Skip to content

Commit

Permalink
Add uniform_random gate mode to mergekit-moe (#303)
Browse files Browse the repository at this point in the history
To better match initialization of `nn.Linear`.
  • Loading branch information
cg123 authored May 4, 2024
1 parent 09c63e6 commit ca96e86
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 1 deletion.
5 changes: 4 additions & 1 deletion mergekit/moe/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,13 @@ class MoEMergeConfig(BaseModel):

base_model: ModelReference
experts: List[Expert]
gate_mode: str = "hidden" # possible values: "hidden", "cheap_embed", "random"
gate_mode: str = (
"hidden" # possible values: "hidden", "cheap_embed", "random", "uniform_random"
)
# "hidden" uses hidden state vectors for the given prompts for each layer
# "cheap_embed" uses the average of token embeddings for the prompts, same for each layer
# "random" is random
# "uniform_random" matches default initialization for torch.nn.Linear
dtype: Optional[str] = None
experts_per_token: int = 2
shared_experts: Optional[List[Expert]] = None
Expand Down
12 changes: 12 additions & 0 deletions mergekit/moe/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# along with this program. If not, see http://www.gnu.org/licenses/.

import logging
import math
from typing import Dict, List, Union

import torch
Expand Down Expand Up @@ -99,6 +100,17 @@ def get_gate_params(
return torch.randn(
(model_cfg.num_hidden_layers, len(experts), model_cfg.hidden_size)
)
elif mode == "uniform_random":
in_features = model_cfg.hidden_size
scale = math.sqrt(1.0 / in_features)
return (
torch.rand(
(model_cfg.num_hidden_layers, len(experts), model_cfg.hidden_size)
)
* 2
* scale
- scale
)
elif mode == "cheap_embed":
embed = model_ref.lazy_loader(lazy_unpickle=lazy_unpickle).get_tensor(
"model.embed_tokens.weight"
Expand Down

0 comments on commit ca96e86

Please sign in to comment.