Skip to content

Commit

Permalink
fix 'intermediate_size' in Llama configuration files after the 'mlp_t…
Browse files Browse the repository at this point in the history
…ype' option was removed (#1309)

* fix 'intermediate_size' in Llama configuration files after the 'mlp_type' option was removed

* config adjustments for llama and gated activations

* pre-commit

---------

Co-authored-by: jahatef <[email protected]>
Co-authored-by: Quentin Anthony <[email protected]>
  • Loading branch information
3 people authored Nov 13, 2024
1 parent 96c242e commit fc74a0c
Show file tree
Hide file tree
Showing 9 changed files with 14 additions and 5 deletions.
2 changes: 2 additions & 0 deletions configs/llama/13B.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# model settings
"num_layers": 40,
"hidden_size": 5120,
"intermediate_size": 40960,
"num_attention_heads": 40,
"seq_length": 2048,
"max_position_embeddings": 2048,
Expand All @@ -16,6 +17,7 @@
"output_layer_parallelism": "column",
"norm": "rmsnorm",
"rms_norm_epsilon": 1.0e-6,
"use_bias_in_mlp": False,

"scaled_upper_triang_masked_softmax_fusion": true,
"bias_gelu_fusion": false,
Expand Down
2 changes: 2 additions & 0 deletions configs/llama/30B.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# model settings
"num_layers": 60,
"hidden_size": 6656,
"intermediate_size": 53248,
"num_attention_heads": 52,
"seq_length": 2048,
"max_position_embeddings": 2048,
Expand All @@ -16,6 +17,7 @@
"output_layer_parallelism": "column",
"norm": "rmsnorm",
"rms_norm_epsilon": 1.0e-6,
"use_bias_in_mlp": False,

"scaled_upper_triang_masked_softmax_fusion": true,
"bias_gelu_fusion": false,
Expand Down
2 changes: 2 additions & 0 deletions configs/llama/65B.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# model settings
"num_layers": 80,
"hidden_size": 8192,
"intermediate_size": 65536,
"num_attention_heads": 64,
"seq_length": 2048,
"max_position_embeddings": 2048,
Expand All @@ -16,6 +17,7 @@
"output_layer_parallelism": "column",
"norm": "rmsnorm",
"rms_norm_epsilon": 1.0e-6,
"use_bias_in_mlp": False,

"scaled_upper_triang_masked_softmax_fusion": true,
"bias_gelu_fusion": false,
Expand Down
2 changes: 2 additions & 0 deletions configs/llama/7B.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# model settings
"num_layers": 32,
"hidden_size": 4096,
"intermediate_size": 32768,
"num_attention_heads": 32,
"seq_length": 2048,
"max_position_embeddings": 2048,
Expand All @@ -16,6 +17,7 @@
"output_layer_parallelism": "column",
"norm": "rmsnorm",
"rms_norm_epsilon": 1.0e-6,
"use_bias_in_mlp": False,

"scaled_upper_triang_masked_softmax_fusion": true,
"bias_gelu_fusion": false,
Expand Down
2 changes: 1 addition & 1 deletion configs/llama/train_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,5 +70,5 @@
"steps_per_print": 10,
"keep_last_n_checkpoints": 4,
"wall_clock_breakdown": true,
"mlp_multiple_of": 256,

}
1 change: 1 addition & 0 deletions configs/llama2/13B.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# model settings
"num_layers": 40,
"hidden_size": 5120,
"intermediate_size": 41472,
"num_attention_heads": 40,
"seq_length": 4096,
"max_position_embeddings": 4096,
Expand Down
2 changes: 1 addition & 1 deletion configs/llama2/70B.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# model settings
"num_layers": 80,
"hidden_size": 8192,
"intermediate_size": 28672,
"intermediate_size": 86016,
"num_attention_heads": 64,
"num_kv_heads": 8,
"seq_length": 4096,
Expand Down
1 change: 1 addition & 0 deletions configs/llama2/7B.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# model settings
"num_layers": 32,
"hidden_size": 4096,
"intermediate_size": 32768,
"num_attention_heads": 32,
"seq_length": 4096,
"max_position_embeddings": 4096,
Expand Down
5 changes: 2 additions & 3 deletions megatron/model/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1245,9 +1245,8 @@ def forward(self, x, attention_mask, layer_past=None):

with torch.enable_grad() if not self.eval else nullcontext():
if (
self.activation == "swiglu"
or self.num_experts > 1
and self.moe_type == "deepspeed"
mlp_bias == None,
self.num_experts > 1 and self.moe_type == "deepspeed",
):
# No dropout either
assert mlp_bias is None
Expand Down

0 comments on commit fc74a0c

Please sign in to comment.