Skip to content

Commit

Permalink
[AutoParallel]:add gpt & baichuan auto ci (#9551)
Browse files Browse the repository at this point in the history
* [AutoParallel]:add gpt & baichuan auto ci

* [AutoParallel]:add gpt & baichuan auto ci

* [AutoParallel]:add gpt & baichuan auto ci

* [AutoParallel]:add gpt & baichuan auto ci

* [AutoParallel]:add gpt & baichuan auto ci

* [AutoParallel]:fix attention_error

* [AutoParallel]:fix attention_error

* [AutoParallel]:fix attention_error

* [AutoParallel]:fix attention_error

* [AutoParallel]:add gpt&baichaun ci

* [AutoParallel]:add gpt&baichaun ci

* [AutoParallel]:add gpt&baichaun ci

* [AutoParallel]:add gpt&baichaun ci
  • Loading branch information
blacksheep-Aristotle authored Dec 9, 2024
1 parent eb4e6a1 commit 8ed579a
Show file tree
Hide file tree
Showing 3 changed files with 179 additions and 26 deletions.
36 changes: 21 additions & 15 deletions llm/auto_parallel/gpt-3/run_pretrain_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,6 @@ class PreTrainingArguments(AutoTrainingArguments):
"help": "The steps use to control the learing rate. If the step > decay_steps, will use the min_learning_rate."
},
)
enable_linear_fused_grad_add: bool = field(
default=False,
metadata={
"help": "Enable fused linear grad add strategy, which will reduce elementwise add for grad accumulation in the backward of nn.Linear ."
},
)
job_schedule_profiler_start: int = field(
default=-1,
metadata={"help": "The step to start job_schedule_profiler."},
Expand Down Expand Up @@ -204,6 +198,15 @@ class ModelArguments:
default=False,
metadata={"help": "whether to fuse first up and gate proj in mlp block"},
)
# this optional can be use in run_pretrain.py
use_fast_layer_norm: bool = field(
default=False,
metadata={"help": "GPT3 model, use fast layernorm"},
)
use_fused_dropout_add: bool = field(
default=False,
metadata={"help": "Gpt3 model, use_fused_dropout_add"},
)
recompute_granularity: str = field(
default="full",
metadata={"help": "Choose among ['full', 'core_attn', 'full_attn']"},
Expand Down Expand Up @@ -353,6 +356,7 @@ def get_train_data_file(args):
class PretrainingTrainer(AutoTrainer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.is_pretraining = True

def _wrap_for_dist_loader(self, train_dataloader):
dist_loader = super()._wrap_for_dist_loader(train_dataloader)
Expand Down Expand Up @@ -391,13 +395,18 @@ def init_seed(seed: int = 1234, args=None):
else:
assert not args.use_hybrid_parallel and args.enable_auto_parallel
if dist.get_world_size() > 1:
if args.hybrid_parallel_topo_order is None or args.hybrid_parallel_topo_order == "pp_first":
order = ["pp", "dp", "sharding", "mp", "sep"]
elif args.hybrid_parallel_topo_order == "sharding_first":
order = ["dp", "sharding", "pp", "mp", "sep"]
topo = Topology(
dist.get_rank(),
dist.get_world_size(),
dp_degree=args.data_parallel_degree,
dp_degree=max(args.data_parallel_degree, args.sharding_parallel_degree),
pp_degree=args.pipeline_parallel_degree,
mp_degree=args.tensor_parallel_degree,
sharding_degree=1, # auto_parallel's sharding is not orthogonal with dp, mp and pp
sharding_degree=1,
order=order,
)

global_seed, local_seed, random_seed = _get_distributed_seeds(args.seed, topo)
Expand All @@ -423,11 +432,6 @@ def main():
else:
model_args, data_args, training_args = parser.parse_args_into_dataclasses()

if training_args.enable_linear_fused_grad_add:
from fused_layers import mock_layers

mock_layers()

if model_args.tokenizer_name_or_path is None:
model_args.tokenizer_name_or_path = model_args.model_name_or_path

Expand Down Expand Up @@ -467,7 +471,7 @@ def main():
tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name_or_path)

config = config_class.from_pretrained(model_args.model_name_or_path)

config.use_fast_layer_norm = model_args.use_fast_layer_norm
config.seq_length = data_args.max_seq_length
# There are some technique extend RotaryEmbedding context. so don't change max_position_embeddings
if not model_args.continue_training:
Expand All @@ -491,7 +495,7 @@ def main():
config.num_attention_heads = (
model_args.num_attention_heads if model_args.num_attention_heads is not None else config.num_attention_heads
)

config.use_fused_dropout_add = model_args.use_fused_dropout_add
config.use_flash_attention = model_args.use_flash_attention
config.use_fused_rms_norm = model_args.use_fused_rms_norm
config.fuse_attention_qkv = model_args.fuse_attention_qkv
Expand Down Expand Up @@ -533,6 +537,8 @@ def main():
def fn(layer):
if hasattr(layer, "enable_recompute") and (layer.enable_recompute is False or layer.enable_recompute == 0):
layer.enable_recompute = True
if hasattr(layer, "layerwise_recompute"):
layer.layerwise_recompute = True

model.apply(fn)

Expand Down
14 changes: 11 additions & 3 deletions paddlenlp/trainer/auto_training_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ class AutoTrainingArguments(TrainingArguments):
Training Arguments for auto_parallel.
"""

fused_linear: bool = field(
default=False,
metadata={"help": "Enable fused linear op, which will fuse matmul and bias add together."},
)

fused_linear_param_grad_add: bool = field(
default=False,
metadata={
Expand All @@ -46,17 +51,20 @@ def __post_init__(self):
super().__post_init__()
assert self.enable_auto_parallel

fused_passes = self.strategy.fused_passes

if self.fused_linear_param_grad_add:
fused_passes = self.strategy.fused_passes
fused_passes.enable = True
fused_passes.fused_passes_list.append("fused_linear_param_grad_add_pass")

if self.fuse_allreduce_split_to_reducescatter:
fused_passes = self.strategy.fused_passes
fused_passes.enable = True
fused_passes.fused_passes_list.append("fuse_allreduce_split_to_reducescatter_pass")

if self.eliminate_transpose:
fused_passes = self.strategy.fused_passes
fused_passes.enable = True
fused_passes.fused_passes_list.append("eliminate_transpose")

if self.fused_linear:
fused_passes.enable = True
fused_passes.fused_passes_list.append("fused_gemm_epilogue_pass")
155 changes: 147 additions & 8 deletions scripts/distribute/ci_case_auto.sh
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ function llama_case_list_auto() {
llama_align_dygraph_dy2st_pir_auto_grad_merge_bs2_fp32_DP1-MP1-PP1
llama_align_dy2st_fthenb_and_vpp_auto_bs2_fp32_DP1-MP1-PP4
llama_align_dygraph_dy2st_pir_auto_pp_bs2_bf16_DP1-MP1-PP4
llama_baichuan_pir_auto_fuse_ffn_attention_qkv_DP2_MP2_PP2
)
if [ $1 = "prepare_case" ]; then
restore_func $fun_list
Expand All @@ -126,6 +127,7 @@ function llm_gpt_case_list_auto() {
llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2
llm_gpt_pir_auto_bs4_TP2
llm_gpt_pir_auto_bs4_TP2_PP2
llm_gpt_pir_auto_bs8_DP2_TP2_PP2
)
if [ $1 = "prepare_case" ]; then
restore_func $fun_list
Expand Down Expand Up @@ -1508,7 +1510,77 @@ function llama_convert_hybrid_ckpt_to_auto_parallel_bs2_fp32_DP2-MP1-PP1() {
check_result $FUNCNAME ${dy_loss} ${auto_loss} ${dy_ips} ${auto_ips} ${dy_mem} ${auto_mem}
echo "=========== $FUNCNAME run end ==========="
}
function llama_baichuan_pir_auto_fuse_ffn_attention_qkv_DP2_MP2_PP2(){
echo "=========== $FUNCNAME run begin ==========="
export PYTHONPATH=$root_path/:$PYTHONPATH
export FLAGS_call_stack_level=3
export NVIDIA_TF32_OVERRIDE=0
export FLAGS_enable_pir_api=1

task_name="llama_baichuan_pir_auto_fuse_ffn_attention_qkv_DP2_MP2_PP2"
case_out_dir="output/$task_name"
case_log_dir="output/$task_name""_log"
rm -rf $case_out_dir
rm -rf $case_log_dir

python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" --log_dir $case_log_dir run_pretrain_auto.py \
--model_type "llama" \
--model_name_or_path "baichuan-inc/Baichuan2-13B-Base" \
--tokenizer_name_or_path "baichuan-inc/Baichuan2-13B-Base" \
--input_dir "./data" \
--output_dir $case_out_dir \
--split 949,50,1 \
--to_static true \
--pipeline_parallel_degree 2 \
--tensor_parallel_degree 2 \
--virtual_pp_degree 2\
--pipeline_schedule_mode "1F1B" \
--weight_decay 0.01 \
--warmup_ratio 0.01 \
--max_grad_norm 0.0 \
--learning_rate 3e-05 \
--min_learning_rate 3e-06 \
--max_steps 10 \
--logging_steps 1 \
--eval_steps 10000 \
--save_steps 1000 \
--continue_training 0 \
--do_train true \
--do_eval false \
--do_predict false \
--disable_tqdm true \
--save_total_limit 2 \
--device gpu \
--dataloader_num_workers 4 \
--distributed_dataloader 0 \
--enable_auto_parallel 1 \
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 32 \
--per_device_eval_batch_size 1 \
--recompute false \
--recompute_use_reentrant true \
--recompute_granularity full \
--pp_recompute_interval 0 \
--bf16 true \
--fp16_opt_level "O2" \
--amp_master_grad true \
--fuse_attention_ffn true \
--fuse_attention_qkv true \
--use_flash_attention false \
--use_fused_rope true \
--use_fused_rms_norm false \
--max_seq_length 4096 \
--sequence_parallel false \
--sharding "stage1" \
--data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate " \
--sharding_parallel_config "enable_stage1_overlap" \
--tensor_parallel_config "enable_mp_async_allreduce" \
--pipeline_parallel_config "enable_send_recv_overlap" \
--auto_parallel_resume_form_hybrid_parallel true \
--num_hidden_layers 2 \
>>${log_path}/$FUNCNAME 2>&1
echo "=========== $FUNCNAME run end ==========="
}
function llm_gpt_dygraph_auto_bs8_fp32_DP2() {
echo "=========== $FUNCNAME run begin ==========="
export PYTHONPATH=$root_path/:$PYTHONPATH
Expand Down Expand Up @@ -1714,7 +1786,7 @@ function llm_gpt_dygraph_auto_bs8_fp32_DP2-MP2-PP2() {
mem=-1
echo "result: loss=$loss ips=$ips mem=$mem loss_md5=$loss_md5"
# loss_base=10.59993172 # note: need to debug
loss_base=10.59891224
loss_base=10.58103752
ips_base=-1
mem_base=-1
if [ $IS_A100 -ne 0 ];then
Expand Down Expand Up @@ -1787,7 +1859,7 @@ function llm_gpt_dygraph_auto_bs8_fp16_DP2-MP2-PP2() {
mem=-1
echo "result: loss=$loss ips=$ips mem=$mem loss_md5=$loss_md5"
# loss_base=10.58456802 # note: need to debug
loss_base=10.59941673
loss_base=10.58146572
ips_base=-1
mem_base=-1
if [ $IS_A100 -ne 0 ];then
Expand All @@ -1803,7 +1875,7 @@ function llm_gpt_pir_auto_bs4_TP2(){
export PYTHONPATH=$root_path/:$PYTHONPATH
export FLAGS_call_stack_level=3
export NVIDIA_TF32_OVERRIDE=0

export FLAGS_enable_pir_api=1
cd ${llm_gpt_case_path}

task_name="gpt3_auto_bs4_tp2"
Expand Down Expand Up @@ -1853,7 +1925,7 @@ function llm_gpt_pir_auto_bs4_TP2(){
--to_static 1 \
--fp16 0 \
--fp16_opt_level "O2" \
--num_hidden_layers 4 \
--num_hidden_layers 2 \
--intermediate_size 1024 \
>>${log_path}/$FUNCNAME 2>&1
echo "=========== $FUNCNAME run end ==========="
Expand All @@ -1864,7 +1936,7 @@ function llm_gpt_pir_auto_bs4_TP2_PP2(){
export PYTHONPATH=$root_path/:$PYTHONPATH
export FLAGS_call_stack_level=3
export NVIDIA_TF32_OVERRIDE=0

export FLAGS_enable_pir_api=1
cd ${llm_gpt_case_path}

task_name="gpt3_auto_bs4_tp2_pp2"
Expand All @@ -1888,7 +1960,7 @@ function llm_gpt_pir_auto_bs4_TP2_PP2(){
--tensor_parallel_degree 2 \
--pipeline_parallel_degree 2 \
--sequence_parallel 0 \
--fuse_attention_qkv 0 \
--fuse_attention_qkv 1 \
--use_flash_attention 0 \
--scale_loss 1024 \
--learning_rate 0.00001 \
Expand All @@ -1912,10 +1984,77 @@ function llm_gpt_pir_auto_bs4_TP2_PP2(){
--model_type "gpt" \
--enable_auto_parallel 1 \
--to_static 1 \
--fp16 0 \
--fp16 1 \
--fp16_opt_level "O2" \
--num_hidden_layers 4 \
--num_hidden_layers 2 \
--intermediate_size 1024 \
>>${log_path}/$FUNCNAME 2>&1
echo "=========== $FUNCNAME run end ==========="
}

function llm_gpt_pir_auto_bs8_DP2_TP2_PP2(){
echo "=========== $FUNCNAME run begin ==========="
export PYTHONPATH=$root_path/:$PYTHONPATH
export FLAGS_call_stack_level=3
export NVIDIA_TF32_OVERRIDE=0
export FLAGS_enable_pir_api=1
cd ${llm_gpt_case_path}

task_name="gpt3_auto_bs8_dp2_tp2_pp2"
case_out_dir="output/$task_name"
case_log_dir="output/$task_name""_log"
rm -rf $case_out_dir
rm -rf $case_log_dir

python -u -m paddle.distributed.launch --gpus "0,1,2,3,4,5,6,7" \
--log_dir $case_log_dir \
run_pretrain_auto.py \
--model_name_or_path gpt3-13B-en \
--tokenizer_name_or_path gpt3-13B-en \
--input_dir "$gpt_data_path/data" \
--output_dir "output/$task_name" \
--split 949,50,1 \
--max_seq_length 1024 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--sharding "stage1" \
--tensor_parallel_degree 2 \
--pipeline_parallel_degree 2 \
--pipeline_schedule_mode "1F1B" \
--sequence_parallel 0 \
--fuse_attention_qkv 1 \
--use_flash_attention 0 \
--fused_linear_param_grad_add 1\
--scale_loss 1024 \
--learning_rate 0.00001 \
--min_learning_rate 0.000005 \
--max_steps 10 \
--save_steps 50000 \
--weight_decay 0.01 \
--warmup_ratio 0.01 \
--max_grad_norm 1.0 \
--logging_steps 1\
--continue_training 0\
--dataloader_num_workers 1 \
--eval_steps 100000 \
--report_to "visualdl" \
--disable_tqdm true \
--recompute 0 \
--gradient_accumulation_steps 4 \
--do_train \
--do_eval \
--device "gpu" \
--model_type "gpt" \
--enable_auto_parallel 1 \
--to_static 1 \
--fp16 1 \
--fp16_opt_level "O2" \
--num_hidden_layers 2 \
--intermediate_size 1024 \
--sharding_parallel_config "enable_stage1_tensor_fusion enable_stage1_overlap" \
--tensor_parallel_config "enable_mp_async_allreduce" \
--data_parallel_config "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate" \
--pipeline_parallel_config "enable_send_recv_overlap enable_split_backward" \
>>${log_path}/$FUNCNAME 2>&1
echo "=========== $FUNCNAME run end ==========="
}
Expand Down

0 comments on commit 8ed579a

Please sign in to comment.