diff --git a/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh b/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh new file mode 100644 index 000000000000..4adc579b79de --- /dev/null +++ b/tests/test_tipc/static/auto_parallel/baichuan2/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh @@ -0,0 +1,25 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +param="model_item=baichuan-inc-baichaun-2-13b_pretrain " +param+="run_mode=DP1_MP2_PP4_1F1B_Sharding8_Stage2 " +param+="device_num=N4C32 " +param+="global_batch_size=32 " +param+="nnodes=4 " +param+="model_type=baichuan2_13b " + +cd ./tests +bash ./test_tipc/static/auto_parallel/baichuan2/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/static/auto_parallel/baichuan2/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/static/auto_parallel/baichuan2/benchmark_common/prepare.sh b/tests/test_tipc/static/auto_parallel/baichuan2/benchmark_common/prepare.sh new file mode 100644 index 000000000000..90f4ce0363eb --- /dev/null +++ b/tests/test_tipc/static/auto_parallel/baichuan2/benchmark_common/prepare.sh @@ -0,0 +1,36 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +python -m pip install -r ../requirements.txt +python -m pip install -r ../requirements-dev.txt + +# install fused_ln custom ops +cd ../slm/model_zoo/gpt-3/external_ops/ +python setup.py install +cd - + +# install fast_dataindex +cd ../llm/auto_parallel/llama +python -m pip install fast_dataindex + +# download data +wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy +wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz +mkdir data +mv llama_openwebtext_100k_ids.npy ./data +mv llama_openwebtext_100k_idx.npz ./data + +# mv pretrain_config +rm -rf pretrain_config_* +cp -r ../../../tests/test_tipc/static/auto_parallel/baichuan2/pretrain_config_* ./ diff --git a/tests/test_tipc/static/auto_parallel/baichuan2/benchmark_common/run_benchmark.sh b/tests/test_tipc/static/auto_parallel/baichuan2/benchmark_common/run_benchmark.sh new file mode 100644 index 000000000000..e550b7256f7f --- /dev/null +++ b/tests/test_tipc/static/auto_parallel/baichuan2/benchmark_common/run_benchmark.sh @@ -0,0 +1,249 @@ +#!/usr/bin/env bash + +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Test training benchmark for a model. +# Usage:bash benchmark/run_benchmark.sh ${model_name_or_path} ${per_device_train_batch_size} ${tensor_parallel_degree} ${pipeline_parallel_degree} ${virtual_pp_degree} ${sequence_parallel} ${sharding_parallel_degree} ${sharding} ${recompute} ${run_mode} ${device_num} +function _set_params(){ + model_item=${model_item:-"baichuan-inc-baichaun-2-13b_pretrain"} + run_mode=${run_mode:-"MP4-PP2"} + device_num=${device_num:-"N4C32"} + global_batch_size=${global_batch_size:-64} + fp_item="bf16" + MODEL_TYPE=${model_type:-"baichuan2_13b"} + + ip_lists=($(echo $TRAINER_INSTANCES | tr ',' ' ')) + master_ip=${ip_lists[0]} + nnodes=${nnodes:-1} + + base_batch_size=${global_batch_size} + profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 + model_repo="PaddleNLP" # (必选) 模型套件的名字 + speed_unit="tokens/s" # (必选)速度指标单位 + skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step + keyword="interval_tokens_per_second_per_device:" # (必选)解析日志,筛选出性能数据所在行的关键字 + convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" + model_mode=5 # 获取ips数据及单位,仅跳过skip_steps后计算均值,单位保持token/s不变 + + # 以下为通用执行命令,无特殊可不用修改 + model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 + device=${CUDA_VISIBLE_DEVICES//,/ } + arr=(${device}) + num_gpu_devices=${#arr[*]} + run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 + profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 + speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} + train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log + mkdir -p $(dirname ${train_log_file}) + + profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling + mkdir -p $(dirname ${profiling_log_file}) + + speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed + mkdir -p $(dirname ${speed_log_file}) + + OUTPUT_PATH=${run_log_path}/output +} + +# 循环监控文件写入状态和进程状态 +monitor_log_file() { + local log_file="$1" # 获取日志文件路径 + local training_pid="$2" # 获取训练进程的 PID + local no_update_duration=0 # 初始化无更新时长计数 + local last_size=0 + local kill_flag_file="/tmp/monitor_killed_$training_pid" + + echo "$(date '+%Y-%m-%d %H:%M:%S') 开始监控进程 $training_pid 和日志文件 $log_file..." + + while true; do + sleep 5 # 每隔 5 秒检查一次日志文件 + + # 判断日志文件是否存在 + if [ ! -f "$log_file" ]; then + echo "日志文件 $log_file 不存在,检查进程状态..." + # 如果日志文件不存在,直接判断进程是否结束 + if ! ps -p $training_pid > /dev/null; then + echo "$(date '+%Y-%m-%d %H:%M:%S') 进程 $training_pid 已经结束。" + break + fi + continue # 如果文件不存在,跳过后续逻辑,继续循环 + fi + + # 获取当前日志文件的大小 + new_size=$(stat -c %s "$log_file") + + if [ "$last_size" -eq "$new_size" ]; then + # 文件大小未变化,增加无更新时长计数 + no_update_duration=$((no_update_duration + 5)) + echo "$(date '+%Y-%m-%d %H:%M:%S') 文件未写入..." + if [ "$no_update_duration" -ge 180 ]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') 文件在过去的 3 分钟内没有继续写入,准备杀掉进程 $training_pid." + # 创建标志文件 + touch "$kill_flag_file" + ls -l "$kill_flag_file" + kill -9 $training_pid # 杀掉进程 + echo "$(date '+%Y-%m-%d %H:%M:%S') 进程 $training_pid 已经被杀掉。" + break + fi + else + # 文件大小有变化,重置无更新时长计数 + echo "$(date '+%Y-%m-%d %H:%M:%S') 文件仍在写入..." + no_update_duration=0 + last_size=$new_size + fi + + # 如果训练进程已经结束,退出监控 + if ! ps -p $training_pid > /dev/null; then + echo "$(date '+%Y-%m-%d %H:%M:%S') 进程 $training_pid 已经结束。" + break + fi + done +} + +function _train(){ + batch_size=${per_device_train_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs + + if [ -d $OUTPUT_PATH ]; then + rm -rf $OUTPUT_PATH + fi + mkdir $OUTPUT_PATH + + echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" + + if [ ${profiling} == "true" ];then + add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" + log_file=${profiling_log_file} + else + add_options="" + log_file=${train_log_file} + fi + + # Disable for hanging bug + # if [ "${tensor_parallel_degree}" != "1" ]; then + # export CUDA_DEVICE_MAX_CONNECTIONS=1 + # fi + + # if [ ${run_mode} == "autotuner" ]; then + # unset PADDLE_ELASTIC_JOB_ID + # unset PADDLE_TRAINER_ENDPOINTS + # unset DISTRIBUTED_TRAINER_ENDPOINTS + # unset FLAGS_START_PORT + # unset PADDLE_ELASTIC_TIMEOUT + # unset PADDLE_TRAINERS_NUM + # unset PADDLE_TRAINER_ID + # autoconfig_args="--auto_tuner_json ./auto_config_${MODEL_TYPE}/${MODEL_TYPE}_pretrain_autoconfig.json" + # else + # autoconfig_args="" + # fi + + if [ ${PADDLE_TRAINER_ID} ]; then + PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" + else + PADDLE_RANK_OPTION="" + fi + + # if [ "$autoconfig_args" != "" ]; then + # distributed_args="--master etcd://$master_ip:2379 --nnodes $nnodes:$nnodes" + # else + # distributed_args="--master $master_ip:36677 --nnodes $nnodes ${PADDLE_RANK_OPTION} --run_mode=collective" + # fi + + echo "==========System Env=============" + env + echo "=================================" + + # 以下为通用执行命令,无特殊可不用修改 + case ${device_num} in + N1C8) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" + train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ + --nnodes 1 --nproc_per_node 8 \ + --log_dir mylog run_pretrain_auto.py \ + ./pretrain_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}.json" + ;; + N4C32) echo "Run with: device_num=${device_num} run_mode=${run_mode}" + train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ + --log_dir mylog run_pretrain_auto.py \ + ./pretrain_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}.json" + ;; + *) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" + train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ + --log_dir mylog run_pretrain_auto.py \ + ./pretrain_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}.json" + ;; + esac + cd ../llm/auto_parallel/llama + # rm -rf ./auto_config_${MODEL_TYPE}/*GBS* + # rm -rf ./auto_config_${MODEL_TYPE}/*auto_tuner.log + # rm -rf ./auto_config_${MODEL_TYPE}/*csv + # rm -rf ./auto_config_${MODEL_TYPE}/best_* + rm -rf mylog && rm -rf checkpoints + + echo "train_cmd: ${train_cmd} log_file: ${log_file}" + timeout 40m ${train_cmd} > ${log_file} 2>&1 & + training_pid=$! # 获取后台进程的 PID + + # 监控进程和日志的更新状态 + monitor_log_file "$log_file" "$training_pid" & + monitor_log_file_pid=$! # 获取日志监控进程的 PID + + # 等待训练进程完成 + wait $training_pid + exit_code=$? + + # 获取训练进程的退出码 + echo "训练进程 $training_pid 的退出码是 $exit_code" + + # 清理后台日志监控进程 + kill $monitor_log_file_pid + + + if [ ${exit_code} -ne 0 ];then + echo -e "${model_name}, FAIL" + # 如果程序是主动报错退出,不是monitor_log_file函数kill掉的情况下,需要等待其它机器被kill + # 标志文件位置 + kill_flag_file="/tmp/monitor_killed_$training_pid" + if [ -f "$kill_flag_file" ]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') 训练进程 $training_pid 是被 monitor_log_file 函数杀掉的。" + rm -f "$kill_flag_file" # 清理标志文件 + else + echo "$(date '+%Y-%m-%d %H:%M:%S') 训练进程 $training_pid 是主动报错退出的。" + sleep 120 + fi + else + echo -e "${model_name}, SUCCESS" + fi + + #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` + if [ ${device_num} != "N1C1" ]; then + case_path=$PWD && cd - && mkdir -p mylog # PaddleNLP/tests/mylog + cp -r ${case_path}/mylog/workerlog.* ./mylog/ + fi +} + +export FLAGS_selected_gpus="0,1,2,3,4,5,6,7" +export NCCL_IB_DISABLE=0 +export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH +# https://github.com/PaddlePaddle/Paddle/pull/69410 合入影响 +# 如不设置参数为1,则默认选择不带tensor fusion的sharding stage1版本 +export FLAGS_enable_sharding_stage1_tensor_fusion=1 + +# 只有13b的任务需要打开CUDA_DEVICE_MAX_CONNECTIONS,7b与13b关闭 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export PARALLEL_CROSS_ENTROPY=true + +source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 +_set_params $@ +#_train # 如果只产出训练log,不解析,可取消注释 +_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 diff --git a/tests/test_tipc/static/auto_parallel/baichuan2/pretrain_config_baichuan2_13b/pretrain-baichuan2_13b.json b/tests/test_tipc/static/auto_parallel/baichuan2/pretrain_config_baichuan2_13b/pretrain-baichuan2_13b.json new file mode 100644 index 000000000000..5ef40643865b --- /dev/null +++ b/tests/test_tipc/static/auto_parallel/baichuan2/pretrain_config_baichuan2_13b/pretrain-baichuan2_13b.json @@ -0,0 +1,55 @@ +{ + "model_name_or_path": "baichuan-inc/Baichuan2-13B-Base", + "tokenizer_name_or_path": "baichuan-inc/Baichuan2-13B-Base", + "input_dir": "./data", + "output_dir": "./checkpoints/baichuan2_13b_ckpts", + "split": "949,50,1", + "to_static": true, + "pipeline_parallel_degree": 2, + "tensor_parallel_degree": 4, + "virtual_pp_degree": 2, + "pipeline_schedule_mode": "1F1B", + "weight_decay": 0.01, + "warmup_ratio": 0.01, + "max_grad_norm": 0.0, + "learning_rate": 0.00003, + "min_learning_rate": 0.000003, + "max_steps": 100, + "logging_steps": 1, + "eval_steps": 10000, + "save_steps": 1000, + "continue_training": 0, + "do_train": true, + "do_eval": false, + "do_predict": false, + "disable_tqdm": true, + "save_total_limit": 2, + "device": "gpu", + "dataloader_num_workers": 4, + "distributed_dataloader": 0, + "enable_auto_parallel": 1, + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 32, + "per_device_eval_batch_size": 1, + "recompute": false, + "recompute_use_reentrant": true, + "recompute_granularity": "full", + "pp_recompute_interval": 0, + "bf16": true, + "fp16_opt_level": "O2", + "amp_master_grad": true, + "fuse_attention_ffn": true, + "fuse_attention_qkv": true, + "use_flash_attention": true, + "fused_linear": 1, + "fused_linear_param_grad_add": 1, + "use_fused_rope": true, + "use_fused_rms_norm": false, + "max_seq_length": 4096, + "sequence_parallel": false, + "sharding": "stage1", + "sharding_parallel_config": "enable_stage1_tensor_fusion enable_stage1_overlap", + "tensor_parallel_config": "enable_mp_async_allreduce", + "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate", + "pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward" +} \ No newline at end of file diff --git a/tests/test_tipc/static/auto_parallel/gpt3/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh b/tests/test_tipc/static/auto_parallel/gpt3/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh new file mode 100644 index 000000000000..3fe89367fb88 --- /dev/null +++ b/tests/test_tipc/static/auto_parallel/gpt3/N4C32/meta-llama-Llama-2-13b_pretrain_dy2st_bs32_bf16_DP1_MP1_PP4_1F1B_Sharding4_Stage1.sh @@ -0,0 +1,25 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +param="model_item=gpt3-13b_pretrain_dy2st " +param+="run_mode=DP1_MP2_PP4_1F1B_Sharding4_Stage1 " +param+="device_num=N4C32 " +param+="global_batch_size=32 " +param+="nnodes=4 " +param+="model_type=gpt3_13b " + +cd ./tests +bash ./test_tipc/static/auto_parallel/gpt3/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/static/auto_parallel/gpt3/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/static/auto_parallel/gpt3/benchmark_common/prepare.sh b/tests/test_tipc/static/auto_parallel/gpt3/benchmark_common/prepare.sh new file mode 100644 index 000000000000..a7be57632792 --- /dev/null +++ b/tests/test_tipc/static/auto_parallel/gpt3/benchmark_common/prepare.sh @@ -0,0 +1,37 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +python -m pip install -r ../requirements.txt +python -m pip install -r ../requirements-dev.txt + +# install fused_ln custom ops +cd ../slm/model_zoo/gpt-3/external_ops/ +python setup.py install +cd - + +# install fast_dataindex +cd ../llm/auto_parallel/gpt-3 +python -m pip install fast_dataindex + +# download data +wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy +wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz + +mkdir data +mv gpt_en_dataset_300m_ids.npy ./data +mv gpt_en_dataset_300m_idx.npz ./data + +# mv pretrain_config +rm -rf pretrain_config_* +cp -r ../../../tests/test_tipc/static/auto_parallel/gpt3/pretrain_config_* ./ diff --git a/tests/test_tipc/static/auto_parallel/gpt3/benchmark_common/run_benchmark.sh b/tests/test_tipc/static/auto_parallel/gpt3/benchmark_common/run_benchmark.sh new file mode 100644 index 000000000000..f11a624ad854 --- /dev/null +++ b/tests/test_tipc/static/auto_parallel/gpt3/benchmark_common/run_benchmark.sh @@ -0,0 +1,253 @@ +#!/usr/bin/env bash + +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Test training benchmark for a model. +# Usage:bash benchmark/run_benchmark.sh ${model_name_or_path} ${per_device_train_batch_size} ${tensor_parallel_degree} ${pipeline_parallel_degree} ${virtual_pp_degree} ${sequence_parallel} ${sharding_parallel_degree} ${sharding} ${recompute} ${run_mode} ${device_num} +function _set_params(){ + model_item=${model_item:-"gpt3-13b_pretrain"} + run_mode=${run_mode:-"MP2-PP4"} + device_num=${device_num:-"N4C32"} + global_batch_size=${global_batch_size:-64} + fp_item="bf16" + MODEL_TYPE=${model_type:-"gpt3_13b"} + + ip_lists=($(echo $TRAINER_INSTANCES | tr ',' ' ')) + master_ip=${ip_lists[0]} + nnodes=${nnodes:-1} + + base_batch_size=${global_batch_size} + profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 + model_repo="PaddleNLP" # (必选) 模型套件的名字 + speed_unit="tokens/s" # (必选)速度指标单位 + skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step + keyword="interval_tokens_per_second_per_device:" # (必选)解析日志,筛选出性能数据所在行的关键字 + convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" + model_mode=5 # 获取ips数据及单位,仅跳过skip_steps后计算均值,单位保持token/s不变 + + # 以下为通用执行命令,无特殊可不用修改 + model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 + device=${CUDA_VISIBLE_DEVICES//,/ } + arr=(${device}) + num_gpu_devices=${#arr[*]} + run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 + profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 + speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} + train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log + mkdir -p $(dirname ${train_log_file}) + + profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling + mkdir -p $(dirname ${profiling_log_file}) + + speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed + mkdir -p $(dirname ${speed_log_file}) + + OUTPUT_PATH=${run_log_path}/output +} + +# 循环监控文件写入状态和进程状态 +monitor_log_file() { + local log_file="$1" # 获取日志文件路径 + local training_pid="$2" # 获取训练进程的 PID + local no_update_duration=0 # 初始化无更新时长计数 + local last_size=0 + local kill_flag_file="/tmp/monitor_killed_$training_pid" + + echo "$(date '+%Y-%m-%d %H:%M:%S') 开始监控进程 $training_pid 和日志文件 $log_file..." + + while true; do + sleep 5 # 每隔 5 秒检查一次日志文件 + + # 判断日志文件是否存在 + if [ ! -f "$log_file" ]; then + echo "日志文件 $log_file 不存在,检查进程状态..." + # 如果日志文件不存在,直接判断进程是否结束 + if ! ps -p $training_pid > /dev/null; then + echo "$(date '+%Y-%m-%d %H:%M:%S') 进程 $training_pid 已经结束。" + break + fi + continue # 如果文件不存在,跳过后续逻辑,继续循环 + fi + + # 获取当前日志文件的大小 + new_size=$(stat -c %s "$log_file") + + if [ "$last_size" -eq "$new_size" ]; then + # 文件大小未变化,增加无更新时长计数 + no_update_duration=$((no_update_duration + 5)) + echo "$(date '+%Y-%m-%d %H:%M:%S') 文件未写入..." + if [ "$no_update_duration" -ge 180 ]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') 文件在过去的 3 分钟内没有继续写入,准备杀掉进程 $training_pid." + # 创建标志文件 + touch "$kill_flag_file" + ls -l "$kill_flag_file" + kill -9 $training_pid # 杀掉进程 + echo "$(date '+%Y-%m-%d %H:%M:%S') 进程 $training_pid 已经被杀掉。" + break + fi + else + # 文件大小有变化,重置无更新时长计数 + echo "$(date '+%Y-%m-%d %H:%M:%S') 文件仍在写入..." + no_update_duration=0 + last_size=$new_size + fi + + # 如果训练进程已经结束,退出监控 + if ! ps -p $training_pid > /dev/null; then + echo "$(date '+%Y-%m-%d %H:%M:%S') 进程 $training_pid 已经结束。" + break + fi + done +} + +function _train(){ + batch_size=${per_device_train_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs + + if [ -d $OUTPUT_PATH ]; then + rm -rf $OUTPUT_PATH + fi + mkdir $OUTPUT_PATH + + echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" + + if [ ${profiling} == "true" ];then + add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" + log_file=${profiling_log_file} + else + add_options="" + log_file=${train_log_file} + fi + + # 70b和7b需要关闭这个开关 + if [[ "${MODEL_TYPE}" =~ "70b" || "${MODEL_TYPE}" =~ "7b" ]]; then + unset CUDA_DEVICE_MAX_CONNECTIONS + fi + # Disable for hanging bug + # if [ "${tensor_parallel_degree}" != "1" ]; then + # export CUDA_DEVICE_MAX_CONNECTIONS=1 + # fi + + # if [ ${run_mode} == "autotuner" ]; then + # unset PADDLE_ELASTIC_JOB_ID + # unset PADDLE_TRAINER_ENDPOINTS + # unset DISTRIBUTED_TRAINER_ENDPOINTS + # unset FLAGS_START_PORT + # unset PADDLE_ELASTIC_TIMEOUT + # unset PADDLE_TRAINERS_NUM + # unset PADDLE_TRAINER_ID + # autoconfig_args="--auto_tuner_json ./auto_config_${MODEL_TYPE}/${MODEL_TYPE}_pretrain_autoconfig.json" + # else + # autoconfig_args="" + # fi + + if [ ${PADDLE_TRAINER_ID} ]; then + PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" + else + PADDLE_RANK_OPTION="" + fi + + # if [ "$autoconfig_args" != "" ]; then + # distributed_args="--master etcd://$master_ip:2379 --nnodes $nnodes:$nnodes" + # else + # distributed_args="--master $master_ip:36677 --nnodes $nnodes ${PADDLE_RANK_OPTION} --run_mode=collective" + # fi + + echo "==========System Env=============" + env + echo "=================================" + + # 以下为通用执行命令,无特殊可不用修改 + case ${device_num} in + N1C8) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" + train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ + --nnodes 1 --nproc_per_node 8 \ + --log_dir mylog run_pretrain_auto.py \ + ./pretrain_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}.json" + ;; + N4C32) echo "Run with: device_num=${device_num} run_mode=${run_mode}" + train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ + --log_dir mylog run_pretrain_auto.py \ + ./pretrain_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}.json" + ;; + *) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" + train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ + --log_dir mylog run_pretrain_auto.py \ + ./pretrain_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}.json" + ;; + esac + cd ../llm/auto_parallel/gpt-3 + # rm -rf ./auto_config_${MODEL_TYPE}/*GBS* + # rm -rf ./auto_config_${MODEL_TYPE}/*auto_tuner.log + # rm -rf ./auto_config_${MODEL_TYPE}/*csv + # rm -rf ./auto_config_${MODEL_TYPE}/best_* + rm -rf mylog && rm -rf checkpoints + + echo "train_cmd: ${train_cmd} log_file: ${log_file}" + timeout 40m ${train_cmd} > ${log_file} 2>&1 & + training_pid=$! # 获取后台进程的 PID + + # 监控进程和日志的更新状态 + monitor_log_file "$log_file" "$training_pid" & + monitor_log_file_pid=$! # 获取日志监控进程的 PID + + # 等待训练进程完成 + wait $training_pid + exit_code=$? + + # 获取训练进程的退出码 + echo "训练进程 $training_pid 的退出码是 $exit_code" + + # 清理后台日志监控进程 + kill $monitor_log_file_pid + + + if [ ${exit_code} -ne 0 ];then + echo -e "${model_name}, FAIL" + # 如果程序是主动报错退出,不是monitor_log_file函数kill掉的情况下,需要等待其它机器被kill + # 标志文件位置 + kill_flag_file="/tmp/monitor_killed_$training_pid" + if [ -f "$kill_flag_file" ]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') 训练进程 $training_pid 是被 monitor_log_file 函数杀掉的。" + rm -f "$kill_flag_file" # 清理标志文件 + else + echo "$(date '+%Y-%m-%d %H:%M:%S') 训练进程 $training_pid 是主动报错退出的。" + sleep 120 + fi + else + echo -e "${model_name}, SUCCESS" + fi + + #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` + if [ ${device_num} != "N1C1" ]; then + case_path=$PWD && cd - && mkdir -p mylog # PaddleNLP/tests/mylog + cp -r ${case_path}/mylog/workerlog.* ./mylog/ + fi +} + +export FLAGS_selected_gpus="0,1,2,3,4,5,6,7" +export NCCL_IB_DISABLE=0 +export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH +# https://github.com/PaddlePaddle/Paddle/pull/69410 合入影响 +# 如不设置参数为1,则默认选择不带tensor fusion的sharding stage1版本 +export FLAGS_enable_sharding_stage1_tensor_fusion=1 + +# 只有13b的任务需要打开CUDA_DEVICE_MAX_CONNECTIONS,7b与13b关闭 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export PARALLEL_CROSS_ENTROPY=true + +source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 +_set_params $@ +#_train # 如果只产出训练log,不解析,可取消注释 +_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 diff --git a/tests/test_tipc/static/auto_parallel/gpt3/pretrain_config_gpt3_13b/pretrain-gpt3_13b.json b/tests/test_tipc/static/auto_parallel/gpt3/pretrain_config_gpt3_13b/pretrain-gpt3_13b.json new file mode 100644 index 000000000000..129f55a349e2 --- /dev/null +++ b/tests/test_tipc/static/auto_parallel/gpt3/pretrain_config_gpt3_13b/pretrain-gpt3_13b.json @@ -0,0 +1,58 @@ +{ + "model_name_or_path": "gpt3-13B-en", + "tokenizer_name_or_path": "gpt3-13B-en", + "to_static": true, + "enable_auto_parallel": 1, + "input_dir": "../llama_data", + "output_dir": "./checkpoints/gpt_pretrain_ckpts", + "split": "949,50,1", + "max_seq_length": 4096, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 1, + "scale_loss": 1024, + "learning_rate": 0.00001, + "min_learning_rate": 0.000001, + "max_steps": 100, + "save_steps": 50000, + "weight_decay": 0.01, + "warmup_ratio": 0.01, + "logging_steps": 1, + "continue_training": 0, + "dataloader_num_workers": 4, + "eval_steps": 100000, + "report_to": "visualdl", + "disable_tqdm": true, + "do_train": true, + "do_eval": true, + "device": "gpu", + "model_type": "gpt", + "sharding": "stage1", + "tensor_parallel_degree": 2, + "pipeline_parallel_degree": 4, + "virtual_pp_degree": 2, + "pipeline_schedule_mode": "1F1B", + "virtual_pipeline_seg_method": "GPTDecoderLayerAuto", + "sequence_parallel": 0, + "use_flash_attention": 1, + "fused_linear": 1, + "fuse_attention_ffn": 1, + "fuse_attention_qkv": 1, + "fused_linear_param_grad_add": 1, + "use_fused_rope": true, + "use_fused_rms_norm": false, + "recompute": 0, + "recompute_use_reentrant": true, + "recompute_granularity": "full", + "pp_recompute_interval": 1, + "gradient_accumulation_steps": 32, + "max_grad_norm": 0.1, + "bf16": 1, + "fp16_opt_level": "O2", + "amp_master_grad": true, + "attention_probs_dropout_prob": 0.1, + "hidden_dropout_prob": 0.1, + "sharding_parallel_config": "enable_stage1_tensor_fusion enable_stage1_overlap", + "tensor_parallel_config": "enable_mp_async_allreduce", + "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate", + "pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward" +} \ No newline at end of file diff --git a/tests/test_tipc/static/auto_parallel/qwen/N4C32/qwen-14b_pretrain_dy2st_bs32_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh b/tests/test_tipc/static/auto_parallel/qwen/N4C32/qwen-14b_pretrain_dy2st_bs32_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh new file mode 100644 index 000000000000..be88c3dcddb9 --- /dev/null +++ b/tests/test_tipc/static/auto_parallel/qwen/N4C32/qwen-14b_pretrain_dy2st_bs32_bf16_DP1_MP2_PP4_1F1B_Sharding4_Stage1.sh @@ -0,0 +1,25 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +param="model_item=qwen-2-14b_pretrain_dy2st " +param+="run_mode=DP1_MP2_PP4_1F1B_Sharding4_Stage1 " +param+="device_num=N4C32 " +param+="global_batch_size=32 " +param+="nnodes=4 " +param+="model_type=qwen_14b " + +cd ./tests +bash ./test_tipc/static/auto_parallel/qwen/benchmark_common/prepare.sh + +bash -c "${param} bash ./test_tipc/static/auto_parallel/qwen/benchmark_common/run_benchmark.sh" diff --git a/tests/test_tipc/static/auto_parallel/qwen/benchmark_common/prepare.sh b/tests/test_tipc/static/auto_parallel/qwen/benchmark_common/prepare.sh new file mode 100644 index 000000000000..7239135a96c5 --- /dev/null +++ b/tests/test_tipc/static/auto_parallel/qwen/benchmark_common/prepare.sh @@ -0,0 +1,36 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +python -m pip install -r ../requirements.txt +python -m pip install -r ../requirements-dev.txt + +# install fused_ln custom ops +cd ../slm/model_zoo/gpt-3/external_ops/ +python setup.py install +cd - + +# install fast_dataindex +cd ../llm/auto_parallel/qwen +python -m pip install fast_dataindex + +# download data +wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_ids.npy +wget https://bj.bcebos.com/paddlenlp/models/transformers/llama/data/llama_openwebtext_100k_idx.npz +mkdir data +mv llama_openwebtext_100k_ids.npy ./data +mv llama_openwebtext_100k_idx.npz ./data + +# mv pretrain_config +rm -rf pretrain_config_* +cp -r ../../../tests/test_tipc/static/auto_parallel/qwen/pretrain_config_* ./ diff --git a/tests/test_tipc/static/auto_parallel/qwen/benchmark_common/run_benchmark.sh b/tests/test_tipc/static/auto_parallel/qwen/benchmark_common/run_benchmark.sh new file mode 100644 index 000000000000..41471ef5b21f --- /dev/null +++ b/tests/test_tipc/static/auto_parallel/qwen/benchmark_common/run_benchmark.sh @@ -0,0 +1,253 @@ +#!/usr/bin/env bash + +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Test training benchmark for a model. +# Usage:bash benchmark/run_benchmark.sh ${model_name_or_path} ${per_device_train_batch_size} ${tensor_parallel_degree} ${pipeline_parallel_degree} ${virtual_pp_degree} ${sequence_parallel} ${sharding_parallel_degree} ${sharding} ${recompute} ${run_mode} ${device_num} +function _set_params(){ + model_item=${model_item:-"qwen-14b_pretrain"} + run_mode=${run_mode:-"MP2-PP1"} + device_num=${device_num:-"N1C8"} + global_batch_size=${global_batch_size:-64} + fp_item="bf16" + MODEL_TYPE=${model_type:-"qwen_14b"} + + ip_lists=($(echo $TRAINER_INSTANCES | tr ',' ' ')) + master_ip=${ip_lists[0]} + nnodes=${nnodes:-1} + + base_batch_size=${global_batch_size} + profiling=${PROFILING:-"false"} # (必选) Profiling 开关,默认关闭,通过全局变量传递 + model_repo="PaddleNLP" # (必选) 模型套件的名字 + speed_unit="tokens/s" # (必选)速度指标单位 + skip_steps=10 # (必选)解析日志,跳过模型前几个性能不稳定的step + keyword="interval_tokens_per_second_per_device:" # (必选)解析日志,筛选出性能数据所在行的关键字 + convergence_key="loss:" # (可选)解析日志,筛选出收敛数据所在行的关键字 如:convergence_key="loss:" + model_mode=5 # 获取ips数据及单位,仅跳过skip_steps后计算均值,单位保持token/s不变 + + # 以下为通用执行命令,无特殊可不用修改 + model_name=${model_item}_bs${global_batch_size}_${fp_item}_${run_mode} # (必填) 且格式不要改动,与竞品名称对齐 + device=${CUDA_VISIBLE_DEVICES//,/ } + arr=(${device}) + num_gpu_devices=${#arr[*]} + run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # (必填) TRAIN_LOG_DIR benchmark框架设置该参数为全局变量 + profiling_log_path=${PROFILING_LOG_DIR:-$(pwd)} # (必填) PROFILING_LOG_DIR benchmark框架设置该参数为全局变量 + speed_log_path=${LOG_PATH_INDEX_DIR:-$(pwd)} + train_log_file=${run_log_path}/${model_repo}_${model_name}_${device_num}_log + mkdir -p $(dirname ${train_log_file}) + + profiling_log_file=${profiling_log_path}/${model_repo}_${model_name}_${device_num}_profiling + mkdir -p $(dirname ${profiling_log_file}) + + speed_log_file=${speed_log_path}/${model_repo}_${model_name}_${device_num}_speed + mkdir -p $(dirname ${speed_log_file}) + + OUTPUT_PATH=${run_log_path}/output +} + +# 循环监控文件写入状态和进程状态 +monitor_log_file() { + local log_file="$1" # 获取日志文件路径 + local training_pid="$2" # 获取训练进程的 PID + local no_update_duration=0 # 初始化无更新时长计数 + local last_size=0 + local kill_flag_file="/tmp/monitor_killed_$training_pid" + + echo "$(date '+%Y-%m-%d %H:%M:%S') 开始监控进程 $training_pid 和日志文件 $log_file..." + + while true; do + sleep 5 # 每隔 5 秒检查一次日志文件 + + # 判断日志文件是否存在 + if [ ! -f "$log_file" ]; then + echo "日志文件 $log_file 不存在,检查进程状态..." + # 如果日志文件不存在,直接判断进程是否结束 + if ! ps -p $training_pid > /dev/null; then + echo "$(date '+%Y-%m-%d %H:%M:%S') 进程 $training_pid 已经结束。" + break + fi + continue # 如果文件不存在,跳过后续逻辑,继续循环 + fi + + # 获取当前日志文件的大小 + new_size=$(stat -c %s "$log_file") + + if [ "$last_size" -eq "$new_size" ]; then + # 文件大小未变化,增加无更新时长计数 + no_update_duration=$((no_update_duration + 5)) + echo "$(date '+%Y-%m-%d %H:%M:%S') 文件未写入..." + if [ "$no_update_duration" -ge 180 ]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') 文件在过去的 3 分钟内没有继续写入,准备杀掉进程 $training_pid." + # 创建标志文件 + touch "$kill_flag_file" + ls -l "$kill_flag_file" + kill -9 $training_pid # 杀掉进程 + echo "$(date '+%Y-%m-%d %H:%M:%S') 进程 $training_pid 已经被杀掉。" + break + fi + else + # 文件大小有变化,重置无更新时长计数 + echo "$(date '+%Y-%m-%d %H:%M:%S') 文件仍在写入..." + no_update_duration=0 + last_size=$new_size + fi + + # 如果训练进程已经结束,退出监控 + if ! ps -p $training_pid > /dev/null; then + echo "$(date '+%Y-%m-%d %H:%M:%S') 进程 $training_pid 已经结束。" + break + fi + done +} + +function _train(){ + batch_size=${per_device_train_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs + + if [ -d $OUTPUT_PATH ]; then + rm -rf $OUTPUT_PATH + fi + mkdir $OUTPUT_PATH + + echo "current CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES}, model_name=${model_name}, device_num=${device_num}, is profiling=${profiling}" + + if [ ${profiling} == "true" ];then + add_options="--profiler_options=\"batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile\"" + log_file=${profiling_log_file} + else + add_options="" + log_file=${train_log_file} + fi + + # 70b和7b需要关闭这个开关 + if [[ "${MODEL_TYPE}" =~ "70b" || "${MODEL_TYPE}" =~ "7b" ]]; then + unset CUDA_DEVICE_MAX_CONNECTIONS + fi + # Disable for hanging bug + # if [ "${tensor_parallel_degree}" != "1" ]; then + # export CUDA_DEVICE_MAX_CONNECTIONS=1 + # fi + + # if [ ${run_mode} == "autotuner" ]; then + # unset PADDLE_ELASTIC_JOB_ID + # unset PADDLE_TRAINER_ENDPOINTS + # unset DISTRIBUTED_TRAINER_ENDPOINTS + # unset FLAGS_START_PORT + # unset PADDLE_ELASTIC_TIMEOUT + # unset PADDLE_TRAINERS_NUM + # unset PADDLE_TRAINER_ID + # autoconfig_args="--auto_tuner_json ./auto_config_${MODEL_TYPE}/${MODEL_TYPE}_pretrain_autoconfig.json" + # else + # autoconfig_args="" + # fi + + if [ ${PADDLE_TRAINER_ID} ]; then + PADDLE_RANK_OPTION=" --rank ${PADDLE_TRAINER_ID}" + else + PADDLE_RANK_OPTION="" + fi + + # if [ "$autoconfig_args" != "" ]; then + # distributed_args="--master etcd://$master_ip:2379 --nnodes $nnodes:$nnodes" + # else + # distributed_args="--master $master_ip:36677 --nnodes $nnodes ${PADDLE_RANK_OPTION} --run_mode=collective" + # fi + + echo "==========System Env=============" + env + echo "=================================" + + # 以下为通用执行命令,无特殊可不用修改 + case ${device_num} in + N1C8) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" + train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ + --nnodes 1 --nproc_per_node 8 \ + --log_dir mylog run_pretrain_auto.py \ + ./pretrain_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}.json" + ;; + N4C32) echo "Run with: device_num=${device_num} run_mode=${run_mode}" + train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ + --log_dir mylog run_pretrain_auto.py \ + ./pretrain_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}.json" + ;; + *) echo "Run with: device_num=${device_num}, run_mode=${run_mode}" + train_cmd="python -u -m paddle.distributed.launch --gpus=0,1,2,3,4,5,6,7 \ + --log_dir mylog run_pretrain_auto.py \ + ./pretrain_config_${MODEL_TYPE}/pretrain-${MODEL_TYPE}.json" + ;; + esac + cd ../llm/auto_parallel/qwen + # rm -rf ./auto_config_${MODEL_TYPE}/*GBS* + # rm -rf ./auto_config_${MODEL_TYPE}/*auto_tuner.log + # rm -rf ./auto_config_${MODEL_TYPE}/*csv + # rm -rf ./auto_config_${MODEL_TYPE}/best_* + rm -rf mylog && rm -rf checkpoints + + echo "train_cmd: ${train_cmd} log_file: ${log_file}" + timeout 40m ${train_cmd} > ${log_file} 2>&1 & + training_pid=$! # 获取后台进程的 PID + + # 监控进程和日志的更新状态 + monitor_log_file "$log_file" "$training_pid" & + monitor_log_file_pid=$! # 获取日志监控进程的 PID + + # 等待训练进程完成 + wait $training_pid + exit_code=$? + + # 获取训练进程的退出码 + echo "训练进程 $training_pid 的退出码是 $exit_code" + + # 清理后台日志监控进程 + kill $monitor_log_file_pid + + + if [ ${exit_code} -ne 0 ];then + echo -e "${model_name}, FAIL" + # 如果程序是主动报错退出,不是monitor_log_file函数kill掉的情况下,需要等待其它机器被kill + # 标志文件位置 + kill_flag_file="/tmp/monitor_killed_$training_pid" + if [ -f "$kill_flag_file" ]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') 训练进程 $training_pid 是被 monitor_log_file 函数杀掉的。" + rm -f "$kill_flag_file" # 清理标志文件 + else + echo "$(date '+%Y-%m-%d %H:%M:%S') 训练进程 $training_pid 是主动报错退出的。" + sleep 120 + fi + else + echo -e "${model_name}, SUCCESS" + fi + + #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` + if [ ${device_num} != "N1C1" ]; then + case_path=$PWD && cd - && mkdir -p mylog # PaddleNLP/tests/mylog + cp -r ${case_path}/mylog/workerlog.* ./mylog/ + fi +} + +export FLAGS_selected_gpus="0,1,2,3,4,5,6,7" +export NCCL_IB_DISABLE=0 +export PYTHONPATH=$(dirname "$PWD"):$PYTHONPATH +# https://github.com/PaddlePaddle/Paddle/pull/69410 合入影响 +# 如不设置参数为1,则默认选择不带tensor fusion的sharding stage1版本 +export FLAGS_enable_sharding_stage1_tensor_fusion=1 + +# 只有13b的任务需要打开CUDA_DEVICE_MAX_CONNECTIONS,7b与13b关闭 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export PARALLEL_CROSS_ENTROPY=true + +source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;如果不联调只想要产出训练log可以注掉本行,提交时需打开 +_set_params $@ +#_train # 如果只产出训练log,不解析,可取消注释 +_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只产出训练log可以注掉本行,提交时需打开 diff --git a/tests/test_tipc/static/auto_parallel/qwen/pretrain_config_qwen2_14b/pretrain-qwen2_14b.json b/tests/test_tipc/static/auto_parallel/qwen/pretrain_config_qwen2_14b/pretrain-qwen2_14b.json new file mode 100644 index 000000000000..9e8bacb911af --- /dev/null +++ b/tests/test_tipc/static/auto_parallel/qwen/pretrain_config_qwen2_14b/pretrain-qwen2_14b.json @@ -0,0 +1,55 @@ +{ + "model_name_or_path": "qwen/qwen-14b", + "tokenizer_name_or_path": "qwen/qwen-14b", + "input_dir": "./data", + "output_dir": "./checkpoints/qwen_pretrain_ckpts", + "per_device_train_batch_size": 1, + "gradient_accumulation_steps": 32, + "per_device_eval_batch_size": 16, + "sharding": "stage1", + "tensor_parallel_degree": 2, + "pipeline_parallel_degree": 4, + "virtual_pp_degree": 5, + "pipeline_schedule_mode": "1F1B", + "virtual_pipeline_seg_method": "QWenBlockAuto", + "use_flash_attention": true, + "use_fused_rms_norm": false, + "use_fused_rope": true, + "fused_linear": 1, + "fuse_attention_ffn": 1, + "fuse_attention_qkv": 1, + "fused_linear_param_grad_add": 1, + "max_seq_length": 4096, + "learning_rate": 0.00003, + "min_learning_rate": 0.000003, + "scale_loss": 1024, + "warmup_steps": 30, + "logging_steps": 1, + "max_steps": 100, + "save_steps": 1000, + "eval_steps": 10000, + "weight_decay": 0.01, + "bf16": true, + "fp16_opt_level": "O2", + "amp_master_grad": true, + "warmup_ratio": 0.01, + "max_grad_norm": 0.0, + "dataloader_num_workers": 4, + "continue_training": 0, + "do_train": true, + "do_eval": false, + "do_predict": false, + "disable_tqdm": true, + "recompute": false, + "recompute_granularity": "core_attn", + "recompute_use_reentrant": true, + "distributed_dataloader": 0, + "save_total_limit": 2, + "enable_auto_parallel": 1, + "to_static": 1, + "auto_parallel_resume_form_hybrid_parallel": true, + "data_parallel_config": "enable_allreduce_avg_in_gradinent_scale gradient_sync_after_accumulate", + "sharding_parallel_config": "enable_stage1_overlap", + "tensor_parallel_config": "enable_mp_async_allreduce", + "pipeline_parallel_config": "enable_send_recv_overlap enable_split_backward" +} \ No newline at end of file