From e05170505dd13597ca3fa8dbe65a6a698a8b1ea4 Mon Sep 17 00:00:00 2001 From: XieYunshen <1084314248@qq.com> Date: Tue, 17 Dec 2024 19:20:43 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dbenchmark=E5=A4=9A?= =?UTF-8?q?=E6=9C=BA=E4=BB=BB=E5=8A=A1=E5=BC=82=E5=B8=B8=E9=80=80=E5=87=BA?= =?UTF-8?q?=E7=9A=84=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../llama2/benchmark_common/run_benchmark.sh | 88 ++++++++++++++++++- 1 file changed, 85 insertions(+), 3 deletions(-) diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/run_benchmark.sh index 9afb2a0902c8..c45ea93451c5 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/run_benchmark.sh +++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/run_benchmark.sh @@ -57,6 +57,61 @@ function _set_params(){ OUTPUT_PATH=${run_log_path}/output } +# 循环监控文件写入状态和进程状态 +monitor_log_file() { + local log_file="$1" # 获取日志文件路径 + local training_pid="$2" # 获取训练进程的 PID + local no_update_duration=0 # 初始化无更新时长计数 + local last_size=0 + local kill_flag_file="/tmp/monitor_killed_$training_pid" + + echo "$(date '+%Y-%m-%d %H:%M:%S') 开始监控进程 $training_pid 和日志文件 $log_file..." + + while true; do + sleep 5 # 每隔 5 秒检查一次日志文件 + + # 判断日志文件是否存在 + if [ ! -f "$log_file" ]; then + echo "日志文件 $log_file 不存在,检查进程状态..." + # 如果日志文件不存在,直接判断进程是否结束 + if ! ps -p $training_pid > /dev/null; then + echo "$(date '+%Y-%m-%d %H:%M:%S') 进程 $training_pid 已经结束。" + break + fi + continue # 如果文件不存在,跳过后续逻辑,继续循环 + fi + + # 获取当前日志文件的大小 + new_size=$(stat -c %s "$log_file") + + if [ "$last_size" -eq "$new_size" ]; then + # 文件大小未变化,增加无更新时长计数 + no_update_duration=$((no_update_duration + 5)) + echo "$(date '+%Y-%m-%d %H:%M:%S') 文件未写入..." + if [ "$no_update_duration" -ge 180 ]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') 文件在过去的 3 分钟内没有继续写入,准备杀掉进程 $training_pid." + # 创建标志文件 + touch "$kill_flag_file" + ls -l "$kill_flag_file" + kill -9 $training_pid # 杀掉进程 + echo "$(date '+%Y-%m-%d %H:%M:%S') 进程 $training_pid 已经被杀掉。" + break + fi + else + # 文件大小有变化,重置无更新时长计数 + echo "$(date '+%Y-%m-%d %H:%M:%S') 文件仍在写入..." + no_update_duration=0 + last_size=$new_size + fi + + # 如果训练进程已经结束,退出监控 + if ! ps -p $training_pid > /dev/null; then + echo "$(date '+%Y-%m-%d %H:%M:%S') 进程 $training_pid 已经结束。" + break + fi + done +} + function _train(){ batch_size=${per_device_train_batch_size} # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs @@ -134,16 +189,43 @@ function _train(){ rm -rf ./auto_config_${MODEL_TYPE}/*csv rm -rf ./auto_config_${MODEL_TYPE}/best_* rm -rf mylog && rm -rf checkpoints - + echo "train_cmd: ${train_cmd} log_file: ${log_file}" - timeout 15m ${train_cmd} > ${log_file} 2>&1 + timeout 40m ${train_cmd} > ${log_file} 2>&1 & + training_pid=$! # 获取后台进程的 PID + + # 监控进程和日志的更新状态 + monitor_log_file "$log_file" "$training_pid" & + monitor_log_file_pid=$! # 获取日志监控进程的 PID - if [ $? -ne 0 ];then + # 等待训练进程完成 + wait $training_pid + exit_code=$? + + # 获取训练进程的退出码 + echo "训练进程 $training_pid 的退出码是 $exit_code" + + # 清理后台日志监控进程 + kill $monitor_log_file_pid + + + if [ ${exit_code} -ne 0 ];then echo -e "${model_name}, FAIL" + # 如果程序是主动报错退出,不是monitor_log_file函数kill掉的情况下,需要等待其它机器被kill + # 标志文件位置 + kill_flag_file="/tmp/monitor_killed_$training_pid" + if [ -f "$kill_flag_file" ]; then + echo "$(date '+%Y-%m-%d %H:%M:%S') 训练进程 $training_pid 是被 monitor_log_file 函数杀掉的。" + rm -f "$kill_flag_file" # 清理标志文件 + else + echo "$(date '+%Y-%m-%d %H:%M:%S') 训练进程 $training_pid 是主动报错退出的。" + sleep 120 + fi else echo -e "${model_name}, SUCCESS" fi + #kill -9 `ps -ef|grep 'python'|awk '{print $2}'` if [ ${device_num} != "N1C1" -a -d ./auto_config_${MODEL_TYPE}/best_cfg ]; then case_path=$PWD && cd - && mkdir -p mylog # PaddleNLP/tests/mylog From dfe3634eac6a8c5c1ed4253e25ea7cd74edb684e Mon Sep 17 00:00:00 2001 From: XieYunshen <1084314248@qq.com> Date: Tue, 17 Dec 2024 19:38:36 +0800 Subject: [PATCH 2/2] fix bug --- .../auto_config_llama2_70b/pretrain-llama2_70b-auto_tuner.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/auto_config_llama2_70b/pretrain-llama2_70b-auto_tuner.json b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/auto_config_llama2_70b/pretrain-llama2_70b-auto_tuner.json index 38b10cc6d665..3b63d696e8fb 100644 --- a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/auto_config_llama2_70b/pretrain-llama2_70b-auto_tuner.json +++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/auto_config_llama2_70b/pretrain-llama2_70b-auto_tuner.json @@ -12,7 +12,7 @@ "sharding_parallel_config": "split_param enable_stage1_overlap", "tensor_parallel_config": "enable_delay_scale_loss enable_mp_async_allreduce enable_sp_async_reduce_scatter enable_mp_skip_c_identity enable_mp_fused_linear_param_grad_add", "pipeline_parallel_config": "enable_delay_scale_loss enable_release_grads disable_partial_send_recv enable_overlap_p2p_comm", - "virtual_pp_degree": 5, + "virtual_pp_degree": 10, "sequence_parallel": 1, "use_flash_attention": true, "use_fused_rms_norm": true,