From e05170505dd13597ca3fa8dbe65a6a698a8b1ea4 Mon Sep 17 00:00:00 2001
From: XieYunshen <1084314248@qq.com>
Date: Tue, 17 Dec 2024 19:20:43 +0800
Subject: [PATCH 1/2] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dbenchmark=E5=A4=9A?=
 =?UTF-8?q?=E6=9C=BA=E4=BB=BB=E5=8A=A1=E5=BC=82=E5=B8=B8=E9=80=80=E5=87=BA?=
 =?UTF-8?q?=E7=9A=84=E5=A4=84=E7=90=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../llama2/benchmark_common/run_benchmark.sh  | 88 ++++++++++++++++++-
 1 file changed, 85 insertions(+), 3 deletions(-)

diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/run_benchmark.sh b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/run_benchmark.sh
index 9afb2a0902c8..c45ea93451c5 100644
--- a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/run_benchmark.sh
+++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/benchmark_common/run_benchmark.sh
@@ -57,6 +57,61 @@ function _set_params(){
     OUTPUT_PATH=${run_log_path}/output
 }
 
+# 循环监控文件写入状态和进程状态
+monitor_log_file() {
+    local log_file="$1"  # 获取日志文件路径
+    local training_pid="$2"  # 获取训练进程的 PID
+    local no_update_duration=0  # 初始化无更新时长计数
+    local last_size=0
+    local kill_flag_file="/tmp/monitor_killed_$training_pid"
+
+    echo "$(date '+%Y-%m-%d %H:%M:%S') 开始监控进程 $training_pid 和日志文件 $log_file..."
+
+    while true; do
+        sleep 5  # 每隔 5 秒检查一次日志文件
+
+        # 判断日志文件是否存在
+        if [ ! -f "$log_file" ]; then
+            echo "日志文件 $log_file 不存在，检查进程状态..."
+            # 如果日志文件不存在，直接判断进程是否结束
+            if ! ps -p $training_pid > /dev/null; then
+                echo "$(date '+%Y-%m-%d %H:%M:%S') 进程 $training_pid 已经结束。"
+                break
+            fi
+            continue  # 如果文件不存在，跳过后续逻辑，继续循环
+        fi
+
+        # 获取当前日志文件的大小
+        new_size=$(stat -c %s "$log_file")
+
+        if [ "$last_size" -eq "$new_size" ]; then
+            # 文件大小未变化，增加无更新时长计数
+            no_update_duration=$((no_update_duration + 5))
+            echo "$(date '+%Y-%m-%d %H:%M:%S') 文件未写入..."
+            if [ "$no_update_duration" -ge 180 ]; then
+                echo "$(date '+%Y-%m-%d %H:%M:%S') 文件在过去的 3 分钟内没有继续写入，准备杀掉进程 $training_pid."
+                # 创建标志文件
+                touch "$kill_flag_file"
+                ls -l "$kill_flag_file"
+                kill -9 $training_pid  # 杀掉进程
+                echo "$(date '+%Y-%m-%d %H:%M:%S') 进程 $training_pid 已经被杀掉。"
+                break
+            fi
+        else
+            # 文件大小有变化，重置无更新时长计数
+            echo "$(date '+%Y-%m-%d %H:%M:%S') 文件仍在写入..."
+            no_update_duration=0
+            last_size=$new_size
+        fi
+
+        # 如果训练进程已经结束，退出监控
+        if ! ps -p $training_pid > /dev/null; then
+            echo "$(date '+%Y-%m-%d %H:%M:%S') 进程 $training_pid 已经结束。"
+            break
+        fi
+    done
+}
+
 function _train(){
     batch_size=${per_device_train_batch_size}  # 如果模型跑多卡单进程时,请在_train函数中计算出多卡需要的bs
 
@@ -134,16 +189,43 @@ function _train(){
     rm -rf ./auto_config_${MODEL_TYPE}/*csv
     rm -rf ./auto_config_${MODEL_TYPE}/best_*
     rm -rf mylog && rm -rf checkpoints
-    
+
     echo "train_cmd: ${train_cmd}  log_file: ${log_file}"
-    timeout 15m ${train_cmd} > ${log_file} 2>&1
+    timeout 40m ${train_cmd} > ${log_file} 2>&1 &
+    training_pid=$!  # 获取后台进程的 PID
+
+    # 监控进程和日志的更新状态
+    monitor_log_file "$log_file" "$training_pid" & 
+    monitor_log_file_pid=$!  # 获取日志监控进程的 PID
 
-    if [ $? -ne 0 ];then
+    # 等待训练进程完成
+    wait $training_pid
+    exit_code=$?
+
+    # 获取训练进程的退出码
+    echo "训练进程 $training_pid 的退出码是 $exit_code"
+
+    # 清理后台日志监控进程
+    kill $monitor_log_file_pid
+
+
+    if [ ${exit_code} -ne 0 ];then
         echo -e "${model_name}, FAIL"
+        # 如果程序是主动报错退出，不是monitor_log_file函数kill掉的情况下，需要等待其它机器被kill
+        # 标志文件位置
+        kill_flag_file="/tmp/monitor_killed_$training_pid"
+        if [ -f "$kill_flag_file" ]; then
+            echo "$(date '+%Y-%m-%d %H:%M:%S') 训练进程 $training_pid 是被 monitor_log_file 函数杀掉的。"
+            rm -f "$kill_flag_file"  # 清理标志文件
+        else
+            echo "$(date '+%Y-%m-%d %H:%M:%S') 训练进程 $training_pid 是主动报错退出的。"
+            sleep 120
+        fi
     else
         echo -e "${model_name}, SUCCESS"
     fi
 
+
     #kill -9 `ps -ef|grep 'python'|awk '{print $2}'`
     if [ ${device_num} != "N1C1" -a -d ./auto_config_${MODEL_TYPE}/best_cfg ]; then
         case_path=$PWD && cd - && mkdir -p mylog      # PaddleNLP/tests/mylog

From dfe3634eac6a8c5c1ed4253e25ea7cd74edb684e Mon Sep 17 00:00:00 2001
From: XieYunshen <1084314248@qq.com>
Date: Tue, 17 Dec 2024 19:38:36 +0800
Subject: [PATCH 2/2] fix bug

---
 .../auto_config_llama2_70b/pretrain-llama2_70b-auto_tuner.json  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/auto_config_llama2_70b/pretrain-llama2_70b-auto_tuner.json b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/auto_config_llama2_70b/pretrain-llama2_70b-auto_tuner.json
index 38b10cc6d665..3b63d696e8fb 100644
--- a/tests/test_tipc/dygraph/hybrid_parallelism/llama2/auto_config_llama2_70b/pretrain-llama2_70b-auto_tuner.json
+++ b/tests/test_tipc/dygraph/hybrid_parallelism/llama2/auto_config_llama2_70b/pretrain-llama2_70b-auto_tuner.json
@@ -12,7 +12,7 @@
     "sharding_parallel_config": "split_param enable_stage1_overlap",
     "tensor_parallel_config": "enable_delay_scale_loss enable_mp_async_allreduce enable_sp_async_reduce_scatter enable_mp_skip_c_identity enable_mp_fused_linear_param_grad_add",
     "pipeline_parallel_config": "enable_delay_scale_loss enable_release_grads disable_partial_send_recv enable_overlap_p2p_comm",
-    "virtual_pp_degree": 5,
+    "virtual_pp_degree": 10,
     "sequence_parallel": 1,   
     "use_flash_attention": true,
     "use_fused_rms_norm": true,