NOAA-EMC · TerrenceMcGuinness-NOAA · Nov 8, 2023 · Nov 8, 2023 · Nov 8, 2023 · Nov 9, 2023
@@ -85,8 +85,8 @@ for pr in ${pr_list}; do
   # shellcheck disable=SC2312
   if [[ -z $(ls -A "${pr_dir}/RUNTESTS/EXPDIR") ]] ; then
     "${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Running" --add-label "CI-${MACHINE_ID^}-Passed"
+    sed -i "1 i\All CI Test Cases Passed on ${MACHINE_ID^}" "${output_ci}"
     sed -i "1 i\`\`\`" "${output_ci}"
-    sed -i "1 i\All CI Test Cases Passed:" "${output_ci}"
     "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci}"
     "${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}"
     # Check to see if this PR that was opened by the weekly tests and if so close it if it passed on all platforms
@@ -131,7 +131,7 @@ for pr in ${pr_list}; do
       "${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Running" --add-label "CI-${MACHINE_ID^}-Failed"
       error_logs=$("${rocotostat}" -d "${db}" -w "${xml}" | grep -E 'FAIL|DEAD' | awk '{print "-c", $1, "-t", $2}' | xargs "${rocotocheck}" -d "${db}" -w "${xml}" | grep join | awk '{print $2}') || true
       {
-       echo "Experiment ${pslot} Terminated: *** FAILED ***"
+       echo "Experiment ${pslot} Terminated: *** FAILED *** on ${MACHIND_ID^}"
-       echo "Experiment ${pslot} Terminated: *** FAILED *** on ${MACHIND_ID^}"
+       echo "Experiment ${pslot} Terminated: *** FAILED *** on ${MACHINE_ID^}"
-       echo "Experiment ${pslot} Terminated: *** FAILED *** on ${MACHIND_ID^}"
+       echo "Experiment ${pslot} Terminated: *** FAILED *** on ${MACHINE_ID^}"
        echo "Experiment ${pslot} Terminated with ${num_failed} tasks failed at $(date)" || true
        echo "Error logs:"
        echo "${error_logs}"
@@ -152,8 +152,8 @@ for pr in ${pr_list}; do
       rm -f "${output_ci_single}"
       # echo "\`\`\`" > "${output_ci_single}"
       DATE=$(date)
-      echo "Experiment ${pslot} **SUCCESS** ${DATE}" >> "${output_ci_single}"
-      echo "Experiment ${pslot} **SUCCESS** at ${DATE}" >> "${output_ci}"
+      echo "Experiment ${pslot} **SUCCESS** ${DATE +%Y%m%d} on ${MACHINE_ID^}" >> "${output_ci_single}"
+      echo "Experiment ${pslot} **SUCCESS** at ${DATE +%Y%m%d} on ${MACHIND_ID^}" >> "${output_ci}"
       "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}"
 
     fi

@@ -159,7 +159,7 @@ for pr in ${pr_list}; do
       set +e
       export LOGFILE_PATH="${HOMEgfs}/ci/scripts/create_experiment.log"
       rm -f "${LOGFILE_PATH}"
-      "${HOMEgfs}/workflow/create_experiment.py" --yaml "${HOMEgfs}/ci/cases/pr/${case}.yaml" 2>&1 "${LOGFILE_PATH}"
+      "${HOMEgfs}/workflow/create_experiment.py" --yaml "${HOMEgfs}/ci/cases/pr/${case}.yaml" > "${LOGFILE_PATH}" 2>&1
       ci_status=$?
       set -e
       if [[ ${ci_status} -eq 0 ]]; then
@@ -174,8 +174,7 @@ for pr in ${pr_list}; do
         } >> "${output_ci}"
       else
         {
-          echo "*** Failed *** to create experiment: ${pslot}"
-          echo ""
+          echo "*** Failed *** to create experiment: ${pslot} on ${MACHINE_ID^} for PR #${pr}"
           cat "${LOGFILE_PATH}"
         } >> "${output_ci}"
         "${GH}" pr edit "${pr}" --repo "${REPO_URL}" --remove-label "CI-${MACHINE_ID^}-Building" --add-label "CI-${MACHINE_ID^}-Failed"

@@ -13,6 +13,8 @@
 scriptname=$(basename "${BASH_SOURCE[0]}")
 echo "Begin ${scriptname} at $(date -u)" || true
 export PS4='+ $(basename ${BASH_SOURCE})[${LINENO}]'
+GH=${HOME}/bin/gh
+REPO_URL="https://github.com/NOAA-EMC/global-workflow.git"
 
 #########################################################################
 #  Set up runtime environment varibles for accounts on supproted machines
@@ -81,7 +83,20 @@
     pslot=$(basename "${pslot_dir}")
     xml="${pslot_dir}/${pslot}.xml"
     db="${pslot_dir}/${pslot}.db"
-    echo "Running: ${rocotorun} -v 10 -w ${xml} -d ${db}"
-    "${rocotorun}" -v 10 -w "${xml}" -d "${db}"
+    set +e
+    ${ROOT_DIR}/ci/scripts/utils/rocoto_statcount.py -d "${db}" -w "${xml}" --check_stalled
+    rc=$?
+    if [[ "${rc}" -ne 0 ]]; then
+      output_ci="${pr_dir}/output_runtime_single.log"
+      {
+        echo "${pslot} has *** STALLED **** on ${MACHINE_ID^}"
+        echo "A job in expermint ${pslot} in ${pslot_dir}"
-        echo "A job in expermint ${pslot} in ${pslot_dir}"
+        echo "A job in experiment ${pslot} in ${pslot_dir}"
-        echo "A job in expermint ${pslot} in ${pslot_dir}"
+        echo "A job in experiment ${pslot} in ${pslot_dir}"
+        echo "may have depenencies that are not being met"
+      } >> "${output_ci}"
+      sed -i "1 i\`\`\`" "${output_ci}"
+      "${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci}"
+      "${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Running" --add-label "CI-${MACHINE_ID^}-Failed"
+      "${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}"
+    fi
   done
 done
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+
+import sys
+import os
+
+from wxflow import Executable, which, Logger
+from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
+
+logger = Logger(level=os.environ.get("LOGGING_LEVEL", "DEBUG"), colored_log=False)
+
+
+def input_args():
+    """
+    Parse command-line arguments.
+
+    Returns
+    -------
+    args : Namespace
+        The parsed command-line arguments.
+    """
+
+    description = """
+        Using rocotostat that lists the status of all jobs this scripts
+        determines rocoto_state: if all cycles are done, then rocoto_state is Done.
+        If all cycles are not done, then rocoto_state is Running.
+        If the check_stalled is used then rocotorun is then issued and
+        rocotostat is run again and if all jobs do not advanced, then
+        rocoto_state is Stalled and the script exits with -1.
+        """
+
+    parser = ArgumentParser(description=description,
+                            formatter_class=ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument('-w', help='workflow_document', type=str)
+    parser.add_argument('-d', help='database_file', type=str)
+    parser.add_argument('--check_stalled', help='check if any jobs do not advance (stalled)', action='store_true', required=False)
+
+    args = parser.parse_args()
+
+    return args
+
+def rocoto_statcount():
+    """
+    Run rocotostat and process its output.
+    """
+
+    args = input_args()
+
+    rocotostat = which("rocotostat")
+    if not rocotostat:
+        logger.exception("rocotostat not found in PATH")
+        sys.exit(-1)
+
+    xml_file_path = os.path.abspath(args.w)
+    db_file_path = os.path.abspath(args.d)
+
+    rocotostat_all = which("rocotostat")
+    rocotostat.add_default_arg(['-w',xml_file_path,'-d',db_file_path,'-s'])
+    rocotostat_all.add_default_arg(['-w',xml_file_path,'-d',db_file_path,'-a'])
+
+    rocotostat_output = rocotostat(output=str)
+    rocotostat_output = rocotostat_output.splitlines()[1:]
+    rocotostat_output = [line.split()[0:2] for line in rocotostat_output]
+
+    rocotostat_output_all = rocotostat_all(output=str)
+    rocotostat_output_all = rocotostat_output_all.splitlines()[1:]
+    rocotostat_output_all = [line.split()[0:4] for line in rocotostat_output_all]
+    rocotostat_output_all = [line for line in rocotostat_output_all if len(line) != 1]
+
+    rocoto_status = {
+       'Cycles' : len(rocotostat_output),
+       'Done_Cycles' :  sum([ sublist.count('Done') for sublist in rocotostat_output ])
+    }
+
+    status_cases = [ 'SUCCEEDED', 'FAIL', 'DEAD', 'RUNNING', 'PENDING', 'QUEUED']
+    for case in status_cases:
+        rocoto_status[case] = sum([ sublist.count(case) for sublist in rocotostat_output_all ])
+
+    return rocoto_status
+
+if __name__ == '__main__':
+
+    args = input_args()
+
+    rocoto_status = rocoto_statcount()
+    for status in rocoto_status:
+        print(f'Number of {status} : {rocoto_status[status]}')
+    rocoto_state = 'Running'
+    if rocoto_status['Cycles'] == rocoto_status['Done_Cycles']:
+        rocoto_state = 'Done'
+
+    if args.check_stalled:
+        if rocoto_state != 'Done':
+            rocoto_run = which("rocotorun")
+            rocoto_run.add_default_arg(['-w',args.w,'-d',args.d])
+            rocoto_run()
+            rocoto_status2 = rocoto_statcount()
+            if rocoto_status2 == rocoto_status:
+                rocoto_state = 'Stalled'
+                print(f'Rocoto State : {rocoto_state}')
+                sys.exit(-1)
+            else:
+                rocoto_state = 'Running'
+    print(f'Rocoto State : {rocoto_state}')