Skip to content

Commit

Permalink
Robust CI Restarts (#2093)
Browse files Browse the repository at this point in the history
Improved CI robustness for reverting back to **CI-Ready** from any
given state

New Features:

- Improved `scancel` routine (refactored into bash "subroutine")
- Improved messaging (see below) when ever a user changes state
- Any and all previous build scripts and running experiments are killed as
a result of reset to **Ready**

Resolves #2060
  • Loading branch information
TerrenceMcGuinness-NOAA committed Dec 4, 2023
1 parent 73621e9 commit a286a11
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 45 deletions.
2 changes: 1 addition & 1 deletion ci/platforms/config.orion
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

export GFS_CI_ROOT=/work2/noaa/stmp/GFS_CI_ROOT
export ICSDIR_ROOT=/work/noaa/global/glopara/data/ICSDIR
export STMP="/work/noaa/stmp/${USER}"
export STMP="/work2/noaa/stmp/${USER}"
export SLURM_ACCOUNT=nems
export max_concurrent_cases=5
export max_concurrent_pr=4
15 changes: 8 additions & 7 deletions ci/scripts/check_ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ case ${MACHINE_ID} in
esac
set +x
source "${ROOT_DIR}/ush/module-setup.sh"
source "${ROOT_DIR}/ci/scripts/utils/ci_utils.sh"
module use "${ROOT_DIR}/modulefiles"
module load "module_gwsetup.${MACHINE_ID}"
module list
Expand Down Expand Up @@ -86,7 +87,7 @@ for pr in ${pr_list}; do
if [[ -z $(ls -A "${pr_dir}/RUNTESTS/EXPDIR") ]] ; then
"${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Running" --add-label "CI-${MACHINE_ID^}-Passed"
sed -i "1 i\`\`\`" "${output_ci}"
sed -i "1 i\All CI Test Cases Passed:" "${output_ci}"
sed -i "1 i\All CI Test Cases Passed on ${MACHINE_ID^}:" "${output_ci}"
"${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci}"
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}"
# Check to see if this PR that was opened by the weekly tests and if so close it if it passed on all platforms
Expand Down Expand Up @@ -131,8 +132,8 @@ for pr in ${pr_list}; do
"${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Running" --add-label "CI-${MACHINE_ID^}-Failed"
error_logs=$("${rocotostat}" -d "${db}" -w "${xml}" | grep -E 'FAIL|DEAD' | awk '{print "-c", $1, "-t", $2}' | xargs "${rocotocheck}" -d "${db}" -w "${xml}" | grep join | awk '{print $2}') || true
{
echo "Experiment ${pslot} Terminated: *** FAILED ***"
echo "Experiment ${pslot} Terminated with ${num_failed} tasks failed at $(date)" || true
echo "Experiment ${pslot} *** FAILED *** on ${MACHINE_ID^}"
echo "Experiment ${pslot} with ${num_failed} tasks failed at $(date +'%D %r')" || true
echo "Error logs:"
echo "${error_logs}"
} >> "${output_ci}"
Expand All @@ -141,7 +142,7 @@ for pr in ${pr_list}; do
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}"
for kill_cases in "${pr_dir}/RUNTESTS/"*; do
pslot=$(basename "${kill_cases}")
sacct --format=jobid,jobname%35,WorkDir%100,stat | grep "${pslot}" | grep "PR\/${pr}\/RUNTESTS" | awk '{print $1}' | xargs scancel || true
cancel_slurm_jobs "${pslot}"
done
break
fi
Expand All @@ -151,9 +152,9 @@ for pr in ${pr_list}; do
rm -Rf "${pr_dir}/RUNTESTS/COMROT/${pslot}"
rm -f "${output_ci_single}"
# echo "\`\`\`" > "${output_ci_single}"
DATE=$(date)
echo "Experiment ${pslot} **SUCCESS** ${DATE}" >> "${output_ci_single}"
echo "Experiment ${pslot} **SUCCESS** at ${DATE}" >> "${output_ci}"
DATE=$(date +'%D %r')
echo "Experiment ${pslot} **SUCCESS** on ${MACHINE_ID^} at ${DATE}" >> "${output_ci_single}"
echo "Experiment ${pslot} *** SUCCESS *** at ${DATE}" >> "${output_ci}"
"${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}"

fi
Expand Down
20 changes: 13 additions & 7 deletions ci/scripts/clone-build_ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -72,16 +72,17 @@ cd sorc || exit 1
set +e
./checkout.sh -c -g -u >> log.checkout 2>&1
checkout_status=$?
DATE=$(date +'%D %r')
if [[ ${checkout_status} != 0 ]]; then
{
echo "Checkout: *** FAILED ***"
echo "Checkout: Failed at $(date)" || true
echo "Checkout: Failed at ${DATE}"
echo "Checkout: see output at ${PWD}/log.checkout"
} >> "${outfile}"
exit "${checkout_status}"
else
{
echo "Checkout: Completed at $(date)" || true
echo "Checkout: Completed at ${DATE}"
} >> "${outfile}"
fi

Expand All @@ -92,25 +93,30 @@ rm -rf log.build
./build_all.sh >> log.build 2>&1
build_status=$?

DATE=$(date +'%D %r')
if [[ ${build_status} != 0 ]]; then
{
echo "Build: *** FAILED ***"
echo "Build: Failed at $(date)" || true
echo "Build: see output at ${PWD}/log.build"
echo "Build: Failed at ${DATE}"
cat "${PWD}/log.build"
} >> "${outfile}"
exit "${build_status}"
else
{
echo "Build: Completed at $(date)" || true
echo "Build: Completed at ${DATE}"
} >> "${outfile}"
fi

./link_workflow.sh
LINK_LOGFILE_PATH=link_workflow.log
rm -f "${LINK_LOGFILE_PATH}"
./link_workflow.sh >> "${LINK_LOGFILE_PATH}" 2>&1
link_status=$?
if [[ ${link_status} != 0 ]]; then
DATE=$(date +'%D %r')
{
echo "Link: *** FAILED ***"
echo "Link: Failed at $(date)" || true
echo "Link: Failed at ${DATE}"
cat "${LINK_LOGFILE_PATH}"
} >> "${outfile}"
exit "${link_status}"
fi
Expand Down
104 changes: 74 additions & 30 deletions ci/scripts/driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ export REPO_URL=${REPO_URL:-"https://github.com/NOAA-EMC/global-workflow.git"}
################################################################
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." >/dev/null 2>&1 && pwd )"
scriptname=$(basename "${BASH_SOURCE[0]}")
echo "Begin ${scriptname} at $(date -u)" || true
echo "Begin ${scriptname} at $(date +'%D %r')" || true
export PS4='+ $(basename ${BASH_SOURCE})[${LINENO}]'

#########################################################################
Expand All @@ -48,6 +48,7 @@ esac
# setup runtime env for correct python install and git
######################################################
set +x
source "${ROOT_DIR}/ci/scripts/utils/ci_utils.sh"
source "${ROOT_DIR}/ush/module-setup.sh"
module use "${ROOT_DIR}/modulefiles"
module load "module_gwsetup.${MACHINE_ID}"
Expand All @@ -68,24 +69,57 @@ pr_list=$(${GH} pr list --repo "${REPO_URL}" --label "CI-${MACHINE_ID^}-Ready" -
for pr in ${pr_list}; do
pr_dir="${GFS_CI_ROOT}/PR/${pr}"
db_list=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}")
pr_id=0
output_ci_single="${GFS_CI_ROOT}/PR/${pr}/output_single.log"
#############################################################
# Check if a Ready labeled PR has changed back from once set
# and in that case remove all previous jobs in scheduler and
# and remove PR from filesystem to start clean
# and in that case completely kill the previose driver.sh cron
# job and all its decedands as well as removing all previous
# jobs in scheduler and associated files in the PR
#############################################################
if [[ "${db_list}" == *"already is in list"* ]]; then
pr_id=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --display "${pr}" | awk '{print $4}') || true
pr_id=$((pr_id+1))
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Ready "${pr_id}"
for cases in "${pr_dir}/RUNTESTS/"*; do
if [[ -z "${cases+x}" ]]; then
break
# Get the the PID and HOST of the driver.sh cron job
# that is stored int he CI database for this PR
driver_ID=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --display "${pr}" | awk '{print $4}') || true
driver_PID=$(echo "${driver_ID}" | cut -d":" -f1) || true
driver_HOST=$(echo "${driver_ID}" | cut -d":" -f2) || true
host_name=$(hostname -s)
rm -f "${output_ci_single}"
{
echo "CI Update on ${MACHINE_ID^} at $(date +'%D %r')" || true
echo "================================================="
echo "PR:${pr} Reset to ${MACHINE_ID^}-Ready by user and is now restarting CI tests" || true
} >> "${output_ci_single}"
if [[ "${driver_PID}" -ne 0 ]]; then
echo "Driver PID: ${driver_PID} no longer running this build having it killed"
if [[ "${driver_HOST}" == "${host_name}" ]]; then
# shellcheck disable=SC2312
pstree -A -p "${driver_PID}" | grep -Pow "(?<=\()[0-9]+(?=\))" | xargs kill
else
# shellcheck disable=SC2312
ssh "${driver_HOST}" 'pstree -A -p "${driver_PID}" | grep -Eow "[0-9]+" | xargs kill'
fi
pslot=$(basename "${cases}")
sacct --format=jobid,jobname%35,WorkDir%100,stat | grep "${pslot}" | grep "PR\/${pr}\/RUNTESTS" | awk '{print $1}' | xargs scancel || true
done
rm -Rf "${pr_dir}"
{
echo "Driver PID: Requested termination of ${driver_PID} and children on ${driver_HOST}"
echo "Driver PID: has restarted as $$ on ${host_name}"
} >> "${output_ci_single}"
fi

experiments=$(find "${pr_dir}/RUNTESTS/EXPDIR" -mindepth 1 -maxdepth 1 -type d) || true
if [[ -z "${experiments}" ]]; then
echo "No current experiments to cancel in PR: ${pr} on ${MACHINE_ID^}" >> "${output_ci_single}"
else
for case in ${experiments}; do
case_name=$(basename "${case}")
cancel_slurm_jobs "${case_name}"
{
echo "Canceled all jobs for experiment ${case_name} in PR:${pr} on ${MACHINE_ID^}"
} >> "${output_ci_single}"
done
fi
sed -i "1 i\`\`\`" "${output_ci_single}"
"${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}"
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --remove_pr "${pr}" --dbfile "${pr_list_dbfile}"
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}"
fi
done

Expand All @@ -110,34 +144,44 @@ for pr in ${pr_list}; do
if [[ -z "${pr_building+x}" ]]; then
continue
fi
"${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Ready" --add-label "CI-${MACHINE_ID^}-Building"
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Building
echo "Processing Pull Request #${pr}"
id=$("${GH}" pr view "${pr}" --repo "${REPO_URL}" --json id --jq '.id')
pr_dir="${GFS_CI_ROOT}/PR/${pr}"
output_ci="${pr_dir}/output_ci_${id}"
output_ci_single="${GFS_CI_ROOT}/PR/${pr}/output_single.log"
driver_build_PID=$$
driver_build_HOST=$(hostname -s)
"${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Ready" --add-label "CI-${MACHINE_ID^}-Building"
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Building "${driver_build_PID}:${driver_build_HOST}"
rm -Rf "${pr_dir}"
mkdir -p "${pr_dir}"
# call clone-build_ci to clone and build PR
id=$("${GH}" pr view "${pr}" --repo "${REPO_URL}" --json id --jq '.id')
{
echo "CI Update on ${MACHINE_ID^} at $(date +'%D %r')" || true
echo "============================================"
echo "Cloning and Building global-workflow PR: ${pr}"
echo "with PID: ${driver_build_PID} on host: ${driver_build_HOST}"
echo ""
} >> "${output_ci_single}"
sed -i "1 i\`\`\`" "${output_ci_single}"
"${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci_single}"
set +e
output_ci="${pr_dir}/output_build_${id}"
rm -f "${output_ci}"
"${ROOT_DIR}/ci/scripts/clone-build_ci.sh" -p "${pr}" -d "${pr_dir}" -o "${output_ci}"
#echo "SKIPPING: ${ROOT_DIR}/ci/scripts/clone-build_ci.sh"
ci_status=$?
##################################################################
# Checking for special case when Ready label was updated
# that cause a running driver exit fail because was currently
# building so we force and exit 0 instead to does not get relabled
# but a race condtion caused the clone-build_ci.sh to start
# and this instance fails before it was killed. In th case we
# we need to exit this instance of the driver script
#################################################################
if [[ ${ci_status} -ne 0 ]]; then
pr_id_check=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --display "{pr}" --dbfile "${pr_list_dbfile}" | awk '{print $4}') || true
if [[ "${pr_id}" -ne "${pr_id_check}" ]]; then
build_PID_check=$("${ROOT_DIR}/ci/scripts/pr_list_database.py" --display "{pr}" --dbfile "${pr_list_dbfile}" | awk '{print $4}' | cut -d":" -f1) || true
if [[ "${build_PID_check}" -ne "$$" ]]; then
echo "Driver build PID: ${build_PID_check} no longer running this build ... exiting"
exit 0
fi
fi
set -e
if [[ ${ci_status} -eq 0 ]]; then
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Built
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Built "0:0"
#setup space to put an experiment
# export RUNTESTS for yaml case files to pickup
export RUNTESTS="${pr_dir}/RUNTESTS"
Expand All @@ -159,7 +203,7 @@ for pr in ${pr_list}; do
set +e
export LOGFILE_PATH="${HOMEgfs}/ci/scripts/create_experiment.log"
rm -f "${LOGFILE_PATH}"
"${HOMEgfs}/workflow/create_experiment.py" --yaml "${HOMEgfs}/ci/cases/pr/${case}.yaml" 2>&1 "${LOGFILE_PATH}"
"${HOMEgfs}/workflow/create_experiment.py" --yaml "${HOMEgfs}/ci/cases/pr/${case}.yaml" > "${LOGFILE_PATH}" 2>&1
ci_status=$?
set -e
if [[ ${ci_status} -eq 0 ]]; then
Expand All @@ -174,7 +218,7 @@ for pr in ${pr_list}; do
} >> "${output_ci}"
else
{
echo "*** Failed *** to create experiment: ${pslot}"
echo "*** Failed *** to create experiment: ${pslot} on ${MACHINE_ID^}"
echo ""
cat "${LOGFILE_PATH}"
} >> "${output_ci}"
Expand All @@ -186,7 +230,7 @@ for pr in ${pr_list}; do
done

"${GH}" pr edit --repo "${REPO_URL}" "${pr}" --remove-label "CI-${MACHINE_ID^}-Building" --add-label "CI-${MACHINE_ID^}-Running"
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Running
"${ROOT_DIR}/ci/scripts/pr_list_database.py" --dbfile "${pr_list_dbfile}" --update_pr "${pr}" Open Running "0:0"
"${GH}" pr comment "${pr}" --repo "${REPO_URL}" --body-file "${output_ci}"

else
Expand Down
24 changes: 24 additions & 0 deletions ci/scripts/utils/ci_utils.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/env bash

function cancel_slurm_jobs() {

# Usage: cancel_slurm_jobs <substring>
# Example: cancel_slurm_jobs "C48_ATM_3c4e7f74"
#
# Cancel all Slurm jobs that have the given substring in their name
# So like in the example all jobs with "C48_ATM_3c4e7f74"
# in their name will be canceled

local substring=$1
local job_ids
job_ids=$(squeue -u "${USER}" -h -o "%i")

for job_id in ${job_ids}; do
job_name=$(sacct -j "${job_id}" --format=JobName%100 | head -3 | tail -1 | sed -r 's/\s+//g') || true
if [[ "${job_name}" =~ ${substring} ]]; then
echo "Canceling Slurm Job ${job_name} with: scancel ${job_id}"
scancel "${job_id}"
continue
fi
done
}

0 comments on commit a286a11

Please sign in to comment.