Skip to content

Commit

Permalink
Check that a PR driver is still running before trying to kill it (#2799)
Browse files Browse the repository at this point in the history
Adds a check to the SSH command used to kill child PIDs of a defunct
driver instance on a different head node to prevent invalid kill
commands, preventing CI failures.

Resolves #2798
  • Loading branch information
DavidHuber-NOAA committed Aug 7, 2024
1 parent 5c2e9b1 commit ad8d3e9
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 8 deletions.
8 changes: 4 additions & 4 deletions ci/scripts/check_ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,14 @@ fi
export GH

rocotostat=$(command -v rocotostat)
if [[ -z ${rocotostat+x} ]]; then
if [[ -z ${rocotostat} ]]; then
echo "rocotostat not found on system"
exit 1
else
echo "rocotostat being used from ${rocotostat}"
fi
rocotocheck=$(command -v rocotocheck)
if [[ -z ${rocotocheck+x} ]]; then
if [[ -z ${rocotocheck} ]]; then
echo "rocotocheck not found on system"
exit 1
else
Expand All @@ -70,7 +70,7 @@ pr_list=""
if [[ -f "${pr_list_dbfile}" ]]; then
pr_list=$("${HOMEgfs}/ci/scripts/utils/pr_list_database.py" --dbfile "${pr_list_dbfile}" --list Open Running) || true
fi
if [[ -z "${pr_list+x}" ]]; then
if [[ -z "${pr_list}" ]]; then
echo "no PRs open and ready to run cases on .. exiting"
exit 0
fi
Expand Down Expand Up @@ -124,7 +124,7 @@ for pr in ${pr_list}; do

for pslot_dir in "${pr_dir}/RUNTESTS/EXPDIR/"*; do
pslot=$(basename "${pslot_dir}") || true
if [[ -z "${pslot+x}" ]]; then
if [[ -z "${pslot}" ]]; then
echo "No experiments found in ${pslot_dir} .. exiting"
exit 0
fi
Expand Down
11 changes: 7 additions & 4 deletions ci/scripts/driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,9 @@ pr_list=$(${GH} pr list --repo "${REPO_URL}" --label "CI-${MACHINE_ID^}-Ready" -

for pr in ${pr_list}; do
pr_dir="${GFS_CI_ROOT}/PR/${pr}"
[[ ! -d ${pr_dir} ]] && mkdir -p "${pr_dir}"
db_list=$("${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --add_pr "${pr}" --dbfile "${pr_list_dbfile}")
output_ci_single="${GFS_CI_ROOT}/PR/${pr}/output_single.log"
output_ci_single="${pr_dir}/output_single.log"
#############################################################
# Check if a Ready labeled PR has changed back from once set
# and in that case completely kill the previose driver.sh cron
Expand Down Expand Up @@ -107,7 +108,9 @@ for pr in ${pr_list}; do
echo -e "${pstree_out}" | grep -Pow "(?<=\()[0-9]+(?=\))" | xargs kill
fi
else
ssh "${driver_HOST}" 'pstree -A -p "${driver_PID}" | grep -Eow "[0-9]+" | xargs kill'
# Check if the driver is still running on the head node; if so, kill it and all child processes
#shellcheck disable=SC2029
ssh "${driver_HOST}" "pstree -A -p \"${driver_PID}\" | grep -Eow \"[0-9]+\" | xargs kill || echo \"Failed to kill process with PID: ${driver_PID}, it may not be valid.\""
fi
{
echo "Driver PID: Requested termination of ${driver_PID} and children on ${driver_HOST}"
Expand Down Expand Up @@ -141,7 +144,7 @@ pr_list=""
if [[ -f "${pr_list_dbfile}" ]]; then
pr_list=$("${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --dbfile "${pr_list_dbfile}" --list Open Ready) || true
fi
if [[ -z "${pr_list+x}" ]]; then
if [[ -z "${pr_list}" ]]; then
echo "no PRs open and ready for checkout/build .. exiting"
exit 0
fi
Expand All @@ -155,7 +158,7 @@ fi
for pr in ${pr_list}; do
# Skip pr's that are currently Building for when overlapping driver scripts are being called from within cron
pr_building=$("${ROOT_DIR}/ci/scripts/utils/pr_list_database.py" --display "${pr}" --dbfile "${pr_list_dbfile}" | grep Building) || true
if [[ -z "${pr_building+x}" ]]; then
if [[ -n "${pr_building}" ]]; then
continue
fi
id=$("${GH}" pr view "${pr}" --repo "${REPO_URL}" --json id --jq '.id')
Expand Down

0 comments on commit ad8d3e9

Please sign in to comment.