From 336b78acb551794eaf0a3a5310a62ddf3cd8dcd4 Mon Sep 17 00:00:00 2001 From: David Huber <69919478+DavidHuber-NOAA@users.noreply.github.com> Date: Tue, 13 Aug 2024 16:37:52 +0000 Subject: [PATCH] Hotfix: Handle UNAVAILABLE and UNKNOWN rocoto status in Bash CI (#2820) # Description From time to time, PBS pro cannot return a `qstat` response within a given time limit set by `rocoto` (default is 45 seconds). If that happens, then an `UNAVAILABLE` status will be returned for the given job. This PR adds checking for this status to allow CI processing to continue. --- ci/scripts/utils/rocotostat.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/ci/scripts/utils/rocotostat.py b/ci/scripts/utils/rocotostat.py index 70c672f0e8..4afea5c8b5 100755 --- a/ci/scripts/utils/rocotostat.py +++ b/ci/scripts/utils/rocotostat.py @@ -136,7 +136,7 @@ def rocoto_statcount(rocotostat): rocotostat_output = [line.split()[0:4] for line in rocotostat_output] rocotostat_output = [line for line in rocotostat_output if len(line) != 1] - status_cases = ['SUCCEEDED', 'FAIL', 'DEAD', 'RUNNING', 'SUBMITTING', 'QUEUED'] + status_cases = ['SUCCEEDED', 'FAIL', 'DEAD', 'RUNNING', 'SUBMITTING', 'QUEUED', 'UNAVAILABLE'] rocoto_status = {} status_counts = Counter(case for sublist in rocotostat_output for case in sublist) @@ -214,9 +214,16 @@ def is_stalled(rocoto_status): elif rocoto_status['DEAD'] > 0: error_return = rocoto_status['FAIL'] + rocoto_status['DEAD'] rocoto_state = 'FAIL' - elif 'UNKNOWN' in rocoto_status: - error_return = rocoto_status['UNKNOWN'] - rocoto_state = 'UNKNOWN' + elif 'UNAVAILABLE' in rocoto_status or 'UNKNOWN' in rocoto_status: + rocoto_status = attempt_multiple_times(lambda: rocoto_statcount(rocotostat), 2, 120, ProcessError) + error_return = 0 + rocoto_state = 'RUNNING' + if 'UNAVAILABLE' in rocoto_status: + error_return = rocoto_status['UNAVAILABLE'] + rocoto_state = 'UNAVAILABLE' + if 'UNKNOWN' in rocoto_status: + error_return += rocoto_status['UNKNOWN'] + rocoto_state = 'UNKNOWN' elif is_stalled(rocoto_status): rocoto_status = attempt_multiple_times(lambda: rocoto_statcount(rocotostat), 2, 120, ProcessError) if is_stalled(rocoto_status):