diff --git a/.github/workflows/ci-unit_tests.yaml b/.github/workflows/ci-unit_tests.yaml index 841ecf674c..f06d48aa23 100644 --- a/.github/workflows/ci-unit_tests.yaml +++ b/.github/workflows/ci-unit_tests.yaml @@ -44,7 +44,7 @@ jobs: key: ${{ runner.os }}-rocoto-${{ hashFiles('**/ci-unit_tests.yaml') }} - name: Run tests - run: python -m pytest ci/scripts/tests/test_rocotostat.py -v --junitxml ci/scripts/tests/test-results.xml + run: python -m pytest ci/scripts/tests/ -v --junitxml ci/scripts/tests/test-results.xml - name: Publish Test Results if: always() diff --git a/ci/scripts/tests/test_rocotostat.py b/ci/scripts/tests/test_rocotostat.py index 8deb88ca0e..52737c4e8f 100755 --- a/ci/scripts/tests/test_rocotostat.py +++ b/ci/scripts/tests/test_rocotostat.py @@ -4,7 +4,7 @@ script_dir = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.join(os.path.dirname(script_dir), 'utils')) -from rocotostat import rocoto_statcount, rocotostat_summary, CommandNotFoundError +from rocotostat import rocoto_statcount, rocotostat_summary, is_done, is_stalled, CommandNotFoundError from wxflow import which workflow_file = os.path.join(script_dir, "testdata/rocotostat/workflow.xml") @@ -35,3 +35,20 @@ def test_rocoto_summary(): assert result['CYCLES_TOTAL'] == 1 assert result['CYCLES_DONE'] == 1 + + +def test_rocoto_done(): + + result = rocotostat_summary(rocotostat) + + assert is_done(result) + + +def test_rocoto_stalled(): + + workflow_file = os.path.join(script_dir, "testdata/rocotostat_stalled/stalled.xml") + database_file = os.path.join(script_dir, "testdata/rocotostat_stalled/stalled.db") + + result = rocoto_statcount(rocotostat) + + assert is_stalled(result) diff --git a/ci/scripts/tests/testdata/rocotostat_stalled/stalled.db b/ci/scripts/tests/testdata/rocotostat_stalled/stalled.db new file mode 100644 index 0000000000..976ee0605c Binary files /dev/null and b/ci/scripts/tests/testdata/rocotostat_stalled/stalled.db differ diff --git a/ci/scripts/tests/testdata/rocotostat_stalled/stalled.xml b/ci/scripts/tests/testdata/rocotostat_stalled/stalled.xml new file mode 100644 index 0000000000..dd4e5a6a68 --- /dev/null +++ b/ci/scripts/tests/testdata/rocotostat_stalled/stalled.xml @@ -0,0 +1,443 @@ + + + + + + +]> + + + + /RUNTESTS/EXPDIR/C48_S2SWA_gefs/logs/@Y@m@d@H.log + + + 202103231200 202103231200 24:00:00 + + + + /gefs/jobs/rocoto/stage_ic.sh + + C48_S2SWA_gefs_stage_ic_@H + nems + batch + orion + 00:15:00 + 1:ppn=1:tpp=1 + --export=NONE + + /RUNTESTS/COMROOT/C48_S2SWA_gefs/logs/@Y@m@d@H/stage_ic.log + + RUN_ENVIRemc + HOMEgfs/gefs + EXPDIR/RUNTESTS/EXPDIR/C48_S2SWA_gefs + NETgefs + CDUMPgefs + RUNgefs + CDATE@Y@m@d@H + PDY@Y@m@d + cyc@H + COMROOT/RUNTESTS/COMROOT + DATAROOT/work/noaa/stmp/mterry/RUNDIRS/C48_S2SWA_gefs + + + + /work/noaa/global/glopara/data/ICSDIR/prototype_ICs/gefs_test/@Y@m@d@H/mem000/atmos/gfs_ctrl.nc + /work/noaa/global/glopara/data/ICSDIR/prototype_ICs/gefs_test/@Y@m@d@H/mem000/wave/@Y@m@d.@H0000.restart.glo_500 + + + + + + + + /gefs/jobs/rocoto/waveinit.sh + + C48_S2SWA_gefs_wave_init_@H + nems + batch + orion + 00:10:00 + 1:ppn=40:tpp=1 + 2GB + --export=NONE + + /RUNTESTS/COMROOT/C48_S2SWA_gefs/logs/@Y@m@d@H/wave_init.log + + RUN_ENVIRemc + HOMEgfs/gefs + EXPDIR/RUNTESTS/EXPDIR/C48_S2SWA_gefs + NETgefs + CDUMPgefs + RUNgefs + CDATE@Y@m@d@H + PDY@Y@m@d + cyc@H + COMROOT/RUNTESTS/COMROOT + DATAROOT/work/noaa/stmp/mterry/RUNDIRS/C48_S2SWA_gefs + + + + + + /gefs/jobs/rocoto/fcst.sh + + C48_S2SWA_gefs_fcst_mem000_@H + nems + batch + orion + 03:00:00 + 1:ppn=40:tpp=1 + --export=NONE + + /RUNTESTS/COMROOT/C48_S2SWA_gefs/logs/@Y@m@d@H/fcst_mem000.log + + RUN_ENVIRemc + HOMEgfs/gefs + EXPDIR/RUNTESTS/EXPDIR/C48_S2SWA_gefs + NETgefs + CDUMPgefs + RUNgefs + CDATE@Y@m@d@H + PDY@Y@m@d + cyc@H + COMROOT/RUNTESTS/COMROOT + DATAROOT/work/noaa/stmp/mterry/RUNDIRS/C48_S2SWA_gefs + + + + + + + + + + + + + 001 002 + + + + /gefs/jobs/rocoto/fcst.sh + + C48_S2SWA_gefs_fcst_mem#member#_@H + nems + batch + orion + 03:00:00 + 1:ppn=40:tpp=1 + --export=NONE + + /RUNTESTS/COMROOT/C48_S2SWA_gefs/logs/@Y@m@d@H/fcst_mem#member#.log + + RUN_ENVIRemc + HOMEgfs/gefs + EXPDIR/RUNTESTS/EXPDIR/C48_S2SWA_gefs + NETgefs + CDUMPgefs + RUNgefs + CDATE@Y@m@d@H + PDY@Y@m@d + cyc@H + COMROOT/RUNTESTS/COMROOT + DATAROOT/work/noaa/stmp/mterry/RUNDIRS/C48_S2SWA_gefs + ENSMEM#member# + MEMDIRmem#member# + + + + + + + + + + + + + + + 000 001 002 + + + + 000 006 012 018 024 030 036 042 048 054 060 066 072 078 084 090 096 102 108 114 120 + + + + /gefs/jobs/rocoto/atmos_products.sh + + C48_S2SWA_gefs_atmos_prod_mem#member#_f#fhr#_@H + nems + batch + orion + 00:15:00 + 1:ppn=24:tpp=1 + --export=NONE + + /RUNTESTS/COMROOT/C48_S2SWA_gefs/logs/@Y@m@d@H/atmos_prod_mem#member#_f#fhr#.log + + RUN_ENVIRemc + HOMEgfs/gefs + EXPDIR/RUNTESTS/EXPDIR/C48_S2SWA_gefs + NETgefs + CDUMPgefs + RUNgefs + CDATE@Y@m@d@H + PDY@Y@m@d + cyc@H + COMROOT/RUNTESTS/COMROOT + DATAROOT/work/noaa/stmp/mterry/RUNDIRS/C48_S2SWA_gefs + ENSMEM#member# + MEMDIRmem#member# + FHRLST#fhr# + COMPONENTatmos + + + &ROTDIR;/gefs.@Y@m@d/@H/mem#member#/model_data/atmos/master/gefs.t@Hz.master.grb2f#fhr# + + + + + + + + + + + 000 006 012 018 024 030 036 042 048 054 060 066 072 078 084 090 096 102 108 114 120 + + + + /gefs/jobs/rocoto/atmos_ensstat.sh + + C48_S2SWA_gefs_atmos_ensstat_f#fhr#_@H + nems + batch + orion + 00:30:00 + 1:ppn=6:tpp=1 + --export=NONE + + /RUNTESTS/COMROOT/C48_S2SWA_gefs/logs/@Y@m@d@H/atmos_ensstat_f#fhr#.log + + RUN_ENVIRemc + HOMEgfs/gefs + EXPDIR/RUNTESTS/EXPDIR/C48_S2SWA_gefs + NETgefs + CDUMPgefs + RUNgefs + CDATE@Y@m@d@H + PDY@Y@m@d + cyc@H + COMROOT/RUNTESTS/COMROOT + DATAROOT/work/noaa/stmp/mterry/RUNDIRS/C48_S2SWA_gefs + FHRLST#fhr# + + + + + + + + + + + + + + + + 000 001 002 + + + + 006 012 018 024 030 036 042 048 054 060 066 072 078 084 090 096 102 108 114 120 + + + + /gefs/jobs/rocoto/oceanice_products.sh + + C48_S2SWA_gefs_ocean_prod_mem#member#_f#fhr#_@H + nems + batch + orion + 00:15:00 + 1:ppn=1:tpp=1 + 96GB + --export=NONE + + /RUNTESTS/COMROOT/C48_S2SWA_gefs/logs/@Y@m@d@H/ocean_prod_mem#member#_f#fhr#.log + + RUN_ENVIRemc + HOMEgfs/gefs + EXPDIR/RUNTESTS/EXPDIR/C48_S2SWA_gefs + NETgefs + CDUMPgefs + RUNgefs + CDATE@Y@m@d@H + PDY@Y@m@d + cyc@H + COMROOT/RUNTESTS/COMROOT + DATAROOT/work/noaa/stmp/mterry/RUNDIRS/C48_S2SWA_gefs + ENSMEM#member# + MEMDIRmem#member# + FHRLST#fhr# + COMPONENTocean + + + + &ROTDIR;/gefs.@Y@m@d/@H/mem#member#/model_data/ocean/history/gefs.ocean.t@Hz.24hr_avg.f#fhr#.nc + /gefs/ush/check_netcdf.sh &ROTDIR;/gefs.@Y@m@d/@H/mem#member#/model_data/ocean/history/gefs.ocean.t@Hz.24hr_avg.f#fhr#.nc + + + + + + + + + + + + 000 001 002 + + + + 006 012 018 024 030 036 042 048 054 060 066 072 078 084 090 096 102 108 114 120 + + + + /gefs/jobs/rocoto/oceanice_products.sh + + C48_S2SWA_gefs_ice_prod_mem#member#_f#fhr#_@H + nems + batch + orion + 00:15:00 + 1:ppn=1:tpp=1 + 96GB + --export=NONE + + /RUNTESTS/COMROOT/C48_S2SWA_gefs/logs/@Y@m@d@H/ice_prod_mem#member#_f#fhr#.log + + RUN_ENVIRemc + HOMEgfs/gefs + EXPDIR/RUNTESTS/EXPDIR/C48_S2SWA_gefs + NETgefs + CDUMPgefs + RUNgefs + CDATE@Y@m@d@H + PDY@Y@m@d + cyc@H + COMROOT/RUNTESTS/COMROOT + DATAROOT/work/noaa/stmp/mterry/RUNDIRS/C48_S2SWA_gefs + ENSMEM#member# + MEMDIRmem#member# + FHRLST#fhr# + COMPONENTice + + + &ROTDIR;/gefs.@Y@m@d/@H/mem#member#/model_data/ice/history/gefs.ice.t@Hz.24hr_avg.f#fhr#.nc + + + + + + + + + + + 000 001 002 + + + + /gefs/jobs/rocoto/wavepostsbs.sh + + C48_S2SWA_gefs_wave_post_grid_mem#member#_@H + nems + batch + orion + 03:00:00 + 1:ppn=40:tpp=1 + 10GB + --export=NONE + + /RUNTESTS/COMROOT/C48_S2SWA_gefs/logs/@Y@m@d@H/wave_post_grid_mem#member#.log + + RUN_ENVIRemc + HOMEgfs/gefs + EXPDIR/RUNTESTS/EXPDIR/C48_S2SWA_gefs + NETgefs + CDUMPgefs + RUNgefs + CDATE@Y@m@d@H + PDY@Y@m@d + cyc@H + COMROOT/RUNTESTS/COMROOT + DATAROOT/work/noaa/stmp/mterry/RUNDIRS/C48_S2SWA_gefs + ENSMEM#member# + MEMDIRmem#member# + + + + &ROTDIR;/gefs.@Y@m@d/@H/mem#member#/model_data/wave/history/gefswave.out_grd.glo_500.@Y@m@d.@H0000 + + + + + + + + + + 000 001 002 + + + + /gefs/jobs/rocoto/wavepostpnt.sh + + C48_S2SWA_gefs_wave_post_pnt_mem#member#_@H + nems + batch + orion + 04:00:00 + 5:ppn=40:tpp=1 + --export=NONE + + /RUNTESTS/COMROOT/C48_S2SWA_gefs/logs/@Y@m@d@H/wave_post_pnt_mem#member#.log + + RUN_ENVIRemc + HOMEgfs/gefs + EXPDIR/RUNTESTS/EXPDIR/C48_S2SWA_gefs + NETgefs + CDUMPgefs + RUNgefs + CDATE@Y@m@d@H + PDY@Y@m@d + cyc@H + COMROOT/RUNTESTS/COMROOT + DATAROOT/work/noaa/stmp/mterry/RUNDIRS/C48_S2SWA_gefs + ENSMEM#member# + MEMDIRmem#member# + + + + + + + + + + + + diff --git a/ci/scripts/utils/rocotostat.py b/ci/scripts/utils/rocotostat.py index 28e3fad707..fe8c84b5e7 100755 --- a/ci/scripts/utils/rocotostat.py +++ b/ci/scripts/utils/rocotostat.py @@ -55,6 +55,7 @@ def rocotostat_summary(rocotostat): Output: ROCOTO_STATUS - A dictionary with the total number of cycles and the number of cycles marked as 'Done'. """ + rocotostat.add_default_arg('--summary') rocotostat_output = rocotostat(output=str) rocotostat_output = rocotostat_output.splitlines()[1:] @@ -99,7 +100,54 @@ def rocoto_statcount(rocotostat): return rocoto_status +def is_done(rocoto_status): + """ + IS_DONE Check if all cycles are done. + + IS_DONE(ROCOTO_STATUS) checks if the total number of cycles equals the number of + done cycles in the ROCOTO_STATUS dictionary. + + Input: + ROCOTO_STATUS - A dictionary with the count of each status case. + + Output: + BOOLEAN - True if all cycles are done, False otherwise. + """ + + if rocoto_status['CYCLES_TOTAL'] == rocoto_status['CYCLES_DONE']: + return True + else: + return False + + +def is_stalled(rocoto_status): + """ + IS_STALLED Check if all cycles are stalled. + + IS_STALLED(ROCOTO_STATUS) checks if all cycles are stalled by verifying if + there are no jobs that are RUNNING, SUBMITTING, or QUEUED. + + Input: + ROCOTO_STATUS - A dictionary with the count of each status case. + + Output: + BOOLEAN - True if all cycles are stalled, False otherwise. + """ + + if rocoto_status['RUNNING'] + rocoto_status['SUBMITTING'] + rocoto_status['QUEUED'] == 0: + return True + else: + return False + + if __name__ == '__main__': + """ + MAIN Execute the script. + + MAIN() parses the input arguments, checks if the rocotostat command is available, + adds default arguments to the rocotostat command, and runs it and reports + out to stdout spcific information of rocoto workflow. + """ args = input_args() @@ -115,7 +163,7 @@ def rocoto_statcount(rocotostat): rocoto_status = rocoto_statcount(rocotostat) rocoto_status.update(rocotostat_summary(rocotostat)) - if rocoto_status['CYCLES_TOTAL'] == rocoto_status['CYCLES_DONE']: + if is_done(rocoto_status): rocoto_state = 'DONE' elif rocoto_status['DEAD'] > 0: error_return = rocoto_status['FAIL'] + rocoto_status['DEAD'] @@ -123,7 +171,7 @@ def rocoto_statcount(rocotostat): elif 'UNKNOWN' in rocoto_status: error_return = rocoto_status['UNKNOWN'] rocoto_state = 'UNKNOWN' - elif rocoto_status['RUNNING'] + rocoto_status['SUBMITTING'] + rocoto_status['QUEUED'] == 0: + elif is_stalled(rocoto_status): # # TODO for now a STALLED state will be just a warning as it can # produce a false negative if there is a timestamp on a file dependency.