Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/develop' into fix/early_run_configs
Browse files Browse the repository at this point in the history
* origin/develop:
  support ATM forecast only on Azure (NOAA-EMC#2827)
  Convert staging job to python and yaml (NOAA-EMC#2651)
  Fixed test on UNAVAILBLE in python Rocoto check (NOAA-EMC#2842)
  • Loading branch information
DavidHuber-NOAA committed Aug 21, 2024
2 parents 0e60123 + 1b18f2f commit 59c2002
Show file tree
Hide file tree
Showing 50 changed files with 875 additions and 670 deletions.
2 changes: 1 addition & 1 deletion ci/cases/pr/C48mx500_3DVarAOWCDA.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ arguments:
resdetocean: 5.0
comroot: {{ 'RUNTESTS' | getenv }}/COMROOT
expdir: {{ 'RUNTESTS' | getenv }}/EXPDIR
icsdir: {{ 'ICSDIR_ROOT' | getenv }}/C48mx500
icsdir: {{ 'ICSDIR_ROOT' | getenv }}/C48mx500/20240610
idate: 2021032412
edate: 2021032418
nens: 0
Expand Down
2 changes: 1 addition & 1 deletion ci/cases/pr/C96C48_hybatmDA.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ arguments:
resensatmos: 48
comroot: {{ 'RUNTESTS' | getenv }}/COMROOT
expdir: {{ 'RUNTESTS' | getenv }}/EXPDIR
icsdir: {{ 'ICSDIR_ROOT' | getenv }}/C96C48
icsdir: {{ 'ICSDIR_ROOT' | getenv }}/C96C48/20240610
idate: 2021122018
edate: 2021122106
nens: 2
Expand Down
2 changes: 1 addition & 1 deletion ci/cases/pr/C96C48_ufs_hybatmDA.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ arguments:
resensatmos: 48
comroot: {{ 'RUNTESTS' | getenv }}/COMROOT
expdir: {{ 'RUNTESTS' | getenv }}/EXPDIR
icsdir: {{ 'ICSDIR_ROOT' | getenv }}/C96C48
icsdir: {{ 'ICSDIR_ROOT' | getenv }}/C96C48/20240610
idate: 2024022318
edate: 2024022400
nens: 2
Expand Down
2 changes: 1 addition & 1 deletion ci/cases/pr/C96_atm3DVar.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ arguments:
resdetatmos: 96
comroot: {{ 'RUNTESTS' | getenv }}/COMROOT
expdir: {{ 'RUNTESTS' | getenv }}/EXPDIR
icsdir: {{ 'ICSDIR_ROOT' | getenv }}/C96C48
icsdir: {{ 'ICSDIR_ROOT' | getenv }}/C96C48/20240610
idate: 2021122018
edate: 2021122106
nens: 0
Expand Down
2 changes: 1 addition & 1 deletion ci/cases/pr/C96_atm3DVar_extended.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ arguments:
resdetatmos: 96
comroot: {{ 'RUNTESTS' | getenv }}/COMROOT
expdir: {{ 'RUNTESTS' | getenv }}/EXPDIR
icsdir: {{ 'ICSDIR_ROOT' | getenv }}/C96C48
icsdir: {{ 'ICSDIR_ROOT' | getenv }}/C96C48/20240610
idate: 2021122018
edate: 2021122118
nens: 0
Expand Down
2 changes: 1 addition & 1 deletion ci/cases/pr/C96_atmaerosnowDA.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ arguments:
resdetatmos: 96
comroot: {{ 'RUNTESTS' | getenv }}/COMROOT
expdir: {{ 'RUNTESTS' | getenv }}/EXPDIR
icsdir: {{ 'ICSDIR_ROOT' | getenv }}/C96C48
icsdir: {{ 'ICSDIR_ROOT' | getenv }}/C96C48/20240610
idate: 2021122012
edate: 2021122100
nens: 0
Expand Down
8 changes: 4 additions & 4 deletions ci/scripts/utils/rocotostat.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def rocoto_statcount(rocotostat):
rocotostat_output = [line.split()[0:4] for line in rocotostat_output]
rocotostat_output = [line for line in rocotostat_output if len(line) != 1]

status_cases = ['SUCCEEDED', 'FAIL', 'DEAD', 'RUNNING', 'SUBMITTING', 'QUEUED', 'UNAVAILABLE']
status_cases = ['SUCCEEDED', 'FAIL', 'DEAD', 'RUNNING', 'SUBMITTING', 'QUEUED', 'UNAVAILABLE', 'UNKNOWN']

rocoto_status = {}
status_counts = Counter(case for sublist in rocotostat_output for case in sublist)
Expand Down Expand Up @@ -214,14 +214,14 @@ def is_stalled(rocoto_status):
elif rocoto_status['DEAD'] > 0:
error_return = rocoto_status['FAIL'] + rocoto_status['DEAD']
rocoto_state = 'FAIL'
elif 'UNAVAILABLE' in rocoto_status or 'UNKNOWN' in rocoto_status:
elif rocoto_status['UNAVAILABLE'] > 0 or rocoto_status['UNKNOWN'] > 0:
rocoto_status = attempt_multiple_times(lambda: rocoto_statcount(rocotostat), 2, 120, ProcessError)
error_return = 0
rocoto_state = 'RUNNING'
if 'UNAVAILABLE' in rocoto_status:
if rocoto_status['UNAVAILABLE'] > 0:
error_return = rocoto_status['UNAVAILABLE']
rocoto_state = 'UNAVAILABLE'
if 'UNKNOWN' in rocoto_status:
if rocoto_status['UNKNOWN'] > 0:
error_return += rocoto_status['UNKNOWN']
rocoto_state = 'UNKNOWN'
elif is_stalled(rocoto_status):
Expand Down
10 changes: 5 additions & 5 deletions docs/source/init.rst
Original file line number Diff line number Diff line change
Expand Up @@ -125,14 +125,14 @@ Start date = 2021032312
├── enkfgdas.20210323
│   ├── 06
│   │   ├── mem001
│   │   │   └── model_data -> ../../../gdas.20210323/06/model_data
│   │   │   └── model -> ../../../gdas.20210323/06/model
│   │   ├── mem002
│   │   │   └── model_data -> ../../../gdas.20210323/06/model_data
│   │   │   └── model -> ../../../gdas.20210323/06/model
│   │   ├── mem003
│   │   │   └── model_data -> ../../../gdas.20210323/06/model_data
│   │   │   └── model -> ../../../gdas.20210323/06/model
...
│   │   └── mem080
│   │   └── model_data -> ../../../gdas.20210323/06/model_data
│   │   └── model -> ../../../gdas.20210323/06/model
│   └── 12
│   ├── mem001
│   │   └── analysis
Expand All @@ -153,7 +153,7 @@ Start date = 2021032312
│   └── gdas.t12z.ocninc.nc -> ../../../../../gdas.20210323/12/analysis/ocean/gdas.t12z.ocninc.nc
└── gdas.20210323
├── 06
│   └── model_data
│   └── model
│   ├── atmos
│   │   └── restart
│   │   ├── 20210323.120000.ca_data.tile1.nc
Expand Down
55 changes: 55 additions & 0 deletions env/AZUREPW.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#! /usr/bin/env bash

if [[ $# -ne 1 ]]; then

echo "Must specify an input argument to set runtime environment variables!"
exit 1

fi

step=$1

export launcher="srun -l --export=ALL"
export mpmd_opt="--multi-prog --output=mpmd.%j.%t.out"

# Configure MPI environment
export OMP_STACKSIZE=2048000
export NTHSTACK=1024000000

ulimit -s unlimited
ulimit -a

# Calculate common variables
# Check first if the dependent variables are set
if [[ -n "${ntasks:-}" && -n "${max_tasks_per_node:-}" && -n "${tasks_per_node:-}" ]]; then
max_threads_per_task=$((max_tasks_per_node / tasks_per_node))
NTHREADSmax=${threads_per_task:-${max_threads_per_task}}
NTHREADS1=${threads_per_task:-1}
[[ ${NTHREADSmax} -gt ${max_threads_per_task} ]] && NTHREADSmax=${max_threads_per_task}
[[ ${NTHREADS1} -gt ${max_threads_per_task} ]] && NTHREADS1=${max_threads_per_task}
APRUN="${launcher} -n ${ntasks}"
else
echo "ERROR config.resources must be sourced before sourcing AZUREPW.env"
exit 2
fi

if [[ "${step}" = "fcst" ]] || [[ "${step}" = "efcs" ]]; then

export launcher="srun --mpi=pmi2 -l"

(( nnodes = (ntasks+tasks_per_node-1)/tasks_per_node ))
(( ufs_ntasks = nnodes*tasks_per_node ))
# With ESMF threading, the model wants to use the full node
export APRUN_UFS="${launcher} -n ${ufs_ntasks}"
unset nnodes ufs_ntasks

elif [[ "${step}" = "post" ]]; then

export NTHREADS_NP=${NTHREADS1}
export APRUN_NP="${APRUN}"

export NTHREADS_DWN=${threads_per_task_dwn:-1}
[[ ${NTHREADS_DWN} -gt ${max_threads_per_task} ]] && export NTHREADS_DWN=${max_threads_per_task}
export APRUN_DWN="${launcher} -n ${ntasks_dwn}"

fi
17 changes: 9 additions & 8 deletions jobs/JGLOBAL_STAGE_IC
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@
source "${HOMEgfs}/ush/preamble.sh"
source "${HOMEgfs}/ush/jjob_header.sh" -e "stage_ic" -c "base stage_ic"

# Restart conditions for GFS cycle come from GDAS
# shellcheck disable=SC2153
rCDUMP=${RUN}
# shellcheck disable=SC2153
[[ ${RUN} = "gfs" ]] && export rCDUMP="gdas"
export rCDUMP
# Execute staging
"${SCRgfs}/exglobal_stage_ic.py"
err=$?

# Execute the Script
"${SCRgfs}/exglobal_stage_ic.sh"
###############################################################
# Check for errors and exit if any of the above failed
if [[ "${err}" -ne 0 ]]; then
echo "FATAL ERROR: Unable to copy ICs to ${ROTDIR}; ABORT!"
exit "${err}"
fi

##########################################
# Remove the Temporary working directory
Expand Down
1 change: 0 additions & 1 deletion parm/config/gefs/config.base
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ export FIXugwd=${FIXgfs}/ugwd
export PACKAGEROOT="@PACKAGEROOT@" # TODO: set via prod_envir in Ops
export COMROOT="@COMROOT@" # TODO: set via prod_envir in Ops
export COMINsyn="@COMINsyn@"
export BASE_CPLIC="@BASE_CPLIC@"

# USER specific paths
export HOMEDIR="@HOMEDIR@"
Expand Down
49 changes: 22 additions & 27 deletions parm/config/gefs/config.stage_ic
Original file line number Diff line number Diff line change
Expand Up @@ -7,32 +7,27 @@ echo "BEGIN: config.stage_ic"
# Get task specific resources
source "${EXPDIR}/config.resources" stage_ic

case "${CASE}" in
"C384")
export CPL_ATMIC=""
export CPL_ICEIC=""
export CPL_OCNIC=""
export CPL_WAVIC=""
export CPL_MEDIC=""
;;
"C96")
export CPL_ATMIC=""
export CPL_ICEIC=""
export CPL_OCNIC=""
export CPL_WAVIC=""
export CPL_MEDIC=""
;;
"C48")
export CPL_ATMIC="gefs_test"
export CPL_ICEIC="gefs_test"
export CPL_OCNIC="gefs_test"
export CPL_WAVIC="gefs_test"
export CPL_MEDIC="gefs_test"
;;
*)
echo "FATAL ERROR Unrecognized resolution: ${CASE}"
exit 1
;;
esac
export ICSDIR="@ICSDIR@" # User provided ICSDIR; blank if not provided
export BASE_IC="@BASE_IC@" # Platform home for staged ICs

export STAGE_IC_YAML_TMPL="${PARMgfs}/stage/master_gefs.yaml.j2"

# Set ICSDIR

if [[ -z "${ICSDIR}" ]] ; then

ic_ver="20240610"

if (( NMEM_ENS > 0 )) ; then
ensic="${CASE_ENS}"
fi

if [[ "${DO_OCN:-NO}" == "YES" ]] ; then
ocnic="mx${OCNRES}"
fi

export ICSDIR="${BASE_IC}/${CASE}${ensic:-}${ocnic:-}/${ic_ver}"

fi

echo "END: config.stage_ic"
7 changes: 3 additions & 4 deletions parm/config/gfs/config.base
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ export PACKAGEROOT="@PACKAGEROOT@" # TODO: set via prod_envir in Ops
export COMROOT="@COMROOT@" # TODO: set via prod_envir in Ops
export COMINsyn="@COMINsyn@"
export DMPDIR="@DMPDIR@"
export BASE_CPLIC="@BASE_CPLIC@"

# Gempak from external models
# Default locations are to dummy locations for testing
Expand Down Expand Up @@ -483,9 +482,9 @@ export OFFSET_START_HOUR=0
# Number of regional collectives to create soundings for
export NUM_SND_COLLECTIVES=${NUM_SND_COLLECTIVES:-9}

# The tracker, genesis, and METplus jobs are not supported on AWS yet
# TODO: we should place these in workflow/hosts/awspw.yaml as part of AWS setup, not for general.
if [[ "${machine}" == "AWSPW" ]]; then
# The tracker, genesis, and METplus jobs are not supported on CSPs yet
# TODO: we should place these in workflow/hosts/awspw.yaml as part of AWS/AZURE setup, not for general.
if [[ "${machine}" =~ "PW" ]]; then
export DO_TRACKER="NO"
export DO_GENESIS="NO"
export DO_METP="NO"
Expand Down
32 changes: 16 additions & 16 deletions parm/config/gfs/config.com
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ echo "BEGIN: config.com"
# declare_from_tmpl [-rx] $var1[:$tmpl1] [$var2[:$tmpl2]] [...]]
#
# options:
# -r: Make variable read-only (same as `decalre -r`)
# -r: Make variable read-only (same as `declare -r`)
# -x: Mark variable for declare -rx (same as `declare -x`)
# var1, var2, etc: Variable names whose values will be generated from a template
# and declared
Expand Down Expand Up @@ -51,12 +51,12 @@ declare -rx COM_TOP_TMPL='${ROTDIR}/${RUN}.${YMD}/${HH}'
declare -rx COM_CONF_TMPL=${COM_BASE}'/conf'
declare -rx COM_OBS_JEDI=${COM_BASE}'/obs_jedi'

declare -rx COM_ATMOS_INPUT_TMPL=${COM_BASE}'/model_data/atmos/input'
declare -rx COM_ATMOS_RESTART_TMPL=${COM_BASE}'/model_data/atmos/restart'
declare -rx COM_ATMOS_INPUT_TMPL=${COM_BASE}'/model/atmos/input'
declare -rx COM_ATMOS_RESTART_TMPL=${COM_BASE}'/model/atmos/restart'
declare -rx COM_ATMOS_ANALYSIS_TMPL=${COM_BASE}'/analysis/atmos'
declare -rx COM_SNOW_ANALYSIS_TMPL=${COM_BASE}'/analysis/snow'
declare -rx COM_ATMOS_HISTORY_TMPL=${COM_BASE}'/model_data/atmos/history'
declare -rx COM_ATMOS_MASTER_TMPL=${COM_BASE}'/model_data/atmos/master'
declare -rx COM_ATMOS_HISTORY_TMPL=${COM_BASE}'/model/atmos/history'
declare -rx COM_ATMOS_MASTER_TMPL=${COM_BASE}'/model/atmos/master'
declare -rx COM_ATMOS_GRIB_TMPL=${COM_BASE}'/products/atmos/grib2'
declare -rx COM_ATMOS_GRIB_GRID_TMPL=${COM_ATMOS_GRIB_TMPL}'/${GRID}'
declare -rx COM_ATMOS_BUFR_TMPL=${COM_BASE}'/products/atmos/bufr'
Expand All @@ -70,17 +70,17 @@ declare -rx COM_ATMOS_RADMON_TMPL=${COM_BASE}'/products/atmos/radmon'
declare -rx COM_ATMOS_MINMON_TMPL=${COM_BASE}'/products/atmos/minmon'
declare -rx COM_ATMOS_WMO_TMPL=${COM_BASE}'/products/atmos/wmo'

declare -rx COM_WAVE_RESTART_TMPL=${COM_BASE}'/model_data/wave/restart'
declare -rx COM_WAVE_PREP_TMPL=${COM_BASE}'/model_data/wave/prep'
declare -rx COM_WAVE_HISTORY_TMPL=${COM_BASE}'/model_data/wave/history'
declare -rx COM_WAVE_RESTART_TMPL=${COM_BASE}'/model/wave/restart'
declare -rx COM_WAVE_PREP_TMPL=${COM_BASE}'/model/wave/prep'
declare -rx COM_WAVE_HISTORY_TMPL=${COM_BASE}'/model/wave/history'
declare -rx COM_WAVE_GRID_TMPL=${COM_BASE}'/products/wave/gridded'
declare -rx COM_WAVE_STATION_TMPL=${COM_BASE}'/products/wave/station'
declare -rx COM_WAVE_GEMPAK_TMPL=${COM_BASE}'/products/wave/gempak'
declare -rx COM_WAVE_WMO_TMPL=${COM_BASE}'/products/wave/wmo'

declare -rx COM_OCEAN_HISTORY_TMPL=${COM_BASE}'/model_data/ocean/history'
declare -rx COM_OCEAN_RESTART_TMPL=${COM_BASE}'/model_data/ocean/restart'
declare -rx COM_OCEAN_INPUT_TMPL=${COM_BASE}'/model_data/ocean/input'
declare -rx COM_OCEAN_HISTORY_TMPL=${COM_BASE}'/model/ocean/history'
declare -rx COM_OCEAN_RESTART_TMPL=${COM_BASE}'/model/ocean/restart'
declare -rx COM_OCEAN_INPUT_TMPL=${COM_BASE}'/model/ocean/input'
declare -rx COM_OCEAN_ANALYSIS_TMPL=${COM_BASE}'/analysis/ocean'
declare -rx COM_OCEAN_BMATRIX_TMPL=${COM_BASE}'/bmatrix/ocean'
declare -rx COM_OCEAN_NETCDF_TMPL=${COM_BASE}'/products/ocean/netcdf'
Expand All @@ -89,14 +89,14 @@ declare -rx COM_OCEAN_GRIB_GRID_TMPL=${COM_OCEAN_GRIB_TMPL}'/${GRID}'

declare -rx COM_ICE_ANALYSIS_TMPL=${COM_BASE}'/analysis/ice'
declare -rx COM_ICE_BMATRIX_TMPL=${COM_BASE}'/bmatrix/ice'
declare -rx COM_ICE_INPUT_TMPL=${COM_BASE}'/model_data/ice/input'
declare -rx COM_ICE_HISTORY_TMPL=${COM_BASE}'/model_data/ice/history'
declare -rx COM_ICE_RESTART_TMPL=${COM_BASE}'/model_data/ice/restart'
declare -rx COM_ICE_INPUT_TMPL=${COM_BASE}'/model/ice/input'
declare -rx COM_ICE_HISTORY_TMPL=${COM_BASE}'/model/ice/history'
declare -rx COM_ICE_RESTART_TMPL=${COM_BASE}'/model/ice/restart'
declare -rx COM_ICE_NETCDF_TMPL=${COM_BASE}'/products/ice/netcdf'
declare -rx COM_ICE_GRIB_TMPL=${COM_BASE}'/products/ice/grib2'
declare -rx COM_ICE_GRIB_GRID_TMPL=${COM_ICE_GRIB_TMPL}'/${GRID}'

declare -rx COM_CHEM_HISTORY_TMPL=${COM_BASE}'/model_data/chem/history'
declare -rx COM_CHEM_HISTORY_TMPL=${COM_BASE}'/model/chem/history'
declare -rx COM_CHEM_ANALYSIS_TMPL=${COM_BASE}'/analysis/chem'

declare -rx COM_MED_RESTART_TMPL=${COM_BASE}'/model_data/med/restart'
declare -rx COM_MED_RESTART_TMPL=${COM_BASE}'/model/med/restart'
8 changes: 8 additions & 0 deletions parm/config/gfs/config.resources
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,14 @@ case ${machine} in
# shellcheck disable=SC2034
mem_node_max=""
;;
"AZUREPW")
export PARTITION_BATCH="compute"
npe_node_max=24
max_tasks_per_node=24
# TODO Supply a max mem/node value for AZURE
# shellcheck disable=SC2034
mem_node_max=""
;;
"CONTAINER")
max_tasks_per_node=1
# TODO Supply a max mem/node value for a container
Expand Down
1 change: 1 addition & 0 deletions parm/config/gfs/config.resources.AWSPW
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# AWS-specific job resources

export is_exclusive="True"
export memory=None

# shellcheck disable=SC2312
for mem_var in $(env | grep '^memory_' | cut -d= -f1); do
Expand Down
11 changes: 11 additions & 0 deletions parm/config/gfs/config.resources.AZUREPW
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#! /usr/bin/env bash

# AZURE-specific job resources

export is_exclusive="True"
unset memory

# shellcheck disable=SC2312
for mem_var in $(env | grep '^memory_' | cut -d= -f1); do
unset "${mem_var}"
done
Loading

0 comments on commit 59c2002

Please sign in to comment.