From 11943e36ba12b3df49c51942da780698fab02d38 Mon Sep 17 00:00:00 2001 From: DavidBurrows-NCO <82525974+DavidBurrows-NCO@users.noreply.github.com> Date: Tue, 2 Jul 2024 12:58:10 -0400 Subject: [PATCH] Fix xml file setup and complete C48 ATM and S2SW runs for CI on Gaea (#2701) This PR sets up the ability on Gaea for auto generation of a clean xml file, i.e., an xml file that does not need any alterations before running rocoto. Refs #2572 Refs #2664 --- env/GAEA.env | 40 +++++++++++++++++++++++---- parm/config/gfs/config.base | 1 + parm/config/gfs/config.resources.GAEA | 5 ++++ sorc/link_workflow.sh | 2 +- workflow/hosts/gaea.yaml | 21 ++++++++------ workflow/rocoto/tasks.py | 2 ++ 6 files changed, 55 insertions(+), 16 deletions(-) diff --git a/env/GAEA.env b/env/GAEA.env index 5509a29a3f..d72be6ba22 100755 --- a/env/GAEA.env +++ b/env/GAEA.env @@ -12,24 +12,52 @@ step=$1 export launcher="srun -l --export=ALL" export mpmd_opt="--multi-prog --output=mpmd.%j.%t.out" +export OMP_STACKSIZE=2048000 +export NTHSTACK=1024000000 + ulimit -s unlimited ulimit -a -if [[ "${step}" = "fcst" ]]; then +if [[ "${step}" = "waveinit" ]]; then + + export CFP_MP="YES" + if [[ "${step}" = "waveprep" ]]; then export MP_PULSE=0 ; fi + export wavempexec=${launcher} + export wave_mpmd=${mpmd_opt} - ppn="npe_node_${step}_${RUN}" - [[ -z "${!ppn+0}" ]] && ppn="npe_node_${step}" - nprocs="npe_${step}_${RUN}" - [[ -z ${!nprocs+0} ]] && nprocs="npe_${step}" +elif [[ "${step}" = "fcst" ]]; then + if [[ "${CDUMP}" =~ "gfs" ]]; then + nprocs="npe_${step}_gfs" + ppn="npe_node_${step}_gfs" || ppn="npe_node_${step}" + else + nprocs="npe_${step}" + ppn="npe_node_${step}" + fi (( nnodes = (${!nprocs}+${!ppn}-1)/${!ppn} )) (( ntasks = nnodes*${!ppn} )) # With ESMF threading, the model wants to use the full node export APRUN_UFS="${launcher} -n ${ntasks}" unset nprocs ppn nnodes ntasks + elif [[ "${step}" = "atmos_products" ]]; then - export USE_CFP="YES" # Use MPMD for downstream product generation + export USE_CFP="YES" # Use MPMD for downstream product generation on Hera + +elif [[ "${step}" = "oceanice_products" ]]; then + + nth_max=$((npe_node_max / npe_node_oceanice_products)) + + export NTHREADS_OCNICEPOST=${nth_oceanice_products:-1} + export APRUN_OCNICEPOST="${launcher} -n 1 --cpus-per-task=${NTHREADS_OCNICEPOST}" + +elif [[ "${step}" = "fit2obs" ]]; then + + nth_max=$((npe_node_max / npe_node_fit2obs)) + + export NTHREADS_FIT2OBS=${nth_fit2obs:-1} + [[ ${NTHREADS_FIT2OBS} -gt ${nth_max} ]] && export NTHREADS_FIT2OBS=${nth_max} + export MPIRUN="${launcher} -n ${npe_fit2obs} --cpus-per-task=${NTHREADS_FIT2OBS}" fi diff --git a/parm/config/gfs/config.base b/parm/config/gfs/config.base index 9fd494a9eb..f78c7fb400 100644 --- a/parm/config/gfs/config.base +++ b/parm/config/gfs/config.base @@ -19,6 +19,7 @@ export QUEUE_SERVICE="@QUEUE_SERVICE@" export PARTITION_BATCH="@PARTITION_BATCH@" export PARTITION_SERVICE="@PARTITION_SERVICE@" export RESERVATION="@RESERVATION@" +export CLUSTERS="@CLUSTERS@" # Project to use in mass store: export HPSS_PROJECT="@HPSS_PROJECT@" diff --git a/parm/config/gfs/config.resources.GAEA b/parm/config/gfs/config.resources.GAEA index 64990b299f..3f0934edc2 100644 --- a/parm/config/gfs/config.resources.GAEA +++ b/parm/config/gfs/config.resources.GAEA @@ -20,3 +20,8 @@ case ${step} in ;; esac + +# shellcheck disable=SC2312 +for mem_var in $(env | grep '^memory_' | cut -d= -f1); do + unset "${mem_var}" +done diff --git a/sorc/link_workflow.sh b/sorc/link_workflow.sh index 4973ab8d7d..8694f856b5 100755 --- a/sorc/link_workflow.sh +++ b/sorc/link_workflow.sh @@ -75,7 +75,7 @@ case "${machine}" in "hercules") FIX_DIR="/work/noaa/global/glopara/fix" ;; "jet") FIX_DIR="/lfs4/HFIP/hfv3gfs/glopara/git/fv3gfs/fix" ;; "s4") FIX_DIR="/data/prod/glopara/fix" ;; - "gaea") FIX_DIR="/gpfs/f5/epic/proj-shared/global/glopara/data/fix" ;; + "gaea") FIX_DIR="/gpfs/f5/ufs-ard/world-shared/global/glopara/data/fix" ;; *) echo "FATAL: Unknown target machine ${machine}, couldn't set FIX_DIR" exit 1 diff --git a/workflow/hosts/gaea.yaml b/workflow/hosts/gaea.yaml index 7ca8420997..ff9877e77b 100644 --- a/workflow/hosts/gaea.yaml +++ b/workflow/hosts/gaea.yaml @@ -1,19 +1,22 @@ -BASE_GIT: '/gpfs/f5/epic/proj-shared/global/glopara/data/git' -DMPDIR: '/gpfs/f5/epic/proj-shared/global/glopara/data/dump' -BASE_CPLIC: '/gpfs/f5/epic/proj-shared/global/glopara/data/ICSDIR/prototype_ICs' -PACKAGEROOT: '/gpfs/f5/epic/proj-shared/global/glopara/data/nwpara' -COMROOT: '/gpfs/f5/epic/proj-shared/global/glopara/data/com' +BASE_GIT: '/gpfs/f5/ufs-ard/world-shared/global/glopara/data/git' +DMPDIR: '/gpfs/f5/ufs-ard/world-shared/global/glopara/data/dump' +BASE_CPLIC: '/gpfs/f5/ufs-ard/world-shared/global/glopara/data/ICSDIR/prototype_ICs' +PACKAGEROOT: '/gpfs/f5/ufs-ard/world-shared/global/glopara/data/nwpara' +COMROOT: '/gpfs/f5/ufs-ard/world-shared/global/glopara/data/com' COMINsyn: '${COMROOT}/gfs/prod/syndat' -HOMEDIR: '/gpfs/f5/epic/scratch/${USER}' -STMP: '/gpfs/f5/epic/scratch/${USER}' -PTMP: '/gpfs/f5/epic/scratch/${USER}' +HOMEDIR: '/gpfs/f5/ufs-ard/scratch/${USER}' +STMP: '/gpfs/f5/ufs-ard/scratch/${USER}' +PTMP: '/gpfs/f5/ufs-ard/scratch/${USER}' NOSCRUB: $HOMEDIR -ACCOUNT: epic +ACCOUNT: ufs-ard +ACCOUNT_SERVICE: ufs-ard SCHEDULER: slurm QUEUE: normal QUEUE_SERVICE: normal PARTITION_BATCH: batch PARTITION_SERVICE: batch +RESERVATION: '' +CLUSTERS: 'c5' CHGRP_RSTPROD: 'NO' CHGRP_CMD: 'chgrp rstprod' HPSSARCH: 'NO' diff --git a/workflow/rocoto/tasks.py b/workflow/rocoto/tasks.py index a126992cee..e18b45ef28 100644 --- a/workflow/rocoto/tasks.py +++ b/workflow/rocoto/tasks.py @@ -226,6 +226,8 @@ def get_resource(self, task_name): native = '--export=NONE' if task_config['RESERVATION'] != "": native += '' if task_name in Tasks.SERVICE_TASKS else ' --reservation=' + task_config['RESERVATION'] + if task_config['CLUSTERS'] != "": + native += ' --clusters=' + task_config['CLUSTERS'] queue = task_config['QUEUE_SERVICE'] if task_name in Tasks.SERVICE_TASKS else task_config['QUEUE']