Skip to content

Commit

Permalink
Add support for forecast-only runs on AWS (#2711)
Browse files Browse the repository at this point in the history
The purpose of this PR it to merge code that allowing global-workflow
run on AWS, first focus on ATM forecast only.

Resolves: #2709
---------

Co-authored-by: David Huber <[email protected]>
  • Loading branch information
weihuang-jedi and DavidHuber-NOAA committed Aug 13, 2024
1 parent 5699167 commit eba813f
Show file tree
Hide file tree
Showing 19 changed files with 169 additions and 82 deletions.
6 changes: 4 additions & 2 deletions env/AWSPW.env
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ fi

step=$1

export launcher="mpiexec.hydra"
export mpmd_opt=""
export launcher="srun -l --export=ALL"
export mpmd_opt="--multi-prog --output=mpmd.%j.%t.out"

# Configure MPI environment
export OMP_STACKSIZE=2048000
Expand All @@ -35,6 +35,8 @@ fi

if [[ "${step}" = "fcst" ]] || [[ "${step}" = "efcs" ]]; then

export launcher="srun --mpi=pmi2 -l"

(( nnodes = (ntasks+tasks_per_node-1)/tasks_per_node ))
(( ufs_ntasks = nnodes*tasks_per_node ))
# With ESMF threading, the model wants to use the full node
Expand Down
49 changes: 49 additions & 0 deletions modulefiles/module_base.noaacloud.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
help([[
Load environment to run GFS on noaacloud
]])

local spack_mod_path=(os.getenv("spack_mod_path") or "None")
prepend_path("MODULEPATH", spack_mod_path)

load(pathJoin("stack-intel", (os.getenv("stack_intel_ver") or "None")))
load(pathJoin("stack-intel-oneapi-mpi", (os.getenv("stack_impi_ver") or "None")))
load(pathJoin("python", (os.getenv("python_ver") or "None")))

load(pathJoin("jasper", (os.getenv("jasper_ver") or "None")))
load(pathJoin("libpng", (os.getenv("libpng_ver") or "None")))
load(pathJoin("cdo", (os.getenv("cdo_ver") or "None")))
--load(pathJoin("R", (os.getenv("R_ver") or "None")))

load(pathJoin("hdf5", (os.getenv("hdf5_ver") or "None")))
load(pathJoin("netcdf-c", (os.getenv("netcdf_c_ver") or "None")))
load(pathJoin("netcdf-fortran", (os.getenv("netcdf_fortran_ver") or "None")))

load(pathJoin("nco", (os.getenv("nco_ver") or "None")))
load(pathJoin("prod_util", (os.getenv("prod_util_ver") or "None")))
load(pathJoin("grib-util", (os.getenv("grib_util_ver") or "None")))
load(pathJoin("g2tmpl", (os.getenv("g2tmpl_ver") or "None")))
load(pathJoin("gsi-ncdiag", (os.getenv("gsi_ncdiag_ver") or "None")))
load(pathJoin("crtm", (os.getenv("crtm_ver") or "None")))
load(pathJoin("bufr", (os.getenv("bufr_ver") or "None")))
load(pathJoin("wgrib2", (os.getenv("wgrib2_ver") or "None")))
load(pathJoin("py-f90nml", (os.getenv("py_f90nml_ver") or "None")))
load(pathJoin("py-netcdf4", (os.getenv("py_netcdf4_ver") or "None")))
load(pathJoin("py-pyyaml", (os.getenv("py_pyyaml_ver") or "None")))
load(pathJoin("py-jinja2", (os.getenv("py_jinja2_ver") or "None")))
load(pathJoin("py-pandas", (os.getenv("py_pandas_ver") or "None")))
load(pathJoin("py-python-dateutil", (os.getenv("py_python_dateutil_ver") or "None")))
--load(pathJoin("met", (os.getenv("met_ver") or "None")))
--load(pathJoin("metplus", (os.getenv("metplus_ver") or "None")))
load(pathJoin("py-xarray", (os.getenv("py_xarray_ver") or "None")))

setenv("WGRIB2","wgrib2")
setenv("UTILROOT",(os.getenv("prod_util_ROOT") or "None"))

--prepend_path("MODULEPATH", pathJoin("/scratch1/NCEPDEV/global/glopara/git/prepobs/v" .. (os.getenv("prepobs_run_ver") or "None"), "modulefiles"))
--prepend_path("MODULEPATH", pathJoin("/scratch1/NCEPDEV/global/glopara/git/prepobs/feature-GFSv17_com_reorg_log_update/modulefiles"))
--load(pathJoin("prepobs", (os.getenv("prepobs_run_ver") or "None")))

--prepend_path("MODULEPATH", pathJoin("/scratch1/NCEPDEV/global/glopara/git/Fit2Obs/v" .. (os.getenv("fit2obs_ver") or "None"), "modulefiles"))
--load(pathJoin("fit2obs", (os.getenv("fit2obs_ver") or "None")))

whatis("Description: GFS run environment")
15 changes: 15 additions & 0 deletions modulefiles/module_gwci.noaacloud.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
help([[
Load environment to run GFS workflow setup scripts on noaacloud
]])

prepend_path("MODULEPATH", "/contrib/spack-stack/spack-stack-1.6.0/envs/unified-env/install/modulefiles/Core")

load(pathJoin("stack-intel", os.getenv("2021.3.0")))
load(pathJoin("stack-intel-oneapi-mpi", os.getenv("2021.3.0")))

load(pathJoin("netcdf-c", os.getenv("4.9.2")))
load(pathJoin("netcdf-fortran", os.getenv("4.6.1")))
load(pathJoin("nccmp","1.9.0.1"))
load(pathJoin("wgrib2", "2.0.8"))

whatis("Description: GFS run setup CI environment")
20 changes: 20 additions & 0 deletions modulefiles/module_gwsetup.noaacloud.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
help([[
Load environment to run GFS workflow setup scripts on noaacloud
]])

load(pathJoin("rocoto"))

prepend_path("MODULEPATH", "/contrib/spack-stack/spack-stack-1.6.0/envs/unified-env/install/modulefiles/Core")

local stack_intel_ver=os.getenv("stack_intel_ver") or "2021.3.0"
local python_ver=os.getenv("python_ver") or "3.10.3"

load(pathJoin("stack-intel", stack_intel_ver))
load(pathJoin("python", python_ver))
load("py-jinja2")
load("py-pyyaml")
load("py-numpy")
local git_ver=os.getenv("git_ver") or "1.8.3.1"
load(pathJoin("git", git_ver))

whatis("Description: GFS run setup environment")
8 changes: 8 additions & 0 deletions parm/config/gfs/config.base
Original file line number Diff line number Diff line change
Expand Up @@ -483,4 +483,12 @@ export OFFSET_START_HOUR=0
# Number of regional collectives to create soundings for
export NUM_SND_COLLECTIVES=${NUM_SND_COLLECTIVES:-9}

# The tracker, genesis, and METplus jobs are not supported on AWS yet
# TODO: we should place these in workflow/hosts/awspw.yaml as part of AWS setup, not for general.
if [[ "${machine}" == "AWSPW" ]]; then
export DO_TRACKER="NO"
export DO_GENESIS="NO"
export DO_METP="NO"
fi

echo "END: config.base"
3 changes: 2 additions & 1 deletion parm/config/gfs/config.resources
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,8 @@ case ${machine} in
;;
"AWSPW")
export PARTITION_BATCH="compute"
max_tasks_per_node=40
npe_node_max=36
max_tasks_per_node=36
# TODO Supply a max mem/node value for AWS
# shellcheck disable=SC2034
mem_node_max=""
Expand Down
10 changes: 10 additions & 0 deletions parm/config/gfs/config.resources.AWSPW
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#! /usr/bin/env bash

# AWS-specific job resources

export is_exclusive="True"

# shellcheck disable=SC2312
for mem_var in $(env | grep '^memory_' | cut -d= -f1); do
unset "${mem_var}"
done
2 changes: 1 addition & 1 deletion sorc/build_all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ build_opts["ww3prepost"]="${_wave_opt} ${_verbose_opt} ${_build_ufs_opt} ${_buil

# Optional DA builds
if [[ "${_build_ufsda}" == "YES" ]]; then
if [[ "${MACHINE_ID}" != "orion" && "${MACHINE_ID}" != "hera" && "${MACHINE_ID}" != "hercules" && "${MACHINE_ID}" != "wcoss2" ]]; then
if [[ "${MACHINE_ID}" != "orion" && "${MACHINE_ID}" != "hera" && "${MACHINE_ID}" != "hercules" && "${MACHINE_ID}" != "wcoss2" && "${MACHINE_ID}" != "noaacloud" ]]; then
echo "NOTE: The GDAS App is not supported on ${MACHINE_ID}. Disabling build."
else
build_jobs["gdas"]=8
Expand Down
29 changes: 4 additions & 25 deletions sorc/build_ufs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,30 +41,9 @@ COMPILE_NR=0
CLEAN_BEFORE=YES
CLEAN_AFTER=NO

if [[ "${MACHINE_ID}" != "noaacloud" ]]; then
BUILD_JOBS=${BUILD_JOBS:-8} ./tests/compile.sh "${MACHINE_ID}" "${MAKE_OPT}" "${COMPILE_NR}" "intel" "${CLEAN_BEFORE}" "${CLEAN_AFTER}"
mv "./tests/fv3_${COMPILE_NR}.exe" ./tests/ufs_model.x
mv "./tests/modules.fv3_${COMPILE_NR}.lua" ./tests/modules.ufs_model.lua
cp "./modulefiles/ufs_common.lua" ./tests/ufs_common.lua
else

if [[ "${PW_CSP:-}" == "aws" ]]; then
set +x
# TODO: This will need to be addressed further when the EPIC stacks are available/supported.
module use /contrib/spack-stack/envs/ufswm/install/modulefiles/Core
module load stack-intel
module load stack-intel-oneapi-mpi
module load ufs-weather-model-env/1.0.0
# TODO: It is still uncertain why this is the only module that is
# missing; check the spack build as this needed to be added manually.
module load w3emc/2.9.2 # TODO: This has similar issues for the EPIC stack.
module list
set -x
fi

export CMAKE_FLAGS="${MAKE_OPT}"
BUILD_JOBS=${BUILD_JOBS:-8} ./build.sh
mv "${cwd}/ufs_model.fd/build/ufs_model" "${cwd}/ufs_model.fd/tests/ufs_model.x"
fi
BUILD_JOBS=${BUILD_JOBS:-8} ./tests/compile.sh "${MACHINE_ID}" "${MAKE_OPT}" "${COMPILE_NR}" "intel" "${CLEAN_BEFORE}" "${CLEAN_AFTER}"
mv "./tests/fv3_${COMPILE_NR}.exe" ./tests/ufs_model.x
mv "./tests/modules.fv3_${COMPILE_NR}.lua" ./tests/modules.ufs_model.lua
cp "./modulefiles/ufs_common.lua" ./tests/ufs_common.lua

exit 0
1 change: 1 addition & 0 deletions sorc/link_workflow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ case "${machine}" in
"jet") FIX_DIR="/lfs4/HFIP/hfv3gfs/glopara/git/fv3gfs/fix" ;;
"s4") FIX_DIR="/data/prod/glopara/fix" ;;
"gaea") FIX_DIR="/gpfs/f5/ufs-ard/world-shared/global/glopara/data/fix" ;;
"noaacloud") FIX_DIR="/contrib/global-workflow-shared-data/fix" ;;
*)
echo "FATAL: Unknown target machine ${machine}, couldn't set FIX_DIR"
exit 1
Expand Down
2 changes: 1 addition & 1 deletion ush/load_fv3gfs_modules.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ source "${HOMEgfs}/versions/run.ver"
module use "${HOMEgfs}/modulefiles"

case "${MACHINE_ID}" in
"wcoss2" | "hera" | "orion" | "hercules" | "gaea" | "jet" | "s4")
"wcoss2" | "hera" | "orion" | "hercules" | "gaea" | "jet" | "s4" | "noaacloud")
module load "module_base.${MACHINE_ID}"
;;
*)
Expand Down
45 changes: 13 additions & 32 deletions ush/load_ufswm_modules.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,40 +11,21 @@ ulimit_s=$( ulimit -S -s )

source "${HOMEgfs}/ush/detect_machine.sh"
source "${HOMEgfs}/ush/module-setup.sh"
if [[ "${MACHINE_ID}" != "noaacloud" ]]; then
module use "${HOMEgfs}/sorc/ufs_model.fd/modulefiles"
module load "ufs_${MACHINE_ID}.intel"
module load prod_util
if [[ "${MACHINE_ID}" = "wcoss2" ]]; then
module load cray-pals
module load cfp
module load libjpeg
module load craype-network-ucx
module load cray-mpich-ucx
else
export UTILROOT=${prod_util_ROOT}
fi
module load wgrib2
export WGRIB2=wgrib2
fi
if [[ "${MACHINE_ID}" == "noaacloud" ]]; then
if [[ "${PW_CSP:-}" = "aws" ]]; then

# TODO: This can be cleaned-up; most of this is a hack for now.
module use "/contrib/spack-stack/envs/ufswm/install/modulefiles/Core"
module load "stack-intel"
module load "stack-intel-oneapi-mpi"
module use -a "/contrib/spack-stack/miniconda/modulefiles/miniconda/"
module load "py39_4.12.0"
module load "ufs-weather-model-env/1.0.0"
export NETCDF="/contrib/spack-stack/miniconda/apps/miniconda/py39_4.12.0"
# TODO: Are there plans for EPIC to maintain this package or should GW provide support?
export UTILROOT="/contrib/global-workflow/NCEPLIBS-prod_util"
export PATH="${PATH}:/contrib/global-workflow/bin"
ndate_path="$(command -v ndate)"
export NDATE="${ndate_path}"
fi
module use "${HOMEgfs}/sorc/ufs_model.fd/modulefiles"
module load "ufs_${MACHINE_ID}.intel"
module load prod_util
if [[ "${MACHINE_ID}" = "wcoss2" ]]; then
module load cray-pals
module load cfp
module load libjpeg
module load craype-network-ucx
module load cray-mpich-ucx
else
export UTILROOT=${prod_util_ROOT}
fi
module load wgrib2
export WGRIB2=wgrib2

module list
unset MACHINE_ID
Expand Down
6 changes: 2 additions & 4 deletions ush/module-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,8 @@ elif [[ ${MACHINE_ID} = discover* ]]; then
# TODO: This can likely be made more general once other cloud
# platforms come online.
elif [[ ${MACHINE_ID} = "noaacloud" ]]; then

export SPACK_ROOT=/contrib/global-workflow/spack-stack/spack
export PATH=${PATH}:${SPACK_ROOT}/bin
. "${SPACK_ROOT}"/share/spack/setup-env.sh
# We are on NOAA Cloud
module purge

else
echo WARNING: UNKNOWN PLATFORM 1>&2
Expand Down
5 changes: 5 additions & 0 deletions versions/build.noaacloud.ver
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
export stack_intel_ver=2021.3.0
export stack_impi_ver=2021.3.0
export spack_env=gsi-addon-env
source "${HOMEgfs:-}/versions/build.spack.ver"
export spack_mod_path="/contrib/spack-stack/spack-stack-${spack_stack_ver}/envs/gsi-addon-env/install/modulefiles/Core"
8 changes: 8 additions & 0 deletions versions/run.noaacloud.ver
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
export stack_intel_ver=2021.3.0
export stack_impi_ver=2021.3.0
export spack_env=gsi-addon-env

source "${HOMEgfs:-}/versions/run.spack.ver"
export spack_mod_path="/contrib/spack-stack/spack-stack-${spack_stack_ver}/envs/gsi-addon-env/install/modulefiles/Core"

export cdo_ver=2.2.0
2 changes: 1 addition & 1 deletion workflow/hosts.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def detect(cls):
elif container is not None:
machine = 'CONTAINER'
elif pw_csp is not None:
if pw_csp.lower() not in ['azure', 'aws', 'gcp']:
if pw_csp.lower() not in ['azure', 'aws', 'google']:
raise ValueError(
f'NOAA cloud service provider "{pw_csp}" is not supported.')
machine = f"{pw_csp.upper()}PW"
Expand Down
21 changes: 11 additions & 10 deletions workflow/hosts/awspw.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
BASE_GIT: '/scratch1/NCEPDEV/global/glopara/git' #TODO: This does not yet exist.
DMPDIR: '/scratch1/NCEPDEV/global/glopara/dump' # TODO: This does not yet exist.
PACKAGEROOT: '/scratch1/NCEPDEV/global/glopara/nwpara' #TODO: This does not yet exist.
COMINsyn: '/scratch1/NCEPDEV/global/glopara/com/gfs/prod/syndat' #TODO: This does not yet exist.
BASE_GIT: '' #TODO: This does not yet exist.
DMPDIR: '' # TODO: This does not yet exist.
PACKAGEROOT: '' #TODO: This does not yet exist.
COMINsyn: '' #TODO: This does not yet exist.
HOMEDIR: '/contrib/${USER}'
STMP: '/lustre/${USER}/stmp2/'
PTMP: '/lustre/${USER}/stmp4/'
NOSCRUB: ${HOMEDIR}
ACCOUNT: hwufscpldcld
STMP: '/lustre/${USER}/stmp/'
PTMP: '/lustre/${USER}/ptmp/'
NOSCRUB: '${HOMEDIR}'
ACCOUNT: '${USER}'
SCHEDULER: slurm
QUEUE: batch
QUEUE_SERVICE: batch
Expand All @@ -16,10 +16,11 @@ RESERVATION: ''
CLUSTERS: ''
CHGRP_RSTPROD: 'YES'
CHGRP_CMD: 'chgrp rstprod' # TODO: This is not yet supported.
HPSSARCH: 'YES'
HPSSARCH: 'NO'
HPSS_PROJECT: emc-global #TODO: See `ATARDIR` below.
BASE_CPLIC: '/bucket/global-workflow-shared-data/ICSDIR/prototype_ICs'
LOCALARCH: 'NO'
ATARDIR: '/NCEPDEV/${HPSS_PROJECT}/1year/${USER}/${machine}/scratch/${PSLOT}' # TODO: This will not yet work from AWS.
ATARDIR: '' # TODO: This will not yet work from AWS.
MAKE_NSSTBUFR: 'NO'
MAKE_ACFTBUFR: 'NO'
SUPPORTED_RESOLUTIONS: ['C48', 'C96'] # TODO: Test and support all cubed-sphere resolutions.
5 changes: 4 additions & 1 deletion workflow/rocoto/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,10 @@ def get_resource(self, task_name):
else:
native += ':shared'
elif scheduler in ['slurm']:
native = '--export=NONE'
if task_config.get('is_exclusive', False):
native = '--exclusive'
else:
native = '--export=NONE'
if task_config['RESERVATION'] != "":
native += '' if task_name in Tasks.SERVICE_TASKS else ' --reservation=' + task_config['RESERVATION']
if task_config.get('CLUSTERS', "") not in ["", '@CLUSTERS@']:
Expand Down
14 changes: 10 additions & 4 deletions workflow/rocoto/workflow_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,16 @@ def _write_crontab(self, crontab_file: str = None, cronint: int = 5) -> None:

strings = ['',
f'#################### {pslot} ####################',
f'MAILTO="{replyto}"',
f'{cronintstr} {rocotorunstr}',
'#################################################################',
'']
f'MAILTO="{replyto}"'
]
# AWS need 'SHELL', and 'BASH_ENV' defined, or, the crontab job won't start.
if os.environ.get('PW_CSP', None) in ['aws', 'azure', 'google']:
strings.extend([f'SHELL="/bin/bash"',
f'BASH_ENV="/etc/bashrc"'
])
strings.extend([f'{cronintstr} {rocotorunstr}',
'#################################################################',
''])

if crontab_file is None:
crontab_file = f"{expdir}/{pslot}.crontab"
Expand Down

0 comments on commit eba813f

Please sign in to comment.