Skip to content

Commit

Permalink
Add AWS functionality to workflow setup and forecast job (#1667)
Browse files Browse the repository at this point in the history
  • Loading branch information
HenryRWinterbottom committed Jun 18, 2023
1 parent 53a88c3 commit 38cd0bc
Show file tree
Hide file tree
Showing 7 changed files with 227 additions and 11 deletions.
137 changes: 137 additions & 0 deletions env/AWSPW.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#! /usr/bin/env bash

if [[ $# -ne 1 ]]; then

echo "Must specify an input argument to set runtime environment variables!"
echo "argument can be any one of the following:"
echo "atmanlrun atmensanlrun aeroanlrun landanlrun"
echo "anal sfcanl fcst post vrfy metp"
echo "eobs eupd ecen efcs epos"
echo "postsnd awips gempak"
exit 1

fi

step=$1

export npe_node_max=36
export launcher="mpiexec.hydra"
export mpmd_opt=""

# Configure MPI environment
export OMP_STACKSIZE=2048000
export NTHSTACK=1024000000

ulimit -s unlimited
ulimit -a

if [[ "${step}" = "fcst" ]] || [[ "${step}" = "efcs" ]]; then

if [[ "${CDUMP}" =~ "gfs" ]]; then
nprocs="npe_${step}_gfs"
ppn="npe_node_${step}_gfs" || ppn="npe_node_${step}"
else
nprocs="npe_${step}"
ppn="npe_node_${step}"
fi
(( nnodes = (${!nprocs}+${!ppn}-1)/${!ppn} ))
(( ntasks = nnodes*${!ppn} ))
# With ESMF threading, the model wants to use the full node
export APRUN_UFS="${launcher} -n ${ntasks}"
unset nprocs ppn nnodes ntasks

elif [[ "${step}" = "post" ]]; then

nth_max=$((npe_node_max / npe_node_post))

export NTHREADS_NP=${nth_np:-1}
[[ ${NTHREADS_NP} -gt ${nth_max} ]] && export NTHREADS_NP=${nth_max}
export APRUN_NP="${launcher} -n ${npe_post}"

export NTHREADS_DWN=${nth_dwn:-1}
[[ ${NTHREADS_DWN} -gt ${nth_max} ]] && export NTHREADS_DWN=${nth_max}
export APRUN_DWN="${launcher} -n ${npe_dwn}"

elif [[ "${step}" = "ecen" ]]; then

nth_max=$((npe_node_max / npe_node_ecen))

export NTHREADS_ECEN=${nth_ecen:-${nth_max}}
[[ ${NTHREADS_ECEN} -gt ${nth_max} ]] && export NTHREADS_ECEN=${nth_max}
export APRUN_ECEN="${launcher} -n ${npe_ecen}"

export NTHREADS_CHGRES=${nth_chgres:-12}
[[ ${NTHREADS_CHGRES} -gt ${npe_node_max} ]] && export NTHREADS_CHGRES=${npe_node_max}
export APRUN_CHGRES="time"

export NTHREADS_CALCINC=${nth_calcinc:-1}
[[ ${NTHREADS_CALCINC} -gt ${nth_max} ]] && export NTHREADS_CALCINC=${nth_max}
export APRUN_CALCINC="${launcher} -n ${npe_ecen}"

elif [[ "${step}" = "esfc" ]]; then

nth_max=$((npe_node_max / npe_node_esfc))

export NTHREADS_ESFC=${nth_esfc:-${nth_max}}
[[ ${NTHREADS_ESFC} -gt ${nth_max} ]] && export NTHREADS_ESFC=${nth_max}
export APRUN_ESFC="${launcher} -n ${npe_esfc}"

export NTHREADS_CYCLE=${nth_cycle:-14}
[[ ${NTHREADS_CYCLE} -gt ${npe_node_max} ]] && export NTHREADS_CYCLE=${npe_node_max}
export APRUN_CYCLE="${launcher} -n ${npe_esfc}"

elif [[ "${step}" = "epos" ]]; then

nth_max=$((npe_node_max / npe_node_epos))

export NTHREADS_EPOS=${nth_epos:-${nth_max}}
[[ ${NTHREADS_EPOS} -gt ${nth_max} ]] && export NTHREADS_EPOS=${nth_max}
export APRUN_EPOS="${launcher} -n ${npe_epos}"

elif [[ "${step}" = "postsnd" ]]; then

export CFP_MP="YES"

nth_max=$((npe_node_max / npe_node_postsnd))

export NTHREADS_POSTSND=${nth_postsnd:-1}
[[ ${NTHREADS_POSTSND} -gt ${nth_max} ]] && export NTHREADS_POSTSND=${nth_max}
export APRUN_POSTSND="${launcher} -n ${npe_postsnd}"

export NTHREADS_POSTSNDCFP=${nth_postsndcfp:-1}
[[ ${NTHREADS_POSTSNDCFP} -gt ${nth_max} ]] && export NTHREADS_POSTSNDCFP=${nth_max}
export APRUN_POSTSNDCFP="${launcher} -n ${npe_postsndcfp} ${mpmd_opt}"

elif [[ "${step}" = "awips" ]]; then

nth_max=$((npe_node_max / npe_node_awips))

export NTHREADS_AWIPS=${nth_awips:-2}
[[ ${NTHREADS_AWIPS} -gt ${nth_max} ]] && export NTHREADS_AWIPS=${nth_max}
export APRUN_AWIPSCFP="${launcher} -n ${npe_awips} ${mpmd_opt}"

elif [[ "${step}" = "gempak" ]]; then

export CFP_MP="YES"

if [[ ${CDUMP} == "gfs" ]]; then
npe_gempak=${npe_gempak_gfs}
npe_node_gempak=${npe_node_gempak_gfs}
fi

nth_max=$((npe_node_max / npe_node_gempak))

export NTHREADS_GEMPAK=${nth_gempak:-1}
[[ ${NTHREADS_GEMPAK} -gt ${nth_max} ]] && export NTHREADS_GEMPAK=${nth_max}
export APRUN="${launcher} -n ${npe_gempak} ${mpmd_opt}"


elif [[ "${step}" = "fit2obs" ]]; then

nth_max=$((npe_node_max / npe_node_fit2obs))

export NTHREADS_FIT2OBS=${nth_fit2obs:-1}
[[ ${NTHREADS_FIT2OBS} -gt ${nth_max} ]] && export NTHREADS_FIT2OBS=${nth_max}
export MPIRUN="${launcher} -n ${npe_fit2obs}"

fi
26 changes: 23 additions & 3 deletions jobs/rocoto/fcst.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@ source "${HOMEgfs}/ush/preamble.sh"
source "${HOMEgfs}/ush/detect_machine.sh"
set +x
source "${HOMEgfs}/ush/module-setup.sh"
module use "${HOMEgfs}/sorc/ufs_model.fd/tests"
module load modules.ufs_model.lua
module load prod_util
if [[ "${MACHINE_ID}" != "noaacloud" ]]; then
module use "${HOMEgfs}/sorc/ufs_model.fd/tests"
fi

if [[ "${MACHINE_ID}" = "wcoss2" ]]; then
module load cray-pals
fi
Expand All @@ -30,6 +31,25 @@ if [[ "${MACHINE_ID}" = "hera" ]]; then
#elif [[ "${MACHINE_ID}" = "wcoss2" ]]; then
# module load "python/3.7.5"
fi
if [[ "${MACHINE_ID}" == "noaacloud" ]]; then
if [[ "${PW_CSP:-}" = "aws" ]]; then

# TODO: This can be cleaned-up; most of this is a hack for now.
module use "/contrib/spack-stack/envs/ufswm/install/modulefiles/Core"
module load "stack-intel"
module load "stack-intel-oneapi-mpi"
module use -a "/contrib/spack-stack/miniconda/modulefiles/miniconda/"
module load "py39_4.12.0"
module load "ufs-weather-model-env/1.0.0"
export NETCDF="/contrib/spack-stack/miniconda/apps/miniconda/py39_4.12.0"
# TODO: Are there plans for EPIC to maintain this package or should GW provide support?
export UTILROOT="/contrib/global-workflow/NCEPLIBS-prod_util"
export PATH="${PATH}:/contrib/global-workflow/bin"
ndate_path="$(command -v ndate)"
export NDATE="${ndate_path}"
fi
fi

module list
unset MACHINE_ID
set_trace
Expand Down
28 changes: 23 additions & 5 deletions sorc/build_ufs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,28 @@ COMPILE_NR=0
CLEAN_BEFORE=YES
CLEAN_AFTER=NO

./tests/compile.sh "${MACHINE_ID}" "${MAKE_OPT}" "${COMPILE_NR}" "intel" "${CLEAN_BEFORE}" "${CLEAN_AFTER}"
mv "./tests/fv3_${COMPILE_NR}.exe" ./tests/ufs_model.x
mv "./tests/modules.fv3_${COMPILE_NR}.lua" ./tests/modules.ufs_model.lua
cp "./modulefiles/ufs_common.lua" ./tests/ufs_common.lua
cp "./modulefiles/ufs_common_spack.lua" ./tests/ufs_common_spack.lua
if [[ "${MACHINE_ID}" != "noaacloud" ]]; then
./tests/compile.sh "${MACHINE_ID}" "${MAKE_OPT}" "${COMPILE_NR}" "intel" "${CLEAN_BEFORE}" "${CLEAN_AFTER}"
mv "./tests/fv3_${COMPILE_NR}.exe" ./tests/ufs_model.x
mv "./tests/modules.fv3_${COMPILE_NR}.lua" ./tests/modules.ufs_model.lua
cp "./modulefiles/ufs_common.lua" ./tests/ufs_common.lua
else

if [[ "${PW_CSP}" == "aws" ]]; then
# TODO: This will need to be addressed further when the EPIC stacks are available/supported.
module use /contrib/spack-stack/envs/ufswm/install/modulefiles/Core
module load stack-intel
module load stack-intel-oneapi-mpi
module load ufs-weather-model-env/1.0.0
# TODO: It is still uncertain why this is the only module that is
# missing; check the spack build as this needed to be added manually.
module load w3emc/2.9.2 # TODO: This has similar issues for the EPIC stack.
module list
fi

export CMAKE_FLAGS="${MAKE_OPT}"
./build.sh
mv "${cwd}/ufs_model.fd/build/ufs_model" "${cwd}/ufs_model.fd/tests/ufs_model.x"
fi

exit 0
7 changes: 7 additions & 0 deletions ush/detect_machine.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ case $(hostname -f) in
*) MACHINE_ID=UNKNOWN ;; # Unknown platform
esac

if [[ ${MACHINE_ID} == "UNKNOWN" ]]; then
case ${PW_CSP:-} in
"aws" | "google" | "azure") MACHINE_ID=noaacloud ;;
*) PW_CSP="UNKNOWN"
esac
fi

# Overwrite auto-detect with MACHINE if set
MACHINE_ID=${MACHINE:-${MACHINE_ID}}

Expand Down
8 changes: 8 additions & 0 deletions ush/module-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,14 @@ elif [[ ${MACHINE_ID} = discover* ]]; then
export PATH=${PATH}:${SPACK_ROOT}/bin
. "${SPACK_ROOT}"/share/spack/setup-env.sh

# TODO: This can likely be made more general once other cloud
# platforms come online.
elif [[ ${MACHINE_ID} = "noaacloud" ]]; then

export SPACK_ROOT=/contrib/global-workflow/spack-stack/spack
export PATH=${PATH}:${SPACK_ROOT}/bin
. "${SPACK_ROOT}"/share/spack/setup-env.sh

else
echo WARNING: UNKNOWN PLATFORM 1>&2
fi
8 changes: 5 additions & 3 deletions workflow/hosts.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,15 @@ class Host:
"""

SUPPORTED_HOSTS = ['HERA', 'ORION', 'JET',
'WCOSS2', 'S4', 'CONTAINER']
'WCOSS2', 'S4', 'CONTAINER', 'AWSPW']

def __init__(self, host=None):

detected_host = self.detect()

if host is not None and host != detected_host:
raise ValueError(f'detected host: "{detected_host}" does not match host: "{host}"')
raise ValueError(
f'detected host: "{detected_host}" does not match host: "{host}"')

self.machine = detected_host
self.info = self._get_info
Expand Down Expand Up @@ -57,7 +58,8 @@ def detect(cls):
@property
def _get_info(self) -> dict:

hostfile = Path(os.path.join(os.path.dirname(__file__), f'hosts/{self.machine.lower()}.yaml'))
hostfile = Path(os.path.join(os.path.dirname(__file__),
f'hosts/{self.machine.lower()}.yaml'))
try:
info = YAMLFile(path=hostfile)
except FileNotFoundError:
Expand Down
24 changes: 24 additions & 0 deletions workflow/hosts/awspw.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
BASE_GIT: '/scratch1/NCEPDEV/global/glopara/git' #TODO: This does not yet exist.
DMPDIR: '/scratch1/NCEPDEV/global/glopara/dump' # TODO: This does not yet exist.
PACKAGEROOT: '/scratch1/NCEPDEV/global/glopara/nwpara' #TODO: This does not yet exist.
COMROOT: '/scratch1/NCEPDEV/global/glopara/com' #TODO: This does not yet exist.
COMINsyn: '${COMROOT}/gfs/prod/syndat' #TODO: This does not yet exist.
HOMEDIR: '/contrib/${USER}'
STMP: '/lustre/${USER}/stmp2/'
PTMP: '/lustre/${USER}/stmp4/'
NOSCRUB: $HOMEDIR
ACCOUNT: hwufscpldcld
SCHEDULER: slurm
QUEUE: batch
QUEUE_SERVICE: batch
PARTITION_BATCH: compute
PARTITION_SERVICE: compute
CHGRP_RSTPROD: 'YES'
CHGRP_CMD: 'chgrp rstprod' # TODO: This is not yet supported.
HPSSARCH: 'YES'
HPSS_PROJECT: emc-global #TODO: See `ATARDIR` below.
LOCALARCH: 'NO'
ATARDIR: '/NCEPDEV/${HPSS_PROJECT}/1year/${USER}/${machine}/scratch/${PSLOT}' # TODO: This will not yet work from AWS.
MAKE_NSSTBUFR: 'NO'
MAKE_ACFTBUFR: 'NO'
SUPPORTED_RESOLUTIONS: ['C48'] # TODO: Test and support all cubed-sphere resolutions.

0 comments on commit 38cd0bc

Please sign in to comment.