diff --git a/env/AWSPW.env b/env/AWSPW.env index 867b9220ba..992281a1d7 100755 --- a/env/AWSPW.env +++ b/env/AWSPW.env @@ -9,8 +9,8 @@ fi step=$1 -export launcher="mpiexec.hydra" -export mpmd_opt="" +export launcher="srun -l --export=ALL" +export mpmd_opt="--multi-prog --output=mpmd.%j.%t.out" # Configure MPI environment export OMP_STACKSIZE=2048000 @@ -35,6 +35,8 @@ fi if [[ "${step}" = "fcst" ]] || [[ "${step}" = "efcs" ]]; then + export launcher="srun --mpi=pmi2 -l" + (( nnodes = (ntasks+tasks_per_node-1)/tasks_per_node )) (( ufs_ntasks = nnodes*tasks_per_node )) # With ESMF threading, the model wants to use the full node diff --git a/modulefiles/module_base.noaacloud.lua b/modulefiles/module_base.noaacloud.lua new file mode 100644 index 0000000000..7997b618e4 --- /dev/null +++ b/modulefiles/module_base.noaacloud.lua @@ -0,0 +1,49 @@ +help([[ +Load environment to run GFS on noaacloud +]]) + +local spack_mod_path=(os.getenv("spack_mod_path") or "None") +prepend_path("MODULEPATH", spack_mod_path) + +load(pathJoin("stack-intel", (os.getenv("stack_intel_ver") or "None"))) +load(pathJoin("stack-intel-oneapi-mpi", (os.getenv("stack_impi_ver") or "None"))) +load(pathJoin("python", (os.getenv("python_ver") or "None"))) + +load(pathJoin("jasper", (os.getenv("jasper_ver") or "None"))) +load(pathJoin("libpng", (os.getenv("libpng_ver") or "None"))) +load(pathJoin("cdo", (os.getenv("cdo_ver") or "None"))) +--load(pathJoin("R", (os.getenv("R_ver") or "None"))) + +load(pathJoin("hdf5", (os.getenv("hdf5_ver") or "None"))) +load(pathJoin("netcdf-c", (os.getenv("netcdf_c_ver") or "None"))) +load(pathJoin("netcdf-fortran", (os.getenv("netcdf_fortran_ver") or "None"))) + +load(pathJoin("nco", (os.getenv("nco_ver") or "None"))) +load(pathJoin("prod_util", (os.getenv("prod_util_ver") or "None"))) +load(pathJoin("grib-util", (os.getenv("grib_util_ver") or "None"))) +load(pathJoin("g2tmpl", (os.getenv("g2tmpl_ver") or "None"))) +load(pathJoin("gsi-ncdiag", (os.getenv("gsi_ncdiag_ver") or "None"))) +load(pathJoin("crtm", (os.getenv("crtm_ver") or "None"))) +load(pathJoin("bufr", (os.getenv("bufr_ver") or "None"))) +load(pathJoin("wgrib2", (os.getenv("wgrib2_ver") or "None"))) +load(pathJoin("py-f90nml", (os.getenv("py_f90nml_ver") or "None"))) +load(pathJoin("py-netcdf4", (os.getenv("py_netcdf4_ver") or "None"))) +load(pathJoin("py-pyyaml", (os.getenv("py_pyyaml_ver") or "None"))) +load(pathJoin("py-jinja2", (os.getenv("py_jinja2_ver") or "None"))) +load(pathJoin("py-pandas", (os.getenv("py_pandas_ver") or "None"))) +load(pathJoin("py-python-dateutil", (os.getenv("py_python_dateutil_ver") or "None"))) +--load(pathJoin("met", (os.getenv("met_ver") or "None"))) +--load(pathJoin("metplus", (os.getenv("metplus_ver") or "None"))) +load(pathJoin("py-xarray", (os.getenv("py_xarray_ver") or "None"))) + +setenv("WGRIB2","wgrib2") +setenv("UTILROOT",(os.getenv("prod_util_ROOT") or "None")) + +--prepend_path("MODULEPATH", pathJoin("/scratch1/NCEPDEV/global/glopara/git/prepobs/v" .. (os.getenv("prepobs_run_ver") or "None"), "modulefiles")) +--prepend_path("MODULEPATH", pathJoin("/scratch1/NCEPDEV/global/glopara/git/prepobs/feature-GFSv17_com_reorg_log_update/modulefiles")) +--load(pathJoin("prepobs", (os.getenv("prepobs_run_ver") or "None"))) + +--prepend_path("MODULEPATH", pathJoin("/scratch1/NCEPDEV/global/glopara/git/Fit2Obs/v" .. (os.getenv("fit2obs_ver") or "None"), "modulefiles")) +--load(pathJoin("fit2obs", (os.getenv("fit2obs_ver") or "None"))) + +whatis("Description: GFS run environment") diff --git a/modulefiles/module_gwci.noaacloud.lua b/modulefiles/module_gwci.noaacloud.lua new file mode 100644 index 0000000000..c3142cd60d --- /dev/null +++ b/modulefiles/module_gwci.noaacloud.lua @@ -0,0 +1,15 @@ +help([[ +Load environment to run GFS workflow setup scripts on noaacloud +]]) + +prepend_path("MODULEPATH", "/contrib/spack-stack/spack-stack-1.6.0/envs/unified-env/install/modulefiles/Core") + +load(pathJoin("stack-intel", os.getenv("2021.3.0"))) +load(pathJoin("stack-intel-oneapi-mpi", os.getenv("2021.3.0"))) + +load(pathJoin("netcdf-c", os.getenv("4.9.2"))) +load(pathJoin("netcdf-fortran", os.getenv("4.6.1"))) +load(pathJoin("nccmp","1.9.0.1")) +load(pathJoin("wgrib2", "2.0.8")) + +whatis("Description: GFS run setup CI environment") diff --git a/modulefiles/module_gwsetup.noaacloud.lua b/modulefiles/module_gwsetup.noaacloud.lua new file mode 100644 index 0000000000..f3845e8d72 --- /dev/null +++ b/modulefiles/module_gwsetup.noaacloud.lua @@ -0,0 +1,20 @@ +help([[ +Load environment to run GFS workflow setup scripts on noaacloud +]]) + +load(pathJoin("rocoto")) + +prepend_path("MODULEPATH", "/contrib/spack-stack/spack-stack-1.6.0/envs/unified-env/install/modulefiles/Core") + +local stack_intel_ver=os.getenv("stack_intel_ver") or "2021.3.0" +local python_ver=os.getenv("python_ver") or "3.10.3" + +load(pathJoin("stack-intel", stack_intel_ver)) +load(pathJoin("python", python_ver)) +load("py-jinja2") +load("py-pyyaml") +load("py-numpy") +local git_ver=os.getenv("git_ver") or "1.8.3.1" +load(pathJoin("git", git_ver)) + +whatis("Description: GFS run setup environment") diff --git a/parm/config/gfs/config.base b/parm/config/gfs/config.base index 2a7ffab0dd..e6a626cfe3 100644 --- a/parm/config/gfs/config.base +++ b/parm/config/gfs/config.base @@ -483,4 +483,12 @@ export OFFSET_START_HOUR=0 # Number of regional collectives to create soundings for export NUM_SND_COLLECTIVES=${NUM_SND_COLLECTIVES:-9} +# The tracker, genesis, and METplus jobs are not supported on AWS yet +# TODO: we should place these in workflow/hosts/awspw.yaml as part of AWS setup, not for general. +if [[ "${machine}" == "AWSPW" ]]; then + export DO_TRACKER="NO" + export DO_GENESIS="NO" + export DO_METP="NO" +fi + echo "END: config.base" diff --git a/parm/config/gfs/config.resources b/parm/config/gfs/config.resources index a596629e76..cec2aef238 100644 --- a/parm/config/gfs/config.resources +++ b/parm/config/gfs/config.resources @@ -106,7 +106,8 @@ case ${machine} in ;; "AWSPW") export PARTITION_BATCH="compute" - max_tasks_per_node=40 + npe_node_max=36 + max_tasks_per_node=36 # TODO Supply a max mem/node value for AWS # shellcheck disable=SC2034 mem_node_max="" diff --git a/parm/config/gfs/config.resources.AWSPW b/parm/config/gfs/config.resources.AWSPW new file mode 100644 index 0000000000..8649713bb7 --- /dev/null +++ b/parm/config/gfs/config.resources.AWSPW @@ -0,0 +1,10 @@ +#! /usr/bin/env bash + +# AWS-specific job resources + +export is_exclusive="True" + +# shellcheck disable=SC2312 +for mem_var in $(env | grep '^memory_' | cut -d= -f1); do + unset "${mem_var}" +done diff --git a/sorc/build_all.sh b/sorc/build_all.sh index 28f52fd306..b6c4e6cc1c 100755 --- a/sorc/build_all.sh +++ b/sorc/build_all.sh @@ -145,7 +145,7 @@ build_opts["ww3prepost"]="${_wave_opt} ${_verbose_opt} ${_build_ufs_opt} ${_buil # Optional DA builds if [[ "${_build_ufsda}" == "YES" ]]; then - if [[ "${MACHINE_ID}" != "orion" && "${MACHINE_ID}" != "hera" && "${MACHINE_ID}" != "hercules" && "${MACHINE_ID}" != "wcoss2" ]]; then + if [[ "${MACHINE_ID}" != "orion" && "${MACHINE_ID}" != "hera" && "${MACHINE_ID}" != "hercules" && "${MACHINE_ID}" != "wcoss2" && "${MACHINE_ID}" != "noaacloud" ]]; then echo "NOTE: The GDAS App is not supported on ${MACHINE_ID}. Disabling build." else build_jobs["gdas"]=8 diff --git a/sorc/build_ufs.sh b/sorc/build_ufs.sh index 7e84eaebc2..44c8c7a2ad 100755 --- a/sorc/build_ufs.sh +++ b/sorc/build_ufs.sh @@ -41,30 +41,9 @@ COMPILE_NR=0 CLEAN_BEFORE=YES CLEAN_AFTER=NO -if [[ "${MACHINE_ID}" != "noaacloud" ]]; then - BUILD_JOBS=${BUILD_JOBS:-8} ./tests/compile.sh "${MACHINE_ID}" "${MAKE_OPT}" "${COMPILE_NR}" "intel" "${CLEAN_BEFORE}" "${CLEAN_AFTER}" - mv "./tests/fv3_${COMPILE_NR}.exe" ./tests/ufs_model.x - mv "./tests/modules.fv3_${COMPILE_NR}.lua" ./tests/modules.ufs_model.lua - cp "./modulefiles/ufs_common.lua" ./tests/ufs_common.lua -else - - if [[ "${PW_CSP:-}" == "aws" ]]; then - set +x - # TODO: This will need to be addressed further when the EPIC stacks are available/supported. - module use /contrib/spack-stack/envs/ufswm/install/modulefiles/Core - module load stack-intel - module load stack-intel-oneapi-mpi - module load ufs-weather-model-env/1.0.0 - # TODO: It is still uncertain why this is the only module that is - # missing; check the spack build as this needed to be added manually. - module load w3emc/2.9.2 # TODO: This has similar issues for the EPIC stack. - module list - set -x - fi - - export CMAKE_FLAGS="${MAKE_OPT}" - BUILD_JOBS=${BUILD_JOBS:-8} ./build.sh - mv "${cwd}/ufs_model.fd/build/ufs_model" "${cwd}/ufs_model.fd/tests/ufs_model.x" -fi +BUILD_JOBS=${BUILD_JOBS:-8} ./tests/compile.sh "${MACHINE_ID}" "${MAKE_OPT}" "${COMPILE_NR}" "intel" "${CLEAN_BEFORE}" "${CLEAN_AFTER}" +mv "./tests/fv3_${COMPILE_NR}.exe" ./tests/ufs_model.x +mv "./tests/modules.fv3_${COMPILE_NR}.lua" ./tests/modules.ufs_model.lua +cp "./modulefiles/ufs_common.lua" ./tests/ufs_common.lua exit 0 diff --git a/sorc/link_workflow.sh b/sorc/link_workflow.sh index be912292fe..ae30e7a645 100755 --- a/sorc/link_workflow.sh +++ b/sorc/link_workflow.sh @@ -76,6 +76,7 @@ case "${machine}" in "jet") FIX_DIR="/lfs4/HFIP/hfv3gfs/glopara/git/fv3gfs/fix" ;; "s4") FIX_DIR="/data/prod/glopara/fix" ;; "gaea") FIX_DIR="/gpfs/f5/ufs-ard/world-shared/global/glopara/data/fix" ;; + "noaacloud") FIX_DIR="/contrib/global-workflow-shared-data/fix" ;; *) echo "FATAL: Unknown target machine ${machine}, couldn't set FIX_DIR" exit 1 diff --git a/ush/load_fv3gfs_modules.sh b/ush/load_fv3gfs_modules.sh index 5f6afb7e35..ff6f64cece 100755 --- a/ush/load_fv3gfs_modules.sh +++ b/ush/load_fv3gfs_modules.sh @@ -20,7 +20,7 @@ source "${HOMEgfs}/versions/run.ver" module use "${HOMEgfs}/modulefiles" case "${MACHINE_ID}" in - "wcoss2" | "hera" | "orion" | "hercules" | "gaea" | "jet" | "s4") + "wcoss2" | "hera" | "orion" | "hercules" | "gaea" | "jet" | "s4" | "noaacloud") module load "module_base.${MACHINE_ID}" ;; *) diff --git a/ush/load_ufswm_modules.sh b/ush/load_ufswm_modules.sh index 6477a8ff39..f00358095d 100755 --- a/ush/load_ufswm_modules.sh +++ b/ush/load_ufswm_modules.sh @@ -11,40 +11,21 @@ ulimit_s=$( ulimit -S -s ) source "${HOMEgfs}/ush/detect_machine.sh" source "${HOMEgfs}/ush/module-setup.sh" -if [[ "${MACHINE_ID}" != "noaacloud" ]]; then - module use "${HOMEgfs}/sorc/ufs_model.fd/modulefiles" - module load "ufs_${MACHINE_ID}.intel" - module load prod_util - if [[ "${MACHINE_ID}" = "wcoss2" ]]; then - module load cray-pals - module load cfp - module load libjpeg - module load craype-network-ucx - module load cray-mpich-ucx - else - export UTILROOT=${prod_util_ROOT} - fi - module load wgrib2 - export WGRIB2=wgrib2 -fi -if [[ "${MACHINE_ID}" == "noaacloud" ]]; then - if [[ "${PW_CSP:-}" = "aws" ]]; then - # TODO: This can be cleaned-up; most of this is a hack for now. - module use "/contrib/spack-stack/envs/ufswm/install/modulefiles/Core" - module load "stack-intel" - module load "stack-intel-oneapi-mpi" - module use -a "/contrib/spack-stack/miniconda/modulefiles/miniconda/" - module load "py39_4.12.0" - module load "ufs-weather-model-env/1.0.0" - export NETCDF="/contrib/spack-stack/miniconda/apps/miniconda/py39_4.12.0" - # TODO: Are there plans for EPIC to maintain this package or should GW provide support? - export UTILROOT="/contrib/global-workflow/NCEPLIBS-prod_util" - export PATH="${PATH}:/contrib/global-workflow/bin" - ndate_path="$(command -v ndate)" - export NDATE="${ndate_path}" - fi +module use "${HOMEgfs}/sorc/ufs_model.fd/modulefiles" +module load "ufs_${MACHINE_ID}.intel" +module load prod_util +if [[ "${MACHINE_ID}" = "wcoss2" ]]; then + module load cray-pals + module load cfp + module load libjpeg + module load craype-network-ucx + module load cray-mpich-ucx +else + export UTILROOT=${prod_util_ROOT} fi +module load wgrib2 +export WGRIB2=wgrib2 module list unset MACHINE_ID diff --git a/ush/module-setup.sh b/ush/module-setup.sh index b4ec3edafa..398562652d 100755 --- a/ush/module-setup.sh +++ b/ush/module-setup.sh @@ -92,10 +92,8 @@ elif [[ ${MACHINE_ID} = discover* ]]; then # TODO: This can likely be made more general once other cloud # platforms come online. elif [[ ${MACHINE_ID} = "noaacloud" ]]; then - - export SPACK_ROOT=/contrib/global-workflow/spack-stack/spack - export PATH=${PATH}:${SPACK_ROOT}/bin - . "${SPACK_ROOT}"/share/spack/setup-env.sh + # We are on NOAA Cloud + module purge else echo WARNING: UNKNOWN PLATFORM 1>&2 diff --git a/versions/build.noaacloud.ver b/versions/build.noaacloud.ver new file mode 100644 index 0000000000..ba47313675 --- /dev/null +++ b/versions/build.noaacloud.ver @@ -0,0 +1,5 @@ +export stack_intel_ver=2021.3.0 +export stack_impi_ver=2021.3.0 +export spack_env=gsi-addon-env +source "${HOMEgfs:-}/versions/build.spack.ver" +export spack_mod_path="/contrib/spack-stack/spack-stack-${spack_stack_ver}/envs/gsi-addon-env/install/modulefiles/Core" diff --git a/versions/run.noaacloud.ver b/versions/run.noaacloud.ver new file mode 100644 index 0000000000..4c9ac3cd42 --- /dev/null +++ b/versions/run.noaacloud.ver @@ -0,0 +1,8 @@ +export stack_intel_ver=2021.3.0 +export stack_impi_ver=2021.3.0 +export spack_env=gsi-addon-env + +source "${HOMEgfs:-}/versions/run.spack.ver" +export spack_mod_path="/contrib/spack-stack/spack-stack-${spack_stack_ver}/envs/gsi-addon-env/install/modulefiles/Core" + +export cdo_ver=2.2.0 diff --git a/workflow/hosts.py b/workflow/hosts.py index cd0cfe0083..eced460fd1 100644 --- a/workflow/hosts.py +++ b/workflow/hosts.py @@ -52,7 +52,7 @@ def detect(cls): elif container is not None: machine = 'CONTAINER' elif pw_csp is not None: - if pw_csp.lower() not in ['azure', 'aws', 'gcp']: + if pw_csp.lower() not in ['azure', 'aws', 'google']: raise ValueError( f'NOAA cloud service provider "{pw_csp}" is not supported.') machine = f"{pw_csp.upper()}PW" diff --git a/workflow/hosts/awspw.yaml b/workflow/hosts/awspw.yaml index 046dafcfa7..f925f54008 100644 --- a/workflow/hosts/awspw.yaml +++ b/workflow/hosts/awspw.yaml @@ -1,12 +1,12 @@ -BASE_GIT: '/scratch1/NCEPDEV/global/glopara/git' #TODO: This does not yet exist. -DMPDIR: '/scratch1/NCEPDEV/global/glopara/dump' # TODO: This does not yet exist. -PACKAGEROOT: '/scratch1/NCEPDEV/global/glopara/nwpara' #TODO: This does not yet exist. -COMINsyn: '/scratch1/NCEPDEV/global/glopara/com/gfs/prod/syndat' #TODO: This does not yet exist. +BASE_GIT: '' #TODO: This does not yet exist. +DMPDIR: '' # TODO: This does not yet exist. +PACKAGEROOT: '' #TODO: This does not yet exist. +COMINsyn: '' #TODO: This does not yet exist. HOMEDIR: '/contrib/${USER}' -STMP: '/lustre/${USER}/stmp2/' -PTMP: '/lustre/${USER}/stmp4/' -NOSCRUB: ${HOMEDIR} -ACCOUNT: hwufscpldcld +STMP: '/lustre/${USER}/stmp/' +PTMP: '/lustre/${USER}/ptmp/' +NOSCRUB: '${HOMEDIR}' +ACCOUNT: '${USER}' SCHEDULER: slurm QUEUE: batch QUEUE_SERVICE: batch @@ -16,10 +16,11 @@ RESERVATION: '' CLUSTERS: '' CHGRP_RSTPROD: 'YES' CHGRP_CMD: 'chgrp rstprod' # TODO: This is not yet supported. -HPSSARCH: 'YES' +HPSSARCH: 'NO' HPSS_PROJECT: emc-global #TODO: See `ATARDIR` below. +BASE_CPLIC: '/bucket/global-workflow-shared-data/ICSDIR/prototype_ICs' LOCALARCH: 'NO' -ATARDIR: '/NCEPDEV/${HPSS_PROJECT}/1year/${USER}/${machine}/scratch/${PSLOT}' # TODO: This will not yet work from AWS. +ATARDIR: '' # TODO: This will not yet work from AWS. MAKE_NSSTBUFR: 'NO' MAKE_ACFTBUFR: 'NO' SUPPORTED_RESOLUTIONS: ['C48', 'C96'] # TODO: Test and support all cubed-sphere resolutions. diff --git a/workflow/rocoto/tasks.py b/workflow/rocoto/tasks.py index 353d2aa943..64952498d4 100644 --- a/workflow/rocoto/tasks.py +++ b/workflow/rocoto/tasks.py @@ -209,7 +209,10 @@ def get_resource(self, task_name): else: native += ':shared' elif scheduler in ['slurm']: - native = '--export=NONE' + if task_config.get('is_exclusive', False): + native = '--exclusive' + else: + native = '--export=NONE' if task_config['RESERVATION'] != "": native += '' if task_name in Tasks.SERVICE_TASKS else ' --reservation=' + task_config['RESERVATION'] if task_config.get('CLUSTERS', "") not in ["", '@CLUSTERS@']: diff --git a/workflow/rocoto/workflow_xml.py b/workflow/rocoto/workflow_xml.py index 11b2cdfc45..ca54f3a5bb 100644 --- a/workflow/rocoto/workflow_xml.py +++ b/workflow/rocoto/workflow_xml.py @@ -157,10 +157,16 @@ def _write_crontab(self, crontab_file: str = None, cronint: int = 5) -> None: strings = ['', f'#################### {pslot} ####################', - f'MAILTO="{replyto}"', - f'{cronintstr} {rocotorunstr}', - '#################################################################', - ''] + f'MAILTO="{replyto}"' + ] + # AWS need 'SHELL', and 'BASH_ENV' defined, or, the crontab job won't start. + if os.environ.get('PW_CSP', None) in ['aws', 'azure', 'google']: + strings.extend([f'SHELL="/bin/bash"', + f'BASH_ENV="/etc/bashrc"' + ]) + strings.extend([f'{cronintstr} {rocotorunstr}', + '#################################################################', + '']) if crontab_file is None: crontab_file = f"{expdir}/{pslot}.crontab"