Fix GPU error on NVIDIA A100 with nvhpc/24.3 #1245
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Self-hosted CI | |
on: | |
push: | |
branches: | |
- main | |
- develop | |
pull_request: | |
branches-ignore: | |
- documentation | |
workflow_dispatch: | |
defaults: | |
run: | |
shell: bash | |
jobs: | |
CI: | |
runs-on: | |
labels: cscs-ci | |
continue-on-error: ${{ matrix.experimental }} | |
strategy: | |
fail-fast: false | |
matrix: | |
config-name: [nvidia-gpu-openacc, cce-cpu-icon-production, cce-gpu-openmp] | |
fpmodel: [DP, SP] | |
include: | |
# The tests are not experimental by default: | |
- experimental: false | |
- config-name: nvidia-gpu-openacc | |
rte-kernels: accel | |
compiler-modules: "PrgEnv-nvidia nvidia craype-accel-nvidia60 cdt-cuda/21.09 !cray-libsci_acc" | |
# Generic accelerator flag | |
fcflags: "-O3 -acc -Mallocatable=03 -gopt" | |
- config-name: cce-cpu-icon-production | |
rte-kernels: default | |
compiler-modules: "PrgEnv-cray" | |
# Production flags for Icon model | |
fcflags: "-hadd_paren -r am -Ktrap=divz,ovf,inv -hflex_mp=intolerant -hfp1 -hnoacc -O1,cache0" | |
- config-name: cce-gpu-openmp | |
rte-kernels: accel | |
compiler-modules: "PrgEnv-cray craype-accel-nvidia60 cdt-cuda/22.05 cudatoolkit/11.2.0_3.39-2.1__gf93aa1c" | |
# OpenMP flags from Nichols Romero (Argonne) | |
fcflags: "-hnoacc -homp -O0" | |
experimental: true | |
env: | |
# Core variables: | |
FC: ftn | |
FCFLAGS: ${{ matrix.fcflags }} -DRTE_USE_${{ matrix.fpmodel}} | |
# Make variables: | |
RRTMGP_ROOT: ${{ github.workspace }} | |
RRTMGP_DATA: ${{ github.workspace }}/rrtmgp-data | |
RTE_KERNELS: ${{ matrix.rte-kernels }} | |
RUN_CMD: "srun -C gpu -A d56 -p cscsci -t 15:00" | |
FAILURE_THRESHOLD: 7.e-4 | |
steps: | |
# | |
# Checks-out repository under $GITHUB_WORKSPACE | |
# | |
- uses: actions/checkout@v3 | |
# | |
# Check out data | |
# | |
- name: Check out data | |
uses: actions/checkout@v3 | |
with: | |
repository: earth-system-radiation/rrtmgp-data | |
path: rrtmgp-data | |
# | |
# Finalize build environment | |
# | |
- name: Finalize build environment | |
run: | | |
# There are significant limitations on what can go into ${GITHUB_ENV}, | |
# therefore, we use ${BASH_ENV} but only when necessary: | |
BASH_ENV="${GITHUB_WORKSPACE}/.bash" | |
echo "source '${GITHUB_WORKSPACE}/.github/workflows/module_switcher'" >> "${BASH_ENV}" | |
echo "switch_for_module daint-gpu ${{ matrix.compiler-modules }} cray-netcdf cray-hdf5" >> "${BASH_ENV}" | |
# Use custom Python environment: | |
# The environment can be re-generated as follows: | |
# module load cray-python | |
# python3 -m venv /scratch/snx3000/rpincus/rte-rrtmgp-python | |
# /scratch/snx3000/rpincus/rte-rrtmgp-python/bin/pip3 install --upgrade pip | |
# /scratch/snx3000/rpincus/rte-rrtmgp-python/bin/pip3 install dask[array] netCDF4 numpy xarray | |
echo 'PATH="/scratch/snx3000/rpincus/rte-rrtmgp-python/bin:${PATH}"' >> "${BASH_ENV}" | |
# Make bash run the above on startup: | |
echo "BASH_ENV=${BASH_ENV}" >> "${GITHUB_ENV}" | |
# Compiler needs more temporary space than normally available: | |
tmpdir='${{ github.workspace }}/tmp' | |
mkdir "${tmpdir}" && echo "TMPDIR=${tmpdir}" >> "${GITHUB_ENV}" | |
# We use the "non-default products" for the tests | |
# (see https://support.hpe.com/hpesc/public/docDisplay?docId=a00113984en_us&page=Modify_Linking_Behavior_to_Use_Non-default_Libraries.html): | |
echo 'LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"' >> "${BASH_ENV}" | |
# SLURM jobs, user home directories and HDF5 file locking are | |
# incompatible on Daint: | |
echo 'HDF5_USE_FILE_LOCKING=FALSE' >> "${GITHUB_ENV}" | |
# | |
# Build libraries, examples and tests | |
# | |
- name: Build libraries, examples and tests | |
run: | | |
$FC --version | |
make libs | |
make -C build separate-libs | |
# | |
# Run examples and tests | |
# | |
- name: Run examples and tests | |
run: make tests | |
# | |
# Relax failure thresholds for single precision | |
# | |
- name: Relax failure threshold for single precision | |
if: matrix.fpmodel == 'SP' | |
run: echo "FAILURE_THRESHOLD=3.5e-1" >> $GITHUB_ENV | |
# | |
# Compare the results | |
# | |
- name: Compare the results | |
run: make check |