Feature CUDA compatability #890
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Self-hosted CI | |
on: | |
push: | |
branches-ignore: | |
- documentation | |
pull_request: | |
branches-ignore: | |
- documentation | |
defaults: | |
run: | |
shell: bash | |
jobs: | |
CI: | |
runs-on: | |
labels: cscs-ci | |
continue-on-error: ${{ matrix.experimental }} | |
strategy: | |
fail-fast: false | |
matrix: | |
config-name: [nvidia-gpu-openacc, cce-cpu-icon-production, cce-gpu-openmp] | |
fpmodel: [DP, SP] | |
include: | |
- config-name: nvidia-gpu-openacc | |
rte-kernels: accel | |
compiler-modules: "PrgEnv-nvidia nvidia craype-accel-nvidia60 cdt-cuda/21.09 !cray-libsci_acc" | |
# Generic accelerator flag | |
fcflags: "-O3 -acc -Mallocatable=03 -gopt" | |
experimental: false | |
- config-name: cce-cpu-icon-production | |
rte-kernels: default | |
compiler-modules: "PrgEnv-cray" | |
# Production flags for Icon model | |
fcflags: "-hadd_paren -r am -Ktrap=divz,ovf,inv -hflex_mp=intolerant -hfp1 -hnoacc -O1,cache0" | |
experimental: false | |
- config-name: cce-gpu-openmp | |
rte-kernels: accel | |
compiler-modules: "PrgEnv-cray craype-accel-nvidia60 cdt-cuda/22.05 cudatoolkit/11.2.0_3.39-2.1__gf93aa1c" | |
# OpenMP flags from Nichols Romero (Argonne) | |
fcflags: "-hnoacc -homp -O0" | |
experimental: true | |
- config-name: cuda-kernels | |
# Fall back to OpenACC | |
rte-kernels: extern | |
compiler-modules: "PrgEnv-nvidia nvidia craype-accel-nvidia60 cdt-cuda/21.09 cudatoolkit/11.2.0_3.39-2.1__gf93aa1c !cray-libsci_acc" | |
fcflags: "-g -O3 -acc -gopt -Mallocatable=03 -Mpreprocess -Minfo" | |
experimental: true | |
env: | |
# Core variables: | |
FC: ftn | |
FCFLAGS: ${{ matrix.fcflags }} -DRTE_USE_${{ matrix.fpmodel}} | |
# Make variables: | |
RRTMGP_ROOT: ${{ github.workspace }} | |
RRTMGP_DATA: ${{ github.workspace }}/rrtmgp-data | |
RTE_KERNELS: ${{ matrix.rte-kernels }} | |
RUN_CMD: "srun -C gpu -A d56 -p cscsci -t 15:00" | |
FAILURE_THRESHOLD: 7.e-4 | |
steps: | |
# | |
# Checks-out repository under $GITHUB_WORKSPACE | |
# | |
- name: Check out Fortran code | |
uses: actions/checkout@v3 | |
# | |
# Check out data | |
# | |
- name: Check out data | |
uses: actions/checkout@v3 | |
with: | |
repository: earth-system-radiation/rrtmgp-data | |
path: rrtmgp-data | |
ref: develop | |
# | |
# Check out CUDA kernels if needed | |
# | |
- name: Check out CUDA kernels | |
if: matrix.config-name == 'cuda-kernels' | |
uses: actions/checkout@v3 | |
with: | |
repository: earth-system-radiation/rte-rrtmgp-cuda-kernels | |
path: cuda-kernels | |
# | |
# Finalize build environment | |
# | |
- name: Finalize build environment | |
run: | | |
# There are significant limitations on what can go into ${GITHUB_ENV}, | |
# therefore, we use ${BASH_ENV} but only when necessary: | |
BASH_ENV="${GITHUB_WORKSPACE}/.bash" | |
echo "source '${GITHUB_WORKSPACE}/.github/workflows/module_switcher'" >> "${BASH_ENV}" | |
echo "switch_for_module daint-gpu ${{ matrix.compiler-modules }} cray-netcdf cray-hdf5" >> "${BASH_ENV}" | |
# Use custom Python environment: | |
# The environment can be re-generated as follows: | |
# module load cray-python | |
# python3 -m venv /scratch/snx3000/rpincus/rte-rrtmgp-python | |
# /scratch/snx3000/rpincus/rte-rrtmgp-python/bin/pip3 install --upgrade pip | |
# /scratch/snx3000/rpincus/rte-rrtmgp-python/bin/pip3 install dask[array] netCDF4 numpy xarray | |
echo 'PATH="/scratch/snx3000/rpincus/rte-rrtmgp-python/bin:${PATH}"' >> "${BASH_ENV}" | |
# Make bash run the above on startup: | |
echo "BASH_ENV=${BASH_ENV}" >> "${GITHUB_ENV}" | |
# Compiler needs more temporary space than normally available: | |
tmpdir='${{ github.workspace }}/tmp' | |
mkdir "${tmpdir}" && echo "TMPDIR=${tmpdir}" >> "${GITHUB_ENV}" | |
# We use the "non-default products" for the tests | |
# (see https://support.hpe.com/hpesc/public/docDisplay?docId=a00113984en_us&page=Modify_Linking_Behavior_to_Use_Non-default_Libraries.html): | |
echo 'LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"' >> "${BASH_ENV}" | |
# SLURM jobs, user home directories and HDF5 file locking are | |
# incompatible on Daint: | |
echo 'HDF5_USE_FILE_LOCKING=FALSE' >> "${GITHUB_ENV}" | |
# Build libraries, examples and tests | |
# | |
- name: Build libraries | |
run: | | |
$FC --version | |
make libs | |
# | |
# Build library of CUDA kernels; copy to build directory | |
# | |
- name: Build CUDA kernels | |
if: matrix.config-name == 'cuda-kernels' | |
run: | | |
make -C cuda-kernels/build | |
cp cuda-kernels/build/librtecudakernels.a build/librtekernels.a | |
cp cuda-kernels/build/librrtmgpcudakernels.a build/librrtmgpkernels.a | |
# | |
# Run examples and tests | |
# | |
- name: Run examples and tests | |
run: make tests | |
# | |
# Relax failure thresholds for single precision | |
# | |
- name: Relax failure threshold for single precision | |
if: matrix.fpmodel == 'SP' | |
run: echo "FAILURE_THRESHOLD=3.5e-1" >> $GITHUB_ENV | |
# | |
# Compare the results | |
# | |
- name: Compare the results | |
run: make check |