Feature CUDA compatability #1265
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Self-hosted CI | |
on: | |
push: | |
branches: | |
- main | |
- develop | |
pull_request: | |
branches-ignore: | |
- documentation | |
workflow_dispatch: | |
defaults: | |
run: | |
shell: bash | |
jobs: | |
CI: | |
if: github.repository == 'earth-system-radiation/rte-rrtmgp' | |
runs-on: | |
labels: cscs-ci | |
strategy: | |
fail-fast: false | |
matrix: | |
config-name: [nvidia-gpu-openacc, cce-cpu-icon-production, cce-gpu-openmp] | |
fpmodel: [DP, SP] | |
include: | |
- config-name: nvidia-gpu-openacc | |
rte-kernels: accel | |
compiler-modules: "PrgEnv-nvidia nvidia craype-accel-nvidia60 cdt-cuda/21.09 !cray-libsci_acc" | |
# Generic accelerator flag | |
fcflags: "-O3 -acc -Mallocatable=03 -gopt" | |
- config-name: cce-cpu-icon-production | |
rte-kernels: default | |
compiler-modules: "PrgEnv-cray" | |
# Production flags for Icon model | |
fcflags: "-hadd_paren -r am -Ktrap=divz,ovf,inv -hflex_mp=intolerant -hfp1 -hnoacc -O1,cache0" | |
- config-name: cce-gpu-openmp | |
rte-kernels: accel | |
compiler-modules: "PrgEnv-cray craype-accel-nvidia60 cdt-cuda/22.05 cudatoolkit/11.2.0_3.39-2.1__gf93aa1c" | |
# OpenMP flags from Nichols Romero (Argonne) | |
fcflags: "-hnoacc -homp -O0" | |
- config-name: cuda-kernels | |
# Fall back to OpenACC | |
rte-kernels: extern | |
compiler-modules: "PrgEnv-nvidia nvidia craype-accel-nvidia60 cdt-cuda/21.09 cudatoolkit/11.2.0_3.39-2.1__gf93aa1c !cray-libsci_acc" | |
fcflags: "-g -O3 -acc -gopt -Mallocatable=03 -Mpreprocess -Minfo" | |
experimental: true | |
env: | |
# Core variables: | |
FC: ftn | |
FCFLAGS: ${{ matrix.fcflags }} -DRTE_USE_${{ matrix.fpmodel }} | |
# Make variables: | |
RRTMGP_ROOT: ${{ github.workspace }} | |
RRTMGP_DATA: ${{ github.workspace }}/rrtmgp-data | |
RTE_KERNELS: ${{ matrix.rte-kernels }} | |
RUN_CMD: "srun -C gpu -A d56 -p cscsci -t 15:00" | |
FAILURE_THRESHOLD: 5.8e-2 # 7.e-4 | |
steps: | |
# | |
# Checks-out repository under $GITHUB_WORKSPACE | |
# | |
- name: Check out Fortran code | |
uses: actions/checkout@v3 | |
# | |
# Check out data | |
# | |
- name: Check out data | |
uses: actions/checkout@v3 | |
with: | |
repository: earth-system-radiation/rrtmgp-data | |
path: rrtmgp-data | |
# | |
# Check out CUDA kernels if needed | |
# | |
- name: Check out CUDA kernels | |
if: matrix.config-name == 'cuda-kernels' | |
uses: actions/checkout@v3 | |
with: | |
repository: earth-system-radiation/rte-rrtmgp-cuda-kernels | |
path: cuda-kernels | |
# | |
# Finalize build environment | |
# | |
- name: Finalize build environment | |
run: | | |
# There are significant limitations on what can go into ${GITHUB_ENV}, | |
# therefore, we use ${BASH_ENV} but only when necessary: | |
BASH_ENV="${GITHUB_WORKSPACE}/.bash" | |
echo "source '${GITHUB_WORKSPACE}/.github/workflows/module_switcher'" >> "${BASH_ENV}" | |
echo "switch_for_module daint-gpu ${{ matrix.compiler-modules }} cray-netcdf cray-hdf5" >> "${BASH_ENV}" | |
# Use custom Python environment: | |
# The environment can be re-generated as follows: | |
# module load cray-python | |
# python3 -m venv /scratch/snx3000/rpincus/rte-rrtmgp-python | |
# /scratch/snx3000/rpincus/rte-rrtmgp-python/bin/pip3 install --upgrade pip | |
# /scratch/snx3000/rpincus/rte-rrtmgp-python/bin/pip3 install dask[array] netCDF4 numpy xarray | |
echo 'PATH="/scratch/snx3000/rpincus/rte-rrtmgp-python/bin:${PATH}"' >> "${BASH_ENV}" | |
# Make bash run the above on startup: | |
echo "BASH_ENV=${BASH_ENV}" >> "${GITHUB_ENV}" | |
# Compiler needs more temporary space than normally available: | |
tmpdir='${{ github.workspace }}/tmp' | |
mkdir "${tmpdir}" && echo "TMPDIR=${tmpdir}" >> "${GITHUB_ENV}" | |
# We use the "non-default products" for the tests | |
# (see https://support.hpe.com/hpesc/public/docDisplay?docId=a00113984en_us&page=Modify_Linking_Behavior_to_Use_Non-default_Libraries.html): | |
echo 'LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"' >> "${BASH_ENV}" | |
# SLURM jobs, user home directories and HDF5 file locking are | |
# incompatible on Daint: | |
echo 'HDF5_USE_FILE_LOCKING=FALSE' >> "${GITHUB_ENV}" | |
# Build libraries, examples and tests | |
# | |
- name: Build libraries | |
run: | | |
$FC --version | |
make -j8 libs | |
# | |
# Build library of CUDA kernels; copy to build directory and overwrite defaults | |
# | |
- name: Build CUDA kernels | |
if: matrix.config-name == 'cuda-kernels' | |
run: | | |
make -C cuda-kernels/build | |
cp cuda-kernels/build/librtecudakernels.a build/librtekernels.a | |
cp cuda-kernels/build/librrtmgpcudakernels.a build/librrtmgpkernels.a | |
# | |
# Run examples and tests (expect success) | |
# | |
- name: Build and run examples and tests (expect success) | |
id: run-success | |
if: matrix.config-name != 'cce-gpu-openmp' | |
run: make -j8 tests | |
# | |
# Run examples and tests (expect failure) | |
# | |
- name: Build and run examples and tests (expect failure) | |
if: steps.run-success.outcome == 'skipped' | |
run: | | |
make -j8 tests && { | |
echo "Unexpected success" | |
exit 1 | |
} || echo "Expected failure" | |
# | |
# Relax failure thresholds for single precision | |
# | |
- name: Relax failure threshold for single precision | |
if: matrix.fpmodel == 'SP' && steps.run-success.outcome != 'skipped' | |
run: echo "FAILURE_THRESHOLD=3.5e-1" >> $GITHUB_ENV | |
# | |
# Compare the results | |
# | |
- name: Compare the results | |
if: steps.run-success.outcome != 'skipped' | |
run: make -j8 check |