Feature CUDA compatability #1265

Workflow file for this run

.github/workflows/self-hosted-ci.yml at 9943839

	name: Self-hosted CI
	on:
	push:
	branches:
	- main
	- develop
	pull_request:
	branches-ignore:
	- documentation
	workflow_dispatch:

	defaults:
	run:
	shell: bash

	jobs:
	CI:
	if: github.repository == 'earth-system-radiation/rte-rrtmgp'
	runs-on:
	labels: cscs-ci
	strategy:
	fail-fast: false
	matrix:
	config-name: [nvidia-gpu-openacc, cce-cpu-icon-production, cce-gpu-openmp]
	fpmodel: [DP, SP]
	include:
	- config-name: nvidia-gpu-openacc
	rte-kernels: accel
	compiler-modules: "PrgEnv-nvidia nvidia craype-accel-nvidia60 cdt-cuda/21.09 !cray-libsci_acc"
	# Generic accelerator flag
	fcflags: "-O3 -acc -Mallocatable=03 -gopt"
	- config-name: cce-cpu-icon-production
	rte-kernels: default
	compiler-modules: "PrgEnv-cray"
	# Production flags for Icon model
	fcflags: "-hadd_paren -r am -Ktrap=divz,ovf,inv -hflex_mp=intolerant -hfp1 -hnoacc -O1,cache0"
	- config-name: cce-gpu-openmp
	rte-kernels: accel
	compiler-modules: "PrgEnv-cray craype-accel-nvidia60 cdt-cuda/22.05 cudatoolkit/11.2.0_3.39-2.1__gf93aa1c"
	# OpenMP flags from Nichols Romero (Argonne)
	fcflags: "-hnoacc -homp -O0"
	- config-name: cuda-kernels
	# Fall back to OpenACC
	rte-kernels: extern
	compiler-modules: "PrgEnv-nvidia nvidia craype-accel-nvidia60 cdt-cuda/21.09 cudatoolkit/11.2.0_3.39-2.1__gf93aa1c !cray-libsci_acc"
	fcflags: "-g -O3 -acc -gopt -Mallocatable=03 -Mpreprocess -Minfo"
	experimental: true

	env:
	# Core variables:
	FC: ftn
	FCFLAGS: ${{ matrix.fcflags }} -DRTE_USE_${{ matrix.fpmodel }}
	# Make variables:
	RRTMGP_ROOT: ${{ github.workspace }}
	RRTMGP_DATA: ${{ github.workspace }}/rrtmgp-data
	RTE_KERNELS: ${{ matrix.rte-kernels }}
	RUN_CMD: "srun -C gpu -A d56 -p cscsci -t 15:00"
	FAILURE_THRESHOLD: 5.8e-2 # 7.e-4
	steps:
	#
	# Checks-out repository under $GITHUB_WORKSPACE
	#
	- name: Check out Fortran code
	uses: actions/checkout@v3
	#
	# Check out data
	#
	- name: Check out data
	uses: actions/checkout@v3
	with:
	repository: earth-system-radiation/rrtmgp-data
	path: rrtmgp-data
	#
	# Check out CUDA kernels if needed
	#
	- name: Check out CUDA kernels
	if: matrix.config-name == 'cuda-kernels'
	uses: actions/checkout@v3
	with:
	repository: earth-system-radiation/rte-rrtmgp-cuda-kernels
	path: cuda-kernels
	#
	# Finalize build environment
	#
	- name: Finalize build environment
	run: \|
	# There are significant limitations on what can go into ${GITHUB_ENV},
	# therefore, we use ${BASH_ENV} but only when necessary:
	BASH_ENV="${GITHUB_WORKSPACE}/.bash"
	echo "source '${GITHUB_WORKSPACE}/.github/workflows/module_switcher'" >> "${BASH_ENV}"
	echo "switch_for_module daint-gpu ${{ matrix.compiler-modules }} cray-netcdf cray-hdf5" >> "${BASH_ENV}"
	# Use custom Python environment:
	# The environment can be re-generated as follows:
	# module load cray-python
	# python3 -m venv /scratch/snx3000/rpincus/rte-rrtmgp-python
	# /scratch/snx3000/rpincus/rte-rrtmgp-python/bin/pip3 install --upgrade pip
	# /scratch/snx3000/rpincus/rte-rrtmgp-python/bin/pip3 install dask[array] netCDF4 numpy xarray
	echo 'PATH="/scratch/snx3000/rpincus/rte-rrtmgp-python/bin:${PATH}"' >> "${BASH_ENV}"
	# Make bash run the above on startup:
	echo "BASH_ENV=${BASH_ENV}" >> "${GITHUB_ENV}"
	# Compiler needs more temporary space than normally available:
	tmpdir='${{ github.workspace }}/tmp'
	mkdir "${tmpdir}" && echo "TMPDIR=${tmpdir}" >> "${GITHUB_ENV}"
	# We use the "non-default products" for the tests
	# (see https://support.hpe.com/hpesc/public/docDisplay?docId=a00113984en_us&page=Modify_Linking_Behavior_to_Use_Non-default_Libraries.html):
	echo 'LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"' >> "${BASH_ENV}"
	# SLURM jobs, user home directories and HDF5 file locking are
	# incompatible on Daint:
	echo 'HDF5_USE_FILE_LOCKING=FALSE' >> "${GITHUB_ENV}"
	# Build libraries, examples and tests
	#
	- name: Build libraries
	run: \|
	$FC --version
	make -j8 libs
	#
	# Build library of CUDA kernels; copy to build directory and overwrite defaults
	#
	- name: Build CUDA kernels
	if: matrix.config-name == 'cuda-kernels'
	run: \|
	make -C cuda-kernels/build
	cp cuda-kernels/build/librtecudakernels.a build/librtekernels.a
	cp cuda-kernels/build/librrtmgpcudakernels.a build/librrtmgpkernels.a
	#
	# Run examples and tests (expect success)
	#
	- name: Build and run examples and tests (expect success)
	id: run-success
	if: matrix.config-name != 'cce-gpu-openmp'
	run: make -j8 tests
	#
	# Run examples and tests (expect failure)
	#
	- name: Build and run examples and tests (expect failure)
	if: steps.run-success.outcome == 'skipped'
	run: \|
	make -j8 tests && {
	echo "Unexpected success"
	exit 1
	} \|\| echo "Expected failure"
	#
	# Relax failure thresholds for single precision
	#
	- name: Relax failure threshold for single precision
	if: matrix.fpmodel == 'SP' && steps.run-success.outcome != 'skipped'
	run: echo "FAILURE_THRESHOLD=3.5e-1" >> $GITHUB_ENV
	#
	# Compare the results
	#
	- name: Compare the results
	if: steps.run-success.outcome != 'skipped'
	run: make -j8 check

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Feature CUDA compatability #1265

Workflow file

Feature CUDA compatability #1265

Jobs

Run details

Workflow file for this run