-
Notifications
You must be signed in to change notification settings - Fork 67
129 lines (127 loc) · 4.66 KB
/
self-hosted-ci.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
name: Self-hosted CI
on:
push:
branches:
- main
- develop
pull_request:
branches-ignore:
- documentation
workflow_dispatch:
defaults:
run:
shell: bash
jobs:
CI:
if: github.repository == 'earth-system-radiation/rte-rrtmgp'
runs-on:
labels: cscs-ci
strategy:
fail-fast: false
matrix:
config-name: [nvidia-gpu-openacc, cce-cpu-icon-production, cce-gpu-openmp]
fpmodel: [DP, SP]
include:
- config-name: nvidia-gpu-openacc
rte-kernels: accel
compiler-modules: "PrgEnv-nvidia nvidia craype-accel-nvidia60 cdt-cuda/21.09 !cray-libsci_acc"
# Generic accelerator flag
fcflags: "-O3 -acc -Mallocatable=03 -gopt"
- config-name: cce-cpu-icon-production
rte-kernels: default
compiler-modules: "PrgEnv-cray"
# Production flags for Icon model
fcflags: "-hadd_paren -r am -Ktrap=divz,ovf,inv -hflex_mp=intolerant -hfp1 -hnoacc -O1,cache0"
- config-name: cce-gpu-openmp
rte-kernels: accel
compiler-modules: "PrgEnv-cray craype-accel-nvidia60 cdt-cuda/22.05 cudatoolkit/11.2.0_3.39-2.1__gf93aa1c"
# OpenMP flags from Nichols Romero (Argonne)
fcflags: "-hnoacc -homp -O0"
env:
# Core variables:
FC: ftn
FCFLAGS: ${{ matrix.fcflags }} -DRTE_USE_${{ matrix.fpmodel }}
# Make variables:
RRTMGP_ROOT: ${{ github.workspace }}
RRTMGP_DATA: ${{ github.workspace }}/rrtmgp-data
RTE_KERNELS: ${{ matrix.rte-kernels }}
RUN_CMD: "srun -C gpu -A d56 -p cscsci -t 15:00"
FAILURE_THRESHOLD: 7.e-4
steps:
#
# Checks-out repository under $GITHUB_WORKSPACE
#
- uses: actions/checkout@v3
#
# Check out data
#
- name: Check out data
uses: actions/checkout@v3
with:
repository: earth-system-radiation/rrtmgp-data
path: rrtmgp-data
ref: v1.8.1
#
# Finalize build environment
#
- name: Finalize build environment
run: |
# There are significant limitations on what can go into ${GITHUB_ENV},
# therefore, we use ${BASH_ENV} but only when necessary:
BASH_ENV="${GITHUB_WORKSPACE}/.bash"
echo "source '${GITHUB_WORKSPACE}/.github/workflows/module_switcher'" >> "${BASH_ENV}"
echo "switch_for_module daint-gpu ${{ matrix.compiler-modules }} cray-netcdf cray-hdf5" >> "${BASH_ENV}"
# Use custom Python environment:
# The environment can be re-generated as follows:
# module load cray-python
# python3 -m venv /scratch/snx3000/rpincus/rte-rrtmgp-python
# /scratch/snx3000/rpincus/rte-rrtmgp-python/bin/pip3 install --upgrade pip
# /scratch/snx3000/rpincus/rte-rrtmgp-python/bin/pip3 install dask[array] netCDF4 numpy xarray
echo 'PATH="/scratch/snx3000/rpincus/rte-rrtmgp-python/bin:${PATH}"' >> "${BASH_ENV}"
# Make bash run the above on startup:
echo "BASH_ENV=${BASH_ENV}" >> "${GITHUB_ENV}"
# Compiler needs more temporary space than normally available:
tmpdir='${{ github.workspace }}/tmp'
mkdir "${tmpdir}" && echo "TMPDIR=${tmpdir}" >> "${GITHUB_ENV}"
# We use the "non-default products" for the tests
# (see https://support.hpe.com/hpesc/public/docDisplay?docId=a00113984en_us&page=Modify_Linking_Behavior_to_Use_Non-default_Libraries.html):
echo 'LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"' >> "${BASH_ENV}"
# SLURM jobs, user home directories and HDF5 file locking are
# incompatible on Daint:
echo 'HDF5_USE_FILE_LOCKING=FALSE' >> "${GITHUB_ENV}"
#
# Build libraries, examples and tests
#
- name: Build libraries
run: |
$FC --version
make -j8 libs
#
# Run examples and tests (expect success)
#
- name: Build and run examples and tests (expect success)
id: run-success
if: matrix.config-name != 'cce-gpu-openmp'
run: make -j8 tests
#
# Run examples and tests (expect failure)
#
- name: Build and run examples and tests (expect failure)
if: steps.run-success.outcome == 'skipped'
run: |
make -j8 tests && {
echo "Unexpected success"
exit 1
} || echo "Expected failure"
#
# Relax failure thresholds for single precision
#
- name: Relax failure threshold for single precision
if: matrix.fpmodel == 'SP' && steps.run-success.outcome != 'skipped'
run: echo "FAILURE_THRESHOLD=3.5e-1" >> $GITHUB_ENV
#
# Compare the results
#
- name: Compare the results
if: steps.run-success.outcome != 'skipped'
run: make -j8 check