Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GitHub Workflow Pipeline #1740

Merged
86 changes: 86 additions & 0 deletions .github/workflows/globalworkflow-ci.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
name: gw-ci-orion

on: [workflow_dispatch]

# TEST_DIR contains 2 directories;
# 1. HOMEgfs: clone of the global-workflow
# 2. RUNTESTS: A directory containing EXPDIR and COMROT for experiments
# e.g. $> tree ./TEST_DIR
# ./TEST_DIR
# ├── HOMEgfs
# └── RUNTESTS
# ├── COMROT
# │   └── ${pslot}
# └── EXPDIR
# └── ${pslot}
env:
TEST_DIR: ${{ github.workspace }}/${{ github.run_id }}
MACHINE_ID: orion

jobs:
checkout-build-link:
runs-on: [self-hosted, orion-ready]
timeout-minutes: 600

steps:
- name: Checkout global-workflow
uses: actions/checkout@v3
with:
path: ${{ github.run_id }}/HOMEgfs # This path needs to be relative

- name: Checkout components
run: |
cd ${{ env.TEST_DIR }}/HOMEgfs/sorc
./checkout.sh -c -g # Options e.g. -u can be added late

- name: Build components
run: |
cd ${{ env.TEST_DIR }}/HOMEgfs/sorc
./build_all.sh

- name: Link artifacts
run: |
cd ${{ env.TEST_DIR }}/HOMEgfs/sorc
./link_workflow.sh

create-experiments:
needs: checkout-build-link
runs-on: [self-hosted, orion-ready]
strategy:
matrix:
case: ["C48_S2S", "C96_atm3DVar"]

steps:
- name: Create Experiments ${{ matrix.case }}
env:
HOMEgfs_PR: ${{ env.TEST_DIR }}/HOMEgfs
RUNTESTS: ${{ env.TEST_DIR }}/RUNTESTS
pslot: ${{ matrix.case }}.${{ github.run_id }}
run: |
cd ${{ env.TEST_DIR }}/HOMEgfs
source workflow/gw_setup.sh
source ci/platforms/orion.sh
./ci/scripts/create_experiment.py --yaml ci/cases/${{ matrix.case }}.yaml --dir ${{ env.HOMEgfs_PR }}

run-experiments:
needs: create-experiments
runs-on: [self-hosted, orion-ready]
strategy:
max-parallel: 2
matrix:
case: ["C48_S2S", "C96_atm3DVar"]
steps:
- name: Run Experiment ${{ matrix.case }}
run: |
cd ${{ env.TEST_DIR }}/HOMEgfs
./ci/scripts/run-check_ci.sh ${{ env.TEST_DIR }} ${{ matrix.case }}.${{ github.run_id }}

clean-up:
needs: run-experiments
runs-on: [self-hosted, orion-ready]
steps:
- name: Clean-up
run: |
cd ${{ github.workspace }}
rm -rf ${{ github.run_id }}

107 changes: 107 additions & 0 deletions ci/scripts/run-check_ci.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/bin/bash

set -eu

#####################################################################################
# Script description: script to check the status of an experiment as reported
# by Rocoto
#####################################################################################

TEST_DIR=${1:-${TEST_DIR:-?}} # Location of the root of the testing directory
pslot=${2:-${pslot:-?}} # Name of the experiment being tested by this script

# TEST_DIR contains 2 directories;
# 1. HOMEgfs: clone of the global-workflow
# 2. RUNTESTS: A directory containing EXPDIR and COMROT for experiments
# # e.g. $> tree ./TEST_DIR
# ./TEST_DIR
# ├── HOMEgfs
# └── RUNTESTS
# ├── COMROT
# │   └── ${pslot}
# └── EXPDIR
# └── ${pslot}
HOMEgfs="${TEST_DIR}/HOMEgfs"
RUNTESTS="${TEST_DIR}/RUNTESTS"

# Source modules and setup logging
echo "Source modules."
source "${HOMEgfs}/workflow/gw_setup.sh"

# cd into the experiment directory
echo "cd ${RUNTESTS}/EXPDIR/${pslot}"
cd "${RUNTESTS}/EXPDIR/${pslot}" || (echo "FATAL ERROR: Unable to cd into '${RUNTESTS}/EXPDIR/${pslot}', ABORT!"; exit 1)

# Name of the Rocoto XML and database files
xml="${pslot}.xml"
db="${pslot}.db"

# Ensure the XML is present for the experiment
if [[ ! -f "${xml}" ]]; then
echo "FATAL ERROR: XML file ${xml} not found in '${pslot}', experiment ${pslot} failed, ABORT!"
exit 1
fi

# Launch experiment
echo "Launch experiment with Rocoto."
rocotorun -v "${ROCOTO_VERBOSE:-0}" -w "${xml}" -d "${db}"
sleep 30
WalterKolczynski-NOAA marked this conversation as resolved.
Show resolved Hide resolved
if [[ ! -f "${db}" ]]; then
echo "FATAL ERROR: Rocoto database file ${db} not found, experiment ${pslot} failed, ABORT!"
exit 2
fi

# Experiment launched
rc=99
while true; do

echo "Run rocotorun."
rocotorun -v "${ROCOTO_VERBOSE:-0}" -w "${xml}" -d "${db}"

# Wait before running rocotostat
sleep 30

# Get job statistics
echo "Gather Rocoto statistics"
rocotostat_output=$(rocotostat -w "${xml}" -d "${db}" -s | grep -v CYCLE) || true
num_cycles=$(echo "${rocotostat_output}" | wc -l) || true
num_done=$(echo "${rocotostat_output}" | grep -c Done) || true
num_succeeded=$(rocotostat -w "${xml}" -d "${db}" -a | grep -c SUCCEEDED) || true
num_failed=$(rocotostat -w "${xml}" -d "${db}" -a | grep -c -E 'FAIL|DEAD') || true

echo "${pslot} Total Cycles: ${num_cycles} number done: ${num_done}"

if [[ ${num_failed} -ne 0 ]]; then
{
echo "Experiment ${pslot} Terminated with ${num_failed} tasks failed at $(date)" || true
echo "Experiment ${pslot} Terminated: *FAILED*"
} >> "${RUNTESTS}/ci.log"

error_logs=$(rocotostat -d "${db}" -w "${xml}" | grep -E 'FAIL|DEAD' | awk '{print "-c", $1, "-t", $2}' | xargs rocotocheck -d "${db}" -w "${xml}" | grep join | awk '{print $2}') || true
{
echo "Error logs:"
echo "${error_logs}"
} >> "${RUNTESTS}/ci.log"
sed -i "s/\`\`\`//2g" "${RUNTESTS}/ci.log"
sacct --format=jobid,jobname%35,WorkDir%100,stat | grep "${pslot}" | grep "${pr}\/RUNTESTS" | awk '{print $1}' | xargs scancel || true
rc=1
break
fi

if [[ "${num_done}" -eq "${num_cycles}" ]]; then
{
echo "Experiment ${pslot} Completed at $(date)" || true
echo "with ${num_succeeded} successfully completed jobs" || true
echo "Experiment ${pslot} Completed: *SUCCESS*"
} >> "${RUNTESTS}/ci.log"
sed -i "s/\`\`\`//2g" "${RUNTESTS}/ci.log"
rc=0
break
fi

# Wait before running rocotorun again
sleep 300

done

exit "${rc}"