-
Notifications
You must be signed in to change notification settings - Fork 168
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* added two files for this PR (now that devlop has pslot tag updates) Co-authored-by: TerrenceMcGuinness-NOAA <[email protected]> Co-authored-by: Rahul Mahajan <[email protected]>
- Loading branch information
1 parent
15ef84d
commit 3e96ed5
Showing
2 changed files
with
193 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
name: gw-ci-orion | ||
|
||
on: [workflow_dispatch] | ||
|
||
# TEST_DIR contains 2 directories; | ||
# 1. HOMEgfs: clone of the global-workflow | ||
# 2. RUNTESTS: A directory containing EXPDIR and COMROT for experiments | ||
# e.g. $> tree ./TEST_DIR | ||
# ./TEST_DIR | ||
# ├── HOMEgfs | ||
# └── RUNTESTS | ||
# ├── COMROT | ||
# │ └── ${pslot} | ||
# └── EXPDIR | ||
# └── ${pslot} | ||
env: | ||
TEST_DIR: ${{ github.workspace }}/${{ github.run_id }} | ||
MACHINE_ID: orion | ||
|
||
jobs: | ||
checkout-build-link: | ||
runs-on: [self-hosted, orion-ready] | ||
timeout-minutes: 600 | ||
|
||
steps: | ||
- name: Checkout global-workflow | ||
uses: actions/checkout@v3 | ||
with: | ||
path: ${{ github.run_id }}/HOMEgfs # This path needs to be relative | ||
|
||
- name: Checkout components | ||
run: | | ||
cd ${{ env.TEST_DIR }}/HOMEgfs/sorc | ||
./checkout.sh -c -g # Options e.g. -u can be added late | ||
- name: Build components | ||
run: | | ||
cd ${{ env.TEST_DIR }}/HOMEgfs/sorc | ||
./build_all.sh | ||
- name: Link artifacts | ||
run: | | ||
cd ${{ env.TEST_DIR }}/HOMEgfs/sorc | ||
./link_workflow.sh | ||
create-experiments: | ||
needs: checkout-build-link | ||
runs-on: [self-hosted, orion-ready] | ||
strategy: | ||
matrix: | ||
case: ["C48_S2S", "C96_atm3DVar"] | ||
|
||
steps: | ||
- name: Create Experiments ${{ matrix.case }} | ||
env: | ||
HOMEgfs_PR: ${{ env.TEST_DIR }}/HOMEgfs | ||
RUNTESTS: ${{ env.TEST_DIR }}/RUNTESTS | ||
pslot: ${{ matrix.case }}.${{ github.run_id }} | ||
run: | | ||
cd ${{ env.TEST_DIR }}/HOMEgfs | ||
source workflow/gw_setup.sh | ||
source ci/platforms/orion.sh | ||
./ci/scripts/create_experiment.py --yaml ci/cases/${{ matrix.case }}.yaml --dir ${{ env.HOMEgfs_PR }} | ||
run-experiments: | ||
needs: create-experiments | ||
runs-on: [self-hosted, orion-ready] | ||
strategy: | ||
max-parallel: 2 | ||
matrix: | ||
case: ["C48_S2S", "C96_atm3DVar"] | ||
steps: | ||
- name: Run Experiment ${{ matrix.case }} | ||
run: | | ||
cd ${{ env.TEST_DIR }}/HOMEgfs | ||
./ci/scripts/run-check_ci.sh ${{ env.TEST_DIR }} ${{ matrix.case }}.${{ github.run_id }} | ||
clean-up: | ||
needs: run-experiments | ||
runs-on: [self-hosted, orion-ready] | ||
steps: | ||
- name: Clean-up | ||
run: | | ||
cd ${{ github.workspace }} | ||
rm -rf ${{ github.run_id }} | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
#!/bin/bash | ||
|
||
set -eu | ||
|
||
##################################################################################### | ||
# Script description: script to check the status of an experiment as reported | ||
# by Rocoto | ||
##################################################################################### | ||
|
||
TEST_DIR=${1:-${TEST_DIR:-?}} # Location of the root of the testing directory | ||
pslot=${2:-${pslot:-?}} # Name of the experiment being tested by this script | ||
|
||
# TEST_DIR contains 2 directories; | ||
# 1. HOMEgfs: clone of the global-workflow | ||
# 2. RUNTESTS: A directory containing EXPDIR and COMROT for experiments | ||
# # e.g. $> tree ./TEST_DIR | ||
# ./TEST_DIR | ||
# ├── HOMEgfs | ||
# └── RUNTESTS | ||
# ├── COMROT | ||
# │ └── ${pslot} | ||
# └── EXPDIR | ||
# └── ${pslot} | ||
HOMEgfs="${TEST_DIR}/HOMEgfs" | ||
RUNTESTS="${TEST_DIR}/RUNTESTS" | ||
|
||
# Source modules and setup logging | ||
echo "Source modules." | ||
source "${HOMEgfs}/workflow/gw_setup.sh" | ||
|
||
# cd into the experiment directory | ||
echo "cd ${RUNTESTS}/EXPDIR/${pslot}" | ||
cd "${RUNTESTS}/EXPDIR/${pslot}" || (echo "FATAL ERROR: Unable to cd into '${RUNTESTS}/EXPDIR/${pslot}', ABORT!"; exit 1) | ||
|
||
# Name of the Rocoto XML and database files | ||
xml="${pslot}.xml" | ||
db="${pslot}.db" | ||
|
||
# Ensure the XML is present for the experiment | ||
if [[ ! -f "${xml}" ]]; then | ||
echo "FATAL ERROR: XML file ${xml} not found in '${pslot}', experiment ${pslot} failed, ABORT!" | ||
exit 1 | ||
fi | ||
|
||
# Launch experiment | ||
echo "Launch experiment with Rocoto." | ||
rocotorun -v "${ROCOTO_VERBOSE:-0}" -w "${xml}" -d "${db}" | ||
sleep 30 | ||
if [[ ! -f "${db}" ]]; then | ||
echo "FATAL ERROR: Rocoto database file ${db} not found, experiment ${pslot} failed, ABORT!" | ||
exit 2 | ||
fi | ||
|
||
# Experiment launched | ||
rc=99 | ||
while true; do | ||
|
||
echo "Run rocotorun." | ||
rocotorun -v "${ROCOTO_VERBOSE:-0}" -w "${xml}" -d "${db}" | ||
|
||
# Wait before running rocotostat | ||
sleep 30 | ||
|
||
# Get job statistics | ||
echo "Gather Rocoto statistics" | ||
rocotostat_output=$(rocotostat -w "${xml}" -d "${db}" -s | grep -v CYCLE) || true | ||
num_cycles=$(echo "${rocotostat_output}" | wc -l) || true | ||
num_done=$(echo "${rocotostat_output}" | grep -c Done) || true | ||
num_succeeded=$(rocotostat -w "${xml}" -d "${db}" -a | grep -c SUCCEEDED) || true | ||
num_failed=$(rocotostat -w "${xml}" -d "${db}" -a | grep -c -E 'FAIL|DEAD') || true | ||
|
||
echo "${pslot} Total Cycles: ${num_cycles} number done: ${num_done}" | ||
|
||
if [[ ${num_failed} -ne 0 ]]; then | ||
{ | ||
echo "Experiment ${pslot} Terminated with ${num_failed} tasks failed at $(date)" || true | ||
echo "Experiment ${pslot} Terminated: *FAILED*" | ||
} >> "${RUNTESTS}/ci.log" | ||
|
||
error_logs=$(rocotostat -d "${db}" -w "${xml}" | grep -E 'FAIL|DEAD' | awk '{print "-c", $1, "-t", $2}' | xargs rocotocheck -d "${db}" -w "${xml}" | grep join | awk '{print $2}') || true | ||
{ | ||
echo "Error logs:" | ||
echo "${error_logs}" | ||
} >> "${RUNTESTS}/ci.log" | ||
sed -i "s/\`\`\`//2g" "${RUNTESTS}/ci.log" | ||
sacct --format=jobid,jobname%35,WorkDir%100,stat | grep "${pslot}" | grep "${pr}\/RUNTESTS" | awk '{print $1}' | xargs scancel || true | ||
rc=1 | ||
break | ||
fi | ||
|
||
if [[ "${num_done}" -eq "${num_cycles}" ]]; then | ||
{ | ||
echo "Experiment ${pslot} Completed at $(date)" || true | ||
echo "with ${num_succeeded} successfully completed jobs" || true | ||
echo "Experiment ${pslot} Completed: *SUCCESS*" | ||
} >> "${RUNTESTS}/ci.log" | ||
sed -i "s/\`\`\`//2g" "${RUNTESTS}/ci.log" | ||
rc=0 | ||
break | ||
fi | ||
|
||
# Wait before running rocotorun again | ||
sleep 300 | ||
|
||
done | ||
|
||
exit "${rc}" |