Skip to content

Commit

Permalink
Merge branch 'NOAA-EMC:develop' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
CoryMartin-NOAA authored Aug 7, 2024
2 parents 09b1133 + 37c53ac commit b193128
Show file tree
Hide file tree
Showing 20 changed files with 197 additions and 92 deletions.
11 changes: 11 additions & 0 deletions .github/pull_request_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,17 @@
# Change characteristics
- Is this a breaking change (a change in existing functionality)? YES/NO
- Does this change require a documentation update? YES/NO
- Does this change require an update to any of the following submodules? YES/NO (If YES, please add a link to any PRs that are pending.)
- [ ] EMC verif-global
- [ ] GDAS
- [ ] GFS-utils
- [ ] GSI
- [ ] GSI-monitor
- [ ] GSI-utils
- [ ] UFS-utils
- [ ] UFS-weather-model
- [ ] wxflow


# How has this been tested?
<!-- Please list any test you conducted, including the machine.
Expand Down
4 changes: 2 additions & 2 deletions ci/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -282,10 +282,10 @@ pipeline {
steps {
script {
sh(script: """
labels=\$(gh pr view ${env.CHANGE_ID} --repo ${repo_url} --json labels --jq '.labels[].name')
labels=\$(${GH} pr view ${env.CHANGE_ID} --repo ${repo_url} --json labels --jq '.labels[].name')
for label in \$labels; do
if [[ "\$label" == *"${Machine}"* ]]; then
gh pr edit ${env.CHANGE_ID} --repo ${repo_url} --remove-label "\$label"
${GH} pr edit ${env.CHANGE_ID} --repo ${repo_url} --remove-label "\$label"
fi
done
""", returnStatus: true)
Expand Down
143 changes: 125 additions & 18 deletions ci/scripts/utils/launch_java_agent.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,70 @@
#!/bin/env bash

set -e

# ==============================================================================
# Script Name: launch_java_agent.sh
#
# Description:
# This script automates the process of launching a Jenkins agent
# on a specified machine. It ensures that the necessary
# prerequisites are met, such as the availability of JAVA_HOME,
# the Jenkins agent launch directory, and proper authentication
# with GitHub.
#
# It then proceeds to check if the Jenkins node is online and
# decides whether to launch the Jenkins agent based on the node's
# status. The agent is launched in the background,
# and its PID is logged for reference.
#
# Prerequisites:
# JAVA_HOME must be set to a valid JDK installation.
# Jenkins agent launch directory must exist and be specified.
# GitHub CLI (gh) must be installed and authenticated for messeging
# from the Jenkins controller to GitHub PR via shell commands.
# Jenkins agent launch directory must exist and be specified.
# TODO: Must use GitHub CLI v2.25.1 (newer versoins have issues)
# https://github.com/cli/cli/releases/download/v2.25.1/gh_2.25.1_linux_amd64.tar.gz
# Jenkins controller URL and authentication token must be provided.
# jenkins-secret-file:
# Must be present in the Jenkins agent launch directory.
# This file contains the secret key for the Jenkins agent
# established by the Jenkins administrator for each Node.
# jenkins_token:
# Must be present in the Jenkins agent launch directory.
# This file contains the user authentication token for the Jenkins controller
# to use the Remote API. This token can be generated by the user
# on the Jenkins controller.
# controller_user:
# Must be set to the Jenkins controller username corresponing to the jenkins_token.
#
# Usage: ./launch_java_agent.sh [now] [-f]
# The optional 'now' argument forces the script to launch the Jenkins
# agent without waiting before trying again.
# The optional 'force' argument forces the script to launch the Jenkins regarless of the node status.
#
# ==============================================================================

force_launch="False"
skip_wait="False"
while getopts ":fnh" flag; do
case "${flag}" in
f) force_launch="True";;
n) skip_wait="True";;
h) echo "Usage: ./launch_java_agent.sh [now] [force]
Two mutually exclusive optional arguments:
-n (now) causes the script to launch the Jenkins agent without waiting before trying again.
-f (force) forces the script to launch the Jenkins regarless of its connection status."
exit 0 ;;
*) echo "Unknown flag: ${flag}"
exit 1;;
esac
done

controller_url="https://jenkins.epic.oarcloud.noaa.gov"
controller_user="terry.mcguinness"
controller_user=${controller_user:-"terry.mcguinness"}
controller_user_auth_token="jenkins_token"

HOMEgfs="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." >/dev/null 2>&1 && pwd )"
host=$(hostname)

Expand All @@ -13,12 +75,10 @@ host=$(hostname)
source "${HOMEgfs}/ush/detect_machine.sh"
case ${MACHINE_ID} in
hera | orion | hercules | wcoss2)
echo "Launch Jenkins Java Controler on ${MACHINE_ID}"
;;
echo "Launch Jenkins Java Controler on ${MACHINE_ID}";;
*)
echo "Unsupported platform. Exiting with error."
exit 1
;;
exit 1;;
esac

LOG=lanuched_agent-$(date +%Y%m%d%M).log
Expand All @@ -43,9 +103,16 @@ echo "JAVA VERSION: "
${JAVA} -version

export GH="${HOME}/bin/gh"
command -v "${GH}"
[[ -f "${GH}" ]] || echo "gh is not installed in ${HOME}/bin"
${GH} --version

check_mark=$(gh auth status -t 2>&1 | grep "Token:" | awk '{print $1}') || true
if [[ "${check_mark}" != "" ]]; then
echo "gh not authenticating with emcbot token"
exit 1
fi
echo "gh authenticating with emcbot TOKEN ok"

if [[ -d "${JENKINS_AGENT_LANUCH_DIR}" ]]; then
echo "Jenkins Agent Lanuch Directory: ${JENKINS_AGENT_LANUCH_DIR}"
else
Expand All @@ -56,22 +123,62 @@ cd "${JENKINS_AGENT_LANUCH_DIR}"

if ! [[ -f agent.jar ]]; then
curl -sO "${controller_url}/jnlpJars/agent.jar"
echo "Updated agent.jar downloaded"
fi

if [[ ! -f "${controller_user_auth_token}" ]]; then
echo "User Jenkins authetication TOKEN to the controller for using the Remote API does not exist"
exit 1
fi
JENKINS_TOKEN=$(cat "${controller_user_auth_token}")

cat << EOF > parse.py
#!/usr/bin/env python3
import json,sys
with open(sys.argv[1], 'r') as file:
data = json.load(file)
print(data.get('offline','True'))
EOF
chmod u+x parse.py

JENKINS_TOKEN=$(cat jenkins_token)
check_node_online() {
rm -f curl_response
curl_response=$(curl --silent -u "${controller_user}:${JENKINS_TOKEN}" "${controller_url}/computer/${MACHINE_ID^}-EMC/api/json?pretty=true") || true
if [[ "${curl_response}" == "" ]]; then
echo "ERROR: Jenkins controller not reachable. Exiting with error."
exit 1
fi
echo -n "${curl_response}" > curl_response
./parse.py curl_response
}

lauch_agent () {
echo "Launching Jenkins Agent on ${host}"
command="nohup ${JAVA} -jar agent.jar -jnlpUrl ${controller_url}/computer/${MACHINE_ID^}-EMC/jenkins-agent.jnlp -secret @jenkins-secret-file -workDir ${JENKINS_WORK_DIR}"
echo -e "Launching Jenkins Agent on ${host} with the command:\n${command}" >& "${LOG}"
${command} >> "${LOG}" 2>&1 &
nohup_PID=$!
echo "Java agent running on PID: ${nohup_PID}" >> "${LOG}" 2>&1
}

if [[ "${force_launch}" == "True" ]]; then
lauch_agent
exit
fi

#
offline=$(curl --silent -u "${controller_user}:${JENKINS_TOKEN}" "${controller_url}/computer/${MACHINE_ID^}-EMC/api/json?pretty=true" | grep '\"offline\"' | awk '{gsub(/,/,"");print $3}') || true
echo "Jenkins Agent offline setting: ${offline}"
offline=$(set -e; check_node_online)

if [[ "${offline}" == "true" ]]; then
echo "Jenkins Agent is offline. Lanuching Jenkins Agent on ${host}"
command="nohup ${JAVA} -jar agent.jar -jnlpUrl ${controller_url}/computer/${MACHINE_ID^}-EMC/jenkins-agent.jnlp -secret @jenkins-secret-file -workDir ${JENKINS_WORK_DIR}"
echo -e "Lanuching Jenkins Agent on ${host} with the command:\n${command}" >& "${LOG}"
${command} >> "${LOG}" 2>&1 &
nohup_PID=$!
echo "Java agent running on PID: ${nohup_PID}" >> "${LOG}" 2>&1
echo "Java agent running on PID: ${nohup_PID}"
if [[ "${offline}" != "False" ]]; then
if [[ "${skip_wait}" != "True" ]]; then
echo "Jenkins Agent is offline. Waiting 5 more minutes to check again in the event it is a temp network issue"
sleep 300
offline=$(set -e; check_node_online)
fi
if [[ "${offline}" != "False" ]]; then
lauch_agent
else
echo "Jenkins Agent is online (nothing done)"
fi
else
echo "Jenkins Agent is online (nothing done)"
fi
2 changes: 1 addition & 1 deletion jobs/JGFS_ATMOS_FBWIND
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# GFS FBWIND PRODUCT GENERATION
############################################
source "${HOMEgfs}/ush/preamble.sh"
source "${HOMEgfs}/ush/jjob_header.sh" -e "fbwind" -c "base"
source "${HOMEgfs}/ush/jjob_header.sh" -e "fbwind" -c "base fbwind"

###################################
# Specify NET and RUN Name and model
Expand Down
2 changes: 1 addition & 1 deletion jobs/JGFS_ATMOS_GEMPAK_META
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# GFS GEMPAK META PRODUCT GENERATION
############################################
source "${HOMEgfs}/ush/preamble.sh"
source "${HOMEgfs}/ush/jjob_header.sh" -e "gempak_meta" -c "base"
source "${HOMEgfs}/ush/jjob_header.sh" -e "gempak_meta" -c "base gempak"


###############################################
Expand Down
2 changes: 1 addition & 1 deletion jobs/JGFS_ATMOS_GEMPAK_PGRB2_SPEC
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#! /usr/bin/env bash

source "${HOMEgfs}/ush/preamble.sh"
source "${HOMEgfs}/ush/jjob_header.sh" -e "gempak_spec" -c "base"
source "${HOMEgfs}/ush/jjob_header.sh" -e "gempak_spec" -c "base gempak"

############################################
# Set up model and cycle specific variables
Expand Down
6 changes: 3 additions & 3 deletions jobs/JGLOBAL_CLEANUP
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ source "${HOMEgfs}/ush/jjob_header.sh" -e "cleanup" -c "base cleanup"

"${SCRgfs}/exglobal_cleanup.sh"
status=$?
[[ ${status} -ne 0 ]] && exit "${status}"
(( status != 0 )) && exit "${status}"

##########################################
# Remove the Temporary working directory
##########################################
cd "${DATAROOT}" || (echo "${DATAROOT} does not exist. ABORT!"; exit 1)
[[ ${KEEPDATA} = "NO" ]] && rm -rf "${DATA}"
# DATAROOT="${STMP}/RUNDIRS/${PSLOT}/${RUN}.${PDY}${cyc}"
# is removed in exglobal_cleanup.sh, nothing to do here.

exit 0

1 change: 0 additions & 1 deletion parm/config/gefs/config.base
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,6 @@ export PSLOT="@PSLOT@"
export EXPDIR="@EXPDIR@/${PSLOT}"
export ROTDIR="@COMROOT@/${PSLOT}"

export DATAROOT="${STMP}/RUNDIRS/${PSLOT}" # TODO: set via prod_envir in Ops
export ARCDIR="${NOSCRUB}/archive/${PSLOT}"
export ATARDIR="@ATARDIR@"

Expand Down
1 change: 0 additions & 1 deletion parm/config/gfs/config.base
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ export DUMP_SUFFIX=""
if [[ "${PDY}${cyc}" -ge "2019092100" && "${PDY}${cyc}" -le "2019110700" ]]; then
export DUMP_SUFFIX="p" # Use dumps from NCO GFS v15.3 parallel
fi
export DATAROOT="${STMP}/RUNDIRS/${PSLOT}" # TODO: set via prod_envir in Ops
export ARCDIR="${NOSCRUB}/archive/${PSLOT}"
export ATARDIR="@ATARDIR@"

Expand Down
11 changes: 11 additions & 0 deletions parm/config/gfs/config.fbwind
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#! /usr/bin/env bash

########## config.gempak ##########
# GFS fbwind step specific

echo "BEGIN: config.fbwind"

# Get task specific resources
source "${EXPDIR}/config.resources" fbwind

echo "END: config.fbwind"
11 changes: 9 additions & 2 deletions parm/config/gfs/config.resources
Original file line number Diff line number Diff line change
Expand Up @@ -925,8 +925,8 @@ case ${step} in
threads_per_task=1
walltime_gdas="03:00:00"
walltime_gfs="06:00:00"
ntasks=4
tasks_per_node=4
ntasks=1
tasks_per_node=1
export memory="80G"
;;

Expand Down Expand Up @@ -1160,6 +1160,13 @@ case ${step} in
memory_gfs="2GB"
;;

"fbwind")
walltime="00:05:00"
ntasks=1
threads_per_task=1
memory="4GB"
;;

"mos_stn_prep")
walltime="00:10:00"
ntasks=3
Expand Down
33 changes: 9 additions & 24 deletions scripts/exglobal_cleanup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,41 +12,26 @@ if [[ -d "${DATAfcst}" ]]; then rm -rf "${DATAfcst}"; fi
#DATAefcs="${DATAROOT}/${RUN}efcs???${PDY:-}${cyc}"
rm -rf "${DATAROOT}/${RUN}efcs"*"${PDY:-}${cyc}"

# Search and delete files/directories from DATAROOT/ older than ${purge_every_days} days
# purge_every_days should be a positive integer
#purge_every_days=3

# Find and delete files older than ${purge_every_days} days
#find "${DATAROOT}/"* -type f -mtime "+${purge_every_days}" -exec rm -f {} \;

# Find and delete directories older than ${purge_every_days} days
#find "${DATAROOT}/"* -type d -mtime "+${purge_every_days}" -exec rm -rf {} \;
# In XML, DATAROOT is defined as:
#DATAROOT="${STMP}/RUNDIRS/${PSLOT}/${RUN}.${PDY}${cyc}"
# cleanup is only executed after the entire cycle is successfully completed.
# removing DATAROOT should be possible if that is the case.
rm -rf "${DATAROOT}"

echo "Cleanup ${DATAROOT} completed!"
###############################################################

###############################################################
# Clean up previous cycles; various depths
# PRIOR CYCLE: Leave the prior cycle alone
# shellcheck disable=SC2153
GDATE=$(date --utc +%Y%m%d%H -d "${PDY} ${cyc} -${assim_freq} hours")
# PREVIOUS to the PRIOR CYCLE
GDATE=$(date --utc +%Y%m%d%H -d "${GDATE:0:8} ${GDATE:8:2} -${assim_freq} hours")

# Remove the TMPDIR directory
# TODO Only prepbufr is currently using this directory, and all jobs should be
# cleaning up after themselves anyway
COMIN="${DATAROOT}/${GDATE}"
[[ -d ${COMIN} ]] && rm -rf "${COMIN}"

if [[ "${CLEANUP_COM:-YES}" == NO ]] ; then
exit 0
fi

###############################################################
# Clean up previous cycles; various depths

# Step back every assim_freq hours and remove old rotating directories
# for successful cycles (defaults from 24h to 120h).
# Retain files needed by Fit2Obs
last_date=$(date --utc +%Y%m%d%H -d "${PDY} ${cyc} -${RMOLDEND:-24} hours" )
last_date=$(date --utc +%Y%m%d%H -d "${PDY} ${cyc} -${RMOLDEND:-24} hours")
first_date=$(date --utc +%Y%m%d%H -d "${PDY} ${cyc} -${RMOLDSTD:-120} hours")
last_rtofs=$(date --utc +%Y%m%d%H -d "${PDY} ${cyc} -${RMOLDRTOFS:-48} hours")
function remove_files() {
Expand Down
4 changes: 2 additions & 2 deletions sorc/link_workflow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ ${LINK_OR_COPY} "${HOMEgfs}/versions/run.${machine}.ver" "${HOMEgfs}/versions/ru
case "${machine}" in
"wcoss2") FIX_DIR="/lfs/h2/emc/global/noscrub/emc.global/FIX/fix" ;;
"hera") FIX_DIR="/scratch1/NCEPDEV/global/glopara/fix" ;;
"orion") FIX_DIR="/work/noaa/global/kfriedma/glopara/fix" ;;
"hercules") FIX_DIR="/work/noaa/global/kfriedma/glopara/fix" ;;
"orion") FIX_DIR="/work/noaa/global/glopara/fix" ;;
"hercules") FIX_DIR="/work/noaa/global/glopara/fix" ;;
"jet") FIX_DIR="/lfs4/HFIP/hfv3gfs/glopara/git/fv3gfs/fix" ;;
"s4") FIX_DIR="/data/prod/glopara/fix" ;;
"gaea") FIX_DIR="/gpfs/f5/ufs-ard/world-shared/global/glopara/data/fix" ;;
Expand Down
15 changes: 0 additions & 15 deletions ush/check_netcdf.sh

This file was deleted.

Loading

0 comments on commit b193128

Please sign in to comment.