[develop] Fixes for PW Jenkins Nightly Builds (ufs-community#1091)

* Adds logic to handle GCP's default conda env, which conflicts with the SRW App's conda env. Fixes a Parallel Works naming convention bug in the script. * It also addresses a known issue with a Ruby warning on PW instances that prevents the run_WE2E_tests.py from exiting gracefully. The solution we use in our bootstrap for /contrib doesn't seem to work for the /lustre directory, which is why the warning is hardcoded into the monitor_jobs.py script. * The new spack-stack build on Azure is missing a gnu library, so added the path to this missing library to the proper run scripts and cleaned up the wflow noaacloud lua file. * Removed log and error files from the qsub wrapper script so that qsub can generate these files with the job id in the files name. Also, fixed typo in the wrapper script.
natalie-perlin · Aug 15, 2024 · 8d9a137 · 8d9a137
1 parent fbc762e
commit 8d9a137
Show file tree

Hide file tree

Showing 6 changed files with 17 additions and 12 deletions.
diff --git a/.cicd/Jenkinsfile b/.cicd/Jenkinsfile
@@ -235,19 +235,19 @@ pipeline {
 
                                 sh "SRW_WE2E_COMPREHENSIVE_TESTS=${run_we2e_comprehensive_tests} SRW_WE2E_SINGLE_TEST=${single_test}" + ' bash --login "${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/srw_test.sh"'
 
-                                // Archive the test log files
-                                sh "[[ -d ${SRW_WE2E_EXPERIMENT_BASE_DIR} ]] && cd ${SRW_WE2E_EXPERIMENT_BASE_DIR} && tar --create --gzip --verbose --dereference --file ${WORKSPACE}/${SRW_PLATFORM}/we2e_test_logs-${SRW_PLATFORM}-${SRW_COMPILER}.tgz */log.generate_FV3LAM_wflow */log/* ${WORKSPACE}/${SRW_PLATFORM}/tests/WE2E/WE2E_tests_*yaml WE2E_summary*txt ${WORKSPACE}/${SRW_PLATFORM}/tests/WE2E/log.* || cat /dev/null > ${WORKSPACE}/${SRW_PLATFORM}/we2e_test_logs-${SRW_PLATFORM}-${SRW_COMPILER}.tgz"
                                 }
                                 sh "STAGE_NAME=${env.STAGE_NAME} " + 'bash --login "${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/disk_usage.sh"'
                             }
                         }
 
                         post {
                             success {
-                                s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}/*_test_results-*-*.txt", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false], [bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}/we2e_test_logs-${env.SRW_PLATFORM}-${env.SRW_COMPILER}.tgz", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
                                 s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}/*-skill-score.txt", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
                             }
                             always {
+                                // Archive the test log files
+                                sh "[[ -d ${SRW_WE2E_EXPERIMENT_BASE_DIR} ]] && cd ${SRW_WE2E_EXPERIMENT_BASE_DIR} && tar --create --gzip --verbose --dereference --file ${env.WORKSPACE}/${env.SRW_PLATFORM}/we2e_test_logs-${env.SRW_PLATFORM}-${env.SRW_COMPILER}.tgz */log.generate_FV3LAM_wflow */log/* ${env.WORKSPACE}/${env.SRW_PLATFORM}/tests/WE2E/WE2E_tests_*yaml WE2E_summary*txt ${env.WORKSPACE}/${env.SRW_PLATFORM}/tests/WE2E/log.* || cat /dev/null > ${env.WORKSPACE}/${env.SRW_PLATFORM}/we2e_test_logs-${env.SRW_PLATFORM}-${env.SRW_COMPILER}.tgz"
+                                s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}/*_test_results-*-*.txt", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false], [bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}/we2e_test_logs-${env.SRW_PLATFORM}-${env.SRW_COMPILER}.tgz", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
                                 s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}-*-time-srw_test.json", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
                                 s3Upload consoleLogLevel: 'INFO', dontSetBuildResultOnFailure: false, dontWaitForConcurrentBuildCompletion: false, entries: [[bucket: 'noaa-epic-prod-jenkins-artifacts', excludedFile: '', flatten: false, gzipFiles: false, keepForever: false, managedArtifacts: true, noUploadOnFailure: false, selectedRegion: 'us-east-1', showDirectlyInBrowser: false, sourceFile: "${env.SRW_PLATFORM}-*-disk-usage${env.STAGE_NAME}.csv", storageClass: 'STANDARD', uploadFromSlave: false, useServerSideEncryption: false]], pluginFailureResultConstraint: 'FAILURE', profileName: 'main', userMetadata: []
                                 // Remove the data sets from the experiments directory to conserve disk space

diff --git a/.cicd/scripts/qsub_srw_ftest.sh b/.cicd/scripts/qsub_srw_ftest.sh
@@ -9,7 +9,5 @@
 #PBS -l select=1:ncpus=24:mpiprocs=24:ompthreads=1
 #PBS -l walltime=00:30:00
 #PBS -V
-#PBS -o log_wrap.%j.log
-#PBS -e err_wrap.%j.err 
 
 bash ${WORKSPACE}/${SRW_PLATFORM}/.cicd/scripts/srw_ftest.sh
diff --git a/.cicd/scripts/srw_ftest.sh b/.cicd/scripts/srw_ftest.sh
@@ -66,6 +66,9 @@ sed "s|^workflow:|workflow:\n  EXEC_SUBDIR: ${workspace}/install_${SRW_COMPILER}
 # Decrease forecast length since we are running all the steps
 sed "s|^  FCST_LEN_HRS: 12|  FCST_LEN_HRS: 6|g" -i ush/config.yaml
 
+# Update compiler 
+sed "s|^  COMPILER: intel|  COMPILER: ${SRW_COMPILER}|g" -i ush/config.yaml
+
 # DATA_LOCATION differs on each platform ... find it.
 export DATA_LOCATION=$(grep TEST_EXTRN_MDL_SOURCE_BASEDIR ${workspace}/ush/machine/${platform,,}.yaml | awk '{printf "%s", $2}')
 echo "DATA_LOCATION=${DATA_LOCATION}"
@@ -85,6 +88,8 @@ source etc/lmod-setup.sh ${platform,,}
 module use modulefiles
 module load build_${platform,,}_${SRW_COMPILER}
 module load wflow_${platform,,}
+# Deactivate conflicting conda env on GCP
+[[ "${SRW_PLATFORM}" =~ "gclusternoaa" ]] && conda deactivate
 
 [[ ${FORGIVE_CONDA} == true ]] && set +e +u    # Some platforms have incomplete python3 or conda support, but wouldn't necessarily block workflow tests
 conda activate srw_app

diff --git a/.cicd/scripts/srw_metric.sh b/.cicd/scripts/srw_metric.sh
@@ -78,6 +78,8 @@ cd ${workspace}
 
 # Activate workflow environment
 module load wflow_${platform,,}
+# Deactivate conflicting conda env on GCP
+[[ "${SRW_PLATFORM}" =~ "gclusternoaa" ]] && conda deactivate
 
 [[ ${FORGIVE_CONDA} == true ]] && set +e +u    # Some platforms have incomplete python3 or conda support, but would not necessarily block workflow tests
 conda activate srw_app
@@ -98,7 +100,7 @@ if [[ ${RUN_STAT_ANLY_OPT} == true ]]; then
     # Clear out data
     rm -rf ${workspace}/Indy-Severe-Weather/
     # Check if metprd data exists locally otherwise get it from S3
-    TEST_EXTRN_MDL_SOURCE_BASEDIR=$(grep TEST_EXTRN_MDL_SOURCE_BASEDIR ${workspace}/ush/machine/${SRW_PLATFORM}.yaml | awk '{print $NF}')
+    TEST_EXTRN_MDL_SOURCE_BASEDIR=$(grep TEST_EXTRN_MDL_SOURCE_BASEDIR ${workspace}/ush/machine/${platform}.yaml | awk '{print $NF}')
     if [[ -d $(dirname ${TEST_EXTRN_MDL_SOURCE_BASEDIR})/metprd/point_stat ]] ; then
         mkdir -p Indy-Severe-Weather/metprd/point_stat
         cp -rp $(dirname ${TEST_EXTRN_MDL_SOURCE_BASEDIR})/metprd/point_stat Indy-Severe-Weather/metprd
@@ -108,7 +110,7 @@ if [[ ${RUN_STAT_ANLY_OPT} == true ]]; then
         wget https://noaa-ufs-srw-pds.s3.amazonaws.com/sample_cases/release-public-v2.1.0/Indy-Severe-Weather.tgz
         tar xvfz Indy-Severe-Weather.tgz
     fi
-    [[ -f ${platform,,}-${srw_compiler}-skill-score.txt ]] && rm ${platform,,}-${srw_compiler}-skill-score.txt
+    [[ -f ${SRW_PLATFORM,,}-${srw_compiler}-skill-score.txt ]] && rm ${SRW_PLATFORM,,}-${srw_compiler}-skill-score.txt
     # Skill score index is computed over several terms that are defined in parm/metplus/STATAnalysisConfig_skill_score. 
     # It is computed by aggregating the output from earlier runs of the Point-Stat and/or Grid-Stat tools over one or more cases.
     # In this example, skill score index is a weighted average of 4 skill scores of RMSE statistics for wind speed, dew point temperature, 
@@ -126,15 +128,15 @@ if [[ ${RUN_STAT_ANLY_OPT} == true ]]; then
        sed -i 's|--load("conda")|load("conda")|g' ${workspace}/modulefiles/tasks/${platform,,}/run_vx.local.lua
     fi
     # Run stat_analysis
-    stat_analysis -config parm/metplus/STATAnalysisConfig_skill_score -lookin ${workspace}/Indy-Severe-Weather/metprd/point_stat -v 2 -out ${platform,,}-${srw_compiler}-skill-score.txt
+    stat_analysis -config parm/metplus/STATAnalysisConfig_skill_score -lookin ${workspace}/Indy-Severe-Weather/metprd/point_stat -v 2 -out ${SRW_PLATFORM,,}-${srw_compiler}-skill-score.txt
 
     # check skill-score.txt
-    cat ${platform,,}-${srw_compiler}-skill-score.txt
+    cat ${SRW_PLATFORM,,}-${srw_compiler}-skill-score.txt
 
     # get skill-score (SS_INDEX) and check if it is significantly smaller than 1.0
     # A value greater than 1.0 indicates that the forecast model outperforms the reference, 
     # while a value less than 1.0 indicates that the reference outperforms the forecast.
-    tmp_string=$( tail -2 ${platform,,}-${srw_compiler}-skill-score.txt | head -1 )
+    tmp_string=$( tail -2 ${SRW_PLATFORM,,}-${srw_compiler}-skill-score.txt | head -1 )
     SS_INDEX=$(echo $tmp_string | awk -F " " '{print $NF}')
     echo "Skill Score: ${SS_INDEX}"
     if [[ ${SS_INDEX} < "0.700" ]]; then

diff --git a/.cicd/scripts/wrapper_srw_ftest.sh b/.cicd/scripts/wrapper_srw_ftest.sh
@@ -67,7 +67,7 @@ do
         # Return exit code and check for results file first
         results_file="${WORKSPACE}/${SRW_PLATFORM}/functional_test_results_${SRW_PLATFORM}_${SRW_COMPILER}.txt"
         if [ ! -f "$results_file" ]; then
-            echo "Missing results file! \nexit 1"
+            echo -e "Missing results file! \nexit 1"
             exit 1
         fi
 

diff --git a/tests/WE2E/utils.py b/tests/WE2E/utils.py
@@ -530,7 +530,7 @@ def compare_rocotostat(expt_dict,name):
             continue
         line_array = line.split()
         # Skip header lines
-        if line_array[0] == 'CYCLE':
+        if line_array[0] == 'CYCLE' or line_array[0] == '/apps/rocoto/1.3.3/lib/workflowmgr/launchserver.rb:40:':
             continue
         # We should now just have lines describing jobs, in the form:
         # line_array = ['cycle','task','jobid','status','exit status','num tries','walltime']