Feature #2612 forecast lead groups (#2698)

* Refactor to prevent calling sys.exit from functions that could cause the calling script to exit unintentionally. Instead return None or False and only sys.exit a non-zero value from a script that is called explicitly. Refactored validate config logic to reduce cognitive complexity to satisfy SonarQube * per #2596, ensure the same RUN_ID is set for a given METplus run even if instances are used in the PROCESS_LIST * fixed linter complaints * remove unused imports * cleanup for SonarQube and linter * remove unused imports * import parent wrappers directly so that PyCharm can follow class hierarchy * per #2612, add support for processing groups of forecast leads * remove call to prune_empty function and ci-run-all-diff * fixed check on DataFrame to properly check for an empty one * remove prune_empty function that is no longer used * update conda environment requirements for metdataio * set label to Group<n> for LEAD_SEQ_<n> if LEAD_SEQ_<n>_LABEL is not set (instead of erroring), use forecast lead label for lead if set, log lead group label and formatted list of leads for each run * increase timeout for request from 60 to 90 * handle exception and output an error message with a suggestion to rerun the failed GHA jobs when a hiccup with DockerHub prevents the list of available Docker data volumes from being obtained. FYI @jprestop * cleanup to address SonarQube complaints, move call to exit out of main function so a script calling the function does not exit when the function errors * cleanup to appease SonarQube * per #2612, add support for creating groups of forecast leads by specifying a list of forecast leads and a time interval to create groups of leads * per #2612, add unit tests for function that gets groups of forecast leads and added tests for new groups capabilities * various refactoring to satisfy SonarQube complaints * renamed config LEAD_SEQ_DIVISIONS to LEAD_SEQ_GROUP_SIZE and LEAD_SEQ_DIVISIONS_LABEL to LEAD_SEQ_GROUP_LABEL * per #2612, add documentation for new config variables including examples * fix empty check for DataFrame * fix rst typo * minor cleanup to satisfy linter * Per #2612, if runtime freq is RUN_ONCE_PER_LEAD and groups of forecast leads are specified, then the wrapper will be run once per lead GROUP instead of once for each unique lead in the lead groups. Update unit test to reflect new behavior
dtcenter · Sep 23, 2024 · 3d752d1 · 3d752d1
1 parent 81aed77
commit 3d752d1
Show file tree

Hide file tree

Showing 32 changed files with 624 additions and 475 deletions.
diff --git a/.github/jobs/docker_utils.py b/.github/jobs/docker_utils.py
@@ -39,7 +39,7 @@ def get_dockerhub_url(branch_name):
 def docker_get_volumes_last_updated(current_branch):
     import requests
     dockerhub_url = get_dockerhub_url(current_branch)
-    dockerhub_request = requests.get(dockerhub_url, timeout=60)
+    dockerhub_request = requests.get(dockerhub_url, timeout=90)
     if dockerhub_request.status_code != 200:
         print(f"Could not find DockerHub URL: {dockerhub_url}")
         return None

diff --git a/.github/jobs/get_data_volumes.py b/.github/jobs/get_data_volumes.py
@@ -7,8 +7,6 @@
 
 import sys
 import os
-import subprocess
-import shlex
 
 from docker_utils import docker_get_volumes_last_updated, get_branch_name
 from docker_utils import get_data_repo, DOCKERHUB_METPLUS_DATA_DEV
@@ -54,7 +52,12 @@ def main(args):
         branch_name = branch_name[5:]
 
     # get all docker data volumes associated with current branch
-    available_volumes = docker_get_volumes_last_updated(branch_name).keys()
+    try:
+        available_volumes = docker_get_volumes_last_updated(branch_name).keys()
+    except AttributeError:
+        print("ERROR: Could not get available Docker data volumes "
+              f"for {branch_name}. Try rerunning failed jobs in GitHub Actions")
+        return None
 
     # loop through arguments and get data volume for each category
     for model_app_name in args:
@@ -78,7 +81,7 @@ def main(args):
             volume_name = f'{prefix}-{model_app_name}'
 
             # add output- to model app name
-            model_app_name=f'output-{model_app_name}'
+            model_app_name = f'output-{model_app_name}'
 
             # set DockerHub repo to dev version because all output data
             # should be in dev repository
@@ -108,11 +111,11 @@ def main(args):
 
 if __name__ == "__main__":
     # split up command line args that have commas before passing into main
-    args = []
+    arg_list = []
 
     for arg in sys.argv[1:]:
-        args.extend(arg.split(','))
-    out = main(args)
+        arg_list.extend(arg.split(','))
+    out = main(arg_list)
     if out is None:
         print("ERROR: Something went wrong")
         sys.exit(1)

diff --git a/.github/jobs/setup_and_run_use_cases.py b/.github/jobs/setup_and_run_use_cases.py
@@ -63,7 +63,7 @@ def main():
     # use BuildKit to build image
     os.environ['DOCKER_BUILDKIT'] = '1'
 
-    isOK = True
+    is_ok = True
     failed_use_cases = []
     for setup_commands, use_case_commands, requirements in all_commands:
         # get environment image tag
@@ -79,45 +79,38 @@ def main():
             f"-f {DOCKERFILE_DIR}/{dockerfile_name} ."
         )
 
-        print(f'Building Docker environment/branch image...')
+        print('Building Docker environment/branch image...')
         if not run_commands(docker_build_cmd):
-            isOK = False
+            is_ok = False
             continue
 
-        commands = []
-
-        # print list of existing docker images
-        commands.append('docker images')
-
+        # print list of existing docker images,
         # remove docker image after creating run env or prune untagged images
-        commands.append(f'docker image rm dtcenter/metplus-dev:{branch_name} -f')
-        commands.append('docker image prune -f')
-        run_commands(commands)
-
-        commands = []
-
-        # list docker images again after removal
-        commands.append('docker images')
-
-        # start interactive container in the background
-        commands.append(
-            f"docker run -d --rm -it -e GITHUB_WORKSPACE "
-            f"--name {RUN_TAG} "
-            f"{os.environ.get('NETWORK_ARG', '')} "
-            f"{' '.join(VOLUME_MOUNTS)} "
-            f"{volumes_from} --workdir {GITHUB_WORKSPACE} "
-            f'{RUN_TAG} bash'
-        )
-
-        # list running containers
-        commands.append('docker ps -a')
-
-        # execute setup commands in running docker container
-        commands.append(_format_docker_exec_command(setup_commands))
+        run_commands([
+            'docker images',
+            f'docker image rm dtcenter/metplus-dev:{branch_name} -f',
+            'docker image prune -f',
+        ])
+
+        # list docker images again after removal,
+        # start interactive container in the background,
+        # list running containers,
+        # then execute setup commands in running docker container
+        commands = [
+            'docker images',
+            (f"docker run -d --rm -it -e GITHUB_WORKSPACE "
+             f"--name {RUN_TAG} "
+             f"{os.environ.get('NETWORK_ARG', '')} "
+             f"{' '.join(VOLUME_MOUNTS)} "
+             f"{volumes_from} --workdir {GITHUB_WORKSPACE} "
+             f'{RUN_TAG} bash'),
+            'docker ps -a',
+            _format_docker_exec_command(setup_commands),
+        ]
 
         # run docker commands and skip running cases if something went wrong
         if not run_commands(commands):
-            isOK = False
+            is_ok = False
 
             # force remove container if setup step fails
             run_commands(f'docker rm -f {RUN_TAG}')
@@ -132,23 +125,28 @@ def main():
         for use_case_command in use_case_commands:
             if not run_commands(_format_docker_exec_command(use_case_command)):
                 failed_use_cases.append(use_case_command)
-                isOK = False
+                is_ok = False
 
         # print bashrc file to see what was added by setup commands
         # then force remove container to stop and remove it
         if not run_commands([
             _format_docker_exec_command('cat /root/.bashrc'),
             f'docker rm -f {RUN_TAG}',
         ]):
-            isOK = False
+            is_ok = False
 
     # print summary of use cases that failed
-    for failed_use_case in failed_use_cases:
-        print(f'ERROR: Use case failed: {failed_use_case}')
+    _print_failed_use_cases(failed_use_cases)
 
-    if not isOK:
+    if not is_ok:
         print("ERROR: Some commands failed.")
-        sys.exit(1)
+
+    return is_ok
+
+
+def _print_failed_use_cases(failed_use_cases):
+    for failed_use_case in failed_use_cases:
+        print(f'ERROR: Use case failed: {failed_use_case}')
 
 
 def _format_docker_exec_command(command):
@@ -201,4 +199,7 @@ def _get_dockerfile_name(requirements):
 
 
 if __name__ == '__main__':
-    main()
+    ret = main()
+    # exit non-zero if anything went wrong
+    if not ret:
+        sys.exit(1)
diff --git a/docs/Users_Guide/glossary.rst b/docs/Users_Guide/glossary.rst
@@ -2244,12 +2244,25 @@ METplus Configuration Glossary
    LEAD_SEQ_<n>
      Specify the sequence of forecast lead times to include in the analysis. Comma separated list format, e.g.:0, 6, 12. <n> corresponds to the bin in which the user wishes to aggregate series by lead results.
 
-     | *Used by:*  SeriesAnalysis
+     | *Used by:*  All
 
    LEAD_SEQ_<n>_LABEL
-     Required when SERIES_BY_LEAD_GROUP_FCSTS=True. Specify the label of the corresponding bin of series by lead results.
+     Specify the label for the :term:`LEAD_SEQ_\<n>` group of forecast leads.
 
-     | *Used by:*  SeriesAnalysis
+     | *Used by:*  All
+
+   LEAD_SEQ_GROUP_SIZE
+     Defines the size of forecast lead groups to create from :term:`LEAD_SEQ`.
+     See :ref:`grouping_forecast_leads` for more information.
+
+     | *Used by:*  All
+
+   LEAD_SEQ_GROUP_LABEL
+     Defines the label to apply for each forecast lead group that are created
+     using :term:`LEAD_SEQ` and :term:`LEAD_SEQ_GROUP_SIZE`.
+     See :ref:`grouping_forecast_leads` for more information.
+
+     | *Used by:*  All
 
    LINE_TYPE
      .. warning:: **DEPRECATED:** Please use :term:`LINE_TYPE_LIST` instead.
@@ -3545,7 +3558,7 @@ METplus Configuration Glossary
      .. warning:: **DEPRECATED:** Please use :term:`LEAD_SEQ_\<n>` and :term:`SERIES_ANALYSIS_RUNTIME_FREQ` instead.
 
    SERIES_BY_LEAD_GROUP_FCSTS
-     .. warning:: **DEPRECATED:** Please use :term:`SERIES_ANALYSIS_GROUP_FCSTS` instead.
+     .. warning:: **DEPRECATED:** Please use :term:`LEAD_SEQ_\<n>` and :term:`SERIES_ANALYSIS_RUNTIME_FREQ` instead.
 
    SERIES_INIT_FILTERED_OUT_DIR
      .. warning:: **DEPRECATED:** Please use :term:`SERIES_ANALYSIS_FILTERED_OUTPUT_DIR` instead.

diff --git a/docs/Users_Guide/systemconfiguration.rst b/docs/Users_Guide/systemconfiguration.rst
@@ -795,25 +795,78 @@ is equivalent to setting::
   [config]
   LEAD_SEQ = 0, 3, 6, 9, 12
 
+.. _grouping_forecast_leads:
+
+Grouping Forecast Leads
+"""""""""""""""""""""""
+
 Grouping forecast leads is possible as well using a special version of
-the :term:`LEAD_SEQ` variable for the
-**SeriesByLead Wrapper Only**.
-If :term:`SERIES_BY_LEAD_GROUP_FCSTS` = True, then groups of
-forecast leads can be defined to be evaluated together.
+the :term:`LEAD_SEQ` variable.
+If {APP_NAME}_RUNTIME_FREQ, e.g. SERIES_ANALYSIS_RUNTIME_FREQ, is set to
+**RUN_ONCE_PER_INIT_OR_VALID**,
+then groups of forecast leads can be defined to be evaluated together.
 Any number of these groups can be defined by setting
 configuration variables LEAD_SEQ_1, LEAD_SEQ_2, ..., :term:`LEAD_SEQ_\<n\>`.
 The value can be defined with a
 comma-separated list of integers (currently only hours are supported here)
 or using :ref:`begin_end_incr`. Each :term:`LEAD_SEQ_\<n\>` must have a
 corresponding variable :term:`LEAD_SEQ_<n>_LABEL`. For example::
 
-
   [config]
   LEAD_SEQ_1 = 0, 6, 12, 18
   LEAD_SEQ_1_LABEL = Day1
   LEAD_SEQ_2 = begin_end_incr(24,42,6)
   LEAD_SEQ_2_LABEL = Day2
 
+In this example, the label **Day1** will be used for 0, 6, 12, 18 and
+the label **Day2** will be used for 24, 30, 36, 42.
+
+Forecast leads can also be grouped by defining a single list of forecast leads
+with :term:`LEAD_SEQ`, then specifying the size of each group using
+:term:`LEAD_SEQ_GROUP_SIZE`. For example::
+
+    [config]
+    LEAD_SEQ = 0, 12, 24, 36
+    LEAD_SEQ_GROUP_SIZE = 1d
+
+This configuration will create groups of forecast leads that each contain 1 day.
+This is the equivalent of setting::
+
+    [config]
+    LEAD_SEQ_1 = 0, 12
+    LEAD_SEQ_2 = 24, 36
+
+Each group will be labeled Group<INDEX> where <INDEX> is the group number.
+In this example, the label **Group1** will be used for 0, 12 and
+the label **Group2** will be used for 24, 36.
+The label can be referenced in filename templates using {label}.
+
+To change the text "Group" to something else, set :term:`LEAD_SEQ_GROUP_LABEL`.
+Setting::
+
+    LEAD_SEQ_GROUP_LABEL = Day
+
+will label the groups **Day1** and **Day2**.
+
+:term:`LEAD_SEQ_<n>_LABEL` can also be used to change the label for a specific
+group. From the previous example, setting::
+
+    LEAD_SEQ_2_LABEL = SecondDay
+
+will label the groups **Day1** and **SecondDay**.
+
+If the list of forecast leads contain a gap where there are no leads that fall
+within a given group, that group will be skipped. For example::
+
+    [config]
+    LEAD_SEQ = 0, 12, 48, 60
+    LEAD_SEQ_GROUP_SIZE = 1d
+    LEAD_SEQ_GROUP_LABEL = Day
+
+The label **Day1** will be used for 0, 12 and
+the label **Day3** will be used for 48, 60.
+Notice that a **Day2** label is not created.
+
 :term:`INIT_SEQ`
 """"""""""""""""
 

diff --git a/internal/scripts/docker_env/scripts/metdataio_env.sh b/internal/scripts/docker_env/scripts/metdataio_env.sh
@@ -23,4 +23,5 @@ BASE_ENV=metplus_base.${METPLUS_VERSION}
 
 mamba create -y --clone ${BASE_ENV} --name ${ENV_NAME}
 
-mamba install -y --name ${ENV_NAME} -c conda-forge lxml==4.9.1 pymysql==1.0.2 pandas==1.5.1
+#mamba install -y --name ${ENV_NAME} -c conda-forge lxml==4.9.1 pymysql==1.0.2 pandas==1.5.1
+mamba install -y --name ${ENV_NAME} -c conda-forge pymysql==1.1.0 pyyaml==6.0 "xarray>=2023.1.0" lxml==4.9.1 netcdf4==1.6.2
diff --git a/internal/tests/pytests/util/config_validate/test_config_validate.py b/internal/tests/pytests/util/config_validate/test_config_validate.py
@@ -104,9 +104,9 @@ def test_is_var_item_valid_levels(metplus_config, item_list, configs_to_set, is_
 @pytest.mark.parametrize(
     'met_config_file, expected',
     [
-        ('GridStatConfig_good', (True, [])),
-        ('GridStatConfig_bad', (False, [])),
-        ('GridStatConfig_fake', (False, [])),
+        ('GridStatConfig_good', True),
+        ('GridStatConfig_bad', False),
+        ('GridStatConfig_fake', False),
     ]
 )
 @pytest.mark.util

diff --git a/internal/tests/pytests/util/system_util/test_system_util.py b/internal/tests/pytests/util/system_util/test_system_util.py
@@ -228,29 +228,6 @@ def test_write_list_to_file(tmp_path_factory):
     assert actual == '\n'.join(output_list) + '\n'
 
 
-@pytest.mark.util
-def test_prune_empty(tmp_path_factory):
-    prune_dir = tmp_path_factory.mktemp('prune')
-
-    dir1 = prune_dir / 'empty_file_dir'
-    dir2 = prune_dir / 'not_empty_file_dir'
-    dir3 = prune_dir / 'empty_dir'
-    for d in [dir1, dir2, dir3]:
-        os.makedirs(d)
-
-    # make two files, one empty one not.
-    open(os.path.join(dir1, 'empty.txt'), 'a').close()
-    file_with_content = os.path.join(dir2, 'not_empty.txt')
-    with open(file_with_content, 'w') as f:
-        f.write('Fee fi fo fum.')
-
-    su.prune_empty(prune_dir, mock.Mock())
-
-    assert not os.path.exists(dir1)
-    assert os.path.exists(file_with_content)
-    assert not os.path.exists(dir3)
-
-
 @pytest.mark.parametrize(
     'regex, expected', [
         (r'\d', ['bigfoot/123.txt', 'yeti/234.txt']),