ACCESS-NRI · dougiesquire · Jun 16, 2023 · Jun 16, 2023 · Jun 16, 2023 · Jun 16, 2023
diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py
@@ -6,14 +6,12 @@
 import multiprocessing
 import re
 import traceback
-from pathlib import Path
 
-import xarray as xr
 from ecgtools.builder import INVALID_ASSET, TRACEBACK, Builder
 
 from ..utils import validate_against_schema
 from . import ESM_JSONSCHEMA, PATH_COLUMN, VARIABLE_COLUMN
-from .utils import get_timeinfo, redact_time_stamps
+from .utils import parse_access_ncfile
 
 
 class ParserError(Exception):
@@ -233,33 +231,18 @@ def parser(file):
             if realm == "ice":
                 realm = "seaIce"
 
-            filename = Path(file).stem
-
-            # Get file id from filename without any time stamps
-            file_id = redact_time_stamps(filename)
-
-            with xr.open_dataset(
-                file,
-                chunks={},
-                decode_cf=False,
-                decode_times=False,
-                decode_coords=False,
-            ) as ds:
-                variable_list = []
-                variable_long_name_list = []
-                variable_standard_name_list = []
-                variable_cell_methods_list = []
-                for var in ds.data_vars:
-                    attrs = ds[var].attrs
-                    if "long_name" in attrs:
-                        variable_list.append(var)
-                        variable_long_name_list.append(attrs["long_name"])
-                    if "standard_name" in attrs:
-                        variable_standard_name_list.append(attrs["standard_name"])
-                    if "cell_methods" in attrs:
-                        variable_cell_methods_list.append(attrs["cell_methods"])
-
-                start_date, end_date, frequency = get_timeinfo(ds)
+            (
+                filename,
+                file_id,
+                _,
+                frequency,
+                start_date,
+                end_date,
+                variable_list,
+                variable_long_name_list,
+                variable_standard_name_list,
+                variable_cell_methods_list,
+            ) = parse_access_ncfile(file)
 
             info = {
                 "path": str(file),
@@ -336,34 +319,21 @@ def parser(file):
             realm_mapping = {"atm": "atmos", "ocn": "ocean", "ice": "seaIce"}
             realm = realm_mapping[realm]
 
-            filename = Path(file).stem
-
-            # Get file id from filename without any time stamps or exp_id
-            file_id = re.sub(exp_id, "", filename)
-            file_id = redact_time_stamps(file_id)
-
-            with xr.open_dataset(
-                file,
-                chunks={},
-                decode_cf=False,
-                decode_times=False,
-                decode_coords=False,
-            ) as ds:
-                variable_list = []
-                variable_long_name_list = []
-                variable_standard_name_list = []
-                variable_cell_methods_list = []
-                for var in ds.data_vars:
-                    attrs = ds[var].attrs
-                    if "long_name" in attrs:
-                        variable_list.append(var)
-                        variable_long_name_list.append(attrs["long_name"])
-                    if "standard_name" in attrs:
-                        variable_standard_name_list.append(attrs["standard_name"])
-                    if "cell_methods" in attrs:
-                        variable_cell_methods_list.append(attrs["cell_methods"])
-
-                start_date, end_date, frequency = get_timeinfo(ds)
+            (
+                filename,
+                file_id,
+                _,
+                frequency,
+                start_date,
+                end_date,
+                variable_list,
+                variable_long_name_list,
+                variable_standard_name_list,
+                variable_cell_methods_list,
+            ) = parse_access_ncfile(file)
+
+            # Remove exp_id from file id so that members can be part of the same dataset
+            file_id = re.sub(exp_id, "", file_id).strip("_")
 
             info = {
                 "path": str(file),

diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py
@@ -4,8 +4,15 @@
 """ Shared utilities for writing Intake-ESM builders and their parsers """
 
 import re
+import warnings
+from pathlib import Path
 
 import cftime
+import xarray as xr
+
+
+class EmptyFileError(Exception):
+    pass
 
 
 def get_timeinfo(ds, time_dim="time"):
@@ -22,12 +29,12 @@ def get_timeinfo(ds, time_dim="time"):
         The name of the time dimension
     """
 
-    if time_dim is None:
-        return None
-
     time_var = ds[time_dim]
     has_bounds = hasattr(time_var, "bounds") and time_var.bounds in ds.variables
 
+    if len(time_var) == 0:
+        raise EmptyFileError("This file has a valid unlimited dimension, but no data")
+
     def _todate(t):
         return cftime.num2date(t, time_var.units, calendar=time_var.calendar)
 
@@ -68,44 +75,137 @@ def _todate(t):
     )
 
 
-def redact_time_stamps(string, fill="X"):
+def parse_access_filename(filename):
     """
-    Sequentially try to redact time stamps from a filename string, starting from the right hand side.
-    Then replace any "-" and "." with "_". E.g. "bz687a.pm107912_mon.nc" is redacted to
-    bz687a.pmXXXXXX_mon.nc
+    Parse an ACCESS model filename and return a file id and any time information
 
     Parameters
     ----------
-    string: str
-        A filename with the suffix (e.g. .nc) removed
-    fill: str, optional
-        The string to replace the digits in the time stamp with
+    filename: str
+        The filename to parse with the extension removed
+
+    Returns
+    -------
+    file_id: str
+        The file id constructed by redacting time information and replacing non-python characters
+        with underscores
+    timestamp: str
+        A string of the redacted time information (e.g. "1990-01")
+    frequency: str
+        The frequency of the file if available in the filename
     """
 
-    # TODO: this function is a horrible hack
-
-    # Patterns are removed in this order. Matching stops once a match is made
-    patterns = [
-        r"\d{4}[-_]\d{2}[-_]\d{2}",
-        r"\d{4}[-_]\d{2}",
-        r"\d{8}",
-        r"\d{6}",
-        r"\d{4}",
-        r"\d{3}",
-        r"\d{2}",
-    ]
-
-    # Strip first matched pattern
-    stripped = string
+    # ACCESS output file patterns
+    patterns = {
+        r"^iceh.*\.(\d{4}-\d{2}-\d{2})$",
+        r"^iceh.*\.(\d{4}-\d{2})$",
+        r"^iceh.*\.(\d{4}-\d{2})-.[^\d].*",
+        r"^iceh.*\.(\d{3})-.[^\d].*",
+        r"^ocean.*[^\d]_(\d{4}_\d{2}_\d{2})$",
+        r"^ocean.*[^\d]_(\d{4}_\d{2})$",
+        r"^ocean.*[^\d]_(\d{4})$",
+        r"^ocean.*[^\d]_(\d{2})$",
+        r"^.*\.p.(\d{6})_.*",
+        r"^.*\.p.-(\d{6})_.*",
+    }
+    # Frequency translations
+    frequencies = {
+        "daily": "1day",
+        "_dai$": "1day",
+        "month": "1mon",
+        "_mon$": "1mon",
+        "yearly": "1yr",
+        "_ann$": "1yr",
+    }
+    redaction_fill = "X"
+
+    # Try to determine frequency
+    frequency = None
+    for pattern, freq in frequencies.items():
+        if re.search(pattern, filename):
+            frequency = freq
+            break
+
+    # Parse file id
+    file_id = filename
+    timestamp = None
     for pattern in patterns:
-        match = re.match(rf"^.*({pattern}(?!.*{pattern})).*$", stripped)
+        match = re.match(pattern, file_id)
         if match:
-            replace = re.sub(r"\d", fill, match.group(1))
-            stripped = stripped[: match.start(1)] + replace + stripped[match.end(1) :]
+            timestamp = match.group(1)
+            redaction = re.sub(r"\d", redaction_fill, timestamp)
+            file_id = file_id[: match.start(1)] + redaction + file_id[match.end(1) :]
             break
 
-    # Enforce Python characters
-    stripped = re.sub(r"[-.]", "_", stripped)
+    # Remove non-python characters from file ids
+    file_id = re.sub(r"[-.]", "_", file_id)
+    file_id = re.sub(r"_+", "_", file_id).strip("_")
+
+    return file_id, timestamp, frequency
+
+
+def parse_access_ncfile(file):
+    """
+    Get Intake-ESM datastore entry info from an ACCESS netcdf file
+
+    Parameters
+    ----------
+    file: str
+        The path to the netcdf file
+
+    Returns
+    -------
+    """
+
+    file = Path(file)
+    filename = file.name
+
+    file_id, filename_timestamp, filename_frequency = parse_access_filename(file.stem)
+
+    with xr.open_dataset(
+        file,
+        chunks={},
+        decode_cf=False,
+        decode_times=False,
+        decode_coords=False,
+    ) as ds:
+        variable_list = []
+        variable_long_name_list = []
+        variable_standard_name_list = []
+        variable_cell_methods_list = []
+        for var in ds.data_vars:
+            attrs = ds[var].attrs
+            if "long_name" in attrs:
+                variable_list.append(var)
+                variable_long_name_list.append(attrs["long_name"])
+            if "standard_name" in attrs:
+                variable_standard_name_list.append(attrs["standard_name"])
+            if "cell_methods" in attrs:
+                variable_cell_methods_list.append(attrs["cell_methods"])
+
+        start_date, end_date, frequency = get_timeinfo(ds)
+
+    if filename_frequency:
+        if filename_frequency != frequency:
+            msg = (
+                f"The frequency '{filename_frequency}' determined from filename {filename} does not "
+                f"match the frequency '{frequency}' determined from the file contents."
+            )
+            if frequency == "fx":
+                frequency = filename_frequency
+            warnings.warn(f"{msg} Using '{frequency}'.")
+
+    outputs = (
+        filename,
+        file_id,
+        filename_timestamp,
+        frequency,
+        start_date,
+        end_date,
+        variable_list,
+        variable_long_name_list,
+        variable_standard_name_list,
+        variable_cell_methods_list,
+    )
 
-    # Remove any double or dangling _
-    return re.sub(r"__", "_", stripped).strip("_")
+    return outputs
diff --git a/tests/test_builder_utils.py b/tests/test_builder_utils.py
@@ -0,0 +1,71 @@
+# Copyright 2023 ACCESS-NRI and contributors. See the top-level COPYRIGHT file for details.
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+
+from access_nri_intake.source.utils import parse_access_filename
+
+
+@pytest.mark.parametrize(
+    "filename, expected",
+    [
+        # Example ACCESS-CM2 filenames
+        ("bz687a.pm107912_mon", ("bz687a_pmXXXXXX_mon", "107912", "1mon")),
+        ("bz687a.p7107912_mon", ("bz687a_p7XXXXXX_mon", "107912", "1mon")),
+        ("bz687a.p7107912_dai", ("bz687a_p7XXXXXX_dai", "107912", "1day")),
+        ("iceh_m.2014-06", ("iceh_m_XXXX_XX", "2014-06", None)),
+        ("iceh.1917-05-daily", ("iceh_XXXX_XX_daily", "1917-05", "1day")),
+        ("ocean_bgc_ann", ("ocean_bgc_ann", None, "1yr")),
+        ("ocean_daily", ("ocean_daily", None, "1day")),
+        # Example ACCESS-ESM1.5 filenames
+        (
+            "PI-GWL-B2035.pe-109904_dai",
+            ("PI_GWL_B2035_pe_XXXXXX_dai", "109904", "1day"),
+        ),
+        (
+            "PI-GWL-B2035.pa-109904_mon",
+            ("PI_GWL_B2035_pa_XXXXXX_mon", "109904", "1mon"),
+        ),
+        (
+            "PI-1pct-02.pe-011802_dai.nc_dai",
+            ("PI_1pct_02_pe_XXXXXX_dai_nc_dai", "011802", "1day"),
+        ),
+        ("iceh.1917-05", ("iceh_XXXX_XX", "1917-05", None)),
+        # Example ACCESS-OM2 filenames
+        ("iceh.057-daily", ("iceh_XXX_daily", "057", "1day")),
+        ("ocean", ("ocean", None, None)),
+        ("ocean_month", ("ocean_month", None, "1mon")),
+        ("ocean_daily_3d_vhrho_nt_07", ("ocean_daily_3d_vhrho_nt_XX", "07", "1day")),
+        (
+            "oceanbgc-3d-caco3-1-yearly-mean-y_2015",
+            ("oceanbgc_3d_caco3_1_yearly_mean_y_XXXX", "2015", "1yr"),
+        ),
+        (
+            "oceanbgc-2d-wdet100-1-daily-mean-y_2015",
+            ("oceanbgc_2d_wdet100_1_daily_mean_y_XXXX", "2015", "1day"),
+        ),
+        (
+            "ocean-3d-v-1-monthly-pow02-ym_1958_04",
+            ("ocean_3d_v_1_monthly_pow02_ym_XXXX_XX", "1958_04", "1mon"),
+        ),
+        (
+            "ocean-2d-sfc_salt_flux_restore-1-monthly-mean-ym_1958_04",
+            (
+                "ocean_2d_sfc_salt_flux_restore_1_monthly_mean_ym_XXXX_XX",
+                "1958_04",
+                "1mon",
+            ),
+        ),
+        (
+            "oceanbgc-3d-phy-1-daily-mean-3-sigfig-5-daily-ymd_2020_12_01",
+            (
+                "oceanbgc_3d_phy_1_daily_mean_3_sigfig_5_daily_ymd_XXXX_XX_XX",
+                "2020_12_01",
+                "1day",
+            ),
+        ),
+        ("iceh.1985-08-31", ("iceh_XXXX_XX_XX", "1985-08-31", None)),
+    ],
+)
+def test_parse_access_filename(filename, expected):
+    assert parse_access_filename(filename) == expected