From 12149ab704f36635debcfd385fa53c294d34f22f Mon Sep 17 00:00:00 2001 From: dougiesquire Date: Fri, 16 Jun 2023 10:57:57 +1000 Subject: [PATCH 1/3] add new filename parser --- src/access_nri_intake/source/utils.py | 83 ++++++++++++++++++--------- tests/test_builders.py | 64 +++++++++++++-------- 2 files changed, 96 insertions(+), 51 deletions(-) diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py index 95840e8..c8bcd60 100644 --- a/src/access_nri_intake/source/utils.py +++ b/src/access_nri_intake/source/utils.py @@ -68,44 +68,71 @@ def _todate(t): ) -def redact_time_stamps(string, fill="X"): +def parse_access_filename(filename): """ - Sequentially try to redact time stamps from a filename string, starting from the right hand side. - Then replace any "-" and "." with "_". E.g. "bz687a.pm107912_mon.nc" is redacted to - bz687a.pmXXXXXX_mon.nc + Parse an ACCESS model filename and return a file id and any time information Parameters ---------- - string: str - A filename with the suffix (e.g. .nc) removed - fill: str, optional - The string to replace the digits in the time stamp with + filename: str + The filename to parse with the extension removed + + Returns + ------- + file_id: str + The file id constructed by redacting time information and replacing non-python characters + with underscores + timestamp: str + A string of the redacted time information (e.g. "1990-01") + frequency: str + The frequency of the file if available in the filename """ - # TODO: this function is a horrible hack - - # Patterns are removed in this order. Matching stops once a match is made - patterns = [ - r"\d{4}[-_]\d{2}[-_]\d{2}", - r"\d{4}[-_]\d{2}", - r"\d{8}", - r"\d{6}", - r"\d{4}", - r"\d{3}", - r"\d{2}", - ] - - # Strip first matched pattern - stripped = string + # ACCESS output file patterns + patterns = { + r"^iceh.*\.(\d{4}-\d{2}-\d{2})$", + r"^iceh.*\.(\d{4}-\d{2})$", + r"^iceh.*\.(\d{4}-\d{2})-.[^\d].*", + r"^iceh.*\.(\d{3})-.[^\d].*", + r"^ocean.*[^\d]_(\d{4}_\d{2}_\d{2})$", + r"^ocean.*[^\d]_(\d{4}_\d{2})$", + r"^ocean.*[^\d]_(\d{4})$", + r"^ocean.*[^\d]_(\d{2})$", + r"^.*\.p.(\d{6})_.*", + r"^.*\.p.-(\d{6})_.*", + } + # Frequency translations + frequencies = { + "daily": "1day", + "_dai$": "1day", + "month": "1mon", + "_mon$": "1mon", + "yearly": "1yr", + "_ann$": "1yr", + } + redaction_fill = "X" + + # Try to determine frequency + frequency = None + for pattern, freq in frequencies.items(): + if re.search(pattern, filename): + frequency = freq + break + + # Parse file id + file_id = filename + timestamp = None for pattern in patterns: - match = re.match(rf"^.*({pattern}(?!.*{pattern})).*$", stripped) + match = re.match(pattern, file_id) if match: - replace = re.sub(r"\d", fill, match.group(1)) - stripped = stripped[: match.start(1)] + replace + stripped[match.end(1) :] + timestamp = match.group(1) + redaction = re.sub(r"\d", redaction_fill, timestamp) + file_id = file_id[: match.start(1)] + redaction + file_id[match.end(1) :] break # Enforce Python characters - stripped = re.sub(r"[-.]", "_", stripped) + file_id = re.sub(r"[-.]", "_", file_id) + file_id = re.sub(r"__", "_", file_id).strip("_") # Remove any double or dangling _ - return re.sub(r"__", "_", stripped).strip("_") + return file_id, timestamp, frequency diff --git a/tests/test_builders.py b/tests/test_builders.py index c6fd24d..284522c 100644 --- a/tests/test_builders.py +++ b/tests/test_builders.py @@ -3,51 +3,69 @@ import pytest -from access_nri_intake.source.utils import redact_time_stamps +from access_nri_intake.source.utils import parse_access_filename @pytest.mark.parametrize( "filename, expected", [ # Example ACCESS-CM2 filenames - ("bz687a.pm107912_mon", "bz687a_pmXXXXXX_mon"), - ("bz687a.p7107912_mon", "bz687a_p7XXXXXX_mon"), - ("iceh_m.2014-06", "iceh_m_XXXX_XX"), - ("iceh.1917-05-daily", "iceh_XXXX_XX_daily"), - ("ocean_bgc_ann", "ocean_bgc_ann"), - ("ocean_daily", "ocean_daily"), + ("bz687a.pm107912_mon", ("bz687a_pmXXXXXX_mon", "107912", "1mon")), + ("bz687a.p7107912_mon", ("bz687a_p7XXXXXX_mon", "107912", "1mon")), + ("bz687a.p7107912_dai", ("bz687a_p7XXXXXX_dai", "107912", "1day")), + ("iceh_m.2014-06", ("iceh_m_XXXX_XX", "2014-06", None)), + ("iceh.1917-05-daily", ("iceh_XXXX_XX_daily", "1917-05", "1day")), + ("ocean_bgc_ann", ("ocean_bgc_ann", None, "1yr")), + ("ocean_daily", ("ocean_daily", None, "1day")), # Example ACCESS-ESM1.5 filenames - ("PI-GWL-B2035.pe-109904_dai", "PI_GWL_B2035_pe_XXXXXX_dai"), - ("PI-GWL-B2035.pa-109904_mon", "PI_GWL_B2035_pa_XXXXXX_mon"), - ("PI-1pct-02.pe-011802_dai.nc_dai", "PI_1pct_02_pe_XXXXXX_dai_nc_dai"), - ("iceh.1917-05", "iceh_XXXX_XX"), + ( + "PI-GWL-B2035.pe-109904_dai", + ("PI_GWL_B2035_pe_XXXXXX_dai", "109904", "1day"), + ), + ( + "PI-GWL-B2035.pa-109904_mon", + ("PI_GWL_B2035_pa_XXXXXX_mon", "109904", "1mon"), + ), + ( + "PI-1pct-02.pe-011802_dai.nc_dai", + ("PI_1pct_02_pe_XXXXXX_dai_nc_dai", "011802", "1day"), + ), + ("iceh.1917-05", ("iceh_XXXX_XX", "1917-05", None)), # Example ACCESS-OM2 filenames - ("iceh.057-daily", "iceh_XXX_daily"), - ("ocean", "ocean"), - ("ocean_month", "ocean_month"), - ("ocean_daily_3d_vhrho_nt_07", "ocean_daily_3d_vhrho_nt_XX"), + ("iceh.057-daily", ("iceh_XXX_daily", "057", "1day")), + ("ocean", ("ocean", None, None)), + ("ocean_month", ("ocean_month", None, "1mon")), + ("ocean_daily_3d_vhrho_nt_07", ("ocean_daily_3d_vhrho_nt_XX", "07", "1day")), ( "oceanbgc-3d-caco3-1-yearly-mean-y_2015", - "oceanbgc_3d_caco3_1_yearly_mean_y_XXXX", + ("oceanbgc_3d_caco3_1_yearly_mean_y_XXXX", "2015", "1yr"), ), ( "oceanbgc-2d-wdet100-1-daily-mean-y_2015", - "oceanbgc_2d_wdet100_1_daily_mean_y_XXXX", + ("oceanbgc_2d_wdet100_1_daily_mean_y_XXXX", "2015", "1day"), ), ( "ocean-3d-v-1-monthly-pow02-ym_1958_04", - "ocean_3d_v_1_monthly_pow02_ym_XXXX_XX", + ("ocean_3d_v_1_monthly_pow02_ym_XXXX_XX", "1958_04", "1mon"), ), ( "ocean-2d-sfc_salt_flux_restore-1-monthly-mean-ym_1958_04", - "ocean_2d_sfc_salt_flux_restore_1_monthly_mean_ym_XXXX_XX", + ( + "ocean_2d_sfc_salt_flux_restore_1_monthly_mean_ym_XXXX_XX", + "1958_04", + "1mon", + ), ), ( "oceanbgc-3d-phy-1-daily-mean-3-sigfig-5-daily-ymd_2020_12_01", - "oceanbgc_3d_phy_1_daily_mean_3_sigfig_5_daily_ymd_XXXX_XX_XX", + ( + "oceanbgc_3d_phy_1_daily_mean_3_sigfig_5_daily_ymd_XXXX_XX_XX", + "2020_12_01", + "1day", + ), ), - ("iceh.1985-08-31", "iceh_XXXX_XX_XX"), + ("iceh.1985-08-31", ("iceh_XXXX_XX_XX", "1985-08-31", None)), ], ) -def test_redact_time_stamps(filename, expected): - assert redact_time_stamps(filename) == expected +def test_parse_access_filename(filename, expected): + assert parse_access_filename(filename) == expected From c8568cf70359386c2cb1f1effa47422359d1fd61 Mon Sep 17 00:00:00 2001 From: dougiesquire Date: Fri, 16 Jun 2023 12:06:21 +1000 Subject: [PATCH 2/3] use new filename parser in Builders --- src/access_nri_intake/source/builders.py | 84 ++++++------------- src/access_nri_intake/source/utils.py | 83 ++++++++++++++++-- ...test_builders.py => test_builder_utils.py} | 0 3 files changed, 103 insertions(+), 64 deletions(-) rename tests/{test_builders.py => test_builder_utils.py} (100%) diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py index 16dfb2a..8b81a4f 100644 --- a/src/access_nri_intake/source/builders.py +++ b/src/access_nri_intake/source/builders.py @@ -6,14 +6,12 @@ import multiprocessing import re import traceback -from pathlib import Path -import xarray as xr from ecgtools.builder import INVALID_ASSET, TRACEBACK, Builder from ..utils import validate_against_schema from . import ESM_JSONSCHEMA, PATH_COLUMN, VARIABLE_COLUMN -from .utils import get_timeinfo, redact_time_stamps +from .utils import parse_access_ncfile class ParserError(Exception): @@ -233,33 +231,17 @@ def parser(file): if realm == "ice": realm = "seaIce" - filename = Path(file).stem - - # Get file id from filename without any time stamps - file_id = redact_time_stamps(filename) - - with xr.open_dataset( - file, - chunks={}, - decode_cf=False, - decode_times=False, - decode_coords=False, - ) as ds: - variable_list = [] - variable_long_name_list = [] - variable_standard_name_list = [] - variable_cell_methods_list = [] - for var in ds.data_vars: - attrs = ds[var].attrs - if "long_name" in attrs: - variable_list.append(var) - variable_long_name_list.append(attrs["long_name"]) - if "standard_name" in attrs: - variable_standard_name_list.append(attrs["standard_name"]) - if "cell_methods" in attrs: - variable_cell_methods_list.append(attrs["cell_methods"]) - - start_date, end_date, frequency = get_timeinfo(ds) + ( + filename, + file_id, + frequency, + start_date, + end_date, + variable_list, + variable_long_name_list, + variable_standard_name_list, + variable_cell_methods_list, + ) = parse_access_ncfile(file) info = { "path": str(file), @@ -336,34 +318,20 @@ def parser(file): realm_mapping = {"atm": "atmos", "ocn": "ocean", "ice": "seaIce"} realm = realm_mapping[realm] - filename = Path(file).stem - - # Get file id from filename without any time stamps or exp_id - file_id = re.sub(exp_id, "", filename) - file_id = redact_time_stamps(file_id) - - with xr.open_dataset( - file, - chunks={}, - decode_cf=False, - decode_times=False, - decode_coords=False, - ) as ds: - variable_list = [] - variable_long_name_list = [] - variable_standard_name_list = [] - variable_cell_methods_list = [] - for var in ds.data_vars: - attrs = ds[var].attrs - if "long_name" in attrs: - variable_list.append(var) - variable_long_name_list.append(attrs["long_name"]) - if "standard_name" in attrs: - variable_standard_name_list.append(attrs["standard_name"]) - if "cell_methods" in attrs: - variable_cell_methods_list.append(attrs["cell_methods"]) - - start_date, end_date, frequency = get_timeinfo(ds) + ( + filename, + file_id, + frequency, + start_date, + end_date, + variable_list, + variable_long_name_list, + variable_standard_name_list, + variable_cell_methods_list, + ) = parse_access_ncfile(file) + + # Remove exp_id from file id so that members can be part of the same dataset + file_id = re.sub(exp_id, "", file_id).strip("_") info = { "path": str(file), diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py index c8bcd60..9cdc798 100644 --- a/src/access_nri_intake/source/utils.py +++ b/src/access_nri_intake/source/utils.py @@ -4,8 +4,15 @@ """ Shared utilities for writing Intake-ESM builders and their parsers """ import re +import warnings +from pathlib import Path import cftime +import xarray as xr + + +class EmptyFileError(Exception): + pass def get_timeinfo(ds, time_dim="time"): @@ -22,12 +29,12 @@ def get_timeinfo(ds, time_dim="time"): The name of the time dimension """ - if time_dim is None: - return None - time_var = ds[time_dim] has_bounds = hasattr(time_var, "bounds") and time_var.bounds in ds.variables + if len(time_var) == 0: + raise EmptyFileError("This file has a valid unlimited dimension, but no data") + def _todate(t): return cftime.num2date(t, time_var.units, calendar=time_var.calendar) @@ -130,9 +137,73 @@ def parse_access_filename(filename): file_id = file_id[: match.start(1)] + redaction + file_id[match.end(1) :] break - # Enforce Python characters + # Remove non-python characters from file ids file_id = re.sub(r"[-.]", "_", file_id) - file_id = re.sub(r"__", "_", file_id).strip("_") + file_id = re.sub(r"_+", "_", file_id).strip("_") - # Remove any double or dangling _ return file_id, timestamp, frequency + + +def parse_access_ncfile(file): + """ + Get Intake-ESM datastore entry info from an ACCESS netcdf file + + Parameters + ---------- + file: str + The path to the netcdf file + + Returns + ------- + """ + + file = Path(file) + filename = file.name + + file_id, filename_timestamp, filename_frequency = parse_access_filename(file.stem) + + with xr.open_dataset( + file, + chunks={}, + decode_cf=False, + decode_times=False, + decode_coords=False, + ) as ds: + variable_list = [] + variable_long_name_list = [] + variable_standard_name_list = [] + variable_cell_methods_list = [] + for var in ds.data_vars: + attrs = ds[var].attrs + if "long_name" in attrs: + variable_list.append(var) + variable_long_name_list.append(attrs["long_name"]) + if "standard_name" in attrs: + variable_standard_name_list.append(attrs["standard_name"]) + if "cell_methods" in attrs: + variable_cell_methods_list.append(attrs["cell_methods"]) + + start_date, end_date, frequency = get_timeinfo(ds) + + if filename_frequency != frequency: + msg = ( + f"The frequency '{filename_frequency}' determined from filename {filename} does not " + f"match the frequency '{frequency}' determined from the file contents." + ) + if frequency == "fx": + frequency = filename_frequency + warnings.warn(f"{msg} Using '{frequency}'.") + + outputs = ( + filename, + file_id, + frequency, + start_date, + end_date, + variable_list, + variable_long_name_list, + variable_standard_name_list, + variable_cell_methods_list, + ) + + return outputs diff --git a/tests/test_builders.py b/tests/test_builder_utils.py similarity index 100% rename from tests/test_builders.py rename to tests/test_builder_utils.py From 26a0b6bac47db2316429e3e7de4493c9d03ffd2e Mon Sep 17 00:00:00 2001 From: dougiesquire Date: Fri, 16 Jun 2023 13:37:00 +1000 Subject: [PATCH 3/3] return timestamp from parse_access_ncfile --- src/access_nri_intake/source/builders.py | 2 ++ src/access_nri_intake/source/utils.py | 18 ++++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/access_nri_intake/source/builders.py b/src/access_nri_intake/source/builders.py index 8b81a4f..1d85fc3 100644 --- a/src/access_nri_intake/source/builders.py +++ b/src/access_nri_intake/source/builders.py @@ -234,6 +234,7 @@ def parser(file): ( filename, file_id, + _, frequency, start_date, end_date, @@ -321,6 +322,7 @@ def parser(file): ( filename, file_id, + _, frequency, start_date, end_date, diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py index 9cdc798..93fd128 100644 --- a/src/access_nri_intake/source/utils.py +++ b/src/access_nri_intake/source/utils.py @@ -185,18 +185,20 @@ def parse_access_ncfile(file): start_date, end_date, frequency = get_timeinfo(ds) - if filename_frequency != frequency: - msg = ( - f"The frequency '{filename_frequency}' determined from filename {filename} does not " - f"match the frequency '{frequency}' determined from the file contents." - ) - if frequency == "fx": - frequency = filename_frequency - warnings.warn(f"{msg} Using '{frequency}'.") + if filename_frequency: + if filename_frequency != frequency: + msg = ( + f"The frequency '{filename_frequency}' determined from filename {filename} does not " + f"match the frequency '{frequency}' determined from the file contents." + ) + if frequency == "fx": + frequency = filename_frequency + warnings.warn(f"{msg} Using '{frequency}'.") outputs = ( filename, file_id, + filename_timestamp, frequency, start_date, end_date,