From 2527cf4cbbed2603aa3e0cd2ab61c3c5d8b60e2c Mon Sep 17 00:00:00 2001 From: dougiesquire Date: Thu, 21 Sep 2023 09:45:34 +1000 Subject: [PATCH 1/3] guess start and end dates when bounds are not available --- src/access_nri_intake/source/utils.py | 115 +++++++++++++++++++------- 1 file changed, 85 insertions(+), 30 deletions(-) diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py index 27ef0d93..bc3338f9 100644 --- a/src/access_nri_intake/source/utils.py +++ b/src/access_nri_intake/source/utils.py @@ -5,6 +5,7 @@ import re import warnings +from datetime import timedelta from pathlib import Path import cftime @@ -15,9 +16,53 @@ class EmptyFileError(Exception): pass -def get_timeinfo(ds, time_dim="time"): +def _add_month_start(time, n): + """Add months to cftime datetime and truncate to start""" + year = time.year + ((time.month + n - 1) // 12) + month = (time.month + n - 1) % 12 + 1 + return time.replace( + year=year, month=month, day=1, hour=0, minute=0, second=0, microsecond=0 + ) + + +def _add_year_start(time, n): + """Add years to cftime datetime and truncate to start""" + return time.replace(year=time.year + n, month=1, day=1, hour=0, minute=0, second=0) + + +def _guess_start_end_dates(ts, te, frequency): + """Guess the start and end bounded times for a given frequency""" + warnings.warn( + "Time coordinate does not include bounds information. Guessing " + "start and end times." + ) + num, unit = frequency + if unit == "yr": + step_back = -int(num / 2) + step_fwd = num + step_back + ts = _add_year_start(ts, step_back) + te = _add_year_start(te, step_fwd) + elif unit == "mon": + step_back = -int(num / 2) + step_fwd = num + step_back + ts = _add_month_start(ts, step_back) + te = _add_month_start(te, step_fwd) + elif unit == "day": + dt = timedelta(days=num) / 2 + ts = ts - dt + te = te + dt + elif unit == "hr": + dt = timedelta(hours=num) / 2 + ts = ts - dt + te = te + dt + else: + warnings.warn("Cannot infer start and end times for subhourly frequencies.") + return ts, te + + +def get_timeinfo(ds, filename_frequency, time_dim): """ - Get start time, end time and frequency of a xarray dataset. Stolen and slightly adapted + Get start date, end date and frequency of a xarray dataset. Stolen and slightly adapted from cosima cookbook, see https://github.com/COSIMA/cosima-cookbook/blob/master/cosima_cookbook/database.py#L565 @@ -33,8 +78,8 @@ def _todate(t): return cftime.num2date(t, time_var.units, calendar=time_var.calendar) time_format = "%Y-%m-%d, %H:%M:%S" - start_time = "none" - end_time = "none" + start_date = "none" + end_date = "none" frequency = "fx" if time_dim in ds: @@ -64,22 +109,40 @@ def _todate(t): # TODO: This is not a very good way to get the frequency if dt.days >= 365: years = round(dt.days / 365) - frequency = f"{years}yr" + frequency = (years, "yr") elif dt.days >= 28: months = round(dt.days / 30) - frequency = f"{months}mon" + frequency = (months, "mon") elif dt.days >= 1: - frequency = f"{dt.days}day" + frequency = (dt.days, "day") elif dt.seconds >= 3600: hours = round(dt.seconds / 3600) - frequency = f"{hours}hr" + frequency = (hours, "hr") else: - frequency = "subhr" + frequency = (None, "subhr") - start_time = ts.strftime(time_format) - end_time = te.strftime(time_format) + if filename_frequency: + if filename_frequency != frequency: + msg = ( + f"The frequency '{filename_frequency}' determined from filename does not " + f"match the frequency '{frequency}' determined from the file contents." + ) + if frequency == "fx": + frequency = filename_frequency + warnings.warn(f"{msg} Using '{frequency}'.") - return (start_time, end_time, frequency) + if frequency != "fx": + if not has_bounds: + ts, te = _guess_start_end_dates(ts, te, frequency) + start_date = ts.strftime(time_format) + end_date = te.strftime(time_format) + + if frequency[0]: + frequency = f"{str(frequency[0])}{frequency[1]}" + else: + frequency = frequency[1] + + return start_date, end_date, frequency def parse_access_filename(filename): @@ -117,12 +180,12 @@ def parse_access_filename(filename): } # Frequency translations frequencies = { - "daily": "1day", - "_dai$": "1day", - "month": "1mon", - "_mon$": "1mon", - "yearly": "1yr", - "_ann$": "1yr", + "daily": (1, "day"), + "_dai$": (1, "day"), + "month": (1, "mon"), + "_mon$": (1, "mon"), + "yearly": (1, "yr"), + "_ann$": (1, "yr"), } redaction_fill = "X" @@ -151,7 +214,7 @@ def parse_access_filename(filename): return file_id, timestamp, frequency -def parse_access_ncfile(file): +def parse_access_ncfile(file, time_dim="time"): """ Get Intake-ESM datastore entry info from an ACCESS netcdf file @@ -159,6 +222,8 @@ def parse_access_ncfile(file): ---------- file: str The path to the netcdf file + time_dim: str + The name of the time dimension Returns ------- @@ -190,21 +255,11 @@ def parse_access_ncfile(file): if "cell_methods" in attrs: variable_cell_methods_list.append(attrs["cell_methods"]) - start_date, end_date, frequency = get_timeinfo(ds) + start_date, end_date, frequency = get_timeinfo(ds, filename_frequency, time_dim) if not variable_list: raise EmptyFileError("This file contains no variables") - if filename_frequency: - if filename_frequency != frequency: - msg = ( - f"The frequency '{filename_frequency}' determined from filename {filename} does not " - f"match the frequency '{frequency}' determined from the file contents." - ) - if frequency == "fx": - frequency = filename_frequency - warnings.warn(f"{msg} Using '{frequency}'.") - outputs = ( filename, file_id, From cc626711cf0809bab0bdbc32907a2e68a36996d4 Mon Sep 17 00:00:00 2001 From: dougiesquire Date: Thu, 21 Sep 2023 15:49:54 +1000 Subject: [PATCH 2/3] add/update tests --- src/access_nri_intake/source/utils.py | 4 +- tests/test_source_utils.py | 172 ++++++++++++++++++++------ 2 files changed, 139 insertions(+), 37 deletions(-) diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py index bc3338f9..f853b442 100644 --- a/src/access_nri_intake/source/utils.py +++ b/src/access_nri_intake/source/utils.py @@ -62,8 +62,8 @@ def _guess_start_end_dates(ts, te, frequency): def get_timeinfo(ds, filename_frequency, time_dim): """ - Get start date, end date and frequency of a xarray dataset. Stolen and slightly adapted - from cosima cookbook, see + Get start date, end date and frequency of a xarray dataset. Stolen and adapted from the + cosima cookbook, see https://github.com/COSIMA/cosima-cookbook/blob/master/cosima_cookbook/database.py#L565 Parameters diff --git a/tests/test_source_utils.py b/tests/test_source_utils.py index 718aa732..65b69519 100644 --- a/tests/test_source_utils.py +++ b/tests/test_source_utils.py @@ -17,50 +17,53 @@ "filename, expected", [ # Example ACCESS-CM2 filenames - ("bz687a.pm107912_mon", ("bz687a_pmXXXXXX_mon", "107912", "1mon")), - ("bz687a.p7107912_mon", ("bz687a_p7XXXXXX_mon", "107912", "1mon")), - ("bz687a.p7107912_dai", ("bz687a_p7XXXXXX_dai", "107912", "1day")), + ("bz687a.pm107912_mon", ("bz687a_pmXXXXXX_mon", "107912", (1, "mon"))), + ("bz687a.p7107912_mon", ("bz687a_p7XXXXXX_mon", "107912", (1, "mon"))), + ("bz687a.p7107912_dai", ("bz687a_p7XXXXXX_dai", "107912", (1, "day"))), ("iceh_m.2014-06", ("iceh_m_XXXX_XX", "2014-06", None)), - ("iceh.1917-05-daily", ("iceh_XXXX_XX_daily", "1917-05", "1day")), - ("ocean_bgc_ann", ("ocean_bgc_ann", None, "1yr")), - ("ocean_daily", ("ocean_daily", None, "1day")), + ("iceh.1917-05-daily", ("iceh_XXXX_XX_daily", "1917-05", (1, "day"))), + ("ocean_bgc_ann", ("ocean_bgc_ann", None, (1, "yr"))), + ("ocean_daily", ("ocean_daily", None, (1, "day"))), # Example ACCESS-ESM1.5 filenames ( "PI-GWL-B2035.pe-109904_dai", - ("PI_GWL_B2035_pe_XXXXXX_dai", "109904", "1day"), + ("PI_GWL_B2035_pe_XXXXXX_dai", "109904", (1, "day")), ), ( "PI-GWL-B2035.pa-109904_mon", - ("PI_GWL_B2035_pa_XXXXXX_mon", "109904", "1mon"), + ("PI_GWL_B2035_pa_XXXXXX_mon", "109904", (1, "mon")), ), ( "PI-1pct-02.pe-011802_dai.nc_dai", - ("PI_1pct_02_pe_XXXXXX_dai_nc_dai", "011802", "1day"), + ("PI_1pct_02_pe_XXXXXX_dai_nc_dai", "011802", (1, "day")), ), ("iceh.1917-05", ("iceh_XXXX_XX", "1917-05", None)), # Example ACCESS-OM2 filenames - ("iceh.057-daily", ("iceh_XXX_daily", "057", "1day")), + ("iceh.057-daily", ("iceh_XXX_daily", "057", (1, "day"))), ("ocean", ("ocean", None, None)), - ("ocean_month", ("ocean_month", None, "1mon")), - ("ocean_daily_3d_vhrho_nt_07", ("ocean_daily_3d_vhrho_nt_XX", "07", "1day")), + ("ocean_month", ("ocean_month", None, (1, "mon"))), + ( + "ocean_daily_3d_vhrho_nt_07", + ("ocean_daily_3d_vhrho_nt_XX", "07", (1, "day")), + ), ( "oceanbgc-3d-caco3-1-yearly-mean-y_2015", - ("oceanbgc_3d_caco3_1_yearly_mean_y_XXXX", "2015", "1yr"), + ("oceanbgc_3d_caco3_1_yearly_mean_y_XXXX", "2015", (1, "yr")), ), ( "oceanbgc-2d-wdet100-1-daily-mean-y_2015", - ("oceanbgc_2d_wdet100_1_daily_mean_y_XXXX", "2015", "1day"), + ("oceanbgc_2d_wdet100_1_daily_mean_y_XXXX", "2015", (1, "day")), ), ( "ocean-3d-v-1-monthly-pow02-ym_1958_04", - ("ocean_3d_v_1_monthly_pow02_ym_XXXX_XX", "1958_04", "1mon"), + ("ocean_3d_v_1_monthly_pow02_ym_XXXX_XX", "1958_04", (1, "mon")), ), ( "ocean-2d-sfc_salt_flux_restore-1-monthly-mean-ym_1958_04", ( "ocean_2d_sfc_salt_flux_restore_1_monthly_mean_ym_XXXX_XX", "1958_04", - "1mon", + (1, "mon"), ), ), ( @@ -68,7 +71,7 @@ ( "oceanbgc_3d_phy_1_daily_mean_3_sigfig_5_daily_ymd_XXXX_XX_XX", "2020_12_01", - "1day", + (1, "day"), ), ), ("iceh.1985-08-31", ("iceh_XXXX_XX_XX", "1985-08-31", None)), @@ -136,8 +139,8 @@ def test_parse_access_filename(filename, expected): "ocean_month_inst_nobounds", None, "1mon", - "1900-01-16, 12:00:00", - "1900-01-16, 12:00:00", + "1900-01-01, 00:00:00", + "1900-02-01, 00:00:00", ["mld"], ["mixed layer depth determined by density criteria"], ["ocean_mixed_layer_thickness_defined_by_sigma_t"], @@ -306,38 +309,137 @@ def test_parse_access_ncfile(test_data, filename, expected): @pytest.mark.parametrize( - "start_end, expected", + "times, bounds, ffreq, expected", [ - ([0.0, 0.00625], ("1900-01-01, 00:00:00", "1900-01-01, 00:09:00", "subhr")), - ([0.0, 0.125], ("1900-01-01, 00:00:00", "1900-01-01, 03:00:00", "3hr")), - ([0.0, 0.25], ("1900-01-01, 00:00:00", "1900-01-01, 06:00:00", "6hr")), - ([0.0, 1.0], ("1900-01-01, 00:00:00", "1900-01-02, 00:00:00", "1day")), - ([0.0, 31.0], ("1900-01-01, 00:00:00", "1900-02-01, 00:00:00", "1mon")), - ([0.0, 90.0], ("1900-01-01, 00:00:00", "1900-04-01, 00:00:00", "3mon")), - ([0.0, 365.0], ("1900-01-01, 00:00:00", "1901-01-01, 00:00:00", "1yr")), - ([0.0, 730.0], ("1900-01-01, 00:00:00", "1902-01-01, 00:00:00", "2yr")), + ( + [365 / 2], + False, + (1, "yr"), + ("1900-01-01, 00:00:00", "1901-01-01, 00:00:00", "1yr"), + ), + ( + [31 / 2], + False, + (1, "mon"), + ("1900-01-01, 00:00:00", "1900-02-01, 00:00:00", "1mon"), + ), + ( + [1.5 / 24], + False, + (3, "hr"), + ("1900-01-01, 00:00:00", "1900-01-01, 03:00:00", "3hr"), + ), + ( + [0.0, 9 / 60 / 24], + True, + None, + ("1900-01-01, 00:00:00", "1900-01-01, 00:09:00", "subhr"), + ), + ( + [0.0, 3 / 24], + True, + None, + ("1900-01-01, 00:00:00", "1900-01-01, 03:00:00", "3hr"), + ), + ( + [0.0, 6 / 24], + True, + None, + ("1900-01-01, 00:00:00", "1900-01-01, 06:00:00", "6hr"), + ), + ( + [0.0, 1.0], + True, + None, + ("1900-01-01, 00:00:00", "1900-01-02, 00:00:00", "1day"), + ), + ( + [0.0, 31.0], + True, + None, + ("1900-01-01, 00:00:00", "1900-02-01, 00:00:00", "1mon"), + ), + ( + [0.0, 90.0], + True, + None, + ("1900-01-01, 00:00:00", "1900-04-01, 00:00:00", "3mon"), + ), + ( + [0.0, 365.0], + True, + None, + ("1900-01-01, 00:00:00", "1901-01-01, 00:00:00", "1yr"), + ), + ( + [0.0, 730.0], + True, + None, + ("1900-01-01, 00:00:00", "1902-01-01, 00:00:00", "2yr"), + ), + ( + [1.5 / 24, 4.5 / 24], + False, + None, + ("1900-01-01, 00:00:00", "1900-01-01, 06:00:00", "3hr"), + ), + ( + [3 / 24, 9 / 24], + False, + None, + ("1900-01-01, 00:00:00", "1900-01-01, 12:00:00", "6hr"), + ), + ( + [0.5, 1.5], + False, + None, + ("1900-01-01, 00:00:00", "1900-01-03, 00:00:00", "1day"), + ), + ( + [31 / 2, 45], + False, + None, + ("1900-01-01, 00:00:00", "1900-03-01, 00:00:00", "1mon"), + ), + ( + [45, 135.5], + False, + None, + ("1900-01-01, 00:00:00", "1900-07-01, 00:00:00", "3mon"), + ), + ( + [365 / 2, 365 + 365 / 2], + False, + None, + ("1900-01-01, 00:00:00", "1902-01-01, 00:00:00", "1yr"), + ), + ( + [365, 3 * 365], + False, + None, + ("1900-01-01, 00:00:00", "1904-01-01, 00:00:00", "2yr"), + ), ], ) -@pytest.mark.parametrize("bounds", [True, False]) -def test_get_timeinfo(start_end, expected, bounds): +def test_get_timeinfo(times, bounds, ffreq, expected): if bounds: - time = (start_end[0] + start_end[1]) / 2 + time = (times[0] + times[1]) / 2 ds = xr.Dataset( data_vars={ "dummy": ("time", [0]), - "time_bounds": (("time", "nv"), [start_end]), + "time_bounds": (("time", "nv"), [(times[0], times[1])]), }, coords={"time": [time]}, ) ds["time"].attrs = dict(bounds="time_bounds") else: ds = xr.Dataset( - data_vars={"dummy": ("time", [0, 0])}, - coords={"time": start_end}, + data_vars={"dummy": ("time", [0] * len(times))}, + coords={"time": times}, ) ds["time"].attrs |= dict( units="days since 1900-01-01 00:00:00", calendar="GREGORIAN" ) - assert get_timeinfo(ds) == expected + assert get_timeinfo(ds, filename_frequency=ffreq, time_dim="time") == expected From 4024c97ba6ea7ee16bd752f3a5a754802c43f238 Mon Sep 17 00:00:00 2001 From: dougiesquire Date: Thu, 28 Sep 2023 13:14:20 +1000 Subject: [PATCH 3/3] address review comments --- src/access_nri_intake/source/utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/access_nri_intake/source/utils.py b/src/access_nri_intake/source/utils.py index f853b442..8d35ed48 100644 --- a/src/access_nri_intake/source/utils.py +++ b/src/access_nri_intake/source/utils.py @@ -27,7 +27,9 @@ def _add_month_start(time, n): def _add_year_start(time, n): """Add years to cftime datetime and truncate to start""" - return time.replace(year=time.year + n, month=1, day=1, hour=0, minute=0, second=0) + return time.replace( + year=time.year + n, month=1, day=1, hour=0, minute=0, second=0, microsecond=0 + ) def _guess_start_end_dates(ts, te, frequency): @@ -81,8 +83,9 @@ def _todate(t): start_date = "none" end_date = "none" frequency = "fx" + has_time = time_dim in ds - if time_dim in ds: + if has_time: time_var = ds[time_dim] if len(time_var) == 0: @@ -131,7 +134,7 @@ def _todate(t): frequency = filename_frequency warnings.warn(f"{msg} Using '{frequency}'.") - if frequency != "fx": + if has_time & (frequency != "fx"): if not has_bounds: ts, te = _guess_start_end_dates(ts, te, frequency) start_date = ts.strftime(time_format)