Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve how filenames are parsed to generate file id #91

Merged
merged 3 commits into from
Jun 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 28 additions & 58 deletions src/access_nri_intake/source/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,12 @@
import multiprocessing
import re
import traceback
from pathlib import Path

import xarray as xr
from ecgtools.builder import INVALID_ASSET, TRACEBACK, Builder

from ..utils import validate_against_schema
from . import ESM_JSONSCHEMA, PATH_COLUMN, VARIABLE_COLUMN
from .utils import get_timeinfo, redact_time_stamps
from .utils import parse_access_ncfile


class ParserError(Exception):
Expand Down Expand Up @@ -233,33 +231,18 @@ def parser(file):
if realm == "ice":
realm = "seaIce"

filename = Path(file).stem

# Get file id from filename without any time stamps
file_id = redact_time_stamps(filename)

with xr.open_dataset(
file,
chunks={},
decode_cf=False,
decode_times=False,
decode_coords=False,
) as ds:
variable_list = []
variable_long_name_list = []
variable_standard_name_list = []
variable_cell_methods_list = []
for var in ds.data_vars:
attrs = ds[var].attrs
if "long_name" in attrs:
variable_list.append(var)
variable_long_name_list.append(attrs["long_name"])
if "standard_name" in attrs:
variable_standard_name_list.append(attrs["standard_name"])
if "cell_methods" in attrs:
variable_cell_methods_list.append(attrs["cell_methods"])

start_date, end_date, frequency = get_timeinfo(ds)
(
filename,
file_id,
_,
frequency,
start_date,
end_date,
variable_list,
variable_long_name_list,
variable_standard_name_list,
variable_cell_methods_list,
) = parse_access_ncfile(file)

info = {
"path": str(file),
Expand Down Expand Up @@ -336,34 +319,21 @@ def parser(file):
realm_mapping = {"atm": "atmos", "ocn": "ocean", "ice": "seaIce"}
realm = realm_mapping[realm]

filename = Path(file).stem

# Get file id from filename without any time stamps or exp_id
file_id = re.sub(exp_id, "", filename)
file_id = redact_time_stamps(file_id)

with xr.open_dataset(
file,
chunks={},
decode_cf=False,
decode_times=False,
decode_coords=False,
) as ds:
variable_list = []
variable_long_name_list = []
variable_standard_name_list = []
variable_cell_methods_list = []
for var in ds.data_vars:
attrs = ds[var].attrs
if "long_name" in attrs:
variable_list.append(var)
variable_long_name_list.append(attrs["long_name"])
if "standard_name" in attrs:
variable_standard_name_list.append(attrs["standard_name"])
if "cell_methods" in attrs:
variable_cell_methods_list.append(attrs["cell_methods"])

start_date, end_date, frequency = get_timeinfo(ds)
(
filename,
file_id,
_,
frequency,
start_date,
end_date,
variable_list,
variable_long_name_list,
variable_standard_name_list,
variable_cell_methods_list,
) = parse_access_ncfile(file)

# Remove exp_id from file id so that members can be part of the same dataset
file_id = re.sub(exp_id, "", file_id).strip("_")

info = {
"path": str(file),
Expand Down
166 changes: 133 additions & 33 deletions src/access_nri_intake/source/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,15 @@
""" Shared utilities for writing Intake-ESM builders and their parsers """

import re
import warnings
from pathlib import Path

import cftime
import xarray as xr


class EmptyFileError(Exception):
pass


def get_timeinfo(ds, time_dim="time"):
Expand All @@ -22,12 +29,12 @@ def get_timeinfo(ds, time_dim="time"):
The name of the time dimension
"""

if time_dim is None:
return None

time_var = ds[time_dim]
has_bounds = hasattr(time_var, "bounds") and time_var.bounds in ds.variables

if len(time_var) == 0:
raise EmptyFileError("This file has a valid unlimited dimension, but no data")

def _todate(t):
return cftime.num2date(t, time_var.units, calendar=time_var.calendar)

Expand Down Expand Up @@ -68,44 +75,137 @@ def _todate(t):
)


def redact_time_stamps(string, fill="X"):
def parse_access_filename(filename):
"""
Sequentially try to redact time stamps from a filename string, starting from the right hand side.
Then replace any "-" and "." with "_". E.g. "bz687a.pm107912_mon.nc" is redacted to
bz687a.pmXXXXXX_mon.nc
Parse an ACCESS model filename and return a file id and any time information

Parameters
----------
string: str
A filename with the suffix (e.g. .nc) removed
fill: str, optional
The string to replace the digits in the time stamp with
filename: str
The filename to parse with the extension removed

Returns
-------
file_id: str
The file id constructed by redacting time information and replacing non-python characters
with underscores
timestamp: str
A string of the redacted time information (e.g. "1990-01")
frequency: str
The frequency of the file if available in the filename
"""

# TODO: this function is a horrible hack

# Patterns are removed in this order. Matching stops once a match is made
patterns = [
r"\d{4}[-_]\d{2}[-_]\d{2}",
r"\d{4}[-_]\d{2}",
r"\d{8}",
r"\d{6}",
r"\d{4}",
r"\d{3}",
r"\d{2}",
]

# Strip first matched pattern
stripped = string
# ACCESS output file patterns
patterns = {
r"^iceh.*\.(\d{4}-\d{2}-\d{2})$",
r"^iceh.*\.(\d{4}-\d{2})$",
r"^iceh.*\.(\d{4}-\d{2})-.[^\d].*",
r"^iceh.*\.(\d{3})-.[^\d].*",
r"^ocean.*[^\d]_(\d{4}_\d{2}_\d{2})$",
r"^ocean.*[^\d]_(\d{4}_\d{2})$",
r"^ocean.*[^\d]_(\d{4})$",
r"^ocean.*[^\d]_(\d{2})$",
r"^.*\.p.(\d{6})_.*",
r"^.*\.p.-(\d{6})_.*",
}
# Frequency translations
frequencies = {
"daily": "1day",
"_dai$": "1day",
"month": "1mon",
"_mon$": "1mon",
"yearly": "1yr",
"_ann$": "1yr",
}
redaction_fill = "X"

# Try to determine frequency
frequency = None
for pattern, freq in frequencies.items():
if re.search(pattern, filename):
frequency = freq
break

# Parse file id
file_id = filename
timestamp = None
for pattern in patterns:
match = re.match(rf"^.*({pattern}(?!.*{pattern})).*$", stripped)
match = re.match(pattern, file_id)
if match:
replace = re.sub(r"\d", fill, match.group(1))
stripped = stripped[: match.start(1)] + replace + stripped[match.end(1) :]
timestamp = match.group(1)
redaction = re.sub(r"\d", redaction_fill, timestamp)
file_id = file_id[: match.start(1)] + redaction + file_id[match.end(1) :]
break

# Enforce Python characters
stripped = re.sub(r"[-.]", "_", stripped)
# Remove non-python characters from file ids
file_id = re.sub(r"[-.]", "_", file_id)
file_id = re.sub(r"_+", "_", file_id).strip("_")

return file_id, timestamp, frequency


def parse_access_ncfile(file):
"""
Get Intake-ESM datastore entry info from an ACCESS netcdf file

Parameters
----------
file: str
The path to the netcdf file

Returns
-------
"""

file = Path(file)
filename = file.name

file_id, filename_timestamp, filename_frequency = parse_access_filename(file.stem)

with xr.open_dataset(
file,
chunks={},
decode_cf=False,
decode_times=False,
decode_coords=False,
) as ds:
variable_list = []
variable_long_name_list = []
variable_standard_name_list = []
variable_cell_methods_list = []
for var in ds.data_vars:
attrs = ds[var].attrs
if "long_name" in attrs:
variable_list.append(var)
variable_long_name_list.append(attrs["long_name"])
if "standard_name" in attrs:
variable_standard_name_list.append(attrs["standard_name"])
if "cell_methods" in attrs:
variable_cell_methods_list.append(attrs["cell_methods"])

start_date, end_date, frequency = get_timeinfo(ds)

if filename_frequency:
if filename_frequency != frequency:
msg = (
f"The frequency '{filename_frequency}' determined from filename {filename} does not "
f"match the frequency '{frequency}' determined from the file contents."
)
if frequency == "fx":
frequency = filename_frequency
warnings.warn(f"{msg} Using '{frequency}'.")

outputs = (
filename,
file_id,
filename_timestamp,
frequency,
start_date,
end_date,
variable_list,
variable_long_name_list,
variable_standard_name_list,
variable_cell_methods_list,
)

# Remove any double or dangling _
return re.sub(r"__", "_", stripped).strip("_")
return outputs
71 changes: 71 additions & 0 deletions tests/test_builder_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Copyright 2023 ACCESS-NRI and contributors. See the top-level COPYRIGHT file for details.
# SPDX-License-Identifier: Apache-2.0

import pytest

from access_nri_intake.source.utils import parse_access_filename


@pytest.mark.parametrize(
"filename, expected",
[
# Example ACCESS-CM2 filenames
("bz687a.pm107912_mon", ("bz687a_pmXXXXXX_mon", "107912", "1mon")),
("bz687a.p7107912_mon", ("bz687a_p7XXXXXX_mon", "107912", "1mon")),
("bz687a.p7107912_dai", ("bz687a_p7XXXXXX_dai", "107912", "1day")),
("iceh_m.2014-06", ("iceh_m_XXXX_XX", "2014-06", None)),
("iceh.1917-05-daily", ("iceh_XXXX_XX_daily", "1917-05", "1day")),
("ocean_bgc_ann", ("ocean_bgc_ann", None, "1yr")),
("ocean_daily", ("ocean_daily", None, "1day")),
# Example ACCESS-ESM1.5 filenames
(
"PI-GWL-B2035.pe-109904_dai",
("PI_GWL_B2035_pe_XXXXXX_dai", "109904", "1day"),
),
(
"PI-GWL-B2035.pa-109904_mon",
("PI_GWL_B2035_pa_XXXXXX_mon", "109904", "1mon"),
),
(
"PI-1pct-02.pe-011802_dai.nc_dai",
("PI_1pct_02_pe_XXXXXX_dai_nc_dai", "011802", "1day"),
),
("iceh.1917-05", ("iceh_XXXX_XX", "1917-05", None)),
# Example ACCESS-OM2 filenames
("iceh.057-daily", ("iceh_XXX_daily", "057", "1day")),
("ocean", ("ocean", None, None)),
("ocean_month", ("ocean_month", None, "1mon")),
("ocean_daily_3d_vhrho_nt_07", ("ocean_daily_3d_vhrho_nt_XX", "07", "1day")),
(
"oceanbgc-3d-caco3-1-yearly-mean-y_2015",
("oceanbgc_3d_caco3_1_yearly_mean_y_XXXX", "2015", "1yr"),
),
(
"oceanbgc-2d-wdet100-1-daily-mean-y_2015",
("oceanbgc_2d_wdet100_1_daily_mean_y_XXXX", "2015", "1day"),
),
(
"ocean-3d-v-1-monthly-pow02-ym_1958_04",
("ocean_3d_v_1_monthly_pow02_ym_XXXX_XX", "1958_04", "1mon"),
),
(
"ocean-2d-sfc_salt_flux_restore-1-monthly-mean-ym_1958_04",
(
"ocean_2d_sfc_salt_flux_restore_1_monthly_mean_ym_XXXX_XX",
"1958_04",
"1mon",
),
),
(
"oceanbgc-3d-phy-1-daily-mean-3-sigfig-5-daily-ymd_2020_12_01",
(
"oceanbgc_3d_phy_1_daily_mean_3_sigfig_5_daily_ymd_XXXX_XX_XX",
"2020_12_01",
"1day",
),
),
("iceh.1985-08-31", ("iceh_XXXX_XX_XX", "1985-08-31", None)),
],
)
def test_parse_access_filename(filename, expected):
assert parse_access_filename(filename) == expected
Loading