Skip to content

Commit

Permalink
Refactor parse_access_* functions into BaseBuilder class (#181)
Browse files Browse the repository at this point in the history
* Preparatory step - add kwargs to parse_access_filename

* Make parse_access_* be classmethods on BaseBuilder

* Update tests for latest changes; show pytest-cov output on terminal

(and curse VSCode for not saving things automatically like my last IDE)

* Refactor how the pattern 'helpers' are stored

* Completed reformat to parse_access_* being classmethods on builders, including tests

(some cleanup still required)

* Remove terminal pytest-cov output

* 'black' affected files

* Attempting to fix 'pre-commit' failure

* Fix mistake in returning patterns to kwargs

* Remove commented-out patterns

* Pre-commit test
  • Loading branch information
marc-white committed Aug 19, 2024
1 parent b927150 commit a07470a
Show file tree
Hide file tree
Showing 4 changed files with 811 additions and 715 deletions.
194 changes: 183 additions & 11 deletions src/access_nri_intake/source/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,34 @@
import multiprocessing
import re
import traceback
from pathlib import Path

import xarray as xr
from ecgtools.builder import INVALID_ASSET, TRACEBACK, Builder

from ..utils import validate_against_schema
from . import ESM_JSONSCHEMA, PATH_COLUMN, VARIABLE_COLUMN
from .utils import parse_access_ncfile
from .utils import EmptyFileError, get_timeinfo

# Frequency translations
FREQUENCIES = {
"daily": (1, "day"),
"_dai$": (1, "day"),
"month": (1, "mon"),
"_mon$": (1, "mon"),
"yearly": (1, "yr"),
"_ann$": (1, "yr"),
}

# ACCESS output file patterns
PATTERNS_HELPERS = {
"not_multi_digit": "(?:\\d(?!\\d)|[^\\d](?=\\d)|[^\\d](?!\\d))",
"om3_components": "(?:cice|mom6|ww3)",
"ymds": "\\d{4}[_,-]\\d{2}[_,-]\\d{2}[_,-]\\d{5}",
"ymd": "\\d{4}[_,-]\\d{2}[_,-]\\d{2}",
"ym": "\\d{4}[_,-]\\d{2}",
"y": "\\d{4}",
}


class ParserError(Exception):
Expand All @@ -24,6 +46,9 @@ class BaseBuilder(Builder):
This builds on the ecgtools.Builder class.
"""

# Base class carries an empty set
PATTERNS = []

def __init__(
self,
path,
Expand Down Expand Up @@ -183,10 +208,144 @@ def parser(file):
# This method should be overwritten
raise NotImplementedError

@classmethod
def parse_access_filename(
cls, filename, patterns=None, frequencies=FREQUENCIES, redaction_fill: str = "X"
):
"""
Parse an ACCESS model filename and return a file id and any time information
Parameters
----------
filename: str
The filename to parse with the extension removed
Returns
-------
file_id: str
The file id constructed by redacting time information and replacing non-python characters
with underscores
timestamp: str
A string of the redacted time information (e.g. "1990-01")
frequency: str
The frequency of the file if available in the filename
"""
if patterns is None:
patterns = cls.PATTERNS

# Try to determine frequency
frequency = None
for pattern, freq in frequencies.items():
if re.search(pattern, filename):
frequency = freq
break

# Parse file id
file_id = filename
timestamp = None
for pattern in patterns:
match = re.match(pattern, file_id)
if match:
timestamp = match.group(1)
redaction = re.sub(r"\d", redaction_fill, timestamp)
file_id = (
file_id[: match.start(1)] + redaction + file_id[match.end(1) :]
)
break

# Remove non-python characters from file ids
file_id = re.sub(r"[-.]", "_", file_id)
file_id = re.sub(r"_+", "_", file_id).strip("_")

return file_id, timestamp, frequency

@classmethod
def parse_access_ncfile(cls, file, time_dim="time"):
"""
Get Intake-ESM datastore entry info from an ACCESS netcdf file
Parameters
----------
file: str
The path to the netcdf file
time_dim: str
The name of the time dimension
Returns
-------
"""

file = Path(file)
filename = file.name

file_id, filename_timestamp, filename_frequency = cls.parse_access_filename(
file.stem
)

with xr.open_dataset(
file,
chunks={},
decode_cf=False,
decode_times=False,
decode_coords=False,
) as ds:
variable_list = []
variable_long_name_list = []
variable_standard_name_list = []
variable_cell_methods_list = []
variable_units_list = []
for var in ds.data_vars:
attrs = ds[var].attrs
if "long_name" in attrs:
variable_list.append(var)
variable_long_name_list.append(attrs["long_name"])
if "standard_name" in attrs:
variable_standard_name_list.append(attrs["standard_name"])
else:
variable_standard_name_list.append("")
if "cell_methods" in attrs:
variable_cell_methods_list.append(attrs["cell_methods"])
else:
variable_cell_methods_list.append("")
if "units" in attrs:
variable_units_list.append(attrs["units"])
else:
variable_units_list.append("")

start_date, end_date, frequency = get_timeinfo(
ds, filename_frequency, time_dim
)

if not variable_list:
raise EmptyFileError("This file contains no variables")

outputs = (
filename,
file_id,
filename_timestamp,
frequency,
start_date,
end_date,
variable_list,
variable_long_name_list,
variable_standard_name_list,
variable_cell_methods_list,
variable_units_list,
)

return outputs


class AccessOm2Builder(BaseBuilder):
"""Intake-ESM datastore builder for ACCESS-OM2 COSIMA datasets"""

PATTERNS = [
rf"^iceh.*\.({PATTERNS_HELPERS['ymd']}|{PATTERNS_HELPERS['ym']})$", # ACCESS-ESM1.5/OM2/CM2 ice
rf"^iceh.*\.(\d{{3}})-{PATTERNS_HELPERS['not_multi_digit']}.*", # ACCESS-OM2 ice
rf"^ocean.*[_,-](?:ymd|ym|y)_({PATTERNS_HELPERS['ymd']}|{PATTERNS_HELPERS['ym']}|{PATTERNS_HELPERS['y']})(?:$|[_,-]{PATTERNS_HELPERS['not_multi_digit']}.*)", # ACCESS-OM2 ocean
r"^ocean.*[^\d]_(\d{2})$", # A few wierd files in ACCESS-OM2 01deg_jra55v13_ryf9091
]

def __init__(self, path):
"""
Initialise a AccessOm2Builder
Expand Down Expand Up @@ -218,8 +377,8 @@ def __init__(self, path):

super().__init__(**kwargs)

@staticmethod
def parser(file):
@classmethod
def parser(cls, file):
try:
match_groups = re.match(r".*/output\d+/([^/]*)/.*\.nc", file).groups()
realm = match_groups[0]
Expand All @@ -239,7 +398,7 @@ def parser(file):
variable_standard_name_list,
variable_cell_methods_list,
variable_units_list,
) = parse_access_ncfile(file)
) = cls.parse_access_ncfile(file)

info = {
"path": str(file),
Expand All @@ -265,6 +424,10 @@ def parser(file):
class AccessOm3Builder(BaseBuilder):
"""Intake-ESM datastore builder for ACCESS-OM3 COSIMA datasets"""

PATTERNS = [
rf"[^\.]*\.{PATTERNS_HELPERS['om3_components']}\..*({PATTERNS_HELPERS['ymds']}|{PATTERNS_HELPERS['ymd']}|{PATTERNS_HELPERS['ym']})$", # ACCESS-OM3
]

def __init__(self, path):
"""
Initialise a AccessOm3Builder
Expand Down Expand Up @@ -302,8 +465,8 @@ def __init__(self, path):

super().__init__(**kwargs)

@staticmethod
def parser(file):
@classmethod
def parser(cls, file):
try:
(
filename,
Expand All @@ -317,7 +480,7 @@ def parser(file):
variable_standard_name_list,
variable_cell_methods_list,
variable_units_list,
) = parse_access_ncfile(file)
) = cls.parse_access_ncfile(file)

if "mom6" in filename:
realm = "ocean"
Expand Down Expand Up @@ -352,6 +515,11 @@ def parser(file):
class AccessEsm15Builder(BaseBuilder):
"""Intake-ESM datastore builder for ACCESS-ESM1.5 datasets"""

PATTERNS = [
rf"^iceh.*\.({PATTERNS_HELPERS['ymd']}|{PATTERNS_HELPERS['ym']})$", # ACCESS-ESM1.5/OM2/CM2 ice
r"^.*\.p.-(\d{6})_.*", # ACCESS-ESM1.5 atmosphere
]

def __init__(self, path, ensemble):
"""
Initialise a AccessEsm15Builder
Expand Down Expand Up @@ -394,8 +562,8 @@ def __init__(self, path, ensemble):

super().__init__(**kwargs)

@staticmethod
def parser(file):
@classmethod
def parser(cls, file):
try:
match_groups = re.match(r".*/([^/]*)/history/([^/]*)/.*\.nc", file).groups()
exp_id = match_groups[0]
Expand All @@ -416,7 +584,7 @@ def parser(file):
variable_standard_name_list,
variable_cell_methods_list,
variable_units_list,
) = parse_access_ncfile(file)
) = cls.parse_access_ncfile(file)

# Remove exp_id from file id so that members can be part of the same dataset
file_id = re.sub(exp_id, "", file_id).strip("_")
Expand Down Expand Up @@ -447,4 +615,8 @@ def parser(file):
class AccessCm2Builder(AccessEsm15Builder):
"""Intake-ESM datastore builder for ACCESS-CM2 datasets"""

pass
PATTERNS = [
rf"^iceh.*\.({PATTERNS_HELPERS['ymd']}|{PATTERNS_HELPERS['ym']})$", # ACCESS-ESM1.5/OM2/CM2 ice
rf"^iceh.*\.({PATTERNS_HELPERS['ym']})-{PATTERNS_HELPERS['not_multi_digit']}.*", # ACCESS-CM2 ice
r"^.*\.p.(\d{6})_.*", # ACCESS-CM2 atmosphere
]
Loading

0 comments on commit a07470a

Please sign in to comment.