Skip to content

Commit

Permalink
Add new pattern for OM4; add ability to have multiple redacted 'times…
Browse files Browse the repository at this point in the history
…tamp' groups in filename regexp
  • Loading branch information
marc-white committed Aug 27, 2024
1 parent 6796f42 commit 138e2a6
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 4 deletions.
15 changes: 11 additions & 4 deletions src/access_nri_intake/source/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
"not_multi_digit": "(?:\\d(?!\\d)|[^\\d](?=\\d)|[^\\d](?!\\d))",
"om3_components": "(?:cice|mom6|ww3)",
"om4_components": "(?:ocean|ice)",
"om4_added_timestamp": "(\\d{4}_\\d{3})",
"ymds": "\\d{4}[_,-]\\d{2}[_,-]\\d{2}[_,-]\\d{5}",
"ymd": "\\d{4}[_,-]\\d{2}[_,-]\\d{2}",
"ymd-ns": "\\d{4}\\d{2}\\d{2}",
Expand Down Expand Up @@ -255,11 +256,12 @@ def parse_access_filename(
match = re.match(pattern, file_id)
if match:
# FIXME switch to using named group for timestamp
# Loop over all found groups and redact
timestamp = match.group(1)
redaction = re.sub(r"\d", redaction_fill, timestamp)
file_id = (
file_id[: match.start(1)] + redaction + file_id[match.end(1) :]
)
for grp in match.groups():
if grp is not None:
redaction = re.sub(r"\d", redaction_fill, grp)
file_id = re.sub(grp, redaction, file_id)
break

# Remove non-python characters from file ids
Expand Down Expand Up @@ -521,10 +523,15 @@ def parser(cls, file):
return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()}


# FIXME refactor to be called Mom6Builder (TBC)
class AccessOm4Builder(BaseBuilder):
"""Intake-ESM datastore builder for ACCESS-OM4 COSIMA datasets"""

# FIXME should be able to make one super-pattern, but couldn't
# make it work with the ? selector after om4_added_timestamp
# NOTE: Order here is important!
PATTERNS = [
rf"[^\.]*({PATTERNS_HELPERS['ymd-ns']})\.{PATTERNS_HELPERS['om4_components']}.*{PATTERNS_HELPERS['om4_added_timestamp']}.*$", # Panan naming
rf"[^\.]*({PATTERNS_HELPERS['ymd-ns']})\.{PATTERNS_HELPERS['om4_components']}.*$", # ACCESS-OM4
]

Expand Down
9 changes: 9 additions & 0 deletions tests/test_builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,15 @@ def test_builder_columns_with_iterables(test_data):
None,
),
),
(
builders.AccessOm4Builder,
"20000201.ocean_daily_2000_032",
(
"XXXXXXXX_ocean_daily_XXXX_XXX",
"20000201",
(1, "day"),
),
),
],
)
def test_parse_access_filename(builder, filename, expected):
Expand Down

0 comments on commit 138e2a6

Please sign in to comment.