diff --git a/pyproject.toml b/pyproject.toml index cbd4a54..1a6c665 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,7 +44,11 @@ requires = [ where = ["src"] [tool.setuptools.package-data] -access_nri_intake = ["data/catalog.yaml"] +access_nri_intake = [ + "data/catalog.yaml", + "data/metadata_schema_experiment.json", + "data/metadata_schema_file.json", +] [tool.versioneer] VCS = "git" diff --git a/src/access_nri_intake/catalog/__init__.py b/src/access_nri_intake/catalog/__init__.py index b87c43c..386c7ef 100644 --- a/src/access_nri_intake/catalog/__init__.py +++ b/src/access_nri_intake/catalog/__init__.py @@ -18,11 +18,9 @@ NAME_COLUMN = "name" TRANSLATOR_GROUPBY_COLUMNS = ["model", "realm", "frequency"] -SCHEMA_URL = "https://raw.githubusercontent.com/ACCESS-NRI/schema/e9055da95093ec2faa555c090fc5af17923d1566/au.org.access-nri/model/output/experiment-metadata/1-0-2.json" -SCHEMA_HASH = "ecb72c1adde3679896ceeca96aa6500d07ea2e05810155ec7a5dc301593c1dc7" EXP_JSONSCHEMA, CATALOG_JSONSCHEMA = get_jsonschema( - url=SCHEMA_URL, known_hash=SCHEMA_HASH, required=CORE_COLUMNS + metadata_file="data/metadata_schema_experiment.json", required=CORE_COLUMNS ) COLUMNS_WITH_ITERABLES = [ diff --git a/src/access_nri_intake/cli.py b/src/access_nri_intake/cli.py index ed53c0a..5253ec5 100644 --- a/src/access_nri_intake/cli.py +++ b/src/access_nri_intake/cli.py @@ -16,7 +16,7 @@ from .catalog import EXP_JSONSCHEMA, translators from .catalog.manager import CatalogManager from .source import builders -from .utils import _can_be_array, load_metadata_yaml +from .utils import _can_be_array, get_catalog_fp, load_metadata_yaml class MetadataCheckError(Exception): @@ -214,9 +214,8 @@ def _get_project(path): # Save the catalog cm.save() - _here = os.path.abspath(os.path.dirname(__file__)) if update: - with open(os.path.join(_here, "data", "catalog.yaml"), "w") as fobj: + with get_catalog_fp().open(mode="w") as fobj: yaml.dump(yaml_dict, fobj) diff --git a/src/access_nri_intake/data/__init__.py b/src/access_nri_intake/data/__init__.py index bede7b7..2e08346 100644 --- a/src/access_nri_intake/data/__init__.py +++ b/src/access_nri_intake/data/__init__.py @@ -1,9 +1,8 @@ # Copyright 2023 ACCESS-NRI and contributors. See the top-level COPYRIGHT file for details. # SPDX-License-Identifier: Apache-2.0 -import os - import intake -_here = os.path.abspath(os.path.dirname(__file__)) -data = intake.open_catalog(os.path.join(_here, "catalog.yaml")).access_nri +from access_nri_intake.utils import get_catalog_fp + +data = intake.open_catalog(get_catalog_fp()).access_nri diff --git a/src/access_nri_intake/data/metadata_schema_experiment.json b/src/access_nri_intake/data/metadata_schema_experiment.json new file mode 100644 index 0000000..2d0d02b --- /dev/null +++ b/src/access_nri_intake/data/metadata_schema_experiment.json @@ -0,0 +1,169 @@ +{ + "$id": "https://raw.githubusercontent.com/ACCESS-NRI/schema/main/au.org.access-nri/model/output/experiment-metadata/1-0-2.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "Experiment metadata", + "description": "The metadata associated with a model experiment", + "type": "object", + "properties": { + "schema_version": { + "const": "1-0-2", + "description": "The version of the schema (string)" + }, + "name": { + "type": "string", + "description": "The name of the experiment (string)" + }, + "experiment_uuid": { + "type": "string", + "format": "uuid", + "description": "Unique uuid for the experiment (string)" + }, + "description": { + "type": "string", + "description": "Short description of the experiment (string, < 150 char)" + }, + "long_description": { + "type": "string", + "description": "Long description of the experiment (string)" + }, + "model": { + "oneOf": [ + {"type": ["string", "null"]}, + { + "type": "array", + "items": {"type": ["string", "null"]} + } + ], + "description": "The name(s) of the model(s) used in the experiment (string)" + }, + "realm": { + "type": "array", + "items": { + "oneOf": [ + {"type": "null"}, + { + "type": "string", + "enum": [ + "aerosol", + "atmos", + "atmosChem", + "land", + "landIce", + "none", + "ocean", + "ocnBgchem", + "seaIce", + "unknown", + "wave" + ] + } + ] + }, + "description": "The realm(s) included in the experiment (string)" + }, + "frequency": { + "type": "array", + "items": { + "oneOf": [ + {"type": "null"}, + { + "type": "string", + "oneOf": [ + { + "pattern": "^fx$" + }, + { + "pattern": "^subhr$" + }, + { + "pattern": "^\\d+hr$" + }, + { + "pattern": "^\\d+day$" + }, + { + "pattern": "^\\d+mon$" + }, + { + "pattern": "^\\d+yr$" + }, + { + "pattern": "^\\d+dec$" + } + ] + } + ] + }, + "description": "The frequency(/ies) included in the experiment (string)" + }, + "variable": { + "type": "array", + "items": { + "type": ["string", "null"] + }, + "description": "The variable(s) included in the experiment (string)" + }, + "nominal_resolution": { + "type": "array", + "items": {"type": ["string", "null"]}, + "description": "The nominal resolution(s) of model(s) used in the experiment (string)" + }, + "version": { + "type": ["number", "string", "null"], + "description": "The version of the experiment (number, string)" + }, + "contact": { + "type": ["string", "null"], + "description": "Contact name for the experiment (string)" + }, + "email": { + "type": ["string", "null"], + "description": "Email address of the contact for the experiment (string)" + }, + "created": { + "type": ["string", "null"], + "description": "Initial creation date of experiment (string)" + }, + "reference": { + "type": ["string", "null"], + "description": "Citation or reference information (string)" + }, + "license": { + "type": ["string", "null"], + "description": "License of the experiment (string)" + }, + "url": { + "type": ["string", "null"], + "description": "Relevant url, e.g. github repo for experiment configuration (string)" + }, + "parent_experiment": { + "type": ["string", "null"], + "description": "experiment_uuid for parent experiment if appropriate (string)" + }, + "related_experiments": { + "type": "array", + "items": { + "type": ["string", "null"] + }, + "description": "experiment_uuids for any related experiment(s) (string)" + }, + "notes": { + "type": ["string", "null"], + "description": "Additional notes (string)" + }, + "keywords": { + "type": "array", + "items": { + "type": ["string", "null"] + }, + "description": "Keywords to associated with experiment (string)" + } + }, + "required": [ + "name", + "experiment_uuid", + "description", + "long_description" + ], + "additionalProperties": false +} diff --git a/src/access_nri_intake/data/metadata_schema_file.json b/src/access_nri_intake/data/metadata_schema_file.json new file mode 100644 index 0000000..5e02b4b --- /dev/null +++ b/src/access_nri_intake/data/metadata_schema_file.json @@ -0,0 +1,132 @@ +{ + "$id": "https://raw.githubusercontent.com/ACCESS-NRI/schema/main/au.org.access-nri/model/output/file-metadata/1-0-1.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "File metadata", + "description": "The metadata associated with a file containing or referencing climate model data", + "type": "object", + "properties": { + "schema_version": { + "const": "1-0-1", + "description": "The version of the schema (string)" + }, + "path": { + "type": "string", + "description": "The path to the file asset" + }, + "realm": { + "type": "string", + "enum": [ + "aerosol", + "atmos", + "atmosChem", + "land", + "landIce", + "none", + "ocean", + "ocnBgchem", + "seaIce", + "unknown", + "wave" + ], + "description": "The realm of the data in the file asset" + }, + "variable": { + "type": "array", + "items": { + "type": ["string", "null"] + }, + "description": "The variable(s) in the file asset" + }, + "variable_long_name": { + "type": "array", + "items": { + "type": ["string", "null"] + }, + "description": "The long_name(s) of the variable(s) in the file asset" + }, + "variable_standard_name": { + "type": "array", + "items": { + "type": ["string", "null"] + }, + "description": "The standard_names(s) of the variable(s) in the file asset" + }, + "variable_cell_methods": { + "type": "array", + "items": { + "type": ["string", "null"] + }, + "description": "The cell_methods(s) of the variable(s) in the file asset" + }, + "variable_units": { + "type": "array", + "items": { + "type": ["string", "null"] + }, + "description": "The units of the variable(s) in the file asset" + }, + "frequency": { + "type": "string", + "oneOf": [ + { + "pattern": "^fx$" + }, + { + "pattern": "^subhr$" + }, + { + "pattern": "^\\d+hr$" + }, + { + "pattern": "^\\d+day$" + }, + { + "pattern": "^\\d+mon$" + }, + { + "pattern": "^\\d+yr$" + }, + { + "pattern": "^\\d+dec$" + } + ], + "description": "The frequency of the variable(s) in the file asset" + }, + "start_date": { + "type": "string", + "oneOf": [ + { + "pattern": "^\\d\\d\\d\\d-\\d\\d-\\d\\d,\\s\\d\\d:\\d\\d:\\d\\d$" + }, + { + "pattern": "none" + } + ], + "description": "The start date of the variable(s) in the file asset" + }, + "end_date": { + "type": "string", + "oneOf": [ + { + "pattern": "^\\d\\d\\d\\d-\\d\\d-\\d\\d,\\s\\d\\d:\\d\\d:\\d\\d$" + }, + { + "pattern": "none" + } + ], + "description": "The end date of the variable(s) in the file asset" + }, + "nominal_resolution": { + "type": "string", + "description": "The nominal resolution of the variable(s) in the file asset" + } + }, + "required": [ + "path", + "realm", + "variable", + "frequency", + "start_date", + "end_date" + ] +} diff --git a/src/access_nri_intake/source/__init__.py b/src/access_nri_intake/source/__init__.py index 4f0242a..9a9c5ac 100644 --- a/src/access_nri_intake/source/__init__.py +++ b/src/access_nri_intake/source/__init__.py @@ -17,9 +17,7 @@ PATH_COLUMN = "path" VARIABLE_COLUMN = "variable" -SCHEMA_URL = "https://raw.githubusercontent.com/ACCESS-NRI/schema/e9055da95093ec2faa555c090fc5af17923d1566/au.org.access-nri/model/output/file-metadata/1-0-1.json" -SCHEMA_HASH = "8f2f069fa06d81ff086b91daa6503f75615aa90385ab61ee2d1a7956dc96f9a6" _, ESM_JSONSCHEMA = get_jsonschema( - url=SCHEMA_URL, known_hash=SCHEMA_HASH, required=CORE_COLUMNS + metadata_file="data/metadata_schema_file.json", required=CORE_COLUMNS ) diff --git a/src/access_nri_intake/utils.py b/src/access_nri_intake/utils.py index af2e6a6..b2895f6 100644 --- a/src/access_nri_intake/utils.py +++ b/src/access_nri_intake/utils.py @@ -4,32 +4,25 @@ """ General utility functions for access-rni-intake """ import json +from importlib import resources as rsr from warnings import warn import jsonschema -import pooch import yaml -def get_jsonschema(url, known_hash, required): +def get_jsonschema(metadata_file, required): """ - Download a jsonschema from a url. Returns the unaltered jsonschema and a version with the "required" key - matching the properties provided. + Read in the required JSON schema, and annotate it with "required" fields. Parameters ---------- - url: str - The URL to the jsonschema file. ACCESS-NRI schema can be found at - https://github.com/ACCESS-NRI/schema. - known_hash: str - A known hash (checksum) of the file. See :py:func:`~pooch.retrieve`. required: list A list of the properties to include in the "required" key """ - schema_file = pooch.retrieve(url=url, known_hash=known_hash) - - with open(schema_file) as fpath: + schema_file = rsr.files("access_nri_intake").joinpath(metadata_file) + with schema_file.open(mode="r") as fpath: schema = json.load(fpath) schema_required = schema.copy() @@ -123,3 +116,7 @@ def _is_array(field): for nfield in field["oneOf"]: is_array = is_array or _is_array(nfield) return is_array + + +def get_catalog_fp(): + return rsr.files("access_nri_intake").joinpath("data/catalog.yaml") diff --git a/tests/test_data.py b/tests/test_data.py new file mode 100644 index 0000000..514df27 --- /dev/null +++ b/tests/test_data.py @@ -0,0 +1,15 @@ +# Copyright 2024 ACCESS-NRI and contributors. See the top-level COPYRIGHT file for details. +# SPDX-License-Identifier: Apache-2.0 + +import os + +from access_nri_intake.utils import get_catalog_fp + + +def test_get_catalog_fp(): + _oneup = os.path.abspath(os.path.dirname("../")) + assert str(get_catalog_fp()) == str( + os.path.join( + _oneup, "access-nri-intake-catalog/src/access_nri_intake/data/catalog.yaml" + ) + ) diff --git a/tests/test_utils.py b/tests/test_utils.py index 8bc3810..51748e5 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -14,26 +14,25 @@ @pytest.mark.parametrize( - "known_hash", - ["2a09030653f495939c90a22e95dd1c4587c8695f7f07e17b9129a6491469f9fc", None], + "schema_file", + ["data/metadata_schema_experiment.json", "data/metadata_schema_file.json"], ) -def test_get_jsonschema(known_hash): +def test_get_jsonschema(schema_file): """ - Test that jsonschema are correctly downloaded and required fields are overwritten + Test that required fields are overwritten """ - url = "https://raw.githubusercontent.com/ACCESS-NRI/schema/4e3d10e563d7c1c9f66e9ab92a2926cdec3d6893/file_asset.json" required = [ - "path", + "realm", ] schema, schema_required = get_jsonschema( - url=url, known_hash=known_hash, required=required + metadata_file=schema_file, required=required ) assert "$schema" in schema assert schema_required["required"] == required required += ["foo"] with pytest.warns(UserWarning): - _, _ = get_jsonschema(url=url, known_hash=known_hash, required=required) + _, _ = get_jsonschema(metadata_file=schema_file, required=required) def test_load_metadata_yaml(tmp_path):