From dd8cf80dfbd72c2bd6e74e84d93873062601a7ae Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Thu, 8 Feb 2024 13:27:48 +0100 Subject: [PATCH 01/22] Add soil temperature and soil moisture to ERA5 data --- src/zampy/conventions/ALMA.json | 8 ++++ src/zampy/datasets/cds_utils.py | 74 +++++++++++++++++++++++++++++++- src/zampy/datasets/era5.py | 17 +++++++- src/zampy/recipe.py | 4 +- src/zampy/reference/variables.py | 3 ++ 5 files changed, 102 insertions(+), 4 deletions(-) diff --git a/src/zampy/conventions/ALMA.json b/src/zampy/conventions/ALMA.json index ddeb60a..ece5985 100644 --- a/src/zampy/conventions/ALMA.json +++ b/src/zampy/conventions/ALMA.json @@ -84,5 +84,13 @@ "land_cover": { "variable": "land_cover", "units": "" + }, + "soil_temperature": { + "variable": "SoilTemp", + "units": "kelvin" + }, + "soil_moisture": { + "variable": "SoilMoist", + "units": "kilogram/meter**3" } } \ No newline at end of file diff --git a/src/zampy/datasets/cds_utils.py b/src/zampy/datasets/cds_utils.py index 4eade9e..23d9661 100644 --- a/src/zampy/datasets/cds_utils.py +++ b/src/zampy/datasets/cds_utils.py @@ -1,5 +1,6 @@ """CDS utilities used by ECMWF datasets.""" +from copy import copy from pathlib import Path import cdsapi import numpy as np @@ -44,6 +45,21 @@ "21:00", "22:00", "23:00", ] # fmt: skip +SPLIT_VARIABLES = { + "soil_temperature": ( + "soil_temperature_level_1", + "soil_temperature_level_2", + "soil_temperature_level_3", + "soil_temperature_level_4", + ), + "moisture_content_of_soil_layer": ( + "volumetric_soil_water_layer_1", + "volumetric_soil_water_layer_2", + "volumetric_soil_water_layer_3", + "volumetric_soil_water_layer_4", + ), +} + def cds_request( dataset: str, @@ -226,6 +242,12 @@ def retrieve_era5( # create list of year/month pairs year_month_pairs = time_bounds_to_year_month(time_bounds) + variables = copy(variables) # Prevent original input from being modified in-place + for split_var in SPLIT_VARIABLES: + if split_var in variables: + variables.remove(split_var) + variables.extend(SPLIT_VARIABLES[split_var]) + for (year, month), variable in product( year_month_pairs, variables, position=0, leave=True ): @@ -354,7 +376,8 @@ def convert_to_zampy( print(f"File '{ncfile.name}' already exists, skipping...") else: ds = parse_nc_file(file) - + # Rename the vswl data: + ncfile = Path(str(ncfile).replace("volumetric_soil_water", "soil_moisture")) ds.to_netcdf(path=ncfile) @@ -373,6 +396,28 @@ def convert_to_zampy( "co2": "co2_concentration", } +VAR_REFERENCE_MULTI_LAYER = { + "stl1": "soil_temperature", + "stl2": "soil_temperature", + "stl3": "soil_temperature", + "stl4": "soil_temperature", + "swvl1": "soil_moisture", + "swvl2": "soil_moisture", + "swvl3": "soil_moisture", + "swvl4": "soil_moisture", +} + +LAYER_BOUNDS = { + "stl1": [[0.0, 7.0]], + "stl2": [[7.0, 28.0]], + "stl3": [[28.0, 100.0]], + "stl4": [[100.0, 289.0]], + "swvl1": [[0.0, 7.0]], + "swvl2": [[7.0, 28.0]], + "swvl3": [[28.0, 100.0]], + "swvl4": [[100.0, 289.0]], +} + WATER_DENSITY = 997.0 # kg/m3 @@ -416,6 +461,33 @@ def parse_nc_file(file: Path) -> xr.Dataset: variable_name ].desc + if variable in VAR_REFERENCE_MULTI_LAYER: + if ( # Soil temperature/moisture routine + str(variable).startswith("stl") or str(variable).startswith("swvl") + ): + if str(variable).startswith("swvl"): + varname = "soil_moisture" + standard_name = "moisture_content_of_soil_layer" + ds[variable] *= WATER_DENSITY + ds[variable].attrs.update({"units": "kg m**-3"}) + else: + varname = "soil_temperature" + standard_name = "temperature_in_ground" + + da = ds[variable] + name = str(da.name) + da = da.expand_dims({"depth": [np.mean(LAYER_BOUNDS[name])]}) + da = da.rename(varname) + da.attrs.update( + { + "long_name": varname.replace("_", " "), + "standard_name": standard_name, + } + ) + + ds = da.to_dataset() + ds["depth_bounds"] = (("depth", "nv"), LAYER_BOUNDS[name]) + # TODO: add dataset attributes. return ds diff --git a/src/zampy/datasets/era5.py b/src/zampy/datasets/era5.py index dda319f..f0edfef 100644 --- a/src/zampy/datasets/era5.py +++ b/src/zampy/datasets/era5.py @@ -47,15 +47,30 @@ class ERA5Land(ECMWFDataset): # noqa: D101 raw_variables = [ Variable(name="t2m", unit=unit_registry.kelvin), Variable(name="d2m", unit=unit_registry.kelvin), + Variable(name="st", unit=unit_registry.kelvin), + Variable(name="swvl", unit=unit_registry.fraction), ] # variable names used in cdsapi downloading request cds_var_names = { "air_temperature": "2m_temperature", "dewpoint_temperature": "2m_dewpoint_temperature", + "soil_temperature_level_1": "soil_temperature_level_1", # Note: split variables + "soil_temperature_level_2": "soil_temperature_level_2", + "soil_temperature_level_3": "soil_temperature_level_3", + "soil_temperature_level_4": "soil_temperature_level_4", + "volumetric_soil_water_layer_1": "volumetric_soil_water_layer_1", + "volumetric_soil_water_layer_2": "volumetric_soil_water_layer_2", + "volumetric_soil_water_layer_3": "volumetric_soil_water_layer_3", + "volumetric_soil_water_layer_4": "volumetric_soil_water_layer_4", } - variable_names = list(cds_var_names.keys()) + variable_names = [ + "air_temperature", + "dewpoint_temperature", + "soil_temperature", + "soil_moisture", + ] variables = [VARIABLE_REFERENCE_LOOKUP[var] for var in variable_names] diff --git a/src/zampy/recipe.py b/src/zampy/recipe.py index 5cd26ac..a0b24c5 100644 --- a/src/zampy/recipe.py +++ b/src/zampy/recipe.py @@ -31,8 +31,8 @@ def recipe_loader(recipe_path: Path) -> dict: for key in ["convention", "frequency", "resolution"] ): msg = ( - "One of the following items are missing from the recipe:\n" - "name, download, convert." + "One of the following 'convert' items are missing from the recipe:\n" + "convention, frequency, resolution." ) raise ValueError(msg) diff --git a/src/zampy/reference/variables.py b/src/zampy/reference/variables.py index d125e7a..d021e4d 100644 --- a/src/zampy/reference/variables.py +++ b/src/zampy/reference/variables.py @@ -13,6 +13,7 @@ def unit_registration() -> UnitRegistry: unit_registry.define("degree_east = degree = degree_E = degreeE") unit_registry.define("watt_per_square_meter = watt/meter**2") unit_registry.define("joule_per_square_meter = joule/meter**2") + unit_registry.define("kilogram_per_square_meter = kilogram/(meter**2)") unit_registry.define( "kilogram_per_square_meter_second = kilogram/(meter**2*second)" ) @@ -56,6 +57,8 @@ def unit_registration() -> UnitRegistry: Variable("elevation", unit=unit_registry.meter), Variable("leaf_area_index", unit=unit_registry.fraction), Variable("land_cover", unit=unit_registry.dimensionless), + Variable("soil_temperature", unit=unit_registry.kelvin), + Variable("soil_moisture", unit=unit_registry.kilogram_per_square_meter), ) VARIABLE_REFERENCE_LOOKUP = {var.name: var for var in VARIABLE_REFERENCE} From 711f3eb76d304c74ead43757bac2b7a7c0f53207 Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Fri, 9 Feb 2024 14:14:01 +0100 Subject: [PATCH 02/22] Fix incorrect name in SPLIT_VARIABLES constant --- src/zampy/datasets/cds_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/zampy/datasets/cds_utils.py b/src/zampy/datasets/cds_utils.py index 23d9661..c223a9a 100644 --- a/src/zampy/datasets/cds_utils.py +++ b/src/zampy/datasets/cds_utils.py @@ -52,7 +52,7 @@ "soil_temperature_level_3", "soil_temperature_level_4", ), - "moisture_content_of_soil_layer": ( + "soil_moisture": ( "volumetric_soil_water_layer_1", "volumetric_soil_water_layer_2", "volumetric_soil_water_layer_3", From 6686d58b47985752994e2f63d7794326949993ec Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Fri, 14 Jun 2024 14:46:14 +0200 Subject: [PATCH 03/22] Use Dask distributed by default: avoids memory issues --- src/zampy/cli.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/zampy/cli.py b/src/zampy/cli.py index e7b4564..f417066 100644 --- a/src/zampy/cli.py +++ b/src/zampy/cli.py @@ -1,6 +1,7 @@ """Implements CLI interface for Zampy.""" from pathlib import Path import click +import dask.distributed from zampy.recipe import RecipeManager @@ -14,4 +15,5 @@ def run_recipe(recipe: Path) -> None: if __name__ == "__main__": + dask.distributed.Client() run_recipe() From 5a9bbcdf3347b15c2f2590f955ba1a74b4883d12 Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Fri, 14 Jun 2024 14:47:06 +0200 Subject: [PATCH 04/22] Validate recipe with datasets before downloading --- src/zampy/recipe.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/zampy/recipe.py b/src/zampy/recipe.py index a0b24c5..82c0794 100644 --- a/src/zampy/recipe.py +++ b/src/zampy/recipe.py @@ -8,6 +8,7 @@ from zampy.datasets.dataset_protocol import Dataset from zampy.datasets.dataset_protocol import SpatialBounds from zampy.datasets.dataset_protocol import TimeBounds +from zampy.datasets.validation import validate_download_request def recipe_loader(recipe_path: Path) -> dict: @@ -92,9 +93,22 @@ def __init__(self, recipe_path: Path) -> None: def run(self) -> None: """Run the full recipe.""" + # First validate all inputs (before downloading, processing...) for dataset_name in self.datasets: _dataset = DATASETS[dataset_name.lower()] dataset: Dataset = _dataset() + + validate_download_request( + dataset, + self.download_dir, + self.timebounds, + self.spatialbounds, + self.datasets[dataset_name]["variables"], + ) + + for dataset_name in self.datasets: + _dataset = DATASETS[dataset_name.lower()] + dataset = _dataset() variables: list[str] = self.datasets[dataset_name]["variables"] # Download datset From d396e48e3a23d8bf2c4cf8432a26c23a4dbeb304 Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Fri, 14 Jun 2024 14:48:11 +0200 Subject: [PATCH 05/22] Delete in-memory dataset after saving to reduce mem usage --- src/zampy/recipe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/zampy/recipe.py b/src/zampy/recipe.py index 82c0794..28ef2d5 100644 --- a/src/zampy/recipe.py +++ b/src/zampy/recipe.py @@ -141,6 +141,7 @@ def run(self) -> None: # e.g. "era5_2010-2020.nc" fname = f"{dataset_name.lower()}_{time_start}-{time_end}.nc" ds.to_netcdf(path=self.data_dir / fname, encoding=encoding) + del ds print( "Finished running the recipe. Output data can be found at:\n" From 33c5cd915f6d69515068edbc1257785347b53875 Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Wed, 10 Jul 2024 09:09:55 +0200 Subject: [PATCH 06/22] Add appropriate time bounds to canopy height data set --- src/zampy/datasets/eth_canopy_height.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/zampy/datasets/eth_canopy_height.py b/src/zampy/datasets/eth_canopy_height.py index c186de3..c7557ec 100644 --- a/src/zampy/datasets/eth_canopy_height.py +++ b/src/zampy/datasets/eth_canopy_height.py @@ -269,10 +269,17 @@ def parse_tiff_file(file: Path, sd_file: bool = False) -> xr.Dataset: da = da.isel(band=0) # get rid of band dim da = da.drop_vars(["band", "spatial_ref"]) # drop unnecessary coords ds = da.to_dataset() - ds = ds.assign_coords( # halfway in the year - {"time": np.datetime64("2020-07-01").astype("datetime64[ns]")} + ds = xr.concat( # Cover entirety of 2020 + ( + ds.assign_coords( + {"time": np.datetime64("2020-01-01").astype("datetime64[ns]")} + ), + ds.assign_coords( + {"time": np.datetime64("2021-01-01").astype("datetime64[ns]")} + ), + ), + dim="time", ) - ds = ds.expand_dims("time") ds = ds.rename( { "band_data": "height_of_vegetation_standard_deviation" From 0e654c9f405d8b9405b15910f64314e278dd2e14 Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Wed, 10 Jul 2024 09:14:02 +0200 Subject: [PATCH 07/22] New formatting --- src/zampy/__init__.py | 1 + src/zampy/cli.py | 1 + src/zampy/datasets/__init__.py | 1 + src/zampy/datasets/catalog.py | 1 + src/zampy/datasets/converter.py | 1 + src/zampy/datasets/dataset_protocol.py | 1 + src/zampy/datasets/eth_canopy_height.py | 1 + src/zampy/datasets/prism_dem.py | 1 + src/zampy/datasets/utils.py | 1 + src/zampy/datasets/validation.py | 1 + src/zampy/recipe.py | 3 ++- src/zampy/reference/variables.py | 1 + tests/test_data/fapar-lai/generate_fake_data.py | 1 + tests/test_datasets/__init__.py | 1 + tests/test_datasets/test_fapar_lai.py | 1 + tests/test_recipes/generate_test_data.py | 1 + tests/test_recipes/test_recipe_loader.py | 1 + tests/test_recipes/test_simple_recipe.py | 1 + 18 files changed, 19 insertions(+), 1 deletion(-) diff --git a/src/zampy/__init__.py b/src/zampy/__init__.py index e5cc82f..f13f2a7 100644 --- a/src/zampy/__init__.py +++ b/src/zampy/__init__.py @@ -1,4 +1,5 @@ """zampy.""" + from zampy import datasets diff --git a/src/zampy/cli.py b/src/zampy/cli.py index f417066..485f5b1 100644 --- a/src/zampy/cli.py +++ b/src/zampy/cli.py @@ -1,4 +1,5 @@ """Implements CLI interface for Zampy.""" + from pathlib import Path import click import dask.distributed diff --git a/src/zampy/datasets/__init__.py b/src/zampy/datasets/__init__.py index 155a9d2..cf0ca5a 100644 --- a/src/zampy/datasets/__init__.py +++ b/src/zampy/datasets/__init__.py @@ -1,4 +1,5 @@ """Datasets implementations.""" + from zampy.datasets import dataset_protocol from zampy.datasets import validation from zampy.datasets.catalog import DATASETS diff --git a/src/zampy/datasets/catalog.py b/src/zampy/datasets/catalog.py index 44e8ffb..7a30cad 100644 --- a/src/zampy/datasets/catalog.py +++ b/src/zampy/datasets/catalog.py @@ -1,4 +1,5 @@ """Catalog of datasets.""" + from zampy.datasets import dataset_protocol from zampy.datasets.cams import CAMS from zampy.datasets.era5 import ERA5 diff --git a/src/zampy/datasets/converter.py b/src/zampy/datasets/converter.py index 6692be8..1e6b75d 100644 --- a/src/zampy/datasets/converter.py +++ b/src/zampy/datasets/converter.py @@ -1,4 +1,5 @@ """Dataset formatter for different conventions.""" + import json import warnings from pathlib import Path diff --git a/src/zampy/datasets/dataset_protocol.py b/src/zampy/datasets/dataset_protocol.py index ce3a0cc..f58c57d 100644 --- a/src/zampy/datasets/dataset_protocol.py +++ b/src/zampy/datasets/dataset_protocol.py @@ -1,4 +1,5 @@ """Outline of the dataset protocol.""" + import json import shutil from dataclasses import dataclass diff --git a/src/zampy/datasets/eth_canopy_height.py b/src/zampy/datasets/eth_canopy_height.py index c7557ec..48953a2 100644 --- a/src/zampy/datasets/eth_canopy_height.py +++ b/src/zampy/datasets/eth_canopy_height.py @@ -1,4 +1,5 @@ """ETH canopy height dataset.""" + import gzip from pathlib import Path import numpy as np diff --git a/src/zampy/datasets/prism_dem.py b/src/zampy/datasets/prism_dem.py index 73440bb..0952ad8 100644 --- a/src/zampy/datasets/prism_dem.py +++ b/src/zampy/datasets/prism_dem.py @@ -1,4 +1,5 @@ """Prism DEM dataset.""" + import gzip import tarfile from pathlib import Path diff --git a/src/zampy/datasets/utils.py b/src/zampy/datasets/utils.py index 69489a2..a737baa 100644 --- a/src/zampy/datasets/utils.py +++ b/src/zampy/datasets/utils.py @@ -1,4 +1,5 @@ """Shared utilities from datasets.""" + import urllib.request from pathlib import Path import requests diff --git a/src/zampy/datasets/validation.py b/src/zampy/datasets/validation.py index df5f9f6..47cbdbc 100644 --- a/src/zampy/datasets/validation.py +++ b/src/zampy/datasets/validation.py @@ -1,4 +1,5 @@ """Checks for user input validation.""" + from pathlib import Path from zampy.datasets.dataset_protocol import Dataset from zampy.datasets.dataset_protocol import SpatialBounds diff --git a/src/zampy/recipe.py b/src/zampy/recipe.py index 28ef2d5..c9c799d 100644 --- a/src/zampy/recipe.py +++ b/src/zampy/recipe.py @@ -1,4 +1,5 @@ -""""All functionality to read and execute Zampy recipes.""" +""" "All functionality to read and execute Zampy recipes.""" + from pathlib import Path from typing import Any import numpy as np diff --git a/src/zampy/reference/variables.py b/src/zampy/reference/variables.py index d021e4d..642b6eb 100644 --- a/src/zampy/reference/variables.py +++ b/src/zampy/reference/variables.py @@ -1,4 +1,5 @@ """Variable reference for Zampy.""" + from pint import UnitRegistry from zampy.datasets.dataset_protocol import Variable diff --git a/tests/test_data/fapar-lai/generate_fake_data.py b/tests/test_data/fapar-lai/generate_fake_data.py index 051b651..a42c475 100644 --- a/tests/test_data/fapar-lai/generate_fake_data.py +++ b/tests/test_data/fapar-lai/generate_fake_data.py @@ -1,4 +1,5 @@ """Generate fake data for the fapar-lai tests.""" + import zipfile from pathlib import Path import numpy as np diff --git a/tests/test_datasets/__init__.py b/tests/test_datasets/__init__.py index ccb1f08..596750e 100644 --- a/tests/test_datasets/__init__.py +++ b/tests/test_datasets/__init__.py @@ -1,4 +1,5 @@ """This module contains all tests for datasets included in zampy.""" + from pathlib import Path diff --git a/tests/test_datasets/test_fapar_lai.py b/tests/test_datasets/test_fapar_lai.py index 8ead16d..2db36e9 100644 --- a/tests/test_datasets/test_fapar_lai.py +++ b/tests/test_datasets/test_fapar_lai.py @@ -1,4 +1,5 @@ """Unit tests for the FAPAR-LAI dataset.""" + import json from pathlib import Path from unittest.mock import patch diff --git a/tests/test_recipes/generate_test_data.py b/tests/test_recipes/generate_test_data.py index 016c6cf..2a88ff6 100644 --- a/tests/test_recipes/generate_test_data.py +++ b/tests/test_recipes/generate_test_data.py @@ -1,4 +1,5 @@ """Generates test data for running the recipe tests.""" + from pathlib import Path import numpy as np import pandas as pd diff --git a/tests/test_recipes/test_recipe_loader.py b/tests/test_recipes/test_recipe_loader.py index 1f3d7cc..042c996 100644 --- a/tests/test_recipes/test_recipe_loader.py +++ b/tests/test_recipes/test_recipe_loader.py @@ -1,4 +1,5 @@ """Test the recipe loader.""" + import pytest from zampy.recipe import recipe_loader diff --git a/tests/test_recipes/test_simple_recipe.py b/tests/test_recipes/test_simple_recipe.py index 536422f..2c8076a 100644 --- a/tests/test_recipes/test_simple_recipe.py +++ b/tests/test_recipes/test_simple_recipe.py @@ -1,4 +1,5 @@ """Testing a simple recipe.""" + from pathlib import Path from unittest.mock import patch import generate_test_data From adf73de29cd81ed56cad99f067995c6b6bf15933 Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Wed, 10 Jul 2024 10:49:19 +0200 Subject: [PATCH 08/22] Make tests use dask distributed scheduler --- pyproject.toml | 2 +- tests/test_datasets/test_fapar_lai.py | 3 +++ tests/test_recipes/test_simple_recipe.py | 4 ++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4a178f3..6331e99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -134,7 +134,7 @@ python_version = "3.10" [tool.ruff] line-length = 88 -exclude = ["docs", "build"] +exclude = ["docs", "build", "tests"] target-version = "py310" [tool.ruff.lint] diff --git a/tests/test_datasets/test_fapar_lai.py b/tests/test_datasets/test_fapar_lai.py index 2db36e9..f9a5d58 100644 --- a/tests/test_datasets/test_fapar_lai.py +++ b/tests/test_datasets/test_fapar_lai.py @@ -10,6 +10,7 @@ from zampy.datasets.dataset_protocol import SpatialBounds from zampy.datasets.dataset_protocol import TimeBounds from . import data_folder +import dask.distributed @pytest.fixture(scope="function") @@ -86,6 +87,8 @@ def test_download(self, mock_retrieve, valid_path_config, dummy_dir): @pytest.mark.slow def test_ingest(self, dummy_dir): """Test ingest function.""" + dask.distributed.Client() + ingest_dir = Path(dummy_dir) / "ingest" ingest_dir.mkdir() diff --git a/tests/test_recipes/test_simple_recipe.py b/tests/test_recipes/test_simple_recipe.py index 2c8076a..a00bc20 100644 --- a/tests/test_recipes/test_simple_recipe.py +++ b/tests/test_recipes/test_simple_recipe.py @@ -12,6 +12,8 @@ from zampy.datasets.dataset_protocol import write_properties_file from zampy.recipe import RecipeManager from zampy.recipe import convert_time +import dask.distributed + RECIPE_FILE = Path(__file__).parent / "recipes" / "era5_recipe.yml" @@ -21,6 +23,8 @@ def test_recipe(tmp_path: Path, mocker): with ( patch.object(DATASETS["era5"], "download"), ): + dask.distributed.Client() + mocker.patch( "zampy.recipe.config_loader", return_value={"working_directory": str(tmp_path.absolute())}, From 014d632dc3303c83d235169854d5b692dba457fe Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Wed, 17 Jul 2024 09:26:01 +0200 Subject: [PATCH 09/22] Ignore typing in test folder --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 6331e99..6f38503 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -131,6 +131,7 @@ markers = [ ignore_missing_imports = true disallow_untyped_defs = true python_version = "3.10" +exclude = "tests" [tool.ruff] line-length = 88 From f26a700d0c29c9fe97c2358e59b7b528a7970660 Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Wed, 17 Jul 2024 09:26:34 +0200 Subject: [PATCH 10/22] Add interpolation for datasets with lower frequency than recipe target --- src/zampy/recipe.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/zampy/recipe.py b/src/zampy/recipe.py index c9c799d..6df130d 100644 --- a/src/zampy/recipe.py +++ b/src/zampy/recipe.py @@ -3,6 +3,8 @@ from pathlib import Path from typing import Any import numpy as np +import pandas as pd +import xarray as xr import yaml from zampy.datasets import DATASETS from zampy.datasets import converter @@ -133,7 +135,18 @@ def run(self) -> None: ds = converter.convert(ds, dataset, convention=self.convention) if "time" in ds.dims: # Dataset with only DEM (e.g.) has no time dim. - ds = ds.resample(time=self.frequency).mean() + freq = xr.infer_freq(ds["time"]) + if freq is None: # fallback: + freq = ( + ds["time"].isel(time=1).to_numpy() - + ds["time"].isel(time=0).to_numpy() + ) + data_freq = pd.to_timedelta(pd.tseries.frequencies.to_offset(freq)) + + if data_freq < pd.Timedelta(self.frequency): + ds = ds.resample(time=self.frequency).mean() + elif data_freq > pd.Timedelta(self.frequency): + ds = ds.resample(time=self.frequency).interpolate("nearest") comp = dict(zlib=True, complevel=5) encoding = {var: comp for var in ds.data_vars} From aa650d310568494e221007c84db5e033fccda4fc Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Wed, 17 Jul 2024 09:55:26 +0200 Subject: [PATCH 11/22] Update formatting and linting (new rules) --- src/zampy/recipe.py | 6 +++--- tests/test_datasets/test_eth_canopy_height.py | 6 +++--- tests/test_datasets/test_fapar_lai.py | 2 +- tests/test_datasets/test_prism_dem.py | 2 +- tests/test_recipes/test_simple_recipe.py | 3 +-- 5 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/zampy/recipe.py b/src/zampy/recipe.py index 6df130d..b1a0cef 100644 --- a/src/zampy/recipe.py +++ b/src/zampy/recipe.py @@ -1,4 +1,4 @@ -""" "All functionality to read and execute Zampy recipes.""" +"""All functionality to read and execute Zampy recipes.""" from pathlib import Path from typing import Any @@ -138,8 +138,8 @@ def run(self) -> None: freq = xr.infer_freq(ds["time"]) if freq is None: # fallback: freq = ( - ds["time"].isel(time=1).to_numpy() - - ds["time"].isel(time=0).to_numpy() + ds["time"].isel(time=1).to_numpy() + - ds["time"].isel(time=0).to_numpy() ) data_freq = pd.to_timedelta(pd.tseries.frequencies.to_offset(freq)) diff --git a/tests/test_datasets/test_eth_canopy_height.py b/tests/test_datasets/test_eth_canopy_height.py index 70112a1..b8ac145 100644 --- a/tests/test_datasets/test_eth_canopy_height.py +++ b/tests/test_datasets/test_eth_canopy_height.py @@ -71,7 +71,7 @@ def test_ingest(self, dummy_dir): """Test ingest function.""" ds, _ = self.ingest_dummy_data(dummy_dir) - assert type(ds) == xr.Dataset + assert isinstance(ds, xr.Dataset) def test_load(self, dummy_dir): """Test load function.""" @@ -153,7 +153,7 @@ def test_parse_tiff_file(): "ETH_GlobalCanopyHeight_10m_2020_N51E003_Map.tif", ) ) - assert type(dummy_ds) == xr.Dataset + assert isinstance(dummy_ds, xr.Dataset) def test_convert_tiff_to_netcdf(dummy_dir): @@ -171,4 +171,4 @@ def test_convert_tiff_to_netcdf(dummy_dir): ds = xr.load_dataset( Path(dummy_dir, "ETH_GlobalCanopyHeight_10m_2020_N51E003_Map.nc") ) - assert type(ds) == xr.Dataset + assert isinstance(ds, xr.Dataset) diff --git a/tests/test_datasets/test_fapar_lai.py b/tests/test_datasets/test_fapar_lai.py index f9a5d58..2e11c8d 100644 --- a/tests/test_datasets/test_fapar_lai.py +++ b/tests/test_datasets/test_fapar_lai.py @@ -3,6 +3,7 @@ import json from pathlib import Path from unittest.mock import patch +import dask.distributed import numpy as np import pytest import xarray as xr @@ -10,7 +11,6 @@ from zampy.datasets.dataset_protocol import SpatialBounds from zampy.datasets.dataset_protocol import TimeBounds from . import data_folder -import dask.distributed @pytest.fixture(scope="function") diff --git a/tests/test_datasets/test_prism_dem.py b/tests/test_datasets/test_prism_dem.py index 5c7d1e1..83b549f 100644 --- a/tests/test_datasets/test_prism_dem.py +++ b/tests/test_datasets/test_prism_dem.py @@ -69,7 +69,7 @@ def test_ingest(self, dummy_dir): """Test ingest function.""" ds, _ = self.ingest_dummy_data(dummy_dir) - assert type(ds) == xr.Dataset + assert isinstance(ds, xr.Dataset) def test_load(self, dummy_dir): """Test load function.""" diff --git a/tests/test_recipes/test_simple_recipe.py b/tests/test_recipes/test_simple_recipe.py index a00bc20..e8dfd18 100644 --- a/tests/test_recipes/test_simple_recipe.py +++ b/tests/test_recipes/test_simple_recipe.py @@ -2,6 +2,7 @@ from pathlib import Path from unittest.mock import patch +import dask.distributed import generate_test_data import numpy as np import pytest @@ -12,8 +13,6 @@ from zampy.datasets.dataset_protocol import write_properties_file from zampy.recipe import RecipeManager from zampy.recipe import convert_time -import dask.distributed - RECIPE_FILE = Path(__file__).parent / "recipes" / "era5_recipe.yml" From 1fe26d06c9e98f0655d7e4ef248742fe45269170 Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Fri, 26 Jul 2024 09:22:12 +0200 Subject: [PATCH 12/22] Try dask submit/result for Windows --- tests/test_datasets/test_fapar_lai.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/test_datasets/test_fapar_lai.py b/tests/test_datasets/test_fapar_lai.py index 2e11c8d..0ac135b 100644 --- a/tests/test_datasets/test_fapar_lai.py +++ b/tests/test_datasets/test_fapar_lai.py @@ -87,15 +87,19 @@ def test_download(self, mock_retrieve, valid_path_config, dummy_dir): @pytest.mark.slow def test_ingest(self, dummy_dir): """Test ingest function.""" - dask.distributed.Client() + c = dask.distributed.Client() ingest_dir = Path(dummy_dir) / "ingest" ingest_dir.mkdir() - lai_dataset = FaparLAI() - lai_dataset.ingest( - download_dir=data_folder / "fapar-lai" / "download", ingest_dir=ingest_dir - ) + def ingest_lai(): + lai_dataset = FaparLAI() + lai_dataset.ingest( + download_dir=data_folder / "fapar-lai" / "download", ingest_dir=ingest_dir + ) + f = c.submit(ingest_lai) + f.result() + ds = xr.open_mfdataset((ingest_dir / "fapar-lai").glob("*.nc")) assert isinstance(ds, xr.Dataset) From b0908b25115578b6bf778df74821572d9036d1ce Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Fri, 26 Jul 2024 09:37:21 +0200 Subject: [PATCH 13/22] Try to fix windows again --- src/zampy/datasets/fapar_lai.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/zampy/datasets/fapar_lai.py b/src/zampy/datasets/fapar_lai.py index 545bfb6..b716620 100644 --- a/src/zampy/datasets/fapar_lai.py +++ b/src/zampy/datasets/fapar_lai.py @@ -257,6 +257,7 @@ def ingest_ncfile(ncfile: Path, ingest_folder: Path) -> None: path=ingest_folder / ncfile.name, encoding={"leaf_area_index": {"zlib": True, "complevel": 3}}, ) + ds.close() # explicitly close to release file to system (for Windows) def extract_fapar_zip( From f09e2d7cd575934fcb34609597248a722f55e664 Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Fri, 26 Jul 2024 10:04:15 +0200 Subject: [PATCH 14/22] Use context manager to ensure file closure --- tests/test_datasets/test_fapar_lai.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_datasets/test_fapar_lai.py b/tests/test_datasets/test_fapar_lai.py index 0ac135b..504f2d4 100644 --- a/tests/test_datasets/test_fapar_lai.py +++ b/tests/test_datasets/test_fapar_lai.py @@ -100,8 +100,8 @@ def ingest_lai(): f = c.submit(ingest_lai) f.result() - ds = xr.open_mfdataset((ingest_dir / "fapar-lai").glob("*.nc")) - assert isinstance(ds, xr.Dataset) + with xr.open_mfdataset((ingest_dir / "fapar-lai").glob("*.nc")) as ds: + assert isinstance(ds, xr.Dataset) def test_load(self): """Test load function.""" From 8d3875989ed7b24d3d0a8edfb71c127c7a0ca9c0 Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Fri, 26 Jul 2024 10:35:37 +0200 Subject: [PATCH 15/22] Try unlinking files manually --- tests/test_datasets/test_fapar_lai.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_datasets/test_fapar_lai.py b/tests/test_datasets/test_fapar_lai.py index 504f2d4..e9f0db7 100644 --- a/tests/test_datasets/test_fapar_lai.py +++ b/tests/test_datasets/test_fapar_lai.py @@ -103,6 +103,9 @@ def ingest_lai(): with xr.open_mfdataset((ingest_dir / "fapar-lai").glob("*.nc")) as ds: assert isinstance(ds, xr.Dataset) + for file in (ingest_dir / "fapar-lai").glob("*.nc"): + file.unlink() + def test_load(self): """Test load function.""" times = TimeBounds(np.datetime64("2019-01-01"), np.datetime64("2019-01-31")) From ecbdb408ea78de73cbe628f8021c148ee958b5be Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Fri, 26 Jul 2024 10:51:19 +0200 Subject: [PATCH 16/22] Change how tempdir works on windows --- src/zampy/datasets/fapar_lai.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/zampy/datasets/fapar_lai.py b/src/zampy/datasets/fapar_lai.py index b716620..c026a3f 100644 --- a/src/zampy/datasets/fapar_lai.py +++ b/src/zampy/datasets/fapar_lai.py @@ -1,5 +1,6 @@ """Implementation of the FAPAR LAI dataset.""" +import os import shutil import tempfile import zipfile @@ -119,7 +120,10 @@ def ingest( # netCDF files follow CF-1.6, only unpacking the archives is required. for file in zip_files: - with tempfile.TemporaryDirectory(dir=tmp_path) as _tmpdir: + with tempfile.TemporaryDirectory( + dir=tmp_path, + ignore_cleanup_errors=True if os.name == "nt" else False, + ) as _tmpdir: tmpdir = Path(_tmpdir) extract_fapar_zip( From f8e0e636c06c13e467a1bd300314076b89130e7d Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Fri, 26 Jul 2024 16:14:32 +0200 Subject: [PATCH 17/22] Allow skipping downloads --- src/zampy/cli.py | 5 +++-- src/zampy/recipe.py | 18 ++++++++++-------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/zampy/cli.py b/src/zampy/cli.py index 485f5b1..c247844 100644 --- a/src/zampy/cli.py +++ b/src/zampy/cli.py @@ -8,10 +8,11 @@ @click.command() @click.argument("recipe", type=click.Path(exists=True, path_type=Path)) -def run_recipe(recipe: Path) -> None: +@click.option('--skip-download', is_flag=True) +def run_recipe(recipe: Path, skip_download: bool) -> None: """Run the recipe using the CLI.""" click.echo(f"Executing recipe: {recipe}") - rm = RecipeManager(recipe) + rm = RecipeManager(recipe, skip_download) rm.run() diff --git a/src/zampy/recipe.py b/src/zampy/recipe.py index b1a0cef..63a360b 100644 --- a/src/zampy/recipe.py +++ b/src/zampy/recipe.py @@ -64,10 +64,11 @@ def config_loader() -> dict: class RecipeManager: """The recipe manager is used to get the required info, and then run the recipe.""" - def __init__(self, recipe_path: Path) -> None: + def __init__(self, recipe_path: Path, skip_download: bool = False) -> None: """Instantiate the recipe manager, using a prepared recipe.""" # Load & parse recipe recipe = recipe_loader(recipe_path) + self.skip_download = skip_download self.start_time, self.end_time = recipe["download"]["time"] self.timebounds = TimeBounds( @@ -114,13 +115,14 @@ def run(self) -> None: dataset = _dataset() variables: list[str] = self.datasets[dataset_name]["variables"] - # Download datset - dataset.download( - download_dir=self.download_dir, - time_bounds=self.timebounds, - spatial_bounds=self.spatialbounds, - variable_names=variables, - ) + # Download dataset + if not self.skip_download: + dataset.download( + download_dir=self.download_dir, + time_bounds=self.timebounds, + spatial_bounds=self.spatialbounds, + variable_names=variables, + ) dataset.ingest(self.download_dir, self.ingest_dir) From e206125a632584a26381bb706e4c45a4d40b2762 Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Fri, 26 Jul 2024 16:14:44 +0200 Subject: [PATCH 18/22] Actually run fast tests first --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9cf85ef..1e1fbb6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -35,7 +35,7 @@ jobs: run: | python3 -m pip install --upgrade pip hatch - name: Run fast tests first - run: hatch run test + run: hatch run fast-test - name: Run full test suite & coverage run: hatch run test - name: Verify that we can build the package From 6577542281a445bde0949db3c8370d1d246ad11d Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Fri, 26 Jul 2024 16:19:52 +0200 Subject: [PATCH 19/22] Finalize windows fix --- src/zampy/datasets/fapar_lai.py | 1 + tests/test_datasets/test_fapar_lai.py | 16 +++++----------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/src/zampy/datasets/fapar_lai.py b/src/zampy/datasets/fapar_lai.py index c026a3f..fdfef0c 100644 --- a/src/zampy/datasets/fapar_lai.py +++ b/src/zampy/datasets/fapar_lai.py @@ -122,6 +122,7 @@ def ingest( for file in zip_files: with tempfile.TemporaryDirectory( dir=tmp_path, + # cleanup fails on windows. No clear idea on how to fix this. ignore_cleanup_errors=True if os.name == "nt" else False, ) as _tmpdir: tmpdir = Path(_tmpdir) diff --git a/tests/test_datasets/test_fapar_lai.py b/tests/test_datasets/test_fapar_lai.py index e9f0db7..44112aa 100644 --- a/tests/test_datasets/test_fapar_lai.py +++ b/tests/test_datasets/test_fapar_lai.py @@ -87,25 +87,19 @@ def test_download(self, mock_retrieve, valid_path_config, dummy_dir): @pytest.mark.slow def test_ingest(self, dummy_dir): """Test ingest function.""" - c = dask.distributed.Client() + dask.distributed.Client() ingest_dir = Path(dummy_dir) / "ingest" ingest_dir.mkdir() - def ingest_lai(): - lai_dataset = FaparLAI() - lai_dataset.ingest( - download_dir=data_folder / "fapar-lai" / "download", ingest_dir=ingest_dir - ) - f = c.submit(ingest_lai) - f.result() + lai_dataset = FaparLAI() + lai_dataset.ingest( + download_dir=data_folder / "fapar-lai" / "download", ingest_dir=ingest_dir + ) with xr.open_mfdataset((ingest_dir / "fapar-lai").glob("*.nc")) as ds: assert isinstance(ds, xr.Dataset) - for file in (ingest_dir / "fapar-lai").glob("*.nc"): - file.unlink() - def test_load(self): """Test load function.""" times = TimeBounds(np.datetime64("2019-01-01"), np.datetime64("2019-01-31")) From 2a447c3e38e722ea1e6a897eb03d503d9864b0ad Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Fri, 26 Jul 2024 16:22:16 +0200 Subject: [PATCH 20/22] Add stemmus scope input data recipe --- recipes/STEMMUS_SCOPE_input.yml | 41 +++++++++++++++++++++++++++++++++ src/zampy/cli.py | 2 +- 2 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 recipes/STEMMUS_SCOPE_input.yml diff --git a/recipes/STEMMUS_SCOPE_input.yml b/recipes/STEMMUS_SCOPE_input.yml new file mode 100644 index 0000000..f296ac0 --- /dev/null +++ b/recipes/STEMMUS_SCOPE_input.yml @@ -0,0 +1,41 @@ +# config (folder, login info etc goes to a ~/.zampy/config file) +name: "STEMMUS_SCOPE_input" + +download: + time: ["2020-01-01", "2020-06-30"] + bbox: [60, 10, 50, 0] # NESW + datasets: + era5_land: + variables: + - air_temperature + - dewpoint_temperature + - soil_temperature + - soil_moisture + era5: + variables: + - total_precipitation + - surface_thermal_radiation_downwards + - surface_solar_radiation_downwards + - surface_pressure + - eastward_component_of_wind + - northward_component_of_wind + eth_canopy_height: + variables: + - height_of_vegetation + fapar_lai: + variables: + - leaf_area_index + land_cover: + variables: + - land_cover + prism_dem_90: + variables: + - elevation + cams: + variables: + - co2_concentration + +convert: + convention: ALMA + frequency: 1H # outputs at 1 hour frequency. Pandas-like freq-keyword. + resolution: 0.25 # output resolution in degrees. diff --git a/src/zampy/cli.py b/src/zampy/cli.py index c247844..d98a34b 100644 --- a/src/zampy/cli.py +++ b/src/zampy/cli.py @@ -8,7 +8,7 @@ @click.command() @click.argument("recipe", type=click.Path(exists=True, path_type=Path)) -@click.option('--skip-download', is_flag=True) +@click.option("--skip-download", is_flag=True) def run_recipe(recipe: Path, skip_download: bool) -> None: """Run the recipe using the CLI.""" click.echo(f"Executing recipe: {recipe}") From 4b44e7562000be6d9e9165febc40f3c63c64ec20 Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Mon, 29 Jul 2024 10:27:11 +0200 Subject: [PATCH 21/22] Fix failing tests: segfault occured due to data not being available --- tests/test_datasets/test_fapar_lai.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_datasets/test_fapar_lai.py b/tests/test_datasets/test_fapar_lai.py index 44112aa..52e6bf7 100644 --- a/tests/test_datasets/test_fapar_lai.py +++ b/tests/test_datasets/test_fapar_lai.py @@ -100,6 +100,7 @@ def test_ingest(self, dummy_dir): with xr.open_mfdataset((ingest_dir / "fapar-lai").glob("*.nc")) as ds: assert isinstance(ds, xr.Dataset) + @pytest.mark.slow # depends on ingested data being available def test_load(self): """Test load function.""" times = TimeBounds(np.datetime64("2019-01-01"), np.datetime64("2019-01-31")) @@ -123,6 +124,7 @@ def test_load(self): np.testing.assert_allclose(ds.latitude.values, expected_lat) np.testing.assert_allclose(ds.longitude.values, expected_lon) + @pytest.mark.slow # depends on ingested data being available def test_convert(self): """Test convert function.""" lai_dataset = FaparLAI() From 690694be62e2f474cc597aac3ea7782e195e9c07 Mon Sep 17 00:00:00 2001 From: Bart Schilperoort Date: Mon, 29 Jul 2024 10:33:25 +0200 Subject: [PATCH 22/22] Apparently Windows dislikes single quotes ' --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6f38503..f742cc9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -106,7 +106,7 @@ format = [ "ruff check src/ tests/ --fix --exit-non-zero-on-fix", "lint", ] -fast-test = ["pytest -m 'not slow'"] +fast-test = ["pytest -m \"not slow\""] test = [ "pytest ./src/zampy/ ./tests/ --doctest-modules --doctest-ignore-import-errors", ]