EcoExtreML · SarahAlidoost · Aug 19, 2024 · Feb 8, 2024 · Feb 9, 2024 · Jun 14, 2024
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -35,7 +35,7 @@ jobs:
         run: |
           python3 -m pip install --upgrade pip hatch
       - name: Run fast tests first
-        run: hatch run test
+        run: hatch run fast-test
       - name: Run full test suite & coverage
         run: hatch run test
       - name: Verify that we can build the package

diff --git a/pyproject.toml b/pyproject.toml
@@ -106,7 +106,7 @@ format = [
   "ruff check src/ tests/ --fix --exit-non-zero-on-fix",
   "lint",
 ]
-fast-test = ["pytest -m 'not slow'"]
+fast-test = ["pytest -m \"not slow\""]
 test = [
   "pytest ./src/zampy/ ./tests/ --doctest-modules --doctest-ignore-import-errors",
 ]
@@ -131,10 +131,11 @@ markers = [
 ignore_missing_imports = true
 disallow_untyped_defs = true
 python_version = "3.10"
+exclude = "tests"
 
 [tool.ruff]
 line-length = 88
-exclude = ["docs", "build"]
+exclude = ["docs", "build", "tests"]
 target-version = "py310"
 
 [tool.ruff.lint]

diff --git a/recipes/STEMMUS_SCOPE_input.yml b/recipes/STEMMUS_SCOPE_input.yml
@@ -0,0 +1,41 @@
+# config (folder, login info etc goes to a ~/.zampy/config file)
+name: "STEMMUS_SCOPE_input"
+
+download:
+  time: ["2020-01-01", "2020-06-30"]
+  bbox: [60, 10, 50, 0] # NESW
+  datasets:
+    era5_land:
+      variables:
+        - air_temperature
+        - dewpoint_temperature
+        - soil_temperature
+        - soil_moisture
+    era5:
+      variables:
+        - total_precipitation
+        - surface_thermal_radiation_downwards
+        - surface_solar_radiation_downwards
+        - surface_pressure
+        - eastward_component_of_wind
+        - northward_component_of_wind
+    eth_canopy_height:
+      variables:
+        - height_of_vegetation
+    fapar_lai:
+      variables:
+        - leaf_area_index
+    land_cover:
+      variables:
+        - land_cover
+    prism_dem_90:
+      variables:
+        - elevation
+    cams:
+      variables:
+        - co2_concentration
+
+convert:
+  convention: ALMA
+  frequency: 1H  # outputs at 1 hour frequency. Pandas-like freq-keyword.
+  resolution: 0.25  # output resolution in degrees.
diff --git a/src/zampy/__init__.py b/src/zampy/__init__.py
@@ -1,4 +1,5 @@
 """zampy."""
+
 from zampy import datasets
 
 

diff --git a/src/zampy/cli.py b/src/zampy/cli.py
@@ -1,17 +1,21 @@
 """Implements CLI interface for Zampy."""
+
 from pathlib import Path
 import click
+import dask.distributed
 from zampy.recipe import RecipeManager
 
 
 @click.command()
 @click.argument("recipe", type=click.Path(exists=True, path_type=Path))
-def run_recipe(recipe: Path) -> None:
+@click.option("--skip-download", is_flag=True)
+def run_recipe(recipe: Path, skip_download: bool) -> None:
     """Run the recipe using the CLI."""
     click.echo(f"Executing recipe: {recipe}")
-    rm = RecipeManager(recipe)
+    rm = RecipeManager(recipe, skip_download)
     rm.run()
 
 
 if __name__ == "__main__":
+    dask.distributed.Client()
     run_recipe()
diff --git a/src/zampy/conventions/ALMA.json b/src/zampy/conventions/ALMA.json
@@ -84,5 +84,13 @@
     "land_cover": {
         "variable": "land_cover",
         "units": ""
+    },
+    "soil_temperature": {
+        "variable": "SoilTemp",
+        "units": "kelvin"
+    },
+    "soil_moisture": {
+        "variable": "SoilMoist",
+        "units": "kilogram/meter**3"
     }
 }
diff --git a/src/zampy/datasets/__init__.py b/src/zampy/datasets/__init__.py
@@ -1,4 +1,5 @@
 """Datasets implementations."""
+
 from zampy.datasets import dataset_protocol
 from zampy.datasets import validation
 from zampy.datasets.catalog import DATASETS

diff --git a/src/zampy/datasets/catalog.py b/src/zampy/datasets/catalog.py
@@ -1,4 +1,5 @@
 """Catalog of datasets."""
+
 from zampy.datasets import dataset_protocol
 from zampy.datasets.cams import CAMS
 from zampy.datasets.era5 import ERA5

diff --git a/src/zampy/datasets/cds_utils.py b/src/zampy/datasets/cds_utils.py
@@ -1,5 +1,6 @@
 """CDS utilities used by ECMWF datasets."""
 
+from copy import copy
 from pathlib import Path
 import cdsapi
 import numpy as np
@@ -44,6 +45,21 @@
     "21:00", "22:00", "23:00",
 ]  # fmt: skip
 
+SPLIT_VARIABLES = {
+    "soil_temperature": (
+        "soil_temperature_level_1",
+        "soil_temperature_level_2",
+        "soil_temperature_level_3",
+        "soil_temperature_level_4",
+    ),
+    "soil_moisture": (
+        "volumetric_soil_water_layer_1",
+        "volumetric_soil_water_layer_2",
+        "volumetric_soil_water_layer_3",
+        "volumetric_soil_water_layer_4",
+    ),
+}
+
 
 def cds_request(
     dataset: str,
@@ -226,6 +242,12 @@ def retrieve_era5(
     # create list of year/month pairs
     year_month_pairs = time_bounds_to_year_month(time_bounds)
 
+    variables = copy(variables)  # Prevent original input from being modified in-place
+    for split_var in SPLIT_VARIABLES:
+        if split_var in variables:
+            variables.remove(split_var)
+            variables.extend(SPLIT_VARIABLES[split_var])
+
     for (year, month), variable in product(
         year_month_pairs, variables, position=0, leave=True
     ):
@@ -354,7 +376,8 @@ def convert_to_zampy(
         print(f"File '{ncfile.name}' already exists, skipping...")
     else:
         ds = parse_nc_file(file)
-
+        # Rename the vswl data:
+        ncfile = Path(str(ncfile).replace("volumetric_soil_water", "soil_moisture"))
         ds.to_netcdf(path=ncfile)
 
 
@@ -373,6 +396,28 @@ def convert_to_zampy(
     "co2": "co2_concentration",
 }
 
+VAR_REFERENCE_MULTI_LAYER = {
+    "stl1": "soil_temperature",
+    "stl2": "soil_temperature",
+    "stl3": "soil_temperature",
+    "stl4": "soil_temperature",
+    "swvl1": "soil_moisture",
+    "swvl2": "soil_moisture",
+    "swvl3": "soil_moisture",
+    "swvl4": "soil_moisture",
+}
+
+LAYER_BOUNDS = {
+    "stl1": [[0.0, 7.0]],
+    "stl2": [[7.0, 28.0]],
+    "stl3": [[28.0, 100.0]],
+    "stl4": [[100.0, 289.0]],
+    "swvl1": [[0.0, 7.0]],
+    "swvl2": [[7.0, 28.0]],
+    "swvl3": [[28.0, 100.0]],
+    "swvl4": [[100.0, 289.0]],
+}
+
 WATER_DENSITY = 997.0  # kg/m3
 
 
@@ -416,6 +461,33 @@ def parse_nc_file(file: Path) -> xr.Dataset:
                 variable_name
             ].desc
 
+        if variable in VAR_REFERENCE_MULTI_LAYER:
+            if (  # Soil temperature/moisture routine
+                str(variable).startswith("stl") or str(variable).startswith("swvl")
+            ):
+                if str(variable).startswith("swvl"):
+                    varname = "soil_moisture"
+                    standard_name = "moisture_content_of_soil_layer"
+                    ds[variable] *= WATER_DENSITY
+                    ds[variable].attrs.update({"units": "kg m**-3"})
+                else:
+                    varname = "soil_temperature"
+                    standard_name = "temperature_in_ground"
+
+                da = ds[variable]
+                name = str(da.name)
+                da = da.expand_dims({"depth": [np.mean(LAYER_BOUNDS[name])]})
+                da = da.rename(varname)
+                da.attrs.update(
+                    {
+                        "long_name": varname.replace("_", " "),
+                        "standard_name": standard_name,
+                    }
+                )
+
+                ds = da.to_dataset()
+                ds["depth_bounds"] = (("depth", "nv"), LAYER_BOUNDS[name])
+
     # TODO: add dataset attributes.
 
     return ds
diff --git a/src/zampy/datasets/converter.py b/src/zampy/datasets/converter.py
@@ -1,4 +1,5 @@
 """Dataset formatter for different conventions."""
+
 import json
 import warnings
 from pathlib import Path

diff --git a/src/zampy/datasets/dataset_protocol.py b/src/zampy/datasets/dataset_protocol.py
@@ -1,4 +1,5 @@
 """Outline of the dataset protocol."""
+
 import json
 import shutil
 from dataclasses import dataclass

diff --git a/src/zampy/datasets/era5.py b/src/zampy/datasets/era5.py
@@ -47,15 +47,30 @@ class ERA5Land(ECMWFDataset):  # noqa: D101
     raw_variables = [
         Variable(name="t2m", unit=unit_registry.kelvin),
         Variable(name="d2m", unit=unit_registry.kelvin),
+        Variable(name="st", unit=unit_registry.kelvin),
+        Variable(name="swvl", unit=unit_registry.fraction),
     ]
 
     # variable names used in cdsapi downloading request
     cds_var_names = {
         "air_temperature": "2m_temperature",
         "dewpoint_temperature": "2m_dewpoint_temperature",
+        "soil_temperature_level_1": "soil_temperature_level_1",  # Note: split variables
+        "soil_temperature_level_2": "soil_temperature_level_2",
+        "soil_temperature_level_3": "soil_temperature_level_3",
+        "soil_temperature_level_4": "soil_temperature_level_4",
+        "volumetric_soil_water_layer_1": "volumetric_soil_water_layer_1",
+        "volumetric_soil_water_layer_2": "volumetric_soil_water_layer_2",
+        "volumetric_soil_water_layer_3": "volumetric_soil_water_layer_3",
+        "volumetric_soil_water_layer_4": "volumetric_soil_water_layer_4",
     }
 
-    variable_names = list(cds_var_names.keys())
+    variable_names = [
+        "air_temperature",
+        "dewpoint_temperature",
+        "soil_temperature",
+        "soil_moisture",
+    ]
 
     variables = [VARIABLE_REFERENCE_LOOKUP[var] for var in variable_names]
 

diff --git a/src/zampy/datasets/eth_canopy_height.py b/src/zampy/datasets/eth_canopy_height.py
@@ -1,4 +1,5 @@
 """ETH canopy height dataset."""
+
 import gzip
 from pathlib import Path
 import numpy as np
@@ -269,10 +270,17 @@ def parse_tiff_file(file: Path, sd_file: bool = False) -> xr.Dataset:
     da = da.isel(band=0)  # get rid of band dim
     da = da.drop_vars(["band", "spatial_ref"])  # drop unnecessary coords
     ds = da.to_dataset()
-    ds = ds.assign_coords(  # halfway in the year
-        {"time": np.datetime64("2020-07-01").astype("datetime64[ns]")}
+    ds = xr.concat(  # Cover entirety of 2020
+        (
+            ds.assign_coords(
+                {"time": np.datetime64("2020-01-01").astype("datetime64[ns]")}
+            ),
+            ds.assign_coords(
+                {"time": np.datetime64("2021-01-01").astype("datetime64[ns]")}
+            ),
+        ),
+        dim="time",
     )
-    ds = ds.expand_dims("time")
     ds = ds.rename(
         {
             "band_data": "height_of_vegetation_standard_deviation"

diff --git a/src/zampy/datasets/fapar_lai.py b/src/zampy/datasets/fapar_lai.py
@@ -1,5 +1,6 @@
 """Implementation of the FAPAR LAI dataset."""
 
+import os
 import shutil
 import tempfile
 import zipfile
@@ -119,7 +120,11 @@ def ingest(
 
         # netCDF files follow CF-1.6, only unpacking the archives is required.
         for file in zip_files:
-            with tempfile.TemporaryDirectory(dir=tmp_path) as _tmpdir:
+            with tempfile.TemporaryDirectory(
+                dir=tmp_path,
+                # cleanup fails on windows. No clear idea on how to fix this.
+                ignore_cleanup_errors=True if os.name == "nt" else False,
+            ) as _tmpdir:
                 tmpdir = Path(_tmpdir)
 
                 extract_fapar_zip(
@@ -257,6 +262,7 @@ def ingest_ncfile(ncfile: Path, ingest_folder: Path) -> None:
         path=ingest_folder / ncfile.name,
         encoding={"leaf_area_index": {"zlib": True, "complevel": 3}},
     )
+    ds.close()  # explicitly close to release file to system (for Windows)
 
 
 def extract_fapar_zip(

diff --git a/src/zampy/datasets/prism_dem.py b/src/zampy/datasets/prism_dem.py
@@ -1,4 +1,5 @@
 """Prism DEM dataset."""
+
 import gzip
 import tarfile
 from pathlib import Path

diff --git a/src/zampy/datasets/utils.py b/src/zampy/datasets/utils.py
@@ -1,4 +1,5 @@
 """Shared utilities from datasets."""
+
 import urllib.request
 from pathlib import Path
 import requests

diff --git a/src/zampy/datasets/validation.py b/src/zampy/datasets/validation.py
@@ -1,4 +1,5 @@
 """Checks for user input validation."""
+
 from pathlib import Path
 from zampy.datasets.dataset_protocol import Dataset
 from zampy.datasets.dataset_protocol import SpatialBounds