Implement regridding for Dataset.load() (#9)

* Add regridding as part of the `load` method. * Correct convention type, add convention Path option * Updated example recipe (based on demo comments) * Test hatch-conda on CI * Specify ci shells for run * Debug with "micromamba info" * Better xarray regrid w/ flox. Make ESMF optional. * Fix CI errors * Correct regrid docstring * Add regrid method to dataset.load * Typing modifications for 3.8, 3.9 compat. * Ignore import errors in Pytest's doctest * Update demo notebook * Add note on rechunking
EcoExtreML · Jun 29, 2023 · 54439a0 · 54439a0
1 parent 7199edf
commit 54439a0
Show file tree

Hide file tree

Showing 11 changed files with 775 additions and 411 deletions.
diff --git a/.github/workflows/mamba.yml b/.github/workflows/mamba.yml
@@ -0,0 +1,39 @@
+name: Micromamba
+
+on:
+  push:
+    branches:
+    - main
+  pull_request:
+    branches:
+    - main
+
+jobs:
+
+  build:
+    name: Micromamba test on (${{ matrix.python-version }}, ${{ matrix.os }})
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: ['ubuntu-latest']
+        python-version: ['3.10']
+    env:
+      MPLBACKEND: Agg  # https://github.com/orgs/community/discussions/26434
+    steps:
+      - uses: actions/checkout@v3
+      - uses: mamba-org/setup-micromamba@v1
+        with:
+          environment-file: environment.yml
+          cache-environment: true
+          init-shell: bash
+      - name: Micromamba info
+        shell: bash -el {0}
+        run: |
+          micromamba info
+      - name: Install dev dependencies
+        run: pip install .[dev]
+        shell: bash -el {0}
+      - name: Run pytest
+        run: pytest
+        shell: micromamba-shell {0}
diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,20 @@
+---
+name: zampy
+channels:
+  - conda-forge
+dependencies:
+  - python==3.10
+  - xESMF
+  - requests
+  - netcdf4
+  - numpy
+  - pandas
+  - matplotlib
+  - xarray
+  - rioxarray  # required for TIFF files
+  - tqdm
+  - dask[diagnostics]
+  - pint
+  - cf_xarray  # required to auto-pint CF compliant datasets
+  - pint-xarray
+  - flox
diff --git a/eth_dataset_demo.ipynb b/eth_dataset_demo.ipynb
diff --git a/example_recipe.yml b/example_recipe.yml
@@ -1,23 +1,21 @@
+# config (folder, login info etc goes to a ~/.zampy/config file)
 download:
-  folder: /home/bart/Data/zampy/test/
   years: [1980, 2020]
   bbox: [3, 50, 6, 54]
 
   datasets:
-    era5-land:
-      frequency: hourly
+    era5-land-hourly:
       variables:
         - air_temperature  # will map to 2m_temperature...
           - height_m: 2  # optional extra argument
         - dewpoint_temperature
           - height_m: 2
 
 convert:
-  standard: ALMA
-  flavor: PLUMBER2  # More specified than ALMA.
-  folder: /home/bart/Data/zampy/output/
+  standard: ALMA-PLUMBER2
   frequency: 1H  # outputs at 1 hour frequency. Pandas-like freq-keyword.
   resolution: 0.01  # output resolution in degrees.
+  conversion-method: "flox"  # Either flox or xesmf. xesmf requires conda + linux.
 
   additional_variables:  # Possible future addition
     saturation_vapor_pressure:
@@ -32,7 +30,6 @@ collections:
     area: [3, 50, 6, 54]
 
     dataset: era5-land
-      frequency: hourly
       variables:
         - air_temperature
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -57,9 +57,11 @@ dependencies = [
   "rioxarray",  # required for TIFF files
   "tqdm",
   "dask[diagnostics]",
+  "distributed",  # dask-distributed
   "pint",
   "cf_xarray",  # required to auto-pint CF compliant datasets.
   "pint-xarray",
+  "flox",  # For better groupby methods.
 ]
 dynamic = ["version"]
 
@@ -74,6 +76,7 @@ dev = [
   "types-urllib3", # type stubs for url lib
   "pytest",
   "pytest-cov",
+  "pre-commit",
 ]
 
 [tool.hatch.envs.default]
@@ -90,11 +93,21 @@ format = [
   "ruff check src/ tests/ --fix --exit-non-zero-on-fix",
   "lint",
 ]
-test = ["pytest ./src/zampy/ ./tests/ --doctest-modules",]
+test = ["pytest ./src/zampy/ ./tests/ --doctest-modules --doctest-ignore-import-errors",]
 coverage = [
   "pytest --cov --cov-report term --cov-report xml --junitxml=xunit-result.xml tests/",
 ]
 
+# [tool.hatch.envs.conda]
+# type = "conda"
+# python = "3.10"
+# command = "micromamba"
+# environment-file = "environment.yml"
+# extra-dependencies = ["pytest", "pytest-cov"]
+
+# [tool.hatch.envs.conda.scripts]
+# test = ["pytest ./tests/",]
+
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 

diff --git a/src/zampy/__init__.py b/src/zampy/__init__.py
@@ -1,10 +1,11 @@
 """zampy."""
-from . import datasets
+from zampy import datasets
+from zampy import utils
 
 
 __author__ = "Bart Schilperoort"
 __email__ = "[email protected]"
 __version__ = "0.1.0"
 
 
-__all__ = ["datasets"]
+__all__ = ["datasets", "utils"]
diff --git a/src/zampy/datasets/converter.py b/src/zampy/datasets/converter.py
@@ -14,16 +14,23 @@
 }
 
 
-def check_convention(convention: str) -> None:
+def check_convention(convention: Union[str, Path]) -> None:
     """Check if the given convention is supported."""
-    if convention.upper() not in CONVENTIONS:
-        raise ValueError(
-            f"The '{convention}' convention is not supported.\n"
-            "Please check the available conventions in the `conventions` "
-            "directory and choose one from there."
-        )
+    if isinstance(convention, str):
+        if convention.upper() not in CONVENTIONS:
+            raise ValueError(
+                f"The '{convention}' convention is not supported.\n"
+                "Please check the available conventions in the `conventions` "
+                "directory and choose one from there."
+            )
+        else:
+            print(f"Starting data conversion to the '{convention}' convention.")
     else:
-        print(f"Start converting data to follow the '{convention}' convention.")
+        if not convention.exists():
+            raise FileNotFoundError(
+                f"Convention file '{convention}' could not be found."
+            )
+        print(f"Starting data conversion to the convention defined in '{convention}'")
 
 
 def convert(

diff --git a/src/zampy/datasets/dataset_protocol.py b/src/zampy/datasets/dataset_protocol.py
@@ -131,11 +131,13 @@ def ingest(
         ...
 
     @abstractmethod
-    def load(
+    def load(  # noqa: PLR0913
         self,
         ingest_dir: Path,
         time_bounds: TimeBounds,
         spatial_bounds: SpatialBounds,
+        resolution: float,
+        regrid_method: str,
         variable_names: List[str],
     ) -> xr.Dataset:
         """Get the dataset as an xarray Dataset.
@@ -145,6 +147,12 @@ def load(
             time_bounds: The start and end time of the data that should be loaded.
             spatial_bounds: The lat/lon bounding box for which the data should be
                 loaded.
+            resolution: The desired resolution of the loaded data. The ingested data
+                will be regridded to match this resolution.
+            regrid_method: Which routines to use to resample. Either "flox" (default) or
+                "esmf". Of these two, esmf is the more robust and accurate regridding
+                method, however it can be difficult to install.
+
             variable_names: Which variables should be loaded.
 
         Returns:

diff --git a/src/zampy/datasets/eth_canopy_height.py b/src/zampy/datasets/eth_canopy_height.py
@@ -2,6 +2,7 @@
 import gzip
 from pathlib import Path
 from typing import List
+from typing import Union
 import numpy as np
 import xarray as xr
 from zampy.datasets import converter
@@ -15,6 +16,7 @@
 from zampy.datasets.dataset_protocol import write_properties_file
 from zampy.reference.variables import VARIABLE_REFERENCE_LOOKUP
 from zampy.reference.variables import unit_registry
+from zampy.utils import regrid
 
 
 VALID_NAME_FILE = (
@@ -119,11 +121,13 @@ def ingest(
 
         return True
 
-    def load(
+    def load(  # noqa: PLR0913
         self,
         ingest_dir: Path,
         time_bounds: TimeBounds,
         spatial_bounds: SpatialBounds,
+        resolution: float,
+        regrid_method: str,
         variable_names: List[str],
     ) -> xr.Dataset:
         files: List[Path] = []
@@ -133,17 +137,14 @@ def load(
             files += (ingest_dir / self.name).glob("*Map_SD.nc")
 
         ds = xr.open_mfdataset(files, chunks={"latitude": 2000, "longitude": 2000})
-        ds = ds.sel(
-            latitude=slice(spatial_bounds.south, spatial_bounds.north),
-            longitude=slice(spatial_bounds.west, spatial_bounds.east),
-            time=slice(time_bounds.start, time_bounds.end),
-        )
+        ds = ds.sel(time=slice(time_bounds.start, time_bounds.end))
+        ds = regrid.regrid_data(ds, spatial_bounds, resolution, regrid_method)
         return ds
 
     def convert(
         self,
         ingest_dir: Path,
-        convention: str,
+        convention: Union[str, Path],
     ) -> bool:
         converter.check_convention(convention)
         ingest_folder = ingest_dir / self.name
@@ -237,6 +238,13 @@ def convert_tiff_to_netcdf(
         print(f"File '{ncfile.name}' already exists, skipping...")
     else:
         ds = parse_tiff_file(file, sd_file)
+
+        # Coarsen the data to be 1/100 deg resolution instead of 1/12000
+        ds = ds.coarsen({"latitude": 120, "longitude": 120}).mean()  # type: ignore
+        ds = ds.compute()
+        ds = ds.interpolate_na(dim="longitude", limit=1)
+        ds = ds.interpolate_na(dim="latitude", limit=1)
+
         ds.to_netcdf(
             path=ncfile,
             encoding=ds.encoding,
@@ -259,7 +267,9 @@ def parse_tiff_file(file: Path, sd_file: bool = False) -> xr.Dataset:
     da = da.isel(band=0)  # get rid of band dim
     da = da.drop_vars(["band", "spatial_ref"])  # drop unnecessary coords
     ds = da.to_dataset()
-    ds = ds.assign_coords({"time": np.datetime64("2020-07-01")})  # halfway in the year
+    ds = ds.assign_coords(  # halfway in the year
+        {"time": np.datetime64("2020-07-01").astype("datetime64[ns]")}
+    )
     ds = ds.expand_dims("time")
     ds = ds.rename(
         {