Skip to content

Commit

Permalink
Implement regridding for Dataset.load() (#9)
Browse files Browse the repository at this point in the history
* Add regridding as part of the `load` method.

* Correct convention type, add convention Path option

* Updated example recipe (based on demo comments)

* Test hatch-conda on CI

* Specify ci shells for run

* Debug with "micromamba info"

* Better xarray regrid w/ flox. Make ESMF optional.

* Fix CI errors

* Correct regrid docstring

* Add regrid method to dataset.load

* Typing modifications for 3.8, 3.9 compat.

* Ignore import errors in Pytest's doctest

* Update demo notebook

* Add note on rechunking
  • Loading branch information
BSchilperoort committed Jun 29, 2023
1 parent 7199edf commit 54439a0
Show file tree
Hide file tree
Showing 11 changed files with 775 additions and 411 deletions.
39 changes: 39 additions & 0 deletions .github/workflows/mamba.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: Micromamba

on:
push:
branches:
- main
pull_request:
branches:
- main

jobs:

build:
name: Micromamba test on (${{ matrix.python-version }}, ${{ matrix.os }})
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: ['ubuntu-latest']
python-version: ['3.10']
env:
MPLBACKEND: Agg # https://github.com/orgs/community/discussions/26434
steps:
- uses: actions/checkout@v3
- uses: mamba-org/setup-micromamba@v1
with:
environment-file: environment.yml
cache-environment: true
init-shell: bash
- name: Micromamba info
shell: bash -el {0}
run: |
micromamba info
- name: Install dev dependencies
run: pip install .[dev]
shell: bash -el {0}
- name: Run pytest
run: pytest
shell: micromamba-shell {0}
20 changes: 20 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
---
name: zampy
channels:
- conda-forge
dependencies:
- python==3.10
- xESMF
- requests
- netcdf4
- numpy
- pandas
- matplotlib
- xarray
- rioxarray # required for TIFF files
- tqdm
- dask[diagnostics]
- pint
- cf_xarray # required to auto-pint CF compliant datasets
- pint-xarray
- flox
755 changes: 371 additions & 384 deletions eth_dataset_demo.ipynb

Large diffs are not rendered by default.

11 changes: 4 additions & 7 deletions example_recipe.yml
Original file line number Diff line number Diff line change
@@ -1,23 +1,21 @@
# config (folder, login info etc goes to a ~/.zampy/config file)
download:
folder: /home/bart/Data/zampy/test/
years: [1980, 2020]
bbox: [3, 50, 6, 54]

datasets:
era5-land:
frequency: hourly
era5-land-hourly:
variables:
- air_temperature # will map to 2m_temperature...
- height_m: 2 # optional extra argument
- dewpoint_temperature
- height_m: 2

convert:
standard: ALMA
flavor: PLUMBER2 # More specified than ALMA.
folder: /home/bart/Data/zampy/output/
standard: ALMA-PLUMBER2
frequency: 1H # outputs at 1 hour frequency. Pandas-like freq-keyword.
resolution: 0.01 # output resolution in degrees.
conversion-method: "flox" # Either flox or xesmf. xesmf requires conda + linux.

additional_variables: # Possible future addition
saturation_vapor_pressure:
Expand All @@ -32,7 +30,6 @@ collections:
area: [3, 50, 6, 54]

dataset: era5-land
frequency: hourly
variables:
- air_temperature

Expand Down
15 changes: 14 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,11 @@ dependencies = [
"rioxarray", # required for TIFF files
"tqdm",
"dask[diagnostics]",
"distributed", # dask-distributed
"pint",
"cf_xarray", # required to auto-pint CF compliant datasets.
"pint-xarray",
"flox", # For better groupby methods.
]
dynamic = ["version"]

Expand All @@ -74,6 +76,7 @@ dev = [
"types-urllib3", # type stubs for url lib
"pytest",
"pytest-cov",
"pre-commit",
]

[tool.hatch.envs.default]
Expand All @@ -90,11 +93,21 @@ format = [
"ruff check src/ tests/ --fix --exit-non-zero-on-fix",
"lint",
]
test = ["pytest ./src/zampy/ ./tests/ --doctest-modules",]
test = ["pytest ./src/zampy/ ./tests/ --doctest-modules --doctest-ignore-import-errors",]
coverage = [
"pytest --cov --cov-report term --cov-report xml --junitxml=xunit-result.xml tests/",
]

# [tool.hatch.envs.conda]
# type = "conda"
# python = "3.10"
# command = "micromamba"
# environment-file = "environment.yml"
# extra-dependencies = ["pytest", "pytest-cov"]

# [tool.hatch.envs.conda.scripts]
# test = ["pytest ./tests/",]

[tool.pytest.ini_options]
testpaths = ["tests"]

Expand Down
5 changes: 3 additions & 2 deletions src/zampy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""zampy."""
from . import datasets
from zampy import datasets
from zampy import utils


__author__ = "Bart Schilperoort"
__email__ = "[email protected]"
__version__ = "0.1.0"


__all__ = ["datasets"]
__all__ = ["datasets", "utils"]
23 changes: 15 additions & 8 deletions src/zampy/datasets/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,23 @@
}


def check_convention(convention: str) -> None:
def check_convention(convention: Union[str, Path]) -> None:
"""Check if the given convention is supported."""
if convention.upper() not in CONVENTIONS:
raise ValueError(
f"The '{convention}' convention is not supported.\n"
"Please check the available conventions in the `conventions` "
"directory and choose one from there."
)
if isinstance(convention, str):
if convention.upper() not in CONVENTIONS:
raise ValueError(
f"The '{convention}' convention is not supported.\n"
"Please check the available conventions in the `conventions` "
"directory and choose one from there."
)
else:
print(f"Starting data conversion to the '{convention}' convention.")
else:
print(f"Start converting data to follow the '{convention}' convention.")
if not convention.exists():
raise FileNotFoundError(
f"Convention file '{convention}' could not be found."
)
print(f"Starting data conversion to the convention defined in '{convention}'")


def convert(
Expand Down
10 changes: 9 additions & 1 deletion src/zampy/datasets/dataset_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,11 +131,13 @@ def ingest(
...

@abstractmethod
def load(
def load( # noqa: PLR0913
self,
ingest_dir: Path,
time_bounds: TimeBounds,
spatial_bounds: SpatialBounds,
resolution: float,
regrid_method: str,
variable_names: List[str],
) -> xr.Dataset:
"""Get the dataset as an xarray Dataset.
Expand All @@ -145,6 +147,12 @@ def load(
time_bounds: The start and end time of the data that should be loaded.
spatial_bounds: The lat/lon bounding box for which the data should be
loaded.
resolution: The desired resolution of the loaded data. The ingested data
will be regridded to match this resolution.
regrid_method: Which routines to use to resample. Either "flox" (default) or
"esmf". Of these two, esmf is the more robust and accurate regridding
method, however it can be difficult to install.
variable_names: Which variables should be loaded.
Returns:
Expand Down
26 changes: 18 additions & 8 deletions src/zampy/datasets/eth_canopy_height.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import gzip
from pathlib import Path
from typing import List
from typing import Union
import numpy as np
import xarray as xr
from zampy.datasets import converter
Expand All @@ -15,6 +16,7 @@
from zampy.datasets.dataset_protocol import write_properties_file
from zampy.reference.variables import VARIABLE_REFERENCE_LOOKUP
from zampy.reference.variables import unit_registry
from zampy.utils import regrid


VALID_NAME_FILE = (
Expand Down Expand Up @@ -119,11 +121,13 @@ def ingest(

return True

def load(
def load( # noqa: PLR0913
self,
ingest_dir: Path,
time_bounds: TimeBounds,
spatial_bounds: SpatialBounds,
resolution: float,
regrid_method: str,
variable_names: List[str],
) -> xr.Dataset:
files: List[Path] = []
Expand All @@ -133,17 +137,14 @@ def load(
files += (ingest_dir / self.name).glob("*Map_SD.nc")

ds = xr.open_mfdataset(files, chunks={"latitude": 2000, "longitude": 2000})
ds = ds.sel(
latitude=slice(spatial_bounds.south, spatial_bounds.north),
longitude=slice(spatial_bounds.west, spatial_bounds.east),
time=slice(time_bounds.start, time_bounds.end),
)
ds = ds.sel(time=slice(time_bounds.start, time_bounds.end))
ds = regrid.regrid_data(ds, spatial_bounds, resolution, regrid_method)
return ds

def convert(
self,
ingest_dir: Path,
convention: str,
convention: Union[str, Path],
) -> bool:
converter.check_convention(convention)
ingest_folder = ingest_dir / self.name
Expand Down Expand Up @@ -237,6 +238,13 @@ def convert_tiff_to_netcdf(
print(f"File '{ncfile.name}' already exists, skipping...")
else:
ds = parse_tiff_file(file, sd_file)

# Coarsen the data to be 1/100 deg resolution instead of 1/12000
ds = ds.coarsen({"latitude": 120, "longitude": 120}).mean() # type: ignore
ds = ds.compute()
ds = ds.interpolate_na(dim="longitude", limit=1)
ds = ds.interpolate_na(dim="latitude", limit=1)

ds.to_netcdf(
path=ncfile,
encoding=ds.encoding,
Expand All @@ -259,7 +267,9 @@ def parse_tiff_file(file: Path, sd_file: bool = False) -> xr.Dataset:
da = da.isel(band=0) # get rid of band dim
da = da.drop_vars(["band", "spatial_ref"]) # drop unnecessary coords
ds = da.to_dataset()
ds = ds.assign_coords({"time": np.datetime64("2020-07-01")}) # halfway in the year
ds = ds.assign_coords( # halfway in the year
{"time": np.datetime64("2020-07-01").astype("datetime64[ns]")}
)
ds = ds.expand_dims("time")
ds = ds.rename(
{
Expand Down
Loading

0 comments on commit 54439a0

Please sign in to comment.