Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add soil temperature and soil moisture to ERA5 data, STEMMUS_SCOPE recipe #53

Merged
merged 22 commits into from
Aug 19, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
dd8cf80
Add soil temperature and soil moisture to ERA5 data
BSchilperoort Feb 8, 2024
711f3eb
Fix incorrect name in SPLIT_VARIABLES constant
BSchilperoort Feb 9, 2024
6686d58
Use Dask distributed by default: avoids memory issues
BSchilperoort Jun 14, 2024
5a9bbcd
Validate recipe with datasets before downloading
BSchilperoort Jun 14, 2024
d396e48
Delete in-memory dataset after saving to reduce mem usage
BSchilperoort Jun 14, 2024
33c5cd9
Add appropriate time bounds to canopy height data set
BSchilperoort Jul 10, 2024
0e654c9
New formatting
BSchilperoort Jul 10, 2024
adf73de
Make tests use dask distributed scheduler
BSchilperoort Jul 10, 2024
014d632
Ignore typing in test folder
BSchilperoort Jul 17, 2024
f26a700
Add interpolation for datasets with lower frequency than recipe target
BSchilperoort Jul 17, 2024
aa650d3
Update formatting and linting (new rules)
BSchilperoort Jul 17, 2024
1fe26d0
Try dask submit/result for Windows
BSchilperoort Jul 26, 2024
b0908b2
Try to fix windows again
BSchilperoort Jul 26, 2024
f09e2d7
Use context manager to ensure file closure
BSchilperoort Jul 26, 2024
8d38759
Try unlinking files manually
BSchilperoort Jul 26, 2024
ecbdb40
Change how tempdir works on windows
BSchilperoort Jul 26, 2024
f8e0e63
Allow skipping downloads
BSchilperoort Jul 26, 2024
e206125
Actually run fast tests first
BSchilperoort Jul 26, 2024
6577542
Finalize windows fix
BSchilperoort Jul 26, 2024
2a447c3
Add stemmus scope input data recipe
BSchilperoort Jul 26, 2024
4b44e75
Fix failing tests: segfault occured due to data not being available
BSchilperoort Jul 29, 2024
690694b
Apparently Windows dislikes single quotes '
BSchilperoort Jul 29, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,11 @@ markers = [
ignore_missing_imports = true
disallow_untyped_defs = true
python_version = "3.10"
exclude = "tests"

[tool.ruff]
line-length = 88
exclude = ["docs", "build"]
exclude = ["docs", "build", "tests"]
target-version = "py310"

[tool.ruff.lint]
Expand Down
1 change: 1 addition & 0 deletions src/zampy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""zampy."""

from zampy import datasets


Expand Down
3 changes: 3 additions & 0 deletions src/zampy/cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""Implements CLI interface for Zampy."""

from pathlib import Path
import click
import dask.distributed
from zampy.recipe import RecipeManager


Expand All @@ -14,4 +16,5 @@ def run_recipe(recipe: Path) -> None:


if __name__ == "__main__":
dask.distributed.Client()
run_recipe()
8 changes: 8 additions & 0 deletions src/zampy/conventions/ALMA.json
Original file line number Diff line number Diff line change
Expand Up @@ -84,5 +84,13 @@
"land_cover": {
"variable": "land_cover",
"units": ""
},
"soil_temperature": {
"variable": "SoilTemp",
"units": "kelvin"
},
"soil_moisture": {
"variable": "SoilMoist",
"units": "kilogram/meter**3"
}
}
1 change: 1 addition & 0 deletions src/zampy/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Datasets implementations."""

from zampy.datasets import dataset_protocol
from zampy.datasets import validation
from zampy.datasets.catalog import DATASETS
Expand Down
1 change: 1 addition & 0 deletions src/zampy/datasets/catalog.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Catalog of datasets."""

from zampy.datasets import dataset_protocol
from zampy.datasets.cams import CAMS
from zampy.datasets.era5 import ERA5
Expand Down
74 changes: 73 additions & 1 deletion src/zampy/datasets/cds_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""CDS utilities used by ECMWF datasets."""

from copy import copy
from pathlib import Path
import cdsapi
import numpy as np
Expand Down Expand Up @@ -44,6 +45,21 @@
"21:00", "22:00", "23:00",
] # fmt: skip

SPLIT_VARIABLES = {
"soil_temperature": (
"soil_temperature_level_1",
"soil_temperature_level_2",
"soil_temperature_level_3",
"soil_temperature_level_4",
),
"soil_moisture": (
"volumetric_soil_water_layer_1",
"volumetric_soil_water_layer_2",
"volumetric_soil_water_layer_3",
"volumetric_soil_water_layer_4",
),
}


def cds_request(
dataset: str,
Expand Down Expand Up @@ -226,6 +242,12 @@ def retrieve_era5(
# create list of year/month pairs
year_month_pairs = time_bounds_to_year_month(time_bounds)

variables = copy(variables) # Prevent original input from being modified in-place
for split_var in SPLIT_VARIABLES:
if split_var in variables:
variables.remove(split_var)
variables.extend(SPLIT_VARIABLES[split_var])

for (year, month), variable in product(
year_month_pairs, variables, position=0, leave=True
):
Expand Down Expand Up @@ -354,7 +376,8 @@ def convert_to_zampy(
print(f"File '{ncfile.name}' already exists, skipping...")
else:
ds = parse_nc_file(file)

# Rename the vswl data:
ncfile = Path(str(ncfile).replace("volumetric_soil_water", "soil_moisture"))
ds.to_netcdf(path=ncfile)


Expand All @@ -373,6 +396,28 @@ def convert_to_zampy(
"co2": "co2_concentration",
}

VAR_REFERENCE_MULTI_LAYER = {
"stl1": "soil_temperature",
"stl2": "soil_temperature",
"stl3": "soil_temperature",
"stl4": "soil_temperature",
"swvl1": "soil_moisture",
"swvl2": "soil_moisture",
"swvl3": "soil_moisture",
"swvl4": "soil_moisture",
}

LAYER_BOUNDS = {
"stl1": [[0.0, 7.0]],
"stl2": [[7.0, 28.0]],
"stl3": [[28.0, 100.0]],
"stl4": [[100.0, 289.0]],
"swvl1": [[0.0, 7.0]],
"swvl2": [[7.0, 28.0]],
"swvl3": [[28.0, 100.0]],
"swvl4": [[100.0, 289.0]],
}

WATER_DENSITY = 997.0 # kg/m3


Expand Down Expand Up @@ -416,6 +461,33 @@ def parse_nc_file(file: Path) -> xr.Dataset:
variable_name
].desc

if variable in VAR_REFERENCE_MULTI_LAYER:
if ( # Soil temperature/moisture routine
str(variable).startswith("stl") or str(variable).startswith("swvl")
):
if str(variable).startswith("swvl"):
varname = "soil_moisture"
standard_name = "moisture_content_of_soil_layer"
ds[variable] *= WATER_DENSITY
ds[variable].attrs.update({"units": "kg m**-3"})
else:
varname = "soil_temperature"
standard_name = "temperature_in_ground"

da = ds[variable]
name = str(da.name)
da = da.expand_dims({"depth": [np.mean(LAYER_BOUNDS[name])]})
da = da.rename(varname)
da.attrs.update(
{
"long_name": varname.replace("_", " "),
"standard_name": standard_name,
}
)

ds = da.to_dataset()
ds["depth_bounds"] = (("depth", "nv"), LAYER_BOUNDS[name])

# TODO: add dataset attributes.

return ds
1 change: 1 addition & 0 deletions src/zampy/datasets/converter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Dataset formatter for different conventions."""

import json
import warnings
from pathlib import Path
Expand Down
1 change: 1 addition & 0 deletions src/zampy/datasets/dataset_protocol.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Outline of the dataset protocol."""

import json
import shutil
from dataclasses import dataclass
Expand Down
17 changes: 16 additions & 1 deletion src/zampy/datasets/era5.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,30 @@ class ERA5Land(ECMWFDataset): # noqa: D101
raw_variables = [
Variable(name="t2m", unit=unit_registry.kelvin),
Variable(name="d2m", unit=unit_registry.kelvin),
Variable(name="st", unit=unit_registry.kelvin),
Variable(name="swvl", unit=unit_registry.fraction),
]

# variable names used in cdsapi downloading request
cds_var_names = {
"air_temperature": "2m_temperature",
"dewpoint_temperature": "2m_dewpoint_temperature",
"soil_temperature_level_1": "soil_temperature_level_1", # Note: split variables
"soil_temperature_level_2": "soil_temperature_level_2",
"soil_temperature_level_3": "soil_temperature_level_3",
"soil_temperature_level_4": "soil_temperature_level_4",
"volumetric_soil_water_layer_1": "volumetric_soil_water_layer_1",
"volumetric_soil_water_layer_2": "volumetric_soil_water_layer_2",
"volumetric_soil_water_layer_3": "volumetric_soil_water_layer_3",
"volumetric_soil_water_layer_4": "volumetric_soil_water_layer_4",
}

variable_names = list(cds_var_names.keys())
variable_names = [
"air_temperature",
"dewpoint_temperature",
"soil_temperature",
"soil_moisture",
]

variables = [VARIABLE_REFERENCE_LOOKUP[var] for var in variable_names]

Expand Down
14 changes: 11 additions & 3 deletions src/zampy/datasets/eth_canopy_height.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""ETH canopy height dataset."""

import gzip
from pathlib import Path
import numpy as np
Expand Down Expand Up @@ -269,10 +270,17 @@ def parse_tiff_file(file: Path, sd_file: bool = False) -> xr.Dataset:
da = da.isel(band=0) # get rid of band dim
da = da.drop_vars(["band", "spatial_ref"]) # drop unnecessary coords
ds = da.to_dataset()
ds = ds.assign_coords( # halfway in the year
{"time": np.datetime64("2020-07-01").astype("datetime64[ns]")}
ds = xr.concat( # Cover entirety of 2020
(
ds.assign_coords(
{"time": np.datetime64("2020-01-01").astype("datetime64[ns]")}
),
ds.assign_coords(
{"time": np.datetime64("2021-01-01").astype("datetime64[ns]")}
),
),
dim="time",
)
ds = ds.expand_dims("time")
ds = ds.rename(
{
"band_data": "height_of_vegetation_standard_deviation"
Expand Down
1 change: 1 addition & 0 deletions src/zampy/datasets/prism_dem.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Prism DEM dataset."""

import gzip
import tarfile
from pathlib import Path
Expand Down
1 change: 1 addition & 0 deletions src/zampy/datasets/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Shared utilities from datasets."""

import urllib.request
from pathlib import Path
import requests
Expand Down
1 change: 1 addition & 0 deletions src/zampy/datasets/validation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Checks for user input validation."""

from pathlib import Path
from zampy.datasets.dataset_protocol import Dataset
from zampy.datasets.dataset_protocol import SpatialBounds
Expand Down
37 changes: 33 additions & 4 deletions src/zampy/recipe.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
""""All functionality to read and execute Zampy recipes."""
"""All functionality to read and execute Zampy recipes."""

from pathlib import Path
from typing import Any
import numpy as np
import pandas as pd
import xarray as xr
import yaml
from zampy.datasets import DATASETS
from zampy.datasets import converter
from zampy.datasets.dataset_protocol import Dataset
from zampy.datasets.dataset_protocol import SpatialBounds
from zampy.datasets.dataset_protocol import TimeBounds
from zampy.datasets.validation import validate_download_request


def recipe_loader(recipe_path: Path) -> dict:
Expand All @@ -31,8 +35,8 @@ def recipe_loader(recipe_path: Path) -> dict:
for key in ["convention", "frequency", "resolution"]
):
msg = (
"One of the following items are missing from the recipe:\n"
"name, download, convert."
"One of the following 'convert' items are missing from the recipe:\n"
"convention, frequency, resolution."
)
raise ValueError(msg)

Expand Down Expand Up @@ -92,9 +96,22 @@ def __init__(self, recipe_path: Path) -> None:

def run(self) -> None:
"""Run the full recipe."""
# First validate all inputs (before downloading, processing...)
for dataset_name in self.datasets:
_dataset = DATASETS[dataset_name.lower()]
dataset: Dataset = _dataset()

validate_download_request(
dataset,
self.download_dir,
self.timebounds,
self.spatialbounds,
self.datasets[dataset_name]["variables"],
)

for dataset_name in self.datasets:
_dataset = DATASETS[dataset_name.lower()]
dataset = _dataset()
variables: list[str] = self.datasets[dataset_name]["variables"]

# Download datset
Expand All @@ -118,7 +135,18 @@ def run(self) -> None:
ds = converter.convert(ds, dataset, convention=self.convention)

if "time" in ds.dims: # Dataset with only DEM (e.g.) has no time dim.
ds = ds.resample(time=self.frequency).mean()
freq = xr.infer_freq(ds["time"])
if freq is None: # fallback:
freq = (
ds["time"].isel(time=1).to_numpy()
- ds["time"].isel(time=0).to_numpy()
)
data_freq = pd.to_timedelta(pd.tseries.frequencies.to_offset(freq))

if data_freq < pd.Timedelta(self.frequency):
ds = ds.resample(time=self.frequency).mean()
elif data_freq > pd.Timedelta(self.frequency):
ds = ds.resample(time=self.frequency).interpolate("nearest")

comp = dict(zlib=True, complevel=5)
encoding = {var: comp for var in ds.data_vars}
Expand All @@ -127,6 +155,7 @@ def run(self) -> None:
# e.g. "era5_2010-2020.nc"
fname = f"{dataset_name.lower()}_{time_start}-{time_end}.nc"
ds.to_netcdf(path=self.data_dir / fname, encoding=encoding)
del ds

print(
"Finished running the recipe. Output data can be found at:\n"
Expand Down
4 changes: 4 additions & 0 deletions src/zampy/reference/variables.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Variable reference for Zampy."""

from pint import UnitRegistry
from zampy.datasets.dataset_protocol import Variable

Expand All @@ -13,6 +14,7 @@ def unit_registration() -> UnitRegistry:
unit_registry.define("degree_east = degree = degree_E = degreeE")
unit_registry.define("watt_per_square_meter = watt/meter**2")
unit_registry.define("joule_per_square_meter = joule/meter**2")
unit_registry.define("kilogram_per_square_meter = kilogram/(meter**2)")
unit_registry.define(
"kilogram_per_square_meter_second = kilogram/(meter**2*second)"
)
Expand Down Expand Up @@ -56,6 +58,8 @@ def unit_registration() -> UnitRegistry:
Variable("elevation", unit=unit_registry.meter),
Variable("leaf_area_index", unit=unit_registry.fraction),
Variable("land_cover", unit=unit_registry.dimensionless),
Variable("soil_temperature", unit=unit_registry.kelvin),
Variable("soil_moisture", unit=unit_registry.kilogram_per_square_meter),
)

VARIABLE_REFERENCE_LOOKUP = {var.name: var for var in VARIABLE_REFERENCE}
1 change: 1 addition & 0 deletions tests/test_data/fapar-lai/generate_fake_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Generate fake data for the fapar-lai tests."""

import zipfile
from pathlib import Path
import numpy as np
Expand Down
1 change: 1 addition & 0 deletions tests/test_datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""This module contains all tests for datasets included in zampy."""

from pathlib import Path


Expand Down
Loading
Loading