From d0fec005493727d742ab5b5bb31509bd956e9f9a Mon Sep 17 00:00:00 2001 From: Yang Date: Mon, 31 Jul 2023 14:33:03 +0200 Subject: [PATCH] Implement ERA5 data ingestion (#11) * add download function for era5 * check existence and overwrite * make mypy happy and add variables list * era5 data ingestion * update convention path and update demo notebook * update zampy variable names * update demo notebook * add tests for new utils * add tests for era5 dataset * fix tests for loading .cdsapirc * check size before downloading * add progress bar and message for downloading * silent too many arguments * fix unittest on windows using mktemp from pytest * update convention and variable units * update tests with all variables * compare units with pint unit registry * add all variables to tests * fix typo in convention * combine tests for similar variables * make linter happy * Apply suggestions from code review Co-authored-by: Bart Schilperoort * update variable names accordingly * convert precipitation with pint --------- Co-authored-by: Bart Schilperoort --- README.md | 15 +- demo/era5_dataset_demo.ipynb | 1204 +++++++++++++++++ .../eth_dataset_demo.ipynb | 0 pyproject.toml | 2 + src/zampy/conventions/ALMA.json | 28 + src/zampy/datasets/__init__.py | 3 +- src/zampy/datasets/converter.py | 13 +- src/zampy/datasets/dataset_protocol.py | 4 +- src/zampy/datasets/era5.py | 247 ++++ src/zampy/datasets/eth_canopy_height.py | 4 +- src/zampy/datasets/utils.py | 107 ++ src/zampy/reference/README.md | 9 + src/zampy/reference/variables.py | 39 +- .../era5_10m_u_component_of_wind_1996-1.nc | Bin 0 -> 33280 bytes .../era5_10m_v_component_of_wind_1996-1.nc | Bin 0 -> 33280 bytes ...a5_mean_total_precipitation_rate_1996-1.nc | Bin 0 -> 33280 bytes .../era5/era5_surface_pressure_1996-1.nc | Bin 0 -> 33280 bytes ...urface_solar_radiation_downwards_1996-1.nc | Bin 0 -> 33280 bytes ...face_thermal_radiation_downwards_1996-1.nc | Bin 0 -> 33280 bytes tests/test_data/era5/properties.json | 0 tests/test_datasets/test_era5.py | 218 +++ tests/test_utils.py | 72 + 22 files changed, 1944 insertions(+), 21 deletions(-) create mode 100644 demo/era5_dataset_demo.ipynb rename eth_dataset_demo.ipynb => demo/eth_dataset_demo.ipynb (100%) create mode 100644 src/zampy/datasets/era5.py create mode 100644 src/zampy/reference/README.md create mode 100644 tests/test_data/era5/era5_10m_u_component_of_wind_1996-1.nc create mode 100644 tests/test_data/era5/era5_10m_v_component_of_wind_1996-1.nc create mode 100644 tests/test_data/era5/era5_mean_total_precipitation_rate_1996-1.nc create mode 100644 tests/test_data/era5/era5_surface_pressure_1996-1.nc create mode 100644 tests/test_data/era5/era5_surface_solar_radiation_downwards_1996-1.nc create mode 100644 tests/test_data/era5/era5_surface_thermal_radiation_downwards_1996-1.nc create mode 100644 tests/test_data/era5/properties.json create mode 100644 tests/test_datasets/test_era5.py diff --git a/README.md b/README.md index 1da61af..9a40b89 100644 --- a/README.md +++ b/README.md @@ -24,4 +24,17 @@ Tool for downloading Land Surface Model input data - Load recipes using Pydantic ([for example](https://github.com/phenology/springtime/blob/main/src/springtime/datasets/daymet.py)). - Support both a CLI & Python API. -Note: items in *italic* will not be worked on for now/low priority, but we want to allow space for these in the future. \ No newline at end of file +Note: items in *italic* will not be worked on for now/low priority, but we want to allow space for these in the future. + +## Instructions for CDS datasets (e.g. ERA5) +To download the following datasets, users need access to CDS via cdsapi: + +- ERA5 +- ERA5 land +- LAI + +First, you need to be a registered user on CDS via the [registration page](https://cds.climate.copernicus.eu/user/register?destination=%2F%23!%2Fhome). + +Before submitting any request with `zampy`, please configure your `.cdsapirc` file following the instructions on https://cds.climate.copernicus.eu/api-how-to. + +When downloading a dataset for the first time, it is **necessary to agree to the Terms of Use of every datasets that you intend to download**. This can only be done via the CDS website. When you try to download these datasets, you will be prompted to go to the terms of use and accept them. \ No newline at end of file diff --git a/demo/era5_dataset_demo.ipynb b/demo/era5_dataset_demo.ipynb new file mode 100644 index 0000000..5232850 --- /dev/null +++ b/demo/era5_dataset_demo.ipynb @@ -0,0 +1,1204 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Handle ERA5 dataset with Zampy\n", + "Demo notebook for developers." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from zampy.datasets import ERA5\n", + "from zampy.datasets.dataset_protocol import TimeBounds, SpatialBounds\n", + "from pathlib import Path" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "work_dir = Path(\"/home/yangliu/EcoExtreML/temp\")\n", + "download_dir = work_dir / \"download\"\n", + "ingest_dir = work_dir / \"ingest\"\n", + "times = TimeBounds(np.datetime64(\"2010-01-01T00:00:00\"), np.datetime64(\"2010-01-31T23:00:00\"))\n", + "bbox_demo = SpatialBounds(54, 56, 1, 3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Download dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File 'era5_10m_v_component_of_wind_2010-1.nc' already exists, skipping...\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "era5_dataset = ERA5()\n", + "era5_dataset.download(\n", + " download_dir=download_dir,\n", + " time_bounds=times,\n", + " spatial_bounds=bbox_demo,\n", + " variable_names=[\"10m_v_component_of_wind\"], #\"surface_pressure\", \"mean_total_precipitation_rate\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data ingestion to the unified format in `zampy`." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File 'era5_10m_v_component_of_wind_2010-1.nc' already exists, skipping...\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "era5_dataset.ingest(download_dir, ingest_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "ds = era5_dataset.load(\n", + " ingest_dir=ingest_dir,\n", + " time_bounds=times,\n", + " spatial_bounds=bbox_demo,\n", + " variable_names=[\"10m_v_component_of_wind\"],\n", + " resolution=1.0,\n", + " regrid_method=\"flox\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:                  (time: 744, latitude: 54, longitude: 54)\n",
+       "Coordinates:\n",
+       "  * time                     (time) datetime64[ns] 2010-01-01 ... 2010-01-31T...\n",
+       "  * latitude                 (latitude) float64 1.0 2.0 3.0 ... 52.0 53.0 54.0\n",
+       "  * longitude                (longitude) float64 3.0 4.0 5.0 ... 54.0 55.0 56.0\n",
+       "Data variables:\n",
+       "    10m_v_component_of_wind  (time, latitude, longitude) float32 dask.array<chunksize=(744, 54, 54), meta=np.ndarray>\n",
+       "Attributes:\n",
+       "    Conventions:  CF-1.6\n",
+       "    history:      2023-07-11 10:17:10 GMT by grib_to_netcdf-2.25.1: /opt/ecmw...
" + ], + "text/plain": [ + "\n", + "Dimensions: (time: 744, latitude: 54, longitude: 54)\n", + "Coordinates:\n", + " * time (time) datetime64[ns] 2010-01-01 ... 2010-01-31T...\n", + " * latitude (latitude) float64 1.0 2.0 3.0 ... 52.0 53.0 54.0\n", + " * longitude (longitude) float64 3.0 4.0 5.0 ... 54.0 55.0 56.0\n", + "Data variables:\n", + " 10m_v_component_of_wind (time, latitude, longitude) float32 dask.array\n", + "Attributes:\n", + " Conventions: CF-1.6\n", + " history: 2023-07-11 10:17:10 GMT by grib_to_netcdf-2.25.1: /opt/ecmw..." + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10m_v_component_of_wind renamed to Wind_E.\n", + "Conversion of dataset 'era5' following ALMA convention is complete!\n" + ] + } + ], + "source": [ + "from zampy.datasets import converter\n", + "\n", + "ds_convert = converter.convert(ds, era5_dataset, \"ALMA\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:    (time: 744, latitude: 54, longitude: 54)\n",
+       "Coordinates:\n",
+       "  * time       (time) datetime64[ns] 2010-01-01 ... 2010-01-31T23:00:00\n",
+       "  * latitude   (latitude) float64 1.0 2.0 3.0 4.0 5.0 ... 51.0 52.0 53.0 54.0\n",
+       "  * longitude  (longitude) float64 3.0 4.0 5.0 6.0 7.0 ... 53.0 54.0 55.0 56.0\n",
+       "Data variables:\n",
+       "    Wind_E     (time, latitude, longitude) float32 dask.array<chunksize=(744, 54, 54), meta=np.ndarray>\n",
+       "Attributes:\n",
+       "    Conventions:  ALMA\n",
+       "    history:      2023-07-11 10:17:10 GMT by grib_to_netcdf-2.25.1: /opt/ecmw...
" + ], + "text/plain": [ + "\n", + "Dimensions: (time: 744, latitude: 54, longitude: 54)\n", + "Coordinates:\n", + " * time (time) datetime64[ns] 2010-01-01 ... 2010-01-31T23:00:00\n", + " * latitude (latitude) float64 1.0 2.0 3.0 4.0 5.0 ... 51.0 52.0 53.0 54.0\n", + " * longitude (longitude) float64 3.0 4.0 5.0 6.0 7.0 ... 53.0 54.0 55.0 56.0\n", + "Data variables:\n", + " Wind_E (time, latitude, longitude) float32 dask.array\n", + "Attributes:\n", + " Conventions: ALMA\n", + " history: 2023-07-11 10:17:10 GMT by grib_to_netcdf-2.25.1: /opt/ecmw..." + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds_convert" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "ecoextreml", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/eth_dataset_demo.ipynb b/demo/eth_dataset_demo.ipynb similarity index 100% rename from eth_dataset_demo.ipynb rename to demo/eth_dataset_demo.ipynb diff --git a/pyproject.toml b/pyproject.toml index 4816f7e..ad83aae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,6 +62,7 @@ dependencies = [ "cf_xarray", # required to auto-pint CF compliant datasets. "pint-xarray", "flox", # For better groupby methods. + "cdsapi", ] dynamic = ["version"] @@ -142,6 +143,7 @@ extend-select = [ ] ignore = [ "PLR2004", # magic value used in comparsion (i.e. `if ndays == 28: month_is_feb`). + "PLR0913", # too many arguments ] line-length = 88 exclude = ["docs", "build"] diff --git a/src/zampy/conventions/ALMA.json b/src/zampy/conventions/ALMA.json index 0e62826..8398410 100644 --- a/src/zampy/conventions/ALMA.json +++ b/src/zampy/conventions/ALMA.json @@ -23,6 +23,16 @@ "units": "watt/meter**2", "positive_dir": "downward" }, + "surface_solar_radiation_downwards": { + "variable": "SWdown", + "units": "watt/meter**2", + "positive_dir": "downward" + }, + "surface_thermal_radiation_downwards": { + "variable": "LWdown", + "units": "watt/meter**2", + "positive_dir": "downward" + }, "latent_heat_flux": { "variable": "Qle", "units": "watt/meter**2", @@ -32,5 +42,23 @@ "variable": "Qh", "units": "watt/meter**2", "positive_dir": "downward" + }, + "surface_pressure": { + "variable": "Psurf", + "units": "pascal" + }, + "eastward_component_of_wind": { + "variable": "Wind_E", + "units": "meter/second", + "positive_dir": "eastward" + }, + "northward_component_of_wind": { + "variable": "Wind_N", + "units": "meter/second", + "positive_dir": "northward" + }, + "total_precipitation": { + "variable": "Rainf", + "units": "millimeter/second" } } \ No newline at end of file diff --git a/src/zampy/datasets/__init__.py b/src/zampy/datasets/__init__.py index fdff791..e1b1724 100644 --- a/src/zampy/datasets/__init__.py +++ b/src/zampy/datasets/__init__.py @@ -1,7 +1,8 @@ """Datasets implementations.""" from zampy.datasets import dataset_protocol from zampy.datasets import validation +from zampy.datasets.era5 import ERA5 from zampy.datasets.eth_canopy_height import EthCanopyHeight -__all__ = ["dataset_protocol", "validation", "EthCanopyHeight"] +__all__ = ["dataset_protocol", "validation", "EthCanopyHeight", "ERA5"] diff --git a/src/zampy/datasets/converter.py b/src/zampy/datasets/converter.py index 02c21e1..f641950 100644 --- a/src/zampy/datasets/converter.py +++ b/src/zampy/datasets/converter.py @@ -7,11 +7,11 @@ import pint_xarray # noqa: F401 import xarray as xr from zampy.datasets.dataset_protocol import Dataset +from zampy.reference.variables import unit_registry -CONVENTIONS = { - "ALMA": "src/zampy/conventions/ALMA.json", -} +CONVENTIONS = ["ALMA"] +conventions_path = Path(__file__).resolve().parents[1] / "conventions" def check_convention(convention: Union[str, Path]) -> None: @@ -48,7 +48,9 @@ def convert( """ converted = False if isinstance(convention, str): - convention_file = Path(CONVENTIONS[convention]).open(mode="r", encoding="UTF8") + convention_file = (conventions_path / f"{convention}.json").open( + mode="r", encoding="UTF8" + ) else: convention_file = convention.open(mode="r", encoding="UTF8") @@ -58,7 +60,7 @@ def convert( convert_units = convention_dict[var.lower()]["units"] var_name = convention_dict[var.lower()]["variable"] var_units = data[var].attrs["units"] - if var_units != convert_units: + if unit_registry(var_units) != unit_registry(convert_units): # lazy dask array data = _convert_var(data, var, convert_units) data = data.rename({var: var_name}) @@ -75,6 +77,7 @@ def convert( f"Conversion of dataset '{dataset.name}' following {convention} " "convention is complete!" ) + data.attrs["Conventions"] = convention else: warnings.warn( f"All variables already follow the {convention} convention or " diff --git a/src/zampy/datasets/dataset_protocol.py b/src/zampy/datasets/dataset_protocol.py index 2803cff..77b7042 100644 --- a/src/zampy/datasets/dataset_protocol.py +++ b/src/zampy/datasets/dataset_protocol.py @@ -88,7 +88,7 @@ def __init__(self) -> None: ... @abstractmethod - def download( # noqa: PLR0913 + def download( self, download_dir: Path, time_bounds: TimeBounds, @@ -131,7 +131,7 @@ def ingest( ... @abstractmethod - def load( # noqa: PLR0913 + def load( self, ingest_dir: Path, time_bounds: TimeBounds, diff --git a/src/zampy/datasets/era5.py b/src/zampy/datasets/era5.py new file mode 100644 index 0000000..6c7c64d --- /dev/null +++ b/src/zampy/datasets/era5.py @@ -0,0 +1,247 @@ +"""ERA5 dataset.""" + +from pathlib import Path +from typing import List +from typing import Union +import numpy as np +import xarray as xr +from zampy.datasets import converter +from zampy.datasets import utils +from zampy.datasets import validation +from zampy.datasets.dataset_protocol import Dataset +from zampy.datasets.dataset_protocol import SpatialBounds +from zampy.datasets.dataset_protocol import TimeBounds +from zampy.datasets.dataset_protocol import Variable +from zampy.datasets.dataset_protocol import copy_properties_file +from zampy.datasets.dataset_protocol import write_properties_file +from zampy.reference.variables import VARIABLE_REFERENCE_LOOKUP +from zampy.reference.variables import unit_registry +from zampy.utils import regrid + + +## Ignore missing class/method docstrings: they are implemented in the Dataset class. +# ruff: noqa: D102 + + +class ERA5(Dataset): # noqa: D101 + name = "era5" + time_bounds = TimeBounds(np.datetime64("1940-01-01"), np.datetime64("2023-06-30")) + spatial_bounds = SpatialBounds(90, 180, -90, -180) + + raw_variables = ( + Variable(name="mtpr", unit=unit_registry.kilogram_per_square_meter_second), + Variable(name="strd", unit=unit_registry.joule_per_square_meter), + Variable(name="ssrd", unit=unit_registry.joule_per_square_meter), + Variable(name="sp", unit=unit_registry.pascal), + Variable(name="u10", unit=unit_registry.meter_per_second), + Variable(name="v10", unit=unit_registry.meter_per_second), + ) + + # variable names used in cdsapi downloading request + variable_names = ( + "mean_total_precipitation_rate", + "surface_thermal_radiation_downwards", + "surface_solar_radiation_downwards", + "surface_pressure", + "10m_u_component_of_wind", + "10m_v_component_of_wind", + ) + + license = "cc-by-4.0" + bib = """ + @article{hersbach2020era5, + title={The ERA5 global reanalysis}, + author={Hersbach, Hans et al.}, + journal={Quarterly Journal of the Royal Meteorological Society}, + volume={146}, + number={730}, + pages={1999--2049}, + year={2020}, + publisher={Wiley Online Library} + } + """ + + def download( + self, + download_dir: Path, + time_bounds: TimeBounds, + spatial_bounds: SpatialBounds, + variable_names: List[str], + overwrite: bool = False, + ) -> bool: + validation.validate_download_request( + self, + download_dir, + time_bounds, + spatial_bounds, + variable_names, + ) + + download_folder = download_dir / self.name + download_folder.mkdir(parents=True, exist_ok=True) + + utils.cds_request( + dataset="reanalysis-era5-single-levels", + variables=variable_names, + time_bounds=time_bounds, + spatial_bounds=spatial_bounds, + path=download_folder, + overwrite=overwrite, + ) + + write_properties_file( + download_folder, spatial_bounds, time_bounds, variable_names + ) + + return True + + def ingest( + self, + download_dir: Path, + ingest_dir: Path, + overwrite: bool = False, + ) -> bool: + download_folder = download_dir / self.name + ingest_folder = ingest_dir / self.name + ingest_folder.mkdir(parents=True, exist_ok=True) + + data_file_pattern = "era5_*.nc" + data_files = list(download_folder.glob(data_file_pattern)) + + for file in data_files: + convert_to_zampy( + ingest_folder, + file=file, + overwrite=overwrite, + ) + + copy_properties_file(download_folder, ingest_folder) + + return True + + def load( + self, + ingest_dir: Path, + time_bounds: TimeBounds, + spatial_bounds: SpatialBounds, + resolution: float, + regrid_method: str, + variable_names: List[str], + ) -> xr.Dataset: + files: List[Path] = [] + for var in self.variable_names: + if var in variable_names: + files += (ingest_dir / self.name).glob(f"era5_{var}*.nc") + + ds = xr.open_mfdataset(files, chunks={"latitude": 200, "longitude": 200}) + ds = ds.sel(time=slice(time_bounds.start, time_bounds.end)) + ds = regrid.regrid_data(ds, spatial_bounds, resolution, regrid_method) + + return ds + + def convert( + self, + ingest_dir: Path, + convention: Union[str, Path], + ) -> bool: + converter.check_convention(convention) + ingest_folder = ingest_dir / self.name + + data_file_pattern = "era5_*.nc" + + data_files = list(ingest_folder.glob(data_file_pattern)) + + for file in data_files: + # start conversion process + print(f"Start processing file `{file.name}`.") + ds = xr.open_dataset(file, chunks={"x": 50, "y": 50}) + ds = converter.convert(ds, dataset=self, convention=convention) + # TODO: support derived variables + # TODO: other calculations + # call ds.compute() + + return True + + +def convert_to_zampy( + ingest_folder: Path, + file: Path, + overwrite: bool = False, +) -> None: + """Convert the downloaded nc files to standard CF/Zampy netCDF files. + + The downloaded ERA5 data already follows CF1.6 convention. However, it uses + (abbreviated) variable name instead of standard name, which prohibits the format + conversion. Therefore we need to ingest the downloaded files and rename all + variables to standard names. + + Args: + ingest_folder: Folder where the files have to be written to. + file: Path to the ERA5 nc file. + overwrite: Overwrite all existing files. If False, file that already exist will + be skipped. + """ + ncfile = ingest_folder / file.with_suffix(".nc").name + if ncfile.exists() and not overwrite: + print(f"File '{ncfile.name}' already exists, skipping...") + else: + ds = parse_nc_file(file) + + ds.to_netcdf(path=ncfile) + + +var_reference_era5_to_zampy = { + "mtpr": "total_precipitation", + "strd": "surface_thermal_radiation_downwards", + "ssrd": "surface_solar_radiation_downwards", + "sp": "surface_pressure", + "u10": "eastward_component_of_wind", + "v10": "northward_component_of_wind", +} + +WATER_DENSITY = 997.0 # kg/m3 + + +def parse_nc_file(file: Path) -> xr.Dataset: + """Parse the downloaded ERA5 nc files, to CF/Zampy standard dataset. + + Args: + file: Path to the ERA5 nc file. + + Returns: + CF/Zampy formatted xarray Dataset + """ + # Open chunked: will be dask array -> file writing can be parallelized. + ds = xr.open_dataset(file, chunks={"x": 50, "y": 50}) + + for variable in ds.variables: + if variable in var_reference_era5_to_zampy: + var = str(variable) # Cast to string to please mypy + variable_name = var_reference_era5_to_zampy[var] + ds = ds.rename({var: variable_name}) + # convert radiation to flux J/m2 to W/m2 + # https://confluence.ecmwf.int/pages/viewpage.action?pageId=155337784 + if variable_name in ( + "surface_solar_radiation_downwards", + "surface_thermal_radiation_downwards", + ): + ds[variable_name] = ds[variable_name] / 3600 + # conversion precipitation kg/m2s to mm/s + elif variable_name == "total_precipitation": + ds[variable_name] = ds[variable_name] / WATER_DENSITY + ds[variable_name].attrs["units"] = "meter_per_second" + # convert from m/s to mm/s + ds = converter._convert_var( + ds, variable_name, VARIABLE_REFERENCE_LOOKUP[variable_name].unit + ) + + ds[variable_name].attrs["units"] = str( + VARIABLE_REFERENCE_LOOKUP[variable_name].unit + ) + ds[variable_name].attrs["description"] = VARIABLE_REFERENCE_LOOKUP[ + variable_name + ].desc + + # TODO: add dataset attributes. + + return ds diff --git a/src/zampy/datasets/eth_canopy_height.py b/src/zampy/datasets/eth_canopy_height.py index bae9322..9e8a2a0 100644 --- a/src/zampy/datasets/eth_canopy_height.py +++ b/src/zampy/datasets/eth_canopy_height.py @@ -55,7 +55,7 @@ class EthCanopyHeight(Dataset): # noqa: D101 data_url = "https://share.phys.ethz.ch/~pf/nlangdata/ETH_GlobalCanopyHeight_10m_2020_version1/3deg_cogs/" - def download( # noqa: PLR0913 + def download( self, download_dir: Path, time_bounds: TimeBounds, @@ -121,7 +121,7 @@ def ingest( return True - def load( # noqa: PLR0913 + def load( self, ingest_dir: Path, time_bounds: TimeBounds, diff --git a/src/zampy/datasets/utils.py b/src/zampy/datasets/utils.py index 1b27975..c6a6214 100644 --- a/src/zampy/datasets/utils.py +++ b/src/zampy/datasets/utils.py @@ -1,10 +1,24 @@ """Shared utilities from datasets.""" import urllib.request from pathlib import Path +from typing import List from typing import Optional +from typing import Tuple from typing import Union +import cdsapi +import pandas as pd import requests from tqdm import tqdm +from tqdm.contrib.itertools import product +from zampy.datasets.dataset_protocol import SpatialBounds +from zampy.datasets.dataset_protocol import TimeBounds + + +PRODUCT_FNAME = { + "reanalysis-era5-single-levels": "era5", + "reanalysis-era5-land": "era5-land", +} +CDSAPI_CONFIG_PATH = Path.home() / ".cdsapirc" class TqdmUpdate(tqdm): @@ -54,3 +68,96 @@ def get_file_size(fpath: Path) -> int: return 0 else: return fpath.stat().st_size + + +def cds_request( + dataset: str, + variables: List[str], + time_bounds: TimeBounds, + spatial_bounds: SpatialBounds, + path: Path, + overwrite: bool, +) -> None: + """Download data via CDS API. + + To raise a request via CDS API, the user needs to set up the + configuration file `.cdsapirc` following the instructions on + https://cds.climate.copernicus.eu/api-how-to. + + Following the efficiency tips of request, + https://confluence.ecmwf.int/display/CKB/Climate+Data+Store+%28CDS%29+documentation + The downloading is organized by asking for one month of data per request. + + Args: + dataset: Dataset name for retrieval via `cdsapi`. + variables: Zampy variable. + time_bounds: Zampy time bounds object. + spatial_bounds: Zampy spatial bounds object. + path: File path to which the data should be saved. + overwrite: If an existing file (of the same size!) should be overwritten. + """ + fname = PRODUCT_FNAME[dataset] + + with CDSAPI_CONFIG_PATH.open(encoding="utf8") as f: + url = f.readline().split(":", 1)[1].strip() + api_key = f.readline().split(":", 1)[1].strip() + + c = cdsapi.Client( + url=url, + key=api_key, + verify=True, + quiet=True, + ) + + # create list of year/month pairs + year_month_pairs = time_bounds_to_year_month(time_bounds) + + for (year, month), variable in product( + year_month_pairs, variables, position=0, leave=True + ): + # raise download request + r = c.retrieve( + dataset, + { + "product_type": "reanalysis", + "variable": [variable], + "year": year, + "month": month, + # fmt: off + "day": [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", + "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", + "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", + "31", + ], + "time": [ + "00:00", "01:00", "02:00", "03:00", "04:00", "05:00", "06:00", + "07:00", "08:00", "09:00", "10:00", "11:00", "12:00", "13:00", + "14:00", "15:00", "16:00", "17:00", "18:00", "19:00", "20:00", + "21:00", "22:00", "23:00", + ], + # fmt: on + "area": [ + spatial_bounds.north, + spatial_bounds.west, + spatial_bounds.south, + spatial_bounds.east, + ], + "format": "netcdf", + }, + ) + # check existence and overwrite + fpath = path / f"{fname}_{variable}_{year}-{month}.nc" + + if get_file_size(fpath) != r.content_length or overwrite: + r.download(fpath) + tqdm.write(f"Download {fpath.name} successfully.") + else: + print(f"File '{fpath.name}' already exists, skipping...") + + +def time_bounds_to_year_month(time_bounds: TimeBounds) -> List[Tuple[str, str]]: + """Return year/month pairs.""" + date_range = pd.date_range(start=time_bounds.start, end=time_bounds.end, freq="M") + year_month_pairs = [(str(date.year), str(date.month)) for date in date_range] + return year_month_pairs diff --git a/src/zampy/reference/README.md b/src/zampy/reference/README.md new file mode 100644 index 0000000..630757d --- /dev/null +++ b/src/zampy/reference/README.md @@ -0,0 +1,9 @@ +### Reference + +To allow data ingestion into the unified `zampy` format and to enable variables conversion between different conventions, we need a unique identifier for each variable. Therefore we create a `variables.py` file to store the `zampy` unique variable names for reference. + +### Ingestion + +During data ingestion, all variables will be formatted to `zampy` variable format. + + \ No newline at end of file diff --git a/src/zampy/reference/variables.py b/src/zampy/reference/variables.py index 9ef9709..9968727 100644 --- a/src/zampy/reference/variables.py +++ b/src/zampy/reference/variables.py @@ -3,23 +3,42 @@ from zampy.datasets.dataset_protocol import Variable -unit_registry = UnitRegistry() -unit_registry.define("fraction = [] = frac") -unit_registry.define("percent = 1e-2 frac = pct") -unit_registry.define("ppm = 1e-6 fraction") -unit_registry.define("degree_north = degree = degree_N = degreeN") -unit_registry.define("degree_east = degree = degree_E = degreeE") +def unit_registration() -> UnitRegistry: + """Create unit registration for all custom units.""" + unit_registry = UnitRegistry() + unit_registry.define("fraction = [] = frac") + unit_registry.define("percent = 1e-2 frac = pct") + unit_registry.define("ppm = 1e-6 fraction") + unit_registry.define("degree_north = degree = degree_N = degreeN") + unit_registry.define("degree_east = degree = degree_E = degreeE") + unit_registry.define("watt_per_square_meter = watt/meter**2") + unit_registry.define("joule_per_square_meter = joule/meter**2") + unit_registry.define( + "kilogram_per_square_meter_second = kilogram/(meter**2*second)" + ) + unit_registry.define("milimeter_per_second = watt/meter**2") + return unit_registry + +unit_registry = unit_registration() # By default, we use the variable names and units following the CF convention: # https://cfconventions.org/Data/cf-standard-names/current/build/cf-standard-name-table.html VARIABLE_REFERENCE = ( - Variable("air-temperature", unit_registry.kelvin), - Variable("dewpoint-temperature", unit_registry.kelvin), - Variable("relative-humidity", unit_registry.percent), + Variable("air_temperature", unit_registry.kelvin), + Variable("dewpoint_temperature", unit_registry.kelvin), + Variable("relative_humidity", unit_registry.percent), + Variable("surface_pressure", unit_registry.pascal), + Variable("eastward_component_of_wind", unit_registry.meter_per_second), + Variable("northward_component_of_wind", unit_registry.meter_per_second), + Variable("surface_solar_radiation_downwards", unit_registry.watt_per_square_meter), + Variable( + "surface_thermal_radiation_downwards", unit_registry.watt_per_square_meter + ), + Variable("total_precipitation", unit_registry.millimeter_per_second), Variable( - "specific-humidity", + "specific_humidity", unit_registry.fraction, desc="Mass fraction of water in air.", ), diff --git a/tests/test_data/era5/era5_10m_u_component_of_wind_1996-1.nc b/tests/test_data/era5/era5_10m_u_component_of_wind_1996-1.nc new file mode 100644 index 0000000000000000000000000000000000000000..202dfb36fe5604129301de0ff4e5b9cf908ae292 GIT binary patch literal 33280 zcmeI5O>7%g5XYZC;yQ`*QAh)nv@uDbP}0*s=F3jaolIwz<`y#9rR4N{+C;{q{&#{VyMw%{IxUYe#J$!Z(obtxLGXmb8*gU2|#mx~wi;E-KTG+nH7zLi;ieZ!c4a z$W-&r73{15%DrC7c}iBz`IL9g%WutgIm(#&M}xmMFN4Oww{IU`abo}A^G#B>@dy6C zmQ{2owMuS`_P4jz!*}s{2scr-)K1%NBYT54zAba?+16@rAkm$}ZaIVP?Mb;+-%aRX zc+7ak;;CG*Uf32}3^_iMn_9QGej!qH_pOhBK{IXAVt#JvS#uw`_0#7b`mlsNQ)^sy zMs}jc;A!5c8Mo~3qJ2)Sf1L#6Z=O5<>5B`=*{9r+_(E>w_dGEDk9|y0zvKiVzhBCEAXh+?d1orikSF2jhY4O>-I9sk) zs?n-fS}sK6M~)mi5R21aG!{E*{~~dEhEb%Qj{VEIVxg4JRqPae>o~PmE9GKgwdO5n zS1N^-a>dJ)jM;Z3lznp#2Np?_4^7eQrpRWJPglyTg-Xq%2NusULLZ*ze4$eH%B88t z#!L3eHZ?i%Q}Ky-Y~pa}esZJdJ0pTySW!C6$5RV&+L41iD@>3flF|I* zF*E%6FB=?IV`Z+{Zj*547Tz0PD6Cx{v4hfjF+G!BOlOkmtX(LFC}ds(`CKhmEz~ww zOrAn6u@ENwPmgnb?~%|=a^}0k&DWM#b0Z0HxN{LHH2zE--`gC~nnbiF4jfVFj%Xwq zV!r(9ClVVC3P1rU00nMBffLDv`Q~>vg}uHq>yK?kZfn0jkG21CaSTc<+OP989``b? z{`!*ftgoVGYIeZN=sxmGr} z-Od_cMriG5h?BVzZ9r83*Ljf*`X Kc0%kyv3~(?yb`ql literal 0 HcmV?d00001 diff --git a/tests/test_data/era5/era5_10m_v_component_of_wind_1996-1.nc b/tests/test_data/era5/era5_10m_v_component_of_wind_1996-1.nc new file mode 100644 index 0000000000000000000000000000000000000000..f5ba129fdb2409d1a264140a5f6ba69a48ac5830 GIT binary patch literal 33280 zcmeI5Piz}i5XPUqj_V}OpF$d-q>V`eg_5Q=gqBvSaAG@6MCw2S96;b&v71_Q?Z|e* zjau9|0*4|Yq+U330>q(0YB?ZIs7Fu{swfgCR4$+#Kw;*+8OJfHz|i)9zNdJ{``+x^ z*>C6lcAd+-HJhC2>Dt@n4~2}jO=;#$*Atb2A5VsUUzklU`oo|2M{J+BONR82R@>z_ zK6dHv=j}l`#|7S@798ZQFW|AE=B>w~mjdp4aEPrhAiJisST3y;YHPXtP3K_t3>hBu ze5UoRx^LY1{O*7o7q48f;rERBd;{F)_xXH!*=oH3j(B(E896v^CcDk#CWYA1%Ojp% z1RcL|-d)#+L3f7dooPfCyk%tJDR%YDfCKVro8R!eiX;nGSbA zdg@fl`ge(cLuA_j`qz&>b$X57X1#tBz22v@aML?^NA+Ue+ShAzXT3UUGS;`4t=XKU zuRR+#=bc=$O>*hlQR{{I2J*di2^Zg%R+6c8msW4f>eA(+GVQvPX~iM5FVpbOGIfYd zwZgf)oz!qBhWYwHc7S0uN+p}GcGG^`L!JnI#L1W;X_oi2z*x&nnlhkee-e1?U ziteOV$&Jze_SSmn9zGA@Cc>85X}fJ?Z{XIqWsW`DTI~%ax^vhqXRy6JNw?~|2^|cN znL@GnQnt93-w|63IX;q`TDP}xK3sJ7t&e~~Gi}jges1Xn^8mT^)8{_=u!KBQYg~3l zcB01MY2K$9x9#tteNL@^oCM@=UR?j|)rG|Dvu;UTkz4s44^eA-I%7&#zj|$J!Vh>Q zQ+vmm_9o|f3n^QBz2VyEC+$EmeiDHrprwZd{{ zrIKGMR|?sZF$b;%Gw<%>z``l=p(%RZ6q$75xk`C8U#S)7fyHx-(1+()&sVC2a_Phq z<0boKJ25%&^h9j@Z0^icCt?$^=)}?BgXBifcSZ!akfL;$kEa&ov?B+3R+s=oB(3?! zV`ljCpEfzH#>%YQZWDLrHr^XvD6Cx{v4hfjF*TD~Or;a4j9n;4C}ds(xoj<4&DXY8 zOpZb>u@ENocaL*@?~%|=V&>bU&DR#MyOFp!+`Wht8h<3G_cupWmx$`(z!3%Sibj$l z=8G?Hkl1KY017|>DDW>7IGI?OZ+>S}*y}5^{@7OJw)X4uSo@z8$AHA5{W?$MaWCWQ zuTOdPe_bZle=umj`s==0^_SN<`E_NCo>$(b8vQy?cT1c)Pq?LD=jmzbSO4CYe)aEb z>DRX#d1Y?Iqx0-;@z;6mZx0&N>(zPqrHh;Pt3SVcbJKqH=XY>!+Q0pUivFdc7wFuu zg#u6j3P1rU00p1`6o3Ly017~X|Ca)Bxg1z01ayE7tN`)=d4N2?Isga20dN2u00+PU zZ~z-cF|JFU~Fb@{XopVr;eI(u4IPwVJu-Mm4G(-tSD@Au0-*UIL$ z*BN808`Jk8+21F2zgVu>%q?QK#yli;MC_>8F|iMeeMIa5v5$&A355XaXaah=5ZD5Qawv@uDbP}0Dt%j4~LDGO)2J8*AkNxKb;Eyu`ruj^hZASkJz@rE;*%_wAwDe z@v%#PKOYatJ>KRDmC!wW^aTTKs03GQYn4xft`Ztz>kCTNbQa5{6|c6QFI;yHX3vn} zlL4P;y{*=b%g?4@_MX{%&8FWo=JO4348PCk)5lgD7~qU|N1rsVC1EDJ&EzJf*wM>p z0{Rei{l-Q2TpxrunlaBf(?~3I!^9%f>>4lwCPE*R%@k_O`E!%Wa=DWCO1WC0YWnH^ z?%AU@QTGs}h}?uteWCb@L(s0~E81G#T?cr_0z4oRi%hSeJ~yL7s!OuKI9u;LWjmuYxs znK~#K2dsIk1sgR$W3QLSJPlSI^C@rD%Wog;a+NXb9}WJ}JPjHrzIo^5iWB>LpKp?y zjX&`BxvZi)sa0}gw7tE#9=?n3L$ryqrE=CD8>2UP<8GN_&$ec}B2g^|;1O)QC*@{+ zH=%>!G2<1B&*zHkg&ncQkmD=4sCBiCi;<%1w>|;}McSgp{M^#BW-qz*(}yX|5^^NX zN3D&>PE;5?&2@^nt-goqJhcAxARvG9+=WXoEhJ~3a#P}S>B{f;6qUBa8B@CQ)tRk; z@9|8g@}@KG1>}c~P-g_`k^XyU81uv6)mOGsJHk+=(n#%uyOY3vz2w!ZR&$zsc3hk< zuUD$ks#jVrMB~SgA3Yq4(_b_eJ7NDKae9YQrk#%c%ei8ql+RUc2<|!_T5FYZv9MP2 zma{9B!b-W~t)i6ylZ2Yb$bF+x2~{?T?9LP;$|BjnjDD%X#(J zx4inlEP?eO4BD>#T34(7vYeA&SH|dlWi8cc*EroRd1{C;$bZ02F`%Pyh-*f&Y~P;e>k}Sm?kCU>sl^U>sl`fCJzFH~!9%&dPoM6~r~B~fzI(dQp6;us`{?Pud4m$4_r&IMuQPV9Z2r1W z>UMWy`aUG}{bCP@|wDJ IVvmUZ3)n|K%K!iX literal 0 HcmV?d00001 diff --git a/tests/test_data/era5/era5_surface_pressure_1996-1.nc b/tests/test_data/era5/era5_surface_pressure_1996-1.nc new file mode 100644 index 0000000000000000000000000000000000000000..a69a10354ad6456d8206c9b9801d14c97b8a6409 GIT binary patch literal 33280 zcmeI5O>A355XZ+Kar5DP6w&}~+L$CzC~2BNXlbPgCyvuZqz)v&0R*lUyQz_DN48UL zG(ucB0tb)~5*Ln~0C7M_g#$tyP>-M@Q~?qvR4$+#Kw)Ng#&Jw4NNIaO|CR0W?#`Px zZ+7S{K3)hp1RUXsaB zfErI~eQI&4X8o&)SN1vg+Xn-IZkY-O0s*^hlc8?$cz5hk@*{B?YooCZHqz7~Gax^UUvF&T==0?OK2dex_unB=zR)9N+dUAA2;OuKGoT8Rrg4^!XvVQOMy zVz8P&mvdG_oO>Od^PH@f^HceA`Rvwg*Q1D5Kj`_TejC&#zIkhM*$4akKwylADH!_u zUbdk7P+Q1}YXz4+o+-cU+ZY`J zkLi4&@N}lIn%fpz0y#NSnn~YVzYs0>b?ancu%%5}%*`%7Nq4bY7r*b|w}enEnLj3- zMN#4=;HmGk#VviIy`-#l@UApUaE0rPXpJ zR>>EaaTkM?Al`HvDas0ud zqPw$=j}1REJUVnXd*qsLusEJ&n+Tp#|nC{ps+wBWqBua z)c5Jn8{$@NXRf(nihFSb*G4vq#IBRs!D+paoK7wzQ;B5SZIq+zvh0Curkbhbs+&6| z%Ptos5Y+rnw{v^&QP(Lk{q3>(Ym2Y>nYb$4I*$w*e@;yvsP|}1Jz7%*@hEajVx$>D zpMUWq2OA9$Km-s0MBq9ia56DJSO3gru(wBM`?YPzZ5+4zv5h~ZiedG|#?4OccCYiM zZ+Cgqe^CwF9}G5b`qnpT`uaGhzpjYvy!tFv8#g;`4St%PXv4VKX>S-e{f>rl)9-8; zx2GF@WUl#Rc6K-Dn;rMtgXipe&5k^DNwaa&muGKjHg5XvAD6oEtq)ZEmxf)SI~pH| z03v`0AOeU0B7g`W0*C-2fCwN0|EmbRupW2U1Bp0DA|MZt2gn2L18@Kw00+PUZ~z)?qpm9U7U=IpevJc5nPx)?nF#wT;*w%JQ;8bzId>F zwYq{ zoS47gseP%HEp4wCV`eg_5Q=gqBu{aAG@6MCw2S96;b&v71_Q?Z|e@ z0f}1NI06Td5E2)ToB(k^NQDF9gnFbZLJ=TwLgfO=0TgE5n{gbI3S8PA(DxMY_`RK- zo&EN`->!3+*XNS6Jze{|e4&uhwk6HH=0>8j8JC% zYmeAO-9?ZhauPP-hq>=JK}wc<6ARPJ%jQnn!E*cPlbk=DN-xhZrZdZl>4lUDkMX$Q zbhrc3)2CC`zf1fZ8dLw9-#+}r={0(j_4-}(dY8_^O>g8a)r)m&U$4>K_3ETxtZy+} zvs043_H15TaB|Hx$)#&Ytv}2+kngQdujXmRA*t2fw0c8Umo68TY0s^kR-8inG7ax8 zQwIg(fVIN;yp8Ioxz|f`o+hi#`DEdIA-8k3%TdOxe>C_@^D=1c`1YNdRVViMo;OLu z#^?Y0T2|4W)GE0#I^Nz|58c7%A=*UPQafX}joBNx{%x6O&$d>(B2g^|;2CUhPtvXW zZbApcW42H%K9?=7=Xb>xLynK+rq=Z~FNTZmzV#6>DAG197Uq|qG53&LKYf_eEFnkI zeAL>A>_m;h)7+|egR|?sZF^8@OGjAQ>#KI}^p%A?;L?)ehs#0FdS84@%U~!BQ`f!{J`AW4= zE=@f$Ub0WNsmY1QCt~C0a%Uf#icQ3#6UT%1k{dnV84=tbgoo12cSKfi->)A5}zRP-+my+G%N zEfjzPPyh-*0Vn_kpa2wr0#E=7{I3)kh`Y;yg$}F$<^kpb<^k3LH~eRqBbIh~*1;7I=C4{;Q0oY4-9W7q z*jg7*>;7q-KdtMhb^Nrfo!05ox_nxPPwVb!ojt9qr*-tSZr-5e$u+UL-R+F6mCaxG zOW#&Ertd@2KOpv?SgzU3En>IE+$VNK?5Nl=vG|wD~X4|%Lu}8#C Ih&?LyFNiVEW&i*H literal 0 HcmV?d00001 diff --git a/tests/test_data/era5/era5_surface_thermal_radiation_downwards_1996-1.nc b/tests/test_data/era5/era5_surface_thermal_radiation_downwards_1996-1.nc new file mode 100644 index 0000000000000000000000000000000000000000..2528c6961e087e0fa3a0281e1fbf9b38e40c971c GIT binary patch literal 33280 zcmeI5O>7%g5XYZC;yQ`*QAk56X=9Q=p`@t|p{11~oY+nikvfn72N1Yc?4~QZc4Rx{ zMlEg}fdfc@#Dyag5+Dw!QsIC&p&mg+r~)KTs9ZoffWpjsGmc|YflJ#1`ai`xes5=I zXaD=&Z`ZlZt8=N@p00ge{&3i6+mvQraU(I=@zcrhAB%ITC4b~&|A-w6?2=u2NUQDg z8y~y$_w)9Uoa0UIPz&9~TVF81hFY*%tK>fmx=v__tuH8D(^)K+R=wJKzHrSsm_0*= zcLsc>^|abIE64m2mCh{BFJ`jK$?1i(iHz}h zz;w6+GSjEh*1t>q8yZvp>)$^3#OXD9llA&t^m?1l!cA}FE!B&4YhSO?o%QOZV61O3 zSF=-+zV>WfTX1sCHOZxGM{OX&H<0hGPOs)^#UZKH-L!g5R+lapm1)&sS(A?{#IZu;S=X}aL=jFH0b~(zJ^^XRBXxCV$#gOA8xv6!%jSG>YyKj9242rZxi-q~+r_J5u)=wX%G)u^l zG#|A#B0EuI@HF=+;hsk4*OP$!%`@jeePJ;<_k>##pUbWMj(1UOJDf44%U`{? z74ThN$<$tVroDiCu@Sl%LAs>>elv{uVerb!Td5skC{t^scEX)WV834SYE`Q_Ej~Lh z&X(6J)o9f#trVj1YLO8jGE?XXEWa})0kFBI0UkJv%edMQ1dUP@<@>8xESM=51q1NmGnS1r`G zR!p8!F0l|M{7;W_eeIFdO>*|zW6jr=sJlH0akzaMDK!2}%^YaXs4f}R#ep*l-4=~x zhL|tD{E@^)g918aiRI0O<}LE%=%kfk=r`1&tn~bL>z;Xi;ioY#^YZ0 ztG_bgoo12cSKfi->)A8*uRP-+my+G%N zEfjzPPyh-*0Vn_kpa2wr0#E=7{I3+)n{bx{3msSi%md5=%mb_gZ~zvPWUs@Th7>oW;^{Ci?AiM=Y;Bol^PRBS@*X|c4+XB}MeVE(Fg1+|W#)(zA; zfvt4`weFwR`O~_7TE|bz+G(9Wt;?r%__Xex*4fj#dRj+M>*ftgo?H`~+g@jEt!)0f zPx`jHF?}DB{(i9s#B$AMZc)26=3cQQVn@Y}iM>zk{bCP_eL(DkVh@R>GTXL|i#;rM JLhKQ-e*szL{}ccK literal 0 HcmV?d00001 diff --git a/tests/test_data/era5/properties.json b/tests/test_data/era5/properties.json new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_datasets/test_era5.py b/tests/test_datasets/test_era5.py new file mode 100644 index 0000000..c3908b5 --- /dev/null +++ b/tests/test_datasets/test_era5.py @@ -0,0 +1,218 @@ +"""Unit test for ERA5 dataset.""" + +import json +from pathlib import Path +from unittest.mock import patch +import numpy as np +import pytest +import xarray as xr +from zampy.datasets import era5 +from zampy.datasets.dataset_protocol import SpatialBounds +from zampy.datasets.dataset_protocol import TimeBounds +from . import data_folder + + +@pytest.fixture(scope="function") +def valid_path_cds(tmp_path_factory): + """Create a dummy .cdsapirc file.""" + fn = tmp_path_factory.mktemp("usrhome") / ".cdsapirc" + with open(fn, mode="w", encoding="utf-8") as f: + f.write("url: a\nkey: 123:abc-def") + return fn + + +@pytest.fixture(scope="function") +def dummy_dir(tmp_path_factory): + """Create a dummpy directory for testing.""" + return tmp_path_factory.mktemp("data") + + +class TestERA5: + """Test the ERA5 class.""" + + @patch("cdsapi.Client.retrieve") + def test_download(self, mock_retrieve, valid_path_cds, dummy_dir): + """Test download functionality. + Here we mock the downloading and save property file to a fake path. + """ + times = TimeBounds(np.datetime64("2010-01-01"), np.datetime64("2010-01-31")) + bbox = SpatialBounds(54, 56, 1, 3) + variable = ["10m_v_component_of_wind"] + download_dir = Path(dummy_dir, "download") + + era5_dataset = era5.ERA5() + # create a dummy .cdsapirc + patching = patch("zampy.datasets.utils.CDSAPI_CONFIG_PATH", valid_path_cds) + with patching: + era5_dataset.download( + download_dir=download_dir, + time_bounds=times, + spatial_bounds=bbox, + variable_names=variable, + overwrite=True, + ) + + # make sure that the download is called + mock_retrieve.assert_called_once_with( + "reanalysis-era5-single-levels", + { + "product_type": "reanalysis", + "variable": variable, + "year": "2010", + "month": "1", + # fmt: off + "day": [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", + "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", + "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", + "31", + ], + "time": [ + "00:00", "01:00", "02:00", "03:00", "04:00", "05:00", "06:00", + "07:00", "08:00", "09:00", "10:00", "11:00", "12:00", "13:00", + "14:00", "15:00", "16:00", "17:00", "18:00", "19:00", "20:00", + "21:00", "22:00", "23:00", + ], + # fmt: on + "area": [ + bbox.north, + bbox.west, + bbox.south, + bbox.east, + ], + "format": "netcdf", + }, + ) + + # check property file + with (download_dir / "era5" / "properties.json").open( + mode="r", encoding="utf-8" + ) as file: + json_dict = json.load(file) + # check property + assert json_dict["variable_names"] == variable + + def ingest_dummy_data(self, temp_dir): + """Ingest dummy tif data to nc for other tests.""" + era5_dataset = era5.ERA5() + era5_dataset.ingest(download_dir=data_folder, ingest_dir=Path(temp_dir)) + ds = xr.load_dataset( + Path( + temp_dir, + "era5", + "era5_10m_v_component_of_wind_1996-1.nc", + ) + ) + + return ds, era5_dataset + + def test_ingest(self, dummy_dir): + """Test ingest function.""" + ds, _ = self.ingest_dummy_data(dummy_dir) + assert type(ds) == xr.Dataset + + def test_load(self): + """Test load function.""" + times = TimeBounds(np.datetime64("1996-01-01"), np.datetime64("1996-01-02")) + bbox = SpatialBounds(39, -107, 37, -109) + variable = ["10m_v_component_of_wind"] + + era5_dataset = era5.ERA5() + + ds = era5_dataset.load( + ingest_dir=Path(data_folder), + time_bounds=times, + spatial_bounds=bbox, + variable_names=variable, + resolution=1.0, + regrid_method="flox", + ) + + # we assert the regridded coordinates + expected_lat = [37.0, 38.0, 39.0] + expected_lon = [-109.0, -108.0, -107.0] + + np.testing.assert_allclose(ds.latitude.values, expected_lat) + np.testing.assert_allclose(ds.longitude.values, expected_lon) + + def test_convert(self, dummy_dir): + """Test convert function.""" + _, era5_dataset = self.ingest_dummy_data(dummy_dir) + era5_dataset.convert(ingest_dir=Path(dummy_dir), convention="ALMA") + # TODO: finish this test when the function is complete. + + +def test_convert_to_zampy(dummy_dir): + """Test function for converting file to zampy format.""" + ingest_folder = Path(data_folder, "era5") + era5.convert_to_zampy( + ingest_folder=Path(dummy_dir), + file=Path(ingest_folder, "era5_10m_v_component_of_wind_1996-1.nc"), + overwrite=True, + ) + + ds = xr.load_dataset(Path(dummy_dir, "era5_10m_v_component_of_wind_1996-1.nc")) + + assert list(ds.data_vars)[0] == "northward_component_of_wind" + + +def test_parse_nc_file_10m_wind(): + """Test parsing netcdf file function with 10 meter velocity u/v component.""" + variables = { + "10m_v_component_of_wind": "northward_component_of_wind", + "10m_u_component_of_wind": "eastward_component_of_wind", + } + for variable in variables: + ds = era5.parse_nc_file(data_folder / "era5" / f"era5_{variable}_1996-1.nc") + expected_var_name = variables[variable] + assert list(ds.data_vars)[0] == expected_var_name + assert ds[expected_var_name].attrs["units"] == "meter_per_second" + + +def test_parse_nc_file_radiation(): + """Test parsing netcdf file function with surface radiation.""" + variables = { + "surface_thermal_radiation_downwards": "strd", + "surface_solar_radiation_downwards": "ssrd", + } + for variable in variables: + ds_original = xr.load_dataset( + data_folder / "era5" / f"era5_{variable}_1996-1.nc" + ) + ds = era5.parse_nc_file(data_folder / "era5" / f"era5_{variable}_1996-1.nc") + + assert list(ds.data_vars)[0] == variable + assert ds[variable].attrs["units"] == "watt_per_square_meter" + assert np.allclose( + ds_original[variables[variable]].values, + ds[variable].values * 3600, + equal_nan=True, + ) + + +def test_parse_nc_file_precipitation(): + """Test parsing netcdf file function with precipitation.""" + ds_original = xr.load_dataset( + data_folder / "era5" / "era5_mean_total_precipitation_rate_1996-1.nc" + ) + ds = era5.parse_nc_file( + data_folder / "era5" / "era5_mean_total_precipitation_rate_1996-1.nc" + ) + expected_var_name = "total_precipitation" + + assert list(ds.data_vars)[0] == expected_var_name + assert ds["total_precipitation"].attrs["units"] == "millimeter_per_second" + assert np.allclose( + ds_original["mtpr"].values, + ds["total_precipitation"].values * era5.WATER_DENSITY / 1000, + equal_nan=True, + ) + + +def test_parse_nc_file_pressure(): + """Test parsing netcdf file function with surface pressure.""" + ds = era5.parse_nc_file(data_folder / "era5" / "era5_surface_pressure_1996-1.nc") + expected_var_name = "surface_pressure" + + assert list(ds.data_vars)[0] == expected_var_name + assert ds["surface_pressure"].attrs["units"] == "pascal" diff --git a/tests/test_utils.py b/tests/test_utils.py index f77ad62..5239c2c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,7 +3,11 @@ import tempfile from pathlib import Path from unittest.mock import patch +import numpy as np +import pytest from zampy.datasets import utils +from zampy.datasets.dataset_protocol import SpatialBounds +from zampy.datasets.dataset_protocol import TimeBounds def test_tqdm_update(): @@ -66,3 +70,71 @@ def test_download_url(mock_urlretrieve): utils.download_url(url, fpath, overwrite) # assrt that the urlretrieve function is called. assert mock_urlretrieve.called + + +@pytest.fixture(scope="function") +def valid_path_cds(tmp_path_factory): + """Create a dummy .cdsapirc file.""" + fn = tmp_path_factory.mktemp("usrhome") / ".cdsapirc" + with open(fn, mode="w", encoding="utf-8") as f: + f.write("url: a\nkey: 123:abc-def") + return fn + + +@patch("cdsapi.Client.retrieve") +def test_cds_request(mock_retrieve, valid_path_cds): + """ "Test cds request for downloading data from CDS server.""" + product = "reanalysis-era5-single-levels" + variables = ["10m_v_component_of_wind"] + time_bounds = TimeBounds( + np.datetime64("2010-01-01T00:00:00"), np.datetime64("2010-01-31T23:00:00") + ) + spatial_bounds = SpatialBounds(54, 56, 1, 3) + path = Path(__file__).resolve().parent + overwrite = True + + # create a dummy .cdsapirc + patching = patch("zampy.datasets.utils.CDSAPI_CONFIG_PATH", valid_path_cds) + with patching: + utils.cds_request( + product, variables, time_bounds, spatial_bounds, path, overwrite + ) + + mock_retrieve.assert_called_with( + product, + { + "product_type": "reanalysis", + "variable": variables, + "year": "2010", + "month": "1", + # fmt: off + "day": [ + "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", + "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", + "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", + "31", + ], + "time": [ + "00:00", "01:00", "02:00", "03:00", "04:00", "05:00", "06:00", + "07:00", "08:00", "09:00", "10:00", "11:00", "12:00", "13:00", + "14:00", "15:00", "16:00", "17:00", "18:00", "19:00", "20:00", + "21:00", "22:00", "23:00", + ], + # fmt: on + "area": [ + spatial_bounds.north, + spatial_bounds.west, + spatial_bounds.south, + spatial_bounds.east, + ], + "format": "netcdf", + }, + ) + + +def test_time_bounds_to_year_month(): + """Test year and month pair converter function.""" + times = TimeBounds(np.datetime64("2010-01-01"), np.datetime64("2010-01-31")) + expected = [("2010", "1")] + year_month_pairs = utils.time_bounds_to_year_month(times) + assert expected == year_month_pairs