EcoExtreML · SarahAlidoost · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 18, 2024
diff --git a/docs/configuration.md b/docs/configuration.md
@@ -23,15 +23,25 @@ The configuration file should contain the `working_directory`, for instance:
 working_directory: /path_to_a_working_directory/  #for example: /home/bart/Zampy
 ```
 
-If you need access to data on CDS or ADS server, you should add your CDS or ADS credentials to `zampy_config.yml`:
+The old Climate Data Store (CDS) is shut down on 3 September 2024. For more
+information see:
+[the-new-climate-data-store-beta](https://forum.ecmwf.int/t/the-new-climate-data-store-beta-cds-beta-is-now-live/3315).
+To use the new CDS/ADS, you need to have an ECMWF account, your existing CDS/ADS
+credentials does not work.
+
+If you need access to data on CDS or ADS server, you should add your CDS/ADS
+credentials to `zampy_config.yml`. To find your key, go to [CDS how to
+api](https://cds.climate.copernicus.eu/how-to-api), or [ADS how to
+api](https://ads.atmosphere.copernicus.eu/how-to-api). You can skip the steps
+related to `.cdsapirc` and simply add the key to `zampy_config.yml`:
 
 ```yaml
 cdsapi:
-  url:  # for example https://cds.climate.copernicus.eu/api/v2
-  key:  # for example 12345:xhashd-232jcsha-dsaj429-cdjajd29319
+  url:  # for example https://cds.climate.copernicus.eu/api
+  key:  # for example xhashd-232jcsha-dsaj429-cdjajd29319
 adsapi:
-  url:  # for example https://ads.atmosphere.copernicus.eu/api/v2
-  key:  # for example 12345:xhashd-232jcsha-dsaj429-cdjajd29319
+  url:  # for example https://ads.atmosphere.copernicus.eu/api
+  key:  # for example xhashd-232jcsha-dsaj429-cdjajd29319
 ```
 
 ## Instructions for CDS/ADS datasets
@@ -45,9 +55,6 @@ To download the following datasets, users need access to CDS/ADS via `cdsapi`/`a
 - ADS
   - CAMS EGG4 (e.g. co2)
 
-To generate these API keys, you need to be a registered user on *CDS* via the [registration page](https://cds.climate.copernicus.eu/user/register?destination=%2F%23!%2Fhome), or on *ADS* via the [registration page](https://ads.atmosphere.copernicus.eu/user/register?destination=%2F%23!%2Fhome).
-
-Before submitting any request with `zampy`, please put your `cdsapi`/`adsapi` credentials in `zampy_config.yml`. Here is a short [instruction](https://cds.climate.copernicus.eu/api-how-to) about how to find your CDS/ADS API key. You can skip the steps related to `.cdsapirc` and simply add the key to `zampy_config.yml`.
 
 ### Agree to the Terms of Use on CDS/ADS
 

diff --git a/docs/index.md b/docs/index.md
@@ -52,7 +52,7 @@ download:
     cams:
       variables:
         - co2_concentration
-    
+
 convert:
   convention: ALMA
   frequency: 1H  # outputs at 1 hour frequency. Pandas-like freq-keyword.
@@ -67,6 +67,14 @@ When you have your reciped created and saved on your disk, you can execute your
 zampy /path_to_recipe/sample_recipe.yml
 ```
 
+>NOTE: You may recieve an error message from CDS/ADS if not all the required
+>licences have been accepted. Follow the instructions in the error message to
+>accept the licences and run the recipe again.
+
+When downloading process starts, you can also check the status of your requests
+in your CDS/ADS profile.
+
+
 ### Interact with `zampy` in notebooks
 
 It is possible to use `zampy` directly in Python via its Python API. This is not recommended, as it is more difficult to reproduce the workflow if there is no recipe.

diff --git a/pyproject.toml b/pyproject.toml
@@ -62,7 +62,7 @@ dependencies = [
   "pint",
   "cf_xarray",  # required to auto-pint CF compliant datasets.
   "pint-xarray",
-  "cdsapi",
+  "cdsapi>=0.7.2",
   "xarray-regrid", # for regridding
 ]
 dynamic = ["version"]

diff --git a/recipes/STEMMUS_SCOPE_input.yml b/recipes/STEMMUS_SCOPE_input.yml
@@ -2,7 +2,7 @@
 name: "STEMMUS_SCOPE_input"
 
 download:
-  time: ["2020-01-01", "2020-06-30"]
+  time: ["2020-01-01", "2020-02-15"]
   bbox: [60, 10, 50, 0] # NESW
   datasets:
     era5_land:
@@ -37,5 +37,5 @@ download:
 
 convert:
   convention: ALMA
-  frequency: 1H  # outputs at 1 hour frequency. Pandas-like freq-keyword.
+  frequency: 1h  # outputs at 1 hour frequency. Pandas-like freq-keyword.
   resolution: 0.25  # output resolution in degrees.
diff --git a/src/zampy/datasets/cds_utils.py b/src/zampy/datasets/cds_utils.py
@@ -35,10 +35,10 @@
     "01", "02", "03", "04", "05", "06", "07", "08", "09", "10",
     "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
     "21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
-    "31", 
+    "31",
 ]  # fmt: skip
 
-ALL_HOURS = [  
+ALL_HOURS = [
     "00:00", "01:00", "02:00", "03:00", "04:00", "05:00", "06:00",
     "07:00", "08:00", "09:00", "10:00", "11:00", "12:00", "13:00",
     "14:00", "15:00", "16:00", "17:00", "18:00", "19:00", "20:00",
@@ -97,11 +97,13 @@ def cds_request(
 
     url, api_key = cds_api_key(fname)
 
+    # TODO: expose timeout, see issue 64
     c = cdsapi.Client(
         url=url,
         key=api_key,
         verify=True,
         quiet=True,
+        timeout=300,
     )
     # choose retrieve function
     retrieve_func = RETRIEVE_FUNCTION[fname]
@@ -124,7 +126,8 @@ def cds_request_land_cover(
     dataset: str,
     time_bounds: TimeBounds,
     path: Path,
-    overwrite: bool,
+    spatial_bounds: SpatialBounds | None = None,
+    overwrite: bool = False,
 ) -> None:
     """Download land cover data via CDS API.
 
@@ -136,6 +139,7 @@ def cds_request_land_cover(
         dataset: Dataset name for retrieval via `cdsapi`.
         time_bounds: Zampy time bounds object.
         path: File path to which the data should be saved.
+        spatial_bounds: Zampy spatial bounds object.
         overwrite: If an existing file (of the same size!) should be overwritten.
     """
     fname = PRODUCT_FNAME[dataset]
@@ -152,18 +156,27 @@ def cds_request_land_cover(
     years_months = time_bounds_to_year_month(time_bounds)
     years = {year for (year, _) in years_months}
 
+    if spatial_bounds is not None:
+        area = [
+            spatial_bounds.north,
+            spatial_bounds.west,
+            spatial_bounds.south,
+            spatial_bounds.east,
+        ]
+
     for year in tqdm(years):
         if int(year) < 2016:
-            version = "v2.0.7cds"
+            version = "v2_0_7cds"
         else:
-            version = "v2.1.1"
+            version = "v2_1_1"
         r = c.retrieve(
             dataset,
             {
                 "variable": "all",
                 "format": "zip",
                 "year": year,
                 "version": version,
+                "area": area,
             },
         )
         fpath = path / f"{fname}_LCCS_MAP_300m_{year}.zip"

diff --git a/src/zampy/datasets/ecmwf_dataset.py b/src/zampy/datasets/ecmwf_dataset.py
@@ -120,11 +120,19 @@ def load(
                 files += (ingest_dir / self.name).glob(f"{self.name}_{var}*.nc")
 
         ds = xr.open_mfdataset(files, chunks={"latitude": 200, "longitude": 200})
-        ds = ds.sel(time=slice(time_bounds.start, time_bounds.end))
 
+        # rename valid_time to time
+        if "valid_time" in ds.dims:
+            ds = ds.rename({"valid_time": "time"})
+
+        ds = ds.sel(time=slice(time_bounds.start, time_bounds.end))
         grid = xarray_regrid.create_regridding_dataset(
             make_grid(spatial_bounds, resolution)
         )
+
+        # this is needed before regrid
+        ds = ds.unify_chunks()
+
         ds = ds.regrid.linear(grid)
 
         return ds

diff --git a/src/zampy/datasets/fapar_lai.py b/src/zampy/datasets/fapar_lai.py
@@ -150,8 +150,7 @@ def load(
         variable_names: list[str],
     ) -> xr.Dataset:
         files = list((ingest_dir / self.name).glob("*.nc"))
-
-        ds = xr.open_mfdataset(files, parallel=True)
+        ds = xr.open_mfdataset(files)
         ds = ds.sel(time=slice(time_bounds.start, time_bounds.end))
 
         grid = xarray_regrid.create_regridding_dataset(
@@ -223,7 +222,7 @@ def download_fapar_lai(
         "format": "zip",
         "variable": "lai",
         "horizontal_resolution": "1km",
-        "product_version": "V3",
+        "product_version": "v3",
         "satellite": "spot" if year < 2014 else "proba",
         "sensor": "vgt",
         "month": f"{month:0>2}",

diff --git a/src/zampy/datasets/land_cover.py b/src/zampy/datasets/land_cover.py
@@ -82,6 +82,7 @@ def download(
         cds_utils.cds_request_land_cover(
             dataset=self.cds_dataset,
             time_bounds=time_bounds,
+            spatial_bounds=spatial_bounds,
             path=download_folder,
             overwrite=overwrite,
         )
@@ -134,16 +135,30 @@ def load(
                 )
                 raise ValueError(msg)
         files = list((ingest_dir / self.name).glob(f"{self.name}_*.nc"))
-
         ds = xr.open_mfdataset(files, chunks={"latitude": 200, "longitude": 200})
         ds = ds.sel(time=slice(time_bounds.start, time_bounds.end))
 
         grid = xarray_regrid.create_regridding_dataset(
             utils.make_grid(spatial_bounds, resolution)
         )
-        ds = ds.regrid.most_common(grid, time_dim="time", max_mem=1e9)
 
-        return ds
+        ds_regrid = {}
+        for variable in variable_names:
+            # select the variable to be regridded
+            da = ds[variable]
+
+            # get values for most common method
+            if "flag_values" in da.attrs:
+                regrid_values = da.attrs["flag_values"]
+            else:
+                regrid_values = np.unique(da.values)
+
+            da_regrid = da.regrid.most_common(grid, values=regrid_values)
+
+            # make sure dtype is the same
+            ds_regrid[variable] = da_regrid.astype(da.dtype)
+
+        return xr.Dataset(ds_regrid)
 
     def convert(
         self,
@@ -207,27 +222,40 @@ def extract_netcdf_to_zampy(file: Path) -> xr.Dataset:
 
         # only keep land cover class variable
         with xr.open_dataset(unzip_folder / zipped_file_name) as ds:
-            var_list = [var for var in ds.data_vars]
+            var_list = list(ds.data_vars)
             raw_variable = "lccs_class"
             var_list.remove(raw_variable)
             ds = ds.drop_vars(var_list)  # noqa: PLW2901
 
             ds = ds.sortby(["lat", "lon"])  # noqa: PLW2901
             ds = ds.rename({"lat": "latitude", "lon": "longitude"})  # noqa: PLW2901
             new_grid = xarray_regrid.Grid(
-                north=90,
-                east=180,
-                south=-90,
-                west=-180,
+                north=ds["latitude"].max().item(),
+                east=ds["longitude"].max().item(),
+                south=ds["latitude"].min().item(),
+                west=ds["longitude"].min().item(),
                 resolution_lat=0.05,
                 resolution_lon=0.05,
             )
 
             target_dataset = xarray_regrid.create_regridding_dataset(new_grid)
 
-            ds_regrid = ds.regrid.most_common(
-                target_dataset, time_dim="time", max_mem=1e9
-            )
+            # select the variable to be regridded
+            da = ds[raw_variable]
+
+            # get values for most common method
+            if "flag_values" in da.attrs:
+                regrid_values = da.attrs["flag_values"]
+            else:
+                regrid_values = np.unique(da.values)
+
+            da_regrid = da.regrid.most_common(target_dataset, values=regrid_values)
+
+            # make sure dtype is the same
+            da_regrid = da_regrid.astype(da.dtype)
+
+            # convert dataarray to dataset
+            ds_regrid = da_regrid.to_dataset()
 
         # rename variable to follow the zampy convention
         variable_name = "land_cover"

diff --git a/src/zampy/datasets/utils.py b/src/zampy/datasets/utils.py
@@ -43,10 +43,13 @@ def download_url(url: str, fpath: Path, overwrite: bool) -> None:
         print(f"File '{fpath.name}' already exists, skipping...")
 
 
-def get_url_size(url: str) -> int:
+def get_url_size(url: str) -> int | None:
     """Return the size (bytes) of a given URL."""
     response = requests.head(url)
-    return int(response.headers["Content-Length"])
+    content_length = response.headers.get("Content-Length")
+    if content_length:
+        return int(content_length)
+    return None
 
 
 def get_file_size(fpath: Path) -> int:

diff --git a/src/zampy/recipe.py b/src/zampy/recipe.py
@@ -137,13 +137,19 @@ def run(self) -> None:
             ds = converter.convert(ds, dataset, convention=self.convention)
 
             if "time" in ds.dims:  # Dataset with only DEM (e.g.) has no time dim.
-                freq = xr.infer_freq(ds["time"])
-                if freq is None:  # fallback:
-                    freq = (
+                data_freq = None
+
+                if len(ds["time"]) == 1:
+                    data_freq = pd.Timedelta(self.frequency)
+                elif len(ds["time"]) > 3:  # see pandas _FrequencyInferer
+                    freq = xr.infer_freq(ds["time"])
+                    data_freq = pd.to_timedelta(pd.tseries.frequencies.to_offset(freq))
+
+                if data_freq is None:  # fallback:
+                    data_freq = pd.Timedelta(
                         ds["time"].isel(time=1).to_numpy()
                         - ds["time"].isel(time=0).to_numpy()
                     )
-                data_freq = pd.to_timedelta(pd.tseries.frequencies.to_offset(freq))
 
                 if data_freq < pd.Timedelta(self.frequency):
                     ds = ds.resample(time=self.frequency).mean()

diff --git a/tests/test_cds_utils.py b/tests/test_cds_utils.py
@@ -130,6 +130,7 @@ def test_cds_request_land_cover(mock_retrieve, valid_path_config):
             dataset,
             time_bounds,
             path,
+            SpatialBounds(54, 56, 1, 3),
             overwrite,
         )
 
@@ -139,7 +140,8 @@ def test_cds_request_land_cover(mock_retrieve, valid_path_config):
             "variable": "all",
             "format": "zip",
             "year": "1996",
-            "version": "v2.0.7cds",
+            "version": "v2_0_7cds",
+            "area": [54, 3, 1, 56],
         },
     )
 

diff --git a/tests/test_datasets/test_era5.py b/tests/test_datasets/test_era5.py
@@ -128,6 +128,9 @@ def test_load(self):
         np.testing.assert_allclose(ds.latitude.values, expected_lat)
         np.testing.assert_allclose(ds.longitude.values, expected_lon)
 
+        # check if valid_time not in the dataset
+        assert "valid_time" not in ds.dims
+
     def test_convert(self, dummy_dir):
         """Test convert function."""
         _, era5_dataset = self.ingest_dummy_data(dummy_dir)

diff --git a/tests/test_datasets/test_era5_land.py b/tests/test_datasets/test_era5_land.py
@@ -126,6 +126,9 @@ def test_load(self):
         np.testing.assert_allclose(ds.latitude.values, expected_lat)
         np.testing.assert_allclose(ds.longitude.values, expected_lon)
 
+        # check if valid_time not in the dataset
+        assert "valid_time" not in ds.dims
+
     def test_convert(self, dummy_dir):
         """Test convert function."""
         _, era5_land_dataset = self.ingest_dummy_data(dummy_dir)

diff --git a/tests/test_datasets/test_fapar_lai.py b/tests/test_datasets/test_fapar_lai.py
@@ -61,7 +61,7 @@ def test_download(self, mock_retrieve, valid_path_config, dummy_dir):
                     "format": "zip",
                     "variable": "lai",
                     "horizontal_resolution": "1km",
-                    "product_version": "V3",
+                    "product_version": "v3",
                     "satellite": "proba",
                     "sensor": "vgt",
                     "month": "01",