Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Let Xarray handle decode_times #232

Merged
merged 9 commits into from
Aug 27, 2024
8 changes: 6 additions & 2 deletions docs/releases.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ v1.0.1 (unreleased)
New Features
~~~~~~~~~~~~

- Adds `decode_times` to open_virtual_dataset (:pull:`232`)
By `Raphael Hagen <https://github.com/norlandrhagen>`_.

- Add parser for the OPeNDAP DMR++ XML format and integration with open_virtual_dataset (:pull:`113`)
By `Ayush Nag <https://github.com/ayushnag>`_.

Expand All @@ -17,17 +20,18 @@ New Features

Breaking changes
~~~~~~~~~~~~~~~~

- Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`)
By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.
- VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclass
`pydantic.BaseModel` (:pull:`210`)
- `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`xxx`)


Deprecations
~~~~~~~~~~~~

- Depreciates cftime_variables in open_virtual_dataset in favor of decode_times. (:pull:`232`)
By `Raphael Hagen <https://github.com/norlandrhagen>`_.

Bug fixes
~~~~~~~~~

Expand Down
12 changes: 7 additions & 5 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -306,8 +306,8 @@ Dimensions: (time: 2920, lat: 25, lon: 53)
Coordinates:
lat (lat) float32 100B ManifestArray<shape=(25,), dtype=float32, chu...
lon (lon) float32 212B ManifestArray<shape=(53,), dtype=float32, chu...
* time (time) float32 12kB 1.867e+06 1.867e+06 ... 1.885e+06 1.885e+06
Data variables:
* time (time) datetime64[ns] 23kB 2013-01-01 ... 2014-12-31T18:00:00
Data variables:
air (time, lat, lon) float64 31MB ...
Attributes:
Conventions: COARDS
Expand All @@ -325,13 +325,15 @@ Loading variables can be useful in a few scenarios:

### CF-encoded time variables

Notice that the `time` variable that was loaded above does not have the expected dtype. To correctly decode time variables according to the CF conventions (like `xr.open_dataset` does by default), you need to include them in an additional keyword argument `cftime_variables`:
To correctly decode time variables according to the CF conventions, you need to pass `time` to `loadable_variables` and ensure the `decode_times` argument of `open_virtual_dataset` is set to True (`decode_times` defaults to None).



```python
vds = open_virtual_dataset(
'air.nc',
loadable_variables=['air', 'time'],
cftime_variables=['time'],
decode_times=True,
indexes={},
)
```
Expand All @@ -352,7 +354,7 @@ Attributes:
title: 4x daily NMC reanalysis (1948)
```

Now the loaded time variable has a `datetime64[ns]` dtype. Any variables listed as `cftime_variables` must also be listed as `loadable_variables`.


## Writing virtual stores to disk

Expand Down
41 changes: 15 additions & 26 deletions virtualizarr/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

import xarray as xr
from xarray.backends import AbstractDataStore, BackendArray
from xarray.coding.times import CFDatetimeCoder
from xarray.core.indexes import Index, PandasIndex
from xarray.core.variable import IndexVariable

Expand Down Expand Up @@ -54,6 +53,7 @@ def open_virtual_dataset(
filetype: FileType | None = None,
drop_variables: Iterable[str] | None = None,
loadable_variables: Iterable[str] | None = None,
decode_times: bool | None = None,
cftime_variables: Iterable[str] | None = None,
Copy link
Contributor

@jsignell jsignell Sep 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just seeing this after getting back from break. Should the cftime_variables kwarg be removed?

EDIT: oohh I see it's being deprecated.

indexes: Mapping[str, Index] | None = None,
virtual_array_class=ManifestArray,
Expand All @@ -79,10 +79,8 @@ def open_virtual_dataset(
loadable_variables: list[str], default is None
Variables in the file to open as lazy numpy/dask arrays instead of instances of virtual_array_class.
Default is to open all variables as virtual arrays (i.e. ManifestArray).
cftime_variables : list[str], default is None
Interpret the value of specified vars using cftime, returning a datetime.
These will be automatically re-encoded with cftime. This list must be a subset
of ``loadable_variables``.
decode_times: bool | None, default is None
Bool that is passed into Xarray's open_dataset. Allows time to be decoded into a datetime object.
indexes : Mapping[str, Index], default is None
Indexes to use on the returned xarray Dataset.
Default is None, which will read any 1D coordinate data to create in-memory Pandas indexes.
Expand All @@ -99,6 +97,15 @@ def open_virtual_dataset(
vds
An xarray Dataset containing instances of virtual_array_cls for each variable, or normal lazily indexed arrays for each variable in loadable_variables.
"""

if cftime_variables is not None:
# It seems like stacklevel=2 is req to surface this warning.
warnings.warn(
"cftime_variables is depreciated and will be ignored. Pass decode_times=True and loadable_variables=['time'] to decode time values to datetime objects.",
DeprecationWarning,
stacklevel=2,
)

loadable_vars: dict[str, xr.Variable]
virtual_vars: dict[str, xr.Variable]
vars: dict[str, xr.Variable]
Expand All @@ -119,20 +126,6 @@ def open_virtual_dataset(
if common:
raise ValueError(f"Cannot both load and drop variables {common}")

if cftime_variables is None:
cftime_variables = []
elif isinstance(cftime_variables, str):
cftime_variables = [cftime_variables]
else:
cftime_variables = list(cftime_variables)

if diff := (set(cftime_variables) - set(loadable_variables)):
missing_str = ", ".join([f"'{v}'" for v in diff])
raise ValueError(
"All ``cftime_variables`` must be included in ``loadable_variables`` "
f"({missing_str} not in ``loadable_variables``)"
)

if virtual_array_class is not ManifestArray:
raise NotImplementedError()

Expand All @@ -150,9 +143,9 @@ def open_virtual_dataset(
elif filetype == FileType.dmrpp:
from virtualizarr.readers.dmrpp import DMRParser

if loadable_variables != [] or cftime_variables != [] or indexes is None:
if loadable_variables != [] or indexes is None:
raise NotImplementedError(
"Specifying `loadable_variables`, `cftime_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files."
"Specifying `loadable_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files."
)

fpath = _fsspec_openfile_from_filepath(
Expand Down Expand Up @@ -202,7 +195,7 @@ def open_virtual_dataset(
ds = xr.open_dataset(
cast(XArrayOpenT, fpath),
drop_variables=drop_variables,
decode_times=False,
decode_times=decode_times,
)

if indexes is None:
Expand All @@ -225,10 +218,6 @@ def open_virtual_dataset(
if name in loadable_variables
}

for name in cftime_variables:
var = loadable_vars[name]
loadable_vars[name] = CFDatetimeCoder().decode(var, name=name)

# if we only read the indexes we can just close the file right away as nothing is lazy
if loadable_vars == {}:
ds.close()
Expand Down
36 changes: 27 additions & 9 deletions virtualizarr/tests/test_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def test_no_indexes(self, netcdf4_file):
def test_create_default_indexes(self, netcdf4_file):
with pytest.warns(UserWarning, match="will create in-memory pandas indexes"):
vds = open_virtual_dataset(netcdf4_file, indexes=None)
ds = open_dataset(netcdf4_file, decode_times=False)
ds = open_dataset(netcdf4_file, decode_times=True)

# TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812
assert index_mappings_equal(vds.xindexes, ds.xindexes)
Expand All @@ -104,6 +104,31 @@ def index_mappings_equal(indexes1: Mapping[str, Index], indexes2: Mapping[str, I
return True


def test_cftime_index(tmpdir):
"""Ensure a virtual dataset contains the same indexes as an Xarray dataset"""
# Note: Test was created to debug: https://github.com/zarr-developers/VirtualiZarr/issues/168
ds = xr.Dataset(
data_vars={
"tasmax": (["time", "lat", "lon"], np.random.rand(2, 18, 36)),
},
coords={
"time": np.array(["2023-01-01", "2023-01-02"], dtype="datetime64[ns]"),
"lat": np.arange(-90, 90, 10),
"lon": np.arange(-180, 180, 10),
},
attrs={"attr1_key": "attr1_val"},
)
ds.to_netcdf(f"{tmpdir}/tmp.nc")
vds = open_virtual_dataset(
f"{tmpdir}/tmp.nc", loadable_variables=["time", "lat", "lon"], indexes={}
)
# TODO use xr.testing.assert_identical(vds.indexes, ds.indexes) instead once class supported by assertion comparison, see https://github.com/pydata/xarray/issues/5812
assert index_mappings_equal(vds.xindexes, ds.xindexes)
assert list(ds.coords) == list(vds.coords)
assert vds.dims == ds.dims
assert vds.attrs == ds.attrs


class TestOpenVirtualDatasetAttrs:
def test_drop_array_dimensions(self, netcdf4_file):
# regression test for GH issue #150
Expand Down Expand Up @@ -211,7 +236,7 @@ def test_loadable_variables(self, netcdf4_file):
else:
assert isinstance(vds[name].data, ManifestArray), name

full_ds = xr.open_dataset(netcdf4_file, decode_times=False)
full_ds = xr.open_dataset(netcdf4_file, decode_times=True)

for name in full_ds.variables:
if name in vars_to_load:
Expand Down Expand Up @@ -246,10 +271,3 @@ def test_open_dataset_with_scalar(self, hdf5_scalar, tmpdir):
vds = open_virtual_dataset(hdf5_scalar)
assert vds.scalar.dims == ()
assert vds.scalar.attrs == {"scalar": "true"}


def test_cftime_variables_must_be_in_loadable_variables(tmpdir):
ds = xr.Dataset(data_vars={"time": ["2024-06-21"]})
ds.to_netcdf(f"{tmpdir}/scalar.nc")
with pytest.raises(ValueError, match="'time' not in"):
open_virtual_dataset(f"{tmpdir}/scalar.nc", cftime_variables=["time"])
2 changes: 0 additions & 2 deletions virtualizarr/tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,11 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars
f"{tmpdir}/air1.nc",
indexes={},
loadable_variables=time_vars,
cftime_variables=time_vars,
)
vds2 = open_virtual_dataset(
f"{tmpdir}/air2.nc",
indexes={},
loadable_variables=time_vars,
cftime_variables=time_vars,
)

if decode_times is False:
Expand Down
Loading