From b2c7a1b1ddfdc7e7d43d80fb26106c5df7b43e55 Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 9 Jul 2024 13:23:12 -0400 Subject: [PATCH 1/6] Add example of using cftime_variables to usage docs (#174) * add example of using cftime_variables to usage docs * add indexes={} * release notes * remove accidental copypasta * move arguments to separate lines --- docs/releases.rst | 3 +++ docs/usage.md | 37 ++++++++++++++++++++++++++++++++++--- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/docs/releases.rst b/docs/releases.rst index 4ba7912a..ba267c3b 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -39,7 +39,10 @@ Bug fixes Documentation ~~~~~~~~~~~~~ +- Added example of using cftime_variables to usage docs. (:issue:`169`, :pull:`174`) + By `Tom Nicholas `_. - Updated the development roadmap in preparation for v1.0. (:pull:`164`) + By `Tom Nicholas `_. - Warn if user passes `indexes=None` to `open_virtual_dataset` to indicate that this is not yet fully supported. (:pull:`170`) By `Tom Nicholas `_. - Clarify that virtual datasets cannot be treated like normal xarray datasets. (:issue:`173`) diff --git a/docs/usage.md b/docs/usage.md index b58cc695..a443c0a0 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -298,7 +298,7 @@ TODO: Use preprocess to create a new index from the metadata Whilst the values of virtual variables (i.e. those backed by `ManifestArray` objects) cannot be loaded into memory, you do have the option of opening specific variables from the file as loadable lazy numpy/dask arrays, just like `xr.open_dataset` normally returns. These variables are specified using the `loadable_variables` argument: ```python -vds = open_virtual_dataset('air.nc', loadable_variables=['air', 'time']) +vds = open_virtual_dataset('air.nc', loadable_variables=['air', 'time'], indexes={}) ``` ```python Size: 31MB @@ -306,7 +306,7 @@ Dimensions: (time: 2920, lat: 25, lon: 53) Coordinates: lat (lat) float32 100B ManifestArray Size: 31MB +Dimensions: (time: 2920, lat: 25, lon: 53) +Coordinates: + lat (lat) float32 100B ManifestArray Date: Tue, 9 Jul 2024 15:54:37 -0500 Subject: [PATCH 2/6] Future-proof offset and size records in chunkmanifest (#177) --- virtualizarr/manifests/manifest.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 0297d17a..cc196e6d 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -71,8 +71,8 @@ class ChunkManifest: """ _paths: np.ndarray[Any, np.dtypes.StringDType] # type: ignore[name-defined] - _offsets: np.ndarray[Any, np.dtype[np.int32]] - _lengths: np.ndarray[Any, np.dtype[np.int32]] + _offsets: np.ndarray[Any, np.dtype[np.uint64]] + _lengths: np.ndarray[Any, np.dtype[np.uint64]] def __init__(self, entries: dict) -> None: """ @@ -100,8 +100,8 @@ def __init__(self, entries: dict) -> None: # Initializing to empty implies that entries with path='' are treated as missing chunks paths = np.empty(shape=shape, dtype=np.dtypes.StringDType()) # type: ignore[attr-defined] - offsets = np.empty(shape=shape, dtype=np.dtype("int32")) - lengths = np.empty(shape=shape, dtype=np.dtype("int32")) + offsets = np.empty(shape=shape, dtype=np.dtype("uint64")) + lengths = np.empty(shape=shape, dtype=np.dtype("uint64")) # populate the arrays for key, entry in entries.items(): @@ -128,8 +128,8 @@ def __init__(self, entries: dict) -> None: def from_arrays( cls, paths: np.ndarray[Any, np.dtype[np.dtypes.StringDType]], # type: ignore[name-defined] - offsets: np.ndarray[Any, np.dtype[np.int32]], - lengths: np.ndarray[Any, np.dtype[np.int32]], + offsets: np.ndarray[Any, np.dtype[np.uint64]], + lengths: np.ndarray[Any, np.dtype[np.uint64]], ) -> "ChunkManifest": """ Create manifest directly from numpy arrays containing the path and byte range information. @@ -161,13 +161,13 @@ def from_arrays( raise ValueError( f"paths array must have a numpy variable-length string dtype, but got dtype {paths.dtype}" ) - if offsets.dtype != np.dtype("int32"): + if offsets.dtype != np.dtype("uint64"): raise ValueError( - f"offsets array must have 32-bit integer dtype, but got dtype {offsets.dtype}" + f"offsets array must have 64-bit unsigned integer dtype, but got dtype {offsets.dtype}" ) - if lengths.dtype != np.dtype("int32"): + if lengths.dtype != np.dtype("uint64"): raise ValueError( - f"lengths array must have 32-bit integer dtype, but got dtype {lengths.dtype}" + f"lengths array must have 64-bit unsigned integer dtype, but got dtype {lengths.dtype}" ) # check shapes From 1ac4efc770f6965ac2c08f063359b191b112647b Mon Sep 17 00:00:00 2001 From: Nathan Zimmerman Date: Tue, 9 Jul 2024 16:00:33 -0500 Subject: [PATCH 3/6] Use a set to avoid duplicate var names from kerchunk (#179) --- virtualizarr/kerchunk.py | 4 ++-- virtualizarr/tests/test_kerchunk.py | 13 ++++++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 97f64b1b..6e82067d 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -165,8 +165,8 @@ def find_var_names(ds_reference_dict: KerchunkStoreRefs) -> list[str]: """Find the names of zarr variables in this store/group.""" refs = ds_reference_dict["refs"] - found_var_names = [key.split("/")[0] for key in refs.keys() if "/" in key] - return found_var_names + found_var_names = {key.split("/")[0] for key in refs.keys() if "/" in key} + return list(found_var_names) def extract_array_refs( diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py index 22d6d7df..a6693e29 100644 --- a/virtualizarr/tests/test_kerchunk.py +++ b/virtualizarr/tests/test_kerchunk.py @@ -5,7 +5,7 @@ import xarray as xr import xarray.testing as xrt -from virtualizarr.kerchunk import FileType, _automatically_determine_filetype +from virtualizarr.kerchunk import FileType, find_var_names, _automatically_determine_filetype, KerchunkStoreRefs from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.xarray import dataset_from_kerchunk_refs @@ -266,3 +266,14 @@ def test_FileType(): assert "zarr" == FileType("zarr").name with pytest.raises(ValueError): FileType(None) + + +def test_no_duplicates_find_var_names(): + """Verify that we get a deduplicated list of var names""" + ref_dict = { + "refs": { + "x/something": {}, + "x/otherthing": {} + } + } + assert len(find_var_names(ref_dict)) == 1 \ No newline at end of file From 47303fa0df8f71dead57bbfc48a6f2db4aee6cba Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Tue, 9 Jul 2024 17:43:28 -0400 Subject: [PATCH 4/6] linting --- docs/usage.md | 6 +++--- virtualizarr/tests/test_kerchunk.py | 15 +++++++-------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index a443c0a0..b0935286 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -329,9 +329,9 @@ Notice that the `time` variable that was loaded above does not have the expected ```python vds = open_virtual_dataset( - 'air.nc', - loadable_variables=['air', 'time'], - cftime_variables=['time'], + 'air.nc', + loadable_variables=['air', 'time'], + cftime_variables=['time'], indexes={}, ) ``` diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py index a6693e29..9aa934df 100644 --- a/virtualizarr/tests/test_kerchunk.py +++ b/virtualizarr/tests/test_kerchunk.py @@ -5,7 +5,11 @@ import xarray as xr import xarray.testing as xrt -from virtualizarr.kerchunk import FileType, find_var_names, _automatically_determine_filetype, KerchunkStoreRefs +from virtualizarr.kerchunk import ( + FileType, + _automatically_determine_filetype, + find_var_names, +) from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.xarray import dataset_from_kerchunk_refs @@ -270,10 +274,5 @@ def test_FileType(): def test_no_duplicates_find_var_names(): """Verify that we get a deduplicated list of var names""" - ref_dict = { - "refs": { - "x/something": {}, - "x/otherthing": {} - } - } - assert len(find_var_names(ref_dict)) == 1 \ No newline at end of file + ref_dict = {"refs": {"x/something": {}, "x/otherthing": {}}} + assert len(find_var_names(ref_dict)) == 1 From aecdc3d11a3b226850a173b77d98b1a9910452fd Mon Sep 17 00:00:00 2001 From: Tom Nicholas Date: Tue, 9 Jul 2024 17:49:28 -0400 Subject: [PATCH 5/6] release notes for v1.0 (#181) --- docs/releases.rst | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/docs/releases.rst b/docs/releases.rst index ba267c3b..58f4f6f4 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -1,14 +1,41 @@ Release notes ============= -.. _v0.2: +.. _v1.1: -v0.2 (unreleased) +v1.1 (unreleased) ----------------- New Features ~~~~~~~~~~~~ +Breaking changes +~~~~~~~~~~~~~~~~ + +Deprecations +~~~~~~~~~~~~ + +Bug fixes +~~~~~~~~~ + +Documentation +~~~~~~~~~~~~~ + +Internal Changes +~~~~~~~~~~~~~~~~ + +.. _v1.0: + +v1.0 (9th July 2024) +-------------------- + +This release marks VirtualiZarr as mostly feature-complete, in the sense of achieving feature parity with kerchunk's logic for combining datasets, providing an easier way to manipulate kerchunk references in memory and generate kerchunk reference files on disk. + +Future VirtualiZarr development will focus on generalizing and upstreaming useful concepts into the Zarr specification, the Zarr-Python library, Xarray, and possibly some new packages. See the roadmap in the documentation for details. + +New Features +~~~~~~~~~~~~ + - Now successfully opens both tiff and FITS files. (:issue:`160`, :pull:`162`) By `Tom Nicholas `_. - Added a `.rename_paths` convenience method to rename paths in a manifest according to a function. From a5ae1f6a1e13b8bb9dbedd20b29044b4f3c706e3 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Tue, 9 Jul 2024 17:53:13 -0400 Subject: [PATCH 6/6] v1.0->v1.0.0 --- docs/releases.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/releases.rst b/docs/releases.rst index 58f4f6f4..c44ff245 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -1,10 +1,10 @@ Release notes ============= -.. _v1.1: +.. _v1.0.1: -v1.1 (unreleased) ------------------ +v1.0.1 (unreleased) +------------------- New Features ~~~~~~~~~~~~ @@ -24,10 +24,10 @@ Documentation Internal Changes ~~~~~~~~~~~~~~~~ -.. _v1.0: +.. _v1.0.0: -v1.0 (9th July 2024) --------------------- +v1.0.0 (9th July 2024) +---------------------- This release marks VirtualiZarr as mostly feature-complete, in the sense of achieving feature parity with kerchunk's logic for combining datasets, providing an easier way to manipulate kerchunk references in memory and generate kerchunk reference files on disk.