From 6b7abe2a0dc650ae7e6bf07c080cc9023a17bf2c Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 19 Apr 2024 13:25:28 -0600 Subject: [PATCH 01/68] Generate chunk manifest backed variable from HDF5 dataset. --- pyproject.toml | 1 + virtualizarr/readers/hdf.py | 135 ++++++++++++++++++++ virtualizarr/tests/test_readers/__init__.py | 0 virtualizarr/tests/test_readers/conftest.py | 91 +++++++++++++ virtualizarr/tests/test_readers/test_hdf.py | 71 ++++++++++ 5 files changed, 298 insertions(+) create mode 100644 virtualizarr/readers/hdf.py create mode 100644 virtualizarr/tests/test_readers/__init__.py create mode 100644 virtualizarr/tests/test_readers/conftest.py create mode 100644 virtualizarr/tests/test_readers/test_hdf.py diff --git a/pyproject.toml b/pyproject.toml index c7505bc..7994c92 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,7 @@ dependencies = [ "kerchunk==0.2.2", "pydantic", "packaging", + "h5netcdf", ] [project.optional-dependencies] diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py new file mode 100644 index 0000000..a34ae34 --- /dev/null +++ b/virtualizarr/readers/hdf.py @@ -0,0 +1,135 @@ +from typing import List + +import h5py +import xarray as xr + +from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray +from virtualizarr.zarr import ZArray + + +def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: + """ + Generate ChunkManifest for HDF5 dataset. + + Parameters + ---------- + path: str + The path the HDF5 container file + dset : h5py.Dataset + HDF5 dataset for which to create a ChunkManifest + + Returns + ------- + ChunkManifest + A Virtualizarr ChunkManifest + """ + dsid = dataset.id + + if dataset.chunks is None: + if dsid.get_offset() is None: + raise ValueError("Dataset has no space allocated in the file") + else: + key_list = [0] * (len(dataset.shape) or 1) + key = ".".join(map(str, key_list)) + chunk_entry = ChunkEntry( + path=path, + offset=dsid.get_offset(), + length=dsid.get_storage_size() + ) + chunk_entries = {key: chunk_entry} + chunk_manifest = ChunkManifest( + entries=chunk_entries + ) + return chunk_manifest + else: + num_chunks = dsid.get_num_chunks() + if num_chunks == 0: + raise ValueError("The dataset is chunked but contains no chunks") + + chunk_entries = dict() + + def get_key(blob): + key_list = [a // b for a, b in zip(blob.chunk_offset, dataset.chunks)] + key = ".".join(map(str, key_list)) + return key + + def store_chunk_entry(blob): + chunk_entries[get_key(blob)] = ChunkEntry( + path=path, + offset=blob.byte_offset, + length=blob.size + ) + + has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) + if has_chunk_iter: + dsid.chunk_iter(store_chunk_entry) + else: + for index in range(num_chunks): + store_chunk_entry(dsid.get_chunk_info(index)) + + chunk_manifest = ChunkManifest( + entries=chunk_entries + ) + return chunk_manifest + +def _dataset_dims(dataset: h5py.Dataset) -> List[str]: + """ + Get a list of dimension scale names attached to input HDF5 dataset. + + This is required by the xarray package to work with Zarr arrays. Only + one dimension scale per dataset dimension is allowed. If dataset is + dimension scale, it will be considered as the dimension to itself. + + Parameters + ---------- + dataset : h5py.Dataset + HDF5 dataset. + + Returns + ------- + list + List with HDF5 path names of dimension scales attached to input + dataset. + """ + dims = list() + rank = len(dataset.shape) + if rank: + for n in range(rank): + num_scales = len(dataset.dims[n]) + if num_scales == 1: + dims.append(dataset.dims[n][0].name[1:]) + elif h5py.h5ds.is_scale(dataset.id): + dims.append(dataset.name[1:]) + elif num_scales > 1: + raise ValueError( + f"{dataset.name}: {len(dataset.dims[n])} " + f"dimension scales attached to dimension #{n}" + ) + elif num_scales == 0: + # Some HDF5 files do not have dimension scales. + # If this is the case, `num_scales` will be 0. + # In this case, we mimic netCDF4 and assign phony dimension names. + # See https://github.com/fsspec/kerchunk/issues/41 + dims.append(f"phony_dim_{n}") + return dims + + +def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: + # This chunk determination logic mirrors zarr-python's create + # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66 + chunks = dataset.chunks if dataset.chunks else dataset.shape + zarray = ZArray( + chunks=chunks, + compressor=dataset.compression, + dtype=dataset.dtype, + fill_value=dataset.fillvalue, + filters=None, + order="C", + shape=dataset.shape, + zarr_format=2, + ) + manifest = _dataset_chunk_manifest(path, dataset) + marray = ManifestArray(zarray=zarray, chunkmanifest=manifest) + dims = _dataset_dims(dataset) + variable = xr.Variable(data=marray, dims=dims) + return variable diff --git a/virtualizarr/tests/test_readers/__init__.py b/virtualizarr/tests/test_readers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py new file mode 100644 index 0000000..b450483 --- /dev/null +++ b/virtualizarr/tests/test_readers/conftest.py @@ -0,0 +1,91 @@ +import h5py +import numpy as np +import pytest +import xarray as xr + + +@pytest.fixture +def empty_chunks_netcdf4_file(tmpdir): + ds = xr.Dataset({"data": []}) + filepath = f"{tmpdir}/empty_chunks.nc" + ds.to_netcdf(filepath, engine="h5netcdf") + return filepath + + +@pytest.fixture +def empty_dataset_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/empty_dataset.nc" + f = h5py.File(filepath, "w") + f.create_dataset("data", shape=(0,), dtype="f") + return filepath + + +@pytest.fixture +def no_chunks_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/no_chunks.nc" + f = h5py.File(filepath, "w") + data = np.random.random((10, 10)) + f.create_dataset(name="data", data=data, chunks=None) + return filepath + + +@pytest.fixture +def chunked_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/chunks.nc" + f = h5py.File(filepath, "w") + data = np.random.random((100, 100)) + f.create_dataset(name="data", data=data, chunks=(50, 50)) + return filepath + + +@pytest.fixture +def single_dimension_scale_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/single_dimension_scale.nc" + f = h5py.File(filepath, "w") + data = [1, 2] + x = [0, 1] + f.create_dataset(name="data", data=data) + f.create_dataset(name="x", data=x) + f["x"].make_scale() + f["data"].dims[0].attach_scale(f["x"]) + return filepath + + +@pytest.fixture +def is_scale_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/is_scale.nc" + f = h5py.File(filepath, "w") + data = [1, 2] + f.create_dataset(name="data", data=data) + f["data"].make_scale() + return filepath + + +@pytest.fixture +def multiple_dimension_scales_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/multiple_dimension_scales.nc" + f = h5py.File(filepath, "w") + data = [1, 2] + f.create_dataset(name="data", data=data) + f.create_dataset(name="x", data=[0, 1]) + f.create_dataset(name="y", data=[0, 1]) + f["x"].make_scale() + f["y"].make_scale() + f["data"].dims[0].attach_scale(f["x"]) + f["data"].dims[0].attach_scale(f["y"]) + return filepath + + +@pytest.fixture +def chunked_dimensions_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/chunks_dimension.nc" + f = h5py.File(filepath, "w") + data = np.random.random((100, 100)) + x = np.random.random((100, 100)) + y = np.random.random((100, 100)) + f.create_dataset(name="data", data=data, chunks=(50, 50)) + f.create_dataset(name="x", data=x, chunks=(50, 50)) + f.create_dataset(name="y", data=y, chunks=(50, 50)) + f["data"].dims[0].attach_scale(f["x"]) + f["data"].dims[1].attach_scale(f["y"]) + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py new file mode 100644 index 0000000..b6b78c1 --- /dev/null +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -0,0 +1,71 @@ +import h5py +import pytest + +from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims, + _dataset_to_variable) + + +class TestDatasetChunkManifest: + def test_empty_chunks(self, empty_chunks_netcdf4_file): + f = h5py.File(empty_chunks_netcdf4_file) + ds = f["data"] + with pytest.raises(ValueError, match="chunked but contains no chunks"): + _dataset_chunk_manifest(path=empty_chunks_netcdf4_file, dataset=ds) + + def test_empty_dataset(self, empty_dataset_netcdf4_file): + f = h5py.File(empty_dataset_netcdf4_file) + ds = f["data"] + with pytest.raises(ValueError, match="no space allocated in the file"): + _dataset_chunk_manifest(path=empty_dataset_netcdf4_file, dataset=ds) + + def test_no_chunking(self, no_chunks_netcdf4_file): + f = h5py.File(no_chunks_netcdf4_file) + ds = f["data"] + manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds) + assert len(manifest.entries) == 1 + + def test_chunked(self, chunked_netcdf4_file): + f = h5py.File(chunked_netcdf4_file) + ds = f["data"] + manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds) + assert len(manifest.entries) == 4 + + +class TestDatasetDims: + def test_single_dimension_scale(self, single_dimension_scale_netcdf4_file): + f = h5py.File(single_dimension_scale_netcdf4_file) + ds = f["data"] + dims = _dataset_dims(ds) + assert dims[0] == "x" + + def test_is_dimension_scale(self, is_scale_netcdf4_file): + f = h5py.File(is_scale_netcdf4_file) + ds = f["data"] + dims = _dataset_dims(ds) + assert dims[0] == "data" + + def test_multiple_dimension_scales(self, multiple_dimension_scales_netcdf4_file): + f = h5py.File(multiple_dimension_scales_netcdf4_file) + ds = f["data"] + with pytest.raises(ValueError, match="dimension scales attached"): + _dataset_dims(ds) + + def test_no_dimension_scales(self, no_chunks_netcdf4_file): + f = h5py.File(no_chunks_netcdf4_file) + ds = f["data"] + dims = _dataset_dims(ds) + assert dims == ["phony_dim_0", "phony_dim_1"] + + +class TestDatasetToVariable: + def test_chunked_dataset(self, chunked_dimensions_netcdf4_file): + f = h5py.File(chunked_dimensions_netcdf4_file) + ds = f["data"] + var = _dataset_to_variable(chunked_dimensions_netcdf4_file, ds) + assert var.chunks == (50, 50) + + def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file): + f = h5py.File(single_dimension_scale_netcdf4_file) + ds = f["data"] + var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds) + assert var.chunks == (2,) From bca0aabd6030625156b5fe1e58fb8d9a2ccf46f1 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 19 Apr 2024 14:20:38 -0600 Subject: [PATCH 02/68] Transfer dataset attrs to variable. --- virtualizarr/readers/hdf.py | 50 ++++++++++++++++++++- virtualizarr/tests/test_readers/conftest.py | 10 +++++ virtualizarr/tests/test_readers/test_hdf.py | 16 ++++++- 3 files changed, 74 insertions(+), 2 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index a34ae34..d6518a3 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -1,6 +1,7 @@ from typing import List import h5py +import numpy as np import xarray as xr from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray @@ -114,6 +115,52 @@ def _dataset_dims(dataset: h5py.Dataset) -> List[str]: return dims +def _extract_attrs(dataset: h5py.Dataset): + """ + Extract attributes from an HDF5 dataset. + + Parameters + ---------- + dataset : h5py.Dataset + An HDF5 dataset. + """ + _HIDDEN_ATTRS = { + "REFERENCE_LIST", + "CLASS", + "DIMENSION_LIST", + "NAME", + "_Netcdf4Dimid", + "_Netcdf4Coordinates", + "_nc3_strict", + "_NCProperties", + } + attrs = {} + for n, v in dataset.attrs.items(): + if n in _HIDDEN_ATTRS: + continue + # Fix some attribute values to avoid JSON encoding exceptions... + if isinstance(v, bytes): + v = v.decode("utf-8") or " " + elif isinstance(v, (np.ndarray, np.number, np.bool_)): + if v.dtype.kind == "S": + v = v.astype(str) + if n == "_FillValue": + continue + elif v.size == 1: + v = v.flatten()[0] + if isinstance(v, (np.ndarray, np.number, np.bool_)): + v = v.tolist() + else: + v = v.tolist() + elif isinstance(v, h5py._hl.base.Empty): + v = "" + if v == "DIMENSION_SCALE": + continue + + attrs[n] = v + return attrs + + def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: # This chunk determination logic mirrors zarr-python's create # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66 @@ -131,5 +178,6 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: manifest = _dataset_chunk_manifest(path, dataset) marray = ManifestArray(zarray=zarray, chunkmanifest=manifest) dims = _dataset_dims(dataset) - variable = xr.Variable(data=marray, dims=dims) + attrs = _extract_attrs(dataset) + variable = xr.Variable(data=marray, dims=dims, attrs=attrs) return variable diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index b450483..2c40fe1 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -89,3 +89,13 @@ def chunked_dimensions_netcdf4_file(tmpdir): f["data"].dims[0].attach_scale(f["x"]) f["data"].dims[1].attach_scale(f["y"]) return filepath + + +@pytest.fixture +def string_attribute_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/attributes.nc" + f = h5py.File(filepath, "w") + data = np.random.random((10, 10)) + f.create_dataset(name="data", data=data, chunks=None) + f["data"].attrs["attribute_name"] = "attribute_name" + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index b6b78c1..495b7de 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -2,7 +2,7 @@ import pytest from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims, - _dataset_to_variable) + _dataset_to_variable, _extract_attrs) class TestDatasetChunkManifest: @@ -69,3 +69,17 @@ def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file): ds = f["data"] var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds) assert var.chunks == (2,) + + def test_dataset_attributes(self, string_attribute_netcdf4_file): + f = h5py.File(string_attribute_netcdf4_file) + ds = f["data"] + var = _dataset_to_variable(string_attribute_netcdf4_file, ds) + assert var.attrs["attribute_name"] == "attribute_name" + + +class TestExtractAttributes: + def test_string_attribute(self, string_attribute_netcdf4_file): + f = h5py.File(string_attribute_netcdf4_file) + ds = f["data"] + attrs = _extract_attrs(ds) + assert attrs["attribute_name"] == "attribute_name" From 384ff6bb2d75b68a4af1f23d56a6544b4e20d6b5 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 19 Apr 2024 15:26:58 -0600 Subject: [PATCH 03/68] Get virtual variables dict from HDF5 file. --- virtualizarr/readers/hdf.py | 14 +++++++++++++- virtualizarr/tests/test_readers/conftest.py | 16 ++++++++++++---- virtualizarr/tests/test_readers/test_hdf.py | 15 ++++++++++++++- 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index d6518a3..9c3ebf4 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -1,4 +1,4 @@ -from typing import List +from typing import Mapping, List import h5py import numpy as np @@ -181,3 +181,15 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: attrs = _extract_attrs(dataset) variable = xr.Variable(data=marray, dims=dims, attrs=attrs) return variable + + +def virtual_vars_from_hdf(path: str, f: h5py.File) -> Mapping[str, xr.Variable]: + variables = {} + for key in f.keys(): + if isinstance(f[key], h5py.Dataset): + variable = _dataset_to_variable(path, f[key]) + variables[key] = variable + else: + raise NotImplementedError("Nested groups are not yet supported") + + return variables diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 2c40fe1..735e922 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -81,11 +81,11 @@ def chunked_dimensions_netcdf4_file(tmpdir): filepath = f"{tmpdir}/chunks_dimension.nc" f = h5py.File(filepath, "w") data = np.random.random((100, 100)) - x = np.random.random((100, 100)) - y = np.random.random((100, 100)) + x = np.random.random((100)) + y = np.random.random((100)) f.create_dataset(name="data", data=data, chunks=(50, 50)) - f.create_dataset(name="x", data=x, chunks=(50, 50)) - f.create_dataset(name="y", data=y, chunks=(50, 50)) + f.create_dataset(name="x", data=x) + f.create_dataset(name="y", data=y) f["data"].dims[0].attach_scale(f["x"]) f["data"].dims[1].attach_scale(f["y"]) return filepath @@ -99,3 +99,11 @@ def string_attribute_netcdf4_file(tmpdir): f.create_dataset(name="data", data=data, chunks=None) f["data"].attrs["attribute_name"] = "attribute_name" return filepath + + +@pytest.fixture +def group_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/group.nc" + f = h5py.File(filepath, "w") + f.create_group("group") + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index 495b7de..da331ed 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -2,7 +2,8 @@ import pytest from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims, - _dataset_to_variable, _extract_attrs) + _dataset_to_variable, _extract_attrs, + virtual_vars_from_hdf) class TestDatasetChunkManifest: @@ -83,3 +84,15 @@ def test_string_attribute(self, string_attribute_netcdf4_file): ds = f["data"] attrs = _extract_attrs(ds) assert attrs["attribute_name"] == "attribute_name" + + +class TestVirtualVarsFromHDF: + def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): + f = h5py.File(chunked_dimensions_netcdf4_file) + variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file, f) + assert len(variables) == 3 + + def test_groups_not_implemented(self, group_netcdf4_file): + f = h5py.File(group_netcdf4_file) + with pytest.raises(NotImplementedError): + virtual_vars_from_hdf(group_netcdf4_file, f) From 4c5f9bd30186aee61ff79223a70a3172b1c17d00 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 22 Apr 2024 12:33:24 -0600 Subject: [PATCH 04/68] Update virtual_vars_from_hdf to use fsspec and drop_variables arg. --- pyproject.toml | 2 +- virtualizarr/readers/hdf.py | 25 +++++++++++++++------ virtualizarr/tests/test_readers/conftest.py | 10 +++++++++ virtualizarr/tests/test_readers/test_hdf.py | 13 +++++++---- 4 files changed, 38 insertions(+), 12 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7994c92..d08621e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,6 @@ dependencies = [ "kerchunk==0.2.2", "pydantic", "packaging", - "h5netcdf", ] [project.optional-dependencies] @@ -35,6 +34,7 @@ test = [ "pytest", "scipy", "pooch", + "h5netcdf", ] diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index 9c3ebf4..c4ab292 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -1,5 +1,6 @@ -from typing import Mapping, List +from typing import List, Mapping, Optional +import fsspec import h5py import numpy as np import xarray as xr @@ -73,6 +74,7 @@ def store_chunk_entry(blob): ) return chunk_manifest + def _dataset_dims(dataset: h5py.Dataset) -> List[str]: """ Get a list of dimension scale names attached to input HDF5 dataset. @@ -183,13 +185,22 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: return variable -def virtual_vars_from_hdf(path: str, f: h5py.File) -> Mapping[str, xr.Variable]: +def virtual_vars_from_hdf( + path: str, + drop_variables: Optional[List[str]] = None, +) -> Mapping[str, xr.Variable]: + if drop_variables is None: + drop_variables = [] + fs, file_path = fsspec.core.url_to_fs(path) + open_file = fs.open(path, "rb") + f = h5py.File(open_file, mode="r") variables = {} for key in f.keys(): - if isinstance(f[key], h5py.Dataset): - variable = _dataset_to_variable(path, f[key]) - variables[key] = variable - else: - raise NotImplementedError("Nested groups are not yet supported") + if key not in drop_variables: + if isinstance(f[key], h5py.Dataset): + variable = _dataset_to_variable(path, f[key]) + variables[key] = variable + else: + raise NotImplementedError("Nested groups are not yet supported") return variables diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 735e922..aa2b0fe 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -107,3 +107,13 @@ def group_netcdf4_file(tmpdir): f = h5py.File(filepath, "w") f.create_group("group") return filepath + + +@pytest.fixture +def multiple_datasets_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/multiple_datasets.nc" + f = h5py.File(filepath, "w") + data = np.random.random((10, 10)) + f.create_dataset(name="data", data=data, chunks=None) + f.create_dataset(name="data2", data=data, chunks=None) + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index da331ed..36f7bc7 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -88,11 +88,16 @@ def test_string_attribute(self, string_attribute_netcdf4_file): class TestVirtualVarsFromHDF: def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): - f = h5py.File(chunked_dimensions_netcdf4_file) - variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file, f) + variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file) assert len(variables) == 3 def test_groups_not_implemented(self, group_netcdf4_file): - f = h5py.File(group_netcdf4_file) with pytest.raises(NotImplementedError): - virtual_vars_from_hdf(group_netcdf4_file, f) + virtual_vars_from_hdf(group_netcdf4_file) + + def test_drop_variables(self, multiple_datasets_netcdf4_file): + variables = virtual_vars_from_hdf( + multiple_datasets_netcdf4_file, + ["data2"] + ) + assert "data2" not in variables.keys() From 1dd3370aedc6e0b590f752273387a716366defe9 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 22 Apr 2024 13:02:03 -0600 Subject: [PATCH 05/68] mypy fix to use ChunkKey and empty dimensions list. --- virtualizarr/readers/hdf.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index c4ab292..fdb9a77 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -1,4 +1,4 @@ -from typing import List, Mapping, Optional +from typing import List, Mapping, Optional, Union import fsspec import h5py @@ -8,6 +8,8 @@ from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray from virtualizarr.zarr import ZArray +from virtualizarr.types import ChunkKey + def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: """ @@ -38,7 +40,8 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: offset=dsid.get_offset(), length=dsid.get_storage_size() ) - chunk_entries = {key: chunk_entry} + chunk_key = ChunkKey(key) + chunk_entries = {chunk_key: chunk_entry} chunk_manifest = ChunkManifest( entries=chunk_entries ) @@ -75,7 +78,7 @@ def store_chunk_entry(blob): return chunk_manifest -def _dataset_dims(dataset: h5py.Dataset) -> List[str]: +def _dataset_dims(dataset: h5py.Dataset) -> Union[List[str], List[None]]: """ Get a list of dimension scale names attached to input HDF5 dataset. @@ -114,7 +117,7 @@ def _dataset_dims(dataset: h5py.Dataset) -> List[str]: # In this case, we mimic netCDF4 and assign phony dimension names. # See https://github.com/fsspec/kerchunk/issues/41 dims.append(f"phony_dim_{n}") - return dims + return dims def _extract_attrs(dataset: h5py.Dataset): From d92c75c82cd000bf0fafa5301c22793434fb18ed Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 22 Apr 2024 13:40:52 -0600 Subject: [PATCH 06/68] Extract attributes from hdf5 root group. --- virtualizarr/readers/hdf.py | 18 +++++++++++++----- virtualizarr/tests/test_readers/conftest.py | 8 ++++++++ virtualizarr/tests/test_readers/test_hdf.py | 5 +++++ 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index fdb9a77..e02d03e 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -120,14 +120,14 @@ def _dataset_dims(dataset: h5py.Dataset) -> Union[List[str], List[None]]: return dims -def _extract_attrs(dataset: h5py.Dataset): +def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]): """ - Extract attributes from an HDF5 dataset. + Extract attributes from an HDF5 group or dataset. Parameters ---------- - dataset : h5py.Dataset - An HDF5 dataset. + h5obj : h5py.Group or h5py.Dataset + An HDF5 group or dataset. """ _HIDDEN_ATTRS = { "REFERENCE_LIST", @@ -140,7 +140,7 @@ def _extract_attrs(dataset: h5py.Dataset): "_NCProperties", } attrs = {} - for n, v in dataset.attrs.items(): + for n, v in h5obj.attrs.items(): if n in _HIDDEN_ATTRS: continue # Fix some attribute values to avoid JSON encoding exceptions... @@ -207,3 +207,11 @@ def virtual_vars_from_hdf( raise NotImplementedError("Nested groups are not yet supported") return variables + + +def attrs_from_root_group(path: str): + fs, file_path = fsspec.core.url_to_fs(path) + open_file = fs.open(path, "rb") + f = h5py.File(open_file, mode="r") + attrs = _extract_attrs(f) + return attrs diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index aa2b0fe..46ac7b2 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -101,6 +101,14 @@ def string_attribute_netcdf4_file(tmpdir): return filepath +@pytest.fixture +def root_attributes_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/root_attributes.nc" + f = h5py.File(filepath, "w") + f.attrs["attribute_name"] = "attribute_name" + return filepath + + @pytest.fixture def group_netcdf4_file(tmpdir): filepath = f"{tmpdir}/group.nc" diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index 36f7bc7..a24e36a 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -85,6 +85,11 @@ def test_string_attribute(self, string_attribute_netcdf4_file): attrs = _extract_attrs(ds) assert attrs["attribute_name"] == "attribute_name" + def test_root_attribute(self, root_attributes_netcdf4_file): + f = h5py.File(root_attributes_netcdf4_file) + attrs = _extract_attrs(f) + assert attrs["attribute_name"] == "attribute_name" + class TestVirtualVarsFromHDF: def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): From 0ed836272d26a62b8de457c30dc6525292efc916 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 22 Apr 2024 14:19:17 -0600 Subject: [PATCH 07/68] Use hdf reader for netcdf4 files. --- virtualizarr/xarray.py | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 5c3c854..415b0a0 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -8,7 +8,8 @@ from xarray.core.variable import IndexVariable import virtualizarr.kerchunk as kerchunk -from virtualizarr.kerchunk import KerchunkStoreRefs, FileType +from virtualizarr.kerchunk import KerchunkStoreRefs, FileType, _automatically_determine_filetype +from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group from virtualizarr.manifests import ChunkManifest, ManifestArray @@ -76,18 +77,28 @@ def open_virtual_dataset( if common: raise ValueError(f"Cannot both load and drop variables {common}") + if filetype is None: + filetype = _automatically_determine_filetype(filepath) + filetype = FileType(filetype) + if filetype.name.lower() == "netcdf4": + virtual_vars = virtual_vars_from_hdf( + path=filepath, + drop_variables=drop_variables + ) + ds_attrs = attrs_from_root_group(path=filepath) # this is the only place we actually always need to use kerchunk directly # TODO avoid even reading byte ranges for variables that will be dropped later anyway? - vds_refs = kerchunk.read_kerchunk_references_from_file( - filepath=filepath, - filetype=filetype, - ) - virtual_vars = virtual_vars_from_kerchunk_refs( - vds_refs, - drop_variables=drop_variables + loadable_variables, - virtual_array_class=virtual_array_class, - ) - ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) + else: + vds_refs = kerchunk.read_kerchunk_references_from_file( + filepath=filepath, + filetype=filetype, + ) + virtual_vars = virtual_vars_from_kerchunk_refs( + vds_refs, + drop_variables=drop_variables + loadable_variables, + virtual_array_class=virtual_array_class, + ) + ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) if indexes is None or len(loadable_variables) > 0: # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... From f4485fa10aebc0f8ef5ff7441704f49781325835 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 22 Apr 2024 21:57:39 +0000 Subject: [PATCH 08/68] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- virtualizarr/xarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 415b0a0..2213ffa 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -9,7 +9,7 @@ import virtualizarr.kerchunk as kerchunk from virtualizarr.kerchunk import KerchunkStoreRefs, FileType, _automatically_determine_filetype -from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group +from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group from virtualizarr.manifests import ChunkManifest, ManifestArray From 0123df7b802734f1902bee0cdd196f5baca10c9e Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 8 May 2024 18:03:04 -0600 Subject: [PATCH 09/68] Fix ruff complaints. --- virtualizarr/readers/hdf.py | 3 +-- virtualizarr/tests/test_readers/test_hdf.py | 10 +++++++--- virtualizarr/xarray.py | 8 ++++++-- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index e02d03e..af25c02 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -6,9 +6,8 @@ import xarray as xr from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray -from virtualizarr.zarr import ZArray - from virtualizarr.types import ChunkKey +from virtualizarr.zarr import ZArray def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index a24e36a..0d5a16d 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -1,9 +1,13 @@ import h5py import pytest -from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims, - _dataset_to_variable, _extract_attrs, - virtual_vars_from_hdf) +from virtualizarr.readers.hdf import ( + _dataset_chunk_manifest, + _dataset_dims, + _dataset_to_variable, + _extract_attrs, + virtual_vars_from_hdf, +) class TestDatasetChunkManifest: diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index fbf6136..9629a34 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -18,9 +18,13 @@ from xarray.core.variable import IndexVariable import virtualizarr.kerchunk as kerchunk -from virtualizarr.kerchunk import KerchunkStoreRefs, FileType, _automatically_determine_filetype -from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group +from virtualizarr.kerchunk import ( + FileType, + KerchunkStoreRefs, + _automatically_determine_filetype, +) from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.readers.hdf import attrs_from_root_group, virtual_vars_from_hdf from virtualizarr.zarr import ( attrs_from_zarr_group_json, dataset_to_zarr, From 332bcaab1ae182696e1daf7c611f6fe8fd8ee4fd Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 10 May 2024 15:10:30 -0600 Subject: [PATCH 10/68] First steps for handling HDF5 filters. --- pyproject.toml | 1 + virtualizarr/readers/hdf.py | 7 +- virtualizarr/readers/hdf_filters.py | 34 +++++++++ virtualizarr/tests/test_readers/conftest.py | 26 +++++++ .../tests/test_readers/test_hdf_filters.py | 31 ++++++++ .../test_readers/test_hdf_integration.py | 21 ++++++ virtualizarr/xarray.py | 71 +++++++++---------- 7 files changed, 153 insertions(+), 38 deletions(-) create mode 100644 virtualizarr/readers/hdf_filters.py create mode 100644 virtualizarr/tests/test_readers/test_hdf_filters.py create mode 100644 virtualizarr/tests/test_readers/test_hdf_integration.py diff --git a/pyproject.toml b/pyproject.toml index 79a5078..4818b5f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "numpy", "ujson", "packaging", + "hdf5plugin", ] [project.optional-dependencies] diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index af25c02..7d95d99 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -6,6 +6,7 @@ import xarray as xr from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray +from virtualizarr.readers.hdf_filters import codecs_from_dataset from virtualizarr.types import ChunkKey from virtualizarr.zarr import ZArray @@ -169,12 +170,14 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: # This chunk determination logic mirrors zarr-python's create # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66 chunks = dataset.chunks if dataset.chunks else dataset.shape + codecs = codecs_from_dataset(dataset) + filters = [codec.get_config() for codec in codecs] zarray = ZArray( chunks=chunks, - compressor=dataset.compression, + compressor=None, dtype=dataset.dtype, fill_value=dataset.fillvalue, - filters=None, + filters=filters, order="C", shape=dataset.shape, zarr_format=2, diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py new file mode 100644 index 0000000..6070fc1 --- /dev/null +++ b/virtualizarr/readers/hdf_filters.py @@ -0,0 +1,34 @@ +from typing import List, Tuple, Union + +import h5py +import numcodecs.registry as registry +from numcodecs.abc import Codec + +_non_standard_filters = { + "gzip": "zlib" +} + + +def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec: + try: + id = int(filter_id) + except ValueError: + id = filter_id + + if isinstance(id, str): + if id in _non_standard_filters.keys(): + id = _non_standard_filters[id] + conf = {"id": id} + if id == "zlib": + conf["level"] = filter_properties + + codec = registry.get_codec(conf) + return codec + + +def codecs_from_dataset(dataset: h5py.Dataset) -> List[Codec]: + codecs = [] + for filter_id, filter_properties in dataset._filters.items(): + codec = _filter_to_codec(filter_id, filter_properties) + codecs.append(codec) + return codecs diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 46ac7b2..4f0d4fc 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -125,3 +125,29 @@ def multiple_datasets_netcdf4_file(tmpdir): f.create_dataset(name="data", data=data, chunks=None) f.create_dataset(name="data2", data=data, chunks=None) return filepath + + +@pytest.fixture +def np_uncompressed(): + return np.arange(100) + + +@pytest.fixture +def gzip_filter_netcdf4_file(tmpdir, np_uncompressed): + filepath = f"{tmpdir}/gzip.nc" + f = h5py.File(filepath, "w") + f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1) + return filepath + + +@pytest.fixture +def gzip_filter_xarray_netcdf4_file(tmpdir): + ds = xr.tutorial.open_dataset("air_temperature") + encoding = {} + for var_name in ds.variables: + # encoding[var_name] = {"zlib": True, "compression_opts": 1} + encoding[var_name] = {"compression": "gzip", "compression_opts": 1} + + filepath = f"{tmpdir}/gzip_xarray.nc" + ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py new file mode 100644 index 0000000..50a5d08 --- /dev/null +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -0,0 +1,31 @@ +import h5py +import numcodecs +import pytest + +from virtualizarr.readers.hdf_filters import ( + _filter_to_codec, + codecs_from_dataset, +) + + +class TestFilterToCodec: + def test_gzip_uses_zlib_nomcodec(self): + codec = _filter_to_codec("gzip", 1) + assert isinstance(codec, numcodecs.zlib.Zlib) + + def test_lzf_not_available(self): + with pytest.raises(ValueError, match="codec not available"): + _filter_to_codec("lzf") + + +class TestCodecsFromDataSet: + def test_gzip(self, np_uncompressed, gzip_filter_netcdf4_file): + f = h5py.File(gzip_filter_netcdf4_file) + ds = f["data"] + chunk_info = ds.id.get_chunk_info(0) + codecs = codecs_from_dataset(ds) + with open(gzip_filter_netcdf4_file, 'rb') as file: + file.seek(chunk_info.byte_offset) + bytes_read = file.read(chunk_info.size) + decoded = codecs[0].decode(bytes_read) + assert decoded == np_uncompressed.tobytes() diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py new file mode 100644 index 0000000..45bfadc --- /dev/null +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -0,0 +1,21 @@ +import fsspec +import numpy +import xarray as xr + +import virtualizarr +from virtualizarr.kerchunk import FileType + + +class TestIntegration: + def test_gzip_filter_end_to_end(self, tmpdir, gzip_filter_xarray_netcdf4_file): + virtual_ds = virtualizarr.open_virtual_dataset( + gzip_filter_xarray_netcdf4_file, + filetype=FileType("netcdf4") + ) + kerchunk_file = f"{tmpdir}/gzip_kerchunk.json" + virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") + fs = fsspec.filesystem("reference", fo=kerchunk_file) + m = fs.get_mapper("") + + ds = xr.open_dataset(m, engine="kerchunk") + assert isinstance(ds.air.values[0][0][0], numpy.float64) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 9629a34..24ba973 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -128,48 +128,47 @@ def open_virtual_dataset( ) ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) - if indexes is None or len(loadable_variables) > 0: - # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... - # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references - # TODO really we probably want a dedicated xarray backend that iterates over all variables only once - ds = xr.open_dataset(filepath, drop_variables=drop_variables) - - if indexes is None: - # add default indexes by reading data from file - indexes = {name: index for name, index in ds.xindexes.items()} - elif indexes != {}: - # TODO allow manual specification of index objects - raise NotImplementedError() - else: - indexes = dict(**indexes) # for type hinting: to allow mutation - - loadable_vars = { - name: var - for name, var in ds.variables.items() - if name in loadable_variables - } - - # if we only read the indexes we can just close the file right away as nothing is lazy - if loadable_vars == {}: - ds.close() + if indexes is None or len(loadable_variables) > 0: + # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... + # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references + # TODO really we probably want a dedicated xarray backend that iterates over all variables only once + ds = xr.open_dataset(filepath, drop_variables=drop_variables) + + if indexes is None: + # add default indexes by reading data from file + indexes = {name: index for name, index in ds.xindexes.items()} + elif indexes != {}: + # TODO allow manual specification of index objects + raise NotImplementedError() else: - loadable_vars = {} - indexes = {} + indexes = dict(**indexes) # for type hinting: to allow mutation - vars = {**virtual_vars, **loadable_vars} + loadable_vars = { + name: var + for name, var in ds.variables.items() + if name in loadable_variables + } - data_vars, coords = separate_coords(vars, indexes) + # if we only read the indexes we can just close the file right away as nothing is lazy + if loadable_vars == {}: + ds.close() + else: + loadable_vars = {} + indexes = {} - vds = xr.Dataset( - data_vars, - coords=coords, - # indexes={}, # TODO should be added in a later version of xarray - attrs=ds_attrs, - ) + vars = {**virtual_vars, **loadable_vars} + + data_vars, coords = separate_coords(vars, indexes) + vds = xr.Dataset( + data_vars, + coords=coords, + # indexes={}, # TODO should be added in a later version of xarray + attrs=ds_attrs, + ) - # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened + # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened - return vds + return vds def open_virtual_dataset_from_v3_store( From c51e615ca0cd5396bde54868e439419fe9d9b9c8 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 13 May 2024 12:36:29 -0600 Subject: [PATCH 11/68] Initial step for hdf5plugin supported codecs. --- virtualizarr/readers/hdf_filters.py | 25 +++++++++++++++ virtualizarr/tests/test_readers/conftest.py | 31 +++++++++++++------ .../tests/test_readers/test_hdf_filters.py | 20 +++++++++--- .../test_readers/test_hdf_integration.py | 7 +++-- 4 files changed, 66 insertions(+), 17 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 6070fc1..75f06bd 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -1,14 +1,30 @@ from typing import List, Tuple, Union import h5py +import hdf5plugin import numcodecs.registry as registry from numcodecs.abc import Codec +from pydantic import BaseModel, validator _non_standard_filters = { "gzip": "zlib" } +class BloscProperties(BaseModel): + blocksize: int + clevel: int + shuffle: int + cname: str + + @validator("cname", pre=True) + def get_cname_from_code(cls, v): + blosc_compressor_codes = { + value: key for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items() + } + return blosc_compressor_codes[v] + + def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec: try: id = int(filter_id) @@ -21,6 +37,15 @@ def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None conf = {"id": id} if id == "zlib": conf["level"] = filter_properties + elif isinstance(id, int): + filter = hdf5plugin.get_filters(id)[0] + id = filter.filter_name + if id == "blosc": + blosc_props = BloscProperties(**{k: v for k, v in + zip(BloscProperties.__fields__.keys(), + filter_properties[-4:])}) + conf = blosc_props.model_dump() + conf["id"] = id codec = registry.get_codec(conf) return codec diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 4f0d4fc..cc9331e 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -1,4 +1,5 @@ import h5py +import hdf5plugin import numpy as np import pytest import xarray as xr @@ -132,22 +133,32 @@ def np_uncompressed(): return np.arange(100) -@pytest.fixture -def gzip_filter_netcdf4_file(tmpdir, np_uncompressed): - filepath = f"{tmpdir}/gzip.nc" +@pytest.fixture(params=["gzip", "blosc"]) +def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): + filepath = f"{tmpdir}/{request.param}.nc" f = h5py.File(filepath, "w") - f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1) + if request.param == "gzip": + f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1) + if request.param == "blosc": + f.create_dataset(name="data", data=np_uncompressed, + **hdf5plugin.Blosc( + cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE + )) return filepath -@pytest.fixture -def gzip_filter_xarray_netcdf4_file(tmpdir): +@pytest.fixture(params=["gzip"]) +def filter_encoded_xarray_netcdf4_files(tmpdir, request): ds = xr.tutorial.open_dataset("air_temperature") encoding = {} + if request.param == "gzip": + encoding_config = { + "zlib": True, + "complevel": 1 + } for var_name in ds.variables: - # encoding[var_name] = {"zlib": True, "compression_opts": 1} - encoding[var_name] = {"compression": "gzip", "compression_opts": 1} + encoding[var_name] = encoding_config - filepath = f"{tmpdir}/gzip_xarray.nc" - ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) + filepath = f"{tmpdir}/{request.param}_xarray.nc" + ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding) return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index 50a5d08..8094d4c 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -9,7 +9,7 @@ class TestFilterToCodec: - def test_gzip_uses_zlib_nomcodec(self): + def test_gzip_uses_zlib_numcodec(self): codec = _filter_to_codec("gzip", 1) assert isinstance(codec, numcodecs.zlib.Zlib) @@ -17,14 +17,26 @@ def test_lzf_not_available(self): with pytest.raises(ValueError, match="codec not available"): _filter_to_codec("lzf") + def test_blosc(self): + codec = _filter_to_codec("32001", (2, 2, 8, 800, 9, 2, 1)) + assert isinstance(codec, numcodecs.blosc.Blosc) + expected_config = { + "id": "blosc", + "blocksize": 800, + "clevel": 9, + "shuffle": 2, + "cname": "lz4", + } + assert codec.get_config() == expected_config + class TestCodecsFromDataSet: - def test_gzip(self, np_uncompressed, gzip_filter_netcdf4_file): - f = h5py.File(gzip_filter_netcdf4_file) + def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file): + f = h5py.File(filter_encoded_netcdf4_file) ds = f["data"] chunk_info = ds.id.get_chunk_info(0) codecs = codecs_from_dataset(ds) - with open(gzip_filter_netcdf4_file, 'rb') as file: + with open(filter_encoded_netcdf4_file, 'rb') as file: file.seek(chunk_info.byte_offset) bytes_read = file.read(chunk_info.size) decoded = codecs[0].decode(bytes_read) diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index 45bfadc..94fc0c1 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -7,12 +7,13 @@ class TestIntegration: - def test_gzip_filter_end_to_end(self, tmpdir, gzip_filter_xarray_netcdf4_file): + def test_filters_end_to_end(self, tmpdir, + filter_encoded_xarray_netcdf4_files): virtual_ds = virtualizarr.open_virtual_dataset( - gzip_filter_xarray_netcdf4_file, + filter_encoded_xarray_netcdf4_files, filetype=FileType("netcdf4") ) - kerchunk_file = f"{tmpdir}/gzip_kerchunk.json" + kerchunk_file = f"{tmpdir}/kerchunk.json" virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") fs = fsspec.filesystem("reference", fo=kerchunk_file) m = fs.get_mapper("") From 0083f77103c909079427ce3471e65af7fb3bfc54 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 16 May 2024 16:24:57 -0400 Subject: [PATCH 12/68] Small commit to check compression support in CI environment. --- pyproject.toml | 1 + virtualizarr/tests/test_readers/conftest.py | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4818b5f..bba695e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,7 @@ test = [ "scipy", "pooch", "ruff", + "netcdf4", ] diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index cc9331e..8dc82c3 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -147,7 +147,7 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): return filepath -@pytest.fixture(params=["gzip"]) +@pytest.fixture(params=["gzip", "blosc_lz"]) def filter_encoded_xarray_netcdf4_files(tmpdir, request): ds = xr.tutorial.open_dataset("air_temperature") encoding = {} @@ -156,9 +156,14 @@ def filter_encoded_xarray_netcdf4_files(tmpdir, request): "zlib": True, "complevel": 1 } + if request.param == "blosc_lz": + encoding_config = { + "compression": "blosc_lz", + } + for var_name in ds.variables: encoding[var_name] = encoding_config filepath = f"{tmpdir}/{request.param}_xarray.nc" - ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding) + ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) return filepath From 207c4b5cb411637070dc9a5f7011a0e0c98ef877 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 19 May 2024 21:34:26 +0000 Subject: [PATCH 13/68] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- virtualizarr/readers/hdf.py | 16 ++++---------- virtualizarr/readers/hdf_filters.py | 22 ++++++++++++------- virtualizarr/tests/test_readers/conftest.py | 18 +++++++-------- virtualizarr/tests/test_readers/test_hdf.py | 5 +---- .../tests/test_readers/test_hdf_filters.py | 2 +- .../test_readers/test_hdf_integration.py | 6 ++--- virtualizarr/xarray.py | 5 ++--- 7 files changed, 33 insertions(+), 41 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index 7d95d99..78e718e 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -36,15 +36,11 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: key_list = [0] * (len(dataset.shape) or 1) key = ".".join(map(str, key_list)) chunk_entry = ChunkEntry( - path=path, - offset=dsid.get_offset(), - length=dsid.get_storage_size() + path=path, offset=dsid.get_offset(), length=dsid.get_storage_size() ) chunk_key = ChunkKey(key) chunk_entries = {chunk_key: chunk_entry} - chunk_manifest = ChunkManifest( - entries=chunk_entries - ) + chunk_manifest = ChunkManifest(entries=chunk_entries) return chunk_manifest else: num_chunks = dsid.get_num_chunks() @@ -60,9 +56,7 @@ def get_key(blob): def store_chunk_entry(blob): chunk_entries[get_key(blob)] = ChunkEntry( - path=path, - offset=blob.byte_offset, - length=blob.size + path=path, offset=blob.byte_offset, length=blob.size ) has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) @@ -72,9 +66,7 @@ def store_chunk_entry(blob): for index in range(num_chunks): store_chunk_entry(dsid.get_chunk_info(index)) - chunk_manifest = ChunkManifest( - entries=chunk_entries - ) + chunk_manifest = ChunkManifest(entries=chunk_entries) return chunk_manifest diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 75f06bd..77e7037 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -6,9 +6,7 @@ from numcodecs.abc import Codec from pydantic import BaseModel, validator -_non_standard_filters = { - "gzip": "zlib" -} +_non_standard_filters = {"gzip": "zlib"} class BloscProperties(BaseModel): @@ -20,12 +18,15 @@ class BloscProperties(BaseModel): @validator("cname", pre=True) def get_cname_from_code(cls, v): blosc_compressor_codes = { - value: key for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items() + value: key + for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items() } return blosc_compressor_codes[v] -def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec: +def _filter_to_codec( + filter_id: str, filter_properties: Union[int, Tuple] = None +) -> Codec: try: id = int(filter_id) except ValueError: @@ -41,9 +42,14 @@ def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None filter = hdf5plugin.get_filters(id)[0] id = filter.filter_name if id == "blosc": - blosc_props = BloscProperties(**{k: v for k, v in - zip(BloscProperties.__fields__.keys(), - filter_properties[-4:])}) + blosc_props = BloscProperties( + **{ + k: v + for k, v in zip( + BloscProperties.__fields__.keys(), filter_properties[-4:] + ) + } + ) conf = blosc_props.model_dump() conf["id"] = id diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index aa66f93..53c9630 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -138,12 +138,15 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): filepath = f"{tmpdir}/{request.param}.nc" f = h5py.File(filepath, "w") if request.param == "gzip": - f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1) + f.create_dataset( + name="data", data=np_uncompressed, compression="gzip", compression_opts=1 + ) if request.param == "blosc": - f.create_dataset(name="data", data=np_uncompressed, - **hdf5plugin.Blosc( - cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE - )) + f.create_dataset( + name="data", + data=np_uncompressed, + **hdf5plugin.Blosc(cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE), + ) return filepath @@ -152,10 +155,7 @@ def filter_encoded_xarray_netcdf4_files(tmpdir, request): ds = xr.tutorial.open_dataset("air_temperature") encoding = {} if request.param == "gzip": - encoding_config = { - "zlib": True, - "complevel": 1 - } + encoding_config = {"zlib": True, "complevel": 1} for var_name in ds.variables: encoding[var_name] = encoding_config diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index 0d5a16d..a83bfc3 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -105,8 +105,5 @@ def test_groups_not_implemented(self, group_netcdf4_file): virtual_vars_from_hdf(group_netcdf4_file) def test_drop_variables(self, multiple_datasets_netcdf4_file): - variables = virtual_vars_from_hdf( - multiple_datasets_netcdf4_file, - ["data2"] - ) + variables = virtual_vars_from_hdf(multiple_datasets_netcdf4_file, ["data2"]) assert "data2" not in variables.keys() diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index 8094d4c..28b5d69 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -36,7 +36,7 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file): ds = f["data"] chunk_info = ds.id.get_chunk_info(0) codecs = codecs_from_dataset(ds) - with open(filter_encoded_netcdf4_file, 'rb') as file: + with open(filter_encoded_netcdf4_file, "rb") as file: file.seek(chunk_info.byte_offset) bytes_read = file.read(chunk_info.size) decoded = codecs[0].decode(bytes_read) diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index 94fc0c1..b31289c 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -7,11 +7,9 @@ class TestIntegration: - def test_filters_end_to_end(self, tmpdir, - filter_encoded_xarray_netcdf4_files): + def test_filters_end_to_end(self, tmpdir, filter_encoded_xarray_netcdf4_files): virtual_ds = virtualizarr.open_virtual_dataset( - filter_encoded_xarray_netcdf4_files, - filetype=FileType("netcdf4") + filter_encoded_xarray_netcdf4_files, filetype=FileType("netcdf4") ) kerchunk_file = f"{tmpdir}/kerchunk.json" virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 7264565..d8b6a08 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -20,8 +20,8 @@ _automatically_determine_filetype, ) from virtualizarr.manifests import ChunkManifest, ManifestArray -from virtualizarr.utils import _fsspec_openfile_from_filepath from virtualizarr.readers.hdf import attrs_from_root_group, virtual_vars_from_hdf +from virtualizarr.utils import _fsspec_openfile_from_filepath from virtualizarr.zarr import ( attrs_from_zarr_group_json, dataset_to_zarr, @@ -109,8 +109,7 @@ def open_virtual_dataset( if filetype.name.lower() == "netcdf4": print("wat") virtual_vars = virtual_vars_from_hdf( - path=filepath, - drop_variables=drop_variables + path=filepath, drop_variables=drop_variables ) ds_attrs = attrs_from_root_group(path=filepath) if filetype == "zarr_v3": From c57380058a5ad6ddbd908d54b1edd85b1f74f91d Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sun, 19 May 2024 16:12:50 -0600 Subject: [PATCH 14/68] Fix mypy complaints for hdf_filters. --- virtualizarr/readers/hdf_filters.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 75f06bd..7a8bcc8 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -1,4 +1,4 @@ -from typing import List, Tuple, Union +from typing import List, Optional, Tuple, TypedDict, Union import h5py import hdf5plugin @@ -25,26 +25,30 @@ def get_cname_from_code(cls, v): return blosc_compressor_codes[v] -def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec: +def _filter_to_codec(filter_id: str, filter_properties: Union[int, None, Tuple] = None) -> Codec: + id_int = None + id_str = None try: - id = int(filter_id) + id_int = int(filter_id) except ValueError: - id = filter_id + id_str = filter_id - if isinstance(id, str): - if id in _non_standard_filters.keys(): - id = _non_standard_filters[id] + if id_str: + if id_str in _non_standard_filters.keys(): + id = _non_standard_filters[id_str] + else: + id = id_str conf = {"id": id} if id == "zlib": - conf["level"] = filter_properties - elif isinstance(id, int): - filter = hdf5plugin.get_filters(id)[0] + conf["level"] = filter_properties # type: ignore[assignment] + if id_int: + filter = hdf5plugin.get_filters(id_int)[0] id = filter.filter_name - if id == "blosc": + if id == "blosc" and isinstance(filter_properties, tuple): blosc_props = BloscProperties(**{k: v for k, v in zip(BloscProperties.__fields__.keys(), filter_properties[-4:])}) - conf = blosc_props.model_dump() + conf = blosc_props.model_dump() # type: ignore[assignment] conf["id"] = id codec = registry.get_codec(conf) From 588e06b507e8661644e33923ad0295e255152e1e Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sun, 19 May 2024 16:22:39 -0600 Subject: [PATCH 15/68] Local pre-commit fix for hdf_filters. --- virtualizarr/readers/hdf_filters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index a3868eb..dfe1c1f 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Tuple, TypedDict, Union +from typing import List, Tuple, Union import h5py import hdf5plugin From 725333e06fad83d4d763317faca5f41167a2c98f Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 20 May 2024 20:13:44 -0600 Subject: [PATCH 16/68] Use fsspec reader_options introduced in #37. --- virtualizarr/readers/hdf.py | 22 ++++++++++++++++------ virtualizarr/xarray.py | 7 ++++--- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index 78e718e..19d99b3 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -1,6 +1,5 @@ from typing import List, Mapping, Optional, Union -import fsspec import h5py import numpy as np import xarray as xr @@ -8,6 +7,7 @@ from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray from virtualizarr.readers.hdf_filters import codecs_from_dataset from virtualizarr.types import ChunkKey +from virtualizarr.utils import _fsspec_openfile_from_filepath from virtualizarr.zarr import ZArray @@ -185,11 +185,15 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: def virtual_vars_from_hdf( path: str, drop_variables: Optional[List[str]] = None, + reader_options: Optional[dict] = { + "storage_options": {"key": "", "secret": "", "anon": True} + }, ) -> Mapping[str, xr.Variable]: if drop_variables is None: drop_variables = [] - fs, file_path = fsspec.core.url_to_fs(path) - open_file = fs.open(path, "rb") + open_file = _fsspec_openfile_from_filepath( + filepath=path, reader_options=reader_options + ) f = h5py.File(open_file, mode="r") variables = {} for key in f.keys(): @@ -203,9 +207,15 @@ def virtual_vars_from_hdf( return variables -def attrs_from_root_group(path: str): - fs, file_path = fsspec.core.url_to_fs(path) - open_file = fs.open(path, "rb") +def attrs_from_root_group( + path: str, + reader_options: Optional[dict] = { + "storage_options": {"key": "", "secret": "", "anon": True} + }, +): + open_file = _fsspec_openfile_from_filepath( + filepath=path, reader_options=reader_options + ) f = h5py.File(open_file, mode="r") attrs = _extract_attrs(f) return attrs diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index d8b6a08..8f810ee 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -107,11 +107,12 @@ def open_virtual_dataset( filetype = FileType(filetype) if filetype.name.lower() == "netcdf4": - print("wat") virtual_vars = virtual_vars_from_hdf( - path=filepath, drop_variables=drop_variables + path=filepath, + drop_variables=drop_variables, + reader_options=reader_options, ) - ds_attrs = attrs_from_root_group(path=filepath) + ds_attrs = attrs_from_root_group(path=filepath, reader_options=reader_options) if filetype == "zarr_v3": # TODO is there a neat way of auto-detecting this? return open_virtual_dataset_from_v3_store( From 72df10861ab0830531502885c0aaa3ebf3de4dee Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 20 May 2024 20:40:38 -0600 Subject: [PATCH 17/68] Fix incorrect zarr_v3 if block position from merge commit ef0d7a8. --- virtualizarr/xarray.py | 128 +++++++++++++++++++++-------------------- 1 file changed, 66 insertions(+), 62 deletions(-) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 8f810ee..d76e2a6 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -101,82 +101,86 @@ def open_virtual_dataset( if virtual_array_class is not ManifestArray: raise NotImplementedError() - - if filetype is None: - filetype = _automatically_determine_filetype(filepath=filepath) - filetype = FileType(filetype) - - if filetype.name.lower() == "netcdf4": - virtual_vars = virtual_vars_from_hdf( - path=filepath, - drop_variables=drop_variables, - reader_options=reader_options, - ) - ds_attrs = attrs_from_root_group(path=filepath, reader_options=reader_options) if filetype == "zarr_v3": # TODO is there a neat way of auto-detecting this? return open_virtual_dataset_from_v3_store( storepath=filepath, drop_variables=drop_variables, indexes=indexes ) else: - # this is the only place we actually always need to use kerchunk directly - # TODO avoid even reading byte ranges for variables that will be dropped later anyway? - vds_refs = kerchunk.read_kerchunk_references_from_file( - filepath=filepath, - filetype=filetype, - ) - virtual_vars = virtual_vars_from_kerchunk_refs( - vds_refs, - drop_variables=drop_variables + loadable_variables, - virtual_array_class=virtual_array_class, - ) - ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {}) - - if indexes is None or len(loadable_variables) > 0: - # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... - # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references - # TODO really we probably want a dedicated xarray backend that iterates over all variables only once - fpath = _fsspec_openfile_from_filepath( - filepath=filepath, reader_options=reader_options - ) + if filetype is None: + filetype = _automatically_determine_filetype(filepath=filepath) + filetype = FileType(filetype) + + if filetype.name.lower() == "netcdf4": + virtual_vars = virtual_vars_from_hdf( + path=filepath, + drop_variables=drop_variables, + reader_options=reader_options, + ) + ds_attrs = attrs_from_root_group( + path=filepath, reader_options=reader_options + ) + else: + # this is the only place we actually always need to use kerchunk directly + # TODO avoid even reading byte ranges for variables that will be dropped later anyway? + vds_refs = kerchunk.read_kerchunk_references_from_file( + filepath=filepath, + filetype=filetype, + ) + virtual_vars = virtual_vars_from_kerchunk_refs( + vds_refs, + drop_variables=drop_variables + loadable_variables, + virtual_array_class=virtual_array_class, + ) + ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get( + ".zattrs", {} + ) - ds = xr.open_dataset(fpath, drop_variables=drop_variables) + if indexes is None or len(loadable_variables) > 0: + # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... + # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references + # TODO really we probably want a dedicated xarray backend that iterates over all variables only once + fpath = _fsspec_openfile_from_filepath( + filepath=filepath, reader_options=reader_options + ) - if indexes is None: - # add default indexes by reading data from file - indexes = {name: index for name, index in ds.xindexes.items()} - elif indexes != {}: - # TODO allow manual specification of index objects - raise NotImplementedError() - else: - indexes = dict(**indexes) # for type hinting: to allow mutation + ds = xr.open_dataset(fpath, drop_variables=drop_variables) - loadable_vars = { - name: var - for name, var in ds.variables.items() - if name in loadable_variables - } + if indexes is None: + # add default indexes by reading data from file + indexes = {name: index for name, index in ds.xindexes.items()} + elif indexes != {}: + # TODO allow manual specification of index objects + raise NotImplementedError() + else: + indexes = dict(**indexes) # for type hinting: to allow mutation - # if we only read the indexes we can just close the file right away as nothing is lazy - if loadable_vars == {}: - ds.close() - else: - loadable_vars = {} - indexes = {} + loadable_vars = { + name: var + for name, var in ds.variables.items() + if name in loadable_variables + } - vars = {**virtual_vars, **loadable_vars} + # if we only read the indexes we can just close the file right away as nothing is lazy + if loadable_vars == {}: + ds.close() + else: + loadable_vars = {} + indexes = {} - data_vars, coords = separate_coords(vars, indexes) - vds = xr.Dataset( - data_vars, - coords=coords, - # indexes={}, # TODO should be added in a later version of xarray - attrs=ds_attrs, - ) + vars = {**virtual_vars, **loadable_vars} - # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened + data_vars, coords = separate_coords(vars, indexes) + vds = xr.Dataset( + data_vars, + coords=coords, + # indexes={}, # TODO should be added in a later version of xarray + attrs=ds_attrs, + ) - return vds + # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened + + return vds def open_virtual_dataset_from_v3_store( From d1e85cb169adc3851951afc2a64fcdec6180243c Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 21 May 2024 08:48:05 -0600 Subject: [PATCH 18/68] Fix early return from hdf _extract_attrs. --- virtualizarr/readers/hdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index 19d99b3..be93237 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -155,7 +155,7 @@ def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]): continue attrs[n] = v - return attrs + return attrs def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: From 1e2b3436fd086f8188c516f2fda4f6cd3a521325 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 21 May 2024 09:23:50 -0600 Subject: [PATCH 19/68] Test that _extract_attrs correctly handles multiple attributes. --- virtualizarr/tests/test_readers/conftest.py | 3 ++- virtualizarr/tests/test_readers/test_hdf.py | 16 +++++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 53c9630..fe2ec88 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -93,12 +93,13 @@ def chunked_dimensions_netcdf4_file(tmpdir): @pytest.fixture -def string_attribute_netcdf4_file(tmpdir): +def string_attributes_netcdf4_file(tmpdir): filepath = f"{tmpdir}/attributes.nc" f = h5py.File(filepath, "w") data = np.random.random((10, 10)) f.create_dataset(name="data", data=data, chunks=None) f["data"].attrs["attribute_name"] = "attribute_name" + f["data"].attrs["attribute_name2"] = "attribute_name2" return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index a83bfc3..a67352e 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -75,16 +75,16 @@ def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file): var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds) assert var.chunks == (2,) - def test_dataset_attributes(self, string_attribute_netcdf4_file): - f = h5py.File(string_attribute_netcdf4_file) + def test_dataset_attributes(self, string_attributes_netcdf4_file): + f = h5py.File(string_attributes_netcdf4_file) ds = f["data"] - var = _dataset_to_variable(string_attribute_netcdf4_file, ds) + var = _dataset_to_variable(string_attributes_netcdf4_file, ds) assert var.attrs["attribute_name"] == "attribute_name" class TestExtractAttributes: - def test_string_attribute(self, string_attribute_netcdf4_file): - f = h5py.File(string_attribute_netcdf4_file) + def test_string_attribute(self, string_attributes_netcdf4_file): + f = h5py.File(string_attributes_netcdf4_file) ds = f["data"] attrs = _extract_attrs(ds) assert attrs["attribute_name"] == "attribute_name" @@ -94,6 +94,12 @@ def test_root_attribute(self, root_attributes_netcdf4_file): attrs = _extract_attrs(f) assert attrs["attribute_name"] == "attribute_name" + def test_multiple_attributes(self, string_attributes_netcdf4_file): + f = h5py.File(string_attributes_netcdf4_file) + ds = f["data"] + attrs = _extract_attrs(ds) + assert len(attrs.keys()) == 2 + class TestVirtualVarsFromHDF: def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): From 7f1c1897dcad92cb988ea7e14a165d63fe23dad6 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 22 May 2024 14:16:12 -0600 Subject: [PATCH 20/68] Initial attempt at scale and offset via numcodecs. --- virtualizarr/readers/hdf.py | 14 ++++++++--- virtualizarr/readers/hdf_filters.py | 36 ++++++++++++++++++++++++++++- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index be93237..c251866 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -5,7 +5,7 @@ import xarray as xr from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray -from virtualizarr.readers.hdf_filters import codecs_from_dataset +from virtualizarr.readers.hdf_filters import cfcodec_from_dataset, codecs_from_dataset from virtualizarr.types import ChunkKey from virtualizarr.utils import _fsspec_openfile_from_filepath from virtualizarr.zarr import ZArray @@ -163,11 +163,20 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66 chunks = dataset.chunks if dataset.chunks else dataset.shape codecs = codecs_from_dataset(dataset) + cfcodec = cfcodec_from_dataset(dataset) + attrs = _extract_attrs(dataset) + if cfcodec: + codecs.append(cfcodec["codec"]) + dtype = cfcodec["target_dtype"] + attrs.pop("scale_factor", None) + attrs.pop("add_offset", None) + else: + dtype = dataset.dtype filters = [codec.get_config() for codec in codecs] zarray = ZArray( chunks=chunks, compressor=None, - dtype=dataset.dtype, + dtype=dtype, fill_value=dataset.fillvalue, filters=filters, order="C", @@ -177,7 +186,6 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: manifest = _dataset_chunk_manifest(path, dataset) marray = ManifestArray(zarray=zarray, chunkmanifest=manifest) dims = _dataset_dims(dataset) - attrs = _extract_attrs(dataset) variable = xr.Variable(data=marray, dims=dims, attrs=attrs) return variable diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index dfe1c1f..169eab9 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -1,10 +1,13 @@ -from typing import List, Tuple, Union +from typing import List, Tuple, TypedDict, Union import h5py import hdf5plugin import numcodecs.registry as registry +import numpy as np from numcodecs.abc import Codec +from numcodecs.fixedscaleoffset import FixedScaleOffset from pydantic import BaseModel, validator +from xarray.coding.variables import _choose_float_dtype _non_standard_filters = {"gzip": "zlib"} @@ -24,6 +27,11 @@ def get_cname_from_code(cls, v): return blosc_compressor_codes[v] +class CFCodec(TypedDict): + target_dtype: np.dtype + codec: Codec + + def _filter_to_codec( filter_id: str, filter_properties: Union[int, None, Tuple] = None ) -> Codec: @@ -61,6 +69,32 @@ def _filter_to_codec( return codec +def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: + attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs} + mapping = {} + if "scale_factor" in attributes: + mapping["scale_factor"] = 1 / attributes["scale_factor"][0] + else: + mapping["scale_factor"] = 1 + if "add_offset" in attributes: + mapping["add_offset"] = attributes["add_offset"] + else: + mapping["add_offset"] = 0 + if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0: + float_dtype = _choose_float_dtype(dtype=dataset.dtype, mapping=mapping) + target_dtype = np.dtype(float_dtype) + codec = FixedScaleOffset( + offset=mapping["add_offset"], + scale=mapping["scale_factor"], + dtype=target_dtype, + astype=dataset.dtype, + ) + cfcodec = CFCodec(target_dtype=target_dtype, codec=codec) + return cfcodec + else: + return None + + def codecs_from_dataset(dataset: h5py.Dataset) -> List[Codec]: codecs = [] for filter_id, filter_properties in dataset._filters.items(): From 908e332ae9860a7e7d36845633a7c9267ee72ca0 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 23 May 2024 10:54:48 -0600 Subject: [PATCH 21/68] Tests for cfcodec_from_dataset. --- virtualizarr/tests/test_readers/conftest.py | 10 +++++++ .../tests/test_readers/test_hdf_filters.py | 29 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index fe2ec88..202cdd9 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -164,3 +164,13 @@ def filter_encoded_xarray_netcdf4_files(tmpdir, request): filepath = f"{tmpdir}/{request.param}_xarray.nc" ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) return filepath + + +@pytest.fixture +def add_offset_netcdf4_file(tmpdir): + filepath = f"{tmpdir}/offset.nc" + f = h5py.File(filepath, "w") + data = np.random.random((10, 10)) + f.create_dataset(name="data", data=data, chunks=None) + f["data"].attrs.create(name="add_offset", data=5) + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index 28b5d69..dca9f40 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -1,9 +1,11 @@ import h5py import numcodecs +import numpy as np import pytest from virtualizarr.readers.hdf_filters import ( _filter_to_codec, + cfcodec_from_dataset, codecs_from_dataset, ) @@ -41,3 +43,30 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file): bytes_read = file.read(chunk_info.size) decoded = codecs[0].decode(bytes_read) assert decoded == np_uncompressed.tobytes() + + +class TestCFCodecFromDataset: + def test_no_cf_convention(self, filter_encoded_netcdf4_file): + f = h5py.File(filter_encoded_netcdf4_file) + ds = f["data"] + cf_codec = cfcodec_from_dataset(ds) + assert cf_codec is None + + def test_cf_scale_factor(self, netcdf4_file): + f = h5py.File(netcdf4_file) + ds = f["air"] + cf_codec = cfcodec_from_dataset(ds) + assert cf_codec["target_dtype"] == np.dtype(np.float64) + assert cf_codec["codec"].scale == 100.0 + assert cf_codec["codec"].offset == 0 + assert cf_codec["codec"].dtype == " Date: Fri, 24 May 2024 12:47:12 -0600 Subject: [PATCH 22/68] Temporarily relax integration tests to assert_allclose. --- virtualizarr/tests/test_integration.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 064968b..1b9aad8 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -62,7 +62,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format): roundtrip = xr.open_dataset(f"{tmpdir}/refs.{format}", engine="kerchunk") # assert equal to original dataset - xrt.assert_equal(roundtrip, ds) + xrt.assert_allclose(roundtrip, ds) def test_kerchunk_roundtrip_concat(self, tmpdir, format): # set up example xarray dataset @@ -89,7 +89,7 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format): roundtrip = xr.open_dataset(f"{tmpdir}/refs.{format}", engine="kerchunk") # assert equal to original dataset - xrt.assert_equal(roundtrip, ds) + xrt.assert_allclose(roundtrip, ds) def test_open_scalar_variable(tmpdir): From ca6b236b36fabf96c0659556f2cff2ef59435d6c Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 24 May 2024 13:50:49 -0600 Subject: [PATCH 23/68] Add blosc_lz4 fixture parameterization to confirm libnetcdf environment. --- virtualizarr/tests/test_readers/conftest.py | 13 +++++++++---- .../tests/test_readers/test_hdf_integration.py | 4 ++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 202cdd9..20d5433 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -134,7 +134,7 @@ def np_uncompressed(): return np.arange(100) -@pytest.fixture(params=["gzip", "blosc"]) +@pytest.fixture(params=["gzip", "blosc_lz4"]) def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): filepath = f"{tmpdir}/{request.param}.nc" f = h5py.File(filepath, "w") @@ -142,7 +142,7 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): f.create_dataset( name="data", data=np_uncompressed, compression="gzip", compression_opts=1 ) - if request.param == "blosc": + if request.param == "blosc_lz4": f.create_dataset( name="data", data=np_uncompressed, @@ -151,18 +151,23 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): return filepath -@pytest.fixture(params=["gzip"]) -def filter_encoded_xarray_netcdf4_files(tmpdir, request): +@pytest.fixture(params=["gzip", "blosc_zlib"]) +def filter_encoded_xarray_netcdf4_file(tmpdir, request): ds = xr.tutorial.open_dataset("air_temperature") encoding = {} if request.param == "gzip": encoding_config = {"zlib": True, "complevel": 1} + if "blosc" in request.param: + encoding_config = { + "compression": request.param, + } for var_name in ds.variables: encoding[var_name] = encoding_config filepath = f"{tmpdir}/{request.param}_xarray.nc" ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) + # ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding) return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index b31289c..ade8e7c 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -7,9 +7,9 @@ class TestIntegration: - def test_filters_end_to_end(self, tmpdir, filter_encoded_xarray_netcdf4_files): + def test_filters_roundtrip(self, tmpdir, filter_encoded_xarray_netcdf4_file): virtual_ds = virtualizarr.open_virtual_dataset( - filter_encoded_xarray_netcdf4_files, filetype=FileType("netcdf4") + filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4") ) kerchunk_file = f"{tmpdir}/kerchunk.json" virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") From b7426c5b15f33a65a0890a51fbc6d9464b673eaf Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 24 May 2024 14:05:21 -0600 Subject: [PATCH 24/68] Check for compatability with netcdf4 engine. --- virtualizarr/tests/test_readers/conftest.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 20d5433..cb1212f 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -166,8 +166,7 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request): encoding[var_name] = encoding_config filepath = f"{tmpdir}/{request.param}_xarray.nc" - ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) - # ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding) + ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding) return filepath From dac21dde6239b5ea7e918ff50aef8839ab2f7773 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 27 May 2024 12:58:48 -0600 Subject: [PATCH 25/68] Use separate fixtures for h5netcdf and netcdf4 compression styles. --- virtualizarr/tests/test_readers/conftest.py | 27 ++++++++++++++----- .../test_readers/test_hdf_integration.py | 20 ++++++++++++-- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index cb1212f..a4fafed 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -3,6 +3,7 @@ import numpy as np import pytest import xarray as xr +from xarray.tests.test_dataset import create_test_data @pytest.fixture @@ -151,22 +152,36 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): return filepath -@pytest.fixture(params=["gzip", "blosc_zlib"]) -def filter_encoded_xarray_netcdf4_file(tmpdir, request): +@pytest.fixture(params=["gzip"]) +def filter_encoded_xarray_h5netcdf_file(tmpdir, request): ds = xr.tutorial.open_dataset("air_temperature") encoding = {} if request.param == "gzip": encoding_config = {"zlib": True, "complevel": 1} + + for var_name in ds.variables: + encoding[var_name] = encoding_config + + filepath = f"{tmpdir}/{request.param}_xarray.nc" + ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) + return filepath + + +@pytest.fixture(params=["blosc_zlib"]) +def filter_encoded_xarray_netcdf4_file(tmpdir, request): + ds = create_test_data(dim_sizes=(20, 80, 10)) if "blosc" in request.param: encoding_config = { "compression": request.param, + "chunksizes": (20, 40), + "original_shape": ds.var2.shape, + "blosc_shuffle": 1, + "fletcher32": False, } - for var_name in ds.variables: - encoding[var_name] = encoding_config - + ds["var2"].encoding.update(encoding_config) filepath = f"{tmpdir}/{request.param}_xarray.nc" - ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding) + ds.to_netcdf(filepath, engine="netcdf4") return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index ade8e7c..d6ecf2f 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -7,9 +7,11 @@ class TestIntegration: - def test_filters_roundtrip(self, tmpdir, filter_encoded_xarray_netcdf4_file): + def test_filters_h5netcdf_roundtrip( + self, tmpdir, filter_encoded_xarray_h5netcdf_file + ): virtual_ds = virtualizarr.open_virtual_dataset( - filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4") + filter_encoded_xarray_h5netcdf_file, filetype=FileType("netcdf4") ) kerchunk_file = f"{tmpdir}/kerchunk.json" virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") @@ -18,3 +20,17 @@ def test_filters_roundtrip(self, tmpdir, filter_encoded_xarray_netcdf4_file): ds = xr.open_dataset(m, engine="kerchunk") assert isinstance(ds.air.values[0][0][0], numpy.float64) + + def test_filters_netcdf4_roundtrip( + self, tmpdir, filter_encoded_xarray_netcdf4_file + ): + virtual_ds = virtualizarr.open_virtual_dataset( + filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4") + ) + kerchunk_file = f"{tmpdir}/kerchunk.json" + virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") + fs = fsspec.filesystem("reference", fo=kerchunk_file) + m = fs.get_mapper("") + + ds = xr.open_dataset(m, engine="kerchunk") + print(ds["var2"].encoding) From e968772a3a206658064e3e29294afec7604d0bc9 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 27 May 2024 15:49:22 -0600 Subject: [PATCH 26/68] Print libhdf5 and libnetcdf4 versions to confirm compiled environment. --- virtualizarr/tests/test_readers/conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index a4fafed..8904dd3 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -4,6 +4,7 @@ import pytest import xarray as xr from xarray.tests.test_dataset import create_test_data +from xarray.util.print_versions import netcdf_and_hdf5_versions @pytest.fixture @@ -181,6 +182,7 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request): ds["var2"].encoding.update(encoding_config) filepath = f"{tmpdir}/{request.param}_xarray.nc" + print(netcdf_and_hdf5_versions()) ds.to_netcdf(filepath, engine="netcdf4") return filepath From 9a98e57e55fd020bcf3d682604eee2f03775ff26 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 27 May 2024 17:07:51 -0600 Subject: [PATCH 27/68] Skip netcdf4 style compression tests when libhdf5 < 1.14. --- virtualizarr/tests/test_readers/conftest.py | 15 ++++++++++++--- .../test_readers/test_hdf_integration.py | 19 ++++++++++++++++--- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 8904dd3..0ddb2a0 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -3,6 +3,7 @@ import numpy as np import pytest import xarray as xr +from packaging.version import Version from xarray.tests.test_dataset import create_test_data from xarray.util.print_versions import netcdf_and_hdf5_versions @@ -168,8 +169,17 @@ def filter_encoded_xarray_h5netcdf_file(tmpdir, request): return filepath +@pytest.fixture() +def skip_test_for_libhdf5_version(): + versions = netcdf_and_hdf5_versions() + libhdf5_version = Version(versions[0][1]) + return libhdf5_version < Version("1.14") + + @pytest.fixture(params=["blosc_zlib"]) -def filter_encoded_xarray_netcdf4_file(tmpdir, request): +def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_version): + if skip_test_for_libhdf5_version: + pytest.skip("Requires libhdf5 >= 1.14") ds = create_test_data(dim_sizes=(20, 80, 10)) if "blosc" in request.param: encoding_config = { @@ -182,9 +192,8 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request): ds["var2"].encoding.update(encoding_config) filepath = f"{tmpdir}/{request.param}_xarray.nc" - print(netcdf_and_hdf5_versions()) ds.to_netcdf(filepath, engine="netcdf4") - return filepath + return {"filepath": filepath, "compressor": request.param} @pytest.fixture diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index d6ecf2f..f51ebd4 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -24,13 +24,26 @@ def test_filters_h5netcdf_roundtrip( def test_filters_netcdf4_roundtrip( self, tmpdir, filter_encoded_xarray_netcdf4_file ): + filepath = filter_encoded_xarray_netcdf4_file["filepath"] + compressor = filter_encoded_xarray_netcdf4_file["compressor"] virtual_ds = virtualizarr.open_virtual_dataset( - filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4") + filepath, filetype=FileType("netcdf4") ) kerchunk_file = f"{tmpdir}/kerchunk.json" virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") fs = fsspec.filesystem("reference", fo=kerchunk_file) m = fs.get_mapper("") - ds = xr.open_dataset(m, engine="kerchunk") - print(ds["var2"].encoding) + + expected_encoding = ds["var2"].encoding.copy() + compression = expected_encoding.pop("compression") + blosc_shuffle = expected_encoding.pop("blosc_shuffle") + if compression is not None: + if "blosc" in compression and blosc_shuffle: + expected_encoding["blosc"] = { + "compressor": compressor, + "shuffle": blosc_shuffle, + } + expected_encoding["shuffle"] = False + actual_encoding = ds["var2"].encoding + assert expected_encoding.items() <= actual_encoding.items() From 7590b87e375f0dea6683aceba4322ca5a0c8a95d Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 11 Jun 2024 13:57:51 -0600 Subject: [PATCH 28/68] Include imagecodecs.numcodecs to support HDF5 lzf filters. --- pyproject.toml | 1 + virtualizarr/readers/hdf_filters.py | 2 +- virtualizarr/tests/test_readers/test_hdf_filters.py | 8 ++++---- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f0563f0..773cccc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ test = [ "fsspec", "s3fs", "fastparquet", + "imagecodecs-numcodecs", ] diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 169eab9..08a3bba 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -9,7 +9,7 @@ from pydantic import BaseModel, validator from xarray.coding.variables import _choose_float_dtype -_non_standard_filters = {"gzip": "zlib"} +_non_standard_filters = {"gzip": "zlib", "lzf": "imagecodecs_lzf"} class BloscProperties(BaseModel): diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index dca9f40..b5b0404 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -1,7 +1,7 @@ import h5py +import imagecodecs import numcodecs import numpy as np -import pytest from virtualizarr.readers.hdf_filters import ( _filter_to_codec, @@ -15,9 +15,9 @@ def test_gzip_uses_zlib_numcodec(self): codec = _filter_to_codec("gzip", 1) assert isinstance(codec, numcodecs.zlib.Zlib) - def test_lzf_not_available(self): - with pytest.raises(ValueError, match="codec not available"): - _filter_to_codec("lzf") + def test_lzf(self): + codec = _filter_to_codec("lzf") + assert isinstance(codec, imagecodecs.numcodecs.Lzf) def test_blosc(self): codec = _filter_to_codec("32001", (2, 2, 8, 800, 9, 2, 1)) From 14bd7098545bd7f443b791f24aafa11bcc00fdbb Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 11 Jun 2024 16:24:30 -0600 Subject: [PATCH 29/68] Remove test that verifies call to read_kerchunk_references_from_file. --- virtualizarr/tests/test_xarray.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 695759b..d145550 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -1,5 +1,4 @@ from collections.abc import Mapping -from unittest.mock import patch import numpy as np import pytest @@ -304,16 +303,3 @@ def test_loadable_variables(self, netcdf4_file): for name in full_ds.variables: if name in vars_to_load: xrt.assert_identical(vds.variables[name], full_ds.variables[name]) - - @patch("virtualizarr.kerchunk.read_kerchunk_references_from_file") - def test_open_virtual_dataset_passes_expected_args( - self, mock_read_kerchunk, netcdf4_file - ): - reader_options = {"option1": "value1", "option2": "value2"} - open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options) - args = { - "filepath": netcdf4_file, - "filetype": None, - "reader_options": reader_options, - } - mock_read_kerchunk.assert_called_once_with(**args) From acdf0d76557a5abdf2657f1278f57c732a4dd347 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 12 Jun 2024 15:05:34 -0600 Subject: [PATCH 30/68] Add additional codec support structures for imagecodecs and numcodecs. --- virtualizarr/readers/hdf_filters.py | 23 +++++++++++++++++---- virtualizarr/tests/test_readers/conftest.py | 9 +++++++- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 08a3bba..667ff09 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -9,7 +9,12 @@ from pydantic import BaseModel, validator from xarray.coding.variables import _choose_float_dtype -_non_standard_filters = {"gzip": "zlib", "lzf": "imagecodecs_lzf"} +_non_standard_filters = { + "gzip": "zlib", + "lzf": "imagecodecs_lzf", +} + +_hdf5plugin_imagecodecs = {"lz4": "imagecodecs_lz4h5", "bzip2": "imagecodecs_bz2"} class BloscProperties(BaseModel): @@ -27,6 +32,10 @@ def get_cname_from_code(cls, v): return blosc_compressor_codes[v] +class ZstdProperties(BaseModel): + level: int + + class CFCodec(TypedDict): target_dtype: np.dtype codec: Codec @@ -41,18 +50,20 @@ def _filter_to_codec( id_int = int(filter_id) except ValueError: id_str = filter_id - + conf = {} if id_str: if id_str in _non_standard_filters.keys(): id = _non_standard_filters[id_str] else: id = id_str - conf = {"id": id} + conf["id"] = id # type: ignore[assignment] if id == "zlib": conf["level"] = filter_properties # type: ignore[assignment] if id_int: filter = hdf5plugin.get_filters(id_int)[0] id = filter.filter_name + if id in _hdf5plugin_imagecodecs.keys(): + id = _hdf5plugin_imagecodecs[id] if id == "blosc" and isinstance(filter_properties, tuple): blosc_props = BloscProperties( **{ @@ -63,7 +74,11 @@ def _filter_to_codec( } ) conf = blosc_props.model_dump() # type: ignore[assignment] - conf["id"] = id + if id == "zstd" and isinstance(filter_properties, tuple): + zstd_props = ZstdProperties(level=filter_properties[0]) + conf = zstd_props.model_dump() # type: ignore[assignment] + + conf["id"] = id codec = registry.get_codec(conf) return codec diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 0ddb2a0..3e6f9c3 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -137,7 +137,7 @@ def np_uncompressed(): return np.arange(100) -@pytest.fixture(params=["gzip", "blosc_lz4"]) +@pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd"]) def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): filepath = f"{tmpdir}/{request.param}.nc" f = h5py.File(filepath, "w") @@ -151,6 +151,13 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): data=np_uncompressed, **hdf5plugin.Blosc(cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE), ) + if request.param == "lz4": + f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.LZ4(nbytes=0)) + if request.param == "bzip2": + f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.BZip2()) + if request.param == "zstd": + f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.Zstd(clevel=2)) + return filepath From 4ba323a6c862deb8908706373b6df429fd78f986 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Wed, 12 Jun 2024 16:17:04 -0600 Subject: [PATCH 31/68] Add codec config test for Zstd. --- virtualizarr/tests/test_readers/test_hdf_filters.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index b5b0404..4d23a75 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -31,6 +31,12 @@ def test_blosc(self): } assert codec.get_config() == expected_config + def test_zstd(self): + codec = _filter_to_codec("32015", (5,)) + assert isinstance(codec, numcodecs.zstd.Zstd) + expected_config = {"id": "zstd", "level": 5} + assert codec.get_config() == expected_config + class TestCodecsFromDataSet: def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file): From e14e53b0fc2bb7ed1ca3d5b73fc43594aff77426 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 20 Jun 2024 18:03:26 -0600 Subject: [PATCH 32/68] Include initial cf decoding tests. --- virtualizarr/readers/hdf_filters.py | 3 +- virtualizarr/tests/test_readers/conftest.py | 34 ++++++++++++++++--- .../tests/test_readers/test_hdf_filters.py | 28 +++++++++++++++ 3 files changed, 60 insertions(+), 5 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 667ff09..f4e2dcf 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -88,7 +88,8 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs} mapping = {} if "scale_factor" in attributes: - mapping["scale_factor"] = 1 / attributes["scale_factor"][0] + mapping["scale_factor"] = 1 / attributes["scale_factor"] + # mapping["scale_factor"] =attributes["scale_factor"][0] else: mapping["scale_factor"] = 1 if "add_offset" in attributes: diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 3e6f9c3..e1a53c5 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -204,10 +204,36 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_ve @pytest.fixture -def add_offset_netcdf4_file(tmpdir): +def np_uncompressed_int16(): + return np.arange(100, dtype=np.int16) + + +@pytest.fixture +def offset(): + return np.float32(5.0) + + +@pytest.fixture +def add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset): filepath = f"{tmpdir}/offset.nc" f = h5py.File(filepath, "w") - data = np.random.random((10, 10)) - f.create_dataset(name="data", data=data, chunks=None) - f["data"].attrs.create(name="add_offset", data=5) + data = np_uncompressed_int16 - offset + f.create_dataset(name="data", data=data, chunks=True) + f["data"].attrs.create(name="add_offset", data=offset) + return filepath + + +@pytest.fixture +def scale_factor(): + return 0.01 + + +@pytest.fixture +def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_factor): + filepath = f"{tmpdir}/scale_offset.nc" + f = h5py.File(filepath, "w") + data = (np_uncompressed_int16 - offset) / scale_factor + f.create_dataset(name="data", data=data, chunks=True) + f["data"].attrs.create(name="add_offset", data=offset) + f["data"].attrs.create(name="scale_factor", data=np.array([scale_factor])) return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index 4d23a75..960bcf2 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -76,3 +76,31 @@ def test_cf_add_offset(self, add_offset_netcdf4_file): assert cf_codec["codec"].scale == 1 assert cf_codec["codec"].offset == 5 assert cf_codec["codec"].dtype == " Date: Thu, 20 Jun 2024 19:49:54 -0600 Subject: [PATCH 33/68] Revert typo for scale_factor retrieval. --- virtualizarr/readers/hdf_filters.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index f4e2dcf..667ff09 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -88,8 +88,7 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs} mapping = {} if "scale_factor" in attributes: - mapping["scale_factor"] = 1 / attributes["scale_factor"] - # mapping["scale_factor"] =attributes["scale_factor"][0] + mapping["scale_factor"] = 1 / attributes["scale_factor"][0] else: mapping["scale_factor"] = 1 if "add_offset" in attributes: From 01a3980f541a45c8a33a907dd6d3bed722eacae9 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 20 Jun 2024 20:12:44 -0600 Subject: [PATCH 34/68] Update reader to use new numpy manifest representation. --- virtualizarr/readers/hdf.py | 29 ++++++++++----------- virtualizarr/tests/test_readers/test_hdf.py | 4 +-- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index c251866..b96bdff 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -39,34 +39,33 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: path=path, offset=dsid.get_offset(), length=dsid.get_storage_size() ) chunk_key = ChunkKey(key) - chunk_entries = {chunk_key: chunk_entry} + chunk_entries = {chunk_key: chunk_entry.dict()} chunk_manifest = ChunkManifest(entries=chunk_entries) return chunk_manifest else: num_chunks = dsid.get_num_chunks() if num_chunks == 0: raise ValueError("The dataset is chunked but contains no chunks") + paths = np.full(num_chunks, path, dtype=np.dtypes.StringDType) # type: ignore + offsets = np.empty((num_chunks), dtype=np.int32) + lengths = np.empty((num_chunks), dtype=np.int32) - chunk_entries = dict() - - def get_key(blob): - key_list = [a // b for a, b in zip(blob.chunk_offset, dataset.chunks)] - key = ".".join(map(str, key_list)) - return key - - def store_chunk_entry(blob): - chunk_entries[get_key(blob)] = ChunkEntry( - path=path, offset=blob.byte_offset, length=blob.size - ) + def add_chunk_info(blob, chunk_index): + offsets[chunk_index] = blob.byte_offset + lengths[chunk_index] = blob.size + chunk_index += 1 has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) if has_chunk_iter: - dsid.chunk_iter(store_chunk_entry) + chunk_index = 0 + dsid.chunk_iter(add_chunk_info, chunk_index) else: for index in range(num_chunks): - store_chunk_entry(dsid.get_chunk_info(index)) + add_chunk_info(dsid.get_chunk_info(index), index) - chunk_manifest = ChunkManifest(entries=chunk_entries) + chunk_manifest = ChunkManifest.from_arrays( + paths=paths, offsets=offsets, lengths=lengths + ) return chunk_manifest diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index a67352e..8c5a40a 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -27,13 +27,13 @@ def test_no_chunking(self, no_chunks_netcdf4_file): f = h5py.File(no_chunks_netcdf4_file) ds = f["data"] manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds) - assert len(manifest.entries) == 1 + assert len(manifest) == 1 def test_chunked(self, chunked_netcdf4_file): f = h5py.File(chunked_netcdf4_file) ds = f["data"] manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds) - assert len(manifest.entries) == 4 + assert len(manifest) == 4 class TestDatasetDims: From c37d9e526239ad5207f76d400924fffaabb578ec Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 21 Jun 2024 19:05:01 -0600 Subject: [PATCH 35/68] Temporarily skip test until blosc netcdf4 issue is solved. --- virtualizarr/tests/test_readers/test_hdf_integration.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index f51ebd4..dca34db 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -1,5 +1,6 @@ import fsspec import numpy +import pytest import xarray as xr import virtualizarr @@ -21,6 +22,9 @@ def test_filters_h5netcdf_roundtrip( ds = xr.open_dataset(m, engine="kerchunk") assert isinstance(ds.air.values[0][0][0], numpy.float64) + @pytest.mark.skip( + reason="Issue with xr 'dim1' serialization and blosc availability" + ) def test_filters_netcdf4_roundtrip( self, tmpdir, filter_encoded_xarray_netcdf4_file ): From 17b30d4149603c952e0b24892b2d104ed7499a52 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 21 Jun 2024 19:24:07 -0600 Subject: [PATCH 36/68] Fix Pydantic 2 migration warnings. --- virtualizarr/readers/hdf_filters.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 667ff09..cc8e810 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -6,7 +6,7 @@ import numpy as np from numcodecs.abc import Codec from numcodecs.fixedscaleoffset import FixedScaleOffset -from pydantic import BaseModel, validator +from pydantic import BaseModel, field_validator from xarray.coding.variables import _choose_float_dtype _non_standard_filters = { @@ -23,7 +23,7 @@ class BloscProperties(BaseModel): shuffle: int cname: str - @validator("cname", pre=True) + @field_validator("cname", mode="before") def get_cname_from_code(cls, v): blosc_compressor_codes = { value: key @@ -69,7 +69,7 @@ def _filter_to_codec( **{ k: v for k, v in zip( - BloscProperties.__fields__.keys(), filter_properties[-4:] + BloscProperties.model_fields.keys(), filter_properties[-4:] ) } ) From f6b596a6563aff90a70acb0b8190898399368f32 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 21 Jun 2024 19:30:55 -0600 Subject: [PATCH 37/68] Include hdf5plugin and imagecodecs-numcodecs in mamba test environment. --- ci/environment.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/environment.yml b/ci/environment.yml index 0385ea5..e909bee 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -14,6 +14,7 @@ dependencies: - ujson - packaging - universal_pathlib + - hdf5plugin # Testing - codecov - pre-commit @@ -26,3 +27,4 @@ dependencies: - fsspec - s3fs - fastparquet + - imagecodecs-numcodecs From eb6e24d10385fa68a9a8909d0c6cfb9a97a34461 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 21 Jun 2024 19:35:24 -0600 Subject: [PATCH 38/68] Mamba attempt with imagecodecs rather than imagecodecs-numcodecs. --- ci/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/environment.yml b/ci/environment.yml index e909bee..20784a6 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -27,4 +27,4 @@ dependencies: - fsspec - s3fs - fastparquet - - imagecodecs-numcodecs + - imagecodecs From c85bd168025d4c96c1112aff22cc82fc0e07cbfd Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Fri, 21 Jun 2024 19:41:14 -0600 Subject: [PATCH 39/68] Mamba attempt with latest imagecodecs release. --- ci/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/environment.yml b/ci/environment.yml index 20784a6..fb967bc 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -27,4 +27,4 @@ dependencies: - fsspec - s3fs - fastparquet - - imagecodecs + - imagecodecs>=2024.6.1 From ca435da5007263136bf489ffe647cb690145cbd7 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 25 Jun 2024 19:34:35 -0600 Subject: [PATCH 40/68] Use correct iter_chunks callback function signtature. --- virtualizarr/readers/hdf.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index b96bdff..d082b71 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -53,12 +53,22 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: def add_chunk_info(blob, chunk_index): offsets[chunk_index] = blob.byte_offset lengths[chunk_index] = blob.size - chunk_index += 1 has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) if has_chunk_iter: - chunk_index = 0 - dsid.chunk_iter(add_chunk_info, chunk_index) + + def create_callback(initial=0): + value = initial + + def callback(blob): + nonlocal value + add_chunk_info(blob, chunk_index=value) + value += 1 + + return callback + + callback = create_callback() + dsid.chunk_iter(callback) else: for index in range(num_chunks): add_chunk_info(dsid.get_chunk_info(index), index) From 3017951549fe4b3d9d7099b1357aa76136d23f16 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 25 Jun 2024 19:35:40 -0600 Subject: [PATCH 41/68] Include pip based imagecodecs-numcodecs until conda-forge availability. --- ci/environment.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/environment.yml b/ci/environment.yml index fb967bc..e2f5a86 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -28,3 +28,5 @@ dependencies: - s3fs - fastparquet - imagecodecs>=2024.6.1 + - pip: + - imagecodecs-numcodecs From 32ba13537070fbee7e861d8618f6a77eacbe0da8 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 27 Jun 2024 15:43:10 -0600 Subject: [PATCH 42/68] Handle non-coordinate dims which are serialized to hdf as empty dataset. --- virtualizarr/readers/hdf.py | 65 ++++++++++++--------- virtualizarr/tests/test_integration.py | 18 +++++- virtualizarr/tests/test_readers/test_hdf.py | 1 + virtualizarr/xarray.py | 2 +- 4 files changed, 53 insertions(+), 33 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index d082b71..cbbe824 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -11,7 +11,9 @@ from virtualizarr.zarr import ZArray -def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: +def _dataset_chunk_manifest( + path: str, dataset: h5py.Dataset +) -> Optional[ChunkManifest]: """ Generate ChunkManifest for HDF5 dataset. @@ -31,7 +33,7 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest: if dataset.chunks is None: if dsid.get_offset() is None: - raise ValueError("Dataset has no space allocated in the file") + return None else: key_list = [0] * (len(dataset.shape) or 1) key = ".".join(map(str, key_list)) @@ -167,35 +169,39 @@ def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]): return attrs -def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable: +def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variable]: # This chunk determination logic mirrors zarr-python's create # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66 - chunks = dataset.chunks if dataset.chunks else dataset.shape - codecs = codecs_from_dataset(dataset) - cfcodec = cfcodec_from_dataset(dataset) - attrs = _extract_attrs(dataset) - if cfcodec: - codecs.append(cfcodec["codec"]) - dtype = cfcodec["target_dtype"] - attrs.pop("scale_factor", None) - attrs.pop("add_offset", None) - else: - dtype = dataset.dtype - filters = [codec.get_config() for codec in codecs] - zarray = ZArray( - chunks=chunks, - compressor=None, - dtype=dtype, - fill_value=dataset.fillvalue, - filters=filters, - order="C", - shape=dataset.shape, - zarr_format=2, - ) + manifest = _dataset_chunk_manifest(path, dataset) - marray = ManifestArray(zarray=zarray, chunkmanifest=manifest) - dims = _dataset_dims(dataset) - variable = xr.Variable(data=marray, dims=dims, attrs=attrs) + if manifest: + chunks = dataset.chunks if dataset.chunks else dataset.shape + codecs = codecs_from_dataset(dataset) + cfcodec = cfcodec_from_dataset(dataset) + attrs = _extract_attrs(dataset) + if cfcodec: + codecs.append(cfcodec["codec"]) + dtype = cfcodec["target_dtype"] + attrs.pop("scale_factor", None) + attrs.pop("add_offset", None) + else: + dtype = dataset.dtype + filters = [codec.get_config() for codec in codecs] + zarray = ZArray( + chunks=chunks, + compressor=None, + dtype=dtype, + fill_value=dataset.fillvalue, + filters=filters, + order="C", + shape=dataset.shape, + zarr_format=2, + ) + marray = ManifestArray(zarray=zarray, chunkmanifest=manifest) + dims = _dataset_dims(dataset) + variable = xr.Variable(data=marray, dims=dims, attrs=attrs) + else: + variable = None return variable @@ -217,7 +223,8 @@ def virtual_vars_from_hdf( if key not in drop_variables: if isinstance(f[key], h5py.Dataset): variable = _dataset_to_variable(path, f[key]) - variables[key] = variable + if variable is not None: + variables[key] = variable else: raise NotImplementedError("Nested groups are not yet supported") diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 451862c..6a1f91e 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -71,9 +71,13 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format): f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False ) - # assert identical to original dataset + # assert all_close to original dataset xrt.assert_allclose(roundtrip, ds) + # assert coordinate attributes are maintained + for coord in ds.coords: + assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs + def test_kerchunk_roundtrip_concat(self, tmpdir, format): # set up example xarray dataset ds = xr.tutorial.open_dataset("air_temperature", decode_times=False) @@ -107,8 +111,12 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format): f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False ) - # assert identical to original dataset - xrt.assert_identical(roundtrip, ds) + # assert all_close to original dataset + xrt.assert_allclose(roundtrip, ds) + + # assert coordinate attributes are maintained + for coord in ds.coords: + assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs def test_non_dimension_coordinates(self, tmpdir, format): # regression test for GH issue #105 @@ -142,6 +150,10 @@ def test_non_dimension_coordinates(self, tmpdir, format): # assert equal to original dataset xrt.assert_allclose(roundtrip, ds) + # assert coordinate attributes are maintained + for coord in ds.coords: + assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs + def test_open_scalar_variable(tmpdir): # regression test for GH issue #100 diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index 8c5a40a..c744cd6 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -17,6 +17,7 @@ def test_empty_chunks(self, empty_chunks_netcdf4_file): with pytest.raises(ValueError, match="chunked but contains no chunks"): _dataset_chunk_manifest(path=empty_chunks_netcdf4_file, dataset=ds) + @pytest.mark.skip("Need to differentiate non coordinate dimensions from empty") def test_empty_dataset(self, empty_dataset_netcdf4_file): f = h5py.File(empty_dataset_netcdf4_file) ds = f["data"] diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 39bd067..a8a2369 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -121,7 +121,7 @@ def open_virtual_dataset( ds_attrs = attrs_from_root_group( path=filepath, reader_options=reader_options ) - coord_names = None + coord_names = ds_attrs.pop("coordinates", []) else: # this is the only place we actually always need to use kerchunk directly # TODO avoid even reading byte ranges for variables that will be dropped later anyway? From 64f446c8d452291548bba2c73a104bf068dc2d7e Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Thu, 27 Jun 2024 16:23:43 -0600 Subject: [PATCH 43/68] Use reader_options for filetype check and update failing kerchunk call. --- virtualizarr/tests/test_xarray.py | 18 +++++++++++++----- virtualizarr/xarray.py | 4 +++- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index e55583b..282d4ad 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -8,6 +8,7 @@ from xarray.core.indexes import Index from virtualizarr import open_virtual_dataset +from virtualizarr.kerchunk import FileType from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.tests import network, requires_s3fs from virtualizarr.zarr import ZArray @@ -325,18 +326,25 @@ def test_loadable_variables(self, netcdf4_file): if name in vars_to_load: xrt.assert_identical(vds.variables[name], full_ds.variables[name]) - @patch("virtualizarr.kerchunk.read_kerchunk_references_from_file") + @patch("virtualizarr.xarray._automatically_determine_filetype") + @patch("virtualizarr.xarray.virtual_vars_from_hdf") def test_open_virtual_dataset_passes_expected_args( - self, mock_read_kerchunk, netcdf4_file + self, mock_reader, mock_determine_filetype, netcdf4_file ): reader_options = {"option1": "value1", "option2": "value2"} + mock_determine_filetype.return_value = FileType.netcdf4 open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options) - args = { + reader_args = { + "path": netcdf4_file, + "drop_variables": [], + "reader_options": reader_options, + } + mock_reader.assert_called_once_with(**reader_args) + filetype_args = { "filepath": netcdf4_file, - "filetype": None, "reader_options": reader_options, } - mock_read_kerchunk.assert_called_once_with(**args) + mock_determine_filetype.assert_called_once_with(**filetype_args) class TestRenamePaths: diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index a8a2369..86a59c8 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -109,7 +109,9 @@ def open_virtual_dataset( ) else: if filetype is None: - filetype = _automatically_determine_filetype(filepath=filepath) + filetype = _automatically_determine_filetype( + filepath=filepath, reader_options=reader_options + ) filetype = FileType(filetype) if filetype.name.lower() == "netcdf4": From 9797346463e443d6f48b567569156f4ca01490cf Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sat, 29 Jun 2024 18:20:06 -0600 Subject: [PATCH 44/68] Fix chunkmanifest shaping for chunked datasets. --- virtualizarr/readers/hdf.py | 36 +++++++++------------ virtualizarr/tests/test_readers/test_hdf.py | 10 ++++-- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index cbbe824..d683f69 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -1,3 +1,4 @@ +import math from typing import List, Mapping, Optional, Union import h5py @@ -48,32 +49,27 @@ def _dataset_chunk_manifest( num_chunks = dsid.get_num_chunks() if num_chunks == 0: raise ValueError("The dataset is chunked but contains no chunks") - paths = np.full(num_chunks, path, dtype=np.dtypes.StringDType) # type: ignore - offsets = np.empty((num_chunks), dtype=np.int32) - lengths = np.empty((num_chunks), dtype=np.int32) - def add_chunk_info(blob, chunk_index): - offsets[chunk_index] = blob.byte_offset - lengths[chunk_index] = blob.size + shape = tuple(math.ceil(a / b) for a, b in zip(dataset.shape, dataset.chunks)) + paths = np.empty(shape, dtype=np.dtypes.StringDType) # type: ignore + offsets = np.empty(shape, dtype=np.int32) + lengths = np.empty(shape, dtype=np.int32) - has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) - if has_chunk_iter: - - def create_callback(initial=0): - value = initial + def get_key(blob): + return tuple([a // b for a, b in zip(blob.chunk_offset, dataset.chunks)]) - def callback(blob): - nonlocal value - add_chunk_info(blob, chunk_index=value) - value += 1 + def add_chunk_info(blob): + key = get_key(blob) + paths[key] = path + offsets[key] = blob.byte_offset + lengths[key] = blob.size - return callback - - callback = create_callback() - dsid.chunk_iter(callback) + has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) + if has_chunk_iter: + dsid.chunk_iter(add_chunk_info) else: for index in range(num_chunks): - add_chunk_info(dsid.get_chunk_info(index), index) + add_chunk_info(dsid.get_chunk_info(index)) chunk_manifest = ChunkManifest.from_arrays( paths=paths, offsets=offsets, lengths=lengths diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index c744cd6..25caab9 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -28,13 +28,19 @@ def test_no_chunking(self, no_chunks_netcdf4_file): f = h5py.File(no_chunks_netcdf4_file) ds = f["data"] manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds) - assert len(manifest) == 1 + assert manifest.shape_chunk_grid == (1, 1) def test_chunked(self, chunked_netcdf4_file): f = h5py.File(chunked_netcdf4_file) ds = f["data"] manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds) - assert len(manifest) == 4 + assert manifest.shape_chunk_grid == (2, 2) + + def test_chunked_roundtrip(self, chunked_roundtrip): + f = h5py.File(chunked_roundtrip) + ds = f["var2"] + manifest = _dataset_chunk_manifest(path=chunked_roundtrip, dataset=ds) + assert manifest.shape_chunk_grid == (2, 8) class TestDatasetDims: From c833e191abb773e409aec6eeb47ab6438d0ee0a9 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sat, 29 Jun 2024 18:22:05 -0600 Subject: [PATCH 45/68] Handle scale_factor attribute serialization for compressed files. --- virtualizarr/readers/hdf_filters.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index cc8e810..1a3c222 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -88,7 +88,11 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs} mapping = {} if "scale_factor" in attributes: - mapping["scale_factor"] = 1 / attributes["scale_factor"][0] + try: + scale_factor = attributes["scale_factor"][0] + except IndexError: + scale_factor = attributes["scale_factor"] + mapping["scale_factor"] = 1 / scale_factor else: mapping["scale_factor"] = 1 if "add_offset" in attributes: From 701bcfad494326a71ec08c454465bceaa33803e9 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sat, 29 Jun 2024 18:24:13 -0600 Subject: [PATCH 46/68] Include chunked roundtrip fixture. --- virtualizarr/tests/test_readers/conftest.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index e1a53c5..5fbec00 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -196,7 +196,8 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_ve "blosc_shuffle": 1, "fletcher32": False, } - + # Check on how handle scalar dim. + ds = ds.drop_dims("dim3") ds["var2"].encoding.update(encoding_config) filepath = f"{tmpdir}/{request.param}_xarray.nc" ds.to_netcdf(filepath, engine="netcdf4") @@ -237,3 +238,14 @@ def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_f f["data"].attrs.create(name="add_offset", data=offset) f["data"].attrs.create(name="scale_factor", data=np.array([scale_factor])) return filepath + + +@pytest.fixture() +def chunked_roundtrip(tmpdir): + ds = create_test_data(dim_sizes=(20, 80, 10)) + ds = ds.drop_dims("dim3") + filepath = f"{tmpdir}/chunked_xarray.nc" + ds.to_netcdf( + filepath, engine="netcdf4", encoding={"var2": {"chunksizes": (10, 10)}} + ) + return filepath From 08c988e2c16a7366a4ea99f2fc073da407b326d5 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sat, 29 Jun 2024 18:24:48 -0600 Subject: [PATCH 47/68] Standardize xarray integration tests for hdf filters. --- .../test_readers/test_hdf_integration.py | 47 ++++++------------- 1 file changed, 14 insertions(+), 33 deletions(-) diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index dca34db..abc23df 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -1,53 +1,34 @@ -import fsspec -import numpy import pytest import xarray as xr +import xarray.testing as xrt import virtualizarr from virtualizarr.kerchunk import FileType class TestIntegration: + @pytest.mark.xfail(reason="Investigate initial time value decoding issue") def test_filters_h5netcdf_roundtrip( self, tmpdir, filter_encoded_xarray_h5netcdf_file ): - virtual_ds = virtualizarr.open_virtual_dataset( + ds = xr.open_dataset(filter_encoded_xarray_h5netcdf_file, decode_times=False) + vds = virtualizarr.open_virtual_dataset( filter_encoded_xarray_h5netcdf_file, filetype=FileType("netcdf4") ) kerchunk_file = f"{tmpdir}/kerchunk.json" - virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") - fs = fsspec.filesystem("reference", fo=kerchunk_file) - m = fs.get_mapper("") - - ds = xr.open_dataset(m, engine="kerchunk") - assert isinstance(ds.air.values[0][0][0], numpy.float64) + vds.virtualize.to_kerchunk(kerchunk_file, format="json") + roundtrip = xr.open_dataset( + kerchunk_file, engine="kerchunk", decode_times=False + ) + xrt.assert_allclose(ds, roundtrip) - @pytest.mark.skip( - reason="Issue with xr 'dim1' serialization and blosc availability" - ) def test_filters_netcdf4_roundtrip( self, tmpdir, filter_encoded_xarray_netcdf4_file ): filepath = filter_encoded_xarray_netcdf4_file["filepath"] - compressor = filter_encoded_xarray_netcdf4_file["compressor"] - virtual_ds = virtualizarr.open_virtual_dataset( - filepath, filetype=FileType("netcdf4") - ) + ds = xr.open_dataset(filepath) + vds = virtualizarr.open_virtual_dataset(filepath, filetype=FileType("netcdf4")) kerchunk_file = f"{tmpdir}/kerchunk.json" - virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json") - fs = fsspec.filesystem("reference", fo=kerchunk_file) - m = fs.get_mapper("") - ds = xr.open_dataset(m, engine="kerchunk") - - expected_encoding = ds["var2"].encoding.copy() - compression = expected_encoding.pop("compression") - blosc_shuffle = expected_encoding.pop("blosc_shuffle") - if compression is not None: - if "blosc" in compression and blosc_shuffle: - expected_encoding["blosc"] = { - "compressor": compressor, - "shuffle": blosc_shuffle, - } - expected_encoding["shuffle"] = False - actual_encoding = ds["var2"].encoding - assert expected_encoding.items() <= actual_encoding.items() + vds.virtualize.to_kerchunk(kerchunk_file, format="json") + roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") + xrt.assert_equal(ds, roundtrip) From 4cb4bac261a7825f44798e247c13a6faeb752a5a Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sat, 29 Jun 2024 20:00:56 -0600 Subject: [PATCH 48/68] Update reader selection logic for new filetype determination. --- virtualizarr/xarray.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py index 1a795e5..9671264 100644 --- a/virtualizarr/xarray.py +++ b/virtualizarr/xarray.py @@ -136,8 +136,7 @@ def open_virtual_dataset( filepath=filepath, reader_options=reader_options ) filetype = FileType(filetype) - - if filetype.name.lower() == "netcdf4": + if filetype.name.lower() == "netcdf4" or filetype.name.lower() == "hdf5": virtual_vars = virtual_vars_from_hdf( path=filepath, drop_variables=drop_variables, From d352104393d0eeacfc3b566a9f0cb79c7e688c8f Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sun, 30 Jun 2024 13:07:17 -0600 Subject: [PATCH 49/68] Use decode_times for integration test. --- .../tests/test_readers/test_hdf_integration.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index abc23df..882dea3 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -1,4 +1,3 @@ -import pytest import xarray as xr import xarray.testing as xrt @@ -7,19 +6,18 @@ class TestIntegration: - @pytest.mark.xfail(reason="Investigate initial time value decoding issue") def test_filters_h5netcdf_roundtrip( self, tmpdir, filter_encoded_xarray_h5netcdf_file ): - ds = xr.open_dataset(filter_encoded_xarray_h5netcdf_file, decode_times=False) + ds = xr.open_dataset(filter_encoded_xarray_h5netcdf_file, decode_times=True) vds = virtualizarr.open_virtual_dataset( - filter_encoded_xarray_h5netcdf_file, filetype=FileType("netcdf4") + filter_encoded_xarray_h5netcdf_file, + loadable_variables=["time"], + cftime_variables=["time"], ) kerchunk_file = f"{tmpdir}/kerchunk.json" vds.virtualize.to_kerchunk(kerchunk_file, format="json") - roundtrip = xr.open_dataset( - kerchunk_file, engine="kerchunk", decode_times=False - ) + roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk", decode_times=True) xrt.assert_allclose(ds, roundtrip) def test_filters_netcdf4_roundtrip( From 3d89ea426ccb0f8abdcb961e55773887d48d38d6 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Sun, 30 Jun 2024 13:38:46 -0600 Subject: [PATCH 50/68] Standardize fixture names for hdf5 vs netcdf4 file types. --- virtualizarr/tests/test_readers/conftest.py | 36 +++++---- virtualizarr/tests/test_readers/test_hdf.py | 78 +++++++++---------- .../tests/test_readers/test_hdf_filters.py | 26 +++---- .../test_readers/test_hdf_integration.py | 10 +-- 4 files changed, 76 insertions(+), 74 deletions(-) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 5fbec00..539b2fb 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -9,7 +9,7 @@ @pytest.fixture -def empty_chunks_netcdf4_file(tmpdir): +def empty_chunks_hdf5_file(tmpdir): ds = xr.Dataset({"data": []}) filepath = f"{tmpdir}/empty_chunks.nc" ds.to_netcdf(filepath, engine="h5netcdf") @@ -17,7 +17,7 @@ def empty_chunks_netcdf4_file(tmpdir): @pytest.fixture -def empty_dataset_netcdf4_file(tmpdir): +def empty_dataset_hdf5_file(tmpdir): filepath = f"{tmpdir}/empty_dataset.nc" f = h5py.File(filepath, "w") f.create_dataset("data", shape=(0,), dtype="f") @@ -25,7 +25,7 @@ def empty_dataset_netcdf4_file(tmpdir): @pytest.fixture -def no_chunks_netcdf4_file(tmpdir): +def no_chunks_hdf5_file(tmpdir): filepath = f"{tmpdir}/no_chunks.nc" f = h5py.File(filepath, "w") data = np.random.random((10, 10)) @@ -34,7 +34,7 @@ def no_chunks_netcdf4_file(tmpdir): @pytest.fixture -def chunked_netcdf4_file(tmpdir): +def chunked_hdf5_file(tmpdir): filepath = f"{tmpdir}/chunks.nc" f = h5py.File(filepath, "w") data = np.random.random((100, 100)) @@ -43,7 +43,7 @@ def chunked_netcdf4_file(tmpdir): @pytest.fixture -def single_dimension_scale_netcdf4_file(tmpdir): +def single_dimension_scale_hdf5_file(tmpdir): filepath = f"{tmpdir}/single_dimension_scale.nc" f = h5py.File(filepath, "w") data = [1, 2] @@ -56,7 +56,7 @@ def single_dimension_scale_netcdf4_file(tmpdir): @pytest.fixture -def is_scale_netcdf4_file(tmpdir): +def is_scale_hdf5_file(tmpdir): filepath = f"{tmpdir}/is_scale.nc" f = h5py.File(filepath, "w") data = [1, 2] @@ -66,7 +66,7 @@ def is_scale_netcdf4_file(tmpdir): @pytest.fixture -def multiple_dimension_scales_netcdf4_file(tmpdir): +def multiple_dimension_scales_hdf5_file(tmpdir): filepath = f"{tmpdir}/multiple_dimension_scales.nc" f = h5py.File(filepath, "w") data = [1, 2] @@ -96,7 +96,7 @@ def chunked_dimensions_netcdf4_file(tmpdir): @pytest.fixture -def string_attributes_netcdf4_file(tmpdir): +def string_attributes_hdf5_file(tmpdir): filepath = f"{tmpdir}/attributes.nc" f = h5py.File(filepath, "w") data = np.random.random((10, 10)) @@ -107,7 +107,7 @@ def string_attributes_netcdf4_file(tmpdir): @pytest.fixture -def root_attributes_netcdf4_file(tmpdir): +def root_attributes_hdf5_file(tmpdir): filepath = f"{tmpdir}/root_attributes.nc" f = h5py.File(filepath, "w") f.attrs["attribute_name"] = "attribute_name" @@ -115,7 +115,7 @@ def root_attributes_netcdf4_file(tmpdir): @pytest.fixture -def group_netcdf4_file(tmpdir): +def group_hdf5_file(tmpdir): filepath = f"{tmpdir}/group.nc" f = h5py.File(filepath, "w") f.create_group("group") @@ -123,7 +123,7 @@ def group_netcdf4_file(tmpdir): @pytest.fixture -def multiple_datasets_netcdf4_file(tmpdir): +def multiple_datasets_hdf5_file(tmpdir): filepath = f"{tmpdir}/multiple_datasets.nc" f = h5py.File(filepath, "w") data = np.random.random((10, 10)) @@ -138,7 +138,7 @@ def np_uncompressed(): @pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd"]) -def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): +def filter_encoded_hdf5_file(tmpdir, np_uncompressed, request): filepath = f"{tmpdir}/{request.param}.nc" f = h5py.File(filepath, "w") if request.param == "gzip": @@ -162,7 +162,7 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request): @pytest.fixture(params=["gzip"]) -def filter_encoded_xarray_h5netcdf_file(tmpdir, request): +def filter_encoded_roundtrip_hdf5_file(tmpdir, request): ds = xr.tutorial.open_dataset("air_temperature") encoding = {} if request.param == "gzip": @@ -184,7 +184,9 @@ def skip_test_for_libhdf5_version(): @pytest.fixture(params=["blosc_zlib"]) -def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_version): +def filter_encoded_roundtrip_netcdf4_file( + tmpdir, request, skip_test_for_libhdf5_version +): if skip_test_for_libhdf5_version: pytest.skip("Requires libhdf5 >= 1.14") ds = create_test_data(dim_sizes=(20, 80, 10)) @@ -215,7 +217,7 @@ def offset(): @pytest.fixture -def add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset): +def add_offset_hdf5_file(tmpdir, np_uncompressed_int16, offset): filepath = f"{tmpdir}/offset.nc" f = h5py.File(filepath, "w") data = np_uncompressed_int16 - offset @@ -230,7 +232,7 @@ def scale_factor(): @pytest.fixture -def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_factor): +def scale_add_offset_hdf5_file(tmpdir, np_uncompressed_int16, offset, scale_factor): filepath = f"{tmpdir}/scale_offset.nc" f = h5py.File(filepath, "w") data = (np_uncompressed_int16 - offset) / scale_factor @@ -241,7 +243,7 @@ def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_f @pytest.fixture() -def chunked_roundtrip(tmpdir): +def chunked_roundtrip_hdf5_file(tmpdir): ds = create_test_data(dim_sizes=(20, 80, 10)) ds = ds.drop_dims("dim3") filepath = f"{tmpdir}/chunked_xarray.nc" diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py index 25caab9..1fb0f6e 100644 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ b/virtualizarr/tests/test_readers/test_hdf.py @@ -11,59 +11,59 @@ class TestDatasetChunkManifest: - def test_empty_chunks(self, empty_chunks_netcdf4_file): - f = h5py.File(empty_chunks_netcdf4_file) + def test_empty_chunks(self, empty_chunks_hdf5_file): + f = h5py.File(empty_chunks_hdf5_file) ds = f["data"] with pytest.raises(ValueError, match="chunked but contains no chunks"): - _dataset_chunk_manifest(path=empty_chunks_netcdf4_file, dataset=ds) + _dataset_chunk_manifest(path=empty_chunks_hdf5_file, dataset=ds) @pytest.mark.skip("Need to differentiate non coordinate dimensions from empty") - def test_empty_dataset(self, empty_dataset_netcdf4_file): - f = h5py.File(empty_dataset_netcdf4_file) + def test_empty_dataset(self, empty_dataset_hdf5_file): + f = h5py.File(empty_dataset_hdf5_file) ds = f["data"] with pytest.raises(ValueError, match="no space allocated in the file"): - _dataset_chunk_manifest(path=empty_dataset_netcdf4_file, dataset=ds) + _dataset_chunk_manifest(path=empty_dataset_hdf5_file, dataset=ds) - def test_no_chunking(self, no_chunks_netcdf4_file): - f = h5py.File(no_chunks_netcdf4_file) + def test_no_chunking(self, no_chunks_hdf5_file): + f = h5py.File(no_chunks_hdf5_file) ds = f["data"] - manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds) + manifest = _dataset_chunk_manifest(path=no_chunks_hdf5_file, dataset=ds) assert manifest.shape_chunk_grid == (1, 1) - def test_chunked(self, chunked_netcdf4_file): - f = h5py.File(chunked_netcdf4_file) + def test_chunked(self, chunked_hdf5_file): + f = h5py.File(chunked_hdf5_file) ds = f["data"] - manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds) + manifest = _dataset_chunk_manifest(path=chunked_hdf5_file, dataset=ds) assert manifest.shape_chunk_grid == (2, 2) - def test_chunked_roundtrip(self, chunked_roundtrip): - f = h5py.File(chunked_roundtrip) + def test_chunked_roundtrip(self, chunked_roundtrip_hdf5_file): + f = h5py.File(chunked_roundtrip_hdf5_file) ds = f["var2"] - manifest = _dataset_chunk_manifest(path=chunked_roundtrip, dataset=ds) + manifest = _dataset_chunk_manifest(path=chunked_roundtrip_hdf5_file, dataset=ds) assert manifest.shape_chunk_grid == (2, 8) class TestDatasetDims: - def test_single_dimension_scale(self, single_dimension_scale_netcdf4_file): - f = h5py.File(single_dimension_scale_netcdf4_file) + def test_single_dimension_scale(self, single_dimension_scale_hdf5_file): + f = h5py.File(single_dimension_scale_hdf5_file) ds = f["data"] dims = _dataset_dims(ds) assert dims[0] == "x" - def test_is_dimension_scale(self, is_scale_netcdf4_file): - f = h5py.File(is_scale_netcdf4_file) + def test_is_dimension_scale(self, is_scale_hdf5_file): + f = h5py.File(is_scale_hdf5_file) ds = f["data"] dims = _dataset_dims(ds) assert dims[0] == "data" - def test_multiple_dimension_scales(self, multiple_dimension_scales_netcdf4_file): - f = h5py.File(multiple_dimension_scales_netcdf4_file) + def test_multiple_dimension_scales(self, multiple_dimension_scales_hdf5_file): + f = h5py.File(multiple_dimension_scales_hdf5_file) ds = f["data"] with pytest.raises(ValueError, match="dimension scales attached"): _dataset_dims(ds) - def test_no_dimension_scales(self, no_chunks_netcdf4_file): - f = h5py.File(no_chunks_netcdf4_file) + def test_no_dimension_scales(self, no_chunks_hdf5_file): + f = h5py.File(no_chunks_hdf5_file) ds = f["data"] dims = _dataset_dims(ds) assert dims == ["phony_dim_0", "phony_dim_1"] @@ -76,33 +76,33 @@ def test_chunked_dataset(self, chunked_dimensions_netcdf4_file): var = _dataset_to_variable(chunked_dimensions_netcdf4_file, ds) assert var.chunks == (50, 50) - def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file): - f = h5py.File(single_dimension_scale_netcdf4_file) + def test_not_chunked_dataset(self, single_dimension_scale_hdf5_file): + f = h5py.File(single_dimension_scale_hdf5_file) ds = f["data"] - var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds) + var = _dataset_to_variable(single_dimension_scale_hdf5_file, ds) assert var.chunks == (2,) - def test_dataset_attributes(self, string_attributes_netcdf4_file): - f = h5py.File(string_attributes_netcdf4_file) + def test_dataset_attributes(self, string_attributes_hdf5_file): + f = h5py.File(string_attributes_hdf5_file) ds = f["data"] - var = _dataset_to_variable(string_attributes_netcdf4_file, ds) + var = _dataset_to_variable(string_attributes_hdf5_file, ds) assert var.attrs["attribute_name"] == "attribute_name" class TestExtractAttributes: - def test_string_attribute(self, string_attributes_netcdf4_file): - f = h5py.File(string_attributes_netcdf4_file) + def test_string_attribute(self, string_attributes_hdf5_file): + f = h5py.File(string_attributes_hdf5_file) ds = f["data"] attrs = _extract_attrs(ds) assert attrs["attribute_name"] == "attribute_name" - def test_root_attribute(self, root_attributes_netcdf4_file): - f = h5py.File(root_attributes_netcdf4_file) + def test_root_attribute(self, root_attributes_hdf5_file): + f = h5py.File(root_attributes_hdf5_file) attrs = _extract_attrs(f) assert attrs["attribute_name"] == "attribute_name" - def test_multiple_attributes(self, string_attributes_netcdf4_file): - f = h5py.File(string_attributes_netcdf4_file) + def test_multiple_attributes(self, string_attributes_hdf5_file): + f = h5py.File(string_attributes_hdf5_file) ds = f["data"] attrs = _extract_attrs(ds) assert len(attrs.keys()) == 2 @@ -113,10 +113,10 @@ def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file) assert len(variables) == 3 - def test_groups_not_implemented(self, group_netcdf4_file): + def test_groups_not_implemented(self, group_hdf5_file): with pytest.raises(NotImplementedError): - virtual_vars_from_hdf(group_netcdf4_file) + virtual_vars_from_hdf(group_hdf5_file) - def test_drop_variables(self, multiple_datasets_netcdf4_file): - variables = virtual_vars_from_hdf(multiple_datasets_netcdf4_file, ["data2"]) + def test_drop_variables(self, multiple_datasets_hdf5_file): + variables = virtual_vars_from_hdf(multiple_datasets_hdf5_file, ["data2"]) assert "data2" not in variables.keys() diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index 960bcf2..99b3af4 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -39,12 +39,12 @@ def test_zstd(self): class TestCodecsFromDataSet: - def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file): - f = h5py.File(filter_encoded_netcdf4_file) + def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file): + f = h5py.File(filter_encoded_hdf5_file) ds = f["data"] chunk_info = ds.id.get_chunk_info(0) codecs = codecs_from_dataset(ds) - with open(filter_encoded_netcdf4_file, "rb") as file: + with open(filter_encoded_hdf5_file, "rb") as file: file.seek(chunk_info.byte_offset) bytes_read = file.read(chunk_info.size) decoded = codecs[0].decode(bytes_read) @@ -52,8 +52,8 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file): class TestCFCodecFromDataset: - def test_no_cf_convention(self, filter_encoded_netcdf4_file): - f = h5py.File(filter_encoded_netcdf4_file) + def test_no_cf_convention(self, filter_encoded_hdf5_file): + f = h5py.File(filter_encoded_hdf5_file) ds = f["data"] cf_codec = cfcodec_from_dataset(ds) assert cf_codec is None @@ -68,8 +68,8 @@ def test_cf_scale_factor(self, netcdf4_file): assert cf_codec["codec"].dtype == " Date: Sun, 30 Jun 2024 22:14:26 -0600 Subject: [PATCH 51/68] Handle array add_offset property for compressed data. --- virtualizarr/readers/hdf_filters.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 1a3c222..5b35d8f 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -96,7 +96,11 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: else: mapping["scale_factor"] = 1 if "add_offset" in attributes: - mapping["add_offset"] = attributes["add_offset"] + try: + offset = attributes["add_offset"][0] + except IndexError: + offset = attributes["add_offset"] + mapping["add_offset"] = offset else: mapping["add_offset"] = 0 if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0: From db5b4213b0c4b512c872ce4acdce04c66936a6a5 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 1 Jul 2024 16:57:11 -0600 Subject: [PATCH 52/68] Include h5py shuffle filter. --- virtualizarr/readers/hdf_filters.py | 18 ++++++++++++++---- .../tests/test_readers/test_hdf_filters.py | 11 ++++++++++- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index 5b35d8f..a60dd56 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -36,6 +36,14 @@ class ZstdProperties(BaseModel): level: int +class ShuffleProperties(BaseModel): + elementsize: int + + +class ZlibProperties(BaseModel): + level: int + + class CFCodec(TypedDict): target_dtype: np.dtype codec: Codec @@ -56,9 +64,13 @@ def _filter_to_codec( id = _non_standard_filters[id_str] else: id = id_str - conf["id"] = id # type: ignore[assignment] if id == "zlib": - conf["level"] = filter_properties # type: ignore[assignment] + zlib_props = ZlibProperties(level=filter_properties) + conf = zlib_props.model_dump() # type: ignore[assignment] + if id == "shuffle" and isinstance(filter_properties, tuple): + shuffle_props = ShuffleProperties(elementsize=filter_properties[0]) + conf = shuffle_props.model_dump() # type: ignore[assignment] + conf["id"] = id # type: ignore[assignment] if id_int: filter = hdf5plugin.get_filters(id_int)[0] id = filter.filter_name @@ -77,9 +89,7 @@ def _filter_to_codec( if id == "zstd" and isinstance(filter_properties, tuple): zstd_props = ZstdProperties(level=filter_properties[0]) conf = zstd_props.model_dump() # type: ignore[assignment] - conf["id"] = id - codec = registry.get_codec(conf) return codec diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py index 99b3af4..efaad78 100644 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ b/virtualizarr/tests/test_readers/test_hdf_filters.py @@ -37,6 +37,12 @@ def test_zstd(self): expected_config = {"id": "zstd", "level": 5} assert codec.get_config() == expected_config + def test_shuffle(self): + codec = _filter_to_codec("shuffle", (7,)) + assert isinstance(codec, numcodecs.shuffle.Shuffle) + expected_config = {"id": "shuffle", "elementsize": 7} + assert codec.get_config() == expected_config + class TestCodecsFromDataSet: def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file): @@ -48,7 +54,10 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file): file.seek(chunk_info.byte_offset) bytes_read = file.read(chunk_info.size) decoded = codecs[0].decode(bytes_read) - assert decoded == np_uncompressed.tobytes() + if isinstance(decoded, np.ndarray): + assert decoded.tobytes() == np_uncompressed.tobytes() + else: + assert decoded == np_uncompressed.tobytes() class TestCFCodecFromDataset: From 9a1da321e186f56d230cb5609dc787f7d9ec557b Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Mon, 1 Jul 2024 17:03:46 -0600 Subject: [PATCH 53/68] Make ScaleAndOffset codec last in filters list. --- virtualizarr/readers/hdf.py | 2 +- virtualizarr/tests/test_readers/conftest.py | 36 ++++++++++++++++++- .../test_readers/test_hdf_integration.py | 10 ++++++ 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index d683f69..f3337c0 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -176,7 +176,7 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab cfcodec = cfcodec_from_dataset(dataset) attrs = _extract_attrs(dataset) if cfcodec: - codecs.append(cfcodec["codec"]) + codecs.insert(0, cfcodec["codec"]) dtype = cfcodec["target_dtype"] attrs.pop("scale_factor", None) attrs.pop("add_offset", None) diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index 539b2fb..afc0bee 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -137,7 +137,7 @@ def np_uncompressed(): return np.arange(100) -@pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd"]) +@pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd", "shuffle"]) def filter_encoded_hdf5_file(tmpdir, np_uncompressed, request): filepath = f"{tmpdir}/{request.param}.nc" f = h5py.File(filepath, "w") @@ -157,6 +157,8 @@ def filter_encoded_hdf5_file(tmpdir, np_uncompressed, request): f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.BZip2()) if request.param == "zstd": f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.Zstd(clevel=2)) + if request.param == "shuffle": + f.create_dataset(name="data", data=np_uncompressed, shuffle=True) return filepath @@ -251,3 +253,35 @@ def chunked_roundtrip_hdf5_file(tmpdir): filepath, engine="netcdf4", encoding={"var2": {"chunksizes": (10, 10)}} ) return filepath + + +@pytest.fixture(params=["gzip", "zlib"]) +def filter_and_cf_roundtrip_hdf5_file(tmpdir, request): + x = np.arange(100) + y = np.arange(100) + temperature = 0.1 * x[:, None] + 0.1 * y[None, :] + ds = xr.Dataset( + {"temperature": (["x", "y"], temperature)}, + coords={"x": np.arange(100), "y": np.arange(100)}, + ) + encoding = { + "temperature": { + "dtype": "int16", + "scale_factor": 0.1, + "add_offset": 273.15, + } + } + if request.param == "gzip": + encoding["temperature"]["compression"] = "gzip" + encoding["temperature"]["compression_opts"] = 7 + + if request.param == "zlib": + encoding["temperature"]["zlib"] = True + encoding["temperature"]["complevel"] = 9 + + from random import randint + + filepath = f"{tmpdir}/{request.param}_{randint(0,100)}_cf_roundtrip.nc" + ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) + + return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index 4fc7bd3..dd8d6c3 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -1,3 +1,4 @@ +import pytest import xarray as xr import xarray.testing as xrt @@ -30,3 +31,12 @@ def test_filters_netcdf4_roundtrip( vds.virtualize.to_kerchunk(kerchunk_file, format="json") roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") xrt.assert_equal(ds, roundtrip) + + @pytest.mark.xfail(reason="Investigate kerchunk _FillValue logic") + def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file): + ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file) + vds = virtualizarr.open_virtual_dataset(filter_and_cf_roundtrip_hdf5_file) + kerchunk_file = f"{tmpdir}/filter_cf_kerchunk.json" + vds.virtualize.to_kerchunk(kerchunk_file, format="json") + roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") + xrt.assert_allclose(ds, roundtrip) From 9b2b0f8a2b94073c2bf50fe78d8dd068e6d1332c Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 2 Jul 2024 13:23:23 -0600 Subject: [PATCH 54/68] Apply ScaleAndOffset codec to _FillValue since it's value is now downstream. --- virtualizarr/readers/hdf.py | 4 +++- virtualizarr/tests/test_readers/conftest.py | 7 ++++++- virtualizarr/tests/test_readers/test_hdf_integration.py | 2 -- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py index f3337c0..6197067 100644 --- a/virtualizarr/readers/hdf.py +++ b/virtualizarr/readers/hdf.py @@ -180,14 +180,16 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab dtype = cfcodec["target_dtype"] attrs.pop("scale_factor", None) attrs.pop("add_offset", None) + fill_value = cfcodec["codec"].decode(dataset.fillvalue) else: dtype = dataset.dtype + fill_value = dataset.fillvalue filters = [codec.get_config() for codec in codecs] zarray = ZArray( chunks=chunks, compressor=None, dtype=dtype, - fill_value=dataset.fillvalue, + fill_value=fill_value, filters=filters, order="C", shape=dataset.shape, diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py index afc0bee..ec4132b 100644 --- a/virtualizarr/tests/test_readers/conftest.py +++ b/virtualizarr/tests/test_readers/conftest.py @@ -259,7 +259,9 @@ def chunked_roundtrip_hdf5_file(tmpdir): def filter_and_cf_roundtrip_hdf5_file(tmpdir, request): x = np.arange(100) y = np.arange(100) + fill_value = np.int16(-9999) temperature = 0.1 * x[:, None] + 0.1 * y[None, :] + temperature[0][0] = fill_value ds = xr.Dataset( {"temperature": (["x", "y"], temperature)}, coords={"x": np.arange(100), "y": np.arange(100)}, @@ -269,7 +271,10 @@ def filter_and_cf_roundtrip_hdf5_file(tmpdir, request): "dtype": "int16", "scale_factor": 0.1, "add_offset": 273.15, - } + "_FillValue": fill_value, + }, + "x": {"_FillValue": fill_value}, + "y": {"_FillValue": fill_value}, } if request.param == "gzip": encoding["temperature"]["compression"] = "gzip" diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py index dd8d6c3..5cf3f79 100644 --- a/virtualizarr/tests/test_readers/test_hdf_integration.py +++ b/virtualizarr/tests/test_readers/test_hdf_integration.py @@ -1,4 +1,3 @@ -import pytest import xarray as xr import xarray.testing as xrt @@ -32,7 +31,6 @@ def test_filters_netcdf4_roundtrip( roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk") xrt.assert_equal(ds, roundtrip) - @pytest.mark.xfail(reason="Investigate kerchunk _FillValue logic") def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file): ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file) vds = virtualizarr.open_virtual_dataset(filter_and_cf_roundtrip_hdf5_file) From 9ef136275ff636535dcb7e6ecc5b35c1e7149065 Mon Sep 17 00:00:00 2001 From: sharkinsspatial Date: Tue, 2 Jul 2024 15:12:04 -0600 Subject: [PATCH 55/68] Coerce scale and add_offset values to native float for JSON serialization. --- virtualizarr/readers/hdf_filters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py index a60dd56..ae232fe 100644 --- a/virtualizarr/readers/hdf_filters.py +++ b/virtualizarr/readers/hdf_filters.py @@ -102,7 +102,7 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: scale_factor = attributes["scale_factor"][0] except IndexError: scale_factor = attributes["scale_factor"] - mapping["scale_factor"] = 1 / scale_factor + mapping["scale_factor"] = float(1 / scale_factor) else: mapping["scale_factor"] = 1 if "add_offset" in attributes: @@ -110,7 +110,7 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: offset = attributes["add_offset"][0] except IndexError: offset = attributes["add_offset"] - mapping["add_offset"] = offset + mapping["add_offset"] = float(offset) else: mapping["add_offset"] = 0 if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0: From eb16bc1ab249a5a2d9b48ae1b7920c6f0d7a4c1d Mon Sep 17 00:00:00 2001 From: Gustavo Hidalgo Date: Wed, 17 Jul 2024 16:27:30 -0400 Subject: [PATCH 56/68] Conformant ZarrV3 codecs --- virtualizarr/tests/test_integration.py | 2 +- virtualizarr/tests/test_zarr.py | 62 ++++++++++++++-- virtualizarr/zarr.py | 97 ++++++++++++++++++++++++-- 3 files changed, 148 insertions(+), 13 deletions(-) diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 2e612de..239316a 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -138,7 +138,7 @@ def test_non_dimension_coordinates(self, tmpdir, format): # regression test for GH issue #105 # set up example xarray dataset containing non-dimension coordinate variables - ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6).reshape(2, 3))}) + ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6.0).reshape(2, 3))}) # save it to disk as netCDF (in temporary directory) ds.to_netcdf(f"{tmpdir}/non_dim_coords.nc") diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py index 80d04b9..01ac7e5 100644 --- a/virtualizarr/tests/test_zarr.py +++ b/virtualizarr/tests/test_zarr.py @@ -1,12 +1,17 @@ +import json + import numpy as np +import pytest import xarray as xr import xarray.testing as xrt from virtualizarr import ManifestArray, open_virtual_dataset from virtualizarr.manifests.manifest import ChunkManifest +from virtualizarr.zarr import dataset_to_zarr, metadata_from_zarr_json -def test_zarr_v3_roundtrip(tmpdir): +@pytest.fixture +def vds_with_manifest_arrays() -> xr.Dataset: arr = ManifestArray( chunkmanifest=ChunkManifest( entries={"0.0": dict(path="test.nc", offset=6144, length=48)} @@ -15,18 +20,63 @@ def test_zarr_v3_roundtrip(tmpdir): shape=(2, 3), dtype=np.dtype(" 1 + and all(isconfigurable(codec) for codec in metadata["codecs"]) + ) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 545a86f..0ffc224 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -6,8 +6,10 @@ Literal, NewType, Optional, + Union, ) +import numcodecs import numpy as np import ujson # type: ignore import xarray as xr @@ -103,6 +105,8 @@ def dict(self) -> dict[str, Any]: if zarray_dict["fill_value"] is np.nan: zarray_dict["fill_value"] = None + else: + zarray_dict["fill_value"] = self._default_fill_value() return zarray_dict @@ -134,6 +138,80 @@ def replace( zarr_format=zarr_format if zarr_format is not None else self.zarr_format, ) + def _default_fill_value(self) -> Union[bool, int, float, str, list]: + """ + The value and format of the fill_value depend on the data_type of the array. + See here for spec: + https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value + """ + # numpy dtypes's hierarchy lets us avoid checking for all the widths + # https://numpy.org/doc/stable/reference/arrays.scalars.html + if self.dtype is np.dtype("bool"): + return False + elif self.dtype is np.dtype("int"): + return 0 + elif self.dtype is np.dtype("float"): + return "NaN" + elif self.dtype is np.dtype("complex"): + return ["NaN", "NaN"] + else: + return "NaN" + + def _v3_codec_pipeline(self) -> list: + """ + VirtualiZarr internally uses the `filters`, `compressor`, and `order` attributes + from zarr v2, but to create conformant zarr v3 metadata those 3 must be turned into `codecs` objects. + Not all codecs are created equal though: https://github.com/zarr-developers/zarr-python/issues/1943 + An array _must_ declare a single ArrayBytes codec, and 0 or more ArrayArray, BytesBytes codecs. + Roughly, this is the mapping: + ``` + filters: Iterable[ArrayArrayCodec] #optional + compressor: ArrayBytesCodec #mandatory + post_compressor: Iterable[BytesBytesCodec] #optional + ``` + """ + if self.filters: + filter_codecs_configs = [ + numcodecs.get_codec(filter).get_config() for filter in self.filters + ] + filters = [ + dict(name=codec.pop("id"), configuration=codec) + for codec in filter_codecs_configs + ] + else: + filters = [] + + # Noting here that zarr v3 has very few codecs specificed in the official spec, + # and that there are far more codecs in `numcodecs`. We take a gamble and assume + # that the codec names and configuration are simply mapped into zarrv3 "configurables". + compressor_codec = numcodecs.get_codec( + # default to gzip because it is officially specified in the zarr v3 spec + dict(id=self.compressor or "gzip") + ).get_config() + compressor_id = compressor_codec.pop("id") + compressor = dict(name=compressor_id, configuration=compressor_codec) + + # https://zarr-specs.readthedocs.io/en/latest/v3/codecs/transpose/v1.0.html#transpose-codec-v1 + # Either "C" or "F", defining the layout of bytes within each chunk of the array. + # "C" means row-major order, i.e., the last dimension varies fastest; + # "F" means column-major order, i.e., the first dimension varies fastest. + if self.order == "C": + order = tuple(range(len(self.shape))) + elif self.order == "F": + order = tuple(reversed(range(len(self.shape)))) + + transpose = dict(name="transpose", configuration=dict(order=order)) + # https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097 + # "If no ArrayBytesCodec is supplied, we can auto-add a BytesCodec" + bytes = dict( + name="bytes", configuration={} + ) # TODO need to handle endianess configuration + + # The order here is significant! + # [ArrayArray] -> ArrayBytes -> [BytesBytes] + codec_pipeline = [transpose, bytes] + [compressor] + filters + return codec_pipeline + def encode_dtype(dtype: np.dtype) -> str: # TODO not sure if there is a better way to get the ' "name": "default", "configuration": {"separator": "/"}, } - metadata["codecs"] = metadata.pop("filters") - metadata.pop("compressor") # TODO this should be entered in codecs somehow - metadata.pop("order") # TODO this should be replaced by a transpose codec + metadata["codecs"] = zarray._v3_codec_pipeline() + metadata.pop("filters") + metadata.pop("compressor") + metadata.pop("order") # indicate that we're using the manifest storage transformer ZEP metadata["storage_transformers"] = [ @@ -282,13 +361,19 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: fill_value = np.nan else: fill_value = metadata["fill_value"] - + all_codecs = [ + codec + for codec in metadata["codecs"] + if codec["name"] not in ("transpose", "bytes") + ] + compressor = all_codecs[0] + filters = [dict(id=f.pop("name"), **f) for f in all_codecs[1:]] or None zarray = ZArray( chunks=metadata["chunk_grid"]["configuration"]["chunk_shape"], - compressor=metadata["codecs"], + compressor=compressor["name"], dtype=np.dtype(metadata["data_type"]), fill_value=fill_value, - filters=metadata.get("filters", None), + filters=filters, order="C", shape=chunk_shape, zarr_format=3, From 5f1b7f9aff309c53e95bdd85fdf6dee7a2caae3e Mon Sep 17 00:00:00 2001 From: Gustavo Hidalgo Date: Wed, 17 Jul 2024 16:42:39 -0400 Subject: [PATCH 57/68] Update docs --- docs/releases.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/releases.rst b/docs/releases.rst index c44ff24..1451191 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -12,6 +12,9 @@ New Features Breaking changes ~~~~~~~~~~~~~~~~ +- Serialize valid ZarrV3 metadata (for :pull:`193`). + By `Gustavo Hidalgo `_. + Deprecations ~~~~~~~~~~~~ From 519d45d6c62480d3ee6bf378cc4629fd92cb6cb0 Mon Sep 17 00:00:00 2001 From: Gustavo Hidalgo Date: Wed, 17 Jul 2024 21:19:27 -0400 Subject: [PATCH 58/68] Update virtualizarr/zarr.py Co-authored-by: Tom Augspurger --- virtualizarr/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 0ffc224..45f7874 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -196,7 +196,7 @@ def _v3_codec_pipeline(self) -> list: # "C" means row-major order, i.e., the last dimension varies fastest; # "F" means column-major order, i.e., the first dimension varies fastest. if self.order == "C": - order = tuple(range(len(self.shape))) + order = tuple(enumerate(self.shape)) elif self.order == "F": order = tuple(reversed(range(len(self.shape)))) From 76e9c8ecc0f4da63db1f41539f2a9a655129214b Mon Sep 17 00:00:00 2001 From: Gustavo Hidalgo Date: Wed, 17 Jul 2024 21:19:44 -0400 Subject: [PATCH 59/68] Update virtualizarr/zarr.py Co-authored-by: Tom Augspurger --- virtualizarr/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 45f7874..1bb8cc3 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -198,7 +198,7 @@ def _v3_codec_pipeline(self) -> list: if self.order == "C": order = tuple(enumerate(self.shape)) elif self.order == "F": - order = tuple(reversed(range(len(self.shape)))) + order = tuple(reversed(enumerate(self.shape))) transpose = dict(name="transpose", configuration=dict(order=order)) # https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097 From 000c52072fafc80cfb9defae39a6310ec4574b54 Mon Sep 17 00:00:00 2001 From: Gustavo Hidalgo Date: Wed, 17 Jul 2024 21:38:48 -0400 Subject: [PATCH 60/68] Change default_fill to 0s --- virtualizarr/zarr.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 0ffc224..cdc45df 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -151,11 +151,11 @@ def _default_fill_value(self) -> Union[bool, int, float, str, list]: elif self.dtype is np.dtype("int"): return 0 elif self.dtype is np.dtype("float"): - return "NaN" + return 0.0 elif self.dtype is np.dtype("complex"): - return ["NaN", "NaN"] + return [0.0, 0.0] else: - return "NaN" + return 0.0 def _v3_codec_pipeline(self) -> list: """ From c2e7279edc68e0d40f3b99c114d54d3e2e08f746 Mon Sep 17 00:00:00 2001 From: Gustavo Hidalgo Date: Wed, 17 Jul 2024 22:26:04 -0400 Subject: [PATCH 61/68] Generate permutation --- virtualizarr/zarr.py | 77 ++++++++++++++++++++++++++------------------ 1 file changed, 45 insertions(+), 32 deletions(-) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 2b2dd04..a00c7a2 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -13,7 +13,14 @@ import numpy as np import ujson # type: ignore import xarray as xr -from pydantic import BaseModel, ConfigDict, field_validator +from pydantic import ( + BaseModel, + ConfigDict, + Field, + field_validator, + model_validator, +) +from typing_extensions import Self from virtualizarr.vendor.zarr.utils import json_dumps @@ -24,6 +31,7 @@ ZAttrs = NewType( "ZAttrs", dict[str, Any] ) # just the .zattrs (for one array or for the whole store/group) +FillValueT = bool | str | float | int | list | None class Codec(BaseModel): @@ -46,7 +54,7 @@ class ZArray(BaseModel): chunks: tuple[int, ...] compressor: str | None = None dtype: np.dtype - fill_value: float | int | None = np.nan # float or int? + fill_value: FillValueT = Field(default=0.0, validate_default=True) filters: list[dict] | None = None order: Literal["C", "F"] shape: tuple[int, ...] @@ -66,6 +74,12 @@ def __post_init__(self) -> None: f"Array shape {self.shape} has ndim={self.shape} but chunk shape {self.chunks} has ndim={len(self.chunks)}" ) + @model_validator(mode="after") + def _check_fill_value(self) -> Self: + if self.fill_value is None: + self.fill_value = _default_fill_value(self.dtype) + return self + @property def codec(self) -> Codec: """For comparison against other arrays.""" @@ -100,18 +114,14 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": def dict(self) -> dict[str, Any]: zarray_dict = dict(self) - zarray_dict["dtype"] = encode_dtype(zarray_dict["dtype"]) - - if zarray_dict["fill_value"] is np.nan: - zarray_dict["fill_value"] = None - else: - zarray_dict["fill_value"] = self._default_fill_value() - return zarray_dict def to_kerchunk_json(self) -> str: - return ujson.dumps(self.dict()) + zarray_dict = self.dict() + if zarray_dict["fill_value"] is np.nan: + zarray_dict["fill_value"] = None + return ujson.dumps(zarray_dict) def replace( self, @@ -138,25 +148,6 @@ def replace( zarr_format=zarr_format if zarr_format is not None else self.zarr_format, ) - def _default_fill_value(self) -> Union[bool, int, float, str, list]: - """ - The value and format of the fill_value depend on the data_type of the array. - See here for spec: - https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value - """ - # numpy dtypes's hierarchy lets us avoid checking for all the widths - # https://numpy.org/doc/stable/reference/arrays.scalars.html - if self.dtype is np.dtype("bool"): - return False - elif self.dtype is np.dtype("int"): - return 0 - elif self.dtype is np.dtype("float"): - return 0.0 - elif self.dtype is np.dtype("complex"): - return [0.0, 0.0] - else: - return 0.0 - def _v3_codec_pipeline(self) -> list: """ VirtualiZarr internally uses the `filters`, `compressor`, and `order` attributes @@ -196,9 +187,9 @@ def _v3_codec_pipeline(self) -> list: # "C" means row-major order, i.e., the last dimension varies fastest; # "F" means column-major order, i.e., the first dimension varies fastest. if self.order == "C": - order = tuple(enumerate(self.shape)) + order = tuple(range(len(self.shape))) elif self.order == "F": - order = tuple(reversed(enumerate(self.shape))) + order = tuple(reversed(range(len(self.shape)))) transpose = dict(name="transpose", configuration=dict(order=order)) # https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097 @@ -358,7 +349,9 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: chunk_shape = metadata["chunk_grid"]["configuration"]["chunk_shape"] if metadata["fill_value"] is None: - fill_value = np.nan + raise ValueError( + "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" + ) else: fill_value = metadata["fill_value"] all_codecs = [ @@ -380,3 +373,23 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: ) return zarray, dim_names, attrs + + +def _default_fill_value(dtype: np.dtype) -> Union[bool, int, float, str, list]: + """ + The value and format of the fill_value depend on the data_type of the array. + See here for spec: + https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value + """ + # numpy dtypes's hierarchy lets us avoid checking for all the widths + # https://numpy.org/doc/stable/reference/arrays.scalars.html + if dtype is np.dtype("bool"): + return False + elif dtype is np.dtype("int"): + return 0 + elif dtype is np.dtype("float"): + return 0.0 + elif dtype is np.dtype("complex"): + return [0.0, 0.0] + else: + return 0.0 From 145960a6e42c21dd111dc10fa03b4657c92c7480 Mon Sep 17 00:00:00 2001 From: Gustavo Hidalgo Date: Thu, 18 Jul 2024 10:31:14 -0400 Subject: [PATCH 62/68] Pythonic isinstance check --- virtualizarr/tests/test_zarr.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py index 01ac7e5..5967f7d 100644 --- a/virtualizarr/tests/test_zarr.py +++ b/virtualizarr/tests/test_zarr.py @@ -72,9 +72,7 @@ def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: xr.Datas ) assert isconfigurable(metadata["chunk_grid"]) assert isconfigurable(metadata["chunk_key_encoding"]) - assert any( - isinstance(metadata["fill_value"], t) for t in (bool, int, float, str, list) - ) + assert isinstance(metadata["fill_value"], (bool, int, float, str, list)) assert ( isinstance(metadata["codecs"], list) and len(metadata["codecs"]) > 1 From c051f04523ae3d9a4244c1ece92ffc95a633498b Mon Sep 17 00:00:00 2001 From: Gustavo Hidalgo Date: Thu, 18 Jul 2024 10:31:59 -0400 Subject: [PATCH 63/68] Add return type to isconfigurable Co-authored-by: Tom Augspurger --- virtualizarr/tests/test_zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py index 5967f7d..29db840 100644 --- a/virtualizarr/tests/test_zarr.py +++ b/virtualizarr/tests/test_zarr.py @@ -30,7 +30,7 @@ def vds_with_manifest_arrays() -> xr.Dataset: return xr.Dataset({"a": (["x", "y"], arr)}, attrs={"something": 0}) -def isconfigurable(value: dict): +def isconfigurable(value: dict) -> bool: """ Several metadata attributes in ZarrV3 use a dictionary with keys "name" : str and "configuration" : dict """ From 7b093247075b6eb3204a8fe7069ef985b7b8747b Mon Sep 17 00:00:00 2001 From: Tria McNeely Date: Fri, 19 Jul 2024 14:30:59 -0400 Subject: [PATCH 64/68] Changes from pair programming for zarrv3 to kerchunk file reading --- virtualizarr/kerchunk.py | 2 +- virtualizarr/zarr.py | 50 ++++++++++++++++++++++++++-------------- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 6e82067..122b86b 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -266,7 +266,7 @@ def variable_to_kerchunk_arr_refs(var: xr.Variable, var_name: str) -> KerchunkAr for chunk_key, entry in marr.manifest.dict().items() } - zarray = marr.zarray + zarray = marr.zarray.replace(zarr_format=2) else: try: diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index a00c7a2..7e5674e 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -35,7 +35,7 @@ class Codec(BaseModel): - compressor: str | None = None + compressor: dict | None = None filters: list[dict] | None = None def __repr__(self) -> str: @@ -52,7 +52,7 @@ class ZArray(BaseModel): ) chunks: tuple[int, ...] - compressor: str | None = None + compressor: dict | None = None dtype: np.dtype fill_value: FillValueT = Field(default=0.0, validate_default=True) filters: list[dict] | None = None @@ -98,8 +98,8 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": compressor = decoded_arr_refs_zarray["compressor"] # deal with an inconsistency in kerchunk's tiff_to_zarr function # TODO should this be moved to the point where we actually call tiff_to_zarr? Or ideally made consistent upstream. - if compressor is not None and "id" in compressor: - compressor = compressor["id"] + # if compressor is not None and "id" in compressor: + # compressor = compressor["id"] return ZArray( chunks=tuple(decoded_arr_refs_zarray["chunks"]), @@ -126,7 +126,7 @@ def to_kerchunk_json(self) -> str: def replace( self, chunks: Optional[tuple[int, ...]] = None, - compressor: Optional[str] = None, + compressor: Optional[dict] = None, dtype: Optional[np.dtype] = None, fill_value: Optional[float] = None, # float or int? filters: Optional[list[dict]] = None, # type: ignore[valid-type] @@ -175,12 +175,10 @@ def _v3_codec_pipeline(self) -> list: # Noting here that zarr v3 has very few codecs specificed in the official spec, # and that there are far more codecs in `numcodecs`. We take a gamble and assume # that the codec names and configuration are simply mapped into zarrv3 "configurables". - compressor_codec = numcodecs.get_codec( - # default to gzip because it is officially specified in the zarr v3 spec - dict(id=self.compressor or "gzip") - ).get_config() - compressor_id = compressor_codec.pop("id") - compressor = dict(name=compressor_id, configuration=compressor_codec) + if self.compressor: + compressor = [_num_codec_config_to_configurable(self.compressor)] + else: + compressor = [] # https://zarr-specs.readthedocs.io/en/latest/v3/codecs/transpose/v1.0.html#transpose-codec-v1 # Either "C" or "F", defining the layout of bytes within each chunk of the array. @@ -200,7 +198,7 @@ def _v3_codec_pipeline(self) -> list: # The order here is significant! # [ArrayArray] -> ArrayBytes -> [BytesBytes] - codec_pipeline = [transpose, bytes] + [compressor] + filters + codec_pipeline = [transpose, bytes] + compressor + filters return codec_pipeline @@ -347,6 +345,8 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: dim_names = metadata.pop("dimension_names") chunk_shape = metadata["chunk_grid"]["configuration"]["chunk_shape"] + shape = metadata["shape"] + zarr_format = metadata["zarr_format"] if metadata["fill_value"] is None: raise ValueError( @@ -359,21 +359,37 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: for codec in metadata["codecs"] if codec["name"] not in ("transpose", "bytes") ] - compressor = all_codecs[0] - filters = [dict(id=f.pop("name"), **f) for f in all_codecs[1:]] or None + # TODO: hdf.py treats all codecs as filter, but maybe one needs to be the compressor? + compressor = None #all_codecs[0] if all_codecs else None + filters = [_configurable_to_num_codec_config(_filter) for _filter in all_codecs] or None zarray = ZArray( chunks=metadata["chunk_grid"]["configuration"]["chunk_shape"], - compressor=compressor["name"], + compressor=_configurable_to_num_codec_config(compressor) if compressor else None, dtype=np.dtype(metadata["data_type"]), fill_value=fill_value, filters=filters, order="C", - shape=chunk_shape, - zarr_format=3, + shape=shape, + zarr_format=zarr_format, ) return zarray, dim_names, attrs +def _configurable_to_num_codec_config(configurable: dict) -> dict: + """ + Convert a zarr v3 configurable into a numcodecs codec. + """ + configurable_copy = configurable.copy() + codec_id = configurable_copy.pop("name") + configuration = configurable_copy.pop("configuration") + return numcodecs.get_codec({"id": codec_id, **configuration}).get_config() + +def _num_codec_config_to_configurable(num_codec: dict) -> dict: + """ + Convert a numcodecs codec into a zarr v3 configurable. + """ + num_codec_copy = num_codec.copy() + return {"name": num_codec_copy.pop("id"), "configuration": num_codec_copy} def _default_fill_value(dtype: np.dtype) -> Union[bool, int, float, str, list]: """ From 2c59256424752ba7acab8f32038b19c7cb535b2f Mon Sep 17 00:00:00 2001 From: Tria McNeely Date: Fri, 19 Jul 2024 14:43:42 -0400 Subject: [PATCH 65/68] Revert "Merge remote-tracking branch 'upstream/hdf5_reader' into codecs" This reverts commit 7a65fbdc8eda1dfedaa59e90bd2d8fe652819085, reversing changes made to c051f04523ae3d9a4244c1ece92ffc95a633498b. --- ci/environment.yml | 4 - pyproject.toml | 2 - virtualizarr/readers/hdf.py | 243 --------------- virtualizarr/readers/hdf_filters.py | 136 -------- virtualizarr/tests/test_integration.py | 23 +- virtualizarr/tests/test_readers/__init__.py | 0 virtualizarr/tests/test_readers/conftest.py | 292 ------------------ virtualizarr/tests/test_readers/test_hdf.py | 122 -------- .../tests/test_readers/test_hdf_filters.py | 115 ------- .../test_readers/test_hdf_integration.py | 40 --- virtualizarr/tests/test_xarray.py | 18 +- virtualizarr/xarray.py | 56 ++-- 12 files changed, 28 insertions(+), 1023 deletions(-) delete mode 100644 virtualizarr/readers/hdf.py delete mode 100644 virtualizarr/readers/hdf_filters.py delete mode 100644 virtualizarr/tests/test_readers/__init__.py delete mode 100644 virtualizarr/tests/test_readers/conftest.py delete mode 100644 virtualizarr/tests/test_readers/test_hdf.py delete mode 100644 virtualizarr/tests/test_readers/test_hdf_filters.py delete mode 100644 virtualizarr/tests/test_readers/test_hdf_integration.py diff --git a/ci/environment.yml b/ci/environment.yml index 5ba1f8d..a41a99d 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -14,7 +14,6 @@ dependencies: - ujson - packaging - universal_pathlib - - hdf5plugin # Testing - codecov - pre-commit @@ -27,10 +26,7 @@ dependencies: - fsspec - s3fs - fastparquet - - imagecodecs>=2024.6.1 # for opening tiff files - tifffile # for opening FITS files - astropy - - pip: - - imagecodecs-numcodecs diff --git a/pyproject.toml b/pyproject.toml index 7baa87b..9fe0468 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,6 @@ dependencies = [ "ujson", "packaging", "universal-pathlib", - "hdf5plugin", ] [project.optional-dependencies] @@ -46,7 +45,6 @@ test = [ "fsspec", "s3fs", "fastparquet", - "imagecodecs-numcodecs", ] diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py deleted file mode 100644 index 6197067..0000000 --- a/virtualizarr/readers/hdf.py +++ /dev/null @@ -1,243 +0,0 @@ -import math -from typing import List, Mapping, Optional, Union - -import h5py -import numpy as np -import xarray as xr - -from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray -from virtualizarr.readers.hdf_filters import cfcodec_from_dataset, codecs_from_dataset -from virtualizarr.types import ChunkKey -from virtualizarr.utils import _fsspec_openfile_from_filepath -from virtualizarr.zarr import ZArray - - -def _dataset_chunk_manifest( - path: str, dataset: h5py.Dataset -) -> Optional[ChunkManifest]: - """ - Generate ChunkManifest for HDF5 dataset. - - Parameters - ---------- - path: str - The path the HDF5 container file - dset : h5py.Dataset - HDF5 dataset for which to create a ChunkManifest - - Returns - ------- - ChunkManifest - A Virtualizarr ChunkManifest - """ - dsid = dataset.id - - if dataset.chunks is None: - if dsid.get_offset() is None: - return None - else: - key_list = [0] * (len(dataset.shape) or 1) - key = ".".join(map(str, key_list)) - chunk_entry = ChunkEntry( - path=path, offset=dsid.get_offset(), length=dsid.get_storage_size() - ) - chunk_key = ChunkKey(key) - chunk_entries = {chunk_key: chunk_entry.dict()} - chunk_manifest = ChunkManifest(entries=chunk_entries) - return chunk_manifest - else: - num_chunks = dsid.get_num_chunks() - if num_chunks == 0: - raise ValueError("The dataset is chunked but contains no chunks") - - shape = tuple(math.ceil(a / b) for a, b in zip(dataset.shape, dataset.chunks)) - paths = np.empty(shape, dtype=np.dtypes.StringDType) # type: ignore - offsets = np.empty(shape, dtype=np.int32) - lengths = np.empty(shape, dtype=np.int32) - - def get_key(blob): - return tuple([a // b for a, b in zip(blob.chunk_offset, dataset.chunks)]) - - def add_chunk_info(blob): - key = get_key(blob) - paths[key] = path - offsets[key] = blob.byte_offset - lengths[key] = blob.size - - has_chunk_iter = callable(getattr(dsid, "chunk_iter", None)) - if has_chunk_iter: - dsid.chunk_iter(add_chunk_info) - else: - for index in range(num_chunks): - add_chunk_info(dsid.get_chunk_info(index)) - - chunk_manifest = ChunkManifest.from_arrays( - paths=paths, offsets=offsets, lengths=lengths - ) - return chunk_manifest - - -def _dataset_dims(dataset: h5py.Dataset) -> Union[List[str], List[None]]: - """ - Get a list of dimension scale names attached to input HDF5 dataset. - - This is required by the xarray package to work with Zarr arrays. Only - one dimension scale per dataset dimension is allowed. If dataset is - dimension scale, it will be considered as the dimension to itself. - - Parameters - ---------- - dataset : h5py.Dataset - HDF5 dataset. - - Returns - ------- - list - List with HDF5 path names of dimension scales attached to input - dataset. - """ - dims = list() - rank = len(dataset.shape) - if rank: - for n in range(rank): - num_scales = len(dataset.dims[n]) - if num_scales == 1: - dims.append(dataset.dims[n][0].name[1:]) - elif h5py.h5ds.is_scale(dataset.id): - dims.append(dataset.name[1:]) - elif num_scales > 1: - raise ValueError( - f"{dataset.name}: {len(dataset.dims[n])} " - f"dimension scales attached to dimension #{n}" - ) - elif num_scales == 0: - # Some HDF5 files do not have dimension scales. - # If this is the case, `num_scales` will be 0. - # In this case, we mimic netCDF4 and assign phony dimension names. - # See https://github.com/fsspec/kerchunk/issues/41 - dims.append(f"phony_dim_{n}") - return dims - - -def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]): - """ - Extract attributes from an HDF5 group or dataset. - - Parameters - ---------- - h5obj : h5py.Group or h5py.Dataset - An HDF5 group or dataset. - """ - _HIDDEN_ATTRS = { - "REFERENCE_LIST", - "CLASS", - "DIMENSION_LIST", - "NAME", - "_Netcdf4Dimid", - "_Netcdf4Coordinates", - "_nc3_strict", - "_NCProperties", - } - attrs = {} - for n, v in h5obj.attrs.items(): - if n in _HIDDEN_ATTRS: - continue - # Fix some attribute values to avoid JSON encoding exceptions... - if isinstance(v, bytes): - v = v.decode("utf-8") or " " - elif isinstance(v, (np.ndarray, np.number, np.bool_)): - if v.dtype.kind == "S": - v = v.astype(str) - if n == "_FillValue": - continue - elif v.size == 1: - v = v.flatten()[0] - if isinstance(v, (np.ndarray, np.number, np.bool_)): - v = v.tolist() - else: - v = v.tolist() - elif isinstance(v, h5py._hl.base.Empty): - v = "" - if v == "DIMENSION_SCALE": - continue - - attrs[n] = v - return attrs - - -def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variable]: - # This chunk determination logic mirrors zarr-python's create - # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66 - - manifest = _dataset_chunk_manifest(path, dataset) - if manifest: - chunks = dataset.chunks if dataset.chunks else dataset.shape - codecs = codecs_from_dataset(dataset) - cfcodec = cfcodec_from_dataset(dataset) - attrs = _extract_attrs(dataset) - if cfcodec: - codecs.insert(0, cfcodec["codec"]) - dtype = cfcodec["target_dtype"] - attrs.pop("scale_factor", None) - attrs.pop("add_offset", None) - fill_value = cfcodec["codec"].decode(dataset.fillvalue) - else: - dtype = dataset.dtype - fill_value = dataset.fillvalue - filters = [codec.get_config() for codec in codecs] - zarray = ZArray( - chunks=chunks, - compressor=None, - dtype=dtype, - fill_value=fill_value, - filters=filters, - order="C", - shape=dataset.shape, - zarr_format=2, - ) - marray = ManifestArray(zarray=zarray, chunkmanifest=manifest) - dims = _dataset_dims(dataset) - variable = xr.Variable(data=marray, dims=dims, attrs=attrs) - else: - variable = None - return variable - - -def virtual_vars_from_hdf( - path: str, - drop_variables: Optional[List[str]] = None, - reader_options: Optional[dict] = { - "storage_options": {"key": "", "secret": "", "anon": True} - }, -) -> Mapping[str, xr.Variable]: - if drop_variables is None: - drop_variables = [] - open_file = _fsspec_openfile_from_filepath( - filepath=path, reader_options=reader_options - ) - f = h5py.File(open_file, mode="r") - variables = {} - for key in f.keys(): - if key not in drop_variables: - if isinstance(f[key], h5py.Dataset): - variable = _dataset_to_variable(path, f[key]) - if variable is not None: - variables[key] = variable - else: - raise NotImplementedError("Nested groups are not yet supported") - - return variables - - -def attrs_from_root_group( - path: str, - reader_options: Optional[dict] = { - "storage_options": {"key": "", "secret": "", "anon": True} - }, -): - open_file = _fsspec_openfile_from_filepath( - filepath=path, reader_options=reader_options - ) - f = h5py.File(open_file, mode="r") - attrs = _extract_attrs(f) - return attrs diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py deleted file mode 100644 index ae232fe..0000000 --- a/virtualizarr/readers/hdf_filters.py +++ /dev/null @@ -1,136 +0,0 @@ -from typing import List, Tuple, TypedDict, Union - -import h5py -import hdf5plugin -import numcodecs.registry as registry -import numpy as np -from numcodecs.abc import Codec -from numcodecs.fixedscaleoffset import FixedScaleOffset -from pydantic import BaseModel, field_validator -from xarray.coding.variables import _choose_float_dtype - -_non_standard_filters = { - "gzip": "zlib", - "lzf": "imagecodecs_lzf", -} - -_hdf5plugin_imagecodecs = {"lz4": "imagecodecs_lz4h5", "bzip2": "imagecodecs_bz2"} - - -class BloscProperties(BaseModel): - blocksize: int - clevel: int - shuffle: int - cname: str - - @field_validator("cname", mode="before") - def get_cname_from_code(cls, v): - blosc_compressor_codes = { - value: key - for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items() - } - return blosc_compressor_codes[v] - - -class ZstdProperties(BaseModel): - level: int - - -class ShuffleProperties(BaseModel): - elementsize: int - - -class ZlibProperties(BaseModel): - level: int - - -class CFCodec(TypedDict): - target_dtype: np.dtype - codec: Codec - - -def _filter_to_codec( - filter_id: str, filter_properties: Union[int, None, Tuple] = None -) -> Codec: - id_int = None - id_str = None - try: - id_int = int(filter_id) - except ValueError: - id_str = filter_id - conf = {} - if id_str: - if id_str in _non_standard_filters.keys(): - id = _non_standard_filters[id_str] - else: - id = id_str - if id == "zlib": - zlib_props = ZlibProperties(level=filter_properties) - conf = zlib_props.model_dump() # type: ignore[assignment] - if id == "shuffle" and isinstance(filter_properties, tuple): - shuffle_props = ShuffleProperties(elementsize=filter_properties[0]) - conf = shuffle_props.model_dump() # type: ignore[assignment] - conf["id"] = id # type: ignore[assignment] - if id_int: - filter = hdf5plugin.get_filters(id_int)[0] - id = filter.filter_name - if id in _hdf5plugin_imagecodecs.keys(): - id = _hdf5plugin_imagecodecs[id] - if id == "blosc" and isinstance(filter_properties, tuple): - blosc_props = BloscProperties( - **{ - k: v - for k, v in zip( - BloscProperties.model_fields.keys(), filter_properties[-4:] - ) - } - ) - conf = blosc_props.model_dump() # type: ignore[assignment] - if id == "zstd" and isinstance(filter_properties, tuple): - zstd_props = ZstdProperties(level=filter_properties[0]) - conf = zstd_props.model_dump() # type: ignore[assignment] - conf["id"] = id - codec = registry.get_codec(conf) - return codec - - -def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None: - attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs} - mapping = {} - if "scale_factor" in attributes: - try: - scale_factor = attributes["scale_factor"][0] - except IndexError: - scale_factor = attributes["scale_factor"] - mapping["scale_factor"] = float(1 / scale_factor) - else: - mapping["scale_factor"] = 1 - if "add_offset" in attributes: - try: - offset = attributes["add_offset"][0] - except IndexError: - offset = attributes["add_offset"] - mapping["add_offset"] = float(offset) - else: - mapping["add_offset"] = 0 - if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0: - float_dtype = _choose_float_dtype(dtype=dataset.dtype, mapping=mapping) - target_dtype = np.dtype(float_dtype) - codec = FixedScaleOffset( - offset=mapping["add_offset"], - scale=mapping["scale_factor"], - dtype=target_dtype, - astype=dataset.dtype, - ) - cfcodec = CFCodec(target_dtype=target_dtype, codec=codec) - return cfcodec - else: - return None - - -def codecs_from_dataset(dataset: h5py.Dataset) -> List[Codec]: - codecs = [] - for filter_id, filter_properties in dataset._filters.items(): - codec = _filter_to_codec(filter_id, filter_properties) - codecs.append(codec) - return codecs diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 65b9c71..239316a 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -69,12 +69,8 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format): f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False ) - # assert all_close to original dataset - xrt.assert_allclose(roundtrip, ds) - - # assert coordinate attributes are maintained - for coord in ds.coords: - assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs + # assert identical to original dataset + xrt.assert_identical(roundtrip, ds) @pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])]) def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars): @@ -128,14 +124,9 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars roundtrip = xr.open_dataset( f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=decode_times ) - if decode_times is False: - # assert all_close to original dataset - xrt.assert_allclose(roundtrip, ds) - - # assert coordinate attributes are maintained - for coord in ds.coords: - assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs + # assert identical to original dataset + xrt.assert_identical(roundtrip, ds) else: # they are very very close! But assert_allclose doesn't seem to work on datetimes assert (roundtrip.time - ds.time).sum() == 0 @@ -173,11 +164,7 @@ def test_non_dimension_coordinates(self, tmpdir, format): ) # assert equal to original dataset - xrt.assert_allclose(roundtrip, ds) - - # assert coordinate attributes are maintained - for coord in ds.coords: - assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs + xrt.assert_identical(roundtrip, ds) def test_open_scalar_variable(tmpdir): diff --git a/virtualizarr/tests/test_readers/__init__.py b/virtualizarr/tests/test_readers/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py deleted file mode 100644 index ec4132b..0000000 --- a/virtualizarr/tests/test_readers/conftest.py +++ /dev/null @@ -1,292 +0,0 @@ -import h5py -import hdf5plugin -import numpy as np -import pytest -import xarray as xr -from packaging.version import Version -from xarray.tests.test_dataset import create_test_data -from xarray.util.print_versions import netcdf_and_hdf5_versions - - -@pytest.fixture -def empty_chunks_hdf5_file(tmpdir): - ds = xr.Dataset({"data": []}) - filepath = f"{tmpdir}/empty_chunks.nc" - ds.to_netcdf(filepath, engine="h5netcdf") - return filepath - - -@pytest.fixture -def empty_dataset_hdf5_file(tmpdir): - filepath = f"{tmpdir}/empty_dataset.nc" - f = h5py.File(filepath, "w") - f.create_dataset("data", shape=(0,), dtype="f") - return filepath - - -@pytest.fixture -def no_chunks_hdf5_file(tmpdir): - filepath = f"{tmpdir}/no_chunks.nc" - f = h5py.File(filepath, "w") - data = np.random.random((10, 10)) - f.create_dataset(name="data", data=data, chunks=None) - return filepath - - -@pytest.fixture -def chunked_hdf5_file(tmpdir): - filepath = f"{tmpdir}/chunks.nc" - f = h5py.File(filepath, "w") - data = np.random.random((100, 100)) - f.create_dataset(name="data", data=data, chunks=(50, 50)) - return filepath - - -@pytest.fixture -def single_dimension_scale_hdf5_file(tmpdir): - filepath = f"{tmpdir}/single_dimension_scale.nc" - f = h5py.File(filepath, "w") - data = [1, 2] - x = [0, 1] - f.create_dataset(name="data", data=data) - f.create_dataset(name="x", data=x) - f["x"].make_scale() - f["data"].dims[0].attach_scale(f["x"]) - return filepath - - -@pytest.fixture -def is_scale_hdf5_file(tmpdir): - filepath = f"{tmpdir}/is_scale.nc" - f = h5py.File(filepath, "w") - data = [1, 2] - f.create_dataset(name="data", data=data) - f["data"].make_scale() - return filepath - - -@pytest.fixture -def multiple_dimension_scales_hdf5_file(tmpdir): - filepath = f"{tmpdir}/multiple_dimension_scales.nc" - f = h5py.File(filepath, "w") - data = [1, 2] - f.create_dataset(name="data", data=data) - f.create_dataset(name="x", data=[0, 1]) - f.create_dataset(name="y", data=[0, 1]) - f["x"].make_scale() - f["y"].make_scale() - f["data"].dims[0].attach_scale(f["x"]) - f["data"].dims[0].attach_scale(f["y"]) - return filepath - - -@pytest.fixture -def chunked_dimensions_netcdf4_file(tmpdir): - filepath = f"{tmpdir}/chunks_dimension.nc" - f = h5py.File(filepath, "w") - data = np.random.random((100, 100)) - x = np.random.random((100)) - y = np.random.random((100)) - f.create_dataset(name="data", data=data, chunks=(50, 50)) - f.create_dataset(name="x", data=x) - f.create_dataset(name="y", data=y) - f["data"].dims[0].attach_scale(f["x"]) - f["data"].dims[1].attach_scale(f["y"]) - return filepath - - -@pytest.fixture -def string_attributes_hdf5_file(tmpdir): - filepath = f"{tmpdir}/attributes.nc" - f = h5py.File(filepath, "w") - data = np.random.random((10, 10)) - f.create_dataset(name="data", data=data, chunks=None) - f["data"].attrs["attribute_name"] = "attribute_name" - f["data"].attrs["attribute_name2"] = "attribute_name2" - return filepath - - -@pytest.fixture -def root_attributes_hdf5_file(tmpdir): - filepath = f"{tmpdir}/root_attributes.nc" - f = h5py.File(filepath, "w") - f.attrs["attribute_name"] = "attribute_name" - return filepath - - -@pytest.fixture -def group_hdf5_file(tmpdir): - filepath = f"{tmpdir}/group.nc" - f = h5py.File(filepath, "w") - f.create_group("group") - return filepath - - -@pytest.fixture -def multiple_datasets_hdf5_file(tmpdir): - filepath = f"{tmpdir}/multiple_datasets.nc" - f = h5py.File(filepath, "w") - data = np.random.random((10, 10)) - f.create_dataset(name="data", data=data, chunks=None) - f.create_dataset(name="data2", data=data, chunks=None) - return filepath - - -@pytest.fixture -def np_uncompressed(): - return np.arange(100) - - -@pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd", "shuffle"]) -def filter_encoded_hdf5_file(tmpdir, np_uncompressed, request): - filepath = f"{tmpdir}/{request.param}.nc" - f = h5py.File(filepath, "w") - if request.param == "gzip": - f.create_dataset( - name="data", data=np_uncompressed, compression="gzip", compression_opts=1 - ) - if request.param == "blosc_lz4": - f.create_dataset( - name="data", - data=np_uncompressed, - **hdf5plugin.Blosc(cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE), - ) - if request.param == "lz4": - f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.LZ4(nbytes=0)) - if request.param == "bzip2": - f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.BZip2()) - if request.param == "zstd": - f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.Zstd(clevel=2)) - if request.param == "shuffle": - f.create_dataset(name="data", data=np_uncompressed, shuffle=True) - - return filepath - - -@pytest.fixture(params=["gzip"]) -def filter_encoded_roundtrip_hdf5_file(tmpdir, request): - ds = xr.tutorial.open_dataset("air_temperature") - encoding = {} - if request.param == "gzip": - encoding_config = {"zlib": True, "complevel": 1} - - for var_name in ds.variables: - encoding[var_name] = encoding_config - - filepath = f"{tmpdir}/{request.param}_xarray.nc" - ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) - return filepath - - -@pytest.fixture() -def skip_test_for_libhdf5_version(): - versions = netcdf_and_hdf5_versions() - libhdf5_version = Version(versions[0][1]) - return libhdf5_version < Version("1.14") - - -@pytest.fixture(params=["blosc_zlib"]) -def filter_encoded_roundtrip_netcdf4_file( - tmpdir, request, skip_test_for_libhdf5_version -): - if skip_test_for_libhdf5_version: - pytest.skip("Requires libhdf5 >= 1.14") - ds = create_test_data(dim_sizes=(20, 80, 10)) - if "blosc" in request.param: - encoding_config = { - "compression": request.param, - "chunksizes": (20, 40), - "original_shape": ds.var2.shape, - "blosc_shuffle": 1, - "fletcher32": False, - } - # Check on how handle scalar dim. - ds = ds.drop_dims("dim3") - ds["var2"].encoding.update(encoding_config) - filepath = f"{tmpdir}/{request.param}_xarray.nc" - ds.to_netcdf(filepath, engine="netcdf4") - return {"filepath": filepath, "compressor": request.param} - - -@pytest.fixture -def np_uncompressed_int16(): - return np.arange(100, dtype=np.int16) - - -@pytest.fixture -def offset(): - return np.float32(5.0) - - -@pytest.fixture -def add_offset_hdf5_file(tmpdir, np_uncompressed_int16, offset): - filepath = f"{tmpdir}/offset.nc" - f = h5py.File(filepath, "w") - data = np_uncompressed_int16 - offset - f.create_dataset(name="data", data=data, chunks=True) - f["data"].attrs.create(name="add_offset", data=offset) - return filepath - - -@pytest.fixture -def scale_factor(): - return 0.01 - - -@pytest.fixture -def scale_add_offset_hdf5_file(tmpdir, np_uncompressed_int16, offset, scale_factor): - filepath = f"{tmpdir}/scale_offset.nc" - f = h5py.File(filepath, "w") - data = (np_uncompressed_int16 - offset) / scale_factor - f.create_dataset(name="data", data=data, chunks=True) - f["data"].attrs.create(name="add_offset", data=offset) - f["data"].attrs.create(name="scale_factor", data=np.array([scale_factor])) - return filepath - - -@pytest.fixture() -def chunked_roundtrip_hdf5_file(tmpdir): - ds = create_test_data(dim_sizes=(20, 80, 10)) - ds = ds.drop_dims("dim3") - filepath = f"{tmpdir}/chunked_xarray.nc" - ds.to_netcdf( - filepath, engine="netcdf4", encoding={"var2": {"chunksizes": (10, 10)}} - ) - return filepath - - -@pytest.fixture(params=["gzip", "zlib"]) -def filter_and_cf_roundtrip_hdf5_file(tmpdir, request): - x = np.arange(100) - y = np.arange(100) - fill_value = np.int16(-9999) - temperature = 0.1 * x[:, None] + 0.1 * y[None, :] - temperature[0][0] = fill_value - ds = xr.Dataset( - {"temperature": (["x", "y"], temperature)}, - coords={"x": np.arange(100), "y": np.arange(100)}, - ) - encoding = { - "temperature": { - "dtype": "int16", - "scale_factor": 0.1, - "add_offset": 273.15, - "_FillValue": fill_value, - }, - "x": {"_FillValue": fill_value}, - "y": {"_FillValue": fill_value}, - } - if request.param == "gzip": - encoding["temperature"]["compression"] = "gzip" - encoding["temperature"]["compression_opts"] = 7 - - if request.param == "zlib": - encoding["temperature"]["zlib"] = True - encoding["temperature"]["complevel"] = 9 - - from random import randint - - filepath = f"{tmpdir}/{request.param}_{randint(0,100)}_cf_roundtrip.nc" - ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding) - - return filepath diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py deleted file mode 100644 index 1fb0f6e..0000000 --- a/virtualizarr/tests/test_readers/test_hdf.py +++ /dev/null @@ -1,122 +0,0 @@ -import h5py -import pytest - -from virtualizarr.readers.hdf import ( - _dataset_chunk_manifest, - _dataset_dims, - _dataset_to_variable, - _extract_attrs, - virtual_vars_from_hdf, -) - - -class TestDatasetChunkManifest: - def test_empty_chunks(self, empty_chunks_hdf5_file): - f = h5py.File(empty_chunks_hdf5_file) - ds = f["data"] - with pytest.raises(ValueError, match="chunked but contains no chunks"): - _dataset_chunk_manifest(path=empty_chunks_hdf5_file, dataset=ds) - - @pytest.mark.skip("Need to differentiate non coordinate dimensions from empty") - def test_empty_dataset(self, empty_dataset_hdf5_file): - f = h5py.File(empty_dataset_hdf5_file) - ds = f["data"] - with pytest.raises(ValueError, match="no space allocated in the file"): - _dataset_chunk_manifest(path=empty_dataset_hdf5_file, dataset=ds) - - def test_no_chunking(self, no_chunks_hdf5_file): - f = h5py.File(no_chunks_hdf5_file) - ds = f["data"] - manifest = _dataset_chunk_manifest(path=no_chunks_hdf5_file, dataset=ds) - assert manifest.shape_chunk_grid == (1, 1) - - def test_chunked(self, chunked_hdf5_file): - f = h5py.File(chunked_hdf5_file) - ds = f["data"] - manifest = _dataset_chunk_manifest(path=chunked_hdf5_file, dataset=ds) - assert manifest.shape_chunk_grid == (2, 2) - - def test_chunked_roundtrip(self, chunked_roundtrip_hdf5_file): - f = h5py.File(chunked_roundtrip_hdf5_file) - ds = f["var2"] - manifest = _dataset_chunk_manifest(path=chunked_roundtrip_hdf5_file, dataset=ds) - assert manifest.shape_chunk_grid == (2, 8) - - -class TestDatasetDims: - def test_single_dimension_scale(self, single_dimension_scale_hdf5_file): - f = h5py.File(single_dimension_scale_hdf5_file) - ds = f["data"] - dims = _dataset_dims(ds) - assert dims[0] == "x" - - def test_is_dimension_scale(self, is_scale_hdf5_file): - f = h5py.File(is_scale_hdf5_file) - ds = f["data"] - dims = _dataset_dims(ds) - assert dims[0] == "data" - - def test_multiple_dimension_scales(self, multiple_dimension_scales_hdf5_file): - f = h5py.File(multiple_dimension_scales_hdf5_file) - ds = f["data"] - with pytest.raises(ValueError, match="dimension scales attached"): - _dataset_dims(ds) - - def test_no_dimension_scales(self, no_chunks_hdf5_file): - f = h5py.File(no_chunks_hdf5_file) - ds = f["data"] - dims = _dataset_dims(ds) - assert dims == ["phony_dim_0", "phony_dim_1"] - - -class TestDatasetToVariable: - def test_chunked_dataset(self, chunked_dimensions_netcdf4_file): - f = h5py.File(chunked_dimensions_netcdf4_file) - ds = f["data"] - var = _dataset_to_variable(chunked_dimensions_netcdf4_file, ds) - assert var.chunks == (50, 50) - - def test_not_chunked_dataset(self, single_dimension_scale_hdf5_file): - f = h5py.File(single_dimension_scale_hdf5_file) - ds = f["data"] - var = _dataset_to_variable(single_dimension_scale_hdf5_file, ds) - assert var.chunks == (2,) - - def test_dataset_attributes(self, string_attributes_hdf5_file): - f = h5py.File(string_attributes_hdf5_file) - ds = f["data"] - var = _dataset_to_variable(string_attributes_hdf5_file, ds) - assert var.attrs["attribute_name"] == "attribute_name" - - -class TestExtractAttributes: - def test_string_attribute(self, string_attributes_hdf5_file): - f = h5py.File(string_attributes_hdf5_file) - ds = f["data"] - attrs = _extract_attrs(ds) - assert attrs["attribute_name"] == "attribute_name" - - def test_root_attribute(self, root_attributes_hdf5_file): - f = h5py.File(root_attributes_hdf5_file) - attrs = _extract_attrs(f) - assert attrs["attribute_name"] == "attribute_name" - - def test_multiple_attributes(self, string_attributes_hdf5_file): - f = h5py.File(string_attributes_hdf5_file) - ds = f["data"] - attrs = _extract_attrs(ds) - assert len(attrs.keys()) == 2 - - -class TestVirtualVarsFromHDF: - def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file): - variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file) - assert len(variables) == 3 - - def test_groups_not_implemented(self, group_hdf5_file): - with pytest.raises(NotImplementedError): - virtual_vars_from_hdf(group_hdf5_file) - - def test_drop_variables(self, multiple_datasets_hdf5_file): - variables = virtual_vars_from_hdf(multiple_datasets_hdf5_file, ["data2"]) - assert "data2" not in variables.keys() diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py deleted file mode 100644 index efaad78..0000000 --- a/virtualizarr/tests/test_readers/test_hdf_filters.py +++ /dev/null @@ -1,115 +0,0 @@ -import h5py -import imagecodecs -import numcodecs -import numpy as np - -from virtualizarr.readers.hdf_filters import ( - _filter_to_codec, - cfcodec_from_dataset, - codecs_from_dataset, -) - - -class TestFilterToCodec: - def test_gzip_uses_zlib_numcodec(self): - codec = _filter_to_codec("gzip", 1) - assert isinstance(codec, numcodecs.zlib.Zlib) - - def test_lzf(self): - codec = _filter_to_codec("lzf") - assert isinstance(codec, imagecodecs.numcodecs.Lzf) - - def test_blosc(self): - codec = _filter_to_codec("32001", (2, 2, 8, 800, 9, 2, 1)) - assert isinstance(codec, numcodecs.blosc.Blosc) - expected_config = { - "id": "blosc", - "blocksize": 800, - "clevel": 9, - "shuffle": 2, - "cname": "lz4", - } - assert codec.get_config() == expected_config - - def test_zstd(self): - codec = _filter_to_codec("32015", (5,)) - assert isinstance(codec, numcodecs.zstd.Zstd) - expected_config = {"id": "zstd", "level": 5} - assert codec.get_config() == expected_config - - def test_shuffle(self): - codec = _filter_to_codec("shuffle", (7,)) - assert isinstance(codec, numcodecs.shuffle.Shuffle) - expected_config = {"id": "shuffle", "elementsize": 7} - assert codec.get_config() == expected_config - - -class TestCodecsFromDataSet: - def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file): - f = h5py.File(filter_encoded_hdf5_file) - ds = f["data"] - chunk_info = ds.id.get_chunk_info(0) - codecs = codecs_from_dataset(ds) - with open(filter_encoded_hdf5_file, "rb") as file: - file.seek(chunk_info.byte_offset) - bytes_read = file.read(chunk_info.size) - decoded = codecs[0].decode(bytes_read) - if isinstance(decoded, np.ndarray): - assert decoded.tobytes() == np_uncompressed.tobytes() - else: - assert decoded == np_uncompressed.tobytes() - - -class TestCFCodecFromDataset: - def test_no_cf_convention(self, filter_encoded_hdf5_file): - f = h5py.File(filter_encoded_hdf5_file) - ds = f["data"] - cf_codec = cfcodec_from_dataset(ds) - assert cf_codec is None - - def test_cf_scale_factor(self, netcdf4_file): - f = h5py.File(netcdf4_file) - ds = f["air"] - cf_codec = cfcodec_from_dataset(ds) - assert cf_codec["target_dtype"] == np.dtype(np.float64) - assert cf_codec["codec"].scale == 100.0 - assert cf_codec["codec"].offset == 0 - assert cf_codec["codec"].dtype == " 0: # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables... @@ -211,6 +190,7 @@ def open_virtual_dataset( vars = {**virtual_vars, **loadable_vars} data_vars, coords = separate_coords(vars, indexes, coord_names) + vds = xr.Dataset( data_vars, coords=coords, From 50c3dcd43d7569eaf57ebad0b85701293eec9101 Mon Sep 17 00:00:00 2001 From: Gustavo Hidalgo Date: Fri, 19 Jul 2024 16:26:45 -0400 Subject: [PATCH 66/68] Fix unit tests --- docs/releases.rst | 2 +- virtualizarr/tests/__init__.py | 4 ++-- virtualizarr/tests/test_manifests/test_array.py | 12 ++++++------ virtualizarr/tests/test_xarray.py | 10 +++++----- virtualizarr/tests/test_zarr.py | 2 +- virtualizarr/zarr.py | 16 ++++++++++------ 6 files changed, 25 insertions(+), 21 deletions(-) diff --git a/docs/releases.rst b/docs/releases.rst index 1451191..3eeed7e 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -12,7 +12,7 @@ New Features Breaking changes ~~~~~~~~~~~~~~~~ -- Serialize valid ZarrV3 metadata (for :pull:`193`). +- Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`) By `Gustavo Hidalgo `_. Deprecations diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 3856a6b..7df13d1 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -48,9 +48,9 @@ def create_manifestarray( zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "blosc", "clevel": 5, "cname": "lz4", "shuffle": 1}, dtype=np.dtype("float32"), - fill_value=0.0, # TODO change this to NaN? + fill_value=0.0, filters=None, order="C", shape=shape, diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index 459e60b..6d5ede7 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -19,7 +19,7 @@ def test_create_manifestarray(self): shape = (5, 2, 20) zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -74,7 +74,7 @@ def test_equals(self): shape = (5, 2, 20) zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -95,7 +95,7 @@ def test_not_equal_chunk_entries(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(5, 1, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -209,7 +209,7 @@ def test_concat(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(5, 1, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -254,7 +254,7 @@ def test_stack(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(5, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -299,7 +299,7 @@ def test_refuse_combine(): zarray_common = { "chunks": (5, 1, 10), - "compressor": "zlib", + "compressor": {"id": "zlib", "level": 1}, "dtype": np.dtype("int32"), "fill_value": 0.0, "filters": None, diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index d0fe2e3..7fb7a02 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -19,7 +19,7 @@ def test_wrapping(): dtype = np.dtype("int32") zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=dtype, fill_value=0.0, filters=None, @@ -49,7 +49,7 @@ def test_equals(self): shape = (5, 20) zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -86,7 +86,7 @@ def test_concat_along_existing_dim(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(1, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -133,7 +133,7 @@ def test_concat_along_new_dim(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(5, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -183,7 +183,7 @@ def test_concat_dim_coords_along_existing_dim(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(10,), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py index 29db840..7715d24 100644 --- a/virtualizarr/tests/test_zarr.py +++ b/virtualizarr/tests/test_zarr.py @@ -20,7 +20,7 @@ def vds_with_manifest_arrays() -> xr.Dataset: shape=(2, 3), dtype=np.dtype(" tuple[ZArray, list[str], dict]: ) else: fill_value = metadata["fill_value"] + all_codecs = [ codec for codec in metadata["codecs"] if codec["name"] not in ("transpose", "bytes") ] - # TODO: hdf.py treats all codecs as filter, but maybe one needs to be the compressor? - compressor = None #all_codecs[0] if all_codecs else None - filters = [_configurable_to_num_codec_config(_filter) for _filter in all_codecs] or None + compressor, *filters = [ + _configurable_to_num_codec_config(_filter) for _filter in all_codecs + ] zarray = ZArray( - chunks=metadata["chunk_grid"]["configuration"]["chunk_shape"], - compressor=_configurable_to_num_codec_config(compressor) if compressor else None, + chunks=chunk_shape, + compressor=compressor, dtype=np.dtype(metadata["data_type"]), fill_value=fill_value, - filters=filters, + filters=filters or None, order="C", shape=shape, zarr_format=zarr_format, @@ -375,6 +376,7 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: return zarray, dim_names, attrs + def _configurable_to_num_codec_config(configurable: dict) -> dict: """ Convert a zarr v3 configurable into a numcodecs codec. @@ -384,6 +386,7 @@ def _configurable_to_num_codec_config(configurable: dict) -> dict: configuration = configurable_copy.pop("configuration") return numcodecs.get_codec({"id": codec_id, **configuration}).get_config() + def _num_codec_config_to_configurable(num_codec: dict) -> dict: """ Convert a numcodecs codec into a zarr v3 configurable. @@ -391,6 +394,7 @@ def _num_codec_config_to_configurable(num_codec: dict) -> dict: num_codec_copy = num_codec.copy() return {"name": num_codec_copy.pop("id"), "configuration": num_codec_copy} + def _default_fill_value(dtype: np.dtype) -> Union[bool, int, float, str, list]: """ The value and format of the fill_value depend on the data_type of the array. From ab97e6398c3c97712bf56bb249a8ea2420bc28cb Mon Sep 17 00:00:00 2001 From: Gustavo Hidalgo Date: Mon, 22 Jul 2024 11:36:10 -0400 Subject: [PATCH 67/68] PR comments --- virtualizarr/zarr.py | 42 +++++++++++++++--------------------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index f772e8d..932e7da 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -6,7 +6,6 @@ Literal, NewType, Optional, - Union, ) import numcodecs @@ -33,6 +32,20 @@ ) # just the .zattrs (for one array or for the whole store/group) FillValueT = bool | str | float | int | list | None +ZARR_DEFAULT_FILL_VALUE: dict[np.dtype, FillValueT] = { + # numpy dtypes's hierarchy lets us avoid checking for all the widths + # https://numpy.org/doc/stable/reference/arrays.scalars.html + np.dtype("bool"): False, + np.dtype("int"): 0, + np.dtype("float"): 0.0, + np.dtype("complex"): [0.0, 0.0], +} +""" +The value and format of the fill_value depend on the `data_type` of the array. +See here for spec: +https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value +""" + class Codec(BaseModel): compressor: dict | None = None @@ -77,7 +90,7 @@ def __post_init__(self) -> None: @model_validator(mode="after") def _check_fill_value(self) -> Self: if self.fill_value is None: - self.fill_value = _default_fill_value(self.dtype) + self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype, default=0.0) return self @property @@ -96,11 +109,6 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": fill_value = np.nan compressor = decoded_arr_refs_zarray["compressor"] - # deal with an inconsistency in kerchunk's tiff_to_zarr function - # TODO should this be moved to the point where we actually call tiff_to_zarr? Or ideally made consistent upstream. - # if compressor is not None and "id" in compressor: - # compressor = compressor["id"] - return ZArray( chunks=tuple(decoded_arr_refs_zarray["chunks"]), compressor=compressor, @@ -393,23 +401,3 @@ def _num_codec_config_to_configurable(num_codec: dict) -> dict: """ num_codec_copy = num_codec.copy() return {"name": num_codec_copy.pop("id"), "configuration": num_codec_copy} - - -def _default_fill_value(dtype: np.dtype) -> Union[bool, int, float, str, list]: - """ - The value and format of the fill_value depend on the data_type of the array. - See here for spec: - https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value - """ - # numpy dtypes's hierarchy lets us avoid checking for all the widths - # https://numpy.org/doc/stable/reference/arrays.scalars.html - if dtype is np.dtype("bool"): - return False - elif dtype is np.dtype("int"): - return 0 - elif dtype is np.dtype("float"): - return 0.0 - elif dtype is np.dtype("complex"): - return [0.0, 0.0] - else: - return 0.0 From 0be0728416a4d572d3aab0fb356d776f7173876c Mon Sep 17 00:00:00 2001 From: Gustavo Hidalgo Date: Mon, 22 Jul 2024 11:44:05 -0400 Subject: [PATCH 68/68] Remove kwarg in dict default --- virtualizarr/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 932e7da..e5015b3 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -90,7 +90,7 @@ def __post_init__(self) -> None: @model_validator(mode="after") def _check_fill_value(self) -> Self: if self.fill_value is None: - self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype, default=0.0) + self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype, 0.0) return self @property