From 6b7abe2a0dc650ae7e6bf07c080cc9023a17bf2c Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 19 Apr 2024 13:25:28 -0600
Subject: [PATCH 01/68] Generate chunk manifest backed variable from HDF5
 dataset.

---
 pyproject.toml                              |   1 +
 virtualizarr/readers/hdf.py                 | 135 ++++++++++++++++++++
 virtualizarr/tests/test_readers/__init__.py |   0
 virtualizarr/tests/test_readers/conftest.py |  91 +++++++++++++
 virtualizarr/tests/test_readers/test_hdf.py |  71 ++++++++++
 5 files changed, 298 insertions(+)
 create mode 100644 virtualizarr/readers/hdf.py
 create mode 100644 virtualizarr/tests/test_readers/__init__.py
 create mode 100644 virtualizarr/tests/test_readers/conftest.py
 create mode 100644 virtualizarr/tests/test_readers/test_hdf.py

diff --git a/pyproject.toml b/pyproject.toml
index c7505bc..7994c92 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,7 @@ dependencies = [
     "kerchunk==0.2.2",
     "pydantic",
     "packaging",
+    "h5netcdf",
 ]
 
 [project.optional-dependencies]
diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
new file mode 100644
index 0000000..a34ae34
--- /dev/null
+++ b/virtualizarr/readers/hdf.py
@@ -0,0 +1,135 @@
+from typing import List
+
+import h5py
+import xarray as xr
+
+from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
+from virtualizarr.zarr import ZArray
+
+
+def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
+    """
+    Generate ChunkManifest for HDF5 dataset.
+
+    Parameters
+    ----------
+    path: str
+        The path the HDF5 container file
+     dset : h5py.Dataset
+        HDF5 dataset for which to create a ChunkManifest
+
+    Returns
+    -------
+    ChunkManifest
+        A Virtualizarr ChunkManifest
+    """
+    dsid = dataset.id
+
+    if dataset.chunks is None:
+        if dsid.get_offset() is None:
+            raise ValueError("Dataset has no space allocated in the file")
+        else:
+            key_list = [0] * (len(dataset.shape) or 1)
+            key = ".".join(map(str, key_list))
+            chunk_entry = ChunkEntry(
+                path=path,
+                offset=dsid.get_offset(),
+                length=dsid.get_storage_size()
+            )
+            chunk_entries = {key: chunk_entry}
+            chunk_manifest = ChunkManifest(
+                entries=chunk_entries
+            )
+            return chunk_manifest
+    else:
+        num_chunks = dsid.get_num_chunks()
+        if num_chunks == 0:
+            raise ValueError("The dataset is chunked but contains no chunks")
+
+        chunk_entries = dict()
+
+        def get_key(blob):
+            key_list = [a // b for a, b in zip(blob.chunk_offset, dataset.chunks)]
+            key = ".".join(map(str, key_list))
+            return key
+
+        def store_chunk_entry(blob):
+            chunk_entries[get_key(blob)] = ChunkEntry(
+                path=path,
+                offset=blob.byte_offset,
+                length=blob.size
+            )
+
+        has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
+        if has_chunk_iter:
+            dsid.chunk_iter(store_chunk_entry)
+        else:
+            for index in range(num_chunks):
+                store_chunk_entry(dsid.get_chunk_info(index))
+
+        chunk_manifest = ChunkManifest(
+            entries=chunk_entries
+        )
+        return chunk_manifest
+
+def _dataset_dims(dataset: h5py.Dataset) -> List[str]:
+    """
+    Get a list of dimension scale names attached to input HDF5 dataset.
+
+    This is required by the xarray package to work with Zarr arrays. Only
+    one dimension scale per dataset dimension is allowed. If dataset is
+    dimension scale, it will be considered as the dimension to itself.
+
+    Parameters
+    ----------
+    dataset : h5py.Dataset
+        HDF5 dataset.
+
+    Returns
+    -------
+    list
+        List with HDF5 path names of dimension scales attached to input
+        dataset.
+    """
+    dims = list()
+    rank = len(dataset.shape)
+    if rank:
+        for n in range(rank):
+            num_scales = len(dataset.dims[n])
+            if num_scales == 1:
+                dims.append(dataset.dims[n][0].name[1:])
+            elif h5py.h5ds.is_scale(dataset.id):
+                dims.append(dataset.name[1:])
+            elif num_scales > 1:
+                raise ValueError(
+                    f"{dataset.name}: {len(dataset.dims[n])} "
+                    f"dimension scales attached to dimension #{n}"
+                )
+            elif num_scales == 0:
+                # Some HDF5 files do not have dimension scales.
+                # If this is the case, `num_scales` will be 0.
+                # In this case, we mimic netCDF4 and assign phony dimension names.
+                # See https://github.com/fsspec/kerchunk/issues/41
+                dims.append(f"phony_dim_{n}")
+        return dims
+
+
+def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
+    # This chunk determination logic mirrors zarr-python's create
+    # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
+    chunks = dataset.chunks if dataset.chunks else dataset.shape
+    zarray = ZArray(
+        chunks=chunks,
+        compressor=dataset.compression,
+        dtype=dataset.dtype,
+        fill_value=dataset.fillvalue,
+        filters=None,
+        order="C",
+        shape=dataset.shape,
+        zarr_format=2,
+    )
+    manifest = _dataset_chunk_manifest(path, dataset)
+    marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
+    dims = _dataset_dims(dataset)
+    variable = xr.Variable(data=marray, dims=dims)
+    return variable
diff --git a/virtualizarr/tests/test_readers/__init__.py b/virtualizarr/tests/test_readers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
new file mode 100644
index 0000000..b450483
--- /dev/null
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -0,0 +1,91 @@
+import h5py
+import numpy as np
+import pytest
+import xarray as xr
+
+
+@pytest.fixture
+def empty_chunks_netcdf4_file(tmpdir):
+    ds = xr.Dataset({"data": []})
+    filepath = f"{tmpdir}/empty_chunks.nc"
+    ds.to_netcdf(filepath, engine="h5netcdf")
+    return filepath
+
+
+@pytest.fixture
+def empty_dataset_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/empty_dataset.nc"
+    f = h5py.File(filepath, "w")
+    f.create_dataset("data", shape=(0,), dtype="f")
+    return filepath
+
+
+@pytest.fixture
+def no_chunks_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/no_chunks.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((10, 10))
+    f.create_dataset(name="data", data=data, chunks=None)
+    return filepath
+
+
+@pytest.fixture
+def chunked_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/chunks.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((100, 100))
+    f.create_dataset(name="data", data=data, chunks=(50, 50))
+    return filepath
+
+
+@pytest.fixture
+def single_dimension_scale_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/single_dimension_scale.nc"
+    f = h5py.File(filepath, "w")
+    data = [1, 2]
+    x = [0, 1]
+    f.create_dataset(name="data", data=data)
+    f.create_dataset(name="x", data=x)
+    f["x"].make_scale()
+    f["data"].dims[0].attach_scale(f["x"])
+    return filepath
+
+
+@pytest.fixture
+def is_scale_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/is_scale.nc"
+    f = h5py.File(filepath, "w")
+    data = [1, 2]
+    f.create_dataset(name="data", data=data)
+    f["data"].make_scale()
+    return filepath
+
+
+@pytest.fixture
+def multiple_dimension_scales_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/multiple_dimension_scales.nc"
+    f = h5py.File(filepath, "w")
+    data = [1, 2]
+    f.create_dataset(name="data", data=data)
+    f.create_dataset(name="x", data=[0, 1])
+    f.create_dataset(name="y", data=[0, 1])
+    f["x"].make_scale()
+    f["y"].make_scale()
+    f["data"].dims[0].attach_scale(f["x"])
+    f["data"].dims[0].attach_scale(f["y"])
+    return filepath
+
+
+@pytest.fixture
+def chunked_dimensions_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/chunks_dimension.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((100, 100))
+    x = np.random.random((100, 100))
+    y = np.random.random((100, 100))
+    f.create_dataset(name="data", data=data, chunks=(50, 50))
+    f.create_dataset(name="x", data=x, chunks=(50, 50))
+    f.create_dataset(name="y", data=y, chunks=(50, 50))
+    f["data"].dims[0].attach_scale(f["x"])
+    f["data"].dims[1].attach_scale(f["y"])
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
new file mode 100644
index 0000000..b6b78c1
--- /dev/null
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -0,0 +1,71 @@
+import h5py
+import pytest
+
+from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims,
+                                      _dataset_to_variable)
+
+
+class TestDatasetChunkManifest:
+    def test_empty_chunks(self, empty_chunks_netcdf4_file):
+        f = h5py.File(empty_chunks_netcdf4_file)
+        ds = f["data"]
+        with pytest.raises(ValueError, match="chunked but contains no chunks"):
+            _dataset_chunk_manifest(path=empty_chunks_netcdf4_file, dataset=ds)
+
+    def test_empty_dataset(self, empty_dataset_netcdf4_file):
+        f = h5py.File(empty_dataset_netcdf4_file)
+        ds = f["data"]
+        with pytest.raises(ValueError, match="no space allocated in the file"):
+            _dataset_chunk_manifest(path=empty_dataset_netcdf4_file, dataset=ds)
+
+    def test_no_chunking(self, no_chunks_netcdf4_file):
+        f = h5py.File(no_chunks_netcdf4_file)
+        ds = f["data"]
+        manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds)
+        assert len(manifest.entries) == 1
+
+    def test_chunked(self, chunked_netcdf4_file):
+        f = h5py.File(chunked_netcdf4_file)
+        ds = f["data"]
+        manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds)
+        assert len(manifest.entries) == 4
+
+
+class TestDatasetDims:
+    def test_single_dimension_scale(self, single_dimension_scale_netcdf4_file):
+        f = h5py.File(single_dimension_scale_netcdf4_file)
+        ds = f["data"]
+        dims = _dataset_dims(ds)
+        assert dims[0] == "x"
+
+    def test_is_dimension_scale(self, is_scale_netcdf4_file):
+        f = h5py.File(is_scale_netcdf4_file)
+        ds = f["data"]
+        dims = _dataset_dims(ds)
+        assert dims[0] == "data"
+
+    def test_multiple_dimension_scales(self, multiple_dimension_scales_netcdf4_file):
+        f = h5py.File(multiple_dimension_scales_netcdf4_file)
+        ds = f["data"]
+        with pytest.raises(ValueError, match="dimension scales attached"):
+            _dataset_dims(ds)
+
+    def test_no_dimension_scales(self, no_chunks_netcdf4_file):
+        f = h5py.File(no_chunks_netcdf4_file)
+        ds = f["data"]
+        dims = _dataset_dims(ds)
+        assert dims == ["phony_dim_0", "phony_dim_1"]
+
+
+class TestDatasetToVariable:
+    def test_chunked_dataset(self, chunked_dimensions_netcdf4_file):
+        f = h5py.File(chunked_dimensions_netcdf4_file)
+        ds = f["data"]
+        var = _dataset_to_variable(chunked_dimensions_netcdf4_file, ds)
+        assert var.chunks == (50, 50)
+
+    def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file):
+        f = h5py.File(single_dimension_scale_netcdf4_file)
+        ds = f["data"]
+        var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds)
+        assert var.chunks == (2,)

From bca0aabd6030625156b5fe1e58fb8d9a2ccf46f1 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 19 Apr 2024 14:20:38 -0600
Subject: [PATCH 02/68] Transfer dataset attrs to variable.

---
 virtualizarr/readers/hdf.py                 | 50 ++++++++++++++++++++-
 virtualizarr/tests/test_readers/conftest.py | 10 +++++
 virtualizarr/tests/test_readers/test_hdf.py | 16 ++++++-
 3 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index a34ae34..d6518a3 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -1,6 +1,7 @@
 from typing import List
 
 import h5py
+import numpy as np
 import xarray as xr
 
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
@@ -114,6 +115,52 @@ def _dataset_dims(dataset: h5py.Dataset) -> List[str]:
         return dims
 
 
+def _extract_attrs(dataset: h5py.Dataset):
+    """
+    Extract attributes from an HDF5 dataset.
+
+    Parameters
+    ----------
+    dataset : h5py.Dataset
+        An HDF5 dataset.
+    """
+    _HIDDEN_ATTRS = {
+        "REFERENCE_LIST",
+        "CLASS",
+        "DIMENSION_LIST",
+        "NAME",
+        "_Netcdf4Dimid",
+        "_Netcdf4Coordinates",
+        "_nc3_strict",
+        "_NCProperties",
+    }
+    attrs = {}
+    for n, v in dataset.attrs.items():
+        if n in _HIDDEN_ATTRS:
+            continue
+        # Fix some attribute values to avoid JSON encoding exceptions...
+        if isinstance(v, bytes):
+            v = v.decode("utf-8") or " "
+        elif isinstance(v, (np.ndarray, np.number, np.bool_)):
+            if v.dtype.kind == "S":
+                v = v.astype(str)
+            if n == "_FillValue":
+                continue
+            elif v.size == 1:
+                v = v.flatten()[0]
+                if isinstance(v, (np.ndarray, np.number, np.bool_)):
+                    v = v.tolist()
+            else:
+                v = v.tolist()
+        elif isinstance(v, h5py._hl.base.Empty):
+            v = ""
+        if v == "DIMENSION_SCALE":
+            continue
+
+        attrs[n] = v
+        return attrs
+
+
 def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
     # This chunk determination logic mirrors zarr-python's create
     # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
@@ -131,5 +178,6 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
     manifest = _dataset_chunk_manifest(path, dataset)
     marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
     dims = _dataset_dims(dataset)
-    variable = xr.Variable(data=marray, dims=dims)
+    attrs = _extract_attrs(dataset)
+    variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
     return variable
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index b450483..2c40fe1 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -89,3 +89,13 @@ def chunked_dimensions_netcdf4_file(tmpdir):
     f["data"].dims[0].attach_scale(f["x"])
     f["data"].dims[1].attach_scale(f["y"])
     return filepath
+
+
+@pytest.fixture
+def string_attribute_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/attributes.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((10, 10))
+    f.create_dataset(name="data", data=data, chunks=None)
+    f["data"].attrs["attribute_name"] = "attribute_name"
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index b6b78c1..495b7de 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -2,7 +2,7 @@
 import pytest
 
 from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims,
-                                      _dataset_to_variable)
+                                      _dataset_to_variable, _extract_attrs)
 
 
 class TestDatasetChunkManifest:
@@ -69,3 +69,17 @@ def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file):
         ds = f["data"]
         var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds)
         assert var.chunks == (2,)
+
+    def test_dataset_attributes(self, string_attribute_netcdf4_file):
+        f = h5py.File(string_attribute_netcdf4_file)
+        ds = f["data"]
+        var = _dataset_to_variable(string_attribute_netcdf4_file, ds)
+        assert var.attrs["attribute_name"] == "attribute_name"
+
+
+class TestExtractAttributes:
+    def test_string_attribute(self, string_attribute_netcdf4_file):
+        f = h5py.File(string_attribute_netcdf4_file)
+        ds = f["data"]
+        attrs = _extract_attrs(ds)
+        assert attrs["attribute_name"] == "attribute_name"

From 384ff6bb2d75b68a4af1f23d56a6544b4e20d6b5 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 19 Apr 2024 15:26:58 -0600
Subject: [PATCH 03/68] Get virtual variables dict from HDF5 file.

---
 virtualizarr/readers/hdf.py                 | 14 +++++++++++++-
 virtualizarr/tests/test_readers/conftest.py | 16 ++++++++++++----
 virtualizarr/tests/test_readers/test_hdf.py | 15 ++++++++++++++-
 3 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index d6518a3..9c3ebf4 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import Mapping, List
 
 import h5py
 import numpy as np
@@ -181,3 +181,15 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
     attrs = _extract_attrs(dataset)
     variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
     return variable
+
+
+def virtual_vars_from_hdf(path: str, f: h5py.File) -> Mapping[str, xr.Variable]:
+    variables = {}
+    for key in f.keys():
+        if isinstance(f[key], h5py.Dataset):
+            variable = _dataset_to_variable(path, f[key])
+            variables[key] = variable
+        else:
+            raise NotImplementedError("Nested groups are not yet supported")
+
+    return variables
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 2c40fe1..735e922 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -81,11 +81,11 @@ def chunked_dimensions_netcdf4_file(tmpdir):
     filepath = f"{tmpdir}/chunks_dimension.nc"
     f = h5py.File(filepath, "w")
     data = np.random.random((100, 100))
-    x = np.random.random((100, 100))
-    y = np.random.random((100, 100))
+    x = np.random.random((100))
+    y = np.random.random((100))
     f.create_dataset(name="data", data=data, chunks=(50, 50))
-    f.create_dataset(name="x", data=x, chunks=(50, 50))
-    f.create_dataset(name="y", data=y, chunks=(50, 50))
+    f.create_dataset(name="x", data=x)
+    f.create_dataset(name="y", data=y)
     f["data"].dims[0].attach_scale(f["x"])
     f["data"].dims[1].attach_scale(f["y"])
     return filepath
@@ -99,3 +99,11 @@ def string_attribute_netcdf4_file(tmpdir):
     f.create_dataset(name="data", data=data, chunks=None)
     f["data"].attrs["attribute_name"] = "attribute_name"
     return filepath
+
+
+@pytest.fixture
+def group_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/group.nc"
+    f = h5py.File(filepath, "w")
+    f.create_group("group")
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index 495b7de..da331ed 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -2,7 +2,8 @@
 import pytest
 
 from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims,
-                                      _dataset_to_variable, _extract_attrs)
+                                      _dataset_to_variable, _extract_attrs,
+                                      virtual_vars_from_hdf)
 
 
 class TestDatasetChunkManifest:
@@ -83,3 +84,15 @@ def test_string_attribute(self, string_attribute_netcdf4_file):
         ds = f["data"]
         attrs = _extract_attrs(ds)
         assert attrs["attribute_name"] == "attribute_name"
+
+
+class TestVirtualVarsFromHDF:
+    def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file):
+        f = h5py.File(chunked_dimensions_netcdf4_file)
+        variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file, f)
+        assert len(variables) == 3
+
+    def test_groups_not_implemented(self, group_netcdf4_file):
+        f = h5py.File(group_netcdf4_file)
+        with pytest.raises(NotImplementedError):
+            virtual_vars_from_hdf(group_netcdf4_file, f)

From 4c5f9bd30186aee61ff79223a70a3172b1c17d00 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 22 Apr 2024 12:33:24 -0600
Subject: [PATCH 04/68] Update virtual_vars_from_hdf to use fsspec and
 drop_variables arg.

---
 pyproject.toml                              |  2 +-
 virtualizarr/readers/hdf.py                 | 25 +++++++++++++++------
 virtualizarr/tests/test_readers/conftest.py | 10 +++++++++
 virtualizarr/tests/test_readers/test_hdf.py | 13 +++++++----
 4 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7994c92..d08621e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,6 @@ dependencies = [
     "kerchunk==0.2.2",
     "pydantic",
     "packaging",
-    "h5netcdf",
 ]
 
 [project.optional-dependencies]
@@ -35,6 +34,7 @@ test = [
     "pytest",
     "scipy",
     "pooch",
+    "h5netcdf",
 ]
 
 
diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index 9c3ebf4..c4ab292 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -1,5 +1,6 @@
-from typing import Mapping, List
+from typing import List, Mapping, Optional
 
+import fsspec
 import h5py
 import numpy as np
 import xarray as xr
@@ -73,6 +74,7 @@ def store_chunk_entry(blob):
         )
         return chunk_manifest
 
+
 def _dataset_dims(dataset: h5py.Dataset) -> List[str]:
     """
     Get a list of dimension scale names attached to input HDF5 dataset.
@@ -183,13 +185,22 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
     return variable
 
 
-def virtual_vars_from_hdf(path: str, f: h5py.File) -> Mapping[str, xr.Variable]:
+def virtual_vars_from_hdf(
+    path: str,
+    drop_variables: Optional[List[str]] = None,
+) -> Mapping[str, xr.Variable]:
+    if drop_variables is None:
+        drop_variables = []
+    fs, file_path = fsspec.core.url_to_fs(path)
+    open_file = fs.open(path, "rb")
+    f = h5py.File(open_file, mode="r")
     variables = {}
     for key in f.keys():
-        if isinstance(f[key], h5py.Dataset):
-            variable = _dataset_to_variable(path, f[key])
-            variables[key] = variable
-        else:
-            raise NotImplementedError("Nested groups are not yet supported")
+        if key not in drop_variables:
+            if isinstance(f[key], h5py.Dataset):
+                variable = _dataset_to_variable(path, f[key])
+                variables[key] = variable
+            else:
+                raise NotImplementedError("Nested groups are not yet supported")
 
     return variables
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 735e922..aa2b0fe 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -107,3 +107,13 @@ def group_netcdf4_file(tmpdir):
     f = h5py.File(filepath, "w")
     f.create_group("group")
     return filepath
+
+
+@pytest.fixture
+def multiple_datasets_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/multiple_datasets.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((10, 10))
+    f.create_dataset(name="data", data=data, chunks=None)
+    f.create_dataset(name="data2", data=data, chunks=None)
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index da331ed..36f7bc7 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -88,11 +88,16 @@ def test_string_attribute(self, string_attribute_netcdf4_file):
 
 class TestVirtualVarsFromHDF:
     def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file):
-        f = h5py.File(chunked_dimensions_netcdf4_file)
-        variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file, f)
+        variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file)
         assert len(variables) == 3
 
     def test_groups_not_implemented(self, group_netcdf4_file):
-        f = h5py.File(group_netcdf4_file)
         with pytest.raises(NotImplementedError):
-            virtual_vars_from_hdf(group_netcdf4_file, f)
+            virtual_vars_from_hdf(group_netcdf4_file)
+
+    def test_drop_variables(self, multiple_datasets_netcdf4_file):
+        variables = virtual_vars_from_hdf(
+            multiple_datasets_netcdf4_file,
+            ["data2"]
+        )
+        assert "data2" not in variables.keys()

From 1dd3370aedc6e0b590f752273387a716366defe9 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 22 Apr 2024 13:02:03 -0600
Subject: [PATCH 05/68] mypy fix to use ChunkKey and empty dimensions list.

---
 virtualizarr/readers/hdf.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index c4ab292..fdb9a77 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -1,4 +1,4 @@
-from typing import List, Mapping, Optional
+from typing import List, Mapping, Optional, Union
 
 import fsspec
 import h5py
@@ -8,6 +8,8 @@
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
 from virtualizarr.zarr import ZArray
 
+from virtualizarr.types import ChunkKey
+
 
 def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
     """
@@ -38,7 +40,8 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
                 offset=dsid.get_offset(),
                 length=dsid.get_storage_size()
             )
-            chunk_entries = {key: chunk_entry}
+            chunk_key = ChunkKey(key)
+            chunk_entries = {chunk_key: chunk_entry}
             chunk_manifest = ChunkManifest(
                 entries=chunk_entries
             )
@@ -75,7 +78,7 @@ def store_chunk_entry(blob):
         return chunk_manifest
 
 
-def _dataset_dims(dataset: h5py.Dataset) -> List[str]:
+def _dataset_dims(dataset: h5py.Dataset) -> Union[List[str], List[None]]:
     """
     Get a list of dimension scale names attached to input HDF5 dataset.
 
@@ -114,7 +117,7 @@ def _dataset_dims(dataset: h5py.Dataset) -> List[str]:
                 # In this case, we mimic netCDF4 and assign phony dimension names.
                 # See https://github.com/fsspec/kerchunk/issues/41
                 dims.append(f"phony_dim_{n}")
-        return dims
+    return dims
 
 
 def _extract_attrs(dataset: h5py.Dataset):

From d92c75c82cd000bf0fafa5301c22793434fb18ed Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 22 Apr 2024 13:40:52 -0600
Subject: [PATCH 06/68] Extract attributes from hdf5 root group.

---
 virtualizarr/readers/hdf.py                 | 18 +++++++++++++-----
 virtualizarr/tests/test_readers/conftest.py |  8 ++++++++
 virtualizarr/tests/test_readers/test_hdf.py |  5 +++++
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index fdb9a77..e02d03e 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -120,14 +120,14 @@ def _dataset_dims(dataset: h5py.Dataset) -> Union[List[str], List[None]]:
     return dims
 
 
-def _extract_attrs(dataset: h5py.Dataset):
+def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]):
     """
-    Extract attributes from an HDF5 dataset.
+    Extract attributes from an HDF5 group or dataset.
 
     Parameters
     ----------
-    dataset : h5py.Dataset
-        An HDF5 dataset.
+    h5obj : h5py.Group or h5py.Dataset
+        An HDF5 group or dataset.
     """
     _HIDDEN_ATTRS = {
         "REFERENCE_LIST",
@@ -140,7 +140,7 @@ def _extract_attrs(dataset: h5py.Dataset):
         "_NCProperties",
     }
     attrs = {}
-    for n, v in dataset.attrs.items():
+    for n, v in h5obj.attrs.items():
         if n in _HIDDEN_ATTRS:
             continue
         # Fix some attribute values to avoid JSON encoding exceptions...
@@ -207,3 +207,11 @@ def virtual_vars_from_hdf(
                 raise NotImplementedError("Nested groups are not yet supported")
 
     return variables
+
+
+def attrs_from_root_group(path: str):
+    fs, file_path = fsspec.core.url_to_fs(path)
+    open_file = fs.open(path, "rb")
+    f = h5py.File(open_file, mode="r")
+    attrs = _extract_attrs(f)
+    return attrs
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index aa2b0fe..46ac7b2 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -101,6 +101,14 @@ def string_attribute_netcdf4_file(tmpdir):
     return filepath
 
 
+@pytest.fixture
+def root_attributes_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/root_attributes.nc"
+    f = h5py.File(filepath, "w")
+    f.attrs["attribute_name"] = "attribute_name"
+    return filepath
+
+
 @pytest.fixture
 def group_netcdf4_file(tmpdir):
     filepath = f"{tmpdir}/group.nc"
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index 36f7bc7..a24e36a 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -85,6 +85,11 @@ def test_string_attribute(self, string_attribute_netcdf4_file):
         attrs = _extract_attrs(ds)
         assert attrs["attribute_name"] == "attribute_name"
 
+    def test_root_attribute(self, root_attributes_netcdf4_file):
+        f = h5py.File(root_attributes_netcdf4_file)
+        attrs = _extract_attrs(f)
+        assert attrs["attribute_name"] == "attribute_name"
+
 
 class TestVirtualVarsFromHDF:
     def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file):

From 0ed836272d26a62b8de457c30dc6525292efc916 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 22 Apr 2024 14:19:17 -0600
Subject: [PATCH 07/68] Use hdf reader for netcdf4 files.

---
 virtualizarr/xarray.py | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 5c3c854..415b0a0 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -8,7 +8,8 @@
 from xarray.core.variable import IndexVariable
 
 import virtualizarr.kerchunk as kerchunk
-from virtualizarr.kerchunk import KerchunkStoreRefs, FileType
+from virtualizarr.kerchunk import KerchunkStoreRefs, FileType, _automatically_determine_filetype
+from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group 
 from virtualizarr.manifests import ChunkManifest, ManifestArray
 
 
@@ -76,18 +77,28 @@ def open_virtual_dataset(
     if common:
         raise ValueError(f"Cannot both load and drop variables {common}")
 
+    if filetype is None:
+        filetype = _automatically_determine_filetype(filepath)
+    filetype = FileType(filetype)
+    if filetype.name.lower() == "netcdf4":
+        virtual_vars = virtual_vars_from_hdf(
+            path=filepath,
+            drop_variables=drop_variables
+        )
+        ds_attrs = attrs_from_root_group(path=filepath)
     # this is the only place we actually always need to use kerchunk directly
     # TODO avoid even reading byte ranges for variables that will be dropped later anyway?
-    vds_refs = kerchunk.read_kerchunk_references_from_file(
-        filepath=filepath,
-        filetype=filetype,
-    )
-    virtual_vars = virtual_vars_from_kerchunk_refs(
-        vds_refs,
-        drop_variables=drop_variables + loadable_variables,
-        virtual_array_class=virtual_array_class,
-    )
-    ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {})
+    else:
+        vds_refs = kerchunk.read_kerchunk_references_from_file(
+            filepath=filepath,
+            filetype=filetype,
+        )
+        virtual_vars = virtual_vars_from_kerchunk_refs(
+            vds_refs,
+            drop_variables=drop_variables + loadable_variables,
+            virtual_array_class=virtual_array_class,
+        )
+        ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {})
 
     if indexes is None or len(loadable_variables) > 0:
         # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...

From f4485fa10aebc0f8ef5ff7441704f49781325835 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 22 Apr 2024 21:57:39 +0000
Subject: [PATCH 08/68] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 virtualizarr/xarray.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 415b0a0..2213ffa 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -9,7 +9,7 @@
 
 import virtualizarr.kerchunk as kerchunk
 from virtualizarr.kerchunk import KerchunkStoreRefs, FileType, _automatically_determine_filetype
-from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group 
+from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group
 from virtualizarr.manifests import ChunkManifest, ManifestArray
 
 

From 0123df7b802734f1902bee0cdd196f5baca10c9e Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 8 May 2024 18:03:04 -0600
Subject: [PATCH 09/68] Fix ruff complaints.

---
 virtualizarr/readers/hdf.py                 |  3 +--
 virtualizarr/tests/test_readers/test_hdf.py | 10 +++++++---
 virtualizarr/xarray.py                      |  8 ++++++--
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index e02d03e..af25c02 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -6,9 +6,8 @@
 import xarray as xr
 
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
-from virtualizarr.zarr import ZArray
-
 from virtualizarr.types import ChunkKey
+from virtualizarr.zarr import ZArray
 
 
 def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index a24e36a..0d5a16d 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -1,9 +1,13 @@
 import h5py
 import pytest
 
-from virtualizarr.readers.hdf import (_dataset_chunk_manifest, _dataset_dims,
-                                      _dataset_to_variable, _extract_attrs,
-                                      virtual_vars_from_hdf)
+from virtualizarr.readers.hdf import (
+    _dataset_chunk_manifest,
+    _dataset_dims,
+    _dataset_to_variable,
+    _extract_attrs,
+    virtual_vars_from_hdf,
+)
 
 
 class TestDatasetChunkManifest:
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index fbf6136..9629a34 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -18,9 +18,13 @@
 from xarray.core.variable import IndexVariable
 
 import virtualizarr.kerchunk as kerchunk
-from virtualizarr.kerchunk import KerchunkStoreRefs, FileType, _automatically_determine_filetype
-from virtualizarr.readers.hdf import virtual_vars_from_hdf, attrs_from_root_group
+from virtualizarr.kerchunk import (
+    FileType,
+    KerchunkStoreRefs,
+    _automatically_determine_filetype,
+)
 from virtualizarr.manifests import ChunkManifest, ManifestArray
+from virtualizarr.readers.hdf import attrs_from_root_group, virtual_vars_from_hdf
 from virtualizarr.zarr import (
     attrs_from_zarr_group_json,
     dataset_to_zarr,

From 332bcaab1ae182696e1daf7c611f6fe8fd8ee4fd Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 10 May 2024 15:10:30 -0600
Subject: [PATCH 10/68] First steps for handling HDF5 filters.

---
 pyproject.toml                                |  1 +
 virtualizarr/readers/hdf.py                   |  7 +-
 virtualizarr/readers/hdf_filters.py           | 34 +++++++++
 virtualizarr/tests/test_readers/conftest.py   | 26 +++++++
 .../tests/test_readers/test_hdf_filters.py    | 31 ++++++++
 .../test_readers/test_hdf_integration.py      | 21 ++++++
 virtualizarr/xarray.py                        | 71 +++++++++----------
 7 files changed, 153 insertions(+), 38 deletions(-)
 create mode 100644 virtualizarr/readers/hdf_filters.py
 create mode 100644 virtualizarr/tests/test_readers/test_hdf_filters.py
 create mode 100644 virtualizarr/tests/test_readers/test_hdf_integration.py

diff --git a/pyproject.toml b/pyproject.toml
index 79a5078..4818b5f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ dependencies = [
     "numpy",
     "ujson",
     "packaging",
+    "hdf5plugin",
 ]
 
 [project.optional-dependencies]
diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index af25c02..7d95d99 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -6,6 +6,7 @@
 import xarray as xr
 
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
+from virtualizarr.readers.hdf_filters import codecs_from_dataset
 from virtualizarr.types import ChunkKey
 from virtualizarr.zarr import ZArray
 
@@ -169,12 +170,14 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
     # This chunk determination logic mirrors zarr-python's create
     # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
     chunks = dataset.chunks if dataset.chunks else dataset.shape
+    codecs = codecs_from_dataset(dataset)
+    filters = [codec.get_config() for codec in codecs]
     zarray = ZArray(
         chunks=chunks,
-        compressor=dataset.compression,
+        compressor=None,
         dtype=dataset.dtype,
         fill_value=dataset.fillvalue,
-        filters=None,
+        filters=filters,
         order="C",
         shape=dataset.shape,
         zarr_format=2,
diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
new file mode 100644
index 0000000..6070fc1
--- /dev/null
+++ b/virtualizarr/readers/hdf_filters.py
@@ -0,0 +1,34 @@
+from typing import List, Tuple, Union
+
+import h5py
+import numcodecs.registry as registry
+from numcodecs.abc import Codec
+
+_non_standard_filters = {
+    "gzip": "zlib"
+}
+
+
+def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec:
+    try:
+        id = int(filter_id)
+    except ValueError:
+        id = filter_id
+
+    if isinstance(id, str):
+        if id in _non_standard_filters.keys():
+            id = _non_standard_filters[id]
+        conf = {"id": id}
+        if id == "zlib":
+            conf["level"] = filter_properties
+
+    codec = registry.get_codec(conf)
+    return codec
+
+
+def codecs_from_dataset(dataset: h5py.Dataset) -> List[Codec]:
+    codecs = []
+    for filter_id, filter_properties in dataset._filters.items():
+        codec = _filter_to_codec(filter_id, filter_properties)
+        codecs.append(codec)
+    return codecs
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 46ac7b2..4f0d4fc 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -125,3 +125,29 @@ def multiple_datasets_netcdf4_file(tmpdir):
     f.create_dataset(name="data", data=data, chunks=None)
     f.create_dataset(name="data2", data=data, chunks=None)
     return filepath
+
+
+@pytest.fixture
+def np_uncompressed():
+    return np.arange(100)
+
+
+@pytest.fixture
+def gzip_filter_netcdf4_file(tmpdir, np_uncompressed):
+    filepath = f"{tmpdir}/gzip.nc"
+    f = h5py.File(filepath, "w")
+    f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1)
+    return filepath
+
+
+@pytest.fixture
+def gzip_filter_xarray_netcdf4_file(tmpdir):
+    ds = xr.tutorial.open_dataset("air_temperature")
+    encoding = {}
+    for var_name in ds.variables:
+        #  encoding[var_name] = {"zlib": True, "compression_opts": 1}
+        encoding[var_name] = {"compression": "gzip", "compression_opts": 1}
+
+    filepath = f"{tmpdir}/gzip_xarray.nc"
+    ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
new file mode 100644
index 0000000..50a5d08
--- /dev/null
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -0,0 +1,31 @@
+import h5py
+import numcodecs
+import pytest
+
+from virtualizarr.readers.hdf_filters import (
+    _filter_to_codec,
+    codecs_from_dataset,
+)
+
+
+class TestFilterToCodec:
+    def test_gzip_uses_zlib_nomcodec(self):
+        codec = _filter_to_codec("gzip", 1)
+        assert isinstance(codec, numcodecs.zlib.Zlib)
+
+    def test_lzf_not_available(self):
+        with pytest.raises(ValueError, match="codec not available"):
+            _filter_to_codec("lzf")
+
+
+class TestCodecsFromDataSet:
+    def test_gzip(self, np_uncompressed, gzip_filter_netcdf4_file):
+        f = h5py.File(gzip_filter_netcdf4_file)
+        ds = f["data"]
+        chunk_info = ds.id.get_chunk_info(0)
+        codecs = codecs_from_dataset(ds)
+        with open(gzip_filter_netcdf4_file, 'rb') as file:
+            file.seek(chunk_info.byte_offset)
+            bytes_read = file.read(chunk_info.size)
+            decoded = codecs[0].decode(bytes_read)
+            assert decoded == np_uncompressed.tobytes()
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
new file mode 100644
index 0000000..45bfadc
--- /dev/null
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -0,0 +1,21 @@
+import fsspec
+import numpy
+import xarray as xr
+
+import virtualizarr
+from virtualizarr.kerchunk import FileType
+
+
+class TestIntegration:
+    def test_gzip_filter_end_to_end(self, tmpdir, gzip_filter_xarray_netcdf4_file):
+        virtual_ds = virtualizarr.open_virtual_dataset(
+            gzip_filter_xarray_netcdf4_file,
+            filetype=FileType("netcdf4")
+        )
+        kerchunk_file = f"{tmpdir}/gzip_kerchunk.json"
+        virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
+        fs = fsspec.filesystem("reference", fo=kerchunk_file)
+        m = fs.get_mapper("")
+
+        ds = xr.open_dataset(m, engine="kerchunk")
+        assert isinstance(ds.air.values[0][0][0], numpy.float64)
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 9629a34..24ba973 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -128,48 +128,47 @@ def open_virtual_dataset(
         )
         ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {})
 
-        if indexes is None or len(loadable_variables) > 0:
-            # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
-            # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
-            # TODO really we probably want a dedicated xarray backend that iterates over all variables only once
-            ds = xr.open_dataset(filepath, drop_variables=drop_variables)
-
-            if indexes is None:
-                # add default indexes by reading data from file
-                indexes = {name: index for name, index in ds.xindexes.items()}
-            elif indexes != {}:
-                # TODO allow manual specification of index objects
-                raise NotImplementedError()
-            else:
-                indexes = dict(**indexes)  # for type hinting: to allow mutation
-
-            loadable_vars = {
-                name: var
-                for name, var in ds.variables.items()
-                if name in loadable_variables
-            }
-
-            # if we only read the indexes we can just close the file right away as nothing is lazy
-            if loadable_vars == {}:
-                ds.close()
+    if indexes is None or len(loadable_variables) > 0:
+        # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
+        # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
+        # TODO really we probably want a dedicated xarray backend that iterates over all variables only once
+        ds = xr.open_dataset(filepath, drop_variables=drop_variables)
+
+        if indexes is None:
+            # add default indexes by reading data from file
+            indexes = {name: index for name, index in ds.xindexes.items()}
+        elif indexes != {}:
+            # TODO allow manual specification of index objects
+            raise NotImplementedError()
         else:
-            loadable_vars = {}
-            indexes = {}
+            indexes = dict(**indexes)  # for type hinting: to allow mutation
 
-        vars = {**virtual_vars, **loadable_vars}
+        loadable_vars = {
+            name: var
+            for name, var in ds.variables.items()
+            if name in loadable_variables
+        }
 
-        data_vars, coords = separate_coords(vars, indexes)
+        # if we only read the indexes we can just close the file right away as nothing is lazy
+        if loadable_vars == {}:
+            ds.close()
+    else:
+        loadable_vars = {}
+        indexes = {}
 
-        vds = xr.Dataset(
-            data_vars,
-            coords=coords,
-            # indexes={},  # TODO should be added in a later version of xarray
-            attrs=ds_attrs,
-        )
+    vars = {**virtual_vars, **loadable_vars}
+
+    data_vars, coords = separate_coords(vars, indexes)
+    vds = xr.Dataset(
+        data_vars,
+        coords=coords,
+        # indexes={},  # TODO should be added in a later version of xarray
+        attrs=ds_attrs,
+    )
 
-        # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened
+    # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened
 
-        return vds
+    return vds
 
 
 def open_virtual_dataset_from_v3_store(

From c51e615ca0cd5396bde54868e439419fe9d9b9c8 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 13 May 2024 12:36:29 -0600
Subject: [PATCH 11/68] Initial step for hdf5plugin supported codecs.

---
 virtualizarr/readers/hdf_filters.py           | 25 +++++++++++++++
 virtualizarr/tests/test_readers/conftest.py   | 31 +++++++++++++------
 .../tests/test_readers/test_hdf_filters.py    | 20 +++++++++---
 .../test_readers/test_hdf_integration.py      |  7 +++--
 4 files changed, 66 insertions(+), 17 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 6070fc1..75f06bd 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -1,14 +1,30 @@
 from typing import List, Tuple, Union
 
 import h5py
+import hdf5plugin
 import numcodecs.registry as registry
 from numcodecs.abc import Codec
+from pydantic import BaseModel, validator
 
 _non_standard_filters = {
     "gzip": "zlib"
 }
 
 
+class BloscProperties(BaseModel):
+    blocksize: int
+    clevel: int
+    shuffle: int
+    cname: str
+
+    @validator("cname", pre=True)
+    def get_cname_from_code(cls, v):
+        blosc_compressor_codes = {
+            value: key for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items()
+        }
+        return blosc_compressor_codes[v]
+
+
 def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec:
     try:
         id = int(filter_id)
@@ -21,6 +37,15 @@ def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None
         conf = {"id": id}
         if id == "zlib":
             conf["level"] = filter_properties
+    elif isinstance(id, int):
+        filter = hdf5plugin.get_filters(id)[0]
+        id = filter.filter_name
+        if id == "blosc":
+            blosc_props = BloscProperties(**{k: v for k, v in
+                                             zip(BloscProperties.__fields__.keys(),
+                                                 filter_properties[-4:])})
+            conf = blosc_props.model_dump()
+            conf["id"] = id
 
     codec = registry.get_codec(conf)
     return codec
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 4f0d4fc..cc9331e 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -1,4 +1,5 @@
 import h5py
+import hdf5plugin
 import numpy as np
 import pytest
 import xarray as xr
@@ -132,22 +133,32 @@ def np_uncompressed():
     return np.arange(100)
 
 
-@pytest.fixture
-def gzip_filter_netcdf4_file(tmpdir, np_uncompressed):
-    filepath = f"{tmpdir}/gzip.nc"
+@pytest.fixture(params=["gzip", "blosc"])
+def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
+    filepath = f"{tmpdir}/{request.param}.nc"
     f = h5py.File(filepath, "w")
-    f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1)
+    if request.param == "gzip":
+        f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1)
+    if request.param == "blosc":
+        f.create_dataset(name="data", data=np_uncompressed,
+                         **hdf5plugin.Blosc(
+                             cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE
+                         ))
     return filepath
 
 
-@pytest.fixture
-def gzip_filter_xarray_netcdf4_file(tmpdir):
+@pytest.fixture(params=["gzip"])
+def filter_encoded_xarray_netcdf4_files(tmpdir, request):
     ds = xr.tutorial.open_dataset("air_temperature")
     encoding = {}
+    if request.param == "gzip":
+        encoding_config = {
+            "zlib": True,
+            "complevel": 1
+        }
     for var_name in ds.variables:
-        #  encoding[var_name] = {"zlib": True, "compression_opts": 1}
-        encoding[var_name] = {"compression": "gzip", "compression_opts": 1}
+        encoding[var_name] = encoding_config
 
-    filepath = f"{tmpdir}/gzip_xarray.nc"
-    ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
+    filepath = f"{tmpdir}/{request.param}_xarray.nc"
+    ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding)
     return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index 50a5d08..8094d4c 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -9,7 +9,7 @@
 
 
 class TestFilterToCodec:
-    def test_gzip_uses_zlib_nomcodec(self):
+    def test_gzip_uses_zlib_numcodec(self):
         codec = _filter_to_codec("gzip", 1)
         assert isinstance(codec, numcodecs.zlib.Zlib)
 
@@ -17,14 +17,26 @@ def test_lzf_not_available(self):
         with pytest.raises(ValueError, match="codec not available"):
             _filter_to_codec("lzf")
 
+    def test_blosc(self):
+        codec = _filter_to_codec("32001", (2, 2, 8, 800, 9, 2, 1))
+        assert isinstance(codec, numcodecs.blosc.Blosc)
+        expected_config = {
+            "id": "blosc",
+            "blocksize": 800,
+            "clevel": 9,
+            "shuffle": 2,
+            "cname": "lz4",
+        }
+        assert codec.get_config() == expected_config
+
 
 class TestCodecsFromDataSet:
-    def test_gzip(self, np_uncompressed, gzip_filter_netcdf4_file):
-        f = h5py.File(gzip_filter_netcdf4_file)
+    def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file):
+        f = h5py.File(filter_encoded_netcdf4_file)
         ds = f["data"]
         chunk_info = ds.id.get_chunk_info(0)
         codecs = codecs_from_dataset(ds)
-        with open(gzip_filter_netcdf4_file, 'rb') as file:
+        with open(filter_encoded_netcdf4_file, 'rb') as file:
             file.seek(chunk_info.byte_offset)
             bytes_read = file.read(chunk_info.size)
             decoded = codecs[0].decode(bytes_read)
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index 45bfadc..94fc0c1 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -7,12 +7,13 @@
 
 
 class TestIntegration:
-    def test_gzip_filter_end_to_end(self, tmpdir, gzip_filter_xarray_netcdf4_file):
+    def test_filters_end_to_end(self, tmpdir,
+                                    filter_encoded_xarray_netcdf4_files):
         virtual_ds = virtualizarr.open_virtual_dataset(
-            gzip_filter_xarray_netcdf4_file,
+            filter_encoded_xarray_netcdf4_files,
             filetype=FileType("netcdf4")
         )
-        kerchunk_file = f"{tmpdir}/gzip_kerchunk.json"
+        kerchunk_file = f"{tmpdir}/kerchunk.json"
         virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
         fs = fsspec.filesystem("reference", fo=kerchunk_file)
         m = fs.get_mapper("")

From 0083f77103c909079427ce3471e65af7fb3bfc54 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 16 May 2024 16:24:57 -0400
Subject: [PATCH 12/68] Small commit to check compression support in CI
 environment.

---
 pyproject.toml                              | 1 +
 virtualizarr/tests/test_readers/conftest.py | 9 +++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4818b5f..bba695e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,6 +41,7 @@ test = [
     "scipy",
     "pooch",
     "ruff",
+    "netcdf4",
 ]
 
 
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index cc9331e..8dc82c3 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -147,7 +147,7 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
     return filepath
 
 
-@pytest.fixture(params=["gzip"])
+@pytest.fixture(params=["gzip", "blosc_lz"])
 def filter_encoded_xarray_netcdf4_files(tmpdir, request):
     ds = xr.tutorial.open_dataset("air_temperature")
     encoding = {}
@@ -156,9 +156,14 @@ def filter_encoded_xarray_netcdf4_files(tmpdir, request):
             "zlib": True,
             "complevel": 1
         }
+    if request.param == "blosc_lz":
+        encoding_config = {
+            "compression": "blosc_lz",
+        }
+
     for var_name in ds.variables:
         encoding[var_name] = encoding_config
 
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
-    ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding)
+    ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
     return filepath

From 207c4b5cb411637070dc9a5f7011a0e0c98ef877 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 19 May 2024 21:34:26 +0000
Subject: [PATCH 13/68] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 virtualizarr/readers/hdf.py                   | 16 ++++----------
 virtualizarr/readers/hdf_filters.py           | 22 ++++++++++++-------
 virtualizarr/tests/test_readers/conftest.py   | 18 +++++++--------
 virtualizarr/tests/test_readers/test_hdf.py   |  5 +----
 .../tests/test_readers/test_hdf_filters.py    |  2 +-
 .../test_readers/test_hdf_integration.py      |  6 ++---
 virtualizarr/xarray.py                        |  5 ++---
 7 files changed, 33 insertions(+), 41 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index 7d95d99..78e718e 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -36,15 +36,11 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
             key_list = [0] * (len(dataset.shape) or 1)
             key = ".".join(map(str, key_list))
             chunk_entry = ChunkEntry(
-                path=path,
-                offset=dsid.get_offset(),
-                length=dsid.get_storage_size()
+                path=path, offset=dsid.get_offset(), length=dsid.get_storage_size()
             )
             chunk_key = ChunkKey(key)
             chunk_entries = {chunk_key: chunk_entry}
-            chunk_manifest = ChunkManifest(
-                entries=chunk_entries
-            )
+            chunk_manifest = ChunkManifest(entries=chunk_entries)
             return chunk_manifest
     else:
         num_chunks = dsid.get_num_chunks()
@@ -60,9 +56,7 @@ def get_key(blob):
 
         def store_chunk_entry(blob):
             chunk_entries[get_key(blob)] = ChunkEntry(
-                path=path,
-                offset=blob.byte_offset,
-                length=blob.size
+                path=path, offset=blob.byte_offset, length=blob.size
             )
 
         has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
@@ -72,9 +66,7 @@ def store_chunk_entry(blob):
             for index in range(num_chunks):
                 store_chunk_entry(dsid.get_chunk_info(index))
 
-        chunk_manifest = ChunkManifest(
-            entries=chunk_entries
-        )
+        chunk_manifest = ChunkManifest(entries=chunk_entries)
         return chunk_manifest
 
 
diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 75f06bd..77e7037 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -6,9 +6,7 @@
 from numcodecs.abc import Codec
 from pydantic import BaseModel, validator
 
-_non_standard_filters = {
-    "gzip": "zlib"
-}
+_non_standard_filters = {"gzip": "zlib"}
 
 
 class BloscProperties(BaseModel):
@@ -20,12 +18,15 @@ class BloscProperties(BaseModel):
     @validator("cname", pre=True)
     def get_cname_from_code(cls, v):
         blosc_compressor_codes = {
-            value: key for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items()
+            value: key
+            for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items()
         }
         return blosc_compressor_codes[v]
 
 
-def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec:
+def _filter_to_codec(
+    filter_id: str, filter_properties: Union[int, Tuple] = None
+) -> Codec:
     try:
         id = int(filter_id)
     except ValueError:
@@ -41,9 +42,14 @@ def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None
         filter = hdf5plugin.get_filters(id)[0]
         id = filter.filter_name
         if id == "blosc":
-            blosc_props = BloscProperties(**{k: v for k, v in
-                                             zip(BloscProperties.__fields__.keys(),
-                                                 filter_properties[-4:])})
+            blosc_props = BloscProperties(
+                **{
+                    k: v
+                    for k, v in zip(
+                        BloscProperties.__fields__.keys(), filter_properties[-4:]
+                    )
+                }
+            )
             conf = blosc_props.model_dump()
             conf["id"] = id
 
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index aa66f93..53c9630 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -138,12 +138,15 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
     filepath = f"{tmpdir}/{request.param}.nc"
     f = h5py.File(filepath, "w")
     if request.param == "gzip":
-        f.create_dataset(name="data", data=np_uncompressed, compression="gzip", compression_opts=1)
+        f.create_dataset(
+            name="data", data=np_uncompressed, compression="gzip", compression_opts=1
+        )
     if request.param == "blosc":
-        f.create_dataset(name="data", data=np_uncompressed,
-                         **hdf5plugin.Blosc(
-                             cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE
-                         ))
+        f.create_dataset(
+            name="data",
+            data=np_uncompressed,
+            **hdf5plugin.Blosc(cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE),
+        )
     return filepath
 
 
@@ -152,10 +155,7 @@ def filter_encoded_xarray_netcdf4_files(tmpdir, request):
     ds = xr.tutorial.open_dataset("air_temperature")
     encoding = {}
     if request.param == "gzip":
-        encoding_config = {
-            "zlib": True,
-            "complevel": 1
-        }
+        encoding_config = {"zlib": True, "complevel": 1}
 
     for var_name in ds.variables:
         encoding[var_name] = encoding_config
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index 0d5a16d..a83bfc3 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -105,8 +105,5 @@ def test_groups_not_implemented(self, group_netcdf4_file):
             virtual_vars_from_hdf(group_netcdf4_file)
 
     def test_drop_variables(self, multiple_datasets_netcdf4_file):
-        variables = virtual_vars_from_hdf(
-            multiple_datasets_netcdf4_file,
-            ["data2"]
-        )
+        variables = virtual_vars_from_hdf(multiple_datasets_netcdf4_file, ["data2"])
         assert "data2" not in variables.keys()
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index 8094d4c..28b5d69 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -36,7 +36,7 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file):
         ds = f["data"]
         chunk_info = ds.id.get_chunk_info(0)
         codecs = codecs_from_dataset(ds)
-        with open(filter_encoded_netcdf4_file, 'rb') as file:
+        with open(filter_encoded_netcdf4_file, "rb") as file:
             file.seek(chunk_info.byte_offset)
             bytes_read = file.read(chunk_info.size)
             decoded = codecs[0].decode(bytes_read)
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index 94fc0c1..b31289c 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -7,11 +7,9 @@
 
 
 class TestIntegration:
-    def test_filters_end_to_end(self, tmpdir,
-                                    filter_encoded_xarray_netcdf4_files):
+    def test_filters_end_to_end(self, tmpdir, filter_encoded_xarray_netcdf4_files):
         virtual_ds = virtualizarr.open_virtual_dataset(
-            filter_encoded_xarray_netcdf4_files,
-            filetype=FileType("netcdf4")
+            filter_encoded_xarray_netcdf4_files, filetype=FileType("netcdf4")
         )
         kerchunk_file = f"{tmpdir}/kerchunk.json"
         virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 7264565..d8b6a08 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -20,8 +20,8 @@
     _automatically_determine_filetype,
 )
 from virtualizarr.manifests import ChunkManifest, ManifestArray
-from virtualizarr.utils import _fsspec_openfile_from_filepath
 from virtualizarr.readers.hdf import attrs_from_root_group, virtual_vars_from_hdf
+from virtualizarr.utils import _fsspec_openfile_from_filepath
 from virtualizarr.zarr import (
     attrs_from_zarr_group_json,
     dataset_to_zarr,
@@ -109,8 +109,7 @@ def open_virtual_dataset(
     if filetype.name.lower() == "netcdf4":
         print("wat")
         virtual_vars = virtual_vars_from_hdf(
-            path=filepath,
-            drop_variables=drop_variables
+            path=filepath, drop_variables=drop_variables
         )
         ds_attrs = attrs_from_root_group(path=filepath)
     if filetype == "zarr_v3":

From c57380058a5ad6ddbd908d54b1edd85b1f74f91d Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sun, 19 May 2024 16:12:50 -0600
Subject: [PATCH 14/68] Fix mypy complaints for hdf_filters.

---
 virtualizarr/readers/hdf_filters.py | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 75f06bd..7a8bcc8 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -1,4 +1,4 @@
-from typing import List, Tuple, Union
+from typing import List, Optional, Tuple, TypedDict, Union
 
 import h5py
 import hdf5plugin
@@ -25,26 +25,30 @@ def get_cname_from_code(cls, v):
         return blosc_compressor_codes[v]
 
 
-def _filter_to_codec(filter_id: str, filter_properties: Union[int, Tuple] = None) -> Codec:
+def _filter_to_codec(filter_id: str, filter_properties: Union[int, None, Tuple] = None) -> Codec:
+    id_int = None
+    id_str = None
     try:
-        id = int(filter_id)
+        id_int = int(filter_id)
     except ValueError:
-        id = filter_id
+        id_str = filter_id
 
-    if isinstance(id, str):
-        if id in _non_standard_filters.keys():
-            id = _non_standard_filters[id]
+    if id_str:
+        if id_str in _non_standard_filters.keys():
+            id = _non_standard_filters[id_str]
+        else:
+            id = id_str
         conf = {"id": id}
         if id == "zlib":
-            conf["level"] = filter_properties
-    elif isinstance(id, int):
-        filter = hdf5plugin.get_filters(id)[0]
+            conf["level"] = filter_properties # type: ignore[assignment]
+    if id_int:
+        filter = hdf5plugin.get_filters(id_int)[0]
         id = filter.filter_name
-        if id == "blosc":
+        if id == "blosc" and isinstance(filter_properties, tuple):
             blosc_props = BloscProperties(**{k: v for k, v in
                                              zip(BloscProperties.__fields__.keys(),
                                                  filter_properties[-4:])})
-            conf = blosc_props.model_dump()
+            conf = blosc_props.model_dump() # type: ignore[assignment]
             conf["id"] = id
 
     codec = registry.get_codec(conf)

From 588e06b507e8661644e33923ad0295e255152e1e Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sun, 19 May 2024 16:22:39 -0600
Subject: [PATCH 15/68] Local pre-commit fix for hdf_filters.

---
 virtualizarr/readers/hdf_filters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index a3868eb..dfe1c1f 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple, TypedDict, Union
+from typing import List, Tuple, Union
 
 import h5py
 import hdf5plugin

From 725333e06fad83d4d763317faca5f41167a2c98f Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 20 May 2024 20:13:44 -0600
Subject: [PATCH 16/68] Use fsspec reader_options introduced in #37.

---
 virtualizarr/readers/hdf.py | 22 ++++++++++++++++------
 virtualizarr/xarray.py      |  7 ++++---
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index 78e718e..19d99b3 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -1,6 +1,5 @@
 from typing import List, Mapping, Optional, Union
 
-import fsspec
 import h5py
 import numpy as np
 import xarray as xr
@@ -8,6 +7,7 @@
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
 from virtualizarr.readers.hdf_filters import codecs_from_dataset
 from virtualizarr.types import ChunkKey
+from virtualizarr.utils import _fsspec_openfile_from_filepath
 from virtualizarr.zarr import ZArray
 
 
@@ -185,11 +185,15 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
 def virtual_vars_from_hdf(
     path: str,
     drop_variables: Optional[List[str]] = None,
+    reader_options: Optional[dict] = {
+        "storage_options": {"key": "", "secret": "", "anon": True}
+    },
 ) -> Mapping[str, xr.Variable]:
     if drop_variables is None:
         drop_variables = []
-    fs, file_path = fsspec.core.url_to_fs(path)
-    open_file = fs.open(path, "rb")
+    open_file = _fsspec_openfile_from_filepath(
+        filepath=path, reader_options=reader_options
+    )
     f = h5py.File(open_file, mode="r")
     variables = {}
     for key in f.keys():
@@ -203,9 +207,15 @@ def virtual_vars_from_hdf(
     return variables
 
 
-def attrs_from_root_group(path: str):
-    fs, file_path = fsspec.core.url_to_fs(path)
-    open_file = fs.open(path, "rb")
+def attrs_from_root_group(
+    path: str,
+    reader_options: Optional[dict] = {
+        "storage_options": {"key": "", "secret": "", "anon": True}
+    },
+):
+    open_file = _fsspec_openfile_from_filepath(
+        filepath=path, reader_options=reader_options
+    )
     f = h5py.File(open_file, mode="r")
     attrs = _extract_attrs(f)
     return attrs
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index d8b6a08..8f810ee 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -107,11 +107,12 @@ def open_virtual_dataset(
     filetype = FileType(filetype)
 
     if filetype.name.lower() == "netcdf4":
-        print("wat")
         virtual_vars = virtual_vars_from_hdf(
-            path=filepath, drop_variables=drop_variables
+            path=filepath,
+            drop_variables=drop_variables,
+            reader_options=reader_options,
         )
-        ds_attrs = attrs_from_root_group(path=filepath)
+        ds_attrs = attrs_from_root_group(path=filepath, reader_options=reader_options)
     if filetype == "zarr_v3":
         # TODO is there a neat way of auto-detecting this?
         return open_virtual_dataset_from_v3_store(

From 72df10861ab0830531502885c0aaa3ebf3de4dee Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 20 May 2024 20:40:38 -0600
Subject: [PATCH 17/68] Fix incorrect zarr_v3 if block position from merge
 commit ef0d7a8.

---
 virtualizarr/xarray.py | 128 +++++++++++++++++++++--------------------
 1 file changed, 66 insertions(+), 62 deletions(-)

diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 8f810ee..d76e2a6 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -101,82 +101,86 @@ def open_virtual_dataset(
 
     if virtual_array_class is not ManifestArray:
         raise NotImplementedError()
-
-    if filetype is None:
-        filetype = _automatically_determine_filetype(filepath=filepath)
-    filetype = FileType(filetype)
-
-    if filetype.name.lower() == "netcdf4":
-        virtual_vars = virtual_vars_from_hdf(
-            path=filepath,
-            drop_variables=drop_variables,
-            reader_options=reader_options,
-        )
-        ds_attrs = attrs_from_root_group(path=filepath, reader_options=reader_options)
     if filetype == "zarr_v3":
         # TODO is there a neat way of auto-detecting this?
         return open_virtual_dataset_from_v3_store(
             storepath=filepath, drop_variables=drop_variables, indexes=indexes
         )
     else:
-        # this is the only place we actually always need to use kerchunk directly
-        # TODO avoid even reading byte ranges for variables that will be dropped later anyway?
-        vds_refs = kerchunk.read_kerchunk_references_from_file(
-            filepath=filepath,
-            filetype=filetype,
-        )
-        virtual_vars = virtual_vars_from_kerchunk_refs(
-            vds_refs,
-            drop_variables=drop_variables + loadable_variables,
-            virtual_array_class=virtual_array_class,
-        )
-        ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {})
-
-    if indexes is None or len(loadable_variables) > 0:
-        # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
-        # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
-        # TODO really we probably want a dedicated xarray backend that iterates over all variables only once
-        fpath = _fsspec_openfile_from_filepath(
-            filepath=filepath, reader_options=reader_options
-        )
+        if filetype is None:
+            filetype = _automatically_determine_filetype(filepath=filepath)
+        filetype = FileType(filetype)
+
+        if filetype.name.lower() == "netcdf4":
+            virtual_vars = virtual_vars_from_hdf(
+                path=filepath,
+                drop_variables=drop_variables,
+                reader_options=reader_options,
+            )
+            ds_attrs = attrs_from_root_group(
+                path=filepath, reader_options=reader_options
+            )
+        else:
+            # this is the only place we actually always need to use kerchunk directly
+            # TODO avoid even reading byte ranges for variables that will be dropped later anyway?
+            vds_refs = kerchunk.read_kerchunk_references_from_file(
+                filepath=filepath,
+                filetype=filetype,
+            )
+            virtual_vars = virtual_vars_from_kerchunk_refs(
+                vds_refs,
+                drop_variables=drop_variables + loadable_variables,
+                virtual_array_class=virtual_array_class,
+            )
+            ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(
+                ".zattrs", {}
+            )
 
-        ds = xr.open_dataset(fpath, drop_variables=drop_variables)
+        if indexes is None or len(loadable_variables) > 0:
+            # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
+            # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
+            # TODO really we probably want a dedicated xarray backend that iterates over all variables only once
+            fpath = _fsspec_openfile_from_filepath(
+                filepath=filepath, reader_options=reader_options
+            )
 
-        if indexes is None:
-            # add default indexes by reading data from file
-            indexes = {name: index for name, index in ds.xindexes.items()}
-        elif indexes != {}:
-            # TODO allow manual specification of index objects
-            raise NotImplementedError()
-        else:
-            indexes = dict(**indexes)  # for type hinting: to allow mutation
+            ds = xr.open_dataset(fpath, drop_variables=drop_variables)
 
-        loadable_vars = {
-            name: var
-            for name, var in ds.variables.items()
-            if name in loadable_variables
-        }
+            if indexes is None:
+                # add default indexes by reading data from file
+                indexes = {name: index for name, index in ds.xindexes.items()}
+            elif indexes != {}:
+                # TODO allow manual specification of index objects
+                raise NotImplementedError()
+            else:
+                indexes = dict(**indexes)  # for type hinting: to allow mutation
 
-        # if we only read the indexes we can just close the file right away as nothing is lazy
-        if loadable_vars == {}:
-            ds.close()
-    else:
-        loadable_vars = {}
-        indexes = {}
+            loadable_vars = {
+                name: var
+                for name, var in ds.variables.items()
+                if name in loadable_variables
+            }
 
-    vars = {**virtual_vars, **loadable_vars}
+            # if we only read the indexes we can just close the file right away as nothing is lazy
+            if loadable_vars == {}:
+                ds.close()
+        else:
+            loadable_vars = {}
+            indexes = {}
 
-    data_vars, coords = separate_coords(vars, indexes)
-    vds = xr.Dataset(
-        data_vars,
-        coords=coords,
-        # indexes={},  # TODO should be added in a later version of xarray
-        attrs=ds_attrs,
-    )
+        vars = {**virtual_vars, **loadable_vars}
 
-    # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened
+        data_vars, coords = separate_coords(vars, indexes)
+        vds = xr.Dataset(
+            data_vars,
+            coords=coords,
+            # indexes={},  # TODO should be added in a later version of xarray
+            attrs=ds_attrs,
+        )
 
-    return vds
+        # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened
+
+        return vds
 
 
 def open_virtual_dataset_from_v3_store(

From d1e85cb169adc3851951afc2a64fcdec6180243c Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 21 May 2024 08:48:05 -0600
Subject: [PATCH 18/68] Fix early return from hdf _extract_attrs.

---
 virtualizarr/readers/hdf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index 19d99b3..be93237 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -155,7 +155,7 @@ def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]):
             continue
 
         attrs[n] = v
-        return attrs
+    return attrs
 
 
 def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:

From 1e2b3436fd086f8188c516f2fda4f6cd3a521325 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 21 May 2024 09:23:50 -0600
Subject: [PATCH 19/68] Test that _extract_attrs correctly handles multiple
 attributes.

---
 virtualizarr/tests/test_readers/conftest.py |  3 ++-
 virtualizarr/tests/test_readers/test_hdf.py | 16 +++++++++++-----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 53c9630..fe2ec88 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -93,12 +93,13 @@ def chunked_dimensions_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def string_attribute_netcdf4_file(tmpdir):
+def string_attributes_netcdf4_file(tmpdir):
     filepath = f"{tmpdir}/attributes.nc"
     f = h5py.File(filepath, "w")
     data = np.random.random((10, 10))
     f.create_dataset(name="data", data=data, chunks=None)
     f["data"].attrs["attribute_name"] = "attribute_name"
+    f["data"].attrs["attribute_name2"] = "attribute_name2"
     return filepath
 
 
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index a83bfc3..a67352e 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -75,16 +75,16 @@ def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file):
         var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds)
         assert var.chunks == (2,)
 
-    def test_dataset_attributes(self, string_attribute_netcdf4_file):
-        f = h5py.File(string_attribute_netcdf4_file)
+    def test_dataset_attributes(self, string_attributes_netcdf4_file):
+        f = h5py.File(string_attributes_netcdf4_file)
         ds = f["data"]
-        var = _dataset_to_variable(string_attribute_netcdf4_file, ds)
+        var = _dataset_to_variable(string_attributes_netcdf4_file, ds)
         assert var.attrs["attribute_name"] == "attribute_name"
 
 
 class TestExtractAttributes:
-    def test_string_attribute(self, string_attribute_netcdf4_file):
-        f = h5py.File(string_attribute_netcdf4_file)
+    def test_string_attribute(self, string_attributes_netcdf4_file):
+        f = h5py.File(string_attributes_netcdf4_file)
         ds = f["data"]
         attrs = _extract_attrs(ds)
         assert attrs["attribute_name"] == "attribute_name"
@@ -94,6 +94,12 @@ def test_root_attribute(self, root_attributes_netcdf4_file):
         attrs = _extract_attrs(f)
         assert attrs["attribute_name"] == "attribute_name"
 
+    def test_multiple_attributes(self, string_attributes_netcdf4_file):
+        f = h5py.File(string_attributes_netcdf4_file)
+        ds = f["data"]
+        attrs = _extract_attrs(ds)
+        assert len(attrs.keys()) == 2
+
 
 class TestVirtualVarsFromHDF:
     def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file):

From 7f1c1897dcad92cb988ea7e14a165d63fe23dad6 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 22 May 2024 14:16:12 -0600
Subject: [PATCH 20/68] Initial attempt at scale and offset via numcodecs.

---
 virtualizarr/readers/hdf.py         | 14 ++++++++---
 virtualizarr/readers/hdf_filters.py | 36 ++++++++++++++++++++++++++++-
 2 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index be93237..c251866 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -5,7 +5,7 @@
 import xarray as xr
 
 from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
-from virtualizarr.readers.hdf_filters import codecs_from_dataset
+from virtualizarr.readers.hdf_filters import cfcodec_from_dataset, codecs_from_dataset
 from virtualizarr.types import ChunkKey
 from virtualizarr.utils import _fsspec_openfile_from_filepath
 from virtualizarr.zarr import ZArray
@@ -163,11 +163,20 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
     # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
     chunks = dataset.chunks if dataset.chunks else dataset.shape
     codecs = codecs_from_dataset(dataset)
+    cfcodec = cfcodec_from_dataset(dataset)
+    attrs = _extract_attrs(dataset)
+    if cfcodec:
+        codecs.append(cfcodec["codec"])
+        dtype = cfcodec["target_dtype"]
+        attrs.pop("scale_factor", None)
+        attrs.pop("add_offset", None)
+    else:
+        dtype = dataset.dtype
     filters = [codec.get_config() for codec in codecs]
     zarray = ZArray(
         chunks=chunks,
         compressor=None,
-        dtype=dataset.dtype,
+        dtype=dtype,
         fill_value=dataset.fillvalue,
         filters=filters,
         order="C",
@@ -177,7 +186,6 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
     manifest = _dataset_chunk_manifest(path, dataset)
     marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
     dims = _dataset_dims(dataset)
-    attrs = _extract_attrs(dataset)
     variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
     return variable
 
diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index dfe1c1f..169eab9 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -1,10 +1,13 @@
-from typing import List, Tuple, Union
+from typing import List, Tuple, TypedDict, Union
 
 import h5py
 import hdf5plugin
 import numcodecs.registry as registry
+import numpy as np
 from numcodecs.abc import Codec
+from numcodecs.fixedscaleoffset import FixedScaleOffset
 from pydantic import BaseModel, validator
+from xarray.coding.variables import _choose_float_dtype
 
 _non_standard_filters = {"gzip": "zlib"}
 
@@ -24,6 +27,11 @@ def get_cname_from_code(cls, v):
         return blosc_compressor_codes[v]
 
 
+class CFCodec(TypedDict):
+    target_dtype: np.dtype
+    codec: Codec
+
+
 def _filter_to_codec(
     filter_id: str, filter_properties: Union[int, None, Tuple] = None
 ) -> Codec:
@@ -61,6 +69,32 @@ def _filter_to_codec(
     return codec
 
 
+def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
+    attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs}
+    mapping = {}
+    if "scale_factor" in attributes:
+        mapping["scale_factor"] = 1 / attributes["scale_factor"][0]
+    else:
+        mapping["scale_factor"] = 1
+    if "add_offset" in attributes:
+        mapping["add_offset"] = attributes["add_offset"]
+    else:
+        mapping["add_offset"] = 0
+    if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0:
+        float_dtype = _choose_float_dtype(dtype=dataset.dtype, mapping=mapping)
+        target_dtype = np.dtype(float_dtype)
+        codec = FixedScaleOffset(
+            offset=mapping["add_offset"],
+            scale=mapping["scale_factor"],
+            dtype=target_dtype,
+            astype=dataset.dtype,
+        )
+        cfcodec = CFCodec(target_dtype=target_dtype, codec=codec)
+        return cfcodec
+    else:
+        return None
+
+
 def codecs_from_dataset(dataset: h5py.Dataset) -> List[Codec]:
     codecs = []
     for filter_id, filter_properties in dataset._filters.items():

From 908e332ae9860a7e7d36845633a7c9267ee72ca0 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 23 May 2024 10:54:48 -0600
Subject: [PATCH 21/68] Tests for cfcodec_from_dataset.

---
 virtualizarr/tests/test_readers/conftest.py   | 10 +++++++
 .../tests/test_readers/test_hdf_filters.py    | 29 +++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index fe2ec88..202cdd9 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -164,3 +164,13 @@ def filter_encoded_xarray_netcdf4_files(tmpdir, request):
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
     ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
     return filepath
+
+
+@pytest.fixture
+def add_offset_netcdf4_file(tmpdir):
+    filepath = f"{tmpdir}/offset.nc"
+    f = h5py.File(filepath, "w")
+    data = np.random.random((10, 10))
+    f.create_dataset(name="data", data=data, chunks=None)
+    f["data"].attrs.create(name="add_offset", data=5)
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index 28b5d69..dca9f40 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -1,9 +1,11 @@
 import h5py
 import numcodecs
+import numpy as np
 import pytest
 
 from virtualizarr.readers.hdf_filters import (
     _filter_to_codec,
+    cfcodec_from_dataset,
     codecs_from_dataset,
 )
 
@@ -41,3 +43,30 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file):
             bytes_read = file.read(chunk_info.size)
             decoded = codecs[0].decode(bytes_read)
             assert decoded == np_uncompressed.tobytes()
+
+
+class TestCFCodecFromDataset:
+    def test_no_cf_convention(self, filter_encoded_netcdf4_file):
+        f = h5py.File(filter_encoded_netcdf4_file)
+        ds = f["data"]
+        cf_codec = cfcodec_from_dataset(ds)
+        assert cf_codec is None
+
+    def test_cf_scale_factor(self, netcdf4_file):
+        f = h5py.File(netcdf4_file)
+        ds = f["air"]
+        cf_codec = cfcodec_from_dataset(ds)
+        assert cf_codec["target_dtype"] == np.dtype(np.float64)
+        assert cf_codec["codec"].scale == 100.0
+        assert cf_codec["codec"].offset == 0
+        assert cf_codec["codec"].dtype == "<f8"
+        assert cf_codec["codec"].astype == "<i2"
+
+    def test_cf_add_offset(self, add_offset_netcdf4_file):
+        f = h5py.File(add_offset_netcdf4_file)
+        ds = f["data"]
+        cf_codec = cfcodec_from_dataset(ds)
+        assert cf_codec["target_dtype"] == np.dtype(np.float64)
+        assert cf_codec["codec"].scale == 1
+        assert cf_codec["codec"].offset == 5
+        assert cf_codec["codec"].dtype == "<f8"

From 0df332d04d438f291abc7b952a15e8851e6e9777 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 24 May 2024 12:47:12 -0600
Subject: [PATCH 22/68] Temporarily relax integration tests to assert_allclose.

---
 virtualizarr/tests/test_integration.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
index 064968b..1b9aad8 100644
--- a/virtualizarr/tests/test_integration.py
+++ b/virtualizarr/tests/test_integration.py
@@ -62,7 +62,7 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format):
         roundtrip = xr.open_dataset(f"{tmpdir}/refs.{format}", engine="kerchunk")
 
         # assert equal to original dataset
-        xrt.assert_equal(roundtrip, ds)
+        xrt.assert_allclose(roundtrip, ds)
 
     def test_kerchunk_roundtrip_concat(self, tmpdir, format):
         # set up example xarray dataset
@@ -89,7 +89,7 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format):
         roundtrip = xr.open_dataset(f"{tmpdir}/refs.{format}", engine="kerchunk")
 
         # assert equal to original dataset
-        xrt.assert_equal(roundtrip, ds)
+        xrt.assert_allclose(roundtrip, ds)
 
 
 def test_open_scalar_variable(tmpdir):

From ca6b236b36fabf96c0659556f2cff2ef59435d6c Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 24 May 2024 13:50:49 -0600
Subject: [PATCH 23/68] Add blosc_lz4 fixture parameterization to confirm
 libnetcdf environment.

---
 virtualizarr/tests/test_readers/conftest.py         | 13 +++++++++----
 .../tests/test_readers/test_hdf_integration.py      |  4 ++--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 202cdd9..20d5433 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -134,7 +134,7 @@ def np_uncompressed():
     return np.arange(100)
 
 
-@pytest.fixture(params=["gzip", "blosc"])
+@pytest.fixture(params=["gzip", "blosc_lz4"])
 def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
     filepath = f"{tmpdir}/{request.param}.nc"
     f = h5py.File(filepath, "w")
@@ -142,7 +142,7 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
         f.create_dataset(
             name="data", data=np_uncompressed, compression="gzip", compression_opts=1
         )
-    if request.param == "blosc":
+    if request.param == "blosc_lz4":
         f.create_dataset(
             name="data",
             data=np_uncompressed,
@@ -151,18 +151,23 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
     return filepath
 
 
-@pytest.fixture(params=["gzip"])
-def filter_encoded_xarray_netcdf4_files(tmpdir, request):
+@pytest.fixture(params=["gzip", "blosc_zlib"])
+def filter_encoded_xarray_netcdf4_file(tmpdir, request):
     ds = xr.tutorial.open_dataset("air_temperature")
     encoding = {}
     if request.param == "gzip":
         encoding_config = {"zlib": True, "complevel": 1}
+    if "blosc" in request.param:
+        encoding_config = {
+            "compression": request.param,
+        }
 
     for var_name in ds.variables:
         encoding[var_name] = encoding_config
 
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
     ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
+    #  ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding)
     return filepath
 
 
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index b31289c..ade8e7c 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -7,9 +7,9 @@
 
 
 class TestIntegration:
-    def test_filters_end_to_end(self, tmpdir, filter_encoded_xarray_netcdf4_files):
+    def test_filters_roundtrip(self, tmpdir, filter_encoded_xarray_netcdf4_file):
         virtual_ds = virtualizarr.open_virtual_dataset(
-            filter_encoded_xarray_netcdf4_files, filetype=FileType("netcdf4")
+            filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4")
         )
         kerchunk_file = f"{tmpdir}/kerchunk.json"
         virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")

From b7426c5b15f33a65a0890a51fbc6d9464b673eaf Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 24 May 2024 14:05:21 -0600
Subject: [PATCH 24/68] Check for compatability with netcdf4 engine.

---
 virtualizarr/tests/test_readers/conftest.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 20d5433..cb1212f 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -166,8 +166,7 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request):
         encoding[var_name] = encoding_config
 
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
-    ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
-    #  ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding)
+    ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding)
     return filepath
 
 

From dac21dde6239b5ea7e918ff50aef8839ab2f7773 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 27 May 2024 12:58:48 -0600
Subject: [PATCH 25/68] Use separate fixtures for h5netcdf and netcdf4
 compression styles.

---
 virtualizarr/tests/test_readers/conftest.py   | 27 ++++++++++++++-----
 .../test_readers/test_hdf_integration.py      | 20 ++++++++++++--
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index cb1212f..a4fafed 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pytest
 import xarray as xr
+from xarray.tests.test_dataset import create_test_data
 
 
 @pytest.fixture
@@ -151,22 +152,36 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
     return filepath
 
 
-@pytest.fixture(params=["gzip", "blosc_zlib"])
-def filter_encoded_xarray_netcdf4_file(tmpdir, request):
+@pytest.fixture(params=["gzip"])
+def filter_encoded_xarray_h5netcdf_file(tmpdir, request):
     ds = xr.tutorial.open_dataset("air_temperature")
     encoding = {}
     if request.param == "gzip":
         encoding_config = {"zlib": True, "complevel": 1}
+
+    for var_name in ds.variables:
+        encoding[var_name] = encoding_config
+
+    filepath = f"{tmpdir}/{request.param}_xarray.nc"
+    ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
+    return filepath
+
+
+@pytest.fixture(params=["blosc_zlib"])
+def filter_encoded_xarray_netcdf4_file(tmpdir, request):
+    ds = create_test_data(dim_sizes=(20, 80, 10))
     if "blosc" in request.param:
         encoding_config = {
             "compression": request.param,
+            "chunksizes": (20, 40),
+            "original_shape": ds.var2.shape,
+            "blosc_shuffle": 1,
+            "fletcher32": False,
         }
 
-    for var_name in ds.variables:
-        encoding[var_name] = encoding_config
-
+    ds["var2"].encoding.update(encoding_config)
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
-    ds.to_netcdf(filepath, engine="netcdf4", encoding=encoding)
+    ds.to_netcdf(filepath, engine="netcdf4")
     return filepath
 
 
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index ade8e7c..d6ecf2f 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -7,9 +7,11 @@
 
 
 class TestIntegration:
-    def test_filters_roundtrip(self, tmpdir, filter_encoded_xarray_netcdf4_file):
+    def test_filters_h5netcdf_roundtrip(
+        self, tmpdir, filter_encoded_xarray_h5netcdf_file
+    ):
         virtual_ds = virtualizarr.open_virtual_dataset(
-            filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4")
+            filter_encoded_xarray_h5netcdf_file, filetype=FileType("netcdf4")
         )
         kerchunk_file = f"{tmpdir}/kerchunk.json"
         virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
@@ -18,3 +20,17 @@ def test_filters_roundtrip(self, tmpdir, filter_encoded_xarray_netcdf4_file):
 
         ds = xr.open_dataset(m, engine="kerchunk")
         assert isinstance(ds.air.values[0][0][0], numpy.float64)
+
+    def test_filters_netcdf4_roundtrip(
+        self, tmpdir, filter_encoded_xarray_netcdf4_file
+    ):
+        virtual_ds = virtualizarr.open_virtual_dataset(
+            filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4")
+        )
+        kerchunk_file = f"{tmpdir}/kerchunk.json"
+        virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
+        fs = fsspec.filesystem("reference", fo=kerchunk_file)
+        m = fs.get_mapper("")
+
+        ds = xr.open_dataset(m, engine="kerchunk")
+        print(ds["var2"].encoding)

From e968772a3a206658064e3e29294afec7604d0bc9 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 27 May 2024 15:49:22 -0600
Subject: [PATCH 26/68] Print libhdf5 and libnetcdf4 versions to confirm
 compiled environment.

---
 virtualizarr/tests/test_readers/conftest.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index a4fafed..8904dd3 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -4,6 +4,7 @@
 import pytest
 import xarray as xr
 from xarray.tests.test_dataset import create_test_data
+from xarray.util.print_versions import netcdf_and_hdf5_versions
 
 
 @pytest.fixture
@@ -181,6 +182,7 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request):
 
     ds["var2"].encoding.update(encoding_config)
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
+    print(netcdf_and_hdf5_versions())
     ds.to_netcdf(filepath, engine="netcdf4")
     return filepath
 

From 9a98e57e55fd020bcf3d682604eee2f03775ff26 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 27 May 2024 17:07:51 -0600
Subject: [PATCH 27/68] Skip netcdf4 style compression tests when libhdf5 <
 1.14.

---
 virtualizarr/tests/test_readers/conftest.py   | 15 ++++++++++++---
 .../test_readers/test_hdf_integration.py      | 19 ++++++++++++++++---
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 8904dd3..0ddb2a0 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pytest
 import xarray as xr
+from packaging.version import Version
 from xarray.tests.test_dataset import create_test_data
 from xarray.util.print_versions import netcdf_and_hdf5_versions
 
@@ -168,8 +169,17 @@ def filter_encoded_xarray_h5netcdf_file(tmpdir, request):
     return filepath
 
 
+@pytest.fixture()
+def skip_test_for_libhdf5_version():
+    versions = netcdf_and_hdf5_versions()
+    libhdf5_version = Version(versions[0][1])
+    return libhdf5_version < Version("1.14")
+
+
 @pytest.fixture(params=["blosc_zlib"])
-def filter_encoded_xarray_netcdf4_file(tmpdir, request):
+def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_version):
+    if skip_test_for_libhdf5_version:
+        pytest.skip("Requires libhdf5 >= 1.14")
     ds = create_test_data(dim_sizes=(20, 80, 10))
     if "blosc" in request.param:
         encoding_config = {
@@ -182,9 +192,8 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request):
 
     ds["var2"].encoding.update(encoding_config)
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
-    print(netcdf_and_hdf5_versions())
     ds.to_netcdf(filepath, engine="netcdf4")
-    return filepath
+    return {"filepath": filepath, "compressor": request.param}
 
 
 @pytest.fixture
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index d6ecf2f..f51ebd4 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -24,13 +24,26 @@ def test_filters_h5netcdf_roundtrip(
     def test_filters_netcdf4_roundtrip(
         self, tmpdir, filter_encoded_xarray_netcdf4_file
     ):
+        filepath = filter_encoded_xarray_netcdf4_file["filepath"]
+        compressor = filter_encoded_xarray_netcdf4_file["compressor"]
         virtual_ds = virtualizarr.open_virtual_dataset(
-            filter_encoded_xarray_netcdf4_file, filetype=FileType("netcdf4")
+            filepath, filetype=FileType("netcdf4")
         )
         kerchunk_file = f"{tmpdir}/kerchunk.json"
         virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
         fs = fsspec.filesystem("reference", fo=kerchunk_file)
         m = fs.get_mapper("")
-
         ds = xr.open_dataset(m, engine="kerchunk")
-        print(ds["var2"].encoding)
+
+        expected_encoding = ds["var2"].encoding.copy()
+        compression = expected_encoding.pop("compression")
+        blosc_shuffle = expected_encoding.pop("blosc_shuffle")
+        if compression is not None:
+            if "blosc" in compression and blosc_shuffle:
+                expected_encoding["blosc"] = {
+                    "compressor": compressor,
+                    "shuffle": blosc_shuffle,
+                }
+                expected_encoding["shuffle"] = False
+        actual_encoding = ds["var2"].encoding
+        assert expected_encoding.items() <= actual_encoding.items()

From 7590b87e375f0dea6683aceba4322ca5a0c8a95d Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 11 Jun 2024 13:57:51 -0600
Subject: [PATCH 28/68] Include imagecodecs.numcodecs to support HDF5 lzf
 filters.

---
 pyproject.toml                                      | 1 +
 virtualizarr/readers/hdf_filters.py                 | 2 +-
 virtualizarr/tests/test_readers/test_hdf_filters.py | 8 ++++----
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f0563f0..773cccc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,6 +46,7 @@ test = [
     "fsspec",
     "s3fs",
     "fastparquet",
+    "imagecodecs-numcodecs",
 ]
 
 
diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 169eab9..08a3bba 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -9,7 +9,7 @@
 from pydantic import BaseModel, validator
 from xarray.coding.variables import _choose_float_dtype
 
-_non_standard_filters = {"gzip": "zlib"}
+_non_standard_filters = {"gzip": "zlib", "lzf": "imagecodecs_lzf"}
 
 
 class BloscProperties(BaseModel):
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index dca9f40..b5b0404 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -1,7 +1,7 @@
 import h5py
+import imagecodecs
 import numcodecs
 import numpy as np
-import pytest
 
 from virtualizarr.readers.hdf_filters import (
     _filter_to_codec,
@@ -15,9 +15,9 @@ def test_gzip_uses_zlib_numcodec(self):
         codec = _filter_to_codec("gzip", 1)
         assert isinstance(codec, numcodecs.zlib.Zlib)
 
-    def test_lzf_not_available(self):
-        with pytest.raises(ValueError, match="codec not available"):
-            _filter_to_codec("lzf")
+    def test_lzf(self):
+        codec = _filter_to_codec("lzf")
+        assert isinstance(codec, imagecodecs.numcodecs.Lzf)
 
     def test_blosc(self):
         codec = _filter_to_codec("32001", (2, 2, 8, 800, 9, 2, 1))

From 14bd7098545bd7f443b791f24aafa11bcc00fdbb Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 11 Jun 2024 16:24:30 -0600
Subject: [PATCH 29/68] Remove test that verifies call to
 read_kerchunk_references_from_file.

---
 virtualizarr/tests/test_xarray.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
index 695759b..d145550 100644
--- a/virtualizarr/tests/test_xarray.py
+++ b/virtualizarr/tests/test_xarray.py
@@ -1,5 +1,4 @@
 from collections.abc import Mapping
-from unittest.mock import patch
 
 import numpy as np
 import pytest
@@ -304,16 +303,3 @@ def test_loadable_variables(self, netcdf4_file):
         for name in full_ds.variables:
             if name in vars_to_load:
                 xrt.assert_identical(vds.variables[name], full_ds.variables[name])
-
-    @patch("virtualizarr.kerchunk.read_kerchunk_references_from_file")
-    def test_open_virtual_dataset_passes_expected_args(
-        self, mock_read_kerchunk, netcdf4_file
-    ):
-        reader_options = {"option1": "value1", "option2": "value2"}
-        open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options)
-        args = {
-            "filepath": netcdf4_file,
-            "filetype": None,
-            "reader_options": reader_options,
-        }
-        mock_read_kerchunk.assert_called_once_with(**args)

From acdf0d76557a5abdf2657f1278f57c732a4dd347 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 12 Jun 2024 15:05:34 -0600
Subject: [PATCH 30/68] Add additional codec support structures for imagecodecs
 and numcodecs.

---
 virtualizarr/readers/hdf_filters.py         | 23 +++++++++++++++++----
 virtualizarr/tests/test_readers/conftest.py |  9 +++++++-
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 08a3bba..667ff09 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -9,7 +9,12 @@
 from pydantic import BaseModel, validator
 from xarray.coding.variables import _choose_float_dtype
 
-_non_standard_filters = {"gzip": "zlib", "lzf": "imagecodecs_lzf"}
+_non_standard_filters = {
+    "gzip": "zlib",
+    "lzf": "imagecodecs_lzf",
+}
+
+_hdf5plugin_imagecodecs = {"lz4": "imagecodecs_lz4h5", "bzip2": "imagecodecs_bz2"}
 
 
 class BloscProperties(BaseModel):
@@ -27,6 +32,10 @@ def get_cname_from_code(cls, v):
         return blosc_compressor_codes[v]
 
 
+class ZstdProperties(BaseModel):
+    level: int
+
+
 class CFCodec(TypedDict):
     target_dtype: np.dtype
     codec: Codec
@@ -41,18 +50,20 @@ def _filter_to_codec(
         id_int = int(filter_id)
     except ValueError:
         id_str = filter_id
-
+    conf = {}
     if id_str:
         if id_str in _non_standard_filters.keys():
             id = _non_standard_filters[id_str]
         else:
             id = id_str
-        conf = {"id": id}
+        conf["id"] = id  # type: ignore[assignment]
         if id == "zlib":
             conf["level"] = filter_properties  # type: ignore[assignment]
     if id_int:
         filter = hdf5plugin.get_filters(id_int)[0]
         id = filter.filter_name
+        if id in _hdf5plugin_imagecodecs.keys():
+            id = _hdf5plugin_imagecodecs[id]
         if id == "blosc" and isinstance(filter_properties, tuple):
             blosc_props = BloscProperties(
                 **{
@@ -63,7 +74,11 @@ def _filter_to_codec(
                 }
             )
             conf = blosc_props.model_dump()  # type: ignore[assignment]
-            conf["id"] = id
+        if id == "zstd" and isinstance(filter_properties, tuple):
+            zstd_props = ZstdProperties(level=filter_properties[0])
+            conf = zstd_props.model_dump()  # type: ignore[assignment]
+
+        conf["id"] = id
 
     codec = registry.get_codec(conf)
     return codec
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 0ddb2a0..3e6f9c3 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -137,7 +137,7 @@ def np_uncompressed():
     return np.arange(100)
 
 
-@pytest.fixture(params=["gzip", "blosc_lz4"])
+@pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd"])
 def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
     filepath = f"{tmpdir}/{request.param}.nc"
     f = h5py.File(filepath, "w")
@@ -151,6 +151,13 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
             data=np_uncompressed,
             **hdf5plugin.Blosc(cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE),
         )
+    if request.param == "lz4":
+        f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.LZ4(nbytes=0))
+    if request.param == "bzip2":
+        f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.BZip2())
+    if request.param == "zstd":
+        f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.Zstd(clevel=2))
+
     return filepath
 
 

From 4ba323a6c862deb8908706373b6df429fd78f986 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Wed, 12 Jun 2024 16:17:04 -0600
Subject: [PATCH 31/68] Add codec config test for Zstd.

---
 virtualizarr/tests/test_readers/test_hdf_filters.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index b5b0404..4d23a75 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -31,6 +31,12 @@ def test_blosc(self):
         }
         assert codec.get_config() == expected_config
 
+    def test_zstd(self):
+        codec = _filter_to_codec("32015", (5,))
+        assert isinstance(codec, numcodecs.zstd.Zstd)
+        expected_config = {"id": "zstd", "level": 5}
+        assert codec.get_config() == expected_config
+
 
 class TestCodecsFromDataSet:
     def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file):

From e14e53b0fc2bb7ed1ca3d5b73fc43594aff77426 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 20 Jun 2024 18:03:26 -0600
Subject: [PATCH 32/68] Include initial cf decoding tests.

---
 virtualizarr/readers/hdf_filters.py           |  3 +-
 virtualizarr/tests/test_readers/conftest.py   | 34 ++++++++++++++++---
 .../tests/test_readers/test_hdf_filters.py    | 28 +++++++++++++++
 3 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 667ff09..f4e2dcf 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -88,7 +88,8 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
     attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs}
     mapping = {}
     if "scale_factor" in attributes:
-        mapping["scale_factor"] = 1 / attributes["scale_factor"][0]
+        mapping["scale_factor"] = 1 / attributes["scale_factor"]
+        # mapping["scale_factor"] =attributes["scale_factor"][0]
     else:
         mapping["scale_factor"] = 1
     if "add_offset" in attributes:
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 3e6f9c3..e1a53c5 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -204,10 +204,36 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_ve
 
 
 @pytest.fixture
-def add_offset_netcdf4_file(tmpdir):
+def np_uncompressed_int16():
+    return np.arange(100, dtype=np.int16)
+
+
+@pytest.fixture
+def offset():
+    return np.float32(5.0)
+
+
+@pytest.fixture
+def add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset):
     filepath = f"{tmpdir}/offset.nc"
     f = h5py.File(filepath, "w")
-    data = np.random.random((10, 10))
-    f.create_dataset(name="data", data=data, chunks=None)
-    f["data"].attrs.create(name="add_offset", data=5)
+    data = np_uncompressed_int16 - offset
+    f.create_dataset(name="data", data=data, chunks=True)
+    f["data"].attrs.create(name="add_offset", data=offset)
+    return filepath
+
+
+@pytest.fixture
+def scale_factor():
+    return 0.01
+
+
+@pytest.fixture
+def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_factor):
+    filepath = f"{tmpdir}/scale_offset.nc"
+    f = h5py.File(filepath, "w")
+    data = (np_uncompressed_int16 - offset) / scale_factor
+    f.create_dataset(name="data", data=data, chunks=True)
+    f["data"].attrs.create(name="add_offset", data=offset)
+    f["data"].attrs.create(name="scale_factor", data=np.array([scale_factor]))
     return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index 4d23a75..960bcf2 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -76,3 +76,31 @@ def test_cf_add_offset(self, add_offset_netcdf4_file):
         assert cf_codec["codec"].scale == 1
         assert cf_codec["codec"].offset == 5
         assert cf_codec["codec"].dtype == "<f8"
+
+    def test_cf_codec_decoding_offset(
+        self, add_offset_netcdf4_file, np_uncompressed_int16
+    ):
+        f = h5py.File(add_offset_netcdf4_file)
+        ds = f["data"]
+        chunk_info = ds.id.get_chunk_info(0)
+        cfcodec = cfcodec_from_dataset(ds)
+        with open(add_offset_netcdf4_file, "rb") as file:
+            file.seek(chunk_info.byte_offset)
+            bytes_read = file.read(chunk_info.size)
+            decoded = cfcodec["codec"].decode(bytes_read)
+            assert np.array_equal(decoded, np_uncompressed_int16)
+            assert decoded.dtype == np.float64
+
+    def test_cf_codec_decoding_scale_offset(
+        self, scale_add_offset_netcdf4_file, np_uncompressed_int16
+    ):
+        f = h5py.File(scale_add_offset_netcdf4_file)
+        ds = f["data"]
+        chunk_info = ds.id.get_chunk_info(0)
+        cfcodec = cfcodec_from_dataset(ds)
+        with open(scale_add_offset_netcdf4_file, "rb") as file:
+            file.seek(chunk_info.byte_offset)
+            bytes_read = file.read(chunk_info.size)
+            decoded = cfcodec["codec"].decode(bytes_read)
+            assert np.allclose(decoded, np_uncompressed_int16)
+            assert decoded.dtype == np.float64

From b052f8c8f88e287bfdc684db0b595f32dfa88b15 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 20 Jun 2024 19:49:54 -0600
Subject: [PATCH 33/68] Revert typo for scale_factor retrieval.

---
 virtualizarr/readers/hdf_filters.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index f4e2dcf..667ff09 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -88,8 +88,7 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
     attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs}
     mapping = {}
     if "scale_factor" in attributes:
-        mapping["scale_factor"] = 1 / attributes["scale_factor"]
-        # mapping["scale_factor"] =attributes["scale_factor"][0]
+        mapping["scale_factor"] = 1 / attributes["scale_factor"][0]
     else:
         mapping["scale_factor"] = 1
     if "add_offset" in attributes:

From 01a3980f541a45c8a33a907dd6d3bed722eacae9 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 20 Jun 2024 20:12:44 -0600
Subject: [PATCH 34/68] Update reader to use new numpy manifest representation.

---
 virtualizarr/readers/hdf.py                 | 29 ++++++++++-----------
 virtualizarr/tests/test_readers/test_hdf.py |  4 +--
 2 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index c251866..b96bdff 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -39,34 +39,33 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
                 path=path, offset=dsid.get_offset(), length=dsid.get_storage_size()
             )
             chunk_key = ChunkKey(key)
-            chunk_entries = {chunk_key: chunk_entry}
+            chunk_entries = {chunk_key: chunk_entry.dict()}
             chunk_manifest = ChunkManifest(entries=chunk_entries)
             return chunk_manifest
     else:
         num_chunks = dsid.get_num_chunks()
         if num_chunks == 0:
             raise ValueError("The dataset is chunked but contains no chunks")
+        paths = np.full(num_chunks, path, dtype=np.dtypes.StringDType)  # type: ignore
+        offsets = np.empty((num_chunks), dtype=np.int32)
+        lengths = np.empty((num_chunks), dtype=np.int32)
 
-        chunk_entries = dict()
-
-        def get_key(blob):
-            key_list = [a // b for a, b in zip(blob.chunk_offset, dataset.chunks)]
-            key = ".".join(map(str, key_list))
-            return key
-
-        def store_chunk_entry(blob):
-            chunk_entries[get_key(blob)] = ChunkEntry(
-                path=path, offset=blob.byte_offset, length=blob.size
-            )
+        def add_chunk_info(blob, chunk_index):
+            offsets[chunk_index] = blob.byte_offset
+            lengths[chunk_index] = blob.size
+            chunk_index += 1
 
         has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
         if has_chunk_iter:
-            dsid.chunk_iter(store_chunk_entry)
+            chunk_index = 0
+            dsid.chunk_iter(add_chunk_info, chunk_index)
         else:
             for index in range(num_chunks):
-                store_chunk_entry(dsid.get_chunk_info(index))
+                add_chunk_info(dsid.get_chunk_info(index), index)
 
-        chunk_manifest = ChunkManifest(entries=chunk_entries)
+        chunk_manifest = ChunkManifest.from_arrays(
+            paths=paths, offsets=offsets, lengths=lengths
+        )
         return chunk_manifest
 
 
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index a67352e..8c5a40a 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -27,13 +27,13 @@ def test_no_chunking(self, no_chunks_netcdf4_file):
         f = h5py.File(no_chunks_netcdf4_file)
         ds = f["data"]
         manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds)
-        assert len(manifest.entries) == 1
+        assert len(manifest) == 1
 
     def test_chunked(self, chunked_netcdf4_file):
         f = h5py.File(chunked_netcdf4_file)
         ds = f["data"]
         manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds)
-        assert len(manifest.entries) == 4
+        assert len(manifest) == 4
 
 
 class TestDatasetDims:

From c37d9e526239ad5207f76d400924fffaabb578ec Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 21 Jun 2024 19:05:01 -0600
Subject: [PATCH 35/68] Temporarily skip test until blosc netcdf4 issue is
 solved.

---
 virtualizarr/tests/test_readers/test_hdf_integration.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index f51ebd4..dca34db 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -1,5 +1,6 @@
 import fsspec
 import numpy
+import pytest
 import xarray as xr
 
 import virtualizarr
@@ -21,6 +22,9 @@ def test_filters_h5netcdf_roundtrip(
         ds = xr.open_dataset(m, engine="kerchunk")
         assert isinstance(ds.air.values[0][0][0], numpy.float64)
 
+    @pytest.mark.skip(
+        reason="Issue with xr 'dim1' serialization and blosc availability"
+    )
     def test_filters_netcdf4_roundtrip(
         self, tmpdir, filter_encoded_xarray_netcdf4_file
     ):

From 17b30d4149603c952e0b24892b2d104ed7499a52 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 21 Jun 2024 19:24:07 -0600
Subject: [PATCH 36/68] Fix Pydantic 2 migration warnings.

---
 virtualizarr/readers/hdf_filters.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 667ff09..cc8e810 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -6,7 +6,7 @@
 import numpy as np
 from numcodecs.abc import Codec
 from numcodecs.fixedscaleoffset import FixedScaleOffset
-from pydantic import BaseModel, validator
+from pydantic import BaseModel, field_validator
 from xarray.coding.variables import _choose_float_dtype
 
 _non_standard_filters = {
@@ -23,7 +23,7 @@ class BloscProperties(BaseModel):
     shuffle: int
     cname: str
 
-    @validator("cname", pre=True)
+    @field_validator("cname", mode="before")
     def get_cname_from_code(cls, v):
         blosc_compressor_codes = {
             value: key
@@ -69,7 +69,7 @@ def _filter_to_codec(
                 **{
                     k: v
                     for k, v in zip(
-                        BloscProperties.__fields__.keys(), filter_properties[-4:]
+                        BloscProperties.model_fields.keys(), filter_properties[-4:]
                     )
                 }
             )

From f6b596a6563aff90a70acb0b8190898399368f32 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 21 Jun 2024 19:30:55 -0600
Subject: [PATCH 37/68] Include hdf5plugin and imagecodecs-numcodecs in mamba
 test environment.

---
 ci/environment.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/environment.yml b/ci/environment.yml
index 0385ea5..e909bee 100644
--- a/ci/environment.yml
+++ b/ci/environment.yml
@@ -14,6 +14,7 @@ dependencies:
   - ujson
   - packaging
   - universal_pathlib
+  - hdf5plugin
   # Testing
   - codecov
   - pre-commit
@@ -26,3 +27,4 @@ dependencies:
   - fsspec
   - s3fs
   - fastparquet
+  - imagecodecs-numcodecs

From eb6e24d10385fa68a9a8909d0c6cfb9a97a34461 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 21 Jun 2024 19:35:24 -0600
Subject: [PATCH 38/68] Mamba attempt with imagecodecs rather than
 imagecodecs-numcodecs.

---
 ci/environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/environment.yml b/ci/environment.yml
index e909bee..20784a6 100644
--- a/ci/environment.yml
+++ b/ci/environment.yml
@@ -27,4 +27,4 @@ dependencies:
   - fsspec
   - s3fs
   - fastparquet
-  - imagecodecs-numcodecs
+  - imagecodecs

From c85bd168025d4c96c1112aff22cc82fc0e07cbfd Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Fri, 21 Jun 2024 19:41:14 -0600
Subject: [PATCH 39/68] Mamba attempt with latest imagecodecs release.

---
 ci/environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/environment.yml b/ci/environment.yml
index 20784a6..fb967bc 100644
--- a/ci/environment.yml
+++ b/ci/environment.yml
@@ -27,4 +27,4 @@ dependencies:
   - fsspec
   - s3fs
   - fastparquet
-  - imagecodecs
+  - imagecodecs>=2024.6.1

From ca435da5007263136bf489ffe647cb690145cbd7 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 25 Jun 2024 19:34:35 -0600
Subject: [PATCH 40/68] Use correct iter_chunks callback function signtature.

---
 virtualizarr/readers/hdf.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index b96bdff..d082b71 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -53,12 +53,22 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
         def add_chunk_info(blob, chunk_index):
             offsets[chunk_index] = blob.byte_offset
             lengths[chunk_index] = blob.size
-            chunk_index += 1
 
         has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
         if has_chunk_iter:
-            chunk_index = 0
-            dsid.chunk_iter(add_chunk_info, chunk_index)
+
+            def create_callback(initial=0):
+                value = initial
+
+                def callback(blob):
+                    nonlocal value
+                    add_chunk_info(blob, chunk_index=value)
+                    value += 1
+
+                return callback
+
+            callback = create_callback()
+            dsid.chunk_iter(callback)
         else:
             for index in range(num_chunks):
                 add_chunk_info(dsid.get_chunk_info(index), index)

From 3017951549fe4b3d9d7099b1357aa76136d23f16 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 25 Jun 2024 19:35:40 -0600
Subject: [PATCH 41/68] Include pip based imagecodecs-numcodecs until
 conda-forge availability.

---
 ci/environment.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/environment.yml b/ci/environment.yml
index fb967bc..e2f5a86 100644
--- a/ci/environment.yml
+++ b/ci/environment.yml
@@ -28,3 +28,5 @@ dependencies:
   - s3fs
   - fastparquet
   - imagecodecs>=2024.6.1
+  - pip:
+    - imagecodecs-numcodecs

From 32ba13537070fbee7e861d8618f6a77eacbe0da8 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 27 Jun 2024 15:43:10 -0600
Subject: [PATCH 42/68] Handle non-coordinate dims which are serialized to hdf
 as empty dataset.

---
 virtualizarr/readers/hdf.py                 | 65 ++++++++++++---------
 virtualizarr/tests/test_integration.py      | 18 +++++-
 virtualizarr/tests/test_readers/test_hdf.py |  1 +
 virtualizarr/xarray.py                      |  2 +-
 4 files changed, 53 insertions(+), 33 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index d082b71..cbbe824 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -11,7 +11,9 @@
 from virtualizarr.zarr import ZArray
 
 
-def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
+def _dataset_chunk_manifest(
+    path: str, dataset: h5py.Dataset
+) -> Optional[ChunkManifest]:
     """
     Generate ChunkManifest for HDF5 dataset.
 
@@ -31,7 +33,7 @@ def _dataset_chunk_manifest(path: str, dataset: h5py.Dataset) -> ChunkManifest:
 
     if dataset.chunks is None:
         if dsid.get_offset() is None:
-            raise ValueError("Dataset has no space allocated in the file")
+            return None
         else:
             key_list = [0] * (len(dataset.shape) or 1)
             key = ".".join(map(str, key_list))
@@ -167,35 +169,39 @@ def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]):
     return attrs
 
 
-def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> xr.Variable:
+def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variable]:
     # This chunk determination logic mirrors zarr-python's create
     # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
-    chunks = dataset.chunks if dataset.chunks else dataset.shape
-    codecs = codecs_from_dataset(dataset)
-    cfcodec = cfcodec_from_dataset(dataset)
-    attrs = _extract_attrs(dataset)
-    if cfcodec:
-        codecs.append(cfcodec["codec"])
-        dtype = cfcodec["target_dtype"]
-        attrs.pop("scale_factor", None)
-        attrs.pop("add_offset", None)
-    else:
-        dtype = dataset.dtype
-    filters = [codec.get_config() for codec in codecs]
-    zarray = ZArray(
-        chunks=chunks,
-        compressor=None,
-        dtype=dtype,
-        fill_value=dataset.fillvalue,
-        filters=filters,
-        order="C",
-        shape=dataset.shape,
-        zarr_format=2,
-    )
+
     manifest = _dataset_chunk_manifest(path, dataset)
-    marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
-    dims = _dataset_dims(dataset)
-    variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
+    if manifest:
+        chunks = dataset.chunks if dataset.chunks else dataset.shape
+        codecs = codecs_from_dataset(dataset)
+        cfcodec = cfcodec_from_dataset(dataset)
+        attrs = _extract_attrs(dataset)
+        if cfcodec:
+            codecs.append(cfcodec["codec"])
+            dtype = cfcodec["target_dtype"]
+            attrs.pop("scale_factor", None)
+            attrs.pop("add_offset", None)
+        else:
+            dtype = dataset.dtype
+        filters = [codec.get_config() for codec in codecs]
+        zarray = ZArray(
+            chunks=chunks,
+            compressor=None,
+            dtype=dtype,
+            fill_value=dataset.fillvalue,
+            filters=filters,
+            order="C",
+            shape=dataset.shape,
+            zarr_format=2,
+        )
+        marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
+        dims = _dataset_dims(dataset)
+        variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
+    else:
+        variable = None
     return variable
 
 
@@ -217,7 +223,8 @@ def virtual_vars_from_hdf(
         if key not in drop_variables:
             if isinstance(f[key], h5py.Dataset):
                 variable = _dataset_to_variable(path, f[key])
-                variables[key] = variable
+                if variable is not None:
+                    variables[key] = variable
             else:
                 raise NotImplementedError("Nested groups are not yet supported")
 
diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
index 451862c..6a1f91e 100644
--- a/virtualizarr/tests/test_integration.py
+++ b/virtualizarr/tests/test_integration.py
@@ -71,9 +71,13 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format):
                 f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False
             )
 
-        # assert identical to original dataset
+        # assert all_close to original dataset
         xrt.assert_allclose(roundtrip, ds)
 
+        # assert coordinate attributes are maintained
+        for coord in ds.coords:
+            assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs
+
     def test_kerchunk_roundtrip_concat(self, tmpdir, format):
         # set up example xarray dataset
         ds = xr.tutorial.open_dataset("air_temperature", decode_times=False)
@@ -107,8 +111,12 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format):
                 f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False
             )
 
-        # assert identical to original dataset
-        xrt.assert_identical(roundtrip, ds)
+        # assert all_close to original dataset
+        xrt.assert_allclose(roundtrip, ds)
+
+        # assert coordinate attributes are maintained
+        for coord in ds.coords:
+            assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs
 
     def test_non_dimension_coordinates(self, tmpdir, format):
         # regression test for GH issue #105
@@ -142,6 +150,10 @@ def test_non_dimension_coordinates(self, tmpdir, format):
         # assert equal to original dataset
         xrt.assert_allclose(roundtrip, ds)
 
+        # assert coordinate attributes are maintained
+        for coord in ds.coords:
+            assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs
+
 
 def test_open_scalar_variable(tmpdir):
     # regression test for GH issue #100
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index 8c5a40a..c744cd6 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -17,6 +17,7 @@ def test_empty_chunks(self, empty_chunks_netcdf4_file):
         with pytest.raises(ValueError, match="chunked but contains no chunks"):
             _dataset_chunk_manifest(path=empty_chunks_netcdf4_file, dataset=ds)
 
+    @pytest.mark.skip("Need to differentiate non coordinate dimensions from empty")
     def test_empty_dataset(self, empty_dataset_netcdf4_file):
         f = h5py.File(empty_dataset_netcdf4_file)
         ds = f["data"]
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 39bd067..a8a2369 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -121,7 +121,7 @@ def open_virtual_dataset(
             ds_attrs = attrs_from_root_group(
                 path=filepath, reader_options=reader_options
             )
-            coord_names = None
+            coord_names = ds_attrs.pop("coordinates", [])
         else:
             # this is the only place we actually always need to use kerchunk directly
             # TODO avoid even reading byte ranges for variables that will be dropped later anyway?

From 64f446c8d452291548bba2c73a104bf068dc2d7e Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Thu, 27 Jun 2024 16:23:43 -0600
Subject: [PATCH 43/68] Use reader_options for filetype check and update
 failing kerchunk call.

---
 virtualizarr/tests/test_xarray.py | 18 +++++++++++++-----
 virtualizarr/xarray.py            |  4 +++-
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
index e55583b..282d4ad 100644
--- a/virtualizarr/tests/test_xarray.py
+++ b/virtualizarr/tests/test_xarray.py
@@ -8,6 +8,7 @@
 from xarray.core.indexes import Index
 
 from virtualizarr import open_virtual_dataset
+from virtualizarr.kerchunk import FileType
 from virtualizarr.manifests import ChunkManifest, ManifestArray
 from virtualizarr.tests import network, requires_s3fs
 from virtualizarr.zarr import ZArray
@@ -325,18 +326,25 @@ def test_loadable_variables(self, netcdf4_file):
             if name in vars_to_load:
                 xrt.assert_identical(vds.variables[name], full_ds.variables[name])
 
-    @patch("virtualizarr.kerchunk.read_kerchunk_references_from_file")
+    @patch("virtualizarr.xarray._automatically_determine_filetype")
+    @patch("virtualizarr.xarray.virtual_vars_from_hdf")
     def test_open_virtual_dataset_passes_expected_args(
-        self, mock_read_kerchunk, netcdf4_file
+        self, mock_reader, mock_determine_filetype, netcdf4_file
     ):
         reader_options = {"option1": "value1", "option2": "value2"}
+        mock_determine_filetype.return_value = FileType.netcdf4
         open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options)
-        args = {
+        reader_args = {
+            "path": netcdf4_file,
+            "drop_variables": [],
+            "reader_options": reader_options,
+        }
+        mock_reader.assert_called_once_with(**reader_args)
+        filetype_args = {
             "filepath": netcdf4_file,
-            "filetype": None,
             "reader_options": reader_options,
         }
-        mock_read_kerchunk.assert_called_once_with(**args)
+        mock_determine_filetype.assert_called_once_with(**filetype_args)
 
 
 class TestRenamePaths:
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index a8a2369..86a59c8 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -109,7 +109,9 @@ def open_virtual_dataset(
         )
     else:
         if filetype is None:
-            filetype = _automatically_determine_filetype(filepath=filepath)
+            filetype = _automatically_determine_filetype(
+                filepath=filepath, reader_options=reader_options
+            )
         filetype = FileType(filetype)
 
         if filetype.name.lower() == "netcdf4":

From 9797346463e443d6f48b567569156f4ca01490cf Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sat, 29 Jun 2024 18:20:06 -0600
Subject: [PATCH 44/68] Fix chunkmanifest shaping for chunked datasets.

---
 virtualizarr/readers/hdf.py                 | 36 +++++++++------------
 virtualizarr/tests/test_readers/test_hdf.py | 10 ++++--
 2 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index cbbe824..d683f69 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -1,3 +1,4 @@
+import math
 from typing import List, Mapping, Optional, Union
 
 import h5py
@@ -48,32 +49,27 @@ def _dataset_chunk_manifest(
         num_chunks = dsid.get_num_chunks()
         if num_chunks == 0:
             raise ValueError("The dataset is chunked but contains no chunks")
-        paths = np.full(num_chunks, path, dtype=np.dtypes.StringDType)  # type: ignore
-        offsets = np.empty((num_chunks), dtype=np.int32)
-        lengths = np.empty((num_chunks), dtype=np.int32)
 
-        def add_chunk_info(blob, chunk_index):
-            offsets[chunk_index] = blob.byte_offset
-            lengths[chunk_index] = blob.size
+        shape = tuple(math.ceil(a / b) for a, b in zip(dataset.shape, dataset.chunks))
+        paths = np.empty(shape, dtype=np.dtypes.StringDType)  # type: ignore
+        offsets = np.empty(shape, dtype=np.int32)
+        lengths = np.empty(shape, dtype=np.int32)
 
-        has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
-        if has_chunk_iter:
-
-            def create_callback(initial=0):
-                value = initial
+        def get_key(blob):
+            return tuple([a // b for a, b in zip(blob.chunk_offset, dataset.chunks)])
 
-                def callback(blob):
-                    nonlocal value
-                    add_chunk_info(blob, chunk_index=value)
-                    value += 1
+        def add_chunk_info(blob):
+            key = get_key(blob)
+            paths[key] = path
+            offsets[key] = blob.byte_offset
+            lengths[key] = blob.size
 
-                return callback
-
-            callback = create_callback()
-            dsid.chunk_iter(callback)
+        has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
+        if has_chunk_iter:
+            dsid.chunk_iter(add_chunk_info)
         else:
             for index in range(num_chunks):
-                add_chunk_info(dsid.get_chunk_info(index), index)
+                add_chunk_info(dsid.get_chunk_info(index))
 
         chunk_manifest = ChunkManifest.from_arrays(
             paths=paths, offsets=offsets, lengths=lengths
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index c744cd6..25caab9 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -28,13 +28,19 @@ def test_no_chunking(self, no_chunks_netcdf4_file):
         f = h5py.File(no_chunks_netcdf4_file)
         ds = f["data"]
         manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds)
-        assert len(manifest) == 1
+        assert manifest.shape_chunk_grid == (1, 1)
 
     def test_chunked(self, chunked_netcdf4_file):
         f = h5py.File(chunked_netcdf4_file)
         ds = f["data"]
         manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds)
-        assert len(manifest) == 4
+        assert manifest.shape_chunk_grid == (2, 2)
+
+    def test_chunked_roundtrip(self, chunked_roundtrip):
+        f = h5py.File(chunked_roundtrip)
+        ds = f["var2"]
+        manifest = _dataset_chunk_manifest(path=chunked_roundtrip, dataset=ds)
+        assert manifest.shape_chunk_grid == (2, 8)
 
 
 class TestDatasetDims:

From c833e191abb773e409aec6eeb47ab6438d0ee0a9 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sat, 29 Jun 2024 18:22:05 -0600
Subject: [PATCH 45/68] Handle scale_factor attribute serialization for
 compressed files.

---
 virtualizarr/readers/hdf_filters.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index cc8e810..1a3c222 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -88,7 +88,11 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
     attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs}
     mapping = {}
     if "scale_factor" in attributes:
-        mapping["scale_factor"] = 1 / attributes["scale_factor"][0]
+        try:
+            scale_factor = attributes["scale_factor"][0]
+        except IndexError:
+            scale_factor = attributes["scale_factor"]
+        mapping["scale_factor"] = 1 / scale_factor
     else:
         mapping["scale_factor"] = 1
     if "add_offset" in attributes:

From 701bcfad494326a71ec08c454465bceaa33803e9 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sat, 29 Jun 2024 18:24:13 -0600
Subject: [PATCH 46/68] Include chunked roundtrip fixture.

---
 virtualizarr/tests/test_readers/conftest.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index e1a53c5..5fbec00 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -196,7 +196,8 @@ def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_ve
             "blosc_shuffle": 1,
             "fletcher32": False,
         }
-
+    #  Check on how handle scalar dim.
+    ds = ds.drop_dims("dim3")
     ds["var2"].encoding.update(encoding_config)
     filepath = f"{tmpdir}/{request.param}_xarray.nc"
     ds.to_netcdf(filepath, engine="netcdf4")
@@ -237,3 +238,14 @@ def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_f
     f["data"].attrs.create(name="add_offset", data=offset)
     f["data"].attrs.create(name="scale_factor", data=np.array([scale_factor]))
     return filepath
+
+
+@pytest.fixture()
+def chunked_roundtrip(tmpdir):
+    ds = create_test_data(dim_sizes=(20, 80, 10))
+    ds = ds.drop_dims("dim3")
+    filepath = f"{tmpdir}/chunked_xarray.nc"
+    ds.to_netcdf(
+        filepath, engine="netcdf4", encoding={"var2": {"chunksizes": (10, 10)}}
+    )
+    return filepath

From 08c988e2c16a7366a4ea99f2fc073da407b326d5 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sat, 29 Jun 2024 18:24:48 -0600
Subject: [PATCH 47/68] Standardize xarray integration tests for hdf filters.

---
 .../test_readers/test_hdf_integration.py      | 47 ++++++-------------
 1 file changed, 14 insertions(+), 33 deletions(-)

diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index dca34db..abc23df 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -1,53 +1,34 @@
-import fsspec
-import numpy
 import pytest
 import xarray as xr
+import xarray.testing as xrt
 
 import virtualizarr
 from virtualizarr.kerchunk import FileType
 
 
 class TestIntegration:
+    @pytest.mark.xfail(reason="Investigate initial time value decoding issue")
     def test_filters_h5netcdf_roundtrip(
         self, tmpdir, filter_encoded_xarray_h5netcdf_file
     ):
-        virtual_ds = virtualizarr.open_virtual_dataset(
+        ds = xr.open_dataset(filter_encoded_xarray_h5netcdf_file, decode_times=False)
+        vds = virtualizarr.open_virtual_dataset(
             filter_encoded_xarray_h5netcdf_file, filetype=FileType("netcdf4")
         )
         kerchunk_file = f"{tmpdir}/kerchunk.json"
-        virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
-        fs = fsspec.filesystem("reference", fo=kerchunk_file)
-        m = fs.get_mapper("")
-
-        ds = xr.open_dataset(m, engine="kerchunk")
-        assert isinstance(ds.air.values[0][0][0], numpy.float64)
+        vds.virtualize.to_kerchunk(kerchunk_file, format="json")
+        roundtrip = xr.open_dataset(
+            kerchunk_file, engine="kerchunk", decode_times=False
+        )
+        xrt.assert_allclose(ds, roundtrip)
 
-    @pytest.mark.skip(
-        reason="Issue with xr 'dim1' serialization and blosc availability"
-    )
     def test_filters_netcdf4_roundtrip(
         self, tmpdir, filter_encoded_xarray_netcdf4_file
     ):
         filepath = filter_encoded_xarray_netcdf4_file["filepath"]
-        compressor = filter_encoded_xarray_netcdf4_file["compressor"]
-        virtual_ds = virtualizarr.open_virtual_dataset(
-            filepath, filetype=FileType("netcdf4")
-        )
+        ds = xr.open_dataset(filepath)
+        vds = virtualizarr.open_virtual_dataset(filepath, filetype=FileType("netcdf4"))
         kerchunk_file = f"{tmpdir}/kerchunk.json"
-        virtual_ds.virtualize.to_kerchunk(kerchunk_file, format="json")
-        fs = fsspec.filesystem("reference", fo=kerchunk_file)
-        m = fs.get_mapper("")
-        ds = xr.open_dataset(m, engine="kerchunk")
-
-        expected_encoding = ds["var2"].encoding.copy()
-        compression = expected_encoding.pop("compression")
-        blosc_shuffle = expected_encoding.pop("blosc_shuffle")
-        if compression is not None:
-            if "blosc" in compression and blosc_shuffle:
-                expected_encoding["blosc"] = {
-                    "compressor": compressor,
-                    "shuffle": blosc_shuffle,
-                }
-                expected_encoding["shuffle"] = False
-        actual_encoding = ds["var2"].encoding
-        assert expected_encoding.items() <= actual_encoding.items()
+        vds.virtualize.to_kerchunk(kerchunk_file, format="json")
+        roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
+        xrt.assert_equal(ds, roundtrip)

From 4cb4bac261a7825f44798e247c13a6faeb752a5a Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sat, 29 Jun 2024 20:00:56 -0600
Subject: [PATCH 48/68] Update reader selection logic for new filetype
 determination.

---
 virtualizarr/xarray.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index 1a795e5..9671264 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -136,8 +136,7 @@ def open_virtual_dataset(
                 filepath=filepath, reader_options=reader_options
             )
         filetype = FileType(filetype)
-
-        if filetype.name.lower() == "netcdf4":
+        if filetype.name.lower() == "netcdf4" or filetype.name.lower() == "hdf5":
             virtual_vars = virtual_vars_from_hdf(
                 path=filepath,
                 drop_variables=drop_variables,

From d352104393d0eeacfc3b566a9f0cb79c7e688c8f Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sun, 30 Jun 2024 13:07:17 -0600
Subject: [PATCH 49/68] Use decode_times for integration test.

---
 .../tests/test_readers/test_hdf_integration.py       | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index abc23df..882dea3 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -1,4 +1,3 @@
-import pytest
 import xarray as xr
 import xarray.testing as xrt
 
@@ -7,19 +6,18 @@
 
 
 class TestIntegration:
-    @pytest.mark.xfail(reason="Investigate initial time value decoding issue")
     def test_filters_h5netcdf_roundtrip(
         self, tmpdir, filter_encoded_xarray_h5netcdf_file
     ):
-        ds = xr.open_dataset(filter_encoded_xarray_h5netcdf_file, decode_times=False)
+        ds = xr.open_dataset(filter_encoded_xarray_h5netcdf_file, decode_times=True)
         vds = virtualizarr.open_virtual_dataset(
-            filter_encoded_xarray_h5netcdf_file, filetype=FileType("netcdf4")
+            filter_encoded_xarray_h5netcdf_file,
+            loadable_variables=["time"],
+            cftime_variables=["time"],
         )
         kerchunk_file = f"{tmpdir}/kerchunk.json"
         vds.virtualize.to_kerchunk(kerchunk_file, format="json")
-        roundtrip = xr.open_dataset(
-            kerchunk_file, engine="kerchunk", decode_times=False
-        )
+        roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk", decode_times=True)
         xrt.assert_allclose(ds, roundtrip)
 
     def test_filters_netcdf4_roundtrip(

From 3d89ea426ccb0f8abdcb961e55773887d48d38d6 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sun, 30 Jun 2024 13:38:46 -0600
Subject: [PATCH 50/68] Standardize fixture names for hdf5 vs netcdf4 file
 types.

---
 virtualizarr/tests/test_readers/conftest.py   | 36 +++++----
 virtualizarr/tests/test_readers/test_hdf.py   | 78 +++++++++----------
 .../tests/test_readers/test_hdf_filters.py    | 26 +++----
 .../test_readers/test_hdf_integration.py      | 10 +--
 4 files changed, 76 insertions(+), 74 deletions(-)

diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 5fbec00..539b2fb 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -9,7 +9,7 @@
 
 
 @pytest.fixture
-def empty_chunks_netcdf4_file(tmpdir):
+def empty_chunks_hdf5_file(tmpdir):
     ds = xr.Dataset({"data": []})
     filepath = f"{tmpdir}/empty_chunks.nc"
     ds.to_netcdf(filepath, engine="h5netcdf")
@@ -17,7 +17,7 @@ def empty_chunks_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def empty_dataset_netcdf4_file(tmpdir):
+def empty_dataset_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/empty_dataset.nc"
     f = h5py.File(filepath, "w")
     f.create_dataset("data", shape=(0,), dtype="f")
@@ -25,7 +25,7 @@ def empty_dataset_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def no_chunks_netcdf4_file(tmpdir):
+def no_chunks_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/no_chunks.nc"
     f = h5py.File(filepath, "w")
     data = np.random.random((10, 10))
@@ -34,7 +34,7 @@ def no_chunks_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def chunked_netcdf4_file(tmpdir):
+def chunked_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/chunks.nc"
     f = h5py.File(filepath, "w")
     data = np.random.random((100, 100))
@@ -43,7 +43,7 @@ def chunked_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def single_dimension_scale_netcdf4_file(tmpdir):
+def single_dimension_scale_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/single_dimension_scale.nc"
     f = h5py.File(filepath, "w")
     data = [1, 2]
@@ -56,7 +56,7 @@ def single_dimension_scale_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def is_scale_netcdf4_file(tmpdir):
+def is_scale_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/is_scale.nc"
     f = h5py.File(filepath, "w")
     data = [1, 2]
@@ -66,7 +66,7 @@ def is_scale_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def multiple_dimension_scales_netcdf4_file(tmpdir):
+def multiple_dimension_scales_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/multiple_dimension_scales.nc"
     f = h5py.File(filepath, "w")
     data = [1, 2]
@@ -96,7 +96,7 @@ def chunked_dimensions_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def string_attributes_netcdf4_file(tmpdir):
+def string_attributes_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/attributes.nc"
     f = h5py.File(filepath, "w")
     data = np.random.random((10, 10))
@@ -107,7 +107,7 @@ def string_attributes_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def root_attributes_netcdf4_file(tmpdir):
+def root_attributes_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/root_attributes.nc"
     f = h5py.File(filepath, "w")
     f.attrs["attribute_name"] = "attribute_name"
@@ -115,7 +115,7 @@ def root_attributes_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def group_netcdf4_file(tmpdir):
+def group_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/group.nc"
     f = h5py.File(filepath, "w")
     f.create_group("group")
@@ -123,7 +123,7 @@ def group_netcdf4_file(tmpdir):
 
 
 @pytest.fixture
-def multiple_datasets_netcdf4_file(tmpdir):
+def multiple_datasets_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/multiple_datasets.nc"
     f = h5py.File(filepath, "w")
     data = np.random.random((10, 10))
@@ -138,7 +138,7 @@ def np_uncompressed():
 
 
 @pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd"])
-def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
+def filter_encoded_hdf5_file(tmpdir, np_uncompressed, request):
     filepath = f"{tmpdir}/{request.param}.nc"
     f = h5py.File(filepath, "w")
     if request.param == "gzip":
@@ -162,7 +162,7 @@ def filter_encoded_netcdf4_file(tmpdir, np_uncompressed, request):
 
 
 @pytest.fixture(params=["gzip"])
-def filter_encoded_xarray_h5netcdf_file(tmpdir, request):
+def filter_encoded_roundtrip_hdf5_file(tmpdir, request):
     ds = xr.tutorial.open_dataset("air_temperature")
     encoding = {}
     if request.param == "gzip":
@@ -184,7 +184,9 @@ def skip_test_for_libhdf5_version():
 
 
 @pytest.fixture(params=["blosc_zlib"])
-def filter_encoded_xarray_netcdf4_file(tmpdir, request, skip_test_for_libhdf5_version):
+def filter_encoded_roundtrip_netcdf4_file(
+    tmpdir, request, skip_test_for_libhdf5_version
+):
     if skip_test_for_libhdf5_version:
         pytest.skip("Requires libhdf5 >= 1.14")
     ds = create_test_data(dim_sizes=(20, 80, 10))
@@ -215,7 +217,7 @@ def offset():
 
 
 @pytest.fixture
-def add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset):
+def add_offset_hdf5_file(tmpdir, np_uncompressed_int16, offset):
     filepath = f"{tmpdir}/offset.nc"
     f = h5py.File(filepath, "w")
     data = np_uncompressed_int16 - offset
@@ -230,7 +232,7 @@ def scale_factor():
 
 
 @pytest.fixture
-def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_factor):
+def scale_add_offset_hdf5_file(tmpdir, np_uncompressed_int16, offset, scale_factor):
     filepath = f"{tmpdir}/scale_offset.nc"
     f = h5py.File(filepath, "w")
     data = (np_uncompressed_int16 - offset) / scale_factor
@@ -241,7 +243,7 @@ def scale_add_offset_netcdf4_file(tmpdir, np_uncompressed_int16, offset, scale_f
 
 
 @pytest.fixture()
-def chunked_roundtrip(tmpdir):
+def chunked_roundtrip_hdf5_file(tmpdir):
     ds = create_test_data(dim_sizes=(20, 80, 10))
     ds = ds.drop_dims("dim3")
     filepath = f"{tmpdir}/chunked_xarray.nc"
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
index 25caab9..1fb0f6e 100644
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ b/virtualizarr/tests/test_readers/test_hdf.py
@@ -11,59 +11,59 @@
 
 
 class TestDatasetChunkManifest:
-    def test_empty_chunks(self, empty_chunks_netcdf4_file):
-        f = h5py.File(empty_chunks_netcdf4_file)
+    def test_empty_chunks(self, empty_chunks_hdf5_file):
+        f = h5py.File(empty_chunks_hdf5_file)
         ds = f["data"]
         with pytest.raises(ValueError, match="chunked but contains no chunks"):
-            _dataset_chunk_manifest(path=empty_chunks_netcdf4_file, dataset=ds)
+            _dataset_chunk_manifest(path=empty_chunks_hdf5_file, dataset=ds)
 
     @pytest.mark.skip("Need to differentiate non coordinate dimensions from empty")
-    def test_empty_dataset(self, empty_dataset_netcdf4_file):
-        f = h5py.File(empty_dataset_netcdf4_file)
+    def test_empty_dataset(self, empty_dataset_hdf5_file):
+        f = h5py.File(empty_dataset_hdf5_file)
         ds = f["data"]
         with pytest.raises(ValueError, match="no space allocated in the file"):
-            _dataset_chunk_manifest(path=empty_dataset_netcdf4_file, dataset=ds)
+            _dataset_chunk_manifest(path=empty_dataset_hdf5_file, dataset=ds)
 
-    def test_no_chunking(self, no_chunks_netcdf4_file):
-        f = h5py.File(no_chunks_netcdf4_file)
+    def test_no_chunking(self, no_chunks_hdf5_file):
+        f = h5py.File(no_chunks_hdf5_file)
         ds = f["data"]
-        manifest = _dataset_chunk_manifest(path=no_chunks_netcdf4_file, dataset=ds)
+        manifest = _dataset_chunk_manifest(path=no_chunks_hdf5_file, dataset=ds)
         assert manifest.shape_chunk_grid == (1, 1)
 
-    def test_chunked(self, chunked_netcdf4_file):
-        f = h5py.File(chunked_netcdf4_file)
+    def test_chunked(self, chunked_hdf5_file):
+        f = h5py.File(chunked_hdf5_file)
         ds = f["data"]
-        manifest = _dataset_chunk_manifest(path=chunked_netcdf4_file, dataset=ds)
+        manifest = _dataset_chunk_manifest(path=chunked_hdf5_file, dataset=ds)
         assert manifest.shape_chunk_grid == (2, 2)
 
-    def test_chunked_roundtrip(self, chunked_roundtrip):
-        f = h5py.File(chunked_roundtrip)
+    def test_chunked_roundtrip(self, chunked_roundtrip_hdf5_file):
+        f = h5py.File(chunked_roundtrip_hdf5_file)
         ds = f["var2"]
-        manifest = _dataset_chunk_manifest(path=chunked_roundtrip, dataset=ds)
+        manifest = _dataset_chunk_manifest(path=chunked_roundtrip_hdf5_file, dataset=ds)
         assert manifest.shape_chunk_grid == (2, 8)
 
 
 class TestDatasetDims:
-    def test_single_dimension_scale(self, single_dimension_scale_netcdf4_file):
-        f = h5py.File(single_dimension_scale_netcdf4_file)
+    def test_single_dimension_scale(self, single_dimension_scale_hdf5_file):
+        f = h5py.File(single_dimension_scale_hdf5_file)
         ds = f["data"]
         dims = _dataset_dims(ds)
         assert dims[0] == "x"
 
-    def test_is_dimension_scale(self, is_scale_netcdf4_file):
-        f = h5py.File(is_scale_netcdf4_file)
+    def test_is_dimension_scale(self, is_scale_hdf5_file):
+        f = h5py.File(is_scale_hdf5_file)
         ds = f["data"]
         dims = _dataset_dims(ds)
         assert dims[0] == "data"
 
-    def test_multiple_dimension_scales(self, multiple_dimension_scales_netcdf4_file):
-        f = h5py.File(multiple_dimension_scales_netcdf4_file)
+    def test_multiple_dimension_scales(self, multiple_dimension_scales_hdf5_file):
+        f = h5py.File(multiple_dimension_scales_hdf5_file)
         ds = f["data"]
         with pytest.raises(ValueError, match="dimension scales attached"):
             _dataset_dims(ds)
 
-    def test_no_dimension_scales(self, no_chunks_netcdf4_file):
-        f = h5py.File(no_chunks_netcdf4_file)
+    def test_no_dimension_scales(self, no_chunks_hdf5_file):
+        f = h5py.File(no_chunks_hdf5_file)
         ds = f["data"]
         dims = _dataset_dims(ds)
         assert dims == ["phony_dim_0", "phony_dim_1"]
@@ -76,33 +76,33 @@ def test_chunked_dataset(self, chunked_dimensions_netcdf4_file):
         var = _dataset_to_variable(chunked_dimensions_netcdf4_file, ds)
         assert var.chunks == (50, 50)
 
-    def test_not_chunked_dataset(self, single_dimension_scale_netcdf4_file):
-        f = h5py.File(single_dimension_scale_netcdf4_file)
+    def test_not_chunked_dataset(self, single_dimension_scale_hdf5_file):
+        f = h5py.File(single_dimension_scale_hdf5_file)
         ds = f["data"]
-        var = _dataset_to_variable(single_dimension_scale_netcdf4_file, ds)
+        var = _dataset_to_variable(single_dimension_scale_hdf5_file, ds)
         assert var.chunks == (2,)
 
-    def test_dataset_attributes(self, string_attributes_netcdf4_file):
-        f = h5py.File(string_attributes_netcdf4_file)
+    def test_dataset_attributes(self, string_attributes_hdf5_file):
+        f = h5py.File(string_attributes_hdf5_file)
         ds = f["data"]
-        var = _dataset_to_variable(string_attributes_netcdf4_file, ds)
+        var = _dataset_to_variable(string_attributes_hdf5_file, ds)
         assert var.attrs["attribute_name"] == "attribute_name"
 
 
 class TestExtractAttributes:
-    def test_string_attribute(self, string_attributes_netcdf4_file):
-        f = h5py.File(string_attributes_netcdf4_file)
+    def test_string_attribute(self, string_attributes_hdf5_file):
+        f = h5py.File(string_attributes_hdf5_file)
         ds = f["data"]
         attrs = _extract_attrs(ds)
         assert attrs["attribute_name"] == "attribute_name"
 
-    def test_root_attribute(self, root_attributes_netcdf4_file):
-        f = h5py.File(root_attributes_netcdf4_file)
+    def test_root_attribute(self, root_attributes_hdf5_file):
+        f = h5py.File(root_attributes_hdf5_file)
         attrs = _extract_attrs(f)
         assert attrs["attribute_name"] == "attribute_name"
 
-    def test_multiple_attributes(self, string_attributes_netcdf4_file):
-        f = h5py.File(string_attributes_netcdf4_file)
+    def test_multiple_attributes(self, string_attributes_hdf5_file):
+        f = h5py.File(string_attributes_hdf5_file)
         ds = f["data"]
         attrs = _extract_attrs(ds)
         assert len(attrs.keys()) == 2
@@ -113,10 +113,10 @@ def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file):
         variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file)
         assert len(variables) == 3
 
-    def test_groups_not_implemented(self, group_netcdf4_file):
+    def test_groups_not_implemented(self, group_hdf5_file):
         with pytest.raises(NotImplementedError):
-            virtual_vars_from_hdf(group_netcdf4_file)
+            virtual_vars_from_hdf(group_hdf5_file)
 
-    def test_drop_variables(self, multiple_datasets_netcdf4_file):
-        variables = virtual_vars_from_hdf(multiple_datasets_netcdf4_file, ["data2"])
+    def test_drop_variables(self, multiple_datasets_hdf5_file):
+        variables = virtual_vars_from_hdf(multiple_datasets_hdf5_file, ["data2"])
         assert "data2" not in variables.keys()
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index 960bcf2..99b3af4 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -39,12 +39,12 @@ def test_zstd(self):
 
 
 class TestCodecsFromDataSet:
-    def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file):
-        f = h5py.File(filter_encoded_netcdf4_file)
+    def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file):
+        f = h5py.File(filter_encoded_hdf5_file)
         ds = f["data"]
         chunk_info = ds.id.get_chunk_info(0)
         codecs = codecs_from_dataset(ds)
-        with open(filter_encoded_netcdf4_file, "rb") as file:
+        with open(filter_encoded_hdf5_file, "rb") as file:
             file.seek(chunk_info.byte_offset)
             bytes_read = file.read(chunk_info.size)
             decoded = codecs[0].decode(bytes_read)
@@ -52,8 +52,8 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_netcdf4_file):
 
 
 class TestCFCodecFromDataset:
-    def test_no_cf_convention(self, filter_encoded_netcdf4_file):
-        f = h5py.File(filter_encoded_netcdf4_file)
+    def test_no_cf_convention(self, filter_encoded_hdf5_file):
+        f = h5py.File(filter_encoded_hdf5_file)
         ds = f["data"]
         cf_codec = cfcodec_from_dataset(ds)
         assert cf_codec is None
@@ -68,8 +68,8 @@ def test_cf_scale_factor(self, netcdf4_file):
         assert cf_codec["codec"].dtype == "<f8"
         assert cf_codec["codec"].astype == "<i2"
 
-    def test_cf_add_offset(self, add_offset_netcdf4_file):
-        f = h5py.File(add_offset_netcdf4_file)
+    def test_cf_add_offset(self, add_offset_hdf5_file):
+        f = h5py.File(add_offset_hdf5_file)
         ds = f["data"]
         cf_codec = cfcodec_from_dataset(ds)
         assert cf_codec["target_dtype"] == np.dtype(np.float64)
@@ -78,13 +78,13 @@ def test_cf_add_offset(self, add_offset_netcdf4_file):
         assert cf_codec["codec"].dtype == "<f8"
 
     def test_cf_codec_decoding_offset(
-        self, add_offset_netcdf4_file, np_uncompressed_int16
+        self, add_offset_hdf5_file, np_uncompressed_int16
     ):
-        f = h5py.File(add_offset_netcdf4_file)
+        f = h5py.File(add_offset_hdf5_file)
         ds = f["data"]
         chunk_info = ds.id.get_chunk_info(0)
         cfcodec = cfcodec_from_dataset(ds)
-        with open(add_offset_netcdf4_file, "rb") as file:
+        with open(add_offset_hdf5_file, "rb") as file:
             file.seek(chunk_info.byte_offset)
             bytes_read = file.read(chunk_info.size)
             decoded = cfcodec["codec"].decode(bytes_read)
@@ -92,13 +92,13 @@ def test_cf_codec_decoding_offset(
             assert decoded.dtype == np.float64
 
     def test_cf_codec_decoding_scale_offset(
-        self, scale_add_offset_netcdf4_file, np_uncompressed_int16
+        self, scale_add_offset_hdf5_file, np_uncompressed_int16
     ):
-        f = h5py.File(scale_add_offset_netcdf4_file)
+        f = h5py.File(scale_add_offset_hdf5_file)
         ds = f["data"]
         chunk_info = ds.id.get_chunk_info(0)
         cfcodec = cfcodec_from_dataset(ds)
-        with open(scale_add_offset_netcdf4_file, "rb") as file:
+        with open(scale_add_offset_hdf5_file, "rb") as file:
             file.seek(chunk_info.byte_offset)
             bytes_read = file.read(chunk_info.size)
             decoded = cfcodec["codec"].decode(bytes_read)
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index 882dea3..4fc7bd3 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -7,11 +7,11 @@
 
 class TestIntegration:
     def test_filters_h5netcdf_roundtrip(
-        self, tmpdir, filter_encoded_xarray_h5netcdf_file
+        self, tmpdir, filter_encoded_roundtrip_hdf5_file
     ):
-        ds = xr.open_dataset(filter_encoded_xarray_h5netcdf_file, decode_times=True)
+        ds = xr.open_dataset(filter_encoded_roundtrip_hdf5_file, decode_times=True)
         vds = virtualizarr.open_virtual_dataset(
-            filter_encoded_xarray_h5netcdf_file,
+            filter_encoded_roundtrip_hdf5_file,
             loadable_variables=["time"],
             cftime_variables=["time"],
         )
@@ -21,9 +21,9 @@ def test_filters_h5netcdf_roundtrip(
         xrt.assert_allclose(ds, roundtrip)
 
     def test_filters_netcdf4_roundtrip(
-        self, tmpdir, filter_encoded_xarray_netcdf4_file
+        self, tmpdir, filter_encoded_roundtrip_netcdf4_file
     ):
-        filepath = filter_encoded_xarray_netcdf4_file["filepath"]
+        filepath = filter_encoded_roundtrip_netcdf4_file["filepath"]
         ds = xr.open_dataset(filepath)
         vds = virtualizarr.open_virtual_dataset(filepath, filetype=FileType("netcdf4"))
         kerchunk_file = f"{tmpdir}/kerchunk.json"

From c9dd0d9cbcc638a4f9d116e3b36a86de997140b6 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Sun, 30 Jun 2024 22:14:26 -0600
Subject: [PATCH 51/68] Handle array add_offset property for compressed data.

---
 virtualizarr/readers/hdf_filters.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 1a3c222..5b35d8f 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -96,7 +96,11 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
     else:
         mapping["scale_factor"] = 1
     if "add_offset" in attributes:
-        mapping["add_offset"] = attributes["add_offset"]
+        try:
+            offset = attributes["add_offset"][0]
+        except IndexError:
+            offset = attributes["add_offset"]
+        mapping["add_offset"] = offset
     else:
         mapping["add_offset"] = 0
     if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0:

From db5b4213b0c4b512c872ce4acdce04c66936a6a5 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 1 Jul 2024 16:57:11 -0600
Subject: [PATCH 52/68] Include h5py shuffle filter.

---
 virtualizarr/readers/hdf_filters.py            | 18 ++++++++++++++----
 .../tests/test_readers/test_hdf_filters.py     | 11 ++++++++++-
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index 5b35d8f..a60dd56 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -36,6 +36,14 @@ class ZstdProperties(BaseModel):
     level: int
 
 
+class ShuffleProperties(BaseModel):
+    elementsize: int
+
+
+class ZlibProperties(BaseModel):
+    level: int
+
+
 class CFCodec(TypedDict):
     target_dtype: np.dtype
     codec: Codec
@@ -56,9 +64,13 @@ def _filter_to_codec(
             id = _non_standard_filters[id_str]
         else:
             id = id_str
-        conf["id"] = id  # type: ignore[assignment]
         if id == "zlib":
-            conf["level"] = filter_properties  # type: ignore[assignment]
+            zlib_props = ZlibProperties(level=filter_properties)
+            conf = zlib_props.model_dump()  # type: ignore[assignment]
+        if id == "shuffle" and isinstance(filter_properties, tuple):
+            shuffle_props = ShuffleProperties(elementsize=filter_properties[0])
+            conf = shuffle_props.model_dump()  # type: ignore[assignment]
+        conf["id"] = id  # type: ignore[assignment]
     if id_int:
         filter = hdf5plugin.get_filters(id_int)[0]
         id = filter.filter_name
@@ -77,9 +89,7 @@ def _filter_to_codec(
         if id == "zstd" and isinstance(filter_properties, tuple):
             zstd_props = ZstdProperties(level=filter_properties[0])
             conf = zstd_props.model_dump()  # type: ignore[assignment]
-
         conf["id"] = id
-
     codec = registry.get_codec(conf)
     return codec
 
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
index 99b3af4..efaad78 100644
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ b/virtualizarr/tests/test_readers/test_hdf_filters.py
@@ -37,6 +37,12 @@ def test_zstd(self):
         expected_config = {"id": "zstd", "level": 5}
         assert codec.get_config() == expected_config
 
+    def test_shuffle(self):
+        codec = _filter_to_codec("shuffle", (7,))
+        assert isinstance(codec, numcodecs.shuffle.Shuffle)
+        expected_config = {"id": "shuffle", "elementsize": 7}
+        assert codec.get_config() == expected_config
+
 
 class TestCodecsFromDataSet:
     def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file):
@@ -48,7 +54,10 @@ def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file):
             file.seek(chunk_info.byte_offset)
             bytes_read = file.read(chunk_info.size)
             decoded = codecs[0].decode(bytes_read)
-            assert decoded == np_uncompressed.tobytes()
+            if isinstance(decoded, np.ndarray):
+                assert decoded.tobytes() == np_uncompressed.tobytes()
+            else:
+                assert decoded == np_uncompressed.tobytes()
 
 
 class TestCFCodecFromDataset:

From 9a1da321e186f56d230cb5609dc787f7d9ec557b Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Mon, 1 Jul 2024 17:03:46 -0600
Subject: [PATCH 53/68] Make ScaleAndOffset codec last in filters list.

---
 virtualizarr/readers/hdf.py                   |  2 +-
 virtualizarr/tests/test_readers/conftest.py   | 36 ++++++++++++++++++-
 .../test_readers/test_hdf_integration.py      | 10 ++++++
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index d683f69..f3337c0 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -176,7 +176,7 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab
         cfcodec = cfcodec_from_dataset(dataset)
         attrs = _extract_attrs(dataset)
         if cfcodec:
-            codecs.append(cfcodec["codec"])
+            codecs.insert(0, cfcodec["codec"])
             dtype = cfcodec["target_dtype"]
             attrs.pop("scale_factor", None)
             attrs.pop("add_offset", None)
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index 539b2fb..afc0bee 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -137,7 +137,7 @@ def np_uncompressed():
     return np.arange(100)
 
 
-@pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd"])
+@pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd", "shuffle"])
 def filter_encoded_hdf5_file(tmpdir, np_uncompressed, request):
     filepath = f"{tmpdir}/{request.param}.nc"
     f = h5py.File(filepath, "w")
@@ -157,6 +157,8 @@ def filter_encoded_hdf5_file(tmpdir, np_uncompressed, request):
         f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.BZip2())
     if request.param == "zstd":
         f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.Zstd(clevel=2))
+    if request.param == "shuffle":
+        f.create_dataset(name="data", data=np_uncompressed, shuffle=True)
 
     return filepath
 
@@ -251,3 +253,35 @@ def chunked_roundtrip_hdf5_file(tmpdir):
         filepath, engine="netcdf4", encoding={"var2": {"chunksizes": (10, 10)}}
     )
     return filepath
+
+
+@pytest.fixture(params=["gzip", "zlib"])
+def filter_and_cf_roundtrip_hdf5_file(tmpdir, request):
+    x = np.arange(100)
+    y = np.arange(100)
+    temperature = 0.1 * x[:, None] + 0.1 * y[None, :]
+    ds = xr.Dataset(
+        {"temperature": (["x", "y"], temperature)},
+        coords={"x": np.arange(100), "y": np.arange(100)},
+    )
+    encoding = {
+        "temperature": {
+            "dtype": "int16",
+            "scale_factor": 0.1,
+            "add_offset": 273.15,
+        }
+    }
+    if request.param == "gzip":
+        encoding["temperature"]["compression"] = "gzip"
+        encoding["temperature"]["compression_opts"] = 7
+
+    if request.param == "zlib":
+        encoding["temperature"]["zlib"] = True
+        encoding["temperature"]["complevel"] = 9
+
+    from random import randint
+
+    filepath = f"{tmpdir}/{request.param}_{randint(0,100)}_cf_roundtrip.nc"
+    ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
+
+    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index 4fc7bd3..dd8d6c3 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -1,3 +1,4 @@
+import pytest
 import xarray as xr
 import xarray.testing as xrt
 
@@ -30,3 +31,12 @@ def test_filters_netcdf4_roundtrip(
         vds.virtualize.to_kerchunk(kerchunk_file, format="json")
         roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
         xrt.assert_equal(ds, roundtrip)
+
+    @pytest.mark.xfail(reason="Investigate kerchunk _FillValue logic")
+    def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file):
+        ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file)
+        vds = virtualizarr.open_virtual_dataset(filter_and_cf_roundtrip_hdf5_file)
+        kerchunk_file = f"{tmpdir}/filter_cf_kerchunk.json"
+        vds.virtualize.to_kerchunk(kerchunk_file, format="json")
+        roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
+        xrt.assert_allclose(ds, roundtrip)

From 9b2b0f8a2b94073c2bf50fe78d8dd068e6d1332c Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 2 Jul 2024 13:23:23 -0600
Subject: [PATCH 54/68] Apply ScaleAndOffset codec to _FillValue since it's
 value is now downstream.

---
 virtualizarr/readers/hdf.py                             | 4 +++-
 virtualizarr/tests/test_readers/conftest.py             | 7 ++++++-
 virtualizarr/tests/test_readers/test_hdf_integration.py | 2 --
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
index f3337c0..6197067 100644
--- a/virtualizarr/readers/hdf.py
+++ b/virtualizarr/readers/hdf.py
@@ -180,14 +180,16 @@ def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variab
             dtype = cfcodec["target_dtype"]
             attrs.pop("scale_factor", None)
             attrs.pop("add_offset", None)
+            fill_value = cfcodec["codec"].decode(dataset.fillvalue)
         else:
             dtype = dataset.dtype
+            fill_value = dataset.fillvalue
         filters = [codec.get_config() for codec in codecs]
         zarray = ZArray(
             chunks=chunks,
             compressor=None,
             dtype=dtype,
-            fill_value=dataset.fillvalue,
+            fill_value=fill_value,
             filters=filters,
             order="C",
             shape=dataset.shape,
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
index afc0bee..ec4132b 100644
--- a/virtualizarr/tests/test_readers/conftest.py
+++ b/virtualizarr/tests/test_readers/conftest.py
@@ -259,7 +259,9 @@ def chunked_roundtrip_hdf5_file(tmpdir):
 def filter_and_cf_roundtrip_hdf5_file(tmpdir, request):
     x = np.arange(100)
     y = np.arange(100)
+    fill_value = np.int16(-9999)
     temperature = 0.1 * x[:, None] + 0.1 * y[None, :]
+    temperature[0][0] = fill_value
     ds = xr.Dataset(
         {"temperature": (["x", "y"], temperature)},
         coords={"x": np.arange(100), "y": np.arange(100)},
@@ -269,7 +271,10 @@ def filter_and_cf_roundtrip_hdf5_file(tmpdir, request):
             "dtype": "int16",
             "scale_factor": 0.1,
             "add_offset": 273.15,
-        }
+            "_FillValue": fill_value,
+        },
+        "x": {"_FillValue": fill_value},
+        "y": {"_FillValue": fill_value},
     }
     if request.param == "gzip":
         encoding["temperature"]["compression"] = "gzip"
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
index dd8d6c3..5cf3f79 100644
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ b/virtualizarr/tests/test_readers/test_hdf_integration.py
@@ -1,4 +1,3 @@
-import pytest
 import xarray as xr
 import xarray.testing as xrt
 
@@ -32,7 +31,6 @@ def test_filters_netcdf4_roundtrip(
         roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
         xrt.assert_equal(ds, roundtrip)
 
-    @pytest.mark.xfail(reason="Investigate kerchunk _FillValue logic")
     def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file):
         ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file)
         vds = virtualizarr.open_virtual_dataset(filter_and_cf_roundtrip_hdf5_file)

From 9ef136275ff636535dcb7e6ecc5b35c1e7149065 Mon Sep 17 00:00:00 2001
From: sharkinsspatial <sharkinsgis@gmail.com>
Date: Tue, 2 Jul 2024 15:12:04 -0600
Subject: [PATCH 55/68] Coerce scale and add_offset values to native float for
 JSON serialization.

---
 virtualizarr/readers/hdf_filters.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
index a60dd56..ae232fe 100644
--- a/virtualizarr/readers/hdf_filters.py
+++ b/virtualizarr/readers/hdf_filters.py
@@ -102,7 +102,7 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
             scale_factor = attributes["scale_factor"][0]
         except IndexError:
             scale_factor = attributes["scale_factor"]
-        mapping["scale_factor"] = 1 / scale_factor
+        mapping["scale_factor"] = float(1 / scale_factor)
     else:
         mapping["scale_factor"] = 1
     if "add_offset" in attributes:
@@ -110,7 +110,7 @@ def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
             offset = attributes["add_offset"][0]
         except IndexError:
             offset = attributes["add_offset"]
-        mapping["add_offset"] = offset
+        mapping["add_offset"] = float(offset)
     else:
         mapping["add_offset"] = 0
     if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0:

From eb16bc1ab249a5a2d9b48ae1b7920c6f0d7a4c1d Mon Sep 17 00:00:00 2001
From: Gustavo Hidalgo <guhidalgo@microsoft.com>
Date: Wed, 17 Jul 2024 16:27:30 -0400
Subject: [PATCH 56/68] Conformant ZarrV3 codecs

---
 virtualizarr/tests/test_integration.py |  2 +-
 virtualizarr/tests/test_zarr.py        | 62 ++++++++++++++--
 virtualizarr/zarr.py                   | 97 ++++++++++++++++++++++++--
 3 files changed, 148 insertions(+), 13 deletions(-)

diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
index 2e612de..239316a 100644
--- a/virtualizarr/tests/test_integration.py
+++ b/virtualizarr/tests/test_integration.py
@@ -138,7 +138,7 @@ def test_non_dimension_coordinates(self, tmpdir, format):
         # regression test for GH issue #105
 
         # set up example xarray dataset containing non-dimension coordinate variables
-        ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6).reshape(2, 3))})
+        ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6.0).reshape(2, 3))})
 
         # save it to disk as netCDF (in temporary directory)
         ds.to_netcdf(f"{tmpdir}/non_dim_coords.nc")
diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py
index 80d04b9..01ac7e5 100644
--- a/virtualizarr/tests/test_zarr.py
+++ b/virtualizarr/tests/test_zarr.py
@@ -1,12 +1,17 @@
+import json
+
 import numpy as np
+import pytest
 import xarray as xr
 import xarray.testing as xrt
 
 from virtualizarr import ManifestArray, open_virtual_dataset
 from virtualizarr.manifests.manifest import ChunkManifest
+from virtualizarr.zarr import dataset_to_zarr, metadata_from_zarr_json
 
 
-def test_zarr_v3_roundtrip(tmpdir):
+@pytest.fixture
+def vds_with_manifest_arrays() -> xr.Dataset:
     arr = ManifestArray(
         chunkmanifest=ChunkManifest(
             entries={"0.0": dict(path="test.nc", offset=6144, length=48)}
@@ -15,18 +20,63 @@ def test_zarr_v3_roundtrip(tmpdir):
             shape=(2, 3),
             dtype=np.dtype("<i8"),
             chunks=(2, 3),
-            compressor=None,
+            compressor="gzip",
             filters=None,
-            fill_value=np.nan,
+            fill_value=0,
             order="C",
             zarr_format=3,
         ),
     )
-    original = xr.Dataset({"a": (["x", "y"], arr)}, attrs={"something": 0})
+    return xr.Dataset({"a": (["x", "y"], arr)}, attrs={"something": 0})
+
+
+def isconfigurable(value: dict):
+    """
+    Several metadata attributes in ZarrV3 use a dictionary with keys "name" : str and "configuration" : dict
+    """
+    return "name" in value and "configuration" in value
 
-    original.virtualize.to_zarr(tmpdir / "store.zarr")
+
+def test_zarr_v3_roundtrip(tmpdir, vds_with_manifest_arrays: xr.Dataset):
+    vds_with_manifest_arrays.virtualize.to_zarr(tmpdir / "store.zarr")
     roundtrip = open_virtual_dataset(
         tmpdir / "store.zarr", filetype="zarr_v3", indexes={}
     )
 
-    xrt.assert_identical(roundtrip, original)
+    xrt.assert_identical(roundtrip, vds_with_manifest_arrays)
+
+
+def test_metadata_roundtrip(tmpdir, vds_with_manifest_arrays: xr.Dataset):
+    dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr")
+    zarray, _, _ = metadata_from_zarr_json(tmpdir / "store.zarr/a/zarr.json")
+    assert zarray == vds_with_manifest_arrays.a.data.zarray
+
+
+def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: xr.Dataset):
+    """
+    Checks that the output metadata of an array variable conforms to this spec
+    for the required attributes:
+    https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#metadata
+    """
+    dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr")
+    # read the a variable's metadata
+    with open(tmpdir / "store.zarr/a/zarr.json", mode="r") as f:
+        metadata = json.loads(f.read())
+    assert metadata["zarr_format"] == 3
+    assert metadata["node_type"] == "array"
+    assert isinstance(metadata["shape"], list) and all(
+        isinstance(dim, int) for dim in metadata["shape"]
+    )
+    assert isinstance(metadata["data_type"], str) or isconfigurable(
+        metadata["data_type"]
+    )
+    assert isconfigurable(metadata["chunk_grid"])
+    assert isconfigurable(metadata["chunk_key_encoding"])
+    assert any(
+        isinstance(metadata["fill_value"], t) for t in (bool, int, float, str, list)
+    )
+    assert (
+        isinstance(metadata["codecs"], list)
+        and len(metadata["codecs"]) > 1
+        and all(isconfigurable(codec) for codec in metadata["codecs"])
+    )
diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py
index 545a86f..0ffc224 100644
--- a/virtualizarr/zarr.py
+++ b/virtualizarr/zarr.py
@@ -6,8 +6,10 @@
     Literal,
     NewType,
     Optional,
+    Union,
 )
 
+import numcodecs
 import numpy as np
 import ujson  # type: ignore
 import xarray as xr
@@ -103,6 +105,8 @@ def dict(self) -> dict[str, Any]:
 
         if zarray_dict["fill_value"] is np.nan:
             zarray_dict["fill_value"] = None
+        else:
+            zarray_dict["fill_value"] = self._default_fill_value()
 
         return zarray_dict
 
@@ -134,6 +138,80 @@ def replace(
             zarr_format=zarr_format if zarr_format is not None else self.zarr_format,
         )
 
+    def _default_fill_value(self) -> Union[bool, int, float, str, list]:
+        """
+        The value and format of the fill_value depend on the data_type of the array.
+        See here for spec:
+        https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value
+        """
+        # numpy dtypes's hierarchy lets us avoid checking for all the widths
+        # https://numpy.org/doc/stable/reference/arrays.scalars.html
+        if self.dtype is np.dtype("bool"):
+            return False
+        elif self.dtype is np.dtype("int"):
+            return 0
+        elif self.dtype is np.dtype("float"):
+            return "NaN"
+        elif self.dtype is np.dtype("complex"):
+            return ["NaN", "NaN"]
+        else:
+            return "NaN"
+
+    def _v3_codec_pipeline(self) -> list:
+        """
+        VirtualiZarr internally uses the `filters`, `compressor`, and `order` attributes
+        from zarr v2, but to create conformant zarr v3 metadata those 3 must be turned into `codecs` objects.
+        Not all codecs are created equal though: https://github.com/zarr-developers/zarr-python/issues/1943
+        An array _must_ declare a single ArrayBytes codec, and 0 or more ArrayArray, BytesBytes codecs.
+        Roughly, this is the mapping:
+        ```
+            filters: Iterable[ArrayArrayCodec] #optional
+            compressor: ArrayBytesCodec #mandatory
+            post_compressor: Iterable[BytesBytesCodec] #optional
+        ```
+        """
+        if self.filters:
+            filter_codecs_configs = [
+                numcodecs.get_codec(filter).get_config() for filter in self.filters
+            ]
+            filters = [
+                dict(name=codec.pop("id"), configuration=codec)
+                for codec in filter_codecs_configs
+            ]
+        else:
+            filters = []
+
+        # Noting here that zarr v3 has very few codecs specificed in the official spec,
+        # and that there are far more codecs in `numcodecs`. We take a gamble and assume
+        # that the codec names and configuration are simply mapped into zarrv3 "configurables".
+        compressor_codec = numcodecs.get_codec(
+            # default to gzip because it is officially specified in the zarr v3 spec
+            dict(id=self.compressor or "gzip")
+        ).get_config()
+        compressor_id = compressor_codec.pop("id")
+        compressor = dict(name=compressor_id, configuration=compressor_codec)
+
+        # https://zarr-specs.readthedocs.io/en/latest/v3/codecs/transpose/v1.0.html#transpose-codec-v1
+        # Either "C" or "F", defining the layout of bytes within each chunk of the array.
+        # "C" means row-major order, i.e., the last dimension varies fastest;
+        # "F" means column-major order, i.e., the first dimension varies fastest.
+        if self.order == "C":
+            order = tuple(range(len(self.shape)))
+        elif self.order == "F":
+            order = tuple(reversed(range(len(self.shape))))
+
+        transpose = dict(name="transpose", configuration=dict(order=order))
+        # https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097
+        # "If no ArrayBytesCodec is supplied, we can auto-add a BytesCodec"
+        bytes = dict(
+            name="bytes", configuration={}
+        )  # TODO need to handle endianess configuration
+
+        # The order here is significant!
+        # [ArrayArray] -> ArrayBytes -> [BytesBytes]
+        codec_pipeline = [transpose, bytes] + [compressor] + filters
+        return codec_pipeline
+
 
 def encode_dtype(dtype: np.dtype) -> str:
     # TODO not sure if there is a better way to get the '<i4' style representation of the dtype out
@@ -234,9 +312,10 @@ def zarr_v3_array_metadata(zarray: ZArray, dim_names: list[str], attrs: dict) ->
         "name": "default",
         "configuration": {"separator": "/"},
     }
-    metadata["codecs"] = metadata.pop("filters")
-    metadata.pop("compressor")  # TODO this should be entered in codecs somehow
-    metadata.pop("order")  # TODO this should be replaced by a transpose codec
+    metadata["codecs"] = zarray._v3_codec_pipeline()
+    metadata.pop("filters")
+    metadata.pop("compressor")
+    metadata.pop("order")
 
     # indicate that we're using the manifest storage transformer ZEP
     metadata["storage_transformers"] = [
@@ -282,13 +361,19 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]:
         fill_value = np.nan
     else:
         fill_value = metadata["fill_value"]
-
+    all_codecs = [
+        codec
+        for codec in metadata["codecs"]
+        if codec["name"] not in ("transpose", "bytes")
+    ]
+    compressor = all_codecs[0]
+    filters = [dict(id=f.pop("name"), **f) for f in all_codecs[1:]] or None
     zarray = ZArray(
         chunks=metadata["chunk_grid"]["configuration"]["chunk_shape"],
-        compressor=metadata["codecs"],
+        compressor=compressor["name"],
         dtype=np.dtype(metadata["data_type"]),
         fill_value=fill_value,
-        filters=metadata.get("filters", None),
+        filters=filters,
         order="C",
         shape=chunk_shape,
         zarr_format=3,

From 5f1b7f9aff309c53e95bdd85fdf6dee7a2caae3e Mon Sep 17 00:00:00 2001
From: Gustavo Hidalgo <guhidalgo@microsoft.com>
Date: Wed, 17 Jul 2024 16:42:39 -0400
Subject: [PATCH 57/68] Update docs

---
 docs/releases.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/releases.rst b/docs/releases.rst
index c44ff24..1451191 100644
--- a/docs/releases.rst
+++ b/docs/releases.rst
@@ -12,6 +12,9 @@ New Features
 Breaking changes
 ~~~~~~~~~~~~~~~~
 
+- Serialize valid ZarrV3 metadata (for :pull:`193`).
+  By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.
+
 Deprecations
 ~~~~~~~~~~~~
 

From 519d45d6c62480d3ee6bf378cc4629fd92cb6cb0 Mon Sep 17 00:00:00 2001
From: Gustavo Hidalgo <zambrano.hidalgo@gmail.com>
Date: Wed, 17 Jul 2024 21:19:27 -0400
Subject: [PATCH 58/68] Update virtualizarr/zarr.py

Co-authored-by: Tom Augspurger <tom.augspurger88@gmail.com>
---
 virtualizarr/zarr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py
index 0ffc224..45f7874 100644
--- a/virtualizarr/zarr.py
+++ b/virtualizarr/zarr.py
@@ -196,7 +196,7 @@ def _v3_codec_pipeline(self) -> list:
         # "C" means row-major order, i.e., the last dimension varies fastest;
         # "F" means column-major order, i.e., the first dimension varies fastest.
         if self.order == "C":
-            order = tuple(range(len(self.shape)))
+            order = tuple(enumerate(self.shape))
         elif self.order == "F":
             order = tuple(reversed(range(len(self.shape))))
 

From 76e9c8ecc0f4da63db1f41539f2a9a655129214b Mon Sep 17 00:00:00 2001
From: Gustavo Hidalgo <zambrano.hidalgo@gmail.com>
Date: Wed, 17 Jul 2024 21:19:44 -0400
Subject: [PATCH 59/68] Update virtualizarr/zarr.py

Co-authored-by: Tom Augspurger <tom.augspurger88@gmail.com>
---
 virtualizarr/zarr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py
index 45f7874..1bb8cc3 100644
--- a/virtualizarr/zarr.py
+++ b/virtualizarr/zarr.py
@@ -198,7 +198,7 @@ def _v3_codec_pipeline(self) -> list:
         if self.order == "C":
             order = tuple(enumerate(self.shape))
         elif self.order == "F":
-            order = tuple(reversed(range(len(self.shape))))
+            order = tuple(reversed(enumerate(self.shape)))
 
         transpose = dict(name="transpose", configuration=dict(order=order))
         # https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097

From 000c52072fafc80cfb9defae39a6310ec4574b54 Mon Sep 17 00:00:00 2001
From: Gustavo Hidalgo <guhidalgo@microsoft.com>
Date: Wed, 17 Jul 2024 21:38:48 -0400
Subject: [PATCH 60/68] Change default_fill to 0s

---
 virtualizarr/zarr.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py
index 0ffc224..cdc45df 100644
--- a/virtualizarr/zarr.py
+++ b/virtualizarr/zarr.py
@@ -151,11 +151,11 @@ def _default_fill_value(self) -> Union[bool, int, float, str, list]:
         elif self.dtype is np.dtype("int"):
             return 0
         elif self.dtype is np.dtype("float"):
-            return "NaN"
+            return 0.0
         elif self.dtype is np.dtype("complex"):
-            return ["NaN", "NaN"]
+            return [0.0, 0.0]
         else:
-            return "NaN"
+            return 0.0
 
     def _v3_codec_pipeline(self) -> list:
         """

From c2e7279edc68e0d40f3b99c114d54d3e2e08f746 Mon Sep 17 00:00:00 2001
From: Gustavo Hidalgo <guhidalgo@microsoft.com>
Date: Wed, 17 Jul 2024 22:26:04 -0400
Subject: [PATCH 61/68] Generate permutation

---
 virtualizarr/zarr.py | 77 ++++++++++++++++++++++++++------------------
 1 file changed, 45 insertions(+), 32 deletions(-)

diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py
index 2b2dd04..a00c7a2 100644
--- a/virtualizarr/zarr.py
+++ b/virtualizarr/zarr.py
@@ -13,7 +13,14 @@
 import numpy as np
 import ujson  # type: ignore
 import xarray as xr
-from pydantic import BaseModel, ConfigDict, field_validator
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    field_validator,
+    model_validator,
+)
+from typing_extensions import Self
 
 from virtualizarr.vendor.zarr.utils import json_dumps
 
@@ -24,6 +31,7 @@
 ZAttrs = NewType(
     "ZAttrs", dict[str, Any]
 )  # just the .zattrs (for one array or for the whole store/group)
+FillValueT = bool | str | float | int | list | None
 
 
 class Codec(BaseModel):
@@ -46,7 +54,7 @@ class ZArray(BaseModel):
     chunks: tuple[int, ...]
     compressor: str | None = None
     dtype: np.dtype
-    fill_value: float | int | None = np.nan  # float or int?
+    fill_value: FillValueT = Field(default=0.0, validate_default=True)
     filters: list[dict] | None = None
     order: Literal["C", "F"]
     shape: tuple[int, ...]
@@ -66,6 +74,12 @@ def __post_init__(self) -> None:
                 f"Array shape {self.shape} has ndim={self.shape} but chunk shape {self.chunks} has ndim={len(self.chunks)}"
             )
 
+    @model_validator(mode="after")
+    def _check_fill_value(self) -> Self:
+        if self.fill_value is None:
+            self.fill_value = _default_fill_value(self.dtype)
+        return self
+
     @property
     def codec(self) -> Codec:
         """For comparison against other arrays."""
@@ -100,18 +114,14 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray":
 
     def dict(self) -> dict[str, Any]:
         zarray_dict = dict(self)
-
         zarray_dict["dtype"] = encode_dtype(zarray_dict["dtype"])
-
-        if zarray_dict["fill_value"] is np.nan:
-            zarray_dict["fill_value"] = None
-        else:
-            zarray_dict["fill_value"] = self._default_fill_value()
-
         return zarray_dict
 
     def to_kerchunk_json(self) -> str:
-        return ujson.dumps(self.dict())
+        zarray_dict = self.dict()
+        if zarray_dict["fill_value"] is np.nan:
+            zarray_dict["fill_value"] = None
+        return ujson.dumps(zarray_dict)
 
     def replace(
         self,
@@ -138,25 +148,6 @@ def replace(
             zarr_format=zarr_format if zarr_format is not None else self.zarr_format,
         )
 
-    def _default_fill_value(self) -> Union[bool, int, float, str, list]:
-        """
-        The value and format of the fill_value depend on the data_type of the array.
-        See here for spec:
-        https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value
-        """
-        # numpy dtypes's hierarchy lets us avoid checking for all the widths
-        # https://numpy.org/doc/stable/reference/arrays.scalars.html
-        if self.dtype is np.dtype("bool"):
-            return False
-        elif self.dtype is np.dtype("int"):
-            return 0
-        elif self.dtype is np.dtype("float"):
-            return 0.0
-        elif self.dtype is np.dtype("complex"):
-            return [0.0, 0.0]
-        else:
-            return 0.0
-
     def _v3_codec_pipeline(self) -> list:
         """
         VirtualiZarr internally uses the `filters`, `compressor`, and `order` attributes
@@ -196,9 +187,9 @@ def _v3_codec_pipeline(self) -> list:
         # "C" means row-major order, i.e., the last dimension varies fastest;
         # "F" means column-major order, i.e., the first dimension varies fastest.
         if self.order == "C":
-            order = tuple(enumerate(self.shape))
+            order = tuple(range(len(self.shape)))
         elif self.order == "F":
-            order = tuple(reversed(enumerate(self.shape)))
+            order = tuple(reversed(range(len(self.shape))))
 
         transpose = dict(name="transpose", configuration=dict(order=order))
         # https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097
@@ -358,7 +349,9 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]:
     chunk_shape = metadata["chunk_grid"]["configuration"]["chunk_shape"]
 
     if metadata["fill_value"] is None:
-        fill_value = np.nan
+        raise ValueError(
+            "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value"
+        )
     else:
         fill_value = metadata["fill_value"]
     all_codecs = [
@@ -380,3 +373,23 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]:
     )
 
     return zarray, dim_names, attrs
+
+
+def _default_fill_value(dtype: np.dtype) -> Union[bool, int, float, str, list]:
+    """
+    The value and format of the fill_value depend on the data_type of the array.
+    See here for spec:
+    https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value
+    """
+    # numpy dtypes's hierarchy lets us avoid checking for all the widths
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html
+    if dtype is np.dtype("bool"):
+        return False
+    elif dtype is np.dtype("int"):
+        return 0
+    elif dtype is np.dtype("float"):
+        return 0.0
+    elif dtype is np.dtype("complex"):
+        return [0.0, 0.0]
+    else:
+        return 0.0

From 145960a6e42c21dd111dc10fa03b4657c92c7480 Mon Sep 17 00:00:00 2001
From: Gustavo Hidalgo <guhidalgo@microsoft.com>
Date: Thu, 18 Jul 2024 10:31:14 -0400
Subject: [PATCH 62/68] Pythonic isinstance check

---
 virtualizarr/tests/test_zarr.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py
index 01ac7e5..5967f7d 100644
--- a/virtualizarr/tests/test_zarr.py
+++ b/virtualizarr/tests/test_zarr.py
@@ -72,9 +72,7 @@ def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: xr.Datas
     )
     assert isconfigurable(metadata["chunk_grid"])
     assert isconfigurable(metadata["chunk_key_encoding"])
-    assert any(
-        isinstance(metadata["fill_value"], t) for t in (bool, int, float, str, list)
-    )
+    assert isinstance(metadata["fill_value"], (bool, int, float, str, list))
     assert (
         isinstance(metadata["codecs"], list)
         and len(metadata["codecs"]) > 1

From c051f04523ae3d9a4244c1ece92ffc95a633498b Mon Sep 17 00:00:00 2001
From: Gustavo Hidalgo <zambrano.hidalgo@gmail.com>
Date: Thu, 18 Jul 2024 10:31:59 -0400
Subject: [PATCH 63/68] Add return type to isconfigurable

Co-authored-by: Tom Augspurger <tom.augspurger88@gmail.com>
---
 virtualizarr/tests/test_zarr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py
index 5967f7d..29db840 100644
--- a/virtualizarr/tests/test_zarr.py
+++ b/virtualizarr/tests/test_zarr.py
@@ -30,7 +30,7 @@ def vds_with_manifest_arrays() -> xr.Dataset:
     return xr.Dataset({"a": (["x", "y"], arr)}, attrs={"something": 0})
 
 
-def isconfigurable(value: dict):
+def isconfigurable(value: dict) -> bool:
     """
     Several metadata attributes in ZarrV3 use a dictionary with keys "name" : str and "configuration" : dict
     """

From 7b093247075b6eb3204a8fe7069ef985b7b8747b Mon Sep 17 00:00:00 2001
From: Tria McNeely <triamcnely@microsoft.com>
Date: Fri, 19 Jul 2024 14:30:59 -0400
Subject: [PATCH 64/68] Changes from pair programming for zarrv3 to kerchunk
 file reading

---
 virtualizarr/kerchunk.py |  2 +-
 virtualizarr/zarr.py     | 50 ++++++++++++++++++++++++++--------------
 2 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py
index 6e82067..122b86b 100644
--- a/virtualizarr/kerchunk.py
+++ b/virtualizarr/kerchunk.py
@@ -266,7 +266,7 @@ def variable_to_kerchunk_arr_refs(var: xr.Variable, var_name: str) -> KerchunkAr
             for chunk_key, entry in marr.manifest.dict().items()
         }
 
-        zarray = marr.zarray
+        zarray = marr.zarray.replace(zarr_format=2)
 
     else:
         try:
diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py
index a00c7a2..7e5674e 100644
--- a/virtualizarr/zarr.py
+++ b/virtualizarr/zarr.py
@@ -35,7 +35,7 @@
 
 
 class Codec(BaseModel):
-    compressor: str | None = None
+    compressor: dict | None = None
     filters: list[dict] | None = None
 
     def __repr__(self) -> str:
@@ -52,7 +52,7 @@ class ZArray(BaseModel):
     )
 
     chunks: tuple[int, ...]
-    compressor: str | None = None
+    compressor: dict | None = None
     dtype: np.dtype
     fill_value: FillValueT = Field(default=0.0, validate_default=True)
     filters: list[dict] | None = None
@@ -98,8 +98,8 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray":
         compressor = decoded_arr_refs_zarray["compressor"]
         # deal with an inconsistency in kerchunk's tiff_to_zarr function
         # TODO should this be moved to the point where we actually call tiff_to_zarr? Or ideally made consistent upstream.
-        if compressor is not None and "id" in compressor:
-            compressor = compressor["id"]
+        # if compressor is not None and "id" in compressor:
+        #     compressor = compressor["id"]
 
         return ZArray(
             chunks=tuple(decoded_arr_refs_zarray["chunks"]),
@@ -126,7 +126,7 @@ def to_kerchunk_json(self) -> str:
     def replace(
         self,
         chunks: Optional[tuple[int, ...]] = None,
-        compressor: Optional[str] = None,
+        compressor: Optional[dict] = None,
         dtype: Optional[np.dtype] = None,
         fill_value: Optional[float] = None,  # float or int?
         filters: Optional[list[dict]] = None,  # type: ignore[valid-type]
@@ -175,12 +175,10 @@ def _v3_codec_pipeline(self) -> list:
         # Noting here that zarr v3 has very few codecs specificed in the official spec,
         # and that there are far more codecs in `numcodecs`. We take a gamble and assume
         # that the codec names and configuration are simply mapped into zarrv3 "configurables".
-        compressor_codec = numcodecs.get_codec(
-            # default to gzip because it is officially specified in the zarr v3 spec
-            dict(id=self.compressor or "gzip")
-        ).get_config()
-        compressor_id = compressor_codec.pop("id")
-        compressor = dict(name=compressor_id, configuration=compressor_codec)
+        if self.compressor:
+            compressor = [_num_codec_config_to_configurable(self.compressor)]
+        else:
+            compressor = []
 
         # https://zarr-specs.readthedocs.io/en/latest/v3/codecs/transpose/v1.0.html#transpose-codec-v1
         # Either "C" or "F", defining the layout of bytes within each chunk of the array.
@@ -200,7 +198,7 @@ def _v3_codec_pipeline(self) -> list:
 
         # The order here is significant!
         # [ArrayArray] -> ArrayBytes -> [BytesBytes]
-        codec_pipeline = [transpose, bytes] + [compressor] + filters
+        codec_pipeline = [transpose, bytes] + compressor + filters
         return codec_pipeline
 
 
@@ -347,6 +345,8 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]:
     dim_names = metadata.pop("dimension_names")
 
     chunk_shape = metadata["chunk_grid"]["configuration"]["chunk_shape"]
+    shape = metadata["shape"]
+    zarr_format = metadata["zarr_format"]
 
     if metadata["fill_value"] is None:
         raise ValueError(
@@ -359,21 +359,37 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]:
         for codec in metadata["codecs"]
         if codec["name"] not in ("transpose", "bytes")
     ]
-    compressor = all_codecs[0]
-    filters = [dict(id=f.pop("name"), **f) for f in all_codecs[1:]] or None
+    # TODO: hdf.py treats all codecs as filter, but maybe one needs to be the compressor?
+    compressor = None #all_codecs[0] if all_codecs else None
+    filters = [_configurable_to_num_codec_config(_filter) for _filter in all_codecs] or None
     zarray = ZArray(
         chunks=metadata["chunk_grid"]["configuration"]["chunk_shape"],
-        compressor=compressor["name"],
+        compressor=_configurable_to_num_codec_config(compressor) if compressor else None,
         dtype=np.dtype(metadata["data_type"]),
         fill_value=fill_value,
         filters=filters,
         order="C",
-        shape=chunk_shape,
-        zarr_format=3,
+        shape=shape,
+        zarr_format=zarr_format,
     )
 
     return zarray, dim_names, attrs
 
+def _configurable_to_num_codec_config(configurable: dict) -> dict:
+    """
+    Convert a zarr v3 configurable into a numcodecs codec.
+    """
+    configurable_copy = configurable.copy()
+    codec_id = configurable_copy.pop("name")
+    configuration = configurable_copy.pop("configuration")
+    return numcodecs.get_codec({"id": codec_id, **configuration}).get_config()
+
+def _num_codec_config_to_configurable(num_codec: dict) -> dict:
+    """
+    Convert a numcodecs codec into a zarr v3 configurable.
+    """
+    num_codec_copy = num_codec.copy()
+    return {"name": num_codec_copy.pop("id"), "configuration": num_codec_copy}
 
 def _default_fill_value(dtype: np.dtype) -> Union[bool, int, float, str, list]:
     """

From 2c59256424752ba7acab8f32038b19c7cb535b2f Mon Sep 17 00:00:00 2001
From: Tria McNeely <triamcnely@microsoft.com>
Date: Fri, 19 Jul 2024 14:43:42 -0400
Subject: [PATCH 65/68] Revert "Merge remote-tracking branch
 'upstream/hdf5_reader' into codecs"

This reverts commit 7a65fbdc8eda1dfedaa59e90bd2d8fe652819085, reversing
changes made to c051f04523ae3d9a4244c1ece92ffc95a633498b.
---
 ci/environment.yml                            |   4 -
 pyproject.toml                                |   2 -
 virtualizarr/readers/hdf.py                   | 243 ---------------
 virtualizarr/readers/hdf_filters.py           | 136 --------
 virtualizarr/tests/test_integration.py        |  23 +-
 virtualizarr/tests/test_readers/__init__.py   |   0
 virtualizarr/tests/test_readers/conftest.py   | 292 ------------------
 virtualizarr/tests/test_readers/test_hdf.py   | 122 --------
 .../tests/test_readers/test_hdf_filters.py    | 115 -------
 .../test_readers/test_hdf_integration.py      |  40 ---
 virtualizarr/tests/test_xarray.py             |  18 +-
 virtualizarr/xarray.py                        |  56 ++--
 12 files changed, 28 insertions(+), 1023 deletions(-)
 delete mode 100644 virtualizarr/readers/hdf.py
 delete mode 100644 virtualizarr/readers/hdf_filters.py
 delete mode 100644 virtualizarr/tests/test_readers/__init__.py
 delete mode 100644 virtualizarr/tests/test_readers/conftest.py
 delete mode 100644 virtualizarr/tests/test_readers/test_hdf.py
 delete mode 100644 virtualizarr/tests/test_readers/test_hdf_filters.py
 delete mode 100644 virtualizarr/tests/test_readers/test_hdf_integration.py

diff --git a/ci/environment.yml b/ci/environment.yml
index 5ba1f8d..a41a99d 100644
--- a/ci/environment.yml
+++ b/ci/environment.yml
@@ -14,7 +14,6 @@ dependencies:
   - ujson
   - packaging
   - universal_pathlib
-  - hdf5plugin
   # Testing
   - codecov
   - pre-commit
@@ -27,10 +26,7 @@ dependencies:
   - fsspec
   - s3fs
   - fastparquet
-  - imagecodecs>=2024.6.1
   # for opening tiff files
   - tifffile
   # for opening FITS files
   - astropy
-  - pip:
-    - imagecodecs-numcodecs
diff --git a/pyproject.toml b/pyproject.toml
index 7baa87b..9fe0468 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,7 +29,6 @@ dependencies = [
     "ujson",
     "packaging",
     "universal-pathlib",
-    "hdf5plugin",
 ]
 
 [project.optional-dependencies]
@@ -46,7 +45,6 @@ test = [
     "fsspec",
     "s3fs",
     "fastparquet",
-    "imagecodecs-numcodecs",
 ]
 
 
diff --git a/virtualizarr/readers/hdf.py b/virtualizarr/readers/hdf.py
deleted file mode 100644
index 6197067..0000000
--- a/virtualizarr/readers/hdf.py
+++ /dev/null
@@ -1,243 +0,0 @@
-import math
-from typing import List, Mapping, Optional, Union
-
-import h5py
-import numpy as np
-import xarray as xr
-
-from virtualizarr.manifests import ChunkEntry, ChunkManifest, ManifestArray
-from virtualizarr.readers.hdf_filters import cfcodec_from_dataset, codecs_from_dataset
-from virtualizarr.types import ChunkKey
-from virtualizarr.utils import _fsspec_openfile_from_filepath
-from virtualizarr.zarr import ZArray
-
-
-def _dataset_chunk_manifest(
-    path: str, dataset: h5py.Dataset
-) -> Optional[ChunkManifest]:
-    """
-    Generate ChunkManifest for HDF5 dataset.
-
-    Parameters
-    ----------
-    path: str
-        The path the HDF5 container file
-     dset : h5py.Dataset
-        HDF5 dataset for which to create a ChunkManifest
-
-    Returns
-    -------
-    ChunkManifest
-        A Virtualizarr ChunkManifest
-    """
-    dsid = dataset.id
-
-    if dataset.chunks is None:
-        if dsid.get_offset() is None:
-            return None
-        else:
-            key_list = [0] * (len(dataset.shape) or 1)
-            key = ".".join(map(str, key_list))
-            chunk_entry = ChunkEntry(
-                path=path, offset=dsid.get_offset(), length=dsid.get_storage_size()
-            )
-            chunk_key = ChunkKey(key)
-            chunk_entries = {chunk_key: chunk_entry.dict()}
-            chunk_manifest = ChunkManifest(entries=chunk_entries)
-            return chunk_manifest
-    else:
-        num_chunks = dsid.get_num_chunks()
-        if num_chunks == 0:
-            raise ValueError("The dataset is chunked but contains no chunks")
-
-        shape = tuple(math.ceil(a / b) for a, b in zip(dataset.shape, dataset.chunks))
-        paths = np.empty(shape, dtype=np.dtypes.StringDType)  # type: ignore
-        offsets = np.empty(shape, dtype=np.int32)
-        lengths = np.empty(shape, dtype=np.int32)
-
-        def get_key(blob):
-            return tuple([a // b for a, b in zip(blob.chunk_offset, dataset.chunks)])
-
-        def add_chunk_info(blob):
-            key = get_key(blob)
-            paths[key] = path
-            offsets[key] = blob.byte_offset
-            lengths[key] = blob.size
-
-        has_chunk_iter = callable(getattr(dsid, "chunk_iter", None))
-        if has_chunk_iter:
-            dsid.chunk_iter(add_chunk_info)
-        else:
-            for index in range(num_chunks):
-                add_chunk_info(dsid.get_chunk_info(index))
-
-        chunk_manifest = ChunkManifest.from_arrays(
-            paths=paths, offsets=offsets, lengths=lengths
-        )
-        return chunk_manifest
-
-
-def _dataset_dims(dataset: h5py.Dataset) -> Union[List[str], List[None]]:
-    """
-    Get a list of dimension scale names attached to input HDF5 dataset.
-
-    This is required by the xarray package to work with Zarr arrays. Only
-    one dimension scale per dataset dimension is allowed. If dataset is
-    dimension scale, it will be considered as the dimension to itself.
-
-    Parameters
-    ----------
-    dataset : h5py.Dataset
-        HDF5 dataset.
-
-    Returns
-    -------
-    list
-        List with HDF5 path names of dimension scales attached to input
-        dataset.
-    """
-    dims = list()
-    rank = len(dataset.shape)
-    if rank:
-        for n in range(rank):
-            num_scales = len(dataset.dims[n])
-            if num_scales == 1:
-                dims.append(dataset.dims[n][0].name[1:])
-            elif h5py.h5ds.is_scale(dataset.id):
-                dims.append(dataset.name[1:])
-            elif num_scales > 1:
-                raise ValueError(
-                    f"{dataset.name}: {len(dataset.dims[n])} "
-                    f"dimension scales attached to dimension #{n}"
-                )
-            elif num_scales == 0:
-                # Some HDF5 files do not have dimension scales.
-                # If this is the case, `num_scales` will be 0.
-                # In this case, we mimic netCDF4 and assign phony dimension names.
-                # See https://github.com/fsspec/kerchunk/issues/41
-                dims.append(f"phony_dim_{n}")
-    return dims
-
-
-def _extract_attrs(h5obj: Union[h5py.Dataset, h5py.Group]):
-    """
-    Extract attributes from an HDF5 group or dataset.
-
-    Parameters
-    ----------
-    h5obj : h5py.Group or h5py.Dataset
-        An HDF5 group or dataset.
-    """
-    _HIDDEN_ATTRS = {
-        "REFERENCE_LIST",
-        "CLASS",
-        "DIMENSION_LIST",
-        "NAME",
-        "_Netcdf4Dimid",
-        "_Netcdf4Coordinates",
-        "_nc3_strict",
-        "_NCProperties",
-    }
-    attrs = {}
-    for n, v in h5obj.attrs.items():
-        if n in _HIDDEN_ATTRS:
-            continue
-        # Fix some attribute values to avoid JSON encoding exceptions...
-        if isinstance(v, bytes):
-            v = v.decode("utf-8") or " "
-        elif isinstance(v, (np.ndarray, np.number, np.bool_)):
-            if v.dtype.kind == "S":
-                v = v.astype(str)
-            if n == "_FillValue":
-                continue
-            elif v.size == 1:
-                v = v.flatten()[0]
-                if isinstance(v, (np.ndarray, np.number, np.bool_)):
-                    v = v.tolist()
-            else:
-                v = v.tolist()
-        elif isinstance(v, h5py._hl.base.Empty):
-            v = ""
-        if v == "DIMENSION_SCALE":
-            continue
-
-        attrs[n] = v
-    return attrs
-
-
-def _dataset_to_variable(path: str, dataset: h5py.Dataset) -> Optional[xr.Variable]:
-    # This chunk determination logic mirrors zarr-python's create
-    # https://github.com/zarr-developers/zarr-python/blob/main/zarr/creation.py#L62-L66
-
-    manifest = _dataset_chunk_manifest(path, dataset)
-    if manifest:
-        chunks = dataset.chunks if dataset.chunks else dataset.shape
-        codecs = codecs_from_dataset(dataset)
-        cfcodec = cfcodec_from_dataset(dataset)
-        attrs = _extract_attrs(dataset)
-        if cfcodec:
-            codecs.insert(0, cfcodec["codec"])
-            dtype = cfcodec["target_dtype"]
-            attrs.pop("scale_factor", None)
-            attrs.pop("add_offset", None)
-            fill_value = cfcodec["codec"].decode(dataset.fillvalue)
-        else:
-            dtype = dataset.dtype
-            fill_value = dataset.fillvalue
-        filters = [codec.get_config() for codec in codecs]
-        zarray = ZArray(
-            chunks=chunks,
-            compressor=None,
-            dtype=dtype,
-            fill_value=fill_value,
-            filters=filters,
-            order="C",
-            shape=dataset.shape,
-            zarr_format=2,
-        )
-        marray = ManifestArray(zarray=zarray, chunkmanifest=manifest)
-        dims = _dataset_dims(dataset)
-        variable = xr.Variable(data=marray, dims=dims, attrs=attrs)
-    else:
-        variable = None
-    return variable
-
-
-def virtual_vars_from_hdf(
-    path: str,
-    drop_variables: Optional[List[str]] = None,
-    reader_options: Optional[dict] = {
-        "storage_options": {"key": "", "secret": "", "anon": True}
-    },
-) -> Mapping[str, xr.Variable]:
-    if drop_variables is None:
-        drop_variables = []
-    open_file = _fsspec_openfile_from_filepath(
-        filepath=path, reader_options=reader_options
-    )
-    f = h5py.File(open_file, mode="r")
-    variables = {}
-    for key in f.keys():
-        if key not in drop_variables:
-            if isinstance(f[key], h5py.Dataset):
-                variable = _dataset_to_variable(path, f[key])
-                if variable is not None:
-                    variables[key] = variable
-            else:
-                raise NotImplementedError("Nested groups are not yet supported")
-
-    return variables
-
-
-def attrs_from_root_group(
-    path: str,
-    reader_options: Optional[dict] = {
-        "storage_options": {"key": "", "secret": "", "anon": True}
-    },
-):
-    open_file = _fsspec_openfile_from_filepath(
-        filepath=path, reader_options=reader_options
-    )
-    f = h5py.File(open_file, mode="r")
-    attrs = _extract_attrs(f)
-    return attrs
diff --git a/virtualizarr/readers/hdf_filters.py b/virtualizarr/readers/hdf_filters.py
deleted file mode 100644
index ae232fe..0000000
--- a/virtualizarr/readers/hdf_filters.py
+++ /dev/null
@@ -1,136 +0,0 @@
-from typing import List, Tuple, TypedDict, Union
-
-import h5py
-import hdf5plugin
-import numcodecs.registry as registry
-import numpy as np
-from numcodecs.abc import Codec
-from numcodecs.fixedscaleoffset import FixedScaleOffset
-from pydantic import BaseModel, field_validator
-from xarray.coding.variables import _choose_float_dtype
-
-_non_standard_filters = {
-    "gzip": "zlib",
-    "lzf": "imagecodecs_lzf",
-}
-
-_hdf5plugin_imagecodecs = {"lz4": "imagecodecs_lz4h5", "bzip2": "imagecodecs_bz2"}
-
-
-class BloscProperties(BaseModel):
-    blocksize: int
-    clevel: int
-    shuffle: int
-    cname: str
-
-    @field_validator("cname", mode="before")
-    def get_cname_from_code(cls, v):
-        blosc_compressor_codes = {
-            value: key
-            for key, value in hdf5plugin._filters.Blosc._Blosc__COMPRESSIONS.items()
-        }
-        return blosc_compressor_codes[v]
-
-
-class ZstdProperties(BaseModel):
-    level: int
-
-
-class ShuffleProperties(BaseModel):
-    elementsize: int
-
-
-class ZlibProperties(BaseModel):
-    level: int
-
-
-class CFCodec(TypedDict):
-    target_dtype: np.dtype
-    codec: Codec
-
-
-def _filter_to_codec(
-    filter_id: str, filter_properties: Union[int, None, Tuple] = None
-) -> Codec:
-    id_int = None
-    id_str = None
-    try:
-        id_int = int(filter_id)
-    except ValueError:
-        id_str = filter_id
-    conf = {}
-    if id_str:
-        if id_str in _non_standard_filters.keys():
-            id = _non_standard_filters[id_str]
-        else:
-            id = id_str
-        if id == "zlib":
-            zlib_props = ZlibProperties(level=filter_properties)
-            conf = zlib_props.model_dump()  # type: ignore[assignment]
-        if id == "shuffle" and isinstance(filter_properties, tuple):
-            shuffle_props = ShuffleProperties(elementsize=filter_properties[0])
-            conf = shuffle_props.model_dump()  # type: ignore[assignment]
-        conf["id"] = id  # type: ignore[assignment]
-    if id_int:
-        filter = hdf5plugin.get_filters(id_int)[0]
-        id = filter.filter_name
-        if id in _hdf5plugin_imagecodecs.keys():
-            id = _hdf5plugin_imagecodecs[id]
-        if id == "blosc" and isinstance(filter_properties, tuple):
-            blosc_props = BloscProperties(
-                **{
-                    k: v
-                    for k, v in zip(
-                        BloscProperties.model_fields.keys(), filter_properties[-4:]
-                    )
-                }
-            )
-            conf = blosc_props.model_dump()  # type: ignore[assignment]
-        if id == "zstd" and isinstance(filter_properties, tuple):
-            zstd_props = ZstdProperties(level=filter_properties[0])
-            conf = zstd_props.model_dump()  # type: ignore[assignment]
-        conf["id"] = id
-    codec = registry.get_codec(conf)
-    return codec
-
-
-def cfcodec_from_dataset(dataset: h5py.Dataset) -> Codec | None:
-    attributes = {attr: dataset.attrs[attr] for attr in dataset.attrs}
-    mapping = {}
-    if "scale_factor" in attributes:
-        try:
-            scale_factor = attributes["scale_factor"][0]
-        except IndexError:
-            scale_factor = attributes["scale_factor"]
-        mapping["scale_factor"] = float(1 / scale_factor)
-    else:
-        mapping["scale_factor"] = 1
-    if "add_offset" in attributes:
-        try:
-            offset = attributes["add_offset"][0]
-        except IndexError:
-            offset = attributes["add_offset"]
-        mapping["add_offset"] = float(offset)
-    else:
-        mapping["add_offset"] = 0
-    if mapping["scale_factor"] != 1 or mapping["add_offset"] != 0:
-        float_dtype = _choose_float_dtype(dtype=dataset.dtype, mapping=mapping)
-        target_dtype = np.dtype(float_dtype)
-        codec = FixedScaleOffset(
-            offset=mapping["add_offset"],
-            scale=mapping["scale_factor"],
-            dtype=target_dtype,
-            astype=dataset.dtype,
-        )
-        cfcodec = CFCodec(target_dtype=target_dtype, codec=codec)
-        return cfcodec
-    else:
-        return None
-
-
-def codecs_from_dataset(dataset: h5py.Dataset) -> List[Codec]:
-    codecs = []
-    for filter_id, filter_properties in dataset._filters.items():
-        codec = _filter_to_codec(filter_id, filter_properties)
-        codecs.append(codec)
-    return codecs
diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
index 65b9c71..239316a 100644
--- a/virtualizarr/tests/test_integration.py
+++ b/virtualizarr/tests/test_integration.py
@@ -69,12 +69,8 @@ def test_kerchunk_roundtrip_no_concat(self, tmpdir, format):
                 f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False
             )
 
-        # assert all_close to original dataset
-        xrt.assert_allclose(roundtrip, ds)
-
-        # assert coordinate attributes are maintained
-        for coord in ds.coords:
-            assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs
+        # assert identical to original dataset
+        xrt.assert_identical(roundtrip, ds)
 
     @pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])])
     def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars):
@@ -128,14 +124,9 @@ def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars
             roundtrip = xr.open_dataset(
                 f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=decode_times
             )
-
         if decode_times is False:
-            # assert all_close to original dataset
-            xrt.assert_allclose(roundtrip, ds)
-
-            # assert coordinate attributes are maintained
-            for coord in ds.coords:
-                assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs
+            # assert identical to original dataset
+            xrt.assert_identical(roundtrip, ds)
         else:
             # they are very very close! But assert_allclose doesn't seem to work on datetimes
             assert (roundtrip.time - ds.time).sum() == 0
@@ -173,11 +164,7 @@ def test_non_dimension_coordinates(self, tmpdir, format):
             )
 
         # assert equal to original dataset
-        xrt.assert_allclose(roundtrip, ds)
-
-        # assert coordinate attributes are maintained
-        for coord in ds.coords:
-            assert ds.coords[coord].attrs == roundtrip.coords[coord].attrs
+        xrt.assert_identical(roundtrip, ds)
 
 
 def test_open_scalar_variable(tmpdir):
diff --git a/virtualizarr/tests/test_readers/__init__.py b/virtualizarr/tests/test_readers/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/virtualizarr/tests/test_readers/conftest.py b/virtualizarr/tests/test_readers/conftest.py
deleted file mode 100644
index ec4132b..0000000
--- a/virtualizarr/tests/test_readers/conftest.py
+++ /dev/null
@@ -1,292 +0,0 @@
-import h5py
-import hdf5plugin
-import numpy as np
-import pytest
-import xarray as xr
-from packaging.version import Version
-from xarray.tests.test_dataset import create_test_data
-from xarray.util.print_versions import netcdf_and_hdf5_versions
-
-
-@pytest.fixture
-def empty_chunks_hdf5_file(tmpdir):
-    ds = xr.Dataset({"data": []})
-    filepath = f"{tmpdir}/empty_chunks.nc"
-    ds.to_netcdf(filepath, engine="h5netcdf")
-    return filepath
-
-
-@pytest.fixture
-def empty_dataset_hdf5_file(tmpdir):
-    filepath = f"{tmpdir}/empty_dataset.nc"
-    f = h5py.File(filepath, "w")
-    f.create_dataset("data", shape=(0,), dtype="f")
-    return filepath
-
-
-@pytest.fixture
-def no_chunks_hdf5_file(tmpdir):
-    filepath = f"{tmpdir}/no_chunks.nc"
-    f = h5py.File(filepath, "w")
-    data = np.random.random((10, 10))
-    f.create_dataset(name="data", data=data, chunks=None)
-    return filepath
-
-
-@pytest.fixture
-def chunked_hdf5_file(tmpdir):
-    filepath = f"{tmpdir}/chunks.nc"
-    f = h5py.File(filepath, "w")
-    data = np.random.random((100, 100))
-    f.create_dataset(name="data", data=data, chunks=(50, 50))
-    return filepath
-
-
-@pytest.fixture
-def single_dimension_scale_hdf5_file(tmpdir):
-    filepath = f"{tmpdir}/single_dimension_scale.nc"
-    f = h5py.File(filepath, "w")
-    data = [1, 2]
-    x = [0, 1]
-    f.create_dataset(name="data", data=data)
-    f.create_dataset(name="x", data=x)
-    f["x"].make_scale()
-    f["data"].dims[0].attach_scale(f["x"])
-    return filepath
-
-
-@pytest.fixture
-def is_scale_hdf5_file(tmpdir):
-    filepath = f"{tmpdir}/is_scale.nc"
-    f = h5py.File(filepath, "w")
-    data = [1, 2]
-    f.create_dataset(name="data", data=data)
-    f["data"].make_scale()
-    return filepath
-
-
-@pytest.fixture
-def multiple_dimension_scales_hdf5_file(tmpdir):
-    filepath = f"{tmpdir}/multiple_dimension_scales.nc"
-    f = h5py.File(filepath, "w")
-    data = [1, 2]
-    f.create_dataset(name="data", data=data)
-    f.create_dataset(name="x", data=[0, 1])
-    f.create_dataset(name="y", data=[0, 1])
-    f["x"].make_scale()
-    f["y"].make_scale()
-    f["data"].dims[0].attach_scale(f["x"])
-    f["data"].dims[0].attach_scale(f["y"])
-    return filepath
-
-
-@pytest.fixture
-def chunked_dimensions_netcdf4_file(tmpdir):
-    filepath = f"{tmpdir}/chunks_dimension.nc"
-    f = h5py.File(filepath, "w")
-    data = np.random.random((100, 100))
-    x = np.random.random((100))
-    y = np.random.random((100))
-    f.create_dataset(name="data", data=data, chunks=(50, 50))
-    f.create_dataset(name="x", data=x)
-    f.create_dataset(name="y", data=y)
-    f["data"].dims[0].attach_scale(f["x"])
-    f["data"].dims[1].attach_scale(f["y"])
-    return filepath
-
-
-@pytest.fixture
-def string_attributes_hdf5_file(tmpdir):
-    filepath = f"{tmpdir}/attributes.nc"
-    f = h5py.File(filepath, "w")
-    data = np.random.random((10, 10))
-    f.create_dataset(name="data", data=data, chunks=None)
-    f["data"].attrs["attribute_name"] = "attribute_name"
-    f["data"].attrs["attribute_name2"] = "attribute_name2"
-    return filepath
-
-
-@pytest.fixture
-def root_attributes_hdf5_file(tmpdir):
-    filepath = f"{tmpdir}/root_attributes.nc"
-    f = h5py.File(filepath, "w")
-    f.attrs["attribute_name"] = "attribute_name"
-    return filepath
-
-
-@pytest.fixture
-def group_hdf5_file(tmpdir):
-    filepath = f"{tmpdir}/group.nc"
-    f = h5py.File(filepath, "w")
-    f.create_group("group")
-    return filepath
-
-
-@pytest.fixture
-def multiple_datasets_hdf5_file(tmpdir):
-    filepath = f"{tmpdir}/multiple_datasets.nc"
-    f = h5py.File(filepath, "w")
-    data = np.random.random((10, 10))
-    f.create_dataset(name="data", data=data, chunks=None)
-    f.create_dataset(name="data2", data=data, chunks=None)
-    return filepath
-
-
-@pytest.fixture
-def np_uncompressed():
-    return np.arange(100)
-
-
-@pytest.fixture(params=["gzip", "blosc_lz4", "lz4", "bzip2", "zstd", "shuffle"])
-def filter_encoded_hdf5_file(tmpdir, np_uncompressed, request):
-    filepath = f"{tmpdir}/{request.param}.nc"
-    f = h5py.File(filepath, "w")
-    if request.param == "gzip":
-        f.create_dataset(
-            name="data", data=np_uncompressed, compression="gzip", compression_opts=1
-        )
-    if request.param == "blosc_lz4":
-        f.create_dataset(
-            name="data",
-            data=np_uncompressed,
-            **hdf5plugin.Blosc(cname="lz4", clevel=9, shuffle=hdf5plugin.Blosc.SHUFFLE),
-        )
-    if request.param == "lz4":
-        f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.LZ4(nbytes=0))
-    if request.param == "bzip2":
-        f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.BZip2())
-    if request.param == "zstd":
-        f.create_dataset(name="data", data=np_uncompressed, **hdf5plugin.Zstd(clevel=2))
-    if request.param == "shuffle":
-        f.create_dataset(name="data", data=np_uncompressed, shuffle=True)
-
-    return filepath
-
-
-@pytest.fixture(params=["gzip"])
-def filter_encoded_roundtrip_hdf5_file(tmpdir, request):
-    ds = xr.tutorial.open_dataset("air_temperature")
-    encoding = {}
-    if request.param == "gzip":
-        encoding_config = {"zlib": True, "complevel": 1}
-
-    for var_name in ds.variables:
-        encoding[var_name] = encoding_config
-
-    filepath = f"{tmpdir}/{request.param}_xarray.nc"
-    ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
-    return filepath
-
-
-@pytest.fixture()
-def skip_test_for_libhdf5_version():
-    versions = netcdf_and_hdf5_versions()
-    libhdf5_version = Version(versions[0][1])
-    return libhdf5_version < Version("1.14")
-
-
-@pytest.fixture(params=["blosc_zlib"])
-def filter_encoded_roundtrip_netcdf4_file(
-    tmpdir, request, skip_test_for_libhdf5_version
-):
-    if skip_test_for_libhdf5_version:
-        pytest.skip("Requires libhdf5 >= 1.14")
-    ds = create_test_data(dim_sizes=(20, 80, 10))
-    if "blosc" in request.param:
-        encoding_config = {
-            "compression": request.param,
-            "chunksizes": (20, 40),
-            "original_shape": ds.var2.shape,
-            "blosc_shuffle": 1,
-            "fletcher32": False,
-        }
-    #  Check on how handle scalar dim.
-    ds = ds.drop_dims("dim3")
-    ds["var2"].encoding.update(encoding_config)
-    filepath = f"{tmpdir}/{request.param}_xarray.nc"
-    ds.to_netcdf(filepath, engine="netcdf4")
-    return {"filepath": filepath, "compressor": request.param}
-
-
-@pytest.fixture
-def np_uncompressed_int16():
-    return np.arange(100, dtype=np.int16)
-
-
-@pytest.fixture
-def offset():
-    return np.float32(5.0)
-
-
-@pytest.fixture
-def add_offset_hdf5_file(tmpdir, np_uncompressed_int16, offset):
-    filepath = f"{tmpdir}/offset.nc"
-    f = h5py.File(filepath, "w")
-    data = np_uncompressed_int16 - offset
-    f.create_dataset(name="data", data=data, chunks=True)
-    f["data"].attrs.create(name="add_offset", data=offset)
-    return filepath
-
-
-@pytest.fixture
-def scale_factor():
-    return 0.01
-
-
-@pytest.fixture
-def scale_add_offset_hdf5_file(tmpdir, np_uncompressed_int16, offset, scale_factor):
-    filepath = f"{tmpdir}/scale_offset.nc"
-    f = h5py.File(filepath, "w")
-    data = (np_uncompressed_int16 - offset) / scale_factor
-    f.create_dataset(name="data", data=data, chunks=True)
-    f["data"].attrs.create(name="add_offset", data=offset)
-    f["data"].attrs.create(name="scale_factor", data=np.array([scale_factor]))
-    return filepath
-
-
-@pytest.fixture()
-def chunked_roundtrip_hdf5_file(tmpdir):
-    ds = create_test_data(dim_sizes=(20, 80, 10))
-    ds = ds.drop_dims("dim3")
-    filepath = f"{tmpdir}/chunked_xarray.nc"
-    ds.to_netcdf(
-        filepath, engine="netcdf4", encoding={"var2": {"chunksizes": (10, 10)}}
-    )
-    return filepath
-
-
-@pytest.fixture(params=["gzip", "zlib"])
-def filter_and_cf_roundtrip_hdf5_file(tmpdir, request):
-    x = np.arange(100)
-    y = np.arange(100)
-    fill_value = np.int16(-9999)
-    temperature = 0.1 * x[:, None] + 0.1 * y[None, :]
-    temperature[0][0] = fill_value
-    ds = xr.Dataset(
-        {"temperature": (["x", "y"], temperature)},
-        coords={"x": np.arange(100), "y": np.arange(100)},
-    )
-    encoding = {
-        "temperature": {
-            "dtype": "int16",
-            "scale_factor": 0.1,
-            "add_offset": 273.15,
-            "_FillValue": fill_value,
-        },
-        "x": {"_FillValue": fill_value},
-        "y": {"_FillValue": fill_value},
-    }
-    if request.param == "gzip":
-        encoding["temperature"]["compression"] = "gzip"
-        encoding["temperature"]["compression_opts"] = 7
-
-    if request.param == "zlib":
-        encoding["temperature"]["zlib"] = True
-        encoding["temperature"]["complevel"] = 9
-
-    from random import randint
-
-    filepath = f"{tmpdir}/{request.param}_{randint(0,100)}_cf_roundtrip.nc"
-    ds.to_netcdf(filepath, engine="h5netcdf", encoding=encoding)
-
-    return filepath
diff --git a/virtualizarr/tests/test_readers/test_hdf.py b/virtualizarr/tests/test_readers/test_hdf.py
deleted file mode 100644
index 1fb0f6e..0000000
--- a/virtualizarr/tests/test_readers/test_hdf.py
+++ /dev/null
@@ -1,122 +0,0 @@
-import h5py
-import pytest
-
-from virtualizarr.readers.hdf import (
-    _dataset_chunk_manifest,
-    _dataset_dims,
-    _dataset_to_variable,
-    _extract_attrs,
-    virtual_vars_from_hdf,
-)
-
-
-class TestDatasetChunkManifest:
-    def test_empty_chunks(self, empty_chunks_hdf5_file):
-        f = h5py.File(empty_chunks_hdf5_file)
-        ds = f["data"]
-        with pytest.raises(ValueError, match="chunked but contains no chunks"):
-            _dataset_chunk_manifest(path=empty_chunks_hdf5_file, dataset=ds)
-
-    @pytest.mark.skip("Need to differentiate non coordinate dimensions from empty")
-    def test_empty_dataset(self, empty_dataset_hdf5_file):
-        f = h5py.File(empty_dataset_hdf5_file)
-        ds = f["data"]
-        with pytest.raises(ValueError, match="no space allocated in the file"):
-            _dataset_chunk_manifest(path=empty_dataset_hdf5_file, dataset=ds)
-
-    def test_no_chunking(self, no_chunks_hdf5_file):
-        f = h5py.File(no_chunks_hdf5_file)
-        ds = f["data"]
-        manifest = _dataset_chunk_manifest(path=no_chunks_hdf5_file, dataset=ds)
-        assert manifest.shape_chunk_grid == (1, 1)
-
-    def test_chunked(self, chunked_hdf5_file):
-        f = h5py.File(chunked_hdf5_file)
-        ds = f["data"]
-        manifest = _dataset_chunk_manifest(path=chunked_hdf5_file, dataset=ds)
-        assert manifest.shape_chunk_grid == (2, 2)
-
-    def test_chunked_roundtrip(self, chunked_roundtrip_hdf5_file):
-        f = h5py.File(chunked_roundtrip_hdf5_file)
-        ds = f["var2"]
-        manifest = _dataset_chunk_manifest(path=chunked_roundtrip_hdf5_file, dataset=ds)
-        assert manifest.shape_chunk_grid == (2, 8)
-
-
-class TestDatasetDims:
-    def test_single_dimension_scale(self, single_dimension_scale_hdf5_file):
-        f = h5py.File(single_dimension_scale_hdf5_file)
-        ds = f["data"]
-        dims = _dataset_dims(ds)
-        assert dims[0] == "x"
-
-    def test_is_dimension_scale(self, is_scale_hdf5_file):
-        f = h5py.File(is_scale_hdf5_file)
-        ds = f["data"]
-        dims = _dataset_dims(ds)
-        assert dims[0] == "data"
-
-    def test_multiple_dimension_scales(self, multiple_dimension_scales_hdf5_file):
-        f = h5py.File(multiple_dimension_scales_hdf5_file)
-        ds = f["data"]
-        with pytest.raises(ValueError, match="dimension scales attached"):
-            _dataset_dims(ds)
-
-    def test_no_dimension_scales(self, no_chunks_hdf5_file):
-        f = h5py.File(no_chunks_hdf5_file)
-        ds = f["data"]
-        dims = _dataset_dims(ds)
-        assert dims == ["phony_dim_0", "phony_dim_1"]
-
-
-class TestDatasetToVariable:
-    def test_chunked_dataset(self, chunked_dimensions_netcdf4_file):
-        f = h5py.File(chunked_dimensions_netcdf4_file)
-        ds = f["data"]
-        var = _dataset_to_variable(chunked_dimensions_netcdf4_file, ds)
-        assert var.chunks == (50, 50)
-
-    def test_not_chunked_dataset(self, single_dimension_scale_hdf5_file):
-        f = h5py.File(single_dimension_scale_hdf5_file)
-        ds = f["data"]
-        var = _dataset_to_variable(single_dimension_scale_hdf5_file, ds)
-        assert var.chunks == (2,)
-
-    def test_dataset_attributes(self, string_attributes_hdf5_file):
-        f = h5py.File(string_attributes_hdf5_file)
-        ds = f["data"]
-        var = _dataset_to_variable(string_attributes_hdf5_file, ds)
-        assert var.attrs["attribute_name"] == "attribute_name"
-
-
-class TestExtractAttributes:
-    def test_string_attribute(self, string_attributes_hdf5_file):
-        f = h5py.File(string_attributes_hdf5_file)
-        ds = f["data"]
-        attrs = _extract_attrs(ds)
-        assert attrs["attribute_name"] == "attribute_name"
-
-    def test_root_attribute(self, root_attributes_hdf5_file):
-        f = h5py.File(root_attributes_hdf5_file)
-        attrs = _extract_attrs(f)
-        assert attrs["attribute_name"] == "attribute_name"
-
-    def test_multiple_attributes(self, string_attributes_hdf5_file):
-        f = h5py.File(string_attributes_hdf5_file)
-        ds = f["data"]
-        attrs = _extract_attrs(ds)
-        assert len(attrs.keys()) == 2
-
-
-class TestVirtualVarsFromHDF:
-    def test_variable_with_dimensions(self, chunked_dimensions_netcdf4_file):
-        variables = virtual_vars_from_hdf(chunked_dimensions_netcdf4_file)
-        assert len(variables) == 3
-
-    def test_groups_not_implemented(self, group_hdf5_file):
-        with pytest.raises(NotImplementedError):
-            virtual_vars_from_hdf(group_hdf5_file)
-
-    def test_drop_variables(self, multiple_datasets_hdf5_file):
-        variables = virtual_vars_from_hdf(multiple_datasets_hdf5_file, ["data2"])
-        assert "data2" not in variables.keys()
diff --git a/virtualizarr/tests/test_readers/test_hdf_filters.py b/virtualizarr/tests/test_readers/test_hdf_filters.py
deleted file mode 100644
index efaad78..0000000
--- a/virtualizarr/tests/test_readers/test_hdf_filters.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import h5py
-import imagecodecs
-import numcodecs
-import numpy as np
-
-from virtualizarr.readers.hdf_filters import (
-    _filter_to_codec,
-    cfcodec_from_dataset,
-    codecs_from_dataset,
-)
-
-
-class TestFilterToCodec:
-    def test_gzip_uses_zlib_numcodec(self):
-        codec = _filter_to_codec("gzip", 1)
-        assert isinstance(codec, numcodecs.zlib.Zlib)
-
-    def test_lzf(self):
-        codec = _filter_to_codec("lzf")
-        assert isinstance(codec, imagecodecs.numcodecs.Lzf)
-
-    def test_blosc(self):
-        codec = _filter_to_codec("32001", (2, 2, 8, 800, 9, 2, 1))
-        assert isinstance(codec, numcodecs.blosc.Blosc)
-        expected_config = {
-            "id": "blosc",
-            "blocksize": 800,
-            "clevel": 9,
-            "shuffle": 2,
-            "cname": "lz4",
-        }
-        assert codec.get_config() == expected_config
-
-    def test_zstd(self):
-        codec = _filter_to_codec("32015", (5,))
-        assert isinstance(codec, numcodecs.zstd.Zstd)
-        expected_config = {"id": "zstd", "level": 5}
-        assert codec.get_config() == expected_config
-
-    def test_shuffle(self):
-        codec = _filter_to_codec("shuffle", (7,))
-        assert isinstance(codec, numcodecs.shuffle.Shuffle)
-        expected_config = {"id": "shuffle", "elementsize": 7}
-        assert codec.get_config() == expected_config
-
-
-class TestCodecsFromDataSet:
-    def test_numcodec_decoding(self, np_uncompressed, filter_encoded_hdf5_file):
-        f = h5py.File(filter_encoded_hdf5_file)
-        ds = f["data"]
-        chunk_info = ds.id.get_chunk_info(0)
-        codecs = codecs_from_dataset(ds)
-        with open(filter_encoded_hdf5_file, "rb") as file:
-            file.seek(chunk_info.byte_offset)
-            bytes_read = file.read(chunk_info.size)
-            decoded = codecs[0].decode(bytes_read)
-            if isinstance(decoded, np.ndarray):
-                assert decoded.tobytes() == np_uncompressed.tobytes()
-            else:
-                assert decoded == np_uncompressed.tobytes()
-
-
-class TestCFCodecFromDataset:
-    def test_no_cf_convention(self, filter_encoded_hdf5_file):
-        f = h5py.File(filter_encoded_hdf5_file)
-        ds = f["data"]
-        cf_codec = cfcodec_from_dataset(ds)
-        assert cf_codec is None
-
-    def test_cf_scale_factor(self, netcdf4_file):
-        f = h5py.File(netcdf4_file)
-        ds = f["air"]
-        cf_codec = cfcodec_from_dataset(ds)
-        assert cf_codec["target_dtype"] == np.dtype(np.float64)
-        assert cf_codec["codec"].scale == 100.0
-        assert cf_codec["codec"].offset == 0
-        assert cf_codec["codec"].dtype == "<f8"
-        assert cf_codec["codec"].astype == "<i2"
-
-    def test_cf_add_offset(self, add_offset_hdf5_file):
-        f = h5py.File(add_offset_hdf5_file)
-        ds = f["data"]
-        cf_codec = cfcodec_from_dataset(ds)
-        assert cf_codec["target_dtype"] == np.dtype(np.float64)
-        assert cf_codec["codec"].scale == 1
-        assert cf_codec["codec"].offset == 5
-        assert cf_codec["codec"].dtype == "<f8"
-
-    def test_cf_codec_decoding_offset(
-        self, add_offset_hdf5_file, np_uncompressed_int16
-    ):
-        f = h5py.File(add_offset_hdf5_file)
-        ds = f["data"]
-        chunk_info = ds.id.get_chunk_info(0)
-        cfcodec = cfcodec_from_dataset(ds)
-        with open(add_offset_hdf5_file, "rb") as file:
-            file.seek(chunk_info.byte_offset)
-            bytes_read = file.read(chunk_info.size)
-            decoded = cfcodec["codec"].decode(bytes_read)
-            assert np.array_equal(decoded, np_uncompressed_int16)
-            assert decoded.dtype == np.float64
-
-    def test_cf_codec_decoding_scale_offset(
-        self, scale_add_offset_hdf5_file, np_uncompressed_int16
-    ):
-        f = h5py.File(scale_add_offset_hdf5_file)
-        ds = f["data"]
-        chunk_info = ds.id.get_chunk_info(0)
-        cfcodec = cfcodec_from_dataset(ds)
-        with open(scale_add_offset_hdf5_file, "rb") as file:
-            file.seek(chunk_info.byte_offset)
-            bytes_read = file.read(chunk_info.size)
-            decoded = cfcodec["codec"].decode(bytes_read)
-            assert np.allclose(decoded, np_uncompressed_int16)
-            assert decoded.dtype == np.float64
diff --git a/virtualizarr/tests/test_readers/test_hdf_integration.py b/virtualizarr/tests/test_readers/test_hdf_integration.py
deleted file mode 100644
index 5cf3f79..0000000
--- a/virtualizarr/tests/test_readers/test_hdf_integration.py
+++ /dev/null
@@ -1,40 +0,0 @@
-import xarray as xr
-import xarray.testing as xrt
-
-import virtualizarr
-from virtualizarr.kerchunk import FileType
-
-
-class TestIntegration:
-    def test_filters_h5netcdf_roundtrip(
-        self, tmpdir, filter_encoded_roundtrip_hdf5_file
-    ):
-        ds = xr.open_dataset(filter_encoded_roundtrip_hdf5_file, decode_times=True)
-        vds = virtualizarr.open_virtual_dataset(
-            filter_encoded_roundtrip_hdf5_file,
-            loadable_variables=["time"],
-            cftime_variables=["time"],
-        )
-        kerchunk_file = f"{tmpdir}/kerchunk.json"
-        vds.virtualize.to_kerchunk(kerchunk_file, format="json")
-        roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk", decode_times=True)
-        xrt.assert_allclose(ds, roundtrip)
-
-    def test_filters_netcdf4_roundtrip(
-        self, tmpdir, filter_encoded_roundtrip_netcdf4_file
-    ):
-        filepath = filter_encoded_roundtrip_netcdf4_file["filepath"]
-        ds = xr.open_dataset(filepath)
-        vds = virtualizarr.open_virtual_dataset(filepath, filetype=FileType("netcdf4"))
-        kerchunk_file = f"{tmpdir}/kerchunk.json"
-        vds.virtualize.to_kerchunk(kerchunk_file, format="json")
-        roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
-        xrt.assert_equal(ds, roundtrip)
-
-    def test_filter_and_cf_roundtrip(self, tmpdir, filter_and_cf_roundtrip_hdf5_file):
-        ds = xr.open_dataset(filter_and_cf_roundtrip_hdf5_file)
-        vds = virtualizarr.open_virtual_dataset(filter_and_cf_roundtrip_hdf5_file)
-        kerchunk_file = f"{tmpdir}/filter_cf_kerchunk.json"
-        vds.virtualize.to_kerchunk(kerchunk_file, format="json")
-        roundtrip = xr.open_dataset(kerchunk_file, engine="kerchunk")
-        xrt.assert_allclose(ds, roundtrip)
diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
index 4490d54..d0fe2e3 100644
--- a/virtualizarr/tests/test_xarray.py
+++ b/virtualizarr/tests/test_xarray.py
@@ -8,7 +8,6 @@
 from xarray.core.indexes import Index
 
 from virtualizarr import open_virtual_dataset
-from virtualizarr.kerchunk import FileType
 from virtualizarr.manifests import ChunkManifest, ManifestArray
 from virtualizarr.tests import has_astropy, has_tifffile, network, requires_s3fs
 from virtualizarr.zarr import ZArray
@@ -407,25 +406,18 @@ def test_explicit_filetype(self, netcdf4_file):
         with pytest.raises(NotImplementedError):
             open_virtual_dataset(netcdf4_file, filetype="grib")
 
-    @patch("virtualizarr.xarray._automatically_determine_filetype")
-    @patch("virtualizarr.xarray.virtual_vars_from_hdf")
+    @patch("virtualizarr.kerchunk.read_kerchunk_references_from_file")
     def test_open_virtual_dataset_passes_expected_args(
-        self, mock_reader, mock_determine_filetype, netcdf4_file
+        self, mock_read_kerchunk, netcdf4_file
     ):
         reader_options = {"option1": "value1", "option2": "value2"}
-        mock_determine_filetype.return_value = FileType.netcdf4
         open_virtual_dataset(netcdf4_file, indexes={}, reader_options=reader_options)
-        reader_args = {
-            "path": netcdf4_file,
-            "drop_variables": [],
-            "reader_options": reader_options,
-        }
-        mock_reader.assert_called_once_with(**reader_args)
-        filetype_args = {
+        args = {
             "filepath": netcdf4_file,
+            "filetype": None,
             "reader_options": reader_options,
         }
-        mock_determine_filetype.assert_called_once_with(**filetype_args)
+        mock_read_kerchunk.assert_called_once_with(**args)
 
 
 class TestRenamePaths:
diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
index d11a69f..df84717 100644
--- a/virtualizarr/xarray.py
+++ b/virtualizarr/xarray.py
@@ -17,13 +17,8 @@
 from xarray.core.variable import IndexVariable
 
 import virtualizarr.kerchunk as kerchunk
-from virtualizarr.kerchunk import (
-    FileType,
-    KerchunkStoreRefs,
-    _automatically_determine_filetype,
-)
+from virtualizarr.kerchunk import FileType, KerchunkStoreRefs
 from virtualizarr.manifests import ChunkManifest, ManifestArray
-from virtualizarr.readers.hdf import attrs_from_root_group, virtual_vars_from_hdf
 from virtualizarr.utils import _fsspec_openfile_from_filepath
 from virtualizarr.zarr import (
     attrs_from_zarr_group_json,
@@ -122,6 +117,7 @@ def open_virtual_dataset(
 
     if virtual_array_class is not ManifestArray:
         raise NotImplementedError()
+
     if filetype == "zarr_v3":
         # TODO is there a neat way of auto-detecting this?
         return open_virtual_dataset_from_v3_store(
@@ -132,38 +128,21 @@ def open_virtual_dataset(
             reader_options = {
                 "storage_options": {"key": "", "secret": "", "anon": True}
             }
-        if filetype is None:
-            filetype = _automatically_determine_filetype(
-                filepath=filepath, reader_options=reader_options
-            )
-        filetype = FileType(filetype)
-        if filetype.name.lower() == "netcdf4" or filetype.name.lower() == "hdf5":
-            virtual_vars = virtual_vars_from_hdf(
-                path=filepath,
-                drop_variables=drop_variables,
-                reader_options=reader_options,
-            )
-            ds_attrs = attrs_from_root_group(
-                path=filepath, reader_options=reader_options
-            )
-            coord_names = ds_attrs.pop("coordinates", [])
-        else:
-            # this is the only place we actually always need to use kerchunk directly
-            # TODO avoid even reading byte ranges for variables that will be dropped later anyway?
-            vds_refs = kerchunk.read_kerchunk_references_from_file(
-                filepath=filepath,
-                filetype=filetype,
-                reader_options=reader_options,
-            )
-            virtual_vars = virtual_vars_from_kerchunk_refs(
-                vds_refs,
-                drop_variables=drop_variables + loadable_variables,
-                virtual_array_class=virtual_array_class,
-            )
-            ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(
-                ".zattrs", {}
-            )
-            coord_names = ds_attrs.pop("coordinates", [])
+
+        # this is the only place we actually always need to use kerchunk directly
+        # TODO avoid even reading byte ranges for variables that will be dropped later anyway?
+        vds_refs = kerchunk.read_kerchunk_references_from_file(
+            filepath=filepath,
+            filetype=filetype,
+            reader_options=reader_options,
+        )
+        virtual_vars = virtual_vars_from_kerchunk_refs(
+            vds_refs,
+            drop_variables=drop_variables + loadable_variables,
+            virtual_array_class=virtual_array_class,
+        )
+        ds_attrs = kerchunk.fully_decode_arr_refs(vds_refs["refs"]).get(".zattrs", {})
+        coord_names = ds_attrs.pop("coordinates", [])
 
         if indexes is None or len(loadable_variables) > 0:
             # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
@@ -211,6 +190,7 @@ def open_virtual_dataset(
         vars = {**virtual_vars, **loadable_vars}
 
         data_vars, coords = separate_coords(vars, indexes, coord_names)
+
         vds = xr.Dataset(
             data_vars,
             coords=coords,

From 50c3dcd43d7569eaf57ebad0b85701293eec9101 Mon Sep 17 00:00:00 2001
From: Gustavo Hidalgo <guhidalgo@microsoft.com>
Date: Fri, 19 Jul 2024 16:26:45 -0400
Subject: [PATCH 66/68] Fix unit tests

---
 docs/releases.rst                               |  2 +-
 virtualizarr/tests/__init__.py                  |  4 ++--
 virtualizarr/tests/test_manifests/test_array.py | 12 ++++++------
 virtualizarr/tests/test_xarray.py               | 10 +++++-----
 virtualizarr/tests/test_zarr.py                 |  2 +-
 virtualizarr/zarr.py                            | 16 ++++++++++------
 6 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/docs/releases.rst b/docs/releases.rst
index 1451191..3eeed7e 100644
--- a/docs/releases.rst
+++ b/docs/releases.rst
@@ -12,7 +12,7 @@ New Features
 Breaking changes
 ~~~~~~~~~~~~~~~~
 
-- Serialize valid ZarrV3 metadata (for :pull:`193`).
+- Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`)
   By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.
 
 Deprecations
diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py
index 3856a6b..7df13d1 100644
--- a/virtualizarr/tests/__init__.py
+++ b/virtualizarr/tests/__init__.py
@@ -48,9 +48,9 @@ def create_manifestarray(
 
     zarray = ZArray(
         chunks=chunks,
-        compressor="zlib",
+        compressor={"id": "blosc", "clevel": 5, "cname": "lz4", "shuffle": 1},
         dtype=np.dtype("float32"),
-        fill_value=0.0,  # TODO change this to NaN?
+        fill_value=0.0,
         filters=None,
         order="C",
         shape=shape,
diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py
index 459e60b..6d5ede7 100644
--- a/virtualizarr/tests/test_manifests/test_array.py
+++ b/virtualizarr/tests/test_manifests/test_array.py
@@ -19,7 +19,7 @@ def test_create_manifestarray(self):
         shape = (5, 2, 20)
         zarray = ZArray(
             chunks=chunks,
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -74,7 +74,7 @@ def test_equals(self):
         shape = (5, 2, 20)
         zarray = ZArray(
             chunks=chunks,
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -95,7 +95,7 @@ def test_not_equal_chunk_entries(self):
         # both manifest arrays in this example have the same zarray properties
         zarray = ZArray(
             chunks=(5, 1, 10),
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -209,7 +209,7 @@ def test_concat(self):
         # both manifest arrays in this example have the same zarray properties
         zarray = ZArray(
             chunks=(5, 1, 10),
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -254,7 +254,7 @@ def test_stack(self):
         # both manifest arrays in this example have the same zarray properties
         zarray = ZArray(
             chunks=(5, 10),
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -299,7 +299,7 @@ def test_refuse_combine():
 
     zarray_common = {
         "chunks": (5, 1, 10),
-        "compressor": "zlib",
+        "compressor": {"id": "zlib", "level": 1},
         "dtype": np.dtype("int32"),
         "fill_value": 0.0,
         "filters": None,
diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
index d0fe2e3..7fb7a02 100644
--- a/virtualizarr/tests/test_xarray.py
+++ b/virtualizarr/tests/test_xarray.py
@@ -19,7 +19,7 @@ def test_wrapping():
     dtype = np.dtype("int32")
     zarray = ZArray(
         chunks=chunks,
-        compressor="zlib",
+        compressor={"id": "zlib", "level": 1},
         dtype=dtype,
         fill_value=0.0,
         filters=None,
@@ -49,7 +49,7 @@ def test_equals(self):
         shape = (5, 20)
         zarray = ZArray(
             chunks=chunks,
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -86,7 +86,7 @@ def test_concat_along_existing_dim(self):
         # both manifest arrays in this example have the same zarray properties
         zarray = ZArray(
             chunks=(1, 10),
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -133,7 +133,7 @@ def test_concat_along_new_dim(self):
         # both manifest arrays in this example have the same zarray properties
         zarray = ZArray(
             chunks=(5, 10),
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
@@ -183,7 +183,7 @@ def test_concat_dim_coords_along_existing_dim(self):
         # both manifest arrays in this example have the same zarray properties
         zarray = ZArray(
             chunks=(10,),
-            compressor="zlib",
+            compressor={"id": "zlib", "level": 1},
             dtype=np.dtype("int32"),
             fill_value=0.0,
             filters=None,
diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py
index 29db840..7715d24 100644
--- a/virtualizarr/tests/test_zarr.py
+++ b/virtualizarr/tests/test_zarr.py
@@ -20,7 +20,7 @@ def vds_with_manifest_arrays() -> xr.Dataset:
             shape=(2, 3),
             dtype=np.dtype("<i8"),
             chunks=(2, 3),
-            compressor="gzip",
+            compressor={"id": "zlib", "level": 1},
             filters=None,
             fill_value=0,
             order="C",
diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py
index 7e5674e..f772e8d 100644
--- a/virtualizarr/zarr.py
+++ b/virtualizarr/zarr.py
@@ -354,20 +354,21 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]:
         )
     else:
         fill_value = metadata["fill_value"]
+
     all_codecs = [
         codec
         for codec in metadata["codecs"]
         if codec["name"] not in ("transpose", "bytes")
     ]
-    # TODO: hdf.py treats all codecs as filter, but maybe one needs to be the compressor?
-    compressor = None #all_codecs[0] if all_codecs else None
-    filters = [_configurable_to_num_codec_config(_filter) for _filter in all_codecs] or None
+    compressor, *filters = [
+        _configurable_to_num_codec_config(_filter) for _filter in all_codecs
+    ]
     zarray = ZArray(
-        chunks=metadata["chunk_grid"]["configuration"]["chunk_shape"],
-        compressor=_configurable_to_num_codec_config(compressor) if compressor else None,
+        chunks=chunk_shape,
+        compressor=compressor,
         dtype=np.dtype(metadata["data_type"]),
         fill_value=fill_value,
-        filters=filters,
+        filters=filters or None,
         order="C",
         shape=shape,
         zarr_format=zarr_format,
@@ -375,6 +376,7 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]:
 
     return zarray, dim_names, attrs
 
+
 def _configurable_to_num_codec_config(configurable: dict) -> dict:
     """
     Convert a zarr v3 configurable into a numcodecs codec.
@@ -384,6 +386,7 @@ def _configurable_to_num_codec_config(configurable: dict) -> dict:
     configuration = configurable_copy.pop("configuration")
     return numcodecs.get_codec({"id": codec_id, **configuration}).get_config()
 
+
 def _num_codec_config_to_configurable(num_codec: dict) -> dict:
     """
     Convert a numcodecs codec into a zarr v3 configurable.
@@ -391,6 +394,7 @@ def _num_codec_config_to_configurable(num_codec: dict) -> dict:
     num_codec_copy = num_codec.copy()
     return {"name": num_codec_copy.pop("id"), "configuration": num_codec_copy}
 
+
 def _default_fill_value(dtype: np.dtype) -> Union[bool, int, float, str, list]:
     """
     The value and format of the fill_value depend on the data_type of the array.

From ab97e6398c3c97712bf56bb249a8ea2420bc28cb Mon Sep 17 00:00:00 2001
From: Gustavo Hidalgo <guhidalgo@microsoft.com>
Date: Mon, 22 Jul 2024 11:36:10 -0400
Subject: [PATCH 67/68] PR comments

---
 virtualizarr/zarr.py | 42 +++++++++++++++---------------------------
 1 file changed, 15 insertions(+), 27 deletions(-)

diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py
index f772e8d..932e7da 100644
--- a/virtualizarr/zarr.py
+++ b/virtualizarr/zarr.py
@@ -6,7 +6,6 @@
     Literal,
     NewType,
     Optional,
-    Union,
 )
 
 import numcodecs
@@ -33,6 +32,20 @@
 )  # just the .zattrs (for one array or for the whole store/group)
 FillValueT = bool | str | float | int | list | None
 
+ZARR_DEFAULT_FILL_VALUE: dict[np.dtype, FillValueT] = {
+    # numpy dtypes's hierarchy lets us avoid checking for all the widths
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html
+    np.dtype("bool"): False,
+    np.dtype("int"): 0,
+    np.dtype("float"): 0.0,
+    np.dtype("complex"): [0.0, 0.0],
+}
+"""
+The value and format of the fill_value depend on the `data_type` of the array.
+See here for spec:
+https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value
+"""
+
 
 class Codec(BaseModel):
     compressor: dict | None = None
@@ -77,7 +90,7 @@ def __post_init__(self) -> None:
     @model_validator(mode="after")
     def _check_fill_value(self) -> Self:
         if self.fill_value is None:
-            self.fill_value = _default_fill_value(self.dtype)
+            self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype, default=0.0)
         return self
 
     @property
@@ -96,11 +109,6 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray":
             fill_value = np.nan
 
         compressor = decoded_arr_refs_zarray["compressor"]
-        # deal with an inconsistency in kerchunk's tiff_to_zarr function
-        # TODO should this be moved to the point where we actually call tiff_to_zarr? Or ideally made consistent upstream.
-        # if compressor is not None and "id" in compressor:
-        #     compressor = compressor["id"]
-
         return ZArray(
             chunks=tuple(decoded_arr_refs_zarray["chunks"]),
             compressor=compressor,
@@ -393,23 +401,3 @@ def _num_codec_config_to_configurable(num_codec: dict) -> dict:
     """
     num_codec_copy = num_codec.copy()
     return {"name": num_codec_copy.pop("id"), "configuration": num_codec_copy}
-
-
-def _default_fill_value(dtype: np.dtype) -> Union[bool, int, float, str, list]:
-    """
-    The value and format of the fill_value depend on the data_type of the array.
-    See here for spec:
-    https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value
-    """
-    # numpy dtypes's hierarchy lets us avoid checking for all the widths
-    # https://numpy.org/doc/stable/reference/arrays.scalars.html
-    if dtype is np.dtype("bool"):
-        return False
-    elif dtype is np.dtype("int"):
-        return 0
-    elif dtype is np.dtype("float"):
-        return 0.0
-    elif dtype is np.dtype("complex"):
-        return [0.0, 0.0]
-    else:
-        return 0.0

From 0be0728416a4d572d3aab0fb356d776f7173876c Mon Sep 17 00:00:00 2001
From: Gustavo Hidalgo <guhidalgo@microsoft.com>
Date: Mon, 22 Jul 2024 11:44:05 -0400
Subject: [PATCH 68/68] Remove kwarg in dict default

---
 virtualizarr/zarr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py
index 932e7da..e5015b3 100644
--- a/virtualizarr/zarr.py
+++ b/virtualizarr/zarr.py
@@ -90,7 +90,7 @@ def __post_init__(self) -> None:
     @model_validator(mode="after")
     def _check_fill_value(self) -> Self:
         if self.fill_value is None:
-            self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype, default=0.0)
+            self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype, 0.0)
         return self
 
     @property