zarr-developers · TomNicholas · Jul 22, 2024 · Apr 19, 2024 · Apr 19, 2024 · Apr 19, 2024
diff --git a/docs/releases.rst b/docs/releases.rst
@@ -12,6 +12,9 @@ New Features
 Breaking changes
 ~~~~~~~~~~~~~~~~
 
+- Serialize valid ZarrV3 metadata (for :pull:`193`).
+  By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.
+
 Deprecations
 ~~~~~~~~~~~~
 

diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py
@@ -138,7 +138,7 @@ def test_non_dimension_coordinates(self, tmpdir, format):
         # regression test for GH issue #105
 
         # set up example xarray dataset containing non-dimension coordinate variables
-        ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6).reshape(2, 3))})
+        ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6.0).reshape(2, 3))})
 
         # save it to disk as netCDF (in temporary directory)
         ds.to_netcdf(f"{tmpdir}/non_dim_coords.nc")

diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py
@@ -1,12 +1,17 @@
+import json
+
 import numpy as np
+import pytest
 import xarray as xr
 import xarray.testing as xrt
 
 from virtualizarr import ManifestArray, open_virtual_dataset
 from virtualizarr.manifests.manifest import ChunkManifest
+from virtualizarr.zarr import dataset_to_zarr, metadata_from_zarr_json
 
 
-def test_zarr_v3_roundtrip(tmpdir):
+@pytest.fixture
+def vds_with_manifest_arrays() -> xr.Dataset:
     arr = ManifestArray(
         chunkmanifest=ChunkManifest(
             entries={"0.0": dict(path="test.nc", offset=6144, length=48)}
@@ -15,18 +20,63 @@ def test_zarr_v3_roundtrip(tmpdir):
             shape=(2, 3),
             dtype=np.dtype("<i8"),
             chunks=(2, 3),
-            compressor=None,
+            compressor="gzip",
             filters=None,
-            fill_value=np.nan,
+            fill_value=0,
             order="C",
             zarr_format=3,
         ),
     )
-    original = xr.Dataset({"a": (["x", "y"], arr)}, attrs={"something": 0})
+    return xr.Dataset({"a": (["x", "y"], arr)}, attrs={"something": 0})
+
+
+def isconfigurable(value: dict):
+    """
+    Several metadata attributes in ZarrV3 use a dictionary with keys "name" : str and "configuration" : dict
+    """
+    return "name" in value and "configuration" in value
 
-    original.virtualize.to_zarr(tmpdir / "store.zarr")
+
+def test_zarr_v3_roundtrip(tmpdir, vds_with_manifest_arrays: xr.Dataset):
+    vds_with_manifest_arrays.virtualize.to_zarr(tmpdir / "store.zarr")
     roundtrip = open_virtual_dataset(
         tmpdir / "store.zarr", filetype="zarr_v3", indexes={}
     )
 
-    xrt.assert_identical(roundtrip, original)
+    xrt.assert_identical(roundtrip, vds_with_manifest_arrays)
+
+
+def test_metadata_roundtrip(tmpdir, vds_with_manifest_arrays: xr.Dataset):
+    dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr")
+    zarray, _, _ = metadata_from_zarr_json(tmpdir / "store.zarr/a/zarr.json")
+    assert zarray == vds_with_manifest_arrays.a.data.zarray
+
+
+def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: xr.Dataset):
+    """
+    Checks that the output metadata of an array variable conforms to this spec
+    for the required attributes:
+    https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#metadata
+    """
+    dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr")
+    # read the a variable's metadata
+    with open(tmpdir / "store.zarr/a/zarr.json", mode="r") as f:
+        metadata = json.loads(f.read())
+    assert metadata["zarr_format"] == 3
+    assert metadata["node_type"] == "array"
+    assert isinstance(metadata["shape"], list) and all(
+        isinstance(dim, int) for dim in metadata["shape"]
+    )
+    assert isinstance(metadata["data_type"], str) or isconfigurable(
+        metadata["data_type"]
+    )
+    assert isconfigurable(metadata["chunk_grid"])
+    assert isconfigurable(metadata["chunk_key_encoding"])
+    assert any(
+        isinstance(metadata["fill_value"], t) for t in (bool, int, float, str, list)
+    )
+    assert (
+        isinstance(metadata["codecs"], list)
+        and len(metadata["codecs"]) > 1
+        and all(isconfigurable(codec) for codec in metadata["codecs"])
+    )
diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py
@@ -6,12 +6,21 @@
     Literal,
     NewType,
     Optional,
+    Union,
 )
 
+import numcodecs
 import numpy as np
 import ujson  # type: ignore
 import xarray as xr
-from pydantic import BaseModel, ConfigDict, field_validator
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    field_validator,
+    model_validator,
+)
+from typing_extensions import Self
 
 from virtualizarr.vendor.zarr.utils import json_dumps
 
@@ -22,6 +31,7 @@
 ZAttrs = NewType(
     "ZAttrs", dict[str, Any]
 )  # just the .zattrs (for one array or for the whole store/group)
+FillValueT = bool | str | float | int | list | None
 
 
 class Codec(BaseModel):
@@ -44,7 +54,7 @@ class ZArray(BaseModel):
     chunks: tuple[int, ...]
     compressor: str | None = None
     dtype: np.dtype
-    fill_value: float | int | None = np.nan  # float or int?
+    fill_value: FillValueT = Field(default=0.0, validate_default=True)
     filters: list[dict] | None = None
     order: Literal["C", "F"]
     shape: tuple[int, ...]
@@ -64,6 +74,12 @@ def __post_init__(self) -> None:
                 f"Array shape {self.shape} has ndim={self.shape} but chunk shape {self.chunks} has ndim={len(self.chunks)}"
             )
 
+    @model_validator(mode="after")
+    def _check_fill_value(self) -> Self:
+        if self.fill_value is None:
+            self.fill_value = _default_fill_value(self.dtype)
+        return self
+
     @property
     def codec(self) -> Codec:
         """For comparison against other arrays."""
@@ -98,16 +114,14 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray":
 
     def dict(self) -> dict[str, Any]:
         zarray_dict = dict(self)
-
         zarray_dict["dtype"] = encode_dtype(zarray_dict["dtype"])
-
-        if zarray_dict["fill_value"] is np.nan:
-            zarray_dict["fill_value"] = None
-
         return zarray_dict
 
     def to_kerchunk_json(self) -> str:
-        return ujson.dumps(self.dict())
+        zarray_dict = self.dict()
+        if zarray_dict["fill_value"] is np.nan:
+            zarray_dict["fill_value"] = None
+        return ujson.dumps(zarray_dict)
 
     def replace(
         self,
@@ -134,6 +148,61 @@ def replace(
             zarr_format=zarr_format if zarr_format is not None else self.zarr_format,
         )
 
+    def _v3_codec_pipeline(self) -> list:
+        """
+        VirtualiZarr internally uses the `filters`, `compressor`, and `order` attributes
+        from zarr v2, but to create conformant zarr v3 metadata those 3 must be turned into `codecs` objects.
+        Not all codecs are created equal though: https://github.com/zarr-developers/zarr-python/issues/1943
+        An array _must_ declare a single ArrayBytes codec, and 0 or more ArrayArray, BytesBytes codecs.
+        Roughly, this is the mapping:
+        ```
+            filters: Iterable[ArrayArrayCodec] #optional
+            compressor: ArrayBytesCodec #mandatory
+            post_compressor: Iterable[BytesBytesCodec] #optional
+        ```
+        """
+        if self.filters:
+            filter_codecs_configs = [
+                numcodecs.get_codec(filter).get_config() for filter in self.filters
+            ]
+            filters = [
+                dict(name=codec.pop("id"), configuration=codec)
+                for codec in filter_codecs_configs
+            ]
+        else:
+            filters = []
+
+        # Noting here that zarr v3 has very few codecs specificed in the official spec,
+        # and that there are far more codecs in `numcodecs`. We take a gamble and assume
+        # that the codec names and configuration are simply mapped into zarrv3 "configurables".
+        compressor_codec = numcodecs.get_codec(
+            # default to gzip because it is officially specified in the zarr v3 spec
+            dict(id=self.compressor or "gzip")
+        ).get_config()
+        compressor_id = compressor_codec.pop("id")
+        compressor = dict(name=compressor_id, configuration=compressor_codec)
+
+        # https://zarr-specs.readthedocs.io/en/latest/v3/codecs/transpose/v1.0.html#transpose-codec-v1
+        # Either "C" or "F", defining the layout of bytes within each chunk of the array.
+        # "C" means row-major order, i.e., the last dimension varies fastest;
+        # "F" means column-major order, i.e., the first dimension varies fastest.
+        if self.order == "C":
+            order = tuple(range(len(self.shape)))
+        elif self.order == "F":
+            order = tuple(reversed(range(len(self.shape))))
+
+        transpose = dict(name="transpose", configuration=dict(order=order))
+        # https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097
+        # "If no ArrayBytesCodec is supplied, we can auto-add a BytesCodec"
+        bytes = dict(
+            name="bytes", configuration={}
+        )  # TODO need to handle endianess configuration
+
+        # The order here is significant!
+        # [ArrayArray] -> ArrayBytes -> [BytesBytes]
+        codec_pipeline = [transpose, bytes] + [compressor] + filters
+        return codec_pipeline
+
 
 def encode_dtype(dtype: np.dtype) -> str:
     # TODO not sure if there is a better way to get the '<i4' style representation of the dtype out
@@ -234,9 +303,10 @@ def zarr_v3_array_metadata(zarray: ZArray, dim_names: list[str], attrs: dict) ->
         "name": "default",
         "configuration": {"separator": "/"},
     }
-    metadata["codecs"] = metadata.pop("filters")
-    metadata.pop("compressor")  # TODO this should be entered in codecs somehow
-    metadata.pop("order")  # TODO this should be replaced by a transpose codec
+    metadata["codecs"] = zarray._v3_codec_pipeline()
+    metadata.pop("filters")
+    metadata.pop("compressor")
+    metadata.pop("order")
 
     # indicate that we're using the manifest storage transformer ZEP
     metadata["storage_transformers"] = [
@@ -279,19 +349,47 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]:
     chunk_shape = metadata["chunk_grid"]["configuration"]["chunk_shape"]
 
     if metadata["fill_value"] is None:
-        fill_value = np.nan
+        raise ValueError(
+            "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value"
+        )
     else:
         fill_value = metadata["fill_value"]
-
+    all_codecs = [
+        codec
+        for codec in metadata["codecs"]
+        if codec["name"] not in ("transpose", "bytes")
+    ]
+    compressor = all_codecs[0]
+    filters = [dict(id=f.pop("name"), **f) for f in all_codecs[1:]] or None
     zarray = ZArray(
         chunks=metadata["chunk_grid"]["configuration"]["chunk_shape"],
-        compressor=metadata["codecs"],
+        compressor=compressor["name"],
         dtype=np.dtype(metadata["data_type"]),
         fill_value=fill_value,
-        filters=metadata.get("filters", None),
+        filters=filters,
         order="C",
         shape=chunk_shape,
         zarr_format=3,
     )
 
     return zarray, dim_names, attrs
+
+
+def _default_fill_value(dtype: np.dtype) -> Union[bool, int, float, str, list]:
+    """
+    The value and format of the fill_value depend on the data_type of the array.
+    See here for spec:
+    https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value
+    """
+    # numpy dtypes's hierarchy lets us avoid checking for all the widths
+    # https://numpy.org/doc/stable/reference/arrays.scalars.html
+    if dtype is np.dtype("bool"):
+        return False
+    elif dtype is np.dtype("int"):
+        return 0
+    elif dtype is np.dtype("float"):
+        return 0.0
+    elif dtype is np.dtype("complex"):
+        return [0.0, 0.0]
+    else:
+        return 0.0