diff --git a/docs/releases.rst b/docs/releases.rst index c44ff24..3eeed7e 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -12,6 +12,9 @@ New Features Breaking changes ~~~~~~~~~~~~~~~~ +- Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`) + By `Gustavo Hidalgo `_. + Deprecations ~~~~~~~~~~~~ diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 6e82067..122b86b 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -266,7 +266,7 @@ def variable_to_kerchunk_arr_refs(var: xr.Variable, var_name: str) -> KerchunkAr for chunk_key, entry in marr.manifest.dict().items() } - zarray = marr.zarray + zarray = marr.zarray.replace(zarr_format=2) else: try: diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 3856a6b..7df13d1 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -48,9 +48,9 @@ def create_manifestarray( zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "blosc", "clevel": 5, "cname": "lz4", "shuffle": 1}, dtype=np.dtype("float32"), - fill_value=0.0, # TODO change this to NaN? + fill_value=0.0, filters=None, order="C", shape=shape, diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 2e612de..239316a 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -138,7 +138,7 @@ def test_non_dimension_coordinates(self, tmpdir, format): # regression test for GH issue #105 # set up example xarray dataset containing non-dimension coordinate variables - ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6).reshape(2, 3))}) + ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6.0).reshape(2, 3))}) # save it to disk as netCDF (in temporary directory) ds.to_netcdf(f"{tmpdir}/non_dim_coords.nc") diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index 459e60b..6d5ede7 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -19,7 +19,7 @@ def test_create_manifestarray(self): shape = (5, 2, 20) zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -74,7 +74,7 @@ def test_equals(self): shape = (5, 2, 20) zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -95,7 +95,7 @@ def test_not_equal_chunk_entries(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(5, 1, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -209,7 +209,7 @@ def test_concat(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(5, 1, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -254,7 +254,7 @@ def test_stack(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(5, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -299,7 +299,7 @@ def test_refuse_combine(): zarray_common = { "chunks": (5, 1, 10), - "compressor": "zlib", + "compressor": {"id": "zlib", "level": 1}, "dtype": np.dtype("int32"), "fill_value": 0.0, "filters": None, diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index d0fe2e3..7fb7a02 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -19,7 +19,7 @@ def test_wrapping(): dtype = np.dtype("int32") zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=dtype, fill_value=0.0, filters=None, @@ -49,7 +49,7 @@ def test_equals(self): shape = (5, 20) zarray = ZArray( chunks=chunks, - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -86,7 +86,7 @@ def test_concat_along_existing_dim(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(1, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -133,7 +133,7 @@ def test_concat_along_new_dim(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(5, 10), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, @@ -183,7 +183,7 @@ def test_concat_dim_coords_along_existing_dim(self): # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(10,), - compressor="zlib", + compressor={"id": "zlib", "level": 1}, dtype=np.dtype("int32"), fill_value=0.0, filters=None, diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py index 80d04b9..7715d24 100644 --- a/virtualizarr/tests/test_zarr.py +++ b/virtualizarr/tests/test_zarr.py @@ -1,12 +1,17 @@ +import json + import numpy as np +import pytest import xarray as xr import xarray.testing as xrt from virtualizarr import ManifestArray, open_virtual_dataset from virtualizarr.manifests.manifest import ChunkManifest +from virtualizarr.zarr import dataset_to_zarr, metadata_from_zarr_json -def test_zarr_v3_roundtrip(tmpdir): +@pytest.fixture +def vds_with_manifest_arrays() -> xr.Dataset: arr = ManifestArray( chunkmanifest=ChunkManifest( entries={"0.0": dict(path="test.nc", offset=6144, length=48)} @@ -15,18 +20,61 @@ def test_zarr_v3_roundtrip(tmpdir): shape=(2, 3), dtype=np.dtype(" bool: + """ + Several metadata attributes in ZarrV3 use a dictionary with keys "name" : str and "configuration" : dict + """ + return "name" in value and "configuration" in value - original.virtualize.to_zarr(tmpdir / "store.zarr") + +def test_zarr_v3_roundtrip(tmpdir, vds_with_manifest_arrays: xr.Dataset): + vds_with_manifest_arrays.virtualize.to_zarr(tmpdir / "store.zarr") roundtrip = open_virtual_dataset( tmpdir / "store.zarr", filetype="zarr_v3", indexes={} ) - xrt.assert_identical(roundtrip, original) + xrt.assert_identical(roundtrip, vds_with_manifest_arrays) + + +def test_metadata_roundtrip(tmpdir, vds_with_manifest_arrays: xr.Dataset): + dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr") + zarray, _, _ = metadata_from_zarr_json(tmpdir / "store.zarr/a/zarr.json") + assert zarray == vds_with_manifest_arrays.a.data.zarray + + +def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: xr.Dataset): + """ + Checks that the output metadata of an array variable conforms to this spec + for the required attributes: + https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#metadata + """ + dataset_to_zarr(vds_with_manifest_arrays, tmpdir / "store.zarr") + # read the a variable's metadata + with open(tmpdir / "store.zarr/a/zarr.json", mode="r") as f: + metadata = json.loads(f.read()) + assert metadata["zarr_format"] == 3 + assert metadata["node_type"] == "array" + assert isinstance(metadata["shape"], list) and all( + isinstance(dim, int) for dim in metadata["shape"] + ) + assert isinstance(metadata["data_type"], str) or isconfigurable( + metadata["data_type"] + ) + assert isconfigurable(metadata["chunk_grid"]) + assert isconfigurable(metadata["chunk_key_encoding"]) + assert isinstance(metadata["fill_value"], (bool, int, float, str, list)) + assert ( + isinstance(metadata["codecs"], list) + and len(metadata["codecs"]) > 1 + and all(isconfigurable(codec) for codec in metadata["codecs"]) + ) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index 545a86f..e5015b3 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -8,10 +8,18 @@ Optional, ) +import numcodecs import numpy as np import ujson # type: ignore import xarray as xr -from pydantic import BaseModel, ConfigDict, field_validator +from pydantic import ( + BaseModel, + ConfigDict, + Field, + field_validator, + model_validator, +) +from typing_extensions import Self from virtualizarr.vendor.zarr.utils import json_dumps @@ -22,10 +30,25 @@ ZAttrs = NewType( "ZAttrs", dict[str, Any] ) # just the .zattrs (for one array or for the whole store/group) +FillValueT = bool | str | float | int | list | None + +ZARR_DEFAULT_FILL_VALUE: dict[np.dtype, FillValueT] = { + # numpy dtypes's hierarchy lets us avoid checking for all the widths + # https://numpy.org/doc/stable/reference/arrays.scalars.html + np.dtype("bool"): False, + np.dtype("int"): 0, + np.dtype("float"): 0.0, + np.dtype("complex"): [0.0, 0.0], +} +""" +The value and format of the fill_value depend on the `data_type` of the array. +See here for spec: +https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value +""" class Codec(BaseModel): - compressor: str | None = None + compressor: dict | None = None filters: list[dict] | None = None def __repr__(self) -> str: @@ -42,9 +65,9 @@ class ZArray(BaseModel): ) chunks: tuple[int, ...] - compressor: str | None = None + compressor: dict | None = None dtype: np.dtype - fill_value: float | int | None = np.nan # float or int? + fill_value: FillValueT = Field(default=0.0, validate_default=True) filters: list[dict] | None = None order: Literal["C", "F"] shape: tuple[int, ...] @@ -64,6 +87,12 @@ def __post_init__(self) -> None: f"Array shape {self.shape} has ndim={self.shape} but chunk shape {self.chunks} has ndim={len(self.chunks)}" ) + @model_validator(mode="after") + def _check_fill_value(self) -> Self: + if self.fill_value is None: + self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype, 0.0) + return self + @property def codec(self) -> Codec: """For comparison against other arrays.""" @@ -80,11 +109,6 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": fill_value = np.nan compressor = decoded_arr_refs_zarray["compressor"] - # deal with an inconsistency in kerchunk's tiff_to_zarr function - # TODO should this be moved to the point where we actually call tiff_to_zarr? Or ideally made consistent upstream. - if compressor is not None and "id" in compressor: - compressor = compressor["id"] - return ZArray( chunks=tuple(decoded_arr_refs_zarray["chunks"]), compressor=compressor, @@ -98,21 +122,19 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": def dict(self) -> dict[str, Any]: zarray_dict = dict(self) - zarray_dict["dtype"] = encode_dtype(zarray_dict["dtype"]) - - if zarray_dict["fill_value"] is np.nan: - zarray_dict["fill_value"] = None - return zarray_dict def to_kerchunk_json(self) -> str: - return ujson.dumps(self.dict()) + zarray_dict = self.dict() + if zarray_dict["fill_value"] is np.nan: + zarray_dict["fill_value"] = None + return ujson.dumps(zarray_dict) def replace( self, chunks: Optional[tuple[int, ...]] = None, - compressor: Optional[str] = None, + compressor: Optional[dict] = None, dtype: Optional[np.dtype] = None, fill_value: Optional[float] = None, # float or int? filters: Optional[list[dict]] = None, # type: ignore[valid-type] @@ -134,6 +156,59 @@ def replace( zarr_format=zarr_format if zarr_format is not None else self.zarr_format, ) + def _v3_codec_pipeline(self) -> list: + """ + VirtualiZarr internally uses the `filters`, `compressor`, and `order` attributes + from zarr v2, but to create conformant zarr v3 metadata those 3 must be turned into `codecs` objects. + Not all codecs are created equal though: https://github.com/zarr-developers/zarr-python/issues/1943 + An array _must_ declare a single ArrayBytes codec, and 0 or more ArrayArray, BytesBytes codecs. + Roughly, this is the mapping: + ``` + filters: Iterable[ArrayArrayCodec] #optional + compressor: ArrayBytesCodec #mandatory + post_compressor: Iterable[BytesBytesCodec] #optional + ``` + """ + if self.filters: + filter_codecs_configs = [ + numcodecs.get_codec(filter).get_config() for filter in self.filters + ] + filters = [ + dict(name=codec.pop("id"), configuration=codec) + for codec in filter_codecs_configs + ] + else: + filters = [] + + # Noting here that zarr v3 has very few codecs specificed in the official spec, + # and that there are far more codecs in `numcodecs`. We take a gamble and assume + # that the codec names and configuration are simply mapped into zarrv3 "configurables". + if self.compressor: + compressor = [_num_codec_config_to_configurable(self.compressor)] + else: + compressor = [] + + # https://zarr-specs.readthedocs.io/en/latest/v3/codecs/transpose/v1.0.html#transpose-codec-v1 + # Either "C" or "F", defining the layout of bytes within each chunk of the array. + # "C" means row-major order, i.e., the last dimension varies fastest; + # "F" means column-major order, i.e., the first dimension varies fastest. + if self.order == "C": + order = tuple(range(len(self.shape))) + elif self.order == "F": + order = tuple(reversed(range(len(self.shape)))) + + transpose = dict(name="transpose", configuration=dict(order=order)) + # https://github.com/zarr-developers/zarr-python/pull/1944#issuecomment-2151994097 + # "If no ArrayBytesCodec is supplied, we can auto-add a BytesCodec" + bytes = dict( + name="bytes", configuration={} + ) # TODO need to handle endianess configuration + + # The order here is significant! + # [ArrayArray] -> ArrayBytes -> [BytesBytes] + codec_pipeline = [transpose, bytes] + compressor + filters + return codec_pipeline + def encode_dtype(dtype: np.dtype) -> str: # TODO not sure if there is a better way to get the ' "name": "default", "configuration": {"separator": "/"}, } - metadata["codecs"] = metadata.pop("filters") - metadata.pop("compressor") # TODO this should be entered in codecs somehow - metadata.pop("order") # TODO this should be replaced by a transpose codec + metadata["codecs"] = zarray._v3_codec_pipeline() + metadata.pop("filters") + metadata.pop("compressor") + metadata.pop("order") # indicate that we're using the manifest storage transformer ZEP metadata["storage_transformers"] = [ @@ -277,21 +353,51 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: dim_names = metadata.pop("dimension_names") chunk_shape = metadata["chunk_grid"]["configuration"]["chunk_shape"] + shape = metadata["shape"] + zarr_format = metadata["zarr_format"] if metadata["fill_value"] is None: - fill_value = np.nan + raise ValueError( + "fill_value must be specified https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#fill-value" + ) else: fill_value = metadata["fill_value"] + all_codecs = [ + codec + for codec in metadata["codecs"] + if codec["name"] not in ("transpose", "bytes") + ] + compressor, *filters = [ + _configurable_to_num_codec_config(_filter) for _filter in all_codecs + ] zarray = ZArray( - chunks=metadata["chunk_grid"]["configuration"]["chunk_shape"], - compressor=metadata["codecs"], + chunks=chunk_shape, + compressor=compressor, dtype=np.dtype(metadata["data_type"]), fill_value=fill_value, - filters=metadata.get("filters", None), + filters=filters or None, order="C", - shape=chunk_shape, - zarr_format=3, + shape=shape, + zarr_format=zarr_format, ) return zarray, dim_names, attrs + + +def _configurable_to_num_codec_config(configurable: dict) -> dict: + """ + Convert a zarr v3 configurable into a numcodecs codec. + """ + configurable_copy = configurable.copy() + codec_id = configurable_copy.pop("name") + configuration = configurable_copy.pop("configuration") + return numcodecs.get_codec({"id": codec_id, **configuration}).get_config() + + +def _num_codec_config_to_configurable(num_codec: dict) -> dict: + """ + Convert a numcodecs codec into a zarr v3 configurable. + """ + num_codec_copy = num_codec.copy() + return {"name": num_codec_copy.pop("id"), "configuration": num_codec_copy}