diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py index 97f64b1b..f3b2ed73 100644 --- a/virtualizarr/kerchunk.py +++ b/virtualizarr/kerchunk.py @@ -9,10 +9,11 @@ import ujson # type: ignore import xarray as xr from xarray.coding.times import CFDatetimeCoder +from zarr.array import ArrayMetadata, ArrayV2Metadata from virtualizarr.manifests.manifest import join from virtualizarr.utils import _fsspec_openfile_from_filepath -from virtualizarr.zarr import ZArray, ZAttrs +from virtualizarr.zarr import ZAttrs, from_kerchunk_refs, to_kerchunk_json # Distinguishing these via type hints makes it a lot easier to mentally keep track of what the opaque kerchunk "reference dicts" actually mean # (idea from https://kobzol.github.io/rust/python/2023/05/20/writing-python-like-its-rust.html) @@ -195,8 +196,8 @@ def extract_array_refs( def parse_array_refs( arr_refs: KerchunkArrRefs, -) -> tuple[dict, ZArray, ZAttrs]: - zarray = ZArray.from_kerchunk_refs(arr_refs.pop(".zarray")) +) -> tuple[dict, ArrayMetadata, ZAttrs]: + zarray = from_kerchunk_refs(arr_refs.pop(".zarray")) zattrs = arr_refs.pop(".zattrs", {}) chunk_dict = arr_refs @@ -296,15 +297,15 @@ def variable_to_kerchunk_arr_refs(var: xr.Variable, var_name: str) -> KerchunkAr # TODO can this be generalized to save individual chunks of a dask array? # TODO will this fail for a scalar? arr_refs = {join(0 for _ in np_arr.shape): inlined_data} - - zarray = ZArray( + zarray = ArrayV2Metadata( chunks=np_arr.shape, shape=np_arr.shape, dtype=np_arr.dtype, order="C", + fill_value=np.nan, ) - zarray_dict = zarray.to_kerchunk_json() + zarray_dict = to_kerchunk_json(zarray) arr_refs[".zarray"] = zarray_dict zattrs = {**var.attrs, **var.encoding} diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index a0983dec..6b2039a1 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -2,9 +2,9 @@ from typing import Any, Callable, Union import numpy as np +from zarr.array import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata, RegularChunkGrid from ..kerchunk import KerchunkArrRefs -from ..zarr import ZArray from .array_api import MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS from .manifest import ChunkManifest @@ -22,11 +22,11 @@ class ManifestArray: """ _manifest: ChunkManifest - _zarray: ZArray + _zarray: ArrayMetadata def __init__( self, - zarray: ZArray | dict, + zarray: ArrayMetadata | dict, chunkmanifest: dict | ChunkManifest, ) -> None: """ @@ -34,15 +34,20 @@ def __init__( Parameters ---------- - zarray : dict or ZArray + zarray : dict or zarr.array.ArrayMetadata chunkmanifest : dict or ChunkManifest """ - if isinstance(zarray, ZArray): - _zarray = zarray - else: - # try unpacking the dict - _zarray = ZArray(**zarray) + match zarray: + case ArrayMetadata(): + _zarray = zarray + case dict(): + zarray = zarray.copy() + zarr_format = zarray.pop("zarr_format", None) + if zarr_format == 3: + _zarray = ArrayV3Metadata(**zarray) + else: + _zarray = ArrayV2Metadata(**zarray) if isinstance(chunkmanifest, ChunkManifest): _chunkmanifest = chunkmanifest @@ -79,12 +84,20 @@ def manifest(self) -> ChunkManifest: return self._manifest @property - def zarray(self) -> ZArray: + def zarray(self) -> ArrayMetadata: return self._zarray @property def chunks(self) -> tuple[int, ...]: - return tuple(self.zarray.chunks) + """ + Individual chunk size by number of elements. + """ + if isinstance(self._zarray.chunk_grid, RegularChunkGrid): + return self._zarray.chunk_grid.chunk_shape + else: + raise NotImplementedError( + "Only RegularChunkGrid is currently supported for chunk size" + ) @property def dtype(self) -> np.dtype: @@ -93,6 +106,9 @@ def dtype(self) -> np.dtype: @property def shape(self) -> tuple[int, ...]: + """ + Array shape by number of elements along each dimension. + """ return tuple(int(length) for length in list(self.zarray.shape)) @property diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index 0ecdc023..96ac776c 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -1,6 +1,9 @@ -from typing import TYPE_CHECKING, Callable, Iterable +from dataclasses import replace +from typing import TYPE_CHECKING, Callable, Iterable, Union import numpy as np +from zarr.abc.codec import Codec as ZCodec +from zarr.array import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata, RegularChunkGrid from virtualizarr.zarr import Codec, ceildiv @@ -34,7 +37,7 @@ def _check_combineable_zarr_arrays(arrays: Iterable["ManifestArray"]) -> None: # Can't combine different codecs in one manifest # see https://github.com/zarr-developers/zarr-specs/issues/288 - _check_same_codecs([arr.zarray.codec for arr in arrays]) + _check_same_codecs([arr.zarray for arr in arrays]) # Would require variable-length chunks ZEP _check_same_chunk_shapes([arr.chunks for arr in arrays]) @@ -51,7 +54,20 @@ def _check_same_dtypes(dtypes: list[np.dtype]) -> None: ) -def _check_same_codecs(codecs: list[Codec]) -> None: +def _check_same_codecs(zarrays: list[ArrayMetadata]) -> None: + if len({zarry.zarr_format for zarry in zarrays}) > 1: + raise ValueError("Cannot concatenate arrays with different zarr formats.") + + def to_codec(zarray: ArrayMetadata) -> Union[Codec | tuple[ZCodec, ...]]: + match zarray: + case ArrayV2Metadata(compressor=compressor, filters=filters): + return Codec(compressor=compressor, filters=filters) + case ArrayV3Metadata(codecs=codecs): + return codecs + case _: + raise ValueError("Unknown ArrayMetadata type") + + codecs = [to_codec(zarray) for zarray in zarrays] first_codec, *other_codecs = codecs for codec in other_codecs: if codec != first_codec: @@ -144,10 +160,7 @@ def concatenate( ) # chunk shape has not changed, there are just now more chunks along the concatenation axis - new_zarray = first_arr.zarray.replace( - shape=tuple(new_shape), - ) - + new_zarray = replace(first_arr.zarray, shape=tuple(new_shape)) return ManifestArray(chunkmanifest=concatenated_manifest, zarray=new_zarray) @@ -239,10 +252,10 @@ def stack( old_chunks = first_arr.chunks new_chunks = list(old_chunks) new_chunks.insert(axis, 1) - - new_zarray = first_arr.zarray.replace( - chunks=tuple(new_chunks), + new_zarray = replace( + first_arr.zarray, shape=tuple(new_shape), + chunk_grid=RegularChunkGrid(chunk_shape=tuple(new_chunks)), ) return ManifestArray(chunkmanifest=stacked_manifest, zarray=new_zarray) @@ -314,9 +327,10 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra lengths=broadcasted_lengths, ) - new_zarray = x.zarray.replace( - chunks=new_chunk_shape, - shape=new_shape, + new_zarray = replace( + x.zarray, + shape=tuple(new_shape), + chunk_grid=RegularChunkGrid(chunk_shape=tuple(new_chunk_shape)), ) return ManifestArray(chunkmanifest=broadcasted_manifest, zarray=new_zarray) diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 3856a6ba..f82d9315 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -4,10 +4,11 @@ import numpy as np import pytest from packaging.version import Version +from zarr.array import ArrayV2Metadata from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.manifests.manifest import join -from virtualizarr.zarr import ZArray, ceildiv +from virtualizarr.zarr import ceildiv network = pytest.mark.network @@ -46,15 +47,14 @@ def create_manifestarray( The manifest is populated with a (somewhat) unique path, offset, and length for each key. """ - zarray = ZArray( + zarray = ArrayV2Metadata( chunks=chunks, - compressor="zlib", + compressor={"id": "zlib"}, dtype=np.dtype("float32"), fill_value=0.0, # TODO change this to NaN? filters=None, order="C", shape=shape, - zarr_format=2, ) chunk_grid_shape = tuple( diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py index 80d04b9c..9d863b15 100644 --- a/virtualizarr/tests/test_zarr.py +++ b/virtualizarr/tests/test_zarr.py @@ -13,12 +13,13 @@ def test_zarr_v3_roundtrip(tmpdir): ), zarray=dict( shape=(2, 3), - dtype=np.dtype(" str: return f"Codec(compressor={self.compressor}, filters={self.filters})" -class ZArray(BaseModel): - """Just the .zarray information""" - - # TODO will this work for V3? +def to_kerchunk_json(array: ArrayMetadata) -> str: + return ujson.dumps(array.to_dict()) - model_config = ConfigDict( - arbitrary_types_allowed=True, # only here so pydantic doesn't complain about the numpy dtype field - ) - chunks: tuple[int, ...] - compressor: str | None = None - dtype: np.dtype - fill_value: float | int | None = np.nan # float or int? - filters: list[dict] | None = None - order: Literal["C", "F"] - shape: tuple[int, ...] - zarr_format: Literal[2, 3] = 2 - - @field_validator("dtype") - @classmethod - def validate_dtype(cls, dtype) -> np.dtype: - # Your custom validation logic here - # Convert numpy.dtype to a format suitable for Pydantic - return np.dtype(dtype) - - def __post_init__(self) -> None: - if len(self.shape) != len(self.chunks): - raise ValueError( - "Dimension mismatch between array shape and chunk shape. " - f"Array shape {self.shape} has ndim={self.shape} but chunk shape {self.chunks} has ndim={len(self.chunks)}" - ) +def from_kerchunk_refs(decoded_arr_refs_zarray: dict[str, Any]) -> ArrayMetadata: + # coerce type of fill_value as kerchunk can be inconsistent with this + fill_value = decoded_arr_refs_zarray["fill_value"] + if fill_value is None or fill_value == "NaN" or fill_value == "nan": + fill_value = np.nan - @property - def codec(self) -> Codec: - """For comparison against other arrays.""" - return Codec(compressor=self.compressor, filters=self.filters) + compressor = decoded_arr_refs_zarray["compressor"] + # deal with an inconsistency in kerchunk's tiff_to_zarr function + # TODO should this be moved to the point where we actually call tiff_to_zarr? Or ideally made consistent upstream. + if compressor is not None and "id" in compressor: + compressor = compressor["id"] + # return ArrayV2Metadata.from_dict(decoded_arr_refs_zarray) - def __repr__(self) -> str: - return f"ZArray(shape={self.shape}, chunks={self.chunks}, dtype={self.dtype}, compressor={self.compressor}, filters={self.filters}, fill_value={self.fill_value})" - - @classmethod - def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": - # coerce type of fill_value as kerchunk can be inconsistent with this - fill_value = decoded_arr_refs_zarray["fill_value"] - if fill_value is None or fill_value == "NaN" or fill_value == "nan": - fill_value = np.nan - - compressor = decoded_arr_refs_zarray["compressor"] - # deal with an inconsistency in kerchunk's tiff_to_zarr function - # TODO should this be moved to the point where we actually call tiff_to_zarr? Or ideally made consistent upstream. - if compressor is not None and "id" in compressor: - compressor = compressor["id"] - - return ZArray( - chunks=tuple(decoded_arr_refs_zarray["chunks"]), - compressor=compressor, + if int(decoded_arr_refs_zarray["zarr_format"]) == 2: + return ArrayV2Metadata( + shape=tuple(decoded_arr_refs_zarray["shape"]), dtype=np.dtype(decoded_arr_refs_zarray["dtype"]), + chunks=tuple(decoded_arr_refs_zarray["chunks"]), + order=decoded_arr_refs_zarray["order"], fill_value=fill_value, + compressor=compressor, filters=decoded_arr_refs_zarray["filters"], - order=decoded_arr_refs_zarray["order"], - shape=tuple(decoded_arr_refs_zarray["shape"]), - zarr_format=int(decoded_arr_refs_zarray["zarr_format"]), - ) - - def dict(self) -> dict[str, Any]: - zarray_dict = dict(self) - - zarray_dict["dtype"] = encode_dtype(zarray_dict["dtype"]) - - if zarray_dict["fill_value"] is np.nan: - zarray_dict["fill_value"] = None - - return zarray_dict - - def to_kerchunk_json(self) -> str: - return ujson.dumps(self.dict()) - - def replace( - self, - chunks: Optional[tuple[int, ...]] = None, - compressor: Optional[str] = None, - dtype: Optional[np.dtype] = None, - fill_value: Optional[float] = None, # float or int? - filters: Optional[list[dict]] = None, # type: ignore[valid-type] - order: Optional[Literal["C"] | Literal["F"]] = None, - shape: Optional[tuple[int, ...]] = None, - zarr_format: Optional[Literal[2] | Literal[3]] = None, - ) -> "ZArray": - """ - Convenience method to create a new ZArray from an existing one by altering only certain attributes. - """ - return ZArray( - chunks=chunks if chunks is not None else self.chunks, - compressor=compressor if compressor is not None else self.compressor, - dtype=dtype if dtype is not None else self.dtype, - fill_value=fill_value if fill_value is not None else self.fill_value, - filters=filters if filters is not None else self.filters, - shape=shape if shape is not None else self.shape, - order=order if order is not None else self.order, - zarr_format=zarr_format if zarr_format is not None else self.zarr_format, ) + else: + raise ValueError("Only Zarr format 2 is supported.") def encode_dtype(dtype: np.dtype) -> str: @@ -216,41 +152,63 @@ def to_zarr_json(var: xr.Variable, array_dir: Path) -> None: metadata_file.write(json_dumps(metadata)) -def zarr_v3_array_metadata(zarray: ZArray, dim_names: list[str], attrs: dict) -> dict: +def zarr_v3_array_metadata( + zarray: ArrayMetadata, dim_names: list[str], attrs: dict +) -> dict: """Construct a v3-compliant metadata dict from v2 zarray + information stored on the xarray variable.""" # TODO it would be nice if we could use the zarr-python metadata.ArrayMetadata classes to do this conversion for us - metadata = zarray.dict() - - # adjust to match v3 spec - metadata["zarr_format"] = 3 - metadata["node_type"] = "array" - metadata["data_type"] = str(np.dtype(metadata.pop("dtype"))) - metadata["chunk_grid"] = { - "name": "regular", - "configuration": {"chunk_shape": metadata.pop("chunks")}, - } - metadata["chunk_key_encoding"] = { - "name": "default", - "configuration": {"separator": "/"}, - } - metadata["codecs"] = metadata.pop("filters") - metadata.pop("compressor") # TODO this should be entered in codecs somehow - metadata.pop("order") # TODO this should be replaced by a transpose codec - - # indicate that we're using the manifest storage transformer ZEP - metadata["storage_transformers"] = [ - { - "name": "chunk-manifest-json", - "configuration": {"manifest": "./manifest.json"}, - } - ] - - # add information from xarray object - metadata["dimension_names"] = dim_names - metadata["attributes"] = attrs - - return metadata + match zarray: + case ArrayV3Metadata() as v3_metadata: + metadata = replace( + v3_metadata, dimension_names=tuple(dim_names), attributes=attrs + ).to_dict() + metadata["data_type"] = str(metadata["data_type"]) + + # Can remove when ZarrV3 metadata includes storage transformers + # https://github.com/zarr-developers/zarr-python/issues/2009 + if "storage_transformers" not in metadata: + metadata["storage_transformers"] = [ + { + "name": "chunk-manifest-json", + "configuration": {"manifest": "./manifest.json"}, + } + ] + return metadata + + case ArrayV2Metadata(): + metadata = zarray.to_dict() + # adjust to match v3 spec + metadata["zarr_format"] = 3 + metadata["node_type"] = "array" + metadata["data_type"] = str(np.dtype(metadata.pop("dtype"))) + metadata["chunk_grid"] = { + "name": "regular", + "configuration": {"chunk_shape": metadata.pop("chunks")}, + } + metadata["chunk_key_encoding"] = { + "name": "default", + "configuration": {"separator": "/"}, + } + metadata["codecs"] = metadata.pop("filters") + metadata.pop("compressor") # TODO this should be entered in codecs somehow + metadata.pop("order") # TODO this should be replaced by a transpose codec + + # indicate that we're using the manifest storage transformer ZEP + metadata["storage_transformers"] = [ + { + "name": "chunk-manifest-json", + "configuration": {"manifest": "./manifest.json"}, + } + ] + + # add information from xarray object + metadata["dimension_names"] = dim_names + metadata["attributes"] = attrs + + return metadata + case _: + raise ValueError("Unknown array metadata type") def attrs_from_zarr_group_json(filepath: Path) -> dict: @@ -259,7 +217,7 @@ def attrs_from_zarr_group_json(filepath: Path) -> dict: return attrs["attributes"] -def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: +def metadata_from_zarr_json(filepath: Path) -> tuple[ArrayMetadata, list[str], dict]: with open(filepath) as metadata_file: metadata = json.load(metadata_file) @@ -272,26 +230,8 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: raise ValueError( "Can only read byte ranges from Zarr v3 stores which implement the manifest storage transformer ZEP." ) - - attrs = metadata.pop("attributes") - dim_names = metadata.pop("dimension_names") - - chunk_shape = metadata["chunk_grid"]["configuration"]["chunk_shape"] - - if metadata["fill_value"] is None: - fill_value = np.nan - else: - fill_value = metadata["fill_value"] - - zarray = ZArray( - chunks=metadata["chunk_grid"]["configuration"]["chunk_shape"], - compressor=metadata["codecs"], - dtype=np.dtype(metadata["data_type"]), - fill_value=fill_value, - filters=metadata.get("filters", None), - order="C", - shape=chunk_shape, - zarr_format=3, - ) - - return zarray, dim_names, attrs + # Can remove when ZarrV3 metadata includes storage transformers + # https://github.com/zarr-developers/zarr-python/issues/2009 + metadata.pop("storage_transformers") + metadata = ArrayV3Metadata.from_dict(metadata) + return metadata, metadata.dimension_names, metadata.attributes