zarr-developers · TomAugspurger · Aug 8, 2024 · Aug 4, 2024 · Aug 5, 2024 · Aug 5, 2024
diff --git a/ci/environment.yml b/ci/environment.yml
@@ -9,7 +9,6 @@ dependencies:
   - netcdf4
   - xarray>=2024.6.0
   - kerchunk>=0.2.5
-  - pydantic
   - numpy>=2.0.0
   - ujson
   - packaging

diff --git a/docs/releases.rst b/docs/releases.rst
@@ -17,6 +17,10 @@ Breaking changes
 
 - Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`)
   By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.
+- VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclass
+  `pydantic.BaseModel` (:pull:`210`)
+- `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`xxx`)
+
 
 Deprecations
 ~~~~~~~~~~~~

diff --git a/pyproject.toml b/pyproject.toml
@@ -24,7 +24,6 @@ dependencies = [
     "xarray>=2024.06.0",
     "kerchunk>=0.2.5",
     "h5netcdf",
-    "pydantic",
     "numpy>=2.0.0",
     "ujson",
     "packaging",

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
@@ -1,10 +1,10 @@
+import dataclasses
 import json
 import re
 from collections.abc import Iterable, Iterator
 from typing import Any, Callable, Dict, NewType, Tuple, TypedDict, cast
 
 import numpy as np
-from pydantic import BaseModel, ConfigDict
 from upath import UPath
 
 from virtualizarr.types import ChunkKey
@@ -25,22 +25,18 @@ class ChunkDictEntry(TypedDict):
 ChunkDict = NewType("ChunkDict", dict[ChunkKey, ChunkDictEntry])
 
 
-class ChunkEntry(BaseModel):
+@dataclasses.dataclass(frozen=True)
+class ChunkEntry:
     """
     Information for a single chunk in the manifest.
 
     Stored in the form `{"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}`.
     """
 
-    model_config = ConfigDict(frozen=True)
-
     path: str  # TODO stricter typing/validation of possible local / remote paths?
     offset: int
     length: int
 
-    def __repr__(self) -> str:
-        return f"ChunkEntry(path='{self.path}', offset={self.offset}, length={self.length})"
-
     @classmethod
     def from_kerchunk(
         cls, path_and_byte_range_info: tuple[str] | tuple[str, int, int]
@@ -57,8 +53,12 @@ def to_kerchunk(self) -> tuple[str, int, int]:
         """Write out in the format that kerchunk uses for chunk entries."""
         return (self.path, self.offset, self.length)
 
-    def dict(self) -> ChunkDictEntry:  # type: ignore[override]
-        return ChunkDictEntry(path=self.path, offset=self.offset, length=self.length)
+    def dict(self) -> ChunkDictEntry:
+        return ChunkDictEntry(
+            path=self.path,
+            offset=self.offset,
+            length=self.length,
+        )
 
 
 class ChunkManifest:

diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py
@@ -94,7 +94,7 @@ def test_accessor_to_kerchunk_dict(self):
             "refs": {
                 ".zgroup": '{"zarr_format":2}',
                 ".zattrs": "{}",
-                "a/.zarray": '{"chunks":[2,3],"compressor":null,"dtype":"<i8","fill_value":null,"filters":null,"order":"C","shape":[2,3],"zarr_format":2}',
+                "a/.zarray": '{"shape":[2,3],"chunks":[2,3],"dtype":"<i8","fill_value":null,"order":"C","compressor":null,"filters":null,"zarr_format":2}',
                 "a/.zattrs": '{"_ARRAY_DIMENSIONS":["x","y"]}',
                 "a/0.0": ["test.nc", 6144, 48],
             },
@@ -133,7 +133,7 @@ def test_accessor_to_kerchunk_json(self, tmp_path):
             "refs": {
                 ".zgroup": '{"zarr_format":2}',
                 ".zattrs": "{}",
-                "a/.zarray": '{"chunks":[2,3],"compressor":null,"dtype":"<i8","fill_value":null,"filters":null,"order":"C","shape":[2,3],"zarr_format":2}',
+                "a/.zarray": '{"shape":[2,3],"chunks":[2,3],"dtype":"<i8","fill_value":null,"order":"C","compressor":null,"filters":null,"zarr_format":2}',
                 "a/.zattrs": '{"_ARRAY_DIMENSIONS":["x","y"]}',
                 "a/0.0": ["test.nc", 6144, 48],
             },

diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py
@@ -27,16 +27,6 @@ def test_invalid_chunk_entries(self):
         with pytest.raises(ValueError, match="must be of the form"):
             ChunkManifest(entries=chunks)
 
-        chunks = {
-            "0.0.0": {
-                "path": "s3://bucket/foo.nc",
-                "offset": "some nonsense",
-                "length": 100,
-            },
-        }
-        with pytest.raises(ValueError, match="must be of the form"):
-            ChunkManifest(entries=chunks)
-
     def test_invalid_chunk_keys(self):
         chunks = {
             "0.0.": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},

diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py
@@ -8,7 +8,7 @@
 from virtualizarr import ManifestArray, open_virtual_dataset
 from virtualizarr.kerchunk import FileType
 from virtualizarr.manifests.manifest import ChunkManifest
-from virtualizarr.zarr import dataset_to_zarr, metadata_from_zarr_json
+from virtualizarr.zarr import ZArray, dataset_to_zarr, metadata_from_zarr_json
 
 
 @pytest.fixture
@@ -79,3 +79,29 @@ def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: xr.Datas
         and len(metadata["codecs"]) > 1
         and all(isconfigurable(codec) for codec in metadata["codecs"])
     )
+
+
+def test_replace_partial():
+    arr = ZArray(shape=(2, 3), chunks=(1, 1), dtype=np.dtype("<i8"))
+    result = arr.replace(chunks=(2, 3))
+    expected = ZArray(shape=(2, 3), chunks=(2, 3), dtype=np.dtype("<i8"))
+    assert result == expected
+    assert result.shape == (2, 3)
+    assert result.chunks == (2, 3)
+
+
+def test_replace_total():
+    arr = ZArray(shape=(2, 3), chunks=(1, 1), dtype=np.dtype("<i8"))
+    kwargs = dict(
+        shape=(4, 4),
+        chunks=(2, 2),
+        dtype=np.dtype("<f8"),
+        fill_value=-1.0,
+        order="F",
+        compressor={"id": "zlib", "level": 1},
+        filters=[{"id": "blosc", "clevel": 5}],
+        zarr_format=3,
+    )
+    result = arr.replace(**kwargs)
+    expected = ZArray(**kwargs)
+    assert result == expected
diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py
@@ -1,26 +1,12 @@
+import dataclasses
 import json
 from pathlib import Path
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Literal,
-    NewType,
-    Optional,
-    cast,
-)
+from typing import TYPE_CHECKING, Any, Literal, NewType, cast
 
 import numcodecs
 import numpy as np
 import ujson  # type: ignore
 import xarray as xr
-from pydantic import (
-    BaseModel,
-    ConfigDict,
-    Field,
-    field_validator,
-    model_validator,
-)
-from typing_extensions import Self
 
 from virtualizarr.vendor.zarr.utils import json_dumps
 
@@ -50,38 +36,26 @@
 """
 
 
-class Codec(BaseModel):
+@dataclasses.dataclass
+class Codec:
     compressor: dict | None = None
     filters: list[dict] | None = None
 
-    def __repr__(self) -> str:
-        return f"Codec(compressor={self.compressor}, filters={self.filters})"
 
-
-class ZArray(BaseModel):
+@dataclasses.dataclass
+class ZArray:
     """Just the .zarray information"""
 
     # TODO will this work for V3?
 
-    model_config = ConfigDict(
-        arbitrary_types_allowed=True,  # only here so pydantic doesn't complain about the numpy dtype field
-    )
-
+    shape: tuple[int, ...]
     chunks: tuple[int, ...]
-    compressor: dict | None = None
     dtype: np.dtype
-    fill_value: FillValueT = Field(None, validate_default=True)
+    fill_value: FillValueT = dataclasses.field(default=None)
+    order: Literal["C", "F"] = "C"
+    compressor: dict | None = None
     filters: list[dict] | None = None
-    order: Literal["C", "F"]
-    shape: tuple[int, ...]
-    zarr_format: ZARR_FORMAT = 2
-
-    @field_validator("dtype")
-    @classmethod
-    def validate_dtype(cls, dtype) -> np.dtype:
-        # Your custom validation logic here
-        # Convert numpy.dtype to a format suitable for Pydantic
-        return np.dtype(dtype)
+    zarr_format: Literal[2, 3] = 2
 
     def __post_init__(self) -> None:
         if len(self.shape) != len(self.chunks):
@@ -90,20 +64,18 @@ def __post_init__(self) -> None:
                 f"Array shape {self.shape} has ndim={self.shape} but chunk shape {self.chunks} has ndim={len(self.chunks)}"
             )
 
-    @model_validator(mode="after")
-    def _check_fill_value(self) -> Self:
+        if isinstance(self.dtype, str):
+            # Convert dtype string to numpy.dtype
+            self.dtype = np.dtype(self.dtype)
+
         if self.fill_value is None:
             self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype.kind, 0.0)
-        return self
 
     @property
     def codec(self) -> Codec:
         """For comparison against other arrays."""
         return Codec(compressor=self.compressor, filters=self.filters)
 
-    def __repr__(self) -> str:
-        return f"ZArray(shape={self.shape}, chunks={self.chunks}, dtype={self.dtype}, compressor={self.compressor}, filters={self.filters}, fill_value={self.fill_value})"
-
     @classmethod
     def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray":
         # coerce type of fill_value as kerchunk can be inconsistent with this
@@ -127,8 +99,8 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray":
             zarr_format=cast(ZARR_FORMAT, zarr_format),
         )
 
-    def dict(self) -> dict[str, Any]:  # type: ignore
-        zarray_dict = dict(self)
+    def dict(self) -> dict[str, Any]:
+        zarray_dict = dataclasses.asdict(self)
         zarray_dict["dtype"] = encode_dtype(zarray_dict["dtype"])
         return zarray_dict
 
@@ -138,30 +110,40 @@ def to_kerchunk_json(self) -> str:
             zarray_dict["fill_value"] = None
         return ujson.dumps(zarray_dict)
 
+    # ZArray.dict seems to shadow "dict", so we need the type ignore in
+    # the signature below.
     def replace(
         self,
-        chunks: Optional[tuple[int, ...]] = None,
-        compressor: Optional[dict] = None,  # type: ignore[valid-type]
-        dtype: Optional[np.dtype] = None,
-        fill_value: Optional[float] = None,  # float or int?
-        filters: Optional[list[dict]] = None,  # type: ignore[valid-type]
-        order: Optional[Literal["C"] | Literal["F"]] = None,
-        shape: Optional[tuple[int, ...]] = None,
-        zarr_format: Optional[Literal[2] | Literal[3]] = None,
+        shape: tuple[int, ...] | None = None,
+        chunks: tuple[int, ...] | None = None,
+        dtype: np.dtype | str | None = None,
+        fill_value: FillValueT = None,
+        order: Literal["C", "F"] | None = None,
+        compressor: "dict | None" = None,  # type: ignore[valid-type]
+        filters: list[dict] | None = None,  # type: ignore[valid-type]
+        zarr_format: Literal[2, 3] | None = None,
     ) -> "ZArray":
         """
         Convenience method to create a new ZArray from an existing one by altering only certain attributes.
         """
-        return ZArray(
-            chunks=chunks if chunks is not None else self.chunks,
-            compressor=compressor if compressor is not None else self.compressor,
-            dtype=dtype if dtype is not None else self.dtype,
-            fill_value=fill_value if fill_value is not None else self.fill_value,
-            filters=filters if filters is not None else self.filters,
-            shape=shape if shape is not None else self.shape,
-            order=order if order is not None else self.order,
-            zarr_format=zarr_format if zarr_format is not None else self.zarr_format,
-        )
+        replacements: dict[str, Any] = {}
+        if shape is not None:
+            replacements["shape"] = shape
+        if chunks is not None:
+            replacements["chunks"] = chunks
+        if dtype is not None:
+            replacements["dtype"] = dtype
+        if fill_value is not None:
+            replacements["fill_value"] = fill_value
+        if order is not None:
+            replacements["order"] = order
+        if compressor is not None:
+            replacements["compressor"] = compressor
+        if filters is not None:
+            replacements["filters"] = filters
+        if zarr_format is not None:
+            replacements["zarr_format"] = zarr_format
+        return dataclasses.replace(self, **replacements)
 
     def _v3_codec_pipeline(self) -> list:
         """
@@ -361,8 +343,8 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]:
     attrs = metadata.pop("attributes")
     dim_names = metadata.pop("dimension_names")
 
-    chunk_shape = metadata["chunk_grid"]["configuration"]["chunk_shape"]
-    shape = metadata["shape"]
+    chunk_shape = tuple(metadata["chunk_grid"]["configuration"]["chunk_shape"])
+    shape = tuple(metadata["shape"])
     zarr_format = metadata["zarr_format"]
 
     if metadata["fill_value"] is None: