zarr-developers · TomAugspurger · Aug 8, 2024 · Aug 4, 2024 · Aug 5, 2024 · Aug 5, 2024
diff --git a/ci/environment.yml b/ci/environment.yml
@@ -9,7 +9,6 @@ dependencies:
   - netcdf4
   - xarray>=2024.6.0
   - kerchunk>=0.2.5
-  - pydantic
   - numpy>=2.0.0
   - ujson
   - packaging

diff --git a/docs/releases.rst b/docs/releases.rst
@@ -17,6 +17,9 @@ Breaking changes
 
 - Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`)
   By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.
+- VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclasses `zarr.Array`, no longer inherit from `pydantic.BaseModel` (:pull:`xxx`)
+- `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`xxx`)
+
 
 Deprecations
 ~~~~~~~~~~~~

diff --git a/pyproject.toml b/pyproject.toml
@@ -24,7 +24,6 @@ dependencies = [
     "xarray>=2024.06.0",
     "kerchunk>=0.2.5",
     "h5netcdf",
-    "pydantic",
     "numpy>=2.0.0",
     "ujson",
     "packaging",

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
@@ -1,10 +1,10 @@
+import dataclasses
 import json
 import re
 from collections.abc import Iterable, Iterator
 from typing import Any, Callable, Dict, NewType, Tuple, TypedDict, cast
 
 import numpy as np
-from pydantic import BaseModel, ConfigDict
 from upath import UPath
 
 from virtualizarr.types import ChunkKey
@@ -25,22 +25,18 @@ class ChunkDictEntry(TypedDict):
 ChunkDict = NewType("ChunkDict", dict[ChunkKey, ChunkDictEntry])
 
 
-class ChunkEntry(BaseModel):
+@dataclasses.dataclass(frozen=True)
+class ChunkEntry:
     """
     Information for a single chunk in the manifest.
 
     Stored in the form `{"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}`.
     """
 
-    model_config = ConfigDict(frozen=True)
-
     path: str  # TODO stricter typing/validation of possible local / remote paths?
     offset: int
     length: int
 
-    def __repr__(self) -> str:
-        return f"ChunkEntry(path='{self.path}', offset={self.offset}, length={self.length})"
-
     @classmethod
     def from_kerchunk(
         cls, path_and_byte_range_info: tuple[str] | tuple[str, int, int]
@@ -58,7 +54,7 @@ def to_kerchunk(self) -> tuple[str, int, int]:
         return (self.path, self.offset, self.length)
 
     def dict(self) -> ChunkDictEntry:
-        return ChunkDictEntry(path=self.path, offset=self.offset, length=self.length)
+        return dataclasses.asdict(self)
 
 
 class ChunkManifest:

diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py
@@ -94,7 +94,7 @@ def test_accessor_to_kerchunk_dict(self):
             "refs": {
                 ".zgroup": '{"zarr_format":2}',
                 ".zattrs": "{}",
-                "a/.zarray": '{"chunks":[2,3],"compressor":null,"dtype":"<i8","fill_value":null,"filters":null,"order":"C","shape":[2,3],"zarr_format":2}',
+                "a/.zarray": '{"shape":[2,3],"chunks":[2,3],"dtype":"<i8","fill_value":null,"order":"C","compressor":null,"filters":null,"zarr_format":2}',
                 "a/.zattrs": '{"_ARRAY_DIMENSIONS":["x","y"]}',
                 "a/0.0": ["test.nc", 6144, 48],
             },
@@ -133,7 +133,7 @@ def test_accessor_to_kerchunk_json(self, tmp_path):
             "refs": {
                 ".zgroup": '{"zarr_format":2}',
                 ".zattrs": "{}",
-                "a/.zarray": '{"chunks":[2,3],"compressor":null,"dtype":"<i8","fill_value":null,"filters":null,"order":"C","shape":[2,3],"zarr_format":2}',
+                "a/.zarray": '{"shape":[2,3],"chunks":[2,3],"dtype":"<i8","fill_value":null,"order":"C","compressor":null,"filters":null,"zarr_format":2}',
                 "a/.zattrs": '{"_ARRAY_DIMENSIONS":["x","y"]}',
                 "a/0.0": ["test.nc", 6144, 48],
             },

diff --git a/virtualizarr/tests/test_manifests/test_manifest.py b/virtualizarr/tests/test_manifests/test_manifest.py
@@ -27,16 +27,6 @@ def test_invalid_chunk_entries(self):
         with pytest.raises(ValueError, match="must be of the form"):
             ChunkManifest(entries=chunks)
 
-        chunks = {
-            "0.0.0": {
-                "path": "s3://bucket/foo.nc",
-                "offset": "some nonsense",
-                "length": 100,
-            },
-        }
-        with pytest.raises(ValueError, match="must be of the form"):
-            ChunkManifest(entries=chunks)
-
     def test_invalid_chunk_keys(self):
         chunks = {
             "0.0.": {"path": "s3://bucket/foo.nc", "offset": 100, "length": 100},

diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py
@@ -1,3 +1,4 @@
+import dataclasses
 import json
 from pathlib import Path
 from typing import (
@@ -12,14 +13,6 @@
 import numpy as np
 import ujson  # type: ignore
 import xarray as xr
-from pydantic import (
-    BaseModel,
-    ConfigDict,
-    Field,
-    field_validator,
-    model_validator,
-)
-from typing_extensions import Self
 
 from virtualizarr.vendor.zarr.utils import json_dumps
 
@@ -47,60 +40,46 @@
 """
 
 
-class Codec(BaseModel):
+@dataclasses.dataclass
+class Codec:
     compressor: dict | None = None
     filters: list[dict] | None = None
 
-    def __repr__(self) -> str:
-        return f"Codec(compressor={self.compressor}, filters={self.filters})"
-
 
-class ZArray(BaseModel):
+@dataclasses.dataclass
+class ZArray:
     """Just the .zarray information"""
 
     # TODO will this work for V3?
 
-    model_config = ConfigDict(
-        arbitrary_types_allowed=True,  # only here so pydantic doesn't complain about the numpy dtype field
-    )
-
+    shape: tuple[int, ...]
     chunks: tuple[int, ...]
-    compressor: dict | None = None
     dtype: np.dtype
-    fill_value: FillValueT = Field(default=0.0, validate_default=True)
+    fill_value: FillValueT = dataclasses.field(default=0.0)
+    order: Literal["C", "F"] = "C"
+    compressor: dict | None = None
     filters: list[dict] | None = None
-    order: Literal["C", "F"]
-    shape: tuple[int, ...]
     zarr_format: Literal[2, 3] = 2
 
-    @field_validator("dtype")
-    @classmethod
-    def validate_dtype(cls, dtype) -> np.dtype:
-        # Your custom validation logic here
-        # Convert numpy.dtype to a format suitable for Pydantic
-        return np.dtype(dtype)
-
     def __post_init__(self) -> None:
         if len(self.shape) != len(self.chunks):
             raise ValueError(
                 "Dimension mismatch between array shape and chunk shape. "
                 f"Array shape {self.shape} has ndim={self.shape} but chunk shape {self.chunks} has ndim={len(self.chunks)}"
             )
 
-    @model_validator(mode="after")
-    def _check_fill_value(self) -> Self:
+        if isinstance(self.dtype, str):
+            # Convert dtype string to numpy.dtype
+            self.dtype = np.dtype(self.dtype)
+
         if self.fill_value is None:
             self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype, 0.0)
-        return self
 
     @property
     def codec(self) -> Codec:
         """For comparison against other arrays."""
         return Codec(compressor=self.compressor, filters=self.filters)
 
-    def __repr__(self) -> str:
-        return f"ZArray(shape={self.shape}, chunks={self.chunks}, dtype={self.dtype}, compressor={self.compressor}, filters={self.filters}, fill_value={self.fill_value})"
-
     @classmethod
     def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray":
         # coerce type of fill_value as kerchunk can be inconsistent with this
@@ -121,7 +100,7 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray":
         )
 
     def dict(self) -> dict[str, Any]:
-        zarray_dict = dict(self)
+        zarray_dict = dataclasses.asdict(self)
         zarray_dict["dtype"] = encode_dtype(zarray_dict["dtype"])
         return zarray_dict
 
@@ -133,28 +112,12 @@ def to_kerchunk_json(self) -> str:
 
     def replace(
         self,
-        chunks: Optional[tuple[int, ...]] = None,
-        compressor: Optional[dict] = None,
-        dtype: Optional[np.dtype] = None,
-        fill_value: Optional[float] = None,  # float or int?
-        filters: Optional[list[dict]] = None,  # type: ignore[valid-type]
-        order: Optional[Literal["C"] | Literal["F"]] = None,
-        shape: Optional[tuple[int, ...]] = None,
-        zarr_format: Optional[Literal[2] | Literal[3]] = None,
+        **kwargs: Any,
     ) -> "ZArray":
         """
         Convenience method to create a new ZArray from an existing one by altering only certain attributes.
         """
-        return ZArray(
-            chunks=chunks if chunks is not None else self.chunks,
-            compressor=compressor if compressor is not None else self.compressor,
-            dtype=dtype if dtype is not None else self.dtype,
-            fill_value=fill_value if fill_value is not None else self.fill_value,
-            filters=filters if filters is not None else self.filters,
-            shape=shape if shape is not None else self.shape,
-            order=order if order is not None else self.order,
-            zarr_format=zarr_format if zarr_format is not None else self.zarr_format,
-        )
+        return dataclasses.replace(self, **kwargs)
 
     def _v3_codec_pipeline(self) -> list:
         """
@@ -352,8 +315,8 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]:
     attrs = metadata.pop("attributes")
     dim_names = metadata.pop("dimension_names")
 
-    chunk_shape = metadata["chunk_grid"]["configuration"]["chunk_shape"]
-    shape = metadata["shape"]
+    chunk_shape = tuple(metadata["chunk_grid"]["configuration"]["chunk_shape"])
+    shape = tuple(metadata["shape"])
     zarr_format = metadata["zarr_format"]
 
     if metadata["fill_value"] is None: