From 0461d205816483fde1fa27e3449e4286efae656c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 3 Aug 2024 19:39:23 -0500 Subject: [PATCH 1/5] Implement pydantic models as dataclasses This removes our pydantic dependency by reimplementing them with dataclasses. There are a few breaking changes: 1. The classes won't automatically cast the argumetns to the declared type. IMO, that's the preferable behavior. Some backwards compatability shims have been added for np.dtype, but perhpas we want to remove that too. 2. The models won't have any of the methods they previously inherited from pydantic.BaseModel. This is probably good for user-facing objects, we now have full control over the public API. 3. We had to reorder some of the fields on ZArray, since dataclasses is stricter about positional arguments. I've aligned the order with `zarr.create`. The tests did need to be updated in a few places (we compare some strings, and the order changed, and we don't automatically cast lists to tuples in the init for shape and size). --- ci/environment.yml | 1 - docs/releases.rst | 3 + pyproject.toml | 1 - virtualizarr/manifests/manifest.py | 12 +-- virtualizarr/tests/test_kerchunk.py | 4 +- .../tests/test_manifests/test_manifest.py | 10 --- virtualizarr/zarr.py | 73 +++++-------------- 7 files changed, 27 insertions(+), 77 deletions(-) diff --git a/ci/environment.yml b/ci/environment.yml index a41a99d..de0532f 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -9,7 +9,6 @@ dependencies: - netcdf4 - xarray>=2024.6.0 - kerchunk>=0.2.5 - - pydantic - numpy>=2.0.0 - ujson - packaging diff --git a/docs/releases.rst b/docs/releases.rst index 81a0aea..c79ed7c 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -17,6 +17,9 @@ Breaking changes - Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`) By `Gustavo Hidalgo `_. +- VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclasses `zarr.Array`, no longer inherit from `pydantic.BaseModel` (:pull:`xxx`) +- `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`xxx`) + Deprecations ~~~~~~~~~~~~ diff --git a/pyproject.toml b/pyproject.toml index 4496116..6ae86b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,6 @@ dependencies = [ "xarray>=2024.06.0", "kerchunk>=0.2.5", "h5netcdf", - "pydantic", "numpy>=2.0.0", "ujson", "packaging", diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index bf7c24f..534502a 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -1,10 +1,10 @@ +import dataclasses import json import re from collections.abc import Iterable, Iterator from typing import Any, Callable, Dict, NewType, Tuple, TypedDict, cast import numpy as np -from pydantic import BaseModel, ConfigDict from upath import UPath from virtualizarr.types import ChunkKey @@ -25,22 +25,18 @@ class ChunkDictEntry(TypedDict): ChunkDict = NewType("ChunkDict", dict[ChunkKey, ChunkDictEntry]) -class ChunkEntry(BaseModel): +@dataclasses.dataclass(frozen=True) +class ChunkEntry: """ Information for a single chunk in the manifest. Stored in the form `{"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}`. """ - model_config = ConfigDict(frozen=True) - path: str # TODO stricter typing/validation of possible local / remote paths? offset: int length: int - def __repr__(self) -> str: - return f"ChunkEntry(path='{self.path}', offset={self.offset}, length={self.length})" - @classmethod def from_kerchunk( cls, path_and_byte_range_info: tuple[str] | tuple[str, int, int] @@ -58,7 +54,7 @@ def to_kerchunk(self) -> tuple[str, int, int]: return (self.path, self.offset, self.length) def dict(self) -> ChunkDictEntry: - return ChunkDictEntry(path=self.path, offset=self.offset, length=self.length) + return dataclasses.asdict(self) class ChunkManifest: diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py index 9aa934d..379c43a 100644 --- a/virtualizarr/tests/test_kerchunk.py +++ b/virtualizarr/tests/test_kerchunk.py @@ -94,7 +94,7 @@ def test_accessor_to_kerchunk_dict(self): "refs": { ".zgroup": '{"zarr_format":2}', ".zattrs": "{}", - "a/.zarray": '{"chunks":[2,3],"compressor":null,"dtype":" str: - return f"Codec(compressor={self.compressor}, filters={self.filters})" - -class ZArray(BaseModel): +@dataclasses.dataclass +class ZArray: """Just the .zarray information""" # TODO will this work for V3? - model_config = ConfigDict( - arbitrary_types_allowed=True, # only here so pydantic doesn't complain about the numpy dtype field - ) - + shape: tuple[int, ...] chunks: tuple[int, ...] - compressor: dict | None = None dtype: np.dtype - fill_value: FillValueT = Field(default=0.0, validate_default=True) + fill_value: FillValueT = dataclasses.field(default=0.0) + order: Literal["C", "F"] = "C" + compressor: dict | None = None filters: list[dict] | None = None - order: Literal["C", "F"] - shape: tuple[int, ...] zarr_format: Literal[2, 3] = 2 - @field_validator("dtype") - @classmethod - def validate_dtype(cls, dtype) -> np.dtype: - # Your custom validation logic here - # Convert numpy.dtype to a format suitable for Pydantic - return np.dtype(dtype) - def __post_init__(self) -> None: if len(self.shape) != len(self.chunks): raise ValueError( @@ -87,20 +68,18 @@ def __post_init__(self) -> None: f"Array shape {self.shape} has ndim={self.shape} but chunk shape {self.chunks} has ndim={len(self.chunks)}" ) - @model_validator(mode="after") - def _check_fill_value(self) -> Self: + if isinstance(self.dtype, str): + # Convert dtype string to numpy.dtype + self.dtype = np.dtype(self.dtype) + if self.fill_value is None: self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype, 0.0) - return self @property def codec(self) -> Codec: """For comparison against other arrays.""" return Codec(compressor=self.compressor, filters=self.filters) - def __repr__(self) -> str: - return f"ZArray(shape={self.shape}, chunks={self.chunks}, dtype={self.dtype}, compressor={self.compressor}, filters={self.filters}, fill_value={self.fill_value})" - @classmethod def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": # coerce type of fill_value as kerchunk can be inconsistent with this @@ -121,7 +100,7 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": ) def dict(self) -> dict[str, Any]: - zarray_dict = dict(self) + zarray_dict = dataclasses.asdict(self) zarray_dict["dtype"] = encode_dtype(zarray_dict["dtype"]) return zarray_dict @@ -133,28 +112,12 @@ def to_kerchunk_json(self) -> str: def replace( self, - chunks: Optional[tuple[int, ...]] = None, - compressor: Optional[dict] = None, - dtype: Optional[np.dtype] = None, - fill_value: Optional[float] = None, # float or int? - filters: Optional[list[dict]] = None, # type: ignore[valid-type] - order: Optional[Literal["C"] | Literal["F"]] = None, - shape: Optional[tuple[int, ...]] = None, - zarr_format: Optional[Literal[2] | Literal[3]] = None, + **kwargs: Any, ) -> "ZArray": """ Convenience method to create a new ZArray from an existing one by altering only certain attributes. """ - return ZArray( - chunks=chunks if chunks is not None else self.chunks, - compressor=compressor if compressor is not None else self.compressor, - dtype=dtype if dtype is not None else self.dtype, - fill_value=fill_value if fill_value is not None else self.fill_value, - filters=filters if filters is not None else self.filters, - shape=shape if shape is not None else self.shape, - order=order if order is not None else self.order, - zarr_format=zarr_format if zarr_format is not None else self.zarr_format, - ) + return dataclasses.replace(self, **kwargs) def _v3_codec_pipeline(self) -> list: """ @@ -352,8 +315,8 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: attrs = metadata.pop("attributes") dim_names = metadata.pop("dimension_names") - chunk_shape = metadata["chunk_grid"]["configuration"]["chunk_shape"] - shape = metadata["shape"] + chunk_shape = tuple(metadata["chunk_grid"]["configuration"]["chunk_shape"]) + shape = tuple(metadata["shape"]) zarr_format = metadata["zarr_format"] if metadata["fill_value"] is None: From 0fd50e82a3c5c9a508fbaa7e4a12e2c5ce149d86 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 5 Aug 2024 15:18:19 -0500 Subject: [PATCH 2/5] Fixups --- docs/releases.rst | 3 ++- virtualizarr/manifests/manifest.py | 2 +- virtualizarr/tests/test_zarr.py | 27 +++++++++++++++++++++++++- virtualizarr/zarr.py | 31 ++++++++++++++++++++++++++++-- 4 files changed, 58 insertions(+), 5 deletions(-) diff --git a/docs/releases.rst b/docs/releases.rst index 88df206..7283df7 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -17,7 +17,8 @@ Breaking changes - Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`) By `Gustavo Hidalgo `_. -- VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclasses `zarr.Array`, no longer inherit from `pydantic.BaseModel` (:pull:`xxx`) +- VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclass + `pydantic.BaseModel` (:pull:`210`) - `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`xxx`) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index 534502a..e153375 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -54,7 +54,7 @@ def to_kerchunk(self) -> tuple[str, int, int]: return (self.path, self.offset, self.length) def dict(self) -> ChunkDictEntry: - return dataclasses.asdict(self) + return ChunkDictEntry(dataclasses.asdict(self)) class ChunkManifest: diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py index 7715d24..6ba44c9 100644 --- a/virtualizarr/tests/test_zarr.py +++ b/virtualizarr/tests/test_zarr.py @@ -7,7 +7,7 @@ from virtualizarr import ManifestArray, open_virtual_dataset from virtualizarr.manifests.manifest import ChunkManifest -from virtualizarr.zarr import dataset_to_zarr, metadata_from_zarr_json +from virtualizarr.zarr import dataset_to_zarr, metadata_from_zarr_json, ZArray @pytest.fixture @@ -78,3 +78,28 @@ def test_zarr_v3_metadata_conformance(tmpdir, vds_with_manifest_arrays: xr.Datas and len(metadata["codecs"]) > 1 and all(isconfigurable(codec) for codec in metadata["codecs"]) ) + + +def test_replace_partial(): + arr = ZArray(shape=(2, 3), chunks=(1, 1), dtype=np.dtype(" str: zarray_dict["fill_value"] = None return ujson.dumps(zarray_dict) + # ZArray.dict seems to shadow "dict", so need to use __builtins__ in the + # type signature below. def replace( self, - **kwargs: Any, + shape: tuple[int, ...] | None = None, + chunks: tuple[int, ...] | None = None, + dtype: np.dtype | str | None = None, + fill_value: FillValueT = None, + order: Literal["C", "F"] | None = None, + compressor: __builtins__["dict"] | None = None, + filters: list[dict] | None = None, + zarr_format: Literal[2, 3] | None = None, ) -> "ZArray": """ Convenience method to create a new ZArray from an existing one by altering only certain attributes. """ - return dataclasses.replace(self, **kwargs) + replacements = {} + if shape is not None: + replacements["shape"] = shape + if chunks is not None: + replacements["chunks"] = chunks + if dtype is not None: + replacements["dtype"] = dtype + if fill_value is not None: + replacements["fill_value"] = fill_value + if order is not None: + replacements["order"] = order + if compressor is not None: + replacements["compressor"] = compressor + if filters is not None: + replacements["filters"] = filters + if zarr_format is not None: + replacements["zarr_format"] = zarr_format + return dataclasses.replace(self, **replacements) def _v3_codec_pipeline(self) -> list: """ From ea15b8e8bdeb55ee8496892b395b87f669263d78 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 5 Aug 2024 15:19:17 -0500 Subject: [PATCH 3/5] lint --- virtualizarr/tests/test_zarr.py | 5 +++-- virtualizarr/zarr.py | 2 -- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/virtualizarr/tests/test_zarr.py b/virtualizarr/tests/test_zarr.py index 6ba44c9..9eda044 100644 --- a/virtualizarr/tests/test_zarr.py +++ b/virtualizarr/tests/test_zarr.py @@ -7,7 +7,7 @@ from virtualizarr import ManifestArray, open_virtual_dataset from virtualizarr.manifests.manifest import ChunkManifest -from virtualizarr.zarr import dataset_to_zarr, metadata_from_zarr_json, ZArray +from virtualizarr.zarr import ZArray, dataset_to_zarr, metadata_from_zarr_json @pytest.fixture @@ -88,6 +88,7 @@ def test_replace_partial(): assert result.shape == (2, 3) assert result.chunks == (2, 3) + def test_replace_total(): arr = ZArray(shape=(2, 3), chunks=(1, 1), dtype=np.dtype(" Date: Wed, 7 Aug 2024 07:22:17 -0500 Subject: [PATCH 4/5] fixups --- virtualizarr/manifests/manifest.py | 6 +++++- virtualizarr/zarr.py | 10 +++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index fc54ad7..3aaebb4 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -54,7 +54,11 @@ def to_kerchunk(self) -> tuple[str, int, int]: return (self.path, self.offset, self.length) def dict(self) -> ChunkDictEntry: - return ChunkDictEntry(dataclasses.asdict(self)) + return ChunkDictEntry( + path=self.path, + offset=self.offset, + length=self.length, + ) class ChunkManifest: diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index af1b219..c783ad9 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -110,8 +110,8 @@ def to_kerchunk_json(self) -> str: zarray_dict["fill_value"] = None return ujson.dumps(zarray_dict) - # ZArray.dict seems to shadow "dict", so need to use __builtins__ in the - # type signature below. + # ZArray.dict seems to shadow "dict", so we need the type ignore in + # the signature below. def replace( self, shape: tuple[int, ...] | None = None, @@ -119,14 +119,14 @@ def replace( dtype: np.dtype | str | None = None, fill_value: FillValueT = None, order: Literal["C", "F"] | None = None, - compressor: __builtins__["dict"] | None = None, - filters: list[dict] | None = None, + compressor: dict | None = None, # type: ignore[valid-type] + filters: list[dict] | None = None, # type: ignore[valid-type] zarr_format: Literal[2, 3] | None = None, ) -> "ZArray": """ Convenience method to create a new ZArray from an existing one by altering only certain attributes. """ - replacements = {} + replacements: dict[str, Any] = {} if shape is not None: replacements["shape"] = shape if chunks is not None: From 1b2795459b08bf17c36bce250b8dc69f78841462 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Aug 2024 12:37:05 -0500 Subject: [PATCH 5/5] fixup --- virtualizarr/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/zarr.py b/virtualizarr/zarr.py index c783ad9..824892c 100644 --- a/virtualizarr/zarr.py +++ b/virtualizarr/zarr.py @@ -119,7 +119,7 @@ def replace( dtype: np.dtype | str | None = None, fill_value: FillValueT = None, order: Literal["C", "F"] | None = None, - compressor: dict | None = None, # type: ignore[valid-type] + compressor: "dict | None" = None, # type: ignore[valid-type] filters: list[dict] | None = None, # type: ignore[valid-type] zarr_format: Literal[2, 3] | None = None, ) -> "ZArray":