diff --git a/ci/environment.yml b/ci/environment.yml index 5368784..883463a 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -9,7 +9,6 @@ dependencies: - netcdf4 - xarray>=2024.6.0 - kerchunk>=0.2.5 - - pydantic - numpy>=2.0.0 - ujson - packaging diff --git a/docs/releases.rst b/docs/releases.rst index e7f6df2..7283df7 100644 --- a/docs/releases.rst +++ b/docs/releases.rst @@ -17,6 +17,10 @@ Breaking changes - Serialize valid ZarrV3 metadata and require full compressor numcodec config (for :pull:`193`) By `Gustavo Hidalgo `_. +- VirtualiZarr's `ZArray`, `ChunkEntry`, and `Codec` no longer subclass + `pydantic.BaseModel` (:pull:`210`) +- `ZArray`'s `__init__` signature has changed to match `zarr.Array`'s (:pull:`xxx`) + Deprecations ~~~~~~~~~~~~ diff --git a/pyproject.toml b/pyproject.toml index 859dd22..6b0efe8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,6 @@ dependencies = [ "xarray>=2024.06.0", "kerchunk>=0.2.5", "h5netcdf", - "pydantic", "numpy>=2.0.0", "ujson", "packaging", diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py index a8621ed..3aaebb4 100644 --- a/virtualizarr/manifests/manifest.py +++ b/virtualizarr/manifests/manifest.py @@ -1,10 +1,10 @@ +import dataclasses import json import re from collections.abc import Iterable, Iterator from typing import Any, Callable, Dict, NewType, Tuple, TypedDict, cast import numpy as np -from pydantic import BaseModel, ConfigDict from upath import UPath from virtualizarr.types import ChunkKey @@ -25,22 +25,18 @@ class ChunkDictEntry(TypedDict): ChunkDict = NewType("ChunkDict", dict[ChunkKey, ChunkDictEntry]) -class ChunkEntry(BaseModel): +@dataclasses.dataclass(frozen=True) +class ChunkEntry: """ Information for a single chunk in the manifest. Stored in the form `{"path": "s3://bucket/foo.nc", "offset": 100, "length": 100}`. """ - model_config = ConfigDict(frozen=True) - path: str # TODO stricter typing/validation of possible local / remote paths? offset: int length: int - def __repr__(self) -> str: - return f"ChunkEntry(path='{self.path}', offset={self.offset}, length={self.length})" - @classmethod def from_kerchunk( cls, path_and_byte_range_info: tuple[str] | tuple[str, int, int] @@ -57,8 +53,12 @@ def to_kerchunk(self) -> tuple[str, int, int]: """Write out in the format that kerchunk uses for chunk entries.""" return (self.path, self.offset, self.length) - def dict(self) -> ChunkDictEntry: # type: ignore[override] - return ChunkDictEntry(path=self.path, offset=self.offset, length=self.length) + def dict(self) -> ChunkDictEntry: + return ChunkDictEntry( + path=self.path, + offset=self.offset, + length=self.length, + ) class ChunkManifest: diff --git a/virtualizarr/tests/test_kerchunk.py b/virtualizarr/tests/test_kerchunk.py index 9aa934d..379c43a 100644 --- a/virtualizarr/tests/test_kerchunk.py +++ b/virtualizarr/tests/test_kerchunk.py @@ -94,7 +94,7 @@ def test_accessor_to_kerchunk_dict(self): "refs": { ".zgroup": '{"zarr_format":2}', ".zattrs": "{}", - "a/.zarray": '{"chunks":[2,3],"compressor":null,"dtype":" 1 and all(isconfigurable(codec) for codec in metadata["codecs"]) ) + + +def test_replace_partial(): + arr = ZArray(shape=(2, 3), chunks=(1, 1), dtype=np.dtype(" str: - return f"Codec(compressor={self.compressor}, filters={self.filters})" - -class ZArray(BaseModel): +@dataclasses.dataclass +class ZArray: """Just the .zarray information""" # TODO will this work for V3? - model_config = ConfigDict( - arbitrary_types_allowed=True, # only here so pydantic doesn't complain about the numpy dtype field - ) - + shape: tuple[int, ...] chunks: tuple[int, ...] - compressor: dict | None = None dtype: np.dtype - fill_value: FillValueT = Field(None, validate_default=True) + fill_value: FillValueT = dataclasses.field(default=None) + order: Literal["C", "F"] = "C" + compressor: dict | None = None filters: list[dict] | None = None - order: Literal["C", "F"] - shape: tuple[int, ...] - zarr_format: ZARR_FORMAT = 2 - - @field_validator("dtype") - @classmethod - def validate_dtype(cls, dtype) -> np.dtype: - # Your custom validation logic here - # Convert numpy.dtype to a format suitable for Pydantic - return np.dtype(dtype) + zarr_format: Literal[2, 3] = 2 def __post_init__(self) -> None: if len(self.shape) != len(self.chunks): @@ -90,20 +64,18 @@ def __post_init__(self) -> None: f"Array shape {self.shape} has ndim={self.shape} but chunk shape {self.chunks} has ndim={len(self.chunks)}" ) - @model_validator(mode="after") - def _check_fill_value(self) -> Self: + if isinstance(self.dtype, str): + # Convert dtype string to numpy.dtype + self.dtype = np.dtype(self.dtype) + if self.fill_value is None: self.fill_value = ZARR_DEFAULT_FILL_VALUE.get(self.dtype.kind, 0.0) - return self @property def codec(self) -> Codec: """For comparison against other arrays.""" return Codec(compressor=self.compressor, filters=self.filters) - def __repr__(self) -> str: - return f"ZArray(shape={self.shape}, chunks={self.chunks}, dtype={self.dtype}, compressor={self.compressor}, filters={self.filters}, fill_value={self.fill_value})" - @classmethod def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": # coerce type of fill_value as kerchunk can be inconsistent with this @@ -127,8 +99,8 @@ def from_kerchunk_refs(cls, decoded_arr_refs_zarray) -> "ZArray": zarr_format=cast(ZARR_FORMAT, zarr_format), ) - def dict(self) -> dict[str, Any]: # type: ignore - zarray_dict = dict(self) + def dict(self) -> dict[str, Any]: + zarray_dict = dataclasses.asdict(self) zarray_dict["dtype"] = encode_dtype(zarray_dict["dtype"]) return zarray_dict @@ -138,30 +110,40 @@ def to_kerchunk_json(self) -> str: zarray_dict["fill_value"] = None return ujson.dumps(zarray_dict) + # ZArray.dict seems to shadow "dict", so we need the type ignore in + # the signature below. def replace( self, - chunks: Optional[tuple[int, ...]] = None, - compressor: Optional[dict] = None, # type: ignore[valid-type] - dtype: Optional[np.dtype] = None, - fill_value: Optional[float] = None, # float or int? - filters: Optional[list[dict]] = None, # type: ignore[valid-type] - order: Optional[Literal["C"] | Literal["F"]] = None, - shape: Optional[tuple[int, ...]] = None, - zarr_format: Optional[Literal[2] | Literal[3]] = None, + shape: tuple[int, ...] | None = None, + chunks: tuple[int, ...] | None = None, + dtype: np.dtype | str | None = None, + fill_value: FillValueT = None, + order: Literal["C", "F"] | None = None, + compressor: "dict | None" = None, # type: ignore[valid-type] + filters: list[dict] | None = None, # type: ignore[valid-type] + zarr_format: Literal[2, 3] | None = None, ) -> "ZArray": """ Convenience method to create a new ZArray from an existing one by altering only certain attributes. """ - return ZArray( - chunks=chunks if chunks is not None else self.chunks, - compressor=compressor if compressor is not None else self.compressor, - dtype=dtype if dtype is not None else self.dtype, - fill_value=fill_value if fill_value is not None else self.fill_value, - filters=filters if filters is not None else self.filters, - shape=shape if shape is not None else self.shape, - order=order if order is not None else self.order, - zarr_format=zarr_format if zarr_format is not None else self.zarr_format, - ) + replacements: dict[str, Any] = {} + if shape is not None: + replacements["shape"] = shape + if chunks is not None: + replacements["chunks"] = chunks + if dtype is not None: + replacements["dtype"] = dtype + if fill_value is not None: + replacements["fill_value"] = fill_value + if order is not None: + replacements["order"] = order + if compressor is not None: + replacements["compressor"] = compressor + if filters is not None: + replacements["filters"] = filters + if zarr_format is not None: + replacements["zarr_format"] = zarr_format + return dataclasses.replace(self, **replacements) def _v3_codec_pipeline(self) -> list: """ @@ -361,8 +343,8 @@ def metadata_from_zarr_json(filepath: Path) -> tuple[ZArray, list[str], dict]: attrs = metadata.pop("attributes") dim_names = metadata.pop("dimension_names") - chunk_shape = metadata["chunk_grid"]["configuration"]["chunk_shape"] - shape = metadata["shape"] + chunk_shape = tuple(metadata["chunk_grid"]["configuration"]["chunk_shape"]) + shape = tuple(metadata["shape"]) zarr_format = metadata["zarr_format"] if metadata["fill_value"] is None: