Internal refactor to separate reading and writing concerns (#231)

* split xarray.py into backend.py and accessor.py * move the kerchunk serialization code out into a new writers submodule * separate out the zarr reading code as a separate reader * actually include new accessor.py file * actually include new kerchunk writers file * actually include new zarr writer file * update test to import from the new location of zarr code * refactor to create a kerchunk 'reader' * split test_xarray.py into two files * split up the kerchunk tests into tests of writing and reading kerchunk * absolute imports in top-level init * kerchunk.py -> types.kerchunk.py * fix some mypy issues * release notes * update module paths in API docs * separate zarr writer tests out * forgot file i moved the zarr tests to * move left behind test * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
zarr-developers · Aug 27, 2024 · 515d157 · 515d157
1 parent d7f0c57
commit 515d157
Show file tree

Hide file tree

Showing 24 changed files with 1,247 additions and 1,173 deletions.
diff --git a/docs/api.rst b/docs/api.rst
@@ -21,7 +21,7 @@ Manifests
 Reading
 =======
 
-.. currentmodule:: virtualizarr.xarray
+.. currentmodule:: virtualizarr.backend
 .. autosummary::
     :nosignatures:
     :toctree: generated/
@@ -32,7 +32,7 @@ Reading
 Serialization
 =============
 
-.. currentmodule:: virtualizarr.xarray
+.. currentmodule:: virtualizarr.accessor
 .. autosummary::
     :nosignatures:
     :toctree: generated/
@@ -44,7 +44,7 @@ Serialization
 Rewriting
 =============
 
-.. currentmodule:: virtualizarr.xarray
+.. currentmodule:: virtualizarr.accessor
 .. autosummary::
     :nosignatures:
     :toctree: generated/

diff --git a/docs/releases.rst b/docs/releases.rst
@@ -34,7 +34,7 @@ Bug fixes
 - Exclude empty chunks during `ChunkDict` construction. (:pull:`198`)
   By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.
 - Fixed regression in `fill_value` handling for datetime dtypes making virtual
-  Zarr stores unreadable (:pr:`206`)
+  Zarr stores unreadable (:pull:`206`)
   By `Timothy Hodson <https://github.com/thodson-usgs>`_
 
 Documentation
@@ -43,6 +43,9 @@ Documentation
 Internal Changes
 ~~~~~~~~~~~~~~~~
 
+- Refactored internal structure significantly to split up everything to do with reading references from that to do with writing references.
+  (:issue:`229`) (:pull:`231`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
+
 .. _v1.0.0:
 
 v1.0.0 (9th July 2024)

diff --git a/virtualizarr/__init__.py b/virtualizarr/__init__.py
@@ -1,6 +1,6 @@
-from .manifests import ChunkManifest, ManifestArray  # type: ignore # noqa
-from .xarray import VirtualiZarrDatasetAccessor  # type: ignore # noqa
-from .xarray import open_virtual_dataset  # noqa: F401
+from virtualizarr.manifests import ChunkManifest, ManifestArray  # type: ignore # noqa
+from virtualizarr.accessor import VirtualiZarrDatasetAccessor  # type: ignore # noqa
+from virtualizarr.backend import open_virtual_dataset  # noqa: F401
 
 from importlib.metadata import version as _version
 

diff --git a/virtualizarr/accessor.py b/virtualizarr/accessor.py
@@ -0,0 +1,166 @@
+from pathlib import Path
+from typing import (
+    Callable,
+    Literal,
+    overload,
+)
+
+import ujson  # type: ignore
+from xarray import Dataset, register_dataset_accessor
+
+from virtualizarr.manifests import ManifestArray
+from virtualizarr.types.kerchunk import KerchunkStoreRefs
+from virtualizarr.writers.kerchunk import dataset_to_kerchunk_refs
+from virtualizarr.writers.zarr import dataset_to_zarr
+
+
+@register_dataset_accessor("virtualize")
+class VirtualiZarrDatasetAccessor:
+    """
+    Xarray accessor for writing out virtual datasets to disk.
+
+    Methods on this object are called via `ds.virtualize.{method}`.
+    """
+
+    def __init__(self, ds: Dataset):
+        self.ds: Dataset = ds
+
+    def to_zarr(self, storepath: str) -> None:
+        """
+        Serialize all virtualized arrays in this xarray dataset as a Zarr store.
+
+        Currently requires all variables to be backed by ManifestArray objects.
+
+        Not very useful until some implementation of a Zarr reader can actually read these manifest.json files.
+        See https://github.com/zarr-developers/zarr-specs/issues/287
+
+        Parameters
+        ----------
+        storepath : str
+        """
+        dataset_to_zarr(self.ds, storepath)
+
+    @overload
+    def to_kerchunk(
+        self, filepath: None, format: Literal["dict"]
+    ) -> KerchunkStoreRefs: ...
+
+    @overload
+    def to_kerchunk(self, filepath: str | Path, format: Literal["json"]) -> None: ...
+
+    @overload
+    def to_kerchunk(
+        self,
+        filepath: str | Path,
+        format: Literal["parquet"],
+        record_size: int = 100_000,
+        categorical_threshold: int = 10,
+    ) -> None: ...
+
+    def to_kerchunk(
+        self,
+        filepath: str | Path | None = None,
+        format: Literal["dict", "json", "parquet"] = "dict",
+        record_size: int = 100_000,
+        categorical_threshold: int = 10,
+    ) -> KerchunkStoreRefs | None:
+        """
+        Serialize all virtualized arrays in this xarray dataset into the kerchunk references format.
+
+        Parameters
+        ----------
+        filepath : str, default: None
+            File path to write kerchunk references into. Not required if format is 'dict'.
+        format : 'dict', 'json', or 'parquet'
+            Format to serialize the kerchunk references as.
+            If 'json' or 'parquet' then the 'filepath' argument is required.
+        record_size (parquet only): int
+            Number of references to store in each reference file (default 100,000). Bigger values
+            mean fewer read requests but larger memory footprint.
+        categorical_threshold (parquet only) : int
+            Encode urls as pandas.Categorical to reduce memory footprint if the ratio
+            of the number of unique urls to total number of refs for each variable
+            is greater than or equal to this number. (default 10)
+
+        References
+        ----------
+        https://fsspec.github.io/kerchunk/spec.html
+        """
+        refs = dataset_to_kerchunk_refs(self.ds)
+
+        if format == "dict":
+            return refs
+        elif format == "json":
+            if filepath is None:
+                raise ValueError("Filepath must be provided when format is 'json'")
+
+            with open(filepath, "w") as json_file:
+                ujson.dump(refs, json_file)
+
+            return None
+        elif format == "parquet":
+            from kerchunk.df import refs_to_dataframe
+
+            if isinstance(filepath, Path):
+                url = str(filepath)
+            elif isinstance(filepath, str):
+                url = filepath
+
+            # refs_to_dataframe is responsible for writing to parquet.
+            # at no point does it create a full in-memory dataframe.
+            refs_to_dataframe(
+                refs,
+                url=url,
+                record_size=record_size,
+                categorical_threshold=categorical_threshold,
+            )
+            return None
+        else:
+            raise ValueError(f"Unrecognized output format: {format}")
+
+    def rename_paths(
+        self,
+        new: str | Callable[[str], str],
+    ) -> Dataset:
+        """
+        Rename paths to chunks in every ManifestArray in this dataset.
+
+        Accepts either a string, in which case this new path will be used for all chunks, or
+        a function which accepts the old path and returns the new path.
+
+        Parameters
+        ----------
+        new
+            New path to use for all chunks, either as a string, or as a function which accepts and returns strings.
+
+        Returns
+        -------
+        Dataset
+
+        Examples
+        --------
+        Rename paths to reflect moving the referenced files from local storage to an S3 bucket.
+
+        >>> def local_to_s3_url(old_local_path: str) -> str:
+        ...     from pathlib import Path
+        ...
+        ...     new_s3_bucket_url = "http://s3.amazonaws.com/my_bucket/"
+        ...
+        ...     filename = Path(old_local_path).name
+        ...     return str(new_s3_bucket_url / filename)
+
+        >>> ds.virtualize.rename_paths(local_to_s3_url)
+
+        See Also
+        --------
+        ManifestArray.rename_paths
+        ChunkManifest.rename_paths
+        """
+
+        new_ds = self.ds.copy()
+        for var_name in new_ds.variables:
+            data = new_ds[var_name].data
+            if isinstance(data, ManifestArray):
+                new_ds[var_name].data = data.rename_paths(new=new)
+
+        return new_ds