Skip to content

Commit

Permalink
Internal refactor to separate reading and writing concerns (#231)
Browse files Browse the repository at this point in the history
* split xarray.py into backend.py and accessor.py

* move the kerchunk serialization code out into a new writers submodule

* separate out the zarr reading code as a separate reader

* actually include new accessor.py file

* actually include new kerchunk writers file

* actually include new zarr writer file

* update test to import from the new location of zarr code

* refactor to create a kerchunk 'reader'

* split test_xarray.py into two files

* split up the kerchunk tests into tests of writing and reading kerchunk

* absolute imports in top-level init

* kerchunk.py -> types.kerchunk.py

* fix some mypy issues

* release notes

* update module paths in API docs

* separate zarr writer tests out

* forgot file i moved the zarr tests to

* move left behind test

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
TomNicholas and pre-commit-ci[bot] committed Aug 27, 2024
1 parent d7f0c57 commit 515d157
Show file tree
Hide file tree
Showing 24 changed files with 1,247 additions and 1,173 deletions.
6 changes: 3 additions & 3 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ Manifests
Reading
=======

.. currentmodule:: virtualizarr.xarray
.. currentmodule:: virtualizarr.backend
.. autosummary::
:nosignatures:
:toctree: generated/
Expand All @@ -32,7 +32,7 @@ Reading
Serialization
=============

.. currentmodule:: virtualizarr.xarray
.. currentmodule:: virtualizarr.accessor
.. autosummary::
:nosignatures:
:toctree: generated/
Expand All @@ -44,7 +44,7 @@ Serialization
Rewriting
=============

.. currentmodule:: virtualizarr.xarray
.. currentmodule:: virtualizarr.accessor
.. autosummary::
:nosignatures:
:toctree: generated/
Expand Down
5 changes: 4 additions & 1 deletion docs/releases.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ Bug fixes
- Exclude empty chunks during `ChunkDict` construction. (:pull:`198`)
By `Gustavo Hidalgo <https://github.com/ghidalgo3>`_.
- Fixed regression in `fill_value` handling for datetime dtypes making virtual
Zarr stores unreadable (:pr:`206`)
Zarr stores unreadable (:pull:`206`)
By `Timothy Hodson <https://github.com/thodson-usgs>`_

Documentation
Expand All @@ -43,6 +43,9 @@ Documentation
Internal Changes
~~~~~~~~~~~~~~~~

- Refactored internal structure significantly to split up everything to do with reading references from that to do with writing references.
(:issue:`229`) (:pull:`231`) By `Tom Nicholas <https://github.com/TomNicholas>`_.

.. _v1.0.0:

v1.0.0 (9th July 2024)
Expand Down
6 changes: 3 additions & 3 deletions virtualizarr/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .manifests import ChunkManifest, ManifestArray # type: ignore # noqa
from .xarray import VirtualiZarrDatasetAccessor # type: ignore # noqa
from .xarray import open_virtual_dataset # noqa: F401
from virtualizarr.manifests import ChunkManifest, ManifestArray # type: ignore # noqa
from virtualizarr.accessor import VirtualiZarrDatasetAccessor # type: ignore # noqa
from virtualizarr.backend import open_virtual_dataset # noqa: F401

from importlib.metadata import version as _version

Expand Down
166 changes: 166 additions & 0 deletions virtualizarr/accessor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
from pathlib import Path
from typing import (
Callable,
Literal,
overload,
)

import ujson # type: ignore
from xarray import Dataset, register_dataset_accessor

from virtualizarr.manifests import ManifestArray
from virtualizarr.types.kerchunk import KerchunkStoreRefs
from virtualizarr.writers.kerchunk import dataset_to_kerchunk_refs
from virtualizarr.writers.zarr import dataset_to_zarr


@register_dataset_accessor("virtualize")
class VirtualiZarrDatasetAccessor:
"""
Xarray accessor for writing out virtual datasets to disk.
Methods on this object are called via `ds.virtualize.{method}`.
"""

def __init__(self, ds: Dataset):
self.ds: Dataset = ds

def to_zarr(self, storepath: str) -> None:
"""
Serialize all virtualized arrays in this xarray dataset as a Zarr store.
Currently requires all variables to be backed by ManifestArray objects.
Not very useful until some implementation of a Zarr reader can actually read these manifest.json files.
See https://github.com/zarr-developers/zarr-specs/issues/287
Parameters
----------
storepath : str
"""
dataset_to_zarr(self.ds, storepath)

@overload
def to_kerchunk(
self, filepath: None, format: Literal["dict"]
) -> KerchunkStoreRefs: ...

@overload
def to_kerchunk(self, filepath: str | Path, format: Literal["json"]) -> None: ...

@overload
def to_kerchunk(
self,
filepath: str | Path,
format: Literal["parquet"],
record_size: int = 100_000,
categorical_threshold: int = 10,
) -> None: ...

def to_kerchunk(
self,
filepath: str | Path | None = None,
format: Literal["dict", "json", "parquet"] = "dict",
record_size: int = 100_000,
categorical_threshold: int = 10,
) -> KerchunkStoreRefs | None:
"""
Serialize all virtualized arrays in this xarray dataset into the kerchunk references format.
Parameters
----------
filepath : str, default: None
File path to write kerchunk references into. Not required if format is 'dict'.
format : 'dict', 'json', or 'parquet'
Format to serialize the kerchunk references as.
If 'json' or 'parquet' then the 'filepath' argument is required.
record_size (parquet only): int
Number of references to store in each reference file (default 100,000). Bigger values
mean fewer read requests but larger memory footprint.
categorical_threshold (parquet only) : int
Encode urls as pandas.Categorical to reduce memory footprint if the ratio
of the number of unique urls to total number of refs for each variable
is greater than or equal to this number. (default 10)
References
----------
https://fsspec.github.io/kerchunk/spec.html
"""
refs = dataset_to_kerchunk_refs(self.ds)

if format == "dict":
return refs
elif format == "json":
if filepath is None:
raise ValueError("Filepath must be provided when format is 'json'")

with open(filepath, "w") as json_file:
ujson.dump(refs, json_file)

return None
elif format == "parquet":
from kerchunk.df import refs_to_dataframe

if isinstance(filepath, Path):
url = str(filepath)
elif isinstance(filepath, str):
url = filepath

# refs_to_dataframe is responsible for writing to parquet.
# at no point does it create a full in-memory dataframe.
refs_to_dataframe(
refs,
url=url,
record_size=record_size,
categorical_threshold=categorical_threshold,
)
return None
else:
raise ValueError(f"Unrecognized output format: {format}")

def rename_paths(
self,
new: str | Callable[[str], str],
) -> Dataset:
"""
Rename paths to chunks in every ManifestArray in this dataset.
Accepts either a string, in which case this new path will be used for all chunks, or
a function which accepts the old path and returns the new path.
Parameters
----------
new
New path to use for all chunks, either as a string, or as a function which accepts and returns strings.
Returns
-------
Dataset
Examples
--------
Rename paths to reflect moving the referenced files from local storage to an S3 bucket.
>>> def local_to_s3_url(old_local_path: str) -> str:
... from pathlib import Path
...
... new_s3_bucket_url = "http://s3.amazonaws.com/my_bucket/"
...
... filename = Path(old_local_path).name
... return str(new_s3_bucket_url / filename)
>>> ds.virtualize.rename_paths(local_to_s3_url)
See Also
--------
ManifestArray.rename_paths
ChunkManifest.rename_paths
"""

new_ds = self.ds.copy()
for var_name in new_ds.variables:
data = new_ds[var_name].data
if isinstance(data, ManifestArray):
new_ds[var_name].data = data.rename_paths(new=new)

return new_ds
Loading

0 comments on commit 515d157

Please sign in to comment.