Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Convert zarr helpers to utilities, update numpy chunk encoding in zarr router #260

Merged
merged 14 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/source/user-guide/plugins.md
Original file line number Diff line number Diff line change
Expand Up @@ -312,9 +312,18 @@ class TutorialDataset(Plugin):
return None
```

```{note}
Some routers may want to cache data computed from datasets that they serve to avoid unnecessary recomputation. In this case, routers may rely on the
`_xpublish_id` attribute (`DATASET_ID_ATTR_KEY` from `xpublish.api`) on each dataset. If this attribute is set, it should be a unique identifier for the dataset, otherwise the `dataset_id` used to load the dataset will be set as the `_xpublish_id` automatically.
```

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's probably worth being more explicit, and making it more important than alert. Also could you tweak the example above to explicit set the attr as an example?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean move that up and change it to a warning? Ill update the example code too

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ya, warning is probably the right level. I was thinking tweaking the tone as well, 'you need to set a uniqueDATASET_ID_ATTR_KEY from xpublish.api on each dataset for routers to manage caching appropriately' to go with the warning.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice. I like that, K updated it!

## Hook Spec Plugins

Plugins can also provide new hook specifications that other plugins can then implement.
This allows Xpublish to support things that we haven't even thought of yet.

These return a class of hookspecs from {py:meth}`xpublish.plugins.hooks.PluginSpec.register_hookspec`.

```

```
9 changes: 4 additions & 5 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,18 +49,17 @@ def test_invalid_encoding_chunks_with_dask_raise():
data = dask.array.zeros((10, 20, 30), chunks=expected)
ds = xr.Dataset({'foo': (['x', 'y', 'z'], data)})
ds['foo'].encoding['chunks'] = [8, 5, 1]
with pytest.raises(NotImplementedError) as excinfo:
with pytest.raises(ValueError) as excinfo:
_ = create_zmetadata(ds)
excinfo.match(r'Specified zarr chunks .*')


def test_invalid_encoding_chunks_with_numpy_raise():
def test_ignore_encoding_chunks_with_numpy():
data = np.zeros((10, 20, 30))
ds = xr.Dataset({'foo': (['x', 'y', 'z'], data)})
ds['foo'].encoding['chunks'] = [8, 5, 1]
with pytest.raises(ValueError) as excinfo:
_ = create_zmetadata(ds)
excinfo.match(r'Encoding chunks do not match inferred.*')
zmetadata = create_zmetadata(ds)
assert zmetadata['metadata']['foo/.zarray']['chunks'] == [10, 20, 30]


def test_get_data_chunk_numpy():
Expand Down
2 changes: 1 addition & 1 deletion tests/test_rest_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ def test_cache(airtemp_ds):

response1 = client.get('/zarr/air/0.0.0')
assert response1.status_code == 200
assert '/air/0.0.0' in rest.cache
assert 'airtemp/air/0.0.0' in rest.cache
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All datasets now have an _xpublish_id attr so the cache string changed in this test


# test that we can retrieve
response2 = client.get('/zarr/air/0.0.0')
Expand Down
65 changes: 3 additions & 62 deletions xpublish/dependencies.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,10 @@
"""Helper functions to use a FastAPI dependencies."""
from typing import (
TYPE_CHECKING,
Dict,
List,
)

from typing import TYPE_CHECKING, Dict, List

import cachey
import pluggy
import xarray as xr
from fastapi import Depends

from .utils.api import DATASET_ID_ATTR_KEY
from .utils.zarr import ZARR_METADATA_KEY, create_zmetadata, create_zvariables

if TYPE_CHECKING:
from .plugins import Plugin # pragma: no cover
Expand All @@ -28,6 +21,7 @@ def get_dataset_ids() -> List[str]:

Returns:
A list of unique keys for datasets

"""
return [] # pragma: no cover

Expand Down Expand Up @@ -66,58 +60,6 @@ def get_cache() -> cachey.Cache:
return cachey.Cache(available_bytes=1e6) # pragma: no cover


def get_zvariables(
dataset: xr.Dataset = Depends(get_dataset),
cache: cachey.Cache = Depends(get_cache),
) -> dict:
"""FastAPI dependency that returns a dictionary of zarr encoded variables.

Args:
dataset: The dataset to get the zvariables from.
cache: The cache to use for storing the zvariables.

Returns:
A dictionary of zarr encoded variables.
"""
cache_key = dataset.attrs.get(DATASET_ID_ATTR_KEY, '') + '/' + 'zvariables'
zvariables = cache.get(cache_key)

if zvariables is None:
zvariables = create_zvariables(dataset)

# we want to permanently cache this: set high cost value
cache.put(cache_key, zvariables, 99999)

return zvariables


def get_zmetadata(
dataset: xr.Dataset = Depends(get_dataset),
cache: cachey.Cache = Depends(get_cache),
zvariables: dict = Depends(get_zvariables),
) -> dict:
"""FastAPI dependency that returns a consolidated zmetadata dictionary.

Args:
dataset: The dataset to get the zmetadata from.
cache: The cache to use for storing the zmetadata.
zvariables: The zvariables to use for creating the zmetadata.

Returns:
A consolidated zmetadata dictionary.
"""
cache_key = dataset.attrs.get(DATASET_ID_ATTR_KEY, '') + '/' + ZARR_METADATA_KEY
zmeta = cache.get(cache_key)

if zmeta is None:
zmeta = create_zmetadata(dataset)

# we want to permanently cache this: set high cost value
cache.put(cache_key, zmeta, 99999)

return zmeta


def get_plugins() -> Dict[str, 'Plugin']:
"""FastAPI dependency that returns the a dictionary of loaded plugins.

Expand All @@ -129,4 +71,3 @@ def get_plugins() -> Dict[str, 'Plugin']:

def get_plugin_manager() -> pluggy.PluginManager:
"""Return the active plugin manager."""
...
13 changes: 6 additions & 7 deletions xpublish/plugins/included/dataset_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,20 @@

from xpublish.utils.api import JSONResponse

from ...dependencies import get_zmetadata, get_zvariables
from ...utils.zarr import get_zmetadata, get_zvariables
from .. import Dependencies, Plugin, hookimpl


class DatasetInfoPlugin(Plugin):
"""Dataset metadata and schema routes."""
"""Dataset metadata."""

name: str = 'dataset_info'

dataset_router_prefix: str = ''
dataset_router_tags: Sequence[str] = ['dataset_info']

@hookimpl
def dataset_router(self, deps: Dependencies) -> APIRouter:
"""Returns a router with dataset metadata and schema routes."""
def dataset_router(self, deps: Dependencies) -> APIRouter: # noqa: D102
router = APIRouter(
prefix=self.dataset_router_prefix,
tags=list(self.dataset_router_tags),
Expand All @@ -39,22 +38,22 @@ def html_representation(
def list_keys(
dataset=Depends(deps.dataset),
) -> list[str]:
"""Returns a of the keys in a dataset."""
"""List of the keys in a dataset."""
return JSONResponse(list(dataset.variables))

@router.get('/dict')
def to_dict(
dataset=Depends(deps.dataset),
) -> dict:
"""Returns the full dataset as a dictionary."""
"""The full dataset as a dictionary."""
return JSONResponse(dataset.to_dict(data=False))

@router.get('/info')
def info(
dataset=Depends(deps.dataset),
cache=Depends(deps.cache),
) -> dict:
"""Returns the dataset schema (close to the NCO-JSON schema)."""
"""Dataset schema (close to the NCO-JSON schema)."""
zvariables = get_zvariables(dataset, cache)
zmetadata = get_zmetadata(dataset, cache, zvariables)

Expand Down
12 changes: 6 additions & 6 deletions xpublish/plugins/included/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@

from xpublish.utils.api import JSONResponse

from ...dependencies import get_zmetadata, get_zvariables
from ...utils.api import DATASET_ID_ATTR_KEY
from ...utils.cache import CostTimer
from ...utils.zarr import (
ZARR_METADATA_KEY,
encode_chunk,
get_data_chunk,
get_zmetadata,
get_zvariables,
jsonify_zmetadata,
)
from .. import Dependencies, Plugin, hookimpl
Expand All @@ -32,8 +33,7 @@ class ZarrPlugin(Plugin):
dataset_router_tags: Sequence[str] = ['zarr']

@hookimpl
def dataset_router(self, deps: Dependencies) -> APIRouter:
"""Returns a router with Zarr-like accessing endpoints for datasets."""
def dataset_router(self, deps: Dependencies) -> APIRouter: # noqa: D102
router = APIRouter(
prefix=self.dataset_router_prefix,
tags=list(self.dataset_router_tags),
Expand All @@ -44,7 +44,7 @@ def get_zarr_metadata(
dataset=Depends(deps.dataset),
cache=Depends(deps.cache),
) -> dict:
"""Returns consolidated Zarr metadata."""
"""Consolidated Zarr metadata."""
zvariables = get_zvariables(dataset, cache)
zmetadata = get_zmetadata(dataset, cache, zvariables)

Expand All @@ -57,7 +57,7 @@ def get_zarr_group(
dataset=Depends(deps.dataset),
cache=Depends(deps.cache),
) -> dict:
"""Returns Zarr group data."""
"""Zarr group data."""
zvariables = get_zvariables(dataset, cache)
zmetadata = get_zmetadata(dataset, cache, zvariables)

Expand All @@ -68,7 +68,7 @@ def get_zarr_attrs(
dataset=Depends(deps.dataset),
cache=Depends(deps.cache),
) -> dict:
"""Returns Zarr attributes."""
"""Zarr attributes."""
zvariables = get_zvariables(dataset, cache)
zmetadata = get_zmetadata(dataset, cache, zvariables)

Expand Down
3 changes: 3 additions & 0 deletions xpublish/rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
)
from .routers import dataset_collection_router
from .utils.api import (
DATASET_ID_ATTR_KEY,
SingleDatasetOpenAPIOverrider,
check_route_conflicts,
normalize_app_routers,
Expand Down Expand Up @@ -163,6 +164,8 @@ def get_dataset_from_plugins(
dataset = self.pm.hook.get_dataset(dataset_id=dataset_id)

if dataset:
if dataset.attrs.get(DATASET_ID_ATTR_KEY, None) is None:
dataset.attrs[DATASET_ID_ATTR_KEY] = dataset_id
return dataset

if dataset_id not in self._datasets:
Expand Down
Loading
Loading