Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chunk-aligned indexing #183

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 110 additions & 23 deletions virtualizarr/manifests/array.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import warnings
from types import EllipsisType, NoneType
from typing import Any, Callable, Union

import numpy as np
Expand Down Expand Up @@ -198,36 +199,85 @@ def astype(self, dtype: np.dtype, /, *, copy: bool = True) -> "ManifestArray":

def __getitem__(
self,
key,
key: Union[
int,
slice,
EllipsisType,
None,
tuple[Union[int, slice, EllipsisType, None], ...],
np.ndarray,
],
/,
) -> "ManifestArray":
"""
Only supports extremely limited indexing.
Only supports indexing where slices are aligned with chunk boundaries.

Only here because xarray will apparently attempt to index into its lazy indexing classes even if the operation would be a no-op anyway.
Follows the array API standard otherwise.
"""
from xarray.core.indexing import BasicIndexer
indexer = key

if isinstance(key, BasicIndexer):
indexer = key.tuple
indexer_nd: tuple[Union[int, slice, EllipsisType, None, np.ndarray], ...]
if isinstance(indexer, (int, slice, EllipsisType, NoneType, np.ndarray)):
indexer_nd = (indexer,)
else:
indexer = key
indexer_nd = indexer

indexer = _possibly_expand_trailing_ellipsis(key, self.ndim)
# _validate_indexer(indexer)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(probably don't need this as separate function)


if len(indexer) != self.ndim:
indexer_nd = _possibly_expand_trailing_ellipses(indexer_nd, self.ndim)

if len(indexer_nd) != self.ndim:
raise ValueError(
f"Invalid indexer for array with ndim={self.ndim}: {indexer}"
f"Invalid indexer for array with ndim={self.ndim}: {indexer_nd}"
)

if all(
isinstance(axis_indexer, slice) and axis_indexer == slice(None)
for axis_indexer in indexer
chunk_slices = []
new_arr_shape = []
for axis_num, (indexer_1d, arr_length, chunk_length) in enumerate(
zip(indexer_nd, self.shape, self.chunks)
):
# indexer is all slice(None)'s, so this is a no-op
return self
else:
raise NotImplementedError(f"Doesn't support slicing with {indexer}")
if isinstance(indexer_1d, int):
array_slice_1d = slice(indexer_1d, indexer_1d + 1, 1)
elif isinstance(indexer_1d, NoneType):
array_slice_1d = slice(0, arr_length, 1)
elif isinstance(indexer_1d, slice):
array_slice_1d = slice(
indexer_1d.start if indexer_1d.start is not None else 0,
indexer_1d.stop if indexer_1d.stop is not None else arr_length,
indexer_1d.step if indexer_1d.step is not None else 1,
)
else:
# TODO we could attempt to also support indexing with numpy arrays
raise TypeError(
f"Can only perform indexing with keys of type (int, slice, EllipsisType, NoneType), but got type {type(indexer_1d)} for axis {axis_num}"
)

chunk_slice_1d = _array_slice_to_chunk_slice(
array_slice_1d, arr_length, chunk_length
)
chunk_slices.append(chunk_slice_1d)

n_elements_in_slice = abs(
(array_slice_1d.start - array_slice_1d.stop) / array_slice_1d.step
)
new_arr_shape.append(n_elements_in_slice)

print(chunk_slices)

# do slicing of entries in manifest
sliced_paths = self.manifest._paths[tuple(chunk_slices)]
sliced_offsets = self.manifest._offsets[tuple(chunk_slices)]
sliced_lengths = self.manifest._lengths[tuple(chunk_slices)]
sliced_manifest = ChunkManifest.from_arrays(
paths=sliced_paths,
offsets=sliced_offsets,
lengths=sliced_lengths,
)

# chunk sizes are unchanged by slicing that aligns with chunks
new_zarray = self.zarray.replace(shape=tuple(new_arr_shape))

return ManifestArray(chunkmanifest=sliced_manifest, zarray=new_zarray)

def rename_paths(
self,
Expand Down Expand Up @@ -270,10 +320,47 @@ def rename_paths(
return ManifestArray(zarray=self.zarray, chunkmanifest=renamed_manifest)


def _possibly_expand_trailing_ellipsis(key, ndim: int):
if key[-1] == ...:
extra_slices_needed = ndim - (len(key) - 1)
*indexer, ellipsis = key
return tuple(tuple(indexer) + (slice(None),) * extra_slices_needed)
def _array_slice_to_chunk_slice(
array_slice: slice,
arr_length: int,
chunk_length: int,
) -> slice:
"""
Translate a slice into an array into a corresponding slice into the underlying chunk grid.
Comment on lines +323 to +329
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@d-v-b this is a first step in the direction of #71


Will raise on any array slices that require slicing within individual chunks.
"""

if chunk_length == 1:
# alot of indexing is possible only in this case, because this is basically just a normal array along that axis
chunk_slice = array_slice
return chunk_slice
Comment on lines +334 to +337
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Kind of interesting to realise this. You can even do things like slice(None, None, -1) to reverse the array in this case.


# Check that start of slice aligns with start of a chunk
if array_slice.start % chunk_length != 0:
raise NotImplementedError(
f"Cannot index ManifestArray axis of length {arr_length} and chunk length {chunk_length} with {array_slice} as slice would split individual chunks"
)

# Check that slice spans integer number of chunks
slice_length = array_slice.stop - array_slice.start
if slice_length % chunk_length != 0:
raise NotImplementedError(
f"Cannot index ManifestArray axis of length {arr_length} and chunk length {chunk_length} with {array_slice} as slice would split individual chunks"
)

index_of_first_chunk = int(array_slice.start / chunk_length)
n_chunks = int(slice_length / chunk_length)

chunk_slice = slice(index_of_first_chunk, index_of_first_chunk + n_chunks, 1)

return chunk_slice


def _possibly_expand_trailing_ellipses(indexer, ndim: int):
if indexer[-1] == ...:
extra_slices_needed = ndim - (len(indexer) - 1)
*indexer_1d, ellipsis = indexer
return tuple(tuple(indexer_1d) + (slice(None),) * extra_slices_needed)
else:
return key
return indexer
23 changes: 23 additions & 0 deletions virtualizarr/tests/test_manifests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,3 +337,26 @@ def test_refuse_combine():
for func in [np.concatenate, np.stack]:
with pytest.raises(ValueError, match="inconsistent dtypes"):
func([marr1, marr2], axis=0)


class TestIndexing:
def test_slice_aligned_with_chunks(self):
marr = create_manifestarray(shape=(4,), chunks=(2,))
marr[0:2]
marr[2:4]
marr[0:4]
Comment on lines +344 to +347
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to come back and improve these placeholder tests


with pytest.raises(
NotImplementedError, match="slice would split individual chunks"
):
marr[0]

with pytest.raises(
NotImplementedError, match="slice would split individual chunks"
):
marr[0:1]

with pytest.raises(
NotImplementedError, match="slice would split individual chunks"
):
marr[0:3]
Loading