From 27adc36f9ff2332be0bb082ad44c1a996744f9d0 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Thu, 11 Jul 2024 14:04:56 -0400 Subject: [PATCH 1/2] basic test --- .../tests/test_manifests/test_array.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index 459e60b..c31b6a3 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -337,3 +337,26 @@ def test_refuse_combine(): for func in [np.concatenate, np.stack]: with pytest.raises(ValueError, match="inconsistent dtypes"): func([marr1, marr2], axis=0) + + +class TestIndexing: + def test_slice_aligned_with_chunks(self): + marr = create_manifestarray(shape=(4,), chunks=(2,)) + marr[0:2] + marr[2:4] + marr[0:4] + + with pytest.raises( + NotImplementedError, match="slice would split individual chunks" + ): + marr[0] + + with pytest.raises( + NotImplementedError, match="slice would split individual chunks" + ): + marr[0:1] + + with pytest.raises( + NotImplementedError, match="slice would split individual chunks" + ): + marr[0:3] From efac3157cf30059cf79f084307030fea3abed7ad Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Thu, 11 Jul 2024 14:05:10 -0400 Subject: [PATCH 2/2] implementation --- virtualizarr/manifests/array.py | 133 ++++++++++++++++++++++++++------ 1 file changed, 110 insertions(+), 23 deletions(-) diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index a0983de..b6839bc 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -1,4 +1,5 @@ import warnings +from types import EllipsisType, NoneType from typing import Any, Callable, Union import numpy as np @@ -193,36 +194,85 @@ def astype(self, dtype: np.dtype, /, *, copy: bool = True) -> "ManifestArray": def __getitem__( self, - key, + key: Union[ + int, + slice, + EllipsisType, + None, + tuple[Union[int, slice, EllipsisType, None], ...], + np.ndarray, + ], /, ) -> "ManifestArray": """ - Only supports extremely limited indexing. + Only supports indexing where slices are aligned with chunk boundaries. - Only here because xarray will apparently attempt to index into its lazy indexing classes even if the operation would be a no-op anyway. + Follows the array API standard otherwise. """ - from xarray.core.indexing import BasicIndexer + indexer = key - if isinstance(key, BasicIndexer): - indexer = key.tuple + indexer_nd: tuple[Union[int, slice, EllipsisType, None, np.ndarray], ...] + if isinstance(indexer, (int, slice, EllipsisType, NoneType, np.ndarray)): + indexer_nd = (indexer,) else: - indexer = key + indexer_nd = indexer - indexer = _possibly_expand_trailing_ellipsis(key, self.ndim) + # _validate_indexer(indexer) - if len(indexer) != self.ndim: + indexer_nd = _possibly_expand_trailing_ellipses(indexer_nd, self.ndim) + + if len(indexer_nd) != self.ndim: raise ValueError( - f"Invalid indexer for array with ndim={self.ndim}: {indexer}" + f"Invalid indexer for array with ndim={self.ndim}: {indexer_nd}" ) - if all( - isinstance(axis_indexer, slice) and axis_indexer == slice(None) - for axis_indexer in indexer + chunk_slices = [] + new_arr_shape = [] + for axis_num, (indexer_1d, arr_length, chunk_length) in enumerate( + zip(indexer_nd, self.shape, self.chunks) ): - # indexer is all slice(None)'s, so this is a no-op - return self - else: - raise NotImplementedError(f"Doesn't support slicing with {indexer}") + if isinstance(indexer_1d, int): + array_slice_1d = slice(indexer_1d, indexer_1d + 1, 1) + elif isinstance(indexer_1d, NoneType): + array_slice_1d = slice(0, arr_length, 1) + elif isinstance(indexer_1d, slice): + array_slice_1d = slice( + indexer_1d.start if indexer_1d.start is not None else 0, + indexer_1d.stop if indexer_1d.stop is not None else arr_length, + indexer_1d.step if indexer_1d.step is not None else 1, + ) + else: + # TODO we could attempt to also support indexing with numpy arrays + raise TypeError( + f"Can only perform indexing with keys of type (int, slice, EllipsisType, NoneType), but got type {type(indexer_1d)} for axis {axis_num}" + ) + + chunk_slice_1d = _array_slice_to_chunk_slice( + array_slice_1d, arr_length, chunk_length + ) + chunk_slices.append(chunk_slice_1d) + + n_elements_in_slice = abs( + (array_slice_1d.start - array_slice_1d.stop) / array_slice_1d.step + ) + new_arr_shape.append(n_elements_in_slice) + + print(chunk_slices) + + # do slicing of entries in manifest + sliced_paths = self.manifest._paths[tuple(chunk_slices)] + sliced_offsets = self.manifest._offsets[tuple(chunk_slices)] + sliced_lengths = self.manifest._lengths[tuple(chunk_slices)] + sliced_manifest = ChunkManifest.from_arrays( + paths=sliced_paths, + offsets=sliced_offsets, + lengths=sliced_lengths, + ) + + # chunk sizes are unchanged by slicing that aligns with chunks + new_zarray = self.zarray.replace(shape=tuple(new_arr_shape)) + + return ManifestArray(chunkmanifest=sliced_manifest, zarray=new_zarray) def rename_paths( self, @@ -265,10 +315,47 @@ def rename_paths( return ManifestArray(zarray=self.zarray, chunkmanifest=renamed_manifest) -def _possibly_expand_trailing_ellipsis(key, ndim: int): - if key[-1] == ...: - extra_slices_needed = ndim - (len(key) - 1) - *indexer, ellipsis = key - return tuple(tuple(indexer) + (slice(None),) * extra_slices_needed) +def _array_slice_to_chunk_slice( + array_slice: slice, + arr_length: int, + chunk_length: int, +) -> slice: + """ + Translate a slice into an array into a corresponding slice into the underlying chunk grid. + + Will raise on any array slices that require slicing within individual chunks. + """ + + if chunk_length == 1: + # alot of indexing is possible only in this case, because this is basically just a normal array along that axis + chunk_slice = array_slice + return chunk_slice + + # Check that start of slice aligns with start of a chunk + if array_slice.start % chunk_length != 0: + raise NotImplementedError( + f"Cannot index ManifestArray axis of length {arr_length} and chunk length {chunk_length} with {array_slice} as slice would split individual chunks" + ) + + # Check that slice spans integer number of chunks + slice_length = array_slice.stop - array_slice.start + if slice_length % chunk_length != 0: + raise NotImplementedError( + f"Cannot index ManifestArray axis of length {arr_length} and chunk length {chunk_length} with {array_slice} as slice would split individual chunks" + ) + + index_of_first_chunk = int(array_slice.start / chunk_length) + n_chunks = int(slice_length / chunk_length) + + chunk_slice = slice(index_of_first_chunk, index_of_first_chunk + n_chunks, 1) + + return chunk_slice + + +def _possibly_expand_trailing_ellipses(indexer, ndim: int): + if indexer[-1] == ...: + extra_slices_needed = ndim - (len(indexer) - 1) + *indexer_1d, ellipsis = indexer + return tuple(tuple(indexer_1d) + (slice(None),) * extra_slices_needed) else: - return key + return indexer