pydata · dcherian · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024
diff --git a/doc/api.rst b/doc/api.rst
@@ -250,6 +250,7 @@ Reshaping and reorganizing
    Dataset.roll
    Dataset.pad
    Dataset.sortby
+   Dataset.shuffle_by
    Dataset.broadcast_like
 
 DataArray
@@ -588,6 +589,7 @@ Reshaping and reorganizing
    DataArray.roll
    DataArray.pad
    DataArray.sortby
+   DataArray.shuffle_by
    DataArray.broadcast_like
 
 IO / Conversion
@@ -771,6 +773,7 @@ Dataset
    DatasetGroupBy.var
    DatasetGroupBy.dims
    DatasetGroupBy.groups
+   DatasetGroupBy.shuffle
 
 DataArray
 ---------
@@ -802,6 +805,7 @@ DataArray
    DataArrayGroupBy.var
    DataArrayGroupBy.dims
    DataArrayGroupBy.groups
+   DataArrayGroupBy.shuffle
 
 Grouper Objects
 ---------------

diff --git a/xarray/core/common.py b/xarray/core/common.py
@@ -52,7 +52,7 @@
         T_Variable,
     )
     from xarray.core.variable import Variable
-    from xarray.groupers import Resampler
+    from xarray.groupers import Grouper, Resampler
 
     DTypeMaybeMapping = Union[DTypeLikeSave, Mapping[Any, DTypeLikeSave]]
 
@@ -874,6 +874,68 @@ def rolling_exp(
 
         return rolling_exp.RollingExp(self, window, window_type)
 
+    def shuffle_by(
+        self,
+        group: Hashable | DataArray | Mapping[Any, Grouper] | None = None,
+        chunks: T_Chunks = None,
+        **groupers: Grouper,
+    ) -> Self:
+        """
+        Sort or "shuffle" this object by a Grouper.
+
+        "Shuffle" means the object is sorted so that all group members occur sequentially,
+        in the same chunk. Multiple groups may occur in the same chunk.
+        This method is particularly useful for chunked arrays (e.g. dask, cubed).
+        For chunked array types, the order of appearance is not guaranteed, but will depend on
+        the input chunking.
+
+        Parameters
+        ----------
+        group : Hashable or DataArray or IndexVariable or mapping of Hashable to Grouper
+            Array whose unique values should be used to group this array. If a
+            Hashable, must be the name of a coordinate contained in this dataarray. If a dictionary,
+            must map an existing variable name to a :py:class:`Grouper` instance.
+        chunks : int, tuple of int, "auto" or mapping of hashable to int or a TimeResampler, optional
+            How to adjust chunks along dimensions not present in the array being grouped by.
+        **groupers : Grouper
+           Grouper objects using which to shuffle the data.
+
+        Examples
+        --------
+        >>> import dask
+        >>> from xarray.groupers import UniqueGrouper
+        >>> da = xr.DataArray(
+        ...     dims="x",
+        ...     data=dask.array.arange(10, chunks=1),
+        ...     coords={"x": [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]},
+        ...     name="a",
+        ... )
+        >>> da
+        <xarray.DataArray 'a' (x: 10)> Size: 80B
+        dask.array<arange, shape=(10,), dtype=int64, chunksize=(1,), chunktype=numpy.ndarray>
+        Coordinates:
+          * x        (x) int64 80B 1 2 3 1 2 3 1 2 3 0
+
+        >>> da.shuffle_by(x=UniqueGrouper())
+        <xarray.DataArray 'a' (x: 10)> Size: 80B
+        dask.array<shuffle, shape=(10,), dtype=int64, chunksize=(3,), chunktype=numpy.ndarray>
+        Coordinates:
+          * x        (x) int64 80B 0 1 1 1 2 2 2 3 3 3
+
+        Returns
+        -------
+        DataArray or Dataset
+            The same type as this object
+
+        See Also
+        --------
+        DataArrayGroupBy.shuffle
+        DatasetGroupBy.shuffle
+        dask.dataframe.DataFrame.shuffle
+        dask.array.shuffle
+        """
+        return self.groupby(group=group, **groupers)._shuffle_obj(chunks)
+
     def _resample(
         self,
         resample_cls: type[T_Resample],

diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py
@@ -45,14 +45,15 @@
     peek_at,
 )
 from xarray.core.variable import IndexVariable, Variable
+from xarray.namedarray.pycompat import is_chunked_array
 from xarray.util.deprecation_helpers import _deprecate_positional_args
 
 if TYPE_CHECKING:
     from numpy.typing import ArrayLike
 
     from xarray.core.dataarray import DataArray
     from xarray.core.dataset import Dataset
-    from xarray.core.types import GroupIndex, GroupIndices, GroupKey
+    from xarray.core.types import GroupIndex, GroupIndices, GroupKey, Self, T_Chunks
     from xarray.core.utils import Frozen
     from xarray.groupers import Grouper
 
@@ -517,6 +518,90 @@ def sizes(self) -> Mapping[Hashable, int]:
             self._sizes = self._obj.isel({self._group_dim: index}).sizes
         return self._sizes
 
+    def shuffle(self, chunks: T_Chunks = None) -> Self:
+        """
+        Sort or "shuffle" the underlying object.
+
+        "Shuffle" means the object is sorted so that all group members occur sequentially,
+        in the same chunk. Multiple groups may occur in the same chunk.
+        This method is particularly useful for chunked arrays (e.g. dask, cubed).
+        particularly when you need to map a function that requires all members of a group
+        to be present in a single chunk. For chunked array types, the order of appearance
+        is not guaranteed, but will depend on the input chunking.
+
+        Parameters
+        ----------
+        chunks : int, tuple of int, "auto" or mapping of hashable to int or a TimeResampler, optional
+            How to adjust chunks along dimensions not present in the array being grouped by.
+
+        Returns
+        -------
+        DataArrayGroupBy or DatasetGroupBy
+
+        Examples
+        --------
+        >>> import dask
+        >>> da = xr.DataArray(
+        ...     dims="x",
+        ...     data=dask.array.arange(10, chunks=3),
+        ...     coords={"x": [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]},
+        ...     name="a",
+        ... )
+        >>> shuffled = da.groupby("x").shuffle()
+        >>> shuffled.quantile(q=0.5).compute()
+        <xarray.DataArray 'a' (x: 4)> Size: 32B
+        array([9., 3., 4., 5.])
+        Coordinates:
+            quantile  float64 8B 0.5
+          * x         (x) int64 32B 0 1 2 3
+
+        See Also
+        --------
+        dask.dataframe.shuffle
+        dask.array.shuffle
+        """
+        (grouper,) = self.groupers
+        return self._shuffle_obj(chunks).groupby(
+            {grouper.name: grouper.grouper.reset()},
+            restore_coord_dims=self._restore_coord_dims,
+        )
+
+    def _shuffle_obj(self, chunks: T_Chunks) -> T_DataWithCoords:
+        from xarray.core.dataarray import DataArray
+
+        (grouper,) = self.groupers
+        dim = self._group_dim
+        size = self._obj.sizes[dim]
+        was_array = isinstance(self._obj, DataArray)
+        as_dataset = self._obj._to_temp_dataset() if was_array else self._obj
+        no_slices: list[list[int]] = [
+            list(range(*idx.indices(size))) if isinstance(idx, slice) else idx
+            for idx in self._group_indices
+        ]
+
+        if grouper.name not in as_dataset._variables:
+            as_dataset.coords[grouper.name] = grouper.group1d
+
+        # Shuffling is only different from `isel` for chunked arrays.
+        # Extract them out, and treat them specially. The rest, we route through isel.
+        # This makes it easy to ensure correct handling of indexes.
+        is_chunked = {
+            name: var
+            for name, var in as_dataset._variables.items()
+            if is_chunked_array(var._data)
+        }
+        subset = as_dataset[
+            [name for name in as_dataset._variables if name not in is_chunked]
+        ]
+        shuffled = subset.isel({dim: np.concatenate(no_slices)})
+        for name, var in is_chunked.items():
+            shuffled[name] = var._shuffle(
+                indices=list(self._group_indices), dim=dim, chunks=chunks
+            )
+        shuffled = self._maybe_unstack(shuffled)
+        new_obj = self._obj._from_temp_dataset(shuffled) if was_array else shuffled
+        return new_obj
+
     def map(
         self,
         func: Callable,

diff --git a/xarray/core/types.py b/xarray/core/types.py
@@ -298,7 +298,7 @@ def copy(
 ZarrWriteModes = Literal["w", "w-", "a", "a-", "r+", "r"]
 
 GroupKey = Any
-GroupIndex = Union[int, slice, list[int]]
+GroupIndex = Union[slice, list[int]]
 GroupIndices = tuple[GroupIndex, ...]
 Bins = Union[
     int, Sequence[int], Sequence[float], Sequence[pd.Timestamp], np.ndarray, pd.Index

diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -44,7 +44,13 @@
     maybe_coerce_to_str,
 )
 from xarray.namedarray.core import NamedArray, _raise_if_any_duplicate_dimensions
-from xarray.namedarray.pycompat import integer_types, is_0d_dask_array, to_duck_array
+from xarray.namedarray.parallelcompat import get_chunked_array_type
+from xarray.namedarray.pycompat import (
+    integer_types,
+    is_0d_dask_array,
+    is_chunked_array,
+    to_duck_array,
+)
 from xarray.util.deprecation_helpers import deprecate_dims
 
 NON_NUMPY_SUPPORTED_ARRAY_TYPES = (
@@ -998,6 +1004,24 @@ def compute(self, **kwargs):
         new = self.copy(deep=False)
         return new.load(**kwargs)
 
+    def _shuffle(
+        self, indices: list[list[int]], dim: Hashable, chunks: T_Chunks
+    ) -> Self:
+        array = self._data
+        if is_chunked_array(array):
+            chunkmanager = get_chunked_array_type(array)
+            return self._replace(
+                data=chunkmanager.shuffle(
+                    array,
+                    indexer=indices,
+                    axis=self.get_axis_num(dim),
+                    chunks=chunks,
+                )
+            )
+        else:
+            assert False, "this should be unreachable"
+            return self.isel({dim: np.concatenate(indices)})
+
     def isel(
         self,
         indexers: Mapping[Any, Any] | None = None,

diff --git a/xarray/groupers.py b/xarray/groupers.py
@@ -20,7 +20,7 @@
 from xarray.core.groupby import T_Group, _DummyGroup
 from xarray.core.indexes import safe_cast_to_index
 from xarray.core.resample_cftime import CFTimeGrouper
-from xarray.core.types import Bins, DatetimeLike, GroupIndices, SideOptions
+from xarray.core.types import Bins, DatetimeLike, GroupIndices, Self, SideOptions
 from xarray.core.variable import Variable
 
 __all__ = [
@@ -90,6 +90,13 @@ def factorize(self, group: T_Group) -> EncodedGroups:
         """
         pass
 
+    @abstractmethod
+    def reset(self) -> Self:
+        """
+        Creates a new version of this Grouper clearing any caches.
+        """
+        pass
+
 
 class Resampler(Grouper):
     """
@@ -114,6 +121,9 @@ def group_as_index(self) -> pd.Index:
             self._group_as_index = self.group.to_index()
         return self._group_as_index
 
+    def reset(self) -> Self:
+        return type(self)()
+
     def factorize(self, group1d: T_Group) -> EncodedGroups:
         self.group = group1d
 
@@ -221,6 +231,16 @@ class BinGrouper(Grouper):
     include_lowest: bool = False
     duplicates: Literal["raise", "drop"] = "raise"
 
+    def reset(self) -> Self:
+        return type(self)(
+            bins=self.bins,
+            right=self.right,
+            labels=self.labels,
+            precision=self.precision,
+            include_lowest=self.include_lowest,
+            duplicates=self.duplicates,
+        )
+
     def __post_init__(self) -> None:
         if duck_array_ops.isnull(self.bins).all():
             raise ValueError("All bin edges are NaN.")
@@ -302,6 +322,15 @@ class TimeResampler(Resampler):
     index_grouper: CFTimeGrouper | pd.Grouper = field(init=False, repr=False)
     group_as_index: pd.Index = field(init=False, repr=False)
 
+    def reset(self) -> Self:
+        return type(self)(
+            freq=self.freq,
+            closed=self.closed,
+            label=self.label,
+            origin=self.origin,
+            offset=self.offset,
+        )
+
     def _init_properties(self, group: T_Group) -> None:
         from xarray import CFTimeIndex
 

diff --git a/xarray/namedarray/_typing.py b/xarray/namedarray/_typing.py
@@ -78,7 +78,8 @@ def dtype(self) -> _DType_co: ...
 _Chunks = tuple[_Shape, ...]
 _NormalizedChunks = tuple[tuple[int, ...], ...]
 # FYI in some cases we don't allow `None`, which this doesn't take account of.
-T_ChunkDim: TypeAlias = int | Literal["auto"] | None | tuple[int, ...]
+# # FYI the `str` is for a size string, e.g. "16MB", supported by dask.
+T_ChunkDim: TypeAlias = str | int | Literal["auto"] | None | tuple[int, ...]
 # We allow the tuple form of this (though arguably we could transition to named dims only)
 T_Chunks: TypeAlias = T_ChunkDim | Mapping[Any, T_ChunkDim]
 

diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py
@@ -251,3 +251,16 @@ def store(
             targets=targets,
             **kwargs,
         )
+
+    def shuffle(
+        self, x: DaskArray, indexer: list[list[int]], axis: int, chunks: T_Chunks
+    ) -> DaskArray:
+        import dask.array
+
+        if not module_available("dask", minversion="2024.08.0"):
+            raise ValueError(
+                "This method is very inefficient on dask<2024.08.0. Please upgrade."
+            )
+        if chunks is not None:
+            raise NotImplementedError
+        return dask.array.shuffle(x, indexer, axis)
diff --git a/xarray/namedarray/parallelcompat.py b/xarray/namedarray/parallelcompat.py
@@ -19,6 +19,7 @@
 
 if TYPE_CHECKING:
     from xarray.namedarray._typing import (
+        T_Chunks,
         _Chunks,
         _DType,
         _DType_co,
@@ -356,6 +357,11 @@ def compute(
         """
         raise NotImplementedError()
 
+    def shuffle(
+        self, x: T_ChunkedArray, indexer: list[list[int]], axis: int, chunks: T_Chunks
+    ) -> T_ChunkedArray:
+        raise NotImplementedError()
+
     @property
     def array_api(self) -> Any:
         """

diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py
@@ -106,6 +106,7 @@ def _importorskip(
     has_h5netcdf, requires_h5netcdf = _importorskip("h5netcdf")
 has_cftime, requires_cftime = _importorskip("cftime")
 has_dask, requires_dask = _importorskip("dask")
+has_dask_ge_2024_08_0, _ = _importorskip("dask", minversion="2024.08.0")
 with warnings.catch_warnings():
     warnings.filterwarnings(
         "ignore",