pydata · dcherian · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024
diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py
@@ -517,6 +517,50 @@ def sizes(self) -> Mapping[Hashable, int]:
             self._sizes = self._obj.isel({self._group_dim: index}).sizes
         return self._sizes
 
+    def shuffle(self) -> None:
+        """
+        Shuffle the underlying object so that all members in a group occur sequentially.
+
+        The order of appearance is not guaranteed. This method modifies the underlying Xarray
+        object in place.
+
+        Use this method first if you need to map a function that requires all members of a group
+        be in a single chunk.
+        """
+        from xarray.core.dataarray import DataArray
+        from xarray.core.dataset import Dataset
+
+        (grouper,) = self.groupers
+        dim = self._group_dim
+        was_array = isinstance(self._obj, DataArray)
+        as_dataset = self._obj._to_temp_dataset() if was_array else self._obj
+
+        shuffled = Dataset()
+        for name, var in as_dataset._variables.items():
+            if dim not in var.dims:
+                shuffled[name] = var
+                continue
+            shuffled[name] = var._shuffle(indices=list(self._group_indices), dim=dim)
+
+        # Replace self._group_indices with slices
+        slices = []
+        start = 0
+        for idxr in self._group_indices:
+            if TYPE_CHECKING:
+                assert not isinstance(idxr, slice)
+            slices.append(slice(start, start + len(idxr)))
+            start += len(idxr)
+
+        # TODO: we have now broken the invariant
+        # self._group_indices ≠ self.groupers[0].group_indices
+        self._group_indices = tuple(slices)
+        if was_array:
+            if TYPE_CHECKING:
+                assert isinstance(self._obj, DataArray)
+            self._obj = self._obj._from_temp_dataset(shuffled)
+        else:
+            self._obj = shuffled
+
     def map(
         self,
         func: Callable,

diff --git a/xarray/core/types.py b/xarray/core/types.py
@@ -297,7 +297,7 @@ def copy(
 ZarrWriteModes = Literal["w", "w-", "a", "a-", "r+", "r"]
 
 GroupKey = Any
-GroupIndex = Union[int, slice, list[int]]
+GroupIndex = Union[slice, list[int]]
 GroupIndices = tuple[GroupIndex, ...]
 Bins = Union[
     int, Sequence[int], Sequence[float], Sequence[pd.Timestamp], np.ndarray, pd.Index

diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -44,7 +44,13 @@
     maybe_coerce_to_str,
 )
 from xarray.namedarray.core import NamedArray, _raise_if_any_duplicate_dimensions
-from xarray.namedarray.pycompat import integer_types, is_0d_dask_array, to_duck_array
+from xarray.namedarray.parallelcompat import get_chunked_array_type
+from xarray.namedarray.pycompat import (
+    integer_types,
+    is_0d_dask_array,
+    is_chunked_array,
+    to_duck_array,
+)
 from xarray.util.deprecation_helpers import deprecate_dims
 
 NON_NUMPY_SUPPORTED_ARRAY_TYPES = (
@@ -998,6 +1004,21 @@ def compute(self, **kwargs):
         new = self.copy(deep=False)
         return new.load(**kwargs)
 
+    def _shuffle(self, indices: list[slice | list[int]], dim: Hashable) -> Self:
+        size = self.sizes[dim]
+        indices: list[list[int]] = [
+            list(range(*idx.indices(size))) if isinstance(idx, slice) else idx
+            for idx in indices
+        ]
+        array = self._data
+        if is_chunked_array(array):
+            chunkmanager = get_chunked_array_type(array)
+            return chunkmanager.shuffle(
+                array, indexer=indices, axis=self.get_axis_num(dim)
+            )
+        else:
+            return self.isel({dim: np.concatenate(indices)})
+
     def isel(
         self,
         indexers: Mapping[Any, Any] | None = None,

diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py
@@ -251,3 +251,12 @@ def store(
             targets=targets,
             **kwargs,
         )
+
+    def shuffle(self, x: DaskArray, indexer: list[list[int]], axis: int) -> DaskArray:
+        import dask.array
+
+        if not module_available("dask", minversion="2024.08.0"):
+            raise ValueError(
+                "This method is very inefficient on dask<2024.08.0. Please upgrade."
+            )
+        return dask.array.shuffle(x, indexer, axis)
diff --git a/xarray/namedarray/parallelcompat.py b/xarray/namedarray/parallelcompat.py
@@ -364,6 +364,11 @@ def compute(
         """
         raise NotImplementedError()
 
+    def shuffle(
+        self, x: T_ChunkedArray, indexer: list[list[int]], axis: int
+    ) -> T_ChunkedArray:
+        raise NotImplementedError()
+
     @property
     def array_api(self) -> Any:
         """

diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py
@@ -106,6 +106,7 @@ def _importorskip(
     has_h5netcdf, requires_h5netcdf = _importorskip("h5netcdf")
 has_cftime, requires_cftime = _importorskip("cftime")
 has_dask, requires_dask = _importorskip("dask")
+has_dask_ge_2024_08_0, _ = _importorskip("dask", minversion="2024.08.0")
 with warnings.catch_warnings():
     warnings.filterwarnings(
         "ignore",

diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py
@@ -21,6 +21,7 @@
     assert_identical,
     create_test_data,
     has_cftime,
+    has_dask_ge_2024_08_0,
     has_flox,
     requires_cftime,
     requires_dask,
@@ -1293,11 +1294,26 @@ def test_groupby_sum(self) -> None:
         assert_allclose(expected_sum_axis1, grouped.reduce(np.sum, "y"))
         assert_allclose(expected_sum_axis1, grouped.sum("y"))
 
+    @pytest.mark.parametrize(
+        "shuffle",
+        [
+            pytest.param(
+                True,
+                marks=pytest.mark.skipif(
+                    not has_dask_ge_2024_08_0, reason="dask too old"
+                ),
+            ),
+            False,
+        ],
+    )
     @pytest.mark.parametrize("method", ["sum", "mean", "median"])
-    def test_groupby_reductions(self, method) -> None:
+    def test_groupby_reductions(self, method: str, shuffle: bool) -> None:
         array = self.da
         grouped = array.groupby("abc")
 
+        if shuffle:
+            grouped.shuffle()
+
         reduction = getattr(np, method)
         expected = Dataset(
             {