pydata · dcherian · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024
diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py
@@ -4,7 +4,7 @@
 import warnings
 from collections.abc import Hashable, Iterator, Mapping, Sequence
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, Callable, Generic, Literal, Union
+from typing import TYPE_CHECKING, Any, Callable, Generic, Literal, Self, Union
 
 import numpy as np
 import pandas as pd
@@ -45,6 +45,7 @@
     peek_at,
 )
 from xarray.core.variable import IndexVariable, Variable
+from xarray.namedarray.pycompat import is_chunked_array
 from xarray.util.deprecation_helpers import _deprecate_positional_args
 
 if TYPE_CHECKING:
@@ -517,6 +518,51 @@ def sizes(self) -> Mapping[Hashable, int]:
             self._sizes = self._obj.isel({self._group_dim: index}).sizes
         return self._sizes
 
+    def shuffle(self) -> Self:
+        """
+        Shuffle the underlying object so that all members in a group occur sequentially.
+
+        The order of appearance is not guaranteed.
+
+        Use this method first if you need to map a function that requires all members of a group
+        be in a single chunk.
+        """
+        from xarray.core.dataarray import DataArray
+
+        (grouper,) = self.groupers
+        dim = self._group_dim
+        size = self._obj.sizes[dim]
+        was_array = isinstance(self._obj, DataArray)
+        as_dataset = self._obj._to_temp_dataset() if was_array else self._obj
+        no_slices: list[list[int]] = [
+            list(range(*idx.indices(size))) if isinstance(idx, slice) else idx
+            for idx in self._group_indices
+        ]
+
+        if grouper.name not in as_dataset._variables:
+            as_dataset.coords[grouper.name] = grouper.group1d
+
+        # Shuffling is only different from `isel` for chunked arrays.
+        # Extract them out, and treat them specially. The rest, we route through isel.
+        # This makes it easy to ensure correct handling of indexes.
+        is_chunked = {
+            name: var
+            for name, var in as_dataset._variables.items()
+            if is_chunked_array(var._data)
+        }
+        subset = as_dataset[
+            [name for name in as_dataset._variables if name not in is_chunked]
+        ]
+        shuffled = subset.isel({dim: np.concatenate(no_slices)})
+        for name, var in is_chunked.items():
+            shuffled[name] = var._shuffle(indices=list(self._group_indices), dim=dim)
+        shuffled = self._maybe_unstack(shuffled)
+        new_obj = self._obj._from_temp_dataset(shuffled) if was_array else shuffled
+        return new_obj.groupby(
+            {grouper.name: grouper.grouper.reset()},
+            restore_coord_dims=self._restore_coord_dims,
+        )
+
     def map(
         self,
         func: Callable,

diff --git a/xarray/core/types.py b/xarray/core/types.py
@@ -297,7 +297,7 @@ def copy(
 ZarrWriteModes = Literal["w", "w-", "a", "a-", "r+", "r"]
 
 GroupKey = Any
-GroupIndex = Union[int, slice, list[int]]
+GroupIndex = Union[slice, list[int]]
 GroupIndices = tuple[GroupIndex, ...]
 Bins = Union[
     int, Sequence[int], Sequence[float], Sequence[pd.Timestamp], np.ndarray, pd.Index

diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -44,7 +44,13 @@
     maybe_coerce_to_str,
 )
 from xarray.namedarray.core import NamedArray, _raise_if_any_duplicate_dimensions
-from xarray.namedarray.pycompat import integer_types, is_0d_dask_array, to_duck_array
+from xarray.namedarray.parallelcompat import get_chunked_array_type
+from xarray.namedarray.pycompat import (
+    integer_types,
+    is_0d_dask_array,
+    is_chunked_array,
+    to_duck_array,
+)
 from xarray.util.deprecation_helpers import deprecate_dims
 
 NON_NUMPY_SUPPORTED_ARRAY_TYPES = (
@@ -998,6 +1004,19 @@ def compute(self, **kwargs):
         new = self.copy(deep=False)
         return new.load(**kwargs)
 
+    def _shuffle(self, indices: list[list[int]], dim: Hashable) -> Self:
+        array = self._data
+        if is_chunked_array(array):
+            chunkmanager = get_chunked_array_type(array)
+            return self._replace(
+                data=chunkmanager.shuffle(
+                    array, indexer=indices, axis=self.get_axis_num(dim)
+                )
+            )
+        else:
+            assert False, "this should be unreachable"
+            return self.isel({dim: np.concatenate(indices)})
+
     def isel(
         self,
         indexers: Mapping[Any, Any] | None = None,

diff --git a/xarray/groupers.py b/xarray/groupers.py
@@ -9,7 +9,7 @@
 import datetime
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
-from typing import Any, Literal, cast
+from typing import Any, Literal, Self, cast
 
 import numpy as np
 import pandas as pd
@@ -90,6 +90,13 @@ def factorize(self, group: T_Group) -> EncodedGroups:
         """
         pass
 
+    @abstractmethod
+    def reset(self) -> Self:
+        """
+        Creates a new version of this Grouper clearing any caches.
+        """
+        pass
+
 
 class Resampler(Grouper):
     """
@@ -114,6 +121,9 @@ def group_as_index(self) -> pd.Index:
             self._group_as_index = self.group.to_index()
         return self._group_as_index
 
+    def reset(self) -> Self:
+        return type(self)()
+
     def factorize(self, group1d: T_Group) -> EncodedGroups:
         self.group = group1d
 
@@ -221,6 +231,16 @@ class BinGrouper(Grouper):
     include_lowest: bool = False
     duplicates: Literal["raise", "drop"] = "raise"
 
+    def reset(self) -> Self:
+        return type(self)(
+            bins=self.bins,
+            right=self.right,
+            labels=self.labels,
+            precision=self.precision,
+            include_lowest=self.include_lowest,
+            duplicates=self.duplicates,
+        )
+
     def __post_init__(self) -> None:
         if duck_array_ops.isnull(self.bins).all():
             raise ValueError("All bin edges are NaN.")
@@ -302,6 +322,15 @@ class TimeResampler(Resampler):
     index_grouper: CFTimeGrouper | pd.Grouper = field(init=False, repr=False)
     group_as_index: pd.Index = field(init=False, repr=False)
 
+    def reset(self) -> Self:
+        return type(self)(
+            freq=self.freq,
+            closed=self.closed,
+            label=self.label,
+            origin=self.origin,
+            offset=self.offset,
+        )
+
     def _init_properties(self, group: T_Group) -> None:
         from xarray import CFTimeIndex
 

diff --git a/xarray/namedarray/daskmanager.py b/xarray/namedarray/daskmanager.py
@@ -251,3 +251,12 @@ def store(
             targets=targets,
             **kwargs,
         )
+
+    def shuffle(self, x: DaskArray, indexer: list[list[int]], axis: int) -> DaskArray:
+        import dask.array
+
+        if not module_available("dask", minversion="2024.08.0"):
+            raise ValueError(
+                "This method is very inefficient on dask<2024.08.0. Please upgrade."
+            )
+        return dask.array.shuffle(x, indexer, axis)
diff --git a/xarray/namedarray/parallelcompat.py b/xarray/namedarray/parallelcompat.py
@@ -364,6 +364,11 @@ def compute(
         """
         raise NotImplementedError()
 
+    def shuffle(
+        self, x: T_ChunkedArray, indexer: list[list[int]], axis: int
+    ) -> T_ChunkedArray:
+        raise NotImplementedError()
+
     @property
     def array_api(self) -> Any:
         """

diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py
@@ -106,6 +106,7 @@ def _importorskip(
     has_h5netcdf, requires_h5netcdf = _importorskip("h5netcdf")
 has_cftime, requires_cftime = _importorskip("cftime")
 has_dask, requires_dask = _importorskip("dask")
+has_dask_ge_2024_08_0, _ = _importorskip("dask", minversion="2024.08.0")
 with warnings.catch_warnings():
     warnings.filterwarnings(
         "ignore",

diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py
@@ -2,6 +2,7 @@
 
 import operator
 import warnings
+from typing import Literal
 from unittest import mock
 
 import numpy as np
@@ -21,7 +22,9 @@
     assert_identical,
     create_test_data,
     has_cftime,
+    has_dask_ge_2024_08_0,
     has_flox,
+    raise_if_dask_computes,
     requires_cftime,
     requires_dask,
     requires_flox,
@@ -582,7 +585,12 @@
 
 @pytest.mark.filterwarnings("ignore:Converting non-nanosecond")
 @pytest.mark.filterwarnings("ignore:invalid value encountered in divide:RuntimeWarning")
-def test_groupby_drops_nans() -> None:
+@pytest.mark.parametrize("shuffle", [True, False])
+@pytest.mark.parametrize("chunk", [dict(lat=1), dict(lat=2, lon=2), False])
+def test_groupby_drops_nans(shuffle: bool, chunk: Literal[False] | dict) -> None:
+    xr.set_options(use_flox=False)  # TODO: remove
+    if shuffle and chunk and not has_dask_ge_2024_08_0:
+        pytest.skip()
     # GH2383
     # nan in 2D data variable (requires stacking)
     ds = xr.Dataset(
@@ -597,13 +605,17 @@
     ds["id"].values[3, 0] = np.nan
     ds["id"].values[-1, -1] = np.nan
 
+    if chunk:
+        ds = ds.chunk(chunk)
     grouped = ds.groupby(ds.id)
+    if shuffle:
+        grouped = grouped.shuffle()
 
     # non reduction operation
     expected1 = ds.copy()
-    expected1.variable.values[0, 0, :] = np.nan
-    expected1.variable.values[-1, -1, :] = np.nan
-    expected1.variable.values[3, 0, :] = np.nan
+    expected1.variable.data[0, 0, :] = np.nan
+    expected1.variable.data[-1, -1, :] = np.nan
+    expected1.variable.data[3, 0, :] = np.nan
     actual1 = grouped.map(lambda x: x).transpose(*ds.variable.dims)
     assert_identical(actual1, expected1)
 
@@ -1293,11 +1305,19 @@
         assert_allclose(expected_sum_axis1, grouped.reduce(np.sum, "y"))
         assert_allclose(expected_sum_axis1, grouped.sum("y"))
 
+    @pytest.mark.parametrize("use_flox", [True, False])
+    @pytest.mark.parametrize("shuffle", [True, False])
+    @pytest.mark.parametrize("chunk", [True, False])
     @pytest.mark.parametrize("method", ["sum", "mean", "median"])
-    def test_groupby_reductions(self, method) -> None:
-        array = self.da
-        grouped = array.groupby("abc")
+    def test_groupby_reductions(
+        self, use_flox: bool, method: str, shuffle: bool, chunk: bool
+    ) -> None:
+        if shuffle and chunk and not has_dask_ge_2024_08_0:
+            pytest.skip()
 
+        array = self.da
+        if chunk:
+            array.data = array.chunk({"y": 5}).data
         reduction = getattr(np, method)
         expected = Dataset(
             {
@@ -1315,14 +1335,14 @@
             }
         )["foo"]
 
-        with xr.set_options(use_flox=False):
-            actual_legacy = getattr(grouped, method)(dim="y")
+        with raise_if_dask_computes():
+            grouped = array.groupby("abc")
+            if shuffle:
+                grouped = grouped.shuffle()
 
-        with xr.set_options(use_flox=True):
-            actual_npg = getattr(grouped, method)(dim="y")
-
-        assert_allclose(expected, actual_legacy)
-        assert_allclose(expected, actual_npg)
+            with xr.set_options(use_flox=use_flox):
+                actual = getattr(grouped, method)(dim="y")
+        assert_allclose(expected, actual)
 
     def test_groupby_count(self) -> None:
         array = DataArray(
@@ -2497,7 +2517,7 @@
    with mock.patch("flox.xarray.xarray_reduce", return_value=result) as mocked_reduce:
        da.groupby("label").sum()

    kwargs = mocked_reduce.call_args.kwargs
    if Version(flox.__version__) < Version("0.9.0"):
        assert kwargs["method"] == "cohorts"
    else:
@@ -2543,7 +2563,7 @@
    ds = da.to_dataset()

    expected = ds.groupby("time.year").mean()
    actual = ds.groupby(time=YearGrouper()).mean()
    assert_identical(expected, actual)

    actual = ds.groupby({"time": YearGrouper()}).mean()