diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 96921fe0311c0..4be9a8ad254d2 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -58,6 +58,39 @@ Styler - Fixed bug in :class:`CSSToExcelConverter` leading to ``TypeError`` when border color provided without border style for ``xlsxwriter`` engine (:issue:`42276`) +.. _whatsnew_150.notable_bug_fixes.groupby_transform_dropna: + +Using ``dropna=True`` with ``groupby`` transforms +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A transform is an operation whose result has the same size as its input. When the +result is a :class:`DataFrame` or :class:`Series`, it is also required that the +index of the result matches that of the input. In pandas 1.4, using +:meth:`.DataFrameGroupBy.transform` or :meth:`.SeriesGroupBy.transform` with null +values in the groups and ``dropna=True`` gave incorrect results. Demonstrated by the +examples below, the incorrect results either contained incorrect values, or the result +did not have the same index as the input. + +.. ipython:: python + + df = pd.DataFrame({'a': [1, 1, np.nan], 'b': [2, 3, 4]}) + +*Old behavior*: + +.. code-block:: ipython + + In [3]: df.groupby('a', dropna=True).transform(lambda x: x) + Out[3]: + b + 0 2 + 1 3 + +*New behavior*: + +.. ipython:: python + + df.groupby('a', dropna=True).transform(lambda x: x) + .. _whatsnew_150.notable_bug_fixes.notable_bug_fix2: notable_bug_fix2 diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ad2ec2a88af82..a91c88722593c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -108,6 +108,7 @@ class providing the base-class of operations. CategoricalIndex, Index, MultiIndex, + RangeIndex, ) from pandas.core.internals.blocks import ensure_block_shape import pandas.core.sample as sample @@ -1093,21 +1094,15 @@ def _set_result_index_ordered( return result # row order is scrambled => sort the rows by position in original index - original_positions = Index( - np.concatenate(self._get_indices(self.grouper.result_index)) - ) + original_positions = Index(self.grouper.result_ilocs()) result.set_axis(original_positions, axis=self.axis, inplace=True) result = result.sort_index(axis=self.axis) - - dropped_rows = len(result.index) < len(self.obj.index) - - if dropped_rows: - # get index by slicing original index according to original positions - # slice drops attrs => use set_axis when no rows were dropped - sorted_indexer = result.index - result.index = self._selected_obj.index[sorted_indexer] - else: - result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True) + obj_axis = self.obj._get_axis(self.axis) + if self.grouper.has_dropped_na: + # Add back in any missing rows due to dropna - index here is integral + # with values referring to the row of the input so can use RangeIndex + result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis) + result.set_axis(obj_axis, axis=self.axis, inplace=True) return result diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c3c191ff53532..b4fef6fcdff19 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -795,6 +795,30 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: keys = [ping.group_index for ping in self.groupings] return get_indexer_dict(codes_list, keys) + @final + def result_ilocs(self) -> npt.NDArray[np.intp]: + """ + Get the original integer locations of result_index in the input. + """ + # Original indices are where group_index would go via sorting. + # But when dropna is true, we need to remove null values while accounting for + # any gaps that then occur because of them. + group_index = get_group_index(self.codes, self.shape, sort=False, xnull=True) + + if self.has_dropped_na: + mask = np.where(group_index >= 0) + # Count how many gaps are caused by previous null values for each position + null_gaps = np.cumsum(group_index == -1)[mask] + group_index = group_index[mask] + + result = get_group_index_sorter(group_index, self.ngroups) + + if self.has_dropped_na: + # Shift by the number of prior null gaps + result += np.take(null_gaps, result) + + return result + @final @property def codes(self) -> list[npt.NDArray[np.signedinteger]]: @@ -837,6 +861,14 @@ def is_monotonic(self) -> bool: # return if my group orderings are monotonic return Index(self.group_info[0]).is_monotonic_increasing + @final + @cache_readonly + def has_dropped_na(self) -> bool: + """ + Whether grouper has null value(s) that are dropped. + """ + return bool((self.group_info[0] < 0).any()) + @cache_readonly def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]: comp_ids, obs_group_ids = self._get_compressed_codes() diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 4256142556894..73682620b8353 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -18,8 +18,6 @@ import numpy as np import pytest -from pandas.compat import pa_version_under2p0 - import pandas as pd from pandas.core.arrays import ArrowStringArray from pandas.core.arrays.string_ import StringDtype @@ -193,10 +191,6 @@ class TestPrinting(base.BasePrintingTests): class TestGroupBy(base.BaseGroupbyTests): def test_groupby_extension_transform(self, data_for_grouping, request): - if data_for_grouping.dtype.storage == "pyarrow" and pa_version_under2p0: - # failure observed in 1.0.1, not in 2.0 or later - mark = pytest.mark.xfail(reason="pyarrow raises in self._data[item]") - request.node.add_marker(mark) super().test_groupby_extension_transform(data_for_grouping) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index ab568e24ff029..9fcc28d8f65a9 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -171,52 +171,30 @@ def test_grouper_dropna_propagation(dropna): @pytest.mark.parametrize( - "dropna,input_index,expected_data,expected_index", + "index", [ - (True, pd.RangeIndex(0, 4), {"B": [2, 2, 1]}, pd.RangeIndex(0, 3)), - (True, list("abcd"), {"B": [2, 2, 1]}, list("abc")), - ( - True, - pd.MultiIndex.from_tuples( - [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"] - ), - {"B": [2, 2, 1]}, - pd.MultiIndex.from_tuples( - [(1, "R"), (1, "B"), (2, "R")], names=["num", "col"] - ), - ), - (False, pd.RangeIndex(0, 4), {"B": [2, 2, 1, 1]}, pd.RangeIndex(0, 4)), - (False, list("abcd"), {"B": [2, 2, 1, 1]}, list("abcd")), - ( - False, - pd.MultiIndex.from_tuples( - [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"] - ), - {"B": [2, 2, 1, 1]}, - pd.MultiIndex.from_tuples( - [(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"] - ), - ), + pd.RangeIndex(0, 4), + list("abcd"), + pd.MultiIndex.from_product([(1, 2), ("R", "B")], names=["num", "col"]), ], ) -def test_groupby_dataframe_slice_then_transform( - dropna, input_index, expected_data, expected_index -): +def test_groupby_dataframe_slice_then_transform(dropna, index): # GH35014 & GH35612 + expected_data = {"B": [2, 2, 1, np.nan if dropna else 1]} - df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=input_index) + df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=index) gb = df.groupby("A", dropna=dropna) result = gb.transform(len) - expected = pd.DataFrame(expected_data, index=expected_index) + expected = pd.DataFrame(expected_data, index=index) tm.assert_frame_equal(result, expected) result = gb[["B"]].transform(len) - expected = pd.DataFrame(expected_data, index=expected_index) + expected = pd.DataFrame(expected_data, index=index) tm.assert_frame_equal(result, expected) result = gb["B"].transform(len) - expected = pd.Series(expected_data["B"], index=expected_index, name="B") + expected = pd.Series(expected_data["B"], index=index, name="B") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index e0bcefba9cf3c..5cc9ac9b09ae9 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1290,9 +1290,21 @@ def test_transform_cumcount(): tm.assert_series_equal(result, expected) -def test_null_group_lambda_self(): +def test_null_group_lambda_self(sort, dropna): # GH 17093 - df = DataFrame({"A": [1, np.nan], "B": [1, 1]}) - result = df.groupby("A").transform(lambda x: x) - expected = DataFrame([1], columns=["B"]) + np.random.seed(0) + keys = np.random.randint(0, 5, size=50).astype(float) + nulls = np.random.choice([0, 1], keys.shape).astype(bool) + keys[nulls] = np.nan + values = np.random.randint(0, 5, size=keys.shape) + df = DataFrame({"A": keys, "B": values}) + + expected_values = values + if dropna and nulls.any(): + expected_values = expected_values.astype(float) + expected_values[nulls] = np.nan + expected = DataFrame(expected_values, columns=["B"]) + + gb = df.groupby("A", dropna=dropna, sort=sort) + result = gb.transform(lambda x: x) tm.assert_frame_equal(result, expected)