Skip to content

Commit

Permalink
BUG: Fix some cases of groupby(...).transform with dropna=True (panda…
Browse files Browse the repository at this point in the history
  • Loading branch information
rhshadrach authored Feb 27, 2022
1 parent 429f294 commit 1efa4fb
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 55 deletions.
33 changes: 33 additions & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,39 @@ Styler

- Fixed bug in :class:`CSSToExcelConverter` leading to ``TypeError`` when border color provided without border style for ``xlsxwriter`` engine (:issue:`42276`)

.. _whatsnew_150.notable_bug_fixes.groupby_transform_dropna:

Using ``dropna=True`` with ``groupby`` transforms
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

A transform is an operation whose result has the same size as its input. When the
result is a :class:`DataFrame` or :class:`Series`, it is also required that the
index of the result matches that of the input. In pandas 1.4, using
:meth:`.DataFrameGroupBy.transform` or :meth:`.SeriesGroupBy.transform` with null
values in the groups and ``dropna=True`` gave incorrect results. Demonstrated by the
examples below, the incorrect results either contained incorrect values, or the result
did not have the same index as the input.

.. ipython:: python
df = pd.DataFrame({'a': [1, 1, np.nan], 'b': [2, 3, 4]})
*Old behavior*:

.. code-block:: ipython
In [3]: df.groupby('a', dropna=True).transform(lambda x: x)
Out[3]:
b
0 2
1 3
*New behavior*:

.. ipython:: python
df.groupby('a', dropna=True).transform(lambda x: x)
.. _whatsnew_150.notable_bug_fixes.notable_bug_fix2:

notable_bug_fix2
Expand Down
21 changes: 8 additions & 13 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ class providing the base-class of operations.
CategoricalIndex,
Index,
MultiIndex,
RangeIndex,
)
from pandas.core.internals.blocks import ensure_block_shape
import pandas.core.sample as sample
Expand Down Expand Up @@ -1093,21 +1094,15 @@ def _set_result_index_ordered(
return result

# row order is scrambled => sort the rows by position in original index
original_positions = Index(
np.concatenate(self._get_indices(self.grouper.result_index))
)
original_positions = Index(self.grouper.result_ilocs())
result.set_axis(original_positions, axis=self.axis, inplace=True)
result = result.sort_index(axis=self.axis)

dropped_rows = len(result.index) < len(self.obj.index)

if dropped_rows:
# get index by slicing original index according to original positions
# slice drops attrs => use set_axis when no rows were dropped
sorted_indexer = result.index
result.index = self._selected_obj.index[sorted_indexer]
else:
result.set_axis(self.obj._get_axis(self.axis), axis=self.axis, inplace=True)
obj_axis = self.obj._get_axis(self.axis)
if self.grouper.has_dropped_na:
# Add back in any missing rows due to dropna - index here is integral
# with values referring to the row of the input so can use RangeIndex
result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis)
result.set_axis(obj_axis, axis=self.axis, inplace=True)

return result

Expand Down
32 changes: 32 additions & 0 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,6 +795,30 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
keys = [ping.group_index for ping in self.groupings]
return get_indexer_dict(codes_list, keys)

@final
def result_ilocs(self) -> npt.NDArray[np.intp]:
"""
Get the original integer locations of result_index in the input.
"""
# Original indices are where group_index would go via sorting.
# But when dropna is true, we need to remove null values while accounting for
# any gaps that then occur because of them.
group_index = get_group_index(self.codes, self.shape, sort=False, xnull=True)

if self.has_dropped_na:
mask = np.where(group_index >= 0)
# Count how many gaps are caused by previous null values for each position
null_gaps = np.cumsum(group_index == -1)[mask]
group_index = group_index[mask]

result = get_group_index_sorter(group_index, self.ngroups)

if self.has_dropped_na:
# Shift by the number of prior null gaps
result += np.take(null_gaps, result)

return result

@final
@property
def codes(self) -> list[npt.NDArray[np.signedinteger]]:
Expand Down Expand Up @@ -837,6 +861,14 @@ def is_monotonic(self) -> bool:
# return if my group orderings are monotonic
return Index(self.group_info[0]).is_monotonic_increasing

@final
@cache_readonly
def has_dropped_na(self) -> bool:
"""
Whether grouper has null value(s) that are dropped.
"""
return bool((self.group_info[0] < 0).any())

@cache_readonly
def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
comp_ids, obs_group_ids = self._get_compressed_codes()
Expand Down
6 changes: 0 additions & 6 deletions pandas/tests/extension/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
import numpy as np
import pytest

from pandas.compat import pa_version_under2p0

import pandas as pd
from pandas.core.arrays import ArrowStringArray
from pandas.core.arrays.string_ import StringDtype
Expand Down Expand Up @@ -193,10 +191,6 @@ class TestPrinting(base.BasePrintingTests):

class TestGroupBy(base.BaseGroupbyTests):
def test_groupby_extension_transform(self, data_for_grouping, request):
if data_for_grouping.dtype.storage == "pyarrow" and pa_version_under2p0:
# failure observed in 1.0.1, not in 2.0 or later
mark = pytest.mark.xfail(reason="pyarrow raises in self._data[item]")
request.node.add_marker(mark)
super().test_groupby_extension_transform(data_for_grouping)


Expand Down
42 changes: 10 additions & 32 deletions pandas/tests/groupby/test_groupby_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,52 +171,30 @@ def test_grouper_dropna_propagation(dropna):


@pytest.mark.parametrize(
"dropna,input_index,expected_data,expected_index",
"index",
[
(True, pd.RangeIndex(0, 4), {"B": [2, 2, 1]}, pd.RangeIndex(0, 3)),
(True, list("abcd"), {"B": [2, 2, 1]}, list("abc")),
(
True,
pd.MultiIndex.from_tuples(
[(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
),
{"B": [2, 2, 1]},
pd.MultiIndex.from_tuples(
[(1, "R"), (1, "B"), (2, "R")], names=["num", "col"]
),
),
(False, pd.RangeIndex(0, 4), {"B": [2, 2, 1, 1]}, pd.RangeIndex(0, 4)),
(False, list("abcd"), {"B": [2, 2, 1, 1]}, list("abcd")),
(
False,
pd.MultiIndex.from_tuples(
[(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
),
{"B": [2, 2, 1, 1]},
pd.MultiIndex.from_tuples(
[(1, "R"), (1, "B"), (2, "R"), (2, "B")], names=["num", "col"]
),
),
pd.RangeIndex(0, 4),
list("abcd"),
pd.MultiIndex.from_product([(1, 2), ("R", "B")], names=["num", "col"]),
],
)
def test_groupby_dataframe_slice_then_transform(
dropna, input_index, expected_data, expected_index
):
def test_groupby_dataframe_slice_then_transform(dropna, index):
# GH35014 & GH35612
expected_data = {"B": [2, 2, 1, np.nan if dropna else 1]}

df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=input_index)
df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=index)
gb = df.groupby("A", dropna=dropna)

result = gb.transform(len)
expected = pd.DataFrame(expected_data, index=expected_index)
expected = pd.DataFrame(expected_data, index=index)
tm.assert_frame_equal(result, expected)

result = gb[["B"]].transform(len)
expected = pd.DataFrame(expected_data, index=expected_index)
expected = pd.DataFrame(expected_data, index=index)
tm.assert_frame_equal(result, expected)

result = gb["B"].transform(len)
expected = pd.Series(expected_data["B"], index=expected_index, name="B")
expected = pd.Series(expected_data["B"], index=index, name="B")
tm.assert_series_equal(result, expected)


Expand Down
20 changes: 16 additions & 4 deletions pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -1290,9 +1290,21 @@ def test_transform_cumcount():
tm.assert_series_equal(result, expected)


def test_null_group_lambda_self():
def test_null_group_lambda_self(sort, dropna):
# GH 17093
df = DataFrame({"A": [1, np.nan], "B": [1, 1]})
result = df.groupby("A").transform(lambda x: x)
expected = DataFrame([1], columns=["B"])
np.random.seed(0)
keys = np.random.randint(0, 5, size=50).astype(float)
nulls = np.random.choice([0, 1], keys.shape).astype(bool)
keys[nulls] = np.nan
values = np.random.randint(0, 5, size=keys.shape)
df = DataFrame({"A": keys, "B": values})

expected_values = values
if dropna and nulls.any():
expected_values = expected_values.astype(float)
expected_values[nulls] = np.nan
expected = DataFrame(expected_values, columns=["B"])

gb = df.groupby("A", dropna=dropna, sort=sort)
result = gb.transform(lambda x: x)
tm.assert_frame_equal(result, expected)

0 comments on commit 1efa4fb

Please sign in to comment.