Skip to content

Commit

Permalink
BUG: GroupBy.cumsum td64, skipna=False (pandas-dev#46216)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored Mar 4, 2022
1 parent 87803d0 commit eac1734
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 21 deletions.
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,9 @@ Groupby/resample/rolling
- Bug in :meth:`.ExponentialMovingWindow.mean` with ``axis=1`` and ``engine='numba'`` when the :class:`DataFrame` has more columns than rows (:issue:`46086`)
- Bug when using ``engine="numba"`` would return the same jitted function when modifying ``engine_kwargs`` (:issue:`46086`)
- Bug in :meth:`.DataFrameGroupby.transform` fails when ``axis=1`` and ``func`` is ``"first"`` or ``"last"`` (:issue:`45986`)
- Bug in :meth:`DataFrameGroupby.cumsum` with ``skipna=False`` giving incorrect results (:issue:`??`)
- Bug in :meth:`.GroupBy.cumsum` with ``timedelta64[ns]`` dtype failing to recognize ``NaT`` as a null value (:issue:`??`)
-

Reshaping
^^^^^^^^^
Expand Down
60 changes: 39 additions & 21 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ from pandas._libs.algos import (
from pandas._libs.dtypes cimport (
iu_64_floating_obj_t,
iu_64_floating_t,
numeric_object_t,
numeric_t,
)
from pandas._libs.missing cimport checknull
Expand Down Expand Up @@ -211,7 +212,7 @@ def group_cumsum(
ndarray[numeric_t, ndim=2] values,
const intp_t[::1] labels,
int ngroups,
is_datetimelike,
bint is_datetimelike,
bint skipna=True,
) -> None:
"""
Expand All @@ -238,14 +239,23 @@ def group_cumsum(
"""
cdef:
Py_ssize_t i, j, N, K, size
numeric_t val, y, t
numeric_t val, y, t, na_val
numeric_t[:, ::1] accum, compensation
intp_t lab
bint isna_entry, isna_prev = False

N, K = (<object>values).shape
accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
compensation = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)

if numeric_t == float32_t or numeric_t == float64_t:
na_val = NaN
elif numeric_t is int64_t and is_datetimelike:
na_val = NPY_NAT
else:
# Will not be used, but define to avoid unitialized warning.
na_val = 0

with nogil:
for i in range(N):
lab = labels[i]
Expand All @@ -255,22 +265,30 @@ def group_cumsum(
for j in range(K):
val = values[i, j]

# For floats, use Kahan summation to reduce floating-point
# error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
if numeric_t == float32_t or numeric_t == float64_t:
if val == val:
isna_entry = _treat_as_na(val, is_datetimelike)

if not skipna:
isna_prev = _treat_as_na(accum[lab, j], is_datetimelike)
if isna_prev:
out[i, j] = na_val
continue


if isna_entry:
out[i, j] = na_val
if not skipna:
accum[lab, j] = na_val

else:
# For floats, use Kahan summation to reduce floating-point
# error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
if numeric_t == float32_t or numeric_t == float64_t:
y = val - compensation[lab, j]
t = accum[lab, j] + y
compensation[lab, j] = t - accum[lab, j] - y
accum[lab, j] = t
out[i, j] = t
else:
out[i, j] = NaN
if not skipna:
accum[lab, j] = NaN
break
else:
t = val + accum[lab, j]
t = val + accum[lab, j]

accum[lab, j] = t
out[i, j] = t

Expand Down Expand Up @@ -962,19 +980,19 @@ def group_quantile(
# group_nth, group_last, group_rank
# ----------------------------------------------------------------------

cdef inline bint _treat_as_na(iu_64_floating_obj_t val, bint is_datetimelike) nogil:
if iu_64_floating_obj_t is object:
cdef inline bint _treat_as_na(numeric_object_t val, bint is_datetimelike) nogil:
if numeric_object_t is object:
# Should never be used, but we need to avoid the `val != val` below
# or else cython will raise about gil acquisition.
raise NotImplementedError

elif iu_64_floating_obj_t is int64_t:
elif numeric_object_t is int64_t:
return is_datetimelike and val == NPY_NAT
elif iu_64_floating_obj_t is uint64_t:
# There is no NA value for uint64
return False
else:
elif numeric_object_t is float32_t or numeric_object_t is float64_t:
return val != val
else:
# non-datetimelike integer
return False


# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can
Expand Down
34 changes: 34 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2468,6 +2468,40 @@ def test_groupby_numerical_stability_cumsum():
tm.assert_frame_equal(result, expected, check_exact=True)


def test_groupby_cumsum_skipna_false():
# don't propagate np.nan above the diagonal
arr = np.random.randn(5, 5)
df = DataFrame(arr)
for i in range(5):
df.iloc[i, i] = np.nan

df["A"] = 1
gb = df.groupby("A")

res = gb.cumsum(skipna=False)

expected = df[[0, 1, 2, 3, 4]].cumsum(skipna=False)
tm.assert_frame_equal(res, expected)


def test_groupby_cumsum_timedelta64():
# don't ignore is_datetimelike in libgroupby.group_cumsum
dti = date_range("2016-01-01", periods=5)
ser = Series(dti) - dti[0]
ser[2] = pd.NaT

df = DataFrame({"A": 1, "B": ser})
gb = df.groupby("A")

res = gb.cumsum(numeric_only=False, skipna=True)
exp = DataFrame({"B": [ser[0], ser[1], pd.NaT, ser[4], ser[4] * 2]})
tm.assert_frame_equal(res, exp)

res = gb.cumsum(numeric_only=False, skipna=False)
exp = DataFrame({"B": [ser[0], ser[1], pd.NaT, pd.NaT, pd.NaT]})
tm.assert_frame_equal(res, exp)


def test_groupby_mean_duplicate_index(rand_series_with_duplicate_datetimeindex):
dups = rand_series_with_duplicate_datetimeindex
result = dups.groupby(level=0).mean()
Expand Down

0 comments on commit eac1734

Please sign in to comment.