From eac17345ebb1c0208e549636541326697c3fc33e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 3 Mar 2022 17:18:43 -0800 Subject: [PATCH] BUG: GroupBy.cumsum td64, skipna=False (#46216) --- doc/source/whatsnew/v1.5.0.rst | 3 ++ pandas/_libs/groupby.pyx | 60 ++++++++++++++++++---------- pandas/tests/groupby/test_groupby.py | 34 ++++++++++++++++ 3 files changed, 76 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index adee9271c23b0..52c8b82ab47f9 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -429,6 +429,9 @@ Groupby/resample/rolling - Bug in :meth:`.ExponentialMovingWindow.mean` with ``axis=1`` and ``engine='numba'`` when the :class:`DataFrame` has more columns than rows (:issue:`46086`) - Bug when using ``engine="numba"`` would return the same jitted function when modifying ``engine_kwargs`` (:issue:`46086`) - Bug in :meth:`.DataFrameGroupby.transform` fails when ``axis=1`` and ``func`` is ``"first"`` or ``"last"`` (:issue:`45986`) +- Bug in :meth:`DataFrameGroupby.cumsum` with ``skipna=False`` giving incorrect results (:issue:`??`) +- Bug in :meth:`.GroupBy.cumsum` with ``timedelta64[ns]`` dtype failing to recognize ``NaT`` as a null value (:issue:`??`) +- Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d250a69c1985b..05023028dccb3 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -43,6 +43,7 @@ from pandas._libs.algos import ( from pandas._libs.dtypes cimport ( iu_64_floating_obj_t, iu_64_floating_t, + numeric_object_t, numeric_t, ) from pandas._libs.missing cimport checknull @@ -211,7 +212,7 @@ def group_cumsum( ndarray[numeric_t, ndim=2] values, const intp_t[::1] labels, int ngroups, - is_datetimelike, + bint is_datetimelike, bint skipna=True, ) -> None: """ @@ -238,14 +239,23 @@ def group_cumsum( """ cdef: Py_ssize_t i, j, N, K, size - numeric_t val, y, t + numeric_t val, y, t, na_val numeric_t[:, ::1] accum, compensation intp_t lab + bint isna_entry, isna_prev = False N, K = (values).shape accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype) compensation = np.zeros((ngroups, K), dtype=np.asarray(values).dtype) + if numeric_t == float32_t or numeric_t == float64_t: + na_val = NaN + elif numeric_t is int64_t and is_datetimelike: + na_val = NPY_NAT + else: + # Will not be used, but define to avoid unitialized warning. + na_val = 0 + with nogil: for i in range(N): lab = labels[i] @@ -255,22 +265,30 @@ def group_cumsum( for j in range(K): val = values[i, j] - # For floats, use Kahan summation to reduce floating-point - # error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm) - if numeric_t == float32_t or numeric_t == float64_t: - if val == val: + isna_entry = _treat_as_na(val, is_datetimelike) + + if not skipna: + isna_prev = _treat_as_na(accum[lab, j], is_datetimelike) + if isna_prev: + out[i, j] = na_val + continue + + + if isna_entry: + out[i, j] = na_val + if not skipna: + accum[lab, j] = na_val + + else: + # For floats, use Kahan summation to reduce floating-point + # error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm) + if numeric_t == float32_t or numeric_t == float64_t: y = val - compensation[lab, j] t = accum[lab, j] + y compensation[lab, j] = t - accum[lab, j] - y - accum[lab, j] = t - out[i, j] = t else: - out[i, j] = NaN - if not skipna: - accum[lab, j] = NaN - break - else: - t = val + accum[lab, j] + t = val + accum[lab, j] + accum[lab, j] = t out[i, j] = t @@ -962,19 +980,19 @@ def group_quantile( # group_nth, group_last, group_rank # ---------------------------------------------------------------------- -cdef inline bint _treat_as_na(iu_64_floating_obj_t val, bint is_datetimelike) nogil: - if iu_64_floating_obj_t is object: +cdef inline bint _treat_as_na(numeric_object_t val, bint is_datetimelike) nogil: + if numeric_object_t is object: # Should never be used, but we need to avoid the `val != val` below # or else cython will raise about gil acquisition. raise NotImplementedError - elif iu_64_floating_obj_t is int64_t: + elif numeric_object_t is int64_t: return is_datetimelike and val == NPY_NAT - elif iu_64_floating_obj_t is uint64_t: - # There is no NA value for uint64 - return False - else: + elif numeric_object_t is float32_t or numeric_object_t is float64_t: return val != val + else: + # non-datetimelike integer + return False # TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d48477618767c..3fa7d3646a0bb 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2468,6 +2468,40 @@ def test_groupby_numerical_stability_cumsum(): tm.assert_frame_equal(result, expected, check_exact=True) +def test_groupby_cumsum_skipna_false(): + # don't propagate np.nan above the diagonal + arr = np.random.randn(5, 5) + df = DataFrame(arr) + for i in range(5): + df.iloc[i, i] = np.nan + + df["A"] = 1 + gb = df.groupby("A") + + res = gb.cumsum(skipna=False) + + expected = df[[0, 1, 2, 3, 4]].cumsum(skipna=False) + tm.assert_frame_equal(res, expected) + + +def test_groupby_cumsum_timedelta64(): + # don't ignore is_datetimelike in libgroupby.group_cumsum + dti = date_range("2016-01-01", periods=5) + ser = Series(dti) - dti[0] + ser[2] = pd.NaT + + df = DataFrame({"A": 1, "B": ser}) + gb = df.groupby("A") + + res = gb.cumsum(numeric_only=False, skipna=True) + exp = DataFrame({"B": [ser[0], ser[1], pd.NaT, ser[4], ser[4] * 2]}) + tm.assert_frame_equal(res, exp) + + res = gb.cumsum(numeric_only=False, skipna=False) + exp = DataFrame({"B": [ser[0], ser[1], pd.NaT, pd.NaT, pd.NaT]}) + tm.assert_frame_equal(res, exp) + + def test_groupby_mean_duplicate_index(rand_series_with_duplicate_datetimeindex): dups = rand_series_with_duplicate_datetimeindex result = dups.groupby(level=0).mean()