BUG: GroupBy.cumsum td64, skipna=False (pandas-dev#46216)

me-kbs · Mar 4, 2022 · eac1734 · eac1734
1 parent 87803d0
commit eac1734
Show file tree

Hide file tree

Showing 3 changed files with 76 additions and 21 deletions.
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -429,6 +429,9 @@ Groupby/resample/rolling
 - Bug in :meth:`.ExponentialMovingWindow.mean` with ``axis=1`` and ``engine='numba'`` when the :class:`DataFrame` has more columns than rows (:issue:`46086`)
 - Bug when using ``engine="numba"`` would return the same jitted function when modifying ``engine_kwargs`` (:issue:`46086`)
 - Bug in :meth:`.DataFrameGroupby.transform` fails when ``axis=1`` and ``func`` is ``"first"`` or ``"last"`` (:issue:`45986`)
+- Bug in :meth:`DataFrameGroupby.cumsum` with ``skipna=False`` giving incorrect results (:issue:`??`)
+- Bug in :meth:`.GroupBy.cumsum` with ``timedelta64[ns]`` dtype failing to recognize ``NaT`` as a null value (:issue:`??`)
+-
 
 Reshaping
 ^^^^^^^^^

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -43,6 +43,7 @@ from pandas._libs.algos import (
 from pandas._libs.dtypes cimport (
     iu_64_floating_obj_t,
     iu_64_floating_t,
+    numeric_object_t,
     numeric_t,
 )
 from pandas._libs.missing cimport checknull
@@ -211,7 +212,7 @@ def group_cumsum(
     ndarray[numeric_t, ndim=2] values,
     const intp_t[::1] labels,
     int ngroups,
-    is_datetimelike,
+    bint is_datetimelike,
     bint skipna=True,
 ) -> None:
     """
@@ -238,14 +239,23 @@ def group_cumsum(
     """
     cdef:
         Py_ssize_t i, j, N, K, size
-        numeric_t val, y, t
+        numeric_t val, y, t, na_val
         numeric_t[:, ::1] accum, compensation
         intp_t lab
+        bint isna_entry, isna_prev = False
 
     N, K = (<object>values).shape
     accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
     compensation = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
 
+    if numeric_t == float32_t or numeric_t == float64_t:
+        na_val = NaN
+    elif numeric_t is int64_t and is_datetimelike:
+        na_val = NPY_NAT
+    else:
+        # Will not be used, but define to avoid unitialized warning.
+        na_val = 0
+
     with nogil:
         for i in range(N):
             lab = labels[i]
@@ -255,22 +265,30 @@ def group_cumsum(
             for j in range(K):
                 val = values[i, j]
 
-                # For floats, use Kahan summation to reduce floating-point
-                # error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
-                if numeric_t == float32_t or numeric_t == float64_t:
-                    if val == val:
+                isna_entry = _treat_as_na(val, is_datetimelike)
+
+                if not skipna:
+                    isna_prev = _treat_as_na(accum[lab, j], is_datetimelike)
+                    if isna_prev:
+                        out[i, j] = na_val
+                        continue
+
+
+                if isna_entry:
+                    out[i, j] = na_val
+                    if not skipna:
+                        accum[lab, j] = na_val
+
+                else:
+                    # For floats, use Kahan summation to reduce floating-point
+                    # error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
+                    if numeric_t == float32_t or numeric_t == float64_t:
                         y = val - compensation[lab, j]
                         t = accum[lab, j] + y
                         compensation[lab, j] = t - accum[lab, j] - y
-                        accum[lab, j] = t
-                        out[i, j] = t
                     else:
-                        out[i, j] = NaN
-                        if not skipna:
-                            accum[lab, j] = NaN
-                            break
-                else:
-                    t = val + accum[lab, j]
+                        t = val + accum[lab, j]
+
                     accum[lab, j] = t
                     out[i, j] = t
 
@@ -962,19 +980,19 @@ def group_quantile(
 # group_nth, group_last, group_rank
 # ----------------------------------------------------------------------
 
-cdef inline bint _treat_as_na(iu_64_floating_obj_t val, bint is_datetimelike) nogil:
-    if iu_64_floating_obj_t is object:
+cdef inline bint _treat_as_na(numeric_object_t val, bint is_datetimelike) nogil:
+    if numeric_object_t is object:
         # Should never be used, but we need to avoid the `val != val` below
         #  or else cython will raise about gil acquisition.
         raise NotImplementedError
 
-    elif iu_64_floating_obj_t is int64_t:
+    elif numeric_object_t is int64_t:
         return is_datetimelike and val == NPY_NAT
-    elif iu_64_floating_obj_t is uint64_t:
-        # There is no NA value for uint64
-        return False
-    else:
+    elif numeric_object_t is float32_t or numeric_object_t is float64_t:
         return val != val
+    else:
+        # non-datetimelike integer
+        return False
 
 
 # TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can

diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -2468,6 +2468,40 @@ def test_groupby_numerical_stability_cumsum():
     tm.assert_frame_equal(result, expected, check_exact=True)
 
 
+def test_groupby_cumsum_skipna_false():
+    # don't propagate np.nan above the diagonal
+    arr = np.random.randn(5, 5)
+    df = DataFrame(arr)
+    for i in range(5):
+        df.iloc[i, i] = np.nan
+
+    df["A"] = 1
+    gb = df.groupby("A")
+
+    res = gb.cumsum(skipna=False)
+
+    expected = df[[0, 1, 2, 3, 4]].cumsum(skipna=False)
+    tm.assert_frame_equal(res, expected)
+
+
+def test_groupby_cumsum_timedelta64():
+    # don't ignore is_datetimelike in libgroupby.group_cumsum
+    dti = date_range("2016-01-01", periods=5)
+    ser = Series(dti) - dti[0]
+    ser[2] = pd.NaT
+
+    df = DataFrame({"A": 1, "B": ser})
+    gb = df.groupby("A")
+
+    res = gb.cumsum(numeric_only=False, skipna=True)
+    exp = DataFrame({"B": [ser[0], ser[1], pd.NaT, ser[4], ser[4] * 2]})
+    tm.assert_frame_equal(res, exp)
+
+    res = gb.cumsum(numeric_only=False, skipna=False)
+    exp = DataFrame({"B": [ser[0], ser[1], pd.NaT, pd.NaT, pd.NaT]})
+    tm.assert_frame_equal(res, exp)
+
+
 def test_groupby_mean_duplicate_index(rand_series_with_duplicate_datetimeindex):
     dups = rand_series_with_duplicate_datetimeindex
     result = dups.groupby(level=0).mean()