From 367f8a18bd04bb9c521ee5788b5fce24be63275d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 28 Feb 2022 13:44:26 -0800 Subject: [PATCH] REF: implement _reconstruct_ea_result, _ea_to_cython_values (#46172) --- pandas/_libs/groupby.pyx | 8 +++++++ pandas/core/groupby/ops.py | 47 +++++++++++++++++++++++--------------- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e9d6b6813c6fd..79457bcd6506f 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1350,6 +1350,10 @@ cdef group_min_max(iu_64_floating_t[:, ::1] out, else: if uses_mask: result_mask[i, j] = True + # set out[i, j] to 0 to be deterministic, as + # it was initialized with np.empty. Also ensures + # we can downcast out if appropriate. + out[i, j] = 0 else: out[i, j] = nan_val else: @@ -1494,6 +1498,10 @@ cdef group_cummin_max(iu_64_floating_t[:, ::1] out, if not skipna and na_possible and seen_na[lab, j]: if uses_mask: mask[i, j] = 1 # FIXME: shouldn't alter inplace + # Set to 0 ensures that we are deterministic and can + # downcast if appropriate + out[i, j] = 0 + else: out[i, j] = na_val else: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b4fef6fcdff19..8e38d542349d9 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -339,6 +339,26 @@ def _ea_wrap_cython_operation( **kwargs, ) + npvalues = self._ea_to_cython_values(values) + + res_values = self._cython_op_ndim_compat( + npvalues, + min_count=min_count, + ngroups=ngroups, + comp_ids=comp_ids, + mask=None, + **kwargs, + ) + + if self.how in ["rank"]: + # i.e. how in WrappedCythonOp.cast_blocklist, since + # other cast_blocklist methods dont go through cython_operation + return res_values + + return self._reconstruct_ea_result(values, res_values) + + def _ea_to_cython_values(self, values: ExtensionArray): + # GH#43682 if isinstance(values, (DatetimeArray, PeriodArray, TimedeltaArray)): # All of the functions implemented here are ordinal, so we can # operate on the tz-naive equivalents @@ -356,22 +376,7 @@ def _ea_wrap_cython_operation( raise NotImplementedError( f"function is not implemented for this dtype: {values.dtype}" ) - - res_values = self._cython_op_ndim_compat( - npvalues, - min_count=min_count, - ngroups=ngroups, - comp_ids=comp_ids, - mask=None, - **kwargs, - ) - - if self.how in ["rank"]: - # i.e. how in WrappedCythonOp.cast_blocklist, since - # other cast_blocklist methods dont go through cython_operation - return res_values - - return self._reconstruct_ea_result(values, res_values) + return npvalues def _reconstruct_ea_result(self, values, res_values): """ @@ -387,6 +392,7 @@ def _reconstruct_ea_result(self, values, res_values): return cls._from_sequence(res_values, dtype=dtype) elif needs_i8_conversion(values.dtype): + assert res_values.dtype.kind != "f" # just to be on the safe side i8values = res_values.view("i8") return type(values)(i8values, dtype=values.dtype) @@ -577,9 +583,12 @@ def _call_cython_op( cutoff = max(1, min_count) empty_groups = counts < cutoff if empty_groups.any(): - # Note: this conversion could be lossy, see GH#40767 - result = result.astype("float64") - result[empty_groups] = np.nan + if result_mask is not None and self.uses_mask(): + assert result_mask[empty_groups].all() + else: + # Note: this conversion could be lossy, see GH#40767 + result = result.astype("float64") + result[empty_groups] = np.nan result = result.T