From d801a4b19dd7f3d1ad2ef984114cb7e7389d4700 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 28 Feb 2022 05:51:00 -0800 Subject: [PATCH] PERF: avoid cast in algos.rank (#46175) --- pandas/_libs/algos.pyi | 4 +- pandas/_libs/algos.pyx | 106 ++++++++++++++++++++++++-------------- pandas/core/algorithms.py | 5 -- 3 files changed, 70 insertions(+), 45 deletions(-) diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi index 2447d456b7478..60bdb504c545b 100644 --- a/pandas/_libs/algos.pyi +++ b/pandas/_libs/algos.pyi @@ -102,7 +102,7 @@ def is_monotonic( # ---------------------------------------------------------------------- def rank_1d( - values: np.ndarray, # ndarray[iu_64_floating_obj_t, ndim=1] + values: np.ndarray, # ndarray[numeric_object_t, ndim=1] labels: np.ndarray | None = ..., # const int64_t[:]=None is_datetimelike: bool = ..., ties_method=..., @@ -111,7 +111,7 @@ def rank_1d( na_option=..., ) -> np.ndarray: ... # np.ndarray[float64_t, ndim=1] def rank_2d( - in_arr: np.ndarray, # ndarray[iu_64_floating_obj_t, ndim=2] + in_arr: np.ndarray, # ndarray[numeric_object_t, ndim=2] axis: int = ..., is_datetimelike: bool = ..., ties_method=..., diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 1b33b90ba0fa5..4e3cdc0fdd14a 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -46,7 +46,6 @@ cnp.import_array() cimport pandas._libs.util as util from pandas._libs.dtypes cimport ( - iu_64_floating_obj_t, numeric_object_t, numeric_t, ) @@ -821,9 +820,9 @@ def is_monotonic(ndarray[numeric_object_t, ndim=1] arr, bint timelike): # rank_1d, rank_2d # ---------------------------------------------------------------------- -cdef iu_64_floating_obj_t get_rank_nan_fill_val( - bint rank_nans_highest, - iu_64_floating_obj_t[:] _=None +cdef numeric_object_t get_rank_nan_fill_val( + bint rank_nans_highest, + numeric_object_t[:] _=None ): """ Return the value we'll use to represent missing values when sorting depending @@ -831,20 +830,44 @@ cdef iu_64_floating_obj_t get_rank_nan_fill_val( is unused, but needed for fused type specialization) """ if rank_nans_highest: - if iu_64_floating_obj_t is object: + if numeric_object_t is object: return Infinity() - elif iu_64_floating_obj_t is int64_t: + elif numeric_object_t is int64_t: return util.INT64_MAX - elif iu_64_floating_obj_t is uint64_t: + elif numeric_object_t is int32_t: + return util.INT32_MAX + elif numeric_object_t is int16_t: + return util.INT16_MAX + elif numeric_object_t is int8_t: + return util.INT8_MAX + elif numeric_object_t is uint64_t: return util.UINT64_MAX + elif numeric_object_t is uint32_t: + return util.UINT32_MAX + elif numeric_object_t is uint16_t: + return util.UINT16_MAX + elif numeric_object_t is uint8_t: + return util.UINT8_MAX else: return np.inf else: - if iu_64_floating_obj_t is object: + if numeric_object_t is object: return NegInfinity() - elif iu_64_floating_obj_t is int64_t: + elif numeric_object_t is int64_t: return NPY_NAT - elif iu_64_floating_obj_t is uint64_t: + elif numeric_object_t is int32_t: + return util.INT32_MIN + elif numeric_object_t is int16_t: + return util.INT16_MIN + elif numeric_object_t is int8_t: + return util.INT8_MIN + elif numeric_object_t is uint64_t: + return 0 + elif numeric_object_t is uint32_t: + return 0 + elif numeric_object_t is uint16_t: + return 0 + elif numeric_object_t is uint8_t: return 0 else: return -np.inf @@ -853,7 +876,7 @@ cdef iu_64_floating_obj_t get_rank_nan_fill_val( @cython.wraparound(False) @cython.boundscheck(False) def rank_1d( - ndarray[iu_64_floating_obj_t, ndim=1] values, + ndarray[numeric_object_t, ndim=1] values, const intp_t[:] labels=None, bint is_datetimelike=False, ties_method="average", @@ -866,7 +889,7 @@ def rank_1d( Parameters ---------- - values : array of iu_64_floating_obj_t values to be ranked + values : array of numeric_object_t values to be ranked labels : np.ndarray[np.intp] or None Array containing unique label for each group, with its ordering matching up to the corresponding record in `values`. If not called @@ -896,11 +919,11 @@ def rank_1d( int64_t[::1] grp_sizes intp_t[:] lexsort_indexer float64_t[::1] out - ndarray[iu_64_floating_obj_t, ndim=1] masked_vals - iu_64_floating_obj_t[:] masked_vals_memview + ndarray[numeric_object_t, ndim=1] masked_vals + numeric_object_t[:] masked_vals_memview uint8_t[:] mask bint keep_na, nans_rank_highest, check_labels, check_mask - iu_64_floating_obj_t nan_fill_val + numeric_object_t nan_fill_val tiebreak = tiebreakers[ties_method] if tiebreak == TIEBREAK_FIRST: @@ -921,22 +944,26 @@ def rank_1d( check_labels = labels is not None # For cases where a mask is not possible, we can avoid mask checks - check_mask = not (iu_64_floating_obj_t is uint64_t or - (iu_64_floating_obj_t is int64_t and not is_datetimelike)) + check_mask = ( + numeric_object_t is float32_t + or numeric_object_t is float64_t + or numeric_object_t is object + or (numeric_object_t is int64_t and is_datetimelike) + ) # Copy values into new array in order to fill missing data # with mask, without obfuscating location of missing data # in values array - if iu_64_floating_obj_t is object and values.dtype != np.object_: + if numeric_object_t is object and values.dtype != np.object_: masked_vals = values.astype('O') else: masked_vals = values.copy() - if iu_64_floating_obj_t is object: + if numeric_object_t is object: mask = missing.isnaobj(masked_vals) - elif iu_64_floating_obj_t is int64_t and is_datetimelike: + elif numeric_object_t is int64_t and is_datetimelike: mask = (masked_vals == NPY_NAT).astype(np.uint8) - elif iu_64_floating_obj_t is float64_t: + elif numeric_object_t is float64_t or numeric_object_t is float32_t: mask = np.isnan(masked_vals).astype(np.uint8) else: mask = np.zeros(shape=len(masked_vals), dtype=np.uint8) @@ -948,7 +975,7 @@ def rank_1d( # will flip the ordering to still end up with lowest rank. # Symmetric logic applies to `na_option == 'bottom'` nans_rank_highest = ascending ^ (na_option == 'top') - nan_fill_val = get_rank_nan_fill_val[iu_64_floating_obj_t](nans_rank_highest) + nan_fill_val = get_rank_nan_fill_val[numeric_object_t](nans_rank_highest) if nans_rank_highest: order = [masked_vals, mask] else: @@ -994,8 +1021,8 @@ cdef void rank_sorted_1d( float64_t[::1] out, int64_t[::1] grp_sizes, const intp_t[:] sort_indexer, - # Can make const with cython3 (https://github.com/cython/cython/issues/3222) - iu_64_floating_obj_t[:] masked_vals, + # TODO(cython3): make const (https://github.com/cython/cython/issues/3222) + numeric_object_t[:] masked_vals, const uint8_t[:] mask, bint check_mask, Py_ssize_t N, @@ -1019,7 +1046,7 @@ cdef void rank_sorted_1d( if labels is None. sort_indexer : intp_t[:] Array of indices which sorts masked_vals - masked_vals : iu_64_floating_obj_t[:] + masked_vals : numeric_object_t[:] The values input to rank_1d, with missing values replaced by fill values mask : uint8_t[:] Array where entries are True if the value is missing, False otherwise. @@ -1051,7 +1078,7 @@ cdef void rank_sorted_1d( # that sorted value for retrieval back from the original # values / masked_vals arrays # TODO(cython3): de-duplicate once cython supports conditional nogil - if iu_64_floating_obj_t is object: + if numeric_object_t is object: with gil: for i in range(N): at_end = i == N - 1 @@ -1259,7 +1286,7 @@ cdef void rank_sorted_1d( def rank_2d( - ndarray[iu_64_floating_obj_t, ndim=2] in_arr, + ndarray[numeric_object_t, ndim=2] in_arr, int axis=0, bint is_datetimelike=False, ties_method="average", @@ -1274,13 +1301,13 @@ def rank_2d( Py_ssize_t k, n, col float64_t[::1, :] out # Column-major so columns are contiguous int64_t[::1] grp_sizes - ndarray[iu_64_floating_obj_t, ndim=2] values - iu_64_floating_obj_t[:, :] masked_vals + ndarray[numeric_object_t, ndim=2] values + numeric_object_t[:, :] masked_vals intp_t[:, :] sort_indexer uint8_t[:, :] mask TiebreakEnumType tiebreak bint check_mask, keep_na, nans_rank_highest - iu_64_floating_obj_t nan_fill_val + numeric_object_t nan_fill_val tiebreak = tiebreakers[ties_method] if tiebreak == TIEBREAK_FIRST: @@ -1290,29 +1317,32 @@ def rank_2d( keep_na = na_option == 'keep' # For cases where a mask is not possible, we can avoid mask checks - check_mask = not (iu_64_floating_obj_t is uint64_t or - (iu_64_floating_obj_t is int64_t and not is_datetimelike)) + check_mask = ( + numeric_object_t is float32_t + or numeric_object_t is float64_t + or numeric_object_t is object + or (numeric_object_t is int64_t and is_datetimelike) + ) if axis == 1: values = np.asarray(in_arr).T.copy() else: values = np.asarray(in_arr).copy() - if iu_64_floating_obj_t is object: + if numeric_object_t is object: if values.dtype != np.object_: values = values.astype('O') nans_rank_highest = ascending ^ (na_option == 'top') if check_mask: - nan_fill_val = get_rank_nan_fill_val[iu_64_floating_obj_t](nans_rank_highest) + nan_fill_val = get_rank_nan_fill_val[numeric_object_t](nans_rank_highest) - if iu_64_floating_obj_t is object: + if numeric_object_t is object: mask = missing.isnaobj2d(values).view(np.uint8) - elif iu_64_floating_obj_t is float64_t: + elif numeric_object_t is float64_t or numeric_object_t is float32_t: mask = np.isnan(values).view(np.uint8) - - # int64 and datetimelike else: + # i.e. int64 and datetimelike mask = (values == NPY_NAT).view(np.uint8) np.putmask(values, mask, nan_fill_val) else: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index e901f91b8fe40..b2e2299c0f2b6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1003,11 +1003,6 @@ def rank( is_datetimelike = needs_i8_conversion(values.dtype) values = _ensure_data(values) - if values.dtype.kind in ["i", "u", "f"]: - # rank_t includes only object, int64, uint64, float64 - dtype = values.dtype.kind + "8" - values = values.astype(dtype, copy=False) - if values.ndim == 1: ranks = algos.rank_1d( values,