From 87803d0c3abfbbac5261a2b1a94e6a76a934b1bf Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 3 Mar 2022 15:35:28 -0800 Subject: [PATCH] PERF: avoid object dtype cast for Categorical in _ensure_data (#46208) --- pandas/core/algorithms.py | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 55d1bbaa20c71..5d5380be38de1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -136,7 +136,6 @@ def _ensure_data(values: ArrayLike) -> np.ndarray: # extract_array would raise values = extract_array(values, extract_numpy=True) - # we check some simple dtypes first if is_object_dtype(values.dtype): return ensure_object(np.asarray(values)) @@ -149,17 +148,19 @@ def _ensure_data(values: ArrayLike) -> np.ndarray: return _ensure_data(values._data) return np.asarray(values) + elif is_categorical_dtype(values.dtype): + # NB: cases that go through here should NOT be using _reconstruct_data + # on the back-end. + values = cast("Categorical", values) + return values.codes + elif is_bool_dtype(values.dtype): if isinstance(values, np.ndarray): # i.e. actually dtype == np.dtype("bool") return np.asarray(values).view("uint8") else: - # i.e. all-bool Categorical, BooleanArray - try: - return np.asarray(values).astype("uint8", copy=False) - except (TypeError, ValueError): - # GH#42107 we have pd.NAs present - return np.asarray(values) + # e.g. Sparse[bool, False] # TODO: no test cases get here + return np.asarray(values).astype("uint8", copy=False) elif is_integer_dtype(values.dtype): return np.asarray(values) @@ -174,10 +175,7 @@ def _ensure_data(values: ArrayLike) -> np.ndarray: return np.asarray(values) elif is_complex_dtype(values.dtype): - # Incompatible return value type (got "Tuple[Union[Any, ExtensionArray, - # ndarray[Any, Any]], Union[Any, ExtensionDtype]]", expected - # "Tuple[ndarray[Any, Any], Union[dtype[Any], ExtensionDtype]]") - return values # type: ignore[return-value] + return cast(np.ndarray, values) # datetimelike elif needs_i8_conversion(values.dtype): @@ -187,11 +185,6 @@ def _ensure_data(values: ArrayLike) -> np.ndarray: npvalues = cast(np.ndarray, npvalues) return npvalues - elif is_categorical_dtype(values.dtype): - values = cast("Categorical", values) - values = values.codes - return values - # we have failed, return object values = np.asarray(values, dtype=object) return ensure_object(values) @@ -218,7 +211,8 @@ def _reconstruct_data( return values if not isinstance(dtype, np.dtype): - # i.e. ExtensionDtype + # i.e. ExtensionDtype; note we have ruled out above the possibility + # that values.dtype == dtype cls = dtype.construct_array_type() values = cls._from_sequence(values, dtype=dtype) @@ -938,9 +932,8 @@ def mode(values: ArrayLike, dropna: bool = True) -> ArrayLike: if needs_i8_conversion(values.dtype): # Got here with ndarray; dispatch to DatetimeArray/TimedeltaArray. values = ensure_wrapped_if_datetimelike(values) - # error: Item "ndarray[Any, Any]" of "Union[ExtensionArray, - # ndarray[Any, Any]]" has no attribute "_mode" - return values._mode(dropna=dropna) # type: ignore[union-attr] + values = cast("ExtensionArray", values) + return values._mode(dropna=dropna) values = _ensure_data(values)