diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 8c838d8944d14..55d1bbaa20c71 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -11,7 +11,6 @@ Hashable, Literal, Sequence, - Union, cast, final, ) @@ -30,7 +29,6 @@ ArrayLike, DtypeObj, IndexLabel, - Scalar, TakeIndexer, npt, ) @@ -105,9 +103,7 @@ ) from pandas.core.arrays import ( BaseMaskedArray, - DatetimeArray, ExtensionArray, - TimedeltaArray, ) @@ -539,6 +535,14 @@ def factorize_array( codes : ndarray[np.intp] uniques : ndarray """ + original = values + if values.dtype.kind in ["m", "M"]: + # _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we + # need to do the same to na_value. We are assuming here that the passed + # na_value is an appropriately-typed NaT. + # e.g. test_where_datetimelike_categorical + na_value = iNaT + hash_klass, values = _get_hashtable_algo(values) table = hash_klass(size_hint or len(values)) @@ -546,6 +550,9 @@ def factorize_array( values, na_sentinel=na_sentinel, na_value=na_value, mask=mask ) + # re-cast e.g. i8->dt64/td64, uint8->bool + uniques = _reconstruct_data(uniques, original.dtype, original) + codes = ensure_platform_int(codes) return codes, uniques @@ -720,33 +727,18 @@ def factorize( isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray)) and values.freq is not None ): + # The presence of 'freq' means we can fast-path sorting and know there + # aren't NAs codes, uniques = values.factorize(sort=sort) - if isinstance(original, ABCIndex): - uniques = original._shallow_copy(uniques, name=None) - elif isinstance(original, ABCSeries): - from pandas import Index - - uniques = Index(uniques) - return codes, uniques + return _re_wrap_factorize(original, uniques, codes) if not isinstance(values.dtype, np.dtype): # i.e. ExtensionDtype codes, uniques = values.factorize(na_sentinel=na_sentinel) - dtype = original.dtype else: - dtype = values.dtype - values = _ensure_data(values) - na_value: Scalar | None - - if original.dtype.kind in ["m", "M"]: - # Note: factorize_array will cast NaT bc it has a __int__ - # method, but will not cast the more-correct dtype.type("nat") - na_value = iNaT - else: - na_value = None - + values = np.asarray(values) # convert DTA/TDA/MultiIndex codes, uniques = factorize_array( - values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value + values, na_sentinel=na_sentinel, size_hint=size_hint ) if sort and len(uniques) > 0: @@ -759,23 +751,20 @@ def factorize( # na_value is set based on the dtype of uniques, and compat set to False is # because we do not want na_value to be 0 for integers na_value = na_value_for_dtype(uniques.dtype, compat=False) - # Argument 2 to "append" has incompatible type "List[Union[str, float, Period, - # Timestamp, Timedelta, Any]]"; expected "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]] - # , bool, int, float, complex, str, bytes, _NestedSequence[Union[bool, int, - # float, complex, str, bytes]]]" [arg-type] - uniques = np.append(uniques, [na_value]) # type: ignore[arg-type] + uniques = np.append(uniques, [na_value]) codes = np.where(code_is_na, len(uniques) - 1, codes) - uniques = _reconstruct_data(uniques, dtype, original) + uniques = _reconstruct_data(uniques, original.dtype, original) + + return _re_wrap_factorize(original, uniques, codes) + - # return original tenor +def _re_wrap_factorize(original, uniques, codes: np.ndarray): + """ + Wrap factorize results in Series or Index depending on original type. + """ if isinstance(original, ABCIndex): - if original.dtype.kind in ["m", "M"] and isinstance(uniques, np.ndarray): - original._data = cast( - "Union[DatetimeArray, TimedeltaArray]", original._data - ) - uniques = type(original._data)._simple_new(uniques, dtype=original.dtype) + uniques = ensure_wrapped_if_datetimelike(uniques) uniques = original._shallow_copy(uniques, name=None) elif isinstance(original, ABCSeries): from pandas import Index diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index cc58d7ddd03dc..cb018103d07b4 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -182,6 +182,10 @@ def equals(self, other) -> bool: return False return bool(array_equivalent(self._ndarray, other._ndarray)) + def _from_factorized(cls, values, original): + assert values.dtype == original._ndarray.dtype + return original._from_backing_data(values) + def _values_for_argsort(self) -> np.ndarray: return self._ndarray diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index bea99a741fd6f..709ddd049b07b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2305,12 +2305,6 @@ def unique(self): def _values_for_factorize(self): return self._ndarray, -1 - @classmethod - def _from_factorized(cls, uniques, original): - # ensure we have the same itemsize for codes - codes = coerce_indexer_dtype(uniques, original.dtype.categories) - return original._from_backing_data(codes) - def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: # make sure we have correct itemsize for resulting codes res_values = coerce_indexer_dtype(res_values, self.dtype.categories) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a0d9fab764811..dc079c3431ba3 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -550,14 +550,7 @@ def copy(self: DatetimeLikeArrayT, order="C") -> DatetimeLikeArrayT: return new_obj def _values_for_factorize(self): - # int64 instead of int ensures we have a "view" method - return self._ndarray, np.int64(iNaT) - - @classmethod - def _from_factorized( - cls: type[DatetimeLikeArrayT], values, original: DatetimeLikeArrayT - ) -> DatetimeLikeArrayT: - return cls(values, dtype=original.dtype) + return self._ndarray, self._internal_fill_value # ------------------------------------------------------------------ # Validation Methods diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 4dffd7e76dc2e..f2cf74fa41ba0 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -874,8 +874,9 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: codes, uniques = factorize_array(arr, na_sentinel=na_sentinel, mask=mask) - # the hashtables don't handle all different types of bits - uniques = uniques.astype(self.dtype.numpy_dtype, copy=False) + # check that factorize_array correctly preserves dtype. + assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype) + uniques_ea = type(self)(uniques, np.zeros(len(uniques), dtype=bool)) return codes, uniques_ea diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index d9c2ec0786ced..bf41457ad6109 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -109,10 +109,6 @@ def _from_sequence( result = result.copy() return cls(result) - @classmethod - def _from_factorized(cls, values, original) -> PandasArray: - return original._from_backing_data(values) - def _from_backing_data(self, arr: np.ndarray) -> PandasArray: return type(self)(arr)