From cf2dfa76d2736af151e429843be986884e1e74e4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 16 Feb 2022 05:36:21 -0800 Subject: [PATCH] REF: Share NumericArray/NumericDtype methods (#45997) --- pandas/core/arrays/floating.py | 40 ++----------- pandas/core/arrays/integer.py | 71 +++++------------------ pandas/core/arrays/numeric.py | 64 ++++++++++++++++++-- pandas/core/arrays/string_.py | 4 +- pandas/core/groupby/ops.py | 10 ++-- pandas/io/stata.py | 6 +- pandas/tests/frame/methods/test_astype.py | 5 -- 7 files changed, 89 insertions(+), 111 deletions(-) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index d55aef953b5b5..49a71922f331b 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -3,8 +3,8 @@ import numpy as np from pandas._typing import DtypeObj -from pandas.util._decorators import cache_readonly +from pandas.core.dtypes.common import is_float_dtype from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.arrays.numeric import ( @@ -24,13 +24,7 @@ class FloatingDtype(NumericDtype): """ _default_np_dtype = np.dtype(np.float64) - - def __repr__(self) -> str: - return f"{self.name}Dtype()" - - @property - def _is_numeric(self) -> bool: - return True + _checker = is_float_dtype @classmethod def construct_array_type(cls) -> type[FloatingArray]: @@ -58,18 +52,8 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: return None @classmethod - def _standardize_dtype(cls, dtype) -> FloatingDtype: - if isinstance(dtype, str) and dtype.startswith("Float"): - # Avoid DeprecationWarning from NumPy about np.dtype("Float64") - # https://github.com/numpy/numpy/pull/7476 - dtype = dtype.lower() - - if not issubclass(type(dtype), FloatingDtype): - try: - dtype = FLOAT_STR_TO_DTYPE[str(np.dtype(dtype))] - except KeyError as err: - raise ValueError(f"invalid dtype specified {dtype}") from err - return dtype + def _str_to_dtype_mapping(cls): + return FLOAT_STR_TO_DTYPE @classmethod def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray: @@ -151,22 +135,6 @@ class FloatingArray(NumericArray): _truthy_value = 1.0 _falsey_value = 0.0 - @cache_readonly - def dtype(self) -> FloatingDtype: - return FLOAT_STR_TO_DTYPE[str(self._data.dtype)] - - def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): - if not (isinstance(values, np.ndarray) and values.dtype.kind == "f"): - raise TypeError( - "values should be floating numpy array. Use " - "the 'pd.array' function instead" - ) - if values.dtype == np.float16: - # If we don't raise here, then accessing self.dtype would raise - raise TypeError("FloatingArray does not support np.float16 dtype.") - - super().__init__(values, mask, copy=copy) - _dtype_docstring = """ An ExtensionDtype for {dtype} data. diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 056669f40ca87..9ef3939656ecd 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -3,9 +3,9 @@ import numpy as np from pandas._typing import DtypeObj -from pandas.util._decorators import cache_readonly from pandas.core.dtypes.base import register_extension_dtype +from pandas.core.dtypes.common import is_integer_dtype from pandas.core.arrays.masked import BaseMaskedDtype from pandas.core.arrays.numeric import ( @@ -14,33 +14,18 @@ ) -class _IntegerDtype(NumericDtype): +class IntegerDtype(NumericDtype): """ An ExtensionDtype to hold a single size & kind of integer dtype. These specific implementations are subclasses of the non-public - _IntegerDtype. For example we have Int8Dtype to represent signed int 8s. + IntegerDtype. For example we have Int8Dtype to represent signed int 8s. The attributes name & type are set when these subclasses are created. """ _default_np_dtype = np.dtype(np.int64) - - def __repr__(self) -> str: - sign = "U" if self.is_unsigned_integer else "" - return f"{sign}Int{8 * self.itemsize}Dtype()" - - @cache_readonly - def is_signed_integer(self) -> bool: - return self.kind == "i" - - @cache_readonly - def is_unsigned_integer(self) -> bool: - return self.kind == "u" - - @property - def _is_numeric(self) -> bool: - return True + _checker = is_integer_dtype @classmethod def construct_array_type(cls) -> type[IntegerArray]: @@ -86,20 +71,8 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: return None @classmethod - def _standardize_dtype(cls, dtype) -> _IntegerDtype: - if isinstance(dtype, str) and ( - dtype.startswith("Int") or dtype.startswith("UInt") - ): - # Avoid DeprecationWarning from NumPy about np.dtype("Int64") - # https://github.com/numpy/numpy/pull/7476 - dtype = dtype.lower() - - if not issubclass(type(dtype), _IntegerDtype): - try: - dtype = INT_STR_TO_DTYPE[str(np.dtype(dtype))] - except KeyError as err: - raise ValueError(f"invalid dtype specified {dtype}") from err - return dtype + def _str_to_dtype_mapping(cls): + return INT_STR_TO_DTYPE @classmethod def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray: @@ -189,7 +162,7 @@ class IntegerArray(NumericArray): Length: 3, dtype: UInt16 """ - _dtype_cls = _IntegerDtype + _dtype_cls = IntegerDtype # The value used to fill '_data' to avoid upcasting _internal_fill_value = 1 @@ -197,18 +170,6 @@ class IntegerArray(NumericArray): _truthy_value = 1 _falsey_value = 0 - @cache_readonly - def dtype(self) -> _IntegerDtype: - return INT_STR_TO_DTYPE[str(self._data.dtype)] - - def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): - if not (isinstance(values, np.ndarray) and values.dtype.kind in ["i", "u"]): - raise TypeError( - "values should be integer numpy array. Use " - "the 'pd.array' function instead" - ) - super().__init__(values, mask, copy=copy) - _dtype_docstring = """ An ExtensionDtype for {dtype} integer data. @@ -231,62 +192,62 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): @register_extension_dtype -class Int8Dtype(_IntegerDtype): +class Int8Dtype(IntegerDtype): type = np.int8 name = "Int8" __doc__ = _dtype_docstring.format(dtype="int8") @register_extension_dtype -class Int16Dtype(_IntegerDtype): +class Int16Dtype(IntegerDtype): type = np.int16 name = "Int16" __doc__ = _dtype_docstring.format(dtype="int16") @register_extension_dtype -class Int32Dtype(_IntegerDtype): +class Int32Dtype(IntegerDtype): type = np.int32 name = "Int32" __doc__ = _dtype_docstring.format(dtype="int32") @register_extension_dtype -class Int64Dtype(_IntegerDtype): +class Int64Dtype(IntegerDtype): type = np.int64 name = "Int64" __doc__ = _dtype_docstring.format(dtype="int64") @register_extension_dtype -class UInt8Dtype(_IntegerDtype): +class UInt8Dtype(IntegerDtype): type = np.uint8 name = "UInt8" __doc__ = _dtype_docstring.format(dtype="uint8") @register_extension_dtype -class UInt16Dtype(_IntegerDtype): +class UInt16Dtype(IntegerDtype): type = np.uint16 name = "UInt16" __doc__ = _dtype_docstring.format(dtype="uint16") @register_extension_dtype -class UInt32Dtype(_IntegerDtype): +class UInt32Dtype(IntegerDtype): type = np.uint32 name = "UInt32" __doc__ = _dtype_docstring.format(dtype="uint32") @register_extension_dtype -class UInt64Dtype(_IntegerDtype): +class UInt64Dtype(IntegerDtype): type = np.uint64 name = "UInt64" __doc__ = _dtype_docstring.format(dtype="uint64") -INT_STR_TO_DTYPE: dict[str, _IntegerDtype] = { +INT_STR_TO_DTYPE: dict[str, IntegerDtype] = { "int8": Int8Dtype(), "int16": Int16Dtype(), "int32": Int32Dtype(), diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py index 5ab1a9908fd02..958c9f7b0b3f1 100644 --- a/pandas/core/arrays/numeric.py +++ b/pandas/core/arrays/numeric.py @@ -3,6 +3,8 @@ import numbers from typing import ( TYPE_CHECKING, + Any, + Callable, TypeVar, ) @@ -17,6 +19,7 @@ DtypeObj, ) from pandas.errors import AbstractMethodError +from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( is_bool_dtype, @@ -41,6 +44,22 @@ class NumericDtype(BaseMaskedDtype): _default_np_dtype: np.dtype + _checker: Callable[[Any], bool] # is_foo_dtype + + def __repr__(self) -> str: + return f"{self.name}Dtype()" + + @cache_readonly + def is_signed_integer(self) -> bool: + return self.kind == "i" + + @cache_readonly + def is_unsigned_integer(self) -> bool: + return self.kind == "u" + + @property + def _is_numeric(self) -> bool: + return True def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray @@ -90,12 +109,27 @@ def __from_arrow__( else: return array_class._concat_same_type(results) + @classmethod + def _str_to_dtype_mapping(cls): + raise AbstractMethodError(cls) + @classmethod def _standardize_dtype(cls, dtype) -> NumericDtype: """ Convert a string representation or a numpy dtype to NumericDtype. """ - raise AbstractMethodError(cls) + if isinstance(dtype, str) and (dtype.startswith(("Int", "UInt", "Float"))): + # Avoid DeprecationWarning from NumPy about np.dtype("Int64") + # https://github.com/numpy/numpy/pull/7476 + dtype = dtype.lower() + + if not issubclass(type(dtype), cls): + mapping = cls._str_to_dtype_mapping() + try: + dtype = mapping[str(np.dtype(dtype))] + except KeyError as err: + raise ValueError(f"invalid dtype specified {dtype}") from err + return dtype @classmethod def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray: @@ -108,10 +142,7 @@ def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarr def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype): - if default_dtype.kind == "f": - checker = is_float_dtype - else: - checker = is_integer_dtype + checker = dtype_cls._checker inferred_type = None @@ -188,6 +219,29 @@ class NumericArray(BaseMaskedArray): _dtype_cls: type[NumericDtype] + def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): + checker = self._dtype_cls._checker + if not (isinstance(values, np.ndarray) and checker(values.dtype)): + descr = ( + "floating" + if self._dtype_cls.kind == "f" # type: ignore[comparison-overlap] + else "integer" + ) + raise TypeError( + f"values should be {descr} numpy array. Use " + "the 'pd.array' function instead" + ) + if values.dtype == np.float16: + # If we don't raise here, then accessing self.dtype would raise + raise TypeError("FloatingArray does not support np.float16 dtype.") + + super().__init__(values, mask, copy=copy) + + @cache_readonly + def dtype(self) -> NumericDtype: + mapping = self._dtype_cls._str_to_dtype_mapping() + return mapping[str(self._data.dtype)] + @classmethod def _coerce_to_array( cls, value, *, dtype: DtypeObj, copy: bool = False diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index af1756470a9c0..ca4348e3bd06a 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -45,7 +45,7 @@ ) from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.floating import FloatingDtype -from pandas.core.arrays.integer import _IntegerDtype +from pandas.core.arrays.integer import IntegerDtype from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import isna @@ -432,7 +432,7 @@ def astype(self, dtype, copy: bool = True): return self.copy() return self - elif isinstance(dtype, _IntegerDtype): + elif isinstance(dtype, IntegerDtype): arr = self._ndarray.copy() mask = self.isna() arr[mask] = 0 diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index d4aa6ae9f4059..cf046d92dd6f3 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -76,7 +76,7 @@ ) from pandas.core.arrays.integer import ( Int64Dtype, - _IntegerDtype, + IntegerDtype, ) from pandas.core.arrays.masked import ( BaseMaskedArray, @@ -300,10 +300,10 @@ def _get_result_dtype(self, dtype: DtypeObj) -> DtypeObj: if how in ["add", "cumsum", "sum", "prod"]: if dtype == np.dtype(bool): return np.dtype(np.int64) - elif isinstance(dtype, (BooleanDtype, _IntegerDtype)): + elif isinstance(dtype, (BooleanDtype, IntegerDtype)): return Int64Dtype() elif how in ["mean", "median", "var"]: - if isinstance(dtype, (BooleanDtype, _IntegerDtype)): + if isinstance(dtype, (BooleanDtype, IntegerDtype)): return Float64Dtype() elif is_float_dtype(dtype) or is_complex_dtype(dtype): return dtype @@ -341,7 +341,7 @@ def _ea_wrap_cython_operation( # All of the functions implemented here are ordinal, so we can # operate on the tz-naive equivalents npvalues = values._ndarray.view("M8[ns]") - elif isinstance(values.dtype, (BooleanDtype, _IntegerDtype)): + elif isinstance(values.dtype, (BooleanDtype, IntegerDtype)): # IntegerArray or BooleanArray npvalues = values.to_numpy("float64", na_value=np.nan) elif isinstance(values.dtype, FloatingDtype): @@ -378,7 +378,7 @@ def _reconstruct_ea_result(self, values, res_values): # TODO: allow EAs to override this logic if isinstance( - values.dtype, (BooleanDtype, _IntegerDtype, FloatingDtype, StringDtype) + values.dtype, (BooleanDtype, IntegerDtype, FloatingDtype, StringDtype) ): dtype = self._get_result_dtype(values.dtype) cls = dtype.construct_array_type() diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 883cc36e4c1f1..60c4634662296 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -62,7 +62,7 @@ to_timedelta, ) from pandas.core.arrays.boolean import BooleanDtype -from pandas.core.arrays.integer import _IntegerDtype +from pandas.core.arrays.integer import IntegerDtype from pandas.core.frame import DataFrame from pandas.core.indexes.base import Index from pandas.core.series import Series @@ -585,7 +585,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: for col in data: # Cast from unsupported types to supported types - is_nullable_int = isinstance(data[col].dtype, (_IntegerDtype, BooleanDtype)) + is_nullable_int = isinstance(data[col].dtype, (IntegerDtype, BooleanDtype)) orig = data[col] # We need to find orig_missing before altering data below orig_missing = orig.isna() @@ -593,7 +593,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: missing_loc = data[col].isna() if missing_loc.any(): # Replace with always safe value - fv = 0 if isinstance(data[col].dtype, _IntegerDtype) else False + fv = 0 if isinstance(data[col].dtype, IntegerDtype) else False data.loc[missing_loc, col] = fv # Replace with NumPy-compatible column data[col] = data[col].astype(data[col].dtype.numpy_dtype) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 0e7e4b537c719..6d343de9f5d3a 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -762,11 +762,6 @@ def test_astype_categorical_to_string_missing(self): class IntegerArrayNoCopy(pd.core.arrays.IntegerArray): # GH 42501 - @classmethod - def _from_sequence(cls, scalars, *, dtype=None, copy=False): - values, mask = cls._coerce_to_array(scalars, dtype=dtype, copy=copy) - return IntegerArrayNoCopy(values, mask) - def copy(self): assert False