diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py index 0e5be675f8fcd..112772bc9e842 100644 --- a/pandas/core/arrays/_arrow_utils.py +++ b/pandas/core/arrays/_arrow_utils.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import json import numpy as np diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index cace08aac5c7e..9fe8c6f78f875 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -28,6 +28,7 @@ npt, type_t, ) +from pandas.compat import pa_version_under2p0 from pandas.errors import AbstractMethodError from pandas.util._decorators import doc from pandas.util._validators import ( @@ -66,6 +67,8 @@ if TYPE_CHECKING: + import pyarrow as pa + from pandas._typing import ( NumpySorter, NumpyValueArrayLike, @@ -508,3 +511,89 @@ def _empty( arr = cls._from_sequence([], dtype=dtype) backing = np.empty(shape, dtype=arr._ndarray.dtype) return arr._from_backing_data(backing) + + +ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray") + + +class ArrowExtensionArray(ExtensionArray): + """ + Base class for ExtensionArray backed by Arrow array. + """ + + _data: pa.ChunkedArray + + def __init__(self, values: pa.ChunkedArray): + self._data = values + + def __arrow_array__(self, type=None): + """Convert myself to a pyarrow Array or ChunkedArray.""" + return self._data + + def equals(self, other) -> bool: + if not isinstance(other, ArrowExtensionArray): + return False + # I'm told that pyarrow makes __eq__ behave like pandas' equals; + # TODO: is this documented somewhere? + return self._data == other._data + + @property + def nbytes(self) -> int: + """ + The number of bytes needed to store this object in memory. + """ + return self._data.nbytes + + def __len__(self) -> int: + """ + Length of this array. + + Returns + ------- + length : int + """ + return len(self._data) + + def isna(self) -> npt.NDArray[np.bool_]: + """ + Boolean NumPy array indicating if each value is missing. + + This should return a 1-D array the same length as 'self'. + """ + if pa_version_under2p0: + return self._data.is_null().to_pandas().values + else: + return self._data.is_null().to_numpy() + + def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: + """ + Return a shallow copy of the array. + + Underlying ChunkedArray is immutable, so a deep copy is unnecessary. + + Returns + ------- + type(self) + """ + return type(self)(self._data) + + @classmethod + def _concat_same_type( + cls: type[ArrowExtensionArrayT], to_concat + ) -> ArrowExtensionArrayT: + """ + Concatenate multiple ArrowExtensionArrays. + + Parameters + ---------- + to_concat : sequence of ArrowExtensionArrays + + Returns + ------- + ArrowExtensionArray + """ + import pyarrow as pa + + chunks = [array for ea in to_concat for array in ea._data.iterchunks()] + arr = pa.chunked_array(chunks) + return cls(arr) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index ca4348e3bd06a..fce1942433cf7 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -39,11 +39,11 @@ from pandas.core import ops from pandas.core.array_algos import masked_reductions from pandas.core.arrays import ( + ExtensionArray, FloatingArray, IntegerArray, PandasArray, ) -from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.floating import FloatingDtype from pandas.core.arrays.integer import IntegerDtype from pandas.core.construction import extract_array @@ -224,6 +224,10 @@ def __from_arrow__( class BaseStringArray(ExtensionArray): + """ + Mixin class for StringArray, ArrowStringArray. + """ + pass diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 002def4d31e72..63fdf930f9ee6 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -48,6 +48,7 @@ from pandas.core.dtypes.missing import isna from pandas.core.arraylike import OpsMixin +from pandas.core.arrays._mixins import ArrowExtensionArray from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import Int64Dtype @@ -94,7 +95,9 @@ def _chk_pyarrow_available() -> None: # fallback for the ones that pyarrow doesn't yet support -class ArrowStringArray(OpsMixin, BaseStringArray, ObjectStringArrayMixin): +class ArrowStringArray( + OpsMixin, ArrowExtensionArray, BaseStringArray, ObjectStringArrayMixin +): """ Extension array for string data in a ``pyarrow.ChunkedArray``. @@ -191,10 +194,6 @@ def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: """Correctly construct numpy arrays when passed to `np.asarray()`.""" return self.to_numpy(dtype=dtype) - def __arrow_array__(self, type=None): - """Convert myself to a pyarrow Array or ChunkedArray.""" - return self._data - def to_numpy( self, dtype: npt.DTypeLike | None = None, @@ -216,16 +215,6 @@ def to_numpy( result[mask] = na_value return result - def __len__(self) -> int: - """ - Length of this array. - - Returns - ------- - length : int - """ - return len(self._data) - @doc(ExtensionArray.factorize) def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: encoded = self._data.dictionary_encode() @@ -243,25 +232,6 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: return indices.values, uniques - @classmethod - def _concat_same_type(cls, to_concat) -> ArrowStringArray: - """ - Concatenate multiple ArrowStringArray. - - Parameters - ---------- - to_concat : sequence of ArrowStringArray - - Returns - ------- - ArrowStringArray - """ - return cls( - pa.chunked_array( - [array for ea in to_concat for array in ea._data.iterchunks()] - ) - ) - @overload def __getitem__(self, item: ScalarIndexer) -> ArrowStringScalarOrNAT: ... @@ -342,34 +312,6 @@ def _as_pandas_scalar(self, arrow_scalar: pa.Scalar): else: return scalar - @property - def nbytes(self) -> int: - """ - The number of bytes needed to store this object in memory. - """ - return self._data.nbytes - - def isna(self) -> np.ndarray: - """ - Boolean NumPy array indicating if each value is missing. - - This should return a 1-D array the same length as 'self'. - """ - # TODO: Implement .to_numpy for ChunkedArray - return self._data.is_null().to_pandas().values - - def copy(self) -> ArrowStringArray: - """ - Return a shallow copy of the array. - - Underlying ChunkedArray is immutable, so a deep copy is unnecessary. - - Returns - ------- - ArrowStringArray - """ - return type(self)(self._data) - def _cmp_method(self, other, op): from pandas.arrays import BooleanArray diff --git a/pandas/tests/extension/arrow/arrays.py b/pandas/tests/extension/arrow/arrays.py index 3707447151ae3..1ab3d49392052 100644 --- a/pandas/tests/extension/arrow/arrays.py +++ b/pandas/tests/extension/arrow/arrays.py @@ -8,7 +8,6 @@ """ from __future__ import annotations -import copy import itertools import operator @@ -19,13 +18,13 @@ import pandas as pd from pandas.api.extensions import ( - ExtensionArray, ExtensionDtype, register_extension_dtype, take, ) from pandas.api.types import is_scalar from pandas.core.arraylike import OpsMixin +from pandas.core.arrays._mixins import ArrowExtensionArray as _ArrowExtensionArray from pandas.core.construction import extract_array @@ -73,7 +72,7 @@ def construct_array_type(cls) -> type_t[ArrowStringArray]: return ArrowStringArray -class ArrowExtensionArray(OpsMixin, ExtensionArray): +class ArrowExtensionArray(OpsMixin, _ArrowExtensionArray): _data: pa.ChunkedArray @classmethod @@ -111,9 +110,6 @@ def __getitem__(self, item): vals = self._data.to_pandas()[item] return type(self)._from_sequence(vals) - def __len__(self): - return len(self._data) - def astype(self, dtype, copy=True): # needed to fix this astype for the Series constructor. if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: @@ -142,19 +138,6 @@ def __eq__(self, other): return self._logical_method(other, operator.eq) - @property - def nbytes(self) -> int: - return sum( - x.size - for chunk in self._data.chunks - for x in chunk.buffers() - if x is not None - ) - - def isna(self): - nas = pd.isna(self._data.to_pandas()) - return type(self)._from_sequence(nas) - def take(self, indices, allow_fill=False, fill_value=None): data = self._data.to_pandas() data = extract_array(data, extract_numpy=True) @@ -165,9 +148,6 @@ def take(self, indices, allow_fill=False, fill_value=None): result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill) return self._from_sequence(result, dtype=self.dtype) - def copy(self): - return type(self)(copy.copy(self._data)) - @classmethod def _concat_same_type(cls, to_concat): chunks = list(itertools.chain.from_iterable(x._data.chunks for x in to_concat)) diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index ddd10dfcb2d60..bdfbbef937019 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -62,11 +62,6 @@ def test_contains(self, data, data_missing): class TestConstructors(BaseArrowTests, base.BaseConstructorsTests): - # seems like some bug in isna on empty BoolArray returning floats. - @pytest.mark.xfail(reason="bad is-na for empty data") - def test_from_sequence_from_cls(self, data): - super().test_from_sequence_from_cls(data) - @pytest.mark.xfail(reason="pa.NULL is not recognised as scalar, GH-33899") def test_series_constructor_no_data_with_index(self, dtype, na_value): # pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays @@ -77,10 +72,6 @@ def test_series_constructor_scalar_na_with_index(self, dtype, na_value): # pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays super().test_series_constructor_scalar_na_with_index(dtype, na_value) - @pytest.mark.xfail(reason="ufunc 'invert' not supported for the input types") - def test_construct_empty_dataframe(self, dtype): - super().test_construct_empty_dataframe(dtype) - @pytest.mark.xfail(reason="_from_sequence ignores dtype keyword") def test_empty(self, dtype): super().test_empty(dtype)