REF: implement ArrowExtensionArray base class (pandas-dev#46102)

me-kbs · Feb 26, 2022 · 7dea5ae · 7dea5ae
1 parent c2188de
commit 7dea5ae
Show file tree

Hide file tree

Showing 6 changed files with 102 additions and 94 deletions.
diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import json
 
 import numpy as np

diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
@@ -28,6 +28,7 @@
     npt,
     type_t,
 )
+from pandas.compat import pa_version_under2p0
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import doc
 from pandas.util._validators import (
@@ -66,6 +67,8 @@
 
 if TYPE_CHECKING:
 
+    import pyarrow as pa
+
     from pandas._typing import (
         NumpySorter,
         NumpyValueArrayLike,
@@ -508,3 +511,89 @@ def _empty(
         arr = cls._from_sequence([], dtype=dtype)
         backing = np.empty(shape, dtype=arr._ndarray.dtype)
         return arr._from_backing_data(backing)
+
+
+ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray")
+
+
+class ArrowExtensionArray(ExtensionArray):
+    """
+    Base class for ExtensionArray backed by Arrow array.
+    """
+
+    _data: pa.ChunkedArray
+
+    def __init__(self, values: pa.ChunkedArray):
+        self._data = values
+
+    def __arrow_array__(self, type=None):
+        """Convert myself to a pyarrow Array or ChunkedArray."""
+        return self._data
+
+    def equals(self, other) -> bool:
+        if not isinstance(other, ArrowExtensionArray):
+            return False
+        # I'm told that pyarrow makes __eq__ behave like pandas' equals;
+        #  TODO: is this documented somewhere?
+        return self._data == other._data
+
+    @property
+    def nbytes(self) -> int:
+        """
+        The number of bytes needed to store this object in memory.
+        """
+        return self._data.nbytes
+
+    def __len__(self) -> int:
+        """
+        Length of this array.
+
+        Returns
+        -------
+        length : int
+        """
+        return len(self._data)
+
+    def isna(self) -> npt.NDArray[np.bool_]:
+        """
+        Boolean NumPy array indicating if each value is missing.
+
+        This should return a 1-D array the same length as 'self'.
+        """
+        if pa_version_under2p0:
+            return self._data.is_null().to_pandas().values
+        else:
+            return self._data.is_null().to_numpy()
+
+    def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
+        """
+        Return a shallow copy of the array.
+
+        Underlying ChunkedArray is immutable, so a deep copy is unnecessary.
+
+        Returns
+        -------
+        type(self)
+        """
+        return type(self)(self._data)
+
+    @classmethod
+    def _concat_same_type(
+        cls: type[ArrowExtensionArrayT], to_concat
+    ) -> ArrowExtensionArrayT:
+        """
+        Concatenate multiple ArrowExtensionArrays.
+
+        Parameters
+        ----------
+        to_concat : sequence of ArrowExtensionArrays
+
+        Returns
+        -------
+        ArrowExtensionArray
+        """
+        import pyarrow as pa
+
+        chunks = [array for ea in to_concat for array in ea._data.iterchunks()]
+        arr = pa.chunked_array(chunks)
+        return cls(arr)
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -39,11 +39,11 @@
 from pandas.core import ops
 from pandas.core.array_algos import masked_reductions
 from pandas.core.arrays import (
+    ExtensionArray,
     FloatingArray,
     IntegerArray,
     PandasArray,
 )
-from pandas.core.arrays.base import ExtensionArray
 from pandas.core.arrays.floating import FloatingDtype
 from pandas.core.arrays.integer import IntegerDtype
 from pandas.core.construction import extract_array
@@ -224,6 +224,10 @@ def __from_arrow__(
 
 
 class BaseStringArray(ExtensionArray):
+    """
+    Mixin class for StringArray, ArrowStringArray.
+    """
+
     pass
 
 

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -48,6 +48,7 @@
 from pandas.core.dtypes.missing import isna
 
 from pandas.core.arraylike import OpsMixin
+from pandas.core.arrays._mixins import ArrowExtensionArray
 from pandas.core.arrays.base import ExtensionArray
 from pandas.core.arrays.boolean import BooleanDtype
 from pandas.core.arrays.integer import Int64Dtype
@@ -94,7 +95,9 @@ def _chk_pyarrow_available() -> None:
 # fallback for the ones that pyarrow doesn't yet support
 
 
-class ArrowStringArray(OpsMixin, BaseStringArray, ObjectStringArrayMixin):
+class ArrowStringArray(
+    OpsMixin, ArrowExtensionArray, BaseStringArray, ObjectStringArrayMixin
+):
     """
     Extension array for string data in a ``pyarrow.ChunkedArray``.
 
@@ -191,10 +194,6 @@ def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
         """Correctly construct numpy arrays when passed to `np.asarray()`."""
         return self.to_numpy(dtype=dtype)
 
-    def __arrow_array__(self, type=None):
-        """Convert myself to a pyarrow Array or ChunkedArray."""
-        return self._data
-
     def to_numpy(
         self,
         dtype: npt.DTypeLike | None = None,
@@ -216,16 +215,6 @@ def to_numpy(
             result[mask] = na_value
         return result
 
-    def __len__(self) -> int:
-        """
-        Length of this array.
-
-        Returns
-        -------
-        length : int
-        """
-        return len(self._data)
-
     @doc(ExtensionArray.factorize)
     def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
         encoded = self._data.dictionary_encode()
@@ -243,25 +232,6 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
 
         return indices.values, uniques
 
-    @classmethod
-    def _concat_same_type(cls, to_concat) -> ArrowStringArray:
-        """
-        Concatenate multiple ArrowStringArray.
-
-        Parameters
-        ----------
-        to_concat : sequence of ArrowStringArray
-
-        Returns
-        -------
-        ArrowStringArray
-        """
-        return cls(
-            pa.chunked_array(
-                [array for ea in to_concat for array in ea._data.iterchunks()]
-            )
-        )
-
     @overload
     def __getitem__(self, item: ScalarIndexer) -> ArrowStringScalarOrNAT:
         ...
@@ -342,34 +312,6 @@ def _as_pandas_scalar(self, arrow_scalar: pa.Scalar):
         else:
             return scalar
 
-    @property
-    def nbytes(self) -> int:
-        """
-        The number of bytes needed to store this object in memory.
-        """
-        return self._data.nbytes
-
-    def isna(self) -> np.ndarray:
-        """
-        Boolean NumPy array indicating if each value is missing.
-
-        This should return a 1-D array the same length as 'self'.
-        """
-        # TODO: Implement .to_numpy for ChunkedArray
-        return self._data.is_null().to_pandas().values
-
-    def copy(self) -> ArrowStringArray:
-        """
-        Return a shallow copy of the array.
-
-        Underlying ChunkedArray is immutable, so a deep copy is unnecessary.
-
-        Returns
-        -------
-        ArrowStringArray
-        """
-        return type(self)(self._data)
-
     def _cmp_method(self, other, op):
         from pandas.arrays import BooleanArray
 

diff --git a/pandas/tests/extension/arrow/arrays.py b/pandas/tests/extension/arrow/arrays.py
@@ -8,7 +8,6 @@
 """
 from __future__ import annotations
 
-import copy
 import itertools
 import operator
 
@@ -19,13 +18,13 @@
 
 import pandas as pd
 from pandas.api.extensions import (
-    ExtensionArray,
     ExtensionDtype,
     register_extension_dtype,
     take,
 )
 from pandas.api.types import is_scalar
 from pandas.core.arraylike import OpsMixin
+from pandas.core.arrays._mixins import ArrowExtensionArray as _ArrowExtensionArray
 from pandas.core.construction import extract_array
 
 
@@ -73,7 +72,7 @@ def construct_array_type(cls) -> type_t[ArrowStringArray]:
         return ArrowStringArray
 
 
-class ArrowExtensionArray(OpsMixin, ExtensionArray):
+class ArrowExtensionArray(OpsMixin, _ArrowExtensionArray):
     _data: pa.ChunkedArray
 
     @classmethod
@@ -111,9 +110,6 @@ def __getitem__(self, item):
             vals = self._data.to_pandas()[item]
             return type(self)._from_sequence(vals)
 
-    def __len__(self):
-        return len(self._data)
-
     def astype(self, dtype, copy=True):
         # needed to fix this astype for the Series constructor.
         if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
@@ -142,19 +138,6 @@ def __eq__(self, other):
 
         return self._logical_method(other, operator.eq)
 
-    @property
-    def nbytes(self) -> int:
-        return sum(
-            x.size
-            for chunk in self._data.chunks
-            for x in chunk.buffers()
-            if x is not None
-        )
-
-    def isna(self):
-        nas = pd.isna(self._data.to_pandas())
-        return type(self)._from_sequence(nas)
-
     def take(self, indices, allow_fill=False, fill_value=None):
         data = self._data.to_pandas()
         data = extract_array(data, extract_numpy=True)
@@ -165,9 +148,6 @@ def take(self, indices, allow_fill=False, fill_value=None):
         result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill)
         return self._from_sequence(result, dtype=self.dtype)
 
-    def copy(self):
-        return type(self)(copy.copy(self._data))
-
     @classmethod
     def _concat_same_type(cls, to_concat):
         chunks = list(itertools.chain.from_iterable(x._data.chunks for x in to_concat))

diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py
@@ -62,11 +62,6 @@ def test_contains(self, data, data_missing):
 
 
 class TestConstructors(BaseArrowTests, base.BaseConstructorsTests):
-    # seems like some bug in isna on empty BoolArray returning floats.
-    @pytest.mark.xfail(reason="bad is-na for empty data")
-    def test_from_sequence_from_cls(self, data):
-        super().test_from_sequence_from_cls(data)
-
     @pytest.mark.xfail(reason="pa.NULL is not recognised as scalar, GH-33899")
     def test_series_constructor_no_data_with_index(self, dtype, na_value):
         # pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays
@@ -77,10 +72,6 @@ def test_series_constructor_scalar_na_with_index(self, dtype, na_value):
         # pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays
         super().test_series_constructor_scalar_na_with_index(dtype, na_value)
 
-    @pytest.mark.xfail(reason="ufunc 'invert' not supported for the input types")
-    def test_construct_empty_dataframe(self, dtype):
-        super().test_construct_empty_dataframe(dtype)
-
     @pytest.mark.xfail(reason="_from_sequence ignores dtype keyword")
     def test_empty(self, dtype):
         super().test_empty(dtype)