Skip to content

Commit

Permalink
REF: implement ArrowExtensionArray base class (pandas-dev#46102)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored Feb 26, 2022
1 parent c2188de commit 7dea5ae
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 94 deletions.
2 changes: 2 additions & 0 deletions pandas/core/arrays/_arrow_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import json

import numpy as np
Expand Down
89 changes: 89 additions & 0 deletions pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
npt,
type_t,
)
from pandas.compat import pa_version_under2p0
from pandas.errors import AbstractMethodError
from pandas.util._decorators import doc
from pandas.util._validators import (
Expand Down Expand Up @@ -66,6 +67,8 @@

if TYPE_CHECKING:

import pyarrow as pa

from pandas._typing import (
NumpySorter,
NumpyValueArrayLike,
Expand Down Expand Up @@ -508,3 +511,89 @@ def _empty(
arr = cls._from_sequence([], dtype=dtype)
backing = np.empty(shape, dtype=arr._ndarray.dtype)
return arr._from_backing_data(backing)


ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray")


class ArrowExtensionArray(ExtensionArray):
"""
Base class for ExtensionArray backed by Arrow array.
"""

_data: pa.ChunkedArray

def __init__(self, values: pa.ChunkedArray):
self._data = values

def __arrow_array__(self, type=None):
"""Convert myself to a pyarrow Array or ChunkedArray."""
return self._data

def equals(self, other) -> bool:
if not isinstance(other, ArrowExtensionArray):
return False
# I'm told that pyarrow makes __eq__ behave like pandas' equals;
# TODO: is this documented somewhere?
return self._data == other._data

@property
def nbytes(self) -> int:
"""
The number of bytes needed to store this object in memory.
"""
return self._data.nbytes

def __len__(self) -> int:
"""
Length of this array.
Returns
-------
length : int
"""
return len(self._data)

def isna(self) -> npt.NDArray[np.bool_]:
"""
Boolean NumPy array indicating if each value is missing.
This should return a 1-D array the same length as 'self'.
"""
if pa_version_under2p0:
return self._data.is_null().to_pandas().values
else:
return self._data.is_null().to_numpy()

def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
"""
Return a shallow copy of the array.
Underlying ChunkedArray is immutable, so a deep copy is unnecessary.
Returns
-------
type(self)
"""
return type(self)(self._data)

@classmethod
def _concat_same_type(
cls: type[ArrowExtensionArrayT], to_concat
) -> ArrowExtensionArrayT:
"""
Concatenate multiple ArrowExtensionArrays.
Parameters
----------
to_concat : sequence of ArrowExtensionArrays
Returns
-------
ArrowExtensionArray
"""
import pyarrow as pa

chunks = [array for ea in to_concat for array in ea._data.iterchunks()]
arr = pa.chunked_array(chunks)
return cls(arr)
6 changes: 5 additions & 1 deletion pandas/core/arrays/string_.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@
from pandas.core import ops
from pandas.core.array_algos import masked_reductions
from pandas.core.arrays import (
ExtensionArray,
FloatingArray,
IntegerArray,
PandasArray,
)
from pandas.core.arrays.base import ExtensionArray
from pandas.core.arrays.floating import FloatingDtype
from pandas.core.arrays.integer import IntegerDtype
from pandas.core.construction import extract_array
Expand Down Expand Up @@ -224,6 +224,10 @@ def __from_arrow__(


class BaseStringArray(ExtensionArray):
"""
Mixin class for StringArray, ArrowStringArray.
"""

pass


Expand Down
66 changes: 4 additions & 62 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
from pandas.core.dtypes.missing import isna

from pandas.core.arraylike import OpsMixin
from pandas.core.arrays._mixins import ArrowExtensionArray
from pandas.core.arrays.base import ExtensionArray
from pandas.core.arrays.boolean import BooleanDtype
from pandas.core.arrays.integer import Int64Dtype
Expand Down Expand Up @@ -94,7 +95,9 @@ def _chk_pyarrow_available() -> None:
# fallback for the ones that pyarrow doesn't yet support


class ArrowStringArray(OpsMixin, BaseStringArray, ObjectStringArrayMixin):
class ArrowStringArray(
OpsMixin, ArrowExtensionArray, BaseStringArray, ObjectStringArrayMixin
):
"""
Extension array for string data in a ``pyarrow.ChunkedArray``.
Expand Down Expand Up @@ -191,10 +194,6 @@ def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
"""Correctly construct numpy arrays when passed to `np.asarray()`."""
return self.to_numpy(dtype=dtype)

def __arrow_array__(self, type=None):
"""Convert myself to a pyarrow Array or ChunkedArray."""
return self._data

def to_numpy(
self,
dtype: npt.DTypeLike | None = None,
Expand All @@ -216,16 +215,6 @@ def to_numpy(
result[mask] = na_value
return result

def __len__(self) -> int:
"""
Length of this array.
Returns
-------
length : int
"""
return len(self._data)

@doc(ExtensionArray.factorize)
def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
encoded = self._data.dictionary_encode()
Expand All @@ -243,25 +232,6 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:

return indices.values, uniques

@classmethod
def _concat_same_type(cls, to_concat) -> ArrowStringArray:
"""
Concatenate multiple ArrowStringArray.
Parameters
----------
to_concat : sequence of ArrowStringArray
Returns
-------
ArrowStringArray
"""
return cls(
pa.chunked_array(
[array for ea in to_concat for array in ea._data.iterchunks()]
)
)

@overload
def __getitem__(self, item: ScalarIndexer) -> ArrowStringScalarOrNAT:
...
Expand Down Expand Up @@ -342,34 +312,6 @@ def _as_pandas_scalar(self, arrow_scalar: pa.Scalar):
else:
return scalar

@property
def nbytes(self) -> int:
"""
The number of bytes needed to store this object in memory.
"""
return self._data.nbytes

def isna(self) -> np.ndarray:
"""
Boolean NumPy array indicating if each value is missing.
This should return a 1-D array the same length as 'self'.
"""
# TODO: Implement .to_numpy for ChunkedArray
return self._data.is_null().to_pandas().values

def copy(self) -> ArrowStringArray:
"""
Return a shallow copy of the array.
Underlying ChunkedArray is immutable, so a deep copy is unnecessary.
Returns
-------
ArrowStringArray
"""
return type(self)(self._data)

def _cmp_method(self, other, op):
from pandas.arrays import BooleanArray

Expand Down
24 changes: 2 additions & 22 deletions pandas/tests/extension/arrow/arrays.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
"""
from __future__ import annotations

import copy
import itertools
import operator

Expand All @@ -19,13 +18,13 @@

import pandas as pd
from pandas.api.extensions import (
ExtensionArray,
ExtensionDtype,
register_extension_dtype,
take,
)
from pandas.api.types import is_scalar
from pandas.core.arraylike import OpsMixin
from pandas.core.arrays._mixins import ArrowExtensionArray as _ArrowExtensionArray
from pandas.core.construction import extract_array


Expand Down Expand Up @@ -73,7 +72,7 @@ def construct_array_type(cls) -> type_t[ArrowStringArray]:
return ArrowStringArray


class ArrowExtensionArray(OpsMixin, ExtensionArray):
class ArrowExtensionArray(OpsMixin, _ArrowExtensionArray):
_data: pa.ChunkedArray

@classmethod
Expand Down Expand Up @@ -111,9 +110,6 @@ def __getitem__(self, item):
vals = self._data.to_pandas()[item]
return type(self)._from_sequence(vals)

def __len__(self):
return len(self._data)

def astype(self, dtype, copy=True):
# needed to fix this astype for the Series constructor.
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
Expand Down Expand Up @@ -142,19 +138,6 @@ def __eq__(self, other):

return self._logical_method(other, operator.eq)

@property
def nbytes(self) -> int:
return sum(
x.size
for chunk in self._data.chunks
for x in chunk.buffers()
if x is not None
)

def isna(self):
nas = pd.isna(self._data.to_pandas())
return type(self)._from_sequence(nas)

def take(self, indices, allow_fill=False, fill_value=None):
data = self._data.to_pandas()
data = extract_array(data, extract_numpy=True)
Expand All @@ -165,9 +148,6 @@ def take(self, indices, allow_fill=False, fill_value=None):
result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill)
return self._from_sequence(result, dtype=self.dtype)

def copy(self):
return type(self)(copy.copy(self._data))

@classmethod
def _concat_same_type(cls, to_concat):
chunks = list(itertools.chain.from_iterable(x._data.chunks for x in to_concat))
Expand Down
9 changes: 0 additions & 9 deletions pandas/tests/extension/arrow/test_bool.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,6 @@ def test_contains(self, data, data_missing):


class TestConstructors(BaseArrowTests, base.BaseConstructorsTests):
# seems like some bug in isna on empty BoolArray returning floats.
@pytest.mark.xfail(reason="bad is-na for empty data")
def test_from_sequence_from_cls(self, data):
super().test_from_sequence_from_cls(data)

@pytest.mark.xfail(reason="pa.NULL is not recognised as scalar, GH-33899")
def test_series_constructor_no_data_with_index(self, dtype, na_value):
# pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays
Expand All @@ -77,10 +72,6 @@ def test_series_constructor_scalar_na_with_index(self, dtype, na_value):
# pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays
super().test_series_constructor_scalar_na_with_index(dtype, na_value)

@pytest.mark.xfail(reason="ufunc 'invert' not supported for the input types")
def test_construct_empty_dataframe(self, dtype):
super().test_construct_empty_dataframe(dtype)

@pytest.mark.xfail(reason="_from_sequence ignores dtype keyword")
def test_empty(self, dtype):
super().test_empty(dtype)
Expand Down

0 comments on commit 7dea5ae

Please sign in to comment.