Skip to content

Commit

Permalink
BUG: PandasArray._quantile when empty (pandas-dev#46110)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored Feb 26, 2022
1 parent c3abb52 commit 3f52f4e
Show file tree
Hide file tree
Showing 9 changed files with 59 additions and 7 deletions.
8 changes: 5 additions & 3 deletions pandas/core/array_algos/quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import numpy as np

from pandas._libs import lib
from pandas._typing import (
ArrayLike,
Scalar,
Expand Down Expand Up @@ -128,7 +127,10 @@ def _nanpercentile_1d(
values = values[~mask]

if len(values) == 0:
return np.array([na_value] * len(qs), dtype=values.dtype)
# Can't pass dtype=values.dtype here bc we might have na_value=np.nan
# with values.dtype=int64 see test_quantile_empty
# equiv: 'np.array([na_value] * len(qs))' but much faster
return np.full(len(qs), na_value)

return np.percentile(values, qs, **{np_percentile_argname: interpolation})

Expand Down Expand Up @@ -173,7 +175,7 @@ def _nanpercentile(
# have float result at this point, not i8
return result.astype(values.dtype)

if not lib.is_scalar(mask) and mask.any():
if mask.any():
# Caller is responsible for ensuring mask shape match
assert mask.shape == values.shape
result = [
Expand Down
21 changes: 17 additions & 4 deletions pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,12 @@ class NDArrayBackedExtensionArray(NDArrayBacked, ExtensionArray):

_ndarray: np.ndarray

# scalar used to denote NA value inside our self._ndarray, e.g. -1
# for Categorical, iNaT for Period. Outside of object dtype,
# self.isna() should be exactly locations in self._ndarray with
# _internal_fill_value.
_internal_fill_value: Any

def _box_func(self, x):
"""
Wrap numpy type in our dtype.type if necessary.
Expand Down Expand Up @@ -463,18 +469,25 @@ def _quantile(
mask = np.atleast_2d(mask)

arr = np.atleast_2d(self._ndarray)
# TODO: something NDArrayBacked-specific instead of _values_for_factorize[1]?
fill_value = self._values_for_factorize()[1]
fill_value = self._internal_fill_value

res_values = quantile_with_mask(arr, mask, fill_value, qs, interpolation)

result = type(self)._from_factorized(res_values, self)
res_values = self._cast_quantile_result(res_values)
result = self._from_backing_data(res_values)
if self.ndim == 1:
assert result.shape == (1, len(qs)), result.shape
result = result[0]

return result

# TODO: see if we can share this with other dispatch-wrapping methods
def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
"""
Cast the result of quantile_with_mask to an appropriate dtype
to pass to _from_backing_data in _quantile.
"""
return res_values

# ------------------------------------------------------------------------
# numpy-like methods

Expand Down
6 changes: 6 additions & 0 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi
# For comparisons, so that numpy uses our implementation if the compare
# ops, which raise
__array_priority__ = 1000
_internal_fill_value = -1
# tolist is not actually deprecated, just suppressed in the __dir__
_hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"])
_typ = "categorical"
Expand Down Expand Up @@ -2316,6 +2317,11 @@ def _from_factorized(cls, uniques, original):
original.categories.take(uniques), dtype=original.dtype
)

def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
# make sure we have correct itemsize for resulting codes
res_values = coerce_indexer_dtype(res_values, self.dtype.categories)
return res_values

def equals(self, other: object) -> bool:
"""
Returns True if categorical arrays are equal.
Expand Down
1 change: 1 addition & 0 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps):

_typ = "datetimearray"
_scalar_type = Timestamp
_internal_fill_value = np.datetime64("NaT", "ns")
_recognized_scalars = (datetime, np.datetime64)
_is_recognized_dtype = is_datetime64_any_dtype
_infer_matches = ("datetime", "datetime64", "date")
Expand Down
1 change: 1 addition & 0 deletions pandas/core/arrays/numpy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class PandasArray(
__array_priority__ = 1000
_ndarray: np.ndarray
_dtype: PandasDtype
_internal_fill_value = np.nan

# ------------------------------------------------------------------------
# Constructors
Expand Down
7 changes: 7 additions & 0 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ class PeriodArray(dtl.DatelikeOps):
__array_priority__ = 1000
_typ = "periodarray" # ABCPeriodArray
_scalar_type = Period
_internal_fill_value = np.int64(iNaT)
_recognized_scalars = (Period,)
_is_recognized_dtype = is_period_dtype
_infer_matches = ("period",)
Expand Down Expand Up @@ -697,6 +698,12 @@ def fillna(self, value=None, method=None, limit=None) -> PeriodArray:
return result.view(self.dtype) # type: ignore[return-value]
return super().fillna(value=value, method=method, limit=limit)

# TODO: alternately could override _quantile like searchsorted
def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
# quantile_with_mask may return float64 instead of int64, in which
# case we need to cast back
return res_values.astype(np.int64, copy=False)

# ------------------------------------------------------------------
# Arithmetic Methods

Expand Down
1 change: 1 addition & 0 deletions pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ class TimedeltaArray(dtl.TimelikeOps):

_typ = "timedeltaarray"
_scalar_type = Timedelta
_internal_fill_value = np.timedelta64("NaT", "ns")
_recognized_scalars = (timedelta, np.timedelta64, Tick)
_is_recognized_dtype = is_timedelta64_dtype
_infer_matches = ("timedelta", "timedelta64")
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/arrays/categorical/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,3 +363,13 @@ def test_validate_inplace_raises(self, value):

with pytest.raises(ValueError, match=msg):
cat.sort_values(inplace=value)

def test_quantile_empty(self):
# make sure we have correct itemsize on resulting codes
cat = Categorical(["A", "B"])
idx = Index([0.0, 0.5])
result = cat[:0]._quantile(idx, interpolation="linear")
assert result._codes.dtype == np.int8

expected = cat.take([-1, -1], allow_fill=True)
tm.assert_extension_array_equal(result, expected)
11 changes: 11 additions & 0 deletions pandas/tests/arrays/numpy_/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,3 +298,14 @@ def test_setitem_preserves_views():
arr[-1] = 2.5
view1[-1] = 5
assert arr[-1] == 5


@pytest.mark.parametrize("dtype", [np.int64, np.uint64])
def test_quantile_empty(dtype):
# we should get back np.nans, not -1s
arr = PandasArray(np.array([], dtype=dtype))
idx = pd.Index([0.0, 0.5])

result = arr._quantile(idx, interpolation="linear")
expected = PandasArray(np.array([np.nan, np.nan]))
tm.assert_extension_array_equal(result, expected)

0 comments on commit 3f52f4e

Please sign in to comment.