diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index e47751245cb65..a6fb2fcce430a 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -295,6 +295,7 @@ Performance improvements - Performance improvement in :meth:`MultiIndex.get_locs` (:issue:`45681`, :issue:`46040`) - Performance improvement in :func:`merge` when left and/or right are empty (:issue:`45838`) - Performance improvement in :meth:`DataFrame.join` when left and/or right are empty (:issue:`46015`) +- Performance improvement in :func:`factorize` (:issue:`46109`) - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`) - diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 8048f40ecf330..197a8bdc0cd7c 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -56,7 +56,7 @@ def group_add( values: np.ndarray, # ndarray[complexfloating_t, ndim=2] labels: np.ndarray, # const intp_t[:] min_count: int = ..., - datetimelike: bool = ..., + is_datetimelike: bool = ..., ) -> None: ... def group_prod( out: np.ndarray, # floating[:, ::1] @@ -104,6 +104,8 @@ def group_last( counts: np.ndarray, # int64_t[::1] values: np.ndarray, # ndarray[rank_t, ndim=2] labels: np.ndarray, # const int64_t[:] + mask: npt.NDArray[np.bool_] | None, + result_mask: npt.NDArray[np.bool_] | None, min_count: int = ..., # Py_ssize_t ) -> None: ... def group_nth( diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 0c3c57d40d42d..d250a69c1985b 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -104,11 +104,13 @@ cdef inline float64_t median_linear(float64_t* a, int n) nogil: @cython.boundscheck(False) @cython.wraparound(False) -def group_median_float64(ndarray[float64_t, ndim=2] out, - ndarray[int64_t] counts, - ndarray[float64_t, ndim=2] values, - ndarray[intp_t] labels, - Py_ssize_t min_count=-1) -> None: +def group_median_float64( + ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[intp_t] labels, + Py_ssize_t min_count=-1, +) -> None: """ Only aggregates on axis=0 """ @@ -145,12 +147,14 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_cumprod_float64(float64_t[:, ::1] out, - const float64_t[:, :] values, - const intp_t[::1] labels, - int ngroups, - bint is_datetimelike, - bint skipna=True) -> None: +def group_cumprod_float64( + float64_t[:, ::1] out, + const float64_t[:, :] values, + const intp_t[::1] labels, + int ngroups, + bint is_datetimelike, + bint skipna=True, +) -> None: """ Cumulative product of columns of `values`, in row groups `labels`. @@ -202,12 +206,14 @@ def group_cumprod_float64(float64_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_cumsum(numeric_t[:, ::1] out, - ndarray[numeric_t, ndim=2] values, - const intp_t[::1] labels, - int ngroups, - is_datetimelike, - bint skipna=True) -> None: +def group_cumsum( + numeric_t[:, ::1] out, + ndarray[numeric_t, ndim=2] values, + const intp_t[::1] labels, + int ngroups, + is_datetimelike, + bint skipna=True, +) -> None: """ Cumulative sum of columns of `values`, in row groups `labels`. @@ -271,8 +277,12 @@ def group_cumsum(numeric_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_shift_indexer(int64_t[::1] out, const intp_t[::1] labels, - int ngroups, int periods) -> None: +def group_shift_indexer( + int64_t[::1] out, + const intp_t[::1] labels, + int ngroups, + int periods, +) -> None: cdef: Py_ssize_t N, i, j, ii, lab int offset = 0, sign @@ -323,10 +333,15 @@ def group_shift_indexer(int64_t[::1] out, const intp_t[::1] labels, @cython.wraparound(False) @cython.boundscheck(False) -def group_fillna_indexer(ndarray[intp_t] out, ndarray[intp_t] labels, - ndarray[intp_t] sorted_labels, - ndarray[uint8_t] mask, str direction, - int64_t limit, bint dropna) -> None: +def group_fillna_indexer( + ndarray[intp_t] out, + ndarray[intp_t] labels, + ndarray[intp_t] sorted_labels, + ndarray[uint8_t] mask, + str direction, + int64_t limit, + bint dropna, +) -> None: """ Indexes how to fill values forwards or backwards within a group. @@ -388,13 +403,15 @@ def group_fillna_indexer(ndarray[intp_t] out, ndarray[intp_t] labels, @cython.boundscheck(False) @cython.wraparound(False) -def group_any_all(int8_t[:, ::1] out, - const int8_t[:, :] values, - const intp_t[::1] labels, - const uint8_t[:, :] mask, - str val_test, - bint skipna, - bint nullable) -> None: +def group_any_all( + int8_t[:, ::1] out, + const int8_t[:, :] values, + const intp_t[::1] labels, + const uint8_t[:, :] mask, + str val_test, + bint skipna, + bint nullable, +) -> None: """ Aggregated boolean values to show truthfulness of group elements. If the input is a nullable type (nullable=True), the result will be computed @@ -488,12 +505,14 @@ ctypedef fused add_t: @cython.wraparound(False) @cython.boundscheck(False) -def group_add(add_t[:, ::1] out, - int64_t[::1] counts, - ndarray[add_t, ndim=2] values, - const intp_t[::1] labels, - Py_ssize_t min_count=0, - bint datetimelike=False) -> None: +def group_add( + add_t[:, ::1] out, + int64_t[::1] counts, + ndarray[add_t, ndim=2] values, + const intp_t[::1] labels, + Py_ssize_t min_count=0, + bint is_datetimelike=False, +) -> None: """ Only aggregates on axis=0 using Kahan summation """ @@ -560,7 +579,7 @@ def group_add(add_t[:, ::1] out, # is otherwise the same as in _treat_as_na if val == val and not ( add_t is float64_t - and datetimelike + and is_datetimelike and val == NPY_NAT ): nobs[lab, j] += 1 @@ -579,11 +598,13 @@ def group_add(add_t[:, ::1] out, @cython.wraparound(False) @cython.boundscheck(False) -def group_prod(floating[:, ::1] out, - int64_t[::1] counts, - ndarray[floating, ndim=2] values, - const intp_t[::1] labels, - Py_ssize_t min_count=0) -> None: +def group_prod( + floating[:, ::1] out, + int64_t[::1] counts, + ndarray[floating, ndim=2] values, + const intp_t[::1] labels, + Py_ssize_t min_count=0, +) -> None: """ Only aggregates on axis=0 """ @@ -628,12 +649,14 @@ def group_prod(floating[:, ::1] out, @cython.wraparound(False) @cython.boundscheck(False) @cython.cdivision(True) -def group_var(floating[:, ::1] out, - int64_t[::1] counts, - ndarray[floating, ndim=2] values, - const intp_t[::1] labels, - Py_ssize_t min_count=-1, - int64_t ddof=1) -> None: +def group_var( + floating[:, ::1] out, + int64_t[::1] counts, + ndarray[floating, ndim=2] values, + const intp_t[::1] labels, + Py_ssize_t min_count=-1, + int64_t ddof=1, +) -> None: cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) floating val, ct, oldmean @@ -682,15 +705,16 @@ def group_var(floating[:, ::1] out, @cython.wraparound(False) @cython.boundscheck(False) -def group_mean(mean_t[:, ::1] out, - int64_t[::1] counts, - ndarray[mean_t, ndim=2] values, - const intp_t[::1] labels, - Py_ssize_t min_count=-1, - bint is_datetimelike=False, - const uint8_t[:, ::1] mask=None, - uint8_t[:, ::1] result_mask=None - ) -> None: +def group_mean( + mean_t[:, ::1] out, + int64_t[::1] counts, + ndarray[mean_t, ndim=2] values, + const intp_t[::1] labels, + Py_ssize_t min_count=-1, + bint is_datetimelike=False, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None, +) -> None: """ Compute the mean per label given a label assignment for each value. NaN values are ignored. @@ -770,11 +794,13 @@ def group_mean(mean_t[:, ::1] out, @cython.wraparound(False) @cython.boundscheck(False) -def group_ohlc(floating[:, ::1] out, - int64_t[::1] counts, - ndarray[floating, ndim=2] values, - const intp_t[::1] labels, - Py_ssize_t min_count=-1) -> None: +def group_ohlc( + floating[:, ::1] out, + int64_t[::1] counts, + ndarray[floating, ndim=2] values, + const intp_t[::1] labels, + Py_ssize_t min_count=-1, +) -> None: """ Only aggregates on axis=0 """ @@ -817,13 +843,15 @@ def group_ohlc(floating[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_quantile(ndarray[float64_t, ndim=2] out, - ndarray[numeric_t, ndim=1] values, - ndarray[intp_t] labels, - ndarray[uint8_t] mask, - const intp_t[:] sort_indexer, - const float64_t[:] qs, - str interpolation) -> None: +def group_quantile( + ndarray[float64_t, ndim=2] out, + ndarray[numeric_t, ndim=1] values, + ndarray[intp_t] labels, + ndarray[uint8_t] mask, + const intp_t[:] sort_indexer, + const float64_t[:] qs, + str interpolation, +) -> None: """ Calculate the quantile per group. @@ -949,17 +977,19 @@ cdef inline bint _treat_as_na(iu_64_floating_obj_t val, bint is_datetimelike) no return val != val -# GH#31710 use memorviews once cython 0.30 is released so we can +# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can # use `const iu_64_floating_obj_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) -def group_last(iu_64_floating_obj_t[:, ::1] out, - int64_t[::1] counts, - ndarray[iu_64_floating_obj_t, ndim=2] values, - const intp_t[::1] labels, - const uint8_t[:, :] mask, - uint8_t[:, ::1] result_mask=None, - Py_ssize_t min_count=-1) -> None: +def group_last( + iu_64_floating_obj_t[:, ::1] out, + int64_t[::1] counts, + ndarray[iu_64_floating_obj_t, ndim=2] values, + const intp_t[::1] labels, + const uint8_t[:, :] mask, + uint8_t[:, ::1] result_mask=None, + Py_ssize_t min_count=-1, +) -> None: """ Only aggregates on axis=0 """ @@ -1058,19 +1088,20 @@ def group_last(iu_64_floating_obj_t[:, ::1] out, raise RuntimeError("empty group with uint64_t") -# GH#31710 use memorviews once cython 0.30 is released so we can +# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can # use `const iu_64_floating_obj_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) -def group_nth(iu_64_floating_obj_t[:, ::1] out, - int64_t[::1] counts, - ndarray[iu_64_floating_obj_t, ndim=2] values, - const intp_t[::1] labels, - const uint8_t[:, :] mask, - uint8_t[:, ::1] result_mask=None, - int64_t min_count=-1, - int64_t rank=1, - ) -> None: +def group_nth( + iu_64_floating_obj_t[:, ::1] out, + int64_t[::1] counts, + ndarray[iu_64_floating_obj_t, ndim=2] values, + const intp_t[::1] labels, + const uint8_t[:, :] mask, + uint8_t[:, ::1] result_mask=None, + int64_t min_count=-1, + int64_t rank=1, +) -> None: """ Only aggregates on axis=0 """ @@ -1173,12 +1204,17 @@ def group_nth(iu_64_floating_obj_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_rank(float64_t[:, ::1] out, - ndarray[iu_64_floating_obj_t, ndim=2] values, - const intp_t[::1] labels, - int ngroups, - bint is_datetimelike, str ties_method="average", - bint ascending=True, bint pct=False, str na_option="keep") -> None: +def group_rank( + float64_t[:, ::1] out, + ndarray[iu_64_floating_obj_t, ndim=2] values, + const intp_t[::1] labels, + int ngroups, + bint is_datetimelike, + str ties_method="average", + bint ascending=True, + bint pct=False, + str na_option="keep", +) -> None: """ Provides the rank of values within each group. @@ -1244,15 +1280,17 @@ def group_rank(float64_t[:, ::1] out, @cython.wraparound(False) @cython.boundscheck(False) -cdef group_min_max(iu_64_floating_t[:, ::1] out, - int64_t[::1] counts, - ndarray[iu_64_floating_t, ndim=2] values, - const intp_t[::1] labels, - Py_ssize_t min_count=-1, - bint is_datetimelike=False, - bint compute_max=True, - const uint8_t[:, ::1] mask=None, - uint8_t[:, ::1] result_mask=None): +cdef group_min_max( + iu_64_floating_t[:, ::1] out, + int64_t[::1] counts, + ndarray[iu_64_floating_t, ndim=2] values, + const intp_t[::1] labels, + Py_ssize_t min_count=-1, + bint is_datetimelike=False, + bint compute_max=True, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None, +): """ Compute minimum/maximum of columns of `values`, in row groups `labels`. @@ -1366,14 +1404,16 @@ cdef group_min_max(iu_64_floating_t[:, ::1] out, @cython.wraparound(False) @cython.boundscheck(False) -def group_max(iu_64_floating_t[:, ::1] out, - int64_t[::1] counts, - ndarray[iu_64_floating_t, ndim=2] values, - const intp_t[::1] labels, - Py_ssize_t min_count=-1, - bint is_datetimelike=False, - const uint8_t[:, ::1] mask=None, - uint8_t[:, ::1] result_mask=None) -> None: +def group_max( + iu_64_floating_t[:, ::1] out, + int64_t[::1] counts, + ndarray[iu_64_floating_t, ndim=2] values, + const intp_t[::1] labels, + Py_ssize_t min_count=-1, + bint is_datetimelike=False, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None, +) -> None: """See group_min_max.__doc__""" group_min_max( out, @@ -1390,14 +1430,16 @@ def group_max(iu_64_floating_t[:, ::1] out, @cython.wraparound(False) @cython.boundscheck(False) -def group_min(iu_64_floating_t[:, ::1] out, - int64_t[::1] counts, - ndarray[iu_64_floating_t, ndim=2] values, - const intp_t[::1] labels, - Py_ssize_t min_count=-1, - bint is_datetimelike=False, - const uint8_t[:, ::1] mask=None, - uint8_t[:, ::1] result_mask=None) -> None: +def group_min( + iu_64_floating_t[:, ::1] out, + int64_t[::1] counts, + ndarray[iu_64_floating_t, ndim=2] values, + const intp_t[::1] labels, + Py_ssize_t min_count=-1, + bint is_datetimelike=False, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None, +) -> None: """See group_min_max.__doc__""" group_min_max( out, @@ -1414,14 +1456,16 @@ def group_min(iu_64_floating_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -cdef group_cummin_max(iu_64_floating_t[:, ::1] out, - ndarray[iu_64_floating_t, ndim=2] values, - uint8_t[:, ::1] mask, - const intp_t[::1] labels, - int ngroups, - bint is_datetimelike, - bint skipna, - bint compute_max): +cdef group_cummin_max( + iu_64_floating_t[:, ::1] out, + ndarray[iu_64_floating_t, ndim=2] values, + uint8_t[:, ::1] mask, + const intp_t[::1] labels, + int ngroups, + bint is_datetimelike, + bint skipna, + bint compute_max, +): """ Cumulative minimum/maximum of columns of `values`, in row groups `labels`. @@ -1527,13 +1571,15 @@ cdef group_cummin_max(iu_64_floating_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_cummin(iu_64_floating_t[:, ::1] out, - ndarray[iu_64_floating_t, ndim=2] values, - const intp_t[::1] labels, - int ngroups, - bint is_datetimelike, - uint8_t[:, ::1] mask=None, - bint skipna=True) -> None: +def group_cummin( + iu_64_floating_t[:, ::1] out, + ndarray[iu_64_floating_t, ndim=2] values, + const intp_t[::1] labels, + int ngroups, + bint is_datetimelike, + uint8_t[:, ::1] mask=None, + bint skipna=True, +) -> None: """See group_cummin_max.__doc__""" group_cummin_max( out, @@ -1549,13 +1595,15 @@ def group_cummin(iu_64_floating_t[:, ::1] out, @cython.boundscheck(False) @cython.wraparound(False) -def group_cummax(iu_64_floating_t[:, ::1] out, - ndarray[iu_64_floating_t, ndim=2] values, - const intp_t[::1] labels, - int ngroups, - bint is_datetimelike, - uint8_t[:, ::1] mask=None, - bint skipna=True) -> None: +def group_cummax( + iu_64_floating_t[:, ::1] out, + ndarray[iu_64_floating_t, ndim=2] values, + const intp_t[::1] labels, + int ngroups, + bint is_datetimelike, + uint8_t[:, ::1] mask=None, + bint skipna=True, +) -> None: """See group_cummin_max.__doc__""" group_cummin_max( out, diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 62977f0fd2b4c..e65196c9105d4 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -312,7 +312,7 @@ def _create_binary_propagating_op(name, is_divmod=False): def method(self, other): if (other is C_NA or isinstance(other, str) or isinstance(other, (numbers.Number, np.bool_)) - or isinstance(other, np.ndarray) and not other.shape): + or util.is_array(other) and not other.shape): # Need the other.shape clause to handle NumPy scalars, # since we do a setitem on `out` below, which # won't work for NumPy scalars. @@ -321,7 +321,7 @@ def _create_binary_propagating_op(name, is_divmod=False): else: return NA - elif isinstance(other, np.ndarray): + elif util.is_array(other): out = np.empty(other.shape, dtype=object) out[:] = NA @@ -433,7 +433,7 @@ class NAType(C_NAType): return type(other)(1) else: return NA - elif isinstance(other, np.ndarray): + elif util.is_array(other): return np.where(other == 0, other.dtype.type(1), NA) return NotImplemented @@ -446,7 +446,7 @@ class NAType(C_NAType): return other else: return NA - elif isinstance(other, np.ndarray): + elif util.is_array(other): return np.where(other == 1, other, NA) return NotImplemented diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index b4d2c60837a7e..59a86751964e6 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1313,7 +1313,7 @@ cdef class TextReader: # Factor out code common to TextReader.__dealloc__ and TextReader.close # It cannot be a class method, since calling self.close() in __dealloc__ -# which causes a class attribute lookup and violates best parctices +# which causes a class attribute lookup and violates best practices # https://cython.readthedocs.io/en/latest/src/userguide/special_methods.html#finalization-method-dealloc cdef _close(TextReader reader): # also preemptively free all allocated memory diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 02bdae3a8dbac..63cf7d2ce23ee 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -85,7 +85,7 @@ cdef inline object create_time_from_ts( @cython.wraparound(False) @cython.boundscheck(False) def ints_to_pydatetime( - const int64_t[:] arr, + const int64_t[:] stamps, tzinfo tz=None, object freq=None, bint fold=False, @@ -96,7 +96,7 @@ def ints_to_pydatetime( Parameters ---------- - arr : array of i8 + stamps : array of i8 tz : str, optional convert to this timezone freq : str/Offset, optional @@ -119,14 +119,14 @@ def ints_to_pydatetime( ndarray[object] of type specified by box """ cdef: - Py_ssize_t i, n = len(arr) + Py_ssize_t i, n = len(stamps) ndarray[int64_t] trans int64_t[:] deltas intp_t[:] pos npy_datetimestruct dts object dt, new_tz str typ - int64_t value, local_value, delta = NPY_NAT # dummy for delta + int64_t value, local_val, delta = NPY_NAT # dummy for delta ndarray[object] result = np.empty(n, dtype=object) object (*func_create)(int64_t, npy_datetimestruct, tzinfo, object, bint) bint use_utc = False, use_tzlocal = False, use_fixed = False @@ -161,34 +161,34 @@ def ints_to_pydatetime( use_fixed = True delta = deltas[0] else: - pos = trans.searchsorted(arr, side="right") - 1 + pos = trans.searchsorted(stamps, side="right") - 1 use_pytz = typ == "pytz" for i in range(n): new_tz = tz - value = arr[i] + value = stamps[i] if value == NPY_NAT: result[i] = NaT else: if use_utc: - local_value = value + local_val = value elif use_tzlocal: - local_value = tz_convert_utc_to_tzlocal(value, tz) + local_val = tz_convert_utc_to_tzlocal(value, tz) elif use_fixed: - local_value = value + delta + local_val = value + delta elif not use_pytz: # i.e. dateutil # no zone-name change for dateutil tzs - dst etc # represented in single object. - local_value = value + deltas[pos[i]] + local_val = value + deltas[pos[i]] else: # pytz # find right representation of dst etc in pytz timezone new_tz = tz._tzinfos[tz._transition_info[pos[i]]] - local_value = value + deltas[pos[i]] + local_val = value + deltas[pos[i]] - dt64_to_dtstruct(local_value, &dts) + dt64_to_dtstruct(local_val, &dts) result[i] = func_create(value, dts, new_tz, freq, fold) return result diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b2e2299c0f2b6..8c838d8944d14 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -224,14 +224,10 @@ def _reconstruct_data( if not isinstance(dtype, np.dtype): # i.e. ExtensionDtype cls = dtype.construct_array_type() - if isinstance(values, cls) and values.dtype == dtype: - return values values = cls._from_sequence(values, dtype=dtype) - elif is_bool_dtype(dtype): - values = values.astype(dtype, copy=False) - elif dtype is not None: + else: if is_datetime64_dtype(dtype): dtype = np.dtype("datetime64[ns]") elif is_timedelta64_dtype(dtype): @@ -858,6 +854,7 @@ def value_counts( counts = result._values else: + values = _ensure_arraylike(values) keys, counts = value_counts_arraylike(values, dropna) # For backwards compatibility, we let Index do its normal type @@ -878,19 +875,18 @@ def value_counts( # Called once from SparseArray, otherwise could be private -def value_counts_arraylike(values, dropna: bool): +def value_counts_arraylike(values: np.ndarray, dropna: bool): """ Parameters ---------- - values : arraylike + values : np.ndarray dropna : bool Returns ------- - uniques : np.ndarray or ExtensionArray - counts : np.ndarray + uniques : np.ndarray + counts : np.ndarray[np.int64] """ - values = _ensure_arraylike(values) original = values values = _ensure_data(values) @@ -900,8 +896,8 @@ def value_counts_arraylike(values, dropna: bool): # datetime, timedelta, or period if dropna: - msk = keys != iNaT - keys, counts = keys[msk], counts[msk] + mask = keys != iNaT + keys, counts = keys[mask], counts[mask] res_keys = _reconstruct_data(keys, original.dtype, original) return res_keys, counts @@ -1029,7 +1025,7 @@ def rank( def checked_add_with_arr( - arr: np.ndarray, + arr: npt.NDArray[np.int64], b, arr_mask: npt.NDArray[np.bool_] | None = None, b_mask: npt.NDArray[np.bool_] | None = None, @@ -1044,7 +1040,7 @@ def checked_add_with_arr( Parameters ---------- - arr : array addend. + arr : np.ndarray[int64] addend. b : array or scalar addend. arr_mask : np.ndarray[bool] or None, default None array indicating which elements to exclude from checking diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 238f1382890c9..74a225bb18fd8 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -595,7 +595,7 @@ def normalize_dictlike_arg( for k, v in func.items(): if not is_aggregator(v): # mypy can't realize v is not a list here - new_func[k] = [v] # type:ignore[list-item] + new_func[k] = [v] # type: ignore[list-item] else: new_func[k] = v func = new_func diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index 6bfc2b63448ae..12c6691fe6c63 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -103,7 +103,7 @@ def _nanpercentile_1d( mask: npt.NDArray[np.bool_], qs: npt.NDArray[np.float64], na_value: Scalar, - interpolation, + interpolation: str, ) -> Scalar | np.ndarray: """ Wrapper for np.percentile that skips missing values, specialized to @@ -132,7 +132,14 @@ def _nanpercentile_1d( # equiv: 'np.array([na_value] * len(qs))' but much faster return np.full(len(qs), na_value) - return np.percentile(values, qs, **{np_percentile_argname: interpolation}) + return np.percentile( + values, + qs, + # error: No overload variant of "percentile" matches argument types + # "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]", + # "int", "Dict[str, str]" + **{np_percentile_argname: interpolation}, # type: ignore[call-overload] + ) def _nanpercentile( @@ -141,7 +148,7 @@ def _nanpercentile( *, na_value, mask: npt.NDArray[np.bool_], - interpolation, + interpolation: str, ): """ Wrapper for np.percentile that skips missing values. @@ -186,5 +193,11 @@ def _nanpercentile( return result else: return np.percentile( - values, qs, axis=1, **{np_percentile_argname: interpolation} + values, + qs, + axis=1, + # error: No overload variant of "percentile" matches argument types + # "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]", + # "int", "Dict[str, str]" + **{np_percentile_argname: interpolation}, # type: ignore[call-overload] ) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 9fe8c6f78f875..cc58d7ddd03dc 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -186,7 +186,7 @@ def _values_for_argsort(self) -> np.ndarray: return self._ndarray # Signature of "argmin" incompatible with supertype "ExtensionArray" - def argmin(self, axis: int = 0, skipna: bool = True): # type:ignore[override] + def argmin(self, axis: int = 0, skipna: bool = True): # type: ignore[override] # override base class by adding axis keyword validate_bool_kwarg(skipna, "skipna") if not skipna and self._hasna: @@ -194,7 +194,7 @@ def argmin(self, axis: int = 0, skipna: bool = True): # type:ignore[override] return nargminmax(self, "argmin", axis=axis) # Signature of "argmax" incompatible with supertype "ExtensionArray" - def argmax(self, axis: int = 0, skipna: bool = True): # type:ignore[override] + def argmax(self, axis: int = 0, skipna: bool = True): # type: ignore[override] # override base class by adding axis keyword validate_bool_kwarg(skipna, "skipna") if not skipna and self._hasna: @@ -229,12 +229,16 @@ def searchsorted( side: Literal["left", "right"] = "left", sorter: NumpySorter = None, ) -> npt.NDArray[np.intp] | np.intp: + # TODO(2.0): use _validate_setitem_value once dt64tz mismatched-timezone + # deprecation is enforced npvalue = self._validate_searchsorted_value(value) return self._ndarray.searchsorted(npvalue, side=side, sorter=sorter) def _validate_searchsorted_value( self, value: NumpyValueArrayLike | ExtensionArray ) -> NumpyValueArrayLike: + # TODO(2.0): after deprecation in datetimelikearraymixin is enforced, + # we can remove this and use _validate_setitem_value directly if isinstance(value, ExtensionArray): return value.to_numpy() else: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index bedde2dbf2558..fe49a003cd863 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -361,6 +361,8 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray): _subtyp = "sparse_array" # register ABCSparseArray _hidden_attrs = PandasObject._hidden_attrs | frozenset(["get_values"]) _sparse_index: SparseIndex + _sparse_values: np.ndarray + _dtype: SparseDtype def __init__( self, diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 8e38d542349d9..132c1d11dfe73 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -529,11 +529,11 @@ def _call_cython_op( counts = np.zeros(ngroups, dtype=np.int64) if self.how in ["min", "max", "mean"]: func( - result, - counts, - values, - comp_ids, - min_count, + out=result, + counts=counts, + values=values, + labels=comp_ids, + min_count=min_count, mask=mask, result_mask=result_mask, is_datetimelike=is_datetimelike, @@ -551,12 +551,12 @@ def _call_cython_op( elif self.how in ["add"]: # We support datetimelike func( - result, - counts, - values, - comp_ids, - min_count, - datetimelike=is_datetimelike, + out=result, + counts=counts, + values=values, + labels=comp_ids, + min_count=min_count, + is_datetimelike=is_datetimelike, ) else: func(result, counts, values, comp_ids, min_count) @@ -564,11 +564,11 @@ def _call_cython_op( # TODO: min_count if self.uses_mask(): func( - result, - values, - comp_ids, - ngroups, - is_datetimelike, + out=result, + values=values, + labels=comp_ids, + ngroups=ngroups, + is_datetimelike=is_datetimelike, mask=mask, **kwargs, ) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 25112ead2f0b7..2d3105939427c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -890,7 +890,7 @@ def _engine( # error: Argument 1 to "ExtensionEngine" has incompatible type # "ndarray[Any, Any]"; expected "ExtensionArray" - return self._engine_type(target_values) # type:ignore[arg-type] + return self._engine_type(target_values) # type: ignore[arg-type] @final @cache_readonly @@ -3934,7 +3934,7 @@ def _get_indexer( # error: Argument 1 to "get_indexer" of "IndexEngine" has incompatible # type "Union[ExtensionArray, ndarray[Any, Any]]"; expected # "ndarray[Any, Any]" - indexer = self._engine.get_indexer(tgt_values) # type:ignore[arg-type] + indexer = self._engine.get_indexer(tgt_values) # type: ignore[arg-type] return ensure_platform_int(indexer) @@ -7167,7 +7167,7 @@ def ensure_index_from_sequences(sequences, names=None) -> Index: if len(sequences) == 1: if names is not None: names = names[0] - return Index(sequences[0], name=names) + return Index._with_infer(sequences[0], name=names) else: return MultiIndex.from_arrays(sequences, names=names) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 2731c9ab447b7..5c6fa8d771210 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -227,6 +227,7 @@ def _parse_with_reso(self, label: str): return parsed, reso def _get_string_slice(self, key: str): + # overridden by TimedeltaIndex parsed, reso = self._parse_with_reso(key) try: return self._partial_date_slice(reso, parsed) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index e6f4b352aea69..a296ce130bbf5 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -174,27 +174,27 @@ def asfreq(self, freq=None, how: str = "E") -> PeriodIndex: return type(self)._simple_new(arr, name=self.name) @doc(PeriodArray.to_timestamp) - def to_timestamp(self, freq=None, how="start") -> DatetimeIndex: + def to_timestamp(self, freq=None, how: str = "start") -> DatetimeIndex: arr = self._data.to_timestamp(freq, how) return DatetimeIndex._simple_new(arr, name=self.name) # https://github.com/python/mypy/issues/1362 # error: Decorated property not supported - @property # type:ignore[misc] + @property # type: ignore[misc] @doc(PeriodArray.hour.fget) def hour(self) -> Int64Index: return Int64Index(self._data.hour, name=self.name) # https://github.com/python/mypy/issues/1362 # error: Decorated property not supported - @property # type:ignore[misc] + @property # type: ignore[misc] @doc(PeriodArray.minute.fget) def minute(self) -> Int64Index: return Int64Index(self._data.minute, name=self.name) # https://github.com/python/mypy/issues/1362 # error: Decorated property not supported - @property # type:ignore[misc] + @property # type: ignore[misc] @doc(PeriodArray.second.fget) def second(self) -> Int64Index: return Int64Index(self._data.second, name=self.name) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 941220f25e12f..b873ffc6ee487 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -745,7 +745,7 @@ def _tupleize_axis_indexer(self, key) -> tuple: new_key = [slice(None)] * self.ndim # error: Invalid index type "Optional[Any]" for "List[slice]"; expected # type "SupportsIndex" - new_key[self.axis] = key # type:ignore[index] + new_key[self.axis] = key # type: ignore[index] return tuple(new_key) @final diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 89e74cf19ee8e..9ec046fe18c53 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -3551,7 +3551,7 @@ def from_custom_template( # mypy doesn't like dynamically-defined classes # error: Variable "cls" is not valid as a type # error: Invalid base class "cls" - class MyStyler(cls): # type:ignore[valid-type,misc] + class MyStyler(cls): # type: ignore[valid-type,misc] env = jinja2.Environment(loader=loader) if html_table: template_html_table = env.get_template(html_table) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 910ba63b5bb3a..ab04240e8e791 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1067,7 +1067,7 @@ def _parse_numpy(self): # gets multiple values for keyword argument "dtype_if_empty self.obj = create_series_with_explicit_dtype( *data, dtype_if_empty=object - ) # type:ignore[misc] + ) # type: ignore[misc] else: self.obj = create_series_with_explicit_dtype(data, dtype_if_empty=object) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index eda902d34bff5..3113aa1ad8452 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1346,10 +1346,7 @@ def check_frame_setitem(self, elem, index: Index, inplace: bool): if inplace: # assertion here implies setting was done inplace - - # error: Item "ArrayManager" of "Union[ArrayManager, BlockManager]" has no - # attribute "blocks" - assert df._mgr.blocks[0].values is arr # type:ignore[union-attr] + assert df._mgr.arrays[0] is arr else: assert df.dtypes[0] == object diff --git a/pandas/tests/util/test_show_versions.py b/pandas/tests/util/test_show_versions.py index 7a1363099e7a0..57cd2e1a144b6 100644 --- a/pandas/tests/util/test_show_versions.py +++ b/pandas/tests/util/test_show_versions.py @@ -32,6 +32,7 @@ # https://github.com/pandas-dev/pandas/issues/35252 "ignore:Distutils:UserWarning" ) +@pytest.mark.filterwarnings("ignore:Setuptools is replacing distutils:UserWarning") def test_show_versions(tmpdir): # GH39701 as_json = os.path.join(tmpdir, "test_output.json")