From 7001e9a13699bf3fe53b25f231781e6b84b51fd5 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Tue, 1 Mar 2022 04:55:15 -0800
Subject: [PATCH] CLN: assorted (#46181)

---
 doc/source/whatsnew/v1.5.0.rst           |   1 +
 pandas/_libs/groupby.pyi                 |   4 +-
 pandas/_libs/groupby.pyx                 | 328 +++++++++++++----------
 pandas/_libs/missing.pyx                 |   8 +-
 pandas/_libs/parsers.pyx                 |   2 +-
 pandas/_libs/tslibs/vectorized.pyx       |  24 +-
 pandas/core/algorithms.py                |  24 +-
 pandas/core/apply.py                     |   2 +-
 pandas/core/array_algos/quantile.py      |  21 +-
 pandas/core/arrays/_mixins.py            |   8 +-
 pandas/core/arrays/sparse/array.py       |   2 +
 pandas/core/groupby/ops.py               |  32 +--
 pandas/core/indexes/base.py              |   6 +-
 pandas/core/indexes/datetimelike.py      |   1 +
 pandas/core/indexes/period.py            |   8 +-
 pandas/core/indexing.py                  |   2 +-
 pandas/io/formats/style.py               |   2 +-
 pandas/io/json/_json.py                  |   2 +-
 pandas/tests/internals/test_internals.py |   5 +-
 pandas/tests/util/test_show_versions.py  |   1 +
 20 files changed, 274 insertions(+), 209 deletions(-)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index e47751245cb65..a6fb2fcce430a 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -295,6 +295,7 @@ Performance improvements
 - Performance improvement in :meth:`MultiIndex.get_locs` (:issue:`45681`, :issue:`46040`)
 - Performance improvement in :func:`merge` when left and/or right are empty (:issue:`45838`)
 - Performance improvement in :meth:`DataFrame.join` when left and/or right are empty (:issue:`46015`)
+- Performance improvement in :func:`factorize` (:issue:`46109`)
 - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
 -
 
diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
index 8048f40ecf330..197a8bdc0cd7c 100644
--- a/pandas/_libs/groupby.pyi
+++ b/pandas/_libs/groupby.pyi
@@ -56,7 +56,7 @@ def group_add(
     values: np.ndarray,  # ndarray[complexfloating_t, ndim=2]
     labels: np.ndarray,  # const intp_t[:]
     min_count: int = ...,
-    datetimelike: bool = ...,
+    is_datetimelike: bool = ...,
 ) -> None: ...
 def group_prod(
     out: np.ndarray,  # floating[:, ::1]
@@ -104,6 +104,8 @@ def group_last(
     counts: np.ndarray,  # int64_t[::1]
     values: np.ndarray,  # ndarray[rank_t, ndim=2]
     labels: np.ndarray,  # const int64_t[:]
+    mask: npt.NDArray[np.bool_] | None,
+    result_mask: npt.NDArray[np.bool_] | None,
     min_count: int = ...,  # Py_ssize_t
 ) -> None: ...
 def group_nth(
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 0c3c57d40d42d..d250a69c1985b 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -104,11 +104,13 @@ cdef inline float64_t median_linear(float64_t* a, int n) nogil:
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_median_float64(ndarray[float64_t, ndim=2] out,
-                         ndarray[int64_t] counts,
-                         ndarray[float64_t, ndim=2] values,
-                         ndarray[intp_t] labels,
-                         Py_ssize_t min_count=-1) -> None:
+def group_median_float64(
+    ndarray[float64_t, ndim=2] out,
+    ndarray[int64_t] counts,
+    ndarray[float64_t, ndim=2] values,
+    ndarray[intp_t] labels,
+    Py_ssize_t min_count=-1,
+) -> None:
     """
     Only aggregates on axis=0
     """
@@ -145,12 +147,14 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_cumprod_float64(float64_t[:, ::1] out,
-                          const float64_t[:, :] values,
-                          const intp_t[::1] labels,
-                          int ngroups,
-                          bint is_datetimelike,
-                          bint skipna=True) -> None:
+def group_cumprod_float64(
+    float64_t[:, ::1] out,
+    const float64_t[:, :] values,
+    const intp_t[::1] labels,
+    int ngroups,
+    bint is_datetimelike,
+    bint skipna=True,
+) -> None:
     """
     Cumulative product of columns of `values`, in row groups `labels`.
 
@@ -202,12 +206,14 @@ def group_cumprod_float64(float64_t[:, ::1] out,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_cumsum(numeric_t[:, ::1] out,
-                 ndarray[numeric_t, ndim=2] values,
-                 const intp_t[::1] labels,
-                 int ngroups,
-                 is_datetimelike,
-                 bint skipna=True) -> None:
+def group_cumsum(
+    numeric_t[:, ::1] out,
+    ndarray[numeric_t, ndim=2] values,
+    const intp_t[::1] labels,
+    int ngroups,
+    is_datetimelike,
+    bint skipna=True,
+) -> None:
     """
     Cumulative sum of columns of `values`, in row groups `labels`.
 
@@ -271,8 +277,12 @@ def group_cumsum(numeric_t[:, ::1] out,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_shift_indexer(int64_t[::1] out, const intp_t[::1] labels,
-                        int ngroups, int periods) -> None:
+def group_shift_indexer(
+    int64_t[::1] out,
+    const intp_t[::1] labels,
+    int ngroups,
+    int periods,
+) -> None:
     cdef:
         Py_ssize_t N, i, j, ii, lab
         int offset = 0, sign
@@ -323,10 +333,15 @@ def group_shift_indexer(int64_t[::1] out, const intp_t[::1] labels,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_fillna_indexer(ndarray[intp_t] out, ndarray[intp_t] labels,
-                         ndarray[intp_t] sorted_labels,
-                         ndarray[uint8_t] mask, str direction,
-                         int64_t limit, bint dropna) -> None:
+def group_fillna_indexer(
+    ndarray[intp_t] out,
+    ndarray[intp_t] labels,
+    ndarray[intp_t] sorted_labels,
+    ndarray[uint8_t] mask,
+    str direction,
+    int64_t limit,
+    bint dropna,
+) -> None:
     """
     Indexes how to fill values forwards or backwards within a group.
 
@@ -388,13 +403,15 @@ def group_fillna_indexer(ndarray[intp_t] out, ndarray[intp_t] labels,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_any_all(int8_t[:, ::1] out,
-                  const int8_t[:, :] values,
-                  const intp_t[::1] labels,
-                  const uint8_t[:, :] mask,
-                  str val_test,
-                  bint skipna,
-                  bint nullable) -> None:
+def group_any_all(
+    int8_t[:, ::1] out,
+    const int8_t[:, :] values,
+    const intp_t[::1] labels,
+    const uint8_t[:, :] mask,
+    str val_test,
+    bint skipna,
+    bint nullable,
+) -> None:
     """
     Aggregated boolean values to show truthfulness of group elements. If the
     input is a nullable type (nullable=True), the result will be computed
@@ -488,12 +505,14 @@ ctypedef fused add_t:
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_add(add_t[:, ::1] out,
-              int64_t[::1] counts,
-              ndarray[add_t, ndim=2] values,
-              const intp_t[::1] labels,
-              Py_ssize_t min_count=0,
-              bint datetimelike=False) -> None:
+def group_add(
+    add_t[:, ::1] out,
+    int64_t[::1] counts,
+    ndarray[add_t, ndim=2] values,
+    const intp_t[::1] labels,
+    Py_ssize_t min_count=0,
+    bint is_datetimelike=False,
+) -> None:
     """
     Only aggregates on axis=0 using Kahan summation
     """
@@ -560,7 +579,7 @@ def group_add(add_t[:, ::1] out,
                     #  is otherwise the same as in _treat_as_na
                     if val == val and not (
                         add_t is float64_t
-                        and datetimelike
+                        and is_datetimelike
                         and val == <float64_t>NPY_NAT
                     ):
                         nobs[lab, j] += 1
@@ -579,11 +598,13 @@ def group_add(add_t[:, ::1] out,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_prod(floating[:, ::1] out,
-               int64_t[::1] counts,
-               ndarray[floating, ndim=2] values,
-               const intp_t[::1] labels,
-               Py_ssize_t min_count=0) -> None:
+def group_prod(
+    floating[:, ::1] out,
+    int64_t[::1] counts,
+    ndarray[floating, ndim=2] values,
+    const intp_t[::1] labels,
+    Py_ssize_t min_count=0,
+) -> None:
     """
     Only aggregates on axis=0
     """
@@ -628,12 +649,14 @@ def group_prod(floating[:, ::1] out,
 @cython.wraparound(False)
 @cython.boundscheck(False)
 @cython.cdivision(True)
-def group_var(floating[:, ::1] out,
-              int64_t[::1] counts,
-              ndarray[floating, ndim=2] values,
-              const intp_t[::1] labels,
-              Py_ssize_t min_count=-1,
-              int64_t ddof=1) -> None:
+def group_var(
+    floating[:, ::1] out,
+    int64_t[::1] counts,
+    ndarray[floating, ndim=2] values,
+    const intp_t[::1] labels,
+    Py_ssize_t min_count=-1,
+    int64_t ddof=1,
+) -> None:
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
         floating val, ct, oldmean
@@ -682,15 +705,16 @@ def group_var(floating[:, ::1] out,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_mean(mean_t[:, ::1] out,
-               int64_t[::1] counts,
-               ndarray[mean_t, ndim=2] values,
-               const intp_t[::1] labels,
-               Py_ssize_t min_count=-1,
-               bint is_datetimelike=False,
-               const uint8_t[:, ::1] mask=None,
-               uint8_t[:, ::1] result_mask=None
-               ) -> None:
+def group_mean(
+    mean_t[:, ::1] out,
+    int64_t[::1] counts,
+    ndarray[mean_t, ndim=2] values,
+    const intp_t[::1] labels,
+    Py_ssize_t min_count=-1,
+    bint is_datetimelike=False,
+    const uint8_t[:, ::1] mask=None,
+    uint8_t[:, ::1] result_mask=None,
+) -> None:
     """
     Compute the mean per label given a label assignment for each value.
     NaN values are ignored.
@@ -770,11 +794,13 @@ def group_mean(mean_t[:, ::1] out,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_ohlc(floating[:, ::1] out,
-               int64_t[::1] counts,
-               ndarray[floating, ndim=2] values,
-               const intp_t[::1] labels,
-               Py_ssize_t min_count=-1) -> None:
+def group_ohlc(
+    floating[:, ::1] out,
+    int64_t[::1] counts,
+    ndarray[floating, ndim=2] values,
+    const intp_t[::1] labels,
+    Py_ssize_t min_count=-1,
+) -> None:
     """
     Only aggregates on axis=0
     """
@@ -817,13 +843,15 @@ def group_ohlc(floating[:, ::1] out,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_quantile(ndarray[float64_t, ndim=2] out,
-                   ndarray[numeric_t, ndim=1] values,
-                   ndarray[intp_t] labels,
-                   ndarray[uint8_t] mask,
-                   const intp_t[:] sort_indexer,
-                   const float64_t[:] qs,
-                   str interpolation) -> None:
+def group_quantile(
+    ndarray[float64_t, ndim=2] out,
+    ndarray[numeric_t, ndim=1] values,
+    ndarray[intp_t] labels,
+    ndarray[uint8_t] mask,
+    const intp_t[:] sort_indexer,
+    const float64_t[:] qs,
+    str interpolation,
+) -> None:
     """
     Calculate the quantile per group.
 
@@ -949,17 +977,19 @@ cdef inline bint _treat_as_na(iu_64_floating_obj_t val, bint is_datetimelike) no
         return val != val
 
 
-# GH#31710 use memorviews once cython 0.30 is released so we can
+# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can
 #  use `const iu_64_floating_obj_t[:, :] values`
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_last(iu_64_floating_obj_t[:, ::1] out,
-               int64_t[::1] counts,
-               ndarray[iu_64_floating_obj_t, ndim=2] values,
-               const intp_t[::1] labels,
-               const uint8_t[:, :] mask,
-               uint8_t[:, ::1] result_mask=None,
-               Py_ssize_t min_count=-1) -> None:
+def group_last(
+    iu_64_floating_obj_t[:, ::1] out,
+    int64_t[::1] counts,
+    ndarray[iu_64_floating_obj_t, ndim=2] values,
+    const intp_t[::1] labels,
+    const uint8_t[:, :] mask,
+    uint8_t[:, ::1] result_mask=None,
+    Py_ssize_t min_count=-1,
+) -> None:
     """
     Only aggregates on axis=0
     """
@@ -1058,19 +1088,20 @@ def group_last(iu_64_floating_obj_t[:, ::1] out,
         raise RuntimeError("empty group with uint64_t")
 
 
-# GH#31710 use memorviews once cython 0.30 is released so we can
+# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can
 #  use `const iu_64_floating_obj_t[:, :] values`
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_nth(iu_64_floating_obj_t[:, ::1] out,
-              int64_t[::1] counts,
-              ndarray[iu_64_floating_obj_t, ndim=2] values,
-              const intp_t[::1] labels,
-              const uint8_t[:, :] mask,
-              uint8_t[:, ::1] result_mask=None,
-              int64_t min_count=-1,
-              int64_t rank=1,
-              ) -> None:
+def group_nth(
+    iu_64_floating_obj_t[:, ::1] out,
+    int64_t[::1] counts,
+    ndarray[iu_64_floating_obj_t, ndim=2] values,
+    const intp_t[::1] labels,
+    const uint8_t[:, :] mask,
+    uint8_t[:, ::1] result_mask=None,
+    int64_t min_count=-1,
+    int64_t rank=1,
+) -> None:
     """
     Only aggregates on axis=0
     """
@@ -1173,12 +1204,17 @@ def group_nth(iu_64_floating_obj_t[:, ::1] out,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_rank(float64_t[:, ::1] out,
-               ndarray[iu_64_floating_obj_t, ndim=2] values,
-               const intp_t[::1] labels,
-               int ngroups,
-               bint is_datetimelike, str ties_method="average",
-               bint ascending=True, bint pct=False, str na_option="keep") -> None:
+def group_rank(
+    float64_t[:, ::1] out,
+    ndarray[iu_64_floating_obj_t, ndim=2] values,
+    const intp_t[::1] labels,
+    int ngroups,
+    bint is_datetimelike,
+    str ties_method="average",
+    bint ascending=True,
+    bint pct=False,
+    str na_option="keep",
+) -> None:
     """
     Provides the rank of values within each group.
 
@@ -1244,15 +1280,17 @@ def group_rank(float64_t[:, ::1] out,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-cdef group_min_max(iu_64_floating_t[:, ::1] out,
-                   int64_t[::1] counts,
-                   ndarray[iu_64_floating_t, ndim=2] values,
-                   const intp_t[::1] labels,
-                   Py_ssize_t min_count=-1,
-                   bint is_datetimelike=False,
-                   bint compute_max=True,
-                   const uint8_t[:, ::1] mask=None,
-                   uint8_t[:, ::1] result_mask=None):
+cdef group_min_max(
+    iu_64_floating_t[:, ::1] out,
+    int64_t[::1] counts,
+    ndarray[iu_64_floating_t, ndim=2] values,
+    const intp_t[::1] labels,
+    Py_ssize_t min_count=-1,
+    bint is_datetimelike=False,
+    bint compute_max=True,
+    const uint8_t[:, ::1] mask=None,
+    uint8_t[:, ::1] result_mask=None,
+):
     """
     Compute minimum/maximum  of columns of `values`, in row groups `labels`.
 
@@ -1366,14 +1404,16 @@ cdef group_min_max(iu_64_floating_t[:, ::1] out,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_max(iu_64_floating_t[:, ::1] out,
-              int64_t[::1] counts,
-              ndarray[iu_64_floating_t, ndim=2] values,
-              const intp_t[::1] labels,
-              Py_ssize_t min_count=-1,
-              bint is_datetimelike=False,
-              const uint8_t[:, ::1] mask=None,
-              uint8_t[:, ::1] result_mask=None) -> None:
+def group_max(
+    iu_64_floating_t[:, ::1] out,
+    int64_t[::1] counts,
+    ndarray[iu_64_floating_t, ndim=2] values,
+    const intp_t[::1] labels,
+    Py_ssize_t min_count=-1,
+    bint is_datetimelike=False,
+    const uint8_t[:, ::1] mask=None,
+    uint8_t[:, ::1] result_mask=None,
+) -> None:
     """See group_min_max.__doc__"""
     group_min_max(
         out,
@@ -1390,14 +1430,16 @@ def group_max(iu_64_floating_t[:, ::1] out,
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def group_min(iu_64_floating_t[:, ::1] out,
-              int64_t[::1] counts,
-              ndarray[iu_64_floating_t, ndim=2] values,
-              const intp_t[::1] labels,
-              Py_ssize_t min_count=-1,
-              bint is_datetimelike=False,
-              const uint8_t[:, ::1] mask=None,
-              uint8_t[:, ::1] result_mask=None) -> None:
+def group_min(
+    iu_64_floating_t[:, ::1] out,
+    int64_t[::1] counts,
+    ndarray[iu_64_floating_t, ndim=2] values,
+    const intp_t[::1] labels,
+    Py_ssize_t min_count=-1,
+    bint is_datetimelike=False,
+    const uint8_t[:, ::1] mask=None,
+    uint8_t[:, ::1] result_mask=None,
+) -> None:
     """See group_min_max.__doc__"""
     group_min_max(
         out,
@@ -1414,14 +1456,16 @@ def group_min(iu_64_floating_t[:, ::1] out,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-cdef group_cummin_max(iu_64_floating_t[:, ::1] out,
-                      ndarray[iu_64_floating_t, ndim=2] values,
-                      uint8_t[:, ::1] mask,
-                      const intp_t[::1] labels,
-                      int ngroups,
-                      bint is_datetimelike,
-                      bint skipna,
-                      bint compute_max):
+cdef group_cummin_max(
+    iu_64_floating_t[:, ::1] out,
+    ndarray[iu_64_floating_t, ndim=2] values,
+    uint8_t[:, ::1] mask,
+    const intp_t[::1] labels,
+    int ngroups,
+    bint is_datetimelike,
+    bint skipna,
+    bint compute_max,
+):
     """
     Cumulative minimum/maximum of columns of `values`, in row groups `labels`.
 
@@ -1527,13 +1571,15 @@ cdef group_cummin_max(iu_64_floating_t[:, ::1] out,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_cummin(iu_64_floating_t[:, ::1] out,
-                 ndarray[iu_64_floating_t, ndim=2] values,
-                 const intp_t[::1] labels,
-                 int ngroups,
-                 bint is_datetimelike,
-                 uint8_t[:, ::1] mask=None,
-                 bint skipna=True) -> None:
+def group_cummin(
+    iu_64_floating_t[:, ::1] out,
+    ndarray[iu_64_floating_t, ndim=2] values,
+    const intp_t[::1] labels,
+    int ngroups,
+    bint is_datetimelike,
+    uint8_t[:, ::1] mask=None,
+    bint skipna=True,
+) -> None:
     """See group_cummin_max.__doc__"""
     group_cummin_max(
         out,
@@ -1549,13 +1595,15 @@ def group_cummin(iu_64_floating_t[:, ::1] out,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_cummax(iu_64_floating_t[:, ::1] out,
-                 ndarray[iu_64_floating_t, ndim=2] values,
-                 const intp_t[::1] labels,
-                 int ngroups,
-                 bint is_datetimelike,
-                 uint8_t[:, ::1] mask=None,
-                 bint skipna=True) -> None:
+def group_cummax(
+    iu_64_floating_t[:, ::1] out,
+    ndarray[iu_64_floating_t, ndim=2] values,
+    const intp_t[::1] labels,
+    int ngroups,
+    bint is_datetimelike,
+    uint8_t[:, ::1] mask=None,
+    bint skipna=True,
+) -> None:
     """See group_cummin_max.__doc__"""
     group_cummin_max(
         out,
diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx
index 62977f0fd2b4c..e65196c9105d4 100644
--- a/pandas/_libs/missing.pyx
+++ b/pandas/_libs/missing.pyx
@@ -312,7 +312,7 @@ def _create_binary_propagating_op(name, is_divmod=False):
     def method(self, other):
         if (other is C_NA or isinstance(other, str)
                 or isinstance(other, (numbers.Number, np.bool_))
-                or isinstance(other, np.ndarray) and not other.shape):
+                or util.is_array(other) and not other.shape):
             # Need the other.shape clause to handle NumPy scalars,
             # since we do a setitem on `out` below, which
             # won't work for NumPy scalars.
@@ -321,7 +321,7 @@ def _create_binary_propagating_op(name, is_divmod=False):
             else:
                 return NA
 
-        elif isinstance(other, np.ndarray):
+        elif util.is_array(other):
             out = np.empty(other.shape, dtype=object)
             out[:] = NA
 
@@ -433,7 +433,7 @@ class NAType(C_NAType):
                 return type(other)(1)
             else:
                 return NA
-        elif isinstance(other, np.ndarray):
+        elif util.is_array(other):
             return np.where(other == 0, other.dtype.type(1), NA)
 
         return NotImplemented
@@ -446,7 +446,7 @@ class NAType(C_NAType):
                 return other
             else:
                 return NA
-        elif isinstance(other, np.ndarray):
+        elif util.is_array(other):
             return np.where(other == 1, other, NA)
         return NotImplemented
 
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index b4d2c60837a7e..59a86751964e6 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -1313,7 +1313,7 @@ cdef class TextReader:
 
 # Factor out code common to TextReader.__dealloc__ and TextReader.close
 # It cannot be a class method, since calling self.close() in __dealloc__
-# which causes a class attribute lookup and violates best parctices
+# which causes a class attribute lookup and violates best practices
 # https://cython.readthedocs.io/en/latest/src/userguide/special_methods.html#finalization-method-dealloc
 cdef _close(TextReader reader):
     # also preemptively free all allocated memory
diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx
index 02bdae3a8dbac..63cf7d2ce23ee 100644
--- a/pandas/_libs/tslibs/vectorized.pyx
+++ b/pandas/_libs/tslibs/vectorized.pyx
@@ -85,7 +85,7 @@ cdef inline object create_time_from_ts(
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def ints_to_pydatetime(
-    const int64_t[:] arr,
+    const int64_t[:] stamps,
     tzinfo tz=None,
     object freq=None,
     bint fold=False,
@@ -96,7 +96,7 @@ def ints_to_pydatetime(
 
     Parameters
     ----------
-    arr : array of i8
+    stamps : array of i8
     tz : str, optional
          convert to this timezone
     freq : str/Offset, optional
@@ -119,14 +119,14 @@ def ints_to_pydatetime(
     ndarray[object] of type specified by box
     """
     cdef:
-        Py_ssize_t i, n = len(arr)
+        Py_ssize_t i, n = len(stamps)
         ndarray[int64_t] trans
         int64_t[:] deltas
         intp_t[:] pos
         npy_datetimestruct dts
         object dt, new_tz
         str typ
-        int64_t value, local_value, delta = NPY_NAT  # dummy for delta
+        int64_t value, local_val, delta = NPY_NAT  # dummy for delta
         ndarray[object] result = np.empty(n, dtype=object)
         object (*func_create)(int64_t, npy_datetimestruct, tzinfo, object, bint)
         bint use_utc = False, use_tzlocal = False, use_fixed = False
@@ -161,34 +161,34 @@ def ints_to_pydatetime(
             use_fixed = True
             delta = deltas[0]
         else:
-            pos = trans.searchsorted(arr, side="right") - 1
+            pos = trans.searchsorted(stamps, side="right") - 1
             use_pytz = typ == "pytz"
 
     for i in range(n):
         new_tz = tz
-        value = arr[i]
+        value = stamps[i]
 
         if value == NPY_NAT:
             result[i] = <object>NaT
         else:
             if use_utc:
-                local_value = value
+                local_val = value
             elif use_tzlocal:
-                local_value = tz_convert_utc_to_tzlocal(value, tz)
+                local_val = tz_convert_utc_to_tzlocal(value, tz)
             elif use_fixed:
-                local_value = value + delta
+                local_val = value + delta
             elif not use_pytz:
                 # i.e. dateutil
                 # no zone-name change for dateutil tzs - dst etc
                 # represented in single object.
-                local_value = value + deltas[pos[i]]
+                local_val = value + deltas[pos[i]]
             else:
                 # pytz
                 # find right representation of dst etc in pytz timezone
                 new_tz = tz._tzinfos[tz._transition_info[pos[i]]]
-                local_value = value + deltas[pos[i]]
+                local_val = value + deltas[pos[i]]
 
-            dt64_to_dtstruct(local_value, &dts)
+            dt64_to_dtstruct(local_val, &dts)
             result[i] = func_create(value, dts, new_tz, freq, fold)
 
     return result
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
index b2e2299c0f2b6..8c838d8944d14 100644
--- a/pandas/core/algorithms.py
+++ b/pandas/core/algorithms.py
@@ -224,14 +224,10 @@ def _reconstruct_data(
     if not isinstance(dtype, np.dtype):
         # i.e. ExtensionDtype
         cls = dtype.construct_array_type()
-        if isinstance(values, cls) and values.dtype == dtype:
-            return values
 
         values = cls._from_sequence(values, dtype=dtype)
-    elif is_bool_dtype(dtype):
-        values = values.astype(dtype, copy=False)
 
-    elif dtype is not None:
+    else:
         if is_datetime64_dtype(dtype):
             dtype = np.dtype("datetime64[ns]")
         elif is_timedelta64_dtype(dtype):
@@ -858,6 +854,7 @@ def value_counts(
             counts = result._values
 
         else:
+            values = _ensure_arraylike(values)
             keys, counts = value_counts_arraylike(values, dropna)
 
             # For backwards compatibility, we let Index do its normal type
@@ -878,19 +875,18 @@ def value_counts(
 
 
 # Called once from SparseArray, otherwise could be private
-def value_counts_arraylike(values, dropna: bool):
+def value_counts_arraylike(values: np.ndarray, dropna: bool):
     """
     Parameters
     ----------
-    values : arraylike
+    values : np.ndarray
     dropna : bool
 
     Returns
     -------
-    uniques : np.ndarray or ExtensionArray
-    counts : np.ndarray
+    uniques : np.ndarray
+    counts : np.ndarray[np.int64]
     """
-    values = _ensure_arraylike(values)
     original = values
     values = _ensure_data(values)
 
@@ -900,8 +896,8 @@ def value_counts_arraylike(values, dropna: bool):
         # datetime, timedelta, or period
 
         if dropna:
-            msk = keys != iNaT
-            keys, counts = keys[msk], counts[msk]
+            mask = keys != iNaT
+            keys, counts = keys[mask], counts[mask]
 
     res_keys = _reconstruct_data(keys, original.dtype, original)
     return res_keys, counts
@@ -1029,7 +1025,7 @@ def rank(
 
 
 def checked_add_with_arr(
-    arr: np.ndarray,
+    arr: npt.NDArray[np.int64],
     b,
     arr_mask: npt.NDArray[np.bool_] | None = None,
     b_mask: npt.NDArray[np.bool_] | None = None,
@@ -1044,7 +1040,7 @@ def checked_add_with_arr(
 
     Parameters
     ----------
-    arr : array addend.
+    arr : np.ndarray[int64] addend.
     b : array or scalar addend.
     arr_mask : np.ndarray[bool] or None, default None
         array indicating which elements to exclude from checking
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index 238f1382890c9..74a225bb18fd8 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -595,7 +595,7 @@ def normalize_dictlike_arg(
             for k, v in func.items():
                 if not is_aggregator(v):
                     # mypy can't realize v is not a list here
-                    new_func[k] = [v]  # type:ignore[list-item]
+                    new_func[k] = [v]  # type: ignore[list-item]
                 else:
                     new_func[k] = v
             func = new_func
diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py
index 6bfc2b63448ae..12c6691fe6c63 100644
--- a/pandas/core/array_algos/quantile.py
+++ b/pandas/core/array_algos/quantile.py
@@ -103,7 +103,7 @@ def _nanpercentile_1d(
     mask: npt.NDArray[np.bool_],
     qs: npt.NDArray[np.float64],
     na_value: Scalar,
-    interpolation,
+    interpolation: str,
 ) -> Scalar | np.ndarray:
     """
     Wrapper for np.percentile that skips missing values, specialized to
@@ -132,7 +132,14 @@ def _nanpercentile_1d(
         # equiv: 'np.array([na_value] * len(qs))' but much faster
         return np.full(len(qs), na_value)
 
-    return np.percentile(values, qs, **{np_percentile_argname: interpolation})
+    return np.percentile(
+        values,
+        qs,
+        # error: No overload variant of "percentile" matches argument types
+        # "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]",
+        # "int", "Dict[str, str]"
+        **{np_percentile_argname: interpolation},  # type: ignore[call-overload]
+    )
 
 
 def _nanpercentile(
@@ -141,7 +148,7 @@ def _nanpercentile(
     *,
     na_value,
     mask: npt.NDArray[np.bool_],
-    interpolation,
+    interpolation: str,
 ):
     """
     Wrapper for np.percentile that skips missing values.
@@ -186,5 +193,11 @@ def _nanpercentile(
         return result
     else:
         return np.percentile(
-            values, qs, axis=1, **{np_percentile_argname: interpolation}
+            values,
+            qs,
+            axis=1,
+            # error: No overload variant of "percentile" matches argument types
+            # "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]",
+            # "int", "Dict[str, str]"
+            **{np_percentile_argname: interpolation},  # type: ignore[call-overload]
         )
diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py
index 9fe8c6f78f875..cc58d7ddd03dc 100644
--- a/pandas/core/arrays/_mixins.py
+++ b/pandas/core/arrays/_mixins.py
@@ -186,7 +186,7 @@ def _values_for_argsort(self) -> np.ndarray:
         return self._ndarray
 
     # Signature of "argmin" incompatible with supertype "ExtensionArray"
-    def argmin(self, axis: int = 0, skipna: bool = True):  # type:ignore[override]
+    def argmin(self, axis: int = 0, skipna: bool = True):  # type: ignore[override]
         # override base class by adding axis keyword
         validate_bool_kwarg(skipna, "skipna")
         if not skipna and self._hasna:
@@ -194,7 +194,7 @@ def argmin(self, axis: int = 0, skipna: bool = True):  # type:ignore[override]
         return nargminmax(self, "argmin", axis=axis)
 
     # Signature of "argmax" incompatible with supertype "ExtensionArray"
-    def argmax(self, axis: int = 0, skipna: bool = True):  # type:ignore[override]
+    def argmax(self, axis: int = 0, skipna: bool = True):  # type: ignore[override]
         # override base class by adding axis keyword
         validate_bool_kwarg(skipna, "skipna")
         if not skipna and self._hasna:
@@ -229,12 +229,16 @@ def searchsorted(
         side: Literal["left", "right"] = "left",
         sorter: NumpySorter = None,
     ) -> npt.NDArray[np.intp] | np.intp:
+        # TODO(2.0): use _validate_setitem_value once dt64tz mismatched-timezone
+        #  deprecation is enforced
         npvalue = self._validate_searchsorted_value(value)
         return self._ndarray.searchsorted(npvalue, side=side, sorter=sorter)
 
     def _validate_searchsorted_value(
         self, value: NumpyValueArrayLike | ExtensionArray
     ) -> NumpyValueArrayLike:
+        # TODO(2.0): after deprecation in datetimelikearraymixin is enforced,
+        #  we can remove this and use _validate_setitem_value directly
         if isinstance(value, ExtensionArray):
             return value.to_numpy()
         else:
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
index bedde2dbf2558..fe49a003cd863 100644
--- a/pandas/core/arrays/sparse/array.py
+++ b/pandas/core/arrays/sparse/array.py
@@ -361,6 +361,8 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray):
     _subtyp = "sparse_array"  # register ABCSparseArray
     _hidden_attrs = PandasObject._hidden_attrs | frozenset(["get_values"])
     _sparse_index: SparseIndex
+    _sparse_values: np.ndarray
+    _dtype: SparseDtype
 
     def __init__(
         self,
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 8e38d542349d9..132c1d11dfe73 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -529,11 +529,11 @@ def _call_cython_op(
             counts = np.zeros(ngroups, dtype=np.int64)
             if self.how in ["min", "max", "mean"]:
                 func(
-                    result,
-                    counts,
-                    values,
-                    comp_ids,
-                    min_count,
+                    out=result,
+                    counts=counts,
+                    values=values,
+                    labels=comp_ids,
+                    min_count=min_count,
                     mask=mask,
                     result_mask=result_mask,
                     is_datetimelike=is_datetimelike,
@@ -551,12 +551,12 @@ def _call_cython_op(
             elif self.how in ["add"]:
                 # We support datetimelike
                 func(
-                    result,
-                    counts,
-                    values,
-                    comp_ids,
-                    min_count,
-                    datetimelike=is_datetimelike,
+                    out=result,
+                    counts=counts,
+                    values=values,
+                    labels=comp_ids,
+                    min_count=min_count,
+                    is_datetimelike=is_datetimelike,
                 )
             else:
                 func(result, counts, values, comp_ids, min_count)
@@ -564,11 +564,11 @@ def _call_cython_op(
             # TODO: min_count
             if self.uses_mask():
                 func(
-                    result,
-                    values,
-                    comp_ids,
-                    ngroups,
-                    is_datetimelike,
+                    out=result,
+                    values=values,
+                    labels=comp_ids,
+                    ngroups=ngroups,
+                    is_datetimelike=is_datetimelike,
                     mask=mask,
                     **kwargs,
                 )
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 25112ead2f0b7..2d3105939427c 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -890,7 +890,7 @@ def _engine(
 
         # error: Argument 1 to "ExtensionEngine" has incompatible type
         # "ndarray[Any, Any]"; expected "ExtensionArray"
-        return self._engine_type(target_values)  # type:ignore[arg-type]
+        return self._engine_type(target_values)  # type: ignore[arg-type]
 
     @final
     @cache_readonly
@@ -3934,7 +3934,7 @@ def _get_indexer(
             # error: Argument 1 to "get_indexer" of "IndexEngine" has incompatible
             # type "Union[ExtensionArray, ndarray[Any, Any]]"; expected
             # "ndarray[Any, Any]"
-            indexer = self._engine.get_indexer(tgt_values)  # type:ignore[arg-type]
+            indexer = self._engine.get_indexer(tgt_values)  # type: ignore[arg-type]
 
         return ensure_platform_int(indexer)
 
@@ -7167,7 +7167,7 @@ def ensure_index_from_sequences(sequences, names=None) -> Index:
     if len(sequences) == 1:
         if names is not None:
             names = names[0]
-        return Index(sequences[0], name=names)
+        return Index._with_infer(sequences[0], name=names)
     else:
         return MultiIndex.from_arrays(sequences, names=names)
 
diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
index 2731c9ab447b7..5c6fa8d771210 100644
--- a/pandas/core/indexes/datetimelike.py
+++ b/pandas/core/indexes/datetimelike.py
@@ -227,6 +227,7 @@ def _parse_with_reso(self, label: str):
         return parsed, reso
 
     def _get_string_slice(self, key: str):
+        # overridden by TimedeltaIndex
         parsed, reso = self._parse_with_reso(key)
         try:
             return self._partial_date_slice(reso, parsed)
diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
index e6f4b352aea69..a296ce130bbf5 100644
--- a/pandas/core/indexes/period.py
+++ b/pandas/core/indexes/period.py
@@ -174,27 +174,27 @@ def asfreq(self, freq=None, how: str = "E") -> PeriodIndex:
         return type(self)._simple_new(arr, name=self.name)
 
     @doc(PeriodArray.to_timestamp)
-    def to_timestamp(self, freq=None, how="start") -> DatetimeIndex:
+    def to_timestamp(self, freq=None, how: str = "start") -> DatetimeIndex:
         arr = self._data.to_timestamp(freq, how)
         return DatetimeIndex._simple_new(arr, name=self.name)
 
     # https://github.com/python/mypy/issues/1362
     # error: Decorated property not supported
-    @property  # type:ignore[misc]
+    @property  # type: ignore[misc]
     @doc(PeriodArray.hour.fget)
     def hour(self) -> Int64Index:
         return Int64Index(self._data.hour, name=self.name)
 
     # https://github.com/python/mypy/issues/1362
     # error: Decorated property not supported
-    @property  # type:ignore[misc]
+    @property  # type: ignore[misc]
     @doc(PeriodArray.minute.fget)
     def minute(self) -> Int64Index:
         return Int64Index(self._data.minute, name=self.name)
 
     # https://github.com/python/mypy/issues/1362
     # error: Decorated property not supported
-    @property  # type:ignore[misc]
+    @property  # type: ignore[misc]
     @doc(PeriodArray.second.fget)
     def second(self) -> Int64Index:
         return Int64Index(self._data.second, name=self.name)
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index 941220f25e12f..b873ffc6ee487 100644
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -745,7 +745,7 @@ def _tupleize_axis_indexer(self, key) -> tuple:
         new_key = [slice(None)] * self.ndim
         # error: Invalid index type "Optional[Any]" for "List[slice]"; expected
         # type "SupportsIndex"
-        new_key[self.axis] = key  # type:ignore[index]
+        new_key[self.axis] = key  # type: ignore[index]
         return tuple(new_key)
 
     @final
diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py
index 89e74cf19ee8e..9ec046fe18c53 100644
--- a/pandas/io/formats/style.py
+++ b/pandas/io/formats/style.py
@@ -3551,7 +3551,7 @@ def from_custom_template(
         # mypy doesn't like dynamically-defined classes
         # error: Variable "cls" is not valid as a type
         # error: Invalid base class "cls"
-        class MyStyler(cls):  # type:ignore[valid-type,misc]
+        class MyStyler(cls):  # type: ignore[valid-type,misc]
             env = jinja2.Environment(loader=loader)
             if html_table:
                 template_html_table = env.get_template(html_table)
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index 910ba63b5bb3a..ab04240e8e791 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -1067,7 +1067,7 @@ def _parse_numpy(self):
             #  gets multiple values for keyword argument "dtype_if_empty
             self.obj = create_series_with_explicit_dtype(
                 *data, dtype_if_empty=object
-            )  # type:ignore[misc]
+            )  # type: ignore[misc]
         else:
             self.obj = create_series_with_explicit_dtype(data, dtype_if_empty=object)
 
diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py
index eda902d34bff5..3113aa1ad8452 100644
--- a/pandas/tests/internals/test_internals.py
+++ b/pandas/tests/internals/test_internals.py
@@ -1346,10 +1346,7 @@ def check_frame_setitem(self, elem, index: Index, inplace: bool):
 
         if inplace:
             # assertion here implies setting was done inplace
-
-            # error: Item "ArrayManager" of "Union[ArrayManager, BlockManager]" has no
-            #  attribute "blocks"
-            assert df._mgr.blocks[0].values is arr  # type:ignore[union-attr]
+            assert df._mgr.arrays[0] is arr
         else:
             assert df.dtypes[0] == object
 
diff --git a/pandas/tests/util/test_show_versions.py b/pandas/tests/util/test_show_versions.py
index 7a1363099e7a0..57cd2e1a144b6 100644
--- a/pandas/tests/util/test_show_versions.py
+++ b/pandas/tests/util/test_show_versions.py
@@ -32,6 +32,7 @@
     # https://github.com/pandas-dev/pandas/issues/35252
     "ignore:Distutils:UserWarning"
 )
+@pytest.mark.filterwarnings("ignore:Setuptools is replacing distutils:UserWarning")
 def test_show_versions(tmpdir):
     # GH39701
     as_json = os.path.join(tmpdir, "test_output.json")