From 85be99eac9b78afcf98955cd85c60d75c5726242 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 5 Sep 2024 07:24:22 -1000
Subject: [PATCH 001/224] PERF: CategoricalDtype.update_dtype (#59647)

* PERF: CategoricalDtype.update_dtype

* Add whatsnew number add comment

* Fix unit test

* short circut only for the dtype
---
 doc/source/whatsnew/v3.0.0.rst | 1 +
 pandas/core/dtypes/dtypes.py   | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 75d3ff1193f8d..cd353b60d1a6e 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -528,6 +528,7 @@ Performance improvements
 - Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`)
 - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`)
 - Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`)
+- Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`)
 - Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`)
 - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`)
 - Performance improvement in indexing operations for string dtypes (:issue:`56997`)
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
index 54003e67be7ba..68b4807961d19 100644
--- a/pandas/core/dtypes/dtypes.py
+++ b/pandas/core/dtypes/dtypes.py
@@ -611,6 +611,13 @@ def update_dtype(self, dtype: str_type | CategoricalDtype) -> CategoricalDtype:
             dtype = cast(CategoricalDtype, dtype)
 
         # update categories/ordered unless they've been explicitly passed as None
+        if (
+            isinstance(dtype, CategoricalDtype)
+            and dtype.categories is not None
+            and dtype.ordered is not None
+        ):
+            # Avoid re-validation in CategoricalDtype constructor
+            return dtype
         new_categories = (
             dtype.categories if dtype.categories is not None else self.categories
         )

From 4f1052e390ea6d33e81ec1dc7c6801bb6b5b79ef Mon Sep 17 00:00:00 2001
From: "Mien (Josephine) Nguyen" <josephinee.nguyen@gmail.com>
Date: Thu, 5 Sep 2024 14:07:07 -0400
Subject: [PATCH 002/224] TST: Update BooleanArray _logical_method test to fail
 on incorrect length comparison operator (#59708)

Test
---
 pandas/tests/arrays/boolean/test_logical.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/arrays/boolean/test_logical.py b/pandas/tests/arrays/boolean/test_logical.py
index 66c117ea3fc66..97a24e0f24756 100644
--- a/pandas/tests/arrays/boolean/test_logical.py
+++ b/pandas/tests/arrays/boolean/test_logical.py
@@ -60,19 +60,20 @@ def test_eq_mismatched_type(self, other):
         expected = pd.array([True, True])
         tm.assert_extension_array_equal(result, expected)
 
-    def test_logical_length_mismatch_raises(self, all_logical_operators):
+    @pytest.mark.parametrize("other", [[True, False], [True, False, True, False]])
+    def test_logical_length_mismatch_raises(self, other, all_logical_operators):
         op_name = all_logical_operators
         a = pd.array([True, False, None], dtype="boolean")
         msg = "Lengths must match"
 
         with pytest.raises(ValueError, match=msg):
-            getattr(a, op_name)([True, False])
+            getattr(a, op_name)(other)
 
         with pytest.raises(ValueError, match=msg):
-            getattr(a, op_name)(np.array([True, False]))
+            getattr(a, op_name)(np.array(other))
 
         with pytest.raises(ValueError, match=msg):
-            getattr(a, op_name)(pd.array([True, False], dtype="boolean"))
+            getattr(a, op_name)(pd.array(other, dtype="boolean"))
 
     def test_logical_nan_raises(self, all_logical_operators):
         op_name = all_logical_operators

From 6c30aa22c4537e3ccf5fd968d00c328cd1865545 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Thu, 5 Sep 2024 16:21:06 -0700
Subject: [PATCH 003/224] REF (string): de-duplicate _str_contains (#59709)

* REF: de-duplicate _str_contains

* pyright ignore
---
 pandas/core/arrays/_arrow_string_mixins.py | 15 +++++++++++++++
 pandas/core/arrays/arrow/array.py          | 15 ---------------
 pandas/core/arrays/string_arrow.py         | 14 ++++----------
 3 files changed, 19 insertions(+), 25 deletions(-)

diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py
index ba20111e0d858..5b34a7e2c7cef 100644
--- a/pandas/core/arrays/_arrow_string_mixins.py
+++ b/pandas/core/arrays/_arrow_string_mixins.py
@@ -190,3 +190,18 @@ def _str_istitle(self):
     def _str_isupper(self):
         result = pc.utf8_is_upper(self._pa_array)
         return self._convert_bool_result(result)
+
+    def _str_contains(
+        self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
+    ):
+        if flags:
+            raise NotImplementedError(f"contains not implemented with {flags=}")
+
+        if regex:
+            pa_contains = pc.match_substring_regex
+        else:
+            pa_contains = pc.match_substring
+        result = pa_contains(self._pa_array, pat, ignore_case=not case)
+        if not isna(na):  # pyright: ignore [reportGeneralTypeIssues]
+            result = result.fill_null(na)
+        return self._convert_bool_result(result)
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 807854a13f285..40819ba4ab338 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2322,21 +2322,6 @@ def _str_count(self, pat: str, flags: int = 0) -> Self:
             raise NotImplementedError(f"count not implemented with {flags=}")
         return type(self)(pc.count_substring_regex(self._pa_array, pat))
 
-    def _str_contains(
-        self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
-    ) -> Self:
-        if flags:
-            raise NotImplementedError(f"contains not implemented with {flags=}")
-
-        if regex:
-            pa_contains = pc.match_substring_regex
-        else:
-            pa_contains = pc.match_substring
-        result = pa_contains(self._pa_array, pat, ignore_case=not case)
-        if not isna(na):
-            result = result.fill_null(na)
-        return type(self)(result)
-
     def _result_converter(self, result):
         return type(self)(result)
 
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 6dd0ca2de11ba..e18beb629d0c4 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -223,10 +223,8 @@ def insert(self, loc: int, item) -> ArrowStringArray:
             raise TypeError("Scalar must be NA or str")
         return super().insert(loc, item)
 
-    def _convert_bool_result(self, values, na=None):
+    def _convert_bool_result(self, values):
         if self.dtype.na_value is np.nan:
-            if not isna(na):
-                values = values.fill_null(bool(na))
             return ArrowExtensionArray(values).to_numpy(na_value=np.nan)
         return BooleanDtype().__from_arrow__(values)
 
@@ -304,11 +302,6 @@ def _str_contains(
                 fallback_performancewarning()
             return super()._str_contains(pat, case, flags, na, regex)
 
-        if regex:
-            result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case)
-        else:
-            result = pc.match_substring(self._pa_array, pat, ignore_case=not case)
-        result = self._convert_bool_result(result, na=na)
         if not isna(na):
             if not isinstance(na, bool):
                 # GH#59561
@@ -318,8 +311,9 @@ def _str_contains(
                     FutureWarning,
                     stacklevel=find_stack_level(),
                 )
-            result[isna(result)] = bool(na)
-        return result
+                na = bool(na)
+
+        return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex)
 
     def _str_replace(
         self,

From 3a4526516ae2e64cd9815e87c4c9e23c24b191e9 Mon Sep 17 00:00:00 2001
From: Manlai Amar <70603274+amanlai@users.noreply.github.com>
Date: Thu, 5 Sep 2024 17:47:52 -0700
Subject: [PATCH 004/224] DOC: Fix some docstring validation errors #59698
 (#59713)

* fix some docstring errors

* removed trailing whitespace

* pd.Series.dt.microseconds has the same documentation as pd.TimedeltaIndex.microseconds and SA01 was cleared for both in the previous commit
---
 ci/code_checks.sh                  |  4 ----
 pandas/_libs/tslibs/timedeltas.pyx | 13 ++++++++++---
 pandas/core/arrays/timedeltas.py   | 12 ++++++++++++
 3 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 0714c6f74f0c2..fcbeb20d083d6 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -92,7 +92,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Series.dt.day_name PR01,PR02" \
         -i "pandas.Series.dt.floor PR01,PR02" \
         -i "pandas.Series.dt.freq GL08" \
-        -i "pandas.Series.dt.microseconds SA01" \
         -i "pandas.Series.dt.month_name PR01,PR02" \
         -i "pandas.Series.dt.nanoseconds SA01" \
         -i "pandas.Series.dt.normalize PR01" \
@@ -113,12 +112,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timedelta.max PR02" \
         -i "pandas.Timedelta.min PR02" \
         -i "pandas.Timedelta.resolution PR02" \
-        -i "pandas.Timedelta.to_numpy PR01" \
         -i "pandas.Timedelta.to_timedelta64 SA01" \
         -i "pandas.Timedelta.total_seconds SA01" \
         -i "pandas.Timedelta.view SA01" \
-        -i "pandas.TimedeltaIndex.components SA01" \
-        -i "pandas.TimedeltaIndex.microseconds SA01" \
         -i "pandas.TimedeltaIndex.nanoseconds SA01" \
         -i "pandas.TimedeltaIndex.seconds SA01" \
         -i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \
diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
index 36be1812b0187..a7bc2de5ad837 100644
--- a/pandas/_libs/tslibs/timedeltas.pyx
+++ b/pandas/_libs/tslibs/timedeltas.pyx
@@ -1421,9 +1421,16 @@ cdef class _Timedelta(timedelta):
         """
         Convert the Timedelta to a NumPy timedelta64.
 
-        This is an alias method for `Timedelta.to_timedelta64()`. The dtype and
-        copy parameters are available here only for compatibility. Their values
-        will not affect the return value.
+        This is an alias method for `Timedelta.to_timedelta64()`.
+
+        Parameters
+        ----------
+        dtype : NoneType
+            It is available here only for compatibility. Its value will not
+            affect the return value.
+        copy : bool, default False
+            It is available here only for compatibility. Its value will not
+            affect the return value.
 
         Returns
         -------
diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
index b2cfbe7338c0d..c8a86ffc187d0 100644
--- a/pandas/core/arrays/timedeltas.py
+++ b/pandas/core/arrays/timedeltas.py
@@ -876,6 +876,12 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]:
     microseconds_docstring = textwrap.dedent(
         """Number of microseconds (>= 0 and less than 1 second) for each element.
 
+    See Also
+    --------
+    pd.Timedelta.microseconds : Number of microseconds (>= 0 and less than 1 second).
+    pd.Timedelta.to_pytimedelta.microseconds : Number of microseconds (>= 0 and less
+        than 1 second) of a datetime.timedelta.
+
     Examples
     --------
     For Series:
@@ -955,6 +961,12 @@ def components(self) -> DataFrame:
         -------
         DataFrame
 
+        See Also
+        --------
+        TimedeltaIndex.total_seconds : Return total duration expressed in seconds.
+        Timedelta.components : Return a components namedtuple-like of a single
+            timedelta.
+
         Examples
         --------
         >>> tdelta_idx = pd.to_timedelta(["1 day 3 min 2 us 42 ns"])

From 08431f17333a91f8191146646b2a136f91bfe7d2 Mon Sep 17 00:00:00 2001
From: Deepak Kapila <deepak.kapila@gmail.com>
Date: Fri, 6 Sep 2024 09:53:43 -0400
Subject: [PATCH 005/224] DOC: Clarify docs for df.to_sql (#59727)

---
 pandas/core/generic.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index bc47b662a08d3..42516f0a85e07 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -2812,8 +2812,8 @@ def to_sql(
             `index` is True, then the index names are used.
             A sequence should be given if the DataFrame uses MultiIndex.
         chunksize : int, optional
-            Specify the number of rows in each batch to be written at a time.
-            By default, all rows will be written at once.
+            Specify the number of rows in each batch to be written to the database connection at a time.
+            By default, all rows will be written at once. Also see the method keyword.
         dtype : dict or scalar, optional
             Specifying the datatype for columns. If a dictionary is used, the
             keys should be the column names and the values should be the

From 3f8d3e495a3a26f0be960ec70dee20e2411a4bb4 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Fri, 6 Sep 2024 08:06:15 -0700
Subject: [PATCH 006/224] BUG (string): ArrowStringArray.find corner cases
 (#59562)

---
 pandas/core/arrays/_arrow_string_mixins.py | 44 +++++++++++++++++++++-
 pandas/core/arrays/arrow/array.py          | 23 -----------
 pandas/core/arrays/string_arrow.py         | 18 ++++-----
 pandas/tests/extension/test_arrow.py       | 31 ++++++---------
 4 files changed, 61 insertions(+), 55 deletions(-)

diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py
index 5b34a7e2c7cef..950d4cd7cc92e 100644
--- a/pandas/core/arrays/_arrow_string_mixins.py
+++ b/pandas/core/arrays/_arrow_string_mixins.py
@@ -3,6 +3,7 @@
 from functools import partial
 from typing import (
     TYPE_CHECKING,
+    Any,
     Literal,
 )
 
@@ -10,6 +11,7 @@
 
 from pandas.compat import (
     pa_version_under10p1,
+    pa_version_under13p0,
     pa_version_under17p0,
 )
 
@@ -20,7 +22,10 @@
     import pyarrow.compute as pc
 
 if TYPE_CHECKING:
-    from collections.abc import Sized
+    from collections.abc import (
+        Callable,
+        Sized,
+    )
 
     from pandas._typing import (
         Scalar,
@@ -42,6 +47,9 @@ def _convert_int_result(self, result):
         # Convert an integer-dtype result to the appropriate result type
         raise NotImplementedError
 
+    def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
+        raise NotImplementedError
+
     def _str_pad(
         self,
         width: int,
@@ -205,3 +213,37 @@ def _str_contains(
         if not isna(na):  # pyright: ignore [reportGeneralTypeIssues]
             result = result.fill_null(na)
         return self._convert_bool_result(result)
+
+    def _str_find(self, sub: str, start: int = 0, end: int | None = None):
+        if (
+            pa_version_under13p0
+            and not (start != 0 and end is not None)
+            and not (start == 0 and end is None)
+        ):
+            # GH#59562
+            res_list = self._apply_elementwise(lambda val: val.find(sub, start, end))
+            return self._convert_int_result(pa.chunked_array(res_list))
+
+        if (start == 0 or start is None) and end is None:
+            result = pc.find_substring(self._pa_array, sub)
+        else:
+            if sub == "":
+                # GH#56792
+                res_list = self._apply_elementwise(
+                    lambda val: val.find(sub, start, end)
+                )
+                return self._convert_int_result(pa.chunked_array(res_list))
+            if start is None:
+                start_offset = 0
+                start = 0
+            elif start < 0:
+                start_offset = pc.add(start, pc.utf8_length(self._pa_array))
+                start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset)
+            else:
+                start_offset = start
+            slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
+            result = pc.find_substring(slices, sub)
+            found = pc.not_equal(result, pa.scalar(-1, type=result.type))
+            offset_result = pc.add(result, start_offset)
+            result = pc.if_else(found, offset_result, -1)
+        return self._convert_int_result(result)
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 40819ba4ab338..15f9ba611a642 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2373,29 +2373,6 @@ def _str_fullmatch(
             pat = f"{pat}$"
         return self._str_match(pat, case, flags, na)
 
-    def _str_find(self, sub: str, start: int = 0, end: int | None = None) -> Self:
-        if (start == 0 or start is None) and end is None:
-            result = pc.find_substring(self._pa_array, sub)
-        else:
-            if sub == "":
-                # GH 56792
-                result = self._apply_elementwise(lambda val: val.find(sub, start, end))
-                return type(self)(pa.chunked_array(result))
-            if start is None:
-                start_offset = 0
-                start = 0
-            elif start < 0:
-                start_offset = pc.add(start, pc.utf8_length(self._pa_array))
-                start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset)
-            else:
-                start_offset = start
-            slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
-            result = pc.find_substring(slices, sub)
-            found = pc.not_equal(result, pa.scalar(-1, type=result.type))
-            offset_result = pc.add(result, start_offset)
-            result = pc.if_else(found, offset_result, -1)
-        return type(self)(result)
-
     def _str_join(self, sep: str) -> Self:
         if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string(
             self._pa_array.type
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index e18beb629d0c4..97381b82ceab9 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -416,18 +416,14 @@ def _str_count(self, pat: str, flags: int = 0):
         return self._convert_int_result(result)
 
     def _str_find(self, sub: str, start: int = 0, end: int | None = None):
-        if start != 0 and end is not None:
-            slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
-            result = pc.find_substring(slices, sub)
-            not_found = pc.equal(result, -1)
-            offset_result = pc.add(result, end - start)
-            result = pc.if_else(not_found, result, offset_result)
-        elif start == 0 and end is None:
-            slices = self._pa_array
-            result = pc.find_substring(slices, sub)
-        else:
+        if (
+            pa_version_under13p0
+            and not (start != 0 and end is not None)
+            and not (start == 0 and end is None)
+        ):
+            # GH#59562
             return super()._str_find(sub, start, end)
-        return self._convert_int_result(result)
+        return ArrowStringArrayMixin._str_find(self, sub, start, end)
 
     def _str_get_dummies(self, sep: str = "|"):
         dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep)
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index 3dbdda388d035..fc4f14882b9d7 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -32,8 +32,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs import lib
 from pandas._libs.tslibs import timezones
 from pandas.compat import (
@@ -1947,14 +1945,9 @@ def test_str_find_negative_start():
 
 def test_str_find_no_end():
     ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string()))
-    if pa_version_under13p0:
-        # https://github.com/apache/arrow/issues/36311
-        with pytest.raises(pa.lib.ArrowInvalid, match="Negative buffer resize"):
-            ser.str.find("ab", start=1)
-    else:
-        result = ser.str.find("ab", start=1)
-        expected = pd.Series([-1, None], dtype="int64[pyarrow]")
-        tm.assert_series_equal(result, expected)
+    result = ser.str.find("ab", start=1)
+    expected = pd.Series([-1, None], dtype="int64[pyarrow]")
+    tm.assert_series_equal(result, expected)
 
 
 def test_str_find_negative_start_negative_end():
@@ -1968,17 +1961,11 @@ def test_str_find_negative_start_negative_end():
 def test_str_find_large_start():
     # GH 56791
     ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string()))
-    if pa_version_under13p0:
-        # https://github.com/apache/arrow/issues/36311
-        with pytest.raises(pa.lib.ArrowInvalid, match="Negative buffer resize"):
-            ser.str.find(sub="d", start=16)
-    else:
-        result = ser.str.find(sub="d", start=16)
-        expected = pd.Series([-1, None], dtype=ArrowDtype(pa.int64()))
-        tm.assert_series_equal(result, expected)
+    result = ser.str.find(sub="d", start=16)
+    expected = pd.Series([-1, None], dtype=ArrowDtype(pa.int64()))
+    tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.skipif(
     pa_version_under13p0, reason="https://github.com/apache/arrow/issues/36311"
 )
@@ -1990,11 +1977,15 @@ def test_str_find_e2e(start, end, sub):
         ["abcaadef", "abc", "abcdeddefgj8292", "ab", "a", ""],
         dtype=ArrowDtype(pa.string()),
     )
-    object_series = s.astype(pd.StringDtype())
+    object_series = s.astype(pd.StringDtype(storage="python"))
     result = s.str.find(sub, start, end)
     expected = object_series.str.find(sub, start, end).astype(result.dtype)
     tm.assert_series_equal(result, expected)
 
+    arrow_str_series = s.astype(pd.StringDtype(storage="pyarrow"))
+    result2 = arrow_str_series.str.find(sub, start, end).astype(result.dtype)
+    tm.assert_series_equal(result2, expected)
+
 
 def test_str_find_negative_start_negative_end_no_match():
     # GH 56791

From 38ccb331b15dd301a85b3413673ae144498d4c1f Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Fri, 6 Sep 2024 23:06:00 +0530
Subject: [PATCH 007/224] DOC: fix SA01 for pandas.Period.to_timestamp (#59730)

---
 ci/code_checks.sh              | 1 -
 pandas/_libs/tslibs/period.pyx | 6 ++++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index fcbeb20d083d6..2fc9c1a83c097 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -73,7 +73,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.NA SA01" \
         -i "pandas.Period.freq GL08" \
         -i "pandas.Period.ordinal GL08" \
-        -i "pandas.Period.to_timestamp SA01" \
         -i "pandas.PeriodDtype.freq SA01" \
         -i "pandas.RangeIndex.from_range PR01,SA01" \
         -i "pandas.RangeIndex.start SA01" \
diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx
index e4771feeb804e..c563ab91c4142 100644
--- a/pandas/_libs/tslibs/period.pyx
+++ b/pandas/_libs/tslibs/period.pyx
@@ -2001,6 +2001,12 @@ cdef class _Period(PeriodMixin):
         -------
         Timestamp
 
+        See Also
+        --------
+        Timestamp : A class representing a single point in time.
+        Period : Represents a span of time with a fixed frequency.
+        PeriodIndex.to_timestamp : Convert a `PeriodIndex` to a `DatetimeIndex`.
+
         Examples
         --------
         >>> period = pd.Period('2023-1-1', freq='D')

From 8cd761a2d1553d7dfa986f3c574f03f2fc62587e Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Fri, 6 Sep 2024 23:06:50 +0530
Subject: [PATCH 008/224] DOC: fix SA01,ES01 for pandas.Timedelta.view (#59733)

---
 ci/code_checks.sh                  |  1 -
 pandas/_libs/tslibs/timedeltas.pyx | 15 +++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 2fc9c1a83c097..7ed5103b3b796 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -113,7 +113,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timedelta.resolution PR02" \
         -i "pandas.Timedelta.to_timedelta64 SA01" \
         -i "pandas.Timedelta.total_seconds SA01" \
-        -i "pandas.Timedelta.view SA01" \
         -i "pandas.TimedeltaIndex.nanoseconds SA01" \
         -i "pandas.TimedeltaIndex.seconds SA01" \
         -i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \
diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
index a7bc2de5ad837..4f90f26cf31ab 100644
--- a/pandas/_libs/tslibs/timedeltas.pyx
+++ b/pandas/_libs/tslibs/timedeltas.pyx
@@ -1458,11 +1458,26 @@ cdef class _Timedelta(timedelta):
         """
         Array view compatibility.
 
+        This method allows you to reinterpret the underlying data of a Timedelta
+        object as a different dtype. The `view` method provides a way to reinterpret
+        the internal representation of the `Timedelta` object without modifying its
+        data. This is particularly useful when you need to work with the underlying
+        data directly, such as for performance optimizations or interfacing with
+        low-level APIs. The returned value is typically the number of nanoseconds
+        since the epoch, represented as an integer or another specified dtype.
+
         Parameters
         ----------
         dtype : str or dtype
             The dtype to view the underlying data as.
 
+        See Also
+        --------
+        numpy.ndarray.view : Returns a view of an array with the same data.
+        Timedelta.to_numpy : Converts the Timedelta to a NumPy timedelta64.
+        Timedelta.total_seconds : Returns the total duration of the Timedelta
+            object in seconds.
+
         Examples
         --------
         >>> td = pd.Timedelta('3D')

From 4a16b44bfabc70854f1d3a1447e7050725ff16d9 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 6 Sep 2024 19:37:42 +0200
Subject: [PATCH 009/224] String dtype: implement _get_common_dtype (#59682)

* String dtype: implement _get_common_dtype

* add specific tests

* try fix typing

* try fix typing

* suppress typing error

* support numpy 2.0 string

* fix typo
---
 pandas/core/arrays/string_.py               | 32 ++++++++-
 pandas/tests/arrays/categorical/test_api.py |  3 -
 pandas/tests/arrays/string_/test_concat.py  | 73 +++++++++++++++++++++
 3 files changed, 103 insertions(+), 5 deletions(-)
 create mode 100644 pandas/tests/arrays/string_/test_concat.py

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 143a13c54dbbb..88fd1481031f8 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -171,9 +171,9 @@ def __init__(
             # a consistent NaN value (and we can use `dtype.na_value is np.nan`)
             na_value = np.nan
         elif na_value is not libmissing.NA:
-            raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}")
+            raise ValueError(f"'na_value' must be np.nan or pd.NA, got {na_value}")
 
-        self.storage = storage
+        self.storage = cast(str, storage)
         self._na_value = na_value
 
     def __repr__(self) -> str:
@@ -284,6 +284,34 @@ def construct_array_type(  # type: ignore[override]
         else:
             return ArrowStringArrayNumpySemantics
 
+    def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
+        storages = set()
+        na_values = set()
+
+        for dtype in dtypes:
+            if isinstance(dtype, StringDtype):
+                storages.add(dtype.storage)
+                na_values.add(dtype.na_value)
+            elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "T"):
+                continue
+            else:
+                return None
+
+        if len(storages) == 2:
+            # if both python and pyarrow storage -> priority to pyarrow
+            storage = "pyarrow"
+        else:
+            storage = next(iter(storages))  # type: ignore[assignment]
+
+        na_value: libmissing.NAType | float
+        if len(na_values) == 2:
+            # if both NaN and NA -> priority to NA
+            na_value = libmissing.NA
+        else:
+            na_value = next(iter(na_values))
+
+        return StringDtype(storage=storage, na_value=na_value)
+
     def __from_arrow__(
         self, array: pyarrow.Array | pyarrow.ChunkedArray
     ) -> BaseStringArray:
diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py
index 2ccc5781c608e..2791fd55f54d7 100644
--- a/pandas/tests/arrays/categorical/test_api.py
+++ b/pandas/tests/arrays/categorical/test_api.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat import PY311
 
 from pandas import (
@@ -151,7 +149,6 @@ def test_reorder_categories_raises(self, new_categories):
         with pytest.raises(ValueError, match=msg):
             cat.reorder_categories(new_categories)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_add_categories(self):
         cat = Categorical(["a", "b", "c", "a"], ordered=True)
         old = cat.copy()
diff --git a/pandas/tests/arrays/string_/test_concat.py b/pandas/tests/arrays/string_/test_concat.py
new file mode 100644
index 0000000000000..320d700b2b6c3
--- /dev/null
+++ b/pandas/tests/arrays/string_/test_concat.py
@@ -0,0 +1,73 @@
+import numpy as np
+import pytest
+
+from pandas.compat import HAS_PYARROW
+
+from pandas.core.dtypes.cast import find_common_type
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.util.version import Version
+
+
+@pytest.mark.parametrize(
+    "to_concat_dtypes, result_dtype",
+    [
+        # same types
+        ([("pyarrow", pd.NA), ("pyarrow", pd.NA)], ("pyarrow", pd.NA)),
+        ([("pyarrow", np.nan), ("pyarrow", np.nan)], ("pyarrow", np.nan)),
+        ([("python", pd.NA), ("python", pd.NA)], ("python", pd.NA)),
+        ([("python", np.nan), ("python", np.nan)], ("python", np.nan)),
+        # pyarrow preference
+        ([("pyarrow", pd.NA), ("python", pd.NA)], ("pyarrow", pd.NA)),
+        # NA preference
+        ([("python", pd.NA), ("python", np.nan)], ("python", pd.NA)),
+    ],
+)
+def test_concat_series(request, to_concat_dtypes, result_dtype):
+    if any(storage == "pyarrow" for storage, _ in to_concat_dtypes) and not HAS_PYARROW:
+        pytest.skip("Could not import 'pyarrow'")
+
+    ser_list = [
+        pd.Series(["a", "b", None], dtype=pd.StringDtype(storage, na_value))
+        for storage, na_value in to_concat_dtypes
+    ]
+
+    result = pd.concat(ser_list, ignore_index=True)
+    expected = pd.Series(
+        ["a", "b", None, "a", "b", None], dtype=pd.StringDtype(*result_dtype)
+    )
+    tm.assert_series_equal(result, expected)
+
+    # order doesn't matter for result
+    result = pd.concat(ser_list[::1], ignore_index=True)
+    tm.assert_series_equal(result, expected)
+
+
+def test_concat_with_object(string_dtype_arguments):
+    # _get_common_dtype cannot inspect values, so object dtype with strings still
+    # results in object dtype
+    result = pd.concat(
+        [
+            pd.Series(["a", "b", None], dtype=pd.StringDtype(*string_dtype_arguments)),
+            pd.Series(["a", "b", None], dtype=object),
+        ]
+    )
+    assert result.dtype == np.dtype("object")
+
+
+def test_concat_with_numpy(string_dtype_arguments):
+    # common type with a numpy string dtype always preserves the pandas string dtype
+    dtype = pd.StringDtype(*string_dtype_arguments)
+    assert find_common_type([dtype, np.dtype("U")]) == dtype
+    assert find_common_type([np.dtype("U"), dtype]) == dtype
+    assert find_common_type([dtype, np.dtype("U10")]) == dtype
+    assert find_common_type([np.dtype("U10"), dtype]) == dtype
+
+    # with any other numpy dtype -> object
+    assert find_common_type([dtype, np.dtype("S")]) == np.dtype("object")
+    assert find_common_type([dtype, np.dtype("int64")]) == np.dtype("object")
+
+    if Version(np.__version__) >= Version("2"):
+        assert find_common_type([dtype, np.dtypes.StringDType()]) == dtype
+        assert find_common_type([np.dtypes.StringDType(), dtype]) == dtype

From 5a3a4f350440cd215efae034c506cbead6a1ad9e Mon Sep 17 00:00:00 2001
From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com>
Date: Fri, 6 Sep 2024 20:13:05 +0200
Subject: [PATCH 010/224] DOC: move `idxmin` and `idxmax` docs from
 core/shared_docs.py to core/frame.py (#59735)

move idxmin, idxmax docstring from shared_docs.py to frame.py
---
 pandas/core/frame.py       | 144 ++++++++++++++++++++++++++++++++++++-
 pandas/core/shared_docs.py | 130 ---------------------------------
 2 files changed, 142 insertions(+), 132 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index f47acf579d79c..fe88cb86693e8 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -12745,10 +12745,80 @@ def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series:
         """
         return self.apply(Series.nunique, axis=axis, dropna=dropna)
 
-    @doc(_shared_docs["idxmin"], numeric_only_default="False")
     def idxmin(
         self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
     ) -> Series:
+        """
+        Return index of first occurrence of minimum over requested axis.
+
+        NA/null values are excluded.
+
+        Parameters
+        ----------
+        axis : {{0 or 'index', 1 or 'columns'}}, default 0
+            The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
+        skipna : bool, default True
+            Exclude NA/null values. If the entire DataFrame is NA,
+            or if ``skipna=False`` and there is an NA value, this method
+            will raise a ``ValueError``.
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+
+            .. versionadded:: 1.5.0
+
+        Returns
+        -------
+        Series
+            Indexes of minima along the specified axis.
+
+        Raises
+        ------
+        ValueError
+            * If the row/column is empty
+
+        See Also
+        --------
+        Series.idxmin : Return index of the minimum element.
+
+        Notes
+        -----
+        This method is the DataFrame version of ``ndarray.argmin``.
+
+        Examples
+        --------
+        Consider a dataset containing food consumption in Argentina.
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         {
+        ...             "consumption": [10.51, 103.11, 55.48],
+        ...             "co2_emissions": [37.2, 19.66, 1712],
+        ...         }
+        ...     },
+        ...     index=["Pork", "Wheat Products", "Beef"],
+        ... )
+
+        >>> df
+                        consumption  co2_emissions
+        Pork                  10.51         37.20
+        Wheat Products       103.11         19.66
+        Beef                  55.48       1712.00
+
+        By default, it returns the index for the minimum value in each column.
+
+        >>> df.idxmin()
+        consumption                Pork
+        co2_emissions    Wheat Products
+        dtype: object
+
+        To return the index for the minimum value in each row, use ``axis="columns"``.
+
+        >>> df.idxmin(axis="columns")
+        Pork                consumption
+        Wheat Products    co2_emissions
+        Beef                consumption
+        dtype: object
+        """
         axis = self._get_axis_number(axis)
 
         if self.empty and len(self.axes[axis]):
@@ -12782,10 +12852,80 @@ def idxmin(
         final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))
         return final_result.__finalize__(self, method="idxmin")
 
-    @doc(_shared_docs["idxmax"], numeric_only_default="False")
     def idxmax(
         self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
     ) -> Series:
+        """
+        Return index of first occurrence of maximum over requested axis.
+
+        NA/null values are excluded.
+
+        Parameters
+        ----------
+        axis : {{0 or 'index', 1 or 'columns'}}, default 0
+            The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
+        skipna : bool, default True
+            Exclude NA/null values. If the entire DataFrame is NA,
+            or if ``skipna=False`` and there is an NA value, this method
+            will raise a ``ValueError``.
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+
+            .. versionadded:: 1.5.0
+
+        Returns
+        -------
+        Series
+            Indexes of maxima along the specified axis.
+
+        Raises
+        ------
+        ValueError
+            * If the row/column is empty
+
+        See Also
+        --------
+        Series.idxmax : Return index of the maximum element.
+
+        Notes
+        -----
+        This method is the DataFrame version of ``ndarray.argmax``.
+
+        Examples
+        --------
+        Consider a dataset containing food consumption in Argentina.
+
+        >>> df = pd.DataFrame(
+        ...     {
+        ...         {
+        ...             "consumption": [10.51, 103.11, 55.48],
+        ...             "co2_emissions": [37.2, 19.66, 1712],
+        ...         }
+        ...     },
+        ...     index=["Pork", "Wheat Products", "Beef"],
+        ... )
+
+        >>> df
+                        consumption  co2_emissions
+        Pork                  10.51         37.20
+        Wheat Products       103.11         19.66
+        Beef                  55.48       1712.00
+
+        By default, it returns the index for the maximum value in each column.
+
+        >>> df.idxmax()
+        consumption     Wheat Products
+        co2_emissions             Beef
+        dtype: object
+
+        To return the index for the maximum value in each row, use ``axis="columns"``.
+
+        >>> df.idxmax(axis="columns")
+        Pork              co2_emissions
+        Wheat Products     consumption
+        Beef              co2_emissions
+        dtype: object
+        """
         axis = self._get_axis_number(axis)
 
         if self.empty and len(self.axes[axis]):
diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py
index 5725b96f66cd4..cb0c3d241534c 100644
--- a/pandas/core/shared_docs.py
+++ b/pandas/core/shared_docs.py
@@ -649,133 +649,3 @@
     3  3  d  e
     4  4  e  e
 """
-
-_shared_docs["idxmin"] = """
-    Return index of first occurrence of minimum over requested axis.
-
-    NA/null values are excluded.
-
-    Parameters
-    ----------
-    axis : {{0 or 'index', 1 or 'columns'}}, default 0
-        The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
-    skipna : bool, default True
-        Exclude NA/null values. If the entire Series is NA, or if ``skipna=False``
-        and there is an NA value, this method will raise a ``ValueError``.
-    numeric_only : bool, default {numeric_only_default}
-        Include only `float`, `int` or `boolean` data.
-
-        .. versionadded:: 1.5.0
-
-    Returns
-    -------
-    Series
-        Indexes of minima along the specified axis.
-
-    Raises
-    ------
-    ValueError
-        * If the row/column is empty
-
-    See Also
-    --------
-    Series.idxmin : Return index of the minimum element.
-
-    Notes
-    -----
-    This method is the DataFrame version of ``ndarray.argmin``.
-
-    Examples
-    --------
-    Consider a dataset containing food consumption in Argentina.
-
-    >>> df = pd.DataFrame({{'consumption': [10.51, 103.11, 55.48],
-    ...                   'co2_emissions': [37.2, 19.66, 1712]}},
-    ...                   index=['Pork', 'Wheat Products', 'Beef'])
-
-    >>> df
-                    consumption  co2_emissions
-    Pork                  10.51         37.20
-    Wheat Products       103.11         19.66
-    Beef                  55.48       1712.00
-
-    By default, it returns the index for the minimum value in each column.
-
-    >>> df.idxmin()
-    consumption                Pork
-    co2_emissions    Wheat Products
-    dtype: object
-
-    To return the index for the minimum value in each row, use ``axis="columns"``.
-
-    >>> df.idxmin(axis="columns")
-    Pork                consumption
-    Wheat Products    co2_emissions
-    Beef                consumption
-    dtype: object
-"""
-
-_shared_docs["idxmax"] = """
-    Return index of first occurrence of maximum over requested axis.
-
-    NA/null values are excluded.
-
-    Parameters
-    ----------
-    axis : {{0 or 'index', 1 or 'columns'}}, default 0
-        The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
-    skipna : bool, default True
-        Exclude NA/null values. If the entire Series is NA, or if ``skipna=False``
-        and there is an NA value, this method will raise a ``ValueError``.
-    numeric_only : bool, default {numeric_only_default}
-        Include only `float`, `int` or `boolean` data.
-
-        .. versionadded:: 1.5.0
-
-    Returns
-    -------
-    Series
-        Indexes of maxima along the specified axis.
-
-    Raises
-    ------
-    ValueError
-        * If the row/column is empty
-
-    See Also
-    --------
-    Series.idxmax : Return index of the maximum element.
-
-    Notes
-    -----
-    This method is the DataFrame version of ``ndarray.argmax``.
-
-    Examples
-    --------
-    Consider a dataset containing food consumption in Argentina.
-
-    >>> df = pd.DataFrame({{'consumption': [10.51, 103.11, 55.48],
-    ...                   'co2_emissions': [37.2, 19.66, 1712]}},
-    ...                   index=['Pork', 'Wheat Products', 'Beef'])
-
-    >>> df
-                    consumption  co2_emissions
-    Pork                  10.51         37.20
-    Wheat Products       103.11         19.66
-    Beef                  55.48       1712.00
-
-    By default, it returns the index for the maximum value in each column.
-
-    >>> df.idxmax()
-    consumption     Wheat Products
-    co2_emissions             Beef
-    dtype: object
-
-    To return the index for the maximum value in each row, use ``axis="columns"``.
-
-    >>> df.idxmax(axis="columns")
-    Pork              co2_emissions
-    Wheat Products     consumption
-    Beef              co2_emissions
-    dtype: object
-"""

From 5a07ed5a8e1522886b177bcce21568ecbfe63410 Mon Sep 17 00:00:00 2001
From: ammar-qazi <ammmarqz@gmail.com>
Date: Fri, 6 Sep 2024 20:14:09 +0200
Subject: [PATCH 011/224] Resolves #59670 by documenting that
 DataFrame.from_records()'s columns filters (includes) data.  (#59723)

Update frames.py to factor in explain columns reordering
---
 pandas/core/frame.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index fe88cb86693e8..97df71e2c02a0 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2124,9 +2124,10 @@ def from_records(
         columns : sequence, default None
             Column names to use. If the passed data do not have names
             associated with them, this argument provides names for the
-            columns. Otherwise this argument indicates the order of the columns
+            columns. Otherwise, this argument indicates the order of the columns
             in the result (any names not found in the data will become all-NA
-            columns).
+            columns) and limits the data to these columns if not all column names
+            are provided.
         coerce_float : bool, default False
             Attempt to convert values of non-string, non-numeric objects (like
             decimal.Decimal) to floating point, useful for SQL result sets.

From 352289b3b6e282fcf36d7634a45a5b93839be8fa Mon Sep 17 00:00:00 2001
From: Florian Bourgey <bourgeyflorian@gmail.com>
Date: Fri, 6 Sep 2024 14:15:31 -0400
Subject: [PATCH 012/224] Missing source link (#59549)

* merged DataFrame.index and DataFrame.columns with other Axes section.

* small clean for DataFrame.columns

* reverted frame.rst file
---
 pandas/core/frame.py | 43 +++++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 97df71e2c02a0..c80e9dfd23ba2 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -13629,26 +13629,29 @@ def isin_(x):
     )
     columns = properties.AxisProperty(
         axis=0,
-        doc=dedent(
-            """
-                The column labels of the DataFrame.
-
-                See Also
-                --------
-                DataFrame.index: The index (row labels) of the DataFrame.
-                DataFrame.axes: Return a list representing the axes of the DataFrame.
-
-                Examples
-                --------
-                >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
-                >>> df
-                     A  B
-                0    1  3
-                1    2  4
-                >>> df.columns
-                Index(['A', 'B'], dtype='object')
-                """
-        ),
+        doc="""
+        The column labels of the DataFrame.
+
+        Returns
+        -------
+        pandas.Index
+            The column labels of the DataFrame.
+
+        See Also
+        --------
+        DataFrame.index: The index (row labels) of the DataFrame.
+        DataFrame.axes: Return a list representing the axes of the DataFrame.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
+        >>> df
+                A  B
+        0    1  3
+        1    2  4
+        >>> df.columns
+        Index(['A', 'B'], dtype='object')
+        """,
     )
 
     # ----------------------------------------------------------------------

From 80b685027108245086b78dbd9a176b096c92570a Mon Sep 17 00:00:00 2001
From: matiaslindgren <matias.lindgren@iki.fi>
Date: Sat, 7 Sep 2024 13:53:28 +0200
Subject: [PATCH 013/224] BUG: Fix inconsistent pivot table subaggregation when
 index is None (#59629)

---
 doc/source/whatsnew/v3.0.0.rst     |  1 +
 pandas/core/reshape/pivot.py       | 11 +++++++----
 pandas/tests/reshape/test_pivot.py | 28 ++++++++++++++++++++++++++++
 3 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index cd353b60d1a6e..9a29ff4d49966 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -668,6 +668,7 @@ Reshaping
 - Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`)
 - Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`)
 - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)
+- Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`)
 - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)
 
 Sparse
diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
index 0886aad310034..cfc6f91557781 100644
--- a/pandas/core/reshape/pivot.py
+++ b/pandas/core/reshape/pivot.py
@@ -557,7 +557,12 @@ def _all_key(key):
                 table_pieces.append(piece)
                 margin_keys.append(all_key)
         else:
-            from pandas import DataFrame
+            margin = (
+                data[cols[:1] + values]
+                .groupby(cols[:1], observed=observed)
+                .agg(aggfunc, **kwargs)
+                .T
+            )
 
             cat_axis = 0
             for key, piece in table.groupby(level=0, observed=observed):
@@ -566,9 +571,7 @@ def _all_key(key):
                 else:
                     all_key = margins_name
                 table_pieces.append(piece)
-                # GH31016 this is to calculate margin for each group, and assign
-                # corresponded key as index
-                transformed_piece = DataFrame(piece.apply(aggfunc, **kwargs)).T
+                transformed_piece = margin[key].to_frame().T
                 if isinstance(piece.index, MultiIndex):
                     # We are adding an empty level
                     transformed_piece.index = MultiIndex.from_tuples(
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
index 44b96afaa4ef5..8cfe565ebdd65 100644
--- a/pandas/tests/reshape/test_pivot.py
+++ b/pandas/tests/reshape/test_pivot.py
@@ -2785,3 +2785,31 @@ def test_pivot_empty_with_datetime(self):
             index="category", columns="value", values="timestamp"
         )
         assert df_pivoted.empty
+
+    def test_pivot_margins_with_none_index(self):
+        # GH#58722
+        df = DataFrame(
+            {
+                "x": [1, 1, 2],
+                "y": [3, 3, 4],
+                "z": [5, 5, 6],
+                "w": [7, 8, 9],
+            }
+        )
+        result = df.pivot_table(
+            index=None,
+            columns=["y", "z"],
+            values="w",
+            margins=True,
+            aggfunc="count",
+        )
+        expected = DataFrame(
+            [[2, 2, 1, 1]],
+            index=["w"],
+            columns=MultiIndex(
+                levels=[[3, 4], [5, 6, "All"]],
+                codes=[[0, 0, 1, 1], [0, 2, 1, 2]],
+                names=["y", "z"],
+            ),
+        )
+        tm.assert_frame_equal(result, expected)

From 13f45e70989625850dda374c5588d4beb54bd48c Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Mon, 9 Sep 2024 05:53:48 -0500
Subject: [PATCH 014/224] TST/BUG (string dtype): Fix and adjust indexes string
 tests (#59544)

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 pandas/core/construction.py                   |  5 +++-
 pandas/core/indexes/base.py                   |  6 ++++-
 .../tests/indexes/base_class/test_setops.py   |  6 ++---
 pandas/tests/indexes/test_base.py             | 11 ++------
 pandas/tests/indexes/test_old_base.py         | 26 ++++++++-----------
 5 files changed, 24 insertions(+), 30 deletions(-)

diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index 665eb75953078..bb3aa3867ab08 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -611,7 +611,10 @@ def sanitize_array(
                 dtype = StringDtype(na_value=np.nan)
                 subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype)
 
-            if subarr is data and copy:
+            if (
+                subarr is data
+                or (subarr.dtype == "str" and subarr.dtype.storage == "python")  # type: ignore[union-attr]
+            ) and copy:
                 subarr = subarr.copy()
 
         else:
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 582e1f96fa562..2346c20004210 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -504,7 +504,8 @@ def __new__(
 
         elif is_ea_or_datetimelike_dtype(dtype):
             # non-EA dtype indexes have special casting logic, so we punt here
-            pass
+            if isinstance(data, (set, frozenset)):
+                data = list(data)
 
         elif is_ea_or_datetimelike_dtype(data_dtype):
             pass
@@ -6877,6 +6878,9 @@ def insert(self, loc: int, item) -> Index:
             #  We cannot keep the same dtype, so cast to the (often object)
             #  minimal shared dtype before doing the insert.
             dtype = self._find_common_type_compat(item)
+            if dtype == self.dtype:
+                # EA's might run into recursion errors if loc is invalid
+                raise
             return self.astype(dtype).insert(loc, item)
 
         if arr.dtype != object or not isinstance(
diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py
index f9636ec19f2ec..0e9fb77d6e8dd 100644
--- a/pandas/tests/indexes/base_class/test_setops.py
+++ b/pandas/tests/indexes/base_class/test_setops.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     Index,
@@ -233,7 +231,6 @@ def test_tuple_union_bug(self, method, expected, sort):
         expected = Index(expected)
         tm.assert_index_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize("first_list", [["b", "a"], []])
     @pytest.mark.parametrize("second_list", [["a", "b"], []])
     @pytest.mark.parametrize(
@@ -243,6 +240,7 @@ def test_tuple_union_bug(self, method, expected, sort):
     def test_union_name_preservation(
         self, first_list, second_list, first_name, second_name, expected_name, sort
     ):
+        expected_dtype = object if not first_list or not second_list else "str"
         first = Index(first_list, name=first_name)
         second = Index(second_list, name=second_name)
         union = first.union(second, sort=sort)
@@ -253,7 +251,7 @@ def test_union_name_preservation(
             expected = Index(sorted(vals), name=expected_name)
             tm.assert_index_equal(union, expected)
         else:
-            expected = Index(vals, name=expected_name)
+            expected = Index(vals, name=expected_name, dtype=expected_dtype)
             tm.assert_index_equal(union.sort_values(), expected.sort_values())
 
     @pytest.mark.parametrize(
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
index 7ec66100b7291..486b24845d2ff 100644
--- a/pandas/tests/indexes/test_base.py
+++ b/pandas/tests/indexes/test_base.py
@@ -76,9 +76,6 @@ def test_constructor_casting(self, index):
         tm.assert_contains_all(arr, new_index)
         tm.assert_index_equal(index, new_index)
 
-    @pytest.mark.xfail(
-        using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-    )
     def test_constructor_copy(self, using_infer_string):
         index = Index(list("abc"), name="name")
         arr = np.array(index)
@@ -343,11 +340,6 @@ def test_constructor_empty_special(self, empty, klass):
     def test_view_with_args(self, index):
         index.view("i8")
 
-    @pytest.mark.xfail(
-        using_string_dtype() and not HAS_PYARROW,
-        reason="TODO(infer_string)",
-        strict=False,
-    )
     @pytest.mark.parametrize(
         "index",
         [
@@ -364,7 +356,8 @@ def test_view_with_args_object_array_raises(self, index):
             msg = "When changing to a larger dtype"
             with pytest.raises(ValueError, match=msg):
                 index.view("i8")
-        elif index.dtype == "string":
+        elif index.dtype == "str" and not index.dtype.storage == "python":
+            # TODO(infer_string): Make the errors consistent
             with pytest.raises(NotImplementedError, match="i8"):
                 index.view("i8")
         else:
diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py
index b41871ee921fd..75284a8f8fd47 100644
--- a/pandas/tests/indexes/test_old_base.py
+++ b/pandas/tests/indexes/test_old_base.py
@@ -6,10 +6,7 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs.tslibs import Timestamp
-from pandas.compat import HAS_PYARROW
 
 from pandas.core.dtypes.common import (
     is_integer_dtype,
@@ -28,6 +25,7 @@
     PeriodIndex,
     RangeIndex,
     Series,
+    StringDtype,
     TimedeltaIndex,
     isna,
     period_range,
@@ -229,7 +227,6 @@ def test_logical_compat(self, simple_index):
             with pytest.raises(TypeError, match=msg):
                 idx.any()
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     def test_repr_roundtrip(self, simple_index):
         if isinstance(simple_index, IntervalIndex):
             pytest.skip(f"Not a valid repr for {type(simple_index).__name__}")
@@ -246,11 +243,6 @@ def test_repr_max_seq_item_setting(self, simple_index):
             repr(idx)
             assert "..." not in str(idx)
 
-    @pytest.mark.xfail(
-        using_string_dtype() and not HAS_PYARROW,
-        reason="TODO(infer_string)",
-        strict=False,
-    )
     @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
     def test_ensure_copied_data(self, index):
         # Check the "copy" argument of each Index.__new__ is honoured
@@ -296,7 +288,9 @@ def test_ensure_copied_data(self, index):
                 tm.assert_numpy_array_equal(
                     index._values._mask, result._values._mask, check_same="same"
                 )
-            elif index.dtype == "string[python]":
+            elif (
+                isinstance(index.dtype, StringDtype) and index.dtype.storage == "python"
+            ):
                 assert np.shares_memory(index._values._ndarray, result._values._ndarray)
                 tm.assert_numpy_array_equal(
                     index._values._ndarray, result._values._ndarray, check_same="same"
@@ -444,11 +438,7 @@ def test_insert_base(self, index):
         result = trimmed.insert(0, index[0])
         assert index[0:4].equals(result)
 
-    @pytest.mark.skipif(
-        using_string_dtype(),
-        reason="completely different behavior, tested elsewher",
-    )
-    def test_insert_out_of_bounds(self, index):
+    def test_insert_out_of_bounds(self, index, using_infer_string):
         # TypeError/IndexError matches what np.insert raises in these cases
 
         if len(index) > 0:
@@ -460,6 +450,12 @@ def test_insert_out_of_bounds(self, index):
             msg = "index (0|0.5) is out of bounds for axis 0 with size 0"
         else:
             msg = "slice indices must be integers or None or have an __index__ method"
+
+        if using_infer_string and (
+            index.dtype == "string" or index.dtype == "category"  # noqa: PLR1714
+        ):
+            msg = "loc must be an integer between"
+
         with pytest.raises(err, match=msg):
             index.insert(0.5, "foo")
 

From b7dedf56ad529a2b18f17ae621a69644867c69c7 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Mon, 9 Sep 2024 06:40:22 -0500
Subject: [PATCH 015/224] TST (string dtype): Adjust indexing string tests
 (#59541)

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 pandas/core/arrays/string_.py              |  4 ++
 pandas/core/arrays/string_arrow.py         |  2 +-
 pandas/tests/arrays/string_/test_string.py |  5 +--
 pandas/tests/indexing/test_iloc.py         | 31 +++++++-------
 pandas/tests/indexing/test_indexing.py     | 18 ++++----
 pandas/tests/indexing/test_loc.py          | 48 +++++++++++++---------
 6 files changed, 57 insertions(+), 51 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 88fd1481031f8..a46475a7d1ec2 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -715,6 +715,10 @@ def __setitem__(self, key, value) -> None:
         else:
             if not is_array_like(value):
                 value = np.asarray(value, dtype=object)
+            else:
+                # cast categories and friends to arrays to see if values are
+                # compatible, compatibility with arrow backed strings
+                value = np.asarray(value)
             if len(value) and not lib.is_string_array(value, skipna=True):
                 raise TypeError("Must provide strings.")
 
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 97381b82ceab9..1e5adf106752f 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -240,7 +240,7 @@ def _maybe_convert_setitem_value(self, value):
             value[isna(value)] = None
             for v in value:
                 if not (v is None or isinstance(v, str)):
-                    raise TypeError("Scalar must be NA or str")
+                    raise TypeError("Must provide strings")
         return super()._maybe_convert_setitem_value(value)
 
     def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index dd87dbf8e9a43..87bd1d5921caa 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -102,10 +102,7 @@ def test_setitem_validates(cls, dtype):
     with pytest.raises(TypeError, match=msg):
         arr[0] = 10
 
-    if dtype.storage == "python":
-        msg = "Must provide strings."
-    else:
-        msg = "Scalar must be NA or str"
+    msg = "Must provide strings"
     with pytest.raises(TypeError, match=msg):
         arr[:] = np.array([1, 2])
 
diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py
index b05b5d3dea2dc..dc95e1bb1b8a0 100644
--- a/pandas/tests/indexing/test_iloc.py
+++ b/pandas/tests/indexing/test_iloc.py
@@ -6,8 +6,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import IndexingError
 
 from pandas import (
@@ -1198,22 +1196,25 @@ def test_iloc_getitem_int_single_ea_block_view(self):
         arr[2] = arr[-1]
         assert ser[0] == arr[-1]
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-    def test_iloc_setitem_multicolumn_to_datetime(self):
+    def test_iloc_setitem_multicolumn_to_datetime(self, using_infer_string):
         # GH#20511
         df = DataFrame({"A": ["2022-01-01", "2022-01-02"], "B": ["2021", "2022"]})
 
-        df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])})
-        expected = DataFrame(
-            {
-                "A": [
-                    Timestamp("2021-01-01 00:00:00"),
-                    Timestamp("2022-01-01 00:00:00"),
-                ],
-                "B": ["2021", "2022"],
-            }
-        )
-        tm.assert_frame_equal(df, expected, check_dtype=False)
+        if using_infer_string:
+            with pytest.raises(TypeError, match="Invalid value"):
+                df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])})
+        else:
+            df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])})
+            expected = DataFrame(
+                {
+                    "A": [
+                        Timestamp("2021-01-01 00:00:00"),
+                        Timestamp("2022-01-01 00:00:00"),
+                    ],
+                    "B": ["2021", "2022"],
+                }
+            )
+            tm.assert_frame_equal(df, expected, check_dtype=False)
 
 
 class TestILocErrors:
diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
index f7ada06e3ecb2..fb7e6649c534f 100644
--- a/pandas/tests/indexing/test_indexing.py
+++ b/pandas/tests/indexing/test_indexing.py
@@ -8,8 +8,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import IndexingError
 
 from pandas.core.dtypes.common import (
@@ -528,12 +526,12 @@ def test_string_slice_empty(self):
         with pytest.raises(KeyError, match="^0$"):
             df.loc["2011", 0]
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_astype_assignment(self, using_infer_string):
         # GH4312 (iloc)
         df_orig = DataFrame(
             [["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
         )
+        df_orig[list("ABCDG")] = df_orig[list("ABCDG")].astype(object)
 
         df = df_orig.copy()
 
@@ -543,9 +541,9 @@ def test_astype_assignment(self, using_infer_string):
         expected = DataFrame(
             [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
         )
-        if not using_infer_string:
-            expected["A"] = expected["A"].astype(object)
-            expected["B"] = expected["B"].astype(object)
+        expected[list("CDG")] = expected[list("CDG")].astype(object)
+        expected["A"] = expected["A"].astype(object)
+        expected["B"] = expected["B"].astype(object)
         tm.assert_frame_equal(df, expected)
 
         # GH5702 (loc)
@@ -554,18 +552,16 @@ def test_astype_assignment(self, using_infer_string):
         expected = DataFrame(
             [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
         )
-        if not using_infer_string:
-            expected["A"] = expected["A"].astype(object)
+        expected[list("ABCDG")] = expected[list("ABCDG")].astype(object)
         tm.assert_frame_equal(df, expected)
 
         df = df_orig.copy()
+
         df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64)
         expected = DataFrame(
             [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
         )
-        if not using_infer_string:
-            expected["B"] = expected["B"].astype(object)
-            expected["C"] = expected["C"].astype(object)
+        expected[list("ABCDG")] = expected[list("ABCDG")].astype(object)
         tm.assert_frame_equal(df, expected)
 
     def test_astype_assignment_full_replacements(self):
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
index e007b8c4e97ac..36b08ee1df790 100644
--- a/pandas/tests/indexing/test_loc.py
+++ b/pandas/tests/indexing/test_loc.py
@@ -1,6 +1,7 @@
 """test label based indexing with loc"""
 
 from collections import namedtuple
+import contextlib
 from datetime import (
     date,
     datetime,
@@ -13,10 +14,7 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs import index as libindex
-from pandas.compat import HAS_PYARROW
 from pandas.errors import IndexingError
 
 import pandas as pd
@@ -615,8 +613,7 @@ def test_loc_setitem_consistency_empty(self):
         expected["x"] = expected["x"].astype(np.int64)
         tm.assert_frame_equal(df, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-    def test_loc_setitem_consistency_slice_column_len(self):
+    def test_loc_setitem_consistency_slice_column_len(self, using_infer_string):
         # .loc[:,column] setting with slice == len of the column
         # GH10408
         levels = [
@@ -640,12 +637,23 @@ def test_loc_setitem_consistency_slice_column_len(self):
         ]
         df = DataFrame(values, index=mi, columns=cols)
 
-        df.loc[:, ("Respondent", "StartDate")] = to_datetime(
-            df.loc[:, ("Respondent", "StartDate")]
-        )
-        df.loc[:, ("Respondent", "EndDate")] = to_datetime(
-            df.loc[:, ("Respondent", "EndDate")]
-        )
+        ctx = contextlib.nullcontext()
+        if using_infer_string:
+            ctx = pytest.raises(TypeError, match="Invalid value")
+
+        with ctx:
+            df.loc[:, ("Respondent", "StartDate")] = to_datetime(
+                df.loc[:, ("Respondent", "StartDate")]
+            )
+        with ctx:
+            df.loc[:, ("Respondent", "EndDate")] = to_datetime(
+                df.loc[:, ("Respondent", "EndDate")]
+            )
+
+        if using_infer_string:
+            # infer-objects won't infer stuff anymore
+            return
+
         df = df.infer_objects()
 
         # Adding a new key
@@ -1211,20 +1219,23 @@ def test_loc_reverse_assignment(self):
 
         tm.assert_series_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="can't set int into string")
-    def test_loc_setitem_str_to_small_float_conversion_type(self):
+    def test_loc_setitem_str_to_small_float_conversion_type(self, using_infer_string):
         # GH#20388
 
         col_data = [str(np.random.default_rng(2).random() * 1e-12) for _ in range(5)]
         result = DataFrame(col_data, columns=["A"])
-        expected = DataFrame(col_data, columns=["A"], dtype=object)
+        expected = DataFrame(col_data, columns=["A"])
         tm.assert_frame_equal(result, expected)
 
         # assigning with loc/iloc attempts to set the values inplace, which
         #  in this case is successful
-        result.loc[result.index, "A"] = [float(x) for x in col_data]
-        expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object)
-        tm.assert_frame_equal(result, expected)
+        if using_infer_string:
+            with pytest.raises(TypeError, match="Must provide strings"):
+                result.loc[result.index, "A"] = [float(x) for x in col_data]
+        else:
+            result.loc[result.index, "A"] = [float(x) for x in col_data]
+            expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object)
+            tm.assert_frame_equal(result, expected)
 
         # assigning the entire column using __setitem__ swaps in the new array
         # GH#???
@@ -1389,9 +1400,6 @@ def test_loc_setitem_categorical_values_partial_column_slice(self):
             df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"])
             df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"])
 
-    @pytest.mark.xfail(
-        using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-    )
     def test_loc_setitem_single_row_categorical(self, using_infer_string):
         # GH#25495
         df = DataFrame({"Alpha": ["a"], "Numeric": [0]})

From b0593e20c4a661250df5ab4d832510c1f5819103 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 9 Sep 2024 09:38:47 -0700
Subject: [PATCH 016/224] Bump deadsnakes/action from 3.1.0 to 3.2.0 (#59757)

Bumps [deadsnakes/action](https://github.com/deadsnakes/action) from 3.1.0 to 3.2.0.
- [Release notes](https://github.com/deadsnakes/action/releases)
- [Commits](https://github.com/deadsnakes/action/compare/v3.1.0...v3.2.0)

---
updated-dependencies:
- dependency-name: deadsnakes/action
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/unit-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index d392c84be66fe..d145836f3e596 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -380,7 +380,7 @@ jobs:
           fetch-depth: 0
 
       - name: Set up Python Free-threading Version
-        uses: deadsnakes/action@v3.1.0
+        uses: deadsnakes/action@v3.2.0
         with:
           python-version: 3.13-dev
           nogil: true

From 53cadbbd89a3393d615e4d7abf48f3ec1903fe7b Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 9 Sep 2024 19:15:02 +0200
Subject: [PATCH 017/224] TST (string dtype): adjust pandas/tests/reshape tests
 (#59762)

---
 pandas/tests/reshape/concat/test_concat.py    | 11 ++------
 pandas/tests/reshape/merge/test_merge_asof.py | 10 ++-----
 pandas/tests/reshape/test_get_dummies.py      | 10 ++-----
 pandas/tests/reshape/test_melt.py             | 25 ++++++-----------
 pandas/tests/reshape/test_pivot.py            | 28 ++++++++++++-------
 5 files changed, 34 insertions(+), 50 deletions(-)

diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py
index 8af224f1ad64f..d3edee17366f7 100644
--- a/pandas/tests/reshape/concat/test_concat.py
+++ b/pandas/tests/reshape/concat/test_concat.py
@@ -10,8 +10,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import InvalidIndexError
 
 import pandas as pd
@@ -47,18 +45,11 @@ def test_append_concat(self):
         assert isinstance(result.index, PeriodIndex)
         assert result.index[0] == s1.index[0]
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_concat_copy(self):
         df = DataFrame(np.random.default_rng(2).standard_normal((4, 3)))
         df2 = DataFrame(np.random.default_rng(2).integers(0, 10, size=4).reshape(4, 1))
         df3 = DataFrame({5: "foo"}, index=range(4))
 
-        # These are actual copies.
-        result = concat([df, df2, df3], axis=1)
-        for block in result._mgr.blocks:
-            assert block.values.base is not None
-
-        # These are the same.
         result = concat([df, df2, df3], axis=1)
 
         for block in result._mgr.blocks:
@@ -69,6 +60,8 @@ def test_concat_copy(self):
                 assert arr.base is df2._mgr.blocks[0].values.base
             elif arr.dtype == object:
                 assert arr.base is not None
+            elif arr.dtype == "string":
+                tm.shares_memory(arr, df3._mgr.blocks[0].values)
 
         # Float block was consolidated.
         df4 = DataFrame(np.random.default_rng(2).standard_normal((4, 1)))
diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py
index 8d972087b0dff..f7b0876c5a605 100644
--- a/pandas/tests/reshape/merge/test_merge_asof.py
+++ b/pandas/tests/reshape/merge/test_merge_asof.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -3064,12 +3062,8 @@ def test_on_float_by_int(self):
 
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-    def test_merge_datatype_error_raises(self, using_infer_string):
-        if using_infer_string:
-            msg = "incompatible merge keys"
-        else:
-            msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype"
+    def test_merge_datatype_error_raises(self):
+        msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype"
 
         left = pd.DataFrame({"left_val": [1, 5, 10], "a": ["a", "b", "c"]})
         right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7], "a": [1, 2, 3, 6, 7]})
diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py
index 27a34decae7b0..f07c6845366da 100644
--- a/pandas/tests/reshape/test_get_dummies.py
+++ b/pandas/tests/reshape/test_get_dummies.py
@@ -4,8 +4,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas.util._test_decorators as td
 
 from pandas.core.dtypes.common import is_integer_dtype
@@ -216,11 +214,10 @@ def test_dataframe_dummies_all_obj(self, df, sparse):
 
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-    def test_dataframe_dummies_string_dtype(self, df, using_infer_string):
+    def test_dataframe_dummies_string_dtype(self, df, any_string_dtype):
         # GH44965
         df = df[["A", "B"]]
-        df = df.astype({"A": "object", "B": "string"})
+        df = df.astype({"A": "str", "B": any_string_dtype})
         result = get_dummies(df)
         expected = DataFrame(
             {
@@ -231,8 +228,7 @@ def test_dataframe_dummies_string_dtype(self, df, using_infer_string):
             },
             dtype=bool,
         )
-        if not using_infer_string:
-            # infer_string returns numpy bools
+        if any_string_dtype == "string" and any_string_dtype.na_value is pd.NA:
             expected[["B_b", "B_c"]] = expected[["B_b", "B_c"]].astype("boolean")
         tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py
index be4f2ab4d183d..4a12404f6775a 100644
--- a/pandas/tests/reshape/test_melt.py
+++ b/pandas/tests/reshape/test_melt.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -21,7 +19,7 @@
 def df():
     res = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=10, freq="B"),
     )
     res["id1"] = (res["A"] > 0).astype(np.int64)
@@ -83,7 +81,6 @@ def test_default_col_names(self, df):
         result2 = df.melt(id_vars=["id1", "id2"])
         assert result2.columns.tolist() == ["id1", "id2", "variable", "value"]
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_value_vars(self, df):
         result3 = df.melt(id_vars=["id1", "id2"], value_vars="A")
         assert len(result3) == 10
@@ -100,7 +97,6 @@ def test_value_vars(self, df):
         )
         tm.assert_frame_equal(result4, expected4)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     @pytest.mark.parametrize("type_", (tuple, list, np.array))
     def test_value_vars_types(self, type_, df):
         # GH 15348
@@ -178,7 +174,6 @@ def test_tuple_vars_fail_with_multiindex(self, id_vars, value_vars, df1):
         with pytest.raises(ValueError, match=msg):
             df1.melt(id_vars=id_vars, value_vars=value_vars)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_custom_var_name(self, df, var_name):
         result5 = df.melt(var_name=var_name)
         assert result5.columns.tolist() == ["var", "value"]
@@ -206,7 +201,6 @@ def test_custom_var_name(self, df, var_name):
         )
         tm.assert_frame_equal(result9, expected9)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_custom_value_name(self, df, value_name):
         result10 = df.melt(value_name=value_name)
         assert result10.columns.tolist() == ["variable", "val"]
@@ -236,7 +230,6 @@ def test_custom_value_name(self, df, value_name):
         )
         tm.assert_frame_equal(result14, expected14)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_custom_var_and_value_name(self, df, value_name, var_name):
         result15 = df.melt(var_name=var_name, value_name=value_name)
         assert result15.columns.tolist() == ["var", "val"]
@@ -361,7 +354,6 @@ def test_melt_missing_columns_raises(self):
         with pytest.raises(KeyError, match=msg):
             df.melt(["A"], ["F"], col_level=0)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_melt_mixed_int_str_id_vars(self):
         # GH 29718
         df = DataFrame({0: ["foo"], "a": ["bar"], "b": [1], "d": [2]})
@@ -369,6 +361,8 @@ def test_melt_mixed_int_str_id_vars(self):
         expected = DataFrame(
             {0: ["foo"] * 2, "a": ["bar"] * 2, "variable": list("bd"), "value": [1, 2]}
         )
+        # the df's columns are mixed type and thus object -> preserves object dtype
+        expected["variable"] = expected["variable"].astype(object)
         tm.assert_frame_equal(result, expected)
 
     def test_melt_mixed_int_str_value_vars(self):
@@ -1222,12 +1216,10 @@ def test_raise_of_column_name_value(self):
         ):
             df.melt(id_vars="value", value_name="value")
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
-    @pytest.mark.parametrize("dtype", ["O", "string"])
-    def test_missing_stubname(self, dtype):
+    def test_missing_stubname(self, any_string_dtype):
         # GH46044
         df = DataFrame({"id": ["1", "2"], "a-1": [100, 200], "a-2": [300, 400]})
-        df = df.astype({"id": dtype})
+        df = df.astype({"id": any_string_dtype})
         result = wide_to_long(
             df,
             stubnames=["a", "b"],
@@ -1243,12 +1235,13 @@ def test_missing_stubname(self, dtype):
             {"a": [100, 200, 300, 400], "b": [np.nan] * 4},
             index=index,
         )
-        new_level = expected.index.levels[0].astype(dtype)
+        new_level = expected.index.levels[0].astype(any_string_dtype)
+        if any_string_dtype == "object":
+            new_level = expected.index.levels[0].astype("str")
         expected.index = expected.index.set_levels(new_level, level=0)
         tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_wide_to_long_pyarrow_string_columns():
     # GH 57066
     pytest.importorskip("pyarrow")
@@ -1267,7 +1260,7 @@ def test_wide_to_long_pyarrow_string_columns():
     )
     expected = DataFrame(
         [[1, 1], [1, 1], [1, 2]],
-        columns=Index(["D", "R"], dtype=object),
+        columns=Index(["D", "R"]),
         index=pd.MultiIndex.from_arrays(
             [
                 [1, 1, 1],
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
index 8cfe565ebdd65..eccf676b87f89 100644
--- a/pandas/tests/reshape/test_pivot.py
+++ b/pandas/tests/reshape/test_pivot.py
@@ -1068,7 +1068,6 @@ def test_margins_dtype_len(self, data):
 
         tm.assert_frame_equal(expected, result)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)])
     def test_pivot_table_multiindex_only(self, cols):
         # GH 17038
@@ -1078,7 +1077,7 @@ def test_pivot_table_multiindex_only(self, cols):
         expected = DataFrame(
             [[4.0, 5.0, 6.0]],
             columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols),
-            index=Index(["v"], dtype=object),
+            index=Index(["v"], dtype="str" if cols == ("a", "b") else "object"),
         )
 
         tm.assert_frame_equal(result, expected)
@@ -2570,13 +2569,16 @@ def test_pivot_empty(self):
         expected = DataFrame(index=[], columns=[])
         tm.assert_frame_equal(result, expected, check_names=False)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
-    @pytest.mark.parametrize("dtype", [object, "string"])
-    def test_pivot_integer_bug(self, dtype):
-        df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=dtype)
+    def test_pivot_integer_bug(self, any_string_dtype):
+        df = DataFrame(
+            data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=any_string_dtype
+        )
 
         result = df.pivot(index=1, columns=0, values=2)
-        tm.assert_index_equal(result.columns, Index(["A", "B"], name=0, dtype=dtype))
+        expected_columns = Index(["A", "B"], name=0, dtype=any_string_dtype)
+        if any_string_dtype == "object":
+            expected_columns = expected_columns.astype("str")
+        tm.assert_index_equal(result.columns, expected_columns)
 
     def test_pivot_index_none(self):
         # GH#3962
@@ -2658,7 +2660,9 @@ def test_pivot_columns_not_given(self):
         with pytest.raises(TypeError, match="missing 1 required keyword-only argument"):
             df.pivot()
 
-    @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN")
+    @pytest.mark.xfail(
+        using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
+    )
     def test_pivot_columns_is_none(self):
         # GH#48293
         df = DataFrame({None: [1], "b": 2, "c": 3})
@@ -2674,7 +2678,9 @@ def test_pivot_columns_is_none(self):
         expected = DataFrame({1: 3}, index=Index([2], name="b"))
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN")
+    @pytest.mark.xfail(
+        using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
+    )
     def test_pivot_index_is_none(self):
         # GH#48293
         df = DataFrame({None: [1], "b": 2, "c": 3})
@@ -2688,7 +2694,9 @@ def test_pivot_index_is_none(self):
         expected = DataFrame(3, index=[1], columns=Index([2], name="b"))
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN")
+    @pytest.mark.xfail(
+        using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
+    )
     def test_pivot_values_is_none(self):
         # GH#48293
         df = DataFrame({None: [1], "b": 2, "c": 3})

From 078b73226eb06b6a13bd5822efa5fba7fe47e97c Mon Sep 17 00:00:00 2001
From: Abhinav Reddy <abhinav071197@gmail.com>
Date: Mon, 9 Sep 2024 13:22:08 -0400
Subject: [PATCH 018/224] Fix docs for api.types (#59753)

* Fix is_bool

* Fix is_categorical_dtype

* Fix is_complex

* Fix is_complex_dtype

* Fix is_datetime64_dtype

* Fix is_datetime64_ns_dtype

* Fix is_datetime64tz_dtype

---------

Co-authored-by: Abhinav Thimma <athimma2@illinois.edu>
---
 ci/code_checks.sh            |  7 -------
 pandas/_libs/lib.pyx         | 23 +++++++++++++++++++++++
 pandas/core/dtypes/common.py | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 7ed5103b3b796..44a6b91aeb565 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -123,13 +123,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timestamp.tzinfo GL08" \
         -i "pandas.Timestamp.year GL08" \
         -i "pandas.api.extensions.ExtensionArray.interpolate PR01,SA01" \
-        -i "pandas.api.types.is_bool PR01,SA01" \
-        -i "pandas.api.types.is_categorical_dtype SA01" \
-        -i "pandas.api.types.is_complex PR01,SA01" \
-        -i "pandas.api.types.is_complex_dtype SA01" \
-        -i "pandas.api.types.is_datetime64_dtype SA01" \
-        -i "pandas.api.types.is_datetime64_ns_dtype SA01" \
-        -i "pandas.api.types.is_datetime64tz_dtype SA01" \
         -i "pandas.api.types.is_dict_like PR07,SA01" \
         -i "pandas.api.types.is_extension_array_dtype SA01" \
         -i "pandas.api.types.is_file_like PR07,SA01" \
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index e1a2a0142c52e..47a31954b9d6c 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1123,10 +1123,21 @@ def is_bool(obj: object) -> bool:
     """
     Return True if given object is boolean.
 
+    Parameters
+    ----------
+    obj : object
+        Object to check.
+
     Returns
     -------
     bool
 
+    See Also
+    --------
+    api.types.is_scalar : Check if the input is a scalar.
+    api.types.is_integer : Check if the input is an integer.
+    api.types.is_float : Check if the input is a float.
+
     Examples
     --------
     >>> pd.api.types.is_bool(True)
@@ -1142,10 +1153,22 @@ def is_complex(obj: object) -> bool:
     """
     Return True if given object is complex.
 
+    Parameters
+    ----------
+    obj : object
+        Object to check.
+
     Returns
     -------
     bool
 
+    See Also
+    --------
+    api.types.is_complex_dtype: Check whether the provided array or
+                                dtype is of a complex dtype.
+    api.types.is_number: Check if the object is a number.
+    api.types.is_integer: Return True if given object is integer.
+
     Examples
     --------
     >>> pd.api.types.is_complex(1 + 1j)
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index bcf1ade9b0320..16f6bd396fe93 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -279,6 +279,13 @@ def is_datetime64_dtype(arr_or_dtype) -> bool:
     boolean
         Whether or not the array-like or dtype is of the datetime64 dtype.
 
+    See Also
+    --------
+    api.types.is_datetime64_ns_dtype: Check whether the provided array or
+                                        dtype is of the datetime64[ns] dtype.
+    api.types.is_datetime64_any_dtype: Check whether the provided array or
+                                        dtype is of the datetime64 dtype.
+
     Examples
     --------
     >>> from pandas.api.types import is_datetime64_dtype
@@ -316,6 +323,13 @@ def is_datetime64tz_dtype(arr_or_dtype) -> bool:
     boolean
         Whether or not the array-like or dtype is of a DatetimeTZDtype dtype.
 
+    See Also
+    --------
+    api.types.is_datetime64_dtype: Check whether an array-like or
+                                        dtype is of the datetime64 dtype.
+    api.types.is_datetime64_any_dtype: Check whether the provided array or
+                                        dtype is of the datetime64 dtype.
+
     Examples
     --------
     >>> from pandas.api.types import is_datetime64tz_dtype
@@ -514,6 +528,12 @@ def is_categorical_dtype(arr_or_dtype) -> bool:
     boolean
         Whether or not the array-like or dtype is of the Categorical dtype.
 
+    See Also
+    --------
+    api.types.is_list_like: Check if the object is list-like.
+    api.types.is_complex_dtype: Check whether the provided array or
+                                dtype is of a complex dtype.
+
     Examples
     --------
     >>> from pandas.api.types import is_categorical_dtype
@@ -977,6 +997,13 @@ def is_datetime64_ns_dtype(arr_or_dtype) -> bool:
     bool
         Whether or not the array or dtype is of the datetime64[ns] dtype.
 
+    See Also
+    --------
+    api.types.is_datetime64_dtype: Check whether an array-like or
+                                        dtype is of the datetime64 dtype.
+    api.types.is_datetime64_any_dtype: Check whether the provided array or
+                                        dtype is of the datetime64 dtype.
+
     Examples
     --------
     >>> from pandas.api.types import is_datetime64_ns_dtype
@@ -1436,6 +1463,14 @@ def is_complex_dtype(arr_or_dtype) -> bool:
     boolean
         Whether or not the array or dtype is of a complex dtype.
 
+    See Also
+    --------
+    api.types.is_complex: Return True if given object is complex.
+    api.types.is_numeric_dtype: Check whether the provided array or
+                                dtype is of a numeric dtype.
+    api.types.is_integer_dtype: Check whether the provided array or
+                                dtype is of an integer dtype.
+
     Examples
     --------
     >>> from pandas.api.types import is_complex_dtype

From f3d19fb5298e98b2ff0a16dd03b6f30e32b38069 Mon Sep 17 00:00:00 2001
From: "Mien (Josephine) Nguyen" <josephinee.nguyen@gmail.com>
Date: Mon, 9 Sep 2024 13:27:15 -0400
Subject: [PATCH 019/224] TST: Update IntervalArray min/max test to fail on
 changed default skipna (#59747)

Test
---
 pandas/tests/arrays/interval/test_interval.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py
index 58ba340441d86..8e13dcf25ceba 100644
--- a/pandas/tests/arrays/interval/test_interval.py
+++ b/pandas/tests/arrays/interval/test_interval.py
@@ -222,9 +222,10 @@ def test_min_max(self, left_right_dtypes, index_or_series_or_array):
         res = arr_na.max(skipna=False)
         assert np.isnan(res)
 
-        res = arr_na.min(skipna=True)
-        assert res == MIN
-        assert type(res) == type(MIN)
-        res = arr_na.max(skipna=True)
-        assert res == MAX
-        assert type(res) == type(MAX)
+        for kws in [{"skipna": True}, {}]:
+            res = arr_na.min(**kws)
+            assert res == MIN
+            assert type(res) == type(MIN)
+            res = arr_na.max(**kws)
+            assert res == MAX
+            assert type(res) == type(MAX)

From ea22788f6193eeb1aa9dea25481ab7fe72ea41c5 Mon Sep 17 00:00:00 2001
From: ivanpan0626 <151955212+ivanpan0626@users.noreply.github.com>
Date: Mon, 9 Sep 2024 13:28:44 -0400
Subject: [PATCH 020/224] DOCS: fix docstring validation errors for
 groupby.DataFrameGroupBy.filter, groupby.SeriesGroupBy.filter (#59742)

* DOCS: fix docstring validation errors for pandas.core.groupby.DataFrameGroupBy.filter

DOC string fix for both groupby.DataFrameGroupBy.filter and groupby.SeriesGroupBy.filter

* Update generic.py

* Update generic.py

* Update generic.py

* quickfix
---
 ci/code_checks.sh              |  2 --
 pandas/core/groupby/generic.py | 18 +++++++++++++++---
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 44a6b91aeb565..fdacd2fed7729 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -156,7 +156,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.groupby.DataFrameGroupBy.agg RT03" \
         -i "pandas.core.groupby.DataFrameGroupBy.aggregate RT03" \
         -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \
-        -i "pandas.core.groupby.DataFrameGroupBy.filter SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.hist RT03" \
@@ -172,7 +171,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.agg RT03" \
         -i "pandas.core.groupby.SeriesGroupBy.aggregate RT03" \
-        -i "pandas.core.groupby.SeriesGroupBy.filter PR01,SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.groups SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.indices SA01" \
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index c112d9b6a4b54..230f61bab96df 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -600,15 +600,23 @@ def filter(self, func, dropna: bool = True, *args, **kwargs):
         ----------
         func : function
             Criterion to apply to each group. Should return True or False.
-        dropna : bool
+        dropna : bool, optional
             Drop groups that do not pass the filter. True by default; if False,
             groups that evaluate False are filled with NaNs.
+        *args : tuple
+            Optional positional arguments to pass to `func`.
+        **kwargs : dict
+            Optional keyword arguments to pass to `func`.
 
         Returns
         -------
         Series
             The filtered subset of the original Series.
 
+        See Also
+        --------
+        DataFrameGroupBy.filter : Filter elements from groups base on criterion.
+
         Notes
         -----
         Functions that mutate the passed object can produce unexpected
@@ -1943,9 +1951,9 @@ def filter(self, func, dropna: bool = True, *args, **kwargs) -> DataFrame:
         dropna : bool
             Drop groups that do not pass the filter. True by default; if False,
             groups that evaluate False are filled with NaNs.
-        *args
+        *args : tuple
             Additional positional arguments to pass to `func`.
-        **kwargs
+        **kwargs : dict
             Additional keyword arguments to pass to `func`.
 
         Returns
@@ -1953,6 +1961,10 @@ def filter(self, func, dropna: bool = True, *args, **kwargs) -> DataFrame:
         DataFrame
             The filtered subset of the original DataFrame.
 
+        See Also
+        --------
+        SeriesGroupBy.filter : Filter elements from groups base on criterion.
+
         Notes
         -----
         Each subframe is endowed the attribute 'name' in case you need to know

From 6b74d6f61552f152422ffa53191301aa94b82ade Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Mon, 9 Sep 2024 23:04:13 +0530
Subject: [PATCH 021/224] DOC: fix SA01,ES01 for pandas.RangeIndex.stop
 (#59729)

* DOC: fix SA01,ES01 for pandas.RangeIndex.stop

* remove superfluous description of RangeIndex
---
 ci/code_checks.sh            |  1 -
 pandas/core/indexes/range.py | 11 +++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index fdacd2fed7729..fa4e7ed8c3104 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -77,7 +77,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.RangeIndex.from_range PR01,SA01" \
         -i "pandas.RangeIndex.start SA01" \
         -i "pandas.RangeIndex.step SA01" \
-        -i "pandas.RangeIndex.stop SA01" \
         -i "pandas.Series.cat.add_categories PR01,PR02" \
         -i "pandas.Series.cat.as_ordered PR01" \
         -i "pandas.Series.cat.as_unordered PR01" \
diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py
index b11ce6bd7b919..154e142c41db2 100644
--- a/pandas/core/indexes/range.py
+++ b/pandas/core/indexes/range.py
@@ -313,6 +313,17 @@ def stop(self) -> int:
         """
         The value of the `stop` parameter.
 
+        This property returns the `stop` value of the RangeIndex, which defines the
+        upper (or lower, in case of negative steps) bound of the index range. The
+        `stop` value is exclusive, meaning the RangeIndex includes values up to but
+        not including this value.
+
+        See Also
+        --------
+        RangeIndex : Immutable index representing a range of integers.
+        RangeIndex.start : The start value of the RangeIndex.
+        RangeIndex.step : The step size between elements in the RangeIndex.
+
         Examples
         --------
         >>> idx = pd.RangeIndex(5)

From 9ec6b2a4771170f9fdf70f0e166229eb54ad3a75 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Mon, 9 Sep 2024 23:04:38 +0530
Subject: [PATCH 022/224] DOC: fix SA01,ES01 for pandas.RangeIndex.start
 (#59728)

* DOC: fix SA01,ES01 for pandas.RangeIndex.start

* remove superfluous description of RangeIndex
---
 ci/code_checks.sh            |  1 -
 pandas/core/indexes/range.py | 10 ++++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index fa4e7ed8c3104..2870de5a0c85a 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -75,7 +75,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Period.ordinal GL08" \
         -i "pandas.PeriodDtype.freq SA01" \
         -i "pandas.RangeIndex.from_range PR01,SA01" \
-        -i "pandas.RangeIndex.start SA01" \
         -i "pandas.RangeIndex.step SA01" \
         -i "pandas.Series.cat.add_categories PR01,PR02" \
         -i "pandas.Series.cat.as_ordered PR01" \
diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py
index 154e142c41db2..75d0dfbeb6f01 100644
--- a/pandas/core/indexes/range.py
+++ b/pandas/core/indexes/range.py
@@ -295,6 +295,16 @@ def start(self) -> int:
         """
         The value of the `start` parameter (``0`` if this was not supplied).
 
+        This property returns the starting value of the `RangeIndex`. If the `start`
+        value is not explicitly provided during the creation of the `RangeIndex`,
+        it defaults to 0.
+
+        See Also
+        --------
+        RangeIndex : Immutable index implementing a range-based index.
+        RangeIndex.stop : Returns the stop value of the `RangeIndex`.
+        RangeIndex.step : Returns the step value of the `RangeIndex`.
+
         Examples
         --------
         >>> idx = pd.RangeIndex(5)

From 871703dfc6150db112dde10a0135d3a758e77cd8 Mon Sep 17 00:00:00 2001
From: Marco Edward Gorelli <marcogorelli@protonmail.com>
Date: Mon, 9 Sep 2024 18:36:52 +0100
Subject: [PATCH 023/224] fix: use fastpath for PyCapsule export when starting
 from pyarrow-backed Series, respect requested_schema (#59683)

* fix: use fastpath for PyCapsule export when starting from pyarrow-backed Series, respect requested_schema

* simplify

* stringdtype test
---
 pandas/core/series.py                       | 11 ++++--
 pandas/tests/series/test_arrow_interface.py | 38 +++++++++++++++++++++
 2 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/pandas/core/series.py b/pandas/core/series.py
index 4f79e30f48f3c..0c26ce27c680c 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -580,8 +580,15 @@ def __arrow_c_stream__(self, requested_schema=None):
         PyCapsule
         """
         pa = import_optional_dependency("pyarrow", min_version="16.0.0")
-        ca = pa.chunked_array([pa.Array.from_pandas(self, type=requested_schema)])
-        return ca.__arrow_c_stream__(requested_schema)
+        type = (
+            pa.DataType._import_from_c_capsule(requested_schema)
+            if requested_schema is not None
+            else None
+        )
+        ca = pa.array(self, type=type)
+        if not isinstance(ca, pa.ChunkedArray):
+            ca = pa.chunked_array([ca])
+        return ca.__arrow_c_stream__()
 
     # ----------------------------------------------------------------------
 
diff --git a/pandas/tests/series/test_arrow_interface.py b/pandas/tests/series/test_arrow_interface.py
index 34a2a638e4185..e73cf9bee6aeb 100644
--- a/pandas/tests/series/test_arrow_interface.py
+++ b/pandas/tests/series/test_arrow_interface.py
@@ -21,3 +21,41 @@ def test_series_arrow_interface():
     ca = pa.chunked_array(s)
     expected = pa.chunked_array([[1, 4, 2]])
     assert ca.equals(expected)
+    ca = pa.chunked_array(s, type=pa.int32())
+    expected = pa.chunked_array([[1, 4, 2]], type=pa.int32())
+    assert ca.equals(expected)
+
+
+def test_series_arrow_interface_arrow_dtypes():
+    s = pd.Series([1, 4, 2], dtype="Int64[pyarrow]")
+
+    capsule = s.__arrow_c_stream__()
+    assert (
+        ctypes.pythonapi.PyCapsule_IsValid(
+            ctypes.py_object(capsule), b"arrow_array_stream"
+        )
+        == 1
+    )
+
+    ca = pa.chunked_array(s)
+    expected = pa.chunked_array([[1, 4, 2]])
+    assert ca.equals(expected)
+    ca = pa.chunked_array(s, type=pa.int32())
+    expected = pa.chunked_array([[1, 4, 2]], type=pa.int32())
+    assert ca.equals(expected)
+
+
+def test_series_arrow_interface_stringdtype():
+    s = pd.Series(["foo", "bar"], dtype="string[pyarrow]")
+
+    capsule = s.__arrow_c_stream__()
+    assert (
+        ctypes.pythonapi.PyCapsule_IsValid(
+            ctypes.py_object(capsule), b"arrow_array_stream"
+        )
+        == 1
+    )
+
+    ca = pa.chunked_array(s)
+    expected = pa.chunked_array([["foo", "bar"]], type=pa.large_string())
+    assert ca.equals(expected)

From 47b56ea9ced016fc1c273c2453981a53666038a7 Mon Sep 17 00:00:00 2001
From: Katsia <47710336+KatsiarynaDzibrova@users.noreply.github.com>
Date: Mon, 9 Sep 2024 18:38:13 +0100
Subject: [PATCH 024/224] DOC: Fix pandas.Series.dt seconds, nanoseconds GL08,
 SA01 (#59582)

* fix pandas.Series.dt.freq

* fix seconds, nanoseconds, microseconds

* remove fixed objects from code_checks.sh

* Remove Timedelta Index checks

* fix freq example

* remove freq

* bring back microseconds
---
 ci/code_checks.sh                |  4 ----
 pandas/core/arrays/timedeltas.py | 10 ++++++++++
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 2870de5a0c85a..06078d8958492 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -90,10 +90,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Series.dt.floor PR01,PR02" \
         -i "pandas.Series.dt.freq GL08" \
         -i "pandas.Series.dt.month_name PR01,PR02" \
-        -i "pandas.Series.dt.nanoseconds SA01" \
         -i "pandas.Series.dt.normalize PR01" \
         -i "pandas.Series.dt.round PR01,PR02" \
-        -i "pandas.Series.dt.seconds SA01" \
         -i "pandas.Series.dt.strftime PR01,PR02" \
         -i "pandas.Series.dt.to_period PR01,PR02" \
         -i "pandas.Series.dt.total_seconds PR01" \
@@ -111,8 +109,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timedelta.resolution PR02" \
         -i "pandas.Timedelta.to_timedelta64 SA01" \
         -i "pandas.Timedelta.total_seconds SA01" \
-        -i "pandas.TimedeltaIndex.nanoseconds SA01" \
-        -i "pandas.TimedeltaIndex.seconds SA01" \
         -i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \
         -i "pandas.Timestamp.max PR02" \
         -i "pandas.Timestamp.min PR02" \
diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
index c8a86ffc187d0..754ae277e359a 100644
--- a/pandas/core/arrays/timedeltas.py
+++ b/pandas/core/arrays/timedeltas.py
@@ -842,6 +842,11 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]:
     seconds_docstring = textwrap.dedent(
         """Number of seconds (>= 0 and less than 1 day) for each element.
 
+    See Also
+    --------
+    Series.dt.seconds : Return number of seconds for each element.
+    Series.dt.nanoseconds : Return number of nanoseconds for each element.
+
     Examples
     --------
     For Series:
@@ -917,6 +922,11 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]:
     nanoseconds_docstring = textwrap.dedent(
         """Number of nanoseconds (>= 0 and less than 1 microsecond) for each element.
 
+    See Also
+    --------
+    Series.dt.seconds : Return number of seconds for each element.
+    Series.dt.microseconds : Return number of nanoseconds for each element.
+
     Examples
     --------
     For Series:

From b717abb3131a4cd344b463583c8dd828cd1632bc Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 9 Sep 2024 22:21:36 +0200
Subject: [PATCH 025/224] BUG (string dtype): fix inplace mutation with
 copy=False in ensure_string_array (#59756)

* BUG (string dtype): fix inplace mutation with copy=False in ensure_string_array

* update
---
 pandas/_libs/lib.pyx                  | 18 ++++++++++++------
 pandas/tests/copy_view/test_astype.py | 18 +++++++++++++-----
 pandas/tests/libs/test_lib.py         | 14 ++++++++++++++
 3 files changed, 39 insertions(+), 11 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 47a31954b9d6c..75f58f565dd6f 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -733,7 +733,9 @@ cpdef ndarray[object] ensure_string_array(
     convert_na_value : bool, default True
         If False, existing na values will be used unchanged in the new array.
     copy : bool, default True
-        Whether to ensure that a new array is returned.
+        Whether to ensure that a new array is returned. When True, a new array
+        is always returned. When False, a new array is only returned when needed
+        to avoid mutating the input array.
     skipna : bool, default True
         Whether or not to coerce nulls to their stringified form
         (e.g. if False, NaN becomes 'nan').
@@ -762,11 +764,15 @@ cpdef ndarray[object] ensure_string_array(
 
     result = np.asarray(arr, dtype="object")
 
-    if copy and (result is arr or np.shares_memory(arr, result)):
-        # GH#54654
-        result = result.copy()
-    elif not copy and result is arr:
-        already_copied = False
+    if result is arr or np.may_share_memory(arr, result):
+        # if np.asarray(..) did not make a copy of the input arr, we still need
+        #  to do that to avoid mutating the input array
+        # GH#54654: share_memory check is needed for rare cases where np.asarray
+        #  returns a new object without making a copy of the actual data
+        if copy:
+            result = result.copy()
+        else:
+            already_copied = False
     elif not copy and not result.flags.writeable:
         # Weird edge case where result is a view
         already_copied = False
diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py
index de56d5e4a07ee..80c30f2d0c26e 100644
--- a/pandas/tests/copy_view/test_astype.py
+++ b/pandas/tests/copy_view/test_astype.py
@@ -7,7 +7,6 @@
 
 from pandas.compat import HAS_PYARROW
 from pandas.compat.pyarrow import pa_version_under12p0
-import pandas.util._test_decorators as td
 
 from pandas import (
     DataFrame,
@@ -111,7 +110,8 @@ def test_astype_string_and_object_update_original(dtype, new_dtype):
     tm.assert_frame_equal(df2, df_orig)
 
 
-def test_astype_string_copy_on_pickle_roundrip():
+def test_astype_str_copy_on_pickle_roundrip():
+    # TODO(infer_string) this test can be removed after 3.0 (once str is the default)
     # https://github.com/pandas-dev/pandas/issues/54654
     # ensure_string_array may alter array inplace
     base = Series(np.array([(1, 2), None, 1], dtype="object"))
@@ -120,14 +120,22 @@ def test_astype_string_copy_on_pickle_roundrip():
     tm.assert_series_equal(base, base_copy)
 
 
-@td.skip_if_no("pyarrow")
-def test_astype_string_read_only_on_pickle_roundrip():
+def test_astype_string_copy_on_pickle_roundrip(any_string_dtype):
+    # https://github.com/pandas-dev/pandas/issues/54654
+    # ensure_string_array may alter array inplace
+    base = Series(np.array([(1, 2), None, 1], dtype="object"))
+    base_copy = pickle.loads(pickle.dumps(base))
+    base_copy.astype(any_string_dtype)
+    tm.assert_series_equal(base, base_copy)
+
+
+def test_astype_string_read_only_on_pickle_roundrip(any_string_dtype):
     # https://github.com/pandas-dev/pandas/issues/54654
     # ensure_string_array may alter read-only array inplace
     base = Series(np.array([(1, 2), None, 1], dtype="object"))
     base_copy = pickle.loads(pickle.dumps(base))
     base_copy._values.flags.writeable = False
-    base_copy.astype("string[pyarrow]")
+    base_copy.astype(any_string_dtype)
     tm.assert_series_equal(base, base_copy)
 
 
diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py
index 8583d8bcc052c..17dae1879f3b8 100644
--- a/pandas/tests/libs/test_lib.py
+++ b/pandas/tests/libs/test_lib.py
@@ -1,3 +1,5 @@
+import pickle
+
 import numpy as np
 import pytest
 
@@ -283,3 +285,15 @@ def test_no_default_pickle():
     # GH#40397
     obj = tm.round_trip_pickle(lib.no_default)
     assert obj is lib.no_default
+
+
+def test_ensure_string_array_copy():
+    # ensure the original array is not modified in case of copy=False with
+    # pickle-roundtripped object dtype array
+    # https://github.com/pandas-dev/pandas/issues/54654
+    arr = np.array(["a", None], dtype=object)
+    arr = pickle.loads(pickle.dumps(arr))
+    result = lib.ensure_string_array(arr, copy=False)
+    assert not np.shares_memory(arr, result)
+    assert arr[1] is None
+    assert result[1] is np.nan

From 83fd9babc73fff1a5be53c3f33e8973ed9416b6e Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 9 Sep 2024 22:34:28 +0200
Subject: [PATCH 026/224] TST (string dtype): remove usage of
 'string[pyarrow_numpy]' alias (#59758)

---
 pandas/conftest.py                            | 28 +++++++++++++++++++
 pandas/tests/apply/test_numba.py              |  6 ++--
 .../tests/arrays/string_/test_string_arrow.py |  5 ++--
 pandas/tests/base/test_misc.py                |  4 +--
 pandas/tests/frame/indexing/test_indexing.py  | 10 ++-----
 pandas/tests/frame/methods/test_rank.py       | 14 +++++-----
 pandas/tests/frame/test_constructors.py       |  7 ++---
 pandas/tests/groupby/methods/test_size.py     | 13 ++-------
 .../groupby/methods/test_value_counts.py      | 14 ++--------
 pandas/tests/groupby/test_groupby.py          | 11 ++------
 pandas/tests/groupby/test_reductions.py       |  5 ++--
 .../indexes/base_class/test_constructors.py   |  4 +--
 .../tests/indexes/base_class/test_reshape.py  |  7 ++---
 pandas/tests/indexes/object/test_indexing.py  | 23 ++++-----------
 pandas/tests/indexes/test_base.py             |  5 ++--
 pandas/tests/indexes/test_old_base.py         |  5 +++-
 pandas/tests/interchange/test_impl.py         |  8 ++++--
 pandas/tests/io/json/test_pandas.py           |  8 +++---
 .../io/parser/dtypes/test_dtypes_basic.py     | 11 +++-----
 pandas/tests/io/pytables/test_read.py         |  5 ++--
 pandas/tests/io/test_feather.py               |  4 ++-
 pandas/tests/io/test_orc.py                   |  4 +--
 pandas/tests/io/test_parquet.py               |  8 +++---
 pandas/tests/io/test_sql.py                   |  3 +-
 pandas/tests/reshape/test_get_dummies.py      | 22 +++++++--------
 pandas/tests/reshape/test_melt.py             |  8 +++---
 pandas/tests/series/test_logical_ops.py       |  3 +-
 pandas/tests/strings/test_find_replace.py     |  2 +-
 pandas/tests/util/test_shares_memory.py       |  6 ++--
 29 files changed, 119 insertions(+), 134 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index d11213f1164bc..222aefb4afda8 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -1272,6 +1272,34 @@ def string_dtype(request):
     return request.param
 
 
+@pytest.fixture(
+    params=[
+        ("python", pd.NA),
+        pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
+        pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
+        ("python", np.nan),
+    ],
+    ids=[
+        "string=string[python]",
+        "string=string[pyarrow]",
+        "string=str[pyarrow]",
+        "string=str[python]",
+    ],
+)
+def string_dtype_no_object(request):
+    """
+    Parametrized fixture for string dtypes.
+    * 'string[python]' (NA variant)
+    * 'string[pyarrow]' (NA variant)
+    * 'str' (NaN variant, with pyarrow)
+    * 'str' (NaN variant, without pyarrow)
+    """
+    # need to instantiate the StringDtype here instead of in the params
+    # to avoid importing pyarrow during test collection
+    storage, na_value = request.param
+    return pd.StringDtype(storage, na_value)
+
+
 @pytest.fixture(
     params=[
         "string[python]",
diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py
index d86eeadbaa0fe..825d295043e69 100644
--- a/pandas/tests/apply/test_numba.py
+++ b/pandas/tests/apply/test_numba.py
@@ -5,6 +5,7 @@
 
 import pandas.util._test_decorators as td
 
+import pandas as pd
 from pandas import (
     DataFrame,
     Index,
@@ -29,11 +30,10 @@ def test_numba_vs_python_noop(float_frame, apply_axis):
 
 def test_numba_vs_python_string_index():
     # GH#56189
-    pytest.importorskip("pyarrow")
     df = DataFrame(
         1,
-        index=Index(["a", "b"], dtype="string[pyarrow_numpy]"),
-        columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"),
+        index=Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
+        columns=Index(["x", "y"], dtype=pd.StringDtype(na_value=np.nan)),
     )
     func = lambda x: x
     result = df.apply(func, engine="numba", axis=0)
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
index b042cf632288b..d4363171788d4 100644
--- a/pandas/tests/arrays/string_/test_string_arrow.py
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -241,10 +241,11 @@ def test_setitem_invalid_indexer_raises():
         arr[[0, 1]] = ["foo", "bar", "baz"]
 
 
-@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"])
-def test_pickle_roundtrip(dtype):
+@pytest.mark.parametrize("na_value", [pd.NA, np.nan])
+def test_pickle_roundtrip(na_value):
     # GH 42600
     pytest.importorskip("pyarrow")
+    dtype = StringDtype("pyarrow", na_value=na_value)
     expected = pd.Series(range(10), dtype=dtype)
     expected_sliced = expected.head(2)
     full_pickled = pickle.dumps(expected)
diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py
index bbd9b150b88a8..7819b7b75f065 100644
--- a/pandas/tests/base/test_misc.py
+++ b/pandas/tests/base/test_misc.py
@@ -183,9 +183,7 @@ def test_access_by_position(index_flat):
     assert index[-1] == index[size - 1]
 
     msg = f"index {size} is out of bounds for axis 0 with size {size}"
-    if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal(
-        index.dtype, "string[pyarrow_numpy]"
-    ):
+    if isinstance(index.dtype, pd.StringDtype) and index.dtype.storage == "pyarrow":
         msg = "index out of bounds"
     with pytest.raises(IndexError, match=msg):
         index[size]
diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
index 8ce4e8725d632..0723c3c70091c 100644
--- a/pandas/tests/frame/indexing/test_indexing.py
+++ b/pandas/tests/frame/indexing/test_indexing.py
@@ -1864,13 +1864,11 @@ def test_adding_new_conditional_column() -> None:
     ("dtype", "infer_string"),
     [
         (object, False),
-        ("string[pyarrow_numpy]", True),
+        (pd.StringDtype(na_value=np.nan), True),
     ],
 )
 def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None:
     # https://github.com/pandas-dev/pandas/issues/56204
-    pytest.importorskip("pyarrow")
-
     df = DataFrame({"a": [1, 2], "b": [3, 4]})
     with pd.option_context("future.infer_string", infer_string):
         df.loc[df["a"] == 1, "c"] = "1"
@@ -1880,16 +1878,14 @@ def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None:
     tm.assert_frame_equal(df, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_add_new_column_infer_string():
     # GH#55366
-    pytest.importorskip("pyarrow")
     df = DataFrame({"x": [1]})
     with pd.option_context("future.infer_string", True):
         df.loc[df["x"] == 1, "y"] = "1"
     expected = DataFrame(
-        {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")},
-        columns=Index(["x", "y"], dtype=object),
+        {"x": [1], "y": Series(["1"], dtype=pd.StringDtype(na_value=np.nan))},
+        columns=Index(["x", "y"], dtype="str"),
     )
     tm.assert_frame_equal(df, expected)
 
diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py
index 4b1435babe6b1..c1cdeaa6c10dd 100644
--- a/pandas/tests/frame/methods/test_rank.py
+++ b/pandas/tests/frame/methods/test_rank.py
@@ -14,6 +14,7 @@
 )
 from pandas.compat import HAS_PYARROW
 
+import pandas as pd
 from pandas import (
     DataFrame,
     Index,
@@ -502,14 +503,13 @@ def test_rank_mixed_axis_zero(self, data, expected):
         result = df.rank(numeric_only=True)
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.parametrize(
-        "dtype, exp_dtype",
-        [("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")],
-    )
-    def test_rank_string_dtype(self, dtype, exp_dtype):
+    def test_rank_string_dtype(self, string_dtype_no_object):
         # GH#55362
-        pytest.importorskip("pyarrow")
-        obj = Series(["foo", "foo", None, "foo"], dtype=dtype)
+        obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object)
         result = obj.rank(method="first")
+        exp_dtype = "Int64" if string_dtype_no_object.na_value is pd.NA else "float64"
+        if string_dtype_no_object.storage == "python":
+            # TODO nullable string[python] should also return nullable Int64
+            exp_dtype = "float64"
         expected = Series([1, 2, None, 3], dtype=exp_dtype)
         tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index 0176a36fe78d7..3d46e03547c38 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -2655,8 +2655,7 @@ def test_construct_with_strings_and_none(self):
 
     def test_frame_string_inference(self):
         # GH#54430
-        pytest.importorskip("pyarrow")
-        dtype = "string[pyarrow_numpy]"
+        dtype = pd.StringDtype(na_value=np.nan)
         expected = DataFrame(
             {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
         )
@@ -2690,8 +2689,7 @@ def test_frame_string_inference(self):
 
     def test_frame_string_inference_array_string_dtype(self):
         # GH#54496
-        pytest.importorskip("pyarrow")
-        dtype = "string[pyarrow_numpy]"
+        dtype = pd.StringDtype(na_value=np.nan)
         expected = DataFrame(
             {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
         )
@@ -2715,7 +2713,6 @@ def test_frame_string_inference_array_string_dtype(self):
 
     def test_frame_string_inference_block_dim(self):
         # GH#55363
-        pytest.importorskip("pyarrow")
         with pd.option_context("future.infer_string", True):
             df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]]))
         assert df._mgr.blocks[0].ndim == 2
diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py
index edeac642551a0..91200f53e36bd 100644
--- a/pandas/tests/groupby/methods/test_size.py
+++ b/pandas/tests/groupby/methods/test_size.py
@@ -3,8 +3,6 @@
 
 from pandas._config import using_string_dtype
 
-import pandas.util._test_decorators as td
-
 from pandas import (
     DataFrame,
     Index,
@@ -79,16 +77,9 @@ def test_size_series_masked_type_returns_Int64(dtype):
 
 
 @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        object,
-        pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
-        pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
-    ],
-)
-def test_size_strings(dtype):
+def test_size_strings(any_string_dtype):
     # GH#55627
+    dtype = any_string_dtype
     df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype)
     result = df.groupby("a")["b"].size()
     exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64"
diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py
index da3d626f2d777..8f8f7f64aba75 100644
--- a/pandas/tests/groupby/methods/test_value_counts.py
+++ b/pandas/tests/groupby/methods/test_value_counts.py
@@ -7,8 +7,6 @@
 import numpy as np
 import pytest
 
-import pandas.util._test_decorators as td
-
 from pandas import (
     Categorical,
     CategoricalIndex,
@@ -373,14 +371,6 @@ def test_against_frame_and_seriesgroupby(
             tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        object,
-        pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
-        pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
-    ],
-)
 @pytest.mark.parametrize("normalize", [True, False])
 @pytest.mark.parametrize(
     "sort, ascending, expected_rows, expected_count, expected_group_size",
@@ -398,9 +388,10 @@ def test_compound(
     expected_rows,
     expected_count,
     expected_group_size,
-    dtype,
+    any_string_dtype,
     using_infer_string,
 ):
+    dtype = any_string_dtype
     education_df = education_df.astype(dtype)
     education_df.columns = education_df.columns.astype(dtype)
     # Multiple groupby keys and as_index=False
@@ -417,6 +408,7 @@ def test_compound(
         expected["proportion"] = expected_count
         expected["proportion"] /= expected_group_size
         if dtype == "string[pyarrow]":
+            # TODO(nullable) also string[python] should return nullable dtypes
             expected["proportion"] = expected["proportion"].convert_dtypes()
     else:
         expected["count"] = expected_count
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 11b874d0b1608..6393468fb8ccd 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -2466,20 +2466,13 @@ def test_rolling_wrong_param_min_period():
         test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum()
 
 
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        object,
-        pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
-    ],
-)
-def test_by_column_values_with_same_starting_value(dtype):
+def test_by_column_values_with_same_starting_value(any_string_dtype):
     # GH29635
     df = DataFrame(
         {
             "Name": ["Thomas", "Thomas", "Thomas John"],
             "Credit": [1200, 1300, 900],
-            "Mood": Series(["sad", "happy", "happy"], dtype=dtype),
+            "Mood": Series(["sad", "happy", "happy"], dtype=any_string_dtype),
         }
     )
     aggregate_details = {"Mood": Series.mode, "Credit": "sum"}
diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py
index 8a421654cdf9b..a6ea1502103c5 100644
--- a/pandas/tests/groupby/test_reductions.py
+++ b/pandas/tests/groupby/test_reductions.py
@@ -714,10 +714,9 @@ def test_groupby_min_max_categorical(func):
 
 
 @pytest.mark.parametrize("func", ["min", "max"])
-def test_min_empty_string_dtype(func):
+def test_min_empty_string_dtype(func, string_dtype_no_object):
     # GH#55619
-    pytest.importorskip("pyarrow")
-    dtype = "string[pyarrow_numpy]"
+    dtype = string_dtype_no_object
     df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0]
     result = getattr(df.groupby("a"), func)()
     expected = DataFrame(
diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py
index 6036eddce7a01..0896b97e8a40e 100644
--- a/pandas/tests/indexes/base_class/test_constructors.py
+++ b/pandas/tests/indexes/base_class/test_constructors.py
@@ -47,9 +47,7 @@ def test_construct_empty_tuples(self, tuple_list):
 
     def test_index_string_inference(self):
         # GH#54430
-        pytest.importorskip("pyarrow")
-        dtype = "string[pyarrow_numpy]"
-        expected = Index(["a", "b"], dtype=dtype)
+        expected = Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan))
         with pd.option_context("future.infer_string", True):
             ser = Index(["a", "b"])
         tm.assert_index_equal(ser, expected)
diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py
index e17e39a334acc..56cdca49cb2b0 100644
--- a/pandas/tests/indexes/base_class/test_reshape.py
+++ b/pandas/tests/indexes/base_class/test_reshape.py
@@ -57,12 +57,11 @@ def test_insert_datetime_into_object(self, loc, val):
         tm.assert_index_equal(result, expected)
         assert type(expected[2]) is type(val)
 
-    def test_insert_none_into_string_numpy(self):
+    def test_insert_none_into_string_numpy(self, string_dtype_no_object):
         # GH#55365
-        pytest.importorskip("pyarrow")
-        index = Index(["a", "b", "c"], dtype="string[pyarrow_numpy]")
+        index = Index(["a", "b", "c"], dtype=string_dtype_no_object)
         result = index.insert(-1, None)
-        expected = Index(["a", "b", None, "c"], dtype="string[pyarrow_numpy]")
+        expected = Index(["a", "b", None, "c"], dtype=string_dtype_no_object)
         tm.assert_index_equal(result, expected)
 
     @pytest.mark.parametrize(
diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py
index 2e9ba007a45c1..ea3d068a673e8 100644
--- a/pandas/tests/indexes/object/test_indexing.py
+++ b/pandas/tests/indexes/object/test_indexing.py
@@ -7,7 +7,6 @@
     NA,
     is_matching_na,
 )
-import pandas.util._test_decorators as td
 
 import pandas as pd
 from pandas import Index
@@ -160,14 +159,6 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2):
 
 
 class TestSliceLocs:
-    # TODO(infer_string) parametrize over multiple string dtypes
-    @pytest.mark.parametrize(
-        "dtype",
-        [
-            "object",
-            pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")),
-        ],
-    )
     @pytest.mark.parametrize(
         "in_slice,expected",
         [
@@ -191,24 +182,22 @@ class TestSliceLocs:
             (pd.IndexSlice["m":"m":-1], ""),  # type: ignore[misc]
         ],
     )
-    def test_slice_locs_negative_step(self, in_slice, expected, dtype):
-        index = Index(list("bcdxy"), dtype=dtype)
+    def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype):
+        index = Index(list("bcdxy"), dtype=any_string_dtype)
 
         s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step)
         result = index[s_start : s_stop : in_slice.step]
-        expected = Index(list(expected), dtype=dtype)
+        expected = Index(list(expected), dtype=any_string_dtype)
         tm.assert_index_equal(result, expected)
 
-    # TODO(infer_string) parametrize over multiple string dtypes
-    @td.skip_if_no("pyarrow")
-    def test_slice_locs_negative_step_oob(self):
-        index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]")
+    def test_slice_locs_negative_step_oob(self, any_string_dtype):
+        index = Index(list("bcdxy"), dtype=any_string_dtype)
 
         result = index[-10:5:1]
         tm.assert_index_equal(result, index)
 
         result = index[4:-10:-1]
-        expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]")
+        expected = Index(list("yxdcb"), dtype=any_string_dtype)
         tm.assert_index_equal(result, expected)
 
     def test_slice_locs_dup(self):
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
index 486b24845d2ff..2b62b384930d6 100644
--- a/pandas/tests/indexes/test_base.py
+++ b/pandas/tests/indexes/test_base.py
@@ -933,10 +933,9 @@ def test_isin_empty(self, empty):
         result = index.isin(empty)
         tm.assert_numpy_array_equal(expected, result)
 
-    @td.skip_if_no("pyarrow")
-    def test_isin_arrow_string_null(self):
+    def test_isin_string_null(self, string_dtype_no_object):
         # GH#55821
-        index = Index(["a", "b"], dtype="string[pyarrow_numpy]")
+        index = Index(["a", "b"], dtype=string_dtype_no_object)
         result = index.isin([None])
         expected = np.array([False, False])
         tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py
index 75284a8f8fd47..cd3d599abd30e 100644
--- a/pandas/tests/indexes/test_old_base.py
+++ b/pandas/tests/indexes/test_old_base.py
@@ -295,7 +295,10 @@ def test_ensure_copied_data(self, index):
                 tm.assert_numpy_array_equal(
                     index._values._ndarray, result._values._ndarray, check_same="same"
                 )
-            elif index.dtype in ("string[pyarrow]", "string[pyarrow_numpy]"):
+            elif (
+                isinstance(index.dtype, StringDtype)
+                and index.dtype.storage == "pyarrow"
+            ):
                 assert tm.shares_memory(result._values, index._values)
             else:
                 raise NotImplementedError(index.dtype)
diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py
index 76910db941d36..38961345dc1f2 100644
--- a/pandas/tests/interchange/test_impl.py
+++ b/pandas/tests/interchange/test_impl.py
@@ -465,7 +465,7 @@ def test_non_str_names_w_duplicates():
         ([1.0, 2.25, None], "Float32[pyarrow]", "float32"),
         ([True, False, None], "boolean", "bool"),
         ([True, False, None], "boolean[pyarrow]", "bool"),
-        (["much ado", "about", None], "string[pyarrow_numpy]", "large_string"),
+        (["much ado", "about", None], pd.StringDtype(na_value=np.nan), "large_string"),
         (["much ado", "about", None], "string[pyarrow]", "large_string"),
         (
             [datetime(2020, 1, 1), datetime(2020, 1, 2), None],
@@ -528,7 +528,11 @@ def test_pandas_nullable_with_missing_values(
         ([1.0, 2.25, 5.0], "Float32[pyarrow]", "float32"),
         ([True, False, False], "boolean", "bool"),
         ([True, False, False], "boolean[pyarrow]", "bool"),
-        (["much ado", "about", "nothing"], "string[pyarrow_numpy]", "large_string"),
+        (
+            ["much ado", "about", "nothing"],
+            pd.StringDtype(na_value=np.nan),
+            "large_string",
+        ),
         (["much ado", "about", "nothing"], "string[pyarrow]", "large_string"),
         (
             [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)],
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 3d07c0219691e..1c54232b8b510 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -2245,18 +2245,18 @@ def test_pyarrow_engine_lines_false():
 
 
 def test_json_roundtrip_string_inference(orient):
-    pytest.importorskip("pyarrow")
     df = DataFrame(
         [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"]
     )
     out = df.to_json()
     with pd.option_context("future.infer_string", True):
         result = read_json(StringIO(out))
+    dtype = pd.StringDtype(na_value=np.nan)
     expected = DataFrame(
         [["a", "b"], ["c", "d"]],
-        dtype="string[pyarrow_numpy]",
-        index=Index(["row 1", "row 2"], dtype="string[pyarrow_numpy]"),
-        columns=Index(["col 1", "col 2"], dtype="string[pyarrow_numpy]"),
+        dtype=dtype,
+        index=Index(["row 1", "row 2"], dtype=dtype),
+        columns=Index(["col 1", "col 2"], dtype=dtype),
     )
     tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
index 07f29518b7881..b664423364f6b 100644
--- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
+++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -547,8 +547,7 @@ def test_ea_int_avoid_overflow(all_parsers):
 
 def test_string_inference(all_parsers):
     # GH#54430
-    pytest.importorskip("pyarrow")
-    dtype = "string[pyarrow_numpy]"
+    dtype = pd.StringDtype(na_value=np.nan)
 
     data = """a,b
 x,1
@@ -568,8 +567,6 @@ def test_string_inference(all_parsers):
 @pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_])
 def test_string_inference_object_dtype(all_parsers, dtype):
     # GH#56047
-    pytest.importorskip("pyarrow")
-
     data = """a,b
 x,a
 y,a
@@ -583,7 +580,7 @@ def test_string_inference_object_dtype(all_parsers, dtype):
             "a": pd.Series(["x", "y", "z"], dtype=object),
             "b": pd.Series(["a", "a", "a"], dtype=object),
         },
-        columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
+        columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
     )
     tm.assert_frame_equal(result, expected)
 
@@ -593,9 +590,9 @@ def test_string_inference_object_dtype(all_parsers, dtype):
     expected = DataFrame(
         {
             "a": pd.Series(["x", "y", "z"], dtype=object),
-            "b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"),
+            "b": pd.Series(["a", "a", "a"], dtype=pd.StringDtype(na_value=np.nan)),
         },
-        columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
+        columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
     )
     tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py
index dd3a0eabe95ae..8ae87d4bab52d 100644
--- a/pandas/tests/io/pytables/test_read.py
+++ b/pandas/tests/io/pytables/test_read.py
@@ -310,7 +310,6 @@ def test_read_hdf_series_mode_r(tmp_path, format, setup_path):
 
 def test_read_infer_string(tmp_path, setup_path):
     # GH#54431
-    pytest.importorskip("pyarrow")
     df = DataFrame({"a": ["a", "b", None]})
     path = tmp_path / setup_path
     df.to_hdf(path, key="data", format="table")
@@ -318,8 +317,8 @@ def test_read_infer_string(tmp_path, setup_path):
         result = read_hdf(path, key="data", mode="r")
     expected = DataFrame(
         {"a": ["a", "b", None]},
-        dtype="string[pyarrow_numpy]",
-        columns=Index(["a"], dtype="string[pyarrow_numpy]"),
+        dtype=pd.StringDtype(na_value=np.nan),
+        columns=Index(["a"], dtype=pd.StringDtype(na_value=np.nan)),
     )
     tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
index a1f3babb1ae3b..9721d045b7b91 100644
--- a/pandas/tests/io/test_feather.py
+++ b/pandas/tests/io/test_feather.py
@@ -243,5 +243,7 @@ def test_string_inference(self, tmp_path):
         df.to_feather(path)
         with pd.option_context("future.infer_string", True):
             result = read_feather(path)
-        expected = pd.DataFrame(data={"a": ["x", "y"]}, dtype="string[pyarrow_numpy]")
+        expected = pd.DataFrame(
+            data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan)
+        )
         tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
index 90133344fdfc9..efb3dffecd856 100644
--- a/pandas/tests/io/test_orc.py
+++ b/pandas/tests/io/test_orc.py
@@ -436,7 +436,7 @@ def test_string_inference(tmp_path):
         result = read_orc(path)
     expected = pd.DataFrame(
         data={"a": ["x", "y"]},
-        dtype="string[pyarrow_numpy]",
-        columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"),
+        dtype=pd.StringDtype(na_value=np.nan),
+        columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)),
     )
     tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index a29e479b7c9f1..4c2ea036f08dc 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -1109,8 +1109,8 @@ def test_string_inference(self, tmp_path, pa):
             result = read_parquet(path, engine="pyarrow")
         expected = pd.DataFrame(
             data={"a": ["x", "y"]},
-            dtype="string[pyarrow_numpy]",
-            index=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
+            dtype=pd.StringDtype(na_value=np.nan),
+            index=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
         )
         tm.assert_frame_equal(result, expected)
 
@@ -1140,8 +1140,8 @@ def test_infer_string_large_string_type(self, tmp_path, pa):
             result = read_parquet(path)
         expected = pd.DataFrame(
             data={"a": [None, "b", "c"]},
-            dtype="string[pyarrow_numpy]",
-            columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"),
+            dtype=pd.StringDtype(na_value=np.nan),
+            columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)),
         )
         tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
index 980c88f070b89..c28a33069d23f 100644
--- a/pandas/tests/io/test_sql.py
+++ b/pandas/tests/io/test_sql.py
@@ -3809,7 +3809,6 @@ class Test(BaseModel):
 def test_read_sql_string_inference(sqlite_engine):
     conn = sqlite_engine
     # GH#54430
-    pytest.importorskip("pyarrow")
     table = "test"
     df = DataFrame({"a": ["x", "y"]})
     df.to_sql(table, con=conn, index=False, if_exists="replace")
@@ -3817,7 +3816,7 @@ def test_read_sql_string_inference(sqlite_engine):
     with pd.option_context("future.infer_string", True):
         result = read_sql_table(table, conn)
 
-    dtype = "string[pyarrow_numpy]"
+    dtype = pd.StringDtype(na_value=np.nan)
     expected = DataFrame(
         {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype)
     )
diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py
index f07c6845366da..9ce2c925a368b 100644
--- a/pandas/tests/reshape/test_get_dummies.py
+++ b/pandas/tests/reshape/test_get_dummies.py
@@ -708,19 +708,17 @@ def test_get_dummies_ea_dtype_dataframe(self, any_numeric_ea_and_arrow_dtype):
         )
         tm.assert_frame_equal(result, expected)
 
-    @td.skip_if_no("pyarrow")
-    def test_get_dummies_ea_dtype(self):
+    @pytest.mark.parametrize("dtype_type", ["string", "category"])
+    def test_get_dummies_ea_dtype(self, dtype_type, string_dtype_no_object):
         # GH#56273
-        for dtype, exp_dtype in [
-            ("string[pyarrow]", "boolean"),
-            ("string[pyarrow_numpy]", "bool"),
-            (CategoricalDtype(Index(["a"], dtype="string[pyarrow]")), "boolean"),
-            (CategoricalDtype(Index(["a"], dtype="string[pyarrow_numpy]")), "bool"),
-        ]:
-            df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1})
-            result = get_dummies(df)
-            expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)})
-            tm.assert_frame_equal(result, expected)
+        dtype = string_dtype_no_object
+        exp_dtype = "boolean" if dtype.na_value is pd.NA else "bool"
+        if dtype_type == "category":
+            dtype = CategoricalDtype(Index(["a"], dtype))
+        df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1})
+        result = get_dummies(df)
+        expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)})
+        tm.assert_frame_equal(result, expected)
 
     @td.skip_if_no("pyarrow")
     def test_get_dummies_arrow_dtype(self):
diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py
index 4a12404f6775a..95aa5291cb45a 100644
--- a/pandas/tests/reshape/test_melt.py
+++ b/pandas/tests/reshape/test_melt.py
@@ -1242,9 +1242,9 @@ def test_missing_stubname(self, any_string_dtype):
         tm.assert_frame_equal(result, expected)
 
 
-def test_wide_to_long_pyarrow_string_columns():
+def test_wide_to_long_string_columns(string_storage):
     # GH 57066
-    pytest.importorskip("pyarrow")
+    string_dtype = pd.StringDtype(string_storage, na_value=np.nan)
     df = DataFrame(
         {
             "ID": {0: 1},
@@ -1254,7 +1254,7 @@ def test_wide_to_long_pyarrow_string_columns():
             "D": {0: 1},
         }
     )
-    df.columns = df.columns.astype("string[pyarrow_numpy]")
+    df.columns = df.columns.astype(string_dtype)
     result = wide_to_long(
         df, stubnames="R", i="ID", j="UNPIVOTED", sep="_", suffix=".*"
     )
@@ -1264,7 +1264,7 @@ def test_wide_to_long_pyarrow_string_columns():
         index=pd.MultiIndex.from_arrays(
             [
                 [1, 1, 1],
-                Index(["test1", "test2", "test3"], dtype="string[pyarrow_numpy]"),
+                Index(["test1", "test2", "test3"], dtype=string_dtype),
             ],
             names=["ID", "UNPIVOTED"],
         ),
diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py
index 1586195e79a9d..8516018e8aa93 100644
--- a/pandas/tests/series/test_logical_ops.py
+++ b/pandas/tests/series/test_logical_ops.py
@@ -9,6 +9,7 @@
     DataFrame,
     Index,
     Series,
+    StringDtype,
     bdate_range,
 )
 import pandas._testing as tm
@@ -514,7 +515,7 @@ def test_pyarrow_numpy_string_invalid(self):
         # GH#56008
         pa = pytest.importorskip("pyarrow")
         ser = Series([False, True])
-        ser2 = Series(["a", "b"], dtype="string[pyarrow_numpy]")
+        ser2 = Series(["a", "b"], dtype=StringDtype(na_value=np.nan))
         result = ser == ser2
         expected_eq = Series(False, index=ser.index)
         tm.assert_series_equal(result, expected_eq)
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
index bf01c4996bb32..ea9f89ed129aa 100644
--- a/pandas/tests/strings/test_find_replace.py
+++ b/pandas/tests/strings/test_find_replace.py
@@ -22,7 +22,7 @@
 
 
 def using_pyarrow(dtype):
-    return dtype in ("string[pyarrow]", "string[pyarrow_numpy]")
+    return dtype == "string" and dtype.storage == "pyarrow"
 
 
 def test_contains(any_string_dtype):
diff --git a/pandas/tests/util/test_shares_memory.py b/pandas/tests/util/test_shares_memory.py
index 00a897d574a07..8f1ac93b40247 100644
--- a/pandas/tests/util/test_shares_memory.py
+++ b/pandas/tests/util/test_shares_memory.py
@@ -1,3 +1,5 @@
+import numpy as np
+
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -20,10 +22,10 @@ def test_shares_memory_string():
     # GH#55823
     import pyarrow as pa
 
-    obj = pd.array(["a", "b"], dtype="string[pyarrow]")
+    obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=pd.NA))
     assert tm.shares_memory(obj, obj)
 
-    obj = pd.array(["a", "b"], dtype="string[pyarrow_numpy]")
+    obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=np.nan))
     assert tm.shares_memory(obj, obj)
 
     obj = pd.array(["a", "b"], dtype=pd.ArrowDtype(pa.string()))

From 715585de0d66383c51ce290ad6b18a036254d007 Mon Sep 17 00:00:00 2001
From: aaronchucarroll <120818400+aaronchucarroll@users.noreply.github.com>
Date: Mon, 9 Sep 2024 17:38:04 -0400
Subject: [PATCH 027/224] ENH: Add dtype argument to StringMethods
 get_dummies() (#59577)

---
 doc/source/whatsnew/v3.0.0.rst           |  1 +
 pandas/core/arrays/arrow/array.py        | 15 +++-
 pandas/core/arrays/categorical.py        |  4 +-
 pandas/core/arrays/string_arrow.py       | 19 ++++-
 pandas/core/strings/accessor.py          | 27 ++++++-
 pandas/core/strings/base.py              |  3 +-
 pandas/core/strings/object_array.py      | 13 +++-
 pandas/tests/strings/test_get_dummies.py | 99 ++++++++++++++++++++----
 8 files changed, 154 insertions(+), 27 deletions(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 9a29ff4d49966..819318e119668 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -55,6 +55,7 @@ Other enhancements
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
 - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
 - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
+- :meth:`str.get_dummies` now accepts a  ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
 - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
 - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
 - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`)
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 15f9ba611a642..4edf464be74f1 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -41,6 +41,7 @@
     is_list_like,
     is_numeric_dtype,
     is_scalar,
+    pandas_dtype,
 )
 from pandas.core.dtypes.dtypes import DatetimeTZDtype
 from pandas.core.dtypes.missing import isna
@@ -2475,7 +2476,9 @@ def _str_findall(self, pat: str, flags: int = 0) -> Self:
         result = self._apply_elementwise(predicate)
         return type(self)(pa.chunked_array(result))
 
-    def _str_get_dummies(self, sep: str = "|"):
+    def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
+        if dtype is None:
+            dtype = np.bool_
         split = pc.split_pattern(self._pa_array, sep)
         flattened_values = pc.list_flatten(split)
         uniques = flattened_values.unique()
@@ -2485,7 +2488,15 @@ def _str_get_dummies(self, sep: str = "|"):
         n_cols = len(uniques)
         indices = pc.index_in(flattened_values, uniques_sorted).to_numpy()
         indices = indices + np.arange(n_rows).repeat(lengths) * n_cols
-        dummies = np.zeros(n_rows * n_cols, dtype=np.bool_)
+        _dtype = pandas_dtype(dtype)
+        dummies_dtype: NpDtype
+        if isinstance(_dtype, np.dtype):
+            dummies_dtype = _dtype
+        else:
+            dummies_dtype = np.bool_
+        dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype)
+        if dtype == str:
+            dummies[:] = False
         dummies[indices] = True
         dummies = dummies.reshape((n_rows, n_cols))
         result = type(self)(pa.array(list(dummies)))
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index c613a345686cc..8e0225b31e17b 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -2681,11 +2681,11 @@ def _str_map(
         result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype)
         return take_nd(result, codes, fill_value=na_value)
 
-    def _str_get_dummies(self, sep: str = "|"):
+    def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
         # sep may not be in categories. Just bail on this.
         from pandas.core.arrays import NumpyExtensionArray
 
-        return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep)
+        return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep, dtype)
 
     # ------------------------------------------------------------------------
     # GroupBy Methods
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 1e5adf106752f..fa8c662b68f3c 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -56,6 +56,7 @@
         ArrayLike,
         AxisInt,
         Dtype,
+        NpDtype,
         Scalar,
         Self,
         npt,
@@ -425,12 +426,22 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
             return super()._str_find(sub, start, end)
         return ArrowStringArrayMixin._str_find(self, sub, start, end)
 
-    def _str_get_dummies(self, sep: str = "|"):
-        dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep)
+    def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
+        if dtype is None:
+            dtype = np.int64
+        dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(
+            sep, dtype
+        )
         if len(labels) == 0:
-            return np.empty(shape=(0, 0), dtype=np.int64), labels
+            return np.empty(shape=(0, 0), dtype=dtype), labels
         dummies = np.vstack(dummies_pa.to_numpy())
-        return dummies.astype(np.int64, copy=False), labels
+        _dtype = pandas_dtype(dtype)
+        dummies_dtype: NpDtype
+        if isinstance(_dtype, np.dtype):
+            dummies_dtype = _dtype
+        else:
+            dummies_dtype = np.bool_
+        return dummies.astype(dummies_dtype, copy=False), labels
 
     def _convert_int_result(self, result):
         if self.dtype.na_value is np.nan:
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index bdb88e981bcda..6d10365a1b968 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -26,6 +26,7 @@
 from pandas.core.dtypes.common import (
     ensure_object,
     is_bool_dtype,
+    is_extension_array_dtype,
     is_integer,
     is_list_like,
     is_object_dtype,
@@ -54,6 +55,8 @@
         Iterator,
     )
 
+    from pandas._typing import NpDtype
+
     from pandas import (
         DataFrame,
         Index,
@@ -2431,7 +2434,11 @@ def wrap(
         return self._wrap_result(result)
 
     @forbid_nonstring_types(["bytes"])
-    def get_dummies(self, sep: str = "|"):
+    def get_dummies(
+        self,
+        sep: str = "|",
+        dtype: NpDtype | None = None,
+    ):
         """
         Return DataFrame of dummy/indicator variables for Series.
 
@@ -2442,6 +2449,8 @@ def get_dummies(self, sep: str = "|"):
         ----------
         sep : str, default "|"
             String to split on.
+        dtype : dtype, default np.int64
+            Data type for new columns. Only a single dtype is allowed.
 
         Returns
         -------
@@ -2466,10 +2475,24 @@ def get_dummies(self, sep: str = "|"):
         0  1  1  0
         1  0  0  0
         2  1  0  1
+
+        >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dtype=bool)
+                a      b      c
+        0   True   True    False
+        1   False  False   False
+        2   True   False   True
         """
+        from pandas.core.frame import DataFrame
+
         # we need to cast to Series of strings as only that has all
         # methods available for making the dummies...
-        result, name = self._data.array._str_get_dummies(sep)
+        result, name = self._data.array._str_get_dummies(sep, dtype)
+        if is_extension_array_dtype(dtype) or isinstance(dtype, ArrowDtype):
+            return self._wrap_result(
+                DataFrame(result, columns=name, dtype=dtype),
+                name=name,
+                returns_string=False,
+            )
         return self._wrap_result(
             result,
             name=name,
diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py
index 1281a03e297f9..97d906e3df077 100644
--- a/pandas/core/strings/base.py
+++ b/pandas/core/strings/base.py
@@ -16,6 +16,7 @@
     import re
 
     from pandas._typing import (
+        NpDtype,
         Scalar,
         Self,
     )
@@ -163,7 +164,7 @@ def _str_wrap(self, width: int, **kwargs):
         pass
 
     @abc.abstractmethod
-    def _str_get_dummies(self, sep: str = "|"):
+    def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
         pass
 
     @abc.abstractmethod
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index c6b18d7049c57..6211c7b528db9 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -18,6 +18,7 @@
 import pandas._libs.ops as libops
 from pandas.util._exceptions import find_stack_level
 
+from pandas.core.dtypes.common import pandas_dtype
 from pandas.core.dtypes.missing import isna
 
 from pandas.core.strings.base import BaseStringArrayMethods
@@ -398,9 +399,11 @@ def _str_wrap(self, width: int, **kwargs):
         tw = textwrap.TextWrapper(**kwargs)
         return self._str_map(lambda s: "\n".join(tw.wrap(s)))
 
-    def _str_get_dummies(self, sep: str = "|"):
+    def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
         from pandas import Series
 
+        if dtype is None:
+            dtype = np.int64
         arr = Series(self).fillna("")
         try:
             arr = sep + arr + sep
@@ -412,7 +415,13 @@ def _str_get_dummies(self, sep: str = "|"):
             tags.update(ts)
         tags2 = sorted(tags - {""})
 
-        dummies = np.empty((len(arr), len(tags2)), dtype=np.int64)
+        _dtype = pandas_dtype(dtype)
+        dummies_dtype: NpDtype
+        if isinstance(_dtype, np.dtype):
+            dummies_dtype = _dtype
+        else:
+            dummies_dtype = np.bool_
+        dummies = np.empty((len(arr), len(tags2)), dtype=dummies_dtype)
 
         def _isin(test_elements: str, element: str) -> bool:
             return element in test_elements
diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py
index 31386e4e342ae..0656f505dc745 100644
--- a/pandas/tests/strings/test_get_dummies.py
+++ b/pandas/tests/strings/test_get_dummies.py
@@ -1,4 +1,7 @@
 import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
 
 from pandas import (
     DataFrame,
@@ -8,6 +11,11 @@
     _testing as tm,
 )
 
+try:
+    import pyarrow as pa
+except ImportError:
+    pa = None
+
 
 def test_get_dummies(any_string_dtype):
     s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
@@ -32,22 +40,85 @@ def test_get_dummies_index():
     tm.assert_index_equal(result, expected)
 
 
-def test_get_dummies_with_name_dummy(any_string_dtype):
-    # GH 12180
-    # Dummies named 'name' should work as expected
-    s = Series(["a", "b,name", "b"], dtype=any_string_dtype)
-    result = s.str.get_dummies(",")
-    expected = DataFrame([[1, 0, 0], [0, 1, 1], [0, 1, 0]], columns=["a", "b", "name"])
+# GH#47872
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        np.uint8,
+        np.int16,
+        np.uint16,
+        np.int32,
+        np.uint32,
+        np.int64,
+        np.uint64,
+        bool,
+        "Int8",
+        "Int16",
+        "Int32",
+        "Int64",
+        "boolean",
+    ],
+)
+def test_get_dummies_with_dtype(any_string_dtype, dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies("|", dtype=dtype)
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=dtype
+    )
     tm.assert_frame_equal(result, expected)
 
 
-def test_get_dummies_with_name_dummy_index():
-    # GH 12180
-    # Dummies named 'name' should work as expected
-    idx = Index(["a|b", "name|c", "b|name"])
-    result = idx.str.get_dummies("|")
+# GH#47872
+@td.skip_if_no("pyarrow")
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "int8[pyarrow]",
+        "uint8[pyarrow]",
+        "int16[pyarrow]",
+        "uint16[pyarrow]",
+        "int32[pyarrow]",
+        "uint32[pyarrow]",
+        "int64[pyarrow]",
+        "uint64[pyarrow]",
+        "bool[pyarrow]",
+    ],
+)
+def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies("|", dtype=dtype)
+    expected = DataFrame(
+        [[1, 1, 0], [1, 0, 1], [0, 0, 0]],
+        columns=list("abc"),
+        dtype=dtype,
+    )
+    tm.assert_frame_equal(result, expected)
 
-    expected = MultiIndex.from_tuples(
-        [(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name")
+
+# GH#47872
+def test_get_dummies_with_str_dtype(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies("|", dtype=str)
+    expected = DataFrame(
+        [["T", "T", "F"], ["T", "F", "T"], ["F", "F", "F"]],
+        columns=list("abc"),
+        dtype=str,
     )
-    tm.assert_index_equal(result, expected)
+    tm.assert_frame_equal(result, expected)
+
+
+# GH#47872
+@td.skip_if_no("pyarrow")
+def test_get_dummies_with_pa_str_dtype(any_string_dtype):
+    s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
+    result = s.str.get_dummies("|", dtype="str[pyarrow]")
+    expected = DataFrame(
+        [
+            ["true", "true", "false"],
+            ["true", "false", "true"],
+            ["false", "false", "false"],
+        ],
+        columns=list("abc"),
+        dtype="str[pyarrow]",
+    )
+    tm.assert_frame_equal(result, expected)

From 50ac1907abeef8e6824472988a9f015dcd25bb21 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Tue, 10 Sep 2024 01:18:29 -0700
Subject: [PATCH 028/224] BUG (string): Series.str.slice with negative step
 (#59724)

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 doc/source/whatsnew/v2.3.0.rst             |  3 +-
 pandas/core/arrays/_arrow_string_mixins.py | 32 ++++++++++++++++------
 pandas/core/arrays/arrow/array.py          | 11 --------
 pandas/core/arrays/string_arrow.py         | 14 +---------
 pandas/tests/extension/test_arrow.py       |  1 +
 pandas/tests/strings/test_strings.py       |  1 +
 6 files changed, 28 insertions(+), 34 deletions(-)

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
index 03355f655eb28..03b3a6b55dff6 100644
--- a/doc/source/whatsnew/v2.3.0.rst
+++ b/doc/source/whatsnew/v2.3.0.rst
@@ -103,8 +103,9 @@ Conversion
 Strings
 ^^^^^^^
 - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
+- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
 - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
-
+-
 
 Interval
 ^^^^^^^^
diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py
index 950d4cd7cc92e..32fa5e7c383b5 100644
--- a/pandas/core/arrays/_arrow_string_mixins.py
+++ b/pandas/core/arrays/_arrow_string_mixins.py
@@ -11,6 +11,7 @@
 
 from pandas.compat import (
     pa_version_under10p1,
+    pa_version_under11p0,
     pa_version_under13p0,
     pa_version_under17p0,
 )
@@ -22,10 +23,7 @@
     import pyarrow.compute as pc
 
 if TYPE_CHECKING:
-    from collections.abc import (
-        Callable,
-        Sized,
-    )
+    from collections.abc import Callable
 
     from pandas._typing import (
         Scalar,
@@ -34,7 +32,7 @@
 
 
 class ArrowStringArrayMixin:
-    _pa_array: Sized
+    _pa_array: pa.ChunkedArray
 
     def __init__(self, *args, **kwargs) -> None:
         raise NotImplementedError
@@ -96,13 +94,29 @@ def _str_get(self, i: int) -> Self:
         selected = pc.utf8_slice_codeunits(
             self._pa_array, start=start, stop=stop, step=step
         )
-        null_value = pa.scalar(
-            None,
-            type=self._pa_array.type,  # type: ignore[attr-defined]
-        )
+        null_value = pa.scalar(None, type=self._pa_array.type)
         result = pc.if_else(not_out_of_bounds, selected, null_value)
         return type(self)(result)
 
+    def _str_slice(
+        self, start: int | None = None, stop: int | None = None, step: int | None = None
+    ) -> Self:
+        if pa_version_under11p0:
+            # GH#59724
+            result = self._apply_elementwise(lambda val: val[start:stop:step])
+            return type(self)(pa.chunked_array(result, type=self._pa_array.type))
+        if start is None:
+            if step is not None and step < 0:
+                # GH#59710
+                start = -1
+            else:
+                start = 0
+        if step is None:
+            step = 1
+        return type(self)(
+            pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
+        )
+
     def _str_slice_replace(
         self, start: int | None = None, stop: int | None = None, repl: str | None = None
     ) -> Self:
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 4edf464be74f1..41d40d8304e8f 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2394,17 +2394,6 @@ def _str_rpartition(self, sep: str, expand: bool) -> Self:
         result = self._apply_elementwise(predicate)
         return type(self)(pa.chunked_array(result))
 
-    def _str_slice(
-        self, start: int | None = None, stop: int | None = None, step: int | None = None
-    ) -> Self:
-        if start is None:
-            start = 0
-        if step is None:
-            step = 1
-        return type(self)(
-            pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
-        )
-
     def _str_len(self) -> Self:
         return type(self)(pc.utf8_length(self._pa_array))
 
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index fa8c662b68f3c..73dc822bb8ef5 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -294,6 +294,7 @@ def astype(self, dtype, copy: bool = True):
     _str_startswith = ArrowStringArrayMixin._str_startswith
     _str_endswith = ArrowStringArrayMixin._str_endswith
     _str_pad = ArrowStringArrayMixin._str_pad
+    _str_slice = ArrowStringArrayMixin._str_slice
 
     def _str_contains(
         self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
@@ -352,19 +353,6 @@ def _str_fullmatch(
             pat = f"{pat}$"
         return self._str_match(pat, case, flags, na)
 
-    def _str_slice(
-        self, start: int | None = None, stop: int | None = None, step: int | None = None
-    ) -> Self:
-        if stop is None:
-            return super()._str_slice(start, stop, step)
-        if start is None:
-            start = 0
-        if step is None:
-            step = 1
-        return type(self)(
-            pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
-        )
-
     def _str_len(self):
         result = pc.utf8_length(self._pa_array)
         return self._convert_int_result(result)
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index fc4f14882b9d7..f86d927ddda67 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -2036,6 +2036,7 @@ def test_str_join_string_type():
         [None, 2, None, ["ab", None]],
         [None, 2, 1, ["ab", None]],
         [1, 3, 1, ["bc", None]],
+        (None, None, -1, ["dcba", None]),
     ],
 )
 def test_str_slice(start, stop, step, exp):
diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py
index 1ce46497c3c22..4995b448f7e94 100644
--- a/pandas/tests/strings/test_strings.py
+++ b/pandas/tests/strings/test_strings.py
@@ -394,6 +394,7 @@ def test_pipe_failures(any_string_dtype):
         (2, 5, None, ["foo", "bar", np.nan, "baz"]),
         (0, 3, -1, ["", "", np.nan, ""]),
         (None, None, -1, ["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"]),
+        (None, 2, -1, ["owtoo", "owtra", np.nan, "xuqza"]),
         (3, 10, 2, ["oto", "ato", np.nan, "aqx"]),
         (3, 0, -1, ["ofa", "aba", np.nan, "aba"]),
     ],

From de51d336d10f198cb5594ba55530c9401b4eff18 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 10 Sep 2024 16:35:18 +0200
Subject: [PATCH 029/224] String dtype: remove fallback Perfomance warnings for
 string methods (#59760)

---
 pandas/core/arrays/arrow/_arrow_utils.py  |  19 ----
 pandas/core/arrays/string_arrow.py        |   8 --
 pandas/tests/extension/test_string.py     |   1 -
 pandas/tests/indexes/test_setops.py       |  12 ---
 pandas/tests/strings/test_find_replace.py | 103 ++++++----------------
 pandas/tests/strings/test_string_array.py |   1 -
 6 files changed, 27 insertions(+), 117 deletions(-)

diff --git a/pandas/core/arrays/arrow/_arrow_utils.py b/pandas/core/arrays/arrow/_arrow_utils.py
index cbc9ce0252750..285c3fd465ffc 100644
--- a/pandas/core/arrays/arrow/_arrow_utils.py
+++ b/pandas/core/arrays/arrow/_arrow_utils.py
@@ -1,27 +1,8 @@
 from __future__ import annotations
 
-import warnings
-
 import numpy as np
 import pyarrow
 
-from pandas._config.config import get_option
-
-from pandas.errors import PerformanceWarning
-from pandas.util._exceptions import find_stack_level
-
-
-def fallback_performancewarning(version: str | None = None) -> None:
-    """
-    Raise a PerformanceWarning for falling back to ExtensionArray's
-    non-pyarrow method
-    """
-    if get_option("performance_warnings"):
-        msg = "Falling back on a non-pyarrow code path which may decrease performance."
-        if version is not None:
-            msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning."
-        warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())
-
 
 def pyarrow_array_to_numpy_and_mask(
     arr, dtype: np.dtype
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 73dc822bb8ef5..a669b6d669b48 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -10,8 +10,6 @@
 
 import numpy as np
 
-from pandas._config.config import get_option
-
 from pandas._libs import (
     lib,
     missing as libmissing,
@@ -43,8 +41,6 @@
     import pyarrow as pa
     import pyarrow.compute as pc
 
-    from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning
-
 
 if TYPE_CHECKING:
     from collections.abc import (
@@ -300,8 +296,6 @@ def _str_contains(
         self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
     ):
         if flags:
-            if get_option("mode.performance_warnings"):
-                fallback_performancewarning()
             return super()._str_contains(pat, case, flags, na, regex)
 
         if not isna(na):
@@ -327,8 +321,6 @@ def _str_replace(
         regex: bool = True,
     ):
         if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
-            if get_option("mode.performance_warnings"):
-                fallback_performancewarning()
             return super()._str_replace(pat, repl, n, case, flags, regex)
 
         return ArrowExtensionArray._str_replace(self, pat, repl, n, case, flags, regex)
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 17f6eb8282b23..509ae653e4793 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -209,7 +209,6 @@ def test_compare_scalar(self, data, comparison_op):
         ser = pd.Series(data)
         self._compare_other(ser, data, comparison_op, "abc")
 
-    @pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning")
     def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
         super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op)
 
diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py
index 8fd349dacf9e9..e5dc47be20677 100644
--- a/pandas/tests/indexes/test_setops.py
+++ b/pandas/tests/indexes/test_setops.py
@@ -246,9 +246,6 @@ def test_intersection_base(self, index):
             with pytest.raises(TypeError, match=msg):
                 first.intersection([1, 2, 3])
 
-    @pytest.mark.filterwarnings(
-        "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning"
-    )
     @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
     def test_union_base(self, index):
         index = index.unique()
@@ -276,9 +273,6 @@ def test_union_base(self, index):
                 first.union([1, 2, 3])
 
     @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
-    @pytest.mark.filterwarnings(
-        "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning"
-    )
     def test_difference_base(self, sort, index):
         first = index[2:]
         second = index[:4]
@@ -305,9 +299,6 @@ def test_difference_base(self, sort, index):
                 first.difference([1, 2, 3], sort)
 
     @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
-    @pytest.mark.filterwarnings(
-        "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning"
-    )
     def test_symmetric_difference(self, index):
         if isinstance(index, CategoricalIndex):
             pytest.skip(f"Not relevant for {type(index).__name__}")
@@ -529,9 +520,6 @@ def test_intersection_difference_match_empty(self, index, sort):
 
 
 @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
-@pytest.mark.filterwarnings(
-    "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning"
-)
 @pytest.mark.parametrize(
     "method", ["intersection", "union", "difference", "symmetric_difference"]
 )
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
index ea9f89ed129aa..f3698a2ea33cf 100644
--- a/pandas/tests/strings/test_find_replace.py
+++ b/pandas/tests/strings/test_find_replace.py
@@ -21,10 +21,6 @@
 # --------------------------------------------------------------------------------------
 
 
-def using_pyarrow(dtype):
-    return dtype == "string" and dtype.storage == "pyarrow"
-
-
 def test_contains(any_string_dtype):
     values = np.array(
         ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_
@@ -458,13 +454,10 @@ def test_replace_mixed_object():
     tm.assert_series_equal(result, expected)
 
 
-def test_replace_unicode(any_string_dtype, performance_warning):
+def test_replace_unicode(any_string_dtype):
     ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
     expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
-    with tm.maybe_produces_warning(
-        performance_warning, using_pyarrow(any_string_dtype)
-    ):
-        result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True)
+    result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True)
     tm.assert_series_equal(result, expected)
 
 
@@ -478,16 +471,13 @@ def test_replace_wrong_repl_type_raises(any_string_dtype, index_or_series, repl,
         obj.str.replace("a", repl)
 
 
-def test_replace_callable(any_string_dtype, performance_warning):
+def test_replace_callable(any_string_dtype):
     # GH 15055
     ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
 
     # test with callable
     repl = lambda m: m.group(0).swapcase()
-    with tm.maybe_produces_warning(
-        performance_warning, using_pyarrow(any_string_dtype)
-    ):
-        result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True)
+    result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True)
     expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype)
     tm.assert_series_equal(result, expected)
 
@@ -495,7 +485,7 @@ def test_replace_callable(any_string_dtype, performance_warning):
 @pytest.mark.parametrize(
     "repl", [lambda: None, lambda m, x: None, lambda m, x, y=None: None]
 )
-def test_replace_callable_raises(any_string_dtype, performance_warning, repl):
+def test_replace_callable_raises(any_string_dtype, repl):
     # GH 15055
     values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
 
@@ -504,43 +494,31 @@ def test_replace_callable_raises(any_string_dtype, performance_warning, repl):
         r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "
         r"(?(3)required )positional arguments?"
     )
-    if not using_pyarrow(any_string_dtype):
-        performance_warning = False
     with pytest.raises(TypeError, match=msg):
-        with tm.assert_produces_warning(performance_warning):
-            values.str.replace("a", repl, regex=True)
+        values.str.replace("a", repl, regex=True)
 
 
-def test_replace_callable_named_groups(any_string_dtype, performance_warning):
+def test_replace_callable_named_groups(any_string_dtype):
     # test regex named groups
     ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype)
     pat = r"(?P<first>\w+) (?P<middle>\w+) (?P<last>\w+)"
     repl = lambda m: m.group("middle").swapcase()
-    with tm.maybe_produces_warning(
-        performance_warning, using_pyarrow(any_string_dtype)
-    ):
-        result = ser.str.replace(pat, repl, regex=True)
+    result = ser.str.replace(pat, repl, regex=True)
     expected = Series(["bAR", np.nan], dtype=any_string_dtype)
     tm.assert_series_equal(result, expected)
 
 
-def test_replace_compiled_regex(any_string_dtype, performance_warning):
+def test_replace_compiled_regex(any_string_dtype):
     # GH 15446
     ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
 
     # test with compiled regex
     pat = re.compile(r"BAD_*")
-    with tm.maybe_produces_warning(
-        performance_warning, using_pyarrow(any_string_dtype)
-    ):
-        result = ser.str.replace(pat, "", regex=True)
+    result = ser.str.replace(pat, "", regex=True)
     expected = Series(["foobar", np.nan], dtype=any_string_dtype)
     tm.assert_series_equal(result, expected)
 
-    with tm.maybe_produces_warning(
-        performance_warning, using_pyarrow(any_string_dtype)
-    ):
-        result = ser.str.replace(pat, "", n=1, regex=True)
+    result = ser.str.replace(pat, "", n=1, regex=True)
     expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype)
     tm.assert_series_equal(result, expected)
 
@@ -557,14 +535,11 @@ def test_replace_compiled_regex_mixed_object():
     tm.assert_series_equal(result, expected)
 
 
-def test_replace_compiled_regex_unicode(any_string_dtype, performance_warning):
+def test_replace_compiled_regex_unicode(any_string_dtype):
     ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
     expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype)
     pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE)
-    with tm.maybe_produces_warning(
-        performance_warning, using_pyarrow(any_string_dtype)
-    ):
-        result = ser.str.replace(pat, ", ", regex=True)
+    result = ser.str.replace(pat, ", ", regex=True)
     tm.assert_series_equal(result, expected)
 
 
@@ -586,15 +561,12 @@ def test_replace_compiled_regex_raises(any_string_dtype):
         ser.str.replace(pat, "", case=True, regex=True)
 
 
-def test_replace_compiled_regex_callable(any_string_dtype, performance_warning):
+def test_replace_compiled_regex_callable(any_string_dtype):
     # test with callable
     ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype)
     repl = lambda m: m.group(0).swapcase()
     pat = re.compile("[a-z][A-Z]{2}")
-    with tm.maybe_produces_warning(
-        performance_warning, using_pyarrow(any_string_dtype)
-    ):
-        result = ser.str.replace(pat, repl, n=2, regex=True)
+    result = ser.str.replace(pat, repl, n=2, regex=True)
     expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype)
     tm.assert_series_equal(result, expected)
 
@@ -626,7 +598,7 @@ def test_replace_literal_compiled_raises(any_string_dtype):
         ser.str.replace(pat, "", regex=False)
 
 
-def test_replace_moar(any_string_dtype, performance_warning):
+def test_replace_moar(any_string_dtype):
     # PR #1179
     ser = Series(
         ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"],
@@ -640,10 +612,7 @@ def test_replace_moar(any_string_dtype, performance_warning):
     )
     tm.assert_series_equal(result, expected)
 
-    with tm.maybe_produces_warning(
-        performance_warning, using_pyarrow(any_string_dtype)
-    ):
-        result = ser.str.replace("A", "YYY", case=False)
+    result = ser.str.replace("A", "YYY", case=False)
     expected = Series(
         [
             "YYY",
@@ -661,10 +630,7 @@ def test_replace_moar(any_string_dtype, performance_warning):
     )
     tm.assert_series_equal(result, expected)
 
-    with tm.maybe_produces_warning(
-        performance_warning, using_pyarrow(any_string_dtype)
-    ):
-        result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True)
+    result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True)
     expected = Series(
         [
             "A",
@@ -683,21 +649,15 @@ def test_replace_moar(any_string_dtype, performance_warning):
     tm.assert_series_equal(result, expected)
 
 
-def test_replace_not_case_sensitive_not_regex(any_string_dtype, performance_warning):
+def test_replace_not_case_sensitive_not_regex(any_string_dtype):
     # https://github.com/pandas-dev/pandas/issues/41602
     ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype)
 
-    with tm.maybe_produces_warning(
-        performance_warning, using_pyarrow(any_string_dtype)
-    ):
-        result = ser.str.replace("a", "c", case=False, regex=False)
+    result = ser.str.replace("a", "c", case=False, regex=False)
     expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype)
     tm.assert_series_equal(result, expected)
 
-    with tm.maybe_produces_warning(
-        performance_warning, using_pyarrow(any_string_dtype)
-    ):
-        result = ser.str.replace("a.", "c.", case=False, regex=False)
+    result = ser.str.replace("a.", "c.", case=False, regex=False)
     expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype)
     tm.assert_series_equal(result, expected)
 
@@ -853,7 +813,7 @@ def test_fullmatch_na_kwarg(any_string_dtype):
     tm.assert_series_equal(result, expected)
 
 
-def test_fullmatch_case_kwarg(any_string_dtype, performance_warning):
+def test_fullmatch_case_kwarg(any_string_dtype):
     ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype)
     expected_dtype = (
         np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
@@ -869,10 +829,7 @@ def test_fullmatch_case_kwarg(any_string_dtype, performance_warning):
     result = ser.str.fullmatch("ab", case=False)
     tm.assert_series_equal(result, expected)
 
-    with tm.maybe_produces_warning(
-        performance_warning, using_pyarrow(any_string_dtype)
-    ):
-        result = ser.str.fullmatch("ab", flags=re.IGNORECASE)
+    result = ser.str.fullmatch("ab", flags=re.IGNORECASE)
     tm.assert_series_equal(result, expected)
 
 
@@ -1046,7 +1003,7 @@ def test_translate_mixed_object():
 # --------------------------------------------------------------------------------------
 
 
-def test_flags_kwarg(any_string_dtype, performance_warning):
+def test_flags_kwarg(any_string_dtype):
     data = {
         "Dave": "dave@google.com",
         "Steve": "steve@gmail.com",
@@ -1057,17 +1014,13 @@ def test_flags_kwarg(any_string_dtype, performance_warning):
 
     pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
 
-    use_pyarrow = using_pyarrow(any_string_dtype)
-
     result = data.str.extract(pat, flags=re.IGNORECASE, expand=True)
     assert result.iloc[0].tolist() == ["dave", "google", "com"]
 
-    with tm.maybe_produces_warning(performance_warning, use_pyarrow):
-        result = data.str.match(pat, flags=re.IGNORECASE)
+    result = data.str.match(pat, flags=re.IGNORECASE)
     assert result.iloc[0]
 
-    with tm.maybe_produces_warning(performance_warning, use_pyarrow):
-        result = data.str.fullmatch(pat, flags=re.IGNORECASE)
+    result = data.str.fullmatch(pat, flags=re.IGNORECASE)
     assert result.iloc[0]
 
     result = data.str.findall(pat, flags=re.IGNORECASE)
@@ -1077,8 +1030,6 @@ def test_flags_kwarg(any_string_dtype, performance_warning):
     assert result.iloc[0] == 1
 
     msg = "has match groups"
-    with tm.assert_produces_warning(
-        UserWarning, match=msg, raise_on_extra_warnings=not use_pyarrow
-    ):
+    with tm.assert_produces_warning(UserWarning, match=msg):
         result = data.str.contains(pat, flags=re.IGNORECASE)
     assert result.iloc[0]
diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py
index 0b3f368afea5e..517ddb164985c 100644
--- a/pandas/tests/strings/test_string_array.py
+++ b/pandas/tests/strings/test_string_array.py
@@ -12,7 +12,6 @@
 )
 
 
-@pytest.mark.filterwarnings("ignore:Falling back")
 def test_string_array(nullable_string_dtype, any_string_method):
     method_name, args, kwargs = any_string_method
 

From 16b7288eccdf29efcb430616e77cd701497fe8ed Mon Sep 17 00:00:00 2001
From: ammar-qazi <ammmarqz@gmail.com>
Date: Tue, 10 Sep 2024 19:11:32 +0200
Subject: [PATCH 030/224] DOC: Add docstring for Extensionarray interpolate
 (#59749)

* Update docstring of Extensionarray.interpolate

* Remove Extensionarray.interpolate from code_checks.sh

* Resolving pre-commit errors

* Resolving pre-commit errors 2

* Resolved ruff formatting error

* Fix issues after review
---
 ci/code_checks.sh          |  1 -
 pandas/core/arrays/base.py | 78 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 76 insertions(+), 3 deletions(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 06078d8958492..2aa256b65a493 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -116,7 +116,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timestamp.resolution PR02" \
         -i "pandas.Timestamp.tzinfo GL08" \
         -i "pandas.Timestamp.year GL08" \
-        -i "pandas.api.extensions.ExtensionArray.interpolate PR01,SA01" \
         -i "pandas.api.types.is_dict_like PR07,SA01" \
         -i "pandas.api.types.is_extension_array_dtype SA01" \
         -i "pandas.api.types.is_file_like PR07,SA01" \
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 536c7303a2f92..a933a9ce11646 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -999,16 +999,74 @@ def interpolate(
         **kwargs,
     ) -> Self:
         """
-        See DataFrame.interpolate.__doc__.
+        Fill NaN values using an interpolation method.
+
+        Parameters
+        ----------
+        method : str, default 'linear'
+            Interpolation technique to use. One of:
+            * 'linear': Ignore the index and treat the values as equally spaced.
+            This is the only method supported on MultiIndexes.
+            * 'time': Works on daily and higher resolution data to interpolate
+            given length of interval.
+            * 'index', 'values': use the actual numerical values of the index.
+            * 'pad': Fill in NaNs using existing values.
+            * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric',
+            'polynomial': Passed to scipy.interpolate.interp1d, whereas 'spline'
+            is passed to scipy.interpolate.UnivariateSpline. These methods use
+            the numerical values of the index.
+            Both 'polynomial' and 'spline' require that you also specify an
+            order (int), e.g. arr.interpolate(method='polynomial', order=5).
+            * 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima',
+            'cubicspline': Wrappers around the SciPy interpolation methods
+            of similar names. See Notes.
+            * 'from_derivatives': Refers to scipy.interpolate.BPoly.from_derivatives.
+        axis : int
+            Axis to interpolate along. For 1-dimensional data, use 0.
+        index : Index
+            Index to use for interpolation.
+        limit : int or None
+            Maximum number of consecutive NaNs to fill. Must be greater than 0.
+        limit_direction : {'forward', 'backward', 'both'}
+            Consecutive NaNs will be filled in this direction.
+        limit_area : {'inside', 'outside'} or None
+            If limit is specified, consecutive NaNs will be filled with this
+            restriction.
+            * None: No fill restriction.
+            * 'inside': Only fill NaNs surrounded by valid values (interpolate).
+            * 'outside': Only fill NaNs outside valid values (extrapolate).
+        copy : bool
+            If True, a copy of the object is returned with interpolated values.
+        **kwargs : optional
+            Keyword arguments to pass on to the interpolating function.
+
+        Returns
+        -------
+        ExtensionArray
+            An ExtensionArray with interpolated values.
+
+        See Also
+        --------
+        Series.interpolate : Interpolate values in a Series.
+        DataFrame.interpolate : Interpolate values in a DataFrame.
+
+        Notes
+        -----
+        - All parameters must be specified as keyword arguments.
+        - The 'krogh', 'piecewise_polynomial', 'spline', 'pchip' and 'akima'
+          methods are wrappers around the respective SciPy implementations of
+          similar names. These use the actual numerical values of the index.
 
         Examples
         --------
+        Interpolating values in a NumPy array:
+
         >>> arr = pd.arrays.NumpyExtensionArray(np.array([0, 1, np.nan, 3]))
         >>> arr.interpolate(
         ...     method="linear",
         ...     limit=3,
         ...     limit_direction="forward",
-        ...     index=pd.Index([1, 2, 3, 4]),
+        ...     index=pd.Index(range(len(arr))),
         ...     fill_value=1,
         ...     copy=False,
         ...     axis=0,
@@ -1017,6 +1075,22 @@ def interpolate(
         <NumpyExtensionArray>
         [0.0, 1.0, 2.0, 3.0]
         Length: 4, dtype: float64
+
+        Interpolating values in a FloatingArray:
+
+        >>> arr = pd.array([1.0, pd.NA, 3.0, 4.0, pd.NA, 6.0], dtype="Float64")
+        >>> arr.interpolate(
+        ...     method="linear",
+        ...     axis=0,
+        ...     index=pd.Index(range(len(arr))),
+        ...     limit=None,
+        ...     limit_direction="both",
+        ...     limit_area=None,
+        ...     copy=True,
+        ... )
+        <FloatingArray>
+        [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
+        Length: 6, dtype: Float64
         """
         # NB: we return type(self) even if copy=False
         raise NotImplementedError(

From 4444e5279b2a42b927044d65cbd894abd33fa724 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Wed, 11 Sep 2024 12:40:01 -0700
Subject: [PATCH 031/224] REF (string): de-duplicate ArrowStringArray methods
 (#59555)

---
 pandas/core/arrays/_arrow_string_mixins.py |  83 ++++++++++++++++
 pandas/core/arrays/arrow/array.py          |  86 +----------------
 pandas/core/arrays/string_arrow.py         | 106 ++++-----------------
 3 files changed, 103 insertions(+), 172 deletions(-)

diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py
index 32fa5e7c383b5..aa5b28c71b12a 100644
--- a/pandas/core/arrays/_arrow_string_mixins.py
+++ b/pandas/core/arrays/_arrow_string_mixins.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from functools import partial
+import re
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -48,6 +49,37 @@ def _convert_int_result(self, result):
     def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
         raise NotImplementedError
 
+    def _str_len(self):
+        result = pc.utf8_length(self._pa_array)
+        return self._convert_int_result(result)
+
+    def _str_lower(self) -> Self:
+        return type(self)(pc.utf8_lower(self._pa_array))
+
+    def _str_upper(self) -> Self:
+        return type(self)(pc.utf8_upper(self._pa_array))
+
+    def _str_strip(self, to_strip=None) -> Self:
+        if to_strip is None:
+            result = pc.utf8_trim_whitespace(self._pa_array)
+        else:
+            result = pc.utf8_trim(self._pa_array, characters=to_strip)
+        return type(self)(result)
+
+    def _str_lstrip(self, to_strip=None) -> Self:
+        if to_strip is None:
+            result = pc.utf8_ltrim_whitespace(self._pa_array)
+        else:
+            result = pc.utf8_ltrim(self._pa_array, characters=to_strip)
+        return type(self)(result)
+
+    def _str_rstrip(self, to_strip=None) -> Self:
+        if to_strip is None:
+            result = pc.utf8_rtrim_whitespace(self._pa_array)
+        else:
+            result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
+        return type(self)(result)
+
     def _str_pad(
         self,
         width: int,
@@ -128,6 +160,33 @@ def _str_slice_replace(
             stop = np.iinfo(np.int64).max
         return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl))
 
+    def _str_replace(
+        self,
+        pat: str | re.Pattern,
+        repl: str | Callable,
+        n: int = -1,
+        case: bool = True,
+        flags: int = 0,
+        regex: bool = True,
+    ) -> Self:
+        if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
+            raise NotImplementedError(
+                "replace is not supported with a re.Pattern, callable repl, "
+                "case=False, or flags!=0"
+            )
+
+        func = pc.replace_substring_regex if regex else pc.replace_substring
+        # https://github.com/apache/arrow/issues/39149
+        # GH 56404, unexpected behavior with negative max_replacements with pyarrow.
+        pa_max_replacements = None if n < 0 else n
+        result = func(
+            self._pa_array,
+            pattern=pat,
+            replacement=repl,
+            max_replacements=pa_max_replacements,
+        )
+        return type(self)(result)
+
     def _str_capitalize(self) -> Self:
         return type(self)(pc.utf8_capitalize(self._pa_array))
 
@@ -137,6 +196,16 @@ def _str_title(self) -> Self:
     def _str_swapcase(self) -> Self:
         return type(self)(pc.utf8_swapcase(self._pa_array))
 
+    def _str_removeprefix(self, prefix: str):
+        if not pa_version_under13p0:
+            starts_with = pc.starts_with(self._pa_array, pattern=prefix)
+            removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
+            result = pc.if_else(starts_with, removed, self._pa_array)
+            return type(self)(result)
+        predicate = lambda val: val.removeprefix(prefix)
+        result = self._apply_elementwise(predicate)
+        return type(self)(pa.chunked_array(result))
+
     def _str_removesuffix(self, suffix: str):
         ends_with = pc.ends_with(self._pa_array, pattern=suffix)
         removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
@@ -228,6 +297,20 @@ def _str_contains(
             result = result.fill_null(na)
         return self._convert_bool_result(result)
 
+    def _str_match(
+        self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
+    ):
+        if not pat.startswith("^"):
+            pat = f"^{pat}"
+        return self._str_contains(pat, case, flags, na, regex=True)
+
+    def _str_fullmatch(
+        self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
+    ):
+        if not pat.endswith("$") or pat.endswith("\\$"):
+            pat = f"{pat}$"
+        return self._str_match(pat, case, flags, na)
+
     def _str_find(self, sub: str, start: int = 0, end: int | None = None):
         if (
             pa_version_under13p0
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 41d40d8304e8f..bd94447f0cd80 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -1999,7 +1999,7 @@ def _rank(
         """
         See Series.rank.__doc__.
         """
-        return type(self)(
+        return self._convert_int_result(
             self._rank_calc(
                 axis=axis,
                 method=method,
@@ -2323,36 +2323,6 @@ def _str_count(self, pat: str, flags: int = 0) -> Self:
             raise NotImplementedError(f"count not implemented with {flags=}")
         return type(self)(pc.count_substring_regex(self._pa_array, pat))
 
-    def _result_converter(self, result):
-        return type(self)(result)
-
-    def _str_replace(
-        self,
-        pat: str | re.Pattern,
-        repl: str | Callable,
-        n: int = -1,
-        case: bool = True,
-        flags: int = 0,
-        regex: bool = True,
-    ) -> Self:
-        if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
-            raise NotImplementedError(
-                "replace is not supported with a re.Pattern, callable repl, "
-                "case=False, or flags!=0"
-            )
-
-        func = pc.replace_substring_regex if regex else pc.replace_substring
-        # https://github.com/apache/arrow/issues/39149
-        # GH 56404, unexpected behavior with negative max_replacements with pyarrow.
-        pa_max_replacements = None if n < 0 else n
-        result = func(
-            self._pa_array,
-            pattern=pat,
-            replacement=repl,
-            max_replacements=pa_max_replacements,
-        )
-        return type(self)(result)
-
     def _str_repeat(self, repeats: int | Sequence[int]) -> Self:
         if not isinstance(repeats, int):
             raise NotImplementedError(
@@ -2360,20 +2330,6 @@ def _str_repeat(self, repeats: int | Sequence[int]) -> Self:
             )
         return type(self)(pc.binary_repeat(self._pa_array, repeats))
 
-    def _str_match(
-        self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
-    ) -> Self:
-        if not pat.startswith("^"):
-            pat = f"^{pat}"
-        return self._str_contains(pat, case, flags, na, regex=True)
-
-    def _str_fullmatch(
-        self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
-    ) -> Self:
-        if not pat.endswith("$") or pat.endswith("\\$"):
-            pat = f"{pat}$"
-        return self._str_match(pat, case, flags, na)
-
     def _str_join(self, sep: str) -> Self:
         if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string(
             self._pa_array.type
@@ -2394,46 +2350,6 @@ def _str_rpartition(self, sep: str, expand: bool) -> Self:
         result = self._apply_elementwise(predicate)
         return type(self)(pa.chunked_array(result))
 
-    def _str_len(self) -> Self:
-        return type(self)(pc.utf8_length(self._pa_array))
-
-    def _str_lower(self) -> Self:
-        return type(self)(pc.utf8_lower(self._pa_array))
-
-    def _str_upper(self) -> Self:
-        return type(self)(pc.utf8_upper(self._pa_array))
-
-    def _str_strip(self, to_strip=None) -> Self:
-        if to_strip is None:
-            result = pc.utf8_trim_whitespace(self._pa_array)
-        else:
-            result = pc.utf8_trim(self._pa_array, characters=to_strip)
-        return type(self)(result)
-
-    def _str_lstrip(self, to_strip=None) -> Self:
-        if to_strip is None:
-            result = pc.utf8_ltrim_whitespace(self._pa_array)
-        else:
-            result = pc.utf8_ltrim(self._pa_array, characters=to_strip)
-        return type(self)(result)
-
-    def _str_rstrip(self, to_strip=None) -> Self:
-        if to_strip is None:
-            result = pc.utf8_rtrim_whitespace(self._pa_array)
-        else:
-            result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
-        return type(self)(result)
-
-    def _str_removeprefix(self, prefix: str):
-        if not pa_version_under13p0:
-            starts_with = pc.starts_with(self._pa_array, pattern=prefix)
-            removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
-            result = pc.if_else(starts_with, removed, self._pa_array)
-            return type(self)(result)
-        predicate = lambda val: val.removeprefix(prefix)
-        result = self._apply_elementwise(predicate)
-        return type(self)(pa.chunked_array(result))
-
     def _str_casefold(self) -> Self:
         predicate = lambda val: val.casefold()
         result = self._apply_elementwise(predicate)
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index a669b6d669b48..f446cc5bde147 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -50,10 +50,8 @@
 
     from pandas._typing import (
         ArrayLike,
-        AxisInt,
         Dtype,
         NpDtype,
-        Scalar,
         Self,
         npt,
     )
@@ -290,6 +288,20 @@ def astype(self, dtype, copy: bool = True):
     _str_startswith = ArrowStringArrayMixin._str_startswith
     _str_endswith = ArrowStringArrayMixin._str_endswith
     _str_pad = ArrowStringArrayMixin._str_pad
+    _str_match = ArrowStringArrayMixin._str_match
+    _str_fullmatch = ArrowStringArrayMixin._str_fullmatch
+    _str_lower = ArrowStringArrayMixin._str_lower
+    _str_upper = ArrowStringArrayMixin._str_upper
+    _str_strip = ArrowStringArrayMixin._str_strip
+    _str_lstrip = ArrowStringArrayMixin._str_lstrip
+    _str_rstrip = ArrowStringArrayMixin._str_rstrip
+    _str_removesuffix = ArrowStringArrayMixin._str_removesuffix
+    _str_get = ArrowStringArrayMixin._str_get
+    _str_capitalize = ArrowStringArrayMixin._str_capitalize
+    _str_title = ArrowStringArrayMixin._str_title
+    _str_swapcase = ArrowStringArrayMixin._str_swapcase
+    _str_slice_replace = ArrowStringArrayMixin._str_slice_replace
+    _str_len = ArrowStringArrayMixin._str_len
     _str_slice = ArrowStringArrayMixin._str_slice
 
     def _str_contains(
@@ -323,73 +335,21 @@ def _str_replace(
         if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
             return super()._str_replace(pat, repl, n, case, flags, regex)
 
-        return ArrowExtensionArray._str_replace(self, pat, repl, n, case, flags, regex)
+        return ArrowStringArrayMixin._str_replace(
+            self, pat, repl, n, case, flags, regex
+        )
 
     def _str_repeat(self, repeats: int | Sequence[int]):
         if not isinstance(repeats, int):
             return super()._str_repeat(repeats)
         else:
-            return type(self)(pc.binary_repeat(self._pa_array, repeats))
-
-    def _str_match(
-        self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
-    ):
-        if not pat.startswith("^"):
-            pat = f"^{pat}"
-        return self._str_contains(pat, case, flags, na, regex=True)
-
-    def _str_fullmatch(
-        self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
-    ):
-        if not pat.endswith("$") or pat.endswith("\\$"):
-            pat = f"{pat}$"
-        return self._str_match(pat, case, flags, na)
-
-    def _str_len(self):
-        result = pc.utf8_length(self._pa_array)
-        return self._convert_int_result(result)
-
-    def _str_lower(self) -> Self:
-        return type(self)(pc.utf8_lower(self._pa_array))
-
-    def _str_upper(self) -> Self:
-        return type(self)(pc.utf8_upper(self._pa_array))
-
-    def _str_strip(self, to_strip=None) -> Self:
-        if to_strip is None:
-            result = pc.utf8_trim_whitespace(self._pa_array)
-        else:
-            result = pc.utf8_trim(self._pa_array, characters=to_strip)
-        return type(self)(result)
-
-    def _str_lstrip(self, to_strip=None) -> Self:
-        if to_strip is None:
-            result = pc.utf8_ltrim_whitespace(self._pa_array)
-        else:
-            result = pc.utf8_ltrim(self._pa_array, characters=to_strip)
-        return type(self)(result)
-
-    def _str_rstrip(self, to_strip=None) -> Self:
-        if to_strip is None:
-            result = pc.utf8_rtrim_whitespace(self._pa_array)
-        else:
-            result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
-        return type(self)(result)
+            return ArrowExtensionArray._str_repeat(self, repeats=repeats)
 
     def _str_removeprefix(self, prefix: str):
         if not pa_version_under13p0:
-            starts_with = pc.starts_with(self._pa_array, pattern=prefix)
-            removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
-            result = pc.if_else(starts_with, removed, self._pa_array)
-            return type(self)(result)
+            return ArrowStringArrayMixin._str_removeprefix(self, prefix)
         return super()._str_removeprefix(prefix)
 
-    def _str_removesuffix(self, suffix: str):
-        ends_with = pc.ends_with(self._pa_array, pattern=suffix)
-        removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
-        result = pc.if_else(ends_with, removed, self._pa_array)
-        return type(self)(result)
-
     def _str_count(self, pat: str, flags: int = 0):
         if flags:
             return super()._str_count(pat, flags)
@@ -456,28 +416,6 @@ def _reduce(
         else:
             return result
 
-    def _rank(
-        self,
-        *,
-        axis: AxisInt = 0,
-        method: str = "average",
-        na_option: str = "keep",
-        ascending: bool = True,
-        pct: bool = False,
-    ):
-        """
-        See Series.rank.__doc__.
-        """
-        return self._convert_int_result(
-            self._rank_calc(
-                axis=axis,
-                method=method,
-                na_option=na_option,
-                ascending=ascending,
-                pct=pct,
-            )
-        )
-
     def value_counts(self, dropna: bool = True) -> Series:
         result = super().value_counts(dropna=dropna)
         if self.dtype.na_value is np.nan:
@@ -499,9 +437,3 @@ def _cmp_method(self, other, op):
 
 class ArrowStringArrayNumpySemantics(ArrowStringArray):
     _na_value = np.nan
-    _str_get = ArrowStringArrayMixin._str_get
-    _str_removesuffix = ArrowStringArrayMixin._str_removesuffix
-    _str_capitalize = ArrowStringArrayMixin._str_capitalize
-    _str_title = ArrowStringArrayMixin._str_title
-    _str_swapcase = ArrowStringArrayMixin._str_swapcase
-    _str_slice_replace = ArrowStringArrayMixin._str_slice_replace

From 7acb9659afafbe308d2d78345021487aa7f2f73f Mon Sep 17 00:00:00 2001
From: sshu2017 <66704517+sshu2017@users.noreply.github.com>
Date: Wed, 11 Sep 2024 18:15:52 -0700
Subject: [PATCH 032/224] Fix/na_values_GH59303 (#59755)

* fixed GH#59303

* pre-commit done

* updated v3.0.0.rst

* sort my entry in v3.0.0.rst

* changes based on comments on PR

* reformat long lines

* reformat test_na_values.py

* reformat test_na_values.py again
---
 doc/source/whatsnew/v3.0.0.rst           |  1 +
 pandas/io/parsers/readers.py             |  2 +-
 pandas/tests/io/parser/test_na_values.py | 18 ++++++++++++++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 819318e119668..89a1c388b3ba1 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -627,6 +627,7 @@ I/O
 - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
+- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
 - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
 - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
 - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
index 2916e4d98cce4..ffc2690a5efdf 100644
--- a/pandas/io/parsers/readers.py
+++ b/pandas/io/parsers/readers.py
@@ -1648,7 +1648,7 @@ def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = T
             if keep_default_na:
                 v = set(v) | STR_NA_VALUES
 
-            na_values[k] = v
+            na_values[k] = _stringify_na_values(v, floatify)
         na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}
     else:
         if not is_list_like(na_values):
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
index 360a5feebe073..b612e60c959b1 100644
--- a/pandas/tests/io/parser/test_na_values.py
+++ b/pandas/tests/io/parser/test_na_values.py
@@ -812,3 +812,21 @@ def test_bool_and_nan_to_float(all_parsers):
     result = parser.read_csv(StringIO(data), dtype="float")
     expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]})
     tm.assert_frame_equal(result, expected)
+
+
+@xfail_pyarrow
+@pytest.mark.parametrize(
+    "na_values",
+    [[-99.0, -99], [-99, -99.0]],
+)
+def test_na_values_dict_without_dtype(all_parsers, na_values):
+    parser = all_parsers
+    data = """A
+-99
+-99
+-99.0
+-99.0"""
+
+    result = parser.read_csv(StringIO(data), na_values=na_values)
+    expected = DataFrame({"A": [np.nan, np.nan, np.nan, np.nan]})
+    tm.assert_frame_equal(result, expected)

From 2a3cf8300b183f4230cc9dd4911604e454134450 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 12 Sep 2024 08:39:37 +0200
Subject: [PATCH 033/224] BUG: avoid triggering numpy deprecation warning in
 assert functions for nested array with empty array/list (#59778)

---
 pandas/_libs/lib.pyx                          |  2 ++
 pandas/tests/dtypes/test_missing.py           | 12 +-----------
 pandas/tests/series/methods/test_equals.py    | 12 +-----------
 pandas/tests/util/test_assert_almost_equal.py |  4 ++++
 4 files changed, 8 insertions(+), 22 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 75f58f565dd6f..3f2dfbfb3b404 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -600,6 +600,8 @@ def array_equivalent_object(ndarray left, ndarray right) -> bool:
                     if not array_equivalent(x, y):
                         return False
 
+            elif PyArray_Check(x) or PyArray_Check(y):
+                return False
             elif (x is C_NA) ^ (y is C_NA):
                 return False
             elif not (
diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py
index f86ed6f49759f..73c462d492d2d 100644
--- a/pandas/tests/dtypes/test_missing.py
+++ b/pandas/tests/dtypes/test_missing.py
@@ -1,4 +1,3 @@
-from contextlib import nullcontext
 from datetime import datetime
 from decimal import Decimal
 
@@ -7,7 +6,6 @@
 
 from pandas._libs import missing as libmissing
 from pandas._libs.tslibs import iNaT
-from pandas.compat.numpy import np_version_gte1p25
 
 from pandas.core.dtypes.common import (
     is_float,
@@ -458,15 +456,7 @@ def test_array_equivalent_dti(dtype_equal):
 )
 def test_array_equivalent_series(val):
     arr = np.array([1, 2])
-    msg = "elementwise comparison failed"
-    cm = (
-        # stacklevel is chosen to make sense when called from .equals
-        tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False)
-        if isinstance(val, str) and not np_version_gte1p25
-        else nullcontext()
-    )
-    with cm:
-        assert not array_equivalent(Series([arr, arr]), Series([arr, val]))
+    assert not array_equivalent(Series([arr, arr]), Series([arr, val]))
 
 
 def test_array_equivalent_array_mismatched_shape():
diff --git a/pandas/tests/series/methods/test_equals.py b/pandas/tests/series/methods/test_equals.py
index b94723b7cbddf..0c52eacd7e516 100644
--- a/pandas/tests/series/methods/test_equals.py
+++ b/pandas/tests/series/methods/test_equals.py
@@ -1,11 +1,9 @@
-from contextlib import nullcontext
 import copy
 
 import numpy as np
 import pytest
 
 from pandas._libs.missing import is_matching_na
-from pandas.compat.numpy import np_version_gte1p25
 
 from pandas.core.dtypes.common import is_float
 
@@ -14,7 +12,6 @@
     MultiIndex,
     Series,
 )
-import pandas._testing as tm
 
 
 @pytest.mark.parametrize(
@@ -48,14 +45,7 @@ def test_equals_list_array(val):
     assert s1.equals(s2)
 
     s1[1] = val
-
-    cm = (
-        tm.assert_produces_warning(FutureWarning, check_stacklevel=False)
-        if isinstance(val, str) and not np_version_gte1p25
-        else nullcontext()
-    )
-    with cm:
-        assert not s1.equals(s2)
+    assert not s1.equals(s2)
 
 
 def test_equals_false_negative():
diff --git a/pandas/tests/util/test_assert_almost_equal.py b/pandas/tests/util/test_assert_almost_equal.py
index bcc2e4e03f367..091670ed69f11 100644
--- a/pandas/tests/util/test_assert_almost_equal.py
+++ b/pandas/tests/util/test_assert_almost_equal.py
@@ -534,6 +534,10 @@ def test_assert_almost_equal_iterable_values_mismatch():
         np.array([np.array([1, 2, 3]), np.array([4, 5])], dtype=object),
         np.array([[1, 2, 3], [4, 5]], dtype=object),
     ),
+    (
+        np.array([np.array([], dtype=object), None], dtype=object),
+        np.array([[], None], dtype=object),
+    ),
     (
         np.array(
             [

From 5927bd8c66f126897d97d03865e1526a0072f6f4 Mon Sep 17 00:00:00 2001
From: ktseng4096 <32848825+ktseng4096@users.noreply.github.com>
Date: Thu, 12 Sep 2024 14:07:29 -0700
Subject: [PATCH 034/224] DOC: Update GroupBy docstrings with See Also
 requirements (#59748)

* update groupby docstrings

* fix function name
---
 ci/code_checks.sh              | 6 ------
 pandas/core/groupby/generic.py | 2 ++
 pandas/core/groupby/groupby.py | 9 +++++++++
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 2aa256b65a493..c2ab1e6b62352 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -153,14 +153,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.hist RT03" \
         -i "pandas.core.groupby.DataFrameGroupBy.indices SA01" \
-        -i "pandas.core.groupby.DataFrameGroupBy.max SA01" \
-        -i "pandas.core.groupby.DataFrameGroupBy.min SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \
         -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.ohlc SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
         -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \
-        -i "pandas.core.groupby.DataFrameGroupBy.sum SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.agg RT03" \
         -i "pandas.core.groupby.SeriesGroupBy.aggregate RT03" \
@@ -169,13 +166,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.groupby.SeriesGroupBy.indices SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing SA01" \
-        -i "pandas.core.groupby.SeriesGroupBy.max SA01" \
-        -i "pandas.core.groupby.SeriesGroupBy.min SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.nth PR02" \
         -i "pandas.core.groupby.SeriesGroupBy.ohlc SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \
         -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \
-        -i "pandas.core.groupby.SeriesGroupBy.sum SA01" \
         -i "pandas.core.resample.Resampler.__iter__ RT03,SA01" \
         -i "pandas.core.resample.Resampler.ffill RT03" \
         -i "pandas.core.resample.Resampler.get_group RT03,SA01" \
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index 230f61bab96df..eae33ddc1df29 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -615,6 +615,7 @@ def filter(self, func, dropna: bool = True, *args, **kwargs):
 
         See Also
         --------
+        Series.filter: Filter elements of ungrouped Series.
         DataFrameGroupBy.filter : Filter elements from groups base on criterion.
 
         Notes
@@ -1963,6 +1964,7 @@ def filter(self, func, dropna: bool = True, *args, **kwargs) -> DataFrame:
 
         See Also
         --------
+        DataFrame.filter: Filter elements of ungrouped DataFrame.
         SeriesGroupBy.filter : Filter elements from groups base on criterion.
 
         Notes
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 79fe78b7e5405..38dad446b4c39 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -199,6 +199,15 @@ class providing the base-class of operations.
 Series or DataFrame
     Computed {fname} of values within each group.
 
+See Also
+--------
+SeriesGroupBy.min : Return the min of the group values.
+DataFrameGroupBy.min : Return the min of the group values.
+SeriesGroupBy.max : Return the max of the group values.
+DataFrameGroupBy.max : Return the max of the group values.
+SeriesGroupBy.sum : Return the sum of the group values.
+DataFrameGroupBy.sum : Return the sum of the group values.
+
 Examples
 --------
 {example}

From 2c49f555a004a86a2065525b1f424d1b17208b87 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 12 Sep 2024 23:08:34 +0200
Subject: [PATCH 035/224] BUG/API (string dtype): return float dtype for
 series[str].rank() (#59768)

* BUG/API (string dtype): return float dtype for series[str].rank()

* update frame tests

* add whatsnew

* correct whatsnew note
---
 doc/source/whatsnew/v2.3.0.rst           |  1 +
 pandas/core/arrays/arrow/array.py        |  5 +-
 pandas/core/arrays/string_arrow.py       | 11 ++++
 pandas/tests/frame/methods/test_rank.py  | 23 ++------
 pandas/tests/series/methods/test_rank.py | 72 ++++++++++++++++++------
 5 files changed, 76 insertions(+), 36 deletions(-)

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
index 03b3a6b55dff6..01c2ed3821d7a 100644
--- a/doc/source/whatsnew/v2.3.0.rst
+++ b/doc/source/whatsnew/v2.3.0.rst
@@ -102,6 +102,7 @@ Conversion
 
 Strings
 ^^^^^^^
+- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
 - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
 - Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
 - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index bd94447f0cd80..39cae5b8e2683 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -1999,7 +1999,7 @@ def _rank(
         """
         See Series.rank.__doc__.
         """
-        return self._convert_int_result(
+        return self._convert_rank_result(
             self._rank_calc(
                 axis=axis,
                 method=method,
@@ -2318,6 +2318,9 @@ def _convert_bool_result(self, result):
     def _convert_int_result(self, result):
         return type(self)(result)
 
+    def _convert_rank_result(self, result):
+        return type(self)(result)
+
     def _str_count(self, pat: str, flags: int = 0) -> Self:
         if flags:
             raise NotImplementedError(f"count not implemented with {flags=}")
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index f446cc5bde147..75bb1f8fb1a65 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -29,6 +29,7 @@
 from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin
 from pandas.core.arrays.arrow import ArrowExtensionArray
 from pandas.core.arrays.boolean import BooleanDtype
+from pandas.core.arrays.floating import Float64Dtype
 from pandas.core.arrays.integer import Int64Dtype
 from pandas.core.arrays.numeric import NumericDtype
 from pandas.core.arrays.string_ import (
@@ -395,6 +396,16 @@ def _convert_int_result(self, result):
 
         return Int64Dtype().__from_arrow__(result)
 
+    def _convert_rank_result(self, result):
+        if self.dtype.na_value is np.nan:
+            if isinstance(result, pa.Array):
+                result = result.to_numpy(zero_copy_only=False)
+            else:
+                result = result.to_numpy()
+            return result.astype("float64", copy=False)
+
+        return Float64Dtype().__from_arrow__(result)
+
     def _reduce(
         self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
     ):
diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py
index c1cdeaa6c10dd..6c6c208ee0c78 100644
--- a/pandas/tests/frame/methods/test_rank.py
+++ b/pandas/tests/frame/methods/test_rank.py
@@ -6,15 +6,11 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs.algos import (
     Infinity,
     NegInfinity,
 )
-from pandas.compat import HAS_PYARROW
 
-import pandas as pd
 from pandas import (
     DataFrame,
     Index,
@@ -467,23 +463,10 @@ def test_rank_inf_nans_na_option(
             ("top", False, [2.0, 3.0, 1.0, 4.0]),
         ],
     )
-    def test_rank_object_first(
-        self,
-        request,
-        frame_or_series,
-        na_option,
-        ascending,
-        expected,
-        using_infer_string,
-    ):
+    def test_rank_object_first(self, frame_or_series, na_option, ascending, expected):
         obj = frame_or_series(["foo", "foo", None, "foo"])
-        if using_string_dtype() and not HAS_PYARROW and isinstance(obj, Series):
-            request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
-
         result = obj.rank(method="first", na_option=na_option, ascending=ascending)
         expected = frame_or_series(expected)
-        if using_infer_string and isinstance(obj, Series):
-            expected = expected.astype("uint64")
         tm.assert_equal(result, expected)
 
     @pytest.mark.parametrize(
@@ -507,7 +490,9 @@ def test_rank_string_dtype(self, string_dtype_no_object):
         # GH#55362
         obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object)
         result = obj.rank(method="first")
-        exp_dtype = "Int64" if string_dtype_no_object.na_value is pd.NA else "float64"
+        exp_dtype = (
+            "Float64" if string_dtype_no_object == "string[pyarrow]" else "float64"
+        )
         if string_dtype_no_object.storage == "python":
             # TODO nullable string[python] should also return nullable Int64
             exp_dtype = "float64"
diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py
index 2d7fde130ce70..7c6a7893ba3a0 100644
--- a/pandas/tests/series/methods/test_rank.py
+++ b/pandas/tests/series/methods/test_rank.py
@@ -33,7 +33,8 @@ def ser():
         ["max", np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6])],
         ["first", np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6])],
         ["dense", np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])],
-    ]
+    ],
+    ids=lambda x: x[0],
 )
 def results(request):
     return request.param
@@ -48,12 +49,29 @@ def results(request):
         "Int64",
         pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")),
         pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")),
+        pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
+        "string[python]",
+        "str",
     ]
 )
 def dtype(request):
     return request.param
 
 
+def expected_dtype(dtype, method, pct=False):
+    exp_dtype = "float64"
+    # elif dtype in ["Int64", "Float64", "string[pyarrow]", "string[python]"]:
+    if dtype in ["string[pyarrow]"]:
+        exp_dtype = "Float64"
+    elif dtype in ["float64[pyarrow]", "int64[pyarrow]"]:
+        if method == "average" or pct:
+            exp_dtype = "double[pyarrow]"
+        else:
+            exp_dtype = "uint64[pyarrow]"
+
+    return exp_dtype
+
+
 class TestSeriesRank:
     def test_rank(self, datetime_series):
         sp_stats = pytest.importorskip("scipy.stats")
@@ -251,12 +269,14 @@ def test_rank_signature(self):
         with pytest.raises(ValueError, match=msg):
             s.rank("average")
 
-    @pytest.mark.parametrize("dtype", [None, object])
-    def test_rank_tie_methods(self, ser, results, dtype):
+    def test_rank_tie_methods(self, ser, results, dtype, using_infer_string):
         method, exp = results
+        if dtype == "int64" or (not using_infer_string and dtype == "str"):
+            pytest.skip("int64/str does not support NaN")
+
         ser = ser if dtype is None else ser.astype(dtype)
         result = ser.rank(method=method)
-        tm.assert_series_equal(result, Series(exp))
+        tm.assert_series_equal(result, Series(exp, dtype=expected_dtype(dtype, method)))
 
     @pytest.mark.parametrize("na_option", ["top", "bottom", "keep"])
     @pytest.mark.parametrize(
@@ -357,25 +377,35 @@ def test_rank_methods_series(self, rank_method, op, value):
         ],
     )
     def test_rank_dense_method(self, dtype, ser, exp):
+        if ser[0] < 0 and dtype.startswith("str"):
+            exp = exp[::-1]
         s = Series(ser).astype(dtype)
         result = s.rank(method="dense")
-        expected = Series(exp).astype(result.dtype)
+        expected = Series(exp).astype(expected_dtype(dtype, "dense"))
         tm.assert_series_equal(result, expected)
 
-    def test_rank_descending(self, ser, results, dtype):
+    def test_rank_descending(self, ser, results, dtype, using_infer_string):
         method, _ = results
-        if "i" in dtype:
+        if dtype == "int64" or (not using_infer_string and dtype == "str"):
             s = ser.dropna()
         else:
             s = ser.astype(dtype)
 
         res = s.rank(ascending=False)
-        expected = (s.max() - s).rank()
-        tm.assert_series_equal(res, expected)
+        if dtype.startswith("str"):
+            expected = (s.astype("float64").max() - s.astype("float64")).rank()
+        else:
+            expected = (s.max() - s).rank()
+        tm.assert_series_equal(res, expected.astype(expected_dtype(dtype, "average")))
 
-        expected = (s.max() - s).rank(method=method)
+        if dtype.startswith("str"):
+            expected = (s.astype("float64").max() - s.astype("float64")).rank(
+                method=method
+            )
+        else:
+            expected = (s.max() - s).rank(method=method)
         res2 = s.rank(method=method, ascending=False)
-        tm.assert_series_equal(res2, expected)
+        tm.assert_series_equal(res2, expected.astype(expected_dtype(dtype, method)))
 
     def test_rank_int(self, ser, results):
         method, exp = results
@@ -432,9 +462,11 @@ def test_rank_ea_small_values(self):
     ],
 )
 def test_rank_dense_pct(dtype, ser, exp):
+    if ser[0] < 0 and dtype.startswith("str"):
+        exp = exp[::-1]
     s = Series(ser).astype(dtype)
     result = s.rank(method="dense", pct=True)
-    expected = Series(exp).astype(result.dtype)
+    expected = Series(exp).astype(expected_dtype(dtype, "dense", pct=True))
     tm.assert_series_equal(result, expected)
 
 
@@ -453,9 +485,11 @@ def test_rank_dense_pct(dtype, ser, exp):
     ],
 )
 def test_rank_min_pct(dtype, ser, exp):
+    if ser[0] < 0 and dtype.startswith("str"):
+        exp = exp[::-1]
     s = Series(ser).astype(dtype)
     result = s.rank(method="min", pct=True)
-    expected = Series(exp).astype(result.dtype)
+    expected = Series(exp).astype(expected_dtype(dtype, "min", pct=True))
     tm.assert_series_equal(result, expected)
 
 
@@ -474,9 +508,11 @@ def test_rank_min_pct(dtype, ser, exp):
     ],
 )
 def test_rank_max_pct(dtype, ser, exp):
+    if ser[0] < 0 and dtype.startswith("str"):
+        exp = exp[::-1]
     s = Series(ser).astype(dtype)
     result = s.rank(method="max", pct=True)
-    expected = Series(exp).astype(result.dtype)
+    expected = Series(exp).astype(expected_dtype(dtype, "max", pct=True))
     tm.assert_series_equal(result, expected)
 
 
@@ -495,9 +531,11 @@ def test_rank_max_pct(dtype, ser, exp):
     ],
 )
 def test_rank_average_pct(dtype, ser, exp):
+    if ser[0] < 0 and dtype.startswith("str"):
+        exp = exp[::-1]
     s = Series(ser).astype(dtype)
     result = s.rank(method="average", pct=True)
-    expected = Series(exp).astype(result.dtype)
+    expected = Series(exp).astype(expected_dtype(dtype, "average", pct=True))
     tm.assert_series_equal(result, expected)
 
 
@@ -516,9 +554,11 @@ def test_rank_average_pct(dtype, ser, exp):
     ],
 )
 def test_rank_first_pct(dtype, ser, exp):
+    if ser[0] < 0 and dtype.startswith("str"):
+        exp = exp[::-1]
     s = Series(ser).astype(dtype)
     result = s.rank(method="first", pct=True)
-    expected = Series(exp).astype(result.dtype)
+    expected = Series(exp).astype(expected_dtype(dtype, "first", pct=True))
     tm.assert_series_equal(result, expected)
 
 
From 0d2505dca9c34b666155c1483d592877206081aa Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 12 Sep 2024 23:11:52 +0200
Subject: [PATCH 036/224] String dtype: fix isin() values handling for python
 storage (#59759)

* String dtype: fix isin() values handling for python storage

* address feedback
---
 pandas/conftest.py                         |  9 ++++-
 pandas/core/arrays/string_.py              | 20 +++++++++++
 pandas/tests/arrays/string_/test_string.py | 41 +++++++++++++++++++---
 3 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index 222aefb4afda8..e2db9260ac37d 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -1338,7 +1338,13 @@ def string_storage(request):
         pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
         pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
         ("python", np.nan),
-    ]
+    ],
+    ids=[
+        "string=string[python]",
+        "string=string[pyarrow]",
+        "string=str[pyarrow]",
+        "string=str[python]",
+    ],
 )
 def string_dtype_arguments(request):
     """
@@ -1369,6 +1375,7 @@ def dtype_backend(request):
 
 # Alias so we can test with cartesian product of string_storage
 string_storage2 = string_storage
+string_dtype_arguments2 = string_dtype_arguments
 
 
 @pytest.fixture(params=tm.BYTES_DTYPES)
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index a46475a7d1ec2..b3aa782341c77 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -46,6 +46,7 @@
     nanops,
     ops,
 )
+from pandas.core.algorithms import isin
 from pandas.core.array_algos import masked_reductions
 from pandas.core.arrays.base import ExtensionArray
 from pandas.core.arrays.floating import (
@@ -65,6 +66,7 @@
     import pyarrow
 
     from pandas._typing import (
+        ArrayLike,
         AxisInt,
         Dtype,
         DtypeObj,
@@ -735,6 +737,24 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
         # base class implementation that uses __setitem__
         ExtensionArray._putmask(self, mask, value)
 
+    def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
+        if isinstance(values, BaseStringArray) or (
+            isinstance(values, ExtensionArray) and is_string_dtype(values.dtype)
+        ):
+            values = values.astype(self.dtype, copy=False)
+        else:
+            if not lib.is_string_array(np.asarray(values), skipna=True):
+                values = np.array(
+                    [val for val in values if isinstance(val, str) or isna(val)],
+                    dtype=object,
+                )
+                if not len(values):
+                    return np.zeros(self.shape, dtype=bool)
+
+            values = self._from_sequence(values, dtype=self.dtype)
+
+        return isin(np.asarray(self), np.asarray(values))
+
     def astype(self, dtype, copy: bool = True):
         dtype = pandas_dtype(dtype)
 
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 87bd1d5921caa..33708be497f31 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -30,6 +30,12 @@ def dtype(string_dtype_arguments):
     return pd.StringDtype(storage=storage, na_value=na_value)
 
 
+@pytest.fixture
+def dtype2(string_dtype_arguments2):
+    storage, na_value = string_dtype_arguments2
+    return pd.StringDtype(storage=storage, na_value=na_value)
+
+
 @pytest.fixture
 def cls(dtype):
     """Fixture giving array type from parametrized 'dtype'"""
@@ -662,11 +668,7 @@ def test_isin(dtype, fixed_now_ts):
     tm.assert_series_equal(result, expected)
 
     result = s.isin(["a", pd.NA])
-    if dtype.storage == "python" and dtype.na_value is np.nan:
-        # TODO(infer_string) we should make this consistent
-        expected = pd.Series([True, False, False])
-    else:
-        expected = pd.Series([True, False, True])
+    expected = pd.Series([True, False, True])
     tm.assert_series_equal(result, expected)
 
     result = s.isin([])
@@ -677,6 +679,35 @@ def test_isin(dtype, fixed_now_ts):
     expected = pd.Series([True, False, False])
     tm.assert_series_equal(result, expected)
 
+    result = s.isin([fixed_now_ts])
+    expected = pd.Series([False, False, False])
+    tm.assert_series_equal(result, expected)
+
+
+def test_isin_string_array(dtype, dtype2):
+    s = pd.Series(["a", "b", None], dtype=dtype)
+
+    result = s.isin(pd.array(["a", "c"], dtype=dtype2))
+    expected = pd.Series([True, False, False])
+    tm.assert_series_equal(result, expected)
+
+    result = s.isin(pd.array(["a", None], dtype=dtype2))
+    expected = pd.Series([True, False, True])
+    tm.assert_series_equal(result, expected)
+
+
+def test_isin_arrow_string_array(dtype):
+    pa = pytest.importorskip("pyarrow")
+    s = pd.Series(["a", "b", None], dtype=dtype)
+
+    result = s.isin(pd.array(["a", "c"], dtype=pd.ArrowDtype(pa.string())))
+    expected = pd.Series([True, False, False])
+    tm.assert_series_equal(result, expected)
+
+    result = s.isin(pd.array(["a", None], dtype=pd.ArrowDtype(pa.string())))
+    expected = pd.Series([True, False, True])
+    tm.assert_series_equal(result, expected)
+
 
 def test_setitem_scalar_with_mask_validation(dtype):
     # https://github.com/pandas-dev/pandas/issues/47628

From 73c4fce2fe8c8893b1d370ce04211c59c8182d61 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Fri, 13 Sep 2024 23:55:51 +0530
Subject: [PATCH 037/224] DOC: fix SA01 for pandas.NA (#59787)

---
 ci/code_checks.sh        | 1 -
 pandas/_libs/missing.pyx | 8 ++++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index c2ab1e6b62352..7ad29b3a2a1f3 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -70,7 +70,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         --format=actions \
         -i ES01 `# For now it is ok if docstrings are missing the extended summary` \
         -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \
-        -i "pandas.NA SA01" \
         -i "pandas.Period.freq GL08" \
         -i "pandas.Period.ordinal GL08" \
         -i "pandas.PeriodDtype.freq SA01" \
diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx
index 2f44128cda822..390a527c22bbb 100644
--- a/pandas/_libs/missing.pyx
+++ b/pandas/_libs/missing.pyx
@@ -347,6 +347,14 @@ class NAType(C_NAType):
     The NA singleton is a missing value indicator defined by pandas. It is
     used in certain new extension dtypes (currently the "string" dtype).
 
+    See Also
+    --------
+    numpy.nan : Floating point representation of Not a Number (NaN) for numerical data.
+    isna : Detect missing values for an array-like object.
+    notna : Detect non-missing values for an array-like object.
+    DataFrame.fillna : Fill missing values in a DataFrame.
+    Series.fillna : Fill missing values in a Series.
+
     Examples
     --------
     >>> pd.NA

From a71df34cb841d5aefb94458767d6987caf02ae67 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sat, 14 Sep 2024 22:21:08 +0530
Subject: [PATCH 038/224] DOC: fix SA01,ES01 for pandas.Timedelta.components
 (#59799)

---
 ci/code_checks.sh                  |  1 -
 pandas/_libs/tslibs/timedeltas.pyx | 11 +++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 7ad29b3a2a1f3..fd42fa70a6f7c 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -102,7 +102,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Series.sparse.from_coo PR07,SA01" \
         -i "pandas.Series.sparse.npoints SA01" \
         -i "pandas.Series.sparse.sp_values SA01" \
-        -i "pandas.Timedelta.components SA01" \
         -i "pandas.Timedelta.max PR02" \
         -i "pandas.Timedelta.min PR02" \
         -i "pandas.Timedelta.resolution PR02" \
diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
index 4f90f26cf31ab..6159bd0dadb47 100644
--- a/pandas/_libs/tslibs/timedeltas.pyx
+++ b/pandas/_libs/tslibs/timedeltas.pyx
@@ -1493,6 +1493,17 @@ cdef class _Timedelta(timedelta):
         """
         Return a components namedtuple-like.
 
+        Each component represents a different time unit, allowing you to access the
+        breakdown of the total duration in terms of days, hours, minutes, seconds,
+        milliseconds, microseconds, and nanoseconds.
+
+        See Also
+        --------
+        Timedelta.total_seconds : Returns the total duration of the Timedelta in
+            seconds.
+        to_timedelta : Convert argument to Timedelta.
+        Timedelta : Represents a duration, the difference between two dates or times.
+
         Examples
         --------
         >>> td = pd.Timedelta('2 day 4 min 3 us 42 ns')

From 695dbde594f6d7eef732340f57b4439f6661e74a Mon Sep 17 00:00:00 2001
From: ammar-qazi <ammmarqz@gmail.com>
Date: Sat, 14 Sep 2024 18:53:36 +0200
Subject: [PATCH 039/224] Update ExtensionArray.interpolate to remove outdated
 method of pad (#59798)

---
 pandas/core/arrays/base.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index a933a9ce11646..5f2c2a7772f78 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -1010,7 +1010,6 @@ def interpolate(
             * 'time': Works on daily and higher resolution data to interpolate
             given length of interval.
             * 'index', 'values': use the actual numerical values of the index.
-            * 'pad': Fill in NaNs using existing values.
             * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric',
             'polynomial': Passed to scipy.interpolate.interp1d, whereas 'spline'
             is passed to scipy.interpolate.UnivariateSpline. These methods use

From e3bcd10d7dedd71a70a5229ce2b53c543feb63c5 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sat, 14 Sep 2024 22:24:30 +0530
Subject: [PATCH 040/224] DOC: fix SA01,ES01 for pandas.PeriodDtype.freq
 (#59796)

---
 ci/code_checks.sh            |  1 -
 pandas/core/dtypes/dtypes.py | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index fd42fa70a6f7c..73b389e427648 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -72,7 +72,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \
         -i "pandas.Period.freq GL08" \
         -i "pandas.Period.ordinal GL08" \
-        -i "pandas.PeriodDtype.freq SA01" \
         -i "pandas.RangeIndex.from_range PR01,SA01" \
         -i "pandas.RangeIndex.step SA01" \
         -i "pandas.Series.cat.add_categories PR01,PR02" \
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
index 68b4807961d19..bb6610c514375 100644
--- a/pandas/core/dtypes/dtypes.py
+++ b/pandas/core/dtypes/dtypes.py
@@ -1065,6 +1065,20 @@ def freq(self) -> BaseOffset:
         """
         The frequency object of this PeriodDtype.
 
+        The `freq` property returns the `BaseOffset` object that represents the
+        frequency of the PeriodDtype. This frequency specifies the interval (e.g.,
+        daily, monthly, yearly) associated with the Period type. It is essential
+        for operations that depend on time-based calculations within a period index
+        or series.
+
+        See Also
+        --------
+        Period : Represents a period of time.
+        PeriodIndex : Immutable ndarray holding ordinal values indicating
+            regular periods.
+        PeriodDtype : An ExtensionDtype for Period data.
+        date_range : Return a fixed frequency range of dates.
+
         Examples
         --------
         >>> dtype = pd.PeriodDtype(freq="D")

From e215121f71a59ba44b614f1962a960b8415864ad Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sun, 15 Sep 2024 22:45:39 +0530
Subject: [PATCH 041/224] DOC: fix SA01,ES01 for pandas.Timedelta.total_seconds
 (#59800)

* DOC: fix SA01,ES01 for pandas.Timedelta.total_seconds

* DOC: fix SA01,ES01 for pandas.Timedelta.total_seconds
---
 ci/code_checks.sh                  | 1 -
 pandas/_libs/tslibs/nattype.pyx    | 8 ++++++++
 pandas/_libs/tslibs/timedeltas.pyx | 8 ++++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 73b389e427648..606ede4e861fa 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -105,7 +105,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timedelta.min PR02" \
         -i "pandas.Timedelta.resolution PR02" \
         -i "pandas.Timedelta.to_timedelta64 SA01" \
-        -i "pandas.Timedelta.total_seconds SA01" \
         -i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \
         -i "pandas.Timestamp.max PR02" \
         -i "pandas.Timestamp.min PR02" \
diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx
index 60afc1acdc297..620e0846c750e 100644
--- a/pandas/_libs/tslibs/nattype.pyx
+++ b/pandas/_libs/tslibs/nattype.pyx
@@ -493,6 +493,14 @@ class NaTType(_NaT):
         """
         Total seconds in the duration.
 
+        This method calculates the total duration in seconds by combining
+        the days, seconds, and microseconds of the `Timedelta` object.
+
+        See Also
+        --------
+        to_timedelta : Convert argument to timedelta.
+        Timedelta : Represents a duration, the difference between two dates or times.
+
         Examples
         --------
         >>> td = pd.Timedelta('1min')
diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
index 6159bd0dadb47..0ff5c5fb81df8 100644
--- a/pandas/_libs/tslibs/timedeltas.pyx
+++ b/pandas/_libs/tslibs/timedeltas.pyx
@@ -1189,6 +1189,14 @@ cdef class _Timedelta(timedelta):
         """
         Total seconds in the duration.
 
+        This method calculates the total duration in seconds by combining
+        the days, seconds, and microseconds of the `Timedelta` object.
+
+        See Also
+        --------
+        to_timedelta : Convert argument to timedelta.
+        Timedelta : Represents a duration, the difference between two dates or times.
+
         Examples
         --------
         >>> td = pd.Timedelta('1min')

From 679578742669e208265b9089b6afe3f0451be680 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sun, 15 Sep 2024 22:46:32 +0530
Subject: [PATCH 042/224] DOC: fix SA01 for pandas.api.types.is_array_like
 (#59802)

---
 ci/code_checks.sh            | 1 -
 pandas/core/dtypes/common.py | 4 ++++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 606ede4e861fa..ff5bfee1518c4 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -113,7 +113,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timestamp.tzinfo GL08" \
         -i "pandas.Timestamp.year GL08" \
         -i "pandas.api.types.is_dict_like PR07,SA01" \
-        -i "pandas.api.types.is_extension_array_dtype SA01" \
         -i "pandas.api.types.is_file_like PR07,SA01" \
         -i "pandas.api.types.is_float PR01,SA01" \
         -i "pandas.api.types.is_float_dtype SA01" \
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index 16f6bd396fe93..de38395cecad3 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -1401,6 +1401,10 @@ def is_extension_array_dtype(arr_or_dtype) -> bool:
     bool
         Whether the `arr_or_dtype` is an extension array type.
 
+    See Also
+    --------
+    api.extensions.ExtensionArray : Abstract base class for pandas extension arrays.
+
     Notes
     -----
     This checks whether an object implements the pandas extension

From 2b37219aa0617edce4f4326aec71e049b6acc1d2 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sun, 15 Sep 2024 22:47:05 +0530
Subject: [PATCH 043/224] DOC: fix SA01 for pandas.api.types.is_integer_dtype
 (#59803)

---
 ci/code_checks.sh            | 1 -
 pandas/core/dtypes/common.py | 9 +++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index ff5bfee1518c4..ffa540291e560 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -119,7 +119,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.api.types.is_hashable PR01,RT03,SA01" \
         -i "pandas.api.types.is_int64_dtype SA01" \
         -i "pandas.api.types.is_integer PR01,SA01" \
-        -i "pandas.api.types.is_integer_dtype SA01" \
         -i "pandas.api.types.is_interval_dtype SA01" \
         -i "pandas.api.types.is_iterator PR07,SA01" \
         -i "pandas.api.types.is_list_like SA01" \
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index de38395cecad3..ff855f97a352b 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -694,6 +694,15 @@ def is_integer_dtype(arr_or_dtype) -> bool:
         Whether or not the array or dtype is of an integer dtype and
         not an instance of timedelta64.
 
+    See Also
+    --------
+    api.types.is_integer : Return True if given object is integer.
+    api.types.is_numeric_dtype : Check whether the provided array or dtype is of a
+        numeric dtype.
+    api.types.is_float_dtype : Check whether the provided array or dtype is of a
+        float dtype.
+    Int64Dtype : An ExtensionDtype for Int64Dtype integer data.
+
     Examples
     --------
     >>> from pandas.api.types import is_integer_dtype

From 1d80ac59028b01d3efc15b97119cf6b3c896c1da Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sun, 15 Sep 2024 22:47:47 +0530
Subject: [PATCH 044/224] DOC: fix SA01 for pandas.arrays.FloatingArray
 (#59804)

---
 ci/code_checks.sh              | 1 -
 pandas/core/arrays/floating.py | 8 ++++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index ffa540291e560..f022e0176a987 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -130,7 +130,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
         -i "pandas.arrays.BooleanArray SA01" \
         -i "pandas.arrays.DatetimeArray SA01" \
-        -i "pandas.arrays.FloatingArray SA01" \
         -i "pandas.arrays.IntegerArray SA01" \
         -i "pandas.arrays.IntervalArray.left SA01" \
         -i "pandas.arrays.IntervalArray.length SA01" \
diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py
index b3fbf0f92c32d..67c23f4825a7f 100644
--- a/pandas/core/arrays/floating.py
+++ b/pandas/core/arrays/floating.py
@@ -96,6 +96,14 @@ class FloatingArray(NumericArray):
     -------
     FloatingArray
 
+    See Also
+    --------
+    array : Create an array.
+    Float32Dtype : Float32 dtype for FloatingArray.
+    Float64Dtype : Float64 dtype for FloatingArray.
+    Series : One-dimensional labeled array capable of holding data.
+    DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data.
+
     Examples
     --------
     Create an FloatingArray with :func:`pandas.array`:

From 235e1bea1366f9ffd54866e7a997d2a75016bf84 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?=
 <6618166+twoertwein@users.noreply.github.com>
Date: Sun, 15 Sep 2024 14:24:36 -0400
Subject: [PATCH 045/224] WEB: update list of (in)active core devs (#59808)

---
 web/pandas/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/web/pandas/config.yml b/web/pandas/config.yml
index 74e7fda2e7983..a49aadd45204a 100644
--- a/web/pandas/config.yml
+++ b/web/pandas/config.yml
@@ -89,7 +89,6 @@ maintainers:
   - phofl
   - attack68
   - fangchenli
-  - twoertwein
   - lithomas1
   - lukemanley
   - noatamir
@@ -108,6 +107,7 @@ maintainers:
   - wesm
   - gfyoung
   - mzeitlin11
+  - twoertwein
 workgroups:
   coc:
     name: Code of Conduct

From 3e8ac12d1dacc2308b2f4c2869fa7bc2079bd323 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sun, 15 Sep 2024 22:00:26 +0200
Subject: [PATCH 046/224] BUG (CoW): fix reference tracking in replace_list
 with None (#59807)

---
 pandas/core/internals/blocks.py        | 2 +-
 pandas/tests/copy_view/test_replace.py | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index dced92ba04520..cb40e920149fa 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -915,7 +915,7 @@ def _replace_coerce(
                         nb = nb.copy()
                     putmask_inplace(nb.values, mask, value)
                     return [nb]
-                return [self]
+                return [self.copy(deep=False)]
             return self.replace(
                 to_replace=to_replace,
                 value=value,
diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py
index 58c979fb05089..a8acd446ff5f5 100644
--- a/pandas/tests/copy_view/test_replace.py
+++ b/pandas/tests/copy_view/test_replace.py
@@ -286,6 +286,12 @@ def test_replace_list_none():
 
     assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
 
+    # replace multiple values that don't actually replace anything with None
+    # https://github.com/pandas-dev/pandas/issues/59770
+    df3 = df.replace(["d", "e", "f"], value=None)
+    tm.assert_frame_equal(df3, df_orig)
+    assert tm.shares_memory(get_array(df, "a"), get_array(df3, "a"))
+
 
 def test_replace_list_none_inplace_refs():
     df = DataFrame({"a": ["a", "b", "c"]})

From 122fc4c6f45b8e603132b57c4cf99c8837bac43e Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Sun, 15 Sep 2024 23:03:33 +0200
Subject: [PATCH 047/224] DOC: add whatsnew for v2.2.3 (#59811)

* DOC: add whatsnew for v2.2.3

* fix warning

---------

Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com>
---
 doc/source/whatsnew/index.rst  |  1 +
 doc/source/whatsnew/v2.2.3.rst | 36 ++++++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+)
 create mode 100644 doc/source/whatsnew/v2.2.3.rst

diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst
index 2f7ec52d117f8..1dd6c5fabef04 100644
--- a/doc/source/whatsnew/index.rst
+++ b/doc/source/whatsnew/index.rst
@@ -32,6 +32,7 @@ Version 2.2
 .. toctree::
    :maxdepth: 2
 
+   v2.2.3
    v2.2.2
    v2.2.1
    v2.2.0
diff --git a/doc/source/whatsnew/v2.2.3.rst b/doc/source/whatsnew/v2.2.3.rst
new file mode 100644
index 0000000000000..aa6e241e74b0a
--- /dev/null
+++ b/doc/source/whatsnew/v2.2.3.rst
@@ -0,0 +1,36 @@
+.. _whatsnew_223:
+
+What's new in 2.2.3 (September XX, 2024)
+----------------------------------------
+
+These are the changes in pandas 2.2.3. See :ref:`release` for a full changelog
+including other versions of pandas.
+
+{{ header }}
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_223.regressions:
+
+Fixed regressions
+~~~~~~~~~~~~~~~~~
+-
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_223.bug_fixes:
+
+Bug fixes
+~~~~~~~~~
+-
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_223.other:
+
+Other
+~~~~~
+-
+
+.. ---------------------------------------------------------------------------
+.. _whatsnew_223.contributors:
+
+Contributors
+~~~~~~~~~~~~

From 160b3eb4be5150a2d2bcb6b4e47dc8a44a4c0922 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Mon, 16 Sep 2024 02:43:06 +0530
Subject: [PATCH 048/224] DOC: fix SA01 for pandas.errors.MergeError (#59805)

* DOC: fix SA01 for pandas.errors.MergeError

* DOC: fix SA01 for pandas.errors.MergeError

Co-authored-by: mroeschke <mroeschke@users.noreply.github.com>

---------

Co-authored-by: mroeschke <mroeschke@users.noreply.github.com>
---
 ci/code_checks.sh         | 1 -
 pandas/errors/__init__.py | 5 +++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index f022e0176a987..b57426dbb2078 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -191,7 +191,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.errors.IntCastingNaNError SA01" \
         -i "pandas.errors.InvalidIndexError SA01" \
         -i "pandas.errors.InvalidVersion SA01" \
-        -i "pandas.errors.MergeError SA01" \
         -i "pandas.errors.NullFrequencyError SA01" \
         -i "pandas.errors.NumExprClobberingError SA01" \
         -i "pandas.errors.NumbaUtilError SA01" \
diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py
index 2f625090e0492..7851bc90c5782 100644
--- a/pandas/errors/__init__.py
+++ b/pandas/errors/__init__.py
@@ -261,6 +261,11 @@ class MergeError(ValueError):
 
     Subclass of ``ValueError``.
 
+    See Also
+    --------
+    DataFrame.join : For joining DataFrames on their indexes.
+    merge : For merging two DataFrames on a common set of keys.
+
     Examples
     --------
     >>> left = pd.DataFrame(

From 013ac6702c738b73a6729aa75399eebe9ef52f45 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 16 Sep 2024 19:25:59 +0200
Subject: [PATCH 049/224] String dtype: allow string dtype in query/eval with
 default numexpr engine (#59810)

String dtype: allow string dtype in query/eval with default mumexpr engine
---
 pandas/core/computation/eval.py       | 12 +++++++++---
 pandas/core/computation/expr.py       |  6 +++++-
 pandas/tests/frame/test_query_eval.py | 24 ++++++------------------
 3 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py
index aad768d31483a..485c7f87d6f33 100644
--- a/pandas/core/computation/eval.py
+++ b/pandas/core/computation/eval.py
@@ -14,7 +14,10 @@
 from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import validate_bool_kwarg
 
-from pandas.core.dtypes.common import is_extension_array_dtype
+from pandas.core.dtypes.common import (
+    is_extension_array_dtype,
+    is_string_dtype,
+)
 
 from pandas.core.computation.engines import ENGINES
 from pandas.core.computation.expr import (
@@ -345,10 +348,13 @@ def eval(
         parsed_expr = Expr(expr, engine=engine, parser=parser, env=env)
 
         if engine == "numexpr" and (
-            is_extension_array_dtype(parsed_expr.terms.return_type)
+            (
+                is_extension_array_dtype(parsed_expr.terms.return_type)
+                and not is_string_dtype(parsed_expr.terms.return_type)
+            )
             or getattr(parsed_expr.terms, "operand_types", None) is not None
             and any(
-                is_extension_array_dtype(elem)
+                (is_extension_array_dtype(elem) and not is_string_dtype(elem))
                 for elem in parsed_expr.terms.operand_types
             )
         ):
diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py
index b074e768e0842..f45bc453d2541 100644
--- a/pandas/core/computation/expr.py
+++ b/pandas/core/computation/expr.py
@@ -21,6 +21,8 @@
 
 from pandas.errors import UndefinedVariableError
 
+from pandas.core.dtypes.common import is_string_dtype
+
 import pandas.core.common as com
 from pandas.core.computation.ops import (
     ARITH_OPS_SYMS,
@@ -524,10 +526,12 @@ def _maybe_evaluate_binop(
         elif self.engine != "pytables":
             if (
                 getattr(lhs, "return_type", None) == object
+                or is_string_dtype(getattr(lhs, "return_type", None))
                 or getattr(rhs, "return_type", None) == object
+                or is_string_dtype(getattr(rhs, "return_type", None))
             ):
                 # evaluate "==" and "!=" in python if either of our operands
-                # has an object return type
+                # has an object or string return type
                 return self._maybe_eval(res, eval_in_python + maybe_eval_in_python)
         return res
 
diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py
index fa71153d01157..a574989860957 100644
--- a/pandas/tests/frame/test_query_eval.py
+++ b/pandas/tests/frame/test_query_eval.py
@@ -4,8 +4,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import (
     NumExprClobberingError,
     UndefinedVariableError,
@@ -762,7 +760,6 @@ def test_inf(self, op, f, engine, parser):
         result = df.query(q, engine=engine, parser=parser)
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_check_tz_aware_index_query(self, tz_aware_fixture):
         # https://github.com/pandas-dev/pandas/issues/29463
         tz = tz_aware_fixture
@@ -775,6 +772,7 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture):
         tm.assert_frame_equal(result, expected)
 
         expected = DataFrame(df_index)
+        expected.columns = expected.columns.astype(object)
         result = df.reset_index().query('"2018-01-03 00:00:00+00" < time')
         tm.assert_frame_equal(result, expected)
 
@@ -1072,7 +1070,7 @@ def test_query_with_string_columns(self, parser, engine):
             with pytest.raises(NotImplementedError, match=msg):
                 df.query("a in b and c < d", parser=parser, engine=engine)
 
-    def test_object_array_eq_ne(self, parser, engine, using_infer_string):
+    def test_object_array_eq_ne(self, parser, engine):
         df = DataFrame(
             {
                 "a": list("aaaabbbbcccc"),
@@ -1081,14 +1079,11 @@ def test_object_array_eq_ne(self, parser, engine, using_infer_string):
                 "d": np.random.default_rng(2).integers(9, size=12),
             }
         )
-        warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None
-        with tm.assert_produces_warning(warning):
-            res = df.query("a == b", parser=parser, engine=engine)
+        res = df.query("a == b", parser=parser, engine=engine)
         exp = df[df.a == df.b]
         tm.assert_frame_equal(res, exp)
 
-        with tm.assert_produces_warning(warning):
-            res = df.query("a != b", parser=parser, engine=engine)
+        res = df.query("a != b", parser=parser, engine=engine)
         exp = df[df.a != df.b]
         tm.assert_frame_equal(res, exp)
 
@@ -1128,15 +1123,13 @@ def test_query_with_nested_special_character(self, parser, engine):
         ],
     )
     def test_query_lex_compare_strings(
-        self, parser, engine, op, func, using_infer_string
+        self, parser, engine, op, func
     ):
         a = Series(np.random.default_rng(2).choice(list("abcde"), 20))
         b = Series(np.arange(a.size))
         df = DataFrame({"X": a, "Y": b})
 
-        warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None
-        with tm.assert_produces_warning(warning):
-            res = df.query(f'X {op} "d"', engine=engine, parser=parser)
+        res = df.query(f'X {op} "d"', engine=engine, parser=parser)
         expected = df[func(df.X, "d")]
         tm.assert_frame_equal(res, expected)
 
@@ -1400,7 +1393,6 @@ def test_expr_with_column_name_with_backtick(self):
         expected = df[df["a`b"] < 2]
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_expr_with_string_with_backticks(self):
         # GH 59285
         df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"])
@@ -1408,7 +1400,6 @@ def test_expr_with_string_with_backticks(self):
         expected = df["```" < df["#backticks"]]
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_expr_with_string_with_backticked_substring_same_as_column_name(self):
         # GH 59285
         df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"])
@@ -1439,7 +1430,6 @@ def test_expr_with_column_names_with_special_characters(self, col1, col2, expr):
         expected = df[df[col1] < df[col2]]
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_expr_with_no_backticks(self):
         # GH 59285
         df = DataFrame(("aaa", "vvv", "zzz"), columns=["column_name"])
@@ -1483,7 +1473,6 @@ def test_expr_with_quote_opened_before_backtick_and_quote_is_unmatched(self):
         ):
             df.query("`column-name` < 'It`s that\\'s \"quote\" #hash")
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_at_end(self):
         # GH 59285
         df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"])
@@ -1491,7 +1480,6 @@ def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_at_end(self
         expected = df[df["column-name"] < 'It`s that\'s "quote" #hash']
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_in_mid(self):
         # GH 59285
         df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"])

From 081dcdee8d754af90e307cf2311b06b3d02fae2a Mon Sep 17 00:00:00 2001
From: Lysandros Nikolaou <lisandrosnik@gmail.com>
Date: Mon, 16 Sep 2024 19:40:33 +0200
Subject: [PATCH 050/224] BUG: Remove np._get_promotion_state usage (#59818)

---
 pandas/tests/series/indexing/test_setitem.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py
index 71ba2dab671ef..789e3ac752097 100644
--- a/pandas/tests/series/indexing/test_setitem.py
+++ b/pandas/tests/series/indexing/test_setitem.py
@@ -4,13 +4,17 @@
     datetime,
 )
 from decimal import Decimal
+import os
 
 import numpy as np
 import pytest
 
 from pandas._config import using_string_dtype
 
-from pandas.compat import HAS_PYARROW
+from pandas.compat import (
+    HAS_PYARROW,
+    WASM,
+)
 from pandas.compat.numpy import np_version_gte1p24
 from pandas.errors import IndexingError
 
@@ -1446,7 +1450,11 @@ def obj(self):
             marks=pytest.mark.xfail(
                 (
                     not np_version_gte1p24
-                    or (np_version_gte1p24 and np._get_promotion_state() != "weak")
+                    or (
+                        np_version_gte1p24
+                        and os.environ.get("NPY_PROMOTION_STATE", "weak") != "weak"
+                    )
+                    or WASM
                 ),
                 reason="np.float32(1.1) ends up as 1.100000023841858, so "
                 "np_can_hold_element raises and we cast to float64",

From 8b1b2114ea72b9b79220e3cb2828b3e562bb5e07 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Tue, 17 Sep 2024 19:00:50 -0400
Subject: [PATCH 051/224] CI: Debug failing ARM builds (#59813)

* try bumping cython?

* maybe pinning numpy helps?

* skip tests

* Update test_sparse.py

* go for green

* Update test_sparse.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 pandas/tests/extension/test_sparse.py |  5 +++++
 pandas/tests/series/test_ufunc.py     |  5 ++++-
 pyproject.toml                        | 12 ++++++++++++
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py
index 56c023d99bb1c..b7685a61d4937 100644
--- a/pandas/tests/extension/test_sparse.py
+++ b/pandas/tests/extension/test_sparse.py
@@ -340,11 +340,16 @@ def test_argmin_argmax_all_na(self, method, data, na_value):
         self._check_unsupported(data)
         super().test_argmin_argmax_all_na(method, data, na_value)
 
+    @pytest.mark.fails_arm_wheels
     @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame])
     def test_equals(self, data, na_value, as_series, box):
         self._check_unsupported(data)
         super().test_equals(data, na_value, as_series, box)
 
+    @pytest.mark.fails_arm_wheels
+    def test_equals_same_data_different_object(self, data):
+        super().test_equals_same_data_different_object(data)
+
     @pytest.mark.parametrize(
         "func, na_action, expected",
         [
diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py
index 36a2afb2162c2..a5976bb2518c9 100644
--- a/pandas/tests/series/test_ufunc.py
+++ b/pandas/tests/series/test_ufunc.py
@@ -16,7 +16,10 @@ def ufunc(request):
     return request.param
 
 
-@pytest.fixture(params=[True, False], ids=["sparse", "dense"])
+@pytest.fixture(
+    params=[pytest.param(True, marks=pytest.mark.fails_arm_wheels), False],
+    ids=["sparse", "dense"],
+)
 def sparse(request):
     return request.param
 
diff --git a/pyproject.toml b/pyproject.toml
index 645ded35f3d18..9e4199ab735c6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -163,6 +163,14 @@ before-test = "bash {package}/scripts/cibw_before_test.sh"
 before-build = "pip install delvewheel && bash {package}/scripts/cibw_before_build.sh"
 repair-wheel-command = "delvewheel repair -w {dest_dir} {wheel}"
 
+[[tool.cibuildwheel.overrides]]
+select = "*-manylinux_aarch64*"
+test-command = """
+  PANDAS_CI='1' python -c 'import pandas as pd; \
+  pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db and not fails_arm_wheels", "-n 2", "--no-strict-data-files"]); \
+  pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \
+  """
+
 [[tool.cibuildwheel.overrides]]
 select = "*-musllinux*"
 before-test = "apk update && apk add musl-locales && bash {package}/scripts/cibw_before_test.sh"
@@ -478,6 +486,10 @@ markers = [
   "clipboard: mark a pd.read_clipboard test",
   "arm_slow: mark a test as slow for arm64 architecture",
   "skip_ubsan: Tests known to fail UBSAN check",
+  # TODO: someone should investigate this ...
+  # these tests only fail in the wheel builder and don't fail in regular
+  # ARM CI
+  "fails_arm_wheels: Tests that fail in the ARM wheel build only",
 ]
 
 [tool.mypy]

From a851438906ad5ec5f33df4a28ced85c4a0dcb492 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Wed, 18 Sep 2024 22:20:41 +0530
Subject: [PATCH 052/224] DOC: fix SA01,ES01 for
 pandas.tseries.offsets.WeekOfMonth (#59834)

---
 ci/code_checks.sh               |  1 -
 pandas/_libs/tslibs/offsets.pyx | 11 +++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index b57426dbb2078..f2d9f582d8932 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -383,7 +383,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.tseries.offsets.Week.n GL08" \
         -i "pandas.tseries.offsets.Week.normalize GL08" \
         -i "pandas.tseries.offsets.Week.weekday GL08" \
-        -i "pandas.tseries.offsets.WeekOfMonth SA01" \
         -i "pandas.tseries.offsets.WeekOfMonth.is_on_offset GL08" \
         -i "pandas.tseries.offsets.WeekOfMonth.n GL08" \
         -i "pandas.tseries.offsets.WeekOfMonth.normalize GL08" \
diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx
index 043c029ec900c..4fa1af0ec882c 100644
--- a/pandas/_libs/tslibs/offsets.pyx
+++ b/pandas/_libs/tslibs/offsets.pyx
@@ -3582,6 +3582,11 @@ cdef class WeekOfMonth(WeekOfMonthMixin):
     """
     Describes monthly dates like "the Tuesday of the 2nd week of each month".
 
+    This offset allows for generating or adjusting dates by specifying
+    a particular week and weekday within a month. The week is zero-indexed,
+    where 0 corresponds to the first week of the month, and weekday follows
+    a Monday=0 convention.
+
     Attributes
     ----------
     n : int, default 1
@@ -3602,6 +3607,12 @@ cdef class WeekOfMonth(WeekOfMonthMixin):
         - 5 is Saturday
         - 6 is Sunday.
 
+    See Also
+    --------
+    offsets.Week : Describes weekly frequency adjustments.
+    offsets.MonthEnd : Describes month-end frequency adjustments.
+    date_range : Generates a range of dates based on a specific frequency.
+
     Examples
     --------
     >>> ts = pd.Timestamp(2022, 1, 1)

From 0ad2c0d549ecc866a334e482afadc96845a01efa Mon Sep 17 00:00:00 2001
From: Matthew Simpson <156332325+ms041223@users.noreply.github.com>
Date: Wed, 18 Sep 2024 19:25:53 +0100
Subject: [PATCH 053/224] DOC: Adding ArcticDB to the ecosystem.md page
 (#59830)

* adding ArcticDB to the ecosystem.md page

* Update web/pandas/community/ecosystem.md

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>

* making pandas lower case

---------

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
 web/pandas/community/ecosystem.md | 91 +++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md
index 73a3cb6429790..2ea10954fc929 100644
--- a/web/pandas/community/ecosystem.md
+++ b/web/pandas/community/ecosystem.md
@@ -367,6 +367,97 @@ pandas-gbq provides high performance reads and writes to and from
 these methods were exposed as `pandas.read_gbq` and `DataFrame.to_gbq`.
 Use `pandas_gbq.read_gbq` and `pandas_gbq.to_gbq`, instead.
 
+
+### [ArcticDB](https://github.com/man-group/ArcticDB)
+
+ArcticDB is a serverless DataFrame database engine designed for the Python Data Science ecosystem. ArcticDB enables you to store, retrieve, and process pandas DataFrames at scale. It is a storage engine designed for object storage and also supports local-disk storage using LMDB. ArcticDB requires zero additional infrastructure beyond a running Python environment and access to object storage and can be installed in seconds. Please find full documentation [here](https://docs.arcticdb.io/latest/).
+
+#### ArcticDB Terminology
+
+ArcticDB is structured to provide a scalable and efficient way to manage and retrieve DataFrames, organized into several key components:
+
+- `Object Store` Collections of libraries. Used to separate logical environments from each other. Analogous to a database server.
+- `Library` Contains multiple symbols which are grouped in a certain way (different users, markets, etc). Analogous to a database.
+- `Symbol` Atomic unit of data storage. Identified by a string name. Data stored under a symbol strongly resembles a pandas DataFrame. Analogous to tables.
+- `Version` Every modifying action (write, append, update) performed on a symbol creates a new version of that object.
+
+#### Installation
+
+To install, simply run:
+
+```console
+pip install arcticdb
+```
+
+To get started, we can import ArcticDB and instantiate it:
+
+```python
+import arcticdb as adb
+import numpy as np
+import pandas as pd
+# this will set up the storage using the local file system
+arctic = adb.Arctic("lmdb://arcticdb_test")
+```
+
+> **Note:** ArcticDB supports any S3 API compatible storage, including AWS. ArcticDB also supports Azure Blob storage.  
+> ArcticDB also supports LMDB for local/file based storage - to use LMDB, pass an LMDB path as the URI: `adb.Arctic('lmdb://path/to/desired/database')`.
+
+#### Library Setup
+
+ArcticDB is geared towards storing many (potentially millions) of tables. Individual tables (DataFrames) are called symbols and are stored in collections called libraries. A single library can store many symbols. Libraries must first be initialized prior to use:
+
+```python
+lib = arctic.get_library('sample', create_if_missing=True)
+```
+
+#### Writing Data to ArcticDB
+
+Now we have a library set up, we can get to reading and writing data. ArcticDB has a set of simple functions for DataFrame storage. Let's write a DataFrame to storage.
+
+```python
+df = pd.DataFrame(
+    {
+        "a": list("abc"),
+        "b": list(range(1, 4)),
+        "c": np.arange(3, 6).astype("u1"),
+        "d": np.arange(4.0, 7.0, dtype="float64"),
+        "e": [True, False, True],
+        "f": pd.date_range("20130101", periods=3)
+    }
+)
+
+df
+df.dtypes
+```
+
+Write to ArcticDB.
+
+```python
+write_record = lib.write("test", df)
+```
+
+> **Note:** When writing pandas DataFrames, ArcticDB supports the following index types:
+>
+> - `pandas.Index` containing int64 (or the corresponding dedicated types Int64Index, UInt64Index)
+> - `RangeIndex`
+> - `DatetimeIndex`
+> - `MultiIndex` composed of above supported types
+>
+> The "row" concept in `head`/`tail` refers to the row number ('iloc'), not the value in the `pandas.Index` ('loc').
+
+#### Reading Data from ArcticDB
+
+Read the data back from storage:
+
+```python
+read_record = lib.read("test")
+read_record.data
+df.dtypes
+```
+
+ArcticDB also supports appending, updating, and querying data from storage to a pandas DataFrame. Please find more information [here](https://docs.arcticdb.io/latest/api/query_builder/).
+
+
 ## Out-of-core
 
 ### [Bodo](https://bodo.ai/)

From 09c7a873cacfcb2caa38329d4bb27d61fd153d74 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Wed, 18 Sep 2024 17:16:55 -0400
Subject: [PATCH 054/224] BLD: Fix bad Cython annotation (#59836)

---
 pandas/_libs/tslibs/np_datetime.pxd | 2 +-
 pandas/_libs/tslibs/np_datetime.pyx | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd
index 43240046c6500..3e5654b70cd92 100644
--- a/pandas/_libs/tslibs/np_datetime.pxd
+++ b/pandas/_libs/tslibs/np_datetime.pxd
@@ -89,7 +89,7 @@ cdef int string_to_dts(
     int* out_local,
     int* out_tzoffset,
     bint want_exc,
-    format: str | None = *,
+    str format = *,
     bint exact = *
 ) except? -1
 
diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx
index 61095b3f034fd..0b02fc13246f0 100644
--- a/pandas/_libs/tslibs/np_datetime.pyx
+++ b/pandas/_libs/tslibs/np_datetime.pyx
@@ -331,7 +331,7 @@ cdef int string_to_dts(
     int* out_local,
     int* out_tzoffset,
     bint want_exc,
-    format: str | None=None,
+    str format=None,
     bint exact=True,
 ) except? -1:
     cdef:

From 22372175e04f05f73521cab1b26f0818d6766717 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 19 Sep 2024 00:46:07 +0200
Subject: [PATCH 055/224] BLD/RLS: build wheels with released numpy/cython for
 Python 3.13 (#59819)

---
 .github/workflows/wheels.yml | 6 +-----
 MANIFEST.in                  | 1 -
 pyproject.toml               | 3 +--
 scripts/cibw_before_build.sh | 8 +++-----
 scripts/cibw_before_test.sh  | 8 --------
 5 files changed, 5 insertions(+), 21 deletions(-)
 delete mode 100644 scripts/cibw_before_test.sh

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 67d8715f72614..2aaec8c9b56b0 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -102,9 +102,7 @@ jobs:
         python: [["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"], ["cp313", "3.13"], ["cp313t", "3.13"]]
         include:
         # TODO: Remove this plus installing build deps in cibw_before_build.sh
-        # and test deps in cibw_before_test.sh after pandas can be built with a released NumPy/Cython
-        - python: ["cp313", "3.13"]
-          cibw_build_frontend: 'pip; args: --no-build-isolation'
+        # after pandas can be built with a released NumPy/Cython
         - python: ["cp313t", "3.13"]
           cibw_build_frontend: 'pip; args: --no-build-isolation'
         # Build Pyodide wheels and upload them to Anaconda.org
@@ -187,11 +185,9 @@ jobs:
       - name: Test Windows Wheels
         if: ${{ matrix.buildplat[1] == 'win_amd64' }}
         shell: pwsh
-        # TODO: Remove NumPy nightly install when there's a 3.13 wheel on PyPI
         run: |
           $TST_CMD = @"
           python -m pip install hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0;
-          ${{ matrix.python[1] == '3.13' && 'python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy;' }}
           python -m pip install `$(Get-Item pandas\wheelhouse\*.whl);
           python -c `'import pandas as pd; pd.test(extra_args=[`\"--no-strict-data-files`\", `\"-m not clipboard and not single_cpu and not slow and not network and not db`\"])`';
           "@
diff --git a/MANIFEST.in b/MANIFEST.in
index f586d457eaaf8..a7d7d7eb4e062 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -65,4 +65,3 @@ graft pandas/_libs/include
 
 # Include cibw script in sdist since it's needed for building wheels
 include scripts/cibw_before_build.sh
-include scripts/cibw_before_test.sh
diff --git a/pyproject.toml b/pyproject.toml
index 9e4199ab735c6..5ffd9d9a5608c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -157,7 +157,6 @@ test-command = """
   """
 free-threaded-support = true
 before-build = "bash {package}/scripts/cibw_before_build.sh"
-before-test = "bash {package}/scripts/cibw_before_test.sh"
 
 [tool.cibuildwheel.windows]
 before-build = "pip install delvewheel && bash {package}/scripts/cibw_before_build.sh"
@@ -173,7 +172,7 @@ test-command = """
 
 [[tool.cibuildwheel.overrides]]
 select = "*-musllinux*"
-before-test = "apk update && apk add musl-locales && bash {package}/scripts/cibw_before_test.sh"
+before-test = "apk update && apk add musl-locales"
 
 [[tool.cibuildwheel.overrides]]
 select = "*-win*"
diff --git a/scripts/cibw_before_build.sh b/scripts/cibw_before_build.sh
index f3049b27ed5d1..6186340807f8f 100644
--- a/scripts/cibw_before_build.sh
+++ b/scripts/cibw_before_build.sh
@@ -1,8 +1,6 @@
-# TODO: Delete when there's PyPI NumPy/Cython releases the support Python 3.13.
-# If free-threading support is not included in those releases, this script will have
-# to whether this runs for a free-threaded build instead.
-PYTHON_VERSION="$(python -c "import sys; print(f'{sys.version_info.major}{sys.version_info.minor}')")"
-if [[ $PYTHON_VERSION == "313" ]]; then
+# TODO: Delete when there's a PyPI Cython release that supports free-threaded Python 3.13.
+FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")"
+if [[ $FREE_THREADED_BUILD == "True"  ]]; then
     python -m pip install -U pip
     python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy cython
     python -m pip install ninja meson-python versioneer[toml]
diff --git a/scripts/cibw_before_test.sh b/scripts/cibw_before_test.sh
deleted file mode 100644
index 7d1b143881ced..0000000000000
--- a/scripts/cibw_before_test.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-# TODO: Delete when there's PyPI NumPy/Cython releases the support Python 3.13.
-# If free-threading support is not included in those releases, this script will have
-# to whether this runs for a free-threaded build instead.
-PYTHON_VERSION="$(python -c "import sys; print(f'{sys.version_info.major}{sys.version_info.minor}')")"
-if [[ $PYTHON_VERSION == "313" ]]; then
-    python -m pip install -U pip
-    python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy
-fi

From f1e6cc184ae0534e11c0a2947f4948bc4c5e0a9d Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 19 Sep 2024 15:39:08 -0400
Subject: [PATCH 056/224] BLD: Final release prep for 2.2.3 (#59840)

* BLD: Final release prep

* change back perms

* debug

* try to fix license addition

* silence stable version warning?
---
 doc/source/conf.py             |  4 +++-
 doc/source/whatsnew/v2.2.2.rst |  2 +-
 doc/source/whatsnew/v2.2.3.rst | 23 ++++++++++++++++-------
 doc/source/whatsnew/v3.0.0.rst |  1 -
 pyproject.toml                 |  2 +-
 scripts/cibw_before_build.sh   |  5 +++++
 6 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/doc/source/conf.py b/doc/source/conf.py
index 77dd5d03d311c..ddbda0aa3bf65 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -254,7 +254,9 @@
         "json_url": "https://pandas.pydata.org/versions.json",
         "version_match": switcher_version,
     },
-    "show_version_warning_banner": True,
+    # This shows a warning for patch releases since the
+    # patch version doesn't compare as equal (e.g. 2.2.1 != 2.2.0 but it should be)
+    "show_version_warning_banner": False,
     "icon_links": [
         {
             "name": "Mastodon",
diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst
index 72a2f84c4aaee..fbe5e9b4febb5 100644
--- a/doc/source/whatsnew/v2.2.2.rst
+++ b/doc/source/whatsnew/v2.2.2.rst
@@ -56,4 +56,4 @@ Other
 Contributors
 ~~~~~~~~~~~~
 
-.. contributors:: v2.2.1..v2.2.2|HEAD
+.. contributors:: v2.2.1..v2.2.2
diff --git a/doc/source/whatsnew/v2.2.3.rst b/doc/source/whatsnew/v2.2.3.rst
index aa6e241e74b0a..1696a7b6449af 100644
--- a/doc/source/whatsnew/v2.2.3.rst
+++ b/doc/source/whatsnew/v2.2.3.rst
@@ -1,6 +1,6 @@
 .. _whatsnew_223:
 
-What's new in 2.2.3 (September XX, 2024)
+What's new in 2.2.3 (September 20, 2024)
 ----------------------------------------
 
 These are the changes in pandas 2.2.3. See :ref:`release` for a full changelog
@@ -9,28 +9,37 @@ including other versions of pandas.
 {{ header }}
 
 .. ---------------------------------------------------------------------------
-.. _whatsnew_223.regressions:
 
-Fixed regressions
-~~~~~~~~~~~~~~~~~
--
+.. _whatsnew_220.py13_compat:
+
+Pandas 2.2.3 is now compatible with Python 3.13
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Pandas 2.2.3 is the first version of pandas that is generally compatible with the upcoming
+Python 3.13, and both wheels for free-threaded and normal Python 3.13 will be uploaded for
+this release.
+
+As usual please report any bugs discovered to our `issue tracker <https://github.com/pandas-dev/pandas/issues/new/choose>`_
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_223.bug_fixes:
 
 Bug fixes
 ~~~~~~~~~
--
+- Bug in :func:`eval` on :class:`complex` including division ``/`` discards imaginary part. (:issue:`21374`)
+- Minor fixes for numpy 2.1 compatibility. (:issue:`59444`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_223.other:
 
 Other
 ~~~~~
--
+- Missing licenses for 3rd party dependencies were added back into the wheels. (:issue:`58632`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_223.contributors:
 
 Contributors
 ~~~~~~~~~~~~
+
+.. contributors:: v2.2.2..v2.2.3|HEAD
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 89a1c388b3ba1..c2a56afbc580e 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -693,7 +693,6 @@ Other
 ^^^^^
 - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`)
 - Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`)
-- Bug in :func:`eval` on :class:`complex` including division ``/`` discards imaginary part. (:issue:`21374`)
 - Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`)
 - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`)
 - Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`)
diff --git a/pyproject.toml b/pyproject.toml
index 5ffd9d9a5608c..d0fcdc4b21b33 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -156,7 +156,7 @@ test-command = """
   pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \
   """
 free-threaded-support = true
-before-build = "bash {package}/scripts/cibw_before_build.sh"
+before-build = "PACKAGE_DIR={package} bash {package}/scripts/cibw_before_build.sh"
 
 [tool.cibuildwheel.windows]
 before-build = "pip install delvewheel && bash {package}/scripts/cibw_before_build.sh"
diff --git a/scripts/cibw_before_build.sh b/scripts/cibw_before_build.sh
index 6186340807f8f..679b91e3280ec 100644
--- a/scripts/cibw_before_build.sh
+++ b/scripts/cibw_before_build.sh
@@ -1,3 +1,8 @@
+# Add 3rd party licenses, like numpy does
+for file in $PACKAGE_DIR/LICENSES/*; do
+  cat $file >> $PACKAGE_DIR/LICENSE
+done
+
 # TODO: Delete when there's a PyPI Cython release that supports free-threaded Python 3.13.
 FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")"
 if [[ $FREE_THREADED_BUILD == "True"  ]]; then

From 2419343bfea5dba678146139ca9663d831c47b22 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 20 Sep 2024 07:39:20 -0400
Subject: [PATCH 057/224] BLD: Build wheels for Python 3.13 on aarch64 as well
 (#59847)

* BLD: Build wheels for Python 3.13 on aarch64 as well

* some fixups

* another typo
---
 .circleci/config.yml | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 27b6829dcda70..9c986e5b1b054 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -92,7 +92,13 @@ jobs:
           no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that
           command: |
             pip3 install cibuildwheel==2.20.0
-            cibuildwheel --output-dir wheelhouse
+            if [[ $CIBW_BUILD == cp313t* ]]; then
+              # TODO: temporarily run 3.13 free threaded builds without build isolation
+              # since we need pre-release cython
+              CIBW_BUILD_FRONTEND="pip; args: --no-build-isolation" cibuildwheel --output-dir wheelhouse
+            else
+              cibuildwheel --output-dir wheelhouse
+            fi
 
           environment:
             CIBW_BUILD: << parameters.cibw-build >>
@@ -141,6 +147,10 @@ workflows:
               cibw-build: ["cp310-manylinux_aarch64",
                            "cp311-manylinux_aarch64",
                            "cp312-manylinux_aarch64",
+                           "cp313-manylinux_aarch64",
+                           "cp313t-manylinux_aarch64",
                            "cp310-musllinux_aarch64",
                            "cp311-musllinux_aarch64",
-                           "cp312-musllinux_aarch64",]
+                           "cp312-musllinux_aarch64",
+                           "cp313-musllinux_aarch64",
+                           "cp313t-musllinux_aarch64"]

From 71b395f2cf513f7c4ef8b50c608072bf3950e596 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sun, 22 Sep 2024 19:27:28 +0530
Subject: [PATCH 058/224] DOC: fix RT03 for
 pandas.core.groupby.DataFrameGroupBy.hist (#59870)

---
 ci/code_checks.sh              | 1 -
 pandas/core/groupby/generic.py | 4 +++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index f2d9f582d8932..21104c2e00450 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -144,7 +144,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \
-        -i "pandas.core.groupby.DataFrameGroupBy.hist RT03" \
         -i "pandas.core.groupby.DataFrameGroupBy.indices SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \
         -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index eae33ddc1df29..bec9d344d42e2 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -2694,7 +2694,9 @@ def hist(
 
         Returns
         -------
-        matplotlib.Axes or numpy.ndarray of them
+        matplotlib.Axes or numpy.ndarray
+            A ``matplotlib.Axes`` object or an array of ``Axes`` objects, depending on
+            the layout and grouping.
 
         See Also
         --------

From 2cdb97e2f806d83965c7dee8fb5fcf164a340379 Mon Sep 17 00:00:00 2001
From: Fawaz Ahmed <fawazahmed0@hotmail.com>
Date: Tue, 24 Sep 2024 06:21:39 +0530
Subject: [PATCH 059/224] BUG: Fix precision loss in read_json (#59284)

---
 doc/source/whatsnew/v3.0.0.rst      |  1 +
 pandas/io/json/_json.py             |  3 ++-
 pandas/tests/io/json/test_pandas.py | 12 ++++++++++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index c2a56afbc580e..3b5183c43bcd0 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -630,6 +630,7 @@ I/O
 - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
 - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
 - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`)
+- Bug in :meth:`read_json` where extreme value integers in string format were incorrectly parsed as a different integer number (:issue:`20608`)
 - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`)
 - Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`)
 - Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`)
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index d077b9e0c4568..e9c9f5ba225a5 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -1168,6 +1168,7 @@ def _try_convert_data(
         """
         Try to parse a Series into a column by inferring dtype.
         """
+        org_data = data
         # don't try to coerce, unless a force conversion
         if use_dtypes:
             if not self.dtype:
@@ -1222,7 +1223,7 @@ def _try_convert_data(
         if len(data) and data.dtype in ("float", "object"):
             # coerce ints if we can
             try:
-                new_data = data.astype("int64")
+                new_data = org_data.astype("int64")
                 if (new_data == data).all():
                     data = new_data
                     converted = True
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 1c54232b8b510..d3328d1dfcaef 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -2286,3 +2286,15 @@ def test_read_json_lines_rangeindex():
     result = read_json(StringIO(data), lines=True).index
     expected = RangeIndex(2)
     tm.assert_index_equal(result, expected, exact=True)
+
+
+def test_large_number():
+    # GH#20608
+    result = read_json(
+        StringIO('["9999999999999999"]'),
+        orient="values",
+        typ="series",
+        convert_dates=False,
+    )
+    expected = Series([9999999999999999])
+    tm.assert_series_equal(result, expected)

From dc24410c0fbbfff2b191247dc7dc963cc92c0321 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Wed, 25 Sep 2024 23:15:06 +0530
Subject: [PATCH 060/224] DOC: fix SA01 for pandas.api.types.is_int64_dtype
 (#59862)

---
 ci/code_checks.sh            |  1 -
 pandas/core/dtypes/common.py | 10 ++++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 21104c2e00450..3d31781b886ab 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -117,7 +117,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.api.types.is_float PR01,SA01" \
         -i "pandas.api.types.is_float_dtype SA01" \
         -i "pandas.api.types.is_hashable PR01,RT03,SA01" \
-        -i "pandas.api.types.is_int64_dtype SA01" \
         -i "pandas.api.types.is_integer PR01,SA01" \
         -i "pandas.api.types.is_interval_dtype SA01" \
         -i "pandas.api.types.is_iterator PR07,SA01" \
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index ff855f97a352b..0252927241ef4 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -886,6 +886,16 @@ def is_int64_dtype(arr_or_dtype) -> bool:
     boolean
         Whether or not the array or dtype is of the int64 dtype.
 
+    See Also
+    --------
+    api.types.is_float_dtype : Check whether the provided array or dtype is of a
+        float dtype.
+    api.types.is_bool_dtype : Check whether the provided array or dtype is of a
+        boolean dtype.
+    api.types.is_object_dtype : Check whether an array-like or dtype is of the
+        object dtype.
+    numpy.int64 : Numpy's 64-bit integer type.
+
     Notes
     -----
     Depending on system architecture, the return value of `is_int64_dtype(

From b91be12f8854d87e0f1c6cf9e2db7a5e68983be1 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Wed, 25 Sep 2024 23:16:03 +0530
Subject: [PATCH 061/224] DOC: fix SA01, ES01 for
 pandas.api.types.is_float_dtype (#59861)

---
 ci/code_checks.sh            |  1 -
 pandas/core/dtypes/common.py | 12 ++++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 3d31781b886ab..119c6e2b33684 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -115,7 +115,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.api.types.is_dict_like PR07,SA01" \
         -i "pandas.api.types.is_file_like PR07,SA01" \
         -i "pandas.api.types.is_float PR01,SA01" \
-        -i "pandas.api.types.is_float_dtype SA01" \
         -i "pandas.api.types.is_hashable PR01,RT03,SA01" \
         -i "pandas.api.types.is_integer PR01,SA01" \
         -i "pandas.api.types.is_interval_dtype SA01" \
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index 0252927241ef4..48d2106aff124 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -1285,6 +1285,9 @@ def is_float_dtype(arr_or_dtype) -> bool:
     """
     Check whether the provided array or dtype is of a float dtype.
 
+    The function checks for floating-point data types, which represent real numbers
+    that may have fractional components.
+
     Parameters
     ----------
     arr_or_dtype : array-like or dtype
@@ -1295,6 +1298,15 @@ def is_float_dtype(arr_or_dtype) -> bool:
     boolean
         Whether or not the array or dtype is of a float dtype.
 
+    See Also
+    --------
+    api.types.is_numeric_dtype : Check whether the provided array or dtype is of
+        a numeric dtype.
+    api.types.is_integer_dtype : Check whether the provided array or dtype is of
+        an integer dtype.
+    api.types.is_object_dtype : Check whether an array-like or dtype is of the
+        object dtype.
+
     Examples
     --------
     >>> from pandas.api.types import is_float_dtype

From b81ed16389385ad1272e94d2796db31ce8ccbafd Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Wed, 25 Sep 2024 23:19:34 +0530
Subject: [PATCH 062/224] DOC: fix SA01, ES01 for
 pandas.Series.sparse.sp_values (#59859)

---
 ci/code_checks.sh                  |  1 -
 pandas/core/arrays/sparse/array.py | 12 ++++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 119c6e2b33684..42955a6476734 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -100,7 +100,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Series.sparse.fill_value SA01" \
         -i "pandas.Series.sparse.from_coo PR07,SA01" \
         -i "pandas.Series.sparse.npoints SA01" \
-        -i "pandas.Series.sparse.sp_values SA01" \
         -i "pandas.Timedelta.max PR02" \
         -i "pandas.Timedelta.min PR02" \
         -i "pandas.Timedelta.resolution PR02" \
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
index a09dc20af3b36..40012357f40cd 100644
--- a/pandas/core/arrays/sparse/array.py
+++ b/pandas/core/arrays/sparse/array.py
@@ -603,6 +603,18 @@ def sp_values(self) -> np.ndarray:
         """
         An ndarray containing the non- ``fill_value`` values.
 
+        This property returns the actual data values stored in the sparse
+        representation, excluding the values that are equal to the ``fill_value``.
+        The result is an ndarray of the underlying values, preserving the sparse
+        structure by omitting the default ``fill_value`` entries.
+
+        See Also
+        --------
+        Series.sparse.to_dense : Convert a Series from sparse values to dense.
+        Series.sparse.fill_value : Elements in `data` that are `fill_value` are
+            not stored.
+        Series.sparse.density : The percent of non- ``fill_value`` points, as decimal.
+
         Examples
         --------
         >>> from pandas.arrays import SparseArray

From 7cebd7822ba0598f53fdd6dd8141c66b949c9023 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Wed, 25 Sep 2024 23:22:11 +0530
Subject: [PATCH 063/224] DOC: fix SA01 for pandas.Series.sparse.fill_value
 (#59858)

---
 ci/code_checks.sh                  | 1 -
 pandas/core/arrays/sparse/array.py | 6 ++++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 42955a6476734..e0d6efa0278e4 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -97,7 +97,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Series.dt.tz_localize PR01,PR02" \
         -i "pandas.Series.dt.unit GL08" \
         -i "pandas.Series.pad PR01,SA01" \
-        -i "pandas.Series.sparse.fill_value SA01" \
         -i "pandas.Series.sparse.from_coo PR07,SA01" \
         -i "pandas.Series.sparse.npoints SA01" \
         -i "pandas.Timedelta.max PR02" \
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
index 40012357f40cd..c8ec4068ca199 100644
--- a/pandas/core/arrays/sparse/array.py
+++ b/pandas/core/arrays/sparse/array.py
@@ -635,6 +635,12 @@ def fill_value(self):
 
         For memory savings, this should be the most common value in the array.
 
+        See Also
+        --------
+        SparseDtype : Dtype for data stored in :class:`SparseArray`.
+        Series.value_counts : Return a Series containing counts of unique values.
+        Series.fillna : Fill NA/NaN in a Series with a specified value.
+
         Examples
         --------
         >>> ser = pd.Series([0, 0, 2, 2, 2], dtype="Sparse[int]")

From 5b6997ca14187b31a87490b9e61e3af4cbdda6d7 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Wed, 25 Sep 2024 23:24:42 +0530
Subject: [PATCH 064/224] DOC: fix SA01, ES01 for
 pandas.tseries.offsets.SemiMonthEnd (#59856)

DOC: fix SA01 for pandas.tseries.offsets.SemiMonthEnd
---
 ci/code_checks.sh               |  1 -
 pandas/_libs/tslibs/offsets.pyx | 12 ++++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index e0d6efa0278e4..7cc314007aabd 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -364,7 +364,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.tseries.offsets.SemiMonthBegin.n GL08" \
         -i "pandas.tseries.offsets.SemiMonthBegin.normalize GL08" \
         -i "pandas.tseries.offsets.SemiMonthBegin.rule_code GL08" \
-        -i "pandas.tseries.offsets.SemiMonthEnd SA01" \
         -i "pandas.tseries.offsets.SemiMonthEnd.day_of_month GL08" \
         -i "pandas.tseries.offsets.SemiMonthEnd.is_on_offset GL08" \
         -i "pandas.tseries.offsets.SemiMonthEnd.n GL08" \
diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx
index 4fa1af0ec882c..4db96fbaa3aad 100644
--- a/pandas/_libs/tslibs/offsets.pyx
+++ b/pandas/_libs/tslibs/offsets.pyx
@@ -3316,6 +3316,11 @@ cdef class SemiMonthEnd(SemiMonthOffset):
     """
     Two DateOffset's per month repeating on the last day of the month & day_of_month.
 
+    This offset allows for flexibility in generating date ranges or adjusting dates
+    to the end of a month or a specific day in the month, such as the 15th or the last
+    day of the month. It is useful for financial or scheduling applications where
+    events occur bi-monthly.
+
     Attributes
     ----------
     n : int, default 1
@@ -3325,6 +3330,13 @@ cdef class SemiMonthEnd(SemiMonthOffset):
     day_of_month : int, {1, 3,...,27}, default 15
         A specific integer for the day of the month.
 
+    See Also
+    --------
+    tseries.offsets.SemiMonthBegin : Offset for semi-monthly frequencies, starting at
+        the beginning of the month.
+    tseries.offsets.MonthEnd : Offset to the last calendar day of the month.
+    tseries.offsets.MonthBegin : Offset to the first calendar day of the month.
+
     Examples
     --------
     >>> ts = pd.Timestamp(2022, 1, 14)

From a9e30c5f62d080aea7629ca17cf1e9c0e8c3e080 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 25 Sep 2024 19:57:49 +0200
Subject: [PATCH 065/224] String dtype: map builtin str alias to StringDtype
 (#59685)

* String dtype: map builtin str alias to StringDtype

* fix tests

* fix datetimelike astype and more tests

* remove xfails

* try fix typing

* fix copy_view tests

* fix remaining tests with infer_string enabled

* ignore typing issue for now

* move to common.py

* simplify Categorical._str_get_dummies

* small cleanup

* fix ensure_string_array to not modify extension arrays inplace

* fix ensure_string_array once more + fix is_extension_array_dtype for str

* still xfail TestArrowArray::test_astype_str when not using infer_string

* ensure maybe_convert_objects copies object dtype input array when inferring StringDtype

* update test_1d_object_array_does_not_copy test

* update constructor copy test + do not copy in maybe_convert_objects?

* skip str.get_dummies test for now

* use pandas_dtype() instead of registry.find

* fix corner cases for calling pandas_dtype

* add TODO comment in ensure_string_array
---
 pandas/_libs/lib.pyx                          |  9 +++-
 pandas/_testing/__init__.py                   |  2 +-
 pandas/core/arrays/categorical.py             |  4 +-
 pandas/core/arrays/datetimelike.py            | 10 ++++-
 pandas/core/dtypes/common.py                  | 18 +++++++-
 pandas/core/indexes/base.py                   |  6 ++-
 pandas/core/indexes/interval.py               |  3 +-
 pandas/tests/arrays/floating/test_astype.py   |  6 +--
 pandas/tests/arrays/integer/test_dtypes.py    |  6 +--
 pandas/tests/arrays/sparse/test_astype.py     |  4 +-
 pandas/tests/arrays/sparse/test_dtype.py      |  2 +-
 pandas/tests/dtypes/test_common.py            | 12 ++++++
 pandas/tests/extension/base/casting.py        |  4 +-
 pandas/tests/extension/json/array.py          |  3 +-
 pandas/tests/extension/test_arrow.py          | 29 +++----------
 pandas/tests/frame/methods/test_astype.py     | 17 ++++----
 .../tests/frame/methods/test_select_dtypes.py |  5 ++-
 pandas/tests/frame/test_constructors.py       | 41 +++++++++++++++----
 .../indexes/datetimes/methods/test_astype.py  | 15 ++++---
 pandas/tests/indexes/object/test_astype.py    |  4 +-
 .../indexes/period/methods/test_astype.py     |  9 +++-
 .../indexes/timedeltas/methods/test_astype.py |  9 +++-
 pandas/tests/interchange/test_impl.py         |  1 +
 pandas/tests/io/excel/test_readers.py         |  6 +--
 .../io/parser/dtypes/test_dtypes_basic.py     | 17 ++++----
 pandas/tests/io/parser/test_na_values.py      |  2 -
 .../io/parser/test_python_parser_only.py      |  6 +--
 pandas/tests/series/methods/test_astype.py    | 30 ++++++++------
 pandas/tests/series/methods/test_map.py       |  4 +-
 pandas/tests/series/test_constructors.py      |  2 +-
 pandas/tests/strings/test_get_dummies.py      |  3 ++
 pandas/tests/test_algos.py                    |  7 +++-
 32 files changed, 185 insertions(+), 111 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 3f2dfbfb3b404..8af48a861967a 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -754,7 +754,14 @@ cpdef ndarray[object] ensure_string_array(
 
     if hasattr(arr, "to_numpy"):
 
-        if hasattr(arr, "dtype") and arr.dtype.kind in "mM":
+        if (
+            hasattr(arr, "dtype")
+            and arr.dtype.kind in "mM"
+            # TODO: we should add a custom ArrowExtensionArray.astype implementation
+            # that handles astype(str) specifically, avoiding ending up here and
+            # then we can remove the below check for `_pa_array` (for ArrowEA)
+            and not hasattr(arr, "_pa_array")
+        ):
             # dtype check to exclude DataFrame
             # GH#41409 TODO: not a great place for this
             out = arr.astype(str).astype(object)
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
index 5fa1a984b8aea..0be01da1816a2 100644
--- a/pandas/_testing/__init__.py
+++ b/pandas/_testing/__init__.py
@@ -108,7 +108,7 @@
 
 COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"]
 if using_string_dtype():
-    STRING_DTYPES: list[Dtype] = [str, "U"]
+    STRING_DTYPES: list[Dtype] = ["U"]
 else:
     STRING_DTYPES: list[Dtype] = [str, "str", "U"]  # type: ignore[no-redef]
 COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES]
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 8e0225b31e17b..a69e197df851d 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -2685,7 +2685,9 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
         # sep may not be in categories. Just bail on this.
         from pandas.core.arrays import NumpyExtensionArray
 
-        return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep, dtype)
+        return NumpyExtensionArray(self.to_numpy(str, na_value="NaN"))._str_get_dummies(
+            sep, dtype
+        )
 
     # ------------------------------------------------------------------------
     # GroupBy Methods
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index fbe1677b95b33..7be8daa09c758 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -471,10 +471,16 @@ def astype(self, dtype, copy: bool = True):
 
             return self._box_values(self.asi8.ravel()).reshape(self.shape)
 
+        elif is_string_dtype(dtype):
+            if isinstance(dtype, ExtensionDtype):
+                arr_object = self._format_native_types(na_rep=dtype.na_value)  # type: ignore[arg-type]
+                cls = dtype.construct_array_type()
+                return cls._from_sequence(arr_object, dtype=dtype, copy=False)
+            else:
+                return self._format_native_types()
+
         elif isinstance(dtype, ExtensionDtype):
             return super().astype(dtype, copy=copy)
-        elif is_string_dtype(dtype):
-            return self._format_native_types()
         elif dtype.kind in "iu":
             # we deliberately ignore int32 vs. int64 here.
             # See https://github.com/pandas-dev/pandas/issues/24381 for more.
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index 48d2106aff124..1a38bb03b2c1c 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -12,6 +12,8 @@
 
 import numpy as np
 
+from pandas._config import using_string_dtype
+
 from pandas._libs import (
     Interval,
     Period,
@@ -1470,7 +1472,15 @@ def is_extension_array_dtype(arr_or_dtype) -> bool:
     elif isinstance(dtype, np.dtype):
         return False
     else:
-        return registry.find(dtype) is not None
+        try:
+            with warnings.catch_warnings():
+                # pandas_dtype(..) can raise UserWarning for class input
+                warnings.simplefilter("ignore", UserWarning)
+                dtype = pandas_dtype(dtype)
+        except (TypeError, ValueError):
+            # np.dtype(..) can raise ValueError
+            return False
+        return isinstance(dtype, ExtensionDtype)
 
 
 def is_ea_or_datetimelike_dtype(dtype: DtypeObj | None) -> bool:
@@ -1773,6 +1783,12 @@ def pandas_dtype(dtype) -> DtypeObj:
     elif isinstance(dtype, (np.dtype, ExtensionDtype)):
         return dtype
 
+    # builtin aliases
+    if dtype is str and using_string_dtype():
+        from pandas.core.arrays.string_ import StringDtype
+
+        return StringDtype(na_value=np.nan)
+
     # registered extension types
     result = registry.find(dtype)
     if result is not None:
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 2346c20004210..852049804a4f5 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -6262,7 +6262,11 @@ def _should_compare(self, other: Index) -> bool:
             return False
 
         dtype = _unpack_nested_dtype(other)
-        return self._is_comparable_dtype(dtype) or is_object_dtype(dtype)
+        return (
+            self._is_comparable_dtype(dtype)
+            or is_object_dtype(dtype)
+            or is_string_dtype(dtype)
+        )
 
     def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
         """
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
index 359cdf880937b..8feac890883eb 100644
--- a/pandas/core/indexes/interval.py
+++ b/pandas/core/indexes/interval.py
@@ -51,6 +51,7 @@
     is_number,
     is_object_dtype,
     is_scalar,
+    is_string_dtype,
     pandas_dtype,
 )
 from pandas.core.dtypes.dtypes import (
@@ -712,7 +713,7 @@ def _get_indexer(
             # left/right get_indexer, compare elementwise, equality -> match
             indexer = self._get_indexer_unique_sides(target)
 
-        elif not is_object_dtype(target.dtype):
+        elif not (is_object_dtype(target.dtype) or is_string_dtype(target.dtype)):
             # homogeneous scalar index: use IntervalTree
             # we should always have self._should_partial_index(target) here
             target = self._maybe_convert_i8(target)
diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py
index ccf644b34051d..752ebe194ffcf 100644
--- a/pandas/tests/arrays/floating/test_astype.py
+++ b/pandas/tests/arrays/floating/test_astype.py
@@ -68,11 +68,9 @@ def test_astype_str(using_infer_string):
 
     if using_infer_string:
         expected = pd.array(["0.1", "0.2", None], dtype=pd.StringDtype(na_value=np.nan))
-        tm.assert_extension_array_equal(a.astype("str"), expected)
 
-        # TODO(infer_string) this should also be a string array like above
-        expected = np.array(["0.1", "0.2", "<NA>"], dtype="U32")
-        tm.assert_numpy_array_equal(a.astype(str), expected)
+        tm.assert_extension_array_equal(a.astype(str), expected)
+        tm.assert_extension_array_equal(a.astype("str"), expected)
     else:
         expected = np.array(["0.1", "0.2", "<NA>"], dtype="U32")
 
diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py
index fadd7ac67b58d..7972ba7b9fb0f 100644
--- a/pandas/tests/arrays/integer/test_dtypes.py
+++ b/pandas/tests/arrays/integer/test_dtypes.py
@@ -281,11 +281,9 @@ def test_astype_str(using_infer_string):
 
     if using_infer_string:
         expected = pd.array(["1", "2", None], dtype=pd.StringDtype(na_value=np.nan))
-        tm.assert_extension_array_equal(a.astype("str"), expected)
 
-        # TODO(infer_string) this should also be a string array like above
-        expected = np.array(["1", "2", "<NA>"], dtype=f"{tm.ENDIAN}U21")
-        tm.assert_numpy_array_equal(a.astype(str), expected)
+        tm.assert_extension_array_equal(a.astype(str), expected)
+        tm.assert_extension_array_equal(a.astype("str"), expected)
     else:
         expected = np.array(["1", "2", "<NA>"], dtype=f"{tm.ENDIAN}U21")
 
diff --git a/pandas/tests/arrays/sparse/test_astype.py b/pandas/tests/arrays/sparse/test_astype.py
index 83a507e679d46..e6e4a11a0f5ab 100644
--- a/pandas/tests/arrays/sparse/test_astype.py
+++ b/pandas/tests/arrays/sparse/test_astype.py
@@ -81,8 +81,8 @@ def test_astype_all(self, any_real_numpy_dtype):
             ),
             (
                 SparseArray([0, 1, 10]),
-                str,
-                SparseArray(["0", "1", "10"], dtype=SparseDtype(str, "0")),
+                np.str_,
+                SparseArray(["0", "1", "10"], dtype=SparseDtype(np.str_, "0")),
             ),
             (SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])),
             (
diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py
index 1819744d9a9ae..6143163735ab8 100644
--- a/pandas/tests/arrays/sparse/test_dtype.py
+++ b/pandas/tests/arrays/sparse/test_dtype.py
@@ -184,7 +184,7 @@ def test_construct_from_string_fill_value_raises(string):
     [
         (SparseDtype(int, 0), float, SparseDtype(float, 0.0)),
         (SparseDtype(int, 1), float, SparseDtype(float, 1.0)),
-        (SparseDtype(int, 1), str, SparseDtype(object, "1")),
+        (SparseDtype(int, 1), np.str_, SparseDtype(object, "1")),
         (SparseDtype(float, 1.5), int, SparseDtype(int, 1)),
     ],
 )
diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py
index 2c2dff7a957fe..e338fb1331734 100644
--- a/pandas/tests/dtypes/test_common.py
+++ b/pandas/tests/dtypes/test_common.py
@@ -810,11 +810,23 @@ def test_pandas_dtype_string_dtypes(string_storage):
         "pyarrow" if HAS_PYARROW else "python", na_value=np.nan
     )
 
+    with pd.option_context("future.infer_string", True):
+        # with the default string_storage setting
+        result = pandas_dtype(str)
+    assert result == pd.StringDtype(
+        "pyarrow" if HAS_PYARROW else "python", na_value=np.nan
+    )
+
     with pd.option_context("future.infer_string", True):
         with pd.option_context("string_storage", string_storage):
             result = pandas_dtype("str")
     assert result == pd.StringDtype(string_storage, na_value=np.nan)
 
+    with pd.option_context("future.infer_string", True):
+        with pd.option_context("string_storage", string_storage):
+            result = pandas_dtype(str)
+    assert result == pd.StringDtype(string_storage, na_value=np.nan)
+
     with pd.option_context("future.infer_string", False):
         with pd.option_context("string_storage", string_storage):
             result = pandas_dtype("str")
diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py
index e924e38ee5030..8e3f21e1a4f56 100644
--- a/pandas/tests/extension/base/casting.py
+++ b/pandas/tests/extension/base/casting.py
@@ -44,8 +44,8 @@ def test_tolist(self, data):
         assert result == expected
 
     def test_astype_str(self, data):
-        result = pd.Series(data[:5]).astype(str)
-        expected = pd.Series([str(x) for x in data[:5]], dtype=str)
+        result = pd.Series(data[:2]).astype(str)
+        expected = pd.Series([str(x) for x in data[:2]], dtype=str)
         tm.assert_series_equal(result, expected)
 
     @pytest.mark.parametrize(
diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py
index 3a4391edc99ef..4fa48023fbc95 100644
--- a/pandas/tests/extension/json/array.py
+++ b/pandas/tests/extension/json/array.py
@@ -208,9 +208,8 @@ def astype(self, dtype, copy=True):
                 return self.copy()
             return self
         elif isinstance(dtype, StringDtype):
-            value = self.astype(str)  # numpy doesn't like nested dicts
             arr_cls = dtype.construct_array_type()
-            return arr_cls._from_sequence(value, dtype=dtype, copy=False)
+            return arr_cls._from_sequence(self, dtype=dtype, copy=False)
         elif not copy:
             return np.asarray([dict(x) for x in self], dtype=dtype)
         else:
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index f86d927ddda67..f56094dfd47ca 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -43,7 +43,6 @@
     pa_version_under13p0,
     pa_version_under14p0,
 )
-import pandas.util._test_decorators as td
 
 from pandas.core.dtypes.dtypes import (
     ArrowDtype,
@@ -292,7 +291,7 @@ def test_map(self, data_missing, na_action):
                 expected = data_missing.to_numpy()
             tm.assert_numpy_array_equal(result, expected)
 
-    def test_astype_str(self, data, request):
+    def test_astype_str(self, data, request, using_infer_string):
         pa_dtype = data.dtype.pyarrow_dtype
         if pa.types.is_binary(pa_dtype):
             request.applymarker(
@@ -300,9 +299,10 @@ def test_astype_str(self, data, request):
                     reason=f"For {pa_dtype} .astype(str) decodes.",
                 )
             )
-        elif (
-            pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None
-        ) or pa.types.is_duration(pa_dtype):
+        elif not using_infer_string and (
+            (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None)
+            or pa.types.is_duration(pa_dtype)
+        ):
             request.applymarker(
                 pytest.mark.xfail(
                     reason="pd.Timestamp/pd.Timedelta repr different from numpy repr",
@@ -310,25 +310,6 @@ def test_astype_str(self, data, request):
             )
         super().test_astype_str(data)
 
-    @pytest.mark.parametrize(
-        "nullable_string_dtype",
-        [
-            "string[python]",
-            pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")),
-        ],
-    )
-    def test_astype_string(self, data, nullable_string_dtype, request):
-        pa_dtype = data.dtype.pyarrow_dtype
-        if (
-            pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None
-        ) or pa.types.is_duration(pa_dtype):
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason="pd.Timestamp/pd.Timedelta repr different from numpy repr",
-                )
-            )
-        super().test_astype_string(data, nullable_string_dtype)
-
     def test_from_dtype(self, data, request):
         pa_dtype = data.dtype.pyarrow_dtype
         if pa.types.is_string(pa_dtype) or pa.types.is_decimal(pa_dtype):
diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py
index 8647df0e8ad96..ab3743283ea13 100644
--- a/pandas/tests/frame/methods/test_astype.py
+++ b/pandas/tests/frame/methods/test_astype.py
@@ -168,21 +168,21 @@ def test_astype_str(self):
                 "d": list(map(str, d._values)),
                 "e": list(map(str, e._values)),
             },
-            dtype="object",
+            dtype="str",
         )
 
         tm.assert_frame_equal(result, expected)
 
-    def test_astype_str_float(self):
+    def test_astype_str_float(self, using_infer_string):
         # see GH#11302
         result = DataFrame([np.nan]).astype(str)
-        expected = DataFrame(["nan"], dtype="object")
+        expected = DataFrame([np.nan if using_infer_string else "nan"], dtype="str")
 
         tm.assert_frame_equal(result, expected)
         result = DataFrame([1.12345678901234567890]).astype(str)
 
         val = "1.1234567890123457"
-        expected = DataFrame([val], dtype="object")
+        expected = DataFrame([val], dtype="str")
         tm.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize("dtype_class", [dict, Series])
@@ -284,7 +284,7 @@ def test_astype_duplicate_col_series_arg(self):
         result = df.astype(dtypes)
         expected = DataFrame(
             {
-                0: Series(vals[:, 0].astype(str), dtype=object),
+                0: Series(vals[:, 0].astype(str), dtype="str"),
                 1: vals[:, 1],
                 2: pd.array(vals[:, 2], dtype="Float64"),
                 3: vals[:, 3],
@@ -647,9 +647,10 @@ def test_astype_dt64tz(self, timezone_frame):
             # dt64tz->dt64 deprecated
             timezone_frame.astype("datetime64[ns]")
 
-    def test_astype_dt64tz_to_str(self, timezone_frame):
+    def test_astype_dt64tz_to_str(self, timezone_frame, using_infer_string):
         # str formatting
         result = timezone_frame.astype(str)
+        na_value = np.nan if using_infer_string else "NaT"
         expected = DataFrame(
             [
                 [
@@ -657,7 +658,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame):
                     "2013-01-01 00:00:00-05:00",
                     "2013-01-01 00:00:00+01:00",
                 ],
-                ["2013-01-02", "NaT", "NaT"],
+                ["2013-01-02", na_value, na_value],
                 [
                     "2013-01-03",
                     "2013-01-03 00:00:00-05:00",
@@ -665,7 +666,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame):
                 ],
             ],
             columns=timezone_frame.columns,
-            dtype="object",
+            dtype="str",
         )
         tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py
index 875dca321635f..0354e9df3d168 100644
--- a/pandas/tests/frame/methods/test_select_dtypes.py
+++ b/pandas/tests/frame/methods/test_select_dtypes.py
@@ -99,6 +99,9 @@ def test_select_dtypes_include_using_list_like(self, using_infer_string):
             ei = df[["a"]]
             tm.assert_frame_equal(ri, ei)
 
+            ri = df.select_dtypes(include=[str])
+            tm.assert_frame_equal(ri, ei)
+
     def test_select_dtypes_exclude_using_list_like(self):
         df = DataFrame(
             {
@@ -358,7 +361,7 @@ def test_select_dtypes_datetime_with_tz(self):
     @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"])
     @pytest.mark.parametrize("arg", ["include", "exclude"])
     def test_select_dtypes_str_raises(self, dtype, arg, using_infer_string):
-        if using_infer_string and dtype == "str":
+        if using_infer_string and (dtype == "str" or dtype is str):
             # this is tested below
             pytest.skip("Selecting string columns works with future strings")
         df = DataFrame(
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index 3d46e03547c38..0a924aa393be5 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -24,7 +24,6 @@
 from pandas._config import using_string_dtype
 
 from pandas._libs import lib
-from pandas.compat import HAS_PYARROW
 from pandas.compat.numpy import np_version_gt2
 from pandas.errors import IntCastingNaNError
 
@@ -82,7 +81,7 @@ def test_constructor_from_ndarray_with_str_dtype(self):
         #  with an array of strings each of which is e.g. "[0 1 2]"
         arr = np.arange(12).reshape(4, 3)
         df = DataFrame(arr, dtype=str)
-        expected = DataFrame(arr.astype(str), dtype=object)
+        expected = DataFrame(arr.astype(str), dtype="str")
         tm.assert_frame_equal(df, expected)
 
     def test_constructor_from_2d_datetimearray(self):
@@ -300,18 +299,38 @@ def test_constructor_dtype_nocast_view_2d_array(self):
         df2 = DataFrame(df.values, dtype=df[0].dtype)
         assert df2._mgr.blocks[0].values.flags.c_contiguous
 
-    @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="conversion copies")
-    def test_1d_object_array_does_not_copy(self):
+    def test_1d_object_array_does_not_copy(self, using_infer_string):
         # https://github.com/pandas-dev/pandas/issues/39272
         arr = np.array(["a", "b"], dtype="object")
         df = DataFrame(arr, copy=False)
+        if using_infer_string:
+            if df[0].dtype.storage == "pyarrow":
+                # object dtype strings are converted to arrow memory,
+                # no numpy arrays to compare
+                pass
+            else:
+                assert np.shares_memory(df[0].to_numpy(), arr)
+        else:
+            assert np.shares_memory(df.values, arr)
+
+        df = DataFrame(arr, dtype=object, copy=False)
         assert np.shares_memory(df.values, arr)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="conversion copies")
-    def test_2d_object_array_does_not_copy(self):
+    def test_2d_object_array_does_not_copy(self, using_infer_string):
         # https://github.com/pandas-dev/pandas/issues/39272
         arr = np.array([["a", "b"], ["c", "d"]], dtype="object")
         df = DataFrame(arr, copy=False)
+        if using_infer_string:
+            if df[0].dtype.storage == "pyarrow":
+                # object dtype strings are converted to arrow memory,
+                # no numpy arrays to compare
+                pass
+            else:
+                assert np.shares_memory(df[0].to_numpy(), arr)
+        else:
+            assert np.shares_memory(df.values, arr)
+
+        df = DataFrame(arr, dtype=object, copy=False)
         assert np.shares_memory(df.values, arr)
 
     def test_constructor_dtype_list_data(self):
@@ -1766,12 +1785,18 @@ def test_constructor_column_duplicates(self):
 
         tm.assert_frame_equal(idf, edf)
 
-    def test_constructor_empty_with_string_dtype(self):
+    def test_constructor_empty_with_string_dtype(self, using_infer_string):
         # GH 9428
         expected = DataFrame(index=[0, 1], columns=[0, 1], dtype=object)
+        expected_str = DataFrame(
+            index=[0, 1], columns=[0, 1], dtype=pd.StringDtype(na_value=np.nan)
+        )
 
         df = DataFrame(index=[0, 1], columns=[0, 1], dtype=str)
-        tm.assert_frame_equal(df, expected)
+        if using_infer_string:
+            tm.assert_frame_equal(df, expected_str)
+        else:
+            tm.assert_frame_equal(df, expected)
         df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.str_)
         tm.assert_frame_equal(df, expected)
         df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5")
diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py
index 81dc3b3ecc45e..62be8903da206 100644
--- a/pandas/tests/indexes/datetimes/methods/test_astype.py
+++ b/pandas/tests/indexes/datetimes/methods/test_astype.py
@@ -101,13 +101,16 @@ def test_astype_tznaive_to_tzaware(self):
             # dt64->dt64tz deprecated
             idx._data.astype("datetime64[ns, US/Eastern]")
 
-    def test_astype_str_nat(self):
+    def test_astype_str_nat(self, using_infer_string):
         # GH 13149, GH 13209
         # verify that we are returning NaT as a string (and not unicode)
 
         idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan])
         result = idx.astype(str)
-        expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object)
+        if using_infer_string:
+            expected = Index(["2016-05-16", None, None, None], dtype="str")
+        else:
+            expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object)
         tm.assert_index_equal(result, expected)
 
     def test_astype_str(self):
@@ -117,7 +120,7 @@ def test_astype_str(self):
         expected = Index(
             ["2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04"],
             name="test_name",
-            dtype=object,
+            dtype="str",
         )
         tm.assert_index_equal(result, expected)
 
@@ -132,7 +135,7 @@ def test_astype_str_tz_and_name(self):
                 "2012-01-03 00:00:00-05:00",
             ],
             name="test_name",
-            dtype=object,
+            dtype="str",
         )
         tm.assert_index_equal(result, expected)
 
@@ -143,7 +146,7 @@ def test_astype_str_freq_and_name(self):
         expected = Index(
             ["2011-01-01 00:00:00", "2011-01-01 01:00:00", "2011-01-01 02:00:00"],
             name="test_name",
-            dtype=object,
+            dtype="str",
         )
         tm.assert_index_equal(result, expected)
 
@@ -155,7 +158,7 @@ def test_astype_str_freq_and_tz(self):
         result = dti.astype(str)
         expected = Index(
             ["2012-03-06 00:00:00+00:00", "2012-03-06 01:00:00+00:00"],
-            dtype=object,
+            dtype="str",
             name="test_name",
         )
         tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py
index 9c1ef302c5b51..ce05b5e9f2238 100644
--- a/pandas/tests/indexes/object/test_astype.py
+++ b/pandas/tests/indexes/object/test_astype.py
@@ -15,12 +15,12 @@ def test_astype_str_from_bytes():
     #  ensure_string_array which does f"{val}"
     idx = Index(["あ", b"a"], dtype="object")
     result = idx.astype(str)
-    expected = Index(["あ", "a"], dtype="object")
+    expected = Index(["あ", "a"], dtype="str")
     tm.assert_index_equal(result, expected)
 
     # while we're here, check that Series.astype behaves the same
     result = Series(idx).astype(str)
-    expected = Series(expected, dtype=object)
+    expected = Series(expected, dtype="str")
     tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/indexes/period/methods/test_astype.py b/pandas/tests/indexes/period/methods/test_astype.py
index d545bfd2fae0f..af3c2667f51b4 100644
--- a/pandas/tests/indexes/period/methods/test_astype.py
+++ b/pandas/tests/indexes/period/methods/test_astype.py
@@ -22,7 +22,7 @@ def test_astype_raises(self, dtype):
         with pytest.raises(TypeError, match=msg):
             idx.astype(dtype)
 
-    def test_astype_conversion(self):
+    def test_astype_conversion(self, using_infer_string):
         # GH#13149, GH#13209
         idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq="D", name="idx")
 
@@ -41,7 +41,12 @@ def test_astype_conversion(self):
         tm.assert_index_equal(result, expected)
 
         result = idx.astype(str)
-        expected = Index([str(x) for x in idx], name="idx", dtype=object)
+        if using_infer_string:
+            expected = Index(
+                [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str"
+            )
+        else:
+            expected = Index([str(x) for x in idx], name="idx", dtype=object)
         tm.assert_index_equal(result, expected)
 
         idx = period_range("1990", "2009", freq="Y", name="idx")
diff --git a/pandas/tests/indexes/timedeltas/methods/test_astype.py b/pandas/tests/indexes/timedeltas/methods/test_astype.py
index 311f2b5c9aa59..5166cadae499e 100644
--- a/pandas/tests/indexes/timedeltas/methods/test_astype.py
+++ b/pandas/tests/indexes/timedeltas/methods/test_astype.py
@@ -44,7 +44,7 @@ def test_astype_object_with_nat(self):
         tm.assert_index_equal(result, expected)
         assert idx.tolist() == expected_list
 
-    def test_astype(self):
+    def test_astype(self, using_infer_string):
         # GH 13149, GH 13209
         idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan], name="idx")
 
@@ -61,7 +61,12 @@ def test_astype(self):
         tm.assert_index_equal(result, expected)
 
         result = idx.astype(str)
-        expected = Index([str(x) for x in idx], name="idx", dtype=object)
+        if using_infer_string:
+            expected = Index(
+                [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str"
+            )
+        else:
+            expected = Index([str(x) for x in idx], name="idx", dtype=object)
         tm.assert_index_equal(result, expected)
 
         rng = timedelta_range("1 days", periods=10)
diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py
index 38961345dc1f2..29ce9d0c03111 100644
--- a/pandas/tests/interchange/test_impl.py
+++ b/pandas/tests/interchange/test_impl.py
@@ -401,6 +401,7 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None:
     pd.api.interchange.from_dataframe(df)
 
 
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_empty_string_column():
     # https://github.com/pandas-dev/pandas/issues/56703
     df = pd.DataFrame({"a": []}, dtype=str)
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
index b831ec3bb2c6a..3989e022dbbd2 100644
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -587,7 +587,7 @@ def test_reader_dtype(self, read_ext):
 
         expected["a"] = expected["a"].astype("float64")
         expected["b"] = expected["b"].astype("float32")
-        expected["c"] = Series(["001", "002", "003", "004"], dtype=object)
+        expected["c"] = Series(["001", "002", "003", "004"], dtype="str")
         tm.assert_frame_equal(actual, expected)
 
         msg = "Unable to convert column d to type int64"
@@ -611,8 +611,8 @@ def test_reader_dtype(self, read_ext):
                 {
                     "a": Series([1, 2, 3, 4], dtype="float64"),
                     "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"),
-                    "c": Series(["001", "002", "003", "004"], dtype=object),
-                    "d": Series(["1", "2", np.nan, "4"], dtype=object),
+                    "c": Series(["001", "002", "003", "004"], dtype="str"),
+                    "d": Series(["1", "2", np.nan, "4"], dtype="str"),
                 },
             ),
         ],
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
index b664423364f6b..e02562ac8d93d 100644
--- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
+++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -31,7 +31,7 @@
 @pytest.mark.parametrize("dtype", [str, object])
 @pytest.mark.parametrize("check_orig", [True, False])
 @pytest.mark.usefixtures("pyarrow_xfail")
-def test_dtype_all_columns(all_parsers, dtype, check_orig):
+def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string):
     # see gh-3795, gh-6607
     parser = all_parsers
 
@@ -49,8 +49,10 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig):
         if check_orig:
             expected = df.copy()
             result = result.astype(float)
-        else:
+        elif using_infer_string and dtype is str:
             expected = df.astype(str)
+        else:
+            expected = df.astype(str).astype(object)
 
         tm.assert_frame_equal(result, expected)
 
@@ -300,7 +302,6 @@ def test_true_values_cast_to_bool(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @pytest.mark.usefixtures("pyarrow_xfail")
 @pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)])
 def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
@@ -316,7 +317,6 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @pytest.mark.usefixtures("pyarrow_xfail")
 def test_dtype_mangle_dup_cols_single_dtype(all_parsers):
     # GH#42022
@@ -565,7 +565,7 @@ def test_string_inference(all_parsers):
 
 
 @pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_])
-def test_string_inference_object_dtype(all_parsers, dtype):
+def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string):
     # GH#56047
     data = """a,b
 x,a
@@ -575,10 +575,11 @@ def test_string_inference_object_dtype(all_parsers, dtype):
     with pd.option_context("future.infer_string", True):
         result = parser.read_csv(StringIO(data), dtype=dtype)
 
+    expected_dtype = pd.StringDtype(na_value=np.nan) if dtype is str else object
     expected = DataFrame(
         {
-            "a": pd.Series(["x", "y", "z"], dtype=object),
-            "b": pd.Series(["a", "a", "a"], dtype=object),
+            "a": pd.Series(["x", "y", "z"], dtype=expected_dtype),
+            "b": pd.Series(["a", "a", "a"], dtype=expected_dtype),
         },
         columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
     )
@@ -589,7 +590,7 @@ def test_string_inference_object_dtype(all_parsers, dtype):
 
     expected = DataFrame(
         {
-            "a": pd.Series(["x", "y", "z"], dtype=object),
+            "a": pd.Series(["x", "y", "z"], dtype=expected_dtype),
             "b": pd.Series(["a", "a", "a"], dtype=pd.StringDtype(na_value=np.nan)),
         },
         columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)),
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
index b612e60c959b1..89645b526f2ee 100644
--- a/pandas/tests/io/parser/test_na_values.py
+++ b/pandas/tests/io/parser/test_na_values.py
@@ -667,7 +667,6 @@ def test_inf_na_values_with_int_index(all_parsers):
     tm.assert_frame_equal(out, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @xfail_pyarrow  # mismatched shape
 @pytest.mark.parametrize("na_filter", [True, False])
 def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
@@ -719,7 +718,6 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values):
 # TODO: this test isn't about the na_values keyword, it is about the empty entries
 #  being returned with NaN entries, whereas the pyarrow engine returns "nan"
 @xfail_pyarrow  # mismatched shapes
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_str_nan_dropped(all_parsers):
     # see gh-21131
     parser = all_parsers
diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py
index 26480010fc687..a5bb151e84f47 100644
--- a/pandas/tests/io/parser/test_python_parser_only.py
+++ b/pandas/tests/io/parser/test_python_parser_only.py
@@ -18,8 +18,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import (
     ParserError,
     ParserWarning,
@@ -499,7 +497,6 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @pytest.mark.parametrize(
     "dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}]
 )
@@ -524,10 +521,11 @@ def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, d
             "c": [0, 4000, 131],
         }
     )
+    if dtype["a"] == object:
+        expected["a"] = expected["a"].astype(object)
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @pytest.mark.parametrize(
     "dtype,expected",
     [
diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py
index 579d41f964df0..4a7e204ee4161 100644
--- a/pandas/tests/series/methods/test_astype.py
+++ b/pandas/tests/series/methods/test_astype.py
@@ -76,7 +76,7 @@ def test_astype_dict_like(self, dtype_class):
 
         dt1 = dtype_class({"abc": str})
         result = ser.astype(dt1)
-        expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype=object)
+        expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype="str")
         tm.assert_series_equal(result, expected)
 
         dt2 = dtype_class({"abc": "float64"})
@@ -173,10 +173,14 @@ def test_astype_empty_constructor_equality(self, dtype):
     def test_astype_str_map(self, dtype, data, using_infer_string):
         # see GH#4405
         series = Series(data)
+        using_string_dtype = using_infer_string and dtype is str
         result = series.astype(dtype)
-        expected = series.map(str)
-        if using_infer_string:
-            expected = expected.astype(object)
+        if using_string_dtype:
+            expected = series.map(lambda val: str(val) if val is not np.nan else np.nan)
+        else:
+            expected = series.map(str)
+            if using_infer_string:
+                expected = expected.astype(object)
         tm.assert_series_equal(result, expected)
 
     def test_astype_float_to_period(self):
@@ -213,7 +217,7 @@ def test_astype_dt64_to_str(self):
         # GH#10442 : testing astype(str) is correct for Series/DatetimeIndex
         dti = date_range("2012-01-01", periods=3)
         result = Series(dti).astype(str)
-        expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype=object)
+        expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype="str")
         tm.assert_series_equal(result, expected)
 
     def test_astype_dt64tz_to_str(self):
@@ -226,7 +230,7 @@ def test_astype_dt64tz_to_str(self):
                 "2012-01-02 00:00:00-05:00",
                 "2012-01-03 00:00:00-05:00",
             ],
-            dtype=object,
+            dtype="str",
         )
         tm.assert_series_equal(result, expected)
 
@@ -286,13 +290,13 @@ def test_astype_str_cast_dt64(self):
         ts = Series([Timestamp("2010-01-04 00:00:00")])
         res = ts.astype(str)
 
-        expected = Series(["2010-01-04"], dtype=object)
+        expected = Series(["2010-01-04"], dtype="str")
         tm.assert_series_equal(res, expected)
 
         ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")])
         res = ts.astype(str)
 
-        expected = Series(["2010-01-04 00:00:00-05:00"], dtype=object)
+        expected = Series(["2010-01-04 00:00:00-05:00"], dtype="str")
         tm.assert_series_equal(res, expected)
 
     def test_astype_str_cast_td64(self):
@@ -301,7 +305,7 @@ def test_astype_str_cast_td64(self):
         td = Series([Timedelta(1, unit="D")])
         ser = td.astype(str)
 
-        expected = Series(["1 days"], dtype=object)
+        expected = Series(["1 days"], dtype="str")
         tm.assert_series_equal(ser, expected)
 
     def test_dt64_series_astype_object(self):
@@ -347,7 +351,7 @@ def test_astype_from_float_to_str(self, any_float_dtype):
         # https://github.com/pandas-dev/pandas/issues/36451
         ser = Series([0.1], dtype=any_float_dtype)
         result = ser.astype(str)
-        expected = Series(["0.1"], dtype=object)
+        expected = Series(["0.1"], dtype="str")
         tm.assert_series_equal(result, expected)
 
     @pytest.mark.parametrize(
@@ -358,11 +362,13 @@ def test_astype_from_float_to_str(self, any_float_dtype):
             (NA, "<NA>"),
         ],
     )
-    def test_astype_to_str_preserves_na(self, value, string_value):
+    def test_astype_to_str_preserves_na(self, value, string_value, using_infer_string):
         # https://github.com/pandas-dev/pandas/issues/36904
         ser = Series(["a", "b", value], dtype=object)
         result = ser.astype(str)
-        expected = Series(["a", "b", string_value], dtype=object)
+        expected = Series(
+            ["a", "b", None if using_infer_string else string_value], dtype="str"
+        )
         tm.assert_series_equal(result, expected)
 
     @pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"])
diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py
index fe84ffafa70b4..7fa8686fcc6c8 100644
--- a/pandas/tests/series/methods/test_map.py
+++ b/pandas/tests/series/methods/test_map.py
@@ -549,13 +549,11 @@ def f(x):
         (list(range(3)), {0: 42}, [42] + [np.nan] * 3),
     ],
 )
-def test_map_missing_mixed(vals, mapping, exp, using_infer_string):
+def test_map_missing_mixed(vals, mapping, exp):
     # GH20495
     s = Series(vals + [np.nan])
     result = s.map(mapping)
     exp = Series(exp)
-    if using_infer_string and mapping == {np.nan: "not NaN"}:
-        exp.iloc[-1] = np.nan
     tm.assert_series_equal(result, exp)
 
 
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index 1771a4dfdb71f..69f42b5e42878 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -229,7 +229,7 @@ def test_constructor_empty(self, input_class, using_infer_string):
             # GH 19853 : with empty string, index and dtype str
             empty = Series("", dtype=str, index=range(3))
             if using_infer_string:
-                empty2 = Series("", index=range(3), dtype=object)
+                empty2 = Series("", index=range(3), dtype="str")
             else:
                 empty2 = Series("", index=range(3))
             tm.assert_series_equal(empty, empty2)
diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py
index 0656f505dc745..3b989e284ca25 100644
--- a/pandas/tests/strings/test_get_dummies.py
+++ b/pandas/tests/strings/test_get_dummies.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+from pandas._config import using_string_dtype
+
 import pandas.util._test_decorators as td
 
 from pandas import (
@@ -96,6 +98,7 @@ def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype):
 
 
 # GH#47872
+@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_get_dummies_with_str_dtype(any_string_dtype):
     s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype)
     result = s.str.get_dummies("|", dtype=str)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 06fd81ed722d9..dac74a0e32a42 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -1877,13 +1877,16 @@ def test_strobj_mode(self):
         tm.assert_series_equal(ser.mode(), exp)
 
     @pytest.mark.parametrize("dt", [str, object])
-    def test_strobj_multi_char(self, dt):
+    def test_strobj_multi_char(self, dt, using_infer_string):
         exp = ["bar"]
         data = ["foo"] * 2 + ["bar"] * 3
 
         ser = Series(data, dtype=dt)
         exp = Series(exp, dtype=dt)
-        tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
+        if using_infer_string and dt is str:
+            tm.assert_extension_array_equal(algos.mode(ser.values), exp.values)
+        else:
+            tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values)
         tm.assert_series_equal(ser.mode(), exp)
 
     def test_datelike_mode(self):

From 0962007726634e55f75150db82aadb754bea9752 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Wed, 25 Sep 2024 23:29:16 +0530
Subject: [PATCH 066/224] DOC: fix SA01 for pandas.api.types.is_interval_dtype
 (#59863)

---
 ci/code_checks.sh            | 1 -
 pandas/core/dtypes/common.py | 9 +++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 7cc314007aabd..a436acd01013b 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -115,7 +115,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.api.types.is_float PR01,SA01" \
         -i "pandas.api.types.is_hashable PR01,RT03,SA01" \
         -i "pandas.api.types.is_integer PR01,SA01" \
-        -i "pandas.api.types.is_interval_dtype SA01" \
         -i "pandas.api.types.is_iterator PR07,SA01" \
         -i "pandas.api.types.is_list_like SA01" \
         -i "pandas.api.types.is_named_tuple PR07,SA01" \
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index 1a38bb03b2c1c..1093b35afa8a0 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -481,6 +481,15 @@ def is_interval_dtype(arr_or_dtype) -> bool:
     boolean
         Whether or not the array-like or dtype is of the Interval dtype.
 
+    See Also
+    --------
+    api.types.is_object_dtype : Check whether an array-like or dtype is of the
+        object dtype.
+    api.types.is_numeric_dtype : Check whether the provided array or dtype is
+        of a numeric dtype.
+    api.types.is_categorical_dtype : Check whether an array-like or dtype is of
+        the Categorical dtype.
+
     Examples
     --------
     >>> from pandas.core.dtypes.common import is_interval_dtype

From ffb3c1523747738369bd27d5cdb924ee6884100d Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Wed, 25 Sep 2024 23:30:08 +0530
Subject: [PATCH 067/224] DOC: fix SA01 for pandas.api.types.is_list_like
 (#59864)

---
 ci/code_checks.sh    | 1 -
 pandas/_libs/lib.pyx | 6 ++++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index a436acd01013b..dd1b441b51772 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -116,7 +116,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.api.types.is_hashable PR01,RT03,SA01" \
         -i "pandas.api.types.is_integer PR01,SA01" \
         -i "pandas.api.types.is_iterator PR07,SA01" \
-        -i "pandas.api.types.is_list_like SA01" \
         -i "pandas.api.types.is_named_tuple PR07,SA01" \
         -i "pandas.api.types.is_object_dtype SA01" \
         -i "pandas.api.types.is_re PR07,SA01" \
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 8af48a861967a..de7d9af731010 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1220,6 +1220,12 @@ def is_list_like(obj: object, allow_sets: bool = True) -> bool:
     bool
         Whether `obj` has list-like properties.
 
+    See Also
+    --------
+    Series : One-dimensional ndarray with axis labels (including time series).
+    Index : Immutable sequence used for indexing and alignment.
+    numpy.ndarray : Array object from NumPy, which is considered list-like.
+
     Examples
     --------
     >>> import datetime

From 4b22453651cb71684ce1f56aa67ff6fc451af053 Mon Sep 17 00:00:00 2001
From: musvaage <112724366+musvaage@users.noreply.github.com>
Date: Wed, 25 Sep 2024 20:16:32 +0200
Subject: [PATCH 068/224] typo (#59852)

---
 pandas/io/pytables.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index be7b8dc6640ba..618254fee9259 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -3580,7 +3580,7 @@ def is_transposed(self) -> bool:
 
     @property
     def data_orientation(self) -> tuple[int, ...]:
-        """return a tuple of my permuted axes, non_indexable at the front"""
+        """return a tuple of my permutated axes, non_indexable at the front"""
         return tuple(
             itertools.chain(
                 [int(a[0]) for a in self.non_index_axes],

From 7543426cdf2728635e92b59585203963035ae536 Mon Sep 17 00:00:00 2001
From: Vibavari Gurunathan <vibavari2301@gmail.com>
Date: Wed, 25 Sep 2024 11:17:57 -0700
Subject: [PATCH 069/224] BUG: Fix from_records() column reorder issue, if
 columns!=None use passed param (#59717) (#59809)

* BUG: Fix columns param reorder issue - if columns!=None, use passed param (#59717)

* Add tests for to_arrays()

* Fix import order with isort

* fix sort

* Update datatype to int32

* Fis test

* Revert commit

* Add test for DaaFrame.from_records()

* Apply comments

* Delete test_to_arrays.py
---
 doc/source/whatsnew/v3.0.0.rst                |  1 +
 pandas/core/internals/construction.py         |  3 ++-
 .../frame/constructors/test_from_records.py   | 23 +++++++++++++++++++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 3b5183c43bcd0..516a5d938fb18 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -619,6 +619,7 @@ I/O
 ^^^
 - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`)
 - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`)
+- Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`)
 - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`)
 - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`)
 - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`)
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
index 07465e7b87fcd..959e572b2b35b 100644
--- a/pandas/core/internals/construction.py
+++ b/pandas/core/internals/construction.py
@@ -750,7 +750,8 @@ def to_arrays(
 
     elif isinstance(data, np.ndarray) and data.dtype.names is not None:
         # e.g. recarray
-        columns = Index(list(data.dtype.names))
+        if columns is None:
+            columns = Index(data.dtype.names)
         arrays = [data[k] for k in columns]
         return arrays, columns
 
diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py
index abc3aab1c1492..1d4a2c0075e3e 100644
--- a/pandas/tests/frame/constructors/test_from_records.py
+++ b/pandas/tests/frame/constructors/test_from_records.py
@@ -469,3 +469,26 @@ def test_from_records_empty2(self):
 
         alt = DataFrame(arr)
         tm.assert_frame_equal(alt, expected)
+
+    def test_from_records_structured_array(self):
+        # GH 59717
+        data = np.array(
+            [
+                ("John", 25, "New York", 50000),
+                ("Jane", 30, "San Francisco", 75000),
+                ("Bob", 35, "Chicago", 65000),
+                ("Alice", 28, "Los Angeles", 60000),
+            ],
+            dtype=[("name", "U10"), ("age", "i4"), ("city", "U15"), ("salary", "i4")],
+        )
+
+        actual_result = DataFrame.from_records(data, columns=["name", "salary", "city"])
+
+        modified_data = {
+            "name": ["John", "Jane", "Bob", "Alice"],
+            "salary": np.array([50000, 75000, 65000, 60000], dtype="int32"),
+            "city": ["New York", "San Francisco", "Chicago", "Los Angeles"],
+        }
+        expected_result = DataFrame(modified_data)
+
+        tm.assert_frame_equal(actual_result, expected_result)

From e38409c304f8da88efd7cf074819a1cf7d12be31 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Wed, 25 Sep 2024 23:49:00 +0530
Subject: [PATCH 070/224] DOC: fix SA01 for pandas.arrays.BooleanArray (#59866)

---
 ci/code_checks.sh             | 1 -
 pandas/core/arrays/boolean.py | 7 +++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index dd1b441b51772..40582f3069e97 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -122,7 +122,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.api.types.is_re_compilable PR07,SA01" \
         -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \
         -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
-        -i "pandas.arrays.BooleanArray SA01" \
         -i "pandas.arrays.DatetimeArray SA01" \
         -i "pandas.arrays.IntegerArray SA01" \
         -i "pandas.arrays.IntervalArray.left SA01" \
diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
index 74c0cd7719c13..53ebc35b68d14 100644
--- a/pandas/core/arrays/boolean.py
+++ b/pandas/core/arrays/boolean.py
@@ -286,6 +286,13 @@ class BooleanArray(BaseMaskedArray):
     -------
     BooleanArray
 
+    See Also
+    --------
+    array : Create an array from data with the appropriate dtype.
+    BooleanDtype : Extension dtype for boolean data.
+    Series : One-dimensional ndarray with axis labels (including time series).
+    DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data.
+
     Examples
     --------
     Create an BooleanArray with :func:`pandas.array`:

From f049159d8245959bf313e05d1109ed33f778a077 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Wed, 25 Sep 2024 23:49:56 +0530
Subject: [PATCH 071/224] DOC: fix SA01, ES01 for
 pandas.api.types.is_object_dtype (#59865)

---
 ci/code_checks.sh            |  1 -
 pandas/core/dtypes/common.py | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 40582f3069e97..4eb9d4055e1f8 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -117,7 +117,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.api.types.is_integer PR01,SA01" \
         -i "pandas.api.types.is_iterator PR07,SA01" \
         -i "pandas.api.types.is_named_tuple PR07,SA01" \
-        -i "pandas.api.types.is_object_dtype SA01" \
         -i "pandas.api.types.is_re PR07,SA01" \
         -i "pandas.api.types.is_re_compilable PR07,SA01" \
         -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \
diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
index 1093b35afa8a0..98c770ec4a8b0 100644
--- a/pandas/core/dtypes/common.py
+++ b/pandas/core/dtypes/common.py
@@ -141,6 +141,11 @@ def is_object_dtype(arr_or_dtype) -> bool:
     """
     Check whether an array-like or dtype is of the object dtype.
 
+    This method examines the input to determine if it is of the
+    object data type. Object dtype is a generic data type that can
+    hold any Python objects, including strings, lists, and custom
+    objects.
+
     Parameters
     ----------
     arr_or_dtype : array-like or dtype
@@ -151,6 +156,15 @@ def is_object_dtype(arr_or_dtype) -> bool:
     boolean
         Whether or not the array-like or dtype is of the object dtype.
 
+    See Also
+    --------
+    api.types.is_numeric_dtype : Check whether the provided array or dtype is of a
+        numeric dtype.
+    api.types.is_string_dtype : Check whether the provided array or dtype is of
+        the string dtype.
+    api.types.is_bool_dtype : Check whether the provided array or dtype is of a
+        boolean dtype.
+
     Examples
     --------
     >>> from pandas.api.types import is_object_dtype

From e221fa48a5d5e61f9adc830ed33562548bea9dd4 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Wed, 25 Sep 2024 23:51:58 +0530
Subject: [PATCH 072/224] DOC: fix RT03 for pandas.date_range (#59868)

---
 ci/code_checks.sh                | 1 -
 pandas/core/indexes/datetimes.py | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 4eb9d4055e1f8..72e12effb1104 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -170,7 +170,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.resample.Resampler.sum SA01" \
         -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \
         -i "pandas.core.resample.Resampler.var SA01" \
-        -i "pandas.date_range RT03" \
         -i "pandas.errors.AttributeConflictWarning SA01" \
         -i "pandas.errors.CSSWarning SA01" \
         -i "pandas.errors.CategoricalConversionWarning SA01" \
diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
index 3b3cda8f7cd33..536f22d38468d 100644
--- a/pandas/core/indexes/datetimes.py
+++ b/pandas/core/indexes/datetimes.py
@@ -875,6 +875,7 @@ def date_range(
     Returns
     -------
     DatetimeIndex
+        A DatetimeIndex object of the generated dates.
 
     See Also
     --------

From cf79ac87545744d7c7af7e49b443b2ed0b3ed047 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Wed, 25 Sep 2024 23:52:33 +0530
Subject: [PATCH 073/224] DOC: fix RT03, ES01 for
 pandas.core.resample.Resampler.ffill (#59871)

---
 ci/code_checks.sh       | 1 -
 pandas/core/resample.py | 8 +++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 72e12effb1104..49702dce0e258 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -155,7 +155,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \
         -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \
         -i "pandas.core.resample.Resampler.__iter__ RT03,SA01" \
-        -i "pandas.core.resample.Resampler.ffill RT03" \
         -i "pandas.core.resample.Resampler.get_group RT03,SA01" \
         -i "pandas.core.resample.Resampler.groups SA01" \
         -i "pandas.core.resample.Resampler.indices SA01" \
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
index b621fcf9a6415..711396096a5e3 100644
--- a/pandas/core/resample.py
+++ b/pandas/core/resample.py
@@ -529,6 +529,11 @@ def ffill(self, limit: int | None = None):
         """
         Forward fill the values.
 
+        This method fills missing values by propagating the last valid
+        observation forward, up to the next valid observation. It is commonly
+        used in time series analysis when resampling data to a higher frequency
+        (upsampling) and filling gaps in the resampled output.
+
         Parameters
         ----------
         limit : int, optional
@@ -536,7 +541,8 @@ def ffill(self, limit: int | None = None):
 
         Returns
         -------
-        An upsampled Series.
+        Series
+            The resampled data with missing values filled forward.
 
         See Also
         --------

From 1ddf028c9469a9d6264171c4c79ef1691fe2c680 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Thu, 26 Sep 2024 00:08:18 +0530
Subject: [PATCH 074/224] DOC: fix SA01, ES01 for
 pandas.arrays.IntervalArray.mid (#59867)

* DOC: fix SA01, ES01 for pandas.arrays.IntervalArray.mid

* DOC: add double backticks for sphinx compatibility

Co-authored-by: mroeschke <mroeschke@users.noreply.github.com>

---------

Co-authored-by: mroeschke <mroeschke@users.noreply.github.com>
---
 ci/code_checks.sh              |  1 -
 pandas/core/arrays/interval.py | 10 ++++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 49702dce0e258..3dfd5a3931ecd 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -125,7 +125,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.arrays.IntegerArray SA01" \
         -i "pandas.arrays.IntervalArray.left SA01" \
         -i "pandas.arrays.IntervalArray.length SA01" \
-        -i "pandas.arrays.IntervalArray.mid SA01" \
         -i "pandas.arrays.IntervalArray.right SA01" \
         -i "pandas.arrays.NumpyExtensionArray SA01" \
         -i "pandas.arrays.SparseArray PR07,SA01" \
diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
index 52d64162358c8..2ac9c77bef322 100644
--- a/pandas/core/arrays/interval.py
+++ b/pandas/core/arrays/interval.py
@@ -1291,6 +1291,16 @@ def mid(self) -> Index:
         """
         Return the midpoint of each Interval in the IntervalArray as an Index.
 
+        The midpoint of an interval is calculated as the average of its
+        ``left`` and ``right`` bounds. This property returns a ``pandas.Index`` object
+        containing the midpoint for each interval.
+
+        See Also
+        --------
+        Interval.left : Return left bound for the interval.
+        Interval.right : Return right bound for the interval.
+        Interval.length : Return the length of each interval.
+
         Examples
         --------
 

From 22055e4d3d42c297b1c86306d77f7a27fad8dcf8 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Thu, 26 Sep 2024 00:08:59 +0530
Subject: [PATCH 075/224] DOC: fix SA01, ES01 for pandas.RangeIndex.step
 (#59857)

* DOC: fix SA01, ES01 for pandas.RangeIndex.step

* DOC: add double backticks for sphinx compatibility

Co-authored-by: mroeschke <mroeschke@users.noreply.github.com>

---------

Co-authored-by: mroeschke <mroeschke@users.noreply.github.com>
---
 ci/code_checks.sh            | 1 -
 pandas/core/indexes/range.py | 9 +++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 3dfd5a3931ecd..01486f0e3f926 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -73,7 +73,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Period.freq GL08" \
         -i "pandas.Period.ordinal GL08" \
         -i "pandas.RangeIndex.from_range PR01,SA01" \
-        -i "pandas.RangeIndex.step SA01" \
         -i "pandas.Series.cat.add_categories PR01,PR02" \
         -i "pandas.Series.cat.as_ordered PR01" \
         -i "pandas.Series.cat.as_unordered PR01" \
diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py
index 75d0dfbeb6f01..dc96d1c11db74 100644
--- a/pandas/core/indexes/range.py
+++ b/pandas/core/indexes/range.py
@@ -351,6 +351,15 @@ def step(self) -> int:
         """
         The value of the `step` parameter (``1`` if this was not supplied).
 
+        The ``step`` parameter determines the increment (or decrement in the case
+        of negative values) between consecutive elements in the ``RangeIndex``.
+
+        See Also
+        --------
+        RangeIndex : Immutable index implementing a range-based index.
+        RangeIndex.stop : Returns the stop value of the RangeIndex.
+        RangeIndex.start : Returns the start value of the RangeIndex.
+
         Examples
         --------
         >>> idx = pd.RangeIndex(5)

From efbc29666d820cf62854556cdeadf044b489de4c Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Thu, 26 Sep 2024 00:09:46 +0530
Subject: [PATCH 076/224] DOC: fix SA01, ES01 for
 pandas.Timedelta.to_timedelta64 (#59860)

* DOC: fix SA01, ES01 for pandas.Timedelta.to_timedelta64

* DOC: add double backticks for sphinx compatibility

Co-authored-by: mroeschke <mroeschke@users.noreply.github.com>

---------

Co-authored-by: mroeschke <mroeschke@users.noreply.github.com>
---
 ci/code_checks.sh                  |  1 -
 pandas/_libs/tslibs/timedeltas.pyx | 12 ++++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 01486f0e3f926..20e75f0f6f616 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -101,7 +101,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timedelta.max PR02" \
         -i "pandas.Timedelta.min PR02" \
         -i "pandas.Timedelta.resolution PR02" \
-        -i "pandas.Timedelta.to_timedelta64 SA01" \
         -i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \
         -i "pandas.Timestamp.max PR02" \
         -i "pandas.Timestamp.min PR02" \
diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
index 0ff5c5fb81df8..84ca48c96459f 100644
--- a/pandas/_libs/tslibs/timedeltas.pyx
+++ b/pandas/_libs/tslibs/timedeltas.pyx
@@ -1411,6 +1411,18 @@ cdef class _Timedelta(timedelta):
         """
         Return a numpy.timedelta64 object with 'ns' precision.
 
+        Since NumPy uses ``timedelta64`` objects for its time operations, converting
+        a pandas ``Timedelta`` into a NumPy ``timedelta64`` provides seamless
+        integration between the two libraries, especially when working in environments
+        that heavily rely on NumPy for array-based calculations.
+
+        See Also
+        --------
+        to_timedelta : Convert argument to timedelta.
+        numpy.timedelta64 : A NumPy object for time duration.
+        Timedelta : Represents a duration, the difference between two dates
+            or times.
+
         Examples
         --------
         >>> td = pd.Timedelta('3D')

From c5cfe5d32c7fef4d42e1b22e188a438b5607b804 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Thu, 26 Sep 2024 00:12:43 +0530
Subject: [PATCH 077/224] DOC: fix SA01, ES01 for pandas.errors.EmptyDataError
 (#59872)

---
 ci/code_checks.sh         |  1 -
 pandas/errors/__init__.py | 11 +++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 20e75f0f6f616..f662b4781e84b 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -173,7 +173,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.errors.ClosedFileError SA01" \
         -i "pandas.errors.DataError SA01" \
         -i "pandas.errors.DuplicateLabelError SA01" \
-        -i "pandas.errors.EmptyDataError SA01" \
         -i "pandas.errors.IntCastingNaNError SA01" \
         -i "pandas.errors.InvalidIndexError SA01" \
         -i "pandas.errors.InvalidVersion SA01" \
diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py
index 7851bc90c5782..b9ceae341afd3 100644
--- a/pandas/errors/__init__.py
+++ b/pandas/errors/__init__.py
@@ -205,6 +205,17 @@ class EmptyDataError(ValueError):
     """
     Exception raised in ``pd.read_csv`` when empty data or header is encountered.
 
+    This error is typically encountered when attempting to read an empty file or
+    an invalid file where no data or headers are present.
+
+    See Also
+    --------
+    read_csv : Read a comma-separated values (CSV) file into DataFrame.
+    errors.ParserError : Exception that is raised by an error encountered in parsing
+        file contents.
+    errors.DtypeWarning : Warning raised when reading different dtypes in a column
+        from a file.
+
     Examples
     --------
     >>> from io import StringIO

From 7e5282f5f125406cff7fdf80b452e114adfa4c26 Mon Sep 17 00:00:00 2001
From: Jonathan Marriott <34217286+JonathanMarriott@users.noreply.github.com>
Date: Wed, 25 Sep 2024 20:14:49 +0100
Subject: [PATCH 078/224] DOC: Fix inconsistent and incomplete documentation of
 `pandas.eval` (#59855)

* Improve content and organisation of eval documentation

* Link to pd.eval in pd.DataFrame.query

* Correct name for `//` is floor division

* Include arctan2

Co-authored-by: Xiao Yuan <yuanx749@gmail.com>

---------

Co-authored-by: Xiao Yuan <yuanx749@gmail.com>
---
 pandas/core/computation/eval.py | 37 +++++++++++++++++++++++++--------
 pandas/core/frame.py            | 33 +++++++++++++++--------------
 2 files changed, 45 insertions(+), 25 deletions(-)

diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py
index 485c7f87d6f33..4ccfbd71d9ce8 100644
--- a/pandas/core/computation/eval.py
+++ b/pandas/core/computation/eval.py
@@ -188,15 +188,6 @@ def eval(
     """
     Evaluate a Python expression as a string using various backends.
 
-    The following arithmetic operations are supported: ``+``, ``-``, ``*``,
-    ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following
-    boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not).
-    Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`,
-    :keyword:`or`, and :keyword:`not` with the same semantics as the
-    corresponding bitwise operators.  :class:`~pandas.Series` and
-    :class:`~pandas.DataFrame` objects are supported and behave as they would
-    with plain ol' Python evaluation.
-
     .. warning::
 
         ``eval`` can run arbitrary code which can make you vulnerable to code
@@ -210,6 +201,34 @@ def eval(
         <https://docs.python.org/3/reference/simple_stmts.html#simple-statements>`__,
         only Python `expressions
         <https://docs.python.org/3/reference/simple_stmts.html#expression-statements>`__.
+
+        By default, with the numexpr engine, the following operations are supported:
+
+        - Arthimetic operations: ``+``, ``-``, ``*``, ``/``, ``**``, ``%``
+        - Boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not)
+        - Comparison operators: ``<``, ``<=``, ``==``, ``!=``, ``>=``, ``>``
+
+        Furthermore, the following mathematical functions are supported:
+
+        - Trigonometric: ``sin``, ``cos``, ``tan``, ``arcsin``, ``arccos``, \
+            ``arctan``, ``arctan2``, ``sinh``, ``cosh``, ``tanh``, ``arcsinh``, \
+            ``arccosh`` and ``arctanh``
+        - Logarithms: ``log`` natural, ``log10`` base 10, ``log1p`` log(1+x)
+        - Absolute Value ``abs``
+        - Square root ``sqrt``
+        - Exponential ``exp`` and Exponential minus one ``expm1``
+
+        See the numexpr engine `documentation
+        <https://numexpr.readthedocs.io/en/latest/user_guide.html#supported-functions>`__
+        for further function support details.
+
+        Using the ``'python'`` engine allows the use of native Python operators
+        such as floor division ``//``, in addition to built-in and user-defined
+        Python functions.
+
+        Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`,
+        :keyword:`or`, and :keyword:`not` with the same semantics as the
+        corresponding bitwise operators.
     parser : {'pandas', 'python'}, default 'pandas'
         The parser to use to construct the syntax tree from the expression. The
         default of ``'pandas'`` parses code slightly different than standard
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index c80e9dfd23ba2..4c56948a48eb2 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -4479,20 +4479,11 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No
         expr : str
             The query string to evaluate.
 
-            You can refer to variables
-            in the environment by prefixing them with an '@' character like
-            ``@a + b``.
-
-            You can refer to column names that are not valid Python variable names
-            by surrounding them in backticks. Thus, column names containing spaces
-            or punctuation (besides underscores) or starting with digits must be
-            surrounded by backticks. (For example, a column named "Area (cm^2)" would
-            be referenced as ```Area (cm^2)```). Column names which are Python keywords
-            (like "if", "for", "import", etc) cannot be used.
-
-            For example, if one of your columns is called ``a a`` and you want
-            to sum it with ``b``, your query should be ```a a` + b``.
+            See the documentation for :func:`eval` for details of
+            supported operations and functions in the query string.
 
+            See the documentation for :meth:`DataFrame.eval` for details on
+            referring to column names and variables in the query string.
         inplace : bool
             Whether to modify the DataFrame rather than creating a new one.
         **kwargs
@@ -4651,8 +4642,18 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
             in the environment by prefixing them with an '@' character like
             ``@a + b``.
 
-            You can refer to column names that are not valid Python variable
-            names by surrounding them with backticks `````.
+            You can refer to column names that are not valid Python variable names
+            by surrounding them in backticks. Thus, column names containing spaces
+            or punctuation (besides underscores) or starting with digits must be
+            surrounded by backticks. (For example, a column named "Area (cm^2)" would
+            be referenced as ```Area (cm^2)```). Column names which are Python keywords
+            (like "if", "for", "import", etc) cannot be used.
+
+            For example, if one of your columns is called ``a a`` and you want
+            to sum it with ``b``, your query should be ```a a` + b``.
+
+            See the documentation for :func:`eval` for full details of
+            supported operations and functions in the expression string.
         inplace : bool, default False
             If the expression contains an assignment, whether to perform the
             operation inplace and mutate the existing DataFrame. Otherwise,
@@ -4660,7 +4661,7 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
         **kwargs
             See the documentation for :func:`eval` for complete details
             on the keyword arguments accepted by
-            :meth:`~pandas.DataFrame.query`.
+            :meth:`~pandas.DataFrame.eval`.
 
         Returns
         -------

From c8a67401932c773ace0f62660f09b5684f39a148 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 25 Sep 2024 21:16:04 +0200
Subject: [PATCH 079/224] String dtype: allow string dtype for non-raw apply
 with numba engine (#59854)

* String dtype: allow string dtype for non-raw apply with numba engine

* remove xfails

* clean-up
---
 pandas/core/_numba/extensions.py       | 3 ++-
 pandas/core/apply.py                   | 5 -----
 pandas/tests/apply/test_frame_apply.py | 1 -
 pandas/tests/apply/test_numba.py       | 4 ----
 4 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py
index e6f0427de2a3a..413fdafc7fd04 100644
--- a/pandas/core/_numba/extensions.py
+++ b/pandas/core/_numba/extensions.py
@@ -53,7 +53,8 @@
 @contextmanager
 def set_numba_data(index: Index):
     numba_data = index._data
-    if numba_data.dtype == object:
+    if numba_data.dtype in (object, "string"):
+        numba_data = np.asarray(numba_data)
         if not lib.is_string_array(numba_data):
             raise ValueError(
                 "The numba engine only supports using string or numeric column names"
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index 5959156d11123..7d50b466f5126 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -1172,12 +1172,7 @@ def apply_with_numba(self) -> dict[int, Any]:
         from pandas.core._numba.extensions import set_numba_data
 
         index = self.obj.index
-        if index.dtype == "string":
-            index = index.astype(object)
-
         columns = self.obj.columns
-        if columns.dtype == "string":
-            columns = columns.astype(object)
 
         # Convert from numba dict to regular dict
         # Our isinstance checks in the df constructor don't pass for numbas typed dict
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
index 3be3562d23cd6..dee0efcd8fd15 100644
--- a/pandas/tests/apply/test_frame_apply.py
+++ b/pandas/tests/apply/test_frame_apply.py
@@ -65,7 +65,6 @@ def test_apply(float_frame, engine, request):
         assert result.index is float_frame.index
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize("axis", [0, 1])
 @pytest.mark.parametrize("raw", [True, False])
 @pytest.mark.parametrize("nopython", [True, False])
diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py
index 825d295043e69..d6cd9c321ace6 100644
--- a/pandas/tests/apply/test_numba.py
+++ b/pandas/tests/apply/test_numba.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -20,7 +18,6 @@ def apply_axis(request):
     return request.param
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_numba_vs_python_noop(float_frame, apply_axis):
     func = lambda x: x
     result = float_frame.apply(func, engine="numba", axis=apply_axis)
@@ -43,7 +40,6 @@ def test_numba_vs_python_string_index():
     )
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_numba_vs_python_indexing():
     frame = DataFrame(
         {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]},

From b87bf854519466182b43f9f7d5b6c9d91be87ad0 Mon Sep 17 00:00:00 2001
From: Naresh Kumar <nareshkumargangwar@gmail.com>
Date: Wed, 25 Sep 2024 12:18:03 -0700
Subject: [PATCH 080/224] ENH: Add kwargs to Series.map (#59843)

Co-authored-by: Naresh Kumar <naresh.kumar@snowflake.com>
---
 doc/source/whatsnew/v3.0.0.rst          | 1 +
 pandas/core/series.py                   | 9 +++++++++
 pandas/tests/series/methods/test_map.py | 7 +++++++
 3 files changed, 17 insertions(+)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 516a5d938fb18..41ba80989a0ce 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -54,6 +54,7 @@ Other enhancements
 - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
 - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
+- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
 - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
 - :meth:`str.get_dummies` now accepts a  ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
 - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
diff --git a/pandas/core/series.py b/pandas/core/series.py
index 0c26ce27c680c..bbcb6615aeefd 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -11,6 +11,7 @@
     Mapping,
     Sequence,
 )
+import functools
 import operator
 import sys
 from textwrap import dedent
@@ -4312,6 +4313,7 @@ def map(
         self,
         arg: Callable | Mapping | Series,
         na_action: Literal["ignore"] | None = None,
+        **kwargs,
     ) -> Series:
         """
         Map values of Series according to an input mapping or function.
@@ -4327,6 +4329,11 @@ def map(
         na_action : {None, 'ignore'}, default None
             If 'ignore', propagate NaN values, without passing them to the
             mapping correspondence.
+        **kwargs
+            Additional keyword arguments to pass as keywords arguments to
+            `arg`.
+
+            .. versionadded:: 3.0.0
 
         Returns
         -------
@@ -4388,6 +4395,8 @@ def map(
         3  I am a rabbit
         dtype: object
         """
+        if callable(arg):
+            arg = functools.partial(arg, **kwargs)
         new_values = self._map_values(arg, na_action=na_action)
         return self._constructor(new_values, index=self.index, copy=False).__finalize__(
             self, method="map"
diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py
index 7fa8686fcc6c8..84b60a2afe6eb 100644
--- a/pandas/tests/series/methods/test_map.py
+++ b/pandas/tests/series/methods/test_map.py
@@ -597,3 +597,10 @@ def test_map_type():
     result = s.map(type)
     expected = Series([int, str, type], index=["a", "b", "c"])
     tm.assert_series_equal(result, expected)
+
+
+def test_map_kwargs():
+    # GH 59814
+    result = Series([2, 4, 5]).map(lambda x, y: x + y, y=2)
+    expected = Series([4, 6, 7])
+    tm.assert_series_equal(result, expected)

From a9f76d753dfe3db9206e5556c90ffac0e0ebf46d Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Wed, 25 Sep 2024 12:19:47 -0700
Subject: [PATCH 081/224] REF: pass dtype explicitly to _from_sequence inside
 pd.array (#59773)

REF: pass dtype explicitly to _from_sequence
---
 pandas/core/construction.py            | 6 ++++--
 pandas/tests/extension/base/methods.py | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index bb3aa3867ab08..1e1292f8ef089 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -358,7 +358,8 @@ def array(
             return cls._from_sequence(data, dtype=dtype, copy=copy)
 
         elif data.dtype.kind in "iu":
-            return IntegerArray._from_sequence(data, copy=copy)
+            dtype = IntegerArray._dtype_cls._get_dtype_mapping()[data.dtype]
+            return IntegerArray._from_sequence(data, dtype=dtype, copy=copy)
         elif data.dtype.kind == "f":
             # GH#44715 Exclude np.float16 bc FloatingArray does not support it;
             #  we will fall back to NumpyExtensionArray.
@@ -366,7 +367,8 @@ def array(
                 return NumpyExtensionArray._from_sequence(
                     data, dtype=data.dtype, copy=copy
                 )
-            return FloatingArray._from_sequence(data, copy=copy)
+            dtype = FloatingArray._dtype_cls._get_dtype_mapping()[data.dtype]
+            return FloatingArray._from_sequence(data, dtype=dtype, copy=copy)
 
         elif data.dtype.kind == "b":
             return BooleanArray._from_sequence(data, dtype="boolean", copy=copy)
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
index dd2ed0bd62a02..fd9fec0cb490c 100644
--- a/pandas/tests/extension/base/methods.py
+++ b/pandas/tests/extension/base/methods.py
@@ -549,7 +549,7 @@ def _test_searchsorted_bool_dtypes(self, data_for_sorting, as_series):
         dtype = data_for_sorting.dtype
         data_for_sorting = pd.array([True, False], dtype=dtype)
         b, a = data_for_sorting
-        arr = type(data_for_sorting)._from_sequence([a, b])
+        arr = type(data_for_sorting)._from_sequence([a, b], dtype=dtype)
 
         if as_series:
             arr = pd.Series(arr)

From a92b919a1bb676252b45e574d102b2af29daac12 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Wed, 25 Sep 2024 12:21:12 -0700
Subject: [PATCH 082/224] REF: pass dtype explicitly to _from_sequence (#59774)

---
 pandas/core/arrays/arrow/array.py             |  8 ++++++-
 pandas/core/arrays/datetimelike.py            |  6 ++---
 pandas/core/arrays/datetimes.py               |  6 +----
 pandas/core/arrays/period.py                  |  2 +-
 .../arrays/datetimes/test_constructors.py     | 22 ++++++++++++-------
 pandas/tests/arrays/test_array.py             |  8 +++++--
 pandas/tests/arrays/test_datetimelike.py      | 10 +++++----
 pandas/tests/arrays/test_datetimes.py         | 12 +++++++---
 pandas/tests/arrays/test_timedeltas.py        |  8 +++----
 pandas/tests/base/test_conversion.py          |  3 ++-
 pandas/tests/dtypes/test_generic.py           |  4 ++--
 .../series/accessors/test_dt_accessor.py      |  3 ++-
 12 files changed, 57 insertions(+), 35 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 39cae5b8e2683..00d46ab9296d0 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2300,7 +2300,13 @@ def _groupby_op(
         )
         if isinstance(result, np.ndarray):
             return result
-        return type(self)._from_sequence(result, copy=False)
+        elif isinstance(result, BaseMaskedArray):
+            pa_result = result.__arrow_array__()
+            return type(self)(pa_result)
+        else:
+            # DatetimeArray, TimedeltaArray
+            pa_result = pa.array(result, from_pandas=True)
+            return type(self)(pa_result)
 
     def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
         """Apply a callable to each element while maintaining the chunking structure."""
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index 7be8daa09c758..a25a698856747 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -1393,7 +1393,7 @@ def __add__(self, other):
         if isinstance(result, np.ndarray) and lib.is_np_dtype(result.dtype, "m"):
             from pandas.core.arrays import TimedeltaArray
 
-            return TimedeltaArray._from_sequence(result)
+            return TimedeltaArray._from_sequence(result, dtype=result.dtype)
         return result
 
     def __radd__(self, other):
@@ -1453,7 +1453,7 @@ def __sub__(self, other):
         if isinstance(result, np.ndarray) and lib.is_np_dtype(result.dtype, "m"):
             from pandas.core.arrays import TimedeltaArray
 
-            return TimedeltaArray._from_sequence(result)
+            return TimedeltaArray._from_sequence(result, dtype=result.dtype)
         return result
 
     def __rsub__(self, other):
@@ -1472,7 +1472,7 @@ def __rsub__(self, other):
                 # Avoid down-casting DatetimeIndex
                 from pandas.core.arrays import DatetimeArray
 
-                other = DatetimeArray._from_sequence(other)
+                other = DatetimeArray._from_sequence(other, dtype=other.dtype)
             return other - self
         elif self.dtype.kind == "M" and hasattr(other, "dtype") and not other_is_dt64:
             # GH#19959 datetime - datetime is well-defined as timedelta,
diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
index 201c449185057..43f4428118aa7 100644
--- a/pandas/core/arrays/datetimes.py
+++ b/pandas/core/arrays/datetimes.py
@@ -818,11 +818,7 @@ def _add_offset(self, offset: BaseOffset) -> Self:
                     stacklevel=find_stack_level(),
                 )
             res_values = self.astype("O") + offset
-            # TODO(GH#55564): as_unit will be unnecessary
-            result = type(self)._from_sequence(res_values).as_unit(self.unit)
-            if not len(self):
-                # GH#30336 _from_sequence won't be able to infer self.tz
-                return result.tz_localize(self.tz)
+            result = type(self)._from_sequence(res_values, dtype=self.dtype)
 
         else:
             result = type(self)._simple_new(res_values, dtype=res_values.dtype)
diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
index aa8dacbd6aad5..7d0ad74f851f0 100644
--- a/pandas/core/arrays/period.py
+++ b/pandas/core/arrays/period.py
@@ -812,7 +812,7 @@ def to_timestamp(self, freq=None, how: str = "start") -> DatetimeArray:
         new_parr = self.asfreq(freq, how=how)
 
         new_data = libperiod.periodarr_to_dt64arr(new_parr.asi8, base)
-        dta = DatetimeArray._from_sequence(new_data)
+        dta = DatetimeArray._from_sequence(new_data, dtype=np.dtype("M8[ns]"))
 
         if self.freq.name == "B":
             # See if we can retain BDay instead of Day in cases where
diff --git a/pandas/tests/arrays/datetimes/test_constructors.py b/pandas/tests/arrays/datetimes/test_constructors.py
index d7264c002c67f..74cc3e991bb76 100644
--- a/pandas/tests/arrays/datetimes/test_constructors.py
+++ b/pandas/tests/arrays/datetimes/test_constructors.py
@@ -28,10 +28,12 @@ def test_mixing_naive_tzaware_raises(self, meth):
         # GH#24569
         arr = np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")])
 
-        msg = (
-            "Cannot mix tz-aware with tz-naive values|"
-            "Tz-aware datetime.datetime cannot be converted "
-            "to datetime64 unless utc=True"
+        msg = "|".join(
+            [
+                "Cannot mix tz-aware with tz-naive values",
+                "Tz-aware datetime.datetime cannot be converted "
+                "to datetime64 unless utc=True",
+            ]
         )
 
         for obj in [arr, arr[::-1]]:
@@ -63,10 +65,10 @@ def test_bool_dtype_raises(self):
 
     def test_copy(self):
         data = np.array([1, 2, 3], dtype="M8[ns]")
-        arr = DatetimeArray._from_sequence(data, copy=False)
+        arr = DatetimeArray._from_sequence(data, dtype=data.dtype, copy=False)
         assert arr._ndarray is data
 
-        arr = DatetimeArray._from_sequence(data, copy=True)
+        arr = DatetimeArray._from_sequence(data, dtype=data.dtype, copy=True)
         assert arr._ndarray is not data
 
     def test_numpy_datetime_unit(self, unit):
@@ -163,7 +165,9 @@ def test_from_arrow_from_empty(unit, tz):
     dtype = DatetimeTZDtype(unit=unit, tz=tz)
 
     result = dtype.__from_arrow__(arr)
-    expected = DatetimeArray._from_sequence(np.array(data, dtype=f"datetime64[{unit}]"))
+    expected = DatetimeArray._from_sequence(
+        np.array(data, dtype=f"datetime64[{unit}]"), dtype=np.dtype(f"M8[{unit}]")
+    )
     expected = expected.tz_localize(tz=tz)
     tm.assert_extension_array_equal(result, expected)
 
@@ -179,7 +183,9 @@ def test_from_arrow_from_integers():
     dtype = DatetimeTZDtype(unit="ns", tz="UTC")
 
     result = dtype.__from_arrow__(arr)
-    expected = DatetimeArray._from_sequence(np.array(data, dtype="datetime64[ns]"))
+    expected = DatetimeArray._from_sequence(
+        np.array(data, dtype="datetime64[ns]"), dtype=np.dtype("M8[ns]")
+    )
     expected = expected.tz_localize("UTC")
     tm.assert_extension_array_equal(result, expected)
 
diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py
index 4070a2844846f..3c0ef1e4d928b 100644
--- a/pandas/tests/arrays/test_array.py
+++ b/pandas/tests/arrays/test_array.py
@@ -370,11 +370,15 @@ def test_array_copy():
         ),
         (
             np.array([1, 2], dtype="m8[ns]"),
-            TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[ns]")),
+            TimedeltaArray._from_sequence(
+                np.array([1, 2], dtype="m8[ns]"), dtype=np.dtype("m8[ns]")
+            ),
         ),
         (
             np.array([1, 2], dtype="m8[us]"),
-            TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[us]")),
+            TimedeltaArray._from_sequence(
+                np.array([1, 2], dtype="m8[us]"), dtype=np.dtype("m8[us]")
+            ),
         ),
         # integer
         ([1, 2], IntegerArray._from_sequence([1, 2], dtype="Int64")),
diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py
index 6dd1ef9d59ab4..0c8eefab95464 100644
--- a/pandas/tests/arrays/test_datetimelike.py
+++ b/pandas/tests/arrays/test_datetimelike.py
@@ -257,7 +257,8 @@ def test_fillna_method_doesnt_change_orig(self, method):
         if self.array_cls is PeriodArray:
             arr = self.array_cls(data, dtype="period[D]")
         else:
-            arr = self.array_cls._from_sequence(data)
+            dtype = "M8[ns]" if self.array_cls is DatetimeArray else "m8[ns]"
+            arr = self.array_cls._from_sequence(data, dtype=np.dtype(dtype))
         arr[4] = NaT
 
         fill_value = arr[3] if method == "pad" else arr[5]
@@ -273,7 +274,8 @@ def test_searchsorted(self):
         if self.array_cls is PeriodArray:
             arr = self.array_cls(data, dtype="period[D]")
         else:
-            arr = self.array_cls._from_sequence(data)
+            dtype = "M8[ns]" if self.array_cls is DatetimeArray else "m8[ns]"
+            arr = self.array_cls._from_sequence(data, dtype=np.dtype(dtype))
 
         # scalar
         result = arr.searchsorted(arr[1])
@@ -739,10 +741,10 @@ def test_array_i8_dtype(self, arr1d):
     def test_from_array_keeps_base(self):
         # Ensure that DatetimeArray._ndarray.base isn't lost.
         arr = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]")
-        dta = DatetimeArray._from_sequence(arr)
+        dta = DatetimeArray._from_sequence(arr, dtype=arr.dtype)
 
         assert dta._ndarray is arr
-        dta = DatetimeArray._from_sequence(arr[:0])
+        dta = DatetimeArray._from_sequence(arr[:0], dtype=arr.dtype)
         assert dta._ndarray.base is arr
 
     def test_from_dti(self, arr1d):
diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py
index 8e348805de978..e3f49d04a0ff2 100644
--- a/pandas/tests/arrays/test_datetimes.py
+++ b/pandas/tests/arrays/test_datetimes.py
@@ -499,7 +499,7 @@ def test_value_counts_preserves_tz(self):
     @pytest.mark.parametrize("method", ["pad", "backfill"])
     def test_fillna_preserves_tz(self, method):
         dti = pd.date_range("2000-01-01", periods=5, freq="D", tz="US/Central")
-        arr = DatetimeArray._from_sequence(dti, copy=True)
+        arr = DatetimeArray._from_sequence(dti, dtype=dti.dtype, copy=True)
         arr[2] = pd.NaT
 
         fill_val = dti[1] if method == "pad" else dti[3]
@@ -665,7 +665,9 @@ def test_shift_fill_value(self):
         dti = pd.date_range("2016-01-01", periods=3)
 
         dta = dti._data
-        expected = DatetimeArray._from_sequence(np.roll(dta._ndarray, 1))
+        expected = DatetimeArray._from_sequence(
+            np.roll(dta._ndarray, 1), dtype=dti.dtype
+        )
 
         fv = dta[-1]
         for fill_value in [fv, fv.to_pydatetime(), fv.to_datetime64()]:
@@ -731,7 +733,11 @@ def test_iter_zoneinfo_fold(self, tz):
         )
         utc_vals *= 1_000_000_000
 
-        dta = DatetimeArray._from_sequence(utc_vals).tz_localize("UTC").tz_convert(tz)
+        dta = (
+            DatetimeArray._from_sequence(utc_vals, dtype=np.dtype("M8[ns]"))
+            .tz_localize("UTC")
+            .tz_convert(tz)
+        )
 
         left = dta[2]
         right = list(dta)[2]
diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py
index bcc52f197ee51..fb7c7afdc6ff9 100644
--- a/pandas/tests/arrays/test_timedeltas.py
+++ b/pandas/tests/arrays/test_timedeltas.py
@@ -263,10 +263,10 @@ def test_searchsorted_invalid_types(self, other, index):
 class TestUnaryOps:
     def test_abs(self):
         vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]")
-        arr = TimedeltaArray._from_sequence(vals)
+        arr = TimedeltaArray._from_sequence(vals, dtype=vals.dtype)
 
         evals = np.array([3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]")
-        expected = TimedeltaArray._from_sequence(evals)
+        expected = TimedeltaArray._from_sequence(evals, dtype=evals.dtype)
 
         result = abs(arr)
         tm.assert_timedelta_array_equal(result, expected)
@@ -276,7 +276,7 @@ def test_abs(self):
 
     def test_pos(self):
         vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]")
-        arr = TimedeltaArray._from_sequence(vals)
+        arr = TimedeltaArray._from_sequence(vals, dtype=vals.dtype)
 
         result = +arr
         tm.assert_timedelta_array_equal(result, arr)
@@ -288,7 +288,7 @@ def test_pos(self):
 
     def test_neg(self):
         vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]")
-        arr = TimedeltaArray._from_sequence(vals)
+        arr = TimedeltaArray._from_sequence(vals, dtype=vals.dtype)
 
         evals = np.array([3600 * 10**9, "NaT", -7200 * 10**9], dtype="m8[ns]")
         expected = TimedeltaArray._from_sequence(evals)
diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py
index 13a3ff048c79e..d8af7abe83084 100644
--- a/pandas/tests/base/test_conversion.py
+++ b/pandas/tests/base/test_conversion.py
@@ -333,7 +333,8 @@ def test_array_multiindex_raises():
         # Timedelta
         (
             TimedeltaArray._from_sequence(
-                np.array([0, 3600000000000], dtype="i8").view("m8[ns]")
+                np.array([0, 3600000000000], dtype="i8").view("m8[ns]"),
+                dtype=np.dtype("m8[ns]"),
             ),
             np.array([0, 3600000000000], dtype="m8[ns]"),
         ),
diff --git a/pandas/tests/dtypes/test_generic.py b/pandas/tests/dtypes/test_generic.py
index 261f86bfb0326..2b90886a8d070 100644
--- a/pandas/tests/dtypes/test_generic.py
+++ b/pandas/tests/dtypes/test_generic.py
@@ -20,8 +20,8 @@ class TestABCClasses:
     df = pd.DataFrame({"names": ["a", "b", "c"]}, index=multi_index)
     sparse_array = pd.arrays.SparseArray(np.random.default_rng(2).standard_normal(10))
 
-    datetime_array = pd.core.arrays.DatetimeArray._from_sequence(datetime_index)
-    timedelta_array = pd.core.arrays.TimedeltaArray._from_sequence(timedelta_index)
+    datetime_array = datetime_index.array
+    timedelta_array = timedelta_index.array
 
     abc_pairs = [
         ("ABCMultiIndex", multi_index),
diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py
index 9b9a8ea3600ae..885adb3543b46 100644
--- a/pandas/tests/series/accessors/test_dt_accessor.py
+++ b/pandas/tests/series/accessors/test_dt_accessor.py
@@ -790,7 +790,8 @@ def test_end_time_timevalues(self, input_vals):
         # GH#17157
         # Check that the time part of the Period is adjusted by end_time
         # when using the dt accessor on a Series
-        input_vals = PeriodArray._from_sequence(np.asarray(input_vals))
+        dtype = pd.PeriodDtype(input_vals[0].freq)
+        input_vals = PeriodArray._from_sequence(np.asarray(input_vals), dtype=dtype)
 
         ser = Series(input_vals)
         result = ser.dt.end_time

From b96491a11b7938c9146a26bfac339a6ebe0ca4a2 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 25 Sep 2024 09:22:49 -1000
Subject: [PATCH 083/224] DOC: Emphasize team managed pandas in installation
 docs (#59822)

* DOC: Emphasize team managed pandas in installation docs

* grammar
---
 doc/source/development/maintaining.rst        |   2 +-
 doc/source/getting_started/index.rst          |   3 +-
 doc/source/getting_started/install.rst        | 151 ++++++------------
 web/pandas/getting_started.md                 |  29 +---
 .../static/img/install/anaconda_prompt.png    | Bin 1373 -> 0 bytes
 .../static/img/install/jupyterlab_home.png    | Bin 1962 -> 0 bytes
 .../img/install/pandas_import_and_version.png | Bin 2252 -> 0 bytes
 7 files changed, 52 insertions(+), 133 deletions(-)
 delete mode 100644 web/pandas/static/img/install/anaconda_prompt.png
 delete mode 100644 web/pandas/static/img/install/jupyterlab_home.png
 delete mode 100644 web/pandas/static/img/install/pandas_import_and_version.png

diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst
index 50d380cab1d50..1e4a851d0e72d 100644
--- a/doc/source/development/maintaining.rst
+++ b/doc/source/development/maintaining.rst
@@ -344,7 +344,7 @@ in the next places:
 - Git repo with a `new tag <https://github.com/pandas-dev/pandas/tags>`_
 - Source distribution in a `GitHub release <https://github.com/pandas-dev/pandas/releases>`_
 - Pip packages in the `PyPI <https://pypi.org/project/pandas/>`_
-- Conda/Mamba packages in `conda-forge <https://anaconda.org/conda-forge/pandas>`_
+- Conda packages in `conda-forge <https://anaconda.org/conda-forge/pandas>`_
 
 The process for releasing a new version of pandas is detailed next section.
 
diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst
index 36ed553d9d88e..a17699a71fbd3 100644
--- a/doc/source/getting_started/index.rst
+++ b/doc/source/getting_started/index.rst
@@ -17,8 +17,7 @@ Installation
         :columns: 12 12 6 6
         :padding: 3
 
-        pandas is part of the `Anaconda <https://docs.continuum.io/anaconda/>`__
-        distribution and can be installed with Anaconda or Miniconda:
+        pandas can be installed via conda from `conda-forge <https://anaconda.org/conda-forge/pandas>`__.
 
         ++++++++++++++++++++++
 
diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
index 8e6cb9e9a132d..b3982c4ad091f 100644
--- a/doc/source/getting_started/install.rst
+++ b/doc/source/getting_started/install.rst
@@ -6,15 +6,16 @@
 Installation
 ============
 
-The easiest way to install pandas is to install it
-as part of the `Anaconda <https://docs.continuum.io/free/anaconda/>`__ distribution, a
-cross platform distribution for data analysis and scientific computing.
-The `Conda <https://conda.io/en/latest/>`__ package manager is the
-recommended installation method for most users.
+The pandas development team officially distributes pandas for installation
+through the following methods:
 
-Instructions for installing :ref:`from source <install.source>`,
-:ref:`PyPI <install.pypi>`, or a
-:ref:`development version <install.dev>` are also provided.
+* Available on `conda-forge <https://anaconda.org/conda-forge/pandas>`__ for installation with the conda package manager.
+* Available on `PyPI <https://pypi.org/project/pandas/>`__ for installation with pip.
+* Available on `Github <https://github.com/pandas-dev/pandas>`__ for installation from source.
+
+.. note::
+    pandas may be installable from other sources besides the ones listed above,
+    but they are **not** managed by the pandas development team.
 
 .. _install.version:
 
@@ -26,68 +27,54 @@ See :ref:`Python support policy <policies.python_support>`.
 Installing pandas
 -----------------
 
-.. _install.anaconda:
+.. _install.conda:
 
-Installing with Anaconda
-~~~~~~~~~~~~~~~~~~~~~~~~
+Installing with Conda
+~~~~~~~~~~~~~~~~~~~~~
 
-For users that are new to Python, the easiest way to install Python, pandas, and the
-packages that make up the `PyData <https://pydata.org/>`__ stack
-(`SciPy <https://scipy.org/>`__, `NumPy <https://numpy.org/>`__,
-`Matplotlib <https://matplotlib.org/>`__, `and more <https://docs.continuum.io/free/anaconda/reference/packages/pkg-docs/>`__)
-is with `Anaconda <https://docs.continuum.io/free/anaconda/>`__, a cross-platform
-(Linux, macOS, Windows) Python distribution for data analytics and
-scientific computing. Installation instructions for Anaconda
-`can be found here <https://docs.continuum.io/free/anaconda/install/>`__.
+For users working with the `Conda <https://conda.io/en/latest/>`__ package manager,
+pandas can be installed from the ``conda-forge`` channel.
 
-.. _install.miniconda:
+.. code-block:: shell
 
-Installing with Miniconda
-~~~~~~~~~~~~~~~~~~~~~~~~~
+    conda install -c conda-forge pandas
 
-For users experienced with Python, the recommended way to install pandas with
-`Miniconda <https://docs.conda.io/en/latest/miniconda.html>`__.
-Miniconda allows you to create a minimal, self-contained Python installation compared to Anaconda and use the
-`Conda <https://conda.io/en/latest/>`__ package manager to install additional packages
-and create a virtual environment for your installation. Installation instructions for Miniconda
-`can be found here <https://docs.conda.io/en/latest/miniconda.html>`__.
+To install the Conda package manager on your system, the
+`Miniforge distribution <https://github.com/conda-forge/miniforge?tab=readme-ov-file#install>`__
+is recommended.
 
-The next step is to create a new conda environment. A conda environment is like a
-virtualenv that allows you to specify a specific version of Python and set of libraries.
-Run the following commands from a terminal window.
+Additionally, it is recommended to install and run pandas from a virtual environment.
 
 .. code-block:: shell
 
     conda create -c conda-forge -n name_of_my_env python pandas
-
-This will create a minimal environment with only Python and pandas installed.
-To put your self inside this environment run.
-
-.. code-block:: shell
-
+    # On Linux or MacOS
     source activate name_of_my_env
     # On Windows
     activate name_of_my_env
 
-.. _install.pypi:
+.. tip::
+    For users that are new to Python, the easiest way to install Python, pandas, and the
+    packages that make up the `PyData <https://pydata.org/>`__ stack such as
+    `SciPy <https://scipy.org/>`__, `NumPy <https://numpy.org/>`__ and
+    `Matplotlib <https://matplotlib.org/>`__
+    is with `Anaconda <https://docs.anaconda.com/anaconda/install/>`__, a cross-platform
+    (Linux, macOS, Windows) Python distribution for data analytics and
+    scientific computing.
 
-Installing from PyPI
-~~~~~~~~~~~~~~~~~~~~
+    However, pandas from Anaconda is **not** officially managed by the pandas development team.
 
-pandas can be installed via pip from
-`PyPI <https://pypi.org/project/pandas>`__.
+.. _install.pip:
 
-.. code-block:: shell
-
-    pip install pandas
+Installing with pip
+~~~~~~~~~~~~~~~~~~~
 
-.. note::
-    You must have ``pip>=19.3`` to install from PyPI.
+For users working with the `pip <https://pip.pypa.io/en/stable/>`__ package manager,
+pandas can be installed from `PyPI <https://pypi.org/project/pandas/>`__.
 
-.. note::
+.. code-block:: shell
 
-    It is recommended to install and run pandas from a virtual environment, for example,
-    using the Python standard library's `venv <https://docs.python.org/3/library/venv.html>`__
+    pip install pandas
 
 pandas can also be installed with sets of optional dependencies to enable certain functionality. For example,
 to install pandas with the optional dependencies to read Excel files.
@@ -98,25 +85,8 @@ to install pandas with the optional dependencies to read Excel files.
 
 The full list of extras that can be installed can be found in the :ref:`dependency section.<install.optional_dependencies>`
 
-Handling ImportErrors
-~~~~~~~~~~~~~~~~~~~~~
-
-If you encounter an ``ImportError``, it usually means that Python couldn't find pandas in the list of available
-libraries. Python internally has a list of directories it searches through, to find packages. You can
-obtain these directories with.
-
-.. code-block:: python
-
-    import sys
-    sys.path
-
-One way you could be encountering this error is if you have multiple Python installations on your system
-and you don't have pandas installed in the Python installation you're currently using.
-In Linux/Mac you can run ``which python`` on your terminal and it will tell you which Python installation you're
-using. If it's something like "/usr/bin/python", you're using the Python from the system, which is not recommended.
-
-It is highly recommended to use ``conda``, for quick installation and for package and dependency updates.
-You can find simple installation instructions for pandas :ref:`in this document <install.miniconda>`.
+Additionally, it is recommended to install and run pandas from a virtual environment, for example,
+using the Python standard library's `venv <https://docs.python.org/3/library/venv.html>`__
 
 .. _install.source:
 
@@ -144,49 +114,24 @@ index from the PyPI registry of anaconda.org. You can install it by running.
 
     pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple pandas
 
-Note that you might be required to uninstall an existing version of pandas to install the development version.
+.. note::
+    You might be required to uninstall an existing version of pandas to install the development version.
 
-.. code-block:: shell
+    .. code-block:: shell
 
-    pip uninstall pandas -y
+        pip uninstall pandas -y
 
 Running the test suite
 ----------------------
 
-pandas is equipped with an exhaustive set of unit tests. The packages required to run the tests
-can be installed with ``pip install "pandas[test]"``. To run the tests from a
-Python terminal.
-
-.. code-block:: python
-
-    >>> import pandas as pd
-    >>> pd.test()
-    running: pytest -m "not slow and not network and not db" /home/user/anaconda3/lib/python3.10/site-packages/pandas
-
-    ============================= test session starts ==============================
-    platform linux -- Python 3.9.7, pytest-6.2.5, py-1.11.0, pluggy-1.0.0
-    rootdir: /home/user
-    plugins: dash-1.19.0, anyio-3.5.0, hypothesis-6.29.3
-    collected 154975 items / 4 skipped / 154971 selected
-    ........................................................................ [  0%]
-    ........................................................................ [ 99%]
-    .......................................                                  [100%]
-
-    ==================================== ERRORS ====================================
-
-    =================================== FAILURES ===================================
-
-    =============================== warnings summary ===============================
-
-    =========================== short test summary info ============================
-
-    = 1 failed, 146194 passed, 7402 skipped, 1367 xfailed, 5 xpassed, 197 warnings, 10 errors in 1090.16s (0:18:10) =
+If pandas has been installed :ref:`from source <install.source>`, running ``pytest pandas`` will run all of pandas unit tests.
 
+The unit tests can also be run from the pandas module itself with the :func:`test` function. The packages required to run the tests
+can be installed with ``pip install "pandas[test]"``.
 
 .. note::
 
-    This is just an example of what information is shown. Test failures are not necessarily indicative
-    of a broken pandas installation.
+    Test failures are not necessarily indicative of a broken pandas installation.
 
 .. _install.dependencies:
 
@@ -219,7 +164,7 @@ For example, :func:`pandas.read_hdf` requires the ``pytables`` package, while
 optional dependency is not installed, pandas will raise an ``ImportError`` when
 the method requiring that dependency is called.
 
-If using pip, optional pandas dependencies can be installed or managed in a file (e.g. requirements.txt or pyproject.toml)
+With pip, optional pandas dependencies can be installed or managed in a file (e.g. requirements.txt or pyproject.toml)
 as optional extras (e.g. ``pandas[performance, aws]``). All optional dependencies can be installed with ``pandas[all]``,
 and specific sets of dependencies are listed in the sections below.
 
diff --git a/web/pandas/getting_started.md b/web/pandas/getting_started.md
index 0c4219e1ae12e..801081a9ef391 100644
--- a/web/pandas/getting_started.md
+++ b/web/pandas/getting_started.md
@@ -2,33 +2,8 @@
 
 ## Installation instructions
 
-The next steps provides the easiest and recommended way to set up your
-environment to use pandas. Other installation options can be found in
-the [advanced installation page]({{ base_url}}docs/getting_started/install.html).
-
-1. Download [Anaconda](https://www.anaconda.com/download/) for your operating system and
-   the latest Python version, run the installer, and follow the steps. Please note:
-
-    - It is not needed (and discouraged) to install Anaconda as root or administrator.
-    - When asked if you wish to initialize Anaconda3, answer yes.
-    - Restart the terminal after completing the installation.
-
-    Detailed instructions on how to install Anaconda can be found in the
-    [Anaconda documentation](https://docs.anaconda.com/anaconda/install/).
-
-2. In the Anaconda prompt (or terminal in Linux or macOS), start JupyterLab:
-
-    <img class="img-fluid" alt="" src="{{ base_url }}/static/img/install/anaconda_prompt.png"/>
-
-3. In JupyterLab, create a new (Python 3) notebook:
-
-    <img class="img-fluid" alt="" src="{{ base_url }}/static/img/install/jupyterlab_home.png"/>
-
-4. In the first cell of the notebook, you can import pandas and check the version with:
-
-    <img class="img-fluid" alt="" src="{{ base_url }}/static/img/install/pandas_import_and_version.png"/>
-
-5. Now you are ready to use pandas, and you can write your code in the next cells.
+To install pandas, please reference the [installation page]({{ base_url}}docs/getting_started/install.html)
+from the pandas documentation.
 
 ## Tutorials
 
diff --git a/web/pandas/static/img/install/anaconda_prompt.png b/web/pandas/static/img/install/anaconda_prompt.png
deleted file mode 100644
index 7b547e4ebb02a6102ecf615ddddf576dc74ccd15..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1373
zcmeAS@N?(olHy`uVBq!ia0y~yVA=y@|6*YVk{h`?HGq^sfKP}k10#b9bF2%uKq8Bc
zoSdPsM`yJOV`zA^eCw@!9nLL_Hq6}KxA)+_-aCJPy;}PB|DUWqmG2oCSYCL#IEGZ*
zdV6bswsgA0fsf$}7Q0V-C)y;v<(a<5EiOjKX*{jYY<B~i#htks`D2ZqDLh%n*8Ggm
zh>J;_H*%M>%es_ziw=tk%35>kwN(GMdp|eD+v?`u#*FU$_YYp{-z{xd_vPMdfu$Mq
z_b#zrrsVsZJFeoPYQ@9W=@0(uf2{NBy7zda`91S|w-4{yzyJB;B6M#}*47iZzoiHG
zYu|el9(QBjrhq{Cd;aR`+V}YT%Rg`1eDB-q%6#+p`IW^J4@JMQ6TDaVpYPtI@4sG0
zZkZkaPf2jk`TUB0`xiI;*Y(BzOo?CZF)zkGEB%?n!@0HQU+?_!);D?Ib9ZND>BO|7
zfuYM&F7`dFTd=1xf8x2t@<F!V{Uvo%wx6B+GVoi-vOux+N|XMy%ZJaLo%Q)%PL^BA
zw5;lotC}-co3sB*{G1jjcJ@-F)b-`QbInWd+sypNGsP&z@$8=BscGwPmrZ!u_+i%f
zvbjkU`XB06>Tj8+eN|-Mv!uJPv~0IE?>M%*B6eTVp4TFG)2qH7e;)67sCwDgi4T8n
z4X?YjcI(e2^KTw?G<bgT>XlEo*wa2woyS@FRCE7~#}U=`)1)_Z@9v!X(z`i*<<9)r
z>%qS_)?bOU==Z*Nr+Sw7<=<;}&OLee^q%dHMT+MqKi_a?#@zTnOM}yo_I;c8G3EYm
zbM4BnckiG6aqSy_Wv2cf7X3XRg!X(m<o)4aeMS6-`hxG0VBR5a{e#?aX{d_Zd(Q2!
zyHT-Gc#S>(bpAUZ=Zag;xDg+Hz52wSc$R1xzm+dkTEnleepdRpSGcyqyz(#?-~Ktr
z9&ApqJiBjut<j$EpJsWk<#(HY%J<l+$tT_iFO>@0s^;@`;qp1b);qoiDsgw-|5aG>
za!=^p!$*9!%e<eyy6MBNqMn^{qI)(ba__o#UF%ED?sILYH*U#p`Ec#tt-!FOR&Tev
zHQtw!^0xHpv$uH5<{ei&cUla8)al!uPY>VOApd{e)Hh;}t2VQU%U3-wI;wsAwEyAk
z{_lI;3N~AaymyZNz{dLC(dgqGm9j}HoZBi=CFXXP&)!~WAr@D?r~dW%gmujCuYIXA
zo-#YUHsd<u(<7_5mY4GH-fF?qv%Oku`sLlbgikg9n(*>&Of|dK8s4n5iKccB*XkbB
z-dsFkYxuI8?@pevY47t%UomHXx2a0Z^Qo_Ie9zokETOvbXF&aZshP13?%#Gj{cv*M
z^-rhHRI$E4xZBOU&pbKHZ_bU!Ag_2<&izrm_Uro(`&P)9)W2>`QV&>jvMj#({4=q8
zSGR9lD?T^&ey#P#uI&@rzW=K)z7e)&pP}x4W$yK#&ozHP4~khx)FGl15#bOqq*w+f
s0<2;%VWh+YR|*qDw`P!8z2z1E3ib2V9y4c40?SJVPgg&ebxsLQ0Dqmv%>V!Z

diff --git a/web/pandas/static/img/install/jupyterlab_home.png b/web/pandas/static/img/install/jupyterlab_home.png
deleted file mode 100644
index c62d33a5e0fc605be6d66c4a7be9f31d9baee8bc..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1962
zcmY*aX;f2576yr|C!ium34ulk$V5mBVvYnv8bsrQ2tvTfbBO~AhNmd-LK+t~8(G|w
zZZ^fGl{JG9BJfZk5=0F+G65Sqk0dN<p(QV}<OmWcYR}A>KexVHcd4p->#NEQ4cUq{
zw>38~Fu>Bb1%(+HEQ0lT4r8R3b>_Ki^uza|JNIs}x5p<E(|u3eONl!)nY<*5_pG}x
zp#07)_;Jf@`^B=p`GtQ5e^wk)j4Bt@tqY$&amCAC8yJ{)(}Du_LWA#RbqV;mm0g<p
z_Pha)0QpKxj<TE=^s{l`1GktW$Lw?44?oylM_L2;G+cIjb<iBx*LB;zxgqq#u|<*v
zL(lgPORiW}IAWGNj+hqNa(+6cSv66Ha9rPBtv=YTE$e$W<lx+F+9Ptc>Yd-DUe*1k
z%vIBQ4hKMHFgp94I-?}2QIV}z^#UoA*VyC<4x1rALk{=C_+h-kpRThYZ*=bL*pC2c
zia?N%+<|ve|K3q^nbJp&cH_ihceofoADYf%M)eH1z<cx#mGqkfUc$AVR5`J+Ixb<X
zsPtO3SI|tbvXpVTbxe+d1CyGni!zxpTw+%S#T=B75g$<eV5-L>8@3MihnH3f8@eW<
zI8th{1-mOEK!iWZZ0v3M`{+nuXG^SKINXN@JCMTSt5KJHtqW4uIG0EQz{8;$AMtna
zo(sy=f$O&VpdA)u_?x&|Yov$E^atdf?%tu4X;;vE%QwE_D2QL|=f72=V~0GO*QSrb
z01)RF?K*hxSNMNEUsHVbK>*c)sw?Lch98SKY>Izo11Zw0ApebzkPLc08>#>xEu{6)
zuR80@=N<jTE4w7v)Cwbumkwl6JdjiU_UhH}9J|+9UT}r7t_D)C<T9seo&SK#U!I$r
zT*}zbY*Qth$O}3cg8hA~xW}HPEj*W`soqyc2;g;1j)m`{@;2y8Ez3&avfSS0Robjg
z8&+DXf|4E!cqeMDUfZqKJa0&tD=AcZ=Re-;O$v)7u={Dhp(!HHs5%lEpGgO((4E|Y
z3?Ux4(#wEkURa4EB=S9K8O}sg+UM*2w=|&JgWXxt4sQt8RS3J{!9CV~cNUi!u@<6^
zEkwms3(&}2GsF|h>5udEhtw~d3^ktVjYGMa1=0R6VTzkUDm<-JEIizl*G{s#FVff5
z_34XD2pZal+{a^o%i<kSWa5BDfAvN97A5Tbk!US3{O{D4UVLdyH*Kolhgw5w$K&3+
zIrhI~Oz56AVL%lu8(IXa+{&K0-@hSDu|t-@rC~O^$Lt}6w1+{p2hF}+=rtsUCMz)m
zhg?(E_%-p$U@ZZzLnrU{y{qaz^1%yY%4<G0zq@-8%RhsZP9F({yGL5kaaGJNUZjz1
zWSbeV0AG7(1m;Ug<UpiRf<qp_PuxX#C(y?X%!g^v@NjxkZ?7kO5}ow><?ON#iO1Wj
zt_T}TEUkNNv=-SHiqZJtm=K5R1V>Qe2scBivlFxDga&*ML^`I#vMCDb)0W>0wPeU!
z(>R?oqDA9JY6Q!{562YfpfQL+epM1=*>op$$PUfE2{e9`^*5yZ7G-&)Tq>IP93g!4
zv&$#Gl8CBm(k4tcDz>H=H(PyR>ysIGawMen&bw#ghp~C7tu!3K!q;a?68)^?(w9`F
zHxATULEhNTN^Rkv8}!O<eT?pQuwW)u<{M|(4AG0(MlxTMtNI-mK_X?$QoX&ub_1Hz
z%(gc^HXL!mXTVn2dch&G4S${<-x=1z^n_;6x#+X$qa9UwLfNl|KnxuJ!T{V4SDvQm
zvMKfm>KundRzj&?vRUat>)htLrpF!Po&NlPbqQ|t-`NHU&|1~R<6>0Y#?Z3(7T^&;
zv%x$Yo1=UL35GEL;q(8P{RM$eeBL_?ax5N!`>gmb$f1Gh1Kc7AXJ-NIf)UgaIH{0|
zIsMZ;wz*_aZffGICt)`uM&6*~@s|CM65PA7<UNfqwDpzry(aHko4Ji=qLYqG?P7UP
zOU*fpTj`9Mi`QSg?!bQ{lEb!+89|pulAmDN5vM}oGJ*$-T&O#VrnFJlO0#+=Y%>+X
ztF0$F3aRHg!Ew>75q|~Hl|$JCq;8fM)TJ#(D4o=6wl)0o^%CMu+i)`PA{wlaY!69}
z9XPH?C)&*D=7sVCF+(t@>35|<PE%*%X^Af0-c&E!hOBWu?l?4~L1st;eB>BsU(y+O
z`c^{n4}TsIRh>^A#M9#@fyn2`BmdC|ckefBH*0`EH3@%MYcq{Jj|qo0mG4RO6|d}0
z@8u5}+qGapbcq@A2YlnLqNhB`|EMK8KAJ2Zhjq#^GRshXswmR~5ZspRbVCj@SD=xi
z3M}$RoMiz1r~_0SPBjL<7q;WA`CcxlII2}o;?FpwaLN>6qCKfaO?S@r8X|$z`mU*%
zk*ZM5*u)}rc>zd)DVI!PR*`x*5fbI8fxcS`zuCbRI{)<@+`FpVJo?41ZB~`yPDUUl
y1er|d?4wgb;X(igiMl+x-r3`?cVQ@0XH=pqxn;r(->m<94QN|Jf@*&_nDK8H{P*er

diff --git a/web/pandas/static/img/install/pandas_import_and_version.png b/web/pandas/static/img/install/pandas_import_and_version.png
deleted file mode 100644
index 64c1303ac495ccf72a7c649401cce26c47c15ace..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2252
zcmZ{mc|6<M7QoG@QWa_}sjaH5qNP-8sV$|JVl1VEhN!&+sil%wpQf~yQA<&bQLWZP
zNYU`{OScGO?V{3#{0K@g1Sv7rNjsf+@1OV2z4vqPx%ZxX&i9;i)14h{Afoc30s;aM
z+l!Vi0s=n*{PD&fL4MmHny$|8bew;_Y^9<o=LpTl%Oj3k7d*5P3cl)fvgG%B1w|iz
z(P$^PExgQqKwlr59AE$3z+GF}-rd^0G5cIVKm=}UX?_{YSYQsK6lRsWmTg30-X=ed
zyrHRS#CzvlCyUoMl2x&=h}NihMzGQV&z(J{yEpvR8*fpSTbkYqDH1eajj#&nprYM1
zs8?6X9Se=X3Uv>Wpy{dDhjCxN;DWyRAKT3#6!c$7O199%SvO7=DJRb0p64C6$g|JF
zTAw<-^m@N`3KWT@3E`Bzyd#cqWCog8sr(66Dz64>{Nn8&KY_ihRZ97es|LP<LvRgj
zOLa9Ep&*1kw^Ms4iTHV+<fQjFJX9N*zzxZ&G)gyM=nN=0?V^-qGw((i66?i%LXoH)
z07doN2j&=*6O1{J71D{8)_XabUO7&*{se3Hcd6$q165T6w-2i(=7!w)Rl8>`*jD=h
zEA8-nu=H%uPN9!@<DBEp;H^&BsTv^7K%RU<?XmC^dGf19_<~0>DZ<V8+^%~UBVu+L
z>)c!iAx_H%kPz3VwxTGZa}dW^iYqWb9BWL!1a*TciL)^!y#3fsKrS2sQC_&=q+eG<
z(9?22qGyJ}=aJC{Crg9O7l;h^oa&wdn^4N)rkwYInz<{Tbd1Z3tP_AKm`uf%R+|w^
zoz$xhN({Qy06RzSMthB@ItW~}0$$x6N@D^t%NxT%Ojpd@vWucL2OWs&RuK2>uk|XG
zMCXvS^a^Vw(P!h%Y`l@3+WW|0M1ed!h6(6sr&VZ`+^?1x6T^}6#N69XHrJ4^*nI%o
zS{7&}Rk%$OyvMrL(~M0A!<!z25?ff?n-SgdwfVk&`i%q;TzhES^#cQnJxa@h{qxnu
zc<rQ5F5L7SI`mv?0V)Hr7OYWSB^`<{Irndc0@x+47!10ybJn+kU>JQV!S9H%Ta7=;
zK$aRD5Q}uHWRcGzVGj1U(~bdO5doI+<d9|%L15`8J{d(`8k}}8IwasY=s$U5>ph(6
zx9liJ_c&wMNqCuZag9-=0KQ(?fy2hu*@O|Y?xHQuV=j3k#wqYWyl(Xx7_VykN!MKc
z>29swK`j%`+nM-yqTO=&;KA02?nTD=-PwlcV^_~BFI4;VBeQtCnZ&`xD-AJ*2|6+G
z0B4T`<f@>ujB+_8-moJ%?y_zcsp|rgqB8-Yjy|5c$m)uHPUdddMq=?#gO#V=Hx{oq
zz-{9)`{H-SK)5wcNgPS76<ugoA$A&NbF)pvI<a&UxAZiCl~3zmX(q8WTh3IO*zyMQ
zncDTH#Cme@${_`m1Oy!iuBV{0c<b39tP_Rk{~*))_JnY;)NW_X-6<B?YEp0L)d}$g
z4@MDmT+7TPpQq+8CG4%(OrEw8@<fS7&QIk!n_ido{USfJ(pe?LopQUazwK85vP}~i
zQ8OEK<SI5P3K^+m$2}>&LSuJmK0<NYYEkIvd+DPj@1_nM$A?)_hmc3Ov%h-wvs;|@
zyj(whzfQC|=$hHE`QD*#?fCL<u+t;SQBL5UtRXO%8jbVs6+M@NnP{9?b(n-esYiP=
zqGhQoY!(RmqycFILb6l?OU!Vg<Ck8nN!Tp=&6(0-o~3oXQNRGOho1vfmy&MdXpf)Y
z8e|?VlFASyL6JZ5S@_1lOFk8R8Cna=^a%}H459fKc^&>co$TXNbMkZ3%yQ<Pma^X_
z_P6|eU5$HUcpMjl6Gub9Vw}%TbQUjQysQkqhArpexGhX4mjWgy-OZ+wS!UO!(ow@$
z+;YCJl^Y96JwZrLr*yX#RPN&l;mQc@NlR7CKs8h;B)XTdYE8_KD)i$0()7C~$$gd0
zjETwuLvVpvn5G@N_iCighkr317^;u9hv%*%+}m6F(&W2)j|1E(Qh!`|%Unc_g+=83
z3d38#KwdaQ1)T_+2%l-@(e==-AF^&f3OX2$Ra@Gb{KS<+s%b}{z=U^UwHHdK!>w-X
z=YB+Iz^)-}wnj(ofUwMCRNr^(f-1+pp@WAq&7MkPUyNk>VMAuWnu%`}P8hH@<@RaA
z3|K<Kkr|X!tiN}(TDi$5J@m||1Idc_X*C;PIUli)>!1qZTm~rhILb4p*#0s?et2m(
zGgtyMG`g0TZ|YI^%2<Z;6oqu9vI6K^f$=1bThn5gp%{3h^B9psF;+zrQS~_f&ar2G
zEkVSt`q-E8=Ab!!RKG~rn=2t4w*YIe^Saxjv~r46G+tJS`KX7^go(FJpGJNzgKmU~
zk^Sv=9ZHrQeH2B2%+C13mM$WZPm(Sy5Jhs+iVDU5P6xQ975%UQ71|$)<Er1i{tp^n
zs_6bUwm5dMdwr7;6K2n#IlK7m8Iv$@cVUS~sesG*WJ|zY|IMykEH7e4FoeSnFN>nw
z#K?lV{FT{z#<NQqF7=C+BU`#9O-tFoUVq@^S+`aO`zUu$Ojivg_DFJ?;k+Z@qUimx
z3<IR%JEjG;I2U0tIt(uzN^4!-D+fWi<Kc;oN86R-e!ydrsOAx>CRBwftD3G6$s_n_
zP2i|nyh^k(*~M=^=b>f>Z4~>JSo`tRqVHJq(f_M5>b~B5AuMsoLn>kr(HxxAyOu1?
zsf6OMzfLK3G8Fi4FmNEB7m~IzKMDcd>FrgvD&Noc;9}PyV97<1f9wQbR9YD8DJ^xy
z=-Wm75<5ffMfI(t&9Oh6_*#O$Yw#_^{tCYDwfX<4_K&^)M`!*MFup6v6KhATdP%`I
R`QH-(TPsJ)dW)+!{{$*Kl8*oY


From b9488218ae27b70d1669a932ab16e8ce5a257cf0 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 25 Sep 2024 14:47:10 -1000
Subject: [PATCH 084/224] CI/TST: Check for tzset in set_timezone (#59893)

* CI/TST: Check for tzset in set_timezone

* adjust test message
---
 pandas/_testing/contexts.py         | 17 +++++++++--------
 pandas/tests/tslibs/test_parsing.py | 11 +++++++----
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py
index 91b5d2a981bef..4ca67d6fc082d 100644
--- a/pandas/_testing/contexts.py
+++ b/pandas/_testing/contexts.py
@@ -73,14 +73,15 @@ def set_timezone(tz: str) -> Generator[None, None, None]:
     import time
 
     def setTZ(tz) -> None:
-        if tz is None:
-            try:
-                del os.environ["TZ"]
-            except KeyError:
-                pass
-        else:
-            os.environ["TZ"] = tz
-            time.tzset()
+        if hasattr(time, "tzset"):
+            if tz is None:
+                try:
+                    del os.environ["TZ"]
+                except KeyError:
+                    pass
+            else:
+                os.environ["TZ"] = tz
+                time.tzset()
 
     orig_tz = os.environ.get("TZ")
     setTZ(tz)
diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py
index 9b64beaf09273..07425af8ed37a 100644
--- a/pandas/tests/tslibs/test_parsing.py
+++ b/pandas/tests/tslibs/test_parsing.py
@@ -37,10 +37,13 @@
 )
 def test_parsing_tzlocal_deprecated():
     # GH#50791
-    msg = (
-        r"Parsing 'EST' as tzlocal \(dependent on system timezone\) "
-        r"is no longer supported\. "
-        "Pass the 'tz' keyword or call tz_localize after construction instead"
+    msg = "|".join(
+        [
+            r"Parsing 'EST' as tzlocal \(dependent on system timezone\) "
+            r"is no longer supported\. "
+            "Pass the 'tz' keyword or call tz_localize after construction instead",
+            ".*included an un-recognized timezone",
+        ]
     )
     dtstr = "Jan 15 2004 03:00 EST"
 

From 23c497bb2f7e05af1fda966e7fb04db942453559 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 26 Sep 2024 05:06:07 -1000
Subject: [PATCH 085/224] DOC: Recommend conda from miniforge for contributing
 environment (#59894)

---
 doc/source/development/contributing.rst       |  6 ++---
 .../development/contributing_codebase.rst     |  2 +-
 .../development/contributing_environment.rst  | 23 +++++++++----------
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst
index fe5271dab7132..4d99f282aa695 100644
--- a/doc/source/development/contributing.rst
+++ b/doc/source/development/contributing.rst
@@ -305,15 +305,15 @@ It is important to periodically update your local ``main`` branch with updates f
 branch and update your development environment to reflect any changes to the various packages that
 are used during development.
 
-If using :ref:`mamba <contributing.mamba>`, run:
+If using :ref:`conda <contributing.conda>`, run:
 
 .. code-block:: shell
 
     git checkout main
     git fetch upstream
     git merge upstream/main
-    mamba activate pandas-dev
-    mamba env update -f environment.yml --prune
+    conda activate pandas-dev
+    conda env update -f environment.yml --prune
 
 If using :ref:`pip <contributing.pip>` , do:
 
diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst
index 9d5a992e911b6..670ffe6996302 100644
--- a/doc/source/development/contributing_codebase.rst
+++ b/doc/source/development/contributing_codebase.rst
@@ -244,7 +244,7 @@ in your python environment.
 
 .. warning::
 
-    * Please be aware that the above commands will use the current python environment. If your python packages are older/newer than those installed by the pandas CI, the above commands might fail. This is often the case when the ``mypy`` or ``numpy`` versions do not match. Please see :ref:`how to setup the python environment <contributing.mamba>` or select a `recently succeeded workflow <https://github.com/pandas-dev/pandas/actions/workflows/code-checks.yml?query=branch%3Amain+is%3Asuccess>`_, select the "Docstring validation, typing, and other manual pre-commit hooks" job, then click on "Set up Conda" and "Environment info" to see which versions the pandas CI installs.
+    * Please be aware that the above commands will use the current python environment. If your python packages are older/newer than those installed by the pandas CI, the above commands might fail. This is often the case when the ``mypy`` or ``numpy`` versions do not match. Please see :ref:`how to setup the python environment <contributing.conda>` or select a `recently succeeded workflow <https://github.com/pandas-dev/pandas/actions/workflows/code-checks.yml?query=branch%3Amain+is%3Asuccess>`_, select the "Docstring validation, typing, and other manual pre-commit hooks" job, then click on "Set up Conda" and "Environment info" to see which versions the pandas CI installs.
 
 .. _contributing.ci:
 
diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst
index 643021db7b823..1426d3a84a748 100644
--- a/doc/source/development/contributing_environment.rst
+++ b/doc/source/development/contributing_environment.rst
@@ -43,7 +43,7 @@ and consult the ``Linux`` instructions below.
 
 **macOS**
 
-To use the :ref:`mamba <contributing.mamba>`-based compilers, you will need to install the
+To use the :ref:`conda <contributing.conda>`-based compilers, you will need to install the
 Developer Tools using ``xcode-select --install``.
 
 If you prefer to use a different compiler, general information can be found here:
@@ -51,9 +51,9 @@ https://devguide.python.org/setup/#macos
 
 **Linux**
 
-For Linux-based :ref:`mamba <contributing.mamba>` installations, you won't have to install any
-additional components outside of the mamba environment. The instructions
-below are only needed if your setup isn't based on mamba environments.
+For Linux-based :ref:`conda <contributing.conda>` installations, you won't have to install any
+additional components outside of the conda environment. The instructions
+below are only needed if your setup isn't based on conda environments.
 
 Some Linux distributions will come with a pre-installed C compiler. To find out
 which compilers (and versions) are installed on your system::
@@ -82,19 +82,18 @@ Before we begin, please:
 * Make sure that you have :any:`cloned the repository <contributing.forking>`
 * ``cd`` to the pandas source directory you just created with the clone command
 
-.. _contributing.mamba:
+.. _contributing.conda:
 
-Option 1: using mamba (recommended)
+Option 1: using conda (recommended)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-* Install miniforge to get `mamba <https://mamba.readthedocs.io/en/latest/installation/mamba-installation.html>`_
-* Make sure your mamba is up to date (``mamba update mamba``)
-* Create and activate the ``pandas-dev`` mamba environment using the following commands:
+* Install miniforge to get `conda <https://github.com/conda-forge/miniforge?tab=readme-ov-file#download>`_
+* Create and activate the ``pandas-dev`` conda environment using the following commands:
 
-.. code-block:: none
+.. code-block:: bash
 
-   mamba env create --file environment.yml
-   mamba activate pandas-dev
+   conda env create --file environment.yml
+   conda activate pandas-dev
 
 .. _contributing.pip:
 

From 5ced458f6318f0319877ab655b8cb6b86092ea62 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com>
Date: Sat, 28 Sep 2024 07:51:30 -0400
Subject: [PATCH 086/224] CI: Pin micromamba to 1.x (#59912)

---
 .github/actions/setup-conda/action.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml
index 3eb68bdd2a15c..4fe901998cbcc 100644
--- a/.github/actions/setup-conda/action.yml
+++ b/.github/actions/setup-conda/action.yml
@@ -9,6 +9,8 @@ runs:
     - name: Install ${{ inputs.environment-file }}
       uses: mamba-org/setup-micromamba@v1
       with:
+        # Pinning to avoid 2.0 failures
+        micromamba-version: '1.5.10-0'
         environment-file: ${{ inputs.environment-file }}
         environment-name: test
         condarc-file: ci/.condarc

From 96de1f13103cd21417101de9d555f203cf93867a Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sun, 29 Sep 2024 00:07:34 +0530
Subject: [PATCH 087/224] DOC: fix SA01, ES01 for pandas.Series.sparse.npoints
 (#59896)

* DOC: fix SA01, ES01 for pandas.Series.sparse.npoints

* Update pandas/core/arrays/sparse/array.py
---
 ci/code_checks.sh                  |  1 -
 pandas/core/arrays/sparse/array.py | 12 ++++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index f662b4781e84b..149c5c0326733 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -97,7 +97,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Series.dt.unit GL08" \
         -i "pandas.Series.pad PR01,SA01" \
         -i "pandas.Series.sparse.from_coo PR07,SA01" \
-        -i "pandas.Series.sparse.npoints SA01" \
         -i "pandas.Timedelta.max PR02" \
         -i "pandas.Timedelta.min PR02" \
         -i "pandas.Timedelta.resolution PR02" \
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
index c8ec4068ca199..0c76280e7fdb4 100644
--- a/pandas/core/arrays/sparse/array.py
+++ b/pandas/core/arrays/sparse/array.py
@@ -708,6 +708,18 @@ def npoints(self) -> int:
         """
         The number of non- ``fill_value`` points.
 
+        This property returns the number of elements in the sparse series that are
+        not equal to the ``fill_value``. Sparse data structures store only the
+        non-``fill_value`` elements, reducing memory usage when the majority of
+        values are the same.
+
+        See Also
+        --------
+        Series.sparse.to_dense : Convert a Series from sparse values to dense.
+        Series.sparse.fill_value : Elements in ``data`` that are ``fill_value`` are
+            not stored.
+        Series.sparse.density : The percent of non- ``fill_value`` points, as decimal.
+
         Examples
         --------
         >>> from pandas.arrays import SparseArray

From cf12e6722cfaba646e7f0a1e5e8db88be8d076cd Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sun, 29 Sep 2024 00:08:55 +0530
Subject: [PATCH 088/224] DOC: fix RT03, ES01 for
 pandas.core.groupby.DataFrameGroupBy.agg and
 pandas.core.groupby.DataFrameGroupBy.aggregate (#59869)

* DOC: add double backticks for sphinx compatibility

Co-authored-by: mroeschke <mroeschke@users.noreply.github.com>

* DOC: remove _agg_template_frame

Co-authored-by: mroeschke <mroeschke@users.noreply.github.com>
Co-authored-by: rhshadrach <rhshadrach@users.noreply.github.com>

* DOC: fix RT03, ES01 for pandas.core.groupby.DataFrameGroupBy.aggregate

---------

Co-authored-by: mroeschke <mroeschke@users.noreply.github.com>
Co-authored-by: rhshadrach <rhshadrach@users.noreply.github.com>
---
 ci/code_checks.sh                     |   2 -
 pandas/core/groupby/generic.py        | 176 +++++++++++++++++++++++++-
 pandas/core/groupby/groupby.py        |  78 ------------
 scripts/validate_unwanted_patterns.py |   1 -
 4 files changed, 174 insertions(+), 83 deletions(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 149c5c0326733..669c793737161 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -127,8 +127,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.arrays.SparseArray PR07,SA01" \
         -i "pandas.arrays.TimedeltaArray PR07,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.__iter__ RT03,SA01" \
-        -i "pandas.core.groupby.DataFrameGroupBy.agg RT03" \
-        -i "pandas.core.groupby.DataFrameGroupBy.aggregate RT03" \
         -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index bec9d344d42e2..0c211afb5073c 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -67,7 +67,6 @@
 from pandas.core.groupby.groupby import (
     GroupBy,
     GroupByPlot,
-    _agg_template_frame,
     _agg_template_series,
     _transform_template,
 )
@@ -1515,8 +1514,181 @@ class DataFrameGroupBy(GroupBy[DataFrame]):
     """
     )
 
-    @doc(_agg_template_frame, examples=_agg_examples_doc, klass="DataFrame")
     def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
+        """
+        Aggregate using one or more operations.
+
+        The ``aggregate`` function allows the application of one or more aggregation
+        operations on groups of data within a DataFrameGroupBy object. It supports
+        various aggregation methods, including user-defined functions and predefined
+        functions such as 'sum', 'mean', etc.
+
+        Parameters
+        ----------
+        func : function, str, list, dict or None
+            Function to use for aggregating the data. If a function, must either
+            work when passed a DataFrame or when passed to DataFrame.apply.
+
+            Accepted combinations are:
+
+            - function
+            - string function name
+            - list of functions and/or function names, e.g. ``[np.sum, 'mean']``
+            - dict of index labels -> functions, function names or list of such.
+            - None, in which case ``**kwargs`` are used with Named Aggregation. Here the
+              output has one column for each element in ``**kwargs``. The name of the
+              column is keyword, whereas the value determines the aggregation used to
+              compute the values in the column.
+
+              Can also accept a Numba JIT function with
+              ``engine='numba'`` specified. Only passing a single function is supported
+              with this engine.
+
+              If the ``'numba'`` engine is chosen, the function must be
+              a user defined function with ``values`` and ``index`` as the
+              first and second arguments respectively in the function signature.
+              Each group's index will be passed to the user defined function
+              and optionally available for use.
+
+        *args
+            Positional arguments to pass to func.
+        engine : str, default None
+            * ``'cython'`` : Runs the function through C-extensions from cython.
+            * ``'numba'`` : Runs the function through JIT compiled code from numba.
+            * ``None`` : Defaults to ``'cython'`` or globally setting
+                ``compute.use_numba``
+
+        engine_kwargs : dict, default None
+            * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
+            * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
+              and ``parallel`` dictionary keys. The values must either be ``True`` or
+              ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
+              ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be
+              applied to the function
+
+        **kwargs
+            * If ``func`` is None, ``**kwargs`` are used to define the output names and
+              aggregations via Named Aggregation. See ``func`` entry.
+            * Otherwise, keyword arguments to be passed into func.
+
+        Returns
+        -------
+        DataFrame
+            Aggregated DataFrame based on the grouping and the applied aggregation
+            functions.
+
+        See Also
+        --------
+        DataFrame.groupby.apply : Apply function func group-wise
+            and combine the results together.
+        DataFrame.groupby.transform : Transforms the Series on each group
+            based on the given function.
+        DataFrame.aggregate : Aggregate using one or more operations.
+
+        Notes
+        -----
+        When using ``engine='numba'``, there will be no "fall back" behavior internally.
+        The group data and group index will be passed as numpy arrays to the JITed
+        user defined function, and no alternative execution attempts will be tried.
+
+        Functions that mutate the passed object can produce unexpected
+        behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
+        for more details.
+
+        .. versionchanged:: 1.3.0
+
+            The resulting dtype will reflect the return value of the passed ``func``,
+            see the examples below.
+
+        Examples
+        --------
+        >>> data = {
+        ...     "A": [1, 1, 2, 2],
+        ...     "B": [1, 2, 3, 4],
+        ...     "C": [0.362838, 0.227877, 1.267767, -0.562860],
+        ... }
+        >>> df = pd.DataFrame(data)
+        >>> df
+           A  B         C
+        0  1  1  0.362838
+        1  1  2  0.227877
+        2  2  3  1.267767
+        3  2  4 -0.562860
+
+        The aggregation is for each column.
+
+        >>> df.groupby("A").agg("min")
+           B         C
+        A
+        1  1  0.227877
+        2  3 -0.562860
+
+        Multiple aggregations
+
+        >>> df.groupby("A").agg(["min", "max"])
+            B             C
+          min max       min       max
+        A
+        1   1   2  0.227877  0.362838
+        2   3   4 -0.562860  1.267767
+
+        Select a column for aggregation
+
+        >>> df.groupby("A").B.agg(["min", "max"])
+           min  max
+        A
+        1    1    2
+        2    3    4
+
+        User-defined function for aggregation
+
+        >>> df.groupby("A").agg(lambda x: sum(x) + 2)
+            B          C
+        A
+        1       5       2.590715
+        2       9       2.704907
+
+        Different aggregations per column
+
+        >>> df.groupby("A").agg({"B": ["min", "max"], "C": "sum"})
+            B             C
+          min max       sum
+        A
+        1   1   2  0.590715
+        2   3   4  0.704907
+
+        To control the output names with different aggregations per column,
+        pandas supports "named aggregation"
+
+        >>> df.groupby("A").agg(
+        ...     b_min=pd.NamedAgg(column="B", aggfunc="min"),
+        ...     c_sum=pd.NamedAgg(column="C", aggfunc="sum"),
+        ... )
+           b_min     c_sum
+        A
+        1      1  0.590715
+        2      3  0.704907
+
+        - The keywords are the *output* column names
+        - The values are tuples whose first element is the column to select
+          and the second element is the aggregation to apply to that column.
+          Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields
+          ``['column', 'aggfunc']`` to make it clearer what the arguments are.
+          As usual, the aggregation can be a callable or a string alias.
+
+        See :ref:`groupby.aggregate.named` for more.
+
+        .. versionchanged:: 1.3.0
+
+            The resulting dtype will reflect the return value of the aggregating
+            function.
+
+        >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min())
+              B
+        A
+        1   1.0
+        2   3.0
+        """
         relabeling, func, columns, order = reconstruct_func(func, **kwargs)
         func = maybe_mangle_lambdas(func)
 
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 38dad446b4c39..9e36837bc679f 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -445,84 +445,6 @@ class providing the base-class of operations.
     see the examples below.
 {examples}"""
 
-_agg_template_frame = """
-Aggregate using one or more operations.
-
-Parameters
-----------
-func : function, str, list, dict or None
-    Function to use for aggregating the data. If a function, must either
-    work when passed a {klass} or when passed to {klass}.apply.
-
-    Accepted combinations are:
-
-    - function
-    - string function name
-    - list of functions and/or function names, e.g. ``[np.sum, 'mean']``
-    - dict of index labels -> functions, function names or list of such.
-    - None, in which case ``**kwargs`` are used with Named Aggregation. Here the
-      output has one column for each element in ``**kwargs``. The name of the
-      column is keyword, whereas the value determines the aggregation used to compute
-      the values in the column.
-
-      Can also accept a Numba JIT function with
-      ``engine='numba'`` specified. Only passing a single function is supported
-      with this engine.
-
-      If the ``'numba'`` engine is chosen, the function must be
-      a user defined function with ``values`` and ``index`` as the
-      first and second arguments respectively in the function signature.
-      Each group's index will be passed to the user defined function
-      and optionally available for use.
-
-*args
-    Positional arguments to pass to func.
-engine : str, default None
-    * ``'cython'`` : Runs the function through C-extensions from cython.
-    * ``'numba'`` : Runs the function through JIT compiled code from numba.
-    * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
-
-engine_kwargs : dict, default None
-    * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
-    * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
-      and ``parallel`` dictionary keys. The values must either be ``True`` or
-      ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
-      ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be
-      applied to the function
-
-**kwargs
-    * If ``func`` is None, ``**kwargs`` are used to define the output names and
-      aggregations via Named Aggregation. See ``func`` entry.
-    * Otherwise, keyword arguments to be passed into func.
-
-Returns
--------
-{klass}
-
-See Also
---------
-{klass}.groupby.apply : Apply function func group-wise
-    and combine the results together.
-{klass}.groupby.transform : Transforms the Series on each group
-    based on the given function.
-{klass}.aggregate : Aggregate using one or more operations.
-
-Notes
------
-When using ``engine='numba'``, there will be no "fall back" behavior internally.
-The group data and group index will be passed as numpy arrays to the JITed
-user defined function, and no alternative execution attempts will be tried.
-
-Functions that mutate the passed object can produce unexpected
-behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
-for more details.
-
-.. versionchanged:: 1.3.0
-
-    The resulting dtype will reflect the return value of the passed ``func``,
-    see the examples below.
-{examples}"""
-
 
 @final
 class GroupByPlot(PandasObject):
diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py
index 35f6ffb4980df..5962709056ae8 100755
--- a/scripts/validate_unwanted_patterns.py
+++ b/scripts/validate_unwanted_patterns.py
@@ -30,7 +30,6 @@
     "_new_Index",
     "_new_PeriodIndex",
     "_agg_template_series",
-    "_agg_template_frame",
     "_pipe_template",
     "_apply_groupings_depr",
     "__main__",

From d538a1cd1ad5d1e506c2dc36144e4cac5534858a Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sun, 29 Sep 2024 01:08:32 +0530
Subject: [PATCH 089/224] DOC: fix RT03, ES01 for
 pandas.core.groupby.SeriesGroupBy.agg and
 pandas.core.groupby.SeriesGroupBy.aggregate (#59898)

* DOC: fix RT03, ES01 for pandas.core.groupby.SeriesGroupBy.agg

* DOC: remove _agg_template_series

Co-authored-by: mroeschke <mroeschke@users.noreply.github.com>
Co-authored-by: rhshadrach <rhshadrach@users.noreply.github.com>

* DOC: remove _agg_template_series

Co-authored-by: mroeschke <mroeschke@users.noreply.github.com>
Co-authored-by: rhshadrach <rhshadrach@users.noreply.github.com>

* DOC: remove _agg_template_seris

---------

Co-authored-by: mroeschke <mroeschke@users.noreply.github.com>
Co-authored-by: rhshadrach <rhshadrach@users.noreply.github.com>
---
 ci/code_checks.sh                     |   2 -
 pandas/core/groupby/generic.py        | 136 +++++++++++++++++++++++++-
 pandas/core/groupby/groupby.py        |  81 ---------------
 scripts/validate_unwanted_patterns.py |   1 -
 4 files changed, 134 insertions(+), 86 deletions(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 669c793737161..b65dcedbd8a10 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -137,8 +137,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
         -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \
-        -i "pandas.core.groupby.SeriesGroupBy.agg RT03" \
-        -i "pandas.core.groupby.SeriesGroupBy.aggregate RT03" \
         -i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.groups SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.indices SA01" \
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index 0c211afb5073c..110c0ea88a0a1 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -67,7 +67,6 @@
 from pandas.core.groupby.groupby import (
     GroupBy,
     GroupByPlot,
-    _agg_template_series,
     _transform_template,
 )
 from pandas.core.indexes.api import (
@@ -323,8 +322,141 @@ def apply(self, func, *args, **kwargs) -> Series:
         """
         return super().apply(func, *args, **kwargs)
 
-    @doc(_agg_template_series, examples=_agg_examples_doc, klass="Series")
     def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
+        """
+        Aggregate using one or more operations.
+
+        The ``aggregate`` method enables flexible and efficient aggregation of grouped
+        data using a variety of functions, including built-in, user-defined, and
+        optimized JIT-compiled functions.
+
+        Parameters
+        ----------
+        func : function, str, list, dict or None
+            Function to use for aggregating the data. If a function, must either
+            work when passed a Series or when passed to Series.apply.
+
+            Accepted combinations are:
+
+            - function
+            - string function name
+            - list of functions and/or function names, e.g. ``[np.sum, 'mean']``
+            - None, in which case ``**kwargs`` are used with Named Aggregation. Here
+              the output has one column for each element in ``**kwargs``. The name of
+              the column is keyword, whereas the value determines the aggregation
+              used to compute the values in the column.
+
+              Can also accept a Numba JIT function with
+              ``engine='numba'`` specified. Only passing a single function is supported
+              with this engine.
+
+              If the ``'numba'`` engine is chosen, the function must be
+              a user defined function with ``values`` and ``index`` as the
+              first and second arguments respectively in the function signature.
+              Each group's index will be passed to the user defined function
+              and optionally available for use.
+
+            .. deprecated:: 2.1.0
+
+                Passing a dictionary is deprecated and will raise in a future version
+                of pandas. Pass a list of aggregations instead.
+        *args
+            Positional arguments to pass to func.
+        engine : str, default None
+            * ``'cython'`` : Runs the function through C-extensions from cython.
+            * ``'numba'`` : Runs the function through JIT compiled code from numba.
+            * ``None`` : Defaults to ``'cython'`` or globally setting
+                ``compute.use_numba``
+
+        engine_kwargs : dict, default None
+            * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
+            * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
+              and ``parallel`` dictionary keys. The values must either be ``True`` or
+              ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
+              ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be
+              applied to the function
+
+        **kwargs
+            * If ``func`` is None, ``**kwargs`` are used to define the output names and
+              aggregations via Named Aggregation. See ``func`` entry.
+            * Otherwise, keyword arguments to be passed into func.
+
+        Returns
+        -------
+        Series
+            Aggregated Series based on the grouping and the applied aggregation
+            functions.
+
+        See Also
+        --------
+        SeriesGroupBy.apply : Apply function func group-wise
+            and combine the results together.
+        SeriesGroupBy.transform : Transforms the Series on each group
+            based on the given function.
+        Series.aggregate : Aggregate using one or more operations.
+
+        Notes
+        -----
+        When using ``engine='numba'``, there will be no "fall back" behavior internally.
+        The group data and group index will be passed as numpy arrays to the JITed
+        user defined function, and no alternative execution attempts will be tried.
+
+        Functions that mutate the passed object can produce unexpected
+        behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
+        for more details.
+
+        .. versionchanged:: 1.3.0
+
+            The resulting dtype will reflect the return value of the passed ``func``,
+            see the examples below.
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 3, 4])
+
+        >>> s
+        0    1
+        1    2
+        2    3
+        3    4
+        dtype: int64
+
+        >>> s.groupby([1, 1, 2, 2]).min()
+        1    1
+        2    3
+        dtype: int64
+
+        >>> s.groupby([1, 1, 2, 2]).agg("min")
+        1    1
+        2    3
+        dtype: int64
+
+        >>> s.groupby([1, 1, 2, 2]).agg(["min", "max"])
+           min  max
+        1    1    2
+        2    3    4
+
+        The output column names can be controlled by passing
+        the desired column names and aggregations as keyword arguments.
+
+        >>> s.groupby([1, 1, 2, 2]).agg(
+        ...     minimum="min",
+        ...     maximum="max",
+        ... )
+           minimum  maximum
+        1        1        2
+        2        3        4
+
+        .. versionchanged:: 1.3.0
+
+            The resulting dtype will reflect the return value of the aggregating
+            function.
+
+        >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min())
+        1    1.0
+        2    3.0
+        dtype: float64
+        """
         relabeling = func is None
         columns = None
         if relabeling:
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 9e36837bc679f..e2410788ea95e 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -364,87 +364,6 @@ class providing the base-class of operations.
 --------
 %(example)s"""
 
-_agg_template_series = """
-Aggregate using one or more operations.
-
-Parameters
-----------
-func : function, str, list, dict or None
-    Function to use for aggregating the data. If a function, must either
-    work when passed a {klass} or when passed to {klass}.apply.
-
-    Accepted combinations are:
-
-    - function
-    - string function name
-    - list of functions and/or function names, e.g. ``[np.sum, 'mean']``
-    - None, in which case ``**kwargs`` are used with Named Aggregation. Here the
-      output has one column for each element in ``**kwargs``. The name of the
-      column is keyword, whereas the value determines the aggregation used to compute
-      the values in the column.
-
-      Can also accept a Numba JIT function with
-      ``engine='numba'`` specified. Only passing a single function is supported
-      with this engine.
-
-      If the ``'numba'`` engine is chosen, the function must be
-      a user defined function with ``values`` and ``index`` as the
-      first and second arguments respectively in the function signature.
-      Each group's index will be passed to the user defined function
-      and optionally available for use.
-
-    .. deprecated:: 2.1.0
-
-        Passing a dictionary is deprecated and will raise in a future version
-        of pandas. Pass a list of aggregations instead.
-*args
-    Positional arguments to pass to func.
-engine : str, default None
-    * ``'cython'`` : Runs the function through C-extensions from cython.
-    * ``'numba'`` : Runs the function through JIT compiled code from numba.
-    * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
-
-engine_kwargs : dict, default None
-    * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
-    * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
-      and ``parallel`` dictionary keys. The values must either be ``True`` or
-      ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
-      ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be
-      applied to the function
-
-**kwargs
-    * If ``func`` is None, ``**kwargs`` are used to define the output names and
-      aggregations via Named Aggregation. See ``func`` entry.
-    * Otherwise, keyword arguments to be passed into func.
-
-Returns
--------
-{klass}
-
-See Also
---------
-{klass}GroupBy.apply : Apply function func group-wise
-    and combine the results together.
-{klass}GroupBy.transform : Transforms the Series on each group
-    based on the given function.
-{klass}.aggregate : Aggregate using one or more operations.
-
-Notes
------
-When using ``engine='numba'``, there will be no "fall back" behavior internally.
-The group data and group index will be passed as numpy arrays to the JITed
-user defined function, and no alternative execution attempts will be tried.
-
-Functions that mutate the passed object can produce unexpected
-behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
-for more details.
-
-.. versionchanged:: 1.3.0
-
-    The resulting dtype will reflect the return value of the passed ``func``,
-    see the examples below.
-{examples}"""
-
 
 @final
 class GroupByPlot(PandasObject):
diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py
index 5962709056ae8..076acc359f933 100755
--- a/scripts/validate_unwanted_patterns.py
+++ b/scripts/validate_unwanted_patterns.py
@@ -29,7 +29,6 @@
     "_shared_docs",
     "_new_Index",
     "_new_PeriodIndex",
-    "_agg_template_series",
     "_pipe_template",
     "_apply_groupings_depr",
     "__main__",

From 34f546f8e73386659457fec0b3fa1ef5b0c6d569 Mon Sep 17 00:00:00 2001
From: Deepak Saldanha <saldanhadeepakuconn@gmail.com>
Date: Sun, 29 Sep 2024 23:05:45 +0530
Subject: [PATCH 090/224] DOC: fix docstrings for multiple api.types methods
 (#59920)

fix docstrings for api.types
---
 ci/code_checks.sh               |  5 ---
 pandas/core/dtypes/inference.py | 63 ++++++++++++++++++++++++++++++---
 2 files changed, 59 insertions(+), 9 deletions(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index b65dcedbd8a10..2b3e83d64ab21 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -107,14 +107,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timestamp.resolution PR02" \
         -i "pandas.Timestamp.tzinfo GL08" \
         -i "pandas.Timestamp.year GL08" \
-        -i "pandas.api.types.is_dict_like PR07,SA01" \
-        -i "pandas.api.types.is_file_like PR07,SA01" \
         -i "pandas.api.types.is_float PR01,SA01" \
-        -i "pandas.api.types.is_hashable PR01,RT03,SA01" \
         -i "pandas.api.types.is_integer PR01,SA01" \
         -i "pandas.api.types.is_iterator PR07,SA01" \
-        -i "pandas.api.types.is_named_tuple PR07,SA01" \
-        -i "pandas.api.types.is_re PR07,SA01" \
         -i "pandas.api.types.is_re_compilable PR07,SA01" \
         -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \
         -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py
index f042911b53d2b..6adb34ff0f777 100644
--- a/pandas/core/dtypes/inference.py
+++ b/pandas/core/dtypes/inference.py
@@ -113,13 +113,24 @@ def is_file_like(obj: object) -> bool:
 
     Parameters
     ----------
-    obj : The object to check
+    obj : object
+        The object to check for file-like properties.
+        This can be any Python object, and the function will
+        check if it has attributes typically associated with
+        file-like objects (e.g., `read`, `write`, `__iter__`).
 
     Returns
     -------
     bool
         Whether `obj` has file-like properties.
 
+    See Also
+    --------
+    api.types.is_dict_like : Check if the object is dict-like.
+    api.types.is_hashable : Return True if hash(obj) will succeed, False otherwise.
+    api.types.is_named_tuple : Check if the object is a named tuple.
+    api.types.is_iterator : Check if the object is an iterator.
+
     Examples
     --------
     >>> import io
@@ -142,13 +153,24 @@ def is_re(obj: object) -> TypeGuard[Pattern]:
 
     Parameters
     ----------
-    obj : The object to check
+    obj : object
+        The object to check for being a regex pattern. Typically,
+        this would be an object that you expect to be a compiled
+        pattern from the `re` module.
 
     Returns
     -------
     bool
         Whether `obj` is a regex pattern.
 
+    See Also
+    --------
+    api.types.is_float : Return True if given object is float.
+    api.types.is_iterator : Check if the object is an iterator.
+    api.types.is_integer : Return True if given object is integer.
+    api.types.is_re_compilable : Check if the object can be compiled
+                                into a regex pattern instance.
+
     Examples
     --------
     >>> from pandas.api.types import is_re
@@ -275,13 +297,22 @@ def is_dict_like(obj: object) -> bool:
 
     Parameters
     ----------
-    obj : The object to check
+    obj : object
+        The object to check. This can be any Python object,
+        and the function will determine whether it
+        behaves like a dictionary.
 
     Returns
     -------
     bool
         Whether `obj` has dict-like properties.
 
+    See Also
+    --------
+    api.types.is_list_like : Check if the object is list-like.
+    api.types.is_file_like : Check if the object is a file-like.
+    api.types.is_named_tuple : Check if the object is a named tuple.
+
     Examples
     --------
     >>> from pandas.api.types import is_dict_like
@@ -308,13 +339,22 @@ def is_named_tuple(obj: object) -> bool:
 
     Parameters
     ----------
-    obj : The object to check
+    obj : object
+        The object that will be checked to determine
+        whether it is a named tuple.
 
     Returns
     -------
     bool
         Whether `obj` is a named tuple.
 
+    See Also
+    --------
+    api.types.is_dict_like: Check if the object is dict-like.
+    api.types.is_hashable: Return True if hash(obj)
+                                  will succeed, False otherwise.
+    api.types.is_categorical_dtype : Check if the dtype is categorical.
+
     Examples
     --------
     >>> from collections import namedtuple
@@ -340,9 +380,24 @@ def is_hashable(obj: object) -> TypeGuard[Hashable]:
     Distinguish between these and other types by trying the call to hash() and
     seeing if they raise TypeError.
 
+    Parameters
+    ----------
+    obj : object
+        The object to check for hashability. Any Python object can be passed here.
+
     Returns
     -------
     bool
+        True if object can be hashed (i.e., does not raise TypeError when
+        passed to hash()), and False otherwise (e.g., if object is mutable
+        like a list or dictionary).
+
+    See Also
+    --------
+    api.types.is_float : Return True if given object is float.
+    api.types.is_iterator : Check if the object is an iterator.
+    api.types.is_list_like : Check if the object is list-like.
+    api.types.is_dict_like : Check if the object is dict-like.
 
     Examples
     --------

From 5b35c77041a74b53ebd7c330ca5930fa22929726 Mon Sep 17 00:00:00 2001
From: gameofby <yanyantdcq@outlook.com>
Date: Mon, 30 Sep 2024 01:36:34 +0800
Subject: [PATCH 091/224] DOC: the table name should be
 `air_quality_parameters` rather than `air_quality_parameters_name` (#59918)

---
 .../getting_started/intro_tutorials/08_combine_dataframes.rst   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst
index 05729809491b5..024300bb8a9b0 100644
--- a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst
+++ b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst
@@ -271,7 +271,7 @@ Add the parameters' full description and name, provided by the parameters metada
 
 Compared to the previous example, there is no common column name.
 However, the ``parameter`` column in the ``air_quality`` table and the
-``id`` column in the ``air_quality_parameters_name`` both provide the
+``id`` column in the ``air_quality_parameters`` table both provide the
 measured variable in a common format. The ``left_on`` and ``right_on``
 arguments are used here (instead of just ``on``) to make the link
 between the two tables.

From 90c26ce7ce04d97fdabb394e604ecee0a558c019 Mon Sep 17 00:00:00 2001
From: Deepak Saldanha <saldanhadeepakuconn@gmail.com>
Date: Mon, 30 Sep 2024 00:25:17 +0530
Subject: [PATCH 092/224] DOC: Separate out examples - pandas.str.is methods
 (#59850)

---
 pandas/core/strings/accessor.py | 193 ++++++++++++++++++++++++++------
 1 file changed, 156 insertions(+), 37 deletions(-)

diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index 6d10365a1b968..10117aa6bf503 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -3443,10 +3443,10 @@ def casefold(self):
     Series or Index of bool
         Series or Index of boolean values with the same length as the original
         Series/Index.
-
+    """
+    _shared_docs["isalpha"] = """
     See Also
     --------
-    Series.str.isalpha : Check whether all characters are alphabetic.
     Series.str.isnumeric : Check whether all characters are numeric.
     Series.str.isalnum : Check whether all characters are alphanumeric.
     Series.str.isdigit : Check whether all characters are digits.
@@ -3458,24 +3458,56 @@ def casefold(self):
 
     Examples
     --------
-    **Checks for Alphabetic and Numeric Characters**
 
     >>> s1 = pd.Series(['one', 'one1', '1', ''])
-
     >>> s1.str.isalpha()
     0     True
     1    False
     2    False
     3    False
     dtype: bool
+    """
+    _shared_docs["isnumeric"] = """
+    See Also
+    --------
+    Series.str.isalpha : Check whether all characters are alphabetic.
+    Series.str.isalnum : Check whether all characters are alphanumeric.
+    Series.str.isdigit : Check whether all characters are digits.
+    Series.str.isdecimal : Check whether all characters are decimal.
+    Series.str.isspace : Check whether all characters are whitespace.
+    Series.str.islower : Check whether all characters are lowercase.
+    Series.str.isupper : Check whether all characters are uppercase.
+    Series.str.istitle : Check whether all characters are titlecase.
+
+    Examples
+    --------
+    The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but
+    also includes other characters that can represent quantities such as
+    unicode fractions.
 
+    >>> s1 = pd.Series(['one', 'one1', '1', ''])
     >>> s1.str.isnumeric()
     0    False
     1    False
     2     True
     3    False
     dtype: bool
+    """
+    _shared_docs["isalnum"] = """
+    See Also
+    --------
+    Series.str.isalpha : Check whether all characters are alphabetic.
+    Series.str.isnumeric : Check whether all characters are numeric.
+    Series.str.isdigit : Check whether all characters are digits.
+    Series.str.isdecimal : Check whether all characters are decimal.
+    Series.str.isspace : Check whether all characters are whitespace.
+    Series.str.islower : Check whether all characters are lowercase.
+    Series.str.isupper : Check whether all characters are uppercase.
+    Series.str.istitle : Check whether all characters are titlecase.
 
+    Examples
+    --------
+    >>> s1 = pd.Series(['one', 'one1', '1', ''])
     >>> s1.str.isalnum()
     0     True
     1     True
@@ -3492,47 +3524,72 @@ def casefold(self):
     1    False
     2    False
     dtype: bool
+    """
+    _shared_docs["isdecimal"] = """
+    See Also
+    --------
+    Series.str.isalpha : Check whether all characters are alphabetic.
+    Series.str.isnumeric : Check whether all characters are numeric.
+    Series.str.isalnum : Check whether all characters are alphanumeric.
+    Series.str.isdigit : Check whether all characters are digits.
+    Series.str.isspace : Check whether all characters are whitespace.
+    Series.str.islower : Check whether all characters are lowercase.
+    Series.str.isupper : Check whether all characters are uppercase.
+    Series.str.istitle : Check whether all characters are titlecase.
 
-    **More Detailed Checks for Numeric Characters**
-
-    There are several different but overlapping sets of numeric characters that
-    can be checked for.
+    Examples
+    --------
+    The ``s3.str.isdecimal`` method checks for characters used to form
+    numbers in base 10.
 
     >>> s3 = pd.Series(['23', '³', '⅕', ''])
-
-    The ``s3.str.isdecimal`` method checks for characters used to form numbers
-    in base 10.
-
     >>> s3.str.isdecimal()
     0     True
     1    False
     2    False
     3    False
     dtype: bool
+    """
+    _shared_docs["isdigit"] = """
+    See Also
+    --------
+    Series.str.isalpha : Check whether all characters are alphabetic.
+    Series.str.isnumeric : Check whether all characters are numeric.
+    Series.str.isalnum : Check whether all characters are alphanumeric.
+    Series.str.isdecimal : Check whether all characters are decimal.
+    Series.str.isspace : Check whether all characters are whitespace.
+    Series.str.islower : Check whether all characters are lowercase.
+    Series.str.isupper : Check whether all characters are uppercase.
+    Series.str.istitle : Check whether all characters are titlecase.
 
-    The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also
-    includes special digits, like superscripted and subscripted digits in
-    unicode.
+    Examples
+    --------
+    Similar to ``str.isdecimal`` but also includes special digits, like
+    superscripted and subscripted digits in unicode.
 
+    >>> s3 = pd.Series(['23', '³', '⅕', ''])
     >>> s3.str.isdigit()
     0     True
     1     True
     2    False
     3    False
     dtype: bool
+    """
 
-    The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also
-    includes other characters that can represent quantities such as unicode
-    fractions.
-
-    >>> s3.str.isnumeric()
-    0     True
-    1     True
-    2     True
-    3    False
-    dtype: bool
+    _shared_docs["isspace"] = """
+    See Also
+    --------
+    Series.str.isalpha : Check whether all characters are alphabetic.
+    Series.str.isnumeric : Check whether all characters are numeric.
+    Series.str.isalnum : Check whether all characters are alphanumeric.
+    Series.str.isdigit : Check whether all characters are digits.
+    Series.str.isdecimal : Check whether all characters are decimal.
+    Series.str.islower : Check whether all characters are lowercase.
+    Series.str.isupper : Check whether all characters are uppercase.
+    Series.str.istitle : Check whether all characters are titlecase.
 
-    **Checks for Whitespace**
+    Examples
+    --------
 
     >>> s4 = pd.Series([' ', '\\t\\r\\n ', ''])
     >>> s4.str.isspace()
@@ -3540,30 +3597,74 @@ def casefold(self):
     1     True
     2    False
     dtype: bool
+    """
+    _shared_docs["islower"] = """
+    See Also
+    --------
+    Series.str.isalpha : Check whether all characters are alphabetic.
+    Series.str.isnumeric : Check whether all characters are numeric.
+    Series.str.isalnum : Check whether all characters are alphanumeric.
+    Series.str.isdigit : Check whether all characters are digits.
+    Series.str.isdecimal : Check whether all characters are decimal.
+    Series.str.isspace : Check whether all characters are whitespace.
+    Series.str.isupper : Check whether all characters are uppercase.
+    Series.str.istitle : Check whether all characters are titlecase.
 
-    **Checks for Character Case**
+    Examples
+    --------
 
     >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])
-
     >>> s5.str.islower()
     0     True
     1    False
     2    False
     3    False
     dtype: bool
+    """
+
+    _shared_docs["isupper"] = """
+    See Also
+    --------
+    Series.str.isalpha : Check whether all characters are alphabetic.
+    Series.str.isnumeric : Check whether all characters are numeric.
+    Series.str.isalnum : Check whether all characters are alphanumeric.
+    Series.str.isdigit : Check whether all characters are digits.
+    Series.str.isdecimal : Check whether all characters are decimal.
+    Series.str.isspace : Check whether all characters are whitespace.
+    Series.str.islower : Check whether all characters are lowercase.
+    Series.str.istitle : Check whether all characters are titlecase.
 
+    Examples
+    --------
+
+    >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])
     >>> s5.str.isupper()
     0    False
     1    False
     2     True
     3    False
     dtype: bool
+    """
+    _shared_docs["istitle"] = """
+    See Also
+    --------
+    Series.str.isalpha : Check whether all characters are alphabetic.
+    Series.str.isnumeric : Check whether all characters are numeric.
+    Series.str.isalnum : Check whether all characters are alphanumeric.
+    Series.str.isdigit : Check whether all characters are digits.
+    Series.str.isdecimal : Check whether all characters are decimal.
+    Series.str.isspace : Check whether all characters are whitespace.
+    Series.str.islower : Check whether all characters are lowercase.
+    Series.str.isupper : Check whether all characters are uppercase.
 
+    Examples
+    ------------
     The ``s5.str.istitle`` method checks for whether all words are in title
     case (whether only the first letter of each word is capitalized). Words are
     assumed to be as any sequence of non-numeric characters separated by
     whitespace characters.
 
+    >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])
     >>> s5.str.istitle()
     0    False
     1     True
@@ -3583,31 +3684,49 @@ def casefold(self):
     # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624)
 
     isalnum = _map_and_wrap(
-        "isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"]
+        "isalnum",
+        docstring=_shared_docs["ismethods"] % _doc_args["isalnum"]
+        + _shared_docs["isalnum"],
     )
     isalpha = _map_and_wrap(
-        "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"]
+        "isalpha",
+        docstring=_shared_docs["ismethods"] % _doc_args["isalpha"]
+        + _shared_docs["isalpha"],
     )
     isdigit = _map_and_wrap(
-        "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"]
+        "isdigit",
+        docstring=_shared_docs["ismethods"] % _doc_args["isdigit"]
+        + _shared_docs["isdigit"],
     )
     isspace = _map_and_wrap(
-        "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"]
+        "isspace",
+        docstring=_shared_docs["ismethods"] % _doc_args["isspace"]
+        + _shared_docs["isspace"],
     )
     islower = _map_and_wrap(
-        "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"]
+        "islower",
+        docstring=_shared_docs["ismethods"] % _doc_args["islower"]
+        + _shared_docs["islower"],
     )
     isupper = _map_and_wrap(
-        "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"]
+        "isupper",
+        docstring=_shared_docs["ismethods"] % _doc_args["isupper"]
+        + _shared_docs["isupper"],
     )
     istitle = _map_and_wrap(
-        "istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"]
+        "istitle",
+        docstring=_shared_docs["ismethods"] % _doc_args["istitle"]
+        + _shared_docs["istitle"],
     )
     isnumeric = _map_and_wrap(
-        "isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"]
+        "isnumeric",
+        docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"]
+        + _shared_docs["isnumeric"],
     )
     isdecimal = _map_and_wrap(
-        "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"]
+        "isdecimal",
+        docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"]
+        + _shared_docs["isdecimal"],
     )
 
 
From d66d5823607ecf4c6d1f8eac9ae679863218f2ba Mon Sep 17 00:00:00 2001
From: Deepak Saldanha <saldanhadeepakuconn@gmail.com>
Date: Mon, 30 Sep 2024 21:29:09 +0530
Subject: [PATCH 093/224] DOC: fix pandas.TimedeltaIndex.to_pytimedelta
 RT03,SA01 (#59914)

* update docstrings

* Update pandas/core/arrays/timedeltas.py

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>

---------

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
 ci/code_checks.sh                |  1 -
 pandas/core/arrays/timedeltas.py | 21 +++++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 2b3e83d64ab21..fa23adca6d61e 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -100,7 +100,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timedelta.max PR02" \
         -i "pandas.Timedelta.min PR02" \
         -i "pandas.Timedelta.resolution PR02" \
-        -i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \
         -i "pandas.Timestamp.max PR02" \
         -i "pandas.Timestamp.min PR02" \
         -i "pandas.Timestamp.nanosecond GL08" \
diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
index 754ae277e359a..a8a0037d0bbb9 100644
--- a/pandas/core/arrays/timedeltas.py
+++ b/pandas/core/arrays/timedeltas.py
@@ -790,6 +790,19 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]:
         Returns
         -------
         numpy.ndarray
+            A NumPy ``timedelta64`` object representing the same duration as the
+            original pandas ``Timedelta`` object. The precision of the resulting
+            object is in nanoseconds, which is the default
+            time resolution used by pandas for ``Timedelta`` objects, ensuring
+            high precision for time-based calculations.
+
+        See Also
+        --------
+        to_timedelta : Convert argument to timedelta format.
+        Timedelta : Represents a duration between two dates or times.
+        DatetimeIndex: Index of datetime64 data.
+        Timedelta.components : Return a components namedtuple-like
+                               of a single timedelta.
 
         Examples
         --------
@@ -800,6 +813,14 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]:
         >>> tdelta_idx.to_pytimedelta()
         array([datetime.timedelta(days=1), datetime.timedelta(days=2),
                datetime.timedelta(days=3)], dtype=object)
+
+        >>> tidx = pd.TimedeltaIndex(data=["1 days 02:30:45", "3 days 04:15:10"])
+        >>> tidx
+        TimedeltaIndex(['1 days 02:30:45', '3 days 04:15:10'],
+               dtype='timedelta64[ns]', freq=None)
+        >>> tidx.to_pytimedelta()
+        array([datetime.timedelta(days=1, seconds=9045),
+                datetime.timedelta(days=3, seconds=15310)], dtype=object)
         """
         return ints_to_pytimedelta(self._ndarray)
 

From 111ff84bb958cc7b13a060d9539f83b67ced8f02 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Mon, 30 Sep 2024 21:29:49 +0530
Subject: [PATCH 094/224] DOC: fix SA01, ES01 for pandas.errors.ClosedFileError
 (#59924)

---
 ci/code_checks.sh         |  1 -
 pandas/errors/__init__.py | 10 ++++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index fa23adca6d61e..42eedfe8e223b 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -159,7 +159,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.errors.CSSWarning SA01" \
         -i "pandas.errors.CategoricalConversionWarning SA01" \
         -i "pandas.errors.ChainedAssignmentError SA01" \
-        -i "pandas.errors.ClosedFileError SA01" \
         -i "pandas.errors.DataError SA01" \
         -i "pandas.errors.DuplicateLabelError SA01" \
         -i "pandas.errors.IntCastingNaNError SA01" \
diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py
index b9ceae341afd3..46e090cc3a589 100644
--- a/pandas/errors/__init__.py
+++ b/pandas/errors/__init__.py
@@ -615,6 +615,16 @@ class ClosedFileError(Exception):
     """
     Exception is raised when trying to perform an operation on a closed HDFStore file.
 
+    ``ClosedFileError`` is specific to operations on ``HDFStore`` objects. Once an
+    HDFStore is closed, its resources are no longer available, and any further attempt
+    to access data or perform file operations will raise this exception.
+
+    See Also
+    --------
+    HDFStore.close : Closes the PyTables file handle.
+    HDFStore.open : Opens the file in the specified mode.
+    HDFStore.is_open : Returns a boolean indicating whether the file is open.
+
     Examples
     --------
     >>> store = pd.HDFStore("my-store", "a")  # doctest: +SKIP

From 1baec153e72f98e7184e972f1e937626703e42a6 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Mon, 30 Sep 2024 21:30:32 +0530
Subject: [PATCH 095/224] DOC: fix SA01, ES01 for
 pandas.errors.OutOfBoundsDatetime (#59925)

---
 ci/code_checks.sh                   | 1 -
 pandas/_libs/tslibs/np_datetime.pyx | 9 +++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 42eedfe8e223b..4a1a0042405e3 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -168,7 +168,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.errors.NumExprClobberingError SA01" \
         -i "pandas.errors.NumbaUtilError SA01" \
         -i "pandas.errors.OptionError SA01" \
-        -i "pandas.errors.OutOfBoundsDatetime SA01" \
         -i "pandas.errors.OutOfBoundsTimedelta SA01" \
         -i "pandas.errors.PerformanceWarning SA01" \
         -i "pandas.errors.PossibleDataLossError SA01" \
diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx
index 0b02fc13246f0..193556b2697a9 100644
--- a/pandas/_libs/tslibs/np_datetime.pyx
+++ b/pandas/_libs/tslibs/np_datetime.pyx
@@ -176,6 +176,15 @@ class OutOfBoundsDatetime(ValueError):
     """
     Raised when the datetime is outside the range that can be represented.
 
+    This error occurs when attempting to convert or parse a datetime value
+    that exceeds the bounds supported by pandas' internal datetime
+    representation.
+
+    See Also
+    --------
+    to_datetime : Convert argument to datetime.
+    Timestamp : Pandas replacement for python ``datetime.datetime`` object.
+
     Examples
     --------
     >>> pd.to_datetime("08335394550")

From 74d36ac1c1fe7e735f5b7392cb9dd1bff57f729b Mon Sep 17 00:00:00 2001
From: Florian Bourgey <bourgeyflorian@gmail.com>
Date: Mon, 30 Sep 2024 12:02:41 -0400
Subject: [PATCH 096/224] Fix docstring Timedelta.to_timedelta64 SA01,
 Timedelta.total_seconds SA01, Timedelta.view SA01 (#59719)

* Add 'See Also' section for Timedelta.to_timedelta64

* Fix SA01 for Timedelta.total_seconds()

* Fix SA01 for Timedelta.view

* Add space

* Fix test_nat_doc_strings

* Revert "Fix test_nat_doc_strings"

This reverts commit 9d0965805daa2dbd02eaa1878858cfb0eb97df02.

* Match doc of total_seconds method in nattype.pyx
---
 pandas/_libs/tslibs/nattype.pyx    | 2 ++
 pandas/_libs/tslibs/timedeltas.pyx | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx
index 620e0846c750e..1c0a99eb1ea25 100644
--- a/pandas/_libs/tslibs/nattype.pyx
+++ b/pandas/_libs/tslibs/nattype.pyx
@@ -500,6 +500,8 @@ class NaTType(_NaT):
         --------
         to_timedelta : Convert argument to timedelta.
         Timedelta : Represents a duration, the difference between two dates or times.
+        Timedelta.seconds : Returns the seconds component of the timedelta.
+        Timedelta.microseconds : Returns the microseconds component of the timedelta.
 
         Examples
         --------
diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
index 84ca48c96459f..bbefea7c47fc3 100644
--- a/pandas/_libs/tslibs/timedeltas.pyx
+++ b/pandas/_libs/tslibs/timedeltas.pyx
@@ -1196,6 +1196,8 @@ cdef class _Timedelta(timedelta):
         --------
         to_timedelta : Convert argument to timedelta.
         Timedelta : Represents a duration, the difference between two dates or times.
+        Timedelta.seconds : Returns the seconds component of the timedelta.
+        Timedelta.microseconds : Returns the microseconds component of the timedelta.
 
         Examples
         --------
@@ -1493,6 +1495,7 @@ cdef class _Timedelta(timedelta):
 
         See Also
         --------
+        Timedelta.asm8 : Return a numpy timedelta64 array scalar view.
         numpy.ndarray.view : Returns a view of an array with the same data.
         Timedelta.to_numpy : Converts the Timedelta to a NumPy timedelta64.
         Timedelta.total_seconds : Returns the total duration of the Timedelta

From 00855f81bd84cc6ed9ae42c5f66916b2208dbe04 Mon Sep 17 00:00:00 2001
From: Qaiser Abbasi <3501767+bbq2100@users.noreply.github.com>
Date: Mon, 30 Sep 2024 18:05:22 +0200
Subject: [PATCH 097/224] Fix typo in 10min.rst (#59921)

---
 doc/source/user_guide/10min.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst
index 887ffd5580a52..72bb93d21a99f 100644
--- a/doc/source/user_guide/10min.rst
+++ b/doc/source/user_guide/10min.rst
@@ -177,7 +177,7 @@ See the indexing documentation :ref:`Indexing and Selecting Data <indexing>` and
 Getitem (``[]``)
 ~~~~~~~~~~~~~~~~
 
-For a :class:`DataFrame`, passing a single label selects a columns and
+For a :class:`DataFrame`, passing a single label selects a column and
 yields a :class:`Series` equivalent to ``df.A``:
 
 .. ipython:: python

From cf480366a6bd9979124b91dd894230cbb510ca4d Mon Sep 17 00:00:00 2001
From: Marc Mueller <30130371+cdce8p@users.noreply.github.com>
Date: Mon, 30 Sep 2024 22:47:40 +0200
Subject: [PATCH 098/224] BLD: Fix armv7 build (#59906)

---
 pandas/_libs/src/vendored/ujson/python/JSONtoObj.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c
index 7cc20a52f1849..4cfead8ac77a5 100644
--- a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c
+++ b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c
@@ -38,9 +38,11 @@ Numeric decoder derived from TCL library
 
 // Licence at LICENSES/ULTRAJSON_LICENSE
 
-#include "pandas/vendored/ujson/lib/ultrajson.h"
+// clang-format off
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
+#include "pandas/vendored/ujson/lib/ultrajson.h"
+// clang-format on
 
 static int Object_objectAddKey(void *Py_UNUSED(prv), JSOBJ obj, JSOBJ name,
                                JSOBJ value) {

From e78ebd3f845c086af1d71c0604701ec49df97228 Mon Sep 17 00:00:00 2001
From: Florian Bourgey <bourgeyflorian@gmail.com>
Date: Mon, 30 Sep 2024 17:50:16 -0400
Subject: [PATCH 099/224] DOC: Fix intro to datastructures Series constructor
 behavior (#59793)

---
 doc/source/user_guide/dsintro.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst
index 9757a72f13fa8..b9c285ca30c96 100644
--- a/doc/source/user_guide/dsintro.rst
+++ b/doc/source/user_guide/dsintro.rst
@@ -87,8 +87,9 @@ index will be pulled out.
 
 **From scalar value**
 
-If ``data`` is a scalar value, an index must be
-provided. The value will be repeated to match the length of **index**.
+If ``data`` is a scalar value, the value will be repeated to match
+the length of **index**.  If the **index** is not provided, it defaults
+to ``RangeIndex(1)``.
 
 .. ipython:: python
 

From f598670353311a6fff4e6e1e96074ccf0737e6b7 Mon Sep 17 00:00:00 2001
From: Petroncini <59212480+Petroncini@users.noreply.github.com>
Date: Tue, 1 Oct 2024 17:33:42 -0300
Subject: [PATCH 100/224] BUG: groupby().any() returns true for groups with
 timedelta all NaT (#59782)

---
 doc/source/whatsnew/v3.0.0.rst        |  1 +
 pandas/core/groupby/ops.py            |  8 +++++---
 pandas/tests/groupby/test_grouping.py | 12 ++++++++++++
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 41ba80989a0ce..6ebb51cd3ef89 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -652,6 +652,7 @@ Plotting
 Groupby/resample/rolling
 ^^^^^^^^^^^^^^^^^^^^^^^^
 - Bug in :meth:`.DataFrameGroupBy.__len__` and :meth:`.SeriesGroupBy.__len__` would raise when the grouping contained NA values and ``dropna=False`` (:issue:`58644`)
+- Bug in :meth:`.DataFrameGroupBy.any` that returned True for groups where all Timedelta values are NaT. (:issue:`59712`)
 - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`)
 - Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`)
 - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`)
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index da80969b613cd..0e99178642715 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -371,6 +371,10 @@ def _call_cython_op(
 
         is_datetimelike = dtype.kind in "mM"
 
+        if self.how in ["any", "all"]:
+            if mask is None:
+                mask = isna(values)
+
         if is_datetimelike:
             values = values.view("int64")
             is_numeric = True
@@ -380,12 +384,10 @@ def _call_cython_op(
             values = values.astype(np.float32)
 
         if self.how in ["any", "all"]:
-            if mask is None:
-                mask = isna(values)
             if dtype == object:
                 if kwargs["skipna"]:
                     # GH#37501: don't raise on pd.NA when skipna=True
-                    if mask.any():
+                    if mask is not None and mask.any():
                         # mask on original values computed separately
                         values = values.copy()
                         values[mask] = True
diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py
index fc2a8a970010a..6bb2eaf89b5d7 100644
--- a/pandas/tests/groupby/test_grouping.py
+++ b/pandas/tests/groupby/test_grouping.py
@@ -1180,3 +1180,15 @@ def test_grouping_by_key_is_in_axis():
     result = gb.sum()
     expected = DataFrame({"a": [1, 2], "b": [1, 2], "c": [7, 5]})
     tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_any_with_timedelta():
+    # GH#59712
+    df = DataFrame({"value": [pd.Timedelta(1), pd.NaT]})
+
+    result = df.groupby(np.array([0, 1], dtype=np.int64))["value"].any()
+
+    expected = Series({0: True, 1: False}, name="value", dtype=bool)
+    expected.index = expected.index.astype(np.int64)
+
+    tm.assert_series_equal(result, expected)

From f738d9754ff3eb9b92fef9f294e4bd3699191903 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 2 Oct 2024 12:57:25 +0200
Subject: [PATCH 101/224] CI: Run jobs on 2.3.x branch (#59939)

---
 .github/workflows/code-checks.yml         | 4 ++--
 .github/workflows/docbuild-and-upload.yml | 4 ++--
 .github/workflows/package-checks.yml      | 4 ++--
 .github/workflows/unit-tests.yml          | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
index 7e9c056e75131..e1d2d1ea846b8 100644
--- a/.github/workflows/code-checks.yml
+++ b/.github/workflows/code-checks.yml
@@ -4,11 +4,11 @@ on:
   push:
     branches:
       - main
-      - 2.2.x
+      - 2.3.x
   pull_request:
     branches:
       - main
-      - 2.2.x
+      - 2.3.x
 
 env:
   ENV_FILE: environment.yml
diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml
index 47b97fa57852a..908baa87815ab 100644
--- a/.github/workflows/docbuild-and-upload.yml
+++ b/.github/workflows/docbuild-and-upload.yml
@@ -4,13 +4,13 @@ on:
   push:
     branches:
       - main
-      - 2.2.x
+      - 2.3.x
     tags:
       - '*'
   pull_request:
     branches:
       - main
-      - 2.2.x
+      - 2.3.x
 
 env:
   ENV_FILE: environment.yml
diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml
index 97f90c1588962..6748832903e30 100644
--- a/.github/workflows/package-checks.yml
+++ b/.github/workflows/package-checks.yml
@@ -4,11 +4,11 @@ on:
   push:
     branches:
       - main
-      - 2.2.x
+      - 2.3.x
   pull_request:
     branches:
       - main
-      - 2.2.x
+      - 2.3.x
     types: [ labeled, opened, synchronize, reopened ]
 
 permissions:
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index d145836f3e596..60b234d613a38 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -4,11 +4,11 @@ on:
   push:
     branches:
       - main
-      - 2.2.x
+      - 2.3.x
   pull_request:
     branches:
       - main
-      - 2.2.x
+      - 2.3.x
     paths-ignore:
       - "doc/**"
       - "web/**"

From fd823d22578b684b6070d956def006230e3f6bb3 Mon Sep 17 00:00:00 2001
From: Marc Mueller <30130371+cdce8p@users.noreply.github.com>
Date: Wed, 2 Oct 2024 15:25:48 +0200
Subject: [PATCH 102/224] Include Python.h first (#59929)

---
 pandas/_libs/include/pandas/datetime/date_conversions.h  | 1 +
 pandas/_libs/include/pandas/parser/io.h                  | 3 ++-
 pandas/_libs/include/pandas/parser/pd_parser.h           | 3 ++-
 pandas/_libs/include/pandas/vendored/klib/khash_python.h | 1 +
 pandas/_libs/src/vendored/ujson/python/JSONtoObj.c       | 3 +--
 pandas/_libs/src/vendored/ujson/python/ujson.c           | 1 +
 6 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/pandas/_libs/include/pandas/datetime/date_conversions.h b/pandas/_libs/include/pandas/datetime/date_conversions.h
index e039991847a62..043805a8b25f4 100644
--- a/pandas/_libs/include/pandas/datetime/date_conversions.h
+++ b/pandas/_libs/include/pandas/datetime/date_conversions.h
@@ -9,6 +9,7 @@ The full license is in the LICENSE file, distributed with this software.
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
+
 #include <numpy/ndarraytypes.h>
 
 // Scales value inplace from nanosecond resolution to unit resolution
diff --git a/pandas/_libs/include/pandas/parser/io.h b/pandas/_libs/include/pandas/parser/io.h
index c707c23b567d2..41f1bb9312724 100644
--- a/pandas/_libs/include/pandas/parser/io.h
+++ b/pandas/_libs/include/pandas/parser/io.h
@@ -10,9 +10,10 @@ The full license is in the LICENSE file, distributed with this software.
 #pragma once
 
 #define PY_SSIZE_T_CLEAN
-#include "tokenizer.h"
 #include <Python.h>
 
+#include "tokenizer.h"
+
 #define FS(source) ((file_source *)source)
 
 typedef struct _rd_source {
diff --git a/pandas/_libs/include/pandas/parser/pd_parser.h b/pandas/_libs/include/pandas/parser/pd_parser.h
index 58a09ae1bba39..543839b5d75bf 100644
--- a/pandas/_libs/include/pandas/parser/pd_parser.h
+++ b/pandas/_libs/include/pandas/parser/pd_parser.h
@@ -13,9 +13,10 @@ extern "C" {
 #endif
 
 #define PY_SSIZE_T_CLEAN
-#include "pandas/parser/tokenizer.h"
 #include <Python.h>
 
+#include "pandas/parser/tokenizer.h"
+
 typedef struct {
   int (*to_double)(char *, double *, char, char, int *);
   int (*floatify)(PyObject *, double *, int *);
diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h
index 2fa61642968cf..9706a8211b61f 100644
--- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h
+++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h
@@ -3,6 +3,7 @@
 #pragma once
 
 #include <Python.h>
+
 #include <pymem.h>
 #include <string.h>
 
diff --git a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c
index 4cfead8ac77a5..ef6f1104a1fb9 100644
--- a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c
+++ b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c
@@ -38,11 +38,10 @@ Numeric decoder derived from TCL library
 
 // Licence at LICENSES/ULTRAJSON_LICENSE
 
-// clang-format off
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
+
 #include "pandas/vendored/ujson/lib/ultrajson.h"
-// clang-format on
 
 static int Object_objectAddKey(void *Py_UNUSED(prv), JSOBJ obj, JSOBJ name,
                                JSOBJ value) {
diff --git a/pandas/_libs/src/vendored/ujson/python/ujson.c b/pandas/_libs/src/vendored/ujson/python/ujson.c
index f369d122a3dbe..2ee084b9304f4 100644
--- a/pandas/_libs/src/vendored/ujson/python/ujson.c
+++ b/pandas/_libs/src/vendored/ujson/python/ujson.c
@@ -40,6 +40,7 @@ Numeric decoder derived from TCL library
 
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
+
 #define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY
 #include "numpy/arrayobject.h"
 

From ba7e83da18ac8bfc4f0a521855c0b2ad05ccbbd4 Mon Sep 17 00:00:00 2001
From: FuzzyParrabellum <58094668+FuzzyParrabellum@users.noreply.github.com>
Date: Wed, 2 Oct 2024 21:38:49 +0200
Subject: [PATCH 103/224] DOC: Fix docstring of pandas.Series.compare list
 indent formatting (#59911)

Co-authored-by: rdzantoine.pro@gmail.com <rdzantoine.pro@gmail.com>
Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
 pandas/core/shared_docs.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py
index cb0c3d241534c..81fa508ae6d23 100644
--- a/pandas/core/shared_docs.py
+++ b/pandas/core/shared_docs.py
@@ -65,9 +65,9 @@
     Determine which axis to align the comparison on.
 
     * 0, or 'index' : Resulting differences are stacked vertically
-        with rows drawn alternately from self and other.
+      with rows drawn alternately from self and other.
     * 1, or 'columns' : Resulting differences are aligned horizontally
-        with columns drawn alternately from self and other.
+      with columns drawn alternately from self and other.
 
 keep_shape : bool, default False
     If true, all rows and columns are kept.

From 198ed865420c2a206dc062a32be47c7cc5e76bc0 Mon Sep 17 00:00:00 2001
From: Xiao Yuan <yuanx749@gmail.com>
Date: Fri, 4 Oct 2024 00:08:49 +0800
Subject: [PATCH 104/224] BUG: pd.eval with engine="numexpr" fails with float
 division (#59907)

* BUG: pd.eval with engine="numexpr" fails with float division

* Add skip

* Add whatsnew

* update
---
 doc/source/whatsnew/v3.0.0.rst        | 1 +
 pandas/core/computation/align.py      | 2 +-
 pandas/tests/computation/test_eval.py | 8 ++++++++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 6ebb51cd3ef89..346e2b9e7997e 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -698,6 +698,7 @@ Other
 - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`)
 - Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`)
 - Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`)
+- Bug in :func:`eval` with ``engine="numexpr"`` returning unexpected result for float division. (:issue:`59736`)
 - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`)
 - Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`)
 - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`)
diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py
index 7de4d8cdf99e1..6158c4f4d0539 100644
--- a/pandas/core/computation/align.py
+++ b/pandas/core/computation/align.py
@@ -213,7 +213,7 @@ def reconstruct_object(typ, obj, axes, dtype, name):
     if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_:
         ret_value = res_t.type(obj)
     else:
-        ret_value = typ(obj).astype(res_t)
+        ret_value = res_t.type(obj)
         # The condition is to distinguish 0-dim array (returned in case of
         # scalar) and 1 element array
         # e.g. np.array(0) and np.array([0])
diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py
index 31d568d7c1e0c..3c0bf6c35866c 100644
--- a/pandas/tests/computation/test_eval.py
+++ b/pandas/tests/computation/test_eval.py
@@ -1998,3 +1998,11 @@ def test_validate_bool_args(value):
     msg = 'For argument "inplace" expected type bool, received type'
     with pytest.raises(ValueError, match=msg):
         pd.eval("2+2", inplace=value)
+
+
+@td.skip_if_no("numexpr")
+def test_eval_float_div_numexpr():
+    # GH 59736
+    result = pd.eval("1 / 2", engine="numexpr")
+    expected = 0.5
+    assert result == expected

From c47296ad3b9908f77fba5830ec9dbb7f546cb720 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 3 Oct 2024 06:09:36 -1000
Subject: [PATCH 105/224] CLN: indexes/base.py (#59928)

CLN: indexes.base.py
---
 pandas/core/indexes/base.py | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 852049804a4f5..749a5fea4d513 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -4153,7 +4153,8 @@ def reindex(
         preserve_names = not hasattr(target, "name")
 
         # GH7774: preserve dtype/tz if target is empty and not an Index.
-        target = ensure_has_len(target)  # target may be an iterator
+        if is_iterator(target):
+            target = list(target)
 
         if not isinstance(target, Index) and len(target) == 0:
             if level is not None and self._is_multi:
@@ -7568,21 +7569,9 @@ def ensure_index(index_like: Axes, copy: bool = False) -> Index:
         return Index(index_like, copy=copy)
 
 
-def ensure_has_len(seq):
-    """
-    If seq is an iterator, put its values into a list.
-    """
-    try:
-        len(seq)
-    except TypeError:
-        return list(seq)
-    else:
-        return seq
-
-
 def trim_front(strings: list[str]) -> list[str]:
     """
-    Trims zeros and decimal points.
+    Trims leading spaces evenly among all strings.
 
     Examples
     --------
@@ -7594,8 +7583,9 @@ def trim_front(strings: list[str]) -> list[str]:
     """
     if not strings:
         return strings
-    while all(strings) and all(x[0] == " " for x in strings):
-        strings = [x[1:] for x in strings]
+    smallest_leading_space = min(len(x) - len(x.lstrip()) for x in strings)
+    if smallest_leading_space > 0:
+        strings = [x[smallest_leading_space:] for x in strings]
     return strings
 
 
From 139def2145b83d40364235c6297e1833eab7bb05 Mon Sep 17 00:00:00 2001
From: Deepak Saldanha <deepak.saldanha007@gmail.com>
Date: Fri, 4 Oct 2024 01:39:46 +0530
Subject: [PATCH 106/224] BUG: fix html float display (#59930)

* fix html display float/strings

* add test under io, update whatsnew

* fix linting

* changes to fix floats only

* Revert "fix linting"

This reverts commit 1061442e0a1cf8f745b0863762f2aa023d388336.

* test script for float format

* remove nbsp implementation, keep floats

* Trigger CI

* implement changes post review

* lint check

* update test_formats.py

* rfc test_format.py

* update test cases
---
 doc/source/whatsnew/v3.0.0.rst         |  1 +
 pandas/core/frame.py                   |  3 ++-
 pandas/tests/io/formats/test_format.py | 17 +++++++++++++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 346e2b9e7997e..a5b4560a47bc4 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -620,6 +620,7 @@ I/O
 ^^^
 - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`)
 - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`)
+- Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`)
 - Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`)
 - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`)
 - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 4c56948a48eb2..f184aab4070d7 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1192,6 +1192,7 @@ def _repr_html_(self) -> str | None:
             min_rows = get_option("display.min_rows")
             max_cols = get_option("display.max_columns")
             show_dimensions = get_option("display.show_dimensions")
+            show_floats = get_option("display.float_format")
 
             formatter = fmt.DataFrameFormatter(
                 self,
@@ -1199,7 +1200,7 @@ def _repr_html_(self) -> str | None:
                 col_space=None,
                 na_rep="NaN",
                 formatters=None,
-                float_format=None,
+                float_format=show_floats,
                 sparsify=None,
                 justify=None,
                 index_names=True,
diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py
index af7b04d66096a..82cc3a838ca68 100644
--- a/pandas/tests/io/formats/test_format.py
+++ b/pandas/tests/io/formats/test_format.py
@@ -368,6 +368,23 @@ def test_repr_min_rows(self):
             assert ".." not in repr(df)
             assert ".." not in df._repr_html_()
 
+    @pytest.mark.parametrize(
+        "data, format_option, expected_values",
+        [
+            (12345.6789, "{:12.3f}", "12345.679"),
+            (None, "{:.3f}", "None"),
+            ("", "{:.2f}", ""),
+            (112345.6789, "{:6.3f}", "112345.679"),
+        ],
+    )
+    def test_repr_float_formatting_html_output(
+        self, data, format_option, expected_values
+    ):
+        with option_context("display.float_format", format_option.format):
+            df = DataFrame({"A": [data]})
+            html_output = df._repr_html_()
+            assert expected_values in html_output
+
     def test_str_max_colwidth(self):
         # GH 7856
         df = DataFrame(

From 4ad6c7a287009f727a8b627b091ba19ba06d9342 Mon Sep 17 00:00:00 2001
From: Deepak Saldanha <deepak.saldanha007@gmail.com>
Date: Sat, 5 Oct 2024 01:15:29 +0530
Subject: [PATCH 107/224] BUG: fix nbsp for html formatting (#59964)

* nbsp for strings

* update changes post review
---
 pandas/io/formats/html.py              |  2 ++
 pandas/tests/io/formats/test_format.py | 19 ++++++++++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py
index adaeed017d7bf..fdea1831d5596 100644
--- a/pandas/io/formats/html.py
+++ b/pandas/io/formats/html.py
@@ -195,6 +195,8 @@ def _write_cell(
             esc = {}
 
         rs = pprint_thing(s, escape_chars=esc).strip()
+        # replace spaces betweens strings with non-breaking spaces
+        rs = rs.replace("  ", "&nbsp;&nbsp;")
 
         if self.render_links and is_url(rs):
             rs_unescaped = pprint_thing(s, escape_chars={}).strip()
diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py
index 82cc3a838ca68..0dc16e1ebc723 100644
--- a/pandas/tests/io/formats/test_format.py
+++ b/pandas/tests/io/formats/test_format.py
@@ -375,12 +375,29 @@ def test_repr_min_rows(self):
             (None, "{:.3f}", "None"),
             ("", "{:.2f}", ""),
             (112345.6789, "{:6.3f}", "112345.679"),
+            ("foo      foo", None, "foo&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;foo"),
+            (" foo", None, "foo"),
+            (
+                "foo foo       foo",
+                None,
+                "foo foo&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; foo",
+            ),  # odd no.of spaces
+            (
+                "foo foo    foo",
+                None,
+                "foo foo&nbsp;&nbsp;&nbsp;&nbsp;foo",
+            ),  # even no.of spaces
         ],
     )
     def test_repr_float_formatting_html_output(
         self, data, format_option, expected_values
     ):
-        with option_context("display.float_format", format_option.format):
+        if format_option is not None:
+            with option_context("display.float_format", format_option.format):
+                df = DataFrame({"A": [data]})
+                html_output = df._repr_html_()
+                assert expected_values in html_output
+        else:
             df = DataFrame({"A": [data]})
             html_output = df._repr_html_()
             assert expected_values in html_output

From 58de332785ecac78dbea2d19b5a25253eecf78a0 Mon Sep 17 00:00:00 2001
From: Deepak Saldanha <deepak.saldanha007@gmail.com>
Date: Sat, 5 Oct 2024 02:49:57 +0530
Subject: [PATCH 108/224] BUG: fix treatment of NaNs when .apply() function is
 used on categorical columns. (#59966)

* remove action=ignore for .apply() on cat dtype

* add PR reference in comments

* fix pytest linting

* refac failing test_series_apply.py

* Trigger CI

* changes post review

* rephrase change log
---
 doc/source/whatsnew/v3.0.0.rst          |  2 +-
 pandas/core/apply.py                    | 14 ++------------
 pandas/tests/apply/test_frame_apply.py  |  3 ++-
 pandas/tests/apply/test_series_apply.py |  6 +++---
 4 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index a5b4560a47bc4..52debcc49eb27 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -544,7 +544,7 @@ Bug fixes
 
 Categorical
 ^^^^^^^^^^^
--
+- Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`)
 -
 
 Datetimelike
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index 7d50b466f5126..1f13459724d78 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -38,10 +38,7 @@
     is_numeric_dtype,
     is_sequence,
 )
-from pandas.core.dtypes.dtypes import (
-    CategoricalDtype,
-    ExtensionDtype,
-)
+from pandas.core.dtypes.dtypes import ExtensionDtype
 from pandas.core.dtypes.generic import (
     ABCDataFrame,
     ABCNDFrame,
@@ -1465,14 +1462,7 @@ def curried(x):
 
         else:
             curried = func
-
-        # row-wise access
-        # apply doesn't have a `na_action` keyword and for backward compat reasons
-        # we need to give `na_action="ignore"` for categorical data.
-        # TODO: remove the `na_action="ignore"` when that default has been changed in
-        #  Categorical (GH51645).
-        action = "ignore" if isinstance(obj.dtype, CategoricalDtype) else None
-        mapped = obj._map_values(mapper=curried, na_action=action)
+        mapped = obj._map_values(mapper=curried)
 
         if len(mapped) and isinstance(mapped[0], ABCSeries):
             # GH#43986 Need to do list(mapped) in order to get treated as nested
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
index dee0efcd8fd15..f0ab01e9e960e 100644
--- a/pandas/tests/apply/test_frame_apply.py
+++ b/pandas/tests/apply/test_frame_apply.py
@@ -741,8 +741,9 @@ def test_apply_category_equalness(val):
 
     result = df.a.apply(lambda x: x == val)
     expected = Series(
-        [np.nan if pd.isnull(x) else x == val for x in df_values], name="a"
+        [False if pd.isnull(x) else x == val for x in df_values], name="a"
     )
+    # False since behavior of NaN for categorical dtype has been changed (GH 59966)
     tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py
index 76704de6f2d10..9541b0b7495c7 100644
--- a/pandas/tests/apply/test_series_apply.py
+++ b/pandas/tests/apply/test_series_apply.py
@@ -236,10 +236,10 @@ def test_apply_categorical_with_nan_values(series, by_row):
         with pytest.raises(AttributeError, match=msg):
             s.apply(lambda x: x.split("-")[0], by_row=by_row)
         return
-
-    result = s.apply(lambda x: x.split("-")[0], by_row=by_row)
+    # NaN for cat dtype fixed in (GH 59966)
+    result = s.apply(lambda x: x.split("-")[0] if pd.notna(x) else False, by_row=by_row)
     result = result.astype(object)
-    expected = Series(["1", "1", np.nan], dtype="category")
+    expected = Series(["1", "1", False], dtype="category")
     expected = expected.astype(object)
     tm.assert_series_equal(result, expected)
 

From 7f54bec678694b1bb8e91ab4dc8944431d1c7ae1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Janez=20Dem=C5=A1ar?= <janez.demsar@gmail.com>
Date: Sat, 5 Oct 2024 00:09:52 +0200
Subject: [PATCH 109/224] BUG: Fix SparseFrameAccessor.to_dense return type
 (#59967)

* BUG: Fix SparseFrameAccessor.to_dense return type

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 doc/source/whatsnew/v3.0.0.rst              | 1 +
 pandas/core/arrays/sparse/accessor.py       | 6 +++---
 pandas/tests/arrays/sparse/test_accessor.py | 4 ++++
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 52debcc49eb27..35963a90b5d07 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -682,6 +682,7 @@ Sparse
 ^^^^^^
 - Bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`)
 - Bug in :meth:`DataFrame.sparse.from_spmatrix` which hard coded an invalid ``fill_value`` for certain subtypes. (:issue:`59063`)
+- Bug in :meth:`DataFrame.sparse.to_dense` which ignored subclassing and always returned an instance of :class:`DataFrame` (:issue:`59913`)
 
 ExtensionArray
 ^^^^^^^^^^^^^^
diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py
index e610e018c5a74..8083371ed171a 100644
--- a/pandas/core/arrays/sparse/accessor.py
+++ b/pandas/core/arrays/sparse/accessor.py
@@ -369,10 +369,10 @@ def to_dense(self) -> DataFrame:
         1  1
         2  0
         """
-        from pandas import DataFrame
-
         data = {k: v.array.to_dense() for k, v in self._parent.items()}
-        return DataFrame(data, index=self._parent.index, columns=self._parent.columns)
+        return self._parent._constructor(
+            data, index=self._parent.index, columns=self._parent.columns
+        )
 
     def to_coo(self) -> spmatrix:
         """
diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py
index bd3298940ae3a..08bfd5b69fdd9 100644
--- a/pandas/tests/arrays/sparse/test_accessor.py
+++ b/pandas/tests/arrays/sparse/test_accessor.py
@@ -252,3 +252,7 @@ def test_with_column_named_sparse(self):
         # https://github.com/pandas-dev/pandas/issues/30758
         df = pd.DataFrame({"sparse": pd.arrays.SparseArray([1, 2])})
         assert isinstance(df.sparse, pd.core.arrays.sparse.accessor.SparseFrameAccessor)
+
+    def test_subclassing(self):
+        df = tm.SubclassedDataFrame({"sparse": pd.arrays.SparseArray([1, 2])})
+        assert isinstance(df.sparse.to_dense(), tm.SubclassedDataFrame)

From aea1643c6428cbf52abfa07b068c445149b98827 Mon Sep 17 00:00:00 2001
From: invalidarg <jo89lin@gmail.com>
Date: Sat, 5 Oct 2024 00:10:36 +0200
Subject: [PATCH 110/224] BUG: CSS strings truncated at ":" (#59720)

* second item in tuple is no longer truncated at first colon

https://github.com/pandas-dev/pandas/issues/59623

* added testcase for maybe_convert_css_to_tuples

#59623

* maybe_convert_css_to_tuples() raises on strings without ":"

* fixed implicit str concatination

* Fixed raise on empty string

* Update test_style.py

* attr:; -> ("attr","")

Same behavior as before patch

* add test for "attr:;", ie empty value

* str concatenation in the test broke mypy

* revert explicit str concat

* Invalidarg patch black (#1)

* black test_style

* Update style_render.py

---------

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
 pandas/io/formats/style_render.py           | 24 ++++++++++-----------
 pandas/tests/io/formats/style/test_style.py | 13 ++++++++++-
 2 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py
index 8a6383f7e8f82..08d9fd938c873 100644
--- a/pandas/io/formats/style_render.py
+++ b/pandas/io/formats/style_render.py
@@ -906,9 +906,9 @@ def concatenated_visible_rows(obj):
                 row_body_headers = [
                     {
                         **col,
-                        "display_value": col["display_value"]
-                        if col["is_visible"]
-                        else "",
+                        "display_value": (
+                            col["display_value"] if col["is_visible"] else ""
+                        ),
                         "cellstyle": self.ctx_index[r, c],
                     }
                     for c, col in enumerate(row[:index_levels])
@@ -2069,18 +2069,18 @@ def maybe_convert_css_to_tuples(style: CSSProperties) -> CSSList:
                                              ('border','1px solid red')]
     """
     if isinstance(style, str):
-        s = style.split(";")
-        try:
-            return [
-                (x.split(":")[0].strip(), x.split(":")[1].strip())
-                for x in s
-                if x.strip() != ""
-            ]
-        except IndexError as err:
+        if style and ":" not in style:
             raise ValueError(
                 "Styles supplied as string must follow CSS rule formats, "
                 f"for example 'attr: val;'. '{style}' was given."
-            ) from err
+            )
+        s = style.split(";")
+        return [
+            (x.split(":")[0].strip(), ":".join(x.split(":")[1:]).strip())
+            for x in s
+            if x.strip() != ""
+        ]
+
     return style
 
 
diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py
index 89addbbbc1ded..e9fc2b2d27afd 100644
--- a/pandas/tests/io/formats/style/test_style.py
+++ b/pandas/tests/io/formats/style/test_style.py
@@ -886,8 +886,19 @@ def test_maybe_convert_css_to_tuples(self):
         expected = []
         assert maybe_convert_css_to_tuples("") == expected
 
+        # issue #59623
+        expected = [("a", "b"), ("c", "url('data:123')")]
+        assert maybe_convert_css_to_tuples("a:b;c: url('data:123');") == expected
+
+        # if no value, return attr and empty string
+        expected = [("a", ""), ("c", "")]
+        assert maybe_convert_css_to_tuples("a:;c: ") == expected
+
     def test_maybe_convert_css_to_tuples_err(self):
-        msg = "Styles supplied as string must follow CSS rule formats"
+        msg = (
+            "Styles supplied as string must follow CSS rule formats, "
+            "for example 'attr: val;'. 'err' was given."
+        )
         with pytest.raises(ValueError, match=msg):
             maybe_convert_css_to_tuples("err")
 

From 24190fdb0efd781be9f0a886256edc595587c20f Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sat, 5 Oct 2024 22:38:20 +0530
Subject: [PATCH 111/224] DOC: fix RT03,SA01 for pandas.period_range (#59958)

---
 ci/code_checks.sh             | 1 -
 pandas/core/indexes/period.py | 8 ++++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 4a1a0042405e3..c9d2f54eba1ed 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -184,7 +184,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.io.stata.StataReader.variable_labels RT03,SA01" \
         -i "pandas.io.stata.StataWriter.write_file SA01" \
         -i "pandas.json_normalize RT03,SA01" \
-        -i "pandas.period_range RT03,SA01" \
         -i "pandas.plotting.andrews_curves RT03,SA01" \
         -i "pandas.plotting.lag_plot RT03,SA01" \
         -i "pandas.plotting.scatter_matrix PR07,SA01" \
diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
index b5f05ef0ab78f..377406e24b1d3 100644
--- a/pandas/core/indexes/period.py
+++ b/pandas/core/indexes/period.py
@@ -563,6 +563,14 @@ def period_range(
     Returns
     -------
     PeriodIndex
+        A PeriodIndex of fixed frequency periods.
+
+    See Also
+    --------
+    date_range : Returns a fixed frequency DatetimeIndex.
+    Period : Represents a period of time.
+    PeriodIndex : Immutable ndarray holding ordinal values indicating regular periods
+        in time.
 
     Notes
     -----

From b63c7954d5195b3999cd867b788758e412bf30e1 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sat, 5 Oct 2024 22:40:04 +0530
Subject: [PATCH 112/224] DOC: fix SA01, ES01 for
 pandas.io.stata.StataReader.data_label (#59962)

---
 ci/code_checks.sh  |  1 -
 pandas/io/stata.py | 10 ++++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index c9d2f54eba1ed..ad6ea5b0deb9f 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -179,7 +179,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.errors.ValueLabelTypeMismatch SA01" \
         -i "pandas.infer_freq SA01" \
         -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \
-        -i "pandas.io.stata.StataReader.data_label SA01" \
         -i "pandas.io.stata.StataReader.value_labels RT03,SA01" \
         -i "pandas.io.stata.StataReader.variable_labels RT03,SA01" \
         -i "pandas.io.stata.StataWriter.write_file SA01" \
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 4be06f93689f2..6b988d8fed6bf 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -2004,6 +2004,16 @@ def data_label(self) -> str:
         """
         Return data label of Stata file.
 
+        The data label is a descriptive string associated with the dataset
+        stored in the Stata file. This property provides access to that
+        label, if one is present.
+
+        See Also
+        --------
+        io.stata.StataReader.variable_labels : Return a dict associating each variable
+            name with corresponding label.
+        DataFrame.to_stata : Export DataFrame object to Stata dta format.
+
         Examples
         --------
         >>> df = pd.DataFrame([(1,)], columns=["variable"])

From e740857e6399c589e2704da5376a0a28cc251a38 Mon Sep 17 00:00:00 2001
From: Xiao Yuan <yuanx749@gmail.com>
Date: Sun, 6 Oct 2024 01:12:09 +0800
Subject: [PATCH 113/224] BUG: fix to_numeric raises TypeError for Timedelta
 and Timestamp scalar (#59974)

* BUG: fix to_numeric raises TypeError for Timedelta and Timestamp scalar

* Add whatsnew
---
 doc/source/whatsnew/v3.0.0.rst        |  1 +
 pandas/core/tools/numeric.py          |  6 ++++++
 pandas/tests/tools/test_to_numeric.py | 15 +++++++++++++++
 3 files changed, 22 insertions(+)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 35963a90b5d07..ed0836233553b 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -701,6 +701,7 @@ Other
 - Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`)
 - Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`)
 - Bug in :func:`eval` with ``engine="numexpr"`` returning unexpected result for float division. (:issue:`59736`)
+- Bug in :func:`to_numeric` raising ``TypeError`` when ``arg`` is a :class:`Timedelta` or :class:`Timestamp` scalar. (:issue:`59944`)
 - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`)
 - Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`)
 - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`)
diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py
index 982851d0557c3..f159babb7e018 100644
--- a/pandas/core/tools/numeric.py
+++ b/pandas/core/tools/numeric.py
@@ -11,6 +11,10 @@
     lib,
     missing as libmissing,
 )
+from pandas._libs.tslibs import (
+    Timedelta,
+    Timestamp,
+)
 from pandas.util._validators import check_dtype_backend
 
 from pandas.core.dtypes.cast import maybe_downcast_numeric
@@ -189,6 +193,8 @@ def to_numeric(
             return float(arg)
         if is_number(arg):
             return arg
+        if isinstance(arg, (Timedelta, Timestamp)):
+            return arg._value
         is_scalars = True
         values = np.array([arg], dtype="O")
     elif getattr(arg, "ndim", 1) > 1:
diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py
index 585b7ca94f730..f3645bf0649bd 100644
--- a/pandas/tests/tools/test_to_numeric.py
+++ b/pandas/tests/tools/test_to_numeric.py
@@ -384,6 +384,21 @@ def test_timedelta(transform_assert_equal):
     assert_equal(result, expected)
 
 
+@pytest.mark.parametrize(
+    "scalar",
+    [
+        pd.Timedelta(1, "D"),
+        pd.Timestamp("2017-01-01T12"),
+        pd.Timestamp("2017-01-01T12", tz="US/Pacific"),
+    ],
+)
+def test_timedelta_timestamp_scalar(scalar):
+    # GH#59944
+    result = to_numeric(scalar)
+    expected = to_numeric(Series(scalar))[0]
+    assert result == expected
+
+
 def test_period(request, transform_assert_equal):
     transform, assert_equal = transform_assert_equal
 

From 05fa9583f7bc22796076b7e2a7b94058bebca511 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sat, 5 Oct 2024 22:43:23 +0530
Subject: [PATCH 114/224] DOC: fix SA01, ES01 for
 pandas.testing.assert_extension_array_equal (#59975)

---
 ci/code_checks.sh            |  1 -
 pandas/_testing/asserters.py | 10 ++++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index ad6ea5b0deb9f..16a3a22bc4876 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -187,7 +187,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.plotting.lag_plot RT03,SA01" \
         -i "pandas.plotting.scatter_matrix PR07,SA01" \
         -i "pandas.set_eng_float_format RT03,SA01" \
-        -i "pandas.testing.assert_extension_array_equal SA01" \
         -i "pandas.tseries.offsets.BDay PR02,SA01" \
         -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \
         -i "pandas.tseries.offsets.BQuarterBegin.n GL08" \
diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
index bbd5e60a5a812..01c4dcd92ee40 100644
--- a/pandas/_testing/asserters.py
+++ b/pandas/_testing/asserters.py
@@ -701,6 +701,10 @@ def assert_extension_array_equal(
     """
     Check that left and right ExtensionArrays are equal.
 
+    This method compares two ``ExtensionArray`` instances for equality,
+    including checks for missing values, the dtype of the arrays, and
+    the exactness of the comparison (or tolerance when comparing floats).
+
     Parameters
     ----------
     left, right : ExtensionArray
@@ -726,6 +730,12 @@ def assert_extension_array_equal(
 
         .. versionadded:: 2.0.0
 
+    See Also
+    --------
+    testing.assert_series_equal : Check that left and right ``Series`` are equal.
+    testing.assert_frame_equal : Check that left and right ``DataFrame`` are equal.
+    testing.assert_index_equal : Check that left and right ``Index`` are equal.
+
     Notes
     -----
     Missing values are checked separately from valid values.

From c8813aeebcff18699e558ea0ee56abb9dde6a6f6 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com>
Date: Sun, 6 Oct 2024 13:33:14 -0400
Subject: [PATCH 115/224] API: value_counts to consistently maintain order of
 input (#59745)

* API: value_counts to consistently maintain order of input

* Docs

* Cleanup

* Test & docs fixups

* Refine whatsnew

* Refine whatsnew
---
 doc/source/whatsnew/v3.0.0.rst                | 61 ++++++++++++++++++
 pandas/core/frame.py                          | 10 ++-
 pandas/core/groupby/generic.py                | 28 ++++----
 pandas/core/groupby/groupby.py                |  4 +-
 pandas/core/groupby/ops.py                    | 39 +++++++++--
 .../tests/frame/methods/test_value_counts.py  |  4 +-
 .../groupby/methods/test_value_counts.py      | 64 +++++++++----------
 7 files changed, 157 insertions(+), 53 deletions(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index ed0836233553b..321005272817d 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -203,6 +203,67 @@ In cases with mixed-resolution inputs, the highest resolution is used:
     In [2]: pd.to_datetime([pd.Timestamp("2024-03-22 11:43:01"), "2024-03-22 11:43:01.002"]).dtype
     Out[2]: dtype('<M8[ns]')
 
+.. _whatsnew_300.api_breaking.value_counts_sorting:
+
+Changed behavior in :meth:`DataFrame.value_counts` and :meth:`DataFrameGroupBy.value_counts` when ``sort=False``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In previous versions of pandas, :meth:`DataFrame.value_counts` with ``sort=False`` would sort the result by row labels (as was documented). This was nonintuitive and inconsistent with :meth:`Series.value_counts` which would maintain the order of the input. Now :meth:`DataFrame.value_counts` will maintain the order of the input.
+
+.. ipython:: python
+
+    df = pd.DataFrame(
+        {
+            "a": [2, 2, 2, 2, 1, 1, 1, 1],
+            "b": [2, 1, 3, 1, 2, 3, 1, 1],
+        }
+    )
+    df
+
+*Old behavior*
+
+.. code-block:: ipython
+
+    In [3]: df.value_counts(sort=False)
+    Out[3]:
+    a  b
+    1  1    2
+       2    1
+       3    1
+    2  1    2
+       2    1
+       3    1
+    Name: count, dtype: int64
+
+*New behavior*
+
+.. ipython:: python
+
+    df.value_counts(sort=False)
+
+This change also applies to :meth:`.DataFrameGroupBy.value_counts`. Here, there are two options for sorting: one ``sort`` passed to :meth:`DataFrame.groupby` and one passed directly to :meth:`.DataFrameGroupBy.value_counts`. The former will determine whether to sort the groups, the latter whether to sort the counts. All non-grouping columns will maintain the order of the input *within groups*.
+
+*Old behavior*
+
+.. code-block:: ipython
+
+    In [5]: df.groupby("a", sort=True).value_counts(sort=False)
+    Out[5]:
+    a  b
+    1  1    2
+       2    1
+       3    1
+    2  1    2
+       2    1
+       3    1
+    dtype: int64
+
+*New behavior*
+
+.. ipython:: python
+
+    df.groupby("a", sort=True).value_counts(sort=False)
+
 .. _whatsnew_300.api_breaking.deps:
 
 Increased minimum version for Python
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index f184aab4070d7..1b47002e72fc6 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -7266,7 +7266,11 @@ def value_counts(
         normalize : bool, default False
             Return proportions rather than frequencies.
         sort : bool, default True
-            Sort by frequencies when True. Sort by DataFrame column values when False.
+            Sort by frequencies when True. Preserve the order of the data when False.
+
+            .. versionchanged:: 3.0.0
+
+                Prior to 3.0.0, ``sort=False`` would sort by the columns values.
         ascending : bool, default False
             Sort in ascending order.
         dropna : bool, default True
@@ -7372,7 +7376,9 @@ def value_counts(
             subset = self.columns.tolist()
 
         name = "proportion" if normalize else "count"
-        counts = self.groupby(subset, dropna=dropna, observed=False)._grouper.size()
+        counts = self.groupby(
+            subset, sort=False, dropna=dropna, observed=False
+        )._grouper.size()
         counts.name = name
 
         if sort:
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
index 110c0ea88a0a1..f076f8d79f104 100644
--- a/pandas/core/groupby/generic.py
+++ b/pandas/core/groupby/generic.py
@@ -2621,7 +2621,13 @@ def value_counts(
         normalize : bool, default False
             Return proportions rather than frequencies.
         sort : bool, default True
-            Sort by frequencies.
+            Sort by frequencies when True. When False, non-grouping columns will appear
+            in the order they occur in within groups.
+
+            .. versionchanged:: 3.0.0
+
+                In prior versions, ``sort=False`` would sort the non-grouping columns
+                by label.
         ascending : bool, default False
             Sort in ascending order.
         dropna : bool, default True
@@ -2673,8 +2679,8 @@ def value_counts(
 
         >>> df.groupby("gender").value_counts()
         gender  education  country
-        female  high       FR         1
-                           US         1
+        female  high       US         1
+                           FR         1
         male    low        FR         2
                            US         1
                 medium     FR         1
@@ -2682,8 +2688,8 @@ def value_counts(
 
         >>> df.groupby("gender").value_counts(ascending=True)
         gender  education  country
-        female  high       FR         1
-                           US         1
+        female  high       US         1
+                           FR         1
         male    low        US         1
                 medium     FR         1
                 low        FR         2
@@ -2691,8 +2697,8 @@ def value_counts(
 
         >>> df.groupby("gender").value_counts(normalize=True)
         gender  education  country
-        female  high       FR         0.50
-                           US         0.50
+        female  high       US         0.50
+                           FR         0.50
         male    low        FR         0.50
                            US         0.25
                 medium     FR         0.25
@@ -2700,16 +2706,16 @@ def value_counts(
 
         >>> df.groupby("gender", as_index=False).value_counts()
            gender education country  count
-        0  female      high      FR      1
-        1  female      high      US      1
+        0  female      high      US      1
+        1  female      high      FR      1
         2    male       low      FR      2
         3    male       low      US      1
         4    male    medium      FR      1
 
         >>> df.groupby("gender", as_index=False).value_counts(normalize=True)
            gender education country  proportion
-        0  female      high      FR        0.50
-        1  female      high      US        0.50
+        0  female      high      US        0.50
+        1  female      high      FR        0.50
         2    male       low      FR        0.50
         3    male       low      US        0.25
         4    male    medium      FR        0.25
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index e2410788ea95e..68314567d1b5e 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -2519,7 +2519,7 @@ def _value_counts(
             grouper, _, _ = get_grouper(
                 df,
                 key=key,
-                sort=self.sort,
+                sort=False,
                 observed=False,
                 dropna=dropna,
             )
@@ -2528,7 +2528,7 @@ def _value_counts(
         # Take the size of the overall columns
         gb = df.groupby(
             groupings,
-            sort=self.sort,
+            sort=False,
             observed=self.observed,
             dropna=self.dropna,
         )
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 0e99178642715..a82e77140d274 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -755,6 +755,7 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
         obs = [
             ping._observed or not ping._passed_categorical for ping in self.groupings
         ]
+        sorts = [ping._sort for ping in self.groupings]
         # When passed a categorical grouping, keep all categories
         for k, (ping, level) in enumerate(zip(self.groupings, levels)):
             if ping._passed_categorical:
@@ -765,7 +766,9 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
             result_index.name = self.names[0]
             ids = ensure_platform_int(self.codes[0])
         elif all(obs):
-            result_index, ids = self._ob_index_and_ids(levels, self.codes, self.names)
+            result_index, ids = self._ob_index_and_ids(
+                levels, self.codes, self.names, sorts
+            )
         elif not any(obs):
             result_index, ids = self._unob_index_and_ids(levels, self.codes, self.names)
         else:
@@ -778,6 +781,7 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
                 levels=[levels[idx] for idx in ob_indices],
                 codes=[codes[idx] for idx in ob_indices],
                 names=[names[idx] for idx in ob_indices],
+                sorts=[sorts[idx] for idx in ob_indices],
             )
             unob_index, unob_ids = self._unob_index_and_ids(
                 levels=[levels[idx] for idx in unob_indices],
@@ -800,9 +804,18 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
             ).reorder_levels(index)
             ids = len(unob_index) * ob_ids + unob_ids
 
-            if self._sort:
+            if any(sorts):
                 # Sort result_index and recode ids using the new order
-                sorter = result_index.argsort()
+                n_levels = len(sorts)
+                drop_levels = [
+                    n_levels - idx
+                    for idx, sort in enumerate(reversed(sorts), 1)
+                    if not sort
+                ]
+                if len(drop_levels) > 0:
+                    sorter = result_index._drop_level_numbers(drop_levels).argsort()
+                else:
+                    sorter = result_index.argsort()
                 result_index = result_index.take(sorter)
                 _, index = np.unique(sorter, return_index=True)
                 ids = ensure_platform_int(ids)
@@ -837,10 +850,13 @@ def _ob_index_and_ids(
         levels: list[Index],
         codes: list[npt.NDArray[np.intp]],
         names: list[Hashable],
+        sorts: list[bool],
     ) -> tuple[MultiIndex, npt.NDArray[np.intp]]:
+        consistent_sorting = all(sorts[0] == sort for sort in sorts[1:])
+        sort_in_compress = sorts[0] if consistent_sorting else False
         shape = tuple(len(level) for level in levels)
         group_index = get_group_index(codes, shape, sort=True, xnull=True)
-        ob_ids, obs_group_ids = compress_group_index(group_index, sort=self._sort)
+        ob_ids, obs_group_ids = compress_group_index(group_index, sort=sort_in_compress)
         ob_ids = ensure_platform_int(ob_ids)
         ob_index_codes = decons_obs_group_ids(
             ob_ids, obs_group_ids, shape, codes, xnull=True
@@ -851,6 +867,21 @@ def _ob_index_and_ids(
             names=names,
             verify_integrity=False,
         )
+        if not consistent_sorting:
+            # Sort by the levels where the corresponding sort argument is True
+            n_levels = len(sorts)
+            drop_levels = [
+                n_levels - idx
+                for idx, sort in enumerate(reversed(sorts), 1)
+                if not sort
+            ]
+            if len(drop_levels) > 0:
+                sorter = ob_index._drop_level_numbers(drop_levels).argsort()
+            else:
+                sorter = ob_index.argsort()
+            ob_index = ob_index.take(sorter)
+            _, index = np.unique(sorter, return_index=True)
+            ob_ids = np.where(ob_ids == -1, -1, index.take(ob_ids))
         ob_ids = ensure_platform_int(ob_ids)
         return ob_index, ob_ids
 
diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py
index 7670b53f23173..de5029b9f18b2 100644
--- a/pandas/tests/frame/methods/test_value_counts.py
+++ b/pandas/tests/frame/methods/test_value_counts.py
@@ -128,7 +128,7 @@ def test_data_frame_value_counts_dropna_true(nulls_fixture):
     expected = pd.Series(
         data=[1, 1],
         index=pd.MultiIndex.from_arrays(
-            [("Beth", "John"), ("Louise", "Smith")], names=["first_name", "middle_name"]
+            [("John", "Beth"), ("Smith", "Louise")], names=["first_name", "middle_name"]
         ),
         name="count",
     )
@@ -156,7 +156,7 @@ def test_data_frame_value_counts_dropna_false(nulls_fixture):
                 pd.Index(["Anne", "Beth", "John"]),
                 pd.Index(["Louise", "Smith", np.nan]),
             ],
-            codes=[[0, 1, 2, 2], [2, 0, 1, 2]],
+            codes=[[2, 0, 2, 1], [1, 2, 2, 0]],
             names=["first_name", "middle_name"],
         ),
         name="count",
diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py
index 8f8f7f64aba75..8f3022fbe551c 100644
--- a/pandas/tests/groupby/methods/test_value_counts.py
+++ b/pandas/tests/groupby/methods/test_value_counts.py
@@ -255,10 +255,10 @@ def test_basic(education_df, request):
         index=MultiIndex.from_tuples(
             [
                 ("FR", "male", "low"),
-                ("FR", "female", "high"),
                 ("FR", "male", "medium"),
-                ("US", "female", "high"),
+                ("FR", "female", "high"),
                 ("US", "male", "low"),
+                ("US", "female", "high"),
             ],
             names=["country", "gender", "education"],
         ),
@@ -472,11 +472,11 @@ def test_data_frame_value_counts(
         (
             False,
             False,
-            [0, 1, 3, 5, 7, 6, 8, 2, 4],
+            [0, 1, 3, 5, 6, 7, 8, 2, 4],
             [0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0],
         ),
         (False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]),
-        (True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]),
+        (True, False, [0, 1, 5, 6, 7, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]),
         (True, True, [0, 1, 5], [0.5, 0.5, 1.0]),
     ],
 )
@@ -518,7 +518,7 @@ def test_dropna_combinations(
             True,
             [1, 1],
             MultiIndex.from_arrays(
-                [(1, 1), ("Beth", "John"), ("Louise", "Smith")],
+                [(1, 1), ("John", "Beth"), ("Smith", "Louise")],
                 names=["key", "first_name", "middle_name"],
             ),
         ),
@@ -531,7 +531,7 @@ def test_dropna_combinations(
                     Index(["Anne", "Beth", "John"]),
                     Index(["Louise", "Smith", np.nan]),
                 ],
-                codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]],
+                codes=[[0, 0, 0, 0], [2, 0, 2, 1], [1, 2, 2, 0]],
                 names=["key", "first_name", "middle_name"],
             ),
         ),
@@ -609,17 +609,17 @@ def test_categorical_single_grouper_with_only_observed_categories(
     expected_index = MultiIndex.from_tuples(
         [
             ("FR", "male", "low"),
-            ("FR", "female", "high"),
             ("FR", "male", "medium"),
+            ("FR", "female", "high"),
+            ("FR", "male", "high"),
             ("FR", "female", "low"),
             ("FR", "female", "medium"),
-            ("FR", "male", "high"),
-            ("US", "female", "high"),
             ("US", "male", "low"),
+            ("US", "female", "high"),
+            ("US", "male", "medium"),
+            ("US", "male", "high"),
             ("US", "female", "low"),
             ("US", "female", "medium"),
-            ("US", "male", "high"),
-            ("US", "male", "medium"),
         ],
         names=["country", "gender", "education"],
     )
@@ -711,17 +711,17 @@ def test_categorical_single_grouper_observed_true(
 
     expected_index = [
         ("FR", "male", "low"),
-        ("FR", "female", "high"),
         ("FR", "male", "medium"),
+        ("FR", "female", "high"),
+        ("FR", "male", "high"),
         ("FR", "female", "low"),
         ("FR", "female", "medium"),
-        ("FR", "male", "high"),
-        ("US", "female", "high"),
         ("US", "male", "low"),
+        ("US", "female", "high"),
+        ("US", "male", "medium"),
+        ("US", "male", "high"),
         ("US", "female", "low"),
         ("US", "female", "medium"),
-        ("US", "male", "high"),
-        ("US", "male", "medium"),
     ]
 
     assert_categorical_single_grouper(
@@ -791,23 +791,23 @@ def test_categorical_single_grouper_observed_false(
 
     expected_index = [
         ("FR", "male", "low"),
-        ("FR", "female", "high"),
         ("FR", "male", "medium"),
+        ("FR", "female", "high"),
+        ("FR", "male", "high"),
         ("FR", "female", "low"),
         ("FR", "female", "medium"),
-        ("FR", "male", "high"),
-        ("US", "female", "high"),
         ("US", "male", "low"),
+        ("US", "female", "high"),
+        ("US", "male", "medium"),
+        ("US", "male", "high"),
         ("US", "female", "low"),
         ("US", "female", "medium"),
-        ("US", "male", "high"),
-        ("US", "male", "medium"),
-        ("ASIA", "female", "high"),
-        ("ASIA", "female", "low"),
-        ("ASIA", "female", "medium"),
-        ("ASIA", "male", "high"),
         ("ASIA", "male", "low"),
         ("ASIA", "male", "medium"),
+        ("ASIA", "male", "high"),
+        ("ASIA", "female", "low"),
+        ("ASIA", "female", "medium"),
+        ("ASIA", "female", "high"),
     ]
 
     assert_categorical_single_grouper(
@@ -837,8 +837,8 @@ def test_categorical_single_grouper_observed_false(
                 ("US", "high", "male"),
                 ("US", "low", "male"),
                 ("US", "low", "female"),
-                ("US", "medium", "female"),
                 ("US", "medium", "male"),
+                ("US", "medium", "female"),
             ],
         ),
         (
@@ -949,17 +949,17 @@ def test_categorical_non_groupers(
 
     expected_index = [
         ("FR", "male", "low"),
-        ("FR", "female", "high"),
         ("FR", "male", "medium"),
+        ("FR", "female", "high"),
+        ("FR", "male", "high"),
         ("FR", "female", "low"),
         ("FR", "female", "medium"),
-        ("FR", "male", "high"),
-        ("US", "female", "high"),
         ("US", "male", "low"),
+        ("US", "female", "high"),
+        ("US", "male", "medium"),
+        ("US", "male", "high"),
         ("US", "female", "low"),
         ("US", "female", "medium"),
-        ("US", "male", "high"),
-        ("US", "male", "medium"),
     ]
     expected_series = Series(
         data=expected_data,
@@ -1178,7 +1178,7 @@ def test_value_counts_sort(sort, vc_sort, normalize):
     if sort and vc_sort:
         taker = [0, 1, 2]
     elif sort and not vc_sort:
-        taker = [0, 1, 2]
+        taker = [1, 0, 2]
     elif not sort and vc_sort:
         taker = [0, 2, 1]
     else:

From febfc0b32c92326a6ca3a4a0aa25dd4d88ab19ad Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sun, 6 Oct 2024 23:07:24 +0530
Subject: [PATCH 116/224] DOC: fix PR07,SA01,ES01 for
 pandas.Series.sparse.from_coo (#59980)

---
 ci/code_checks.sh                     |  1 -
 pandas/core/arrays/sparse/accessor.py | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 16a3a22bc4876..c93dbf511aec0 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -96,7 +96,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Series.dt.tz_localize PR01,PR02" \
         -i "pandas.Series.dt.unit GL08" \
         -i "pandas.Series.pad PR01,SA01" \
-        -i "pandas.Series.sparse.from_coo PR07,SA01" \
         -i "pandas.Timedelta.max PR02" \
         -i "pandas.Timedelta.min PR02" \
         -i "pandas.Timedelta.resolution PR02" \
diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py
index 8083371ed171a..0ed5f69fe4703 100644
--- a/pandas/core/arrays/sparse/accessor.py
+++ b/pandas/core/arrays/sparse/accessor.py
@@ -88,9 +88,17 @@ def from_coo(cls, A, dense_index: bool = False) -> Series:
         """
         Create a Series with sparse values from a scipy.sparse.coo_matrix.
 
+        This method takes a ``scipy.sparse.coo_matrix`` (coordinate format) as input and
+        returns a pandas ``Series`` where the non-zero elements are represented as
+        sparse values. The index of the Series can either include only the coordinates
+        of non-zero elements (default behavior) or the full sorted set of coordinates
+        from the matrix if ``dense_index`` is set to `True`.
+
         Parameters
         ----------
         A : scipy.sparse.coo_matrix
+            The sparse matrix in coordinate format from which the sparse Series
+            will be created.
         dense_index : bool, default False
             If False (default), the index consists of only the
             coords of the non-null entries of the original coo_matrix.
@@ -102,6 +110,12 @@ def from_coo(cls, A, dense_index: bool = False) -> Series:
         s : Series
             A Series with sparse values.
 
+        See Also
+        --------
+        DataFrame.sparse.from_spmatrix : Create a new DataFrame from a scipy sparse
+            matrix.
+        scipy.sparse.coo_matrix : A sparse matrix in COOrdinate format.
+
         Examples
         --------
         >>> from scipy import sparse

From e4905bfd1825f490ff12b12c2659d3194882a9e3 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sun, 6 Oct 2024 23:08:02 +0530
Subject: [PATCH 117/224] DOC: fix PR01,SA01,ES01 for pandas.api.types.is_float
 (#59981)

---
 ci/code_checks.sh    |  1 -
 pandas/_libs/lib.pyx | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index c93dbf511aec0..6f4534ba4a4de 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -105,7 +105,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timestamp.resolution PR02" \
         -i "pandas.Timestamp.tzinfo GL08" \
         -i "pandas.Timestamp.year GL08" \
-        -i "pandas.api.types.is_float PR01,SA01" \
         -i "pandas.api.types.is_integer PR01,SA01" \
         -i "pandas.api.types.is_iterator PR07,SA01" \
         -i "pandas.api.types.is_re_compilable PR07,SA01" \
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index de7d9af731010..23e0f387466aa 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1089,9 +1089,23 @@ def is_float(obj: object) -> bool:
     """
     Return True if given object is float.
 
+    This method checks whether the passed object is a float type. It
+    returns `True` if the object is a float, and `False` otherwise.
+
+    Parameters
+    ----------
+    obj : object
+        The object to check for float type.
+
     Returns
     -------
     bool
+        `True` if the object is of float type, otherwise `False`.
+
+    See Also
+    --------
+    api.types.is_integer : Check if an object is of integer type.
+    api.types.is_numeric_dtype : Check if an object is of numeric type.
 
     Examples
     --------

From 8c0777ed0d00cafe32fbb1b37e40396898601490 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sun, 6 Oct 2024 23:08:49 +0530
Subject: [PATCH 118/224] DOC: fix SA01 for pandas.arrays.DatetimeArray
 (#59982)

---
 ci/code_checks.sh               | 1 -
 pandas/core/arrays/datetimes.py | 8 ++++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 6f4534ba4a4de..5ef4f26e66134 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -110,7 +110,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.api.types.is_re_compilable PR07,SA01" \
         -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \
         -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
-        -i "pandas.arrays.DatetimeArray SA01" \
         -i "pandas.arrays.IntegerArray SA01" \
         -i "pandas.arrays.IntervalArray.left SA01" \
         -i "pandas.arrays.IntervalArray.length SA01" \
diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
index 43f4428118aa7..41128e52e31b3 100644
--- a/pandas/core/arrays/datetimes.py
+++ b/pandas/core/arrays/datetimes.py
@@ -205,6 +205,14 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps):  # type: ignore[misc]
     -------
     None
 
+    See Also
+    --------
+    DatetimeIndex : Immutable Index for datetime-like data.
+    Series : One-dimensional labeled array capable of holding datetime-like data.
+    Timestamp : Pandas replacement for python datetime.datetime object.
+    to_datetime : Convert argument to datetime.
+    period_range : Return a fixed frequency PeriodIndex.
+
     Examples
     --------
     >>> pd.arrays.DatetimeArray._from_sequence(

From 4c9620545dcfdf69cc995cf14313e46b75385816 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sun, 6 Oct 2024 23:09:30 +0530
Subject: [PATCH 119/224] DOC: fix SA01 for pandas.errors.SpecificationError
 (#59983)

---
 ci/code_checks.sh         | 1 -
 pandas/errors/__init__.py | 5 +++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 5ef4f26e66134..453c163792fa4 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -169,7 +169,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.errors.PerformanceWarning SA01" \
         -i "pandas.errors.PossibleDataLossError SA01" \
         -i "pandas.errors.PossiblePrecisionLoss SA01" \
-        -i "pandas.errors.SpecificationError SA01" \
         -i "pandas.errors.UndefinedVariableError PR01,SA01" \
         -i "pandas.errors.UnsortedIndexError SA01" \
         -i "pandas.errors.UnsupportedFunctionCall SA01" \
diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py
index 46e090cc3a589..cf2a9d3f4a238 100644
--- a/pandas/errors/__init__.py
+++ b/pandas/errors/__init__.py
@@ -444,6 +444,11 @@ class SpecificationError(Exception):
     The second way is calling ``agg`` on a Dataframe with duplicated functions
     names without assigning column name.
 
+    See Also
+    --------
+    DataFrame.agg : Aggregate using one or more operations over the specified axis.
+    Series.agg : Aggregate using one or more operations over the specified axis.
+
     Examples
     --------
     >>> df = pd.DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)})

From 3c2c5f425ba03508d323f793c933c14bebd39dce Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sun, 6 Oct 2024 23:10:02 +0530
Subject: [PATCH 120/224] DOC: fix SA01 for pandas.errors.InvalidVersion
 (#59984)

---
 ci/code_checks.sh               | 1 -
 pandas/util/version/__init__.py | 8 ++++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 453c163792fa4..5487dc19338da 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -160,7 +160,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.errors.DuplicateLabelError SA01" \
         -i "pandas.errors.IntCastingNaNError SA01" \
         -i "pandas.errors.InvalidIndexError SA01" \
-        -i "pandas.errors.InvalidVersion SA01" \
         -i "pandas.errors.NullFrequencyError SA01" \
         -i "pandas.errors.NumExprClobberingError SA01" \
         -i "pandas.errors.NumbaUtilError SA01" \
diff --git a/pandas/util/version/__init__.py b/pandas/util/version/__init__.py
index b5d975a0db1d8..bd741140f6542 100644
--- a/pandas/util/version/__init__.py
+++ b/pandas/util/version/__init__.py
@@ -114,6 +114,14 @@ class InvalidVersion(ValueError):
     """
     An invalid version was found, users should refer to PEP 440.
 
+    The ``InvalidVersion`` exception is raised when a version string is
+    improperly formatted. Pandas uses this exception to ensure that all
+    version strings are PEP 440 compliant.
+
+    See Also
+    --------
+    util.version.Version : Class for handling and parsing version strings.
+
     Examples
     --------
     >>> pd.util.version.Version("1.")

From 5829e3ea20adc978ebfb82f08d3d5347108be0f0 Mon Sep 17 00:00:00 2001
From: Steffen Rehberg <steffen.rehberg@googlemail.com>
Date: Sun, 6 Oct 2024 23:29:45 +0200
Subject: [PATCH 121/224] DOC: Fix typos in plotting.table (#59986)

Fix typos in pandas.plotting.table docstring.
---
 pandas/plotting/_misc.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py
index d8455f44ef0d1..03701f8778065 100644
--- a/pandas/plotting/_misc.py
+++ b/pandas/plotting/_misc.py
@@ -39,7 +39,7 @@ def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table:
     **kwargs
         Keyword arguments to be passed to matplotlib.table.table.
         If `rowLabels` or `colLabels` is not specified, data index or column
-        name will be used.
+        names will be used.
 
     Returns
     -------
@@ -59,11 +59,11 @@ def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table:
 
             >>> import matplotlib.pyplot as plt
             >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]})
-            >>> fix, ax = plt.subplots()
+            >>> fig, ax = plt.subplots()
             >>> ax.axis("off")
             (0.0, 1.0, 0.0, 1.0)
             >>> table = pd.plotting.table(
-            ...     ax, df, loc="center", cellLoc="center", colWidths=list([0.2, 0.2])
+            ...     ax, df, loc="center", cellLoc="center", colWidths=[0.2, 0.2]
             ... )
     """
     plot_backend = _get_plot_backend("matplotlib")

From 2d9c95ddb70f9c68e1ad4893d07bf0f68a23316e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 7 Oct 2024 10:00:48 -0700
Subject: [PATCH 122/224] Bump mamba-org/setup-micromamba from 1 to 2 (#59988)

Bumps [mamba-org/setup-micromamba](https://github.com/mamba-org/setup-micromamba) from 1 to 2.
- [Release notes](https://github.com/mamba-org/setup-micromamba/releases)
- [Commits](https://github.com/mamba-org/setup-micromamba/compare/v1...v2)

---
updated-dependencies:
- dependency-name: mamba-org/setup-micromamba
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/package-checks.yml | 2 +-
 .github/workflows/wheels.yml         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml
index 6748832903e30..331af6e05b650 100644
--- a/.github/workflows/package-checks.yml
+++ b/.github/workflows/package-checks.yml
@@ -67,7 +67,7 @@ jobs:
           fetch-depth: 0
 
       - name: Set up Python
-        uses: mamba-org/setup-micromamba@v1
+        uses: mamba-org/setup-micromamba@v2
         with:
           environment-name: recipe-test
           create-args: >-
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index 2aaec8c9b56b0..de59a454c827c 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -165,7 +165,7 @@ jobs:
           CIBW_PLATFORM: ${{ matrix.buildplat[1] == 'pyodide_wasm32' && 'pyodide' || 'auto' }}
 
       - name: Set up Python
-        uses: mamba-org/setup-micromamba@v1
+        uses: mamba-org/setup-micromamba@v2
         with:
           environment-name: wheel-env
           # Use a fixed Python, since we might have an unreleased Python not

From e5dc0646bb4b945cec03cc328ac0989cfe0fa60a Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Mon, 7 Oct 2024 22:44:19 +0530
Subject: [PATCH 123/224] DOC: fix RT03,SA01,ES01 for
 pandas.io.stata.StataReader.value_labels (#59991)

---
 ci/code_checks.sh  |  1 -
 pandas/io/stata.py | 10 ++++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 5487dc19338da..102abf4be187c 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -174,7 +174,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.errors.ValueLabelTypeMismatch SA01" \
         -i "pandas.infer_freq SA01" \
         -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \
-        -i "pandas.io.stata.StataReader.value_labels RT03,SA01" \
         -i "pandas.io.stata.StataReader.variable_labels RT03,SA01" \
         -i "pandas.io.stata.StataWriter.write_file SA01" \
         -i "pandas.json_normalize RT03,SA01" \
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 6b988d8fed6bf..f1d289726c9c8 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -2076,9 +2076,19 @@ def value_labels(self) -> dict[str, dict[int, str]]:
         """
         Return a nested dict associating each variable name to its value and label.
 
+        This method retrieves the value labels from a Stata file. Value labels are
+        mappings between the coded values and their corresponding descriptive labels
+        in a Stata dataset.
+
         Returns
         -------
         dict
+            A python dictionary.
+
+        See Also
+        --------
+        read_stata : Read Stata file into DataFrame.
+        DataFrame.to_stata : Export DataFrame object to Stata dta format.
 
         Examples
         --------

From b3d0b9622bcd5bdf9733100407bd8b2695bc9af6 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Mon, 7 Oct 2024 22:45:35 +0530
Subject: [PATCH 124/224] DOC: fix RT03,SA01,ES01 for pandas.plotting.lag_plot
 (#59990)

---
 ci/code_checks.sh        |  1 -
 pandas/plotting/_misc.py | 11 +++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 102abf4be187c..6a1b613eccb8b 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -178,7 +178,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.io.stata.StataWriter.write_file SA01" \
         -i "pandas.json_normalize RT03,SA01" \
         -i "pandas.plotting.andrews_curves RT03,SA01" \
-        -i "pandas.plotting.lag_plot RT03,SA01" \
         -i "pandas.plotting.scatter_matrix PR07,SA01" \
         -i "pandas.set_eng_float_format RT03,SA01" \
         -i "pandas.tseries.offsets.BDay PR02,SA01" \
diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py
index 03701f8778065..81940613dd2b0 100644
--- a/pandas/plotting/_misc.py
+++ b/pandas/plotting/_misc.py
@@ -549,6 +549,10 @@ def lag_plot(series: Series, lag: int = 1, ax: Axes | None = None, **kwds) -> Ax
     """
     Lag plot for time series.
 
+    A lag plot is a scatter plot of a time series against a lag of itself. It helps
+    in visualizing the temporal dependence between observations by plotting the values
+    at time `t` on the x-axis and the values at time `t + lag` on the y-axis.
+
     Parameters
     ----------
     series : Series
@@ -563,6 +567,13 @@ def lag_plot(series: Series, lag: int = 1, ax: Axes | None = None, **kwds) -> Ax
     Returns
     -------
     matplotlib.axes.Axes
+        The matplotlib Axes object containing the lag plot.
+
+    See Also
+    --------
+    plotting.autocorrelation_plot : Autocorrelation plot for time series.
+    matplotlib.pyplot.scatter : A scatter plot of y vs. x with varying marker size
+        and/or color in Matplotlib.
 
     Examples
     --------

From 02267e55586c33a4724dd5e9dbaecfe12e3aa8b4 Mon Sep 17 00:00:00 2001
From: Randolf Scholz <randolf.scholz@gmail.com>
Date: Mon, 7 Oct 2024 19:22:27 +0200
Subject: [PATCH 125/224] Typing: Added missing methods to `NaTType` stub
 (#59995)

added missing methods to NaTType stub
---
 pandas/_libs/tslibs/nattype.pyi | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/pandas/_libs/tslibs/nattype.pyi b/pandas/_libs/tslibs/nattype.pyi
index f49e894a0bfec..fa1577f033fff 100644
--- a/pandas/_libs/tslibs/nattype.pyi
+++ b/pandas/_libs/tslibs/nattype.pyi
@@ -9,6 +9,7 @@ from typing import (
     Literal,
     NoReturn,
     TypeAlias,
+    overload,
 )
 
 import numpy as np
@@ -159,15 +160,31 @@ class NaTType:
     # inject Period properties
     @property
     def qyear(self) -> float: ...
+    # comparisons
     def __eq__(self, other: object) -> bool: ...
     def __ne__(self, other: object) -> bool: ...
     __lt__: _NatComparison
     __le__: _NatComparison
     __gt__: _NatComparison
     __ge__: _NatComparison
+    # unary operators
+    def __pos__(self) -> Self: ...
+    def __neg__(self) -> Self: ...
+    # binary operators
     def __sub__(self, other: Self | timedelta | datetime) -> Self: ...
     def __rsub__(self, other: Self | timedelta | datetime) -> Self: ...
     def __add__(self, other: Self | timedelta | datetime) -> Self: ...
     def __radd__(self, other: Self | timedelta | datetime) -> Self: ...
+    def __mul__(self, other: float) -> Self: ...  # analogous to timedelta
+    def __rmul__(self, other: float) -> Self: ...
+    @overload  # analogous to timedelta
+    def __truediv__(self, other: Self | timedelta) -> float: ...  # Literal[NaN]
+    @overload
+    def __truediv__(self, other: float) -> Self: ...
+    @overload  # analogous to timedelta
+    def __floordiv__(self, other: Self | timedelta) -> float: ...  # Literal[NaN]
+    @overload
+    def __floordiv__(self, other: float) -> Self: ...
+    # other
     def __hash__(self) -> int: ...
     def as_unit(self, unit: str, round_ok: bool = ...) -> NaTType: ...

From 37c31afa1be8b51af545a2dc3354acaf42a9c95e Mon Sep 17 00:00:00 2001
From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com>
Date: Mon, 7 Oct 2024 18:30:40 -0400
Subject: [PATCH 126/224] REGR: groupby.value_counts with all NA values
 (#59999)

* REGR: groupby.value_counts with all NA values

* Better implementation
---
 pandas/core/groupby/ops.py                    |  2 +-
 .../groupby/methods/test_value_counts.py      | 22 +++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index a82e77140d274..b32119a2ddbde 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -867,7 +867,7 @@ def _ob_index_and_ids(
             names=names,
             verify_integrity=False,
         )
-        if not consistent_sorting:
+        if not consistent_sorting and len(ob_index) > 0:
             # Sort by the levels where the corresponding sort argument is True
             n_levels = len(sorts)
             drop_levels = [
diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py
index 8f3022fbe551c..8ca6593a19f20 100644
--- a/pandas/tests/groupby/methods/test_value_counts.py
+++ b/pandas/tests/groupby/methods/test_value_counts.py
@@ -1219,3 +1219,25 @@ def test_value_counts_sort_categorical(sort, vc_sort, normalize):
     expected = expected.take(taker)
 
     tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("groupby_sort", [True, False])
+def test_value_counts_all_na(sort, dropna, groupby_sort):
+    # GH#59989
+    df = DataFrame({"a": [2, 1, 1], "b": np.nan})
+    gb = df.groupby("a", sort=groupby_sort)
+    result = gb.value_counts(sort=sort, dropna=dropna)
+
+    kwargs = {"levels": [[1, 2], [np.nan]], "names": ["a", "b"]}
+    if dropna:
+        data = []
+        index = MultiIndex(codes=[[], []], **kwargs)
+    elif not groupby_sort and not sort:
+        data = [1, 2]
+        index = MultiIndex(codes=[[1, 0], [0, 0]], **kwargs)
+    else:
+        data = [2, 1]
+        index = MultiIndex(codes=[[0, 1], [0, 0]], **kwargs)
+    expected = Series(data, index=index, dtype="int64", name="count")
+
+    tm.assert_series_equal(result, expected)

From 5126dcaf88167ff869db874be40a520bb86a27ed Mon Sep 17 00:00:00 2001
From: Deepak Saldanha <saldanhadeepakuconn@gmail.com>
Date: Tue, 8 Oct 2024 08:05:04 +0530
Subject: [PATCH 127/224] Doc: Update docstring for `dummy_na` parameter
 (#60000)

* update docstring for dummy_na parameter

* Update pandas/core/reshape/encoding.py

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>

---------

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
 pandas/core/reshape/encoding.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py
index c397c1c2566a5..33ff182f5baee 100644
--- a/pandas/core/reshape/encoding.py
+++ b/pandas/core/reshape/encoding.py
@@ -68,7 +68,8 @@ def get_dummies(
         If appending prefix, separator/delimiter to use. Or pass a
         list or dictionary as with `prefix`.
     dummy_na : bool, default False
-        Add a column to indicate NaNs, if False NaNs are ignored.
+        If True, a NaN indicator column will be added even if no NaN values are present.
+        If False, NA values are encoded as all zero.
     columns : list-like, default None
         Column names in the DataFrame to be encoded.
         If `columns` is None then all the columns with

From 5ea5bd95d5bb93434fb5f1686f50b176c46dbac8 Mon Sep 17 00:00:00 2001
From: Randolf Scholz <randolf.scholz@gmail.com>
Date: Tue, 8 Oct 2024 20:36:41 +0200
Subject: [PATCH 128/224] Typing: More precise NaT stub (#60002)

* more precise NaT stub

* ruff format

* updated == and != to return literal
---
 pandas/_libs/tslibs/nattype.pyi | 40 +++++++++++++++------------------
 1 file changed, 18 insertions(+), 22 deletions(-)

diff --git a/pandas/_libs/tslibs/nattype.pyi b/pandas/_libs/tslibs/nattype.pyi
index fa1577f033fff..d3b10fbe79cb9 100644
--- a/pandas/_libs/tslibs/nattype.pyi
+++ b/pandas/_libs/tslibs/nattype.pyi
@@ -25,12 +25,8 @@ NaT: NaTType
 iNaT: int
 nat_strings: set[str]
 
-_NaTComparisonTypes: TypeAlias = (
-    datetime | timedelta | Period | np.datetime64 | np.timedelta64
-)
-
-class _NatComparison:
-    def __call__(self, other: _NaTComparisonTypes) -> bool: ...
+_TimeLike: TypeAlias = datetime | timedelta | Period | np.datetime64 | np.timedelta64
+_TimeDelta: TypeAlias = timedelta | np.timedelta64
 
 class NaTType:
     _value: np.int64
@@ -161,30 +157,30 @@ class NaTType:
     @property
     def qyear(self) -> float: ...
     # comparisons
-    def __eq__(self, other: object) -> bool: ...
-    def __ne__(self, other: object) -> bool: ...
-    __lt__: _NatComparison
-    __le__: _NatComparison
-    __gt__: _NatComparison
-    __ge__: _NatComparison
+    def __eq__(self, other: object, /) -> Literal[False]: ...
+    def __ne__(self, other: object, /) -> Literal[True]: ...
+    def __lt__(self, other: Self | _TimeLike, /) -> Literal[False]: ...
+    def __le__(self, other: Self | _TimeLike, /) -> Literal[False]: ...
+    def __gt__(self, other: Self | _TimeLike, /) -> Literal[False]: ...
+    def __ge__(self, other: Self | _TimeLike, /) -> Literal[False]: ...
     # unary operators
     def __pos__(self) -> Self: ...
     def __neg__(self) -> Self: ...
     # binary operators
-    def __sub__(self, other: Self | timedelta | datetime) -> Self: ...
-    def __rsub__(self, other: Self | timedelta | datetime) -> Self: ...
-    def __add__(self, other: Self | timedelta | datetime) -> Self: ...
-    def __radd__(self, other: Self | timedelta | datetime) -> Self: ...
-    def __mul__(self, other: float) -> Self: ...  # analogous to timedelta
-    def __rmul__(self, other: float) -> Self: ...
+    def __sub__(self, other: Self | _TimeLike, /) -> Self: ...
+    def __rsub__(self, other: Self | _TimeLike, /) -> Self: ...
+    def __add__(self, other: Self | _TimeLike, /) -> Self: ...
+    def __radd__(self, other: Self | _TimeLike, /) -> Self: ...
+    def __mul__(self, other: float, /) -> Self: ...  # analogous to timedelta
+    def __rmul__(self, other: float, /) -> Self: ...
     @overload  # analogous to timedelta
-    def __truediv__(self, other: Self | timedelta) -> float: ...  # Literal[NaN]
+    def __truediv__(self, other: Self | _TimeDelta, /) -> float: ...  # Literal[NaN]
     @overload
-    def __truediv__(self, other: float) -> Self: ...
+    def __truediv__(self, other: float, /) -> Self: ...
     @overload  # analogous to timedelta
-    def __floordiv__(self, other: Self | timedelta) -> float: ...  # Literal[NaN]
+    def __floordiv__(self, other: Self | _TimeDelta, /) -> float: ...  # Literal[NaN]
     @overload
-    def __floordiv__(self, other: float) -> Self: ...
+    def __floordiv__(self, other: float, /) -> Self: ...
     # other
     def __hash__(self) -> int: ...
     def as_unit(self, unit: str, round_ok: bool = ...) -> NaTType: ...

From f94860e1ce75b57db9eda2c37154c5b22b661121 Mon Sep 17 00:00:00 2001
From: Deepak Saldanha <saldanhadeepakuconn@gmail.com>
Date: Wed, 9 Oct 2024 00:11:39 +0530
Subject: [PATCH 129/224] DOC: Refactor _create_delegator_method using
 functools (#59878)

* add tag dt.to_timestamp, series.rst

* add doc strings for dt.to_timestamp

* update datetimes.py

* refactor _create_delegator_method to use functools wrap

* changes to accessor.py

* remove from code_checks.sh

* update code_checks.sh

* update code_checks.sh

* rewrite functools, adjust unit tests

* update change log

* remove dup entry

* update code_checks.sh

* update

* revert all dt related changes

* update series.rst

* update imports

* format use of functools import
---
 ci/code_checks.sh                 | 20 --------------------
 pandas/core/accessor.py           |  7 ++++---
 pandas/core/arrays/categorical.py | 14 ++++++++++++--
 3 files changed, 16 insertions(+), 25 deletions(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 6a1b613eccb8b..6fb675069e81d 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -73,27 +73,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Period.freq GL08" \
         -i "pandas.Period.ordinal GL08" \
         -i "pandas.RangeIndex.from_range PR01,SA01" \
-        -i "pandas.Series.cat.add_categories PR01,PR02" \
-        -i "pandas.Series.cat.as_ordered PR01" \
-        -i "pandas.Series.cat.as_unordered PR01" \
-        -i "pandas.Series.cat.remove_categories PR01,PR02" \
-        -i "pandas.Series.cat.remove_unused_categories PR01" \
-        -i "pandas.Series.cat.rename_categories PR01,PR02" \
-        -i "pandas.Series.cat.reorder_categories PR01,PR02" \
-        -i "pandas.Series.cat.set_categories PR01,PR02" \
-        -i "pandas.Series.dt.as_unit PR01,PR02" \
-        -i "pandas.Series.dt.ceil PR01,PR02" \
-        -i "pandas.Series.dt.day_name PR01,PR02" \
-        -i "pandas.Series.dt.floor PR01,PR02" \
         -i "pandas.Series.dt.freq GL08" \
-        -i "pandas.Series.dt.month_name PR01,PR02" \
-        -i "pandas.Series.dt.normalize PR01" \
-        -i "pandas.Series.dt.round PR01,PR02" \
-        -i "pandas.Series.dt.strftime PR01,PR02" \
-        -i "pandas.Series.dt.to_period PR01,PR02" \
-        -i "pandas.Series.dt.total_seconds PR01" \
-        -i "pandas.Series.dt.tz_convert PR01,PR02" \
-        -i "pandas.Series.dt.tz_localize PR01,PR02" \
         -i "pandas.Series.dt.unit GL08" \
         -i "pandas.Series.pad PR01,SA01" \
         -i "pandas.Timedelta.max PR02" \
diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py
index d8463fda34caa..78684eacf2d66 100644
--- a/pandas/core/accessor.py
+++ b/pandas/core/accessor.py
@@ -7,6 +7,7 @@
 
 from __future__ import annotations
 
+import functools
 from typing import (
     TYPE_CHECKING,
     final,
@@ -117,12 +118,12 @@ def _setter(self, new_values):
             )
 
         def _create_delegator_method(name: str):
+            method = getattr(delegate, accessor_mapping(name))
+
+            @functools.wraps(method)
             def f(self, *args, **kwargs):
                 return self._delegate_method(name, *args, **kwargs)
 
-            f.__name__ = name
-            f.__doc__ = getattr(delegate, accessor_mapping(name)).__doc__
-
             return f
 
         for name in accessors:
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index a69e197df851d..0484ef89f61c2 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -1155,6 +1155,12 @@ def rename_categories(self, new_categories) -> Self:
         """
         Rename categories.
 
+        This method is commonly used to re-label or adjust the
+        category names in categorical data without changing the
+        underlying data. It is useful in situations where you want
+        to modify the labels used for clarity, consistency,
+        or readability.
+
         Parameters
         ----------
         new_categories : list-like, dict-like or callable
@@ -1371,8 +1377,8 @@ def remove_categories(self, removals) -> Self:
         """
         Remove the specified categories.
 
-        `removals` must be included in the old categories. Values which were in
-        the removed categories will be set to NaN
+        The ``removals`` argument must be a subset of the current categories.
+        Any values that were part of the removed categories will be set to NaN.
 
         Parameters
         ----------
@@ -1431,6 +1437,10 @@ def remove_unused_categories(self) -> Self:
         """
         Remove categories which are not used.
 
+        This method is useful when working with datasets
+        that undergo dynamic changes where categories may no longer be
+        relevant, allowing to maintain a clean, efficient data structure.
+
         Returns
         -------
         Categorical

From b975191afe1401f13ab5e15d3df83b5d95dffe75 Mon Sep 17 00:00:00 2001
From: Albert Villanova del Moral
 <8515462+albertvillanova@users.noreply.github.com>
Date: Tue, 8 Oct 2024 22:00:43 +0200
Subject: [PATCH 130/224] Fix Styler docstring (#60001)

* Fix Styler docstring

* Remove blankspaces
---
 pandas/io/formats/style.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py
index 6e5ae09485951..eb6773310da69 100644
--- a/pandas/io/formats/style.py
+++ b/pandas/io/formats/style.py
@@ -222,6 +222,7 @@ class Styler(StylerRenderer):
       * ``level<k>`` where `k` is the level in a MultiIndex
 
     * Column label cells include
+
       * ``col_heading``
       * ``col<n>`` where `n` is the numeric position of the column
       * ``level<k>`` where `k` is the level in a MultiIndex
@@ -231,7 +232,7 @@ class Styler(StylerRenderer):
     * Trimmed cells include ``col_trim`` or ``row_trim``.
 
     Any, or all, or these classes can be renamed by using the ``css_class_names``
-    argument in ``Styler.set_table_classes``, giving a value such as
+    argument in ``Styler.set_table_styles``, giving a value such as
     *{"row": "MY_ROW_CLASS", "col_trim": "", "row_trim": ""}*.
 
     Examples

From a0f9140b942d9f596889cd26ac395551dcdf3afb Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 8 Oct 2024 14:51:49 -0700
Subject: [PATCH 131/224] [pre-commit.ci] pre-commit autoupdate (#59998)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* [pre-commit.ci] pre-commit autoupdate

updates:
- [github.com/astral-sh/ruff-pre-commit: v0.5.0 → v0.6.9](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.0...v0.6.9)
- [github.com/jendrikseipp/vulture: v2.11 → v2.13](https://github.com/jendrikseipp/vulture/compare/v2.11...v2.13)
- [github.com/pre-commit/pre-commit-hooks: v4.6.0 → v5.0.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.6.0...v5.0.0)
- [github.com/asottile/pyupgrade: v3.16.0 → v3.17.0](https://github.com/asottile/pyupgrade/compare/v3.16.0...v3.17.0)
- [github.com/sphinx-contrib/sphinx-lint: v0.9.1 → v1.0.0](https://github.com/sphinx-contrib/sphinx-lint/compare/v0.9.1...v1.0.0)
- [github.com/pre-commit/mirrors-clang-format: v18.1.8 → v19.1.1](https://github.com/pre-commit/mirrors-clang-format/compare/v18.1.8...v19.1.1)

* Update .pre-commit-config.yaml

* fix style.ipynb, ignore some pylint

* pyupgrade

* Revert "pyupgrade"

This reverts commit b539c71009ff15769c501cf170ed9894a49ddcfb.

* don't bump pyupgrade

* Typo in random call

* Delete hidden cell

* Undo max/min rule from ruff

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
 .pre-commit-config.yaml               |  10 +-
 doc/source/user_guide/style.ipynb     | 689 ++++++++++++++------------
 pandas/core/arrays/string_.py         |   2 +-
 pandas/tests/indexes/test_old_base.py |   2 +-
 pyproject.toml                        |   9 +-
 5 files changed, 396 insertions(+), 316 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f6717dd503c9b..7c9ebf7d94173 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,7 +19,7 @@ ci:
     skip: [pyright, mypy]
 repos:
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.0
+    rev: v0.6.9
     hooks:
     -   id: ruff
         args: [--exit-non-zero-on-fix]
@@ -34,7 +34,7 @@ repos:
     -   id: ruff-format
         exclude: ^scripts|^pandas/tests/frame/test_query_eval.py
 -   repo: https://github.com/jendrikseipp/vulture
-    rev: 'v2.11'
+    rev: 'v2.13'
     hooks:
       - id: vulture
         entry: python scripts/run_vulture.py
@@ -52,7 +52,7 @@ repos:
     -   id: cython-lint
     -   id: double-quote-cython-strings
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
     -   id: check-case-conflict
     -   id: check-toml
@@ -90,12 +90,12 @@ repos:
         types: [text]  # overwrite types: [rst]
         types_or: [python, rst]
 -   repo: https://github.com/sphinx-contrib/sphinx-lint
-    rev: v0.9.1
+    rev: v1.0.0
     hooks:
     - id: sphinx-lint
       args: ["--enable", "all", "--disable", "line-too-long"]
 -   repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v18.1.8
+    rev: v19.1.1
     hooks:
     - id: clang-format
       files: ^pandas/_libs/src|^pandas/_libs/include
diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb
index daecfce6ecebc..abb7181fc8d72 100644
--- a/doc/source/user_guide/style.ipynb
+++ b/doc/source/user_guide/style.ipynb
@@ -38,19 +38,6 @@
     "[concatfunc]: ../reference/api/pandas.io.formats.style.Styler.concat.rst"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "nbsphinx": "hidden"
-   },
-   "outputs": [],
-   "source": [
-    "import matplotlib.pyplot\n",
-    "# We have this here to trigger matplotlib's font cache stuff.\n",
-    "# This cell is hidden from the output"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -78,17 +65,13 @@
    "source": [
     "import pandas as pd\n",
     "import numpy as np\n",
-    "import matplotlib as mpl\n",
     "\n",
-    "df = pd.DataFrame({\n",
-    "    \"strings\": [\"Adam\", \"Mike\"],\n",
-    "    \"ints\": [1, 3],\n",
-    "    \"floats\": [1.123, 1000.23]\n",
-    "})\n",
-    "df.style \\\n",
-    "  .format(precision=3, thousands=\".\", decimal=\",\") \\\n",
-    "  .format_index(str.upper, axis=1) \\\n",
-    "  .relabel_index([\"row 1\", \"row 2\"], axis=0)"
+    "df = pd.DataFrame(\n",
+    "    {\"strings\": [\"Adam\", \"Mike\"], \"ints\": [1, 3], \"floats\": [1.123, 1000.23]}\n",
+    ")\n",
+    "df.style.format(precision=3, thousands=\".\", decimal=\",\").format_index(\n",
+    "    str.upper, axis=1\n",
+    ").relabel_index([\"row 1\", \"row 2\"], axis=0)"
    ]
   },
   {
@@ -104,17 +87,21 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "weather_df = pd.DataFrame(np.random.rand(10,2)*5, \n",
-    "                          index=pd.date_range(start=\"2021-01-01\", periods=10),\n",
-    "                          columns=[\"Tokyo\", \"Beijing\"])\n",
+    "weather_df = pd.DataFrame(\n",
+    "    np.random.default_rng().random((10, 2)) * 5,\n",
+    "    index=pd.date_range(start=\"2021-01-01\", periods=10),\n",
+    "    columns=[\"Tokyo\", \"Beijing\"],\n",
+    ")\n",
+    "\n",
     "\n",
-    "def rain_condition(v): \n",
+    "def rain_condition(v):\n",
     "    if v < 1.75:\n",
     "        return \"Dry\"\n",
     "    elif v < 2.75:\n",
     "        return \"Rain\"\n",
     "    return \"Heavy Rain\"\n",
     "\n",
+    "\n",
     "def make_pretty(styler):\n",
     "    styler.set_caption(\"Weather Conditions\")\n",
     "    styler.format(rain_condition)\n",
@@ -122,6 +109,7 @@
     "    styler.background_gradient(axis=None, vmin=1, vmax=5, cmap=\"YlGnBu\")\n",
     "    return styler\n",
     "\n",
+    "\n",
     "weather_df"
    ]
   },
@@ -157,10 +145,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = pd.DataFrame(np.random.randn(5, 5))\n",
-    "df.style \\\n",
-    "  .hide(subset=[0, 2, 4], axis=0) \\\n",
-    "  .hide(subset=[0, 2, 4], axis=1)"
+    "df = pd.DataFrame(np.random.default_rng().standard_normal((5, 5)))\n",
+    "df.style.hide(subset=[0, 2, 4], axis=0).hide(subset=[0, 2, 4], axis=1)"
    ]
   },
   {
@@ -177,9 +163,9 @@
    "outputs": [],
    "source": [
     "show = [0, 2, 4]\n",
-    "df.style \\\n",
-    "  .hide([row for row in df.index if row not in show], axis=0) \\\n",
-    "  .hide([col for col in df.columns if col not in show], axis=1)"
+    "df.style.hide([row for row in df.index if row not in show], axis=0).hide(\n",
+    "    [col for col in df.columns if col not in show], axis=1\n",
+    ")"
    ]
   },
   {
@@ -199,9 +185,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "summary_styler = df.agg([\"sum\", \"mean\"]).style \\\n",
-    "                   .format(precision=3) \\\n",
-    "                   .relabel_index([\"Sum\", \"Average\"])\n",
+    "summary_styler = (\n",
+    "    df.agg([\"sum\", \"mean\"]).style.format(precision=3).relabel_index([\"Sum\", \"Average\"])\n",
+    ")\n",
     "df.style.format(precision=1).concat(summary_styler)"
    ]
   },
@@ -227,9 +213,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df = pd.DataFrame([[38.0, 2.0, 18.0, 22.0, 21, np.nan],[19, 439, 6, 452, 226,232]], \n",
-    "                  index=pd.Index(['Tumour (Positive)', 'Non-Tumour (Negative)'], name='Actual Label:'), \n",
-    "                  columns=pd.MultiIndex.from_product([['Decision Tree', 'Regression', 'Random'],['Tumour', 'Non-Tumour']], names=['Model:', 'Predicted:']))\n",
+    "idx = pd.Index([\"Tumour (Positive)\", \"Non-Tumour (Negative)\"], name=\"Actual Label:\")\n",
+    "cols = pd.MultiIndex.from_product(\n",
+    "    [[\"Decision Tree\", \"Regression\", \"Random\"], [\"Tumour\", \"Non-Tumour\"]],\n",
+    "    names=[\"Model:\", \"Predicted:\"],\n",
+    ")\n",
+    "df = pd.DataFrame(\n",
+    "    [[38.0, 2.0, 18.0, 22.0, 21, np.nan], [19, 439, 6, 452, 226, 232]],\n",
+    "    index=idx,\n",
+    "    columns=cols,\n",
+    ")\n",
     "df.style"
    ]
   },
@@ -242,63 +235,68 @@
    "outputs": [],
    "source": [
     "# Hidden cell to just create the below example: code is covered throughout the guide.\n",
-    "s = df.style\\\n",
-    "      .hide([('Random', 'Tumour'), ('Random', 'Non-Tumour')], axis='columns')\\\n",
-    "      .format('{:.0f}')\\\n",
-    "      .set_table_styles([{\n",
-    "        'selector': '',\n",
-    "        'props':  'border-collapse: separate;'\n",
-    "      },{\n",
-    "        'selector': 'caption',\n",
-    "        'props': 'caption-side: bottom; font-size:1.3em;'\n",
-    "      },{\n",
-    "        'selector': '.index_name',\n",
-    "        'props': 'font-style: italic; color: darkgrey; font-weight:normal;'\n",
-    "      },{\n",
-    "        'selector': 'th:not(.index_name)',\n",
-    "        'props': 'background-color: #000066; color: white;'\n",
-    "      },{\n",
-    "        'selector': 'th.col_heading',\n",
-    "        'props': 'text-align: center;'\n",
-    "      },{\n",
-    "        'selector': 'th.col_heading.level0',\n",
-    "        'props': 'font-size: 1.5em;'\n",
-    "      },{\n",
-    "        'selector': 'th.col2',\n",
-    "        'props': 'border-left: 1px solid white;'\n",
-    "      },{\n",
-    "        'selector': '.col2',\n",
-    "        'props': 'border-left: 1px solid #000066;'\n",
-    "      },{\n",
-    "        'selector': 'td',\n",
-    "        'props': 'text-align: center; font-weight:bold;'\n",
-    "      },{\n",
-    "        'selector': '.true',\n",
-    "        'props': 'background-color: #e6ffe6;'\n",
-    "      },{\n",
-    "        'selector': '.false',\n",
-    "        'props': 'background-color: #ffe6e6;'\n",
-    "      },{\n",
-    "        'selector': '.border-red',\n",
-    "        'props': 'border: 2px dashed red;'\n",
-    "      },{\n",
-    "        'selector': '.border-green',\n",
-    "        'props': 'border: 2px dashed green;'\n",
-    "      },{\n",
-    "        'selector': 'td:hover',\n",
-    "        'props': 'background-color: #ffffb3;'\n",
-    "      }])\\\n",
-    "      .set_td_classes(pd.DataFrame([['true border-green', 'false', 'true', 'false border-red', '', ''],\n",
-    "                                    ['false', 'true', 'false', 'true', '', '']], \n",
-    "                                    index=df.index, columns=df.columns))\\\n",
-    "      .set_caption(\"Confusion matrix for multiple cancer prediction models.\")\\\n",
-    "      .set_tooltips(pd.DataFrame([['This model has a very strong true positive rate', '', '', \"This model's total number of false negatives is too high\", '', ''],\n",
-    "                                    ['', '', '', '', '', '']], \n",
-    "                                    index=df.index, columns=df.columns),\n",
-    "                   css_class='pd-tt', props=\n",
-    "    'visibility: hidden; position: absolute; z-index: 1; border: 1px solid #000066;'\n",
-    "    'background-color: white; color: #000066; font-size: 0.8em;' \n",
-    "    'transform: translate(0px, -24px); padding: 0.6em; border-radius: 0.5em;')\n"
+    "s = (\n",
+    "    df.style.hide([(\"Random\", \"Tumour\"), (\"Random\", \"Non-Tumour\")], axis=\"columns\")\n",
+    "    .format(\"{:.0f}\")\n",
+    "    .set_table_styles(\n",
+    "        [\n",
+    "            {\"selector\": \"\", \"props\": \"border-collapse: separate;\"},\n",
+    "            {\"selector\": \"caption\", \"props\": \"caption-side: bottom; font-size:1.3em;\"},\n",
+    "            {\n",
+    "                \"selector\": \".index_name\",\n",
+    "                \"props\": \"font-style: italic; color: darkgrey; font-weight:normal;\",\n",
+    "            },\n",
+    "            {\n",
+    "                \"selector\": \"th:not(.index_name)\",\n",
+    "                \"props\": \"background-color: #000066; color: white;\",\n",
+    "            },\n",
+    "            {\"selector\": \"th.col_heading\", \"props\": \"text-align: center;\"},\n",
+    "            {\"selector\": \"th.col_heading.level0\", \"props\": \"font-size: 1.5em;\"},\n",
+    "            {\"selector\": \"th.col2\", \"props\": \"border-left: 1px solid white;\"},\n",
+    "            {\"selector\": \".col2\", \"props\": \"border-left: 1px solid #000066;\"},\n",
+    "            {\"selector\": \"td\", \"props\": \"text-align: center; font-weight:bold;\"},\n",
+    "            {\"selector\": \".true\", \"props\": \"background-color: #e6ffe6;\"},\n",
+    "            {\"selector\": \".false\", \"props\": \"background-color: #ffe6e6;\"},\n",
+    "            {\"selector\": \".border-red\", \"props\": \"border: 2px dashed red;\"},\n",
+    "            {\"selector\": \".border-green\", \"props\": \"border: 2px dashed green;\"},\n",
+    "            {\"selector\": \"td:hover\", \"props\": \"background-color: #ffffb3;\"},\n",
+    "        ]\n",
+    "    )\n",
+    "    .set_td_classes(\n",
+    "        pd.DataFrame(\n",
+    "            [\n",
+    "                [\"true border-green\", \"false\", \"true\", \"false border-red\", \"\", \"\"],\n",
+    "                [\"false\", \"true\", \"false\", \"true\", \"\", \"\"],\n",
+    "            ],\n",
+    "            index=df.index,\n",
+    "            columns=df.columns,\n",
+    "        )\n",
+    "    )\n",
+    "    .set_caption(\"Confusion matrix for multiple cancer prediction models.\")\n",
+    "    .set_tooltips(\n",
+    "        pd.DataFrame(\n",
+    "            [\n",
+    "                [\n",
+    "                    \"This model has a very strong true positive rate\",\n",
+    "                    \"\",\n",
+    "                    \"\",\n",
+    "                    \"This model's total number of false negatives is too high\",\n",
+    "                    \"\",\n",
+    "                    \"\",\n",
+    "                ],\n",
+    "                [\"\", \"\", \"\", \"\", \"\", \"\"],\n",
+    "            ],\n",
+    "            index=df.index,\n",
+    "            columns=df.columns,\n",
+    "        ),\n",
+    "        css_class=\"pd-tt\",\n",
+    "        props=\"visibility: hidden; \"\n",
+    "        \"position: absolute; z-index: 1; \"\n",
+    "        \"border: 1px solid #000066;\"\n",
+    "        \"background-color: white; color: #000066; font-size: 0.8em;\"\n",
+    "        \"transform: translate(0px, -24px); padding: 0.6em; border-radius: 0.5em;\",\n",
+    "    )\n",
+    ")"
    ]
   },
   {
@@ -325,7 +323,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "s = df.style.format('{:.0f}').hide([('Random', 'Tumour'), ('Random', 'Non-Tumour')], axis=\"columns\")\n",
+    "s = df.style.format(\"{:.0f}\").hide(\n",
+    "    [(\"Random\", \"Tumour\"), (\"Random\", \"Non-Tumour\")], axis=\"columns\"\n",
+    ")\n",
     "s"
    ]
   },
@@ -337,8 +337,8 @@
    },
    "outputs": [],
    "source": [
-    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n",
-    "s.set_uuid('after_hide')"
+    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n",
+    "s.set_uuid(\"after_hide\")"
    ]
   },
   {
@@ -395,16 +395,16 @@
    "outputs": [],
    "source": [
     "cell_hover = {  # for row hover use <tr> instead of <td>\n",
-    "    'selector': 'td:hover',\n",
-    "    'props': [('background-color', '#ffffb3')]\n",
+    "    \"selector\": \"td:hover\",\n",
+    "    \"props\": [(\"background-color\", \"#ffffb3\")],\n",
     "}\n",
     "index_names = {\n",
-    "    'selector': '.index_name',\n",
-    "    'props': 'font-style: italic; color: darkgrey; font-weight:normal;'\n",
+    "    \"selector\": \".index_name\",\n",
+    "    \"props\": \"font-style: italic; color: darkgrey; font-weight:normal;\",\n",
     "}\n",
     "headers = {\n",
-    "    'selector': 'th:not(.index_name)',\n",
-    "    'props': 'background-color: #000066; color: white;'\n",
+    "    \"selector\": \"th:not(.index_name)\",\n",
+    "    \"props\": \"background-color: #000066; color: white;\",\n",
     "}\n",
     "s.set_table_styles([cell_hover, index_names, headers])"
    ]
@@ -417,8 +417,8 @@
    },
    "outputs": [],
    "source": [
-    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n",
-    "s.set_uuid('after_tab_styles1')"
+    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n",
+    "s.set_uuid(\"after_tab_styles1\")"
    ]
   },
   {
@@ -434,11 +434,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "s.set_table_styles([\n",
-    "    {'selector': 'th.col_heading', 'props': 'text-align: center;'},\n",
-    "    {'selector': 'th.col_heading.level0', 'props': 'font-size: 1.5em;'},\n",
-    "    {'selector': 'td', 'props': 'text-align: center; font-weight: bold;'},\n",
-    "], overwrite=False)"
+    "s.set_table_styles(\n",
+    "    [\n",
+    "        {\"selector\": \"th.col_heading\", \"props\": \"text-align: center;\"},\n",
+    "        {\"selector\": \"th.col_heading.level0\", \"props\": \"font-size: 1.5em;\"},\n",
+    "        {\"selector\": \"td\", \"props\": \"text-align: center; font-weight: bold;\"},\n",
+    "    ],\n",
+    "    overwrite=False,\n",
+    ")"
    ]
   },
   {
@@ -449,8 +452,8 @@
    },
    "outputs": [],
    "source": [
-    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n",
-    "s.set_uuid('after_tab_styles2')"
+    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n",
+    "s.set_uuid(\"after_tab_styles2\")"
    ]
   },
   {
@@ -468,10 +471,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "s.set_table_styles({\n",
-    "    ('Regression', 'Tumour'): [{'selector': 'th', 'props': 'border-left: 1px solid white'},\n",
-    "                               {'selector': 'td', 'props': 'border-left: 1px solid #000066'}]\n",
-    "}, overwrite=False, axis=0)"
+    "s.set_table_styles(\n",
+    "    {\n",
+    "        (\"Regression\", \"Tumour\"): [\n",
+    "            {\"selector\": \"th\", \"props\": \"border-left: 1px solid white\"},\n",
+    "            {\"selector\": \"td\", \"props\": \"border-left: 1px solid #000066\"},\n",
+    "        ]\n",
+    "    },\n",
+    "    overwrite=False,\n",
+    "    axis=0,\n",
+    ")"
    ]
   },
   {
@@ -482,8 +491,8 @@
    },
    "outputs": [],
    "source": [
-    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n",
-    "s.set_uuid('xyz01')"
+    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n",
+    "s.set_uuid(\"xyz01\")"
    ]
   },
   {
@@ -508,7 +517,7 @@
    "outputs": [],
    "source": [
     "out = s.set_table_attributes('class=\"my-table-cls\"').to_html()\n",
-    "print(out[out.find('<table'):][:109])"
+    "print(out[out.find(\"<table\") :][:109])"
    ]
   },
   {
@@ -531,14 +540,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "s.set_table_styles([  # create internal CSS classes\n",
-    "    {'selector': '.true', 'props': 'background-color: #e6ffe6;'},\n",
-    "    {'selector': '.false', 'props': 'background-color: #ffe6e6;'},\n",
-    "], overwrite=False)\n",
-    "cell_color = pd.DataFrame([['true ', 'false ', 'true ', 'false '], \n",
-    "                           ['false ', 'true ', 'false ', 'true ']], \n",
-    "                          index=df.index, \n",
-    "                          columns=df.columns[:4])\n",
+    "s.set_table_styles(\n",
+    "    [  # create internal CSS classes\n",
+    "        {\"selector\": \".true\", \"props\": \"background-color: #e6ffe6;\"},\n",
+    "        {\"selector\": \".false\", \"props\": \"background-color: #ffe6e6;\"},\n",
+    "    ],\n",
+    "    overwrite=False,\n",
+    ")\n",
+    "cell_color = pd.DataFrame(\n",
+    "    [[\"true \", \"false \", \"true \", \"false \"], [\"false \", \"true \", \"false \", \"true \"]],\n",
+    "    index=df.index,\n",
+    "    columns=df.columns[:4],\n",
+    ")\n",
     "s.set_td_classes(cell_color)"
    ]
   },
@@ -550,8 +563,8 @@
    },
    "outputs": [],
    "source": [
-    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n",
-    "s.set_uuid('after_classes')"
+    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n",
+    "s.set_uuid(\"after_classes\")"
    ]
   },
   {
@@ -579,8 +592,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "np.random.seed(0)\n",
-    "df2 = pd.DataFrame(np.random.randn(10,4), columns=['A','B','C','D'])\n",
+    "df2 = pd.DataFrame(\n",
+    "    np.random.default_rng(0).standard_normal((10, 4)), columns=[\"A\", \"B\", \"C\", \"D\"]\n",
+    ")\n",
     "df2.style"
    ]
   },
@@ -597,10 +611,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def style_negative(v, props=''):\n",
+    "def style_negative(v, props=\"\"):\n",
     "    return props if v < 0 else None\n",
-    "s2 = df2.style.map(style_negative, props='color:red;')\\\n",
-    "              .map(lambda v: 'opacity: 20%;' if (v < 0.3) and (v > -0.3) else None)\n",
+    "\n",
+    "\n",
+    "s2 = df2.style.map(style_negative, props=\"color:red;\").map(\n",
+    "    lambda v: \"opacity: 20%;\" if (v < 0.3) and (v > -0.3) else None\n",
+    ")\n",
     "s2"
    ]
   },
@@ -612,8 +629,8 @@
    },
    "outputs": [],
    "source": [
-    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n",
-    "s2.set_uuid('after_applymap')"
+    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n",
+    "s2.set_uuid(\"after_applymap\")"
    ]
   },
   {
@@ -629,9 +646,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def highlight_max(s, props=''):\n",
-    "    return np.where(s == np.nanmax(s.values), props, '')\n",
-    "s2.apply(highlight_max, props='color:white;background-color:darkblue', axis=0)"
+    "def highlight_max(s, props=\"\"):\n",
+    "    return np.where(s == np.nanmax(s.values), props, \"\")\n",
+    "\n",
+    "\n",
+    "s2.apply(highlight_max, props=\"color:white;background-color:darkblue\", axis=0)"
    ]
   },
   {
@@ -642,8 +661,8 @@
    },
    "outputs": [],
    "source": [
-    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n",
-    "s2.set_uuid('after_apply')"
+    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n",
+    "s2.set_uuid(\"after_apply\")"
    ]
   },
   {
@@ -659,8 +678,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "s2.apply(highlight_max, props='color:white;background-color:pink;', axis=1)\\\n",
-    "  .apply(highlight_max, props='color:white;background-color:purple', axis=None)"
+    "s2.apply(highlight_max, props=\"color:white;background-color:pink;\", axis=1).apply(\n",
+    "    highlight_max, props=\"color:white;background-color:purple\", axis=None\n",
+    ")"
    ]
   },
   {
@@ -671,8 +691,8 @@
    },
    "outputs": [],
    "source": [
-    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n",
-    "s2.set_uuid('after_apply_again')"
+    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n",
+    "s2.set_uuid(\"after_apply_again\")"
    ]
   },
   {
@@ -713,8 +733,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "s2.map_index(lambda v: \"color:pink;\" if v>4 else \"color:darkblue;\", axis=0)\n",
-    "s2.apply_index(lambda s: np.where(s.isin([\"A\", \"B\"]), \"color:pink;\", \"color:darkblue;\"), axis=1)"
+    "s2.map_index(lambda v: \"color:pink;\" if v > 4 else \"color:darkblue;\", axis=0)\n",
+    "s2.apply_index(\n",
+    "    lambda s: np.where(s.isin([\"A\", \"B\"]), \"color:pink;\", \"color:darkblue;\"), axis=1\n",
+    ")"
    ]
   },
   {
@@ -734,11 +756,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "s.set_caption(\"Confusion matrix for multiple cancer prediction models.\")\\\n",
-    " .set_table_styles([{\n",
-    "     'selector': 'caption',\n",
-    "     'props': 'caption-side: bottom; font-size:1.25em;'\n",
-    " }], overwrite=False)"
+    "s.set_caption(\n",
+    "    \"Confusion matrix for multiple cancer prediction models.\"\n",
+    ").set_table_styles(\n",
+    "    [{\"selector\": \"caption\", \"props\": \"caption-side: bottom; font-size:1.25em;\"}],\n",
+    "    overwrite=False,\n",
+    ")"
    ]
   },
   {
@@ -749,8 +772,8 @@
    },
    "outputs": [],
    "source": [
-    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n",
-    "s.set_uuid('after_caption')"
+    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n",
+    "s.set_uuid(\"after_caption\")"
    ]
   },
   {
@@ -768,12 +791,24 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tt = pd.DataFrame([['This model has a very strong true positive rate', \n",
-    "                    \"This model's total number of false negatives is too high\"]], \n",
-    "                  index=['Tumour (Positive)'], columns=df.columns[[0,3]])\n",
-    "s.set_tooltips(tt, props='visibility: hidden; position: absolute; z-index: 1; border: 1px solid #000066;'\n",
-    "                         'background-color: white; color: #000066; font-size: 0.8em;' \n",
-    "                         'transform: translate(0px, -24px); padding: 0.6em; border-radius: 0.5em;')"
+    "tt = pd.DataFrame(\n",
+    "    [\n",
+    "        [\n",
+    "            \"This model has a very strong true positive rate\",\n",
+    "            \"This model's total number of false negatives is too high\",\n",
+    "        ]\n",
+    "    ],\n",
+    "    index=[\"Tumour (Positive)\"],\n",
+    "    columns=df.columns[[0, 3]],\n",
+    ")\n",
+    "s.set_tooltips(\n",
+    "    tt,\n",
+    "    props=\"visibility: hidden; position: absolute; z-index: 1; \"\n",
+    "    \"border: 1px solid #000066;\"\n",
+    "    \"background-color: white; color: #000066; font-size: 0.8em;\"\n",
+    "    \"transform: translate(0px, -24px); padding: 0.6em; \"\n",
+    "    \"border-radius: 0.5em;\",\n",
+    ")"
    ]
   },
   {
@@ -784,8 +819,8 @@
    },
    "outputs": [],
    "source": [
-    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n",
-    "s.set_uuid('after_tooltips')"
+    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n",
+    "s.set_uuid(\"after_tooltips\")"
    ]
   },
   {
@@ -801,14 +836,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "s.set_table_styles([  # create internal CSS classes\n",
-    "    {'selector': '.border-red', 'props': 'border: 2px dashed red;'},\n",
-    "    {'selector': '.border-green', 'props': 'border: 2px dashed green;'},\n",
-    "], overwrite=False)\n",
-    "cell_border = pd.DataFrame([['border-green ', ' ', ' ', 'border-red '], \n",
-    "                           [' ', ' ', ' ', ' ']], \n",
-    "                          index=df.index, \n",
-    "                          columns=df.columns[:4])\n",
+    "s.set_table_styles(\n",
+    "    [  # create internal CSS classes\n",
+    "        {\"selector\": \".border-red\", \"props\": \"border: 2px dashed red;\"},\n",
+    "        {\"selector\": \".border-green\", \"props\": \"border: 2px dashed green;\"},\n",
+    "    ],\n",
+    "    overwrite=False,\n",
+    ")\n",
+    "cell_border = pd.DataFrame(\n",
+    "    [[\"border-green \", \" \", \" \", \"border-red \"], [\" \", \" \", \" \", \" \"]],\n",
+    "    index=df.index,\n",
+    "    columns=df.columns[:4],\n",
+    ")\n",
     "s.set_td_classes(cell_color + cell_border)"
    ]
   },
@@ -820,8 +859,8 @@
    },
    "outputs": [],
    "source": [
-    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n",
-    "s.set_uuid('after_borders')"
+    "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n",
+    "s.set_uuid(\"after_borders\")"
    ]
   },
   {
@@ -847,9 +886,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df3 = pd.DataFrame(np.random.randn(4,4), \n",
-    "                   pd.MultiIndex.from_product([['A', 'B'], ['r1', 'r2']]),\n",
-    "                   columns=['c1','c2','c3','c4'])\n",
+    "df3 = pd.DataFrame(\n",
+    "    np.random.default_rng().standard_normal((4, 4)),\n",
+    "    pd.MultiIndex.from_product([[\"A\", \"B\"], [\"r1\", \"r2\"]]),\n",
+    "    columns=[\"c1\", \"c2\", \"c3\", \"c4\"],\n",
+    ")\n",
     "df3"
    ]
   },
@@ -866,9 +907,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "slice_ = ['c3', 'c4']\n",
-    "df3.style.apply(highlight_max, props='color:red;', axis=0, subset=slice_)\\\n",
-    "         .set_properties(**{'background-color': '#ffffb3'}, subset=slice_)"
+    "slice_ = [\"c3\", \"c4\"]\n",
+    "df3.style.apply(\n",
+    "    highlight_max, props=\"color:red;\", axis=0, subset=slice_\n",
+    ").set_properties(**{\"background-color\": \"#ffffb3\"}, subset=slice_)"
    ]
   },
   {
@@ -885,9 +927,10 @@
    "outputs": [],
    "source": [
     "idx = pd.IndexSlice\n",
-    "slice_ = idx[idx[:,'r1'], idx['c2':'c4']]\n",
-    "df3.style.apply(highlight_max, props='color:red;', axis=0, subset=slice_)\\\n",
-    "         .set_properties(**{'background-color': '#ffffb3'}, subset=slice_)"
+    "slice_ = idx[idx[:, \"r1\"], idx[\"c2\":\"c4\"]]\n",
+    "df3.style.apply(\n",
+    "    highlight_max, props=\"color:red;\", axis=0, subset=slice_\n",
+    ").set_properties(**{\"background-color\": \"#ffffb3\"}, subset=slice_)"
    ]
   },
   {
@@ -903,9 +946,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "slice_ = idx[idx[:,'r2'], :]\n",
-    "df3.style.apply(highlight_max, props='color:red;', axis=1, subset=slice_)\\\n",
-    "         .set_properties(**{'background-color': '#ffffb3'}, subset=slice_)"
+    "slice_ = idx[idx[:, \"r2\"], :]\n",
+    "df3.style.apply(\n",
+    "    highlight_max, props=\"color:red;\", axis=1, subset=slice_\n",
+    ").set_properties(**{\"background-color\": \"#ffffb3\"}, subset=slice_)"
    ]
   },
   {
@@ -923,9 +967,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "slice_ = idx[idx[(df3['c1'] + df3['c3']) < -2.0], ['c2', 'c4']]\n",
-    "df3.style.apply(highlight_max, props='color:red;', axis=1, subset=slice_)\\\n",
-    "         .set_properties(**{'background-color': '#ffffb3'}, subset=slice_)"
+    "slice_ = idx[idx[(df3[\"c1\"] + df3[\"c3\"]) < -2.0], [\"c2\", \"c4\"]]\n",
+    "df3.style.apply(\n",
+    "    highlight_max, props=\"color:red;\", axis=1, subset=slice_\n",
+    ").set_properties(**{\"background-color\": \"#ffffb3\"}, subset=slice_)"
    ]
   },
   {
@@ -981,7 +1026,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df4 = pd.DataFrame([[1,2],[3,4]])\n",
+    "df4 = pd.DataFrame([[1, 2], [3, 4]])\n",
     "s4 = df4.style"
    ]
   },
@@ -1003,6 +1048,7 @@
    "outputs": [],
    "source": [
     "from pandas.io.formats.style import Styler\n",
+    "\n",
     "s4 = Styler(df4, uuid_len=0, cell_ids=False)"
    ]
   },
@@ -1053,7 +1099,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df4.style.set_table_styles([{'selector': 'td.col1', 'props': props}])"
+    "df4.style.set_table_styles([{\"selector\": \"td.col1\", \"props\": props}])"
    ]
   },
   {
@@ -1082,9 +1128,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df2.style.apply(highlight_max, props='color:white;background-color:darkblue;', axis=0)\\\n",
-    "         .apply(highlight_max, props='color:white;background-color:pink;', axis=1)\\\n",
-    "         .apply(highlight_max, props='color:white;background-color:purple', axis=None)"
+    "df2.style.apply(\n",
+    "    highlight_max, props=\"color:white;background-color:darkblue;\", axis=0\n",
+    ").apply(highlight_max, props=\"color:white;background-color:pink;\", axis=1).apply(\n",
+    "    highlight_max, props=\"color:white;background-color:purple\", axis=None\n",
+    ")"
    ]
   },
   {
@@ -1105,14 +1153,18 @@
    "outputs": [],
    "source": [
     "build = lambda x: pd.DataFrame(x, index=df2.index, columns=df2.columns)\n",
-    "cls1 = build(df2.apply(highlight_max, props='cls-1 ', axis=0))\n",
-    "cls2 = build(df2.apply(highlight_max, props='cls-2 ', axis=1, result_type='expand').values)\n",
-    "cls3 = build(highlight_max(df2, props='cls-3 '))\n",
-    "df2.style.set_table_styles([\n",
-    "    {'selector': '.cls-1', 'props': 'color:white;background-color:darkblue;'},\n",
-    "    {'selector': '.cls-2', 'props': 'color:white;background-color:pink;'},\n",
-    "    {'selector': '.cls-3', 'props': 'color:white;background-color:purple;'}\n",
-    "]).set_td_classes(cls1 + cls2 + cls3)"
+    "cls1 = build(df2.apply(highlight_max, props=\"cls-1 \", axis=0))\n",
+    "cls2 = build(\n",
+    "    df2.apply(highlight_max, props=\"cls-2 \", axis=1, result_type=\"expand\").values\n",
+    ")\n",
+    "cls3 = build(highlight_max(df2, props=\"cls-3 \"))\n",
+    "df2.style.set_table_styles(\n",
+    "    [\n",
+    "        {\"selector\": \".cls-1\", \"props\": \"color:white;background-color:darkblue;\"},\n",
+    "        {\"selector\": \".cls-2\", \"props\": \"color:white;background-color:pink;\"},\n",
+    "        {\"selector\": \".cls-3\", \"props\": \"color:white;background-color:purple;\"},\n",
+    "    ]\n",
+    ").set_td_classes(cls1 + cls2 + cls3)"
    ]
   },
   {
@@ -1152,10 +1204,14 @@
     "    \"blank\": \"\",\n",
     "}\n",
     "html = Styler(df4, uuid_len=0, cell_ids=False)\n",
-    "html.set_table_styles([{'selector': 'td', 'props': props},\n",
-    "                       {'selector': '.c1', 'props': 'color:green;'},\n",
-    "                       {'selector': '.l0', 'props': 'color:blue;'}],\n",
-    "                      css_class_names=my_css)\n",
+    "html.set_table_styles(\n",
+    "    [\n",
+    "        {\"selector\": \"td\", \"props\": props},\n",
+    "        {\"selector\": \".c1\", \"props\": \"color:green;\"},\n",
+    "        {\"selector\": \".l0\", \"props\": \"color:blue;\"},\n",
+    "    ],\n",
+    "    css_class_names=my_css,\n",
+    ")\n",
     "print(html.to_html())"
    ]
   },
@@ -1213,9 +1269,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df2.iloc[0,2] = np.nan\n",
-    "df2.iloc[4,3] = np.nan\n",
-    "df2.loc[:4].style.highlight_null(color='yellow')"
+    "df2.iloc[0, 2] = np.nan\n",
+    "df2.iloc[4, 3] = np.nan\n",
+    "df2.loc[:4].style.highlight_null(color=\"yellow\")"
    ]
   },
   {
@@ -1231,7 +1287,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df2.loc[:4].style.highlight_max(axis=1, props='color:white; font-weight:bold; background-color:darkblue;')"
+    "df2.loc[:4].style.highlight_max(\n",
+    "    axis=1, props=(\"color:white; \" \"font-weight:bold; \" \"background-color:darkblue;\")\n",
+    ")"
    ]
   },
   {
@@ -1249,7 +1307,9 @@
    "outputs": [],
    "source": [
     "left = pd.Series([1.0, 0.0, 1.0], index=[\"A\", \"B\", \"D\"])\n",
-    "df2.loc[:4].style.highlight_between(left=left, right=1.5, axis=1, props='color:white; background-color:purple;')"
+    "df2.loc[:4].style.highlight_between(\n",
+    "    left=left, right=1.5, axis=1, props=\"color:white; background-color:purple;\"\n",
+    ")"
    ]
   },
   {
@@ -1266,7 +1326,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df2.loc[:4].style.highlight_quantile(q_left=0.85, axis=None, color='yellow')"
+    "df2.loc[:4].style.highlight_quantile(q_left=0.85, axis=None, color=\"yellow\")"
    ]
   },
   {
@@ -1290,6 +1350,7 @@
    "outputs": [],
    "source": [
     "import seaborn as sns\n",
+    "\n",
     "cm = sns.light_palette(\"green\", as_cmap=True)\n",
     "\n",
     "df2.style.background_gradient(cmap=cm)"
@@ -1329,9 +1390,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df2.loc[:4].style.set_properties(**{'background-color': 'black',\n",
-    "                           'color': 'lawngreen',\n",
-    "                           'border-color': 'white'})"
+    "df2.loc[:4].style.set_properties(\n",
+    "    **{\"background-color\": \"black\", \"color\": \"lawngreen\", \"border-color\": \"white\"}\n",
+    ")"
    ]
   },
   {
@@ -1354,7 +1415,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df2.style.bar(subset=['A', 'B'], color='#d65f5f')"
+    "df2.style.bar(subset=[\"A\", \"B\"], color=\"#d65f5f\")"
    ]
   },
   {
@@ -1372,10 +1433,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df2.style.format('{:.3f}', na_rep=\"\")\\\n",
-    "         .bar(align=0, vmin=-2.5, vmax=2.5, cmap=\"bwr\", height=50,\n",
-    "              width=60, props=\"width: 120px; border-right: 1px solid black;\")\\\n",
-    "         .text_gradient(cmap=\"bwr\", vmin=-2.5, vmax=2.5)"
+    "df2.style.format(\"{:.3f}\", na_rep=\"\").bar(\n",
+    "    align=0,\n",
+    "    vmin=-2.5,\n",
+    "    vmax=2.5,\n",
+    "    cmap=\"bwr\",\n",
+    "    height=50,\n",
+    "    width=60,\n",
+    "    props=\"width: 120px; border-right: 1px solid black;\",\n",
+    ").text_gradient(cmap=\"bwr\", vmin=-2.5, vmax=2.5)"
    ]
   },
   {
@@ -1398,10 +1464,10 @@
     "from IPython.display import HTML\n",
     "\n",
     "# Test series\n",
-    "test1 = pd.Series([-100,-60,-30,-20], name='All Negative')\n",
-    "test2 = pd.Series([-10,-5,0,90], name='Both Pos and Neg')\n",
-    "test3 = pd.Series([10,20,50,100], name='All Positive')\n",
-    "test4 = pd.Series([100, 103, 101, 102], name='Large Positive')\n",
+    "test1 = pd.Series([-100, -60, -30, -20], name=\"All Negative\")\n",
+    "test2 = pd.Series([-10, -5, 0, 90], name=\"Both Pos and Neg\")\n",
+    "test3 = pd.Series([10, 20, 50, 100], name=\"All Positive\")\n",
+    "test4 = pd.Series([100, 103, 101, 102], name=\"Large Positive\")\n",
     "\n",
     "\n",
     "head = \"\"\"\n",
@@ -1417,19 +1483,22 @@
     "\n",
     "\"\"\"\n",
     "\n",
-    "aligns = ['left', 'right', 'zero', 'mid', 'mean', 99]\n",
+    "aligns = [\"left\", \"right\", \"zero\", \"mid\", \"mean\", 99]\n",
     "for align in aligns:\n",
     "    row = \"<tr><th>{}</th>\".format(align)\n",
-    "    for series in [test1,test2,test3, test4]:\n",
+    "    for series in [test1, test2, test3, test4]:\n",
     "        s = series.copy()\n",
-    "        s.name=''\n",
-    "        row += \"<td>{}</td>\".format(s.to_frame().style.hide(axis='index').bar(align=align, \n",
-    "                                                           color=['#d65f5f', '#5fba7d'], \n",
-    "                                                           width=100).to_html()) #testn['width']\n",
-    "    row += '</tr>'\n",
+    "        s.name = \"\"\n",
+    "        row += \"<td>{}</td>\".format(\n",
+    "            s.to_frame()\n",
+    "            .style.hide(axis=\"index\")\n",
+    "            .bar(align=align, color=[\"#d65f5f\", \"#5fba7d\"], width=100)\n",
+    "            .to_html()\n",
+    "        )  # testn['width']\n",
+    "    row += \"</tr>\"\n",
     "    head += row\n",
-    "    \n",
-    "head+= \"\"\"\n",
+    "\n",
+    "head += \"\"\"\n",
     "</tbody>\n",
     "</table>\"\"\""
    ]
@@ -1463,11 +1532,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "style1 = df2.style\\\n",
-    "            .map(style_negative, props='color:red;')\\\n",
-    "            .map(lambda v: 'opacity: 20%;' if (v < 0.3) and (v > -0.3) else None)\\\n",
-    "            .set_table_styles([{\"selector\": \"th\", \"props\": \"color: blue;\"}])\\\n",
-    "            .hide(axis=\"index\")\n",
+    "style1 = (\n",
+    "    df2.style.map(style_negative, props=\"color:red;\")\n",
+    "    .map(lambda v: \"opacity: 20%;\" if (v < 0.3) and (v > -0.3) else None)\n",
+    "    .set_table_styles([{\"selector\": \"th\", \"props\": \"color: blue;\"}])\n",
+    "    .hide(axis=\"index\")\n",
+    ")\n",
     "style1"
    ]
   },
@@ -1526,11 +1596,14 @@
    "outputs": [],
    "source": [
     "from ipywidgets import widgets\n",
+    "\n",
+    "\n",
     "@widgets.interact\n",
-    "def f(h_neg=(0, 359, 1), h_pos=(0, 359), s=(0., 99.9), l=(0., 99.9)):\n",
+    "def f(h_neg=(0, 359, 1), h_pos=(0, 359), s=(0.0, 99.9), l_post=(0.0, 99.9)):\n",
     "    return df2.style.background_gradient(\n",
-    "        cmap=sns.palettes.diverging_palette(h_neg=h_neg, h_pos=h_pos, s=s, l=l,\n",
-    "                                            as_cmap=True)\n",
+    "        cmap=sns.palettes.diverging_palette(\n",
+    "            h_neg=h_neg, h_pos=h_pos, s=s, l=l_post, as_cmap=True\n",
+    "        )\n",
     "    )"
    ]
   },
@@ -1548,16 +1621,15 @@
    "outputs": [],
    "source": [
     "def magnify():\n",
-    "    return [dict(selector=\"th\",\n",
-    "                 props=[(\"font-size\", \"4pt\")]),\n",
-    "            dict(selector=\"td\",\n",
-    "                 props=[('padding', \"0em 0em\")]),\n",
-    "            dict(selector=\"th:hover\",\n",
-    "                 props=[(\"font-size\", \"12pt\")]),\n",
-    "            dict(selector=\"tr:hover td:hover\",\n",
-    "                 props=[('max-width', '200px'),\n",
-    "                        ('font-size', '12pt')])\n",
-    "]"
+    "    return [\n",
+    "        {\"selector\": \"th\", \"props\": [(\"font-size\", \"4pt\")]},\n",
+    "        {\"selector\": \"td\", \"props\": [(\"padding\", \"0em 0em\")]},\n",
+    "        {\"selector\": \"th:hover\", \"props\": [(\"font-size\", \"12pt\")]},\n",
+    "        {\n",
+    "            \"selector\": \"tr:hover td:hover\",\n",
+    "            \"props\": [(\"max-width\", \"200px\"), (\"font-size\", \"12pt\")],\n",
+    "        },\n",
+    "    ]"
    ]
   },
   {
@@ -1566,15 +1638,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "np.random.seed(25)\n",
-    "cmap = cmap=sns.diverging_palette(5, 250, as_cmap=True)\n",
-    "bigdf = pd.DataFrame(np.random.randn(20, 25)).cumsum()\n",
+    "cmap = sns.diverging_palette(5, 250, as_cmap=True)\n",
+    "bigdf = pd.DataFrame(np.random.default_rng(25).standard_normal((20, 25))).cumsum()\n",
     "\n",
-    "bigdf.style.background_gradient(cmap, axis=1)\\\n",
-    "    .set_properties(**{'max-width': '80px', 'font-size': '1pt'})\\\n",
-    "    .set_caption(\"Hover to magnify\")\\\n",
-    "    .format(precision=2)\\\n",
-    "    .set_table_styles(magnify())"
+    "bigdf.style.background_gradient(cmap, axis=1).set_properties(\n",
+    "    **{\"max-width\": \"80px\", \"font-size\": \"1pt\"}\n",
+    ").set_caption(\"Hover to magnify\").format(precision=2).set_table_styles(magnify())"
    ]
   },
   {
@@ -1594,7 +1663,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "bigdf = pd.DataFrame(np.random.randn(16, 100))\n",
+    "bigdf = pd.DataFrame(np.random.default_rng().standard_normal((16, 100)))\n",
     "bigdf.style.set_sticky(axis=\"index\")"
    ]
   },
@@ -1611,8 +1680,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "bigdf.index = pd.MultiIndex.from_product([[\"A\",\"B\"],[0,1],[0,1,2,3]])\n",
-    "bigdf.style.set_sticky(axis=\"index\", pixel_size=18, levels=[1,2])"
+    "bigdf.index = pd.MultiIndex.from_product([[\"A\", \"B\"], [0, 1], [0, 1, 2, 3]])\n",
+    "bigdf.style.set_sticky(axis=\"index\", pixel_size=18, levels=[1, 2])"
    ]
   },
   {
@@ -1632,7 +1701,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df4 = pd.DataFrame([['<div></div>', '\"&other\"', '<span></span>']])\n",
+    "df4 = pd.DataFrame([[\"<div></div>\", '\"&other\"', \"<span></span>\"]])\n",
     "df4.style"
    ]
   },
@@ -1651,7 +1720,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df4.style.format('<a href=\"https://pandas.pydata.org\" target=\"_blank\">{}</a>', escape=\"html\")"
+    "df4.style.format(\n",
+    "    '<a href=\"https://pandas.pydata.org\" target=\"_blank\">{}</a>', escape=\"html\"\n",
+    ")"
    ]
   },
   {
@@ -1693,10 +1764,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df2.style.\\\n",
-    "    map(style_negative, props='color:red;').\\\n",
-    "    highlight_max(axis=0).\\\n",
-    "    to_excel('styled.xlsx', engine='openpyxl')"
+    "df2.style.map(style_negative, props=\"color:red;\").highlight_max(axis=0).to_excel(\n",
+    "    \"styled.xlsx\", engine=\"openpyxl\"\n",
+    ")"
    ]
   },
   {
@@ -1765,7 +1835,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(pd.DataFrame([[1,2],[3,4]], index=['i1', 'i2'], columns=['c1', 'c2']).style.to_html())"
+    "print(\n",
+    "    pd.DataFrame(\n",
+    "        [[1, 2], [3, 4]], index=[\"i1\", \"i2\"], columns=[\"c1\", \"c2\"]\n",
+    "    ).style.to_html()\n",
+    ")"
    ]
   },
   {
@@ -1783,9 +1857,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df4 = pd.DataFrame([['text']])\n",
-    "df4.style.map(lambda x: 'color:green;')\\\n",
-    "         .map(lambda x: 'color:red;')"
+    "df4 = pd.DataFrame([[\"text\"]])\n",
+    "df4.style.map(lambda x: \"color:green;\").map(lambda x: \"color:red;\")"
    ]
   },
   {
@@ -1794,8 +1867,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df4.style.map(lambda x: 'color:red;')\\\n",
-    "         .map(lambda x: 'color:green;')"
+    "df4.style.map(lambda x: \"color:red;\").map(lambda x: \"color:green;\")"
    ]
   },
   {
@@ -1820,9 +1892,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df4.style.set_uuid('a_')\\\n",
-    "         .set_table_styles([{'selector': 'td', 'props': 'color:red;'}])\\\n",
-    "         .map(lambda x: 'color:green;')"
+    "df4.style.set_uuid(\"a_\").set_table_styles(\n",
+    "    [{\"selector\": \"td\", \"props\": \"color:red;\"}]\n",
+    ").map(lambda x: \"color:green;\")"
    ]
   },
   {
@@ -1838,11 +1910,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df4.style.set_uuid('b_')\\\n",
-    "         .set_table_styles([{'selector': 'td', 'props': 'color:red;'},\n",
-    "                            {'selector': '.cls-1', 'props': 'color:blue;'}])\\\n",
-    "         .map(lambda x: 'color:green;')\\\n",
-    "         .set_td_classes(pd.DataFrame([['cls-1']]))"
+    "df4.style.set_uuid(\"b_\").set_table_styles(\n",
+    "    [\n",
+    "        {\"selector\": \"td\", \"props\": \"color:red;\"},\n",
+    "        {\"selector\": \".cls-1\", \"props\": \"color:blue;\"},\n",
+    "    ]\n",
+    ").map(lambda x: \"color:green;\").set_td_classes(pd.DataFrame([[\"cls-1\"]]))"
    ]
   },
   {
@@ -1858,12 +1931,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df4.style.set_uuid('c_')\\\n",
-    "         .set_table_styles([{'selector': 'td', 'props': 'color:red;'},\n",
-    "                            {'selector': '.cls-1', 'props': 'color:blue;'},\n",
-    "                            {'selector': 'td.data', 'props': 'color:yellow;'}])\\\n",
-    "         .map(lambda x: 'color:green;')\\\n",
-    "         .set_td_classes(pd.DataFrame([['cls-1']]))"
+    "df4.style.set_uuid(\"c_\").set_table_styles(\n",
+    "    [\n",
+    "        {\"selector\": \"td\", \"props\": \"color:red;\"},\n",
+    "        {\"selector\": \".cls-1\", \"props\": \"color:blue;\"},\n",
+    "        {\"selector\": \"td.data\", \"props\": \"color:yellow;\"},\n",
+    "    ]\n",
+    ").map(lambda x: \"color:green;\").set_td_classes(pd.DataFrame([[\"cls-1\"]]))"
    ]
   },
   {
@@ -1881,12 +1955,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df4.style.set_uuid('d_')\\\n",
-    "         .set_table_styles([{'selector': 'td', 'props': 'color:red;'},\n",
-    "                            {'selector': '.cls-1', 'props': 'color:blue;'},\n",
-    "                            {'selector': 'td.data', 'props': 'color:yellow;'}])\\\n",
-    "         .map(lambda x: 'color:green !important;')\\\n",
-    "         .set_td_classes(pd.DataFrame([['cls-1']]))"
+    "df4.style.set_uuid(\"d_\").set_table_styles(\n",
+    "    [\n",
+    "        {\"selector\": \"td\", \"props\": \"color:red;\"},\n",
+    "        {\"selector\": \".cls-1\", \"props\": \"color:blue;\"},\n",
+    "        {\"selector\": \"td.data\", \"props\": \"color:yellow;\"},\n",
+    "    ]\n",
+    ").map(lambda x: \"color:green !important;\").set_td_classes(pd.DataFrame([[\"cls-1\"]]))"
    ]
   },
   {
@@ -1940,8 +2015,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "with open(\"templates/myhtml.tpl\") as f:\n",
-    "    print(f.read())"
+    "with open(\"templates/myhtml.tpl\") as f_html:\n",
+    "    print(f_html.read())"
    ]
   },
   {
@@ -1960,10 +2035,12 @@
    "source": [
     "class MyStyler(Styler):\n",
     "    env = Environment(\n",
-    "        loader=ChoiceLoader([\n",
-    "            FileSystemLoader(\"templates\"),  # contains ours\n",
-    "            Styler.loader,  # the default\n",
-    "        ])\n",
+    "        loader=ChoiceLoader(\n",
+    "            [\n",
+    "                FileSystemLoader(\"templates\"),  # contains ours\n",
+    "                Styler.loader,  # the default\n",
+    "            ]\n",
+    "        )\n",
     "    )\n",
     "    template_html_table = env.get_template(\"myhtml.tpl\")"
    ]
@@ -2045,8 +2122,8 @@
    },
    "outputs": [],
    "source": [
-    "with open(\"templates/html_style_structure.html\") as f:\n",
-    "    style_structure = f.read()"
+    "with open(\"templates/html_style_structure.html\") as f_sty:\n",
+    "    style_structure = f_sty.read()"
    ]
   },
   {
@@ -2073,8 +2150,8 @@
    },
    "outputs": [],
    "source": [
-    "with open(\"templates/html_table_structure.html\") as f:\n",
-    "    table_structure = f.read()"
+    "with open(\"templates/html_table_structure.html\") as f_table_struct:\n",
+    "    table_structure = f_table_struct.read()"
    ]
   },
   {
@@ -2106,7 +2183,7 @@
     "# from IPython.display import HTML\n",
     "# with open(\"themes/nature_with_gtoc/static/nature.css_t\") as f:\n",
     "#     css = f.read()\n",
-    "    \n",
+    "\n",
     "# HTML('<style>{}</style>'.format(css))"
    ]
   }
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index b3aa782341c77..e4daf9ed450fb 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -190,7 +190,7 @@ def __eq__(self, other: object) -> bool:
         # cannot be checked with normal `==`
         if isinstance(other, str):
             # TODO should dtype == "string" work for the NaN variant?
-            if other == "string" or other == self.name:  # noqa: PLR1714
+            if other == "string" or other == self.name:
                 return True
             try:
                 other = self.construct_from_string(other)
diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py
index cd3d599abd30e..0199e21bfc980 100644
--- a/pandas/tests/indexes/test_old_base.py
+++ b/pandas/tests/indexes/test_old_base.py
@@ -455,7 +455,7 @@ def test_insert_out_of_bounds(self, index, using_infer_string):
             msg = "slice indices must be integers or None or have an __index__ method"
 
         if using_infer_string and (
-            index.dtype == "string" or index.dtype == "category"  # noqa: PLR1714
+            index.dtype == "string" or index.dtype == "category"
         ):
             msg = "loc must be an integer between"
 
diff --git a/pyproject.toml b/pyproject.toml
index d0fcdc4b21b33..1386546996bf2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -324,7 +324,8 @@ ignore = [
   "PT019",
   # The following rules may cause conflicts when used with the formatter:
   "ISC001",
-
+  # if-stmt-min-max
+  "PLR1730",
 
   ### TODO: Enable gradually
   # Useless statement
@@ -341,8 +342,10 @@ ignore = [
   "RUF012",
   # type-comparison
   "E721",
-
-  # Additional pylint rules
+  # repeated-equality-comparison
+  "PLR1714",
+  # self-or-cls-assignment
+  "PLW0642",
   # literal-membership
   "PLR6201", # 847 errors
   # Method could be a function, class method, or static method

From 3f423cd12efa69bebf43e0fda397952339a2ad79 Mon Sep 17 00:00:00 2001
From: sunlight <138234530+sunlight798@users.noreply.github.com>
Date: Thu, 10 Oct 2024 01:14:11 +0800
Subject: [PATCH 132/224] DOC: fix RT03,SA01,ES01 for
 pandas.io.stata.StataReader.variable_labels (#60008)

---
 ci/code_checks.sh  |  1 -
 pandas/io/stata.py | 10 ++++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 6fb675069e81d..79a8cbdf3ea05 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -154,7 +154,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.errors.ValueLabelTypeMismatch SA01" \
         -i "pandas.infer_freq SA01" \
         -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \
-        -i "pandas.io.stata.StataReader.variable_labels RT03,SA01" \
         -i "pandas.io.stata.StataWriter.write_file SA01" \
         -i "pandas.json_normalize RT03,SA01" \
         -i "pandas.plotting.andrews_curves RT03,SA01" \
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index f1d289726c9c8..04bd1e32603f4 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -2045,9 +2045,19 @@ def variable_labels(self) -> dict[str, str]:
         """
         Return a dict associating each variable name with corresponding label.
 
+        This method retrieves variable labels from a Stata file. Variable labels are
+        mappings between variable names and their corresponding descriptive labels
+        in a Stata dataset.
+
         Returns
         -------
         dict
+            A python dictionary.
+
+        See Also
+        --------
+        read_stata : Read Stata file into DataFrame.
+        DataFrame.to_stata : Export DataFrame object to Stata dta format.
 
         Examples
         --------

From 6c1f95d5f7ca83e1f55b9957229e2c1b88c5a934 Mon Sep 17 00:00:00 2001
From: LOCHAN PAUDEL <104910006+nahcol10@users.noreply.github.com>
Date: Wed, 9 Oct 2024 23:48:53 +0530
Subject: [PATCH 133/224] Programming Language :: Python :: 3.13 added to
 pyproject.toml (#59985)

* Programming Language :: Python :: 3.13 added to pyproject.toml

* pyproject.toml updated successfully
---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 1386546996bf2..d6a963e94f5b8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,6 +45,7 @@ classifiers = [
     'Programming Language :: Python :: 3.10',
     'Programming Language :: Python :: 3.11',
     'Programming Language :: Python :: 3.12',
+    'Programming Language :: Python :: 3.13',
     'Topic :: Scientific/Engineering'
 ]
 

From 8303af3fd0e8d947c7d0722497f36a8e479be14d Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Thu, 10 Oct 2024 00:34:58 +0530
Subject: [PATCH 134/224] DOC: fix SA01,ES01 for
 pandas.errors.CategoricalConversionWarning (#60011)

---
 ci/code_checks.sh         | 1 -
 pandas/errors/__init__.py | 9 +++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 79a8cbdf3ea05..1974c98a1d1ff 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -134,7 +134,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.resample.Resampler.var SA01" \
         -i "pandas.errors.AttributeConflictWarning SA01" \
         -i "pandas.errors.CSSWarning SA01" \
-        -i "pandas.errors.CategoricalConversionWarning SA01" \
         -i "pandas.errors.ChainedAssignmentError SA01" \
         -i "pandas.errors.DataError SA01" \
         -i "pandas.errors.DuplicateLabelError SA01" \
diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py
index cf2a9d3f4a238..efc032b0b559e 100644
--- a/pandas/errors/__init__.py
+++ b/pandas/errors/__init__.py
@@ -731,6 +731,15 @@ class CategoricalConversionWarning(Warning):
     """
     Warning is raised when reading a partial labeled Stata file using a iterator.
 
+    This warning helps ensure data integrity and alerts users to potential issues
+    during the incremental reading of Stata files with labeled data, allowing for
+    additional checks and adjustments as necessary.
+
+    See Also
+    --------
+    read_stata : Read a Stata file into a DataFrame.
+    Categorical : Represents a categorical variable in pandas.
+
     Examples
     --------
     >>> from pandas.io.stata import StataReader

From 88554d0ca77c7b80605a34f9ece838b834db8720 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 10 Oct 2024 15:04:25 +0200
Subject: [PATCH 135/224] String dtype: propagate NaNs as False in predicate
 methods (eg .str.startswith) (#59616)

---
 pandas/core/arrays/_arrow_string_mixins.py |  44 +++--
 pandas/core/arrays/arrow/array.py          |   4 +-
 pandas/core/arrays/categorical.py          |  20 ++-
 pandas/core/arrays/string_.py              |  31 ++--
 pandas/core/arrays/string_arrow.py         |  40 +++--
 pandas/core/strings/accessor.py            |  40 +++--
 pandas/core/strings/base.py                |  10 +-
 pandas/core/strings/object_array.py        |  33 ++--
 pandas/tests/strings/test_api.py           |   9 +-
 pandas/tests/strings/test_find_replace.py  | 191 ++++++++++++++-------
 pandas/tests/strings/test_string_array.py  |   2 +-
 pandas/tests/strings/test_strings.py       |  29 +++-
 12 files changed, 307 insertions(+), 146 deletions(-)

diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py
index aa5b28c71b12a..2d1b1eca55e98 100644
--- a/pandas/core/arrays/_arrow_string_mixins.py
+++ b/pandas/core/arrays/_arrow_string_mixins.py
@@ -10,6 +10,7 @@
 
 import numpy as np
 
+from pandas._libs import lib
 from pandas.compat import (
     pa_version_under10p1,
     pa_version_under11p0,
@@ -17,8 +18,6 @@
     pa_version_under17p0,
 )
 
-from pandas.core.dtypes.missing import isna
-
 if not pa_version_under10p1:
     import pyarrow as pa
     import pyarrow.compute as pc
@@ -38,7 +37,7 @@ class ArrowStringArrayMixin:
     def __init__(self, *args, **kwargs) -> None:
         raise NotImplementedError
 
-    def _convert_bool_result(self, result):
+    def _convert_bool_result(self, result, na=lib.no_default, method_name=None):
         # Convert a bool-dtype result to the appropriate result type
         raise NotImplementedError
 
@@ -212,7 +211,9 @@ def _str_removesuffix(self, suffix: str):
         result = pc.if_else(ends_with, removed, self._pa_array)
         return type(self)(result)
 
-    def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
+    def _str_startswith(
+        self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
+    ):
         if isinstance(pat, str):
             result = pc.starts_with(self._pa_array, pattern=pat)
         else:
@@ -225,11 +226,11 @@ def _str_startswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
 
                 for p in pat[1:]:
                     result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
-        if not isna(na):  # pyright: ignore [reportGeneralTypeIssues]
-            result = result.fill_null(na)
-        return self._convert_bool_result(result)
+        return self._convert_bool_result(result, na=na, method_name="startswith")
 
-    def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
+    def _str_endswith(
+        self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
+    ):
         if isinstance(pat, str):
             result = pc.ends_with(self._pa_array, pattern=pat)
         else:
@@ -242,9 +243,7 @@ def _str_endswith(self, pat: str | tuple[str, ...], na: Scalar | None = None):
 
                 for p in pat[1:]:
                     result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
-        if not isna(na):  # pyright: ignore [reportGeneralTypeIssues]
-            result = result.fill_null(na)
-        return self._convert_bool_result(result)
+        return self._convert_bool_result(result, na=na, method_name="endswith")
 
     def _str_isalnum(self):
         result = pc.utf8_is_alnum(self._pa_array)
@@ -283,7 +282,12 @@ def _str_isupper(self):
         return self._convert_bool_result(result)
 
     def _str_contains(
-        self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
+        self,
+        pat,
+        case: bool = True,
+        flags: int = 0,
+        na: Scalar | lib.NoDefault = lib.no_default,
+        regex: bool = True,
     ):
         if flags:
             raise NotImplementedError(f"contains not implemented with {flags=}")
@@ -293,19 +297,25 @@ def _str_contains(
         else:
             pa_contains = pc.match_substring
         result = pa_contains(self._pa_array, pat, ignore_case=not case)
-        if not isna(na):  # pyright: ignore [reportGeneralTypeIssues]
-            result = result.fill_null(na)
-        return self._convert_bool_result(result)
+        return self._convert_bool_result(result, na=na, method_name="contains")
 
     def _str_match(
-        self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
+        self,
+        pat: str,
+        case: bool = True,
+        flags: int = 0,
+        na: Scalar | lib.NoDefault = lib.no_default,
     ):
         if not pat.startswith("^"):
             pat = f"^{pat}"
         return self._str_contains(pat, case, flags, na, regex=True)
 
     def _str_fullmatch(
-        self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
+        self,
+        pat,
+        case: bool = True,
+        flags: int = 0,
+        na: Scalar | lib.NoDefault = lib.no_default,
     ):
         if not pat.endswith("$") or pat.endswith("\\$"):
             pat = f"{pat}$"
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 00d46ab9296d0..dab6a13ff3528 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2318,7 +2318,9 @@ def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
             for chunk in self._pa_array.iterchunks()
         ]
 
-    def _convert_bool_result(self, result):
+    def _convert_bool_result(self, result, na=lib.no_default, method_name=None):
+        if na is not lib.no_default and not isna(na):  # pyright: ignore [reportGeneralTypeIssues]
+            result = result.fill_null(na)
         return type(self)(result)
 
     def _convert_int_result(self, result):
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 0484ef89f61c2..7cde4c53cb2f5 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -2679,16 +2679,28 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
     # ------------------------------------------------------------------------
     # String methods interface
     def _str_map(
-        self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True
+        self, f, na_value=lib.no_default, dtype=np.dtype("object"), convert: bool = True
     ):
         # Optimization to apply the callable `f` to the categories once
         # and rebuild the result by `take`ing from the result with the codes.
         # Returns the same type as the object-dtype implementation though.
-        from pandas.core.arrays import NumpyExtensionArray
-
         categories = self.categories
         codes = self.codes
-        result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype)
+        if categories.dtype == "string":
+            result = categories.array._str_map(f, na_value, dtype)  # type: ignore[attr-defined]
+            if (
+                categories.dtype.na_value is np.nan  # type: ignore[union-attr]
+                and is_bool_dtype(dtype)
+                and (na_value is lib.no_default or isna(na_value))
+            ):
+                # NaN propagates as False for functions with boolean return type
+                na_value = False
+        else:
+            from pandas.core.arrays import NumpyExtensionArray
+
+            result = NumpyExtensionArray(categories.to_numpy())._str_map(
+                f, na_value, dtype
+            )
         return take_nd(result, codes, fill_value=na_value)
 
     def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None):
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index e4daf9ed450fb..f20c4c8625475 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -381,7 +381,11 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self:
         return cls._from_sequence(scalars, dtype=dtype)
 
     def _str_map(
-        self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
+        self,
+        f,
+        na_value=lib.no_default,
+        dtype: Dtype | None = None,
+        convert: bool = True,
     ):
         if self.dtype.na_value is np.nan:
             return self._str_map_nan_semantics(f, na_value=na_value, dtype=dtype)
@@ -390,7 +394,7 @@ def _str_map(
 
         if dtype is None:
             dtype = self.dtype
-        if na_value is None:
+        if na_value is lib.no_default:
             na_value = self.dtype.na_value
 
         mask = isna(self)
@@ -459,11 +463,17 @@ def _str_map_str_or_object(
             # -> We don't know the result type. E.g. `.get` can return anything.
             return lib.map_infer_mask(arr, f, mask.view("uint8"))
 
-    def _str_map_nan_semantics(self, f, na_value=None, dtype: Dtype | None = None):
+    def _str_map_nan_semantics(
+        self, f, na_value=lib.no_default, dtype: Dtype | None = None
+    ):
         if dtype is None:
             dtype = self.dtype
-        if na_value is None:
-            na_value = self.dtype.na_value
+        if na_value is lib.no_default:
+            if is_bool_dtype(dtype):
+                # NaN propagates as False
+                na_value = False
+            else:
+                na_value = self.dtype.na_value
 
         mask = isna(self)
         arr = np.asarray(self)
@@ -474,7 +484,8 @@ def _str_map_nan_semantics(self, f, na_value=None, dtype: Dtype | None = None):
                 if is_integer_dtype(dtype):
                     na_value = 0
                 else:
-                    na_value = True
+                    # NaN propagates as False
+                    na_value = False
 
             result = lib.map_infer_mask(
                 arr,
@@ -484,15 +495,13 @@ def _str_map_nan_semantics(self, f, na_value=None, dtype: Dtype | None = None):
                 na_value=na_value,
                 dtype=np.dtype(cast(type, dtype)),
             )
-            if na_value_is_na and mask.any():
+            if na_value_is_na and is_integer_dtype(dtype) and mask.any():
                 # TODO: we could alternatively do this check before map_infer_mask
                 #  and adjust the dtype/na_value we pass there. Which is more
                 #  performant?
-                if is_integer_dtype(dtype):
-                    result = result.astype("float64")
-                else:
-                    result = result.astype("object")
+                result = result.astype("float64")
                 result[mask] = np.nan
+
             return result
 
         else:
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 75bb1f8fb1a65..6066b8c73a23a 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -219,9 +219,27 @@ def insert(self, loc: int, item) -> ArrowStringArray:
             raise TypeError("Scalar must be NA or str")
         return super().insert(loc, item)
 
-    def _convert_bool_result(self, values):
+    def _convert_bool_result(self, values, na=lib.no_default, method_name=None):
+        if na is not lib.no_default and not isna(na) and not isinstance(na, bool):
+            # GH#59561
+            warnings.warn(
+                f"Allowing a non-bool 'na' in obj.str.{method_name} is deprecated "
+                "and will raise in a future version.",
+                FutureWarning,
+                stacklevel=find_stack_level(),
+            )
+            na = bool(na)
+
         if self.dtype.na_value is np.nan:
-            return ArrowExtensionArray(values).to_numpy(na_value=np.nan)
+            if na is lib.no_default or isna(na):
+                # NaN propagates as False
+                values = values.fill_null(False)
+            else:
+                values = values.fill_null(na)
+            return values.to_numpy()
+        else:
+            if na is not lib.no_default and not isna(na):  # pyright: ignore [reportGeneralTypeIssues]
+                values = values.fill_null(na)
         return BooleanDtype().__from_arrow__(values)
 
     def _maybe_convert_setitem_value(self, value):
@@ -306,22 +324,16 @@ def astype(self, dtype, copy: bool = True):
     _str_slice = ArrowStringArrayMixin._str_slice
 
     def _str_contains(
-        self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
+        self,
+        pat,
+        case: bool = True,
+        flags: int = 0,
+        na=lib.no_default,
+        regex: bool = True,
     ):
         if flags:
             return super()._str_contains(pat, case, flags, na, regex)
 
-        if not isna(na):
-            if not isinstance(na, bool):
-                # GH#59561
-                warnings.warn(
-                    "Allowing a non-bool 'na' in obj.str.contains is deprecated "
-                    "and will raise in a future version.",
-                    FutureWarning,
-                    stacklevel=find_stack_level(),
-                )
-                na = bool(na)
-
         return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex)
 
     def _str_replace(
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index 10117aa6bf503..3cb0e75cfb815 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -1225,7 +1225,12 @@ def join(self, sep: str):
 
     @forbid_nonstring_types(["bytes"])
     def contains(
-        self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
+        self,
+        pat,
+        case: bool = True,
+        flags: int = 0,
+        na=lib.no_default,
+        regex: bool = True,
     ):
         r"""
         Test if pattern or regex is contained within a string of a Series or Index.
@@ -1243,8 +1248,9 @@ def contains(
             Flags to pass through to the re module, e.g. re.IGNORECASE.
         na : scalar, optional
             Fill value for missing values. The default depends on dtype of the
-            array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
-            ``pandas.NA`` is used.
+            array. For object-dtype, ``numpy.nan`` is used. For the nullable
+            ``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype,
+            ``False`` is used.
         regex : bool, default True
             If True, assumes the pat is a regular expression.
 
@@ -1362,7 +1368,7 @@ def contains(
         return self._wrap_result(result, fill_value=na, returns_string=False)
 
     @forbid_nonstring_types(["bytes"])
-    def match(self, pat: str, case: bool = True, flags: int = 0, na=None):
+    def match(self, pat: str, case: bool = True, flags: int = 0, na=lib.no_default):
         """
         Determine if each string starts with a match of a regular expression.
 
@@ -1376,8 +1382,9 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=None):
             Regex module flags, e.g. re.IGNORECASE.
         na : scalar, optional
             Fill value for missing values. The default depends on dtype of the
-            array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
-            ``pandas.NA`` is used.
+            array. For object-dtype, ``numpy.nan`` is used. For the nullable
+            ``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype,
+            ``False`` is used.
 
         Returns
         -------
@@ -1406,7 +1413,7 @@ def match(self, pat: str, case: bool = True, flags: int = 0, na=None):
         return self._wrap_result(result, fill_value=na, returns_string=False)
 
     @forbid_nonstring_types(["bytes"])
-    def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None):
+    def fullmatch(self, pat, case: bool = True, flags: int = 0, na=lib.no_default):
         """
         Determine if each string entirely matches a regular expression.
 
@@ -1420,8 +1427,9 @@ def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None):
             Regex module flags, e.g. re.IGNORECASE.
         na : scalar, optional
             Fill value for missing values. The default depends on dtype of the
-            array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
-            ``pandas.NA`` is used.
+            array. For object-dtype, ``numpy.nan`` is used. For the nullable
+            ``StringDtype``, ``pandas.NA`` is used. For the ``"str"`` dtype,
+            ``False`` is used.
 
         Returns
         -------
@@ -2612,7 +2620,7 @@ def count(self, pat, flags: int = 0):
 
     @forbid_nonstring_types(["bytes"])
     def startswith(
-        self, pat: str | tuple[str, ...], na: Scalar | None = None
+        self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
     ) -> Series | Index:
         """
         Test if the start of each string element matches a pattern.
@@ -2624,10 +2632,11 @@ def startswith(
         pat : str or tuple[str, ...]
             Character sequence or tuple of strings. Regular expressions are not
             accepted.
-        na : object, default NaN
+        na : scalar, optional
             Object shown if element tested is not a string. The default depends
             on dtype of the array. For object-dtype, ``numpy.nan`` is used.
-            For ``StringDtype``, ``pandas.NA`` is used.
+            For the nullable ``StringDtype``, ``pandas.NA`` is used.
+            For the ``"str"`` dtype, ``False`` is used.
 
         Returns
         -------
@@ -2682,7 +2691,7 @@ def startswith(
 
     @forbid_nonstring_types(["bytes"])
     def endswith(
-        self, pat: str | tuple[str, ...], na: Scalar | None = None
+        self, pat: str | tuple[str, ...], na: Scalar | lib.NoDefault = lib.no_default
     ) -> Series | Index:
         """
         Test if the end of each string element matches a pattern.
@@ -2694,10 +2703,11 @@ def endswith(
         pat : str or tuple[str, ...]
             Character sequence or tuple of strings. Regular expressions are not
             accepted.
-        na : object, default NaN
+        na : scalar, optional
             Object shown if element tested is not a string. The default depends
             on dtype of the array. For object-dtype, ``numpy.nan`` is used.
-            For ``StringDtype``, ``pandas.NA`` is used.
+            For the nullable ``StringDtype``, ``pandas.NA`` is used.
+            For the ``"str"`` dtype, ``False`` is used.
 
         Returns
         -------
diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py
index 97d906e3df077..4ed36f85167c9 100644
--- a/pandas/core/strings/base.py
+++ b/pandas/core/strings/base.py
@@ -6,7 +6,7 @@
     Literal,
 )
 
-import numpy as np
+from pandas._libs import lib
 
 if TYPE_CHECKING:
     from collections.abc import (
@@ -89,7 +89,11 @@ def _str_repeat(self, repeats: int | Sequence[int]):
 
     @abc.abstractmethod
     def _str_match(
-        self, pat: str, case: bool = True, flags: int = 0, na: Scalar = np.nan
+        self,
+        pat: str,
+        case: bool = True,
+        flags: int = 0,
+        na: Scalar | lib.NoDefault = lib.no_default,
     ):
         pass
 
@@ -99,7 +103,7 @@ def _str_fullmatch(
         pat: str | re.Pattern,
         case: bool = True,
         flags: int = 0,
-        na: Scalar = np.nan,
+        na: Scalar | lib.NoDefault = lib.no_default,
     ):
         pass
 
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
index 6211c7b528db9..0268194e64d50 100644
--- a/pandas/core/strings/object_array.py
+++ b/pandas/core/strings/object_array.py
@@ -45,7 +45,11 @@ def __len__(self) -> int:
         raise NotImplementedError
 
     def _str_map(
-        self, f, na_value=None, dtype: NpDtype | None = None, convert: bool = True
+        self,
+        f,
+        na_value=lib.no_default,
+        dtype: NpDtype | None = None,
+        convert: bool = True,
     ):
         """
         Map a callable over valid elements of the array.
@@ -66,7 +70,7 @@ def _str_map(
         """
         if dtype is None:
             dtype = np.dtype("object")
-        if na_value is None:
+        if na_value is lib.no_default:
             na_value = self.dtype.na_value  # type: ignore[attr-defined]
 
         if not len(self):
@@ -130,7 +134,12 @@ def _str_pad(
         return self._str_map(f)
 
     def _str_contains(
-        self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
+        self,
+        pat,
+        case: bool = True,
+        flags: int = 0,
+        na=lib.no_default,
+        regex: bool = True,
     ):
         if regex:
             if not case:
@@ -145,7 +154,7 @@ def _str_contains(
             else:
                 upper_pat = pat.upper()
                 f = lambda x: upper_pat in x.upper()
-        if not isna(na) and not isinstance(na, bool):
+        if na is not lib.no_default and not isna(na) and not isinstance(na, bool):
             # GH#59561
             warnings.warn(
                 "Allowing a non-bool 'na' in obj.str.contains is deprecated "
@@ -155,9 +164,9 @@ def _str_contains(
             )
         return self._str_map(f, na, dtype=np.dtype("bool"))
 
-    def _str_startswith(self, pat, na=None):
+    def _str_startswith(self, pat, na=lib.no_default):
         f = lambda x: x.startswith(pat)
-        if not isna(na) and not isinstance(na, bool):
+        if na is not lib.no_default and not isna(na) and not isinstance(na, bool):
             # GH#59561
             warnings.warn(
                 "Allowing a non-bool 'na' in obj.str.startswith is deprecated "
@@ -167,9 +176,9 @@ def _str_startswith(self, pat, na=None):
             )
         return self._str_map(f, na_value=na, dtype=np.dtype(bool))
 
-    def _str_endswith(self, pat, na=None):
+    def _str_endswith(self, pat, na=lib.no_default):
         f = lambda x: x.endswith(pat)
-        if not isna(na) and not isinstance(na, bool):
+        if na is not lib.no_default and not isna(na) and not isinstance(na, bool):
             # GH#59561
             warnings.warn(
                 "Allowing a non-bool 'na' in obj.str.endswith is deprecated "
@@ -238,7 +247,11 @@ def rep(x, r):
             return type(self)._from_sequence(result, dtype=self.dtype)
 
     def _str_match(
-        self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
+        self,
+        pat: str,
+        case: bool = True,
+        flags: int = 0,
+        na: Scalar | lib.NoDefault = lib.no_default,
     ):
         if not case:
             flags |= re.IGNORECASE
@@ -253,7 +266,7 @@ def _str_fullmatch(
         pat: str | re.Pattern,
         case: bool = True,
         flags: int = 0,
-        na: Scalar | None = None,
+        na: Scalar | lib.NoDefault = lib.no_default,
     ):
         if not case:
             flags |= re.IGNORECASE
diff --git a/pandas/tests/strings/test_api.py b/pandas/tests/strings/test_api.py
index 2511474e03ff7..4a1b97606db2b 100644
--- a/pandas/tests/strings/test_api.py
+++ b/pandas/tests/strings/test_api.py
@@ -122,6 +122,7 @@ def test_api_per_method(
     any_allowed_skipna_inferred_dtype,
     any_string_method,
     request,
+    using_infer_string,
 ):
     # this test does not check correctness of the different methods,
     # just that the methods work on the specified (inferred) dtypes,
@@ -160,6 +161,10 @@ def test_api_per_method(
     t = box(values, dtype=dtype)  # explicit dtype to avoid casting
     method = getattr(t.str, method_name)
 
+    if using_infer_string and dtype == "category":
+        string_allowed = method_name not in ["decode"]
+    else:
+        string_allowed = True
     bytes_allowed = method_name in ["decode", "get", "len", "slice"]
     # as of v0.23.4, all methods except 'cat' are very lenient with the
     # allowed data types, just returning NaN for entries that error.
@@ -168,7 +173,8 @@ def test_api_per_method(
     mixed_allowed = method_name not in ["cat"]
 
     allowed_types = (
-        ["string", "unicode", "empty"]
+        ["empty"]
+        + ["string", "unicode"] * string_allowed
         + ["bytes"] * bytes_allowed
         + ["mixed", "mixed-integer"] * mixed_allowed
     )
@@ -182,6 +188,7 @@ def test_api_per_method(
         msg = (
             f"Cannot use .str.{method_name} with values of "
             f"inferred dtype {inferred_dtype!r}."
+            "|a bytes-like object is required, not 'str'"
         )
         with pytest.raises(TypeError, match=msg):
             method(*args, **kwargs)
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
index f3698a2ea33cf..34a6377b5786f 100644
--- a/pandas/tests/strings/test_find_replace.py
+++ b/pandas/tests/strings/test_find_replace.py
@@ -29,20 +29,28 @@ def test_contains(any_string_dtype):
     pat = "mmm[_]+"
 
     result = values.str.contains(pat)
-    expected_dtype = (
-        "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
-    )
-    expected = Series(
-        np.array([False, np.nan, True, True, False], dtype=np.object_),
-        dtype=expected_dtype,
-    )
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected = Series([False, False, True, True, False], dtype=bool)
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        expected = Series(
+            np.array([False, np.nan, True, True, False], dtype=np.object_),
+            dtype=expected_dtype,
+        )
+
     tm.assert_series_equal(result, expected)
 
     result = values.str.contains(pat, regex=False)
-    expected = Series(
-        np.array([False, np.nan, False, False, True], dtype=np.object_),
-        dtype=expected_dtype,
-    )
+    if any_string_dtype == "str":
+        expected = Series([False, False, False, False, True], dtype=bool)
+    else:
+        expected = Series(
+            np.array([False, np.nan, False, False, True], dtype=np.object_),
+            dtype=expected_dtype,
+        )
     tm.assert_series_equal(result, expected)
 
     values = Series(
@@ -79,12 +87,16 @@ def test_contains(any_string_dtype):
     pat = "mmm[_]+"
 
     result = values.str.contains(pat)
-    expected_dtype = (
-        "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
-    )
-    expected = Series(
-        np.array([False, np.nan, True, True], dtype=np.object_), dtype=expected_dtype
-    )
+    if any_string_dtype == "str":
+        expected = Series([False, False, True, True], dtype=bool)
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        expected = Series(
+            np.array([False, np.nan, True, True], dtype=np.object_),
+            dtype=expected_dtype,
+        )
     tm.assert_series_equal(result, expected)
 
     result = values.str.contains(pat, na=False)
@@ -184,39 +196,45 @@ def test_contains_moar(any_string_dtype):
     )
 
     result = s.str.contains("a")
-    expected_dtype = (
-        "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
-    )
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected_dtype = bool
+        na_value = False
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        na_value = np.nan
     expected = Series(
-        [False, False, False, True, True, False, np.nan, False, False, True],
+        [False, False, False, True, True, False, na_value, False, False, True],
         dtype=expected_dtype,
     )
     tm.assert_series_equal(result, expected)
 
     result = s.str.contains("a", case=False)
     expected = Series(
-        [True, False, False, True, True, False, np.nan, True, False, True],
+        [True, False, False, True, True, False, na_value, True, False, True],
         dtype=expected_dtype,
     )
     tm.assert_series_equal(result, expected)
 
     result = s.str.contains("Aa")
     expected = Series(
-        [False, False, False, True, False, False, np.nan, False, False, False],
+        [False, False, False, True, False, False, na_value, False, False, False],
         dtype=expected_dtype,
     )
     tm.assert_series_equal(result, expected)
 
     result = s.str.contains("ba")
     expected = Series(
-        [False, False, False, True, False, False, np.nan, False, False, False],
+        [False, False, False, True, False, False, na_value, False, False, False],
         dtype=expected_dtype,
     )
     tm.assert_series_equal(result, expected)
 
     result = s.str.contains("ba", case=False)
     expected = Series(
-        [False, False, False, True, True, False, np.nan, True, False, False],
+        [False, False, False, True, True, False, na_value, True, False, False],
         dtype=expected_dtype,
     )
     tm.assert_series_equal(result, expected)
@@ -252,10 +270,14 @@ def test_contains_nan(any_string_dtype):
     tm.assert_series_equal(result, expected)
 
     result = s.str.contains("foo")
-    expected_dtype = (
-        "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
-    )
-    expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype)
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected = Series([False, False, False], dtype=bool)
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        expected = Series([np.nan, np.nan, np.nan], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
 
@@ -272,9 +294,7 @@ def test_startswith_endswith_validate_na(any_string_dtype):
     )
 
     dtype = ser.dtype
-    if (
-        isinstance(dtype, pd.StringDtype) and dtype.storage == "python"
-    ) or dtype == np.dtype("object"):
+    if (isinstance(dtype, pd.StringDtype)) or dtype == np.dtype("object"):
         msg = "Allowing a non-bool 'na' in obj.str.startswith is deprecated"
         with tm.assert_produces_warning(FutureWarning, match=msg):
             ser.str.startswith("kapow", na="baz")
@@ -296,7 +316,7 @@ def test_startswith_endswith_validate_na(any_string_dtype):
 @pytest.mark.parametrize("dtype", ["object", "category"])
 @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA])
 @pytest.mark.parametrize("na", [True, False])
-def test_startswith(pat, dtype, null_value, na):
+def test_startswith(pat, dtype, null_value, na, using_infer_string):
     # add category dtype parametrizations for GH-36241
     values = Series(
         ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"],
@@ -310,6 +330,8 @@ def test_startswith(pat, dtype, null_value, na):
         exp = exp.fillna(null_value)
     elif dtype == "object" and null_value is None:
         exp[exp.isna()] = None
+    elif using_infer_string and dtype == "category":
+        exp = exp.fillna(False).astype(bool)
     tm.assert_series_equal(result, exp)
 
     result = values.str.startswith(pat, na=na)
@@ -327,20 +349,31 @@ def test_startswith(pat, dtype, null_value, na):
 
 
 @pytest.mark.parametrize("na", [None, True, False])
-def test_startswith_nullable_string_dtype(nullable_string_dtype, na):
+def test_startswith_string_dtype(any_string_dtype, na):
     values = Series(
         ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."],
-        dtype=nullable_string_dtype,
+        dtype=any_string_dtype,
     )
     result = values.str.startswith("foo", na=na)
+
+    expected_dtype = (
+        (object if na is None else bool)
+        if is_object_or_nan_string_dtype(any_string_dtype)
+        else "boolean"
+    )
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected_dtype = bool
+        if na is None:
+            na = False
     exp = Series(
-        [False, na, True, False, False, na, True, False, False], dtype="boolean"
+        [False, na, True, False, False, na, True, False, False], dtype=expected_dtype
     )
     tm.assert_series_equal(result, exp)
 
     result = values.str.startswith("rege.", na=na)
     exp = Series(
-        [False, na, False, False, False, na, False, False, True], dtype="boolean"
+        [False, na, False, False, False, na, False, False, True], dtype=expected_dtype
     )
     tm.assert_series_equal(result, exp)
 
@@ -354,7 +387,7 @@ def test_startswith_nullable_string_dtype(nullable_string_dtype, na):
 @pytest.mark.parametrize("dtype", ["object", "category"])
 @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA])
 @pytest.mark.parametrize("na", [True, False])
-def test_endswith(pat, dtype, null_value, na):
+def test_endswith(pat, dtype, null_value, na, using_infer_string):
     # add category dtype parametrizations for GH-36241
     values = Series(
         ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"],
@@ -368,6 +401,8 @@ def test_endswith(pat, dtype, null_value, na):
         exp = exp.fillna(null_value)
     elif dtype == "object" and null_value is None:
         exp[exp.isna()] = None
+    elif using_infer_string and dtype == "category":
+        exp = exp.fillna(False).astype(bool)
     tm.assert_series_equal(result, exp)
 
     result = values.str.endswith(pat, na=na)
@@ -385,20 +420,30 @@ def test_endswith(pat, dtype, null_value, na):
 
 
 @pytest.mark.parametrize("na", [None, True, False])
-def test_endswith_nullable_string_dtype(nullable_string_dtype, na):
+def test_endswith_string_dtype(any_string_dtype, na):
     values = Series(
         ["om", None, "foo_nom", "nom", "bar_foo", None, "foo", "regex", "rege."],
-        dtype=nullable_string_dtype,
+        dtype=any_string_dtype,
     )
     result = values.str.endswith("foo", na=na)
+    expected_dtype = (
+        (object if na is None else bool)
+        if is_object_or_nan_string_dtype(any_string_dtype)
+        else "boolean"
+    )
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected_dtype = bool
+        if na is None:
+            na = False
     exp = Series(
-        [False, na, False, False, True, na, True, False, False], dtype="boolean"
+        [False, na, False, False, True, na, True, False, False], dtype=expected_dtype
     )
     tm.assert_series_equal(result, exp)
 
     result = values.str.endswith("rege.", na=na)
     exp = Series(
-        [False, na, False, False, False, na, False, False, True], dtype="boolean"
+        [False, na, False, False, False, na, False, False, True], dtype=expected_dtype
     )
     tm.assert_series_equal(result, exp)
 
@@ -690,36 +735,41 @@ def test_replace_regex_single_character(regex, any_string_dtype):
 
 
 def test_match(any_string_dtype):
-    # New match behavior introduced in 0.13
-    expected_dtype = (
-        "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
-    )
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected_dtype = bool
+        na_value = False
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        na_value = np.nan
 
     values = Series(["fooBAD__barBAD", np.nan, "foo"], dtype=any_string_dtype)
     result = values.str.match(".*(BAD[_]+).*(BAD)")
-    expected = Series([True, np.nan, False], dtype=expected_dtype)
+    expected = Series([True, na_value, False], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
     values = Series(
         ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
     )
     result = values.str.match(".*BAD[_]+.*BAD")
-    expected = Series([True, True, np.nan, False], dtype=expected_dtype)
+    expected = Series([True, True, na_value, False], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
     result = values.str.match("BAD[_]+.*BAD")
-    expected = Series([False, True, np.nan, False], dtype=expected_dtype)
+    expected = Series([False, True, na_value, False], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
     values = Series(
         ["fooBAD__barBAD", "^BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
     )
     result = values.str.match("^BAD[_]+.*BAD")
-    expected = Series([False, False, np.nan, False], dtype=expected_dtype)
+    expected = Series([False, False, na_value, False], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
     result = values.str.match("\\^BAD[_]+.*BAD")
-    expected = Series([False, True, np.nan, False], dtype=expected_dtype)
+    expected = Series([False, True, na_value, False], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
 
@@ -755,10 +805,17 @@ def test_match_na_kwarg(any_string_dtype):
     tm.assert_series_equal(result, expected)
 
     result = s.str.match("a")
-    expected_dtype = (
-        "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
-    )
-    expected = Series([True, False, np.nan], dtype=expected_dtype)
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected_dtype = bool
+        na_value = False
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        na_value = np.nan
+
+    expected = Series([True, False, na_value], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
 
@@ -783,10 +840,14 @@ def test_fullmatch(any_string_dtype):
         ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype=any_string_dtype
     )
     result = ser.str.fullmatch(".*BAD[_]+.*BAD")
-    expected_dtype = (
-        "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
-    )
-    expected = Series([True, False, np.nan, False], dtype=expected_dtype)
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected = Series([True, False, False, False], dtype=bool)
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        expected = Series([True, False, np.nan, False], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
 
@@ -794,10 +855,14 @@ def test_fullmatch_dollar_literal(any_string_dtype):
     # GH 56652
     ser = Series(["foo", "foo$foo", np.nan, "foo$"], dtype=any_string_dtype)
     result = ser.str.fullmatch("foo\\$")
-    expected_dtype = (
-        "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
-    )
-    expected = Series([False, False, np.nan, True], dtype=expected_dtype)
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected = Series([False, False, False, True], dtype=bool)
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        expected = Series([False, False, np.nan, True], dtype=expected_dtype)
     tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py
index 517ddb164985c..cd3c512328139 100644
--- a/pandas/tests/strings/test_string_array.py
+++ b/pandas/tests/strings/test_string_array.py
@@ -38,7 +38,7 @@ def test_string_array(nullable_string_dtype, any_string_method):
             expected.values, skipna=True
         ):
             assert result.dtype == "boolean"
-            result = result.astype(object)
+            expected = expected.astype("boolean")
 
         elif expected.dtype == "bool":
             assert result.dtype == "boolean"
diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py
index 4995b448f7e94..75a2007b61640 100644
--- a/pandas/tests/strings/test_strings.py
+++ b/pandas/tests/strings/test_strings.py
@@ -217,8 +217,21 @@ def test_ismethods(method, expected, any_string_dtype):
     tm.assert_series_equal(result, expected)
 
     # compare with standard library
-    expected = [getattr(item, method)() for item in ser]
-    assert list(result) == expected
+    expected_stdlib = [getattr(item, method)() for item in ser]
+    assert list(result) == expected_stdlib
+
+    # with missing value
+    ser.iloc[[1, 2, 3, 4]] = np.nan
+    result = getattr(ser.str, method)()
+    if ser.dtype == "object":
+        expected = expected.astype(object)
+        expected.iloc[[1, 2, 3, 4]] = np.nan
+    elif ser.dtype == "str":
+        # NaN propagates as False
+        expected.iloc[[1, 2, 3, 4]] = False
+    else:
+        # nullable dtypes propagate NaN
+        expected.iloc[[1, 2, 3, 4]] = np.nan
 
 
 @pytest.mark.parametrize(
@@ -259,10 +272,14 @@ def test_isnumeric_unicode(method, expected, any_string_dtype):
 def test_isnumeric_unicode_missing(method, expected, any_string_dtype):
     values = ["A", np.nan, "¼", "★", np.nan, "３", "four"]  # noqa: RUF001
     ser = Series(values, dtype=any_string_dtype)
-    expected_dtype = (
-        "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
-    )
-    expected = Series(expected, dtype=expected_dtype)
+    if any_string_dtype == "str":
+        # NaN propagates as False
+        expected = Series(expected, dtype=object).fillna(False).astype(bool)
+    else:
+        expected_dtype = (
+            "object" if is_object_or_nan_string_dtype(any_string_dtype) else "boolean"
+        )
+        expected = Series(expected, dtype=expected_dtype)
     result = getattr(ser.str, method)()
     tm.assert_series_equal(result, expected)
 

From 4f328f08df90906198b3a3f955ab321018964f0a Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 10 Oct 2024 15:19:55 +0200
Subject: [PATCH 136/224] TST (string dtype): resolve all infer_string
 TODO/xfails in pandas/tests/arrays (#59686)

---
 pandas/core/arrays/string_arrow.py            |  6 +++++-
 .../arrays/categorical/test_analytics.py      | 20 +++++++++----------
 pandas/tests/arrays/integer/test_reduction.py |  7 +------
 3 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 6066b8c73a23a..9e9893ecbbd97 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -427,9 +427,13 @@ def _reduce(
                 arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, ""))
             else:
                 arr = pc.not_equal(self._pa_array, "")
-            return ArrowExtensionArray(arr)._reduce(
+            result = ArrowExtensionArray(arr)._reduce(
                 name, skipna=skipna, keepdims=keepdims, **kwargs
             )
+            if keepdims:
+                # ArrowExtensionArray will return a length-1 bool[pyarrow] array
+                return result.astype(np.bool_)
+            return result
 
         result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs)
         if name in ("argmin", "argmax") and isinstance(result, pa.Array):
diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py
index 52fd80cd196e0..47fa354e12393 100644
--- a/pandas/tests/arrays/categorical/test_analytics.py
+++ b/pandas/tests/arrays/categorical/test_analytics.py
@@ -4,12 +4,7 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
-from pandas.compat import (
-    HAS_PYARROW,
-    PYPY,
-)
+from pandas.compat import PYPY
 
 from pandas import (
     Categorical,
@@ -299,10 +294,7 @@ def test_nbytes(self):
         exp = 3 + 3 * 8  # 3 int8s for values + 3 int64s for categories
         assert cat.nbytes == exp
 
-    @pytest.mark.xfail(
-        using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)"
-    )
-    def test_memory_usage(self):
+    def test_memory_usage(self, using_infer_string):
         cat = Categorical([1, 2, 3])
 
         # .categories is an index, so we include the hashtable
@@ -310,7 +302,13 @@ def test_memory_usage(self):
         assert 0 < cat.nbytes <= cat.memory_usage(deep=True)
 
         cat = Categorical(["foo", "foo", "bar"])
-        assert cat.memory_usage(deep=True) > cat.nbytes
+        if using_infer_string:
+            if cat.categories.dtype.storage == "python":
+                assert cat.memory_usage(deep=True) > cat.nbytes
+            else:
+                assert cat.memory_usage(deep=True) >= cat.nbytes
+        else:
+            assert cat.memory_usage(deep=True) > cat.nbytes
 
         if not PYPY:
             # sys.getsizeof will call the .memory_usage with
diff --git a/pandas/tests/arrays/integer/test_reduction.py b/pandas/tests/arrays/integer/test_reduction.py
index e485c7f79b475..1c91cd25ba69c 100644
--- a/pandas/tests/arrays/integer/test_reduction.py
+++ b/pandas/tests/arrays/integer/test_reduction.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas.compat import HAS_PYARROW
-
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -104,10 +102,7 @@ def test_groupby_reductions(op, expected):
         ["all", Series([True, True, True], index=["A", "B", "C"], dtype="boolean")],
     ],
 )
-def test_mixed_reductions(request, op, expected, using_infer_string):
-    if op in ["any", "all"] and using_infer_string and HAS_PYARROW:
-        # TODO(infer_string) inconsistent result type
-        request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)"))
+def test_mixed_reductions(op, expected):
     df = DataFrame(
         {
             "A": ["a", "b", "b"],

From 97c4ce390ae7dafb7f58a21184cc32d921edb6c6 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 10 Oct 2024 11:02:14 -0700
Subject: [PATCH 137/224] Bump pypa/cibuildwheel from 2.20.0 to 2.21.0 (#59816)

Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.20.0 to 2.21.0.
- [Release notes](https://github.com/pypa/cibuildwheel/releases)
- [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md)
- [Commits](https://github.com/pypa/cibuildwheel/compare/v2.20.0...v2.21.0)

---
updated-dependencies:
- dependency-name: pypa/cibuildwheel
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
 .github/workflows/wheels.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index de59a454c827c..ce48cfa463974 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -156,7 +156,7 @@ jobs:
         run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV"
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.20.0
+        uses: pypa/cibuildwheel@v2.21.0
         with:
          package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }}
         env:

From 2b9ca0734ead966146321fa22d009d5eb186eb0b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 10 Oct 2024 12:15:00 -1000
Subject: [PATCH 138/224] STY: Bump pyright, pyupgrade and mypy for new PEP-696
 syntax (#60006)

* STY: Bump pyright, pyupgrade and mypy for new PEP-696 syntax

* Apply update

* fix & ignore failures

* another ignore
---
 .pre-commit-config.yaml                  |  4 ++--
 environment.yml                          |  6 +++---
 pandas/_config/config.py                 |  4 ++--
 pandas/_config/localization.py           |  2 +-
 pandas/_testing/_warnings.py             |  2 +-
 pandas/_testing/contexts.py              |  8 ++++----
 pandas/compat/pickle_compat.py           |  2 +-
 pandas/core/apply.py                     | 12 ++++--------
 pandas/core/arraylike.py                 |  4 ++--
 pandas/core/arrays/arrow/array.py        |  2 +-
 pandas/core/arrays/boolean.py            |  2 +-
 pandas/core/arrays/datetimes.py          |  2 +-
 pandas/core/common.py                    |  4 +---
 pandas/core/computation/expr.py          |  2 +-
 pandas/core/frame.py                     |  2 +-
 pandas/core/groupby/groupby.py           |  8 +++++---
 pandas/core/groupby/ops.py               |  2 +-
 pandas/core/indexes/interval.py          |  4 ++--
 pandas/core/indexes/multi.py             |  2 +-
 pandas/core/internals/blocks.py          |  2 +-
 pandas/core/internals/concat.py          |  2 +-
 pandas/core/internals/managers.py        |  4 ++--
 pandas/core/methods/to_dict.py           |  2 +-
 pandas/core/resample.py                  |  2 +-
 pandas/core/series.py                    |  3 +--
 pandas/core/tools/datetimes.py           |  2 +-
 pandas/core/window/rolling.py            |  2 +-
 pandas/io/excel/_odswriter.py            |  2 +-
 pandas/io/excel/_openpyxl.py             |  2 +-
 pandas/io/excel/_xlsxwriter.py           |  2 +-
 pandas/io/formats/css.py                 | 10 +++-------
 pandas/io/formats/format.py              |  2 +-
 pandas/io/parsers/python_parser.py       |  2 +-
 pandas/io/sas/sas7bdat.py                |  2 +-
 pandas/io/sql.py                         | 12 ++++++------
 pandas/plotting/_matplotlib/converter.py |  8 ++++----
 pandas/plotting/_matplotlib/tools.py     |  2 +-
 pandas/plotting/_misc.py                 |  2 +-
 pandas/tests/frame/test_ufunc.py         |  4 ++--
 pandas/tests/indexes/test_base.py        |  2 +-
 pandas/util/_exceptions.py               | 11 +++++++----
 requirements-dev.txt                     |  6 +++---
 42 files changed, 78 insertions(+), 84 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7c9ebf7d94173..1cb7b288aba69 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -74,7 +74,7 @@ repos:
     hooks:
     -   id: isort
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v3.16.0
+    rev: v3.17.0
     hooks:
     -   id: pyupgrade
         args: [--py310-plus]
@@ -112,7 +112,7 @@ repos:
         types: [python]
         stages: [manual]
         additional_dependencies: &pyright_dependencies
-        - pyright@1.1.352
+        - pyright@1.1.383
     -   id: pyright
         # note: assumes python env is setup and activated
         name: pyright reportGeneralTypeIssues
diff --git a/environment.yml b/environment.yml
index 34bc0591ca8df..ab834735441f0 100644
--- a/environment.yml
+++ b/environment.yml
@@ -76,10 +76,10 @@ dependencies:
   - cxx-compiler
 
   # code checks
-  - flake8=6.1.0  # run in subprocess over docstring examples
-  - mypy=1.9.0  # pre-commit uses locally installed mypy
+  - flake8=7.1.0  # run in subprocess over docstring examples
+  - mypy=1.11.2  # pre-commit uses locally installed mypy
   - tokenize-rt  # scripts/check_for_inconsistent_pandas_namespace.py
-  - pre-commit>=3.6.0
+  - pre-commit>=4.0.1
 
   # documentation
   - gitpython  # obtain contributors from git for whatsnew
diff --git a/pandas/_config/config.py b/pandas/_config/config.py
index 4ed2d4c3be692..25760df6bd7a4 100644
--- a/pandas/_config/config.py
+++ b/pandas/_config/config.py
@@ -411,7 +411,7 @@ def __dir__(self) -> list[str]:
 
 
 @contextmanager
-def option_context(*args) -> Generator[None, None, None]:
+def option_context(*args) -> Generator[None]:
     """
     Context manager to temporarily set options in a ``with`` statement.
 
@@ -718,7 +718,7 @@ def _build_option_description(k: str) -> str:
 
 
 @contextmanager
-def config_prefix(prefix: str) -> Generator[None, None, None]:
+def config_prefix(prefix: str) -> Generator[None]:
     """
     contextmanager for multiple invocations of API with a common prefix
 
diff --git a/pandas/_config/localization.py b/pandas/_config/localization.py
index 61d88c43f0e4a..6602633f20399 100644
--- a/pandas/_config/localization.py
+++ b/pandas/_config/localization.py
@@ -25,7 +25,7 @@
 @contextmanager
 def set_locale(
     new_locale: str | tuple[str, str], lc_var: int = locale.LC_ALL
-) -> Generator[str | tuple[str, str], None, None]:
+) -> Generator[str | tuple[str, str]]:
     """
     Context manager for temporarily setting a locale.
 
diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py
index cd2e2b4141ffd..a752c8db90f38 100644
--- a/pandas/_testing/_warnings.py
+++ b/pandas/_testing/_warnings.py
@@ -35,7 +35,7 @@ def assert_produces_warning(
     raise_on_extra_warnings: bool = True,
     match: str | tuple[str | None, ...] | None = None,
     must_find_all_warnings: bool = True,
-) -> Generator[list[warnings.WarningMessage], None, None]:
+) -> Generator[list[warnings.WarningMessage]]:
     """
     Context manager for running code expected to either raise a specific warning,
     multiple specific warnings, or not raise any warnings. Verifies that the code
diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py
index 4ca67d6fc082d..99826de51e1bf 100644
--- a/pandas/_testing/contexts.py
+++ b/pandas/_testing/contexts.py
@@ -29,7 +29,7 @@
 @contextmanager
 def decompress_file(
     path: FilePath | BaseBuffer, compression: CompressionOptions
-) -> Generator[IO[bytes], None, None]:
+) -> Generator[IO[bytes]]:
     """
     Open a compressed file and return a file object.
 
@@ -50,7 +50,7 @@ def decompress_file(
 
 
 @contextmanager
-def set_timezone(tz: str) -> Generator[None, None, None]:
+def set_timezone(tz: str) -> Generator[None]:
     """
     Context manager for temporarily setting a timezone.
 
@@ -92,7 +92,7 @@ def setTZ(tz) -> None:
 
 
 @contextmanager
-def ensure_clean(filename=None) -> Generator[Any, None, None]:
+def ensure_clean(filename=None) -> Generator[Any]:
     """
     Gets a temporary path and agrees to remove on close.
 
@@ -124,7 +124,7 @@ def ensure_clean(filename=None) -> Generator[Any, None, None]:
 
 
 @contextmanager
-def with_csv_dialect(name: str, **kwargs) -> Generator[None, None, None]:
+def with_csv_dialect(name: str, **kwargs) -> Generator[None]:
     """
     Context manager to temporarily register a CSV dialect for parsing CSV.
 
diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py
index 28985a1380bee..beaaa3f8ed3cc 100644
--- a/pandas/compat/pickle_compat.py
+++ b/pandas/compat/pickle_compat.py
@@ -131,7 +131,7 @@ def loads(
 
 
 @contextlib.contextmanager
-def patch_pickle() -> Generator[None, None, None]:
+def patch_pickle() -> Generator[None]:
     """
     Temporarily patch pickle to use our unpickler.
     """
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index 1f13459724d78..af2d6243ce4ed 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -246,12 +246,8 @@ def transform(self) -> DataFrame | Series:
             and not obj.empty
         ):
             raise ValueError("Transform function failed")
-        # error: Argument 1 to "__get__" of "AxisProperty" has incompatible type
-        # "Union[Series, DataFrame, GroupBy[Any], SeriesGroupBy,
-        # DataFrameGroupBy, BaseWindow, Resampler]"; expected "Union[DataFrame,
-        # Series]"
         if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals(
-            obj.index  # type: ignore[arg-type]
+            obj.index
         ):
             raise ValueError("Function did not transform")
 
@@ -803,7 +799,7 @@ def result_columns(self) -> Index:
 
     @property
     @abc.abstractmethod
-    def series_generator(self) -> Generator[Series, None, None]:
+    def series_generator(self) -> Generator[Series]:
         pass
 
     @staticmethod
@@ -1128,7 +1124,7 @@ class FrameRowApply(FrameApply):
     axis: AxisInt = 0
 
     @property
-    def series_generator(self) -> Generator[Series, None, None]:
+    def series_generator(self) -> Generator[Series]:
         return (self.obj._ixs(i, axis=1) for i in range(len(self.columns)))
 
     @staticmethod
@@ -1235,7 +1231,7 @@ def apply_broadcast(self, target: DataFrame) -> DataFrame:
         return result.T
 
     @property
-    def series_generator(self) -> Generator[Series, None, None]:
+    def series_generator(self) -> Generator[Series]:
         values = self.values
         values = ensure_wrapped_if_datetimelike(values)
         assert len(values) > 0
diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py
index f70bb0743aa0f..43ac69508d1a4 100644
--- a/pandas/core/arraylike.py
+++ b/pandas/core/arraylike.py
@@ -403,12 +403,12 @@ def _reconstruct(result):
             # for np.<ufunc>(..) calls
             # kwargs cannot necessarily be handled block-by-block, so only
             # take this path if there are no kwargs
-            mgr = inputs[0]._mgr
+            mgr = inputs[0]._mgr  # pyright: ignore[reportGeneralTypeIssues]
             result = mgr.apply(getattr(ufunc, method))
         else:
             # otherwise specific ufunc methods (eg np.<ufunc>.accumulate(..))
             # Those can have an axis keyword and thus can't be called block-by-block
-            result = default_array_ufunc(inputs[0], ufunc, method, *inputs, **kwargs)
+            result = default_array_ufunc(inputs[0], ufunc, method, *inputs, **kwargs)  # pyright: ignore[reportGeneralTypeIssues]
             # e.g. np.negative (only one reached), with "where" and "out" in kwargs
 
     result = reconstruct(result)
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index dab6a13ff3528..619e7b3ccfb4f 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2428,7 +2428,7 @@ def _str_rindex(self, sub: str, start: int = 0, end: int | None = None) -> Self:
         result = self._apply_elementwise(predicate)
         return type(self)(pa.chunked_array(result))
 
-    def _str_normalize(self, form: str) -> Self:
+    def _str_normalize(self, form: Literal["NFC", "NFD", "NFKC", "NFKD"]) -> Self:
         predicate = lambda val: unicodedata.normalize(form, val)
         result = self._apply_elementwise(predicate)
         return type(self)(pa.chunked_array(result))
diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
index 53ebc35b68d14..87c18fe346c62 100644
--- a/pandas/core/arrays/boolean.py
+++ b/pandas/core/arrays/boolean.py
@@ -369,7 +369,7 @@ def _coerce_to_array(
             assert dtype == "boolean"
         return coerce_to_array(value, copy=copy)
 
-    def _logical_method(self, other, op):
+    def _logical_method(self, other, op):  # type: ignore[override]
         assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"}
         other_is_scalar = lib.is_scalar(other)
         mask = None
diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
index 41128e52e31b3..43cc492f82885 100644
--- a/pandas/core/arrays/datetimes.py
+++ b/pandas/core/arrays/datetimes.py
@@ -2918,7 +2918,7 @@ def _generate_range(
     offset: BaseOffset,
     *,
     unit: str,
-) -> Generator[Timestamp, None, None]:
+) -> Generator[Timestamp]:
     """
     Generates a sequence of dates corresponding to the specified time
     offset. Similar to dateutil.rrule except uses pandas DateOffset
diff --git a/pandas/core/common.py b/pandas/core/common.py
index ec0473a20458b..9788ec972ba1b 100644
--- a/pandas/core/common.py
+++ b/pandas/core/common.py
@@ -560,9 +560,7 @@ def convert_to_list_like(
 
 
 @contextlib.contextmanager
-def temp_setattr(
-    obj, attr: str, value, condition: bool = True
-) -> Generator[None, None, None]:
+def temp_setattr(obj, attr: str, value, condition: bool = True) -> Generator[None]:
     """
     Temporarily set attribute on an object.
 
diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py
index f45bc453d2541..7025d8a72e561 100644
--- a/pandas/core/computation/expr.py
+++ b/pandas/core/computation/expr.py
@@ -168,7 +168,7 @@ def _preparse(
     the ``tokenize`` module and ``tokval`` is a string.
     """
     assert callable(f), "f must be callable"
-    return tokenize.untokenize(f(x) for x in tokenize_string(source))
+    return tokenize.untokenize(f(x) for x in tokenize_string(source))  # pyright: ignore[reportArgumentType]
 
 
 def _is_type(t):
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 1b47002e72fc6..24a164aa15427 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -2306,7 +2306,7 @@ def maybe_reorder(
 
         if any(exclude):
             arr_exclude = (x for x in exclude if x in arr_columns)
-            to_remove = {arr_columns.get_loc(col) for col in arr_exclude}
+            to_remove = {arr_columns.get_loc(col) for col in arr_exclude}  # pyright: ignore[reportUnhashable]
             arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
 
             columns = columns.drop(exclude)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 68314567d1b5e..5bf3e0e4de2fb 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -3719,7 +3719,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
             mask = isna(values)
             if values.ndim == 1:
                 indexer = np.empty(values.shape, dtype=np.intp)
-                col_func(out=indexer, mask=mask)
+                col_func(out=indexer, mask=mask)  # type: ignore[arg-type]
                 return algorithms.take_nd(values, indexer)
 
             else:
@@ -4081,7 +4081,9 @@ def _nth(
     def quantile(
         self,
         q: float | AnyArrayLike = 0.5,
-        interpolation: str = "linear",
+        interpolation: Literal[
+            "linear", "lower", "higher", "nearest", "midpoint"
+        ] = "linear",
         numeric_only: bool = False,
     ):
         """
@@ -4270,7 +4272,7 @@ def blk_func(values: ArrayLike) -> ArrayLike:
                 func(
                     out[0],
                     values=vals,
-                    mask=mask,
+                    mask=mask,  # type: ignore[arg-type]
                     result_mask=result_mask,
                     is_datetimelike=is_datetimelike,
                 )
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index b32119a2ddbde..4c7fe604e452d 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -898,7 +898,7 @@ def _unob_index_and_ids(
         return unob_index, unob_ids
 
     @final
-    def get_group_levels(self) -> Generator[Index, None, None]:
+    def get_group_levels(self) -> Generator[Index]:
         # Note: only called from _insert_inaxis_grouper, which
         #  is only called for BaseGrouper, never for BinGrouper
         result_index = self.result_index
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
index 8feac890883eb..94717141b30b0 100644
--- a/pandas/core/indexes/interval.py
+++ b/pandas/core/indexes/interval.py
@@ -991,7 +991,7 @@ def length(self) -> Index:
     # --------------------------------------------------------------------
     # Set Operations
 
-    def _intersection(self, other, sort):
+    def _intersection(self, other, sort: bool = False):
         """
         intersection specialized to the case with matching dtypes.
         """
@@ -1006,7 +1006,7 @@ def _intersection(self, other, sort):
             # duplicates
             taken = self._intersection_non_unique(other)
 
-        if sort is None:
+        if sort:
             taken = taken.sort_values()
 
         return taken
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index 9eccb7645fbe7..ae9b272af9fe9 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -2664,7 +2664,7 @@ def _reorder_ilevels(self, order) -> MultiIndex:
 
     def _recode_for_new_levels(
         self, new_levels, copy: bool = True
-    ) -> Generator[np.ndarray, None, None]:
+    ) -> Generator[np.ndarray]:
         if len(new_levels) > self.nlevels:
             raise AssertionError(
                 f"Length of new_levels ({len(new_levels)}) "
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
index cb40e920149fa..a3ff577966a6d 100644
--- a/pandas/core/internals/blocks.py
+++ b/pandas/core/internals/blocks.py
@@ -388,7 +388,7 @@ def _split_op_result(self, result: ArrayLike) -> list[Block]:
         return [nb]
 
     @final
-    def _split(self) -> Generator[Block, None, None]:
+    def _split(self) -> Generator[Block]:
         """
         Split a block into a list of single-column blocks.
         """
diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py
index b96d5a59effce..2ee7d3948a70f 100644
--- a/pandas/core/internals/concat.py
+++ b/pandas/core/internals/concat.py
@@ -250,7 +250,7 @@ def _concat_homogeneous_fastpath(
 
 def _get_combined_plan(
     mgrs: list[BlockManager],
-) -> Generator[tuple[BlockPlacement, list[JoinUnit]], None, None]:
+) -> Generator[tuple[BlockPlacement, list[JoinUnit]]]:
     max_len = mgrs[0].shape[0]
 
     blknos_list = [mgr.blknos for mgr in mgrs]
diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
index aa4a785519051..a3738bb25f56c 100644
--- a/pandas/core/internals/managers.py
+++ b/pandas/core/internals/managers.py
@@ -856,7 +856,7 @@ def _slice_take_blocks_ax0(
         *,
         use_na_proxy: bool = False,
         ref_inplace_op: bool = False,
-    ) -> Generator[Block, None, None]:
+    ) -> Generator[Block]:
         """
         Slice/take blocks along axis=0.
 
@@ -1731,7 +1731,7 @@ def unstack(self, unstacker, fill_value) -> BlockManager:
         bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False)
         return bm
 
-    def to_iter_dict(self) -> Generator[tuple[str, Self], None, None]:
+    def to_iter_dict(self) -> Generator[tuple[str, Self]]:
         """
         Yield a tuple of (str(dtype), BlockManager)
 
diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py
index 84202a4fcc840..aea95e4684573 100644
--- a/pandas/core/methods/to_dict.py
+++ b/pandas/core/methods/to_dict.py
@@ -33,7 +33,7 @@
 
 def create_data_for_split(
     df: DataFrame, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]
-) -> Generator[list, None, None]:
+) -> Generator[list]:
     """
     Simple helper method to create data for to ``to_dict(orient="split")``
     to create the main output data
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
index 711396096a5e3..42fed83398737 100644
--- a/pandas/core/resample.py
+++ b/pandas/core/resample.py
@@ -404,7 +404,7 @@ def transform(self, arg, *args, **kwargs):
             arg, *args, **kwargs
         )
 
-    def _downsample(self, f, **kwargs):
+    def _downsample(self, how, **kwargs):
         raise AbstractMethodError(self)
 
     def _upsample(self, f, limit: int | None = None, fill_value=None):
diff --git a/pandas/core/series.py b/pandas/core/series.py
index bbcb6615aeefd..fe2bb0b5aa5c3 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -813,8 +813,7 @@ def _values(self):
     def _references(self) -> BlockValuesRefs:
         return self._mgr._block.refs
 
-    # error: Decorated property not supported
-    @Appender(base.IndexOpsMixin.array.__doc__)  # type: ignore[misc]
+    @Appender(base.IndexOpsMixin.array.__doc__)  # type: ignore[prop-decorator]
     @property
     def array(self) -> ExtensionArray:
         return self._mgr.array_values()
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
index 86c7316320f44..4680a63bf57a1 100644
--- a/pandas/core/tools/datetimes.py
+++ b/pandas/core/tools/datetimes.py
@@ -1000,7 +1000,7 @@ def to_datetime(
         dayfirst=dayfirst,
         yearfirst=yearfirst,
         errors=errors,
-        exact=exact,
+        exact=exact,  # type: ignore[arg-type]
     )
     result: Timestamp | NaTType | Series | Index
 
diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
index 9ea825ad4e44d..cf74cc30f3c5d 100644
--- a/pandas/core/window/rolling.py
+++ b/pandas/core/window/rolling.py
@@ -1507,7 +1507,7 @@ def _generate_cython_apply_func(
             window_aggregations.roll_apply,
             args=args,
             kwargs=kwargs,
-            raw=raw,
+            raw=bool(raw),
             function=function,
         )
 
diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py
index 0ddb59d3413ff..10a06aec72a57 100644
--- a/pandas/io/excel/_odswriter.py
+++ b/pandas/io/excel/_odswriter.py
@@ -34,7 +34,7 @@ class ODSWriter(ExcelWriter):
     _engine = "odf"
     _supported_extensions = (".ods",)
 
-    def __init__(
+    def __init__(  # pyright: ignore[reportInconsistentConstructor]
         self,
         path: FilePath | WriteExcelBuffer | ExcelWriter,
         engine: str | None = None,
diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py
index 218a592c22b4a..3055c68a93cbc 100644
--- a/pandas/io/excel/_openpyxl.py
+++ b/pandas/io/excel/_openpyxl.py
@@ -42,7 +42,7 @@ class OpenpyxlWriter(ExcelWriter):
     _engine = "openpyxl"
     _supported_extensions = (".xlsx", ".xlsm")
 
-    def __init__(
+    def __init__(  # pyright: ignore[reportInconsistentConstructor]
         self,
         path: FilePath | WriteExcelBuffer | ExcelWriter,
         engine: str | None = None,
diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py
index b2fd24a670300..4a7b8eee2bfce 100644
--- a/pandas/io/excel/_xlsxwriter.py
+++ b/pandas/io/excel/_xlsxwriter.py
@@ -181,7 +181,7 @@ class XlsxWriter(ExcelWriter):
     _engine = "xlsxwriter"
     _supported_extensions = (".xlsx",)
 
-    def __init__(
+    def __init__(  # pyright: ignore[reportInconsistentConstructor]
         self,
         path: FilePath | WriteExcelBuffer | ExcelWriter,
         engine: str | None = None,
diff --git a/pandas/io/formats/css.py b/pandas/io/formats/css.py
index 0af04526ea96d..10c970887e03b 100644
--- a/pandas/io/formats/css.py
+++ b/pandas/io/formats/css.py
@@ -34,9 +34,7 @@ def _side_expander(prop_fmt: str) -> Callable:
         function: Return to call when a 'border(-{side}): {value}' string is encountered
     """
 
-    def expand(
-        self: CSSResolver, prop: str, value: str
-    ) -> Generator[tuple[str, str], None, None]:
+    def expand(self: CSSResolver, prop: str, value: str) -> Generator[tuple[str, str]]:
         """
         Expand shorthand property into side-specific property (top, right, bottom, left)
 
@@ -81,9 +79,7 @@ def _border_expander(side: str = "") -> Callable:
     if side != "":
         side = f"-{side}"
 
-    def expand(
-        self: CSSResolver, prop: str, value: str
-    ) -> Generator[tuple[str, str], None, None]:
+    def expand(self: CSSResolver, prop: str, value: str) -> Generator[tuple[str, str]]:
         """
         Expand border into color, style, and width tuples
 
@@ -392,7 +388,7 @@ def _error() -> str:
             size_fmt = f"{val:f}pt"
         return size_fmt
 
-    def atomize(self, declarations: Iterable) -> Generator[tuple[str, str], None, None]:
+    def atomize(self, declarations: Iterable) -> Generator[tuple[str, str]]:
         for prop, value in declarations:
             prop = prop.lower()
             value = value.lower()
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
index 9ad5ac83e9eae..5aecc6af712e5 100644
--- a/pandas/io/formats/format.py
+++ b/pandas/io/formats/format.py
@@ -1024,7 +1024,7 @@ def save_to_buffer(
 @contextmanager
 def _get_buffer(
     buf: FilePath | WriteBuffer[str] | None, encoding: str | None = None
-) -> Generator[WriteBuffer[str], None, None] | Generator[StringIO, None, None]:
+) -> Generator[WriteBuffer[str]] | Generator[StringIO]:
     """
     Context manager to open, yield and close buffer for filenames or Path-like
     objects, otherwise yield buf unchanged.
diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py
index 3a2a1c37f1879..99d584db61755 100644
--- a/pandas/io/parsers/python_parser.py
+++ b/pandas/io/parsers/python_parser.py
@@ -1203,7 +1203,7 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
                     if callable(self.on_bad_lines):
                         new_l = self.on_bad_lines(_content)
                         if new_l is not None:
-                            content.append(new_l)
+                            content.append(new_l)  # pyright: ignore[reportArgumentType]
                     elif self.on_bad_lines in (
                         self.BadLineHandleMethod.ERROR,
                         self.BadLineHandleMethod.WARN,
diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py
index 25257d5fcc192..c5aab4d967cd4 100644
--- a/pandas/io/sas/sas7bdat.py
+++ b/pandas/io/sas/sas7bdat.py
@@ -516,7 +516,7 @@ def _process_columntext_subheader(self, offset: int, length: int) -> None:
                 buf = self._read_bytes(offset1, self._lcs)
                 self.creator_proc = buf[0 : self._lcp]
             if hasattr(self, "creator_proc"):
-                self.creator_proc = self._convert_header_text(self.creator_proc)
+                self.creator_proc = self._convert_header_text(self.creator_proc)  # pyright: ignore[reportArgumentType]
 
     def _process_columnname_subheader(self, offset: int, length: int) -> None:
         int_len = self._int_length
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
index 99dd06568fa01..9aff5600cf49b 100644
--- a/pandas/io/sql.py
+++ b/pandas/io/sql.py
@@ -233,7 +233,7 @@ def _wrap_result_adbc(
 
 
 @overload
-def read_sql_table(
+def read_sql_table(  # pyright: ignore[reportOverlappingOverload]
     table_name: str,
     con,
     schema=...,
@@ -364,7 +364,7 @@ def read_sql_table(
 
 
 @overload
-def read_sql_query(
+def read_sql_query(  # pyright: ignore[reportOverlappingOverload]
     sql,
     con,
     index_col: str | list[str] | None = ...,
@@ -500,7 +500,7 @@ def read_sql_query(
 
 
 @overload
-def read_sql(
+def read_sql(  # pyright: ignore[reportOverlappingOverload]
     sql,
     con,
     index_col: str | list[str] | None = ...,
@@ -1119,7 +1119,7 @@ def _query_iterator(
         coerce_float: bool = True,
         parse_dates=None,
         dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
-    ) -> Generator[DataFrame, None, None]:
+    ) -> Generator[DataFrame]:
         """Return generator through chunked result set."""
         has_read_data = False
         with exit_stack:
@@ -1732,7 +1732,7 @@ def _query_iterator(
         parse_dates=None,
         dtype: DtypeArg | None = None,
         dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
-    ) -> Generator[DataFrame, None, None]:
+    ) -> Generator[DataFrame]:
         """Return generator through chunked result set"""
         has_read_data = False
         with exit_stack:
@@ -2682,7 +2682,7 @@ def _query_iterator(
         parse_dates=None,
         dtype: DtypeArg | None = None,
         dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
-    ) -> Generator[DataFrame, None, None]:
+    ) -> Generator[DataFrame]:
         """Return generator through chunked result set"""
         has_read_data = False
         while True:
diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py
index fc63d65f1e160..4c00049075d03 100644
--- a/pandas/plotting/_matplotlib/converter.py
+++ b/pandas/plotting/_matplotlib/converter.py
@@ -92,7 +92,7 @@ def wrapper(*args, **kwargs):
 
 
 @contextlib.contextmanager
-def pandas_converters() -> Generator[None, None, None]:
+def pandas_converters() -> Generator[None]:
     """
     Context manager registering pandas' converters for a plot.
 
@@ -527,7 +527,7 @@ def _get_periods_per_ymd(freq: BaseOffset) -> tuple[int, int, int]:
 
     ppd = -1  # placeholder for above-day freqs
 
-    if dtype_code >= FreqGroup.FR_HR.value:
+    if dtype_code >= FreqGroup.FR_HR.value:  # pyright: ignore[reportAttributeAccessIssue]
         # error: "BaseOffset" has no attribute "_creso"
         ppd = periods_per_day(freq._creso)  # type: ignore[attr-defined]
         ppm = 28 * ppd
@@ -684,7 +684,7 @@ def _second_finder(label_interval: int) -> None:
     elif span <= periodsperyear // 4:
         month_start = _period_break(dates_, "month")
         info_maj[month_start] = True
-        if dtype_code < FreqGroup.FR_HR.value:
+        if dtype_code < FreqGroup.FR_HR.value:  # pyright: ignore[reportAttributeAccessIssue]
             info["min"] = True
         else:
             day_start = _period_break(dates_, "day")
@@ -910,7 +910,7 @@ def get_finder(freq: BaseOffset):
         return _quarterly_finder
     elif fgroup == FreqGroup.FR_MTH:
         return _monthly_finder
-    elif (dtype_code >= FreqGroup.FR_BUS.value) or fgroup == FreqGroup.FR_WK:
+    elif (dtype_code >= FreqGroup.FR_BUS.value) or fgroup == FreqGroup.FR_WK:  # pyright: ignore[reportAttributeAccessIssue]
         return _daily_finder
     else:  # pragma: no cover
         raise NotImplementedError(f"Unsupported frequency: {dtype_code}")
diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py
index f9c370b2486fd..d5624aecd1215 100644
--- a/pandas/plotting/_matplotlib/tools.py
+++ b/pandas/plotting/_matplotlib/tools.py
@@ -442,7 +442,7 @@ def handle_shared_axes(
                     _remove_labels_from_axis(ax.yaxis)
 
 
-def flatten_axes(axes: Axes | Iterable[Axes]) -> Generator[Axes, None, None]:
+def flatten_axes(axes: Axes | Iterable[Axes]) -> Generator[Axes]:
     if not is_list_like(axes):
         yield axes  # type: ignore[misc]
     elif isinstance(axes, (np.ndarray, ABCIndex)):
diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py
index 81940613dd2b0..7face74dcbc89 100644
--- a/pandas/plotting/_misc.py
+++ b/pandas/plotting/_misc.py
@@ -715,7 +715,7 @@ def _get_canonical_key(self, key: str) -> str:
         return self._ALIASES.get(key, key)
 
     @contextmanager
-    def use(self, key, value) -> Generator[_Options, None, None]:
+    def use(self, key, value) -> Generator[_Options]:
         """
         Temporarily set a parameter value using the with statement.
         Aliasing allowed.
diff --git a/pandas/tests/frame/test_ufunc.py b/pandas/tests/frame/test_ufunc.py
index 95b315c32dca5..092e65dd4b431 100644
--- a/pandas/tests/frame/test_ufunc.py
+++ b/pandas/tests/frame/test_ufunc.py
@@ -66,14 +66,14 @@ def test_binary_input_dispatch_binop(dtype):
     [
         (np.add, 1, [2, 3, 4, 5]),
         (
-            partial(np.add, where=[[False, True], [True, False]]),
+            partial(np.add, where=[[False, True], [True, False]]),  # type: ignore[misc]
             np.array([[1, 1], [1, 1]]),
             [0, 3, 4, 0],
         ),
         (np.power, np.array([[1, 1], [2, 2]]), [1, 2, 9, 16]),
         (np.subtract, 2, [-1, 0, 1, 2]),
         (
-            partial(np.negative, where=np.array([[False, True], [True, False]])),
+            partial(np.negative, where=np.array([[False, True], [True, False]])),  # type: ignore[misc]
             None,
             [0, -2, -3, 0],
         ),
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
index 2b62b384930d6..19b46d9b2c15f 100644
--- a/pandas/tests/indexes/test_base.py
+++ b/pandas/tests/indexes/test_base.py
@@ -1635,7 +1635,7 @@ def test_generated_op_names(opname, index):
         partial(DatetimeIndex, data=["2020-01-01"]),
         partial(PeriodIndex, data=["2020-01-01"]),
         partial(TimedeltaIndex, data=["1 day"]),
-        partial(RangeIndex, data=range(1)),
+        partial(RangeIndex, start=range(1)),
         partial(IntervalIndex, data=[pd.Interval(0, 1)]),
         partial(Index, data=["a"], dtype=object),
         partial(MultiIndex, levels=[1], codes=[0]),
diff --git a/pandas/util/_exceptions.py b/pandas/util/_exceptions.py
index 5f50838d37315..b3c8e54d3ca7f 100644
--- a/pandas/util/_exceptions.py
+++ b/pandas/util/_exceptions.py
@@ -4,7 +4,10 @@
 import inspect
 import os
 import re
-from typing import TYPE_CHECKING
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
 import warnings
 
 if TYPE_CHECKING:
@@ -13,7 +16,7 @@
 
 
 @contextlib.contextmanager
-def rewrite_exception(old_name: str, new_name: str) -> Generator[None, None, None]:
+def rewrite_exception(old_name: str, new_name: str) -> Generator[None]:
     """
     Rewrite the message of an exception.
     """
@@ -24,7 +27,7 @@ def rewrite_exception(old_name: str, new_name: str) -> Generator[None, None, Non
             raise
         msg = str(err.args[0])
         msg = msg.replace(old_name, new_name)
-        args: tuple[str, ...] = (msg,)
+        args: tuple[Any, ...] = (msg,)
         if len(err.args) > 1:
             args = args + err.args[1:]
         err.args = args
@@ -66,7 +69,7 @@ def rewrite_warning(
     target_category: type[Warning],
     new_message: str,
     new_category: type[Warning] | None = None,
-) -> Generator[None, None, None]:
+) -> Generator[None]:
     """
     Rewrite the message of a warning.
 
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 52d2553fc4001..1bf42af6bf2cd 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -53,10 +53,10 @@ seaborn
 moto
 flask
 asv>=0.6.1
-flake8==6.1.0
-mypy==1.9.0
+flake8==7.1.0
+mypy==1.11.2
 tokenize-rt
-pre-commit>=3.6.0
+pre-commit>=4.0.1
 gitpython
 gitdb
 google-auth

From f1bdd0fa571e8689274c8dc7ba62ae503d74a568 Mon Sep 17 00:00:00 2001
From: Sparsh Sah <sparshsah@gmail.com>
Date: Fri, 11 Oct 2024 10:19:44 -0700
Subject: [PATCH 139/224] DOC: Fix `GroupBy.ffill()` doc reference to rows
 (#60019)

Fix `groupby.ffill()` doc reference to rows
---
 pandas/core/groupby/groupby.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 5bf3e0e4de2fb..8dfef9e70db52 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -3833,7 +3833,7 @@ def ffill(self, limit: int | None = None):
         3  1.0  3.0  NaN  NaN
         4  1.0  1.0  NaN  NaN
 
-        Only replace the first NaN element within a group along rows.
+        Only replace the first NaN element within a group along columns.
 
         >>> df.groupby("key").ffill(limit=1)
              A    B    C

From 2a10e04a099d5f1633abcdfbb2dd9fdf09142f8d Mon Sep 17 00:00:00 2001
From: Wong2333 <3201884732@qq.com>
Date: Sat, 12 Oct 2024 11:14:27 +0800
Subject: [PATCH 140/224] DOC: Fix title capitalization in documentation files
 (#32550) (#60021)

* DOC: Fix title capitalization in documentation file

* DOC: Fix title capitalization in documentation files
---
 doc/source/whatsnew/v0.12.0.rst | 12 ++++++------
 doc/source/whatsnew/v3.0.0.rst  |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst
index c805758f85b35..08d3a6b188322 100644
--- a/doc/source/whatsnew/v0.12.0.rst
+++ b/doc/source/whatsnew/v0.12.0.rst
@@ -133,9 +133,9 @@ API changes
     to be inserted if ``True``, default is ``False`` (same as prior to 0.12) (:issue:`3679`)
   - Implement ``__nonzero__`` for ``NDFrame`` objects (:issue:`3691`, :issue:`3696`)
 
-  - IO api
+  - IO API
 
-    - added top-level function ``read_excel`` to replace the following,
+    - Added top-level function ``read_excel`` to replace the following,
       The original API is deprecated and will be removed in a future version
 
       .. code-block:: python
@@ -153,7 +153,7 @@ API changes
 
          pd.read_excel("path_to_file.xls", "Sheet1", index_col=None, na_values=["NA"])
 
-    - added top-level function ``read_sql`` that is equivalent to the following
+    - Added top-level function ``read_sql`` that is equivalent to the following
 
       .. code-block:: python
 
@@ -482,11 +482,11 @@ Bug fixes
 
   - ``HDFStore``
 
-    - will retain index attributes (freq,tz,name) on recreation (:issue:`3499`)
-    - will warn with a ``AttributeConflictWarning`` if you are attempting to append
+    - Will retain index attributes (freq,tz,name) on recreation (:issue:`3499`)
+    - Will warn with a ``AttributeConflictWarning`` if you are attempting to append
       an index with a different frequency than the existing, or attempting
       to append an index with a different name than the existing
-    - support datelike columns with a timezone as data_columns (:issue:`2852`)
+    - Support datelike columns with a timezone as data_columns (:issue:`2852`)
 
   - Non-unique index support clarified (:issue:`3468`).
 
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 321005272817d..e5376177d3381 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -16,12 +16,12 @@ Enhancements
 
 .. _whatsnew_300.enhancements.enhancement1:
 
-enhancement1
+Enhancement1
 ^^^^^^^^^^^^
 
 .. _whatsnew_300.enhancements.enhancement2:
 
-enhancement2
+Enhancement2
 ^^^^^^^^^^^^
 
 .. _whatsnew_300.enhancements.other:

From 68d9dcab5b543adb3bfe5b83563c61a9b8afae77 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Tue, 22 Oct 2024 01:43:47 +0530
Subject: [PATCH 141/224] DOC: fix SA01 for pandas.core.resample.Resampler.ohlc
 (#60036)

---
 ci/code_checks.sh              | 3 ---
 pandas/core/groupby/groupby.py | 6 ++++++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 1974c98a1d1ff..427938c3c5fba 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -104,7 +104,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.groupby.DataFrameGroupBy.indices SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \
         -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \
-        -i "pandas.core.groupby.DataFrameGroupBy.ohlc SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
         -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \
@@ -114,7 +113,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.nth PR02" \
-        -i "pandas.core.groupby.SeriesGroupBy.ohlc SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \
         -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \
         -i "pandas.core.resample.Resampler.__iter__ RT03,SA01" \
@@ -124,7 +122,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.resample.Resampler.max PR01,RT03,SA01" \
         -i "pandas.core.resample.Resampler.mean SA01" \
         -i "pandas.core.resample.Resampler.min PR01,RT03,SA01" \
-        -i "pandas.core.resample.Resampler.ohlc SA01" \
         -i "pandas.core.resample.Resampler.prod SA01" \
         -i "pandas.core.resample.Resampler.quantile PR01,PR07" \
         -i "pandas.core.resample.Resampler.sem SA01" \
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 8dfef9e70db52..f12ded6045e80 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -3224,6 +3224,12 @@ def ohlc(self) -> DataFrame:
         DataFrame
             Open, high, low and close values within each group.
 
+        See Also
+        --------
+        DataFrame.agg : Aggregate using one or more operations over the specified axis.
+        DataFrame.resample : Resample time-series data.
+        DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns.
+
         Examples
         --------
 

From 6d4ba801893a76cdd22da3e373ce7986ac98cda1 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com>
Date: Sat, 26 Oct 2024 14:30:21 -0400
Subject: [PATCH 142/224] CI/TST: Update pyreadstat tests and pin xarray on CI
 (#60109)

---
 ci/deps/actions-310.yaml                   | 2 +-
 ci/deps/actions-311-downstream_compat.yaml | 2 +-
 ci/deps/actions-311.yaml                   | 2 +-
 ci/deps/actions-312.yaml                   | 2 +-
 ci/deps/circle-311-arm64.yaml              | 2 +-
 environment.yml                            | 2 +-
 pandas/tests/io/test_spss.py               | 4 +++-
 requirements-dev.txt                       | 2 +-
 8 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml
index c33c0344e742f..b1c7fda910f67 100644
--- a/ci/deps/actions-310.yaml
+++ b/ci/deps/actions-310.yaml
@@ -52,7 +52,7 @@ dependencies:
   - scipy>=1.10.0
   - sqlalchemy>=2.0.0
   - tabulate>=0.9.0
-  - xarray>=2022.12.0
+  - xarray>=2022.12.0, <=2024.9.0
   - xlrd>=2.0.1
   - xlsxwriter>=3.0.5
   - zstandard>=0.19.0
diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml
index 8692b6e35ab2d..f7fc4c38add90 100644
--- a/ci/deps/actions-311-downstream_compat.yaml
+++ b/ci/deps/actions-311-downstream_compat.yaml
@@ -53,7 +53,7 @@ dependencies:
   - scipy>=1.10.0
   - sqlalchemy>=2.0.0
   - tabulate>=0.9.0
-  - xarray>=2022.12.0
+  - xarray>=2022.12.0, <=2024.9.0
   - xlrd>=2.0.1
   - xlsxwriter>=3.0.5
   - zstandard>=0.19.0
diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
index 8e7d9aba7878d..f1ab3c37c4c71 100644
--- a/ci/deps/actions-311.yaml
+++ b/ci/deps/actions-311.yaml
@@ -52,7 +52,7 @@ dependencies:
   - scipy>=1.10.0
   - sqlalchemy>=2.0.0
   - tabulate>=0.9.0
-  - xarray>=2022.12.0
+  - xarray>=2022.12.0, <=2024.9.0
   - xlrd>=2.0.1
   - xlsxwriter>=3.0.5
   - zstandard>=0.19.0
diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml
index 6c97960a62d40..d39d572eda619 100644
--- a/ci/deps/actions-312.yaml
+++ b/ci/deps/actions-312.yaml
@@ -52,7 +52,7 @@ dependencies:
   - scipy>=1.10.0
   - sqlalchemy>=2.0.0
   - tabulate>=0.9.0
-  - xarray>=2022.12.0
+  - xarray>=2022.12.0, <=2024.9.0
   - xlrd>=2.0.1
   - xlsxwriter>=3.0.5
   - zstandard>=0.19.0
diff --git a/ci/deps/circle-311-arm64.yaml b/ci/deps/circle-311-arm64.yaml
index c86534871b3d2..def7faeb8bcaa 100644
--- a/ci/deps/circle-311-arm64.yaml
+++ b/ci/deps/circle-311-arm64.yaml
@@ -52,7 +52,7 @@ dependencies:
   - scipy>=1.10.0
   - sqlalchemy>=2.0.0
   - tabulate>=0.9.0
-  - xarray>=2022.12.0
+  - xarray>=2022.12.0, <2024.10.0
   - xlrd>=2.0.1
   - xlsxwriter>=3.0.5
   - zstandard>=0.19.0
diff --git a/environment.yml b/environment.yml
index ab834735441f0..c05f8dbebd28e 100644
--- a/environment.yml
+++ b/environment.yml
@@ -55,7 +55,7 @@ dependencies:
   - scipy>=1.10.0
   - sqlalchemy>=2.0.0
   - tabulate>=0.9.0
-  - xarray>=2022.12.0
+  - xarray>=2022.12.0, <=2024.9.0
   - xlrd>=2.0.1
   - xlsxwriter>=3.0.5
   - zstandard>=0.19.0
diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py
index 1aa9f6dca0303..950f74a686b8d 100644
--- a/pandas/tests/io/test_spss.py
+++ b/pandas/tests/io/test_spss.py
@@ -177,4 +177,6 @@ def test_spss_metadata(datapath):
                 "modification_time": datetime.datetime(2015, 2, 6, 14, 33, 36),
             }
         )
-    assert df.attrs == metadata
+    if Version(pyreadstat.__version__) >= Version("1.2.8"):
+        metadata["mr_sets"] = {}
+    tm.assert_dict_equal(df.attrs, metadata)
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 1bf42af6bf2cd..00e320e6370ce 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -44,7 +44,7 @@ s3fs>=2022.11.0
 scipy>=1.10.0
 SQLAlchemy>=2.0.0
 tabulate>=0.9.0
-xarray>=2022.12.0
+xarray>=2022.12.0, <=2024.9.0
 xlrd>=2.0.1
 xlsxwriter>=3.0.5
 zstandard>=0.19.0

From 8d2ca0bf84bcf44a800ac19bdb4ed7ec88c555e2 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sun, 27 Oct 2024 18:43:50 +0530
Subject: [PATCH 143/224] DOC: fix RT03,SA01,ES01 for
 pandas.core.resample.Resampler.__iter__ (#60033)

---
 ci/code_checks.sh              |  3 ---
 pandas/core/groupby/groupby.py | 18 ++++++++++++++++--
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 427938c3c5fba..6f65c52d6f5a3 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -97,7 +97,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.arrays.NumpyExtensionArray SA01" \
         -i "pandas.arrays.SparseArray PR07,SA01" \
         -i "pandas.arrays.TimedeltaArray PR07,SA01" \
-        -i "pandas.core.groupby.DataFrameGroupBy.__iter__ RT03,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \
@@ -106,7 +105,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
         -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \
-        -i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.groups SA01" \
         -i "pandas.core.groupby.SeriesGroupBy.indices SA01" \
@@ -115,7 +113,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.groupby.SeriesGroupBy.nth PR02" \
         -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \
         -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \
-        -i "pandas.core.resample.Resampler.__iter__ RT03,SA01" \
         -i "pandas.core.resample.Resampler.get_group RT03,SA01" \
         -i "pandas.core.resample.Resampler.groups SA01" \
         -i "pandas.core.resample.Resampler.indices SA01" \
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index f12ded6045e80..a0bd25525c55f 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -767,10 +767,24 @@ def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]:
         """
         Groupby iterator.
 
+        This method provides an iterator over the groups created by the ``resample``
+        or ``groupby`` operation on the object. The method yields tuples where
+        the first element is the label (group key) corresponding to each group or
+        resampled bin, and the second element is the subset of the data that falls
+        within that group or bin.
+
         Returns
         -------
-        Generator yielding sequence of (name, subsetted object)
-        for each group
+        Iterator
+            Generator yielding a sequence of (name, subsetted object)
+            for each group.
+
+        See Also
+        --------
+        Series.groupby : Group data by a specific key or column.
+        DataFrame.groupby : Group DataFrame using mapper or by columns.
+        DataFrame.resample : Resample a DataFrame.
+        Series.resample : Resample a Series.
 
         Examples
         --------

From c5bf9373b13dbabcd78a462259f455a42e546afa Mon Sep 17 00:00:00 2001
From: Niklas Rousset <75939868+niklasr22@users.noreply.github.com>
Date: Mon, 28 Oct 2024 18:05:33 +0100
Subject: [PATCH 144/224] CLN: fix pre-commit stage names (#60076)

---
 .pre-commit-config.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1cb7b288aba69..87212309725c7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,9 +2,9 @@ minimum_pre_commit_version: 2.15.0
 exclude: ^LICENSES/|\.(html|csv|svg)$
 # reserve "manual" for relatively slow hooks which we still want to run in CI
 default_stages: [
-    commit,
-    merge-commit,
-    push,
+    pre-commit,
+    pre-merge-commit,
+    pre-push,
     prepare-commit-msg,
     commit-msg,
     post-checkout,

From e3e198f3cf0b7baac52ee9c5b45b54d997786d7c Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Mon, 28 Oct 2024 22:36:48 +0530
Subject: [PATCH 145/224] DOC: fix SA01 for pandas.errors.PossiblePrecisionLoss
 (#60061)

---
 ci/code_checks.sh         | 1 -
 pandas/errors/__init__.py | 4 ++++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 6f65c52d6f5a3..30f0ec226d64f 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -140,7 +140,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.errors.OutOfBoundsTimedelta SA01" \
         -i "pandas.errors.PerformanceWarning SA01" \
         -i "pandas.errors.PossibleDataLossError SA01" \
-        -i "pandas.errors.PossiblePrecisionLoss SA01" \
         -i "pandas.errors.UndefinedVariableError PR01,SA01" \
         -i "pandas.errors.UnsortedIndexError SA01" \
         -i "pandas.errors.UnsupportedFunctionCall SA01" \
diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py
index efc032b0b559e..b6df34e33ecce 100644
--- a/pandas/errors/__init__.py
+++ b/pandas/errors/__init__.py
@@ -691,6 +691,10 @@ class PossiblePrecisionLoss(Warning):
     When the column value is outside or equal to the int64 value the column is
     converted to a float64 dtype.
 
+    See Also
+    --------
+    DataFrame.to_stata : Export DataFrame object to Stata dta format.
+
     Examples
     --------
     >>> df = pd.DataFrame({"s": pd.Series([1, 2**53], dtype=np.int64)})

From e3a3a4a5fbc6451006822e08d1d54d991f6d2c3f Mon Sep 17 00:00:00 2001
From: Kevin Amparado <109636487+KevsterAmp@users.noreply.github.com>
Date: Tue, 29 Oct 2024 05:05:18 +0800
Subject: [PATCH 146/224] ENH: Improve error mesage verbosity in string
 accessor (#59900)

---
 pandas/core/strings/accessor.py                    | 4 +++-
 pandas/tests/series/accessors/test_str_accessor.py | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
index 3cb0e75cfb815..05e1a36877e06 100644
--- a/pandas/core/strings/accessor.py
+++ b/pandas/core/strings/accessor.py
@@ -255,7 +255,9 @@ def _validate(data):
         inferred_dtype = lib.infer_dtype(values, skipna=True)
 
         if inferred_dtype not in allowed_types:
-            raise AttributeError("Can only use .str accessor with string values!")
+            raise AttributeError(
+                f"Can only use .str accessor with string values, not {inferred_dtype}"
+            )
         return inferred_dtype
 
     def __getitem__(self, key):
diff --git a/pandas/tests/series/accessors/test_str_accessor.py b/pandas/tests/series/accessors/test_str_accessor.py
index 09d965ef1f322..ff530459b78fb 100644
--- a/pandas/tests/series/accessors/test_str_accessor.py
+++ b/pandas/tests/series/accessors/test_str_accessor.py
@@ -15,7 +15,8 @@ def test_str_attribute(self):
 
         # str accessor only valid with string values
         ser = Series(range(5))
-        with pytest.raises(AttributeError, match="only use .str accessor"):
+        msg = "Can only use .str accessor with string values, not integer"
+        with pytest.raises(AttributeError, match=msg):
             ser.str.repeat(2)
 
     def test_str_accessor_updates_on_inplace(self):

From 85c93d0521d51eb4f7083e7f5ab580ab1041b857 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Wed, 30 Oct 2024 01:39:44 +0530
Subject: [PATCH 147/224] DOC: fix SA01 for pandas.errors.OptionError (#60031)

---
 ci/code_checks.sh        | 1 -
 pandas/_config/config.py | 4 ++++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 30f0ec226d64f..a5b4eb47df712 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -136,7 +136,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.errors.NullFrequencyError SA01" \
         -i "pandas.errors.NumExprClobberingError SA01" \
         -i "pandas.errors.NumbaUtilError SA01" \
-        -i "pandas.errors.OptionError SA01" \
         -i "pandas.errors.OutOfBoundsTimedelta SA01" \
         -i "pandas.errors.PerformanceWarning SA01" \
         -i "pandas.errors.PossibleDataLossError SA01" \
diff --git a/pandas/_config/config.py b/pandas/_config/config.py
index 25760df6bd7a4..1d57aa806e0f1 100644
--- a/pandas/_config/config.py
+++ b/pandas/_config/config.py
@@ -105,6 +105,10 @@ class OptionError(AttributeError, KeyError):
 
     Backwards compatible with KeyError checks.
 
+    See Also
+    --------
+    options : Access and modify global pandas settings.
+
     Examples
     --------
     >>> pd.options.context

From 0bbca468ab2f35797494c1a6ec9cfd89e10464db Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Wed, 30 Oct 2024 01:40:21 +0530
Subject: [PATCH 148/224] DOC: fix SA01 for pandas.errors.CSSWarning (#60030)

---
 ci/code_checks.sh         | 1 -
 pandas/errors/__init__.py | 8 ++++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index a5b4eb47df712..450678bf55fa2 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -127,7 +127,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \
         -i "pandas.core.resample.Resampler.var SA01" \
         -i "pandas.errors.AttributeConflictWarning SA01" \
-        -i "pandas.errors.CSSWarning SA01" \
         -i "pandas.errors.ChainedAssignmentError SA01" \
         -i "pandas.errors.DataError SA01" \
         -i "pandas.errors.DuplicateLabelError SA01" \
diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py
index b6df34e33ecce..2fafb15822201 100644
--- a/pandas/errors/__init__.py
+++ b/pandas/errors/__init__.py
@@ -591,6 +591,14 @@ class CSSWarning(UserWarning):
     This can be due to the styling not having an equivalent value or because the
     styling isn't properly formatted.
 
+    See Also
+    --------
+    DataFrame.style : Returns a Styler object for applying CSS-like styles.
+    io.formats.style.Styler : Helps style a DataFrame or Series according to the
+        data with HTML and CSS.
+    io.formats.style.Styler.to_excel : Export styled DataFrame to Excel.
+    io.formats.style.Styler.to_html : Export styled DataFrame to HTML.
+
     Examples
     --------
     >>> df = pd.DataFrame({"A": [1, 1, 1]})

From a518b8fd5117fae9872c83ccf79235976c0359dc Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Wed, 30 Oct 2024 01:41:18 +0530
Subject: [PATCH 149/224] DOC: fix PR01,SA01,ES01 for
 pandas.api.types.is_integer (#60034)

---
 ci/code_checks.sh    |  1 -
 pandas/_libs/lib.pyx | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 450678bf55fa2..3ef47b7992463 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -85,7 +85,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timestamp.resolution PR02" \
         -i "pandas.Timestamp.tzinfo GL08" \
         -i "pandas.Timestamp.year GL08" \
-        -i "pandas.api.types.is_integer PR01,SA01" \
         -i "pandas.api.types.is_iterator PR07,SA01" \
         -i "pandas.api.types.is_re_compilable PR07,SA01" \
         -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 23e0f387466aa..8b6d73cda355b 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1122,9 +1122,23 @@ def is_integer(obj: object) -> bool:
     """
     Return True if given object is integer.
 
+    This method checks whether the passed object is an integer type. It
+    returns `True` if the object is an integer, and `False` otherwise.
+
+    Parameters
+    ----------
+    obj : object
+        The object to check for integer type.
+
     Returns
     -------
     bool
+        `True` if the object is of integer type, otherwise `False`.
+
+    See Also
+    --------
+    api.types.is_float : Check if an object is of float type.
+    api.types.is_numeric_dtype : Check if an object is of numeric type.
 
     Examples
     --------

From b9c6fa81ed9ed1247412a5e23e4e88feb3bb0427 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Wed, 30 Oct 2024 01:42:03 +0530
Subject: [PATCH 150/224] DOC: fix SA01,ES01 for
 pandas.core.resample.Resampler.sum (#60037)

---
 ci/code_checks.sh       |  1 -
 pandas/core/resample.py | 12 ++++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 3ef47b7992463..4578a831a20f0 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -122,7 +122,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.core.resample.Resampler.quantile PR01,PR07" \
         -i "pandas.core.resample.Resampler.sem SA01" \
         -i "pandas.core.resample.Resampler.std SA01" \
-        -i "pandas.core.resample.Resampler.sum SA01" \
         -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \
         -i "pandas.core.resample.Resampler.var SA01" \
         -i "pandas.errors.AttributeConflictWarning SA01" \
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
index 42fed83398737..ca4d3fc768efb 100644
--- a/pandas/core/resample.py
+++ b/pandas/core/resample.py
@@ -1021,6 +1021,10 @@ def sum(
         """
         Compute sum of group values.
 
+        This method provides a simple way to compute the sum of values within each
+        resampled group, particularly useful for aggregating time-based data into
+        daily, monthly, or yearly sums.
+
         Parameters
         ----------
         numeric_only : bool, default False
@@ -1039,6 +1043,14 @@ def sum(
         Series or DataFrame
             Computed sum of values within each group.
 
+        See Also
+        --------
+        core.resample.Resampler.mean : Compute mean of groups, excluding missing values.
+        core.resample.Resampler.count : Compute count of group, excluding missing
+            values.
+        DataFrame.resample : Resample time-series data.
+        Series.sum : Return the sum of the values over the requested axis.
+
         Examples
         --------
         >>> ser = pd.Series(

From 40b5610db837f21cfae906e4e3c93b2877b2b034 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Wed, 30 Oct 2024 01:46:10 +0530
Subject: [PATCH 151/224] DOC: fix SA01,ES01 for
 pandas.tseries.offsets.SemiMonthBegin (#60060)

* DOC: fix SA01 for pandas.tseries.offsets.SemiMonthBegin

* DOC: fix ES01 for pandas.tseries.offsets.SemiMonthBegin
---
 ci/code_checks.sh               |  1 -
 pandas/_libs/tslibs/offsets.pyx | 11 +++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 4578a831a20f0..cbecb855d3dfc 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -292,7 +292,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.tseries.offsets.Second.is_on_offset GL08" \
         -i "pandas.tseries.offsets.Second.n GL08" \
         -i "pandas.tseries.offsets.Second.normalize GL08" \
-        -i "pandas.tseries.offsets.SemiMonthBegin SA01" \
         -i "pandas.tseries.offsets.SemiMonthBegin.day_of_month GL08" \
         -i "pandas.tseries.offsets.SemiMonthBegin.is_on_offset GL08" \
         -i "pandas.tseries.offsets.SemiMonthBegin.n GL08" \
diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx
index 4db96fbaa3aad..7569f8e8864a0 100644
--- a/pandas/_libs/tslibs/offsets.pyx
+++ b/pandas/_libs/tslibs/offsets.pyx
@@ -3371,6 +3371,10 @@ cdef class SemiMonthBegin(SemiMonthOffset):
     """
     Two DateOffset's per month repeating on the first day of the month & day_of_month.
 
+    This offset moves dates to the first day of the month and an additional specified
+    day (typically the 15th by default), useful in scenarios where bi-monthly processing
+    occurs on set days.
+
     Attributes
     ----------
     n : int, default 1
@@ -3380,6 +3384,13 @@ cdef class SemiMonthBegin(SemiMonthOffset):
     day_of_month : int, {1, 3,...,27}, default 15
         A specific integer for the day of the month.
 
+    See Also
+    --------
+    tseries.offsets.SemiMonthEnd : Two DateOffset's per month repeating on the last day
+        of the month & day_of_month.
+    tseries.offsets.MonthEnd : Offset to the last calendar day of the month.
+    tseries.offsets.MonthBegin : Offset to the first calendar day of the month.
+
     Examples
     --------
     >>> ts = pd.Timestamp(2022, 1, 1)

From 824750a6df1664ebd6ccb7d25fd2e74188845d02 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Wed, 30 Oct 2024 01:47:41 +0530
Subject: [PATCH 152/224] DOC: fix SA01 for
 pandas.errors.UnsupportedFunctionCall (#60072)

---
 ci/code_checks.sh         | 1 -
 pandas/errors/__init__.py | 6 ++++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index cbecb855d3dfc..4f831ae8261fc 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -138,7 +138,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.errors.PossibleDataLossError SA01" \
         -i "pandas.errors.UndefinedVariableError PR01,SA01" \
         -i "pandas.errors.UnsortedIndexError SA01" \
-        -i "pandas.errors.UnsupportedFunctionCall SA01" \
         -i "pandas.errors.ValueLabelTypeMismatch SA01" \
         -i "pandas.infer_freq SA01" \
         -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \
diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py
index 2fafb15822201..0aaee1ec177ee 100644
--- a/pandas/errors/__init__.py
+++ b/pandas/errors/__init__.py
@@ -76,6 +76,12 @@ class UnsupportedFunctionCall(ValueError):
 
     For example, ``np.cumsum(groupby_object)``.
 
+    See Also
+    --------
+    DataFrame.groupby : Group DataFrame using a mapper or by a Series of columns.
+    Series.groupby : Group Series using a mapper or by a Series of columns.
+    core.groupby.GroupBy.cumsum : Compute cumulative sum for each group.
+
     Examples
     --------
     >>> df = pd.DataFrame(

From fc3aff432f6bd28d8f9ccacac94b202ddc6040b1 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Wed, 30 Oct 2024 01:49:29 +0530
Subject: [PATCH 153/224] DOC: fix RT03,SA01,ES01 for pandas.json_normalize
 (#60032)

---
 ci/code_checks.sh            |  1 -
 pandas/io/json/_normalize.py | 13 +++++++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 4f831ae8261fc..b727d93879a86 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -142,7 +142,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.infer_freq SA01" \
         -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \
         -i "pandas.io.stata.StataWriter.write_file SA01" \
-        -i "pandas.json_normalize RT03,SA01" \
         -i "pandas.plotting.andrews_curves RT03,SA01" \
         -i "pandas.plotting.scatter_matrix PR07,SA01" \
         -i "pandas.set_eng_float_format RT03,SA01" \
diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py
index 7d3eefae39679..45c8876dbe3e5 100644
--- a/pandas/io/json/_normalize.py
+++ b/pandas/io/json/_normalize.py
@@ -279,6 +279,10 @@ def json_normalize(
     """
     Normalize semi-structured JSON data into a flat table.
 
+    This method is designed to transform semi-structured JSON data, such as nested
+    dictionaries or lists, into a flat table. This is particularly useful when
+    handling JSON-like data structures that contain deeply nested fields.
+
     Parameters
     ----------
     data : dict, list of dicts, or Series of dicts
@@ -310,8 +314,13 @@ def json_normalize(
 
     Returns
     -------
-    frame : DataFrame
-    Normalize semi-structured JSON data into a flat table.
+    DataFrame
+        The normalized data, represented as a pandas DataFrame.
+
+    See Also
+    --------
+    DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data.
+    Series : One-dimensional ndarray with axis labels (including time series).
 
     Examples
     --------

From f770beee9ea737f2feecdd9ae4db2a12f8b1cce2 Mon Sep 17 00:00:00 2001
From: Dave Bunten <ekgto445@gmail.com>
Date: Tue, 29 Oct 2024 14:46:47 -0600
Subject: [PATCH 154/224] Add information about code coverage to docs (#60029)

* add information about code coverage to docs

* linting
---
 doc/source/development/contributing_codebase.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst
index 670ffe6996302..c1cfb0d7a623b 100644
--- a/doc/source/development/contributing_codebase.rst
+++ b/doc/source/development/contributing_codebase.rst
@@ -298,6 +298,12 @@ So, before actually writing any code, you should write your tests.  Often the te
 taken from the original GitHub issue.  However, it is always worth considering additional
 use cases and writing corresponding tests.
 
+We use `code coverage <https://en.wikipedia.org/wiki/Code_coverage>`_ to help understand
+the amount of code which is covered by a test. We recommend striving to ensure code
+you add or change within Pandas is covered by a test. Please see our
+`code coverage dashboard through Codecov <https://app.codecov.io/github/pandas-dev/pandas>`_
+for more information.
+
 Adding tests is one of the most common requests after code is pushed to pandas.  Therefore,
 it is worth getting in the habit of writing tests ahead of time so this is never an issue.
 

From 9d184aaed5582263e5ad01442a0f367291af3151 Mon Sep 17 00:00:00 2001
From: eightyseven <lyt5286@126.com>
Date: Wed, 30 Oct 2024 04:50:34 +0800
Subject: [PATCH 155/224] DOC: fix docstring for DataFrame.round() (#60040)

update
---
 pandas/core/frame.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 24a164aa15427..6b646c5591fab 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -10823,7 +10823,7 @@ def round(
         self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs
     ) -> DataFrame:
         """
-        Round a DataFrame to a variable number of decimal places.
+        Round numeric columns in a DataFrame to a variable number of decimal places.
 
         Parameters
         ----------

From 1ca15d362ee395ab69ea3e47139bbb5377155562 Mon Sep 17 00:00:00 2001
From: Myles Scolnick <myles@marimo.io>
Date: Tue, 29 Oct 2024 13:54:40 -0700
Subject: [PATCH 156/224] docs: add marimo to ecosystem.md (#60051)

---
 web/pandas/community/ecosystem.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md
index 2ea10954fc929..6c69ff7602491 100644
--- a/web/pandas/community/ecosystem.md
+++ b/web/pandas/community/ecosystem.md
@@ -239,6 +239,17 @@ Console](https://docs.spyder-ide.org/current/panes/ipythonconsole.html), and Spy
 render Numpydoc documentation on pandas objects in rich text with Sphinx
 both automatically and on-demand.
 
+### [marimo](https://marimo.io)
+
+marimo is a reactive notebook for Python and SQL that enhances productivity when working with dataframes. It provides several features to make data manipulation and visualization more interactive and fun:
+
+1. Rich, interactive displays: marimo can display pandas dataframes in interactive tables or charts with filtering and sorting capabilities.
+2. Data selection: Users can select data in tables or pandas-backed plots, and the selections are automatically sent to Python as pandas dataframes.
+3. No-code transformations: Users can interactively transform pandas dataframes using a GUI, without writing code. The generated code can be copied and pasted into the notebook.
+4. Custom filters: marimo allows the creation of pandas-backed filters using UI elements like sliders and dropdowns.
+5. Dataset explorer: marimo automatically discovers and displays all dataframes in the notebook, allowing users to explore and visualize data interactively.
+6. SQL integration: marimo allows users to write SQL queries against any pandas dataframes existing in memory.
+
 ## API
 
 ### [pandas-datareader](https://github.com/pydata/pandas-datareader)

From b5c7f25b109a749b720df7c102d38e994d5f868e Mon Sep 17 00:00:00 2001
From: ZKaoChi <1953542921@qq.com>
Date: Wed, 30 Oct 2024 04:57:26 +0800
Subject: [PATCH 157/224] DOC: Add Timedelta accepting float value (#60058)

* DOC: Solution for issue #60044

* reduce characters for each line

* DOC: Make the value parameter of pandas.Timedelta can accept float

* Change the expression

* environment update

* environment update
---
 pandas/_libs/tslibs/timedeltas.pyx | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
index bbefea7c47fc3..299730df86923 100644
--- a/pandas/_libs/tslibs/timedeltas.pyx
+++ b/pandas/_libs/tslibs/timedeltas.pyx
@@ -1864,10 +1864,12 @@ class Timedelta(_Timedelta):
 
     Parameters
     ----------
-    value : Timedelta, timedelta, np.timedelta64, str, or int
+    value : Timedelta, timedelta, np.timedelta64, str, int or float
         Input value.
     unit : str, default 'ns'
-        Denote the unit of the input, if input is an integer.
+        If input is an integer, denote the unit of the input.
+        If input is a float, denote the unit of the integer parts.
+        The decimal parts with resolution lower than 1 nanosecond are ignored.
 
         Possible values:
 

From 9e10119dc8c3ad34cee53e113afedd90cf70a0ec Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Tue, 29 Oct 2024 16:01:05 -0500
Subject: [PATCH 158/224] BLD: relax meson/meson-python requirements (#60089)

* relax meson/meson-python requirements

This makes bugfixes from later meson/meson-python fixes available
to build pandas. For eg: python 3.13t support in meson, needs an
up-to-date version of meson.

* Drop upper bound for meson-python
---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d6a963e94f5b8..6dfee8f4910db 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,8 +2,8 @@
 # Minimum requirements for the build system to execute.
 # See https://github.com/scipy/scipy/pull/12940 for the AIX issue.
 requires = [
-    "meson-python==0.13.1",
-    "meson==1.2.1",
+    "meson-python>=0.13.1",
+    "meson>=1.2.1,<2",
     "wheel",
     "Cython~=3.0.5",  # Note: sync with setup.py, environment.yml and asv.conf.json
     # Force numpy higher than 2.0rc1, so that built wheels are compatible

From d8905e4bee2aa0e096ed7831fea7d395d7657120 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 30 Oct 2024 09:00:22 +0100
Subject: [PATCH 159/224] TST (string dtype): duplicate
 pandas/tests/indexes/object tests specifically for string dtypes (#60117)

---
 pandas/tests/indexes/object/test_astype.py   |  18 ---
 pandas/tests/indexes/object/test_indexing.py |  82 ++-----------
 pandas/tests/indexes/string/__init__.py      |   0
 pandas/tests/indexes/string/test_astype.py   |  21 ++++
 pandas/tests/indexes/string/test_indexing.py | 118 +++++++++++++++++++
 5 files changed, 148 insertions(+), 91 deletions(-)
 create mode 100644 pandas/tests/indexes/string/__init__.py
 create mode 100644 pandas/tests/indexes/string/test_astype.py
 create mode 100644 pandas/tests/indexes/string/test_indexing.py

diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py
index ce05b5e9f2238..7e0de138aacfb 100644
--- a/pandas/tests/indexes/object/test_astype.py
+++ b/pandas/tests/indexes/object/test_astype.py
@@ -3,25 +3,7 @@
 from pandas import (
     Index,
     NaT,
-    Series,
 )
-import pandas._testing as tm
-
-
-def test_astype_str_from_bytes():
-    # https://github.com/pandas-dev/pandas/issues/38607
-    # GH#49658 pre-2.0 Index called .values.astype(str) here, which effectively
-    #  did a .decode() on the bytes object.  In 2.0 we go through
-    #  ensure_string_array which does f"{val}"
-    idx = Index(["あ", b"a"], dtype="object")
-    result = idx.astype(str)
-    expected = Index(["あ", "a"], dtype="str")
-    tm.assert_index_equal(result, expected)
-
-    # while we're here, check that Series.astype behaves the same
-    result = Series(idx).astype(str)
-    expected = Series(expected, dtype="str")
-    tm.assert_series_equal(result, expected)
 
 
 def test_astype_invalid_nas_to_tdt64_raises():
diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py
index ea3d068a673e8..89648bc316c16 100644
--- a/pandas/tests/indexes/object/test_indexing.py
+++ b/pandas/tests/indexes/object/test_indexing.py
@@ -3,12 +3,8 @@
 import numpy as np
 import pytest
 
-from pandas._libs.missing import (
-    NA,
-    is_matching_na,
-)
+from pandas._libs.missing import is_matching_na
 
-import pandas as pd
 from pandas import Index
 import pandas._testing as tm
 
@@ -23,13 +19,13 @@ class TestGetIndexer:
     )
     def test_get_indexer_strings(self, method, expected):
         expected = np.array(expected, dtype=np.intp)
-        index = Index(["b", "c"])
+        index = Index(["b", "c"], dtype=object)
         actual = index.get_indexer(["a", "b", "c", "d"], method=method)
 
         tm.assert_numpy_array_equal(actual, expected)
 
-    def test_get_indexer_strings_raises(self, using_infer_string):
-        index = Index(["b", "c"])
+    def test_get_indexer_strings_raises(self):
+        index = Index(["b", "c"], dtype=object)
 
         msg = "|".join(
             [
@@ -68,13 +64,9 @@ def test_get_indexer_with_NA_values(
 
 
 class TestGetIndexerNonUnique:
-    def test_get_indexer_non_unique_nas(
-        self, nulls_fixture, request, using_infer_string
-    ):
+    def test_get_indexer_non_unique_nas(self, nulls_fixture):
         # even though this isn't non-unique, this should still work
-        if using_infer_string and (nulls_fixture is None or nulls_fixture is NA):
-            request.applymarker(pytest.mark.xfail(reason="NAs are cast to NaN"))
-        index = Index(["a", "b", nulls_fixture])
+        index = Index(["a", "b", nulls_fixture], dtype=object)
         indexer, missing = index.get_indexer_non_unique([nulls_fixture])
 
         expected_indexer = np.array([2], dtype=np.intp)
@@ -83,7 +75,7 @@ def test_get_indexer_non_unique_nas(
         tm.assert_numpy_array_equal(missing, expected_missing)
 
         # actually non-unique
-        index = Index(["a", nulls_fixture, "b", nulls_fixture])
+        index = Index(["a", nulls_fixture, "b", nulls_fixture], dtype=object)
         indexer, missing = index.get_indexer_non_unique([nulls_fixture])
 
         expected_indexer = np.array([1, 3], dtype=np.intp)
@@ -92,10 +84,10 @@ def test_get_indexer_non_unique_nas(
 
         # matching-but-not-identical nans
         if is_matching_na(nulls_fixture, float("NaN")):
-            index = Index(["a", float("NaN"), "b", float("NaN")])
+            index = Index(["a", float("NaN"), "b", float("NaN")], dtype=object)
             match_but_not_identical = True
         elif is_matching_na(nulls_fixture, Decimal("NaN")):
-            index = Index(["a", Decimal("NaN"), "b", Decimal("NaN")])
+            index = Index(["a", Decimal("NaN"), "b", Decimal("NaN")], dtype=object)
             match_but_not_identical = True
         else:
             match_but_not_identical = False
@@ -156,59 +148,3 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2):
             expected_indexer = np.array([1, 3], dtype=np.intp)
             tm.assert_numpy_array_equal(indexer, expected_indexer)
             tm.assert_numpy_array_equal(missing, expected_missing)
-
-
-class TestSliceLocs:
-    @pytest.mark.parametrize(
-        "in_slice,expected",
-        [
-            # error: Slice index must be an integer or None
-            (pd.IndexSlice[::-1], "yxdcb"),
-            (pd.IndexSlice["b":"y":-1], ""),  # type: ignore[misc]
-            (pd.IndexSlice["b"::-1], "b"),  # type: ignore[misc]
-            (pd.IndexSlice[:"b":-1], "yxdcb"),  # type: ignore[misc]
-            (pd.IndexSlice[:"y":-1], "y"),  # type: ignore[misc]
-            (pd.IndexSlice["y"::-1], "yxdcb"),  # type: ignore[misc]
-            (pd.IndexSlice["y"::-4], "yb"),  # type: ignore[misc]
-            # absent labels
-            (pd.IndexSlice[:"a":-1], "yxdcb"),  # type: ignore[misc]
-            (pd.IndexSlice[:"a":-2], "ydb"),  # type: ignore[misc]
-            (pd.IndexSlice["z"::-1], "yxdcb"),  # type: ignore[misc]
-            (pd.IndexSlice["z"::-3], "yc"),  # type: ignore[misc]
-            (pd.IndexSlice["m"::-1], "dcb"),  # type: ignore[misc]
-            (pd.IndexSlice[:"m":-1], "yx"),  # type: ignore[misc]
-            (pd.IndexSlice["a":"a":-1], ""),  # type: ignore[misc]
-            (pd.IndexSlice["z":"z":-1], ""),  # type: ignore[misc]
-            (pd.IndexSlice["m":"m":-1], ""),  # type: ignore[misc]
-        ],
-    )
-    def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype):
-        index = Index(list("bcdxy"), dtype=any_string_dtype)
-
-        s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step)
-        result = index[s_start : s_stop : in_slice.step]
-        expected = Index(list(expected), dtype=any_string_dtype)
-        tm.assert_index_equal(result, expected)
-
-    def test_slice_locs_negative_step_oob(self, any_string_dtype):
-        index = Index(list("bcdxy"), dtype=any_string_dtype)
-
-        result = index[-10:5:1]
-        tm.assert_index_equal(result, index)
-
-        result = index[4:-10:-1]
-        expected = Index(list("yxdcb"), dtype=any_string_dtype)
-        tm.assert_index_equal(result, expected)
-
-    def test_slice_locs_dup(self):
-        index = Index(["a", "a", "b", "c", "d", "d"])
-        assert index.slice_locs("a", "d") == (0, 6)
-        assert index.slice_locs(end="d") == (0, 6)
-        assert index.slice_locs("a", "c") == (0, 4)
-        assert index.slice_locs("b", "d") == (2, 6)
-
-        index2 = index[::-1]
-        assert index2.slice_locs("d", "a") == (0, 6)
-        assert index2.slice_locs(end="a") == (0, 6)
-        assert index2.slice_locs("d", "b") == (0, 4)
-        assert index2.slice_locs("c", "a") == (2, 6)
diff --git a/pandas/tests/indexes/string/__init__.py b/pandas/tests/indexes/string/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/pandas/tests/indexes/string/test_astype.py b/pandas/tests/indexes/string/test_astype.py
new file mode 100644
index 0000000000000..0349d85f23167
--- /dev/null
+++ b/pandas/tests/indexes/string/test_astype.py
@@ -0,0 +1,21 @@
+from pandas import (
+    Index,
+    Series,
+)
+import pandas._testing as tm
+
+
+def test_astype_str_from_bytes():
+    # https://github.com/pandas-dev/pandas/issues/38607
+    # GH#49658 pre-2.0 Index called .values.astype(str) here, which effectively
+    #  did a .decode() on the bytes object.  In 2.0 we go through
+    #  ensure_string_array which does f"{val}"
+    idx = Index(["あ", b"a"], dtype="object")
+    result = idx.astype(str)
+    expected = Index(["あ", "a"], dtype="str")
+    tm.assert_index_equal(result, expected)
+
+    # while we're here, check that Series.astype behaves the same
+    result = Series(idx).astype(str)
+    expected = Series(expected, dtype="str")
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py
new file mode 100644
index 0000000000000..755b7109a5a04
--- /dev/null
+++ b/pandas/tests/indexes/string/test_indexing.py
@@ -0,0 +1,118 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import Index
+import pandas._testing as tm
+
+
+class TestGetIndexer:
+    @pytest.mark.parametrize(
+        "method,expected",
+        [
+            ("pad", [-1, 0, 1, 1]),
+            ("backfill", [0, 0, 1, -1]),
+        ],
+    )
+    def test_get_indexer_strings(self, any_string_dtype, method, expected):
+        expected = np.array(expected, dtype=np.intp)
+        index = Index(["b", "c"], dtype=any_string_dtype)
+        actual = index.get_indexer(["a", "b", "c", "d"], method=method)
+
+        tm.assert_numpy_array_equal(actual, expected)
+
+    def test_get_indexer_strings_raises(self, any_string_dtype):
+        index = Index(["b", "c"], dtype=any_string_dtype)
+
+        msg = "|".join(
+            [
+                "operation 'sub' not supported for dtype 'str",
+                r"unsupported operand type\(s\) for -: 'str' and 'str'",
+            ]
+        )
+        with pytest.raises(TypeError, match=msg):
+            index.get_indexer(["a", "b", "c", "d"], method="nearest")
+
+        with pytest.raises(TypeError, match=msg):
+            index.get_indexer(["a", "b", "c", "d"], method="pad", tolerance=2)
+
+        with pytest.raises(TypeError, match=msg):
+            index.get_indexer(
+                ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2]
+            )
+
+
+class TestGetIndexerNonUnique:
+    @pytest.mark.xfail(reason="TODO(infer_string)", strict=False)
+    def test_get_indexer_non_unique_nas(self, any_string_dtype, nulls_fixture):
+        index = Index(["a", "b", None], dtype=any_string_dtype)
+        indexer, missing = index.get_indexer_non_unique([nulls_fixture])
+
+        expected_indexer = np.array([2], dtype=np.intp)
+        expected_missing = np.array([], dtype=np.intp)
+        tm.assert_numpy_array_equal(indexer, expected_indexer)
+        tm.assert_numpy_array_equal(missing, expected_missing)
+
+        # actually non-unique
+        index = Index(["a", None, "b", None], dtype=any_string_dtype)
+        indexer, missing = index.get_indexer_non_unique([nulls_fixture])
+
+        expected_indexer = np.array([1, 3], dtype=np.intp)
+        tm.assert_numpy_array_equal(indexer, expected_indexer)
+        tm.assert_numpy_array_equal(missing, expected_missing)
+
+
+class TestSliceLocs:
+    @pytest.mark.parametrize(
+        "in_slice,expected",
+        [
+            # error: Slice index must be an integer or None
+            (pd.IndexSlice[::-1], "yxdcb"),
+            (pd.IndexSlice["b":"y":-1], ""),  # type: ignore[misc]
+            (pd.IndexSlice["b"::-1], "b"),  # type: ignore[misc]
+            (pd.IndexSlice[:"b":-1], "yxdcb"),  # type: ignore[misc]
+            (pd.IndexSlice[:"y":-1], "y"),  # type: ignore[misc]
+            (pd.IndexSlice["y"::-1], "yxdcb"),  # type: ignore[misc]
+            (pd.IndexSlice["y"::-4], "yb"),  # type: ignore[misc]
+            # absent labels
+            (pd.IndexSlice[:"a":-1], "yxdcb"),  # type: ignore[misc]
+            (pd.IndexSlice[:"a":-2], "ydb"),  # type: ignore[misc]
+            (pd.IndexSlice["z"::-1], "yxdcb"),  # type: ignore[misc]
+            (pd.IndexSlice["z"::-3], "yc"),  # type: ignore[misc]
+            (pd.IndexSlice["m"::-1], "dcb"),  # type: ignore[misc]
+            (pd.IndexSlice[:"m":-1], "yx"),  # type: ignore[misc]
+            (pd.IndexSlice["a":"a":-1], ""),  # type: ignore[misc]
+            (pd.IndexSlice["z":"z":-1], ""),  # type: ignore[misc]
+            (pd.IndexSlice["m":"m":-1], ""),  # type: ignore[misc]
+        ],
+    )
+    def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype):
+        index = Index(list("bcdxy"), dtype=any_string_dtype)
+
+        s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step)
+        result = index[s_start : s_stop : in_slice.step]
+        expected = Index(list(expected), dtype=any_string_dtype)
+        tm.assert_index_equal(result, expected)
+
+    def test_slice_locs_negative_step_oob(self, any_string_dtype):
+        index = Index(list("bcdxy"), dtype=any_string_dtype)
+
+        result = index[-10:5:1]
+        tm.assert_index_equal(result, index)
+
+        result = index[4:-10:-1]
+        expected = Index(list("yxdcb"), dtype=any_string_dtype)
+        tm.assert_index_equal(result, expected)
+
+    def test_slice_locs_dup(self, any_string_dtype):
+        index = Index(["a", "a", "b", "c", "d", "d"], dtype=any_string_dtype)
+        assert index.slice_locs("a", "d") == (0, 6)
+        assert index.slice_locs(end="d") == (0, 6)
+        assert index.slice_locs("a", "c") == (0, 4)
+        assert index.slice_locs("b", "d") == (2, 6)
+
+        index2 = index[::-1]
+        assert index2.slice_locs("d", "a") == (0, 6)
+        assert index2.slice_locs(end="a") == (0, 6)
+        assert index2.slice_locs("d", "b") == (0, 4)
+        assert index2.slice_locs("c", "a") == (2, 6)

From 2ead19826b42a34bd641a14ef1089c7ea5f36a6a Mon Sep 17 00:00:00 2001
From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com>
Date: Wed, 30 Oct 2024 11:19:22 +0300
Subject: [PATCH 160/224] DEPR: Change stacklevel to 2 in DataFrame(mgr)
 deprecation (#58694)

---
 pandas/core/frame.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 6b646c5591fab..c4defdb24370f 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -718,7 +718,7 @@ def __init__(
                     "is deprecated and will raise in a future version. "
                     "Use public APIs instead.",
                     DeprecationWarning,
-                    stacklevel=1,  # bump to 2 once pyarrow 15.0 is released with fix
+                    stacklevel=2,
                 )
 
             data = data.copy(deep=False)

From 7bd594c81acb5f6428e9ef54ba5a9da1f2860a89 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 30 Oct 2024 10:29:37 +0100
Subject: [PATCH 161/224] TST (string dtype): add explicit object vs str dtype
 to index fixture (#60116)

---
 pandas/conftest.py                     | 3 ++-
 pandas/tests/indexes/test_any_index.py | 2 +-
 pandas/tests/indexes/test_old_base.py  | 2 +-
 pandas/tests/indexes/test_setops.py    | 8 +++++++-
 pandas/tests/test_algos.py             | 1 +
 5 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/pandas/conftest.py b/pandas/conftest.py
index e2db9260ac37d..7ad322d050c0f 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -667,7 +667,8 @@ def _create_mi_with_dt64tz_level():
 
 
 indices_dict = {
-    "string": Index([f"pandas_{i}" for i in range(10)]),
+    "object": Index([f"pandas_{i}" for i in range(10)], dtype=object),
+    "string": Index([f"pandas_{i}" for i in range(10)], dtype="str"),
     "datetime": date_range("2020-01-01", periods=10),
     "datetime-tz": date_range("2020-01-01", periods=10, tz="US/Pacific"),
     "period": period_range("2020-01-01", periods=10, freq="D"),
diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py
index e1ed96195e0a7..a4c18732ef258 100644
--- a/pandas/tests/indexes/test_any_index.py
+++ b/pandas/tests/indexes/test_any_index.py
@@ -40,7 +40,7 @@ def test_map_identity_mapping(index, request):
     # GH#12766
 
     result = index.map(lambda x: x)
-    if index.dtype == object and result.dtype == bool:
+    if index.dtype == object and (result.dtype == bool or result.dtype == "string"):
         assert (index == result).all()
         # TODO: could work that into the 'exact="equiv"'?
         return  # FIXME: doesn't belong in this file anymore!
diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py
index 0199e21bfc980..65feb07e05d9f 100644
--- a/pandas/tests/indexes/test_old_base.py
+++ b/pandas/tests/indexes/test_old_base.py
@@ -256,7 +256,7 @@ def test_ensure_copied_data(self, index):
                 "RangeIndex cannot be initialized from data, "
                 "MultiIndex and CategoricalIndex are tested separately"
             )
-        elif index.dtype == object and index.inferred_type == "boolean":
+        elif index.dtype == object and index.inferred_type in ["boolean", "string"]:
             init_kwargs["dtype"] = index.dtype
 
         index_type = type(index)
diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py
index e5dc47be20677..5f934ca3e6e83 100644
--- a/pandas/tests/indexes/test_setops.py
+++ b/pandas/tests/indexes/test_setops.py
@@ -299,7 +299,13 @@ def test_difference_base(self, sort, index):
                 first.difference([1, 2, 3], sort)
 
     @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
-    def test_symmetric_difference(self, index):
+    def test_symmetric_difference(self, index, using_infer_string, request):
+        if (
+            using_infer_string
+            and index.dtype == "object"
+            and index.inferred_type == "string"
+        ):
+            request.applymarker(pytest.mark.xfail(reason="TODO: infer_string"))
         if isinstance(index, CategoricalIndex):
             pytest.skip(f"Not relevant for {type(index).__name__}")
         if len(index) < 2:
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index dac74a0e32a42..81e7d3774b613 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -65,6 +65,7 @@ def test_factorize_complex(self):
         expected_uniques = np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=complex)
         tm.assert_numpy_array_equal(uniques, expected_uniques)
 
+    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     def test_factorize(self, index_or_series_obj, sort):
         obj = index_or_series_obj
         result_codes, result_uniques = obj.factorize(sort=sort)

From 71eb4571b49553b4bb1db851e2d1610b703cd943 Mon Sep 17 00:00:00 2001
From: eightyseven <lyt5286@126.com>
Date: Thu, 31 Oct 2024 02:33:14 +0800
Subject: [PATCH 162/224] DOC: fix docstring of timedaltas.ceil (#60047)

* fix docstring of timedaltas.cela

* Update pandas/_libs/tslibs/timedeltas.pyx

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>

---------

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
 pandas/_libs/tslibs/timedeltas.pyx | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
index 299730df86923..15b629624bafc 100644
--- a/pandas/_libs/tslibs/timedeltas.pyx
+++ b/pandas/_libs/tslibs/timedeltas.pyx
@@ -2178,8 +2178,10 @@ class Timedelta(_Timedelta):
         Parameters
         ----------
         freq : str
-            Frequency string indicating the ceiling resolution.
-            It uses the same units as class constructor :class:`~pandas.Timedelta`.
+            Frequency string indicating the ceiling resolution. Must be a fixed
+            frequency like 's' (second) not 'ME' (month end). See
+            :ref:`frequency aliases <timeseries.offset_aliases>` for
+            a list of possible `freq` values.
 
         Returns
         -------

From d25b3c294f6ea8ae2b1cc52d037e60951bfae543 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Thu, 31 Oct 2024 00:04:18 +0530
Subject: [PATCH 163/224] DOC: fix RT03,SA01,ES01 for
 pandas.set_eng_float_format (#60135)

---
 ci/code_checks.sh           |  1 -
 pandas/io/formats/format.py | 10 ++++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index b727d93879a86..7e3998c01cce6 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -144,7 +144,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.io.stata.StataWriter.write_file SA01" \
         -i "pandas.plotting.andrews_curves RT03,SA01" \
         -i "pandas.plotting.scatter_matrix PR07,SA01" \
-        -i "pandas.set_eng_float_format RT03,SA01" \
         -i "pandas.tseries.offsets.BDay PR02,SA01" \
         -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \
         -i "pandas.tseries.offsets.BQuarterBegin.n GL08" \
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
index 5aecc6af712e5..861f5885f80c6 100644
--- a/pandas/io/formats/format.py
+++ b/pandas/io/formats/format.py
@@ -1926,6 +1926,9 @@ def set_eng_float_format(accuracy: int = 3, use_eng_prefix: bool = False) -> Non
     """
     Format float representation in DataFrame with SI notation.
 
+    Sets the floating-point display format for ``DataFrame`` objects using engineering
+    notation (SI units), allowing easier readability of values across wide ranges.
+
     Parameters
     ----------
     accuracy : int, default 3
@@ -1936,6 +1939,13 @@ def set_eng_float_format(accuracy: int = 3, use_eng_prefix: bool = False) -> Non
     Returns
     -------
     None
+        This method does not return a value. it updates the global display format
+        for floats in DataFrames.
+
+    See Also
+    --------
+    set_option : Set the value of the specified option or options.
+    reset_option : Reset one or more options to their default value.
 
     Examples
     --------

From f0f3efca36e1088d77b442f9f781fad535f6f3cf Mon Sep 17 00:00:00 2001
From: sunlight <138234530+sunlight798@users.noreply.github.com>
Date: Thu, 31 Oct 2024 02:35:35 +0800
Subject: [PATCH 164/224] DOC: fix PR07,SA01 for pandas.api.types.is_iterator
 (#60142)

---
 ci/code_checks.sh    | 1 -
 pandas/_libs/lib.pyx | 8 ++++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 7e3998c01cce6..768e05b16cfe9 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -85,7 +85,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timestamp.resolution PR02" \
         -i "pandas.Timestamp.tzinfo GL08" \
         -i "pandas.Timestamp.year GL08" \
-        -i "pandas.api.types.is_iterator PR07,SA01" \
         -i "pandas.api.types.is_re_compilable PR07,SA01" \
         -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \
         -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 8b6d73cda355b..de603beff7836 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -259,15 +259,23 @@ def is_iterator(obj: object) -> bool:
     Check if the object is an iterator.
 
     This is intended for generators, not list-like objects.
+    This method checks whether the passed object is an iterator. It
+    returns `True` if the object is an iterator, and `False` otherwise.
 
     Parameters
     ----------
     obj : The object to check
+        The object to check for iterator type.
 
     Returns
     -------
     is_iter : bool
         Whether `obj` is an iterator.
+        `True` if the object is of iterator type, otherwise `False`.
+
+    See Also
+    --------
+    api.types.is_list_like : Check if the input is list-like.
 
     Examples
     --------

From 0db1f53859686af2a2dfb1712ed62698723a37dd Mon Sep 17 00:00:00 2001
From: Kevin Amparado <109636487+KevsterAmp@users.noreply.github.com>
Date: Thu, 31 Oct 2024 02:51:52 +0800
Subject: [PATCH 165/224] DOC: fix broken link on team webpage (#60141)

rename link to about/governance.html
---
 web/pandas/about/team.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/web/pandas/about/team.md b/web/pandas/about/team.md
index 49b8a26ab56e8..b66e134fa5b2f 100644
--- a/web/pandas/about/team.md
+++ b/web/pandas/about/team.md
@@ -43,7 +43,7 @@ If you want to support pandas development, you can find information in the [dona
 
 Wes McKinney is the Benevolent Dictator for Life (BDFL).
 
-The project governance is available in the [project governance page]({{ base_url }}governance.html).
+The project governance is available in the [project governance page]({{ base_url }}about/governance.html).
 
 ## Workgroups
 

From e7d54a54da8a179fbde5878dfb4e6440d0cfbac8 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 30 Oct 2024 20:06:30 +0100
Subject: [PATCH 166/224] BUG/TST (string dtype): fix and update tests for
 Stata IO (#60130)

---
 pandas/io/stata.py            |  5 ++++
 pandas/tests/io/test_stata.py | 51 +++++++++++++++++------------------
 2 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 04bd1e32603f4..722e2c79c4e6a 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -569,7 +569,11 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
             if getattr(data[col].dtype, "numpy_dtype", None) is not None:
                 data[col] = data[col].astype(data[col].dtype.numpy_dtype)
             elif is_string_dtype(data[col].dtype):
+                # TODO could avoid converting string dtype to object here,
+                # but handle string dtype in _encode_strings
                 data[col] = data[col].astype("object")
+                # generate_table checks for None values
+                data.loc[data[col].isna(), col] = None
 
         dtype = data[col].dtype
         empty_df = data.shape[0] == 0
@@ -2725,6 +2729,7 @@ def _encode_strings(self) -> None:
                 continue
             column = self.data[col]
             dtype = column.dtype
+            # TODO could also handle string dtype here specifically
             if dtype.type is np.object_:
                 inferred_dtype = infer_dtype(column, skipna=True)
                 if not ((inferred_dtype == "string") or len(column) == 0):
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
index 9f5085ff2ad28..4b5369d61bed6 100644
--- a/pandas/tests/io/test_stata.py
+++ b/pandas/tests/io/test_stata.py
@@ -11,8 +11,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -435,9 +433,8 @@ def test_write_dta6(self, datapath, temp_file):
             check_index_type=False,
         )
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
-    def test_read_write_dta10(self, version, temp_file):
+    def test_read_write_dta10(self, version, temp_file, using_infer_string):
         original = DataFrame(
             data=[["string", "object", 1, 1.1, np.datetime64("2003-12-25")]],
             columns=["string", "object", "integer", "floating", "datetime"],
@@ -451,9 +448,11 @@ def test_read_write_dta10(self, version, temp_file):
         original.to_stata(path, convert_dates={"datetime": "tc"}, version=version)
         written_and_read_again = self.read_dta(path)
 
-        expected = original[:]
+        expected = original.copy()
         # "tc" convert_dates means we store in ms
         expected["datetime"] = expected["datetime"].astype("M8[ms]")
+        if using_infer_string:
+            expected["object"] = expected["object"].astype("str")
 
         tm.assert_frame_equal(
             written_and_read_again.set_index("index"),
@@ -1276,7 +1275,6 @@ def test_categorical_ordering(self, file, datapath):
             assert parsed[col].cat.ordered
             assert not parsed_unordered[col].cat.ordered
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.filterwarnings("ignore::UserWarning")
     @pytest.mark.parametrize(
         "file",
@@ -1340,6 +1338,10 @@ def _convert_categorical(from_frame: DataFrame) -> DataFrame:
                 if cat.categories.dtype == object:
                     categories = pd.Index._with_infer(cat.categories._values)
                     cat = cat.set_categories(categories)
+                elif cat.categories.dtype == "string" and len(cat.categories) == 0:
+                    # if the read categories are empty, it comes back as object dtype
+                    categories = cat.categories.astype(object)
+                    cat = cat.set_categories(categories)
                 from_frame[col] = cat
         return from_frame
 
@@ -1369,7 +1371,6 @@ def test_iterator(self, datapath):
             from_chunks = pd.concat(itr)
         tm.assert_frame_equal(parsed, from_chunks)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.filterwarnings("ignore::UserWarning")
     @pytest.mark.parametrize(
         "file",
@@ -1674,12 +1675,11 @@ def test_inf(self, infval, temp_file):
             path = temp_file
             df.to_stata(path)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_path_pathlib(self):
         df = DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
-            columns=pd.Index(list("ABCD"), dtype=object),
-            index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
+            columns=pd.Index(list("ABCD")),
+            index=pd.Index([f"i-{i}" for i in range(30)]),
         )
         df.index.name = "index"
         reader = lambda x: read_stata(x).set_index("index")
@@ -1699,13 +1699,12 @@ def test_value_labels_iterator(self, write_index, temp_file):
             value_labels = dta_iter.value_labels()
         assert value_labels == {"A": {0: "A", 1: "B", 2: "C", 3: "E"}}
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_set_index(self, temp_file):
         # GH 17328
         df = DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
-            columns=pd.Index(list("ABCD"), dtype=object),
-            index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
+            columns=pd.Index(list("ABCD")),
+            index=pd.Index([f"i-{i}" for i in range(30)]),
         )
         df.index.name = "index"
         path = temp_file
@@ -1733,9 +1732,9 @@ def test_date_parsing_ignores_format_details(self, column, datapath):
         formatted = df.loc[0, column + "_fmt"]
         assert unformatted == formatted
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+    # @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     @pytest.mark.parametrize("byteorder", ["little", "big"])
-    def test_writer_117(self, byteorder, temp_file):
+    def test_writer_117(self, byteorder, temp_file, using_infer_string):
         original = DataFrame(
             data=[
                 [
@@ -1802,6 +1801,9 @@ def test_writer_117(self, byteorder, temp_file):
         expected = original[:]
         # "tc" for convert_dates means we store with "ms" resolution
         expected["datetime"] = expected["datetime"].astype("M8[ms]")
+        if using_infer_string:
+            # object dtype (with only strings/None) comes back as string dtype
+            expected["object"] = expected["object"].astype("str")
 
         tm.assert_frame_equal(
             written_and_read_again.set_index("index"),
@@ -1845,15 +1847,14 @@ def test_invalid_date_conversion(self, temp_file):
         with pytest.raises(ValueError, match=msg):
             original.to_stata(path, convert_dates={"wrong_name": "tc"})
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
     def test_nonfile_writing(self, version, temp_file):
         # GH 21041
         bio = io.BytesIO()
         df = DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
-            columns=pd.Index(list("ABCD"), dtype=object),
-            index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
+            columns=pd.Index(list("ABCD")),
+            index=pd.Index([f"i-{i}" for i in range(30)]),
         )
         df.index.name = "index"
         path = temp_file
@@ -1864,13 +1865,12 @@ def test_nonfile_writing(self, version, temp_file):
         reread = read_stata(path, index_col="index")
         tm.assert_frame_equal(df, reread)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_gzip_writing(self, temp_file):
         # writing version 117 requires seek and cannot be used with gzip
         df = DataFrame(
             1.1 * np.arange(120).reshape((30, 4)),
-            columns=pd.Index(list("ABCD"), dtype=object),
-            index=pd.Index([f"i-{i}" for i in range(30)], dtype=object),
+            columns=pd.Index(list("ABCD")),
+            index=pd.Index([f"i-{i}" for i in range(30)]),
         )
         df.index.name = "index"
         path = temp_file
@@ -1907,8 +1907,7 @@ def test_unicode_dta_118_119(self, file, datapath):
 
         tm.assert_frame_equal(unicode_df, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-    def test_mixed_string_strl(self, temp_file):
+    def test_mixed_string_strl(self, temp_file, using_infer_string):
         # GH 23633
         output = [{"mixed": "string" * 500, "number": 0}, {"mixed": None, "number": 1}]
         output = DataFrame(output)
@@ -1925,6 +1924,8 @@ def test_mixed_string_strl(self, temp_file):
         output.to_stata(path, write_index=False, convert_strl=["mixed"], version=117)
         reread = read_stata(path)
         expected = output.fillna("")
+        if using_infer_string:
+            expected["mixed"] = expected["mixed"].astype("str")
         tm.assert_frame_equal(reread, expected)
 
     @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
@@ -2000,7 +2001,6 @@ def test_stata_119(self, datapath):
                 reader._ensure_open()
                 assert reader._nvar == 32999
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     @pytest.mark.parametrize("version", [118, 119, None])
     @pytest.mark.parametrize("byteorder", ["little", "big"])
     def test_utf8_writer(self, version, byteorder, temp_file):
@@ -2348,13 +2348,12 @@ def test_iterator_errors(datapath, chunksize):
             pass
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_iterator_value_labels(temp_file):
     # GH 31544
     values = ["c_label", "b_label"] + ["a_label"] * 500
     df = DataFrame({f"col{k}": pd.Categorical(values, ordered=True) for k in range(2)})
     df.to_stata(temp_file, write_index=False)
-    expected = pd.Index(["a_label", "b_label", "c_label"], dtype="object")
+    expected = pd.Index(["a_label", "b_label", "c_label"])
     with read_stata(temp_file, chunksize=100) as reader:
         for j, chunk in enumerate(reader):
             for i in range(2):

From f9ae4cfa1d20c8e15c6aa44020ad4c653a2efb8a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gregers=20Thomas=20Skat=20R=C3=B8rdam?= <gregers@rordam.dk>
Date: Wed, 30 Oct 2024 20:23:11 +0100
Subject: [PATCH 167/224] DOC: to_latex braces in headers must be escaped
 (#60063) (#60103)

---
 pandas/core/generic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 42516f0a85e07..1759e1ef91d85 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -3339,7 +3339,7 @@ def to_latex(
             The subset of columns to write. Writes all columns by default.
         header : bool or list of str, default True
             Write out the column names. If a list of strings is given,
-            it is assumed to be aliases for the column names.
+            it is assumed to be aliases for the column names. Braces must be escaped.
         index : bool, default True
             Write row names (index).
         na_rep : str, default 'NaN'

From 00d418936b401809a4a08c556ded57d388480868 Mon Sep 17 00:00:00 2001
From: auderson <48577571+auderson@users.noreply.github.com>
Date: Thu, 31 Oct 2024 03:31:18 +0800
Subject: [PATCH 168/224] PERF: faster _coerce_to_data_and_mask() for
 astype("Float64") (#60121)

* add fast path in _coerce_to_data_and_mask

* update whatsnew

* pre-commit
---
 doc/source/whatsnew/v3.0.0.rst | 1 +
 pandas/core/arrays/numeric.py  | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index e5376177d3381..87d92f6618023 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -592,6 +592,7 @@ Performance improvements
 - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`)
 - Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`)
 - Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`)
+- Performance improvement in :meth:`DataFrame.astype` when converting to extension floating dtypes, e.g. "Float64" (:issue:`60066`)
 - Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`)
 - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`)
 - Performance improvement in indexing operations for string dtypes (:issue:`56997`)
diff --git a/pandas/core/arrays/numeric.py b/pandas/core/arrays/numeric.py
index 2c0236273e731..f319a3cc05575 100644
--- a/pandas/core/arrays/numeric.py
+++ b/pandas/core/arrays/numeric.py
@@ -174,6 +174,8 @@ def _coerce_to_data_and_mask(
             raise TypeError(f"{values.dtype} cannot be converted to {name}")
 
     elif values.dtype.kind == "b" and checker(dtype):
+        # fastpath
+        mask = np.zeros(len(values), dtype=np.bool_)
         if not copy:
             values = np.asarray(values, dtype=default_dtype)
         else:
@@ -190,6 +192,10 @@ def _coerce_to_data_and_mask(
         if values.dtype.kind in "iu":
             # fastpath
             mask = np.zeros(len(values), dtype=np.bool_)
+        elif values.dtype.kind == "f":
+            # np.isnan is faster than is_numeric_na() for floats
+            # github issue: #60066
+            mask = np.isnan(values)
         else:
             mask = libmissing.is_numeric_na(values)
     else:

From 0f94e7b3f35a42af3c9ae8902eb58b65a2e10805 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 30 Oct 2024 12:33:33 -0700
Subject: [PATCH 169/224] Bump pypa/cibuildwheel from 2.21.0 to 2.21.3 (#60035)

Bumps [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel) from 2.21.0 to 2.21.3.
- [Release notes](https://github.com/pypa/cibuildwheel/releases)
- [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md)
- [Commits](https://github.com/pypa/cibuildwheel/compare/v2.21.0...v2.21.3)

---
updated-dependencies:
- dependency-name: pypa/cibuildwheel
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
 .github/workflows/wheels.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index ce48cfa463974..4bff9e7e090da 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -156,7 +156,7 @@ jobs:
         run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV"
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.21.0
+        uses: pypa/cibuildwheel@v2.21.3
         with:
          package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }}
         env:

From 2323b5495819e20fe05892cc532d2bb3f83c3d0d Mon Sep 17 00:00:00 2001
From: steeleelliott03 <100764769+steeleelliott03@users.noreply.github.com>
Date: Wed, 30 Oct 2024 15:47:30 -0400
Subject: [PATCH 170/224] DOC: Fix title capitalization in documentation files
 (#32550) (#59972)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Corrected title capitalization in various .rst files to match the
standard of capitalizing only the first word, unless a term like
DataFrame or Series is involved.

Ran the  script to find and
correct heading issues in the following files:
- doc/source/user_guide/timedeltas.rst
- doc/source/whatsnew/v0.7.0.rst
- doc/source/whatsnew/v0.23.4.rst
- (… and so on)

Fixes part of issue #32550.
---
 doc/source/user_guide/cookbook.rst   | 6 +++---
 doc/source/user_guide/gotchas.rst    | 2 +-
 doc/source/user_guide/groupby.rst    | 6 +++---
 doc/source/user_guide/integer_na.rst | 2 +-
 doc/source/user_guide/io.rst         | 2 +-
 doc/source/whatsnew/v1.0.2.rst       | 2 +-
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst
index 42430fb1fbba0..1525afcac87f7 100644
--- a/doc/source/user_guide/cookbook.rst
+++ b/doc/source/user_guide/cookbook.rst
@@ -35,7 +35,7 @@ These are some neat pandas ``idioms``
    )
    df
 
-if-then...
+If-then...
 **********
 
 An if-then on one column
@@ -176,7 +176,7 @@ One could hard code:
 Selection
 ---------
 
-Dataframes
+DataFrames
 **********
 
 The :ref:`indexing <indexing>` docs.
@@ -1489,7 +1489,7 @@ of the data values:
    )
    df
 
-Constant series
+Constant Series
 ---------------
 
 To assess if a series has a constant value, we can check if ``series.nunique() <= 1``.
diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst
index 26eb656357bf6..842f30f06676e 100644
--- a/doc/source/user_guide/gotchas.rst
+++ b/doc/source/user_guide/gotchas.rst
@@ -121,7 +121,7 @@ Below is how to check if any of the values are ``True``:
     if pd.Series([False, True, False]).any():
         print("I am any")
 
-Bitwise boolean
+Bitwise Boolean
 ~~~~~~~~~~~~~~~
 
 Bitwise boolean operators like ``==`` and ``!=`` return a boolean :class:`Series`
diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst
index 8c80fa7052dd5..acb5a2b7919ac 100644
--- a/doc/source/user_guide/groupby.rst
+++ b/doc/source/user_guide/groupby.rst
@@ -618,7 +618,7 @@ this will make an extra copy.
 
 .. _groupby.aggregate.udf:
 
-Aggregation with User-Defined Functions
+Aggregation with user-defined functions
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Users can also provide their own User-Defined Functions (UDFs) for custom aggregations.
@@ -1261,7 +1261,7 @@ with
     df.groupby("A", group_keys=False).apply(lambda x: x, include_groups=False)
 
 
-Numba Accelerated Routines
+Numba accelerated routines
 --------------------------
 
 .. versionadded:: 1.1
@@ -1696,7 +1696,7 @@ introduction <categorical>` and the
 
     dfg.groupby(["A", [0, 0, 0, 1, 1]]).ngroup()
 
-Groupby by indexer to 'resample' data
+GroupBy by indexer to 'resample' data
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Resampling produces new hypothetical samples (resamples) from already existing observed data or from a model that generates data. These new samples are similar to the pre-existing samples.
diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst
index 76a2f22b7987d..8d35d1583d3bd 100644
--- a/doc/source/user_guide/integer_na.rst
+++ b/doc/source/user_guide/integer_na.rst
@@ -147,7 +147,7 @@ Reduction and groupby operations such as :meth:`~DataFrame.sum` work as well.
    df.sum()
    df.groupby("B").A.sum()
 
-Scalar NA Value
+Scalar NA value
 ---------------
 
 :class:`arrays.IntegerArray` uses :attr:`pandas.NA` as its scalar
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
index fa64bce60caf4..7c165c87adb46 100644
--- a/doc/source/user_guide/io.rst
+++ b/doc/source/user_guide/io.rst
@@ -5996,7 +5996,7 @@ Full documentation can be found `here <https://pandas-gbq.readthedocs.io/en/late
 
 .. _io.stata:
 
-Stata format
+STATA format
 ------------
 
 .. _io.stata_writer:
diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst
index 0a5716f52c836..7354e2bafacc0 100644
--- a/doc/source/whatsnew/v1.0.2.rst
+++ b/doc/source/whatsnew/v1.0.2.rst
@@ -47,7 +47,7 @@ Fixed regressions
 
 .. ---------------------------------------------------------------------------
 
-Indexing with nullable boolean arrays
+Indexing with nullable Boolean arrays
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Previously indexing with a nullable Boolean array containing ``NA`` would raise a ``ValueError``, however this is now permitted with ``NA`` being treated as ``False``. (:issue:`31503`)

From 9cd4a281c42838cd32261b92a55aed830ebeae03 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 30 Oct 2024 22:20:59 +0100
Subject: [PATCH 171/224] CI/TST: fix parquet tz test returning pytz fixed
 offset (pyarrow 18) (#60143)

* CI/TST: fix parquet tz test returning pytz fixed offset (pyarrow 18)

* only convert to pytz if installed
---
 pandas/tests/io/test_parquet.py | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 4c2ea036f08dc..6ef7105cf5ccc 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -17,7 +17,6 @@
     pa_version_under13p0,
     pa_version_under15p0,
     pa_version_under17p0,
-    pa_version_under18p0,
 )
 
 import pandas as pd
@@ -974,21 +973,9 @@ def test_timestamp_nanoseconds(self, pa):
         df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1ns", periods=10)})
         check_round_trip(df, pa, write_kwargs={"version": ver})
 
-    def test_timezone_aware_index(self, request, pa, timezone_aware_date_list):
+    def test_timezone_aware_index(self, pa, timezone_aware_date_list):
         pytest.importorskip("pyarrow", "11.0.0")
 
-        if (
-            timezone_aware_date_list.tzinfo != datetime.timezone.utc
-            and pa_version_under18p0
-        ):
-            request.applymarker(
-                pytest.mark.xfail(
-                    reason=(
-                        "pyarrow returns pytz.FixedOffset while pandas "
-                        "constructs datetime.timezone https://github.com/pandas-dev/pandas/issues/37286"
-                    )
-                )
-            )
         idx = 5 * [timezone_aware_date_list]
         df = pd.DataFrame(index=idx, data={"index_as_col": idx})
 
@@ -1005,6 +992,18 @@ def test_timezone_aware_index(self, request, pa, timezone_aware_date_list):
         expected = df[:]
         if pa_version_under11p0:
             expected.index = expected.index.as_unit("ns")
+        if timezone_aware_date_list.tzinfo != datetime.timezone.utc:
+            # pyarrow returns pytz.FixedOffset while pandas constructs datetime.timezone
+            # https://github.com/pandas-dev/pandas/issues/37286
+            try:
+                import pytz
+            except ImportError:
+                pass
+            else:
+                offset = df.index.tz.utcoffset(timezone_aware_date_list)
+                tz = pytz.FixedOffset(offset.total_seconds() / 60)
+                expected.index = expected.index.tz_convert(tz)
+                expected["index_as_col"] = expected["index_as_col"].dt.tz_convert(tz)
         check_round_trip(df, pa, check_dtype=False, expected=expected)
 
     def test_filter_row_groups(self, pa):

From 2a1ca9da299db95231a96b271b8952ac3c9977fb Mon Sep 17 00:00:00 2001
From: Michael <mbh326684@gmail.com>
Date: Wed, 30 Oct 2024 19:04:24 -0400
Subject: [PATCH 172/224] TST: Retyping of categorical column with NaN (#60112)

* consistent name usage

* changed to numpy array of integers

* Remove redundant assert

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>

---------

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
 pandas/tests/dtypes/test_dtypes.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py
index 33232e8df14e9..b7e37ff270e60 100644
--- a/pandas/tests/dtypes/test_dtypes.py
+++ b/pandas/tests/dtypes/test_dtypes.py
@@ -1243,3 +1243,12 @@ def test_loc_setitem_empty_labels_no_dtype_conversion():
 
     assert df.a.dtype == "int64"
     tm.assert_frame_equal(df, expected)
+
+
+def test_categorical_nan_no_dtype_conversion():
+    # GH 43996
+
+    df = pd.DataFrame({"a": Categorical([np.nan], [1]), "b": [1]})
+    expected = pd.DataFrame({"a": Categorical([1], [1]), "b": [1]})
+    df.loc[0, "a"] = np.array([1])
+    tm.assert_frame_equal(df, expected)

From 4bbb3ce5d5a7e0f24dc2d8c1faf26c3b5d55670d Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 31 Oct 2024 09:41:20 +0100
Subject: [PATCH 173/224] CI: remove usage of legacy NPY_PROMOTION_STATE
 (#60144)

---
 .github/workflows/unit-tests.yml        | 1 -
 ci/deps/actions-311-pyarrownightly.yaml | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 60b234d613a38..212ce7441dfab 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -86,7 +86,6 @@ jobs:
       TEST_ARGS: ${{ matrix.test_args || '' }}
       PYTEST_WORKERS: 'auto'
       PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }}
-      NPY_PROMOTION_STATE: ${{ matrix.env_file == 'actions-311-numpydev.yaml' && 'weak' || 'legacy' }}
       # Clipboard tests
       QT_QPA_PLATFORM: offscreen
       REMOVE_PYARROW: ${{ matrix.name == 'Future infer strings (without pyarrow)' && '1' || '0' }}
diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml
index 434f1d4f7fed2..31a1ef5b8717c 100644
--- a/ci/deps/actions-311-pyarrownightly.yaml
+++ b/ci/deps/actions-311-pyarrownightly.yaml
@@ -18,7 +18,7 @@ dependencies:
 
   # required dependencies
   - python-dateutil
-  - numpy<2
+  - numpy
   - pip
 
   - pip:

From 1cdd20ede0cde485e2cc7f88b98f3237d32b8043 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 31 Oct 2024 10:50:54 +0100
Subject: [PATCH 174/224] TST: update frame method quantile tests (#59875)

* TST (string dtype): update frame method quantile tests

* fixup
---
 pandas/tests/frame/methods/test_quantile.py | 35 +++++++++++----------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py
index fedbdbc98660f..d7baac7264a1d 100644
--- a/pandas/tests/frame/methods/test_quantile.py
+++ b/pandas/tests/frame/methods/test_quantile.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -326,7 +324,6 @@ def test_quantile_multi_empty(self, interp_method):
         )
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_quantile_datetime(self, unit):
         dti = pd.to_datetime(["2010", "2011"]).as_unit(unit)
         df = DataFrame({"a": dti, "b": [0, 5]})
@@ -373,14 +370,13 @@ def test_quantile_datetime(self, unit):
 
         # empty when numeric_only=True
         result = df[["a", "c"]].quantile(0.5, numeric_only=True)
-        expected = Series([], index=[], dtype=np.float64, name=0.5)
+        expected = Series([], index=Index([], dtype="str"), dtype=np.float64, name=0.5)
         tm.assert_series_equal(result, expected)
 
         result = df[["a", "c"]].quantile([0.5], numeric_only=True)
-        expected = DataFrame(index=[0.5], columns=[])
+        expected = DataFrame(index=[0.5], columns=Index([], dtype="str"))
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     @pytest.mark.parametrize(
         "dtype",
         [
@@ -398,7 +394,7 @@ def test_quantile_dt64_empty(self, dtype, interp_method):
         res = df.quantile(
             0.5, axis=1, numeric_only=False, interpolation=interpolation, method=method
         )
-        expected = Series([], index=[], name=0.5, dtype=dtype)
+        expected = Series([], index=Index([], dtype="str"), name=0.5, dtype=dtype)
         tm.assert_series_equal(res, expected)
 
         # no columns in result, so no dtype preservation
@@ -409,7 +405,7 @@ def test_quantile_dt64_empty(self, dtype, interp_method):
             interpolation=interpolation,
             method=method,
         )
-        expected = DataFrame(index=[0.5], columns=[])
+        expected = DataFrame(index=[0.5], columns=Index([], dtype="str"))
         tm.assert_frame_equal(res, expected)
 
     @pytest.mark.parametrize("invalid", [-1, 2, [0.5, -1], [0.5, 2]])
@@ -645,7 +641,6 @@ def test_quantile_nat(self, interp_method, unit):
         )
         tm.assert_frame_equal(res, exp)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_quantile_empty_no_rows_floats(self, interp_method):
         interpolation, method = interp_method
 
@@ -660,11 +655,11 @@ def test_quantile_empty_no_rows_floats(self, interp_method):
         tm.assert_frame_equal(res, exp)
 
         res = df.quantile(0.5, axis=1, interpolation=interpolation, method=method)
-        exp = Series([], index=[], dtype="float64", name=0.5)
+        exp = Series([], index=Index([], dtype="str"), dtype="float64", name=0.5)
         tm.assert_series_equal(res, exp)
 
         res = df.quantile([0.5], axis=1, interpolation=interpolation, method=method)
-        exp = DataFrame(columns=[], index=[0.5])
+        exp = DataFrame(columns=Index([], dtype="str"), index=[0.5])
         tm.assert_frame_equal(res, exp)
 
     def test_quantile_empty_no_rows_ints(self, interp_method):
@@ -874,7 +869,6 @@ def test_quantile_ea_scalar(self, request, obj, index):
         else:
             tm.assert_series_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize(
         "dtype, expected_data, expected_index, axis",
         [
@@ -889,11 +883,13 @@ def test_empty_numeric(self, dtype, expected_data, expected_index, axis):
         df = DataFrame(columns=["a", "b"], dtype=dtype)
         result = df.quantile(0.5, axis=axis)
         expected = Series(
-            expected_data, name=0.5, index=Index(expected_index), dtype="float64"
+            expected_data,
+            name=0.5,
+            index=Index(expected_index, dtype="str"),
+            dtype="float64",
         )
         tm.assert_series_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize(
         "dtype, expected_data, expected_index, axis, expected_dtype",
         [
@@ -908,11 +904,13 @@ def test_empty_datelike(
         df = DataFrame(columns=["a", "b"], dtype=dtype)
         result = df.quantile(0.5, axis=axis, numeric_only=False)
         expected = Series(
-            expected_data, name=0.5, index=Index(expected_index), dtype=expected_dtype
+            expected_data,
+            name=0.5,
+            index=Index(expected_index, dtype="str"),
+            dtype=expected_dtype,
         )
         tm.assert_series_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize(
         "expected_data, expected_index, axis",
         [
@@ -931,7 +929,10 @@ def test_datelike_numeric_only(self, expected_data, expected_index, axis):
         )
         result = df[["a", "c"]].quantile(0.5, axis=axis, numeric_only=True)
         expected = Series(
-            expected_data, name=0.5, index=Index(expected_index), dtype=np.float64
+            expected_data,
+            name=0.5,
+            index=Index(expected_index, dtype="str" if axis == 0 else "int64"),
+            dtype=np.float64,
         )
         tm.assert_series_equal(result, expected)
 

From 2fdb16b347fc34f78213868a8a973447ac79ab2d Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 31 Oct 2024 11:16:04 +0100
Subject: [PATCH 175/224] String dtype: implement sum reduction (#59853)

---
 doc/source/whatsnew/v2.3.0.rst                |  2 +-
 pandas/core/array_algos/masked_reductions.py  |  4 ++
 pandas/core/arrays/arrow/array.py             | 32 ++++++++++
 pandas/core/arrays/string_.py                 | 18 +++++-
 pandas/core/arrays/string_arrow.py            |  6 +-
 pandas/tests/apply/test_frame_apply.py        | 10 ---
 pandas/tests/apply/test_invalid_arg.py        | 39 ++++++------
 pandas/tests/arrays/string_/test_string.py    |  2 -
 pandas/tests/extension/test_arrow.py          | 25 ++------
 pandas/tests/extension/test_string.py         |  2 +-
 pandas/tests/frame/test_reductions.py         | 61 ++++++-------------
 pandas/tests/groupby/aggregate/test_cython.py |  1 -
 pandas/tests/groupby/test_groupby.py          | 15 +----
 pandas/tests/groupby/test_raises.py           |  4 +-
 .../tests/groupby/transform/test_transform.py | 11 +---
 pandas/tests/series/test_reductions.py        | 39 ++++--------
 16 files changed, 120 insertions(+), 151 deletions(-)

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
index 01c2ed3821d7a..64486c5a3e3ba 100644
--- a/doc/source/whatsnew/v2.3.0.rst
+++ b/doc/source/whatsnew/v2.3.0.rst
@@ -32,7 +32,7 @@ enhancement1
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
 
--
+- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py
index f2a32fbe2b0e5..bdf88f2e9fa07 100644
--- a/pandas/core/array_algos/masked_reductions.py
+++ b/pandas/core/array_algos/masked_reductions.py
@@ -62,6 +62,10 @@ def _reductions(
         ):
             return libmissing.NA
 
+        if values.dtype == np.dtype(object):
+            # object dtype does not support `where` without passing an initial
+            values = values[~mask]
+            return func(values, axis=axis, **kwargs)
         return func(values, where=~mask, axis=axis, **kwargs)
 
 
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 619e7b3ccfb4f..53f703b701217 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -68,6 +68,7 @@
     unpack_tuple_and_ellipses,
     validate_indices,
 )
+from pandas.core.nanops import check_below_min_count
 from pandas.core.strings.base import BaseStringArrayMethods
 
 from pandas.io._util import _arrow_dtype_mapping
@@ -1705,6 +1706,37 @@ def pyarrow_meth(data, skip_nulls, **kwargs):
                 denominator = pc.sqrt_checked(pc.count(self._pa_array))
                 return pc.divide_checked(numerator, denominator)
 
+        elif name == "sum" and (
+            pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type)
+        ):
+
+            def pyarrow_meth(data, skip_nulls, min_count=0):  # type: ignore[misc]
+                mask = pc.is_null(data) if data.null_count > 0 else None
+                if skip_nulls:
+                    if min_count > 0 and check_below_min_count(
+                        (len(data),),
+                        None if mask is None else mask.to_numpy(),
+                        min_count,
+                    ):
+                        return pa.scalar(None, type=data.type)
+                    if data.null_count > 0:
+                        # binary_join returns null if there is any null ->
+                        # have to filter out any nulls
+                        data = data.filter(pc.invert(mask))
+                else:
+                    if mask is not None or check_below_min_count(
+                        (len(data),), None, min_count
+                    ):
+                        return pa.scalar(None, type=data.type)
+
+                if pa.types.is_large_string(data.type):
+                    # binary_join only supports string, not large_string
+                    data = data.cast(pa.string())
+                data_list = pa.ListArray.from_arrays(
+                    [0, len(data)], data.combine_chunks()
+                )[0]
+                return pc.binary_join(data_list, "")
+
         else:
             pyarrow_name = {
                 "median": "quantile",
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index f20c4c8625475..4af26858cb131 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -812,8 +812,8 @@ def _reduce(
             else:
                 return nanops.nanall(self._ndarray, skipna=skipna)
 
-        if name in ["min", "max"]:
-            result = getattr(self, name)(skipna=skipna, axis=axis)
+        if name in ["min", "max", "sum"]:
+            result = getattr(self, name)(skipna=skipna, axis=axis, **kwargs)
             if keepdims:
                 return self._from_sequence([result], dtype=self.dtype)
             return result
@@ -840,6 +840,20 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
         )
         return self._wrap_reduction_result(axis, result)
 
+    def sum(
+        self,
+        *,
+        axis: AxisInt | None = None,
+        skipna: bool = True,
+        min_count: int = 0,
+        **kwargs,
+    ) -> Scalar:
+        nv.validate_sum((), kwargs)
+        result = masked_reductions.sum(
+            values=self._ndarray, mask=self.isna(), skipna=skipna
+        )
+        return self._wrap_reduction_result(axis, result)
+
     def value_counts(self, dropna: bool = True) -> Series:
         from pandas.core.algorithms import value_counts_internal as value_counts
 
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 9e9893ecbbd97..cde39c7f4dc6a 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -435,7 +435,11 @@ def _reduce(
                 return result.astype(np.bool_)
             return result
 
-        result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs)
+        if name in ("min", "max", "sum", "argmin", "argmax"):
+            result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs)
+        else:
+            raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
+
         if name in ("argmin", "argmax") and isinstance(result, pa.Array):
             return self._convert_int_result(result)
         elif isinstance(result, pa.Array):
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
index f0ab01e9e960e..ed7eae4502a64 100644
--- a/pandas/tests/apply/test_frame_apply.py
+++ b/pandas/tests/apply/test_frame_apply.py
@@ -4,10 +4,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
-from pandas.compat import HAS_PYARROW
-
 from pandas.core.dtypes.dtypes import CategoricalDtype
 
 import pandas as pd
@@ -1218,7 +1214,6 @@ def test_agg_with_name_as_column_name():
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_agg_multiple_mixed():
     # GH 20909
     mdf = DataFrame(
@@ -1247,9 +1242,6 @@ def test_agg_multiple_mixed():
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-)
 def test_agg_multiple_mixed_raises():
     # GH 20909
     mdf = DataFrame(
@@ -1347,7 +1339,6 @@ def test_named_agg_reduce_axis1_raises(float_frame):
             float_frame.agg(row1=(name1, "sum"), row2=(name2, "max"), axis=axis)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_nuiscance_columns():
     # GH 15015
     df = DataFrame(
@@ -1524,7 +1515,6 @@ def test_apply_datetime_tz_issue(engine, request):
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize("df", [DataFrame({"A": ["a", None], "B": ["c", "d"]})])
 @pytest.mark.parametrize("method", ["min", "max", "sum"])
 def test_mixed_column_raises(df, method, using_infer_string):
diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py
index ba970e328ae40..e19c21f81b3e1 100644
--- a/pandas/tests/apply/test_invalid_arg.py
+++ b/pandas/tests/apply/test_invalid_arg.py
@@ -12,9 +12,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
-from pandas.compat import HAS_PYARROW
 from pandas.errors import SpecificationError
 
 from pandas import (
@@ -212,10 +209,6 @@ def transform(row):
         data.apply(transform, axis=1)
 
 
-# we should raise a proper TypeError instead of propagating the pyarrow error
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-)
 @pytest.mark.parametrize(
     "df, func, expected",
     tm.get_cython_table_params(
@@ -225,21 +218,25 @@ def transform(row):
 def test_agg_cython_table_raises_frame(df, func, expected, axis, using_infer_string):
     # GH 21224
     if using_infer_string:
-        import pyarrow as pa
+        if df.dtypes.iloc[0].storage == "pyarrow":
+            import pyarrow as pa
 
-        expected = (expected, pa.lib.ArrowNotImplementedError)
+            # TODO(infer_string)
+            # should raise a proper TypeError instead of propagating the pyarrow error
 
-    msg = "can't multiply sequence by non-int of type 'str'|has no kernel"
+            expected = (expected, pa.lib.ArrowNotImplementedError)
+        else:
+            expected = (expected, NotImplementedError)
+
+    msg = (
+        "can't multiply sequence by non-int of type 'str'|has no kernel|cannot perform"
+    )
     warn = None if isinstance(func, str) else FutureWarning
     with pytest.raises(expected, match=msg):
         with tm.assert_produces_warning(warn, match="using DataFrame.cumprod"):
             df.agg(func, axis=axis)
 
 
-# we should raise a proper TypeError instead of propagating the pyarrow error
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-)
 @pytest.mark.parametrize(
     "series, func, expected",
     chain(
@@ -263,11 +260,15 @@ def test_agg_cython_table_raises_series(series, func, expected, using_infer_stri
         msg = r"Cannot convert \['a' 'b' 'c'\] to numeric"
 
     if using_infer_string:
-        import pyarrow as pa
-
-        expected = (expected, pa.lib.ArrowNotImplementedError)
-
-    msg = msg + "|does not support|has no kernel"
+        if series.dtype.storage == "pyarrow":
+            import pyarrow as pa
+
+            # TODO(infer_string)
+            # should raise a proper TypeError instead of propagating the pyarrow error
+            expected = (expected, pa.lib.ArrowNotImplementedError)
+        else:
+            expected = (expected, NotImplementedError)
+    msg = msg + "|does not support|has no kernel|Cannot perform|cannot perform"
     warn = None if isinstance(func, str) else FutureWarning
 
     with pytest.raises(expected, match=msg):
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 33708be497f31..7856cf390127e 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -444,14 +444,12 @@ def test_astype_float(dtype, any_float_dtype):
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.xfail(reason="Not implemented StringArray.sum")
 def test_reduce(skipna, dtype):
     arr = pd.Series(["a", "b", "c"], dtype=dtype)
     result = arr.sum(skipna=skipna)
     assert result == "abc"
 
 
-@pytest.mark.xfail(reason="Not implemented StringArray.sum")
 def test_reduce_missing(skipna, dtype):
     arr = pd.Series([None, "a", None, "b", "c", None], dtype=dtype)
     result = arr.sum(skipna=skipna)
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index f56094dfd47ca..f0ff11e5fa3f7 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -461,10 +461,11 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
                 pass
             else:
                 return False
+        elif pa.types.is_binary(pa_dtype) and op_name == "sum":
+            return False
         elif (
             pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype)
         ) and op_name in [
-            "sum",
             "mean",
             "median",
             "prod",
@@ -563,6 +564,8 @@ def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool):
             cmp_dtype = "float64[pyarrow]"
         elif op_name in ["sum", "prod"] and pa.types.is_boolean(pa_type):
             cmp_dtype = "uint64[pyarrow]"
+        elif op_name == "sum" and pa.types.is_string(pa_type):
+            cmp_dtype = arr.dtype
         else:
             cmp_dtype = {
                 "i": "int64[pyarrow]",
@@ -594,26 +597,6 @@ def test_median_not_approximate(self, typ):
         result = pd.Series([1, 2], dtype=f"{typ}[pyarrow]").median()
         assert result == 1.5
 
-    def test_in_numeric_groupby(self, data_for_grouping):
-        dtype = data_for_grouping.dtype
-        if is_string_dtype(dtype):
-            df = pd.DataFrame(
-                {
-                    "A": [1, 1, 2, 2, 3, 3, 1, 4],
-                    "B": data_for_grouping,
-                    "C": [1, 1, 1, 1, 1, 1, 1, 1],
-                }
-            )
-
-            expected = pd.Index(["C"])
-            msg = re.escape(f"agg function failed [how->sum,dtype->{dtype}")
-            with pytest.raises(TypeError, match=msg):
-                df.groupby("A").sum()
-            result = df.groupby("A").sum(numeric_only=True).columns
-            tm.assert_index_equal(result, expected)
-        else:
-            super().test_in_numeric_groupby(data_for_grouping)
-
     def test_construct_from_string_own_name(self, dtype, request):
         pa_dtype = dtype.pyarrow_dtype
         if pa.types.is_decimal(pa_dtype):
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 509ae653e4793..57710d9caad4d 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -188,7 +188,7 @@ def _get_expected_exception(
 
     def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool:
         return (
-            op_name in ["min", "max"]
+            op_name in ["min", "max", "sum"]
             or ser.dtype.na_value is np.nan  # type: ignore[union-attr]
             and op_name in ("any", "all")
         )
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
index 1d667d35db253..05bb603f5c462 100644
--- a/pandas/tests/frame/test_reductions.py
+++ b/pandas/tests/frame/test_reductions.py
@@ -226,7 +226,6 @@ def float_frame_with_na():
 class TestDataFrameAnalytics:
     # ---------------------------------------------------------------------
     # Reductions
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize("axis", [0, 1])
     @pytest.mark.parametrize(
         "opname",
@@ -246,17 +245,11 @@ class TestDataFrameAnalytics:
             pytest.param("kurt", marks=td.skip_if_no("scipy")),
         ],
     )
-    def test_stat_op_api_float_string_frame(
-        self, float_string_frame, axis, opname, using_infer_string
-    ):
-        if (
-            (opname in ("sum", "min", "max") and axis == 0)
-            or opname
-            in (
-                "count",
-                "nunique",
-            )
-        ) and not (using_infer_string and opname == "sum"):
+    def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname):
+        if (opname in ("sum", "min", "max") and axis == 0) or opname in (
+            "count",
+            "nunique",
+        ):
             getattr(float_string_frame, opname)(axis=axis)
         else:
             if opname in ["var", "std", "sem", "skew", "kurt"]:
@@ -283,10 +276,11 @@ def test_stat_op_api_float_string_frame(
                 msg = "'[><]=' not supported between instances of 'float' and 'str'"
             elif opname == "median":
                 msg = re.compile(
-                    r"Cannot convert \[.*\] to numeric|does not support", flags=re.S
+                    r"Cannot convert \[.*\] to numeric|does not support|Cannot perform",
+                    flags=re.S,
                 )
             if not isinstance(msg, re.Pattern):
-                msg = msg + "|does not support"
+                msg = msg + "|does not support|Cannot perform reduction"
             with pytest.raises(TypeError, match=msg):
                 getattr(float_string_frame, opname)(axis=axis)
         if opname != "nunique":
@@ -432,7 +426,6 @@ def test_stat_operators_attempt_obj_array(self, method, df, axis):
             expected[expected.isna()] = None
         tm.assert_series_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"])
     def test_mixed_ops(self, op):
         # GH#16116
@@ -449,26 +442,16 @@ def test_mixed_ops(self, op):
                 "could not convert",
                 "can't multiply sequence by non-int",
                 "does not support",
+                "Cannot perform",
             ]
         )
         with pytest.raises(TypeError, match=msg):
             getattr(df, op)()
 
         with pd.option_context("use_bottleneck", False):
-            msg = "|".join(
-                [
-                    "Could not convert",
-                    "could not convert",
-                    "can't multiply sequence by non-int",
-                    "does not support",
-                ]
-            )
             with pytest.raises(TypeError, match=msg):
                 getattr(df, op)()
 
-    @pytest.mark.xfail(
-        using_string_dtype(), reason="sum doesn't work for arrow strings"
-    )
     def test_reduce_mixed_frame(self):
         # GH 6806
         df = DataFrame(
@@ -608,7 +591,6 @@ def test_sem(self, datetime_frame):
             result = nanops.nansem(arr, axis=0)
             assert not (result < 0).any()
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize(
         "dropna, expected",
         [
@@ -630,7 +612,7 @@ def test_sem(self, datetime_frame):
                     "A": [12],
                     "B": [10.0],
                     "C": [np.nan],
-                    "D": np.array([np.nan], dtype=object),
+                    "D": Series([np.nan], dtype="str"),
                     "E": Categorical([np.nan], categories=["a"]),
                     "F": DatetimeIndex([pd.NaT], dtype="M8[ns]"),
                     "G": to_timedelta([pd.NaT]),
@@ -672,7 +654,7 @@ def test_mode_dropna(self, dropna, expected):
                 "A": [12, 12, 19, 11],
                 "B": [10, 10, np.nan, 3],
                 "C": [1, np.nan, np.nan, np.nan],
-                "D": Series([np.nan, np.nan, "a", np.nan], dtype=object),
+                "D": Series([np.nan, np.nan, "a", np.nan], dtype="str"),
                 "E": Categorical([np.nan, np.nan, "a", np.nan]),
                 "F": DatetimeIndex(["NaT", "2000-01-02", "NaT", "NaT"], dtype="M8[ns]"),
                 "G": to_timedelta(["1 days", "nan", "nan", "nan"]),
@@ -692,7 +674,6 @@ def test_mode_dropna(self, dropna, expected):
         expected = DataFrame(expected)
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     def test_mode_sortwarning(self, using_infer_string):
         # Check for the warning that is raised when the mode
         # results cannot be sorted
@@ -700,7 +681,12 @@ def test_mode_sortwarning(self, using_infer_string):
         df = DataFrame({"A": [np.nan, np.nan, "a", "a"]})
         expected = DataFrame({"A": ["a", np.nan]})
 
-        warning = None if using_infer_string else UserWarning
+        # TODO(infer_string) avoid this UserWarning for python storage
+        warning = (
+            None
+            if using_infer_string and df.A.dtype.storage == "pyarrow"
+            else UserWarning
+        )
         with tm.assert_produces_warning(warning, match="Unable to sort modes"):
             result = df.mode(dropna=False)
             result = result.sort_values(by="A").reset_index(drop=True)
@@ -1354,11 +1340,8 @@ def test_any_all_extra(self):
         result = df[["C"]].all(axis=None).item()
         assert result is True
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize("axis", [0, 1])
-    def test_any_all_object_dtype(
-        self, axis, all_boolean_reductions, skipna, using_infer_string
-    ):
+    def test_any_all_object_dtype(self, axis, all_boolean_reductions, skipna):
         # GH#35450
         df = DataFrame(
             data=[
@@ -1368,13 +1351,8 @@ def test_any_all_object_dtype(
                 [np.nan, np.nan, "5", np.nan],
             ]
         )
-        if using_infer_string:
-            # na in object is True while in string pyarrow numpy it's false
-            val = not axis == 0 and not skipna and all_boolean_reductions == "all"
-        else:
-            val = True
         result = getattr(df, all_boolean_reductions)(axis=axis, skipna=skipna)
-        expected = Series([True, True, val, True])
+        expected = Series([True, True, True, True])
         tm.assert_series_equal(result, expected)
 
     def test_any_datetime(self):
@@ -1939,7 +1917,6 @@ def test_sum_timedelta64_skipna_false():
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="sum doesn't work with arrow strings")
 def test_mixed_frame_with_integer_sum():
     # https://github.com/pandas-dev/pandas/issues/34520
     df = DataFrame([["a", 1]], columns=list("ab"))
diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
index 4a4f5882b7e85..d28eb227314c7 100644
--- a/pandas/tests/groupby/aggregate/test_cython.py
+++ b/pandas/tests/groupby/aggregate/test_cython.py
@@ -146,7 +146,6 @@ def test_cython_agg_return_dict():
     tm.assert_series_equal(ts, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_cython_fail_agg():
     dr = bdate_range("1/1/2000", periods=50)
     ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr)
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 6393468fb8ccd..0d13db79835ba 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -8,12 +8,9 @@
 
 from pandas._config import using_string_dtype
 
-from pandas.compat import HAS_PYARROW
 from pandas.errors import SpecificationError
 import pandas.util._test_decorators as td
 
-from pandas.core.dtypes.common import is_string_dtype
-
 import pandas as pd
 from pandas import (
     Categorical,
@@ -1408,23 +1405,15 @@ def g(group):
     tm.assert_series_equal(result, expected)
 
 
-# TODO harmonize error messages
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
-)
 @pytest.mark.parametrize("grouper", ["A", ["A", "B"]])
-def test_set_group_name(df, grouper, using_infer_string):
+def test_set_group_name(df, grouper):
     def f(group):
         assert group.name is not None
         return group
 
     def freduce(group):
         assert group.name is not None
-        if using_infer_string and grouper == "A" and is_string_dtype(group.dtype):
-            with pytest.raises(TypeError, match="does not support"):
-                group.sum()
-        else:
-            return group.sum()
+        return group.sum()
 
     def freducex(x):
         return freduce(x)
diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py
index f28967fa81ddb..38b4abfddda1e 100644
--- a/pandas/tests/groupby/test_raises.py
+++ b/pandas/tests/groupby/test_raises.py
@@ -208,7 +208,6 @@ def func(x):
         getattr(gb, how)(func)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @pytest.mark.parametrize("how", ["agg", "transform"])
 @pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
 def test_groupby_raises_string_np(
@@ -225,7 +224,8 @@ def test_groupby_raises_string_np(
         np.sum: (None, ""),
         np.mean: (
             TypeError,
-            "Could not convert string .* to numeric",
+            "Could not convert string .* to numeric|"
+            "Cannot perform reduction 'mean' with string dtype",
         ),
     }[groupby_func_np]
     _call_and_check(klass, msg, how, gb, groupby_func_np, ())
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index 053dda0629571..5b8fa96291c9f 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -6,7 +6,6 @@
 from pandas._config import using_string_dtype
 
 from pandas._libs import lib
-from pandas.compat import HAS_PYARROW
 
 from pandas.core.dtypes.common import ensure_platform_int
 
@@ -385,10 +384,7 @@ def test_transform_nuisance_raises(df, using_infer_string):
     gbc = grouped["B"]
     msg = "Could not convert"
     if using_infer_string:
-        if df.columns.dtype.storage == "pyarrow":
-            msg = "with dtype str does not support operation 'mean'"
-        else:
-            msg = "Cannot perform reduction 'mean' with string dtype"
+        msg = "Cannot perform reduction 'mean' with string dtype"
     with pytest.raises(TypeError, match=msg):
         gbc.transform(lambda x: np.mean(x))
 
@@ -483,10 +479,7 @@ def test_groupby_transform_with_int(using_infer_string):
     )
     msg = "Could not convert"
     if using_infer_string:
-        if HAS_PYARROW:
-            msg = "with dtype str does not support operation 'mean'"
-        else:
-            msg = "Cannot perform reduction 'mean' with string dtype"
+        msg = "Cannot perform reduction 'mean' with string dtype"
     with np.errstate(all="ignore"):
         with pytest.raises(TypeError, match=msg):
             df.groupby("A").transform(lambda x: (x - x.mean()) / x.std())
diff --git a/pandas/tests/series/test_reductions.py b/pandas/tests/series/test_reductions.py
index 7bbb902e14a36..86ce60b1fc12b 100644
--- a/pandas/tests/series/test_reductions.py
+++ b/pandas/tests/series/test_reductions.py
@@ -1,10 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
-from pandas.compat import HAS_PYARROW
-
 import pandas as pd
 from pandas import Series
 import pandas._testing as tm
@@ -166,60 +162,49 @@ def test_validate_stat_keepdims():
         np.sum(ser, keepdims=True)
 
 
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-)
-def test_mean_with_convertible_string_raises(using_infer_string):
+def test_mean_with_convertible_string_raises():
     # GH#44008
     ser = Series(["1", "2"])
-    if using_infer_string:
-        msg = "does not support"
-        with pytest.raises(TypeError, match=msg):
-            ser.sum()
-    else:
-        assert ser.sum() == "12"
-    msg = "Could not convert string '12' to numeric|does not support"
+    assert ser.sum() == "12"
+
+    msg = "Could not convert string '12' to numeric|does not support|Cannot perform"
     with pytest.raises(TypeError, match=msg):
         ser.mean()
 
     df = ser.to_frame()
-    msg = r"Could not convert \['12'\] to numeric|does not support"
+    msg = r"Could not convert \['12'\] to numeric|does not support|Cannot perform"
     with pytest.raises(TypeError, match=msg):
         df.mean()
 
 
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-)
 def test_mean_dont_convert_j_to_complex():
     # GH#36703
     df = pd.DataFrame([{"db": "J", "numeric": 123}])
-    msg = r"Could not convert \['J'\] to numeric|does not support"
+    msg = r"Could not convert \['J'\] to numeric|does not support|Cannot perform"
     with pytest.raises(TypeError, match=msg):
         df.mean()
 
     with pytest.raises(TypeError, match=msg):
         df.agg("mean")
 
-    msg = "Could not convert string 'J' to numeric|does not support"
+    msg = "Could not convert string 'J' to numeric|does not support|Cannot perform"
     with pytest.raises(TypeError, match=msg):
         df["db"].mean()
-    msg = "Could not convert string 'J' to numeric|ufunc 'divide'"
+    msg = "Could not convert string 'J' to numeric|ufunc 'divide'|Cannot perform"
     with pytest.raises(TypeError, match=msg):
         np.mean(df["db"].astype("string").array)
 
 
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-)
 def test_median_with_convertible_string_raises():
     # GH#34671 this _could_ return a string "2", but definitely not float 2.0
-    msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support"
+    msg = r"Cannot convert \['1' '2' '3'\] to numeric|does not support|Cannot perform"
     ser = Series(["1", "2", "3"])
     with pytest.raises(TypeError, match=msg):
         ser.median()
 
-    msg = r"Cannot convert \[\['1' '2' '3'\]\] to numeric|does not support"
+    msg = (
+        r"Cannot convert \[\['1' '2' '3'\]\] to numeric|does not support|Cannot perform"
+    )
     df = ser.to_frame()
     with pytest.raises(TypeError, match=msg):
         df.median()

From 4651ddb6bca4ee833a22bd280f2ae924813cf50f Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 31 Oct 2024 13:55:21 +0100
Subject: [PATCH 176/224] TST (string dtype): update tests/reductions tests
 (#60133)

---
 pandas/tests/reductions/test_reductions.py | 43 +++++++++++++++-------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py
index 26fecef6ed0e6..8153ba66d632b 100644
--- a/pandas/tests/reductions/test_reductions.py
+++ b/pandas/tests/reductions/test_reductions.py
@@ -1206,6 +1206,7 @@ def test_idxminmax_object_dtype(self, using_infer_string):
             with pytest.raises(TypeError, match=msg):
                 ser3.idxmin(skipna=False)
 
+    # TODO(infer_string) implement argmin/max for python string dtype
     @pytest.mark.xfail(
         using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
     )
@@ -1431,12 +1432,14 @@ def test_mode_numerical_nan(self, dropna, expected):
         expected = Series(expected)
         tm.assert_series_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     @pytest.mark.parametrize(
-        "dropna, expected1, expected2, expected3",
-        [(True, ["b"], ["bar"], ["nan"]), (False, ["b"], [np.nan], ["nan"])],
+        "dropna, expected1, expected2",
+        [
+            (True, ["b"], ["bar"]),
+            (False, ["b"], [np.nan]),
+        ],
     )
-    def test_mode_str_obj(self, dropna, expected1, expected2, expected3):
+    def test_mode_object(self, dropna, expected1, expected2):
         # Test string and object types.
         data = ["a"] * 2 + ["b"] * 3
 
@@ -1449,17 +1452,32 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3):
 
         s = Series(data, dtype=object)
         result = s.mode(dropna)
-        expected2 = Series(expected2, dtype=None if expected2 == ["bar"] else object)
+        expected2 = Series(expected2, dtype=object)
         tm.assert_series_equal(result, expected2)
 
+    @pytest.mark.parametrize(
+        "dropna, expected1, expected2",
+        [
+            (True, ["b"], ["bar"]),
+            (False, ["b"], [np.nan]),
+        ],
+    )
+    def test_mode_string(self, dropna, expected1, expected2, any_string_dtype):
+        # Test string and object types.
+        data = ["a"] * 2 + ["b"] * 3
+
+        s = Series(data, dtype=any_string_dtype)
+        result = s.mode(dropna)
+        expected1 = Series(expected1, dtype=any_string_dtype)
+        tm.assert_series_equal(result, expected1)
+
         data = ["foo", "bar", "bar", np.nan, np.nan, np.nan]
 
-        s = Series(data, dtype=object).astype(str)
+        s = Series(data, dtype=any_string_dtype)
         result = s.mode(dropna)
-        expected3 = Series(expected3)
-        tm.assert_series_equal(result, expected3)
+        expected2 = Series(expected2, dtype=any_string_dtype)
+        tm.assert_series_equal(result, expected2)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     @pytest.mark.parametrize(
         "dropna, expected1, expected2",
         [(True, ["foo"], ["foo"]), (False, ["foo"], [np.nan])],
@@ -1467,12 +1485,12 @@ def test_mode_str_obj(self, dropna, expected1, expected2, expected3):
     def test_mode_mixeddtype(self, dropna, expected1, expected2):
         s = Series([1, "foo", "foo"])
         result = s.mode(dropna)
-        expected = Series(expected1)
+        expected = Series(expected1, dtype=object)
         tm.assert_series_equal(result, expected)
 
         s = Series([1, "foo", "foo", np.nan, np.nan, np.nan])
         result = s.mode(dropna)
-        expected = Series(expected2, dtype=None if expected2 == ["foo"] else object)
+        expected = Series(expected2, dtype=object)
         tm.assert_series_equal(result, expected)
 
     @pytest.mark.parametrize(
@@ -1597,12 +1615,11 @@ def test_mode_intoverflow(self, dropna, expected1, expected2):
         expected2 = Series(expected2, dtype=np.uint64)
         tm.assert_series_equal(result, expected2)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_mode_sortwarning(self):
         # Check for the warning that is raised when the mode
         # results cannot be sorted
 
-        expected = Series(["foo", np.nan])
+        expected = Series(["foo", np.nan], dtype=object)
         s = Series([1, "foo", "foo", np.nan, np.nan])
 
         with tm.assert_produces_warning(UserWarning, match="Unable to sort modes"):

From e7bb845bd93859d8b901c99c13672f4fb72d622a Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 31 Oct 2024 16:45:43 +0100
Subject: [PATCH 177/224] CI: remove uninstall of nomkl (#60150)

---
 .github/actions/build_pandas/action.yml | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml
index 9dd0679d62f3e..b92bacd1a537c 100644
--- a/.github/actions/build_pandas/action.yml
+++ b/.github/actions/build_pandas/action.yml
@@ -22,13 +22,6 @@ runs:
         fi
       shell: bash -el {0}
 
-    - name: Uninstall nomkl
-      run: |
-        if conda list nomkl | grep nomkl 1>/dev/null; then
-          conda remove nomkl -y
-        fi
-      shell: bash -el {0}
-
     - name: Build Pandas
       run: |
         if [[ ${{ inputs.editable }} == "true" ]]; then

From e26e3eec231c2864691a21b5c16265cf3a54b9f3 Mon Sep 17 00:00:00 2001
From: Kevin Sheppard <bashtage@users.noreply.github.com>
Date: Thu, 31 Oct 2024 15:56:55 +0000
Subject: [PATCH 178/224] BUG: Remove incorrect check on value label length
 (#60156)

* BUG: Remove incorrect check on value label length

Remove 32,000 limit on value limit check since this applies
to the number of variable, not the length of the value labels

closes #60107

* TST: Remove incorrect test

Remove test of the error that was being incorrectly raised
---
 doc/source/whatsnew/v3.0.0.rst |  1 +
 pandas/io/stata.py             |  6 ------
 pandas/tests/io/test_stata.py  | 28 ++++++++++++----------------
 3 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 87d92f6618023..d7d29665950a6 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -687,6 +687,7 @@ I/O
 - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`)
 - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`)
 - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`)
+- Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`)
 - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
 - Bug in :meth:`HDFStore.get` was failing to save data of dtype datetime64[s] correctly (:issue:`59004`)
 - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`)
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 722e2c79c4e6a..ed89d5766c306 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -691,12 +691,6 @@ def _prepare_value_labels(self) -> None:
             self.txt.append(category)
             self.n += 1
 
-        if self.text_len > 32000:
-            raise ValueError(
-                "Stata value labels for a single variable must "
-                "have a combined length less than 32,000 characters."
-            )
-
         # Ensure int32
         self.off = np.array(offsets, dtype=np.int32)
         self.val = np.array(values, dtype=np.int32)
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
index 4b5369d61bed6..8fa85d13bbdb5 100644
--- a/pandas/tests/io/test_stata.py
+++ b/pandas/tests/io/test_stata.py
@@ -3,7 +3,9 @@
 from datetime import datetime
 import gzip
 import io
+import itertools
 import os
+import string
 import struct
 import tarfile
 import zipfile
@@ -1163,28 +1165,13 @@ def test_categorical_writing(self, version, temp_file):
 
     def test_categorical_warnings_and_errors(self, temp_file):
         # Warning for non-string labels
-        # Error for labels too long
-        original = DataFrame.from_records(
-            [["a" * 10000], ["b" * 10000], ["c" * 10000], ["d" * 10000]],
-            columns=["Too_long"],
-        )
-
-        original = original.astype("category")
-        path = temp_file
-        msg = (
-            "Stata value labels for a single variable must have "
-            r"a combined length less than 32,000 characters\."
-        )
-        with pytest.raises(ValueError, match=msg):
-            original.to_stata(path)
-
         original = DataFrame.from_records(
             [["a"], ["b"], ["c"], ["d"], [1]], columns=["Too_long"]
         ).astype("category")
 
         msg = "data file created has not lost information due to duplicate labels"
         with tm.assert_produces_warning(ValueLabelTypeMismatch, match=msg):
-            original.to_stata(path)
+            original.to_stata(temp_file)
             # should get a warning for mixed content
 
     @pytest.mark.parametrize("version", [114, 117, 118, 119, None])
@@ -2592,3 +2579,12 @@ def test_empty_frame(temp_file):
     df3 = read_stata(path, columns=["a"])
     assert "b" not in df3
     tm.assert_series_equal(df3.dtypes, dtypes.loc[["a"]])
+
+
+@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
+def test_many_strl(temp_file, version):
+    n = 65534
+    df = DataFrame(np.arange(n), columns=["col"])
+    lbls = ["".join(v) for v in itertools.product(*([string.ascii_letters] * 3))]
+    value_labels = {"col": {i: lbls[i] for i in range(n)}}
+    df.to_stata(temp_file, value_labels=value_labels, version=version)

From 1908f2eb962e5c8b84483a6f0582b5e32b6f0ee8 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 31 Oct 2024 16:58:39 +0100
Subject: [PATCH 179/224] String dtype: deprecate the pyarrow_numpy storage
 option (#60152)

* String dtype: deprecate the pyarrow_numpy storage option

* add pyarrow skip
---
 doc/source/whatsnew/v2.3.0.rst             |  2 +-
 pandas/core/arrays/string_.py              | 15 +++++++++++++--
 pandas/tests/arrays/string_/test_string.py |  8 ++++++++
 pandas/tests/extension/test_string.py      |  4 ++--
 4 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
index 64486c5a3e3ba..5d72fabedcee8 100644
--- a/doc/source/whatsnew/v2.3.0.rst
+++ b/doc/source/whatsnew/v2.3.0.rst
@@ -54,7 +54,7 @@ notable_bug_fix1
 Deprecations
 ~~~~~~~~~~~~
 - Deprecated allowing non-``bool`` values for ``na`` in :meth:`.str.contains`, :meth:`.str.startswith`, and :meth:`.str.endswith` for dtypes that do not already disallow these (:issue:`59615`)
--
+- Deprecated the ``"pyarrow_numpy"`` storage option for :class:`StringDtype` (:issue:`60152`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_230.performance:
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 4af26858cb131..93c678f606fcd 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -7,6 +7,7 @@
     Literal,
     cast,
 )
+import warnings
 
 import numpy as np
 
@@ -27,6 +28,7 @@
 )
 from pandas.compat.numpy import function as nv
 from pandas.util._decorators import doc
+from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.base import (
     ExtensionDtype,
@@ -154,7 +156,16 @@ def __init__(
                     storage = "python"
 
         if storage == "pyarrow_numpy":
-            # TODO raise a deprecation warning
+            warnings.warn(
+                "The 'pyarrow_numpy' storage option name is deprecated and will be "
+                'removed in pandas 3.0. Use \'pd.StringDtype(storage="pyarrow", '
+                "na_value-np.nan)' to construct the same dtype.\nOr enable the "
+                "'pd.options.future.infer_string = True' option globally and use "
+                'the "str" alias as a shorthand notation to specify a dtype '
+                '(instead of "string[pyarrow_numpy]").',
+                FutureWarning,
+                stacklevel=find_stack_level(),
+            )
             storage = "pyarrow"
             na_value = np.nan
 
@@ -254,7 +265,7 @@ def construct_from_string(cls, string) -> Self:
         elif string == "string[pyarrow]":
             return cls(storage="pyarrow")
         elif string == "string[pyarrow_numpy]":
-            # TODO deprecate
+            # this is deprecated in the dtype __init__, remove this in pandas 3.0
             return cls(storage="pyarrow_numpy")
         else:
             raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 7856cf390127e..a18161f47039b 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -42,6 +42,14 @@ def cls(dtype):
     return dtype.construct_array_type()
 
 
+def test_dtype_constructor():
+    pytest.importorskip("pyarrow")
+
+    with tm.assert_produces_warning(FutureWarning):
+        dtype = pd.StringDtype("pyarrow_numpy")
+    assert dtype == pd.StringDtype("pyarrow", na_value=np.nan)
+
+
 def test_dtype_equality():
     pytest.importorskip("pyarrow")
 
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
index 57710d9caad4d..27621193a9b8d 100644
--- a/pandas/tests/extension/test_string.py
+++ b/pandas/tests/extension/test_string.py
@@ -105,8 +105,8 @@ def test_eq_with_str(self, dtype):
             # only the NA-variant supports parametrized string alias
             assert dtype == f"string[{dtype.storage}]"
         elif dtype.storage == "pyarrow":
-            # TODO(infer_string) deprecate this
-            assert dtype == "string[pyarrow_numpy]"
+            with tm.assert_produces_warning(FutureWarning):
+                assert dtype == "string[pyarrow_numpy]"
 
     def test_is_not_string_type(self, dtype):
         # Different from BaseDtypeTests.test_is_not_string_type

From 13926e5e298acf328b0c1347f008ef3f9c4eb078 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 31 Oct 2024 17:12:43 +0100
Subject: [PATCH 180/224] BUG: preserve (object) dtype in factorize (#60118)

* BUG: preserve (object) dtype in factorize

* add fallback for float16
---
 pandas/core/base.py        | 12 +++++++++---
 pandas/tests/test_algos.py |  1 -
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/pandas/core/base.py b/pandas/core/base.py
index 863cf978426e2..58572aab5b20f 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -44,6 +44,7 @@
 from pandas.core.dtypes.generic import (
     ABCDataFrame,
     ABCIndex,
+    ABCMultiIndex,
     ABCSeries,
 )
 from pandas.core.dtypes.missing import (
@@ -1287,13 +1288,18 @@ def factorize(
         if uniques.dtype == np.float16:
             uniques = uniques.astype(np.float32)
 
-        if isinstance(self, ABCIndex):
-            # preserve e.g. MultiIndex
+        if isinstance(self, ABCMultiIndex):
+            # preserve MultiIndex
             uniques = self._constructor(uniques)
         else:
             from pandas import Index
 
-            uniques = Index(uniques)
+            try:
+                uniques = Index(uniques, dtype=self.dtype)
+            except NotImplementedError:
+                # not all dtypes are supported in Index that are allowed for Series
+                # e.g. float16 or bytes
+                uniques = Index(uniques)
         return codes, uniques
 
     _shared_docs["searchsorted"] = """
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index 81e7d3774b613..dac74a0e32a42 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -65,7 +65,6 @@ def test_factorize_complex(self):
         expected_uniques = np.array([(1 + 0j), (2 + 0j), (2 + 1j)], dtype=complex)
         tm.assert_numpy_array_equal(uniques, expected_uniques)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     def test_factorize(self, index_or_series_obj, sort):
         obj = index_or_series_obj
         result_codes, result_uniques = obj.factorize(sort=sort)

From 8be2f8bd832c9598a4c6acbbf01a2822e0b1983b Mon Sep 17 00:00:00 2001
From: auderson <48577571+auderson@users.noreply.github.com>
Date: Fri, 1 Nov 2024 00:14:47 +0800
Subject: [PATCH 181/224] ENH: numba apply supports positional arguments passed
 as **kwargs (#58995)

* add *args for raw numba apply

* add whatsnew

* fix test_case

* fix pre-commit

* fix test case

* add *args for raw=False as well; merge tests together

* add prepare_function_arguments

* fix mypy

* update get_jit_arguments

* add nopython test in `test_apply_args`

* fix test

* fix pre-commit

* modify prepare_function_arguments

* add tests

* add tests

* add whatsnew

* compat for python 3.12

* pre-commit

* compat for python 3.12

* update doc; use kw-only

* add more tests

* update whatsnew

* pre-commit

* move the tests to test_numba.py

* Update doc/source/whatsnew/v3.0.0.rst

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>

* Update doc/source/whatsnew/v3.0.0.rst

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>

---------

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
 doc/source/whatsnew/v3.0.0.rst               |  1 +
 pandas/core/apply.py                         | 15 ++--
 pandas/core/groupby/groupby.py               | 11 ++-
 pandas/core/util/numba_.py                   | 47 +++++++------
 pandas/core/window/rolling.py                |  9 ++-
 pandas/tests/apply/test_frame_apply.py       | 10 +++
 pandas/tests/groupby/aggregate/test_numba.py | 29 +++++++-
 pandas/tests/groupby/transform/test_numba.py | 29 +++++++-
 pandas/tests/window/test_numba.py            | 74 +++++++++++++++++++-
 9 files changed, 187 insertions(+), 38 deletions(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index d7d29665950a6..c61b8f3fb3701 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -54,6 +54,7 @@ Other enhancements
 - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
 - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
+- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
 - :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
 - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
 - :meth:`str.get_dummies` now accepts a  ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
index af2d6243ce4ed..af513d49bcfe0 100644
--- a/pandas/core/apply.py
+++ b/pandas/core/apply.py
@@ -994,6 +994,7 @@ def wrapper(*args, **kwargs):
                 self.func,  # type: ignore[arg-type]
                 self.args,
                 self.kwargs,
+                num_required_args=1,
             )
             # error: Argument 1 to "__call__" of "_lru_cache_wrapper" has
             # incompatible type "Callable[..., Any] | str | list[Callable
@@ -1001,7 +1002,7 @@ def wrapper(*args, **kwargs):
             # list[Callable[..., Any] | str]]"; expected "Hashable"
             nb_looper = generate_apply_looper(
                 self.func,  # type: ignore[arg-type]
-                **get_jit_arguments(engine_kwargs, kwargs),
+                **get_jit_arguments(engine_kwargs),
             )
             result = nb_looper(self.values, self.axis, *args)
             # If we made the result 2-D, squeeze it back to 1-D
@@ -1158,9 +1159,11 @@ def numba_func(values, col_names, df_index, *args):
 
     def apply_with_numba(self) -> dict[int, Any]:
         func = cast(Callable, self.func)
-        args, kwargs = prepare_function_arguments(func, self.args, self.kwargs)
+        args, kwargs = prepare_function_arguments(
+            func, self.args, self.kwargs, num_required_args=1
+        )
         nb_func = self.generate_numba_apply_func(
-            func, **get_jit_arguments(self.engine_kwargs, kwargs)
+            func, **get_jit_arguments(self.engine_kwargs)
         )
         from pandas.core._numba.extensions import set_numba_data
 
@@ -1298,9 +1301,11 @@ def numba_func(values, col_names_index, index, *args):
 
     def apply_with_numba(self) -> dict[int, Any]:
         func = cast(Callable, self.func)
-        args, kwargs = prepare_function_arguments(func, self.args, self.kwargs)
+        args, kwargs = prepare_function_arguments(
+            func, self.args, self.kwargs, num_required_args=1
+        )
         nb_func = self.generate_numba_apply_func(
-            func, **get_jit_arguments(self.engine_kwargs, kwargs)
+            func, **get_jit_arguments(self.engine_kwargs)
         )
 
         from pandas.core._numba.extensions import set_numba_data
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index a0bd25525c55f..66db033596872 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -136,6 +136,7 @@ class providing the base-class of operations.
 from pandas.core.util.numba_ import (
     get_jit_arguments,
     maybe_use_numba,
+    prepare_function_arguments,
 )
 
 if TYPE_CHECKING:
@@ -1289,8 +1290,11 @@ def _transform_with_numba(self, func, *args, engine_kwargs=None, **kwargs):
 
         starts, ends, sorted_index, sorted_data = self._numba_prep(df)
         numba_.validate_udf(func)
+        args, kwargs = prepare_function_arguments(
+            func, args, kwargs, num_required_args=2
+        )
         numba_transform_func = numba_.generate_numba_transform_func(
-            func, **get_jit_arguments(engine_kwargs, kwargs)
+            func, **get_jit_arguments(engine_kwargs)
         )
         result = numba_transform_func(
             sorted_data,
@@ -1325,8 +1329,11 @@ def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs):
 
         starts, ends, sorted_index, sorted_data = self._numba_prep(df)
         numba_.validate_udf(func)
+        args, kwargs = prepare_function_arguments(
+            func, args, kwargs, num_required_args=2
+        )
         numba_agg_func = numba_.generate_numba_agg_func(
-            func, **get_jit_arguments(engine_kwargs, kwargs)
+            func, **get_jit_arguments(engine_kwargs)
         )
         result = numba_agg_func(
             sorted_data,
diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py
index de024f612516b..d3f00c08e0e2c 100644
--- a/pandas/core/util/numba_.py
+++ b/pandas/core/util/numba_.py
@@ -29,9 +29,7 @@ def set_use_numba(enable: bool = False) -> None:
     GLOBAL_USE_NUMBA = enable
 
 
-def get_jit_arguments(
-    engine_kwargs: dict[str, bool] | None = None, kwargs: dict | None = None
-) -> dict[str, bool]:
+def get_jit_arguments(engine_kwargs: dict[str, bool] | None = None) -> dict[str, bool]:
     """
     Return arguments to pass to numba.JIT, falling back on pandas default JIT settings.
 
@@ -39,8 +37,6 @@ def get_jit_arguments(
     ----------
     engine_kwargs : dict, default None
         user passed keyword arguments for numba.JIT
-    kwargs : dict, default None
-        user passed keyword arguments to pass into the JITed function
 
     Returns
     -------
@@ -55,16 +51,6 @@ def get_jit_arguments(
         engine_kwargs = {}
 
     nopython = engine_kwargs.get("nopython", True)
-    if kwargs:
-        # Note: in case numba supports keyword-only arguments in
-        # a future version, we should remove this check. But this
-        # seems unlikely to happen soon.
-
-        raise NumbaUtilError(
-            "numba does not support keyword-only arguments"
-            "https://github.com/numba/numba/issues/2916, "
-            "https://github.com/numba/numba/issues/6846"
-        )
     nogil = engine_kwargs.get("nogil", False)
     parallel = engine_kwargs.get("parallel", False)
     return {"nopython": nopython, "nogil": nogil, "parallel": parallel}
@@ -109,7 +95,7 @@ def jit_user_function(func: Callable) -> Callable:
 
 
 def prepare_function_arguments(
-    func: Callable, args: tuple, kwargs: dict
+    func: Callable, args: tuple, kwargs: dict, *, num_required_args: int
 ) -> tuple[tuple, dict]:
     """
     Prepare arguments for jitted function. As numba functions do not support kwargs,
@@ -118,11 +104,17 @@ def prepare_function_arguments(
     Parameters
     ----------
     func : function
-        user defined function
+        User defined function
     args : tuple
-        user input positional arguments
+        User input positional arguments
     kwargs : dict
-        user input keyword arguments
+        User input keyword arguments
+    num_required_args : int
+        The number of leading positional arguments we will pass to udf.
+        These are not supplied by the user.
+        e.g. for groupby we require "values", "index" as the first two arguments:
+        `numba_func(group, group_index, *args)`, in this case num_required_args=2.
+        See :func:`pandas.core.groupby.numba_.generate_numba_agg_func`
 
     Returns
     -------
@@ -133,9 +125,9 @@ def prepare_function_arguments(
     if not kwargs:
         return args, kwargs
 
-    # the udf should have this pattern: def udf(value, *args, **kwargs):...
+    # the udf should have this pattern: def udf(arg1, arg2, ..., *args, **kwargs):...
     signature = inspect.signature(func)
-    arguments = signature.bind(_sentinel, *args, **kwargs)
+    arguments = signature.bind(*[_sentinel] * num_required_args, *args, **kwargs)
     arguments.apply_defaults()
     # Ref: https://peps.python.org/pep-0362/
     # Arguments which could be passed as part of either *args or **kwargs
@@ -143,7 +135,16 @@ def prepare_function_arguments(
     args = arguments.args
     kwargs = arguments.kwargs
 
-    assert args[0] is _sentinel
-    args = args[1:]
+    if kwargs:
+        # Note: in case numba supports keyword-only arguments in
+        # a future version, we should remove this check. But this
+        # seems unlikely to happen soon.
+
+        raise NumbaUtilError(
+            "numba does not support keyword-only arguments"
+            "https://github.com/numba/numba/issues/2916, "
+            "https://github.com/numba/numba/issues/6846"
+        )
 
+    args = args[num_required_args:]
     return args, kwargs
diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py
index cf74cc30f3c5d..b1c37ab48fa57 100644
--- a/pandas/core/window/rolling.py
+++ b/pandas/core/window/rolling.py
@@ -65,6 +65,7 @@
 from pandas.core.util.numba_ import (
     get_jit_arguments,
     maybe_use_numba,
+    prepare_function_arguments,
 )
 from pandas.core.window.common import (
     flex_binary_moment,
@@ -1472,14 +1473,16 @@ def apply(
         if maybe_use_numba(engine):
             if raw is False:
                 raise ValueError("raw must be `True` when using the numba engine")
-            numba_args = args
+            numba_args, kwargs = prepare_function_arguments(
+                func, args, kwargs, num_required_args=1
+            )
             if self.method == "single":
                 apply_func = generate_numba_apply_func(
-                    func, **get_jit_arguments(engine_kwargs, kwargs)
+                    func, **get_jit_arguments(engine_kwargs)
                 )
             else:
                 apply_func = generate_numba_table_func(
-                    func, **get_jit_arguments(engine_kwargs, kwargs)
+                    func, **get_jit_arguments(engine_kwargs)
                 )
         elif engine in ("cython", None):
             if engine_kwargs is not None:
diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py
index ed7eae4502a64..d36d723c4be6a 100644
--- a/pandas/tests/apply/test_frame_apply.py
+++ b/pandas/tests/apply/test_frame_apply.py
@@ -90,6 +90,16 @@ def test_apply_args(float_frame, axis, raw, engine, nopython):
     tm.assert_frame_equal(result, expected)
 
     if engine == "numba":
+        # py signature binding
+        with pytest.raises(TypeError, match="missing a required argument: 'a'"):
+            float_frame.apply(
+                lambda x, a: x + a,
+                b=2,
+                raw=raw,
+                engine=engine,
+                engine_kwargs=engine_kwargs,
+            )
+
         # keyword-only arguments are not supported in numba
         with pytest.raises(
             pd.errors.NumbaUtilError,
diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py
index 964a80f8f3310..15c1efe5fd1ff 100644
--- a/pandas/tests/groupby/aggregate/test_numba.py
+++ b/pandas/tests/groupby/aggregate/test_numba.py
@@ -35,18 +35,43 @@ def incorrect_function(x):
 def test_check_nopython_kwargs():
     pytest.importorskip("numba")
 
-    def incorrect_function(values, index):
-        return sum(values) * 2.7
+    def incorrect_function(values, index, *, a):
+        return sum(values) * 2.7 + a
+
+    def correct_function(values, index, a):
+        return sum(values) * 2.7 + a
 
     data = DataFrame(
         {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
         columns=["key", "data"],
     )
+    expected = data.groupby("key").sum() * 2.7
+
+    # py signature binding
+    with pytest.raises(
+        TypeError, match="missing a required (keyword-only argument|argument): 'a'"
+    ):
+        data.groupby("key").agg(incorrect_function, engine="numba", b=1)
+    with pytest.raises(TypeError, match="missing a required argument: 'a'"):
+        data.groupby("key").agg(correct_function, engine="numba", b=1)
+
+    with pytest.raises(
+        TypeError, match="missing a required (keyword-only argument|argument): 'a'"
+    ):
+        data.groupby("key")["data"].agg(incorrect_function, engine="numba", b=1)
+    with pytest.raises(TypeError, match="missing a required argument: 'a'"):
+        data.groupby("key")["data"].agg(correct_function, engine="numba", b=1)
+
+    # numba signature check after binding
     with pytest.raises(NumbaUtilError, match="numba does not support"):
         data.groupby("key").agg(incorrect_function, engine="numba", a=1)
+    actual = data.groupby("key").agg(correct_function, engine="numba", a=1)
+    tm.assert_frame_equal(expected + 1, actual)
 
     with pytest.raises(NumbaUtilError, match="numba does not support"):
         data.groupby("key")["data"].agg(incorrect_function, engine="numba", a=1)
+    actual = data.groupby("key")["data"].agg(correct_function, engine="numba", a=1)
+    tm.assert_series_equal(expected["data"] + 1, actual)
 
 
 @pytest.mark.filterwarnings("ignore")
diff --git a/pandas/tests/groupby/transform/test_numba.py b/pandas/tests/groupby/transform/test_numba.py
index a17d25b2e7e2e..969df8ef4c52b 100644
--- a/pandas/tests/groupby/transform/test_numba.py
+++ b/pandas/tests/groupby/transform/test_numba.py
@@ -33,18 +33,43 @@ def incorrect_function(x):
 def test_check_nopython_kwargs():
     pytest.importorskip("numba")
 
-    def incorrect_function(values, index):
-        return values + 1
+    def incorrect_function(values, index, *, a):
+        return values + a
+
+    def correct_function(values, index, a):
+        return values + a
 
     data = DataFrame(
         {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
         columns=["key", "data"],
     )
+    # py signature binding
+    with pytest.raises(
+        TypeError, match="missing a required (keyword-only argument|argument): 'a'"
+    ):
+        data.groupby("key").transform(incorrect_function, engine="numba", b=1)
+    with pytest.raises(TypeError, match="missing a required argument: 'a'"):
+        data.groupby("key").transform(correct_function, engine="numba", b=1)
+
+    with pytest.raises(
+        TypeError, match="missing a required (keyword-only argument|argument): 'a'"
+    ):
+        data.groupby("key")["data"].transform(incorrect_function, engine="numba", b=1)
+    with pytest.raises(TypeError, match="missing a required argument: 'a'"):
+        data.groupby("key")["data"].transform(correct_function, engine="numba", b=1)
+
+    # numba signature check after binding
     with pytest.raises(NumbaUtilError, match="numba does not support"):
         data.groupby("key").transform(incorrect_function, engine="numba", a=1)
+    actual = data.groupby("key").transform(correct_function, engine="numba", a=1)
+    tm.assert_frame_equal(data[["data"]] + 1, actual)
 
     with pytest.raises(NumbaUtilError, match="numba does not support"):
         data.groupby("key")["data"].transform(incorrect_function, engine="numba", a=1)
+    actual = data.groupby("key")["data"].transform(
+        correct_function, engine="numba", a=1
+    )
+    tm.assert_series_equal(data["data"] + 1, actual)
 
 
 @pytest.mark.filterwarnings("ignore")
diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py
index 23b17c651f08d..d9ab4723a8f2c 100644
--- a/pandas/tests/window/test_numba.py
+++ b/pandas/tests/window/test_numba.py
@@ -38,6 +38,11 @@ def arithmetic_numba_supported_operators(request):
     return request.param
 
 
+@pytest.fixture
+def roll_frame():
+    return DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)})
+
+
 @td.skip_if_no("numba")
 @pytest.mark.filterwarnings("ignore")
 # Filter warnings when parallel=True and the function can't be parallelized by Numba
@@ -67,6 +72,62 @@ def f(x, *args):
         )
         tm.assert_series_equal(result, expected)
 
+    def test_apply_numba_with_kwargs(self, roll_frame):
+        # GH 58995
+        # rolling apply
+        def func(sr, a=0):
+            return sr.sum() + a
+
+        data = DataFrame(range(10))
+
+        result = data.rolling(5).apply(func, engine="numba", raw=True, kwargs={"a": 1})
+        expected = data.rolling(5).sum() + 1
+        tm.assert_frame_equal(result, expected)
+
+        result = data.rolling(5).apply(func, engine="numba", raw=True, args=(1,))
+        tm.assert_frame_equal(result, expected)
+
+        # expanding apply
+
+        result = data.expanding().apply(func, engine="numba", raw=True, kwargs={"a": 1})
+        expected = data.expanding().sum() + 1
+        tm.assert_frame_equal(result, expected)
+
+        result = data.expanding().apply(func, engine="numba", raw=True, args=(1,))
+        tm.assert_frame_equal(result, expected)
+
+        # groupby rolling
+        result = (
+            roll_frame.groupby("A")
+            .rolling(5)
+            .apply(func, engine="numba", raw=True, kwargs={"a": 1})
+        )
+        expected = roll_frame.groupby("A").rolling(5).sum() + 1
+        tm.assert_frame_equal(result, expected)
+
+        result = (
+            roll_frame.groupby("A")
+            .rolling(5)
+            .apply(func, engine="numba", raw=True, args=(1,))
+        )
+        tm.assert_frame_equal(result, expected)
+        # groupby expanding
+
+        result = (
+            roll_frame.groupby("A")
+            .expanding()
+            .apply(func, engine="numba", raw=True, kwargs={"a": 1})
+        )
+        expected = roll_frame.groupby("A").expanding().sum() + 1
+        tm.assert_frame_equal(result, expected)
+
+        result = (
+            roll_frame.groupby("A")
+            .expanding()
+            .apply(func, engine="numba", raw=True, args=(1,))
+        )
+        tm.assert_frame_equal(result, expected)
+
     def test_numba_min_periods(self):
         # GH 58868
         def last_row(x):
@@ -319,13 +380,24 @@ def f(x):
 
 @td.skip_if_no("numba")
 def test_invalid_kwargs_nopython():
+    with pytest.raises(TypeError, match="got an unexpected keyword argument 'a'"):
+        Series(range(1)).rolling(1).apply(
+            lambda x: x, kwargs={"a": 1}, engine="numba", raw=True
+        )
     with pytest.raises(
         NumbaUtilError, match="numba does not support keyword-only arguments"
     ):
         Series(range(1)).rolling(1).apply(
-            lambda x: x, kwargs={"a": 1}, engine="numba", raw=True
+            lambda x, *, a: x, kwargs={"a": 1}, engine="numba", raw=True
         )
 
+    tm.assert_series_equal(
+        Series(range(1), dtype=float) + 1,
+        Series(range(1))
+        .rolling(1)
+        .apply(lambda x, a: (x + a).sum(), kwargs={"a": 1}, engine="numba", raw=True),
+    )
+
 
 @td.skip_if_no("numba")
 @pytest.mark.slow

From de4eaf8b2e7c6b840dbd0198d8c3edf5eaf5afff Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 31 Oct 2024 17:17:07 +0100
Subject: [PATCH 182/224] CI: fix wrong syntax in CI env yml files (mamba 2.0
 compat) (#59910)

* fix wrong syntax in CI env yml files

* fix other files

* remove micromamba pin
---
 .github/actions/setup-conda/action.yml     | 2 --
 ci/deps/actions-310-minimum_versions.yaml  | 4 ++--
 ci/deps/actions-310.yaml                   | 4 ++--
 ci/deps/actions-311-downstream_compat.yaml | 4 ++--
 ci/deps/actions-311-numpydev.yaml          | 4 ++--
 ci/deps/actions-311-pyarrownightly.yaml    | 4 ++--
 ci/deps/actions-311.yaml                   | 4 ++--
 ci/deps/actions-312.yaml                   | 4 ++--
 ci/deps/actions-pypy-39.yaml               | 4 ++--
 ci/deps/circle-311-arm64.yaml              | 4 ++--
 environment.yml                            | 4 ++--
 scripts/generate_pip_deps_from_conda.py    | 2 ++
 12 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml
index 4fe901998cbcc..3eb68bdd2a15c 100644
--- a/.github/actions/setup-conda/action.yml
+++ b/.github/actions/setup-conda/action.yml
@@ -9,8 +9,6 @@ runs:
     - name: Install ${{ inputs.environment-file }}
       uses: mamba-org/setup-micromamba@v1
       with:
-        # Pinning to avoid 2.0 failures
-        micromamba-version: '1.5.10-0'
         environment-file: ${{ inputs.environment-file }}
         environment-name: test
         condarc-file: ci/.condarc
diff --git a/ci/deps/actions-310-minimum_versions.yaml b/ci/deps/actions-310-minimum_versions.yaml
index e670356c95637..c7c72828db481 100644
--- a/ci/deps/actions-310-minimum_versions.yaml
+++ b/ci/deps/actions-310-minimum_versions.yaml
@@ -7,9 +7,9 @@ dependencies:
   - python=3.10
 
   # build dependencies
-  - versioneer[toml]
+  - versioneer
   - cython>=0.29.33
-  - meson[ninja]=1.2.1
+  - meson=1.2.1
   - meson-python=0.13.1
 
   # test dependencies
diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml
index b1c7fda910f67..74cab4e0970dc 100644
--- a/ci/deps/actions-310.yaml
+++ b/ci/deps/actions-310.yaml
@@ -5,9 +5,9 @@ dependencies:
   - python=3.10
 
   # build dependencies
-  - versioneer[toml]
+  - versioneer
   - cython>=0.29.33
-  - meson[ninja]=1.2.1
+  - meson=1.2.1
   - meson-python=0.13.1
 
   # test dependencies
diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml
index f7fc4c38add90..092ca18d61259 100644
--- a/ci/deps/actions-311-downstream_compat.yaml
+++ b/ci/deps/actions-311-downstream_compat.yaml
@@ -6,9 +6,9 @@ dependencies:
   - python=3.11
 
   # build dependencies
-  - versioneer[toml]
+  - versioneer
   - cython>=0.29.33
-  - meson[ninja]=1.2.1
+  - meson=1.2.1
   - meson-python=0.13.1
 
   # test dependencies
diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml
index 996ce5cd9ab94..325a6d45d74fd 100644
--- a/ci/deps/actions-311-numpydev.yaml
+++ b/ci/deps/actions-311-numpydev.yaml
@@ -5,8 +5,8 @@ dependencies:
   - python=3.11
 
   # build dependencies
-  - versioneer[toml]
-  - meson[ninja]=1.2.1
+  - versioneer
+  - meson=1.2.1
   - meson-python=0.13.1
   - cython>=0.29.33
 
diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml
index 31a1ef5b8717c..22e4907e5a6e5 100644
--- a/ci/deps/actions-311-pyarrownightly.yaml
+++ b/ci/deps/actions-311-pyarrownightly.yaml
@@ -5,8 +5,8 @@ dependencies:
   - python=3.11
 
   # build dependencies
-  - versioneer[toml]
-  - meson[ninja]=1.2.1
+  - versioneer
+  - meson=1.2.1
   - cython>=0.29.33
   - meson-python=0.13.1
 
diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml
index f1ab3c37c4c71..b6f515dceaea9 100644
--- a/ci/deps/actions-311.yaml
+++ b/ci/deps/actions-311.yaml
@@ -5,9 +5,9 @@ dependencies:
   - python=3.11
 
   # build dependencies
-  - versioneer[toml]
+  - versioneer
   - cython>=0.29.33
-  - meson[ninja]=1.2.1
+  - meson=1.2.1
   - meson-python=0.13.1
 
   # test dependencies
diff --git a/ci/deps/actions-312.yaml b/ci/deps/actions-312.yaml
index d39d572eda619..bc66f8a5382c9 100644
--- a/ci/deps/actions-312.yaml
+++ b/ci/deps/actions-312.yaml
@@ -5,9 +5,9 @@ dependencies:
   - python=3.12
 
   # build dependencies
-  - versioneer[toml]
+  - versioneer
   - cython>=0.29.33
-  - meson[ninja]=1.2.1
+  - meson=1.2.1
   - meson-python=0.13.1
 
   # test dependencies
diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml
index c157d2e65c001..90933b24b88db 100644
--- a/ci/deps/actions-pypy-39.yaml
+++ b/ci/deps/actions-pypy-39.yaml
@@ -8,9 +8,9 @@ dependencies:
   - python=3.9[build=*_pypy]
 
   # build dependencies
-  - versioneer[toml]
+  - versioneer
   - cython>=0.29.33
-  - meson[ninja]=1.2.1
+  - meson=1.2.1
   - meson-python=0.13.1
 
   # test dependencies
diff --git a/ci/deps/circle-311-arm64.yaml b/ci/deps/circle-311-arm64.yaml
index def7faeb8bcaa..3f09e27d0fe4b 100644
--- a/ci/deps/circle-311-arm64.yaml
+++ b/ci/deps/circle-311-arm64.yaml
@@ -5,9 +5,9 @@ dependencies:
   - python=3.11
 
   # build dependencies
-  - versioneer[toml]
+  - versioneer
   - cython>=0.29.33
-  - meson[ninja]=1.2.1
+  - meson=1.2.1
   - meson-python=0.13.1
 
   # test dependencies
diff --git a/environment.yml b/environment.yml
index c05f8dbebd28e..5ef5fbe910427 100644
--- a/environment.yml
+++ b/environment.yml
@@ -7,9 +7,9 @@ dependencies:
   - pip
 
   # build dependencies
-  - versioneer[toml]
+  - versioneer
   - cython~=3.0.5
-  - meson[ninja]=1.2.1
+  - meson=1.2.1
   - meson-python=0.13.1
 
   # test dependencies
diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py
index d54d35bc0171f..a57876902ad36 100755
--- a/scripts/generate_pip_deps_from_conda.py
+++ b/scripts/generate_pip_deps_from_conda.py
@@ -26,6 +26,8 @@
 EXCLUDE = {"python", "c-compiler", "cxx-compiler"}
 REMAP_VERSION = {"tzdata": "2022.7"}
 CONDA_TO_PIP = {
+    "versioneer": "versioneer[toml]",
+    "meson": "meson[ninja]",
     "pytables": "tables",
     "psycopg2": "psycopg2-binary",
     "dask-core": "dask",

From d48235f5da21a547440f6150990aa42986e88e80 Mon Sep 17 00:00:00 2001
From: Xiao Yuan <yuanx749@gmail.com>
Date: Sat, 2 Nov 2024 00:54:44 +0800
Subject: [PATCH 183/224] DOC: fix docstring of DataFrame.to_latex, double
 curly braces to single (#60165)

---
 pandas/core/generic.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 1759e1ef91d85..756c431022063 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -3324,9 +3324,9 @@ def to_latex(
         r"""
         Render object to a LaTeX tabular, longtable, or nested table.
 
-        Requires ``\usepackage{{booktabs}}``.  The output can be copy/pasted
+        Requires ``\usepackage{booktabs}``.  The output can be copy/pasted
         into a main LaTeX document or read from an external file
-        with ``\input{{table.tex}}``.
+        with ``\input{table.tex}``.
 
         .. versionchanged:: 2.0.0
            Refactored to use the Styler implementation via jinja2 templating.
@@ -3344,13 +3344,13 @@ def to_latex(
             Write row names (index).
         na_rep : str, default 'NaN'
             Missing data representation.
-        formatters : list of functions or dict of {{str: function}}, optional
+        formatters : list of functions or dict of {str: function}, optional
             Formatter functions to apply to columns' elements by position or
             name. The result of each function must be a unicode string.
             List must be of length equal to the number of columns.
         float_format : one-parameter function or str, optional, default None
             Formatter for floating point numbers. For example
-            ``float_format="%.2f"`` and ``float_format="{{:0.2f}}".format`` will
+            ``float_format="%.2f"`` and ``float_format="{:0.2f}".format`` will
             both result in 0.1234 being formatted as 0.12.
         sparsify : bool, optional
             Set to False for a DataFrame with a hierarchical index to print
@@ -3367,7 +3367,7 @@ def to_latex(
             columns of numbers, which default to 'r'.
         longtable : bool, optional
             Use a longtable environment instead of tabular. Requires
-            adding a \usepackage{{longtable}} to your LaTeX preamble.
+            adding a \usepackage{longtable} to your LaTeX preamble.
             By default, the value will be read from the pandas config
             module, and set to `True` if the option ``styler.latex.environment`` is
             `"longtable"`.
@@ -3405,7 +3405,7 @@ def to_latex(
                default value to "r".
         multirow : bool, default True
             Use \multirow to enhance MultiIndex rows. Requires adding a
-            \usepackage{{multirow}} to your LaTeX preamble. Will print
+            \usepackage{multirow} to your LaTeX preamble. Will print
             centered labels (instead of top-aligned) across the contained
             rows, separating groups via clines. The default will be read
             from the pandas config module, and is set as the option
@@ -3416,15 +3416,15 @@ def to_latex(
                default value to `True`.
         caption : str or tuple, optional
             Tuple (full_caption, short_caption),
-            which results in ``\caption[short_caption]{{full_caption}}``;
+            which results in ``\caption[short_caption]{full_caption}``;
             if a single string is passed, no short caption will be set.
         label : str, optional
-            The LaTeX label to be placed inside ``\label{{}}`` in the output.
-            This is used with ``\ref{{}}`` in the main ``.tex`` file.
+            The LaTeX label to be placed inside ``\label{}`` in the output.
+            This is used with ``\ref{}`` in the main ``.tex`` file.
 
         position : str, optional
             The LaTeX positional argument for tables, to be placed after
-            ``\begin{{}}`` in the output.
+            ``\begin{}`` in the output.
 
         Returns
         -------

From a3f14bfa373c6fb4e3470a1f3bd8fed1657e09e1 Mon Sep 17 00:00:00 2001
From: Abhishek Chaudhari
 <91185083+AbhishekChaudharii@users.noreply.github.com>
Date: Fri, 1 Nov 2024 22:26:00 +0530
Subject: [PATCH 184/224] BUG: Fixes pd.merge issue with columns of dtype
 numpy.uintc on windows (#60145)

* bug fix for numpy.uintc in merge operations on windows

Added pytest test case to verify correct behavior with numpy.uintc dtype

* Formatting changes after running pre-commit

* Added tests for numpy.intc

* added whatsnew note

* pre-commit automatic changes and also made changes to test_merge.py file to make pandas namespace consistent

* removed comment

* added the deleted whatsnew note back

* better whatsnew note

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>

---------

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
 doc/source/whatsnew/v3.0.0.rst           |  1 +
 pandas/core/reshape/merge.py             | 12 +++++++-
 pandas/tests/reshape/merge/test_merge.py | 35 ++++++++++++++++++++++++
 3 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index c61b8f3fb3701..2e64c66812306 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -739,6 +739,7 @@ Reshaping
 - Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`)
 - Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`)
 - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)
+- Bug in :meth:`DataFrame.merge` when merging two :class:`DataFrame` on ``intc`` or ``uintc`` types on Windows (:issue:`60091`, :issue:`58713`)
 - Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`)
 - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)
 
diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py
index 07e8fa4841c04..0ca8661ad3b5c 100644
--- a/pandas/core/reshape/merge.py
+++ b/pandas/core/reshape/merge.py
@@ -123,7 +123,17 @@
 
 # See https://github.com/pandas-dev/pandas/issues/52451
 if np.intc is not np.int32:
-    _factorizers[np.intc] = libhashtable.Int64Factorizer
+    if np.dtype(np.intc).itemsize == 4:
+        _factorizers[np.intc] = libhashtable.Int32Factorizer
+    else:
+        _factorizers[np.intc] = libhashtable.Int64Factorizer
+
+if np.uintc is not np.uint32:
+    if np.dtype(np.uintc).itemsize == 4:
+        _factorizers[np.uintc] = libhashtable.UInt32Factorizer
+    else:
+        _factorizers[np.uintc] = libhashtable.UInt64Factorizer
+
 
 _known = (np.ndarray, ExtensionArray, Index, ABCSeries)
 
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
index d4766242b8460..f0abc1afc6ab0 100644
--- a/pandas/tests/reshape/merge/test_merge.py
+++ b/pandas/tests/reshape/merge/test_merge.py
@@ -1843,6 +1843,41 @@ def test_merge_empty(self, left_empty, how, exp):
 
         tm.assert_frame_equal(result, expected)
 
+    def test_merge_with_uintc_columns(self):
+        df1 = DataFrame({"a": ["foo", "bar"], "b": np.array([1, 2], dtype=np.uintc)})
+        df2 = DataFrame({"a": ["foo", "baz"], "b": np.array([3, 4], dtype=np.uintc)})
+        result = df1.merge(df2, how="outer")
+        expected = DataFrame(
+            {
+                "a": ["bar", "baz", "foo", "foo"],
+                "b": np.array([2, 4, 1, 3], dtype=np.uintc),
+            }
+        )
+        tm.assert_frame_equal(result.reset_index(drop=True), expected)
+
+    def test_merge_with_intc_columns(self):
+        df1 = DataFrame({"a": ["foo", "bar"], "b": np.array([1, 2], dtype=np.intc)})
+        df2 = DataFrame({"a": ["foo", "baz"], "b": np.array([3, 4], dtype=np.intc)})
+        result = df1.merge(df2, how="outer")
+        expected = DataFrame(
+            {
+                "a": ["bar", "baz", "foo", "foo"],
+                "b": np.array([2, 4, 1, 3], dtype=np.intc),
+            }
+        )
+        tm.assert_frame_equal(result.reset_index(drop=True), expected)
+
+    def test_merge_intc_non_monotonic(self):
+        df = DataFrame({"join_key": Series([0, 2, 1], dtype=np.intc)})
+        df_details = DataFrame(
+            {"join_key": Series([0, 1, 2], dtype=np.intc), "value": ["a", "b", "c"]}
+        )
+        merged = df.merge(df_details, on="join_key", how="left")
+        expected = DataFrame(
+            {"join_key": np.array([0, 2, 1], dtype=np.intc), "value": ["a", "c", "b"]}
+        )
+        tm.assert_frame_equal(merged.reset_index(drop=True), expected)
+
 
 @pytest.fixture
 def left():

From d11ed2f1193c7a45446a702170b8ca0368bc07d3 Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sat, 2 Nov 2024 00:17:26 +0530
Subject: [PATCH 185/224] DOC: fix SA01,ES01 for
 pandas.arrays.IntervalArray.left (#60168)

---
 ci/code_checks.sh              |  1 -
 pandas/core/arrays/interval.py | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 768e05b16cfe9..adcf48507698b 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -89,7 +89,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \
         -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
         -i "pandas.arrays.IntegerArray SA01" \
-        -i "pandas.arrays.IntervalArray.left SA01" \
         -i "pandas.arrays.IntervalArray.length SA01" \
         -i "pandas.arrays.IntervalArray.right SA01" \
         -i "pandas.arrays.NumpyExtensionArray SA01" \
diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
index 2ac9c77bef322..c58d03fefedb5 100644
--- a/pandas/core/arrays/interval.py
+++ b/pandas/core/arrays/interval.py
@@ -1233,6 +1233,22 @@ def left(self) -> Index:
         """
         Return the left endpoints of each Interval in the IntervalArray as an Index.
 
+        This property provides access to the left endpoints of the intervals
+        contained within the IntervalArray. This can be useful for analyses where
+        the starting point of each interval is of interest, such as in histogram
+        creation, data aggregation, or any scenario requiring the identification
+        of the beginning of defined ranges. This property returns a ``pandas.Index``
+        object containing the midpoint for each interval.
+
+        See Also
+        --------
+        arrays.IntervalArray.right : Return the right endpoints of each Interval in
+            the IntervalArray as an Index.
+        arrays.IntervalArray.mid : Return the midpoint of each Interval in the
+            IntervalArray as an Index.
+        arrays.IntervalArray.contains : Check elementwise if the Intervals contain
+            the value.
+
         Examples
         --------
 

From 9a015d19514597ad5d1c31f566a0a2bdb17d4cc5 Mon Sep 17 00:00:00 2001
From: Shreyal Gupta <99545557+Ravenin7@users.noreply.github.com>
Date: Sat, 2 Nov 2024 22:36:35 +0530
Subject: [PATCH 186/224] DOC: Update contributing docs for Windows build tools
 instructions (#60170)

* DOC: Update Windows build tools instructions for VS Build Tools 2022

* fix trailing whitespaces
---
 doc/source/development/contributing_environment.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst
index 1426d3a84a748..98bd4b00d016b 100644
--- a/doc/source/development/contributing_environment.rst
+++ b/doc/source/development/contributing_environment.rst
@@ -35,6 +35,10 @@ You will need `Build Tools for Visual Studio 2022
         scrolling down to "All downloads" -> "Tools for Visual Studio".
         In the installer, select the "Desktop development with C++" Workloads.
 
+        If you encounter an error indicating ``cl.exe`` is not found when building with Meson,
+        reopen the installer and also select the optional component
+        **MSVC v142 - VS 2019 C++ x64/x86 build tools** in the right pane for installation.
+
 Alternatively, you can install the necessary components on the commandline using
 `vs_BuildTools.exe <https://learn.microsoft.com/en-us/visualstudio/install/use-command-line-parameters-to-install-visual-studio?source=recommendations&view=vs-2022>`_
 

From dc057b4f65b5a2fb5490e65157bc0b36f2638cd1 Mon Sep 17 00:00:00 2001
From: calvin <cd-public@users.noreply.github.com>
Date: Mon, 4 Nov 2024 01:09:32 -0800
Subject: [PATCH 187/224] DOC: Typo fix in "comparison_with_r (#60177)

---
 doc/source/getting_started/comparison/comparison_with_r.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst
index 25ba237e8caf3..d9d7d916b0238 100644
--- a/doc/source/getting_started/comparison/comparison_with_r.rst
+++ b/doc/source/getting_started/comparison/comparison_with_r.rst
@@ -405,7 +405,7 @@ In Python, this list would be a list of tuples, so
    a = list(enumerate(list(range(1, 5)) + [np.NAN]))
    pd.DataFrame(a)
 
-For more details and examples see :ref:`the Into to Data Structures
+For more details and examples see :ref:`the Intro to Data Structures
 documentation <dsintro>`.
 
 meltdf

From 988a7c83183912e56db8ec92736d85ece7f4fdc2 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 4 Nov 2024 10:19:12 +0100
Subject: [PATCH 188/224] TST (string dtype): remove xfails in extension tests
 + fix categorical/string dispatch (#60134)

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
 pandas/core/arrays/string_.py              |  1 -
 pandas/tests/extension/base/ops.py         | 26 ----------------------
 pandas/tests/extension/test_categorical.py |  2 --
 pandas/tests/extension/test_numpy.py       |  7 ------
 4 files changed, 36 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 93c678f606fcd..c9e53abc31182 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -915,7 +915,6 @@ def _cmp_method(self, other, op):
             if not is_array_like(other):
                 other = np.asarray(other)
             other = other[valid]
-            other = np.asarray(other)
 
         if op.__name__ in ops.ARITHMETIC_BINOPS:
             result = np.empty_like(self._ndarray, dtype="object")
diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py
index 547114ecfddd0..222ff42d45052 100644
--- a/pandas/tests/extension/base/ops.py
+++ b/pandas/tests/extension/base/ops.py
@@ -5,10 +5,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
-from pandas.compat import HAS_PYARROW
-
 from pandas.core.dtypes.common import is_string_dtype
 
 import pandas as pd
@@ -134,12 +130,6 @@ class BaseArithmeticOpsTests(BaseOpsUtil):
     series_array_exc: type[Exception] | None = TypeError
     divmod_exc: type[Exception] | None = TypeError
 
-    # TODO(infer_string) need to remove import of pyarrow
-    @pytest.mark.xfail(
-        using_string_dtype() and not HAS_PYARROW,
-        reason="TODO(infer_string)",
-        strict=False,
-    )
     def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
         # series & scalar
         if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype):
@@ -149,11 +139,6 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators):
         ser = pd.Series(data)
         self.check_opname(ser, op_name, ser.iloc[0])
 
-    @pytest.mark.xfail(
-        using_string_dtype() and not HAS_PYARROW,
-        reason="TODO(infer_string)",
-        strict=False,
-    )
     def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
         # frame & scalar
         if all_arithmetic_operators == "__rmod__" and is_string_dtype(data.dtype):
@@ -163,22 +148,12 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators):
         df = pd.DataFrame({"A": data})
         self.check_opname(df, op_name, data[0])
 
-    @pytest.mark.xfail(
-        using_string_dtype() and not HAS_PYARROW,
-        reason="TODO(infer_string)",
-        strict=False,
-    )
     def test_arith_series_with_array(self, data, all_arithmetic_operators):
         # ndarray & other series
         op_name = all_arithmetic_operators
         ser = pd.Series(data)
         self.check_opname(ser, op_name, pd.Series([ser.iloc[0]] * len(ser)))
 
-    @pytest.mark.xfail(
-        using_string_dtype() and not HAS_PYARROW,
-        reason="TODO(infer_string)",
-        strict=False,
-    )
     def test_divmod(self, data):
         ser = pd.Series(data)
         self._check_divmod_op(ser, divmod, 1)
@@ -194,7 +169,6 @@ def test_divmod_series_array(self, data, data_for_twos):
         other = pd.Series(other)
         self._check_divmod_op(other, ops.rdivmod, ser)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     def test_add_series_with_extension_array(self, data):
         # Check adding an ExtensionArray to a Series of the same dtype matches
         # the behavior of adding the arrays directly and then wrapping in a
diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py
index c3d4b83f731a3..8f8af607585df 100644
--- a/pandas/tests/extension/test_categorical.py
+++ b/pandas/tests/extension/test_categorical.py
@@ -140,7 +140,6 @@ def test_map(self, data, na_action):
         result = data.map(lambda x: x, na_action=na_action)
         tm.assert_extension_array_equal(result, data)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
         # frame & scalar
         op_name = all_arithmetic_operators
@@ -152,7 +151,6 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
             )
         super().test_arith_frame_with_scalar(data, op_name)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request):
         op_name = all_arithmetic_operators
         if op_name == "__rmod__":
diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py
index 1b251a5118681..79cfb736941d6 100644
--- a/pandas/tests/extension/test_numpy.py
+++ b/pandas/tests/extension/test_numpy.py
@@ -19,8 +19,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.core.dtypes.dtypes import NumpyEADtype
 
 import pandas as pd
@@ -257,7 +255,6 @@ def test_insert_invalid(self, data, invalid_scalar):
     frame_scalar_exc = None
     series_array_exc = None
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     def test_divmod(self, data):
         divmod_exc = None
         if data.dtype.kind == "O":
@@ -265,7 +262,6 @@ def test_divmod(self, data):
         self.divmod_exc = divmod_exc
         super().test_divmod(data)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     def test_divmod_series_array(self, data):
         ser = pd.Series(data)
         exc = None
@@ -274,7 +270,6 @@ def test_divmod_series_array(self, data):
             self.divmod_exc = exc
         self._check_divmod_op(ser, divmod, data)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request):
         opname = all_arithmetic_operators
         series_scalar_exc = None
@@ -288,7 +283,6 @@ def test_arith_series_with_scalar(self, data, all_arithmetic_operators, request)
         self.series_scalar_exc = series_scalar_exc
         super().test_arith_series_with_scalar(data, all_arithmetic_operators)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     def test_arith_series_with_array(self, data, all_arithmetic_operators):
         opname = all_arithmetic_operators
         series_array_exc = None
@@ -297,7 +291,6 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators):
         self.series_array_exc = series_array_exc
         super().test_arith_series_with_array(data, all_arithmetic_operators)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request):
         opname = all_arithmetic_operators
         frame_scalar_exc = None

From dbeeb1f05bca199b3c1aed979e6ae72074a82243 Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Mon, 4 Nov 2024 02:38:45 -0800
Subject: [PATCH 189/224] TST (string dtype): un-xfail string tests specific to
 object dtype (#59433)

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 pandas/tests/copy_view/test_interp_fillna.py |  8 ++----
 pandas/tests/copy_view/test_replace.py       |  3 +--
 pandas/tests/test_algos.py                   | 28 +++++++++++++-------
 3 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/pandas/tests/copy_view/test_interp_fillna.py b/pandas/tests/copy_view/test_interp_fillna.py
index fc57178b897b9..6bcda0ef2c35a 100644
--- a/pandas/tests/copy_view/test_interp_fillna.py
+++ b/pandas/tests/copy_view/test_interp_fillna.py
@@ -1,10 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
-from pandas.compat import HAS_PYARROW
-
 from pandas import (
     NA,
     DataFrame,
@@ -114,18 +110,18 @@ def test_interp_fill_functions_inplace(func, dtype):
     assert view._mgr._has_no_reference(0)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_interpolate_cannot_with_object_dtype():
     df = DataFrame({"a": ["a", np.nan, "c"], "b": 1})
+    df["a"] = df["a"].astype(object)
 
     msg = "DataFrame cannot interpolate with object dtype"
     with pytest.raises(TypeError, match=msg):
         df.interpolate()
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 def test_interpolate_object_convert_no_op():
     df = DataFrame({"a": ["a", "b", "c"], "b": 1})
+    df["a"] = df["a"].astype(object)
     arr_a = get_array(df, "a")
 
     # Now CoW makes a copy, it should not!
diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py
index a8acd446ff5f5..e57514bffdf1e 100644
--- a/pandas/tests/copy_view/test_replace.py
+++ b/pandas/tests/copy_view/test_replace.py
@@ -259,10 +259,9 @@ def test_replace_empty_list():
     assert not df2._mgr._has_no_reference(0)
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 @pytest.mark.parametrize("value", ["d", None])
 def test_replace_object_list_inplace(value):
-    df = DataFrame({"a": ["a", "b", "c"]})
+    df = DataFrame({"a": ["a", "b", "c"]}, dtype=object)
     arr = get_array(df, "a")
     df.replace(["c"], value, inplace=True)
     assert np.shares_memory(arr, get_array(df, "a"))
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index dac74a0e32a42..3d1177c23c612 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -4,8 +4,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs import (
     algos as libalgos,
     hashtable as ht,
@@ -1684,12 +1682,17 @@ def test_unique_complex_numbers(self, array, expected):
 
 
 class TestHashTable:
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize(
         "htable, data",
         [
-            (ht.PyObjectHashTable, [f"foo_{i}" for i in range(1000)]),
-            (ht.StringHashTable, [f"foo_{i}" for i in range(1000)]),
+            (
+                ht.PyObjectHashTable,
+                np.array([f"foo_{i}" for i in range(1000)], dtype=object),
+            ),
+            (
+                ht.StringHashTable,
+                np.array([f"foo_{i}" for i in range(1000)], dtype=object),
+            ),
             (ht.Float64HashTable, np.arange(1000, dtype=np.float64)),
             (ht.Int64HashTable, np.arange(1000, dtype=np.int64)),
             (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)),
@@ -1697,7 +1700,7 @@ class TestHashTable:
     )
     def test_hashtable_unique(self, htable, data, writable):
         # output of maker has guaranteed unique elements
-        s = Series(data)
+        s = Series(data, dtype=data.dtype)
         if htable == ht.Float64HashTable:
             # add NaN for float column
             s.loc[500] = np.nan
@@ -1724,12 +1727,17 @@ def test_hashtable_unique(self, htable, data, writable):
         reconstr = result_unique[result_inverse]
         tm.assert_numpy_array_equal(reconstr, s_duplicated.values)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize(
         "htable, data",
         [
-            (ht.PyObjectHashTable, [f"foo_{i}" for i in range(1000)]),
-            (ht.StringHashTable, [f"foo_{i}" for i in range(1000)]),
+            (
+                ht.PyObjectHashTable,
+                np.array([f"foo_{i}" for i in range(1000)], dtype=object),
+            ),
+            (
+                ht.StringHashTable,
+                np.array([f"foo_{i}" for i in range(1000)], dtype=object),
+            ),
             (ht.Float64HashTable, np.arange(1000, dtype=np.float64)),
             (ht.Int64HashTable, np.arange(1000, dtype=np.int64)),
             (ht.UInt64HashTable, np.arange(1000, dtype=np.uint64)),
@@ -1737,7 +1745,7 @@ def test_hashtable_unique(self, htable, data, writable):
     )
     def test_hashtable_factorize(self, htable, writable, data):
         # output of maker has guaranteed unique elements
-        s = Series(data)
+        s = Series(data, dtype=data.dtype)
         if htable == ht.Float64HashTable:
             # add NaN for float column
             s.loc[500] = np.nan

From fc0301d54c92dadb9f1ccfeaf21a5a76ea51d2db Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 4 Nov 2024 10:03:16 -0800
Subject: [PATCH 190/224] [pre-commit.ci] pre-commit autoupdate (#60185)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

updates:
- [github.com/astral-sh/ruff-pre-commit: v0.6.9 → v0.7.2](https://github.com/astral-sh/ruff-pre-commit/compare/v0.6.9...v0.7.2)
- [github.com/asottile/pyupgrade: v3.17.0 → v3.19.0](https://github.com/asottile/pyupgrade/compare/v3.17.0...v3.19.0)
- [github.com/pre-commit/mirrors-clang-format: v19.1.1 → v19.1.3](https://github.com/pre-commit/mirrors-clang-format/compare/v19.1.1...v19.1.3)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .pre-commit-config.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 87212309725c7..09912bfb6c349 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,7 +19,7 @@ ci:
     skip: [pyright, mypy]
 repos:
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.6.9
+    rev: v0.7.2
     hooks:
     -   id: ruff
         args: [--exit-non-zero-on-fix]
@@ -74,7 +74,7 @@ repos:
     hooks:
     -   id: isort
 -   repo: https://github.com/asottile/pyupgrade
-    rev: v3.17.0
+    rev: v3.19.0
     hooks:
     -   id: pyupgrade
         args: [--py310-plus]
@@ -95,7 +95,7 @@ repos:
     - id: sphinx-lint
       args: ["--enable", "all", "--disable", "line-too-long"]
 -   repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v19.1.1
+    rev: v19.1.3
     hooks:
     - id: clang-format
       files: ^pandas/_libs/src|^pandas/_libs/include

From 9ec4a9150ef6dbf6da1248b7252141d48203d941 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 4 Nov 2024 19:11:25 +0100
Subject: [PATCH 191/224] TST (string dtype): fix invalid comparison error
 message and update test (#60176)

---
 pandas/core/arrays/arrow/array.py     |  2 +-
 pandas/tests/frame/test_arithmetic.py | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 53f703b701217..52d7fba8798e6 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -734,7 +734,7 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray:
                 try:
                     result[valid] = op(np_array[valid], other)
                 except TypeError:
-                    result = ops.invalid_comparison(np_array, other, op)
+                    result = ops.invalid_comparison(self, other, op)
                 result = pa.array(result, type=pa.bool_())
                 result = pc.if_else(valid, result, None)
         else:
diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py
index e41a3b27e592c..6b61fe8b05219 100644
--- a/pandas/tests/frame/test_arithmetic.py
+++ b/pandas/tests/frame/test_arithmetic.py
@@ -13,8 +13,6 @@
 
 from pandas._config import using_string_dtype
 
-from pandas.compat import HAS_PYARROW
-
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -1544,9 +1542,6 @@ def test_comparisons(self, simple_frame, float_frame, func):
         with pytest.raises(ValueError, match=msg):
             func(simple_frame, simple_frame[:2])
 
-    @pytest.mark.xfail(
-        using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)"
-    )
     def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne):
         # GH 11565
         df = DataFrame(
@@ -1554,7 +1549,12 @@ def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne)
         )
 
         f = getattr(operator, compare_operators_no_eq_ne)
-        msg = "'[<>]=?' not supported between instances of 'str' and 'int'"
+        msg = "|".join(
+            [
+                "'[<>]=?' not supported between instances of 'str' and 'int'",
+                "Invalid comparison between dtype=str and int",
+            ]
+        )
         with pytest.raises(TypeError, match=msg):
             f(df, 0)
 

From cbf6e420854e6bfba9d4b8896f879dd24997223f Mon Sep 17 00:00:00 2001
From: Swati Sneha <swati.sneha095@gmail.com>
Date: Mon, 4 Nov 2024 23:42:37 +0530
Subject: [PATCH 192/224] DOC: added see also for series.dt.round in
 series.round (#60187)

added see also for series.dt.round in series.round
---
 pandas/core/series.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/core/series.py b/pandas/core/series.py
index fe2bb0b5aa5c3..d83d9715878f8 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -2482,6 +2482,7 @@ def round(self, decimals: int = 0, *args, **kwargs) -> Series:
         --------
         numpy.around : Round values of an np.array.
         DataFrame.round : Round values of a DataFrame.
+        Series.dt.round : Round values of data to the specified freq.
 
         Notes
         -----

From eacf0326efb709169ebc49f040834670dfe4beb3 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Mon, 4 Nov 2024 21:19:55 +0100
Subject: [PATCH 193/224] BUG: Fix copy semantics in ``__array__`` (#60046)

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 doc/source/whatsnew/v2.3.0.rst                |  3 ++
 pandas/core/arrays/arrow/array.py             | 11 ++++-
 pandas/core/arrays/categorical.py             | 24 +++++++----
 pandas/core/arrays/datetimelike.py            |  7 ++++
 pandas/core/arrays/interval.py                |  5 +++
 pandas/core/arrays/masked.py                  | 12 +++++-
 pandas/core/arrays/numpy_.py                  |  3 ++
 pandas/core/arrays/period.py                  | 15 ++++++-
 pandas/core/arrays/sparse/array.py            | 15 +++++--
 pandas/core/generic.py                        | 13 +++++-
 pandas/core/indexes/base.py                   |  6 ++-
 pandas/core/indexes/multi.py                  |  9 ++++
 pandas/core/internals/construction.py         |  2 +-
 pandas/core/series.py                         | 13 ++++--
 pandas/tests/arrays/sparse/test_array.py      | 31 ++++++++++++++
 pandas/tests/arrays/test_datetimelike.py      |  8 ++++
 pandas/tests/base/test_conversion.py          | 41 ++++++++++++++++---
 pandas/tests/extension/base/interface.py      | 21 ++++++++++
 pandas/tests/extension/json/array.py          | 10 ++++-
 pandas/tests/indexes/multi/test_conversion.py | 36 ++++++++++++++++
 20 files changed, 255 insertions(+), 30 deletions(-)

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
index 5d72fabedcee8..90f9f4ed464c6 100644
--- a/doc/source/whatsnew/v2.3.0.rst
+++ b/doc/source/whatsnew/v2.3.0.rst
@@ -32,6 +32,9 @@ enhancement1
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
 
+- The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called
+  when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been
+  updated to work correctly with NumPy >= 2 (:issue:`57739`)
 - The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
 -
 
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 52d7fba8798e6..b6f1412066574 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -668,7 +668,16 @@ def __array__(
         self, dtype: NpDtype | None = None, copy: bool | None = None
     ) -> np.ndarray:
         """Correctly construct numpy arrays when passed to `np.asarray()`."""
-        return self.to_numpy(dtype=dtype)
+        if copy is False:
+            # TODO: By using `zero_copy_only` it may be possible to implement this
+            raise ValueError(
+                "Unable to avoid copy while creating an array as requested."
+            )
+        elif copy is None:
+            # `to_numpy(copy=False)` has the meaning of NumPy `copy=None`.
+            copy = False
+
+        return self.to_numpy(dtype=dtype, copy=copy)
 
     def __invert__(self) -> Self:
         # This is a bit wise op for integer types
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
index 7cde4c53cb2f5..99e4cb0545e2d 100644
--- a/pandas/core/arrays/categorical.py
+++ b/pandas/core/arrays/categorical.py
@@ -579,11 +579,12 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
             raise ValueError("Cannot convert float NaN to integer")
 
         elif len(self.codes) == 0 or len(self.categories) == 0:
-            result = np.array(
-                self,
-                dtype=dtype,
-                copy=copy,
-            )
+            # For NumPy 1.x compatibility we cannot use copy=None.  And
+            # `copy=False` has the meaning of `copy=None` here:
+            if not copy:
+                result = np.asarray(self, dtype=dtype)
+            else:
+                result = np.array(self, dtype=dtype)
 
         else:
             # GH8628 (PERF): astype category codes instead of astyping array
@@ -1663,7 +1664,7 @@ def __array__(
             Specifies the the dtype for the array.
 
         copy : bool or None, optional
-            Unused.
+            See :func:`numpy.asarray`.
 
         Returns
         -------
@@ -1686,13 +1687,18 @@ def __array__(
         >>> np.asarray(cat)
         array(['a', 'b'], dtype=object)
         """
+        if copy is False:
+            raise ValueError(
+                "Unable to avoid copy while creating an array as requested."
+            )
+
         ret = take_nd(self.categories._values, self._codes)
-        if dtype and np.dtype(dtype) != self.categories.dtype:
-            return np.asarray(ret, dtype)
         # When we're a Categorical[ExtensionArray], like Interval,
         # we need to ensure __array__ gets all the way to an
         # ndarray.
-        return np.asarray(ret)
+
+        # `take_nd` should already make a copy, so don't force again.
+        return np.asarray(ret, dtype=dtype)
 
     def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
         # for binary ops, use our custom dunder methods
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
index a25a698856747..9c821bf0d184e 100644
--- a/pandas/core/arrays/datetimelike.py
+++ b/pandas/core/arrays/datetimelike.py
@@ -359,7 +359,14 @@ def __array__(
     ) -> np.ndarray:
         # used for Timedelta/DatetimeArray, overwritten by PeriodArray
         if is_object_dtype(dtype):
+            if copy is False:
+                raise ValueError(
+                    "Unable to avoid copy while creating an array as requested."
+                )
             return np.array(list(self), dtype=object)
+
+        if copy is True:
+            return np.array(self._ndarray, dtype=dtype)
         return self._ndarray
 
     @overload
diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
index c58d03fefedb5..3e231fb9f8ecb 100644
--- a/pandas/core/arrays/interval.py
+++ b/pandas/core/arrays/interval.py
@@ -1622,6 +1622,11 @@ def __array__(
         Return the IntervalArray's data as a numpy array of Interval
         objects (with dtype='object')
         """
+        if copy is False:
+            raise ValueError(
+                "Unable to avoid copy while creating an array as requested."
+            )
+
         left = self._left
         right = self._right
         mask = self.isna()
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 92ed690e527c7..349d2ec4d3cc9 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -581,7 +581,17 @@ def __array__(
         the array interface, return my values
         We return an object array here to preserve our scalar values
         """
-        return self.to_numpy(dtype=dtype)
+        if copy is False:
+            if not self._hasna:
+                # special case, here we can simply return the underlying data
+                return np.array(self._data, dtype=dtype, copy=copy)
+            raise ValueError(
+                "Unable to avoid copy while creating an array as requested."
+            )
+
+        if copy is None:
+            copy = False  # The NumPy copy=False meaning is different here.
+        return self.to_numpy(dtype=dtype, copy=copy)
 
     _HANDLED_TYPES: tuple[type, ...]
 
diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
index aafcd82114b97..9f7238a97d808 100644
--- a/pandas/core/arrays/numpy_.py
+++ b/pandas/core/arrays/numpy_.py
@@ -150,6 +150,9 @@ def dtype(self) -> NumpyEADtype:
     def __array__(
         self, dtype: NpDtype | None = None, copy: bool | None = None
     ) -> np.ndarray:
+        if copy is not None:
+            # Note: branch avoids `copy=None` for NumPy 1.x support
+            return np.array(self._ndarray, dtype=dtype, copy=copy)
         return np.asarray(self._ndarray, dtype=dtype)
 
     def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
index 7d0ad74f851f0..ae92e17332c76 100644
--- a/pandas/core/arrays/period.py
+++ b/pandas/core/arrays/period.py
@@ -390,8 +390,19 @@ def __array__(
         self, dtype: NpDtype | None = None, copy: bool | None = None
     ) -> np.ndarray:
         if dtype == "i8":
-            return self.asi8
-        elif dtype == bool:
+            # For NumPy 1.x compatibility we cannot use copy=None.  And
+            # `copy=False` has the meaning of `copy=None` here:
+            if not copy:
+                return np.asarray(self.asi8, dtype=dtype)
+            else:
+                return np.array(self.asi8, dtype=dtype)
+
+        if copy is False:
+            raise ValueError(
+                "Unable to avoid copy while creating an array as requested."
+            )
+
+        if dtype == bool:
             return ~self._isnan
 
         # This will raise TypeError for non-object dtypes
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
index 0c76280e7fdb4..a3db7dc1f93e9 100644
--- a/pandas/core/arrays/sparse/array.py
+++ b/pandas/core/arrays/sparse/array.py
@@ -547,11 +547,20 @@ def from_spmatrix(cls, data: spmatrix) -> Self:
     def __array__(
         self, dtype: NpDtype | None = None, copy: bool | None = None
     ) -> np.ndarray:
-        fill_value = self.fill_value
-
         if self.sp_index.ngaps == 0:
             # Compat for na dtype and int values.
-            return self.sp_values
+            if copy is True:
+                return np.array(self.sp_values)
+            else:
+                return self.sp_values
+
+        if copy is False:
+            raise ValueError(
+                "Unable to avoid copy while creating an array as requested."
+            )
+
+        fill_value = self.fill_value
+
         if dtype is None:
             # Can NumPy represent this type?
             # If not, `np.result_type` will raise. We catch that
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 756c431022063..bbd627d4f0d73 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -2015,8 +2015,17 @@ def __array__(
         self, dtype: npt.DTypeLike | None = None, copy: bool | None = None
     ) -> np.ndarray:
         values = self._values
-        arr = np.asarray(values, dtype=dtype)
-        if astype_is_view(values.dtype, arr.dtype) and self._mgr.is_single_block:
+        if copy is None:
+            # Note: branch avoids `copy=None` for NumPy 1.x support
+            arr = np.asarray(values, dtype=dtype)
+        else:
+            arr = np.array(values, dtype=dtype, copy=copy)
+
+        if (
+            copy is not True
+            and astype_is_view(values.dtype, arr.dtype)
+            and self._mgr.is_single_block
+        ):
             # Check if both conversions can be done without a copy
             if astype_is_view(self.dtypes.iloc[0], values.dtype) and astype_is_view(
                 values.dtype, arr.dtype
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index 749a5fea4d513..cf3d1e6a2ee2d 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -908,7 +908,11 @@ def __array__(self, dtype=None, copy=None) -> np.ndarray:
         """
         The array interface, return my values.
         """
-        return np.asarray(self._data, dtype=dtype)
+        if copy is None:
+            # Note, that the if branch exists for NumPy 1.x support
+            return np.asarray(self._data, dtype=dtype)
+
+        return np.array(self._data, dtype=dtype, copy=copy)
 
     def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs):
         if any(isinstance(other, (ABCSeries, ABCDataFrame)) for other in inputs):
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index ae9b272af9fe9..e6ce00cb714a4 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -1391,6 +1391,15 @@ def copy(  # type: ignore[override]
 
     def __array__(self, dtype=None, copy=None) -> np.ndarray:
         """the array interface, return my values"""
+        if copy is False:
+            # self.values is always a newly construct array, so raise.
+            raise ValueError(
+                "Unable to avoid copy while creating an array as requested."
+            )
+        if copy is True:
+            # explicit np.array call to ensure a copy is made and unique objects
+            # are returned, because self.values is cached
+            return np.array(self.values, dtype=dtype)
         return self.values
 
     def view(self, cls=None) -> Self:
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
index 959e572b2b35b..0812ba5e6def4 100644
--- a/pandas/core/internals/construction.py
+++ b/pandas/core/internals/construction.py
@@ -258,7 +258,7 @@ def ndarray_to_mgr(
             # and a subsequent `astype` will not already result in a copy
             values = np.array(values, copy=True, order="F")
         else:
-            values = np.array(values, copy=False)
+            values = np.asarray(values)
         values = _ensure_2d(values)
 
     else:
diff --git a/pandas/core/series.py b/pandas/core/series.py
index d83d9715878f8..1d601f36d604a 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -842,7 +842,7 @@ def __array__(
             the dtype is inferred from the data.
 
         copy : bool or None, optional
-            Unused.
+            See :func:`numpy.asarray`.
 
         Returns
         -------
@@ -879,8 +879,15 @@ def __array__(
               dtype='datetime64[ns]')
         """
         values = self._values
-        arr = np.asarray(values, dtype=dtype)
-        if astype_is_view(values.dtype, arr.dtype):
+        if copy is None:
+            # Note: branch avoids `copy=None` for NumPy 1.x support
+            arr = np.asarray(values, dtype=dtype)
+        else:
+            arr = np.array(values, dtype=dtype, copy=copy)
+
+        if copy is True:
+            return arr
+        if copy is False or astype_is_view(values.dtype, arr.dtype):
             arr = arr.view()
             arr.flags.writeable = False
         return arr
diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
index c35e8204f3437..1b685100e4931 100644
--- a/pandas/tests/arrays/sparse/test_array.py
+++ b/pandas/tests/arrays/sparse/test_array.py
@@ -4,6 +4,7 @@
 import pytest
 
 from pandas._libs.sparse import IntIndex
+from pandas.compat.numpy import np_version_gt2
 
 import pandas as pd
 from pandas import (
@@ -480,3 +481,33 @@ def test_zero_sparse_column():
 
     expected = pd.DataFrame({"A": SparseArray([0, 0]), "B": [1, 3]}, index=[0, 2])
     tm.assert_frame_equal(result, expected)
+
+
+def test_array_interface(arr_data, arr):
+    # https://github.com/pandas-dev/pandas/pull/60046
+    result = np.asarray(arr)
+    tm.assert_numpy_array_equal(result, arr_data)
+
+    # it always gives a copy by default
+    result_copy1 = np.asarray(arr)
+    result_copy2 = np.asarray(arr)
+    assert not np.may_share_memory(result_copy1, result_copy2)
+
+    # or with explicit copy=True
+    result_copy1 = np.array(arr, copy=True)
+    result_copy2 = np.array(arr, copy=True)
+    assert not np.may_share_memory(result_copy1, result_copy2)
+
+    if not np_version_gt2:
+        # copy=False semantics are only supported in NumPy>=2.
+        return
+
+    # for sparse arrays, copy=False is never allowed
+    with pytest.raises(ValueError, match="Unable to avoid copy while creating"):
+        np.array(arr, copy=False)
+
+    # except when there are actually no sparse filled values
+    arr2 = SparseArray(np.array([1, 2, 3]))
+    result_nocopy1 = np.array(arr2, copy=False)
+    result_nocopy2 = np.array(arr2, copy=False)
+    assert np.may_share_memory(result_nocopy1, result_nocopy2)
diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py
index 0c8eefab95464..d1ef29b0bf8a0 100644
--- a/pandas/tests/arrays/test_datetimelike.py
+++ b/pandas/tests/arrays/test_datetimelike.py
@@ -1152,9 +1152,17 @@ def test_array_interface(self, arr1d):
         result = np.asarray(arr, dtype=object)
         tm.assert_numpy_array_equal(result, expected)
 
+        # to int64 gives the underlying representation
         result = np.asarray(arr, dtype="int64")
         tm.assert_numpy_array_equal(result, arr.asi8)
 
+        result2 = np.asarray(arr, dtype="int64")
+        assert np.may_share_memory(result, result2)
+
+        result_copy1 = np.array(arr, dtype="int64", copy=True)
+        result_copy2 = np.array(arr, dtype="int64", copy=True)
+        assert not np.may_share_memory(result_copy1, result_copy2)
+
         # to other dtypes
         msg = r"float\(\) argument must be a string or a( real)? number, not 'Period'"
         with pytest.raises(TypeError, match=msg):
diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py
index d8af7abe83084..888e8628f8664 100644
--- a/pandas/tests/base/test_conversion.py
+++ b/pandas/tests/base/test_conversion.py
@@ -4,6 +4,7 @@
 from pandas._config import using_string_dtype
 
 from pandas.compat import HAS_PYARROW
+from pandas.compat.numpy import np_version_gt2
 
 from pandas.core.dtypes.dtypes import DatetimeTZDtype
 
@@ -297,24 +298,27 @@ def test_array_multiindex_raises():
 
 
 @pytest.mark.parametrize(
-    "arr, expected",
+    "arr, expected, zero_copy",
     [
-        (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64)),
-        (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object)),
+        (np.array([1, 2], dtype=np.int64), np.array([1, 2], dtype=np.int64), True),
+        (pd.Categorical(["a", "b"]), np.array(["a", "b"], dtype=object), False),
         (
             pd.core.arrays.period_array(["2000", "2001"], freq="D"),
             np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]),
+            False,
         ),
-        (pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan])),
+        (pd.array([0, np.nan], dtype="Int64"), np.array([0, np.nan]), False),
         (
             IntervalArray.from_breaks([0, 1, 2]),
             np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object),
+            False,
         ),
-        (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)),
+        (SparseArray([0, 1]), np.array([0, 1], dtype=np.int64), False),
         # tz-naive datetime
         (
             DatetimeArray._from_sequence(np.array(["2000", "2001"], dtype="M8[ns]")),
             np.array(["2000", "2001"], dtype="M8[ns]"),
+            True,
         ),
         # tz-aware stays tz`-aware
         (
@@ -329,6 +333,7 @@ def test_array_multiindex_raises():
                     Timestamp("2000-01-02", tz="US/Central"),
                 ]
             ),
+            False,
         ),
         # Timedelta
         (
@@ -337,6 +342,7 @@ def test_array_multiindex_raises():
                 dtype=np.dtype("m8[ns]"),
             ),
             np.array([0, 3600000000000], dtype="m8[ns]"),
+            True,
         ),
         # GH#26406 tz is preserved in Categorical[dt64tz]
         (
@@ -347,10 +353,11 @@ def test_array_multiindex_raises():
                     Timestamp("2016-01-02", tz="US/Pacific"),
                 ]
             ),
+            False,
         ),
     ],
 )
-def test_to_numpy(arr, expected, index_or_series_or_array, request):
+def test_to_numpy(arr, expected, zero_copy, index_or_series_or_array):
     box = index_or_series_or_array
 
     with tm.assert_produces_warning(None):
@@ -362,6 +369,28 @@ def test_to_numpy(arr, expected, index_or_series_or_array, request):
     result = np.asarray(thing)
     tm.assert_numpy_array_equal(result, expected)
 
+    # Additionally, we check the `copy=` semantics for array/asarray
+    # (these are implemented by us via `__array__`).
+    result_cp1 = np.array(thing, copy=True)
+    result_cp2 = np.array(thing, copy=True)
+    # When called with `copy=True` NumPy/we should ensure a copy was made
+    assert not np.may_share_memory(result_cp1, result_cp2)
+
+    if not np_version_gt2:
+        # copy=False semantics are only supported in NumPy>=2.
+        return
+
+    if not zero_copy:
+        with pytest.raises(ValueError, match="Unable to avoid copy while creating"):
+            # An error is always acceptable for `copy=False`
+            np.array(thing, copy=False)
+
+    else:
+        result_nocopy1 = np.array(thing, copy=False)
+        result_nocopy2 = np.array(thing, copy=False)
+        # If copy=False was given, these must share the same data
+        assert np.may_share_memory(result_nocopy1, result_nocopy2)
+
 
 @pytest.mark.xfail(
     using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
diff --git a/pandas/tests/extension/base/interface.py b/pandas/tests/extension/base/interface.py
index 6683c87e2b8fc..79eb64b5a654f 100644
--- a/pandas/tests/extension/base/interface.py
+++ b/pandas/tests/extension/base/interface.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+from pandas.compat.numpy import np_version_gt2
+
 from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
 from pandas.core.dtypes.common import is_extension_array_dtype
 from pandas.core.dtypes.dtypes import ExtensionDtype
@@ -71,6 +73,25 @@ def test_array_interface(self, data):
             expected = construct_1d_object_array_from_listlike(list(data))
         tm.assert_numpy_array_equal(result, expected)
 
+    def test_array_interface_copy(self, data):
+        result_copy1 = np.array(data, copy=True)
+        result_copy2 = np.array(data, copy=True)
+        assert not np.may_share_memory(result_copy1, result_copy2)
+
+        if not np_version_gt2:
+            # copy=False semantics are only supported in NumPy>=2.
+            return
+
+        try:
+            result_nocopy1 = np.array(data, copy=False)
+        except ValueError:
+            # An error is always acceptable for `copy=False`
+            return
+
+        result_nocopy2 = np.array(data, copy=False)
+        # If copy=False was given and did not raise, these must share the same data
+        assert np.may_share_memory(result_nocopy1, result_nocopy2)
+
     def test_is_extension_array_dtype(self, data):
         assert is_extension_array_dtype(data)
         assert is_extension_array_dtype(data.dtype)
diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py
index 4fa48023fbc95..a68c8a06e1d18 100644
--- a/pandas/tests/extension/json/array.py
+++ b/pandas/tests/extension/json/array.py
@@ -148,12 +148,20 @@ def __ne__(self, other):
         return NotImplemented
 
     def __array__(self, dtype=None, copy=None):
+        if copy is False:
+            raise ValueError(
+                "Unable to avoid copy while creating an array as requested."
+            )
+
         if dtype is None:
             dtype = object
         if dtype == object:
             # on py38 builds it looks like numpy is inferring to a non-1D array
             return construct_1d_object_array_from_listlike(list(self))
-        return np.asarray(self.data, dtype=dtype)
+        if copy is None:
+            # Note: branch avoids `copy=None` for NumPy 1.x support
+            return np.asarray(self.data, dtype=dtype)
+        return np.asarray(self.data, dtype=dtype, copy=copy)
 
     @property
     def nbytes(self) -> int:
diff --git a/pandas/tests/indexes/multi/test_conversion.py b/pandas/tests/indexes/multi/test_conversion.py
index f6b10c989326f..347d6b206e3b9 100644
--- a/pandas/tests/indexes/multi/test_conversion.py
+++ b/pandas/tests/indexes/multi/test_conversion.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+from pandas.compat.numpy import np_version_gt2
+
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -16,6 +18,40 @@ def test_to_numpy(idx):
     tm.assert_numpy_array_equal(result, exp)
 
 
+def test_array_interface(idx):
+    # https://github.com/pandas-dev/pandas/pull/60046
+    result = np.asarray(idx)
+    expected = np.empty((6,), dtype=object)
+    expected[:] = [
+        ("foo", "one"),
+        ("foo", "two"),
+        ("bar", "one"),
+        ("baz", "two"),
+        ("qux", "one"),
+        ("qux", "two"),
+    ]
+    tm.assert_numpy_array_equal(result, expected)
+
+    # it always gives a copy by default, but the values are cached, so results
+    # are still sharing memory
+    result_copy1 = np.asarray(idx)
+    result_copy2 = np.asarray(idx)
+    assert np.may_share_memory(result_copy1, result_copy2)
+
+    # with explicit copy=True, then it is an actual copy
+    result_copy1 = np.array(idx, copy=True)
+    result_copy2 = np.array(idx, copy=True)
+    assert not np.may_share_memory(result_copy1, result_copy2)
+
+    if not np_version_gt2:
+        # copy=False semantics are only supported in NumPy>=2.
+        return
+
+    # for MultiIndex, copy=False is never allowed
+    with pytest.raises(ValueError, match="Unable to avoid copy while creating"):
+        np.array(idx, copy=False)
+
+
 def test_to_frame():
     tuples = [(1, "one"), (1, "two"), (2, "one"), (2, "two")]
 

From 34387bddffacb158a60a249b08411a8a1fe44455 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 5 Nov 2024 17:23:25 +0100
Subject: [PATCH 194/224] TST (string dtype): avoid hardcoded object dtype for
 columns in datetime_frame fixture (#60192)

---
 pandas/tests/frame/conftest.py               | 2 +-
 pandas/tests/frame/indexing/test_indexing.py | 1 -
 pandas/tests/frame/methods/test_to_csv.py    | 2 --
 3 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py
index 8da7ac635f293..ea8e2e8ecc194 100644
--- a/pandas/tests/frame/conftest.py
+++ b/pandas/tests/frame/conftest.py
@@ -18,7 +18,7 @@ def datetime_frame() -> DataFrame:
     """
     return DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=10, freq="B"),
     )
 
diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
index 0723c3c70091c..3e8686fd30e44 100644
--- a/pandas/tests/frame/indexing/test_indexing.py
+++ b/pandas/tests/frame/indexing/test_indexing.py
@@ -177,7 +177,6 @@ def test_getitem_boolean(self, mixed_float_frame, mixed_int_frame, datetime_fram
                 if bif[c].dtype != bifw[c].dtype:
                     assert bif[c].dtype == df[c].dtype
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_getitem_boolean_casting(self, datetime_frame):
         # don't upcast if we don't need to
         df = datetime_frame.copy()
diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py
index adb327e90bb76..23377b7373987 100644
--- a/pandas/tests/frame/methods/test_to_csv.py
+++ b/pandas/tests/frame/methods/test_to_csv.py
@@ -44,7 +44,6 @@ def test_to_csv_from_csv1(self, temp_file, float_frame):
         float_frame.to_csv(path, header=False)
         float_frame.to_csv(path, index=False)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_to_csv_from_csv1_datetime(self, temp_file, datetime_frame):
         path = str(temp_file)
         # test roundtrip
@@ -549,7 +548,6 @@ def test_to_csv_headers(self, temp_file):
         assert return_value is None
         tm.assert_frame_equal(to_df, recons)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_to_csv_multiindex(self, temp_file, float_frame, datetime_frame):
         frame = float_frame
         old_index = frame.index

From b84e0c81ee424444760a4587d30a7cb8662fa4d9 Mon Sep 17 00:00:00 2001
From: eightyseven <lyt5286@126.com>
Date: Wed, 6 Nov 2024 02:41:39 +0800
Subject: [PATCH 195/224] BUG: Frequency shift on empty DataFrame (#60172)

* freq shift

* Save local changes before merging
---
 doc/source/whatsnew/v3.0.0.rst           |  1 +
 pandas/core/frame.py                     |  2 +-
 pandas/tests/frame/methods/test_shift.py | 10 ++++++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 2e64c66812306..20efac7e2edb0 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -772,6 +772,7 @@ Other
 - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`)
 - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`)
 - Bug in :meth:`DataFrame.query` which raised an exception or produced incorrect results when expressions contained backtick-quoted column names containing the hash character ``#``, backticks, or characters that fall outside the ASCII range (U+0001..U+007F). (:issue:`59285`) (:issue:`49633`)
+- Bug in :meth:`DataFrame.shift` where passing a ``freq`` on a DataFrame with no columns did not shift the index correctly. (:issue:`60102`)
 - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`)
 - Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`)
 - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index c4defdb24370f..a3a459796f765 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -5705,7 +5705,7 @@ def shift(
                 "Passing a 'freq' together with a 'fill_value' is not allowed."
             )
 
-        if self.empty:
+        if self.empty and freq is None:
             return self.copy()
 
         axis = self._get_axis_number(axis)
diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py
index 4e490e9e344ba..a0f96ff111444 100644
--- a/pandas/tests/frame/methods/test_shift.py
+++ b/pandas/tests/frame/methods/test_shift.py
@@ -747,3 +747,13 @@ def test_shift_axis_one_empty(self):
         df = DataFrame()
         result = df.shift(1, axis=1)
         tm.assert_frame_equal(result, df)
+
+    def test_shift_with_offsets_freq_empty(self):
+        # GH#60102
+        dates = date_range("2020-01-01", periods=3, freq="D")
+        offset = offsets.Day()
+        shifted_dates = dates + offset
+        df = DataFrame(index=dates)
+        df_shifted = DataFrame(index=shifted_dates)
+        result = df.shift(freq=offset)
+        tm.assert_frame_equal(result, df_shifted)

From e449b49716203bbf04d20bc867fda7ea6d562ef5 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 5 Nov 2024 19:52:01 +0100
Subject: [PATCH 196/224] TYP/COMPAT: don't use Literal for Series.ndim to
 avoid tab completion bug in IPython (#60197)

---
 pandas/core/base.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pandas/core/base.py b/pandas/core/base.py
index 58572aab5b20f..61a7c079d87f8 100644
--- a/pandas/core/base.py
+++ b/pandas/core/base.py
@@ -361,8 +361,11 @@ def __len__(self) -> int:
         # We need this defined here for mypy
         raise AbstractMethodError(self)
 
+    # Temporarily avoid using `-> Literal[1]:` because of an IPython (jedi) bug
+    # https://github.com/ipython/ipython/issues/14412
+    # https://github.com/davidhalter/jedi/issues/1990
     @property
-    def ndim(self) -> Literal[1]:
+    def ndim(self) -> int:
         """
         Number of dimensions of the underlying data, by definition 1.
 

From 169b00e1eaf06924aee585d8e5469dc284992382 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 5 Nov 2024 19:54:22 +0100
Subject: [PATCH 197/224] BUG: fix inspect usage when pyarrow or jinja2 is not
 installed (#60196)

* BUG: fix inspect usage when pyarrow or jinja2 is not installed

* add whatsnew note
---
 doc/source/whatsnew/v2.3.0.rst        | 3 ++-
 pandas/core/arrays/arrow/accessors.py | 2 +-
 pandas/core/frame.py                  | 5 +++++
 pandas/tests/frame/test_api.py        | 1 -
 pandas/tests/series/test_api.py       | 8 --------
 5 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
index 90f9f4ed464c6..922cc0ead7fb0 100644
--- a/doc/source/whatsnew/v2.3.0.rst
+++ b/doc/source/whatsnew/v2.3.0.rst
@@ -173,7 +173,8 @@ Styler
 
 Other
 ^^^^^
--
+- Fixed usage of ``inspect`` when the optional dependencies ``pyarrow`` or ``jinja2``
+  are not installed (:issue:`60196`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/core/arrays/arrow/accessors.py b/pandas/core/arrays/arrow/accessors.py
index d9a80b699b0bb..230522846d377 100644
--- a/pandas/core/arrays/arrow/accessors.py
+++ b/pandas/core/arrays/arrow/accessors.py
@@ -46,7 +46,7 @@ def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
 
     def _validate(self, data) -> None:
         dtype = data.dtype
-        if not isinstance(dtype, ArrowDtype):
+        if pa_version_under10p1 or not isinstance(dtype, ArrowDtype):
             # Raise AttributeError so that inspect can handle non-struct Series.
             raise AttributeError(self._validation_msg.format(dtype=dtype))
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index a3a459796f765..b35e2c8497fb7 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1397,6 +1397,11 @@ def style(self) -> Styler:
         Please see
         `Table Visualization <../../user_guide/style.ipynb>`_ for more examples.
         """
+        # Raise AttributeError so that inspect works even if jinja2 is not installed.
+        has_jinja2 = import_optional_dependency("jinja2", errors="ignore")
+        if not has_jinja2:
+            raise AttributeError("The '.style' accessor requires jinja2")
+
         from pandas.io.formats.style import Styler
 
         return Styler(self)
diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py
index 3fb994f2e0aff..2b0bf1b0576f9 100644
--- a/pandas/tests/frame/test_api.py
+++ b/pandas/tests/frame/test_api.py
@@ -376,6 +376,5 @@ def test_constructor_expanddim(self):
 
     def test_inspect_getmembers(self):
         # GH38740
-        pytest.importorskip("jinja2")
         df = DataFrame()
         inspect.getmembers(df)
diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py
index 79a55eb357f87..4b369bb0bc869 100644
--- a/pandas/tests/series/test_api.py
+++ b/pandas/tests/series/test_api.py
@@ -4,10 +4,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
-from pandas.compat import HAS_PYARROW
-
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -164,12 +160,8 @@ def test_attrs(self):
         result = s + 1
         assert result.attrs == {"version": 1}
 
-    @pytest.mark.xfail(
-        using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-    )
     def test_inspect_getmembers(self):
         # GH38782
-        pytest.importorskip("jinja2")
         ser = Series(dtype=object)
         inspect.getmembers(ser)
 

From cf52dec71329797b2af84053d091bd7cfc787486 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 5 Nov 2024 19:55:24 +0100
Subject: [PATCH 198/224] BUG (string dtype): fix where() for string dtype with
 python storage (#60195)

---
 pandas/core/arrays/string_.py             |  6 ++++++
 pandas/tests/frame/indexing/test_where.py | 18 ++++++------------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index c9e53abc31182..f54a5260bd699 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -757,6 +757,12 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
         # base class implementation that uses __setitem__
         ExtensionArray._putmask(self, mask, value)
 
+    def _where(self, mask: npt.NDArray[np.bool_], value) -> Self:
+        # the super() method NDArrayBackedExtensionArray._where uses
+        # np.putmask which doesn't properly handle None/pd.NA, so using the
+        # base class implementation that uses __setitem__
+        return ExtensionArray._where(self, mask, value)
+
     def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
         if isinstance(values, BaseStringArray) or (
             isinstance(values, ExtensionArray) and is_string_dtype(values.dtype)
diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py
index 32a827c25c77a..ff66ea491e308 100644
--- a/pandas/tests/frame/indexing/test_where.py
+++ b/pandas/tests/frame/indexing/test_where.py
@@ -6,8 +6,6 @@
 
 from pandas._config import using_string_dtype
 
-from pandas.compat import HAS_PYARROW
-
 from pandas.core.dtypes.common import is_scalar
 
 import pandas as pd
@@ -940,9 +938,6 @@ def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype):
             obj.mask(mask, null)
 
 
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-)
 @given(data=OPTIONAL_ONE_OF_ALL)
 def test_where_inplace_casting(data):
     # GH 22051
@@ -1023,19 +1018,18 @@ def test_where_producing_ea_cond_for_np_dtype():
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)", strict=False
-)
 @pytest.mark.parametrize(
     "replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)]
 )
-def test_where_int_overflow(replacement, using_infer_string, request):
+def test_where_int_overflow(replacement, using_infer_string):
     # GH 31687
     df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]])
     if using_infer_string and replacement not in (None, "snake"):
-        request.node.add_marker(
-            pytest.mark.xfail(reason="Can't set non-string into string column")
-        )
+        with pytest.raises(
+            TypeError, match="Cannot set non-string value|Scalar must be NA or str"
+        ):
+            df.where(pd.notnull(df), replacement)
+        return
     result = df.where(pd.notnull(df), replacement)
     expected = DataFrame([[1.0, 2e25, "nine"], [replacement, 0.1, replacement]])
 

From bec2dbca274a4f983790d069279a4b3aec184f49 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 5 Nov 2024 19:56:30 +0100
Subject: [PATCH 199/224] TST (string dtype): update all tests in
 tests/frame/indexing (#60193)

---
 pandas/tests/frame/indexing/test_coercion.py | 10 +++++---
 pandas/tests/frame/indexing/test_indexing.py | 16 ++++--------
 pandas/tests/frame/indexing/test_insert.py   |  6 ++---
 pandas/tests/frame/indexing/test_setitem.py  | 26 +++++++++-----------
 pandas/tests/frame/indexing/test_where.py    | 18 +++++++++-----
 pandas/tests/frame/indexing/test_xs.py       |  5 +---
 6 files changed, 38 insertions(+), 43 deletions(-)

diff --git a/pandas/tests/frame/indexing/test_coercion.py b/pandas/tests/frame/indexing/test_coercion.py
index cb1cbd68ede63..1a454351b7085 100644
--- a/pandas/tests/frame/indexing/test_coercion.py
+++ b/pandas/tests/frame/indexing/test_coercion.py
@@ -8,8 +8,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -84,14 +82,18 @@ def test_6942(indexer_al):
     assert df.iloc[0, 0] == t2
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_26395(indexer_al):
     # .at case fixed by GH#45121 (best guess)
     df = DataFrame(index=["A", "B", "C"])
     df["D"] = 0
 
     indexer_al(df)["C", "D"] = 2
-    expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64)
+    expected = DataFrame(
+        {"D": [0, 0, 2]},
+        index=["A", "B", "C"],
+        columns=pd.Index(["D"], dtype=object),
+        dtype=np.int64,
+    )
     tm.assert_frame_equal(df, expected)
 
     with pytest.raises(TypeError, match="Invalid value"):
diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
index 3e8686fd30e44..eb14f8bdbfb86 100644
--- a/pandas/tests/frame/indexing/test_indexing.py
+++ b/pandas/tests/frame/indexing/test_indexing.py
@@ -12,7 +12,6 @@
 from pandas._config import using_string_dtype
 
 from pandas._libs import iNaT
-from pandas.compat import HAS_PYARROW
 from pandas.errors import InvalidIndexError
 
 from pandas.core.dtypes.common import is_integer
@@ -505,17 +504,16 @@ def test_setitem_ambig(self, using_infer_string):
             assert dm[2].dtype == np.object_
 
     @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-    def test_setitem_None(self, float_frame, using_infer_string):
+    def test_setitem_None(self, float_frame):
         # GH #766
         float_frame[None] = float_frame["A"]
-        key = None if not using_infer_string else np.nan
         tm.assert_series_equal(
             float_frame.iloc[:, -1], float_frame["A"], check_names=False
         )
         tm.assert_series_equal(
-            float_frame.loc[:, key], float_frame["A"], check_names=False
+            float_frame.loc[:, None], float_frame["A"], check_names=False
         )
-        tm.assert_series_equal(float_frame[key], float_frame["A"], check_names=False)
+        tm.assert_series_equal(float_frame[None], float_frame["A"], check_names=False)
 
     def test_loc_setitem_boolean_mask_allfalse(self):
         # GH 9596
@@ -1125,7 +1123,6 @@ def test_setitem_with_unaligned_tz_aware_datetime_column(self):
         df.loc[[0, 1, 2], "dates"] = column[[1, 0, 2]]
         tm.assert_series_equal(df["dates"], column)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_loc_setitem_datetimelike_with_inference(self):
         # GH 7592
         # assignment of timedeltas with NaT
@@ -1144,13 +1141,10 @@ def test_loc_setitem_datetimelike_with_inference(self):
         result = df.dtypes
         expected = Series(
             [np.dtype("timedelta64[ns]")] * 6 + [np.dtype("datetime64[ns]")] * 2,
-            index=list("ABCDEFGH"),
+            index=Index(list("ABCDEFGH"), dtype=object),
         )
         tm.assert_series_equal(result, expected)
 
-    @pytest.mark.xfail(
-        using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)"
-    )
     def test_getitem_boolean_indexing_mixed(self):
         df = DataFrame(
             {
@@ -1192,7 +1186,7 @@ def test_getitem_boolean_indexing_mixed(self):
         tm.assert_frame_equal(df2, expected)
 
         df["foo"] = "test"
-        msg = "not supported between instances|unorderable types"
+        msg = "not supported between instances|unorderable types|Invalid comparison"
 
         with pytest.raises(TypeError, match=msg):
             df[df > 0.3] = 1
diff --git a/pandas/tests/frame/indexing/test_insert.py b/pandas/tests/frame/indexing/test_insert.py
index 3dd8f7196c594..a1d60eb9626d6 100644
--- a/pandas/tests/frame/indexing/test_insert.py
+++ b/pandas/tests/frame/indexing/test_insert.py
@@ -7,8 +7,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import PerformanceWarning
 
 from pandas import (
@@ -63,7 +61,6 @@ def test_insert_column_bug_4032(self):
         expected = DataFrame([[1.3, 1, 1.1], [2.3, 2, 2.2]], columns=["c", "a", "b"])
         tm.assert_frame_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_insert_with_columns_dups(self):
         # GH#14291
         df = DataFrame()
@@ -71,7 +68,8 @@ def test_insert_with_columns_dups(self):
         df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True)
         df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True)
         exp = DataFrame(
-            [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"]
+            [["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]],
+            columns=Index(["A", "A", "A"], dtype=object),
         )
         tm.assert_frame_equal(df, exp)
 
diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py
index cb971b31c13c4..cfd7e91c4ceab 100644
--- a/pandas/tests/frame/indexing/test_setitem.py
+++ b/pandas/tests/frame/indexing/test_setitem.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.core.dtypes.base import _registry as ea_registry
 from pandas.core.dtypes.common import is_object_dtype
 from pandas.core.dtypes.dtypes import (
@@ -146,13 +144,16 @@ def test_setitem_different_dtype(self):
         )
         tm.assert_series_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_setitem_empty_columns(self):
         # GH 13522
         df = DataFrame(index=["A", "B", "C"])
         df["X"] = df.index
         df["X"] = ["x", "y", "z"]
-        exp = DataFrame(data={"X": ["x", "y", "z"]}, index=["A", "B", "C"])
+        exp = DataFrame(
+            data={"X": ["x", "y", "z"]},
+            index=["A", "B", "C"],
+            columns=Index(["X"], dtype=object),
+        )
         tm.assert_frame_equal(df, exp)
 
     def test_setitem_dt64_index_empty_columns(self):
@@ -162,14 +163,15 @@ def test_setitem_dt64_index_empty_columns(self):
         df["A"] = rng
         assert df["A"].dtype == np.dtype("M8[ns]")
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_setitem_timestamp_empty_columns(self):
         # GH#19843
         df = DataFrame(index=range(3))
         df["now"] = Timestamp("20130101", tz="UTC")
 
         expected = DataFrame(
-            [[Timestamp("20130101", tz="UTC")]] * 3, index=range(3), columns=["now"]
+            [[Timestamp("20130101", tz="UTC")]] * 3,
+            index=range(3),
+            columns=Index(["now"], dtype=object),
         )
         tm.assert_frame_equal(df, expected)
 
@@ -202,14 +204,13 @@ def test_setitem_with_unaligned_sparse_value(self):
         expected = Series(SparseArray([1, 0, 0]), name="new_column")
         tm.assert_series_equal(df["new_column"], expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_setitem_period_preserves_dtype(self):
         # GH: 26861
         data = [Period("2003-12", "D")]
         result = DataFrame([])
         result["a"] = data
 
-        expected = DataFrame({"a": data})
+        expected = DataFrame({"a": data}, columns=Index(["a"], dtype=object))
 
         tm.assert_frame_equal(result, expected)
 
@@ -672,11 +673,10 @@ def test_setitem_iloc_two_dimensional_generator(self):
         expected = DataFrame({"a": [1, 2, 3], "b": [4, 1, 1]})
         tm.assert_frame_equal(df, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_setitem_dtypes_bytes_type_to_object(self):
         # GH 20734
         index = Series(name="id", dtype="S24")
-        df = DataFrame(index=index)
+        df = DataFrame(index=index, columns=Index([], dtype="str"))
         df["a"] = Series(name="a", index=index, dtype=np.uint32)
         df["b"] = Series(name="b", index=index, dtype="S64")
         df["c"] = Series(name="c", index=index, dtype="S64")
@@ -705,7 +705,6 @@ def test_setitem_ea_dtype_rhs_series(self):
         expected = DataFrame({"a": [1, 2]}, dtype="Int64")
         tm.assert_frame_equal(df, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_setitem_npmatrix_2d(self):
         # GH#42376
         # for use-case df["x"] = sparse.random((10, 10)).mean(axis=1)
@@ -714,7 +713,7 @@ def test_setitem_npmatrix_2d(self):
         )
 
         a = np.ones((10, 1))
-        df = DataFrame(index=np.arange(10))
+        df = DataFrame(index=np.arange(10), columns=Index([], dtype="str"))
         df["np-array"] = a
 
         # Instantiation of `np.matrix` gives PendingDeprecationWarning
@@ -927,12 +926,11 @@ def test_setitem_with_expansion_categorical_dtype(self):
         ser.name = "E"
         tm.assert_series_equal(result2.sort_index(), ser.sort_index())
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_setitem_scalars_no_index(self):
         # GH#16823 / GH#17894
         df = DataFrame()
         df["foo"] = 1
-        expected = DataFrame(columns=["foo"]).astype(np.int64)
+        expected = DataFrame(columns=Index(["foo"], dtype=object)).astype(np.int64)
         tm.assert_frame_equal(df, expected)
 
     def test_setitem_newcol_tuple_key(self, float_frame):
diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py
index ff66ea491e308..d3040052ea696 100644
--- a/pandas/tests/frame/indexing/test_where.py
+++ b/pandas/tests/frame/indexing/test_where.py
@@ -48,7 +48,6 @@ def is_ok(s):
 
 
 class TestDataFrameIndexingWhere:
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     def test_where_get(self, where_frame, float_string_frame):
         def _check_get(df, cond, check_dtypes=True):
             other1 = _safe_add(df)
@@ -66,7 +65,10 @@ def _check_get(df, cond, check_dtypes=True):
         # check getting
         df = where_frame
         if df is float_string_frame:
-            msg = "'>' not supported between instances of 'str' and 'int'"
+            msg = (
+                "'>' not supported between instances of 'str' and 'int'"
+                "|Invalid comparison"
+            )
             with pytest.raises(TypeError, match=msg):
                 df > 0
             return
@@ -99,7 +101,6 @@ def test_where_upcasting(self):
 
         tm.assert_series_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     def test_where_alignment(self, where_frame, float_string_frame):
         # aligning
         def _check_align(df, cond, other, check_dtypes=True):
@@ -131,7 +132,10 @@ def _check_align(df, cond, other, check_dtypes=True):
 
         df = where_frame
         if df is float_string_frame:
-            msg = "'>' not supported between instances of 'str' and 'int'"
+            msg = (
+                "'>' not supported between instances of 'str' and 'int'"
+                "|Invalid comparison"
+            )
             with pytest.raises(TypeError, match=msg):
                 df > 0
             return
@@ -174,7 +178,6 @@ def test_where_invalid(self):
         with pytest.raises(ValueError, match=msg):
             df.mask(0)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     def test_where_set(self, where_frame, float_string_frame, mixed_int_frame):
         # where inplace
 
@@ -196,7 +199,10 @@ def _check_set(df, cond, check_dtypes=True):
 
         df = where_frame
         if df is float_string_frame:
-            msg = "'>' not supported between instances of 'str' and 'int'"
+            msg = (
+                "'>' not supported between instances of 'str' and 'int'"
+                "|Invalid comparison"
+            )
             with pytest.raises(TypeError, match=msg):
                 df > 0
             return
diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py
index a01b68f1fea2a..54733129b4d47 100644
--- a/pandas/tests/frame/indexing/test_xs.py
+++ b/pandas/tests/frame/indexing/test_xs.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     DataFrame,
     Index,
@@ -74,10 +72,9 @@ def test_xs_other(self, float_frame):
         tm.assert_series_equal(float_frame["A"], float_frame_orig["A"])
         assert not (expected == 5).all()
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_xs_corner(self):
         # pathological mixed-type reordering case
-        df = DataFrame(index=[0])
+        df = DataFrame(index=[0], columns=Index([], dtype="str"))
         df["A"] = 1.0
         df["B"] = "foo"
         df["C"] = 2.0

From 6631202e8a499a943ed1cbb47033a403725b090b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20C=C3=A1rdenas?=
 <78029302+miguelcsx@users.noreply.github.com>
Date: Tue, 5 Nov 2024 14:00:28 -0500
Subject: [PATCH 200/224] BUG: fix #59950 handle duplicate column names in
 dataframe queries (#59971)

fix: #59950 handle duplicate column names in dataframe queries

- Fixed an issue where `Dataframe.query()` would throw an unexpected
  error

- The error was caused by `self.dtypes[k]`

- Adjusted the behavior to match the behavior prior to pandas version

- Added tests to ensure that `Dataframe.query()` works as expected

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
 doc/source/whatsnew/v3.0.0.rst        |  1 +
 pandas/core/generic.py                |  4 ++--
 pandas/tests/frame/test_query_eval.py | 19 +++++++++++++++++++
 3 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 20efac7e2edb0..9f90181c50909 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -771,6 +771,7 @@ Other
 - Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`)
 - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`)
 - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`)
+- Bug in :meth:`DataFrame.query` where using duplicate column names led to a ``TypeError``. (:issue:`59950`)
 - Bug in :meth:`DataFrame.query` which raised an exception or produced incorrect results when expressions contained backtick-quoted column names containing the hash character ``#``, backticks, or characters that fall outside the ASCII range (U+0001..U+007F). (:issue:`59285`) (:issue:`49633`)
 - Bug in :meth:`DataFrame.shift` where passing a ``freq`` on a DataFrame with no columns did not shift the index correctly. (:issue:`60102`)
 - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`)
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index bbd627d4f0d73..a3a6430b51b3b 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -603,9 +603,9 @@ def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]:
         dtypes = self.dtypes
         return {
             clean_column_name(k): Series(
-                v, copy=False, index=self.index, name=k, dtype=dtypes[k]
+                v, copy=False, index=self.index, name=k, dtype=dtype
             ).__finalize__(self)
-            for k, v in zip(self.columns, self._iter_column_arrays())
+            for k, v, dtype in zip(self.columns, self._iter_column_arrays(), dtypes)
             if not isinstance(k, int)
         }
 
diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py
index a574989860957..ca572b1026526 100644
--- a/pandas/tests/frame/test_query_eval.py
+++ b/pandas/tests/frame/test_query_eval.py
@@ -159,6 +159,25 @@ def test_query_empty_string(self):
         with pytest.raises(ValueError, match=msg):
             df.query("")
 
+    def test_query_duplicate_column_name(self, engine, parser):
+        df = DataFrame(
+            {
+                "A": range(3),
+                "B": range(3),
+                "C": range(3)
+            }
+        ).rename(columns={"B": "A"})
+
+        res = df.query('C == 1', engine=engine, parser=parser)
+
+        expect = DataFrame(
+            [[1, 1, 1]],
+            columns=["A", "A", "C"],
+            index=[1]
+        )
+
+        tm.assert_frame_equal(res, expect)
+
     def test_eval_resolvers_as_list(self):
         # GH 14095
         df = DataFrame(

From eea95a304795d5c0e72494a645af417e0449bc9f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 5 Nov 2024 16:26:40 -0800
Subject: [PATCH 201/224] TST: Skip flaky offset test case on WASM (#60186)

* TST: Skip flaky offset test case on WASM

* Check tzinfo exists

* Check for zoneinfo directly

* Undo original change

* Try installing tzdata to fix

* Revert "Try installing tzdata to fix"

This reverts commit 6698cd5de18a9a4cf03dc1ae86ffe8a10461eee5.

* Revert "Undo original change"

This reverts commit 5bc727258e669050cb0b30e2d652fbd85e86e8da.
---
 .../tseries/offsets/test_offsets_properties.py      | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py
index 943434e515828..809d8f87b2c02 100644
--- a/pandas/tests/tseries/offsets/test_offsets_properties.py
+++ b/pandas/tests/tseries/offsets/test_offsets_properties.py
@@ -8,12 +8,16 @@
 tests, or when trying to pin down the bugs exposed by the tests below.
 """
 
+import zoneinfo
+
 from hypothesis import (
     assume,
     given,
 )
 import pytest
 
+from pandas.compat import WASM
+
 import pandas as pd
 from pandas._testing._hypothesis import (
     DATETIME_JAN_1_1900_OPTIONAL_TZ,
@@ -28,6 +32,15 @@
 @given(DATETIME_JAN_1_1900_OPTIONAL_TZ, YQM_OFFSET)
 def test_on_offset_implementations(dt, offset):
     assume(not offset.normalize)
+    # This case is flaky in CI 2024-11-04
+    assume(
+        not (
+            WASM
+            and isinstance(dt.tzinfo, zoneinfo.ZoneInfo)
+            and dt.tzinfo.key == "Indian/Cocos"
+            and isinstance(offset, pd.offsets.MonthBegin)
+        )
+    )
     # check that the class-specific implementations of is_on_offset match
     # the general case definition:
     #   (dt + offset) - offset == dt

From e49ab80bbd28d5ab03cb796dce96b44f037a0ccf Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 5 Nov 2024 17:50:38 -0800
Subject: [PATCH 202/224] STY: Fix lint error in test_where.py (#60206)

---
 pandas/tests/frame/indexing/test_where.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py
index d3040052ea696..f399f71a9ce88 100644
--- a/pandas/tests/frame/indexing/test_where.py
+++ b/pandas/tests/frame/indexing/test_where.py
@@ -4,8 +4,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.core.dtypes.common import is_scalar
 
 import pandas as pd

From 3bcdab2e8e060f92c251b4c88e1c01f1638599be Mon Sep 17 00:00:00 2001
From: Aidan Feldman <aidan.feldman@gmail.com>
Date: Wed, 6 Nov 2024 13:34:27 -0500
Subject: [PATCH 203/224] add warning about setting max_rows/max_columns to
 'None' (#60216)

---
 pandas/core/config_init.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
index e4eefb570fd95..20fe8cbab1c9f 100644
--- a/pandas/core/config_init.py
+++ b/pandas/core/config_init.py
@@ -100,7 +100,10 @@ def use_numba_cb(key: str) -> None:
 : int
     If max_rows is exceeded, switch to truncate view. Depending on
     `large_repr`, objects are either centrally truncated or printed as
-    a summary view. 'None' value means unlimited.
+    a summary view.
+
+    'None' value means unlimited. Beware that printing a large number of rows
+    could cause your rendering environment (the browser, etc.) to crash.
 
     In case python/IPython is running in a terminal and `large_repr`
     equals 'truncate' this can be set to 0 and pandas will auto-detect
@@ -121,7 +124,11 @@ def use_numba_cb(key: str) -> None:
 : int
     If max_cols is exceeded, switch to truncate view. Depending on
     `large_repr`, objects are either centrally truncated or printed as
-    a summary view. 'None' value means unlimited.
+    a summary view.
+
+    'None' value means unlimited. Beware that printing a large number of
+    columns could cause your rendering environment (the browser, etc.) to
+    crash.
 
     In case python/IPython is running in a terminal and `large_repr`
     equals 'truncate' this can be set to 0 or None and pandas will auto-detect

From 5929ae9f1e1d926b848242d61107075f2a5a363b Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 6 Nov 2024 21:04:00 +0100
Subject: [PATCH 204/224] BUG (string dtype): fix escaping of newline/tab
 characters in the repr (#60215)

* BUG (string dtype): fix escaping of newline/tab characters in the repr

* parametrize existing test for all string dtypes

* remove xfail
---
 pandas/core/arrays/string_.py       | 11 +++++++++++
 pandas/tests/frame/test_repr.py     |  3 ---
 pandas/tests/series/test_formats.py | 14 +++++++-------
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index f54a5260bd699..2954edd93e343 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from functools import partial
 import operator
 from typing import (
     TYPE_CHECKING,
@@ -64,6 +65,8 @@
 from pandas.core.indexers import check_array_indexer
 from pandas.core.missing import isna
 
+from pandas.io.formats import printing
+
 if TYPE_CHECKING:
     import pyarrow
 
@@ -391,6 +394,14 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self:
             raise ValueError
         return cls._from_sequence(scalars, dtype=dtype)
 
+    def _formatter(self, boxed: bool = False):
+        formatter = partial(
+            printing.pprint_thing,
+            escape_chars=("\t", "\r", "\n"),
+            quote_strings=not boxed,
+        )
+        return formatter
+
     def _str_map(
         self,
         f,
diff --git a/pandas/tests/frame/test_repr.py b/pandas/tests/frame/test_repr.py
index 10cc86385af1b..73628424725e5 100644
--- a/pandas/tests/frame/test_repr.py
+++ b/pandas/tests/frame/test_repr.py
@@ -7,8 +7,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     NA,
     Categorical,
@@ -176,7 +174,6 @@ def test_repr_mixed_big(self):
 
         repr(biggie)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="/r in")
     def test_repr(self):
         # columns but no index
         no_index = DataFrame(columns=[0, 1, 3])
diff --git a/pandas/tests/series/test_formats.py b/pandas/tests/series/test_formats.py
index ab083d5c58b35..eb81840f6f8f9 100644
--- a/pandas/tests/series/test_formats.py
+++ b/pandas/tests/series/test_formats.py
@@ -6,8 +6,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     Categorical,
@@ -143,11 +141,13 @@ def test_tidy_repr_name_0(self, arg):
         rep_str = repr(ser)
         assert "Name: 0" in rep_str
 
-    @pytest.mark.xfail(
-        using_string_dtype(), reason="TODO(infer_string): investigate failure"
-    )
-    def test_newline(self):
-        ser = Series(["a\n\r\tb"], name="a\n\r\td", index=["a\n\r\tf"])
+    def test_newline(self, any_string_dtype):
+        ser = Series(
+            ["a\n\r\tb"],
+            name="a\n\r\td",
+            index=Index(["a\n\r\tf"], dtype=any_string_dtype),
+            dtype=any_string_dtype,
+        )
         assert "\t" not in repr(ser)
         assert "\r" not in repr(ser)
         assert "a\n" not in repr(ser)

From 4b04a2f0043ad04b5546750a8947dfeef68cdb75 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 6 Nov 2024 21:05:43 +0100
Subject: [PATCH 205/224] TST (string dtype): avoid explicit object dtype Index
 in fixture data (#60217)

* TST (string dtype): avoid explicit object dtype Index in fixture data

* test updates
---
 pandas/_testing/__init__.py                  |  2 ++
 pandas/conftest.py                           | 10 +++++-----
 pandas/tests/frame/test_reductions.py        |  1 -
 pandas/tests/series/indexing/test_setitem.py |  1 -
 pandas/tests/series/methods/test_reindex.py  |  2 +-
 pandas/tests/series/methods/test_to_csv.py   |  3 ---
 6 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
index 0be01da1816a2..0a110d69c7a70 100644
--- a/pandas/_testing/__init__.py
+++ b/pandas/_testing/__init__.py
@@ -501,6 +501,8 @@ def shares_memory(left, right) -> bool:
     if isinstance(left, MultiIndex):
         return shares_memory(left._codes, right)
     if isinstance(left, (Index, Series)):
+        if isinstance(right, (Index, Series)):
+            return shares_memory(left._values, right._values)
         return shares_memory(left._values, right)
 
     if isinstance(left, NDArrayBackedExtensionArray):
diff --git a/pandas/conftest.py b/pandas/conftest.py
index 7ad322d050c0f..106518678df6a 100644
--- a/pandas/conftest.py
+++ b/pandas/conftest.py
@@ -600,7 +600,7 @@ def multiindex_year_month_day_dataframe_random_data():
     """
     tdf = DataFrame(
         np.random.default_rng(2).standard_normal((100, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=100, freq="B"),
     )
     ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum()
@@ -787,7 +787,7 @@ def string_series() -> Series:
     """
     return Series(
         np.arange(30, dtype=np.float64) * 1.1,
-        index=Index([f"i_{i}" for i in range(30)], dtype=object),
+        index=Index([f"i_{i}" for i in range(30)]),
         name="series",
     )
 
@@ -798,7 +798,7 @@ def object_series() -> Series:
     Fixture for Series of dtype object with Index of unique strings
     """
     data = [f"foo_{i}" for i in range(30)]
-    index = Index([f"bar_{i}" for i in range(30)], dtype=object)
+    index = Index([f"bar_{i}" for i in range(30)])
     return Series(data, index=index, name="objects", dtype=object)
 
 
@@ -890,8 +890,8 @@ def int_frame() -> DataFrame:
     """
     return DataFrame(
         np.ones((30, 4), dtype=np.int64),
-        index=Index([f"foo_{i}" for i in range(30)], dtype=object),
-        columns=Index(list("ABCD"), dtype=object),
+        index=Index([f"foo_{i}" for i in range(30)]),
+        columns=Index(list("ABCD")),
     )
 
 
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
index 05bb603f5c462..30d02f9b5463d 100644
--- a/pandas/tests/frame/test_reductions.py
+++ b/pandas/tests/frame/test_reductions.py
@@ -1047,7 +1047,6 @@ def test_sum_bools(self):
     # ----------------------------------------------------------------------
     # Index of max / min
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     @pytest.mark.parametrize("axis", [0, 1])
     def test_idxmin(self, float_frame, int_frame, skipna, axis):
         frame = float_frame
diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py
index 789e3ac752097..d3246f43e991b 100644
--- a/pandas/tests/series/indexing/test_setitem.py
+++ b/pandas/tests/series/indexing/test_setitem.py
@@ -545,7 +545,6 @@ def test_setitem_with_expansion_type_promotion(self):
         expected = Series([Timestamp("2016-01-01"), 3.0, "foo"], index=["a", "b", "c"])
         tm.assert_series_equal(ser, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_setitem_not_contained(self, string_series):
         # set item that's not contained
         ser = string_series.copy()
diff --git a/pandas/tests/series/methods/test_reindex.py b/pandas/tests/series/methods/test_reindex.py
index 068446a5e216b..442d73cadfe47 100644
--- a/pandas/tests/series/methods/test_reindex.py
+++ b/pandas/tests/series/methods/test_reindex.py
@@ -23,7 +23,7 @@
 def test_reindex(datetime_series, string_series):
     identity = string_series.reindex(string_series.index)
 
-    assert np.may_share_memory(string_series.index, identity.index)
+    assert tm.shares_memory(string_series.index, identity.index)
 
     assert identity.index.is_(string_series.index)
     assert identity.index.identical(string_series.index)
diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py
index 6eb7c74d2eca0..3e3eb36112680 100644
--- a/pandas/tests/series/methods/test_to_csv.py
+++ b/pandas/tests/series/methods/test_to_csv.py
@@ -4,8 +4,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import Series
 import pandas._testing as tm
@@ -26,7 +24,6 @@ def read_csv(self, path, **kwargs):
 
         return out
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_from_csv(self, datetime_series, string_series, temp_file):
         # freq doesn't round-trip
         datetime_series.index = datetime_series.index._with_freq(None)

From c15d8236c3bf017971972f5c17d41b027250e750 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Wed, 6 Nov 2024 23:06:20 +0100
Subject: [PATCH 206/224] ENH (string dtype): accept string_view in addition to
 string/large_string for ArrowStringArray input (#60222)

---
 pandas/core/arrays/string_arrow.py               |  7 +++++++
 pandas/tests/arrays/string_/test_string_arrow.py | 14 ++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index cde39c7f4dc6a..75e36feea2628 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -17,6 +17,7 @@
 from pandas.compat import (
     pa_version_under10p1,
     pa_version_under13p0,
+    pa_version_under16p0,
 )
 from pandas.util._exceptions import find_stack_level
 
@@ -71,6 +72,10 @@ def _chk_pyarrow_available() -> None:
         raise ImportError(msg)
 
 
+def _is_string_view(typ):
+    return not pa_version_under16p0 and pa.types.is_string_view(typ)
+
+
 # TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from
 # ObjectStringArrayMixin because we want to have the object-dtype based methods as
 # fallback for the ones that pyarrow doesn't yet support
@@ -128,11 +133,13 @@ def __init__(self, values) -> None:
         _chk_pyarrow_available()
         if isinstance(values, (pa.Array, pa.ChunkedArray)) and (
             pa.types.is_string(values.type)
+            or _is_string_view(values.type)
             or (
                 pa.types.is_dictionary(values.type)
                 and (
                     pa.types.is_string(values.type.value_type)
                     or pa.types.is_large_string(values.type.value_type)
+                    or _is_string_view(values.type.value_type)
                 )
             )
         ):
diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
index d4363171788d4..e6103da5021bb 100644
--- a/pandas/tests/arrays/string_/test_string_arrow.py
+++ b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -99,6 +99,20 @@ def test_constructor_valid_string_type_value_dictionary(string_type, chunked):
     assert pa.types.is_large_string(arr._pa_array.type)
 
 
+@pytest.mark.parametrize("chunked", [True, False])
+def test_constructor_valid_string_view(chunked):
+    # requires pyarrow>=18 for casting string_view to string
+    pa = pytest.importorskip("pyarrow", minversion="18")
+
+    arr = pa.array(["1", "2", "3"], pa.string_view())
+    if chunked:
+        arr = pa.chunked_array(arr)
+
+    arr = ArrowStringArray(arr)
+    # dictionary type get converted to dense large string array
+    assert pa.types.is_large_string(arr._pa_array.type)
+
+
 def test_constructor_from_list():
     # GH#27673
     pytest.importorskip("pyarrow")

From 909c6da4d3627f95b02d6cc3f482cefbf166e24f Mon Sep 17 00:00:00 2001
From: Jason Mok <106209849+jasonmokk@users.noreply.github.com>
Date: Thu, 7 Nov 2024 01:57:32 -0600
Subject: [PATCH 207/224] TST: Add test for `feather` I/O with historical
 out-of-bounds `datetime` values (#60209)

Co-authored-by: Jason Mok <jasonmok@Jasons-Air.attlocal.net>
Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
 pandas/tests/io/test_feather.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
index 9721d045b7b91..8ae2033faab4f 100644
--- a/pandas/tests/io/test_feather.py
+++ b/pandas/tests/io/test_feather.py
@@ -1,5 +1,6 @@
 """test feather-format compat"""
 
+from datetime import datetime
 import zoneinfo
 
 import numpy as np
@@ -247,3 +248,15 @@ def test_string_inference(self, tmp_path):
             data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan)
         )
         tm.assert_frame_equal(result, expected)
+
+    def test_out_of_bounds_datetime_to_feather(self):
+        # GH#47832
+        df = pd.DataFrame(
+            {
+                "date": [
+                    datetime.fromisoformat("1654-01-01"),
+                    datetime.fromisoformat("1920-01-01"),
+                ],
+            }
+        )
+        self.check_round_trip(df)

From 7fe140e05349c1251e9e9595da982e63c7cf2154 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 7 Nov 2024 09:57:48 +0100
Subject: [PATCH 208/224] TST: add extra test case for np.array(obj,
 copy=False) read-only behaviour (#60191)

---
 pandas/core/generic.py               |  6 +++++
 pandas/tests/copy_view/test_array.py | 37 ++++++++++++++++++++++++----
 2 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index a3a6430b51b3b..35014674565ff 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -2014,6 +2014,12 @@ def empty(self) -> bool:
     def __array__(
         self, dtype: npt.DTypeLike | None = None, copy: bool | None = None
     ) -> np.ndarray:
+        if copy is False and not self._mgr.is_single_block and not self.empty:
+            # check this manually, otherwise ._values will already return a copy
+            # and np.array(values, copy=False) will not raise an error
+            raise ValueError(
+                "Unable to avoid copy while creating an array as requested."
+            )
         values = self._values
         if copy is None:
             # Note: branch avoids `copy=None` for NumPy 1.x support
diff --git a/pandas/tests/copy_view/test_array.py b/pandas/tests/copy_view/test_array.py
index bb238d08bd9bd..2b3ef9201d918 100644
--- a/pandas/tests/copy_view/test_array.py
+++ b/pandas/tests/copy_view/test_array.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+from pandas.compat.numpy import np_version_gt2
+
 from pandas import (
     DataFrame,
     Series,
@@ -15,8 +17,12 @@
 
 @pytest.mark.parametrize(
     "method",
-    [lambda ser: ser.values, lambda ser: np.asarray(ser)],
-    ids=["values", "asarray"],
+    [
+        lambda ser: ser.values,
+        lambda ser: np.asarray(ser),
+        lambda ser: np.array(ser, copy=False),
+    ],
+    ids=["values", "asarray", "array"],
 )
 def test_series_values(method):
     ser = Series([1, 2, 3], name="name")
@@ -40,8 +46,12 @@ def test_series_values(method):
 
 @pytest.mark.parametrize(
     "method",
-    [lambda df: df.values, lambda df: np.asarray(df)],
-    ids=["values", "asarray"],
+    [
+        lambda df: df.values,
+        lambda df: np.asarray(df),
+        lambda ser: np.array(ser, copy=False),
+    ],
+    ids=["values", "asarray", "array"],
 )
 def test_dataframe_values(method):
     df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
@@ -82,7 +92,7 @@ def test_series_to_numpy():
     ser.iloc[0] = 0
     assert ser.values[0] == 0
 
-    # specify copy=False gives a writeable array
+    # specify copy=True gives a writeable array
     ser = Series([1, 2, 3], name="name")
     arr = ser.to_numpy(copy=True)
     assert not np.shares_memory(arr, get_array(ser, "name"))
@@ -130,6 +140,23 @@ def test_dataframe_multiple_numpy_dtypes():
     assert not np.shares_memory(arr, get_array(df, "a"))
     assert arr.flags.writeable is True
 
+    if np_version_gt2:
+        # copy=False semantics are only supported in NumPy>=2.
+
+        with pytest.raises(ValueError, match="Unable to avoid copy while creating"):
+            arr = np.array(df, copy=False)
+
+    arr = np.array(df, copy=True)
+    assert arr.flags.writeable is True
+
+
+def test_dataframe_single_block_copy_true():
+    # the copy=False/None cases are tested above in test_dataframe_values
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    arr = np.array(df, copy=True)
+    assert not np.shares_memory(arr, get_array(df, "a"))
+    assert arr.flags.writeable is True
+
 
 def test_values_is_ea():
     df = DataFrame({"a": date_range("2012-01-01", periods=3)})

From 0937c95777d44462d67fd5b299d4563984e78332 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 7 Nov 2024 16:03:01 +0100
Subject: [PATCH 209/224] BUG (string dtype): fix qualifier in memory usage
 info (#60221)

---
 pandas/core/indexes/base.py              |  4 ++-
 pandas/core/indexes/multi.py             |  9 ++++--
 pandas/tests/frame/methods/test_info.py  | 36 +++++++++++++++---------
 pandas/tests/series/methods/test_info.py | 22 +++++++++++----
 4 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index cf3d1e6a2ee2d..d6035c82aaaf8 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -5139,7 +5139,9 @@ def _is_memory_usage_qualified(self) -> bool:
         """
         Return a boolean if we need a qualified .info display.
         """
-        return is_object_dtype(self.dtype)
+        return is_object_dtype(self.dtype) or (
+            is_string_dtype(self.dtype) and self.dtype.storage == "python"  # type: ignore[union-attr]
+        )
 
     def __contains__(self, key: Any) -> bool:
         """
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index e6ce00cb714a4..d1c99cb864e57 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -66,6 +66,7 @@
     is_list_like,
     is_object_dtype,
     is_scalar,
+    is_string_dtype,
     pandas_dtype,
 )
 from pandas.core.dtypes.dtypes import (
@@ -1425,10 +1426,12 @@ def dtype(self) -> np.dtype:
     def _is_memory_usage_qualified(self) -> bool:
         """return a boolean if we need a qualified .info display"""
 
-        def f(level) -> bool:
-            return "mixed" in level or "string" in level or "unicode" in level
+        def f(dtype) -> bool:
+            return is_object_dtype(dtype) or (
+                is_string_dtype(dtype) and dtype.storage == "python"
+            )
 
-        return any(f(level.inferred_type) for level in self.levels)
+        return any(f(level.dtype) for level in self.levels)
 
     # Cannot determine type of "memory_usage"
     @doc(Index.memory_usage)  # type: ignore[has-type]
diff --git a/pandas/tests/frame/methods/test_info.py b/pandas/tests/frame/methods/test_info.py
index aad43b7a77ac7..74e4383950174 100644
--- a/pandas/tests/frame/methods/test_info.py
+++ b/pandas/tests/frame/methods/test_info.py
@@ -7,8 +7,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat import (
     HAS_PYARROW,
     IS64,
@@ -436,18 +434,25 @@ def test_usage_via_getsizeof():
     assert abs(diff) < 100
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-def test_info_memory_usage_qualified():
+def test_info_memory_usage_qualified(using_infer_string):
     buf = StringIO()
     df = DataFrame(1, columns=list("ab"), index=[1, 2, 3])
     df.info(buf=buf)
     assert "+" not in buf.getvalue()
 
     buf = StringIO()
-    df = DataFrame(1, columns=list("ab"), index=list("ABC"))
+    df = DataFrame(1, columns=list("ab"), index=Index(list("ABC"), dtype=object))
     df.info(buf=buf)
     assert "+" in buf.getvalue()
 
+    buf = StringIO()
+    df = DataFrame(1, columns=list("ab"), index=Index(list("ABC"), dtype="str"))
+    df.info(buf=buf)
+    if using_infer_string and HAS_PYARROW:
+        assert "+" not in buf.getvalue()
+    else:
+        assert "+" in buf.getvalue()
+
     buf = StringIO()
     df = DataFrame(
         1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)])
@@ -460,7 +465,10 @@ def test_info_memory_usage_qualified():
         1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]])
     )
     df.info(buf=buf)
-    assert "+" in buf.getvalue()
+    if using_infer_string and HAS_PYARROW:
+        assert "+" not in buf.getvalue()
+    else:
+        assert "+" in buf.getvalue()
 
 
 def test_info_memory_usage_bug_on_multiindex():
@@ -497,16 +505,15 @@ def test_info_categorical():
     df.info(buf=buf)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @pytest.mark.xfail(not IS64, reason="GH 36579: fail on 32-bit system")
-def test_info_int_columns():
+def test_info_int_columns(using_infer_string):
     # GH#37245
     df = DataFrame({1: [1, 2], 2: [2, 3]}, index=["A", "B"])
     buf = StringIO()
     df.info(show_counts=True, buf=buf)
     result = buf.getvalue()
     expected = textwrap.dedent(
-        """\
+        f"""\
         <class 'pandas.DataFrame'>
         Index: 2 entries, A to B
         Data columns (total 2 columns):
@@ -515,19 +522,22 @@ def test_info_int_columns():
          0   1       2 non-null      int64
          1   2       2 non-null      int64
         dtypes: int64(2)
-        memory usage: 48.0+ bytes
+        memory usage: {'50.0' if using_infer_string and HAS_PYARROW else '48.0+'} bytes
         """
     )
     assert result == expected
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
-def test_memory_usage_empty_no_warning():
+def test_memory_usage_empty_no_warning(using_infer_string):
     # GH#50066
     df = DataFrame(index=["a", "b"])
     with tm.assert_produces_warning(None):
         result = df.memory_usage()
-    expected = Series(16 if IS64 else 8, index=["Index"])
+    if using_infer_string and HAS_PYARROW:
+        value = 18
+    else:
+        value = 16 if IS64 else 8
+    expected = Series(value, index=["Index"])
     tm.assert_series_equal(result, expected)
 
 
diff --git a/pandas/tests/series/methods/test_info.py b/pandas/tests/series/methods/test_info.py
index 097976b0a7ac0..e2831fb80b7a0 100644
--- a/pandas/tests/series/methods/test_info.py
+++ b/pandas/tests/series/methods/test_info.py
@@ -7,10 +7,14 @@
 
 from pandas._config import using_string_dtype
 
-from pandas.compat import PYPY
+from pandas.compat import (
+    HAS_PYARROW,
+    PYPY,
+)
 
 from pandas import (
     CategoricalIndex,
+    Index,
     MultiIndex,
     Series,
     date_range,
@@ -41,7 +45,9 @@ def test_info_categorical():
 
 
 @pytest.mark.parametrize("verbose", [True, False])
-def test_info_series(lexsorted_two_level_string_multiindex, verbose):
+def test_info_series(
+    lexsorted_two_level_string_multiindex, verbose, using_infer_string
+):
     index = lexsorted_two_level_string_multiindex
     ser = Series(range(len(index)), index=index, name="sth")
     buf = StringIO()
@@ -63,10 +69,11 @@ def test_info_series(lexsorted_two_level_string_multiindex, verbose):
             10 non-null     int64
             """
         )
+    qualifier = "" if using_infer_string and HAS_PYARROW else "+"
     expected += textwrap.dedent(
         f"""\
         dtypes: int64(1)
-        memory usage: {ser.memory_usage()}.0+ bytes
+        memory usage: {ser.memory_usage()}.0{qualifier} bytes
         """
     )
     assert result == expected
@@ -142,14 +149,17 @@ def test_info_memory_usage_deep_pypy():
     assert s_object.memory_usage(deep=True) == s_object.memory_usage()
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize(
     "index, plus",
     [
         ([1, 2, 3], False),
-        (list("ABC"), True),
+        (Index(list("ABC"), dtype="str"), not (using_string_dtype() and HAS_PYARROW)),
+        (Index(list("ABC"), dtype=object), True),
         (MultiIndex.from_product([range(3), range(3)]), False),
-        (MultiIndex.from_product([range(3), ["foo", "bar"]]), True),
+        (
+            MultiIndex.from_product([range(3), ["foo", "bar"]]),
+            not (using_string_dtype() and HAS_PYARROW),
+        ),
     ],
 )
 def test_info_memory_usage_qualified(index, plus):

From 692ea6f9d4b05187a05f0811d3241211855d6efb Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 7 Nov 2024 16:04:20 +0100
Subject: [PATCH 210/224] ERR (string dtype): harmonize setitem error message
 for python and pyarrow storage (#60219)

---
 pandas/core/arrays/arrow/array.py            |  4 ++--
 pandas/core/arrays/masked.py                 |  2 +-
 pandas/core/arrays/string_.py                | 12 +++++++++---
 pandas/core/arrays/string_arrow.py           | 15 ++++++++++++---
 pandas/tests/arrays/masked/test_indexing.py  |  2 +-
 pandas/tests/arrays/string_/test_string.py   | 17 ++++-------------
 pandas/tests/frame/indexing/test_indexing.py |  2 +-
 pandas/tests/frame/indexing/test_where.py    |  4 ++--
 pandas/tests/indexing/test_loc.py            |  2 +-
 pandas/tests/series/indexing/test_setitem.py |  4 ++--
 10 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index b6f1412066574..7db10b1cc4a80 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -1145,7 +1145,7 @@ def fillna(
         try:
             fill_value = self._box_pa(value, pa_type=self._pa_array.type)
         except pa.ArrowTypeError as err:
-            msg = f"Invalid value '{value!s}' for dtype {self.dtype}"
+            msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'"
             raise TypeError(msg) from err
 
         try:
@@ -2136,7 +2136,7 @@ def _maybe_convert_setitem_value(self, value):
         try:
             value = self._box_pa(value, self._pa_array.type)
         except pa.ArrowTypeError as err:
-            msg = f"Invalid value '{value!s}' for dtype {self.dtype}"
+            msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'"
             raise TypeError(msg) from err
         return value
 
diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py
index 349d2ec4d3cc9..f3a0cc0dccdb3 100644
--- a/pandas/core/arrays/masked.py
+++ b/pandas/core/arrays/masked.py
@@ -286,7 +286,7 @@ def _validate_setitem_value(self, value):
 
         # Note: without the "str" here, the f-string rendering raises in
         #  py38 builds.
-        raise TypeError(f"Invalid value '{value!s}' for dtype {self.dtype}")
+        raise TypeError(f"Invalid value '{value!s}' for dtype '{self.dtype}'")
 
     def __setitem__(self, key, value) -> None:
         key = check_array_indexer(self, key)
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 2954edd93e343..01619dab7ce45 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -652,7 +652,8 @@ def _validate_scalar(self, value):
             return self.dtype.na_value
         elif not isinstance(value, str):
             raise TypeError(
-                f"Cannot set non-string value '{value}' into a string array."
+                f"Invalid value '{value}' for dtype '{self.dtype}'. Value should be a "
+                f"string or missing value, got '{type(value).__name__}' instead."
             )
         return value
 
@@ -743,7 +744,9 @@ def __setitem__(self, key, value) -> None:
                 value = self.dtype.na_value
             elif not isinstance(value, str):
                 raise TypeError(
-                    f"Cannot set non-string value '{value}' into a StringArray."
+                    f"Invalid value '{value}' for dtype '{self.dtype}'. Value should "
+                    f"be a string or missing value, got '{type(value).__name__}' "
+                    "instead."
                 )
         else:
             if not is_array_like(value):
@@ -753,7 +756,10 @@ def __setitem__(self, key, value) -> None:
                 # compatible, compatibility with arrow backed strings
                 value = np.asarray(value)
             if len(value) and not lib.is_string_array(value, skipna=True):
-                raise TypeError("Must provide strings.")
+                raise TypeError(
+                    "Invalid value for dtype 'str'. Value should be a "
+                    "string or missing value (or array of those)."
+                )
 
             mask = isna(value)
             if mask.any():
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 75e36feea2628..27c1425d11ac6 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -223,7 +223,10 @@ def insert(self, loc: int, item) -> ArrowStringArray:
         if self.dtype.na_value is np.nan and item is np.nan:
             item = libmissing.NA
         if not isinstance(item, str) and item is not libmissing.NA:
-            raise TypeError("Scalar must be NA or str")
+            raise TypeError(
+                f"Invalid value '{item}' for dtype 'str'. Value should be a "
+                f"string or missing value, got '{type(item).__name__}' instead."
+            )
         return super().insert(loc, item)
 
     def _convert_bool_result(self, values, na=lib.no_default, method_name=None):
@@ -255,13 +258,19 @@ def _maybe_convert_setitem_value(self, value):
             if isna(value):
                 value = None
             elif not isinstance(value, str):
-                raise TypeError("Scalar must be NA or str")
+                raise TypeError(
+                    f"Invalid value '{value}' for dtype 'str'. Value should be a "
+                    f"string or missing value, got '{type(value).__name__}' instead."
+                )
         else:
             value = np.array(value, dtype=object, copy=True)
             value[isna(value)] = None
             for v in value:
                 if not (v is None or isinstance(v, str)):
-                    raise TypeError("Must provide strings")
+                    raise TypeError(
+                        "Invalid value for dtype 'str'. Value should be a "
+                        "string or missing value (or array of those)."
+                    )
         return super()._maybe_convert_setitem_value(value)
 
     def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
diff --git a/pandas/tests/arrays/masked/test_indexing.py b/pandas/tests/arrays/masked/test_indexing.py
index 37f38a11cbeae..753d562c87ffa 100644
--- a/pandas/tests/arrays/masked/test_indexing.py
+++ b/pandas/tests/arrays/masked/test_indexing.py
@@ -8,7 +8,7 @@
 
 class TestSetitemValidation:
     def _check_setitem_invalid(self, arr, invalid):
-        msg = f"Invalid value '{invalid!s}' for dtype {arr.dtype}"
+        msg = f"Invalid value '{invalid!s}' for dtype '{arr.dtype}'"
         msg = re.escape(msg)
         with pytest.raises(TypeError, match=msg):
             arr[0] = invalid
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index a18161f47039b..a32ac7db4656a 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -109,14 +109,11 @@ def test_none_to_nan(cls, dtype):
 def test_setitem_validates(cls, dtype):
     arr = cls._from_sequence(["a", "b"], dtype=dtype)
 
-    if dtype.storage == "python":
-        msg = "Cannot set non-string value '10' into a StringArray."
-    else:
-        msg = "Scalar must be NA or str"
+    msg = "Invalid value '10' for dtype 'str"
     with pytest.raises(TypeError, match=msg):
         arr[0] = 10
 
-    msg = "Must provide strings"
+    msg = "Invalid value for dtype 'str"
     with pytest.raises(TypeError, match=msg):
         arr[:] = np.array([1, 2])
 
@@ -508,10 +505,7 @@ def test_fillna_args(dtype):
     expected = pd.array(["a", "b"], dtype=dtype)
     tm.assert_extension_array_equal(res, expected)
 
-    if dtype.storage == "pyarrow":
-        msg = "Invalid value '1' for dtype str"
-    else:
-        msg = "Cannot set non-string value '1' into a StringArray."
+    msg = "Invalid value '1' for dtype 'str"
     with pytest.raises(TypeError, match=msg):
         arr.fillna(value=1)
 
@@ -727,10 +721,7 @@ def test_setitem_scalar_with_mask_validation(dtype):
 
     # for other non-string we should also raise an error
     ser = pd.Series(["a", "b", "c"], dtype=dtype)
-    if dtype.storage == "python":
-        msg = "Cannot set non-string value"
-    else:
-        msg = "Scalar must be NA or str"
+    msg = "Invalid value '1' for dtype 'str"
     with pytest.raises(TypeError, match=msg):
         ser[mask] = 1
 
diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
index eb14f8bdbfb86..84c01e0be3b6f 100644
--- a/pandas/tests/frame/indexing/test_indexing.py
+++ b/pandas/tests/frame/indexing/test_indexing.py
@@ -1274,7 +1274,7 @@ def test_setting_mismatched_na_into_nullable_fails(
                 r"timedelta64\[ns\] cannot be converted to (Floating|Integer)Dtype",
                 r"datetime64\[ns\] cannot be converted to (Floating|Integer)Dtype",
                 "'values' contains non-numeric NA",
-                r"Invalid value '.*' for dtype (U?Int|Float)\d{1,2}",
+                r"Invalid value '.*' for dtype '(U?Int|Float)\d{1,2}'",
             ]
         )
         with pytest.raises(TypeError, match=msg):
diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py
index f399f71a9ce88..86b39ddd19ec1 100644
--- a/pandas/tests/frame/indexing/test_where.py
+++ b/pandas/tests/frame/indexing/test_where.py
@@ -931,7 +931,7 @@ def test_where_nullable_invalid_na(frame_or_series, any_numeric_ea_dtype):
 
     mask = np.array([True, True, False], ndmin=obj.ndim).T
 
-    msg = r"Invalid value '.*' for dtype (U?Int|Float)\d{1,2}"
+    msg = r"Invalid value '.*' for dtype '(U?Int|Float)\d{1,2}'"
 
     for null in tm.NP_NAT_OBJECTS + [pd.NaT]:
         # NaT is an NA value that we should *not* cast to pd.NA dtype
@@ -1030,7 +1030,7 @@ def test_where_int_overflow(replacement, using_infer_string):
     df = DataFrame([[1.0, 2e25, "nine"], [np.nan, 0.1, None]])
     if using_infer_string and replacement not in (None, "snake"):
         with pytest.raises(
-            TypeError, match="Cannot set non-string value|Scalar must be NA or str"
+            TypeError, match=f"Invalid value '{replacement}' for dtype 'str'"
         ):
             df.where(pd.notnull(df), replacement)
         return
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
index 36b08ee1df790..e0e9d4cfc5ccb 100644
--- a/pandas/tests/indexing/test_loc.py
+++ b/pandas/tests/indexing/test_loc.py
@@ -1230,7 +1230,7 @@ def test_loc_setitem_str_to_small_float_conversion_type(self, using_infer_string
         # assigning with loc/iloc attempts to set the values inplace, which
         #  in this case is successful
         if using_infer_string:
-            with pytest.raises(TypeError, match="Must provide strings"):
+            with pytest.raises(TypeError, match="Invalid value"):
                 result.loc[result.index, "A"] = [float(x) for x in col_data]
         else:
             result.loc[result.index, "A"] = [float(x) for x in col_data]
diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py
index d3246f43e991b..ed5cb5a8d1237 100644
--- a/pandas/tests/series/indexing/test_setitem.py
+++ b/pandas/tests/series/indexing/test_setitem.py
@@ -864,7 +864,7 @@ def test_index_where(self, obj, key, expected, raises, val, using_infer_string):
         mask[key] = True
 
         if using_infer_string and obj.dtype == object:
-            with pytest.raises(TypeError, match="Scalar must"):
+            with pytest.raises(TypeError, match="Invalid value"):
                 Index(obj).where(~mask, val)
         else:
             res = Index(obj).where(~mask, val)
@@ -877,7 +877,7 @@ def test_index_putmask(self, obj, key, expected, raises, val, using_infer_string
         mask[key] = True
 
         if using_infer_string and obj.dtype == object:
-            with pytest.raises(TypeError, match="Scalar must"):
+            with pytest.raises(TypeError, match="Invalid value"):
                 Index(obj).putmask(mask, val)
         else:
             res = Index(obj).putmask(mask, val)

From feaa9638a53077218fd9df42dfaa1cd150574bb2 Mon Sep 17 00:00:00 2001
From: Michiel De Muynck <michieldemuynck@gmail.com>
Date: Thu, 7 Nov 2024 20:59:52 +0100
Subject: [PATCH 211/224] BUG: Fix to_excel storing decimals as strings instead
 of numbers (issue #49598) (#60230)

* Fix issue 50174

* Add release notes

* Use correct github issue number
---
 doc/source/whatsnew/v2.3.0.rst        |  2 +-
 pandas/io/excel/_base.py              |  4 ++++
 pandas/tests/io/excel/test_writers.py | 31 +++++++++++++++++++++++++++
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
index 922cc0ead7fb0..d57d86f4a1476 100644
--- a/doc/source/whatsnew/v2.3.0.rst
+++ b/doc/source/whatsnew/v2.3.0.rst
@@ -133,7 +133,7 @@ MultiIndex
 
 I/O
 ^^^
--
+- :meth:`DataFrame.to_excel` was storing decimals as strings instead of numbers (:issue:`49598`)
 -
 
 Period
diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index ef52107c283e9..ced2ad91dba1e 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -8,6 +8,7 @@
     Sequence,
 )
 import datetime
+from decimal import Decimal
 from functools import partial
 import os
 from textwrap import fill
@@ -43,6 +44,7 @@
 
 from pandas.core.dtypes.common import (
     is_bool,
+    is_decimal,
     is_file_like,
     is_float,
     is_integer,
@@ -1348,6 +1350,8 @@ def _value_with_fmt(
             val = float(val)
         elif is_bool(val):
             val = bool(val)
+        elif is_decimal(val):
+            val = Decimal(val)
         elif isinstance(val, datetime.datetime):
             fmt = self._datetime_format
         elif isinstance(val, datetime.date):
diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py
index 44266ae9a62a5..81aa0be24bffc 100644
--- a/pandas/tests/io/excel/test_writers.py
+++ b/pandas/tests/io/excel/test_writers.py
@@ -3,6 +3,7 @@
     datetime,
     timedelta,
 )
+from decimal import Decimal
 from functools import partial
 from io import BytesIO
 import os
@@ -977,6 +978,36 @@ def test_to_excel_float_format(self, tmp_excel):
         )
         tm.assert_frame_equal(result, expected)
 
+    def test_to_excel_datatypes_preserved(self, tmp_excel):
+        # Test that when writing and reading Excel with dtype=object,
+        # datatypes are preserved, except Decimals which should be
+        # stored as floats
+
+        # see gh-49598
+        df = DataFrame(
+            [
+                [1.23, "1.23", Decimal("1.23")],
+                [4.56, "4.56", Decimal("4.56")],
+            ],
+            index=["A", "B"],
+            columns=["X", "Y", "Z"],
+        )
+        df.to_excel(tmp_excel)
+
+        with ExcelFile(tmp_excel) as reader:
+            result = pd.read_excel(reader, index_col=0, dtype=object)
+
+        expected = DataFrame(
+            [
+                [1.23, "1.23", 1.23],
+                [4.56, "4.56", 4.56],
+            ],
+            index=["A", "B"],
+            columns=["X", "Y", "Z"],
+            dtype=object,
+        )
+        tm.assert_frame_equal(result, expected)
+
     def test_to_excel_output_encoding(self, tmp_excel):
         # Avoid mixed inferred_type.
         df = DataFrame(

From 04432f57145a4681c323f555e69fceb804b4c32e Mon Sep 17 00:00:00 2001
From: Xiao Yuan <yuanx749@gmail.com>
Date: Fri, 8 Nov 2024 05:30:24 +0800
Subject: [PATCH 212/224] BUG: fix DataFrame(data=[None, 1],
 dtype='timedelta64[ns]') raising ValueError (#60081)

---
 doc/source/whatsnew/v3.0.0.rst          | 1 +
 pandas/core/construction.py             | 6 ++++++
 pandas/core/dtypes/cast.py              | 2 +-
 pandas/tests/frame/test_constructors.py | 8 ++++++++
 4 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 9f90181c50909..0f78f07af4a89 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -613,6 +613,7 @@ Categorical
 Datetimelike
 ^^^^^^^^^^^^
 - Bug in :attr:`is_year_start` where a DateTimeIndex constructed via a date_range with frequency 'MS' wouldn't have the correct year or quarter start attributes (:issue:`57377`)
+- Bug in :class:`DataFrame` raising ``ValueError`` when ``dtype`` is ``timedelta64`` and ``data`` is a list containing ``None`` (:issue:`60064`)
 - Bug in :class:`Timestamp` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``tzinfo`` or data (:issue:`48688`)
 - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`)
 - Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`)
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index 1e1292f8ef089..8df4f7e3e08f9 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -807,6 +807,12 @@ def _try_cast(
         )
 
     elif dtype.kind in "mM":
+        if is_ndarray:
+            arr = cast(np.ndarray, arr)
+            if arr.ndim == 2 and arr.shape[1] == 1:
+                # GH#60081: DataFrame Constructor converts 1D data to array of
+                # shape (N, 1), but maybe_cast_to_datetime assumes 1D input
+                return maybe_cast_to_datetime(arr[:, 0], dtype).reshape(arr.shape)
         return maybe_cast_to_datetime(arr, dtype)
 
     # GH#15832: Check if we are requesting a numeric dtype and
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 6ba07b1761557..8850b75323d68 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -1205,7 +1205,7 @@ def maybe_infer_to_datetimelike(
 
 def maybe_cast_to_datetime(
     value: np.ndarray | list, dtype: np.dtype
-) -> ExtensionArray | np.ndarray:
+) -> DatetimeArray | TimedeltaArray | np.ndarray:
     """
     try to cast the array/value to a datetimelike dtype, converting float
     nan to iNaT
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index 0a924aa393be5..3d8213cb3d11a 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -2772,6 +2772,14 @@ def test_construction_datetime_resolution_inference(self, cons):
         res_dtype2 = tm.get_dtype(obj2)
         assert res_dtype2 == "M8[us, US/Pacific]", res_dtype2
 
+    def test_construction_nan_value_timedelta64_dtype(self):
+        # GH#60064
+        result = DataFrame([None, 1], dtype="timedelta64[ns]")
+        expected = DataFrame(
+            ["NaT", "0 days 00:00:00.000000001"], dtype="timedelta64[ns]"
+        )
+        tm.assert_frame_equal(result, expected)
+
 
 class TestDataFrameConstructorIndexInference:
     def test_frame_from_dict_of_series_overlapping_monthly_period_indexes(self):

From b5e62ef2dc5dfcc23df5cf8b9dfd65733d9c8886 Mon Sep 17 00:00:00 2001
From: Thomas Dixon <90058210+tev-dixon@users.noreply.github.com>
Date: Thu, 7 Nov 2024 16:32:54 -0500
Subject: [PATCH 213/224] BUG: Fix #59429 stacked bar label position (#60211)

---
 doc/source/whatsnew/v3.0.0.rst            |  1 +
 pandas/plotting/_matplotlib/core.py       |  2 +-
 pandas/tests/plotting/frame/test_frame.py | 10 ++++++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 0f78f07af4a89..13e1ddd153ccb 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -711,6 +711,7 @@ Period
 Plotting
 ^^^^^^^^
 - Bug in :meth:`.DataFrameGroupBy.boxplot` failed when there were multiple groupings (:issue:`14701`)
+- Bug in :meth:`DataFrame.plot.bar` with ``stacked=True`` where labels on stacked bars with zero-height segments were incorrectly positioned at the base instead of the label position of the previous segment (:issue:`59429`)
 - Bug in :meth:`DataFrame.plot.line` raising ``ValueError`` when set both color and a ``dict`` style (:issue:`59461`)
 - Bug in :meth:`DataFrame.plot` that causes a shift to the right when the frequency multiplier is greater than one. (:issue:`57587`)
 - Bug in :meth:`Series.plot` with ``kind="pie"`` with :class:`ArrowDtype` (:issue:`59192`)
diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py
index 505db4b807cfc..1035150302d2c 100644
--- a/pandas/plotting/_matplotlib/core.py
+++ b/pandas/plotting/_matplotlib/core.py
@@ -1960,7 +1960,7 @@ def _make_plot(self, fig: Figure) -> None:
                 )
                 ax.set_title(label)
             elif self.stacked:
-                mask = y > 0
+                mask = y >= 0
                 start = np.where(mask, pos_prior, neg_prior) + self._start_base
                 w = self.bar_width / 2
                 rect = self._plot(
diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py
index b39f953da1ee6..087280ed3e01d 100644
--- a/pandas/tests/plotting/frame/test_frame.py
+++ b/pandas/tests/plotting/frame/test_frame.py
@@ -774,6 +774,16 @@ def test_bar_nan_stacked(self):
         expected = [0.0, 0.0, 0.0, 10.0, 0.0, 20.0, 15.0, 10.0, 40.0]
         assert result == expected
 
+    def test_bar_stacked_label_position_with_zero_height(self):
+        # GH 59429
+        df = DataFrame({"A": [3, 0, 1], "B": [0, 2, 4], "C": [5, 0, 2]})
+        ax = df.plot.bar(stacked=True)
+        ax.bar_label(ax.containers[-1])
+        expected = [8.0, 2.0, 7.0]
+        result = [text.xy[1] for text in ax.texts]
+        tm.assert_almost_equal(result, expected)
+        plt.close("all")
+
     @pytest.mark.parametrize("idx", [Index, pd.CategoricalIndex])
     def test_bar_categorical(self, idx):
         # GH 13019

From f9d2e50a8ca32a8612e9695dbb76d0e3227e8de5 Mon Sep 17 00:00:00 2001
From: YinonHorev <80318264+YinonHorev@users.noreply.github.com>
Date: Thu, 7 Nov 2024 22:49:03 +0100
Subject: [PATCH 214/224] ENH: set __module__ for pandas scalars
 (Timestamp/Timedelta/Period) (#57976)

Co-authored-by: Yinon Horev <yinon.horev@pon.com>
Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 pandas/_libs/tslibs/period.pyx        | 2 ++
 pandas/_libs/tslibs/timedeltas.pyx    | 5 +++--
 pandas/_libs/tslibs/timestamps.pyx    | 5 +++--
 pandas/tests/api/test_api.py          | 3 +++
 pandas/tests/tslibs/test_timezones.py | 2 +-
 5 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx
index c563ab91c4142..d6d69a49c9539 100644
--- a/pandas/_libs/tslibs/period.pyx
+++ b/pandas/_libs/tslibs/period.pyx
@@ -114,6 +114,7 @@ from pandas._libs.tslibs.offsets import (
     INVALID_FREQ_ERR_MSG,
     BDay,
 )
+from pandas.util._decorators import set_module
 
 cdef:
     enum:
@@ -2830,6 +2831,7 @@ cdef class _Period(PeriodMixin):
         return period_format(self.ordinal, base, fmt)
 
 
+@set_module("pandas")
 class Period(_Period):
     """
     Represents a period of time.
diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
index 15b629624bafc..e320aca04683c 100644
--- a/pandas/_libs/tslibs/timedeltas.pyx
+++ b/pandas/_libs/tslibs/timedeltas.pyx
@@ -1,6 +1,7 @@
 import collections
 import warnings
 
+from pandas.util._decorators import set_module
 from pandas.util._exceptions import find_stack_level
 
 cimport cython
@@ -1854,7 +1855,7 @@ cdef class _Timedelta(timedelta):
 
 # Python front end to C extension type _Timedelta
 # This serves as the box for timedelta64
-
+@set_module("pandas")
 class Timedelta(_Timedelta):
     """
     Represents a duration, the difference between two dates or times.
@@ -1916,7 +1917,7 @@ class Timedelta(_Timedelta):
     --------
     Here we initialize Timedelta object with both value and unit
 
-    >>> td = pd.Timedelta(1, "d")
+    >>> td = pd.Timedelta(1, "D")
     >>> td
     Timedelta('1 days 00:00:00')
 
diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx
index 34c84d396ad64..1ab34da7ab53f 100644
--- a/pandas/_libs/tslibs/timestamps.pyx
+++ b/pandas/_libs/tslibs/timestamps.pyx
@@ -50,6 +50,7 @@ import datetime as dt
 from pandas._libs.tslibs cimport ccalendar
 from pandas._libs.tslibs.base cimport ABCTimestamp
 
+from pandas.util._decorators import set_module
 from pandas.util._exceptions import find_stack_level
 
 from pandas._libs.tslibs.conversion cimport (
@@ -1648,7 +1649,7 @@ cdef class _Timestamp(ABCTimestamp):
 # Python front end to C extension type _Timestamp
 # This serves as the box for datetime64
 
-
+@set_module("pandas")
 class Timestamp(_Timestamp):
     """
     Pandas replacement for python datetime.datetime object.
@@ -2926,7 +2927,7 @@ timedelta}, default 'raise'
         --------
         >>> ts = pd.Timestamp(1584226800, unit='s', tz='Europe/Stockholm')
         >>> ts.tz
-        <DstTzInfo 'Europe/Stockholm' CET+1:00:00 STD>
+        zoneinfo.ZoneInfo(key='Europe/Stockholm')
         """
         return self.tzinfo
 
diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
index b23876d9280f7..84c6c4df89641 100644
--- a/pandas/tests/api/test_api.py
+++ b/pandas/tests/api/test_api.py
@@ -400,3 +400,6 @@ def test_util_in_top_level(self):
 
 def test_set_module():
     assert pd.DataFrame.__module__ == "pandas"
+    assert pd.Period.__module__ == "pandas"
+    assert pd.Timestamp.__module__ == "pandas"
+    assert pd.Timedelta.__module__ == "pandas"
diff --git a/pandas/tests/tslibs/test_timezones.py b/pandas/tests/tslibs/test_timezones.py
index 8dd7060f21d59..60bbcf08ce8e7 100644
--- a/pandas/tests/tslibs/test_timezones.py
+++ b/pandas/tests/tslibs/test_timezones.py
@@ -144,7 +144,7 @@ def test_maybe_get_tz_invalid_types():
     with pytest.raises(TypeError, match="<class 'module'>"):
         timezones.maybe_get_tz(pytest)
 
-    msg = "<class 'pandas._libs.tslibs.timestamps.Timestamp'>"
+    msg = "<class 'pandas.Timestamp'>"
     with pytest.raises(TypeError, match=msg):
         timezones.maybe_get_tz(Timestamp("2021-01-01", tz="UTC"))
 

From 3f7bc81ae6839803ecc0da073fe83e9194759550 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 8 Nov 2024 02:04:59 +0100
Subject: [PATCH 215/224] TST (string dtype): resolve xfails in
 pandas/tests/series (#60233)

* TST (string dtype): resolve xfails in pandas/tests/series

* a few more

* link TODO to issue

* fix for non-future mode
---
 .../series/accessors/test_dt_accessor.py      |  4 --
 pandas/tests/series/indexing/test_indexing.py | 21 ++++++---
 pandas/tests/series/indexing/test_setitem.py  | 47 +++++++++++--------
 pandas/tests/series/indexing/test_where.py    | 17 +++----
 pandas/tests/series/methods/test_replace.py   | 29 +++++++-----
 pandas/tests/series/methods/test_unstack.py   |  5 +-
 pandas/tests/series/test_logical_ops.py       |  1 +
 7 files changed, 68 insertions(+), 56 deletions(-)

diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py
index 885adb3543b46..2c441a6ed91c1 100644
--- a/pandas/tests/series/accessors/test_dt_accessor.py
+++ b/pandas/tests/series/accessors/test_dt_accessor.py
@@ -10,8 +10,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs.tslibs.timezones import maybe_get_tz
 
 from pandas.core.dtypes.common import (
@@ -556,7 +554,6 @@ def test_strftime(self):
         )
         tm.assert_series_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_strftime_dt64_days(self):
         ser = Series(date_range("20130101", periods=5))
         ser.iloc[0] = pd.NaT
@@ -571,7 +568,6 @@ def test_strftime_dt64_days(self):
 
         expected = Index(
             ["2015/03/01", "2015/03/02", "2015/03/03", "2015/03/04", "2015/03/05"],
-            dtype=np.object_,
         )
         # dtype may be S10 or U10 depending on python version
         tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py
index 9f310d8c8ab5f..d3556b644c4bf 100644
--- a/pandas/tests/series/indexing/test_indexing.py
+++ b/pandas/tests/series/indexing/test_indexing.py
@@ -6,8 +6,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import IndexingError
 
 from pandas import (
@@ -251,18 +249,29 @@ def test_slice(string_series, object_series):
     tm.assert_series_equal(string_series, original)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_timedelta_assignment():
     # GH 8209
     s = Series([], dtype=object)
     s.loc["B"] = timedelta(1)
-    tm.assert_series_equal(s, Series(Timedelta("1 days"), index=["B"]))
+    expected = Series(
+        Timedelta("1 days"), dtype="timedelta64[ns]", index=Index(["B"], dtype=object)
+    )
+    tm.assert_series_equal(s, expected)
 
     s = s.reindex(s.index.insert(0, "A"))
-    tm.assert_series_equal(s, Series([np.nan, Timedelta("1 days")], index=["A", "B"]))
+    expected = Series(
+        [np.nan, Timedelta("1 days")],
+        dtype="timedelta64[ns]",
+        index=Index(["A", "B"], dtype=object),
+    )
+    tm.assert_series_equal(s, expected)
 
     s.loc["A"] = timedelta(1)
-    expected = Series(Timedelta("1 days"), index=["A", "B"])
+    expected = Series(
+        Timedelta("1 days"),
+        dtype="timedelta64[ns]",
+        index=Index(["A", "B"], dtype=object),
+    )
     tm.assert_series_equal(s, expected)
 
 
diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py
index ed5cb5a8d1237..82c616132456b 100644
--- a/pandas/tests/series/indexing/test_setitem.py
+++ b/pandas/tests/series/indexing/test_setitem.py
@@ -9,12 +9,7 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
-from pandas.compat import (
-    HAS_PYARROW,
-    WASM,
-)
+from pandas.compat import WASM
 from pandas.compat.numpy import np_version_gte1p24
 from pandas.errors import IndexingError
 
@@ -32,6 +27,7 @@
     NaT,
     Period,
     Series,
+    StringDtype,
     Timedelta,
     Timestamp,
     array,
@@ -535,14 +531,16 @@ def test_append_timedelta_does_not_cast(self, td, using_infer_string, request):
         tm.assert_series_equal(ser, expected)
         assert isinstance(ser["td"], Timedelta)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_setitem_with_expansion_type_promotion(self):
         # GH#12599
         ser = Series(dtype=object)
         ser["a"] = Timestamp("2016-01-01")
         ser["b"] = 3.0
         ser["c"] = "foo"
-        expected = Series([Timestamp("2016-01-01"), 3.0, "foo"], index=["a", "b", "c"])
+        expected = Series(
+            [Timestamp("2016-01-01"), 3.0, "foo"],
+            index=Index(["a", "b", "c"], dtype=object),
+        )
         tm.assert_series_equal(ser, expected)
 
     def test_setitem_not_contained(self, string_series):
@@ -826,11 +824,6 @@ def test_mask_key(self, obj, key, expected, raises, val, indexer_sli):
         else:
             indexer_sli(obj)[mask] = val
 
-    @pytest.mark.xfail(
-        using_string_dtype() and not HAS_PYARROW,
-        reason="TODO(infer_string)",
-        strict=False,
-    )
     def test_series_where(self, obj, key, expected, raises, val, is_inplace):
         mask = np.zeros(obj.shape, dtype=bool)
         mask[key] = True
@@ -846,6 +839,11 @@ def test_series_where(self, obj, key, expected, raises, val, is_inplace):
         obj = obj.copy()
         arr = obj._values
 
+        if raises and obj.dtype == "string":
+            with pytest.raises(TypeError, match="Invalid value"):
+                obj.where(~mask, val)
+            return
+
         res = obj.where(~mask, val)
 
         if val is NA and res.dtype == object:
@@ -858,12 +856,11 @@ def test_series_where(self, obj, key, expected, raises, val, is_inplace):
 
         self._check_inplace(is_inplace, orig, arr, obj)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
-    def test_index_where(self, obj, key, expected, raises, val, using_infer_string):
+    def test_index_where(self, obj, key, expected, raises, val):
         mask = np.zeros(obj.shape, dtype=bool)
         mask[key] = True
 
-        if using_infer_string and obj.dtype == object:
+        if raises and obj.dtype == "string":
             with pytest.raises(TypeError, match="Invalid value"):
                 Index(obj).where(~mask, val)
         else:
@@ -871,12 +868,11 @@ def test_index_where(self, obj, key, expected, raises, val, using_infer_string):
             expected_idx = Index(expected, dtype=expected.dtype)
             tm.assert_index_equal(res, expected_idx)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
-    def test_index_putmask(self, obj, key, expected, raises, val, using_infer_string):
+    def test_index_putmask(self, obj, key, expected, raises, val):
         mask = np.zeros(obj.shape, dtype=bool)
         mask[key] = True
 
-        if using_infer_string and obj.dtype == object:
+        if raises and obj.dtype == "string":
             with pytest.raises(TypeError, match="Invalid value"):
                 Index(obj).putmask(mask, val)
         else:
@@ -1372,6 +1368,19 @@ def raises(self):
         return False
 
 
+@pytest.mark.parametrize(
+    "val,exp_dtype,raises",
+    [
+        (1, object, True),
+        ("e", StringDtype(na_value=np.nan), False),
+    ],
+)
+class TestCoercionString(CoercionTest):
+    @pytest.fixture
+    def obj(self):
+        return Series(["a", "b", "c", "d"], dtype=StringDtype(na_value=np.nan))
+
+
 @pytest.mark.parametrize(
     "val,exp_dtype,raises",
     [
diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py
index 053c290999f2f..663ee8ad0ee38 100644
--- a/pandas/tests/series/indexing/test_where.py
+++ b/pandas/tests/series/indexing/test_where.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.core.dtypes.common import is_integer
 
 import pandas as pd
@@ -231,7 +229,6 @@ def test_where_ndframe_align():
     tm.assert_series_equal(out, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="can't set ints into string")
 def test_where_setitem_invalid():
     # GH 2702
     # make sure correct exceptions are raised on invalid list assignment
@@ -241,7 +238,7 @@ def test_where_setitem_invalid():
         "different length than the value"
     )
     # slice
-    s = Series(list("abc"))
+    s = Series(list("abc"), dtype=object)
 
     with pytest.raises(ValueError, match=msg("slice")):
         s[0:3] = list(range(27))
@@ -251,18 +248,18 @@ def test_where_setitem_invalid():
     tm.assert_series_equal(s.astype(np.int64), expected)
 
     # slice with step
-    s = Series(list("abcdef"))
+    s = Series(list("abcdef"), dtype=object)
 
     with pytest.raises(ValueError, match=msg("slice")):
         s[0:4:2] = list(range(27))
 
-    s = Series(list("abcdef"))
+    s = Series(list("abcdef"), dtype=object)
     s[0:4:2] = list(range(2))
     expected = Series([0, "b", 1, "d", "e", "f"])
     tm.assert_series_equal(s, expected)
 
     # neg slices
-    s = Series(list("abcdef"))
+    s = Series(list("abcdef"), dtype=object)
 
     with pytest.raises(ValueError, match=msg("slice")):
         s[:-1] = list(range(27))
@@ -272,18 +269,18 @@ def test_where_setitem_invalid():
     tm.assert_series_equal(s, expected)
 
     # list
-    s = Series(list("abc"))
+    s = Series(list("abc"), dtype=object)
 
     with pytest.raises(ValueError, match=msg("list-like")):
         s[[0, 1, 2]] = list(range(27))
 
-    s = Series(list("abc"))
+    s = Series(list("abc"), dtype=object)
 
     with pytest.raises(ValueError, match=msg("list-like")):
         s[[0, 1, 2]] = list(range(2))
 
     # scalar
-    s = Series(list("abc"))
+    s = Series(list("abc"), dtype=object)
     s[0] = list(range(10))
     expected = Series([list(range(10)), "b", "c"])
     tm.assert_series_equal(s, expected)
diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py
index 611fcc114db6c..1ebef333f054a 100644
--- a/pandas/tests/series/methods/test_replace.py
+++ b/pandas/tests/series/methods/test_replace.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 import pandas._testing as tm
 from pandas.core.arrays import IntervalArray
@@ -628,15 +626,23 @@ def test_replace_nullable_numeric(self):
         with pytest.raises(TypeError, match="Invalid value"):
             ints.replace(1, 9.5)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="can't fill 1 in string")
     @pytest.mark.parametrize("regex", [False, True])
     def test_replace_regex_dtype_series(self, regex):
         # GH-48644
-        series = pd.Series(["0"])
+        series = pd.Series(["0"], dtype=object)
         expected = pd.Series([1], dtype=object)
         result = series.replace(to_replace="0", value=1, regex=regex)
         tm.assert_series_equal(result, expected)
 
+    @pytest.mark.parametrize("regex", [False, True])
+    def test_replace_regex_dtype_series_string(self, regex, using_infer_string):
+        if not using_infer_string:
+            # then this is object dtype which is already tested above
+            return
+        series = pd.Series(["0"], dtype="str")
+        with pytest.raises(TypeError, match="Invalid value"):
+            series.replace(to_replace="0", value=1, regex=regex)
+
     def test_replace_different_int_types(self, any_int_numpy_dtype):
         # GH#45311
         labs = pd.Series([1, 1, 1, 0, 0, 2, 2, 2], dtype=any_int_numpy_dtype)
@@ -656,21 +662,18 @@ def test_replace_value_none_dtype_numeric(self, val):
         expected = pd.Series([1, None], dtype=object)
         tm.assert_series_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-    def test_replace_change_dtype_series(self, using_infer_string):
+    def test_replace_change_dtype_series(self):
         # GH#25797
-        df = pd.DataFrame.from_dict({"Test": ["0.5", True, "0.6"]})
-        warn = FutureWarning if using_infer_string else None
-        with tm.assert_produces_warning(warn, match="Downcasting"):
-            df["Test"] = df["Test"].replace([True], [np.nan])
-        expected = pd.DataFrame.from_dict({"Test": ["0.5", np.nan, "0.6"]})
+        df = pd.DataFrame({"Test": ["0.5", True, "0.6"]}, dtype=object)
+        df["Test"] = df["Test"].replace([True], [np.nan])
+        expected = pd.DataFrame({"Test": ["0.5", np.nan, "0.6"]}, dtype=object)
         tm.assert_frame_equal(df, expected)
 
-        df = pd.DataFrame.from_dict({"Test": ["0.5", None, "0.6"]})
+        df = pd.DataFrame({"Test": ["0.5", None, "0.6"]}, dtype=object)
         df["Test"] = df["Test"].replace([None], [np.nan])
         tm.assert_frame_equal(df, expected)
 
-        df = pd.DataFrame.from_dict({"Test": ["0.5", None, "0.6"]})
+        df = pd.DataFrame({"Test": ["0.5", None, "0.6"]}, dtype=object)
         df["Test"] = df["Test"].fillna(np.nan)
         tm.assert_frame_equal(df, expected)
 
diff --git a/pandas/tests/series/methods/test_unstack.py b/pandas/tests/series/methods/test_unstack.py
index 8c4f0ff3eaea7..f61e20c43657d 100644
--- a/pandas/tests/series/methods/test_unstack.py
+++ b/pandas/tests/series/methods/test_unstack.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -136,11 +134,10 @@ def test_unstack_mixed_type_name_in_multiindex(
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 def test_unstack_multi_index_categorical_values():
     df = DataFrame(
         np.random.default_rng(2).standard_normal((10, 4)),
-        columns=Index(list("ABCD"), dtype=object),
+        columns=Index(list("ABCD")),
         index=date_range("2000-01-01", periods=10, freq="B"),
     )
     mi = df.stack().index.rename(["major", "minor"])
diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py
index 8516018e8aa93..8f63819b09238 100644
--- a/pandas/tests/series/test_logical_ops.py
+++ b/pandas/tests/series/test_logical_ops.py
@@ -413,6 +413,7 @@ def test_logical_ops_label_based(self, using_infer_string):
         for e in [Series(["z"])]:
             if using_infer_string:
                 # TODO(infer_string) should this behave differently?
+                # -> https://github.com/pandas-dev/pandas/issues/60234
                 with pytest.raises(
                     TypeError, match="not supported for dtype|unsupported operand type"
                 ):

From 4cef979494b9838806f205ed575c09bbd4add7bf Mon Sep 17 00:00:00 2001
From: Amir <44722829+Amir-101@users.noreply.github.com>
Date: Fri, 8 Nov 2024 08:56:56 +0100
Subject: [PATCH 216/224] ENH: set __module__ for Dtype and Index classes
 (#59909)

Co-authored-by: Amir <acb@REVWKS049.d2sim7.loc>
Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 pandas/core/dtypes/dtypes.py      |  7 +++++++
 pandas/core/indexes/base.py       |  2 ++
 pandas/core/indexes/category.py   |  2 ++
 pandas/core/indexes/datetimes.py  |  2 ++
 pandas/core/indexes/interval.py   |  2 ++
 pandas/core/indexes/multi.py      |  2 ++
 pandas/core/indexes/period.py     |  2 ++
 pandas/core/indexes/range.py      |  2 ++
 pandas/core/indexes/timedeltas.py |  2 ++
 pandas/tests/api/test_api.py      | 13 +++++++++++++
 10 files changed, 36 insertions(+)

diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
index bb6610c514375..004a1aab5436e 100644
--- a/pandas/core/dtypes/dtypes.py
+++ b/pandas/core/dtypes/dtypes.py
@@ -48,6 +48,7 @@
 from pandas._libs.tslibs.offsets import BDay
 from pandas.compat import pa_version_under10p1
 from pandas.errors import PerformanceWarning
+from pandas.util._decorators import set_module
 from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.base import (
@@ -155,6 +156,7 @@ class CategoricalDtypeType(type):
 
 
 @register_extension_dtype
+@set_module("pandas")
 class CategoricalDtype(PandasExtensionDtype, ExtensionDtype):
     """
     Type for categorical data with the categories and orderedness.
@@ -706,6 +708,7 @@ def index_class(self) -> type_t[CategoricalIndex]:
 
 
 @register_extension_dtype
+@set_module("pandas")
 class DatetimeTZDtype(PandasExtensionDtype):
     """
     An ExtensionDtype for timezone-aware datetime data.
@@ -974,6 +977,7 @@ def index_class(self) -> type_t[DatetimeIndex]:
 
 
 @register_extension_dtype
+@set_module("pandas")
 class PeriodDtype(PeriodDtypeBase, PandasExtensionDtype):
     """
     An ExtensionDtype for Period data.
@@ -1215,6 +1219,7 @@ def index_class(self) -> type_t[PeriodIndex]:
 
 
 @register_extension_dtype
+@set_module("pandas")
 class IntervalDtype(PandasExtensionDtype):
     """
     An ExtensionDtype for Interval data.
@@ -1691,6 +1696,7 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
 
 
 @register_extension_dtype
+@set_module("pandas")
 class SparseDtype(ExtensionDtype):
     """
     Dtype for data stored in :class:`SparseArray`.
@@ -2130,6 +2136,7 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
 
 
 @register_extension_dtype
+@set_module("pandas")
 class ArrowDtype(StorageExtensionDtype):
     """
     An ExtensionDtype for PyArrow data types.
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index d6035c82aaaf8..4a90b164c89cc 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -71,6 +71,7 @@
     Appender,
     cache_readonly,
     doc,
+    set_module,
 )
 from pandas.util._exceptions import (
     find_stack_level,
@@ -315,6 +316,7 @@ def _new_Index(cls, d):
     return cls.__new__(cls, **d)
 
 
+@set_module("pandas")
 class Index(IndexOpsMixin, PandasObject):
     """
     Immutable sequence used for indexing and alignment.
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
index 312219eb7b91a..d20a84449fb85 100644
--- a/pandas/core/indexes/category.py
+++ b/pandas/core/indexes/category.py
@@ -13,6 +13,7 @@
 from pandas.util._decorators import (
     cache_readonly,
     doc,
+    set_module,
 )
 
 from pandas.core.dtypes.common import is_scalar
@@ -76,6 +77,7 @@
     Categorical,
     wrap=True,
 )
+@set_module("pandas")
 class CategoricalIndex(NDArrayBackedExtensionIndex):
     """
     Index based on an underlying :class:`Categorical`.
diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
index 536f22d38468d..b3d9c3bc78a66 100644
--- a/pandas/core/indexes/datetimes.py
+++ b/pandas/core/indexes/datetimes.py
@@ -26,6 +26,7 @@
 from pandas.util._decorators import (
     cache_readonly,
     doc,
+    set_module,
 )
 
 from pandas.core.dtypes.common import is_scalar
@@ -126,6 +127,7 @@ def _new_DatetimeIndex(cls, d):
     + DatetimeArray._bool_ops,
     DatetimeArray,
 )
+@set_module("pandas")
 class DatetimeIndex(DatetimeTimedeltaMixin):
     """
     Immutable ndarray-like of datetime64 data.
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
index 94717141b30b0..b0b9c5419e2ad 100644
--- a/pandas/core/indexes/interval.py
+++ b/pandas/core/indexes/interval.py
@@ -32,6 +32,7 @@
 from pandas.util._decorators import (
     Appender,
     cache_readonly,
+    set_module,
 )
 from pandas.util._exceptions import rewrite_exception
 
@@ -202,6 +203,7 @@ def _new_IntervalIndex(cls, d):
     IntervalArray,
 )
 @inherit_names(["is_non_overlapping_monotonic", "closed"], IntervalArray, cache=True)
+@set_module("pandas")
 class IntervalIndex(ExtensionIndex):
     _typ = "intervalindex"
 
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
index d1c99cb864e57..36e68465a99d9 100644
--- a/pandas/core/indexes/multi.py
+++ b/pandas/core/indexes/multi.py
@@ -53,6 +53,7 @@
     Appender,
     cache_readonly,
     doc,
+    set_module,
 )
 from pandas.util._exceptions import find_stack_level
 
@@ -195,6 +196,7 @@ def new_meth(self_or_cls, *args, **kwargs):
     return cast(F, new_meth)
 
 
+@set_module("pandas")
 class MultiIndex(Index):
     """
     A multi-level, or hierarchical, index object for pandas objects.
diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
index 377406e24b1d3..0a7a0319bed3a 100644
--- a/pandas/core/indexes/period.py
+++ b/pandas/core/indexes/period.py
@@ -20,6 +20,7 @@
 from pandas.util._decorators import (
     cache_readonly,
     doc,
+    set_module,
 )
 
 from pandas.core.dtypes.common import is_integer
@@ -81,6 +82,7 @@ def _new_PeriodIndex(cls, **d):
     wrap=True,
 )
 @inherit_names(["is_leap_year"], PeriodArray)
+@set_module("pandas")
 class PeriodIndex(DatetimeIndexOpsMixin):
     """
     Immutable ndarray holding ordinal values indicating regular periods in time.
diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py
index dc96d1c11db74..7eeaab3b0443f 100644
--- a/pandas/core/indexes/range.py
+++ b/pandas/core/indexes/range.py
@@ -27,6 +27,7 @@
 from pandas.util._decorators import (
     cache_readonly,
     doc,
+    set_module,
 )
 
 from pandas.core.dtypes.base import ExtensionDtype
@@ -74,6 +75,7 @@ def min_fitting_element(start: int, step: int, lower_limit: int) -> int:
     return start + abs(step) * no_steps
 
 
+@set_module("pandas")
 class RangeIndex(Index):
     """
     Immutable Index implementing a monotonic integer range.
diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py
index 29039ffd0217e..6bbe86816d81f 100644
--- a/pandas/core/indexes/timedeltas.py
+++ b/pandas/core/indexes/timedeltas.py
@@ -13,6 +13,7 @@
     Timedelta,
     to_offset,
 )
+from pandas.util._decorators import set_module
 
 from pandas.core.dtypes.common import (
     is_scalar,
@@ -50,6 +51,7 @@
     ],
     TimedeltaArray,
 )
+@set_module("pandas")
 class TimedeltaIndex(DatetimeTimedeltaMixin):
     """
     Immutable Index of timedelta64 data.
diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
index 84c6c4df89641..842fa1a151267 100644
--- a/pandas/tests/api/test_api.py
+++ b/pandas/tests/api/test_api.py
@@ -400,6 +400,19 @@ def test_util_in_top_level(self):
 
 def test_set_module():
     assert pd.DataFrame.__module__ == "pandas"
+    assert pd.CategoricalDtype.__module__ == "pandas"
+    assert pd.PeriodDtype.__module__ == "pandas"
+    assert pd.IntervalDtype.__module__ == "pandas"
+    assert pd.SparseDtype.__module__ == "pandas"
+    assert pd.ArrowDtype.__module__ == "pandas"
+    assert pd.Index.__module__ == "pandas"
+    assert pd.CategoricalIndex.__module__ == "pandas"
+    assert pd.DatetimeIndex.__module__ == "pandas"
+    assert pd.IntervalIndex.__module__ == "pandas"
+    assert pd.MultiIndex.__module__ == "pandas"
+    assert pd.PeriodIndex.__module__ == "pandas"
+    assert pd.RangeIndex.__module__ == "pandas"
+    assert pd.TimedeltaIndex.__module__ == "pandas"
     assert pd.Period.__module__ == "pandas"
     assert pd.Timestamp.__module__ == "pandas"
     assert pd.Timedelta.__module__ == "pandas"

From e5dd89d4d74d8e2a06256023717880788f2b10ed Mon Sep 17 00:00:00 2001
From: jbrockmendel <jbrockmendel@gmail.com>
Date: Fri, 8 Nov 2024 05:34:20 -0800
Subject: [PATCH 217/224] TST (string dtype): fix groupby xfails with
 using_infer_string + update error message (#59430)

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
---
 pandas/core/arrays/arrow/array.py             | 14 +++++
 pandas/core/arrays/base.py                    | 14 +++++
 pandas/core/groupby/groupby.py                |  4 +-
 pandas/tests/frame/test_stack_unstack.py      |  4 +-
 pandas/tests/groupby/aggregate/test_cython.py |  4 +-
 pandas/tests/groupby/methods/test_quantile.py |  9 ++-
 pandas/tests/groupby/test_groupby.py          | 56 ++++++++++++++-----
 pandas/tests/groupby/test_groupby_subclass.py |  2 +-
 pandas/tests/groupby/test_numeric_only.py     | 20 +++++--
 pandas/tests/groupby/test_raises.py           | 54 ++++++++++++++++--
 pandas/tests/resample/test_resample_api.py    | 20 ++++++-
 pandas/tests/reshape/merge/test_join.py       |  4 +-
 pandas/tests/reshape/test_pivot.py            |  8 ++-
 13 files changed, 170 insertions(+), 43 deletions(-)

diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
index 7db10b1cc4a80..fcc50c5b6b20f 100644
--- a/pandas/core/arrays/arrow/array.py
+++ b/pandas/core/arrays/arrow/array.py
@@ -2312,6 +2312,20 @@ def _groupby_op(
         **kwargs,
     ):
         if isinstance(self.dtype, StringDtype):
+            if how in [
+                "prod",
+                "mean",
+                "median",
+                "cumsum",
+                "cumprod",
+                "std",
+                "sem",
+                "var",
+                "skew",
+            ]:
+                raise TypeError(
+                    f"dtype '{self.dtype}' does not support operation '{how}'"
+                )
             return super()._groupby_op(
                 how=how,
                 has_dropped_na=has_dropped_na,
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
index 5f2c2a7772f78..4835d808f2433 100644
--- a/pandas/core/arrays/base.py
+++ b/pandas/core/arrays/base.py
@@ -2608,6 +2608,20 @@ def _groupby_op(
         # GH#43682
         if isinstance(self.dtype, StringDtype):
             # StringArray
+            if op.how in [
+                "prod",
+                "mean",
+                "median",
+                "cumsum",
+                "cumprod",
+                "std",
+                "sem",
+                "var",
+                "skew",
+            ]:
+                raise TypeError(
+                    f"dtype '{self.dtype}' does not support operation '{how}'"
+                )
             if op.how not in ["any", "all"]:
                 # Fail early to avoid conversion to object
                 op._get_cython_function(op.kind, op.how, np.dtype(object), False)
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index 66db033596872..8f2e5d2ee09d4 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -4162,9 +4162,9 @@ def quantile(
         starts, ends = lib.generate_slices(splitter._slabels, splitter.ngroups)
 
         def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]:
-            if is_object_dtype(vals.dtype):
+            if isinstance(vals.dtype, StringDtype) or is_object_dtype(vals.dtype):
                 raise TypeError(
-                    "'quantile' cannot be performed against 'object' dtypes!"
+                    f"dtype '{vals.dtype}' does not support operation 'quantile'"
                 )
 
             inference: DtypeObj | None = None
diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py
index b4f02b6f81b6f..57c803c23b001 100644
--- a/pandas/tests/frame/test_stack_unstack.py
+++ b/pandas/tests/frame/test_stack_unstack.py
@@ -2113,7 +2113,7 @@ def test_unstack_period_frame(self):
     @pytest.mark.filterwarnings(
         "ignore:The previous implementation of stack is deprecated"
     )
-    def test_stack_multiple_bug(self, future_stack):
+    def test_stack_multiple_bug(self, future_stack, using_infer_string):
         # bug when some uniques are not present in the data GH#3170
         id_col = ([1] * 3) + ([2] * 3)
         name = (["a"] * 3) + (["b"] * 3)
@@ -2125,6 +2125,8 @@ def test_stack_multiple_bug(self, future_stack):
         multi.columns.name = "Params"
         unst = multi.unstack("ID")
         msg = re.escape("agg function failed [how->mean,dtype->")
+        if using_infer_string:
+            msg = "dtype 'str' does not support operation 'mean'"
         with pytest.raises(TypeError, match=msg):
             unst.resample("W-THU").mean()
         down = unst.resample("W-THU").mean(numeric_only=True)
diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
index d28eb227314c7..b937e7dcc8136 100644
--- a/pandas/tests/groupby/aggregate/test_cython.py
+++ b/pandas/tests/groupby/aggregate/test_cython.py
@@ -148,11 +148,11 @@ def test_cython_agg_return_dict():
 
 def test_cython_fail_agg():
     dr = bdate_range("1/1/2000", periods=50)
-    ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr)
+    ts = Series(["A", "B", "C", "D", "E"] * 10, dtype=object, index=dr)
 
     grouped = ts.groupby(lambda x: x.month)
     summed = grouped.sum()
-    expected = grouped.agg(np.sum)
+    expected = grouped.agg(np.sum).astype(object)
     tm.assert_series_equal(summed, expected)
 
 
diff --git a/pandas/tests/groupby/methods/test_quantile.py b/pandas/tests/groupby/methods/test_quantile.py
index 0e31c0698cb1e..4a8ad65200caa 100644
--- a/pandas/tests/groupby/methods/test_quantile.py
+++ b/pandas/tests/groupby/methods/test_quantile.py
@@ -162,7 +162,8 @@ def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby,
 def test_quantile_raises():
     df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])
 
-    with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"):
+    msg = "dtype 'object' does not support operation 'quantile'"
+    with pytest.raises(TypeError, match=msg):
         df.groupby("key").quantile()
 
 
@@ -241,7 +242,6 @@ def test_groupby_quantile_nullable_array(values, q):
     tm.assert_series_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
 @pytest.mark.parametrize("numeric_only", [True, False])
 def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
@@ -251,9 +251,8 @@ def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
         expected = df.groupby("a")[["b"]].quantile(q)
         tm.assert_frame_equal(result, expected)
     else:
-        with pytest.raises(
-            TypeError, match="'quantile' cannot be performed against 'object' dtypes!"
-        ):
+        msg = "dtype '.*' does not support operation 'quantile'"
+        with pytest.raises(TypeError, match=msg):
             df.groupby("a").quantile(q, numeric_only=numeric_only)
 
 
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 0d13db79835ba..3305b48a4dcdc 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -425,7 +425,7 @@ def test_frame_multi_key_function_list():
     tm.assert_frame_equal(agged, expected)
 
 
-def test_frame_multi_key_function_list_partial_failure():
+def test_frame_multi_key_function_list_partial_failure(using_infer_string):
     data = DataFrame(
         {
             "A": [
@@ -476,6 +476,8 @@ def test_frame_multi_key_function_list_partial_failure():
     grouped = data.groupby(["A", "B"])
     funcs = ["mean", "std"]
     msg = re.escape("agg function failed [how->mean,dtype->")
+    if using_infer_string:
+        msg = "dtype 'str' does not support operation 'mean'"
     with pytest.raises(TypeError, match=msg):
         grouped.agg(funcs)
 
@@ -662,9 +664,11 @@ def test_groupby_multi_corner(df):
     tm.assert_frame_equal(agged, expected)
 
 
-def test_raises_on_nuisance(df):
+def test_raises_on_nuisance(df, using_infer_string):
     grouped = df.groupby("A")
     msg = re.escape("agg function failed [how->mean,dtype->")
+    if using_infer_string:
+        msg = "dtype 'str' does not support operation 'mean'"
     with pytest.raises(TypeError, match=msg):
         grouped.agg("mean")
     with pytest.raises(TypeError, match=msg):
@@ -699,7 +703,7 @@ def test_keep_nuisance_agg(df, agg_function):
     ["sum", "mean", "prod", "std", "var", "sem", "median"],
 )
 @pytest.mark.parametrize("numeric_only", [True, False])
-def test_omit_nuisance_agg(df, agg_function, numeric_only):
+def test_omit_nuisance_agg(df, agg_function, numeric_only, using_infer_string):
     # GH 38774, GH 38815
     grouped = df.groupby("A")
 
@@ -707,7 +711,10 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only):
     if agg_function in no_drop_nuisance and not numeric_only:
         # Added numeric_only as part of GH#46560; these do not drop nuisance
         # columns when numeric_only is False
-        if agg_function in ("std", "sem"):
+        if using_infer_string:
+            msg = f"dtype 'str' does not support operation '{agg_function}'"
+            klass = TypeError
+        elif agg_function in ("std", "sem"):
             klass = ValueError
             msg = "could not convert string to float: 'one'"
         else:
@@ -728,16 +735,24 @@ def test_omit_nuisance_agg(df, agg_function, numeric_only):
         tm.assert_frame_equal(result, expected)
 
 
-def test_raise_on_nuisance_python_single(df):
+def test_raise_on_nuisance_python_single(df, using_infer_string):
     # GH 38815
     grouped = df.groupby("A")
-    with pytest.raises(ValueError, match="could not convert"):
+
+    err = ValueError
+    msg = "could not convert"
+    if using_infer_string:
+        err = TypeError
+        msg = "dtype 'str' does not support operation 'skew'"
+    with pytest.raises(err, match=msg):
         grouped.skew()
 
 
-def test_raise_on_nuisance_python_multiple(three_group):
+def test_raise_on_nuisance_python_multiple(three_group, using_infer_string):
     grouped = three_group.groupby(["A", "B"])
     msg = re.escape("agg function failed [how->mean,dtype->")
+    if using_infer_string:
+        msg = "dtype 'str' does not support operation 'mean'"
     with pytest.raises(TypeError, match=msg):
         grouped.agg("mean")
     with pytest.raises(TypeError, match=msg):
@@ -775,12 +790,16 @@ def test_nonsense_func():
         df.groupby(lambda x: x + "foo")
 
 
-def test_wrap_aggregated_output_multindex(multiindex_dataframe_random_data):
+def test_wrap_aggregated_output_multindex(
+    multiindex_dataframe_random_data, using_infer_string
+):
     df = multiindex_dataframe_random_data.T
     df["baz", "two"] = "peekaboo"
 
     keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
     msg = re.escape("agg function failed [how->mean,dtype->")
+    if using_infer_string:
+        msg = "dtype 'str' does not support operation 'mean'"
     with pytest.raises(TypeError, match=msg):
         df.groupby(keys).agg("mean")
     agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean")
@@ -960,8 +979,10 @@ def test_groupby_with_hier_columns():
 
 def test_grouping_ndarray(df):
     grouped = df.groupby(df["A"].values)
+    grouped2 = df.groupby(df["A"].rename(None))
+
     result = grouped.sum()
-    expected = df.groupby(df["A"].rename(None)).sum()
+    expected = grouped2.sum()
     tm.assert_frame_equal(result, expected)
 
 
@@ -1457,8 +1478,8 @@ def test_no_dummy_key_names(df):
     result = df.groupby(df["A"].values).sum()
     assert result.index.name is None
 
-    result = df.groupby([df["A"].values, df["B"].values]).sum()
-    assert result.index.names == (None, None)
+    result2 = df.groupby([df["A"].values, df["B"].values]).sum()
+    assert result2.index.names == (None, None)
 
 
 def test_groupby_sort_multiindex_series():
@@ -1761,6 +1782,7 @@ def get_categorical_invalid_expected():
     is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype)
     is_dt64 = df.dtypes.iloc[0].kind == "M"
     is_cat = isinstance(values, Categorical)
+    is_str = isinstance(df.dtypes.iloc[0], pd.StringDtype)
 
     if (
         isinstance(values, Categorical)
@@ -1785,13 +1807,15 @@ def get_categorical_invalid_expected():
 
     if op in ["prod", "sum", "skew"]:
         # ops that require more than just ordered-ness
-        if is_dt64 or is_cat or is_per:
+        if is_dt64 or is_cat or is_per or (is_str and op != "sum"):
             # GH#41291
             # datetime64 -> prod and sum are invalid
             if is_dt64:
                 msg = "datetime64 type does not support"
             elif is_per:
                 msg = "Period type does not support"
+            elif is_str:
+                msg = f"dtype 'str' does not support operation '{op}'"
             else:
                 msg = "category type does not support"
             if op == "skew":
@@ -2714,7 +2738,7 @@ def test_obj_with_exclusions_duplicate_columns():
 def test_groupby_numeric_only_std_no_result(numeric_only):
     # GH 51080
     dicts_non_numeric = [{"a": "foo", "b": "bar"}, {"a": "car", "b": "dar"}]
-    df = DataFrame(dicts_non_numeric)
+    df = DataFrame(dicts_non_numeric, dtype=object)
     dfgb = df.groupby("a", as_index=False, sort=False)
 
     if numeric_only:
@@ -2773,10 +2797,14 @@ def test_grouping_with_categorical_interval_columns():
 def test_groupby_sum_on_nan_should_return_nan(bug_var):
     # GH 24196
     df = DataFrame({"A": [bug_var, bug_var, bug_var, np.nan]})
+    if isinstance(bug_var, str):
+        df = df.astype(object)
     dfgb = df.groupby(lambda x: x)
     result = dfgb.sum(min_count=1)
 
-    expected_df = DataFrame([bug_var, bug_var, bug_var, None], columns=["A"])
+    expected_df = DataFrame(
+        [bug_var, bug_var, bug_var, None], columns=["A"], dtype=df["A"].dtype
+    )
     tm.assert_frame_equal(result, expected_df)
 
 
diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py
index 0832b67b38098..a1f4627475bab 100644
--- a/pandas/tests/groupby/test_groupby_subclass.py
+++ b/pandas/tests/groupby/test_groupby_subclass.py
@@ -109,7 +109,7 @@ def test_groupby_resample_preserves_subclass(obj):
 
     df = obj(
         {
-            "Buyer": "Carl Carl Carl Carl Joe Carl".split(),
+            "Buyer": Series("Carl Carl Carl Carl Joe Carl".split(), dtype=object),
             "Quantity": [18, 3, 5, 1, 9, 3],
             "Date": [
                 datetime(2013, 9, 1, 13, 0),
diff --git a/pandas/tests/groupby/test_numeric_only.py b/pandas/tests/groupby/test_numeric_only.py
index 41e00f8121b14..cb4569812f600 100644
--- a/pandas/tests/groupby/test_numeric_only.py
+++ b/pandas/tests/groupby/test_numeric_only.py
@@ -28,7 +28,8 @@ def df(self):
                 "group": [1, 1, 2],
                 "int": [1, 2, 3],
                 "float": [4.0, 5.0, 6.0],
-                "string": list("abc"),
+                "string": Series(["a", "b", "c"], dtype="str"),
+                "object": Series(["a", "b", "c"], dtype=object),
                 "category_string": Series(list("abc")).astype("category"),
                 "category_int": [7, 8, 9],
                 "datetime": date_range("20130101", periods=3),
@@ -40,6 +41,7 @@ def df(self):
                 "int",
                 "float",
                 "string",
+                "object",
                 "category_string",
                 "category_int",
                 "datetime",
@@ -112,6 +114,7 @@ def test_first_last(self, df, method):
                 "int",
                 "float",
                 "string",
+                "object",
                 "category_string",
                 "category_int",
                 "datetime",
@@ -159,7 +162,9 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
 
         # object dtypes for transformations are not implemented in Cython and
         # have no Python fallback
-        exception = NotImplementedError if method.startswith("cum") else TypeError
+        exception = (
+            (NotImplementedError, TypeError) if method.startswith("cum") else TypeError
+        )
 
         if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"):
             # The methods default to numeric_only=False and raise TypeError
@@ -170,6 +175,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
                     re.escape(f"agg function failed [how->{method},dtype->object]"),
                     # cumsum/cummin/cummax/cumprod
                     "function is not implemented for this dtype",
+                    f"dtype 'str' does not support operation '{method}'",
                 ]
             )
             with pytest.raises(exception, match=msg):
@@ -180,7 +186,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
                     "category type does not support sum operations",
                     re.escape(f"agg function failed [how->{method},dtype->object]"),
                     re.escape(f"agg function failed [how->{method},dtype->string]"),
-                    re.escape(f"agg function failed [how->{method},dtype->str]"),
+                    f"dtype 'str' does not support operation '{method}'",
                 ]
             )
             with pytest.raises(exception, match=msg):
@@ -198,7 +204,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric):
                     f"Cannot perform {method} with non-ordered Categorical",
                     re.escape(f"agg function failed [how->{method},dtype->object]"),
                     re.escape(f"agg function failed [how->{method},dtype->string]"),
-                    re.escape(f"agg function failed [how->{method},dtype->str]"),
+                    f"dtype 'str' does not support operation '{method}'",
                 ]
             )
             with pytest.raises(exception, match=msg):
@@ -299,7 +305,9 @@ def test_numeric_only(kernel, has_arg, numeric_only, keys):
                 re.escape(f"agg function failed [how->{kernel},dtype->object]"),
             ]
         )
-        if kernel == "idxmin":
+        if kernel == "quantile":
+            msg = "dtype 'object' does not support operation 'quantile'"
+        elif kernel == "idxmin":
             msg = "'<' not supported between instances of 'type' and 'type'"
         elif kernel == "idxmax":
             msg = "'>' not supported between instances of 'type' and 'type'"
@@ -379,7 +387,7 @@ def test_deprecate_numeric_only_series(dtype, groupby_func, request):
     # that succeed should not be allowed to fail (without deprecation, at least)
     if groupby_func in fails_on_numeric_object and dtype is object:
         if groupby_func == "quantile":
-            msg = "cannot be performed against 'object' dtypes"
+            msg = "dtype 'object' does not support operation 'quantile'"
         else:
             msg = "is not supported for object dtype"
         with pytest.raises(TypeError, match=msg):
diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py
index 38b4abfddda1e..1e0a15d0ba796 100644
--- a/pandas/tests/groupby/test_raises.py
+++ b/pandas/tests/groupby/test_raises.py
@@ -8,8 +8,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas import (
     Categorical,
     DataFrame,
@@ -106,10 +104,9 @@ def _call_and_check(klass, msg, how, gb, groupby_func, args, warn_msg=""):
                     gb.transform(groupby_func, *args)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
 @pytest.mark.parametrize("how", ["method", "agg", "transform"])
 def test_groupby_raises_string(
-    how, by, groupby_series, groupby_func, df_with_string_col
+    how, by, groupby_series, groupby_func, df_with_string_col, using_infer_string
 ):
     df = df_with_string_col
     args = get_groupby_method_args(groupby_func, df)
@@ -169,7 +166,7 @@ def test_groupby_raises_string(
             TypeError,
             re.escape("agg function failed [how->prod,dtype->object]"),
         ),
-        "quantile": (TypeError, "cannot be performed against 'object' dtypes!"),
+        "quantile": (TypeError, "dtype 'object' does not support operation 'quantile'"),
         "rank": (None, ""),
         "sem": (ValueError, "could not convert string to float"),
         "shift": (None, ""),
@@ -183,6 +180,37 @@ def test_groupby_raises_string(
         ),
     }[groupby_func]
 
+    if using_infer_string:
+        if groupby_func in [
+            "prod",
+            "mean",
+            "median",
+            "cumsum",
+            "cumprod",
+            "std",
+            "sem",
+            "var",
+            "skew",
+            "quantile",
+        ]:
+            msg = f"dtype 'str' does not support operation '{groupby_func}'"
+            if groupby_func in ["sem", "std", "skew"]:
+                # The object-dtype raises ValueError when trying to convert to numeric.
+                klass = TypeError
+        elif groupby_func == "pct_change" and df["d"].dtype.storage == "pyarrow":
+            # This doesn't go through EA._groupby_op so the message isn't controlled
+            #  there.
+            msg = "operation 'truediv' not supported for dtype 'str' with dtype 'str'"
+        elif groupby_func == "diff" and df["d"].dtype.storage == "pyarrow":
+            # This doesn't go through EA._groupby_op so the message isn't controlled
+            #  there.
+            msg = "operation 'sub' not supported for dtype 'str' with dtype 'str'"
+
+        elif groupby_func in ["cummin", "cummax"]:
+            msg = msg.replace("object", "str")
+        elif groupby_func == "corrwith":
+            msg = "Cannot perform reduction 'mean' with string dtype"
+
     if groupby_func == "fillna":
         kind = "Series" if groupby_series else "DataFrame"
         warn_msg = f"{kind}GroupBy.fillna is deprecated"
@@ -211,7 +239,12 @@ def func(x):
 @pytest.mark.parametrize("how", ["agg", "transform"])
 @pytest.mark.parametrize("groupby_func_np", [np.sum, np.mean])
 def test_groupby_raises_string_np(
-    how, by, groupby_series, groupby_func_np, df_with_string_col
+    how,
+    by,
+    groupby_series,
+    groupby_func_np,
+    df_with_string_col,
+    using_infer_string,
 ):
     # GH#50749
     df = df_with_string_col
@@ -228,6 +261,15 @@ def test_groupby_raises_string_np(
             "Cannot perform reduction 'mean' with string dtype",
         ),
     }[groupby_func_np]
+
+    if using_infer_string:
+        if groupby_func_np is np.mean:
+            klass = TypeError
+        msg = (
+            f"Cannot perform reduction '{groupby_func_np.__name__}' "
+            "with string dtype"
+        )
+
     _call_and_check(klass, msg, how, gb, groupby_func_np, ())
 
 
diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py
index a8fb1b392322d..b7b80b5e427ff 100644
--- a/pandas/tests/resample/test_resample_api.py
+++ b/pandas/tests/resample/test_resample_api.py
@@ -187,7 +187,7 @@ def test_api_compat_before_use(attr):
     getattr(rs, attr)
 
 
-def tests_raises_on_nuisance(test_frame):
+def tests_raises_on_nuisance(test_frame, using_infer_string):
     df = test_frame
     df["D"] = "foo"
     r = df.resample("h")
@@ -197,6 +197,8 @@ def tests_raises_on_nuisance(test_frame):
 
     expected = r[["A", "B", "C"]].mean()
     msg = re.escape("agg function failed [how->mean,dtype->")
+    if using_infer_string:
+        msg = "dtype 'str' does not support operation 'mean'"
     with pytest.raises(TypeError, match=msg):
         r.mean()
     result = r.mean(numeric_only=True)
@@ -881,7 +883,9 @@ def test_end_and_end_day_origin(
         ("sem", lib.no_default, "could not convert string to float"),
     ],
 )
-def test_frame_downsample_method(method, numeric_only, expected_data):
+def test_frame_downsample_method(
+    method, numeric_only, expected_data, using_infer_string
+):
     # GH#46442 test if `numeric_only` behave as expected for DataFrameGroupBy
 
     index = date_range("2018-01-01", periods=2, freq="D")
@@ -898,6 +902,11 @@ def test_frame_downsample_method(method, numeric_only, expected_data):
         if method in ("var", "mean", "median", "prod"):
             klass = TypeError
             msg = re.escape(f"agg function failed [how->{method},dtype->")
+            if using_infer_string:
+                msg = f"dtype 'str' does not support operation '{method}'"
+        elif method in ["sum", "std", "sem"] and using_infer_string:
+            klass = TypeError
+            msg = f"dtype 'str' does not support operation '{method}'"
         else:
             klass = ValueError
             msg = expected_data
@@ -932,7 +941,9 @@ def test_frame_downsample_method(method, numeric_only, expected_data):
         ("last", lib.no_default, ["cat_2"]),
     ],
 )
-def test_series_downsample_method(method, numeric_only, expected_data):
+def test_series_downsample_method(
+    method, numeric_only, expected_data, using_infer_string
+):
     # GH#46442 test if `numeric_only` behave as expected for SeriesGroupBy
 
     index = date_range("2018-01-01", periods=2, freq="D")
@@ -948,8 +959,11 @@ def test_series_downsample_method(method, numeric_only, expected_data):
             func(**kwargs)
     elif method == "prod":
         msg = re.escape("agg function failed [how->prod,dtype->")
+        if using_infer_string:
+            msg = "dtype 'str' does not support operation 'prod'"
         with pytest.raises(TypeError, match=msg):
             func(**kwargs)
+
     else:
         result = func(**kwargs)
         expected = Series(expected_data, index=expected_index)
diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py
index 0f743332acbbe..65bfea0b9beea 100644
--- a/pandas/tests/reshape/merge/test_join.py
+++ b/pandas/tests/reshape/merge/test_join.py
@@ -620,7 +620,7 @@ def test_join_non_unique_period_index(self):
         )
         tm.assert_frame_equal(result, expected)
 
-    def test_mixed_type_join_with_suffix(self):
+    def test_mixed_type_join_with_suffix(self, using_infer_string):
         # GH #916
         df = DataFrame(
             np.random.default_rng(2).standard_normal((20, 6)),
@@ -631,6 +631,8 @@ def test_mixed_type_join_with_suffix(self):
 
         grouped = df.groupby("id")
         msg = re.escape("agg function failed [how->mean,dtype->")
+        if using_infer_string:
+            msg = "dtype 'str' does not support operation 'mean'"
         with pytest.raises(TypeError, match=msg):
             grouped.mean()
         mn = grouped.mean(numeric_only=True)
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
index eccf676b87f89..d8a9acdc561fd 100644
--- a/pandas/tests/reshape/test_pivot.py
+++ b/pandas/tests/reshape/test_pivot.py
@@ -935,12 +935,14 @@ def test_margins(self, data):
         for value_col in table.columns.levels[0]:
             self._check_output(table[value_col], value_col, data)
 
-    def test_no_col(self, data):
+    def test_no_col(self, data, using_infer_string):
         # no col
 
         # to help with a buglet
         data.columns = [k * 2 for k in data.columns]
         msg = re.escape("agg function failed [how->mean,dtype->")
+        if using_infer_string:
+            msg = "dtype 'str' does not support operation 'mean'"
         with pytest.raises(TypeError, match=msg):
             data.pivot_table(index=["AA", "BB"], margins=True, aggfunc="mean")
         table = data.drop(columns="CC").pivot_table(
@@ -990,7 +992,7 @@ def test_no_col(self, data):
         ],
     )
     def test_margin_with_only_columns_defined(
-        self, columns, aggfunc, values, expected_columns
+        self, columns, aggfunc, values, expected_columns, using_infer_string
     ):
         # GH 31016
         df = DataFrame(
@@ -1014,6 +1016,8 @@ def test_margin_with_only_columns_defined(
         )
         if aggfunc != "sum":
             msg = re.escape("agg function failed [how->mean,dtype->")
+            if using_infer_string:
+                msg = "dtype 'str' does not support operation 'mean'"
             with pytest.raises(TypeError, match=msg):
                 df.pivot_table(columns=columns, margins=True, aggfunc=aggfunc)
         if "B" not in columns:

From 9b16b9e6a7149a7206f533be187f0e2ad9d13bbc Mon Sep 17 00:00:00 2001
From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com>
Date: Fri, 8 Nov 2024 15:45:41 +0100
Subject: [PATCH 218/224] DEPR: revert enforcing the deprecation of exposing
 blocks in core.internals and deprecate with FutureWarning (#58715)

---
 doc/source/whatsnew/v3.0.0.rst     |  2 +-
 pandas/core/internals/__init__.py  | 44 ++++++++++++++++++++++++++++++
 pandas/io/pytables.py              |  3 +-
 pandas/tests/internals/test_api.py | 26 ++++++++++++++++++
 4 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index 13e1ddd153ccb..89bc942cb7250 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -481,7 +481,7 @@ Other Removals
 - Enforced deprecation of :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` for object-dtype (:issue:`57820`)
 - Enforced deprecation of :meth:`offsets.Tick.delta`, use ``pd.Timedelta(obj)`` instead (:issue:`55498`)
 - Enforced deprecation of ``axis=None`` acting the same as ``axis=0`` in the DataFrame reductions ``sum``, ``prod``, ``std``, ``var``, and ``sem``, passing ``axis=None`` will now reduce over both axes; this is particularly the case when doing e.g. ``numpy.sum(df)`` (:issue:`21597`)
-- Enforced deprecation of ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock`` (:issue:`58467`)
+- Enforced deprecation of ``core.internals`` member ``DatetimeTZBlock`` (:issue:`58467`)
 - Enforced deprecation of ``date_parser`` in :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_excel` in favour of ``date_format`` (:issue:`50601`)
 - Enforced deprecation of ``keep_date_col`` keyword in :func:`read_csv` (:issue:`55569`)
 - Enforced deprecation of ``quantile`` keyword in :meth:`.Rolling.quantile` and :meth:`.Expanding.quantile`, renamed to ``q`` instead. (:issue:`52550`)
diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py
index 45758379e0bd6..5ab70ba38f9c2 100644
--- a/pandas/core/internals/__init__.py
+++ b/pandas/core/internals/__init__.py
@@ -6,8 +6,52 @@
 )
 
 __all__ = [
+    "Block",
+    "ExtensionBlock",
     "make_block",
     "BlockManager",
     "SingleBlockManager",
     "concatenate_managers",
 ]
+
+
+def __getattr__(name: str):
+    # GH#55139
+    import warnings
+
+    if name == "create_block_manager_from_blocks":
+        # GH#33892
+        warnings.warn(
+            f"{name} is deprecated and will be removed in a future version. "
+            "Use public APIs instead.",
+            FutureWarning,
+            # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758
+            # on hard-coding stacklevel
+            stacklevel=2,
+        )
+        from pandas.core.internals.managers import create_block_manager_from_blocks
+
+        return create_block_manager_from_blocks
+
+    if name in [
+        "Block",
+        "ExtensionBlock",
+    ]:
+        warnings.warn(
+            f"{name} is deprecated and will be removed in a future version. "
+            "Use public APIs instead.",
+            FutureWarning,
+            # https://github.com/pandas-dev/pandas/pull/55139#pullrequestreview-1720690758
+            # on hard-coding stacklevel
+            stacklevel=2,
+        )
+        if name == "ExtensionBlock":
+            from pandas.core.internals.blocks import ExtensionBlock
+
+            return ExtensionBlock
+        else:
+            from pandas.core.internals.blocks import Block
+
+            return Block
+
+    raise AttributeError(f"module 'pandas.core.internals' has no attribute '{name}'")
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index 618254fee9259..7d265bc430125 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -126,8 +126,7 @@
         npt,
     )
 
-    from pandas.core.internals.blocks import Block
-
+    from pandas.core.internals import Block
 
 # versioning attribute
 _version = "0.15.2"
diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py
index 591157bbe87fe..fc222f6987466 100644
--- a/pandas/tests/internals/test_api.py
+++ b/pandas/tests/internals/test_api.py
@@ -41,6 +41,20 @@ def test_namespace():
     assert set(result) == set(expected + modules)
 
 
+@pytest.mark.parametrize(
+    "name",
+    [
+        "Block",
+        "ExtensionBlock",
+    ],
+)
+def test_deprecations(name):
+    # GH#55139
+    msg = f"{name} is deprecated.* Use public APIs instead"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        getattr(internals, name)
+
+
 def test_make_block_2d_with_dti():
     # GH#41168
     dti = pd.date_range("2012", periods=3, tz="UTC")
@@ -53,6 +67,18 @@ def test_make_block_2d_with_dti():
     assert blk.values.shape == (1, 3)
 
 
+def test_create_block_manager_from_blocks_deprecated():
+    # GH#33892
+    # If they must, downstream packages should get this from internals.api,
+    #  not internals.
+    msg = (
+        "create_block_manager_from_blocks is deprecated and will be "
+        "removed in a future version. Use public APIs instead"
+    )
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        internals.create_block_manager_from_blocks
+
+
 def test_create_dataframe_from_blocks(float_frame):
     block = float_frame._mgr.blocks[0]
     index = float_frame.index.copy()

From db38017ba647b000a4f3dec9c6ace51cb7f6ab95 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 8 Nov 2024 17:55:36 +0100
Subject: [PATCH 219/224] TST (string dtype): resolve xfail in arrow interface
 tests (#60241)

---
 pandas/tests/frame/test_arrow_interface.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/pandas/tests/frame/test_arrow_interface.py b/pandas/tests/frame/test_arrow_interface.py
index dc163268f64b9..b36b6b5ffe0cc 100644
--- a/pandas/tests/frame/test_arrow_interface.py
+++ b/pandas/tests/frame/test_arrow_interface.py
@@ -2,8 +2,6 @@
 
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas.util._test_decorators as td
 
 import pandas as pd
@@ -11,9 +9,8 @@
 pa = pytest.importorskip("pyarrow")
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @td.skip_if_no("pyarrow", min_version="14.0")
-def test_dataframe_arrow_interface():
+def test_dataframe_arrow_interface(using_infer_string):
     df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
 
     capsule = df.__arrow_c_stream__()
@@ -25,7 +22,8 @@ def test_dataframe_arrow_interface():
     )
 
     table = pa.table(df)
-    expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+    string_type = pa.large_string() if using_infer_string else pa.string()
+    expected = pa.table({"a": [1, 2, 3], "b": pa.array(["a", "b", "c"], string_type)})
     assert table.equals(expected)
 
     schema = pa.schema([("a", pa.int8()), ("b", pa.string())])
@@ -34,13 +32,13 @@ def test_dataframe_arrow_interface():
     assert table.equals(expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @td.skip_if_no("pyarrow", min_version="15.0")
-def test_dataframe_to_arrow():
+def test_dataframe_to_arrow(using_infer_string):
     df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
 
     table = pa.RecordBatchReader.from_stream(df).read_all()
-    expected = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+    string_type = pa.large_string() if using_infer_string else pa.string()
+    expected = pa.table({"a": [1, 2, 3], "b": pa.array(["a", "b", "c"], string_type)})
     assert table.equals(expected)
 
     schema = pa.schema([("a", pa.int8()), ("b", pa.string())])

From 754d09163ae08f2b87daa41f2263556dbb809616 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 8 Nov 2024 18:13:03 +0100
Subject: [PATCH 220/224] BUG (string dtype): correctly enable idxmin/max for
 python-storage strings (#60242)

---
 pandas/core/arrays/string_.py              | 2 +-
 pandas/tests/frame/test_reductions.py      | 5 -----
 pandas/tests/reductions/test_reductions.py | 8 --------
 3 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 01619dab7ce45..de129df2575d3 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -846,7 +846,7 @@ def _reduce(
             else:
                 return nanops.nanall(self._ndarray, skipna=skipna)
 
-        if name in ["min", "max", "sum"]:
+        if name in ["min", "max", "argmin", "argmax", "sum"]:
             result = getattr(self, name)(skipna=skipna, axis=axis, **kwargs)
             if keepdims:
                 return self._from_sequence([result], dtype=self.dtype)
diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
index 30d02f9b5463d..fde4dfeed9c55 100644
--- a/pandas/tests/frame/test_reductions.py
+++ b/pandas/tests/frame/test_reductions.py
@@ -6,8 +6,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat import (
     IS64,
     is_platform_windows,
@@ -1081,7 +1079,6 @@ def test_idxmin_empty(self, index, skipna, axis):
         expected = Series(dtype=index.dtype)
         tm.assert_series_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize("numeric_only", [True, False])
     def test_idxmin_numeric_only(self, numeric_only):
         df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")})
@@ -1098,7 +1095,6 @@ def test_idxmin_axis_2(self, float_frame):
         with pytest.raises(ValueError, match=msg):
             frame.idxmin(axis=2)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize("axis", [0, 1])
     def test_idxmax(self, float_frame, int_frame, skipna, axis):
         frame = float_frame
@@ -1132,7 +1128,6 @@ def test_idxmax_empty(self, index, skipna, axis):
         expected = Series(dtype=index.dtype)
         tm.assert_series_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize("numeric_only", [True, False])
     def test_idxmax_numeric_only(self, numeric_only):
         df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")})
diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py
index 8153ba66d632b..476978aeab15a 100644
--- a/pandas/tests/reductions/test_reductions.py
+++ b/pandas/tests/reductions/test_reductions.py
@@ -7,10 +7,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
-from pandas.compat import HAS_PYARROW
-
 import pandas as pd
 from pandas import (
     Categorical,
@@ -1206,10 +1202,6 @@ def test_idxminmax_object_dtype(self, using_infer_string):
             with pytest.raises(TypeError, match=msg):
                 ser3.idxmin(skipna=False)
 
-    # TODO(infer_string) implement argmin/max for python string dtype
-    @pytest.mark.xfail(
-        using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-    )
     def test_idxminmax_object_frame(self):
         # GH#4279
         df = DataFrame([["zimm", 2.5], ["biff", 1.0], ["bid", 12.0]])

From 2e3e6946f42770d1a4a3218655a2fe2c9a084e2e Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sat, 9 Nov 2024 01:18:34 +0530
Subject: [PATCH 221/224] DOC: fix SA01,ES01 for
 pandas.arrays.IntervalArray.right (#60249)

---
 ci/code_checks.sh              |  1 -
 pandas/core/arrays/interval.py | 15 +++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index adcf48507698b..1c70fcb44f910 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -90,7 +90,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
         -i "pandas.arrays.IntegerArray SA01" \
         -i "pandas.arrays.IntervalArray.length SA01" \
-        -i "pandas.arrays.IntervalArray.right SA01" \
         -i "pandas.arrays.NumpyExtensionArray SA01" \
         -i "pandas.arrays.SparseArray PR07,SA01" \
         -i "pandas.arrays.TimedeltaArray PR07,SA01" \
diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py
index 3e231fb9f8ecb..f47ef095a8409 100644
--- a/pandas/core/arrays/interval.py
+++ b/pandas/core/arrays/interval.py
@@ -1269,6 +1269,21 @@ def right(self) -> Index:
         """
         Return the right endpoints of each Interval in the IntervalArray as an Index.
 
+        This property extracts the right endpoints from each interval contained within
+        the IntervalArray. This can be helpful in use cases where you need to work
+        with or compare only the upper bounds of intervals, such as when performing
+        range-based filtering, determining interval overlaps, or visualizing the end
+        boundaries of data segments.
+
+        See Also
+        --------
+        arrays.IntervalArray.left : Return the left endpoints of each Interval in
+            the IntervalArray as an Index.
+        arrays.IntervalArray.mid : Return the midpoint of each Interval in the
+            IntervalArray as an Index.
+        arrays.IntervalArray.contains : Check elementwise if the Intervals contain
+            the value.
+
         Examples
         --------
 

From 7740c4e2c33ba855d36db93d3028164d08e8263c Mon Sep 17 00:00:00 2001
From: Tuhin Sharma <tuhinsharma121@gmail.com>
Date: Sat, 9 Nov 2024 01:19:12 +0530
Subject: [PATCH 222/224] DOC: fix PR07,SA01,ES01 for pandas.arrays.SparseArray
 (#60250)

---
 ci/code_checks.sh                  |  1 -
 pandas/core/arrays/sparse/array.py | 10 ++++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/ci/code_checks.sh b/ci/code_checks.sh
index 1c70fcb44f910..093e7a8e26854 100755
--- a/ci/code_checks.sh
+++ b/ci/code_checks.sh
@@ -91,7 +91,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.arrays.IntegerArray SA01" \
         -i "pandas.arrays.IntervalArray.length SA01" \
         -i "pandas.arrays.NumpyExtensionArray SA01" \
-        -i "pandas.arrays.SparseArray PR07,SA01" \
         -i "pandas.arrays.TimedeltaArray PR07,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \
         -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
index a3db7dc1f93e9..137dbb6e4d139 100644
--- a/pandas/core/arrays/sparse/array.py
+++ b/pandas/core/arrays/sparse/array.py
@@ -289,12 +289,18 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray):
     """
     An ExtensionArray for storing sparse data.
 
+    SparseArray efficiently stores data with a high frequency of a
+    specific fill value (e.g., zeros), saving memory by only retaining
+    non-fill elements and their indices. This class is particularly
+    useful for large datasets where most values are redundant.
+
     Parameters
     ----------
     data : array-like or scalar
         A dense array of values to store in the SparseArray. This may contain
         `fill_value`.
     sparse_index : SparseIndex, optional
+        Index indicating the locations of sparse elements.
     fill_value : scalar, optional
         Elements in data that are ``fill_value`` are not stored in the
         SparseArray. For memory savings, this should be the most common value
@@ -345,6 +351,10 @@ class SparseArray(OpsMixin, PandasObject, ExtensionArray):
     -------
     None
 
+    See Also
+    --------
+    SparseDtype : Dtype for sparse data.
+
     Examples
     --------
     >>> from pandas.arrays import SparseArray

From 084b1999cffde35bf9e49e5e5b8a5a0482bf927d Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 8 Nov 2024 22:47:07 +0100
Subject: [PATCH 223/224] TST (string dtype): resolve xfails in
 pandas/tests/copy_view (#60245)

---
 pandas/_testing/__init__.py              | 28 ++++++-----------
 pandas/tests/copy_view/test_astype.py    | 22 +++++++-------
 pandas/tests/copy_view/test_functions.py |  1 -
 pandas/tests/copy_view/test_methods.py   | 38 +++++++++++++-----------
 pandas/tests/copy_view/test_replace.py   | 14 +++------
 5 files changed, 46 insertions(+), 57 deletions(-)

diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
index 0a110d69c7a70..e092d65f08dd4 100644
--- a/pandas/_testing/__init__.py
+++ b/pandas/_testing/__init__.py
@@ -7,7 +7,6 @@
 from typing import (
     TYPE_CHECKING,
     ContextManager,
-    cast,
 )
 
 import numpy as np
@@ -21,8 +20,6 @@
 
 from pandas.compat import pa_version_under10p1
 
-from pandas.core.dtypes.common import is_string_dtype
-
 import pandas as pd
 from pandas import (
     ArrowDtype,
@@ -77,8 +74,8 @@
     with_csv_dialect,
 )
 from pandas.core.arrays import (
+    ArrowExtensionArray,
     BaseMaskedArray,
-    ExtensionArray,
     NumpyExtensionArray,
 )
 from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
@@ -92,7 +89,6 @@
         NpDtype,
     )
 
-    from pandas.core.arrays import ArrowExtensionArray
 
 UNSIGNED_INT_NUMPY_DTYPES: list[NpDtype] = ["uint8", "uint16", "uint32", "uint64"]
 UNSIGNED_INT_EA_DTYPES: list[Dtype] = ["UInt8", "UInt16", "UInt32", "UInt64"]
@@ -512,24 +508,18 @@ def shares_memory(left, right) -> bool:
     if isinstance(left, pd.core.arrays.IntervalArray):
         return shares_memory(left._left, right) or shares_memory(left._right, right)
 
-    if (
-        isinstance(left, ExtensionArray)
-        and is_string_dtype(left.dtype)
-        and left.dtype.storage == "pyarrow"  # type: ignore[attr-defined]
-    ):
-        # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
-        left = cast("ArrowExtensionArray", left)
-        if (
-            isinstance(right, ExtensionArray)
-            and is_string_dtype(right.dtype)
-            and right.dtype.storage == "pyarrow"  # type: ignore[attr-defined]
-        ):
-            right = cast("ArrowExtensionArray", right)
+    if isinstance(left, ArrowExtensionArray):
+        if isinstance(right, ArrowExtensionArray):
+            # https://github.com/pandas-dev/pandas/pull/43930#discussion_r736862669
             left_pa_data = left._pa_array
             right_pa_data = right._pa_array
             left_buf1 = left_pa_data.chunk(0).buffers()[1]
             right_buf1 = right_pa_data.chunk(0).buffers()[1]
-            return left_buf1 == right_buf1
+            return left_buf1.address == right_buf1.address
+        else:
+            # if we have one one ArrowExtensionArray and one other array, assume
+            # they can only share memory if they share the same numpy buffer
+            return np.shares_memory(left, right)
 
     if isinstance(left, BaseMaskedArray) and isinstance(right, BaseMaskedArray):
         # By convention, we'll say these share memory if they share *either*
diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py
index 80c30f2d0c26e..91f5badeb9728 100644
--- a/pandas/tests/copy_view/test_astype.py
+++ b/pandas/tests/copy_view/test_astype.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat import HAS_PYARROW
 from pandas.compat.pyarrow import pa_version_under12p0
 
@@ -206,7 +204,6 @@ def test_astype_arrow_timestamp():
         assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._pa_array)
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 def test_convert_dtypes_infer_objects():
     ser = Series(["a", "b", "c"])
     ser_orig = ser.copy()
@@ -217,20 +214,25 @@ def test_convert_dtypes_infer_objects():
         convert_string=False,
     )
 
-    assert np.shares_memory(get_array(ser), get_array(result))
+    assert tm.shares_memory(get_array(ser), get_array(result))
     result.iloc[0] = "x"
     tm.assert_series_equal(ser, ser_orig)
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
-def test_convert_dtypes():
+def test_convert_dtypes(using_infer_string):
     df = DataFrame({"a": ["a", "b"], "b": [1, 2], "c": [1.5, 2.5], "d": [True, False]})
     df_orig = df.copy()
     df2 = df.convert_dtypes()
 
-    assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
-    assert np.shares_memory(get_array(df2, "d"), get_array(df, "d"))
-    assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
-    assert np.shares_memory(get_array(df2, "c"), get_array(df, "c"))
+    if using_infer_string and HAS_PYARROW:
+        # TODO the default nullable string dtype still uses python storage
+        # this should be changed to pyarrow if installed
+        assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    else:
+        assert tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert tm.shares_memory(get_array(df2, "d"), get_array(df, "d"))
+    assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    assert tm.shares_memory(get_array(df2, "c"), get_array(df, "c"))
     df2.iloc[0, 0] = "x"
+    df2.iloc[0, 1] = 10
     tm.assert_frame_equal(df, df_orig)
diff --git a/pandas/tests/copy_view/test_functions.py b/pandas/tests/copy_view/test_functions.py
index fcdece6077829..32fea794975b6 100644
--- a/pandas/tests/copy_view/test_functions.py
+++ b/pandas/tests/copy_view/test_functions.py
@@ -153,7 +153,6 @@ def test_concat_copy_keyword():
     assert np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
 
 
-# @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 @pytest.mark.parametrize(
     "func",
     [
diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py
index 92e1ba750fae2..250697c91ff13 100644
--- a/pandas/tests/copy_view/test_methods.py
+++ b/pandas/tests/copy_view/test_methods.py
@@ -1,8 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat import HAS_PYARROW
 
 import pandas as pd
@@ -716,14 +714,18 @@ def test_head_tail(method):
     tm.assert_frame_equal(df, df_orig)
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
-def test_infer_objects():
-    df = DataFrame({"a": [1, 2], "b": "c", "c": 1, "d": "x"})
+def test_infer_objects(using_infer_string):
+    df = DataFrame(
+        {"a": [1, 2], "b": Series(["x", "y"], dtype=object), "c": 1, "d": "x"}
+    )
     df_orig = df.copy()
     df2 = df.infer_objects()
 
     assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
-    assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    if using_infer_string and HAS_PYARROW:
+        assert not tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    else:
+        assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
 
     df2.iloc[0, 0] = 0
     df2.iloc[0, 1] = "d"
@@ -732,19 +734,16 @@ def test_infer_objects():
     tm.assert_frame_equal(df, df_orig)
 
 
-@pytest.mark.xfail(
-    using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-)
-def test_infer_objects_no_reference():
+def test_infer_objects_no_reference(using_infer_string):
     df = DataFrame(
         {
             "a": [1, 2],
-            "b": "c",
+            "b": Series(["x", "y"], dtype=object),
             "c": 1,
             "d": Series(
                 [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
             ),
-            "e": "b",
+            "e": Series(["z", "w"], dtype=object),
         }
     )
     df = df.infer_objects()
@@ -757,8 +756,14 @@ def test_infer_objects_no_reference():
     df.iloc[0, 1] = "d"
     df.iloc[0, 3] = Timestamp("2018-12-31")
     assert np.shares_memory(arr_a, get_array(df, "a"))
-    # TODO(CoW): Block splitting causes references here
-    assert not np.shares_memory(arr_b, get_array(df, "b"))
+    if using_infer_string and HAS_PYARROW:
+        # note that the underlying memory of arr_b has been copied anyway
+        # because of the assignment, but the EA is updated inplace so still
+        # appears the share memory
+        assert tm.shares_memory(arr_b, get_array(df, "b"))
+    else:
+        # TODO(CoW): Block splitting causes references here
+        assert not np.shares_memory(arr_b, get_array(df, "b"))
     assert np.shares_memory(arr_d, get_array(df, "d"))
 
 
@@ -766,7 +771,7 @@ def test_infer_objects_reference():
     df = DataFrame(
         {
             "a": [1, 2],
-            "b": "c",
+            "b": Series(["x", "y"], dtype=object),
             "c": 1,
             "d": Series(
                 [Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype="object"
@@ -904,14 +909,13 @@ def test_sort_values_inplace(obj, kwargs):
     tm.assert_equal(view, obj_orig)
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 @pytest.mark.parametrize("decimals", [-1, 0, 1])
 def test_round(decimals):
     df = DataFrame({"a": [1, 2], "b": "c"})
     df_orig = df.copy()
     df2 = df.round(decimals=decimals)
 
-    assert np.shares_memory(get_array(df2, "b"), get_array(df, "b"))
+    assert tm.shares_memory(get_array(df2, "b"), get_array(df, "b"))
     # TODO: Make inplace by using out parameter of ndarray.round?
     if decimals >= 0:
         # Ensure lazy copy if no-op
diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py
index e57514bffdf1e..d4838a5e68ab8 100644
--- a/pandas/tests/copy_view/test_replace.py
+++ b/pandas/tests/copy_view/test_replace.py
@@ -1,10 +1,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
-from pandas.compat import HAS_PYARROW
-
 from pandas import (
     Categorical,
     DataFrame,
@@ -13,7 +9,6 @@
 from pandas.tests.copy_view.util import get_array
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
 @pytest.mark.parametrize(
     "replace_kwargs",
     [
@@ -30,14 +25,14 @@
     ],
 )
 def test_replace(replace_kwargs):
-    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
+    df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
     df_orig = df.copy()
 
     df_replaced = df.replace(**replace_kwargs)
 
     if (df_replaced["b"] == df["b"]).all():
         assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
-    assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
+    assert tm.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
 
     # mutating squeezed df triggers a copy-on-write for that column/block
     df_replaced.loc[0, "c"] = -1
@@ -61,18 +56,17 @@ def test_replace_regex_inplace_refs():
     tm.assert_frame_equal(view, df_orig)
 
 
-@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 def test_replace_regex_inplace():
     df = DataFrame({"a": ["aaa", "bbb"]})
     arr = get_array(df, "a")
     df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
     assert df._mgr._has_no_reference(0)
-    assert np.shares_memory(arr, get_array(df, "a"))
+    assert tm.shares_memory(arr, get_array(df, "a"))
 
     df_orig = df.copy()
     df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True)
     tm.assert_frame_equal(df_orig, df)
-    assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
+    assert not tm.shares_memory(get_array(df2, "a"), get_array(df, "a"))
 
 
 def test_replace_regex_inplace_no_op():

From 4fcee0e431135bf6fa97440d4d7e17a96630fe6e Mon Sep 17 00:00:00 2001
From: DL Lim <69715968+dl-lim@users.noreply.github.com>
Date: Sat, 9 Nov 2024 23:57:58 +0000
Subject: [PATCH 224/224] Update mypy version from 1.11.2 to 1.13.0 (#60260)

---
 environment.yml                        |  2 +-
 pandas/core/computation/ops.py         |  3 +--
 pandas/core/dtypes/dtypes.py           |  6 ++++--
 pandas/core/generic.py                 |  4 +++-
 pandas/core/indexes/interval.py        |  3 +--
 pandas/core/indexing.py                |  4 +++-
 pandas/core/missing.py                 | 11 ++++-------
 pandas/core/nanops.py                  | 10 +++++++---
 pandas/io/common.py                    |  6 +++---
 pandas/plotting/_core.py               |  4 +++-
 pandas/plotting/_matplotlib/boxplot.py |  5 +----
 pandas/plotting/_matplotlib/hist.py    |  5 +----
 pandas/tests/io/test_sql.py            |  9 ++++++---
 requirements-dev.txt                   |  2 +-
 14 files changed, 39 insertions(+), 35 deletions(-)

diff --git a/environment.yml b/environment.yml
index 5ef5fbe910427..9bf6cf2a92347 100644
--- a/environment.yml
+++ b/environment.yml
@@ -77,7 +77,7 @@ dependencies:
 
   # code checks
   - flake8=7.1.0  # run in subprocess over docstring examples
-  - mypy=1.11.2  # pre-commit uses locally installed mypy
+  - mypy=1.13.0  # pre-commit uses locally installed mypy
   - tokenize-rt  # scripts/check_for_inconsistent_pandas_namespace.py
   - pre-commit>=4.0.1
 
diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py
index a1a5f77f8539e..9b26de42e119b 100644
--- a/pandas/core/computation/ops.py
+++ b/pandas/core/computation/ops.py
@@ -76,8 +76,7 @@
 class Term:
     def __new__(cls, name, env, side=None, encoding=None):
         klass = Constant if not isinstance(name, str) else cls
-        # error: Argument 2 for "super" not an instance of argument 1
-        supr_new = super(Term, klass).__new__  # type: ignore[misc]
+        supr_new = super(Term, klass).__new__
         return supr_new(klass)
 
     is_local: bool
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
index 004a1aab5436e..96b0aa16940a6 100644
--- a/pandas/core/dtypes/dtypes.py
+++ b/pandas/core/dtypes/dtypes.py
@@ -2130,9 +2130,11 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
                 PerformanceWarning,
                 stacklevel=find_stack_level(),
             )
-
         np_dtypes = (x.subtype if isinstance(x, SparseDtype) else x for x in dtypes)
-        return SparseDtype(np_find_common_type(*np_dtypes), fill_value=fill_value)
+        # error: Argument 1 to "np_find_common_type" has incompatible type
+        # "*Generator[Any | dtype[Any] | ExtensionDtype, None, None]";
+        # expected "dtype[Any]"  [arg-type]
+        return SparseDtype(np_find_common_type(*np_dtypes), fill_value=fill_value)  # type: ignore [arg-type]
 
 
 @register_extension_dtype
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
index 35014674565ff..7c2cc5d33a5db 100644
--- a/pandas/core/generic.py
+++ b/pandas/core/generic.py
@@ -8024,7 +8024,9 @@ def asof(self, where, subset=None):
                     np.nan, index=self.columns, name=where[0]
                 )
 
-        locs = self.index.asof_locs(where, ~(nulls._values))
+        # error: Unsupported operand type for
+        # ~ ("ExtensionArray | ndarray[Any, Any] | Any")
+        locs = self.index.asof_locs(where, ~nulls._values)  # type: ignore[operator]
 
         # mask the missing
         mask = locs == -1
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
index b0b9c5419e2ad..13811c28e6c1e 100644
--- a/pandas/core/indexes/interval.py
+++ b/pandas/core/indexes/interval.py
@@ -558,8 +558,7 @@ def _maybe_convert_i8(self, key):
             left = self._maybe_convert_i8(key.left)
             right = self._maybe_convert_i8(key.right)
             constructor = Interval if scalar else IntervalIndex.from_arrays
-            # error: "object" not callable
-            return constructor(left, right, closed=self.closed)  # type: ignore[operator]
+            return constructor(left, right, closed=self.closed)
 
         if scalar:
             # Timestamp/Timedelta
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
index 08bd3cde60806..975e7ad135ba7 100644
--- a/pandas/core/indexing.py
+++ b/pandas/core/indexing.py
@@ -914,7 +914,9 @@ def __setitem__(self, key, value) -> None:
         indexer = self._get_setitem_indexer(key)
         self._has_valid_setitem_indexer(key)
 
-        iloc = self if self.name == "iloc" else self.obj.iloc
+        iloc: _iLocIndexer = (
+            cast("_iLocIndexer", self) if self.name == "iloc" else self.obj.iloc
+        )
         iloc._setitem_with_indexer(indexer, value, self.name)
 
     def _validate_key(self, key, axis: AxisInt) -> None:
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
index 039d868bccd16..ff2daae002731 100644
--- a/pandas/core/missing.py
+++ b/pandas/core/missing.py
@@ -413,13 +413,10 @@ def func(yvalues: np.ndarray) -> None:
             **kwargs,
         )
 
-    # error: Argument 1 to "apply_along_axis" has incompatible type
-    # "Callable[[ndarray[Any, Any]], None]"; expected "Callable[...,
-    # Union[_SupportsArray[dtype[<nothing>]], Sequence[_SupportsArray
-    # [dtype[<nothing>]]], Sequence[Sequence[_SupportsArray[dtype[<nothing>]]]],
-    # Sequence[Sequence[Sequence[_SupportsArray[dtype[<nothing>]]]]],
-    # Sequence[Sequence[Sequence[Sequence[_SupportsArray[dtype[<nothing>]]]]]]]]"
-    np.apply_along_axis(func, axis, data)  # type: ignore[arg-type]
+    # error: No overload variant of "apply_along_axis" matches
+    # argument types "Callable[[ndarray[Any, Any]], None]",
+    # "int", "ndarray[Any, Any]"
+    np.apply_along_axis(func, axis, data)  # type: ignore[call-overload]
 
 
 def _index_to_interp_indices(index: Index, method: str) -> np.ndarray:
diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py
index e775156a6ae2f..d6154e2352c63 100644
--- a/pandas/core/nanops.py
+++ b/pandas/core/nanops.py
@@ -726,7 +726,9 @@ def nanmean(
 
 
 @bottleneck_switch()
-def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask=None):
+def nanmedian(
+    values: np.ndarray, *, axis: AxisInt | None = None, skipna: bool = True, mask=None
+) -> float | np.ndarray:
     """
     Parameters
     ----------
@@ -738,7 +740,7 @@ def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask=
 
     Returns
     -------
-    result : float
+    result : float | ndarray
         Unless input is a float array, in which case use the same
         precision as the input array.
 
@@ -758,7 +760,7 @@ def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask=
     # cases we never need to set NaN to the masked values
     using_nan_sentinel = values.dtype.kind == "f" and mask is None
 
-    def get_median(x, _mask=None):
+    def get_median(x: np.ndarray, _mask=None):
         if _mask is None:
             _mask = notna(x)
         else:
@@ -794,6 +796,8 @@ def get_median(x, _mask=None):
 
     notempty = values.size
 
+    res: float | np.ndarray
+
     # an array from a frame
     if values.ndim > 1 and axis is not None:
         # there's a non-empty array to apply over otherwise numpy raises
diff --git a/pandas/io/common.py b/pandas/io/common.py
index a76f0cf6dd34d..8da3ca0218983 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -910,10 +910,10 @@ def get_handle(
             or not hasattr(handle, "seekable")
         ):
             handle = _IOWrapper(handle)
-        # error: Argument 1 to "TextIOWrapper" has incompatible type
-        # "_IOWrapper"; expected "IO[bytes]"
+        # error: Value of type variable "_BufferT_co" of "TextIOWrapper" cannot
+        # be "_IOWrapper | BaseBuffer" [type-var]
         handle = TextIOWrapper(
-            handle,  # type: ignore[arg-type]
+            handle,  # type: ignore[type-var]
             encoding=ioargs.encoding,
             errors=errors,
             newline="",
diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py
index b60392368d944..3fbd4c6f6e26a 100644
--- a/pandas/plotting/_core.py
+++ b/pandas/plotting/_core.py
@@ -1038,7 +1038,9 @@ def __call__(self, *args, **kwargs):
                     label_name = label_kw or y
                     data.name = label_name
                 else:
-                    match = is_list_like(label_kw) and len(label_kw) == len(y)
+                    # error: Argument 1 to "len" has incompatible type "Any | bool";
+                    # expected "Sized"  [arg-type]
+                    match = is_list_like(label_kw) and len(label_kw) == len(y)  # type: ignore[arg-type]
                     if label_kw and not match:
                         raise ValueError(
                             "label should be list-like and same length as y"
diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py
index 6bb10068bee38..68682344f98ca 100644
--- a/pandas/plotting/_matplotlib/boxplot.py
+++ b/pandas/plotting/_matplotlib/boxplot.py
@@ -198,10 +198,7 @@ def _make_plot(self, fig: Figure) -> None:
                 else self.data
             )
 
-            # error: Argument "data" to "_iter_data" of "MPLPlot" has
-            # incompatible type "object"; expected "DataFrame |
-            # dict[Hashable, Series | DataFrame]"
-            for i, (label, y) in enumerate(self._iter_data(data=data)):  # type: ignore[arg-type]
+            for i, (label, y) in enumerate(self._iter_data(data=data)):
                 ax = self._get_ax(i)
                 kwds = self.kwds.copy()
 
diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py
index 97e510982ab93..1a423ad49c294 100644
--- a/pandas/plotting/_matplotlib/hist.py
+++ b/pandas/plotting/_matplotlib/hist.py
@@ -137,10 +137,7 @@ def _make_plot(self, fig: Figure) -> None:
             if self.by is not None
             else self.data
         )
-
-        # error: Argument "data" to "_iter_data" of "MPLPlot" has incompatible
-        # type "object"; expected "DataFrame | dict[Hashable, Series | DataFrame]"
-        for i, (label, y) in enumerate(self._iter_data(data=data)):  # type: ignore[arg-type]
+        for i, (label, y) in enumerate(self._iter_data(data=data)):
             ax = self._get_ax(i)
 
             kwds = self.kwds.copy()
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
index c28a33069d23f..beca8dea9407d 100644
--- a/pandas/tests/io/test_sql.py
+++ b/pandas/tests/io/test_sql.py
@@ -237,14 +237,17 @@ def types_table_metadata(dialect: str):
         "types",
         metadata,
         Column("TextCol", TEXT),
-        Column("DateCol", date_type),
+        # error: Cannot infer type argument 1 of "Column"
+        Column("DateCol", date_type),  # type: ignore[misc]
         Column("IntDateCol", Integer),
         Column("IntDateOnlyCol", Integer),
         Column("FloatCol", Float),
         Column("IntCol", Integer),
-        Column("BoolCol", bool_type),
+        # error: Cannot infer type argument 1 of "Column"
+        Column("BoolCol", bool_type),  # type: ignore[misc]
         Column("IntColWithNull", Integer),
-        Column("BoolColWithNull", bool_type),
+        # error: Cannot infer type argument 1 of "Column"
+        Column("BoolColWithNull", bool_type),  # type: ignore[misc]
     )
     return types
 
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 00e320e6370ce..69568cf661241 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -54,7 +54,7 @@ moto
 flask
 asv>=0.6.1
 flake8==7.1.0
-mypy==1.11.2
+mypy==1.13.0
 tokenize-rt
 pre-commit>=4.0.1
 gitpython