From 85be99eac9b78afcf98955cd85c60d75c5726242 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 5 Sep 2024 07:24:22 -1000 Subject: [PATCH 001/224] PERF: CategoricalDtype.update_dtype (#59647) * PERF: CategoricalDtype.update_dtype * Add whatsnew number add comment * Fix unit test * short circut only for the dtype --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/dtypes/dtypes.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 75d3ff1193f8d..cd353b60d1a6e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -528,6 +528,7 @@ Performance improvements - Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`) - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`) - Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`) +- Performance improvement in :meth:`CategoricalDtype.update_dtype` when ``dtype`` is a :class:`CategoricalDtype` with non ``None`` categories and ordered (:issue:`59647`) - Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`) - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`) - Performance improvement in indexing operations for string dtypes (:issue:`56997`) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 54003e67be7ba..68b4807961d19 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -611,6 +611,13 @@ def update_dtype(self, dtype: str_type | CategoricalDtype) -> CategoricalDtype: dtype = cast(CategoricalDtype, dtype) # update categories/ordered unless they've been explicitly passed as None + if ( + isinstance(dtype, CategoricalDtype) + and dtype.categories is not None + and dtype.ordered is not None + ): + # Avoid re-validation in CategoricalDtype constructor + return dtype new_categories = ( dtype.categories if dtype.categories is not None else self.categories ) From 4f1052e390ea6d33e81ec1dc7c6801bb6b5b79ef Mon Sep 17 00:00:00 2001 From: "Mien (Josephine) Nguyen" Date: Thu, 5 Sep 2024 14:07:07 -0400 Subject: [PATCH 002/224] TST: Update BooleanArray _logical_method test to fail on incorrect length comparison operator (#59708) Test --- pandas/tests/arrays/boolean/test_logical.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/tests/arrays/boolean/test_logical.py b/pandas/tests/arrays/boolean/test_logical.py index 66c117ea3fc66..97a24e0f24756 100644 --- a/pandas/tests/arrays/boolean/test_logical.py +++ b/pandas/tests/arrays/boolean/test_logical.py @@ -60,19 +60,20 @@ def test_eq_mismatched_type(self, other): expected = pd.array([True, True]) tm.assert_extension_array_equal(result, expected) - def test_logical_length_mismatch_raises(self, all_logical_operators): + @pytest.mark.parametrize("other", [[True, False], [True, False, True, False]]) + def test_logical_length_mismatch_raises(self, other, all_logical_operators): op_name = all_logical_operators a = pd.array([True, False, None], dtype="boolean") msg = "Lengths must match" with pytest.raises(ValueError, match=msg): - getattr(a, op_name)([True, False]) + getattr(a, op_name)(other) with pytest.raises(ValueError, match=msg): - getattr(a, op_name)(np.array([True, False])) + getattr(a, op_name)(np.array(other)) with pytest.raises(ValueError, match=msg): - getattr(a, op_name)(pd.array([True, False], dtype="boolean")) + getattr(a, op_name)(pd.array(other, dtype="boolean")) def test_logical_nan_raises(self, all_logical_operators): op_name = all_logical_operators From 6c30aa22c4537e3ccf5fd968d00c328cd1865545 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 5 Sep 2024 16:21:06 -0700 Subject: [PATCH 003/224] REF (string): de-duplicate _str_contains (#59709) * REF: de-duplicate _str_contains * pyright ignore --- pandas/core/arrays/_arrow_string_mixins.py | 15 +++++++++++++++ pandas/core/arrays/arrow/array.py | 15 --------------- pandas/core/arrays/string_arrow.py | 14 ++++---------- 3 files changed, 19 insertions(+), 25 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index ba20111e0d858..5b34a7e2c7cef 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -190,3 +190,18 @@ def _str_istitle(self): def _str_isupper(self): result = pc.utf8_is_upper(self._pa_array) return self._convert_bool_result(result) + + def _str_contains( + self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True + ): + if flags: + raise NotImplementedError(f"contains not implemented with {flags=}") + + if regex: + pa_contains = pc.match_substring_regex + else: + pa_contains = pc.match_substring + result = pa_contains(self._pa_array, pat, ignore_case=not case) + if not isna(na): # pyright: ignore [reportGeneralTypeIssues] + result = result.fill_null(na) + return self._convert_bool_result(result) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 807854a13f285..40819ba4ab338 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2322,21 +2322,6 @@ def _str_count(self, pat: str, flags: int = 0) -> Self: raise NotImplementedError(f"count not implemented with {flags=}") return type(self)(pc.count_substring_regex(self._pa_array, pat)) - def _str_contains( - self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True - ) -> Self: - if flags: - raise NotImplementedError(f"contains not implemented with {flags=}") - - if regex: - pa_contains = pc.match_substring_regex - else: - pa_contains = pc.match_substring - result = pa_contains(self._pa_array, pat, ignore_case=not case) - if not isna(na): - result = result.fill_null(na) - return type(self)(result) - def _result_converter(self, result): return type(self)(result) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 6dd0ca2de11ba..e18beb629d0c4 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -223,10 +223,8 @@ def insert(self, loc: int, item) -> ArrowStringArray: raise TypeError("Scalar must be NA or str") return super().insert(loc, item) - def _convert_bool_result(self, values, na=None): + def _convert_bool_result(self, values): if self.dtype.na_value is np.nan: - if not isna(na): - values = values.fill_null(bool(na)) return ArrowExtensionArray(values).to_numpy(na_value=np.nan) return BooleanDtype().__from_arrow__(values) @@ -304,11 +302,6 @@ def _str_contains( fallback_performancewarning() return super()._str_contains(pat, case, flags, na, regex) - if regex: - result = pc.match_substring_regex(self._pa_array, pat, ignore_case=not case) - else: - result = pc.match_substring(self._pa_array, pat, ignore_case=not case) - result = self._convert_bool_result(result, na=na) if not isna(na): if not isinstance(na, bool): # GH#59561 @@ -318,8 +311,9 @@ def _str_contains( FutureWarning, stacklevel=find_stack_level(), ) - result[isna(result)] = bool(na) - return result + na = bool(na) + + return ArrowStringArrayMixin._str_contains(self, pat, case, flags, na, regex) def _str_replace( self, From 3a4526516ae2e64cd9815e87c4c9e23c24b191e9 Mon Sep 17 00:00:00 2001 From: Manlai Amar <70603274+amanlai@users.noreply.github.com> Date: Thu, 5 Sep 2024 17:47:52 -0700 Subject: [PATCH 004/224] DOC: Fix some docstring validation errors #59698 (#59713) * fix some docstring errors * removed trailing whitespace * pd.Series.dt.microseconds has the same documentation as pd.TimedeltaIndex.microseconds and SA01 was cleared for both in the previous commit --- ci/code_checks.sh | 4 ---- pandas/_libs/tslibs/timedeltas.pyx | 13 ++++++++++--- pandas/core/arrays/timedeltas.py | 12 ++++++++++++ 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 0714c6f74f0c2..fcbeb20d083d6 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -92,7 +92,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.day_name PR01,PR02" \ -i "pandas.Series.dt.floor PR01,PR02" \ -i "pandas.Series.dt.freq GL08" \ - -i "pandas.Series.dt.microseconds SA01" \ -i "pandas.Series.dt.month_name PR01,PR02" \ -i "pandas.Series.dt.nanoseconds SA01" \ -i "pandas.Series.dt.normalize PR01" \ @@ -113,12 +112,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ - -i "pandas.Timedelta.to_numpy PR01" \ -i "pandas.Timedelta.to_timedelta64 SA01" \ -i "pandas.Timedelta.total_seconds SA01" \ -i "pandas.Timedelta.view SA01" \ - -i "pandas.TimedeltaIndex.components SA01" \ - -i "pandas.TimedeltaIndex.microseconds SA01" \ -i "pandas.TimedeltaIndex.nanoseconds SA01" \ -i "pandas.TimedeltaIndex.seconds SA01" \ -i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \ diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 36be1812b0187..a7bc2de5ad837 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1421,9 +1421,16 @@ cdef class _Timedelta(timedelta): """ Convert the Timedelta to a NumPy timedelta64. - This is an alias method for `Timedelta.to_timedelta64()`. The dtype and - copy parameters are available here only for compatibility. Their values - will not affect the return value. + This is an alias method for `Timedelta.to_timedelta64()`. + + Parameters + ---------- + dtype : NoneType + It is available here only for compatibility. Its value will not + affect the return value. + copy : bool, default False + It is available here only for compatibility. Its value will not + affect the return value. Returns ------- diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index b2cfbe7338c0d..c8a86ffc187d0 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -876,6 +876,12 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: microseconds_docstring = textwrap.dedent( """Number of microseconds (>= 0 and less than 1 second) for each element. + See Also + -------- + pd.Timedelta.microseconds : Number of microseconds (>= 0 and less than 1 second). + pd.Timedelta.to_pytimedelta.microseconds : Number of microseconds (>= 0 and less + than 1 second) of a datetime.timedelta. + Examples -------- For Series: @@ -955,6 +961,12 @@ def components(self) -> DataFrame: ------- DataFrame + See Also + -------- + TimedeltaIndex.total_seconds : Return total duration expressed in seconds. + Timedelta.components : Return a components namedtuple-like of a single + timedelta. + Examples -------- >>> tdelta_idx = pd.to_timedelta(["1 day 3 min 2 us 42 ns"]) From 08431f17333a91f8191146646b2a136f91bfe7d2 Mon Sep 17 00:00:00 2001 From: Deepak Kapila Date: Fri, 6 Sep 2024 09:53:43 -0400 Subject: [PATCH 005/224] DOC: Clarify docs for df.to_sql (#59727) --- pandas/core/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bc47b662a08d3..42516f0a85e07 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2812,8 +2812,8 @@ def to_sql( `index` is True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. chunksize : int, optional - Specify the number of rows in each batch to be written at a time. - By default, all rows will be written at once. + Specify the number of rows in each batch to be written to the database connection at a time. + By default, all rows will be written at once. Also see the method keyword. dtype : dict or scalar, optional Specifying the datatype for columns. If a dictionary is used, the keys should be the column names and the values should be the From 3f8d3e495a3a26f0be960ec70dee20e2411a4bb4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 6 Sep 2024 08:06:15 -0700 Subject: [PATCH 006/224] BUG (string): ArrowStringArray.find corner cases (#59562) --- pandas/core/arrays/_arrow_string_mixins.py | 44 +++++++++++++++++++++- pandas/core/arrays/arrow/array.py | 23 ----------- pandas/core/arrays/string_arrow.py | 18 ++++----- pandas/tests/extension/test_arrow.py | 31 ++++++--------- 4 files changed, 61 insertions(+), 55 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 5b34a7e2c7cef..950d4cd7cc92e 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -3,6 +3,7 @@ from functools import partial from typing import ( TYPE_CHECKING, + Any, Literal, ) @@ -10,6 +11,7 @@ from pandas.compat import ( pa_version_under10p1, + pa_version_under13p0, pa_version_under17p0, ) @@ -20,7 +22,10 @@ import pyarrow.compute as pc if TYPE_CHECKING: - from collections.abc import Sized + from collections.abc import ( + Callable, + Sized, + ) from pandas._typing import ( Scalar, @@ -42,6 +47,9 @@ def _convert_int_result(self, result): # Convert an integer-dtype result to the appropriate result type raise NotImplementedError + def _apply_elementwise(self, func: Callable) -> list[list[Any]]: + raise NotImplementedError + def _str_pad( self, width: int, @@ -205,3 +213,37 @@ def _str_contains( if not isna(na): # pyright: ignore [reportGeneralTypeIssues] result = result.fill_null(na) return self._convert_bool_result(result) + + def _str_find(self, sub: str, start: int = 0, end: int | None = None): + if ( + pa_version_under13p0 + and not (start != 0 and end is not None) + and not (start == 0 and end is None) + ): + # GH#59562 + res_list = self._apply_elementwise(lambda val: val.find(sub, start, end)) + return self._convert_int_result(pa.chunked_array(res_list)) + + if (start == 0 or start is None) and end is None: + result = pc.find_substring(self._pa_array, sub) + else: + if sub == "": + # GH#56792 + res_list = self._apply_elementwise( + lambda val: val.find(sub, start, end) + ) + return self._convert_int_result(pa.chunked_array(res_list)) + if start is None: + start_offset = 0 + start = 0 + elif start < 0: + start_offset = pc.add(start, pc.utf8_length(self._pa_array)) + start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset) + else: + start_offset = start + slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) + result = pc.find_substring(slices, sub) + found = pc.not_equal(result, pa.scalar(-1, type=result.type)) + offset_result = pc.add(result, start_offset) + result = pc.if_else(found, offset_result, -1) + return self._convert_int_result(result) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 40819ba4ab338..15f9ba611a642 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2373,29 +2373,6 @@ def _str_fullmatch( pat = f"{pat}$" return self._str_match(pat, case, flags, na) - def _str_find(self, sub: str, start: int = 0, end: int | None = None) -> Self: - if (start == 0 or start is None) and end is None: - result = pc.find_substring(self._pa_array, sub) - else: - if sub == "": - # GH 56792 - result = self._apply_elementwise(lambda val: val.find(sub, start, end)) - return type(self)(pa.chunked_array(result)) - if start is None: - start_offset = 0 - start = 0 - elif start < 0: - start_offset = pc.add(start, pc.utf8_length(self._pa_array)) - start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset) - else: - start_offset = start - slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) - result = pc.find_substring(slices, sub) - found = pc.not_equal(result, pa.scalar(-1, type=result.type)) - offset_result = pc.add(result, start_offset) - result = pc.if_else(found, offset_result, -1) - return type(self)(result) - def _str_join(self, sep: str) -> Self: if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( self._pa_array.type diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index e18beb629d0c4..97381b82ceab9 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -416,18 +416,14 @@ def _str_count(self, pat: str, flags: int = 0): return self._convert_int_result(result) def _str_find(self, sub: str, start: int = 0, end: int | None = None): - if start != 0 and end is not None: - slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) - result = pc.find_substring(slices, sub) - not_found = pc.equal(result, -1) - offset_result = pc.add(result, end - start) - result = pc.if_else(not_found, result, offset_result) - elif start == 0 and end is None: - slices = self._pa_array - result = pc.find_substring(slices, sub) - else: + if ( + pa_version_under13p0 + and not (start != 0 and end is not None) + and not (start == 0 and end is None) + ): + # GH#59562 return super()._str_find(sub, start, end) - return self._convert_int_result(result) + return ArrowStringArrayMixin._str_find(self, sub, start, end) def _str_get_dummies(self, sep: str = "|"): dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 3dbdda388d035..fc4f14882b9d7 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -32,8 +32,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import lib from pandas._libs.tslibs import timezones from pandas.compat import ( @@ -1947,14 +1945,9 @@ def test_str_find_negative_start(): def test_str_find_no_end(): ser = pd.Series(["abc", None], dtype=ArrowDtype(pa.string())) - if pa_version_under13p0: - # https://github.com/apache/arrow/issues/36311 - with pytest.raises(pa.lib.ArrowInvalid, match="Negative buffer resize"): - ser.str.find("ab", start=1) - else: - result = ser.str.find("ab", start=1) - expected = pd.Series([-1, None], dtype="int64[pyarrow]") - tm.assert_series_equal(result, expected) + result = ser.str.find("ab", start=1) + expected = pd.Series([-1, None], dtype="int64[pyarrow]") + tm.assert_series_equal(result, expected) def test_str_find_negative_start_negative_end(): @@ -1968,17 +1961,11 @@ def test_str_find_negative_start_negative_end(): def test_str_find_large_start(): # GH 56791 ser = pd.Series(["abcdefg", None], dtype=ArrowDtype(pa.string())) - if pa_version_under13p0: - # https://github.com/apache/arrow/issues/36311 - with pytest.raises(pa.lib.ArrowInvalid, match="Negative buffer resize"): - ser.str.find(sub="d", start=16) - else: - result = ser.str.find(sub="d", start=16) - expected = pd.Series([-1, None], dtype=ArrowDtype(pa.int64())) - tm.assert_series_equal(result, expected) + result = ser.str.find(sub="d", start=16) + expected = pd.Series([-1, None], dtype=ArrowDtype(pa.int64())) + tm.assert_series_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.skipif( pa_version_under13p0, reason="https://github.com/apache/arrow/issues/36311" ) @@ -1990,11 +1977,15 @@ def test_str_find_e2e(start, end, sub): ["abcaadef", "abc", "abcdeddefgj8292", "ab", "a", ""], dtype=ArrowDtype(pa.string()), ) - object_series = s.astype(pd.StringDtype()) + object_series = s.astype(pd.StringDtype(storage="python")) result = s.str.find(sub, start, end) expected = object_series.str.find(sub, start, end).astype(result.dtype) tm.assert_series_equal(result, expected) + arrow_str_series = s.astype(pd.StringDtype(storage="pyarrow")) + result2 = arrow_str_series.str.find(sub, start, end).astype(result.dtype) + tm.assert_series_equal(result2, expected) + def test_str_find_negative_start_negative_end_no_match(): # GH 56791 From 38ccb331b15dd301a85b3413673ae144498d4c1f Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 6 Sep 2024 23:06:00 +0530 Subject: [PATCH 007/224] DOC: fix SA01 for pandas.Period.to_timestamp (#59730) --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/period.pyx | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index fcbeb20d083d6..2fc9c1a83c097 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -73,7 +73,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.NA SA01" \ -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ - -i "pandas.Period.to_timestamp SA01" \ -i "pandas.PeriodDtype.freq SA01" \ -i "pandas.RangeIndex.from_range PR01,SA01" \ -i "pandas.RangeIndex.start SA01" \ diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index e4771feeb804e..c563ab91c4142 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2001,6 +2001,12 @@ cdef class _Period(PeriodMixin): ------- Timestamp + See Also + -------- + Timestamp : A class representing a single point in time. + Period : Represents a span of time with a fixed frequency. + PeriodIndex.to_timestamp : Convert a `PeriodIndex` to a `DatetimeIndex`. + Examples -------- >>> period = pd.Period('2023-1-1', freq='D') From 8cd761a2d1553d7dfa986f3c574f03f2fc62587e Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 6 Sep 2024 23:06:50 +0530 Subject: [PATCH 008/224] DOC: fix SA01,ES01 for pandas.Timedelta.view (#59733) --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/timedeltas.pyx | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 2fc9c1a83c097..7ed5103b3b796 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -113,7 +113,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timedelta.resolution PR02" \ -i "pandas.Timedelta.to_timedelta64 SA01" \ -i "pandas.Timedelta.total_seconds SA01" \ - -i "pandas.Timedelta.view SA01" \ -i "pandas.TimedeltaIndex.nanoseconds SA01" \ -i "pandas.TimedeltaIndex.seconds SA01" \ -i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \ diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index a7bc2de5ad837..4f90f26cf31ab 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1458,11 +1458,26 @@ cdef class _Timedelta(timedelta): """ Array view compatibility. + This method allows you to reinterpret the underlying data of a Timedelta + object as a different dtype. The `view` method provides a way to reinterpret + the internal representation of the `Timedelta` object without modifying its + data. This is particularly useful when you need to work with the underlying + data directly, such as for performance optimizations or interfacing with + low-level APIs. The returned value is typically the number of nanoseconds + since the epoch, represented as an integer or another specified dtype. + Parameters ---------- dtype : str or dtype The dtype to view the underlying data as. + See Also + -------- + numpy.ndarray.view : Returns a view of an array with the same data. + Timedelta.to_numpy : Converts the Timedelta to a NumPy timedelta64. + Timedelta.total_seconds : Returns the total duration of the Timedelta + object in seconds. + Examples -------- >>> td = pd.Timedelta('3D') From 4a16b44bfabc70854f1d3a1447e7050725ff16d9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 6 Sep 2024 19:37:42 +0200 Subject: [PATCH 009/224] String dtype: implement _get_common_dtype (#59682) * String dtype: implement _get_common_dtype * add specific tests * try fix typing * try fix typing * suppress typing error * support numpy 2.0 string * fix typo --- pandas/core/arrays/string_.py | 32 ++++++++- pandas/tests/arrays/categorical/test_api.py | 3 - pandas/tests/arrays/string_/test_concat.py | 73 +++++++++++++++++++++ 3 files changed, 103 insertions(+), 5 deletions(-) create mode 100644 pandas/tests/arrays/string_/test_concat.py diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 143a13c54dbbb..88fd1481031f8 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -171,9 +171,9 @@ def __init__( # a consistent NaN value (and we can use `dtype.na_value is np.nan`) na_value = np.nan elif na_value is not libmissing.NA: - raise ValueError("'na_value' must be np.nan or pd.NA, got {na_value}") + raise ValueError(f"'na_value' must be np.nan or pd.NA, got {na_value}") - self.storage = storage + self.storage = cast(str, storage) self._na_value = na_value def __repr__(self) -> str: @@ -284,6 +284,34 @@ def construct_array_type( # type: ignore[override] else: return ArrowStringArrayNumpySemantics + def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: + storages = set() + na_values = set() + + for dtype in dtypes: + if isinstance(dtype, StringDtype): + storages.add(dtype.storage) + na_values.add(dtype.na_value) + elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "T"): + continue + else: + return None + + if len(storages) == 2: + # if both python and pyarrow storage -> priority to pyarrow + storage = "pyarrow" + else: + storage = next(iter(storages)) # type: ignore[assignment] + + na_value: libmissing.NAType | float + if len(na_values) == 2: + # if both NaN and NA -> priority to NA + na_value = libmissing.NA + else: + na_value = next(iter(na_values)) + + return StringDtype(storage=storage, na_value=na_value) + def __from_arrow__( self, array: pyarrow.Array | pyarrow.ChunkedArray ) -> BaseStringArray: diff --git a/pandas/tests/arrays/categorical/test_api.py b/pandas/tests/arrays/categorical/test_api.py index 2ccc5781c608e..2791fd55f54d7 100644 --- a/pandas/tests/arrays/categorical/test_api.py +++ b/pandas/tests/arrays/categorical/test_api.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.compat import PY311 from pandas import ( @@ -151,7 +149,6 @@ def test_reorder_categories_raises(self, new_categories): with pytest.raises(ValueError, match=msg): cat.reorder_categories(new_categories) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_add_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) old = cat.copy() diff --git a/pandas/tests/arrays/string_/test_concat.py b/pandas/tests/arrays/string_/test_concat.py new file mode 100644 index 0000000000000..320d700b2b6c3 --- /dev/null +++ b/pandas/tests/arrays/string_/test_concat.py @@ -0,0 +1,73 @@ +import numpy as np +import pytest + +from pandas.compat import HAS_PYARROW + +from pandas.core.dtypes.cast import find_common_type + +import pandas as pd +import pandas._testing as tm +from pandas.util.version import Version + + +@pytest.mark.parametrize( + "to_concat_dtypes, result_dtype", + [ + # same types + ([("pyarrow", pd.NA), ("pyarrow", pd.NA)], ("pyarrow", pd.NA)), + ([("pyarrow", np.nan), ("pyarrow", np.nan)], ("pyarrow", np.nan)), + ([("python", pd.NA), ("python", pd.NA)], ("python", pd.NA)), + ([("python", np.nan), ("python", np.nan)], ("python", np.nan)), + # pyarrow preference + ([("pyarrow", pd.NA), ("python", pd.NA)], ("pyarrow", pd.NA)), + # NA preference + ([("python", pd.NA), ("python", np.nan)], ("python", pd.NA)), + ], +) +def test_concat_series(request, to_concat_dtypes, result_dtype): + if any(storage == "pyarrow" for storage, _ in to_concat_dtypes) and not HAS_PYARROW: + pytest.skip("Could not import 'pyarrow'") + + ser_list = [ + pd.Series(["a", "b", None], dtype=pd.StringDtype(storage, na_value)) + for storage, na_value in to_concat_dtypes + ] + + result = pd.concat(ser_list, ignore_index=True) + expected = pd.Series( + ["a", "b", None, "a", "b", None], dtype=pd.StringDtype(*result_dtype) + ) + tm.assert_series_equal(result, expected) + + # order doesn't matter for result + result = pd.concat(ser_list[::1], ignore_index=True) + tm.assert_series_equal(result, expected) + + +def test_concat_with_object(string_dtype_arguments): + # _get_common_dtype cannot inspect values, so object dtype with strings still + # results in object dtype + result = pd.concat( + [ + pd.Series(["a", "b", None], dtype=pd.StringDtype(*string_dtype_arguments)), + pd.Series(["a", "b", None], dtype=object), + ] + ) + assert result.dtype == np.dtype("object") + + +def test_concat_with_numpy(string_dtype_arguments): + # common type with a numpy string dtype always preserves the pandas string dtype + dtype = pd.StringDtype(*string_dtype_arguments) + assert find_common_type([dtype, np.dtype("U")]) == dtype + assert find_common_type([np.dtype("U"), dtype]) == dtype + assert find_common_type([dtype, np.dtype("U10")]) == dtype + assert find_common_type([np.dtype("U10"), dtype]) == dtype + + # with any other numpy dtype -> object + assert find_common_type([dtype, np.dtype("S")]) == np.dtype("object") + assert find_common_type([dtype, np.dtype("int64")]) == np.dtype("object") + + if Version(np.__version__) >= Version("2"): + assert find_common_type([dtype, np.dtypes.StringDType()]) == dtype + assert find_common_type([np.dtypes.StringDType(), dtype]) == dtype From 5a3a4f350440cd215efae034c506cbead6a1ad9e Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva <91160475+natmokval@users.noreply.github.com> Date: Fri, 6 Sep 2024 20:13:05 +0200 Subject: [PATCH 010/224] DOC: move `idxmin` and `idxmax` docs from core/shared_docs.py to core/frame.py (#59735) move idxmin, idxmax docstring from shared_docs.py to frame.py --- pandas/core/frame.py | 144 ++++++++++++++++++++++++++++++++++++- pandas/core/shared_docs.py | 130 --------------------------------- 2 files changed, 142 insertions(+), 132 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f47acf579d79c..fe88cb86693e8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -12745,10 +12745,80 @@ def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: """ return self.apply(Series.nunique, axis=axis, dropna=dropna) - @doc(_shared_docs["idxmin"], numeric_only_default="False") def idxmin( self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False ) -> Series: + """ + Return index of first occurrence of minimum over requested axis. + + NA/null values are excluded. + + Parameters + ---------- + axis : {{0 or 'index', 1 or 'columns'}}, default 0 + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + skipna : bool, default True + Exclude NA/null values. If the entire DataFrame is NA, + or if ``skipna=False`` and there is an NA value, this method + will raise a ``ValueError``. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series + Indexes of minima along the specified axis. + + Raises + ------ + ValueError + * If the row/column is empty + + See Also + -------- + Series.idxmin : Return index of the minimum element. + + Notes + ----- + This method is the DataFrame version of ``ndarray.argmin``. + + Examples + -------- + Consider a dataset containing food consumption in Argentina. + + >>> df = pd.DataFrame( + ... { + ... { + ... "consumption": [10.51, 103.11, 55.48], + ... "co2_emissions": [37.2, 19.66, 1712], + ... } + ... }, + ... index=["Pork", "Wheat Products", "Beef"], + ... ) + + >>> df + consumption co2_emissions + Pork 10.51 37.20 + Wheat Products 103.11 19.66 + Beef 55.48 1712.00 + + By default, it returns the index for the minimum value in each column. + + >>> df.idxmin() + consumption Pork + co2_emissions Wheat Products + dtype: object + + To return the index for the minimum value in each row, use ``axis="columns"``. + + >>> df.idxmin(axis="columns") + Pork consumption + Wheat Products co2_emissions + Beef consumption + dtype: object + """ axis = self._get_axis_number(axis) if self.empty and len(self.axes[axis]): @@ -12782,10 +12852,80 @@ def idxmin( final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) return final_result.__finalize__(self, method="idxmin") - @doc(_shared_docs["idxmax"], numeric_only_default="False") def idxmax( self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False ) -> Series: + """ + Return index of first occurrence of maximum over requested axis. + + NA/null values are excluded. + + Parameters + ---------- + axis : {{0 or 'index', 1 or 'columns'}}, default 0 + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + skipna : bool, default True + Exclude NA/null values. If the entire DataFrame is NA, + or if ``skipna=False`` and there is an NA value, this method + will raise a ``ValueError``. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + Returns + ------- + Series + Indexes of maxima along the specified axis. + + Raises + ------ + ValueError + * If the row/column is empty + + See Also + -------- + Series.idxmax : Return index of the maximum element. + + Notes + ----- + This method is the DataFrame version of ``ndarray.argmax``. + + Examples + -------- + Consider a dataset containing food consumption in Argentina. + + >>> df = pd.DataFrame( + ... { + ... { + ... "consumption": [10.51, 103.11, 55.48], + ... "co2_emissions": [37.2, 19.66, 1712], + ... } + ... }, + ... index=["Pork", "Wheat Products", "Beef"], + ... ) + + >>> df + consumption co2_emissions + Pork 10.51 37.20 + Wheat Products 103.11 19.66 + Beef 55.48 1712.00 + + By default, it returns the index for the maximum value in each column. + + >>> df.idxmax() + consumption Wheat Products + co2_emissions Beef + dtype: object + + To return the index for the maximum value in each row, use ``axis="columns"``. + + >>> df.idxmax(axis="columns") + Pork co2_emissions + Wheat Products consumption + Beef co2_emissions + dtype: object + """ axis = self._get_axis_number(axis) if self.empty and len(self.axes[axis]): diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 5725b96f66cd4..cb0c3d241534c 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -649,133 +649,3 @@ 3 3 d e 4 4 e e """ - -_shared_docs["idxmin"] = """ - Return index of first occurrence of minimum over requested axis. - - NA/null values are excluded. - - Parameters - ---------- - axis : {{0 or 'index', 1 or 'columns'}}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - skipna : bool, default True - Exclude NA/null values. If the entire Series is NA, or if ``skipna=False`` - and there is an NA value, this method will raise a ``ValueError``. - numeric_only : bool, default {numeric_only_default} - Include only `float`, `int` or `boolean` data. - - .. versionadded:: 1.5.0 - - Returns - ------- - Series - Indexes of minima along the specified axis. - - Raises - ------ - ValueError - * If the row/column is empty - - See Also - -------- - Series.idxmin : Return index of the minimum element. - - Notes - ----- - This method is the DataFrame version of ``ndarray.argmin``. - - Examples - -------- - Consider a dataset containing food consumption in Argentina. - - >>> df = pd.DataFrame({{'consumption': [10.51, 103.11, 55.48], - ... 'co2_emissions': [37.2, 19.66, 1712]}}, - ... index=['Pork', 'Wheat Products', 'Beef']) - - >>> df - consumption co2_emissions - Pork 10.51 37.20 - Wheat Products 103.11 19.66 - Beef 55.48 1712.00 - - By default, it returns the index for the minimum value in each column. - - >>> df.idxmin() - consumption Pork - co2_emissions Wheat Products - dtype: object - - To return the index for the minimum value in each row, use ``axis="columns"``. - - >>> df.idxmin(axis="columns") - Pork consumption - Wheat Products co2_emissions - Beef consumption - dtype: object -""" - -_shared_docs["idxmax"] = """ - Return index of first occurrence of maximum over requested axis. - - NA/null values are excluded. - - Parameters - ---------- - axis : {{0 or 'index', 1 or 'columns'}}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - skipna : bool, default True - Exclude NA/null values. If the entire Series is NA, or if ``skipna=False`` - and there is an NA value, this method will raise a ``ValueError``. - numeric_only : bool, default {numeric_only_default} - Include only `float`, `int` or `boolean` data. - - .. versionadded:: 1.5.0 - - Returns - ------- - Series - Indexes of maxima along the specified axis. - - Raises - ------ - ValueError - * If the row/column is empty - - See Also - -------- - Series.idxmax : Return index of the maximum element. - - Notes - ----- - This method is the DataFrame version of ``ndarray.argmax``. - - Examples - -------- - Consider a dataset containing food consumption in Argentina. - - >>> df = pd.DataFrame({{'consumption': [10.51, 103.11, 55.48], - ... 'co2_emissions': [37.2, 19.66, 1712]}}, - ... index=['Pork', 'Wheat Products', 'Beef']) - - >>> df - consumption co2_emissions - Pork 10.51 37.20 - Wheat Products 103.11 19.66 - Beef 55.48 1712.00 - - By default, it returns the index for the maximum value in each column. - - >>> df.idxmax() - consumption Wheat Products - co2_emissions Beef - dtype: object - - To return the index for the maximum value in each row, use ``axis="columns"``. - - >>> df.idxmax(axis="columns") - Pork co2_emissions - Wheat Products consumption - Beef co2_emissions - dtype: object -""" From 5a07ed5a8e1522886b177bcce21568ecbfe63410 Mon Sep 17 00:00:00 2001 From: ammar-qazi Date: Fri, 6 Sep 2024 20:14:09 +0200 Subject: [PATCH 011/224] Resolves #59670 by documenting that DataFrame.from_records()'s columns filters (includes) data. (#59723) Update frames.py to factor in explain columns reordering --- pandas/core/frame.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fe88cb86693e8..97df71e2c02a0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2124,9 +2124,10 @@ def from_records( columns : sequence, default None Column names to use. If the passed data do not have names associated with them, this argument provides names for the - columns. Otherwise this argument indicates the order of the columns + columns. Otherwise, this argument indicates the order of the columns in the result (any names not found in the data will become all-NA - columns). + columns) and limits the data to these columns if not all column names + are provided. coerce_float : bool, default False Attempt to convert values of non-string, non-numeric objects (like decimal.Decimal) to floating point, useful for SQL result sets. From 352289b3b6e282fcf36d7634a45a5b93839be8fa Mon Sep 17 00:00:00 2001 From: Florian Bourgey Date: Fri, 6 Sep 2024 14:15:31 -0400 Subject: [PATCH 012/224] Missing source link (#59549) * merged DataFrame.index and DataFrame.columns with other Axes section. * small clean for DataFrame.columns * reverted frame.rst file --- pandas/core/frame.py | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 97df71e2c02a0..c80e9dfd23ba2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -13629,26 +13629,29 @@ def isin_(x): ) columns = properties.AxisProperty( axis=0, - doc=dedent( - """ - The column labels of the DataFrame. - - See Also - -------- - DataFrame.index: The index (row labels) of the DataFrame. - DataFrame.axes: Return a list representing the axes of the DataFrame. - - Examples - -------- - >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) - >>> df - A B - 0 1 3 - 1 2 4 - >>> df.columns - Index(['A', 'B'], dtype='object') - """ - ), + doc=""" + The column labels of the DataFrame. + + Returns + ------- + pandas.Index + The column labels of the DataFrame. + + See Also + -------- + DataFrame.index: The index (row labels) of the DataFrame. + DataFrame.axes: Return a list representing the axes of the DataFrame. + + Examples + -------- + >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + >>> df + A B + 0 1 3 + 1 2 4 + >>> df.columns + Index(['A', 'B'], dtype='object') + """, ) # ---------------------------------------------------------------------- From 80b685027108245086b78dbd9a176b096c92570a Mon Sep 17 00:00:00 2001 From: matiaslindgren Date: Sat, 7 Sep 2024 13:53:28 +0200 Subject: [PATCH 013/224] BUG: Fix inconsistent pivot table subaggregation when index is None (#59629) --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/reshape/pivot.py | 11 +++++++---- pandas/tests/reshape/test_pivot.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index cd353b60d1a6e..9a29ff4d49966 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -668,6 +668,7 @@ Reshaping - Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`) - Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`) +- Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`) - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`) Sparse diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 0886aad310034..cfc6f91557781 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -557,7 +557,12 @@ def _all_key(key): table_pieces.append(piece) margin_keys.append(all_key) else: - from pandas import DataFrame + margin = ( + data[cols[:1] + values] + .groupby(cols[:1], observed=observed) + .agg(aggfunc, **kwargs) + .T + ) cat_axis = 0 for key, piece in table.groupby(level=0, observed=observed): @@ -566,9 +571,7 @@ def _all_key(key): else: all_key = margins_name table_pieces.append(piece) - # GH31016 this is to calculate margin for each group, and assign - # corresponded key as index - transformed_piece = DataFrame(piece.apply(aggfunc, **kwargs)).T + transformed_piece = margin[key].to_frame().T if isinstance(piece.index, MultiIndex): # We are adding an empty level transformed_piece.index = MultiIndex.from_tuples( diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 44b96afaa4ef5..8cfe565ebdd65 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2785,3 +2785,31 @@ def test_pivot_empty_with_datetime(self): index="category", columns="value", values="timestamp" ) assert df_pivoted.empty + + def test_pivot_margins_with_none_index(self): + # GH#58722 + df = DataFrame( + { + "x": [1, 1, 2], + "y": [3, 3, 4], + "z": [5, 5, 6], + "w": [7, 8, 9], + } + ) + result = df.pivot_table( + index=None, + columns=["y", "z"], + values="w", + margins=True, + aggfunc="count", + ) + expected = DataFrame( + [[2, 2, 1, 1]], + index=["w"], + columns=MultiIndex( + levels=[[3, 4], [5, 6, "All"]], + codes=[[0, 0, 1, 1], [0, 2, 1, 2]], + names=["y", "z"], + ), + ) + tm.assert_frame_equal(result, expected) From 13f45e70989625850dda374c5588d4beb54bd48c Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 9 Sep 2024 05:53:48 -0500 Subject: [PATCH 014/224] TST/BUG (string dtype): Fix and adjust indexes string tests (#59544) Co-authored-by: Joris Van den Bossche --- pandas/core/construction.py | 5 +++- pandas/core/indexes/base.py | 6 ++++- .../tests/indexes/base_class/test_setops.py | 6 ++--- pandas/tests/indexes/test_base.py | 11 ++------ pandas/tests/indexes/test_old_base.py | 26 ++++++++----------- 5 files changed, 24 insertions(+), 30 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 665eb75953078..bb3aa3867ab08 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -611,7 +611,10 @@ def sanitize_array( dtype = StringDtype(na_value=np.nan) subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) - if subarr is data and copy: + if ( + subarr is data + or (subarr.dtype == "str" and subarr.dtype.storage == "python") # type: ignore[union-attr] + ) and copy: subarr = subarr.copy() else: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 582e1f96fa562..2346c20004210 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -504,7 +504,8 @@ def __new__( elif is_ea_or_datetimelike_dtype(dtype): # non-EA dtype indexes have special casting logic, so we punt here - pass + if isinstance(data, (set, frozenset)): + data = list(data) elif is_ea_or_datetimelike_dtype(data_dtype): pass @@ -6877,6 +6878,9 @@ def insert(self, loc: int, item) -> Index: # We cannot keep the same dtype, so cast to the (often object) # minimal shared dtype before doing the insert. dtype = self._find_common_type_compat(item) + if dtype == self.dtype: + # EA's might run into recursion errors if loc is invalid + raise return self.astype(dtype).insert(loc, item) if arr.dtype != object or not isinstance( diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index f9636ec19f2ec..0e9fb77d6e8dd 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( Index, @@ -233,7 +231,6 @@ def test_tuple_union_bug(self, method, expected, sort): expected = Index(expected) tm.assert_index_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("first_list", [["b", "a"], []]) @pytest.mark.parametrize("second_list", [["a", "b"], []]) @pytest.mark.parametrize( @@ -243,6 +240,7 @@ def test_tuple_union_bug(self, method, expected, sort): def test_union_name_preservation( self, first_list, second_list, first_name, second_name, expected_name, sort ): + expected_dtype = object if not first_list or not second_list else "str" first = Index(first_list, name=first_name) second = Index(second_list, name=second_name) union = first.union(second, sort=sort) @@ -253,7 +251,7 @@ def test_union_name_preservation( expected = Index(sorted(vals), name=expected_name) tm.assert_index_equal(union, expected) else: - expected = Index(vals, name=expected_name) + expected = Index(vals, name=expected_name, dtype=expected_dtype) tm.assert_index_equal(union.sort_values(), expected.sort_values()) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 7ec66100b7291..486b24845d2ff 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -76,9 +76,6 @@ def test_constructor_casting(self, index): tm.assert_contains_all(arr, new_index) tm.assert_index_equal(index, new_index) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_constructor_copy(self, using_infer_string): index = Index(list("abc"), name="name") arr = np.array(index) @@ -343,11 +340,6 @@ def test_constructor_empty_special(self, empty, klass): def test_view_with_args(self, index): index.view("i8") - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) @pytest.mark.parametrize( "index", [ @@ -364,7 +356,8 @@ def test_view_with_args_object_array_raises(self, index): msg = "When changing to a larger dtype" with pytest.raises(ValueError, match=msg): index.view("i8") - elif index.dtype == "string": + elif index.dtype == "str" and not index.dtype.storage == "python": + # TODO(infer_string): Make the errors consistent with pytest.raises(NotImplementedError, match="i8"): index.view("i8") else: diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index b41871ee921fd..75284a8f8fd47 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -6,10 +6,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp -from pandas.compat import HAS_PYARROW from pandas.core.dtypes.common import ( is_integer_dtype, @@ -28,6 +25,7 @@ PeriodIndex, RangeIndex, Series, + StringDtype, TimedeltaIndex, isna, period_range, @@ -229,7 +227,6 @@ def test_logical_compat(self, simple_index): with pytest.raises(TypeError, match=msg): idx.any() - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_repr_roundtrip(self, simple_index): if isinstance(simple_index, IntervalIndex): pytest.skip(f"Not a valid repr for {type(simple_index).__name__}") @@ -246,11 +243,6 @@ def test_repr_max_seq_item_setting(self, simple_index): repr(idx) assert "..." not in str(idx) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_ensure_copied_data(self, index): # Check the "copy" argument of each Index.__new__ is honoured @@ -296,7 +288,9 @@ def test_ensure_copied_data(self, index): tm.assert_numpy_array_equal( index._values._mask, result._values._mask, check_same="same" ) - elif index.dtype == "string[python]": + elif ( + isinstance(index.dtype, StringDtype) and index.dtype.storage == "python" + ): assert np.shares_memory(index._values._ndarray, result._values._ndarray) tm.assert_numpy_array_equal( index._values._ndarray, result._values._ndarray, check_same="same" @@ -444,11 +438,7 @@ def test_insert_base(self, index): result = trimmed.insert(0, index[0]) assert index[0:4].equals(result) - @pytest.mark.skipif( - using_string_dtype(), - reason="completely different behavior, tested elsewher", - ) - def test_insert_out_of_bounds(self, index): + def test_insert_out_of_bounds(self, index, using_infer_string): # TypeError/IndexError matches what np.insert raises in these cases if len(index) > 0: @@ -460,6 +450,12 @@ def test_insert_out_of_bounds(self, index): msg = "index (0|0.5) is out of bounds for axis 0 with size 0" else: msg = "slice indices must be integers or None or have an __index__ method" + + if using_infer_string and ( + index.dtype == "string" or index.dtype == "category" # noqa: PLR1714 + ): + msg = "loc must be an integer between" + with pytest.raises(err, match=msg): index.insert(0.5, "foo") From b7dedf56ad529a2b18f17ae621a69644867c69c7 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 9 Sep 2024 06:40:22 -0500 Subject: [PATCH 015/224] TST (string dtype): Adjust indexing string tests (#59541) Co-authored-by: Joris Van den Bossche --- pandas/core/arrays/string_.py | 4 ++ pandas/core/arrays/string_arrow.py | 2 +- pandas/tests/arrays/string_/test_string.py | 5 +-- pandas/tests/indexing/test_iloc.py | 31 +++++++------- pandas/tests/indexing/test_indexing.py | 18 ++++---- pandas/tests/indexing/test_loc.py | 48 +++++++++++++--------- 6 files changed, 57 insertions(+), 51 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 88fd1481031f8..a46475a7d1ec2 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -715,6 +715,10 @@ def __setitem__(self, key, value) -> None: else: if not is_array_like(value): value = np.asarray(value, dtype=object) + else: + # cast categories and friends to arrays to see if values are + # compatible, compatibility with arrow backed strings + value = np.asarray(value) if len(value) and not lib.is_string_array(value, skipna=True): raise TypeError("Must provide strings.") diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 97381b82ceab9..1e5adf106752f 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -240,7 +240,7 @@ def _maybe_convert_setitem_value(self, value): value[isna(value)] = None for v in value: if not (v is None or isinstance(v, str)): - raise TypeError("Scalar must be NA or str") + raise TypeError("Must provide strings") return super()._maybe_convert_setitem_value(value) def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index dd87dbf8e9a43..87bd1d5921caa 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -102,10 +102,7 @@ def test_setitem_validates(cls, dtype): with pytest.raises(TypeError, match=msg): arr[0] = 10 - if dtype.storage == "python": - msg = "Must provide strings." - else: - msg = "Scalar must be NA or str" + msg = "Must provide strings" with pytest.raises(TypeError, match=msg): arr[:] = np.array([1, 2]) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index b05b5d3dea2dc..dc95e1bb1b8a0 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -6,8 +6,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import IndexingError from pandas import ( @@ -1198,22 +1196,25 @@ def test_iloc_getitem_int_single_ea_block_view(self): arr[2] = arr[-1] assert ser[0] == arr[-1] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_iloc_setitem_multicolumn_to_datetime(self): + def test_iloc_setitem_multicolumn_to_datetime(self, using_infer_string): # GH#20511 df = DataFrame({"A": ["2022-01-01", "2022-01-02"], "B": ["2021", "2022"]}) - df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) - expected = DataFrame( - { - "A": [ - Timestamp("2021-01-01 00:00:00"), - Timestamp("2022-01-01 00:00:00"), - ], - "B": ["2021", "2022"], - } - ) - tm.assert_frame_equal(df, expected, check_dtype=False) + if using_infer_string: + with pytest.raises(TypeError, match="Invalid value"): + df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) + else: + df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])}) + expected = DataFrame( + { + "A": [ + Timestamp("2021-01-01 00:00:00"), + Timestamp("2022-01-01 00:00:00"), + ], + "B": ["2021", "2022"], + } + ) + tm.assert_frame_equal(df, expected, check_dtype=False) class TestILocErrors: diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index f7ada06e3ecb2..fb7e6649c534f 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import IndexingError from pandas.core.dtypes.common import ( @@ -528,12 +526,12 @@ def test_string_slice_empty(self): with pytest.raises(KeyError, match="^0$"): df.loc["2011", 0] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_astype_assignment(self, using_infer_string): # GH4312 (iloc) df_orig = DataFrame( [["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) + df_orig[list("ABCDG")] = df_orig[list("ABCDG")].astype(object) df = df_orig.copy() @@ -543,9 +541,9 @@ def test_astype_assignment(self, using_infer_string): expected = DataFrame( [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - if not using_infer_string: - expected["A"] = expected["A"].astype(object) - expected["B"] = expected["B"].astype(object) + expected[list("CDG")] = expected[list("CDG")].astype(object) + expected["A"] = expected["A"].astype(object) + expected["B"] = expected["B"].astype(object) tm.assert_frame_equal(df, expected) # GH5702 (loc) @@ -554,18 +552,16 @@ def test_astype_assignment(self, using_infer_string): expected = DataFrame( [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - if not using_infer_string: - expected["A"] = expected["A"].astype(object) + expected[list("ABCDG")] = expected[list("ABCDG")].astype(object) tm.assert_frame_equal(df, expected) df = df_orig.copy() + df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64) expected = DataFrame( [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG") ) - if not using_infer_string: - expected["B"] = expected["B"].astype(object) - expected["C"] = expected["C"].astype(object) + expected[list("ABCDG")] = expected[list("ABCDG")].astype(object) tm.assert_frame_equal(df, expected) def test_astype_assignment_full_replacements(self): diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index e007b8c4e97ac..36b08ee1df790 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1,6 +1,7 @@ """test label based indexing with loc""" from collections import namedtuple +import contextlib from datetime import ( date, datetime, @@ -13,10 +14,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs import index as libindex -from pandas.compat import HAS_PYARROW from pandas.errors import IndexingError import pandas as pd @@ -615,8 +613,7 @@ def test_loc_setitem_consistency_empty(self): expected["x"] = expected["x"].astype(np.int64) tm.assert_frame_equal(df, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_loc_setitem_consistency_slice_column_len(self): + def test_loc_setitem_consistency_slice_column_len(self, using_infer_string): # .loc[:,column] setting with slice == len of the column # GH10408 levels = [ @@ -640,12 +637,23 @@ def test_loc_setitem_consistency_slice_column_len(self): ] df = DataFrame(values, index=mi, columns=cols) - df.loc[:, ("Respondent", "StartDate")] = to_datetime( - df.loc[:, ("Respondent", "StartDate")] - ) - df.loc[:, ("Respondent", "EndDate")] = to_datetime( - df.loc[:, ("Respondent", "EndDate")] - ) + ctx = contextlib.nullcontext() + if using_infer_string: + ctx = pytest.raises(TypeError, match="Invalid value") + + with ctx: + df.loc[:, ("Respondent", "StartDate")] = to_datetime( + df.loc[:, ("Respondent", "StartDate")] + ) + with ctx: + df.loc[:, ("Respondent", "EndDate")] = to_datetime( + df.loc[:, ("Respondent", "EndDate")] + ) + + if using_infer_string: + # infer-objects won't infer stuff anymore + return + df = df.infer_objects() # Adding a new key @@ -1211,20 +1219,23 @@ def test_loc_reverse_assignment(self): tm.assert_series_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="can't set int into string") - def test_loc_setitem_str_to_small_float_conversion_type(self): + def test_loc_setitem_str_to_small_float_conversion_type(self, using_infer_string): # GH#20388 col_data = [str(np.random.default_rng(2).random() * 1e-12) for _ in range(5)] result = DataFrame(col_data, columns=["A"]) - expected = DataFrame(col_data, columns=["A"], dtype=object) + expected = DataFrame(col_data, columns=["A"]) tm.assert_frame_equal(result, expected) # assigning with loc/iloc attempts to set the values inplace, which # in this case is successful - result.loc[result.index, "A"] = [float(x) for x in col_data] - expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object) - tm.assert_frame_equal(result, expected) + if using_infer_string: + with pytest.raises(TypeError, match="Must provide strings"): + result.loc[result.index, "A"] = [float(x) for x in col_data] + else: + result.loc[result.index, "A"] = [float(x) for x in col_data] + expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object) + tm.assert_frame_equal(result, expected) # assigning the entire column using __setitem__ swaps in the new array # GH#??? @@ -1389,9 +1400,6 @@ def test_loc_setitem_categorical_values_partial_column_slice(self): df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_loc_setitem_single_row_categorical(self, using_infer_string): # GH#25495 df = DataFrame({"Alpha": ["a"], "Numeric": [0]}) From b0593e20c4a661250df5ab4d832510c1f5819103 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 9 Sep 2024 09:38:47 -0700 Subject: [PATCH 016/224] Bump deadsnakes/action from 3.1.0 to 3.2.0 (#59757) Bumps [deadsnakes/action](https://github.com/deadsnakes/action) from 3.1.0 to 3.2.0. - [Release notes](https://github.com/deadsnakes/action/releases) - [Commits](https://github.com/deadsnakes/action/compare/v3.1.0...v3.2.0) --- updated-dependencies: - dependency-name: deadsnakes/action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/unit-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index d392c84be66fe..d145836f3e596 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -380,7 +380,7 @@ jobs: fetch-depth: 0 - name: Set up Python Free-threading Version - uses: deadsnakes/action@v3.1.0 + uses: deadsnakes/action@v3.2.0 with: python-version: 3.13-dev nogil: true From 53cadbbd89a3393d615e4d7abf48f3ec1903fe7b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Sep 2024 19:15:02 +0200 Subject: [PATCH 017/224] TST (string dtype): adjust pandas/tests/reshape tests (#59762) --- pandas/tests/reshape/concat/test_concat.py | 11 ++------ pandas/tests/reshape/merge/test_merge_asof.py | 10 ++----- pandas/tests/reshape/test_get_dummies.py | 10 ++----- pandas/tests/reshape/test_melt.py | 25 ++++++----------- pandas/tests/reshape/test_pivot.py | 28 ++++++++++++------- 5 files changed, 34 insertions(+), 50 deletions(-) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index 8af224f1ad64f..d3edee17366f7 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -10,8 +10,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import InvalidIndexError import pandas as pd @@ -47,18 +45,11 @@ def test_append_concat(self): assert isinstance(result.index, PeriodIndex) assert result.index[0] == s1.index[0] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_concat_copy(self): df = DataFrame(np.random.default_rng(2).standard_normal((4, 3))) df2 = DataFrame(np.random.default_rng(2).integers(0, 10, size=4).reshape(4, 1)) df3 = DataFrame({5: "foo"}, index=range(4)) - # These are actual copies. - result = concat([df, df2, df3], axis=1) - for block in result._mgr.blocks: - assert block.values.base is not None - - # These are the same. result = concat([df, df2, df3], axis=1) for block in result._mgr.blocks: @@ -69,6 +60,8 @@ def test_concat_copy(self): assert arr.base is df2._mgr.blocks[0].values.base elif arr.dtype == object: assert arr.base is not None + elif arr.dtype == "string": + tm.shares_memory(arr, df3._mgr.blocks[0].values) # Float block was consolidated. df4 = DataFrame(np.random.default_rng(2).standard_normal((4, 1))) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 8d972087b0dff..f7b0876c5a605 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -3064,12 +3062,8 @@ def test_on_float_by_int(self): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_merge_datatype_error_raises(self, using_infer_string): - if using_infer_string: - msg = "incompatible merge keys" - else: - msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype" + def test_merge_datatype_error_raises(self): + msg = r"Incompatible merge dtype, .*, both sides must have numeric dtype" left = pd.DataFrame({"left_val": [1, 5, 10], "a": ["a", "b", "c"]}) right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7], "a": [1, 2, 3, 6, 7]}) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 27a34decae7b0..f07c6845366da 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype @@ -216,11 +214,10 @@ def test_dataframe_dummies_all_obj(self, df, sparse): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") - def test_dataframe_dummies_string_dtype(self, df, using_infer_string): + def test_dataframe_dummies_string_dtype(self, df, any_string_dtype): # GH44965 df = df[["A", "B"]] - df = df.astype({"A": "object", "B": "string"}) + df = df.astype({"A": "str", "B": any_string_dtype}) result = get_dummies(df) expected = DataFrame( { @@ -231,8 +228,7 @@ def test_dataframe_dummies_string_dtype(self, df, using_infer_string): }, dtype=bool, ) - if not using_infer_string: - # infer_string returns numpy bools + if any_string_dtype == "string" and any_string_dtype.na_value is pd.NA: expected[["B_b", "B_c"]] = expected[["B_b", "B_c"]].astype("boolean") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index be4f2ab4d183d..4a12404f6775a 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( DataFrame, @@ -21,7 +19,7 @@ def df(): res = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), - columns=Index(list("ABCD"), dtype=object), + columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=10, freq="B"), ) res["id1"] = (res["A"] > 0).astype(np.int64) @@ -83,7 +81,6 @@ def test_default_col_names(self, df): result2 = df.melt(id_vars=["id1", "id2"]) assert result2.columns.tolist() == ["id1", "id2", "variable", "value"] - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_value_vars(self, df): result3 = df.melt(id_vars=["id1", "id2"], value_vars="A") assert len(result3) == 10 @@ -100,7 +97,6 @@ def test_value_vars(self, df): ) tm.assert_frame_equal(result4, expected4) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize("type_", (tuple, list, np.array)) def test_value_vars_types(self, type_, df): # GH 15348 @@ -178,7 +174,6 @@ def test_tuple_vars_fail_with_multiindex(self, id_vars, value_vars, df1): with pytest.raises(ValueError, match=msg): df1.melt(id_vars=id_vars, value_vars=value_vars) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_var_name(self, df, var_name): result5 = df.melt(var_name=var_name) assert result5.columns.tolist() == ["var", "value"] @@ -206,7 +201,6 @@ def test_custom_var_name(self, df, var_name): ) tm.assert_frame_equal(result9, expected9) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_value_name(self, df, value_name): result10 = df.melt(value_name=value_name) assert result10.columns.tolist() == ["variable", "val"] @@ -236,7 +230,6 @@ def test_custom_value_name(self, df, value_name): ) tm.assert_frame_equal(result14, expected14) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_custom_var_and_value_name(self, df, value_name, var_name): result15 = df.melt(var_name=var_name, value_name=value_name) assert result15.columns.tolist() == ["var", "val"] @@ -361,7 +354,6 @@ def test_melt_missing_columns_raises(self): with pytest.raises(KeyError, match=msg): df.melt(["A"], ["F"], col_level=0) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_melt_mixed_int_str_id_vars(self): # GH 29718 df = DataFrame({0: ["foo"], "a": ["bar"], "b": [1], "d": [2]}) @@ -369,6 +361,8 @@ def test_melt_mixed_int_str_id_vars(self): expected = DataFrame( {0: ["foo"] * 2, "a": ["bar"] * 2, "variable": list("bd"), "value": [1, 2]} ) + # the df's columns are mixed type and thus object -> preserves object dtype + expected["variable"] = expected["variable"].astype(object) tm.assert_frame_equal(result, expected) def test_melt_mixed_int_str_value_vars(self): @@ -1222,12 +1216,10 @@ def test_raise_of_column_name_value(self): ): df.melt(id_vars="value", value_name="value") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - @pytest.mark.parametrize("dtype", ["O", "string"]) - def test_missing_stubname(self, dtype): + def test_missing_stubname(self, any_string_dtype): # GH46044 df = DataFrame({"id": ["1", "2"], "a-1": [100, 200], "a-2": [300, 400]}) - df = df.astype({"id": dtype}) + df = df.astype({"id": any_string_dtype}) result = wide_to_long( df, stubnames=["a", "b"], @@ -1243,12 +1235,13 @@ def test_missing_stubname(self, dtype): {"a": [100, 200, 300, 400], "b": [np.nan] * 4}, index=index, ) - new_level = expected.index.levels[0].astype(dtype) + new_level = expected.index.levels[0].astype(any_string_dtype) + if any_string_dtype == "object": + new_level = expected.index.levels[0].astype("str") expected.index = expected.index.set_levels(new_level, level=0) tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_wide_to_long_pyarrow_string_columns(): # GH 57066 pytest.importorskip("pyarrow") @@ -1267,7 +1260,7 @@ def test_wide_to_long_pyarrow_string_columns(): ) expected = DataFrame( [[1, 1], [1, 1], [1, 2]], - columns=Index(["D", "R"], dtype=object), + columns=Index(["D", "R"]), index=pd.MultiIndex.from_arrays( [ [1, 1, 1], diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 8cfe565ebdd65..eccf676b87f89 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1068,7 +1068,6 @@ def test_margins_dtype_len(self, data): tm.assert_frame_equal(expected, result) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("cols", [(1, 2), ("a", "b"), (1, "b"), ("a", 1)]) def test_pivot_table_multiindex_only(self, cols): # GH 17038 @@ -1078,7 +1077,7 @@ def test_pivot_table_multiindex_only(self, cols): expected = DataFrame( [[4.0, 5.0, 6.0]], columns=MultiIndex.from_tuples([(1, 1), (2, 2), (3, 3)], names=cols), - index=Index(["v"], dtype=object), + index=Index(["v"], dtype="str" if cols == ("a", "b") else "object"), ) tm.assert_frame_equal(result, expected) @@ -2570,13 +2569,16 @@ def test_pivot_empty(self): expected = DataFrame(index=[], columns=[]) tm.assert_frame_equal(result, expected, check_names=False) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) - @pytest.mark.parametrize("dtype", [object, "string"]) - def test_pivot_integer_bug(self, dtype): - df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=dtype) + def test_pivot_integer_bug(self, any_string_dtype): + df = DataFrame( + data=[("A", "1", "A1"), ("B", "2", "B2")], dtype=any_string_dtype + ) result = df.pivot(index=1, columns=0, values=2) - tm.assert_index_equal(result.columns, Index(["A", "B"], name=0, dtype=dtype)) + expected_columns = Index(["A", "B"], name=0, dtype=any_string_dtype) + if any_string_dtype == "object": + expected_columns = expected_columns.astype("str") + tm.assert_index_equal(result.columns, expected_columns) def test_pivot_index_none(self): # GH#3962 @@ -2658,7 +2660,9 @@ def test_pivot_columns_not_given(self): with pytest.raises(TypeError, match="missing 1 required keyword-only argument"): df.pivot() - @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_columns_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2674,7 +2678,9 @@ def test_pivot_columns_is_none(self): expected = DataFrame({1: 3}, index=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_index_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) @@ -2688,7 +2694,9 @@ def test_pivot_index_is_none(self): expected = DataFrame(3, index=[1], columns=Index([2], name="b")) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="None is cast to NaN") + @pytest.mark.xfail( + using_string_dtype(), reason="TODO(infer_string) None is cast to NaN" + ) def test_pivot_values_is_none(self): # GH#48293 df = DataFrame({None: [1], "b": 2, "c": 3}) From 078b73226eb06b6a13bd5822efa5fba7fe47e97c Mon Sep 17 00:00:00 2001 From: Abhinav Reddy Date: Mon, 9 Sep 2024 13:22:08 -0400 Subject: [PATCH 018/224] Fix docs for api.types (#59753) * Fix is_bool * Fix is_categorical_dtype * Fix is_complex * Fix is_complex_dtype * Fix is_datetime64_dtype * Fix is_datetime64_ns_dtype * Fix is_datetime64tz_dtype --------- Co-authored-by: Abhinav Thimma --- ci/code_checks.sh | 7 ------- pandas/_libs/lib.pyx | 23 +++++++++++++++++++++++ pandas/core/dtypes/common.py | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 7 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7ed5103b3b796..44a6b91aeb565 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -123,13 +123,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.Timestamp.year GL08" \ -i "pandas.api.extensions.ExtensionArray.interpolate PR01,SA01" \ - -i "pandas.api.types.is_bool PR01,SA01" \ - -i "pandas.api.types.is_categorical_dtype SA01" \ - -i "pandas.api.types.is_complex PR01,SA01" \ - -i "pandas.api.types.is_complex_dtype SA01" \ - -i "pandas.api.types.is_datetime64_dtype SA01" \ - -i "pandas.api.types.is_datetime64_ns_dtype SA01" \ - -i "pandas.api.types.is_datetime64tz_dtype SA01" \ -i "pandas.api.types.is_dict_like PR07,SA01" \ -i "pandas.api.types.is_extension_array_dtype SA01" \ -i "pandas.api.types.is_file_like PR07,SA01" \ diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index e1a2a0142c52e..47a31954b9d6c 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1123,10 +1123,21 @@ def is_bool(obj: object) -> bool: """ Return True if given object is boolean. + Parameters + ---------- + obj : object + Object to check. + Returns ------- bool + See Also + -------- + api.types.is_scalar : Check if the input is a scalar. + api.types.is_integer : Check if the input is an integer. + api.types.is_float : Check if the input is a float. + Examples -------- >>> pd.api.types.is_bool(True) @@ -1142,10 +1153,22 @@ def is_complex(obj: object) -> bool: """ Return True if given object is complex. + Parameters + ---------- + obj : object + Object to check. + Returns ------- bool + See Also + -------- + api.types.is_complex_dtype: Check whether the provided array or + dtype is of a complex dtype. + api.types.is_number: Check if the object is a number. + api.types.is_integer: Return True if given object is integer. + Examples -------- >>> pd.api.types.is_complex(1 + 1j) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index bcf1ade9b0320..16f6bd396fe93 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -279,6 +279,13 @@ def is_datetime64_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of the datetime64 dtype. + See Also + -------- + api.types.is_datetime64_ns_dtype: Check whether the provided array or + dtype is of the datetime64[ns] dtype. + api.types.is_datetime64_any_dtype: Check whether the provided array or + dtype is of the datetime64 dtype. + Examples -------- >>> from pandas.api.types import is_datetime64_dtype @@ -316,6 +323,13 @@ def is_datetime64tz_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of a DatetimeTZDtype dtype. + See Also + -------- + api.types.is_datetime64_dtype: Check whether an array-like or + dtype is of the datetime64 dtype. + api.types.is_datetime64_any_dtype: Check whether the provided array or + dtype is of the datetime64 dtype. + Examples -------- >>> from pandas.api.types import is_datetime64tz_dtype @@ -514,6 +528,12 @@ def is_categorical_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of the Categorical dtype. + See Also + -------- + api.types.is_list_like: Check if the object is list-like. + api.types.is_complex_dtype: Check whether the provided array or + dtype is of a complex dtype. + Examples -------- >>> from pandas.api.types import is_categorical_dtype @@ -977,6 +997,13 @@ def is_datetime64_ns_dtype(arr_or_dtype) -> bool: bool Whether or not the array or dtype is of the datetime64[ns] dtype. + See Also + -------- + api.types.is_datetime64_dtype: Check whether an array-like or + dtype is of the datetime64 dtype. + api.types.is_datetime64_any_dtype: Check whether the provided array or + dtype is of the datetime64 dtype. + Examples -------- >>> from pandas.api.types import is_datetime64_ns_dtype @@ -1436,6 +1463,14 @@ def is_complex_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of a complex dtype. + See Also + -------- + api.types.is_complex: Return True if given object is complex. + api.types.is_numeric_dtype: Check whether the provided array or + dtype is of a numeric dtype. + api.types.is_integer_dtype: Check whether the provided array or + dtype is of an integer dtype. + Examples -------- >>> from pandas.api.types import is_complex_dtype From f3d19fb5298e98b2ff0a16dd03b6f30e32b38069 Mon Sep 17 00:00:00 2001 From: "Mien (Josephine) Nguyen" Date: Mon, 9 Sep 2024 13:27:15 -0400 Subject: [PATCH 019/224] TST: Update IntervalArray min/max test to fail on changed default skipna (#59747) Test --- pandas/tests/arrays/interval/test_interval.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 58ba340441d86..8e13dcf25ceba 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -222,9 +222,10 @@ def test_min_max(self, left_right_dtypes, index_or_series_or_array): res = arr_na.max(skipna=False) assert np.isnan(res) - res = arr_na.min(skipna=True) - assert res == MIN - assert type(res) == type(MIN) - res = arr_na.max(skipna=True) - assert res == MAX - assert type(res) == type(MAX) + for kws in [{"skipna": True}, {}]: + res = arr_na.min(**kws) + assert res == MIN + assert type(res) == type(MIN) + res = arr_na.max(**kws) + assert res == MAX + assert type(res) == type(MAX) From ea22788f6193eeb1aa9dea25481ab7fe72ea41c5 Mon Sep 17 00:00:00 2001 From: ivanpan0626 <151955212+ivanpan0626@users.noreply.github.com> Date: Mon, 9 Sep 2024 13:28:44 -0400 Subject: [PATCH 020/224] DOCS: fix docstring validation errors for groupby.DataFrameGroupBy.filter, groupby.SeriesGroupBy.filter (#59742) * DOCS: fix docstring validation errors for pandas.core.groupby.DataFrameGroupBy.filter DOC string fix for both groupby.DataFrameGroupBy.filter and groupby.SeriesGroupBy.filter * Update generic.py * Update generic.py * Update generic.py * quickfix --- ci/code_checks.sh | 2 -- pandas/core/groupby/generic.py | 18 +++++++++++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 44a6b91aeb565..fdacd2fed7729 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -156,7 +156,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.DataFrameGroupBy.agg RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.aggregate RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.filter SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.hist RT03" \ @@ -172,7 +171,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \ -i "pandas.core.groupby.SeriesGroupBy.agg RT03" \ -i "pandas.core.groupby.SeriesGroupBy.aggregate RT03" \ - -i "pandas.core.groupby.SeriesGroupBy.filter PR01,SA01" \ -i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \ -i "pandas.core.groupby.SeriesGroupBy.groups SA01" \ -i "pandas.core.groupby.SeriesGroupBy.indices SA01" \ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c112d9b6a4b54..230f61bab96df 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -600,15 +600,23 @@ def filter(self, func, dropna: bool = True, *args, **kwargs): ---------- func : function Criterion to apply to each group. Should return True or False. - dropna : bool + dropna : bool, optional Drop groups that do not pass the filter. True by default; if False, groups that evaluate False are filled with NaNs. + *args : tuple + Optional positional arguments to pass to `func`. + **kwargs : dict + Optional keyword arguments to pass to `func`. Returns ------- Series The filtered subset of the original Series. + See Also + -------- + DataFrameGroupBy.filter : Filter elements from groups base on criterion. + Notes ----- Functions that mutate the passed object can produce unexpected @@ -1943,9 +1951,9 @@ def filter(self, func, dropna: bool = True, *args, **kwargs) -> DataFrame: dropna : bool Drop groups that do not pass the filter. True by default; if False, groups that evaluate False are filled with NaNs. - *args + *args : tuple Additional positional arguments to pass to `func`. - **kwargs + **kwargs : dict Additional keyword arguments to pass to `func`. Returns @@ -1953,6 +1961,10 @@ def filter(self, func, dropna: bool = True, *args, **kwargs) -> DataFrame: DataFrame The filtered subset of the original DataFrame. + See Also + -------- + SeriesGroupBy.filter : Filter elements from groups base on criterion. + Notes ----- Each subframe is endowed the attribute 'name' in case you need to know From 6b74d6f61552f152422ffa53191301aa94b82ade Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 9 Sep 2024 23:04:13 +0530 Subject: [PATCH 021/224] DOC: fix SA01,ES01 for pandas.RangeIndex.stop (#59729) * DOC: fix SA01,ES01 for pandas.RangeIndex.stop * remove superfluous description of RangeIndex --- ci/code_checks.sh | 1 - pandas/core/indexes/range.py | 11 +++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index fdacd2fed7729..fa4e7ed8c3104 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -77,7 +77,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.RangeIndex.from_range PR01,SA01" \ -i "pandas.RangeIndex.start SA01" \ -i "pandas.RangeIndex.step SA01" \ - -i "pandas.RangeIndex.stop SA01" \ -i "pandas.Series.cat.add_categories PR01,PR02" \ -i "pandas.Series.cat.as_ordered PR01" \ -i "pandas.Series.cat.as_unordered PR01" \ diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index b11ce6bd7b919..154e142c41db2 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -313,6 +313,17 @@ def stop(self) -> int: """ The value of the `stop` parameter. + This property returns the `stop` value of the RangeIndex, which defines the + upper (or lower, in case of negative steps) bound of the index range. The + `stop` value is exclusive, meaning the RangeIndex includes values up to but + not including this value. + + See Also + -------- + RangeIndex : Immutable index representing a range of integers. + RangeIndex.start : The start value of the RangeIndex. + RangeIndex.step : The step size between elements in the RangeIndex. + Examples -------- >>> idx = pd.RangeIndex(5) From 9ec6b2a4771170f9fdf70f0e166229eb54ad3a75 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 9 Sep 2024 23:04:38 +0530 Subject: [PATCH 022/224] DOC: fix SA01,ES01 for pandas.RangeIndex.start (#59728) * DOC: fix SA01,ES01 for pandas.RangeIndex.start * remove superfluous description of RangeIndex --- ci/code_checks.sh | 1 - pandas/core/indexes/range.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index fa4e7ed8c3104..2870de5a0c85a 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -75,7 +75,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Period.ordinal GL08" \ -i "pandas.PeriodDtype.freq SA01" \ -i "pandas.RangeIndex.from_range PR01,SA01" \ - -i "pandas.RangeIndex.start SA01" \ -i "pandas.RangeIndex.step SA01" \ -i "pandas.Series.cat.add_categories PR01,PR02" \ -i "pandas.Series.cat.as_ordered PR01" \ diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 154e142c41db2..75d0dfbeb6f01 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -295,6 +295,16 @@ def start(self) -> int: """ The value of the `start` parameter (``0`` if this was not supplied). + This property returns the starting value of the `RangeIndex`. If the `start` + value is not explicitly provided during the creation of the `RangeIndex`, + it defaults to 0. + + See Also + -------- + RangeIndex : Immutable index implementing a range-based index. + RangeIndex.stop : Returns the stop value of the `RangeIndex`. + RangeIndex.step : Returns the step value of the `RangeIndex`. + Examples -------- >>> idx = pd.RangeIndex(5) From 871703dfc6150db112dde10a0135d3a758e77cd8 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Mon, 9 Sep 2024 18:36:52 +0100 Subject: [PATCH 023/224] fix: use fastpath for PyCapsule export when starting from pyarrow-backed Series, respect requested_schema (#59683) * fix: use fastpath for PyCapsule export when starting from pyarrow-backed Series, respect requested_schema * simplify * stringdtype test --- pandas/core/series.py | 11 ++++-- pandas/tests/series/test_arrow_interface.py | 38 +++++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 4f79e30f48f3c..0c26ce27c680c 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -580,8 +580,15 @@ def __arrow_c_stream__(self, requested_schema=None): PyCapsule """ pa = import_optional_dependency("pyarrow", min_version="16.0.0") - ca = pa.chunked_array([pa.Array.from_pandas(self, type=requested_schema)]) - return ca.__arrow_c_stream__(requested_schema) + type = ( + pa.DataType._import_from_c_capsule(requested_schema) + if requested_schema is not None + else None + ) + ca = pa.array(self, type=type) + if not isinstance(ca, pa.ChunkedArray): + ca = pa.chunked_array([ca]) + return ca.__arrow_c_stream__() # ---------------------------------------------------------------------- diff --git a/pandas/tests/series/test_arrow_interface.py b/pandas/tests/series/test_arrow_interface.py index 34a2a638e4185..e73cf9bee6aeb 100644 --- a/pandas/tests/series/test_arrow_interface.py +++ b/pandas/tests/series/test_arrow_interface.py @@ -21,3 +21,41 @@ def test_series_arrow_interface(): ca = pa.chunked_array(s) expected = pa.chunked_array([[1, 4, 2]]) assert ca.equals(expected) + ca = pa.chunked_array(s, type=pa.int32()) + expected = pa.chunked_array([[1, 4, 2]], type=pa.int32()) + assert ca.equals(expected) + + +def test_series_arrow_interface_arrow_dtypes(): + s = pd.Series([1, 4, 2], dtype="Int64[pyarrow]") + + capsule = s.__arrow_c_stream__() + assert ( + ctypes.pythonapi.PyCapsule_IsValid( + ctypes.py_object(capsule), b"arrow_array_stream" + ) + == 1 + ) + + ca = pa.chunked_array(s) + expected = pa.chunked_array([[1, 4, 2]]) + assert ca.equals(expected) + ca = pa.chunked_array(s, type=pa.int32()) + expected = pa.chunked_array([[1, 4, 2]], type=pa.int32()) + assert ca.equals(expected) + + +def test_series_arrow_interface_stringdtype(): + s = pd.Series(["foo", "bar"], dtype="string[pyarrow]") + + capsule = s.__arrow_c_stream__() + assert ( + ctypes.pythonapi.PyCapsule_IsValid( + ctypes.py_object(capsule), b"arrow_array_stream" + ) + == 1 + ) + + ca = pa.chunked_array(s) + expected = pa.chunked_array([["foo", "bar"]], type=pa.large_string()) + assert ca.equals(expected) From 47b56ea9ced016fc1c273c2453981a53666038a7 Mon Sep 17 00:00:00 2001 From: Katsia <47710336+KatsiarynaDzibrova@users.noreply.github.com> Date: Mon, 9 Sep 2024 18:38:13 +0100 Subject: [PATCH 024/224] DOC: Fix pandas.Series.dt seconds, nanoseconds GL08, SA01 (#59582) * fix pandas.Series.dt.freq * fix seconds, nanoseconds, microseconds * remove fixed objects from code_checks.sh * Remove Timedelta Index checks * fix freq example * remove freq * bring back microseconds --- ci/code_checks.sh | 4 ---- pandas/core/arrays/timedeltas.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 2870de5a0c85a..06078d8958492 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -90,10 +90,8 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.floor PR01,PR02" \ -i "pandas.Series.dt.freq GL08" \ -i "pandas.Series.dt.month_name PR01,PR02" \ - -i "pandas.Series.dt.nanoseconds SA01" \ -i "pandas.Series.dt.normalize PR01" \ -i "pandas.Series.dt.round PR01,PR02" \ - -i "pandas.Series.dt.seconds SA01" \ -i "pandas.Series.dt.strftime PR01,PR02" \ -i "pandas.Series.dt.to_period PR01,PR02" \ -i "pandas.Series.dt.total_seconds PR01" \ @@ -111,8 +109,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timedelta.resolution PR02" \ -i "pandas.Timedelta.to_timedelta64 SA01" \ -i "pandas.Timedelta.total_seconds SA01" \ - -i "pandas.TimedeltaIndex.nanoseconds SA01" \ - -i "pandas.TimedeltaIndex.seconds SA01" \ -i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \ -i "pandas.Timestamp.max PR02" \ -i "pandas.Timestamp.min PR02" \ diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index c8a86ffc187d0..754ae277e359a 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -842,6 +842,11 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: seconds_docstring = textwrap.dedent( """Number of seconds (>= 0 and less than 1 day) for each element. + See Also + -------- + Series.dt.seconds : Return number of seconds for each element. + Series.dt.nanoseconds : Return number of nanoseconds for each element. + Examples -------- For Series: @@ -917,6 +922,11 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: nanoseconds_docstring = textwrap.dedent( """Number of nanoseconds (>= 0 and less than 1 microsecond) for each element. + See Also + -------- + Series.dt.seconds : Return number of seconds for each element. + Series.dt.microseconds : Return number of nanoseconds for each element. + Examples -------- For Series: From b717abb3131a4cd344b463583c8dd828cd1632bc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Sep 2024 22:21:36 +0200 Subject: [PATCH 025/224] BUG (string dtype): fix inplace mutation with copy=False in ensure_string_array (#59756) * BUG (string dtype): fix inplace mutation with copy=False in ensure_string_array * update --- pandas/_libs/lib.pyx | 18 ++++++++++++------ pandas/tests/copy_view/test_astype.py | 18 +++++++++++++----- pandas/tests/libs/test_lib.py | 14 ++++++++++++++ 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 47a31954b9d6c..75f58f565dd6f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -733,7 +733,9 @@ cpdef ndarray[object] ensure_string_array( convert_na_value : bool, default True If False, existing na values will be used unchanged in the new array. copy : bool, default True - Whether to ensure that a new array is returned. + Whether to ensure that a new array is returned. When True, a new array + is always returned. When False, a new array is only returned when needed + to avoid mutating the input array. skipna : bool, default True Whether or not to coerce nulls to their stringified form (e.g. if False, NaN becomes 'nan'). @@ -762,11 +764,15 @@ cpdef ndarray[object] ensure_string_array( result = np.asarray(arr, dtype="object") - if copy and (result is arr or np.shares_memory(arr, result)): - # GH#54654 - result = result.copy() - elif not copy and result is arr: - already_copied = False + if result is arr or np.may_share_memory(arr, result): + # if np.asarray(..) did not make a copy of the input arr, we still need + # to do that to avoid mutating the input array + # GH#54654: share_memory check is needed for rare cases where np.asarray + # returns a new object without making a copy of the actual data + if copy: + result = result.copy() + else: + already_copied = False elif not copy and not result.flags.writeable: # Weird edge case where result is a view already_copied = False diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index de56d5e4a07ee..80c30f2d0c26e 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -7,7 +7,6 @@ from pandas.compat import HAS_PYARROW from pandas.compat.pyarrow import pa_version_under12p0 -import pandas.util._test_decorators as td from pandas import ( DataFrame, @@ -111,7 +110,8 @@ def test_astype_string_and_object_update_original(dtype, new_dtype): tm.assert_frame_equal(df2, df_orig) -def test_astype_string_copy_on_pickle_roundrip(): +def test_astype_str_copy_on_pickle_roundrip(): + # TODO(infer_string) this test can be removed after 3.0 (once str is the default) # https://github.com/pandas-dev/pandas/issues/54654 # ensure_string_array may alter array inplace base = Series(np.array([(1, 2), None, 1], dtype="object")) @@ -120,14 +120,22 @@ def test_astype_string_copy_on_pickle_roundrip(): tm.assert_series_equal(base, base_copy) -@td.skip_if_no("pyarrow") -def test_astype_string_read_only_on_pickle_roundrip(): +def test_astype_string_copy_on_pickle_roundrip(any_string_dtype): + # https://github.com/pandas-dev/pandas/issues/54654 + # ensure_string_array may alter array inplace + base = Series(np.array([(1, 2), None, 1], dtype="object")) + base_copy = pickle.loads(pickle.dumps(base)) + base_copy.astype(any_string_dtype) + tm.assert_series_equal(base, base_copy) + + +def test_astype_string_read_only_on_pickle_roundrip(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/54654 # ensure_string_array may alter read-only array inplace base = Series(np.array([(1, 2), None, 1], dtype="object")) base_copy = pickle.loads(pickle.dumps(base)) base_copy._values.flags.writeable = False - base_copy.astype("string[pyarrow]") + base_copy.astype(any_string_dtype) tm.assert_series_equal(base, base_copy) diff --git a/pandas/tests/libs/test_lib.py b/pandas/tests/libs/test_lib.py index 8583d8bcc052c..17dae1879f3b8 100644 --- a/pandas/tests/libs/test_lib.py +++ b/pandas/tests/libs/test_lib.py @@ -1,3 +1,5 @@ +import pickle + import numpy as np import pytest @@ -283,3 +285,15 @@ def test_no_default_pickle(): # GH#40397 obj = tm.round_trip_pickle(lib.no_default) assert obj is lib.no_default + + +def test_ensure_string_array_copy(): + # ensure the original array is not modified in case of copy=False with + # pickle-roundtripped object dtype array + # https://github.com/pandas-dev/pandas/issues/54654 + arr = np.array(["a", None], dtype=object) + arr = pickle.loads(pickle.dumps(arr)) + result = lib.ensure_string_array(arr, copy=False) + assert not np.shares_memory(arr, result) + assert arr[1] is None + assert result[1] is np.nan From 83fd9babc73fff1a5be53c3f33e8973ed9416b6e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Sep 2024 22:34:28 +0200 Subject: [PATCH 026/224] TST (string dtype): remove usage of 'string[pyarrow_numpy]' alias (#59758) --- pandas/conftest.py | 28 +++++++++++++++++++ pandas/tests/apply/test_numba.py | 6 ++-- .../tests/arrays/string_/test_string_arrow.py | 5 ++-- pandas/tests/base/test_misc.py | 4 +-- pandas/tests/frame/indexing/test_indexing.py | 10 ++----- pandas/tests/frame/methods/test_rank.py | 14 +++++----- pandas/tests/frame/test_constructors.py | 7 ++--- pandas/tests/groupby/methods/test_size.py | 13 ++------- .../groupby/methods/test_value_counts.py | 14 ++-------- pandas/tests/groupby/test_groupby.py | 11 ++------ pandas/tests/groupby/test_reductions.py | 5 ++-- .../indexes/base_class/test_constructors.py | 4 +-- .../tests/indexes/base_class/test_reshape.py | 7 ++--- pandas/tests/indexes/object/test_indexing.py | 23 ++++----------- pandas/tests/indexes/test_base.py | 5 ++-- pandas/tests/indexes/test_old_base.py | 5 +++- pandas/tests/interchange/test_impl.py | 8 ++++-- pandas/tests/io/json/test_pandas.py | 8 +++--- .../io/parser/dtypes/test_dtypes_basic.py | 11 +++----- pandas/tests/io/pytables/test_read.py | 5 ++-- pandas/tests/io/test_feather.py | 4 ++- pandas/tests/io/test_orc.py | 4 +-- pandas/tests/io/test_parquet.py | 8 +++--- pandas/tests/io/test_sql.py | 3 +- pandas/tests/reshape/test_get_dummies.py | 22 +++++++-------- pandas/tests/reshape/test_melt.py | 8 +++--- pandas/tests/series/test_logical_ops.py | 3 +- pandas/tests/strings/test_find_replace.py | 2 +- pandas/tests/util/test_shares_memory.py | 6 ++-- 29 files changed, 119 insertions(+), 134 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index d11213f1164bc..222aefb4afda8 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1272,6 +1272,34 @@ def string_dtype(request): return request.param +@pytest.fixture( + params=[ + ("python", pd.NA), + pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), + pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), + ("python", np.nan), + ], + ids=[ + "string=string[python]", + "string=string[pyarrow]", + "string=str[pyarrow]", + "string=str[python]", + ], +) +def string_dtype_no_object(request): + """ + Parametrized fixture for string dtypes. + * 'string[python]' (NA variant) + * 'string[pyarrow]' (NA variant) + * 'str' (NaN variant, with pyarrow) + * 'str' (NaN variant, without pyarrow) + """ + # need to instantiate the StringDtype here instead of in the params + # to avoid importing pyarrow during test collection + storage, na_value = request.param + return pd.StringDtype(storage, na_value) + + @pytest.fixture( params=[ "string[python]", diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index d86eeadbaa0fe..825d295043e69 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -5,6 +5,7 @@ import pandas.util._test_decorators as td +import pandas as pd from pandas import ( DataFrame, Index, @@ -29,11 +30,10 @@ def test_numba_vs_python_noop(float_frame, apply_axis): def test_numba_vs_python_string_index(): # GH#56189 - pytest.importorskip("pyarrow") df = DataFrame( 1, - index=Index(["a", "b"], dtype="string[pyarrow_numpy]"), - columns=Index(["x", "y"], dtype="string[pyarrow_numpy]"), + index=Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), + columns=Index(["x", "y"], dtype=pd.StringDtype(na_value=np.nan)), ) func = lambda x: x result = df.apply(func, engine="numba", axis=0) diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index b042cf632288b..d4363171788d4 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -241,10 +241,11 @@ def test_setitem_invalid_indexer_raises(): arr[[0, 1]] = ["foo", "bar", "baz"] -@pytest.mark.parametrize("dtype", ["string[pyarrow]", "string[pyarrow_numpy]"]) -def test_pickle_roundtrip(dtype): +@pytest.mark.parametrize("na_value", [pd.NA, np.nan]) +def test_pickle_roundtrip(na_value): # GH 42600 pytest.importorskip("pyarrow") + dtype = StringDtype("pyarrow", na_value=na_value) expected = pd.Series(range(10), dtype=dtype) expected_sliced = expected.head(2) full_pickled = pickle.dumps(expected) diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index bbd9b150b88a8..7819b7b75f065 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -183,9 +183,7 @@ def test_access_by_position(index_flat): assert index[-1] == index[size - 1] msg = f"index {size} is out of bounds for axis 0 with size {size}" - if is_dtype_equal(index.dtype, "string[pyarrow]") or is_dtype_equal( - index.dtype, "string[pyarrow_numpy]" - ): + if isinstance(index.dtype, pd.StringDtype) and index.dtype.storage == "pyarrow": msg = "index out of bounds" with pytest.raises(IndexError, match=msg): index[size] diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 8ce4e8725d632..0723c3c70091c 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1864,13 +1864,11 @@ def test_adding_new_conditional_column() -> None: ("dtype", "infer_string"), [ (object, False), - ("string[pyarrow_numpy]", True), + (pd.StringDtype(na_value=np.nan), True), ], ) def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: # https://github.com/pandas-dev/pandas/issues/56204 - pytest.importorskip("pyarrow") - df = DataFrame({"a": [1, 2], "b": [3, 4]}) with pd.option_context("future.infer_string", infer_string): df.loc[df["a"] == 1, "c"] = "1" @@ -1880,16 +1878,14 @@ def test_adding_new_conditional_column_with_string(dtype, infer_string) -> None: tm.assert_frame_equal(df, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_add_new_column_infer_string(): # GH#55366 - pytest.importorskip("pyarrow") df = DataFrame({"x": [1]}) with pd.option_context("future.infer_string", True): df.loc[df["x"] == 1, "y"] = "1" expected = DataFrame( - {"x": [1], "y": Series(["1"], dtype="string[pyarrow_numpy]")}, - columns=Index(["x", "y"], dtype=object), + {"x": [1], "y": Series(["1"], dtype=pd.StringDtype(na_value=np.nan))}, + columns=Index(["x", "y"], dtype="str"), ) tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 4b1435babe6b1..c1cdeaa6c10dd 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -14,6 +14,7 @@ ) from pandas.compat import HAS_PYARROW +import pandas as pd from pandas import ( DataFrame, Index, @@ -502,14 +503,13 @@ def test_rank_mixed_axis_zero(self, data, expected): result = df.rank(numeric_only=True) tm.assert_frame_equal(result, expected) - @pytest.mark.parametrize( - "dtype, exp_dtype", - [("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")], - ) - def test_rank_string_dtype(self, dtype, exp_dtype): + def test_rank_string_dtype(self, string_dtype_no_object): # GH#55362 - pytest.importorskip("pyarrow") - obj = Series(["foo", "foo", None, "foo"], dtype=dtype) + obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object) result = obj.rank(method="first") + exp_dtype = "Int64" if string_dtype_no_object.na_value is pd.NA else "float64" + if string_dtype_no_object.storage == "python": + # TODO nullable string[python] should also return nullable Int64 + exp_dtype = "float64" expected = Series([1, 2, None, 3], dtype=exp_dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 0176a36fe78d7..3d46e03547c38 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2655,8 +2655,7 @@ def test_construct_with_strings_and_none(self): def test_frame_string_inference(self): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -2690,8 +2689,7 @@ def test_frame_string_inference(self): def test_frame_string_inference_array_string_dtype(self): # GH#54496 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["a", "b"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) @@ -2715,7 +2713,6 @@ def test_frame_string_inference_array_string_dtype(self): def test_frame_string_inference_block_dim(self): # GH#55363 - pytest.importorskip("pyarrow") with pd.option_context("future.infer_string", True): df = DataFrame(np.array([["hello", "goodbye"], ["hello", "Hello"]])) assert df._mgr.blocks[0].ndim == 2 diff --git a/pandas/tests/groupby/methods/test_size.py b/pandas/tests/groupby/methods/test_size.py index edeac642551a0..91200f53e36bd 100644 --- a/pandas/tests/groupby/methods/test_size.py +++ b/pandas/tests/groupby/methods/test_size.py @@ -3,8 +3,6 @@ from pandas._config import using_string_dtype -import pandas.util._test_decorators as td - from pandas import ( DataFrame, Index, @@ -79,16 +77,9 @@ def test_size_series_masked_type_returns_Int64(dtype): @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], -) -def test_size_strings(dtype): +def test_size_strings(any_string_dtype): # GH#55627 + dtype = any_string_dtype df = DataFrame({"a": ["a", "a", "b"], "b": "a"}, dtype=dtype) result = df.groupby("a")["b"].size() exp_dtype = "Int64" if dtype == "string[pyarrow]" else "int64" diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index da3d626f2d777..8f8f7f64aba75 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -7,8 +7,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( Categorical, CategoricalIndex, @@ -373,14 +371,6 @@ def test_against_frame_and_seriesgroupby( tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], -) @pytest.mark.parametrize("normalize", [True, False]) @pytest.mark.parametrize( "sort, ascending, expected_rows, expected_count, expected_group_size", @@ -398,9 +388,10 @@ def test_compound( expected_rows, expected_count, expected_group_size, - dtype, + any_string_dtype, using_infer_string, ): + dtype = any_string_dtype education_df = education_df.astype(dtype) education_df.columns = education_df.columns.astype(dtype) # Multiple groupby keys and as_index=False @@ -417,6 +408,7 @@ def test_compound( expected["proportion"] = expected_count expected["proportion"] /= expected_group_size if dtype == "string[pyarrow]": + # TODO(nullable) also string[python] should return nullable dtypes expected["proportion"] = expected["proportion"].convert_dtypes() else: expected["count"] = expected_count diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 11b874d0b1608..6393468fb8ccd 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2466,20 +2466,13 @@ def test_rolling_wrong_param_min_period(): test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum() -@pytest.mark.parametrize( - "dtype", - [ - object, - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ], -) -def test_by_column_values_with_same_starting_value(dtype): +def test_by_column_values_with_same_starting_value(any_string_dtype): # GH29635 df = DataFrame( { "Name": ["Thomas", "Thomas", "Thomas John"], "Credit": [1200, 1300, 900], - "Mood": Series(["sad", "happy", "happy"], dtype=dtype), + "Mood": Series(["sad", "happy", "happy"], dtype=any_string_dtype), } ) aggregate_details = {"Mood": Series.mode, "Credit": "sum"} diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 8a421654cdf9b..a6ea1502103c5 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -714,10 +714,9 @@ def test_groupby_min_max_categorical(func): @pytest.mark.parametrize("func", ["min", "max"]) -def test_min_empty_string_dtype(func): +def test_min_empty_string_dtype(func, string_dtype_no_object): # GH#55619 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = string_dtype_no_object df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0] result = getattr(df.groupby("a"), func)() expected = DataFrame( diff --git a/pandas/tests/indexes/base_class/test_constructors.py b/pandas/tests/indexes/base_class/test_constructors.py index 6036eddce7a01..0896b97e8a40e 100644 --- a/pandas/tests/indexes/base_class/test_constructors.py +++ b/pandas/tests/indexes/base_class/test_constructors.py @@ -47,9 +47,7 @@ def test_construct_empty_tuples(self, tuple_list): def test_index_string_inference(self): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" - expected = Index(["a", "b"], dtype=dtype) + expected = Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)) with pd.option_context("future.infer_string", True): ser = Index(["a", "b"]) tm.assert_index_equal(ser, expected) diff --git a/pandas/tests/indexes/base_class/test_reshape.py b/pandas/tests/indexes/base_class/test_reshape.py index e17e39a334acc..56cdca49cb2b0 100644 --- a/pandas/tests/indexes/base_class/test_reshape.py +++ b/pandas/tests/indexes/base_class/test_reshape.py @@ -57,12 +57,11 @@ def test_insert_datetime_into_object(self, loc, val): tm.assert_index_equal(result, expected) assert type(expected[2]) is type(val) - def test_insert_none_into_string_numpy(self): + def test_insert_none_into_string_numpy(self, string_dtype_no_object): # GH#55365 - pytest.importorskip("pyarrow") - index = Index(["a", "b", "c"], dtype="string[pyarrow_numpy]") + index = Index(["a", "b", "c"], dtype=string_dtype_no_object) result = index.insert(-1, None) - expected = Index(["a", "b", None, "c"], dtype="string[pyarrow_numpy]") + expected = Index(["a", "b", None, "c"], dtype=string_dtype_no_object) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py index 2e9ba007a45c1..ea3d068a673e8 100644 --- a/pandas/tests/indexes/object/test_indexing.py +++ b/pandas/tests/indexes/object/test_indexing.py @@ -7,7 +7,6 @@ NA, is_matching_na, ) -import pandas.util._test_decorators as td import pandas as pd from pandas import Index @@ -160,14 +159,6 @@ def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): class TestSliceLocs: - # TODO(infer_string) parametrize over multiple string dtypes - @pytest.mark.parametrize( - "dtype", - [ - "object", - pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), - ], - ) @pytest.mark.parametrize( "in_slice,expected", [ @@ -191,24 +182,22 @@ class TestSliceLocs: (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] ], ) - def test_slice_locs_negative_step(self, in_slice, expected, dtype): - index = Index(list("bcdxy"), dtype=dtype) + def test_slice_locs_negative_step(self, in_slice, expected, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) s_start, s_stop = index.slice_locs(in_slice.start, in_slice.stop, in_slice.step) result = index[s_start : s_stop : in_slice.step] - expected = Index(list(expected), dtype=dtype) + expected = Index(list(expected), dtype=any_string_dtype) tm.assert_index_equal(result, expected) - # TODO(infer_string) parametrize over multiple string dtypes - @td.skip_if_no("pyarrow") - def test_slice_locs_negative_step_oob(self): - index = Index(list("bcdxy"), dtype="string[pyarrow_numpy]") + def test_slice_locs_negative_step_oob(self, any_string_dtype): + index = Index(list("bcdxy"), dtype=any_string_dtype) result = index[-10:5:1] tm.assert_index_equal(result, index) result = index[4:-10:-1] - expected = Index(list("yxdcb"), dtype="string[pyarrow_numpy]") + expected = Index(list("yxdcb"), dtype=any_string_dtype) tm.assert_index_equal(result, expected) def test_slice_locs_dup(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 486b24845d2ff..2b62b384930d6 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -933,10 +933,9 @@ def test_isin_empty(self, empty): result = index.isin(empty) tm.assert_numpy_array_equal(expected, result) - @td.skip_if_no("pyarrow") - def test_isin_arrow_string_null(self): + def test_isin_string_null(self, string_dtype_no_object): # GH#55821 - index = Index(["a", "b"], dtype="string[pyarrow_numpy]") + index = Index(["a", "b"], dtype=string_dtype_no_object) result = index.isin([None]) expected = np.array([False, False]) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 75284a8f8fd47..cd3d599abd30e 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -295,7 +295,10 @@ def test_ensure_copied_data(self, index): tm.assert_numpy_array_equal( index._values._ndarray, result._values._ndarray, check_same="same" ) - elif index.dtype in ("string[pyarrow]", "string[pyarrow_numpy]"): + elif ( + isinstance(index.dtype, StringDtype) + and index.dtype.storage == "pyarrow" + ): assert tm.shares_memory(result._values, index._values) else: raise NotImplementedError(index.dtype) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 76910db941d36..38961345dc1f2 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -465,7 +465,7 @@ def test_non_str_names_w_duplicates(): ([1.0, 2.25, None], "Float32[pyarrow]", "float32"), ([True, False, None], "boolean", "bool"), ([True, False, None], "boolean[pyarrow]", "bool"), - (["much ado", "about", None], "string[pyarrow_numpy]", "large_string"), + (["much ado", "about", None], pd.StringDtype(na_value=np.nan), "large_string"), (["much ado", "about", None], "string[pyarrow]", "large_string"), ( [datetime(2020, 1, 1), datetime(2020, 1, 2), None], @@ -528,7 +528,11 @@ def test_pandas_nullable_with_missing_values( ([1.0, 2.25, 5.0], "Float32[pyarrow]", "float32"), ([True, False, False], "boolean", "bool"), ([True, False, False], "boolean[pyarrow]", "bool"), - (["much ado", "about", "nothing"], "string[pyarrow_numpy]", "large_string"), + ( + ["much ado", "about", "nothing"], + pd.StringDtype(na_value=np.nan), + "large_string", + ), (["much ado", "about", "nothing"], "string[pyarrow]", "large_string"), ( [datetime(2020, 1, 1), datetime(2020, 1, 2), datetime(2020, 1, 3)], diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 3d07c0219691e..1c54232b8b510 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2245,18 +2245,18 @@ def test_pyarrow_engine_lines_false(): def test_json_roundtrip_string_inference(orient): - pytest.importorskip("pyarrow") df = DataFrame( [["a", "b"], ["c", "d"]], index=["row 1", "row 2"], columns=["col 1", "col 2"] ) out = df.to_json() with pd.option_context("future.infer_string", True): result = read_json(StringIO(out)) + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( [["a", "b"], ["c", "d"]], - dtype="string[pyarrow_numpy]", - index=Index(["row 1", "row 2"], dtype="string[pyarrow_numpy]"), - columns=Index(["col 1", "col 2"], dtype="string[pyarrow_numpy]"), + dtype=dtype, + index=Index(["row 1", "row 2"], dtype=dtype), + columns=Index(["col 1", "col 2"], dtype=dtype), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 07f29518b7881..b664423364f6b 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -547,8 +547,7 @@ def test_ea_int_avoid_overflow(all_parsers): def test_string_inference(all_parsers): # GH#54430 - pytest.importorskip("pyarrow") - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) data = """a,b x,1 @@ -568,8 +567,6 @@ def test_string_inference(all_parsers): @pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_]) def test_string_inference_object_dtype(all_parsers, dtype): # GH#56047 - pytest.importorskip("pyarrow") - data = """a,b x,a y,a @@ -583,7 +580,7 @@ def test_string_inference_object_dtype(all_parsers, dtype): "a": pd.Series(["x", "y", "z"], dtype=object), "b": pd.Series(["a", "a", "a"], dtype=object), }, - columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) @@ -593,9 +590,9 @@ def test_string_inference_object_dtype(all_parsers, dtype): expected = DataFrame( { "a": pd.Series(["x", "y", "z"], dtype=object), - "b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"), + "b": pd.Series(["a", "a", "a"], dtype=pd.StringDtype(na_value=np.nan)), }, - columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/test_read.py b/pandas/tests/io/pytables/test_read.py index dd3a0eabe95ae..8ae87d4bab52d 100644 --- a/pandas/tests/io/pytables/test_read.py +++ b/pandas/tests/io/pytables/test_read.py @@ -310,7 +310,6 @@ def test_read_hdf_series_mode_r(tmp_path, format, setup_path): def test_read_infer_string(tmp_path, setup_path): # GH#54431 - pytest.importorskip("pyarrow") df = DataFrame({"a": ["a", "b", None]}) path = tmp_path / setup_path df.to_hdf(path, key="data", format="table") @@ -318,8 +317,8 @@ def test_read_infer_string(tmp_path, setup_path): result = read_hdf(path, key="data", mode="r") expected = DataFrame( {"a": ["a", "b", None]}, - dtype="string[pyarrow_numpy]", - columns=Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index a1f3babb1ae3b..9721d045b7b91 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -243,5 +243,7 @@ def test_string_inference(self, tmp_path): df.to_feather(path) with pd.option_context("future.infer_string", True): result = read_feather(path) - expected = pd.DataFrame(data={"a": ["x", "y"]}, dtype="string[pyarrow_numpy]") + expected = pd.DataFrame( + data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan) + ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 90133344fdfc9..efb3dffecd856 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -436,7 +436,7 @@ def test_string_inference(tmp_path): result = read_orc(path) expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype="string[pyarrow_numpy]", - columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index a29e479b7c9f1..4c2ea036f08dc 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1109,8 +1109,8 @@ def test_string_inference(self, tmp_path, pa): result = read_parquet(path, engine="pyarrow") expected = pd.DataFrame( data={"a": ["x", "y"]}, - dtype="string[pyarrow_numpy]", - index=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + index=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) @@ -1140,8 +1140,8 @@ def test_infer_string_large_string_type(self, tmp_path, pa): result = read_parquet(path) expected = pd.DataFrame( data={"a": [None, "b", "c"]}, - dtype="string[pyarrow_numpy]", - columns=pd.Index(["a"], dtype="string[pyarrow_numpy]"), + dtype=pd.StringDtype(na_value=np.nan), + columns=pd.Index(["a"], dtype=pd.StringDtype(na_value=np.nan)), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 980c88f070b89..c28a33069d23f 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -3809,7 +3809,6 @@ class Test(BaseModel): def test_read_sql_string_inference(sqlite_engine): conn = sqlite_engine # GH#54430 - pytest.importorskip("pyarrow") table = "test" df = DataFrame({"a": ["x", "y"]}) df.to_sql(table, con=conn, index=False, if_exists="replace") @@ -3817,7 +3816,7 @@ def test_read_sql_string_inference(sqlite_engine): with pd.option_context("future.infer_string", True): result = read_sql_table(table, conn) - dtype = "string[pyarrow_numpy]" + dtype = pd.StringDtype(na_value=np.nan) expected = DataFrame( {"a": ["x", "y"]}, dtype=dtype, columns=Index(["a"], dtype=dtype) ) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index f07c6845366da..9ce2c925a368b 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -708,19 +708,17 @@ def test_get_dummies_ea_dtype_dataframe(self, any_numeric_ea_and_arrow_dtype): ) tm.assert_frame_equal(result, expected) - @td.skip_if_no("pyarrow") - def test_get_dummies_ea_dtype(self): + @pytest.mark.parametrize("dtype_type", ["string", "category"]) + def test_get_dummies_ea_dtype(self, dtype_type, string_dtype_no_object): # GH#56273 - for dtype, exp_dtype in [ - ("string[pyarrow]", "boolean"), - ("string[pyarrow_numpy]", "bool"), - (CategoricalDtype(Index(["a"], dtype="string[pyarrow]")), "boolean"), - (CategoricalDtype(Index(["a"], dtype="string[pyarrow_numpy]")), "bool"), - ]: - df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1}) - result = get_dummies(df) - expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)}) - tm.assert_frame_equal(result, expected) + dtype = string_dtype_no_object + exp_dtype = "boolean" if dtype.na_value is pd.NA else "bool" + if dtype_type == "category": + dtype = CategoricalDtype(Index(["a"], dtype)) + df = DataFrame({"name": Series(["a"], dtype=dtype), "x": 1}) + result = get_dummies(df) + expected = DataFrame({"x": 1, "name_a": Series([True], dtype=exp_dtype)}) + tm.assert_frame_equal(result, expected) @td.skip_if_no("pyarrow") def test_get_dummies_arrow_dtype(self): diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 4a12404f6775a..95aa5291cb45a 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -1242,9 +1242,9 @@ def test_missing_stubname(self, any_string_dtype): tm.assert_frame_equal(result, expected) -def test_wide_to_long_pyarrow_string_columns(): +def test_wide_to_long_string_columns(string_storage): # GH 57066 - pytest.importorskip("pyarrow") + string_dtype = pd.StringDtype(string_storage, na_value=np.nan) df = DataFrame( { "ID": {0: 1}, @@ -1254,7 +1254,7 @@ def test_wide_to_long_pyarrow_string_columns(): "D": {0: 1}, } ) - df.columns = df.columns.astype("string[pyarrow_numpy]") + df.columns = df.columns.astype(string_dtype) result = wide_to_long( df, stubnames="R", i="ID", j="UNPIVOTED", sep="_", suffix=".*" ) @@ -1264,7 +1264,7 @@ def test_wide_to_long_pyarrow_string_columns(): index=pd.MultiIndex.from_arrays( [ [1, 1, 1], - Index(["test1", "test2", "test3"], dtype="string[pyarrow_numpy]"), + Index(["test1", "test2", "test3"], dtype=string_dtype), ], names=["ID", "UNPIVOTED"], ), diff --git a/pandas/tests/series/test_logical_ops.py b/pandas/tests/series/test_logical_ops.py index 1586195e79a9d..8516018e8aa93 100644 --- a/pandas/tests/series/test_logical_ops.py +++ b/pandas/tests/series/test_logical_ops.py @@ -9,6 +9,7 @@ DataFrame, Index, Series, + StringDtype, bdate_range, ) import pandas._testing as tm @@ -514,7 +515,7 @@ def test_pyarrow_numpy_string_invalid(self): # GH#56008 pa = pytest.importorskip("pyarrow") ser = Series([False, True]) - ser2 = Series(["a", "b"], dtype="string[pyarrow_numpy]") + ser2 = Series(["a", "b"], dtype=StringDtype(na_value=np.nan)) result = ser == ser2 expected_eq = Series(False, index=ser.index) tm.assert_series_equal(result, expected_eq) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index bf01c4996bb32..ea9f89ed129aa 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -22,7 +22,7 @@ def using_pyarrow(dtype): - return dtype in ("string[pyarrow]", "string[pyarrow_numpy]") + return dtype == "string" and dtype.storage == "pyarrow" def test_contains(any_string_dtype): diff --git a/pandas/tests/util/test_shares_memory.py b/pandas/tests/util/test_shares_memory.py index 00a897d574a07..8f1ac93b40247 100644 --- a/pandas/tests/util/test_shares_memory.py +++ b/pandas/tests/util/test_shares_memory.py @@ -1,3 +1,5 @@ +import numpy as np + import pandas.util._test_decorators as td import pandas as pd @@ -20,10 +22,10 @@ def test_shares_memory_string(): # GH#55823 import pyarrow as pa - obj = pd.array(["a", "b"], dtype="string[pyarrow]") + obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=pd.NA)) assert tm.shares_memory(obj, obj) - obj = pd.array(["a", "b"], dtype="string[pyarrow_numpy]") + obj = pd.array(["a", "b"], dtype=pd.StringDtype("pyarrow", na_value=np.nan)) assert tm.shares_memory(obj, obj) obj = pd.array(["a", "b"], dtype=pd.ArrowDtype(pa.string())) From 715585de0d66383c51ce290ad6b18a036254d007 Mon Sep 17 00:00:00 2001 From: aaronchucarroll <120818400+aaronchucarroll@users.noreply.github.com> Date: Mon, 9 Sep 2024 17:38:04 -0400 Subject: [PATCH 027/224] ENH: Add dtype argument to StringMethods get_dummies() (#59577) --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/arrays/arrow/array.py | 15 +++- pandas/core/arrays/categorical.py | 4 +- pandas/core/arrays/string_arrow.py | 19 ++++- pandas/core/strings/accessor.py | 27 ++++++- pandas/core/strings/base.py | 3 +- pandas/core/strings/object_array.py | 13 +++- pandas/tests/strings/test_get_dummies.py | 99 ++++++++++++++++++++---- 8 files changed, 154 insertions(+), 27 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 9a29ff4d49966..819318e119668 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -55,6 +55,7 @@ Other enhancements - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) +- :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`) - Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`) - Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 15f9ba611a642..4edf464be74f1 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -41,6 +41,7 @@ is_list_like, is_numeric_dtype, is_scalar, + pandas_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna @@ -2475,7 +2476,9 @@ def _str_findall(self, pat: str, flags: int = 0) -> Self: result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_get_dummies(self, sep: str = "|"): + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): + if dtype is None: + dtype = np.bool_ split = pc.split_pattern(self._pa_array, sep) flattened_values = pc.list_flatten(split) uniques = flattened_values.unique() @@ -2485,7 +2488,15 @@ def _str_get_dummies(self, sep: str = "|"): n_cols = len(uniques) indices = pc.index_in(flattened_values, uniques_sorted).to_numpy() indices = indices + np.arange(n_rows).repeat(lengths) * n_cols - dummies = np.zeros(n_rows * n_cols, dtype=np.bool_) + _dtype = pandas_dtype(dtype) + dummies_dtype: NpDtype + if isinstance(_dtype, np.dtype): + dummies_dtype = _dtype + else: + dummies_dtype = np.bool_ + dummies = np.zeros(n_rows * n_cols, dtype=dummies_dtype) + if dtype == str: + dummies[:] = False dummies[indices] = True dummies = dummies.reshape((n_rows, n_cols)) result = type(self)(pa.array(list(dummies))) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c613a345686cc..8e0225b31e17b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2681,11 +2681,11 @@ def _str_map( result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype) return take_nd(result, codes, fill_value=na_value) - def _str_get_dummies(self, sep: str = "|"): + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): # sep may not be in categories. Just bail on this. from pandas.core.arrays import NumpyExtensionArray - return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep) + return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep, dtype) # ------------------------------------------------------------------------ # GroupBy Methods diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 1e5adf106752f..fa8c662b68f3c 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -56,6 +56,7 @@ ArrayLike, AxisInt, Dtype, + NpDtype, Scalar, Self, npt, @@ -425,12 +426,22 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None): return super()._str_find(sub, start, end) return ArrowStringArrayMixin._str_find(self, sub, start, end) - def _str_get_dummies(self, sep: str = "|"): - dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep) + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): + if dtype is None: + dtype = np.int64 + dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies( + sep, dtype + ) if len(labels) == 0: - return np.empty(shape=(0, 0), dtype=np.int64), labels + return np.empty(shape=(0, 0), dtype=dtype), labels dummies = np.vstack(dummies_pa.to_numpy()) - return dummies.astype(np.int64, copy=False), labels + _dtype = pandas_dtype(dtype) + dummies_dtype: NpDtype + if isinstance(_dtype, np.dtype): + dummies_dtype = _dtype + else: + dummies_dtype = np.bool_ + return dummies.astype(dummies_dtype, copy=False), labels def _convert_int_result(self, result): if self.dtype.na_value is np.nan: diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index bdb88e981bcda..6d10365a1b968 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -26,6 +26,7 @@ from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, + is_extension_array_dtype, is_integer, is_list_like, is_object_dtype, @@ -54,6 +55,8 @@ Iterator, ) + from pandas._typing import NpDtype + from pandas import ( DataFrame, Index, @@ -2431,7 +2434,11 @@ def wrap( return self._wrap_result(result) @forbid_nonstring_types(["bytes"]) - def get_dummies(self, sep: str = "|"): + def get_dummies( + self, + sep: str = "|", + dtype: NpDtype | None = None, + ): """ Return DataFrame of dummy/indicator variables for Series. @@ -2442,6 +2449,8 @@ def get_dummies(self, sep: str = "|"): ---------- sep : str, default "|" String to split on. + dtype : dtype, default np.int64 + Data type for new columns. Only a single dtype is allowed. Returns ------- @@ -2466,10 +2475,24 @@ def get_dummies(self, sep: str = "|"): 0 1 1 0 1 0 0 0 2 1 0 1 + + >>> pd.Series(["a|b", np.nan, "a|c"]).str.get_dummies(dtype=bool) + a b c + 0 True True False + 1 False False False + 2 True False True """ + from pandas.core.frame import DataFrame + # we need to cast to Series of strings as only that has all # methods available for making the dummies... - result, name = self._data.array._str_get_dummies(sep) + result, name = self._data.array._str_get_dummies(sep, dtype) + if is_extension_array_dtype(dtype) or isinstance(dtype, ArrowDtype): + return self._wrap_result( + DataFrame(result, columns=name, dtype=dtype), + name=name, + returns_string=False, + ) return self._wrap_result( result, name=name, diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index 1281a03e297f9..97d906e3df077 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -16,6 +16,7 @@ import re from pandas._typing import ( + NpDtype, Scalar, Self, ) @@ -163,7 +164,7 @@ def _str_wrap(self, width: int, **kwargs): pass @abc.abstractmethod - def _str_get_dummies(self, sep: str = "|"): + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): pass @abc.abstractmethod diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index c6b18d7049c57..6211c7b528db9 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -18,6 +18,7 @@ import pandas._libs.ops as libops from pandas.util._exceptions import find_stack_level +from pandas.core.dtypes.common import pandas_dtype from pandas.core.dtypes.missing import isna from pandas.core.strings.base import BaseStringArrayMethods @@ -398,9 +399,11 @@ def _str_wrap(self, width: int, **kwargs): tw = textwrap.TextWrapper(**kwargs) return self._str_map(lambda s: "\n".join(tw.wrap(s))) - def _str_get_dummies(self, sep: str = "|"): + def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): from pandas import Series + if dtype is None: + dtype = np.int64 arr = Series(self).fillna("") try: arr = sep + arr + sep @@ -412,7 +415,13 @@ def _str_get_dummies(self, sep: str = "|"): tags.update(ts) tags2 = sorted(tags - {""}) - dummies = np.empty((len(arr), len(tags2)), dtype=np.int64) + _dtype = pandas_dtype(dtype) + dummies_dtype: NpDtype + if isinstance(_dtype, np.dtype): + dummies_dtype = _dtype + else: + dummies_dtype = np.bool_ + dummies = np.empty((len(arr), len(tags2)), dtype=dummies_dtype) def _isin(test_elements: str, element: str) -> bool: return element in test_elements diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 31386e4e342ae..0656f505dc745 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -1,4 +1,7 @@ import numpy as np +import pytest + +import pandas.util._test_decorators as td from pandas import ( DataFrame, @@ -8,6 +11,11 @@ _testing as tm, ) +try: + import pyarrow as pa +except ImportError: + pa = None + def test_get_dummies(any_string_dtype): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) @@ -32,22 +40,85 @@ def test_get_dummies_index(): tm.assert_index_equal(result, expected) -def test_get_dummies_with_name_dummy(any_string_dtype): - # GH 12180 - # Dummies named 'name' should work as expected - s = Series(["a", "b,name", "b"], dtype=any_string_dtype) - result = s.str.get_dummies(",") - expected = DataFrame([[1, 0, 0], [0, 1, 1], [0, 1, 0]], columns=["a", "b", "name"]) +# GH#47872 +@pytest.mark.parametrize( + "dtype", + [ + np.uint8, + np.int16, + np.uint16, + np.int32, + np.uint32, + np.int64, + np.uint64, + bool, + "Int8", + "Int16", + "Int32", + "Int64", + "boolean", + ], +) +def test_get_dummies_with_dtype(any_string_dtype, dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=dtype) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], columns=list("abc"), dtype=dtype + ) tm.assert_frame_equal(result, expected) -def test_get_dummies_with_name_dummy_index(): - # GH 12180 - # Dummies named 'name' should work as expected - idx = Index(["a|b", "name|c", "b|name"]) - result = idx.str.get_dummies("|") +# GH#47872 +@td.skip_if_no("pyarrow") +@pytest.mark.parametrize( + "dtype", + [ + "int8[pyarrow]", + "uint8[pyarrow]", + "int16[pyarrow]", + "uint16[pyarrow]", + "int32[pyarrow]", + "uint32[pyarrow]", + "int64[pyarrow]", + "uint64[pyarrow]", + "bool[pyarrow]", + ], +) +def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=dtype) + expected = DataFrame( + [[1, 1, 0], [1, 0, 1], [0, 0, 0]], + columns=list("abc"), + dtype=dtype, + ) + tm.assert_frame_equal(result, expected) - expected = MultiIndex.from_tuples( - [(1, 1, 0, 0), (0, 0, 1, 1), (0, 1, 0, 1)], names=("a", "b", "c", "name") + +# GH#47872 +def test_get_dummies_with_str_dtype(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype=str) + expected = DataFrame( + [["T", "T", "F"], ["T", "F", "T"], ["F", "F", "F"]], + columns=list("abc"), + dtype=str, ) - tm.assert_index_equal(result, expected) + tm.assert_frame_equal(result, expected) + + +# GH#47872 +@td.skip_if_no("pyarrow") +def test_get_dummies_with_pa_str_dtype(any_string_dtype): + s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) + result = s.str.get_dummies("|", dtype="str[pyarrow]") + expected = DataFrame( + [ + ["true", "true", "false"], + ["true", "false", "true"], + ["false", "false", "false"], + ], + columns=list("abc"), + dtype="str[pyarrow]", + ) + tm.assert_frame_equal(result, expected) From 50ac1907abeef8e6824472988a9f015dcd25bb21 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 10 Sep 2024 01:18:29 -0700 Subject: [PATCH 028/224] BUG (string): Series.str.slice with negative step (#59724) Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v2.3.0.rst | 3 +- pandas/core/arrays/_arrow_string_mixins.py | 32 ++++++++++++++++------ pandas/core/arrays/arrow/array.py | 11 -------- pandas/core/arrays/string_arrow.py | 14 +--------- pandas/tests/extension/test_arrow.py | 1 + pandas/tests/strings/test_strings.py | 1 + 6 files changed, 28 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 03355f655eb28..03b3a6b55dff6 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -103,8 +103,9 @@ Conversion Strings ^^^^^^^ - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) +- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`) - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) - +- Interval ^^^^^^^^ diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 950d4cd7cc92e..32fa5e7c383b5 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -11,6 +11,7 @@ from pandas.compat import ( pa_version_under10p1, + pa_version_under11p0, pa_version_under13p0, pa_version_under17p0, ) @@ -22,10 +23,7 @@ import pyarrow.compute as pc if TYPE_CHECKING: - from collections.abc import ( - Callable, - Sized, - ) + from collections.abc import Callable from pandas._typing import ( Scalar, @@ -34,7 +32,7 @@ class ArrowStringArrayMixin: - _pa_array: Sized + _pa_array: pa.ChunkedArray def __init__(self, *args, **kwargs) -> None: raise NotImplementedError @@ -96,13 +94,29 @@ def _str_get(self, i: int) -> Self: selected = pc.utf8_slice_codeunits( self._pa_array, start=start, stop=stop, step=step ) - null_value = pa.scalar( - None, - type=self._pa_array.type, # type: ignore[attr-defined] - ) + null_value = pa.scalar(None, type=self._pa_array.type) result = pc.if_else(not_out_of_bounds, selected, null_value) return type(self)(result) + def _str_slice( + self, start: int | None = None, stop: int | None = None, step: int | None = None + ) -> Self: + if pa_version_under11p0: + # GH#59724 + result = self._apply_elementwise(lambda val: val[start:stop:step]) + return type(self)(pa.chunked_array(result, type=self._pa_array.type)) + if start is None: + if step is not None and step < 0: + # GH#59710 + start = -1 + else: + start = 0 + if step is None: + step = 1 + return type(self)( + pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) + ) + def _str_slice_replace( self, start: int | None = None, stop: int | None = None, repl: str | None = None ) -> Self: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 4edf464be74f1..41d40d8304e8f 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2394,17 +2394,6 @@ def _str_rpartition(self, sep: str, expand: bool) -> Self: result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ) -> Self: - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) - def _str_len(self) -> Self: return type(self)(pc.utf8_length(self._pa_array)) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index fa8c662b68f3c..73dc822bb8ef5 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -294,6 +294,7 @@ def astype(self, dtype, copy: bool = True): _str_startswith = ArrowStringArrayMixin._str_startswith _str_endswith = ArrowStringArrayMixin._str_endswith _str_pad = ArrowStringArrayMixin._str_pad + _str_slice = ArrowStringArrayMixin._str_slice def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True @@ -352,19 +353,6 @@ def _str_fullmatch( pat = f"{pat}$" return self._str_match(pat, case, flags, na) - def _str_slice( - self, start: int | None = None, stop: int | None = None, step: int | None = None - ) -> Self: - if stop is None: - return super()._str_slice(start, stop, step) - if start is None: - start = 0 - if step is None: - step = 1 - return type(self)( - pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) - ) - def _str_len(self): result = pc.utf8_length(self._pa_array) return self._convert_int_result(result) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index fc4f14882b9d7..f86d927ddda67 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2036,6 +2036,7 @@ def test_str_join_string_type(): [None, 2, None, ["ab", None]], [None, 2, 1, ["ab", None]], [1, 3, 1, ["bc", None]], + (None, None, -1, ["dcba", None]), ], ) def test_str_slice(start, stop, step, exp): diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py index 1ce46497c3c22..4995b448f7e94 100644 --- a/pandas/tests/strings/test_strings.py +++ b/pandas/tests/strings/test_strings.py @@ -394,6 +394,7 @@ def test_pipe_failures(any_string_dtype): (2, 5, None, ["foo", "bar", np.nan, "baz"]), (0, 3, -1, ["", "", np.nan, ""]), (None, None, -1, ["owtoofaa", "owtrabaa", np.nan, "xuqzabaa"]), + (None, 2, -1, ["owtoo", "owtra", np.nan, "xuqza"]), (3, 10, 2, ["oto", "ato", np.nan, "aqx"]), (3, 0, -1, ["ofa", "aba", np.nan, "aba"]), ], From de51d336d10f198cb5594ba55530c9401b4eff18 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 10 Sep 2024 16:35:18 +0200 Subject: [PATCH 029/224] String dtype: remove fallback Perfomance warnings for string methods (#59760) --- pandas/core/arrays/arrow/_arrow_utils.py | 19 ---- pandas/core/arrays/string_arrow.py | 8 -- pandas/tests/extension/test_string.py | 1 - pandas/tests/indexes/test_setops.py | 12 --- pandas/tests/strings/test_find_replace.py | 103 ++++++---------------- pandas/tests/strings/test_string_array.py | 1 - 6 files changed, 27 insertions(+), 117 deletions(-) diff --git a/pandas/core/arrays/arrow/_arrow_utils.py b/pandas/core/arrays/arrow/_arrow_utils.py index cbc9ce0252750..285c3fd465ffc 100644 --- a/pandas/core/arrays/arrow/_arrow_utils.py +++ b/pandas/core/arrays/arrow/_arrow_utils.py @@ -1,27 +1,8 @@ from __future__ import annotations -import warnings - import numpy as np import pyarrow -from pandas._config.config import get_option - -from pandas.errors import PerformanceWarning -from pandas.util._exceptions import find_stack_level - - -def fallback_performancewarning(version: str | None = None) -> None: - """ - Raise a PerformanceWarning for falling back to ExtensionArray's - non-pyarrow method - """ - if get_option("performance_warnings"): - msg = "Falling back on a non-pyarrow code path which may decrease performance." - if version is not None: - msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning." - warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level()) - def pyarrow_array_to_numpy_and_mask( arr, dtype: np.dtype diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 73dc822bb8ef5..a669b6d669b48 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -10,8 +10,6 @@ import numpy as np -from pandas._config.config import get_option - from pandas._libs import ( lib, missing as libmissing, @@ -43,8 +41,6 @@ import pyarrow as pa import pyarrow.compute as pc - from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning - if TYPE_CHECKING: from collections.abc import ( @@ -300,8 +296,6 @@ def _str_contains( self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True ): if flags: - if get_option("mode.performance_warnings"): - fallback_performancewarning() return super()._str_contains(pat, case, flags, na, regex) if not isna(na): @@ -327,8 +321,6 @@ def _str_replace( regex: bool = True, ): if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: - if get_option("mode.performance_warnings"): - fallback_performancewarning() return super()._str_replace(pat, repl, n, case, flags, regex) return ArrowExtensionArray._str_replace(self, pat, repl, n, case, flags, regex) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 17f6eb8282b23..509ae653e4793 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -209,7 +209,6 @@ def test_compare_scalar(self, data, comparison_op): ser = pd.Series(data) self._compare_other(ser, data, comparison_op, "abc") - @pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning") def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op): super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 8fd349dacf9e9..e5dc47be20677 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -246,9 +246,6 @@ def test_intersection_base(self, index): with pytest.raises(TypeError, match=msg): first.intersection([1, 2, 3]) - @pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" - ) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_union_base(self, index): index = index.unique() @@ -276,9 +273,6 @@ def test_union_base(self, index): first.union([1, 2, 3]) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") - @pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" - ) def test_difference_base(self, sort, index): first = index[2:] second = index[:4] @@ -305,9 +299,6 @@ def test_difference_base(self, sort, index): first.difference([1, 2, 3], sort) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") - @pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" - ) def test_symmetric_difference(self, index): if isinstance(index, CategoricalIndex): pytest.skip(f"Not relevant for {type(index).__name__}") @@ -529,9 +520,6 @@ def test_intersection_difference_match_empty(self, index, sort): @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") -@pytest.mark.filterwarnings( - "ignore:Falling back on a non-pyarrow:pandas.errors.PerformanceWarning" -) @pytest.mark.parametrize( "method", ["intersection", "union", "difference", "symmetric_difference"] ) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index ea9f89ed129aa..f3698a2ea33cf 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -21,10 +21,6 @@ # -------------------------------------------------------------------------------------- -def using_pyarrow(dtype): - return dtype == "string" and dtype.storage == "pyarrow" - - def test_contains(any_string_dtype): values = np.array( ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ @@ -458,13 +454,10 @@ def test_replace_mixed_object(): tm.assert_series_equal(result, expected) -def test_replace_unicode(any_string_dtype, performance_warning): +def test_replace_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) + result = ser.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE, regex=True) tm.assert_series_equal(result, expected) @@ -478,16 +471,13 @@ def test_replace_wrong_repl_type_raises(any_string_dtype, index_or_series, repl, obj.str.replace("a", repl) -def test_replace_callable(any_string_dtype, performance_warning): +def test_replace_callable(any_string_dtype): # GH 15055 ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) # test with callable repl = lambda m: m.group(0).swapcase() - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) + result = ser.str.replace("[a-z][A-Z]{2}", repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -495,7 +485,7 @@ def test_replace_callable(any_string_dtype, performance_warning): @pytest.mark.parametrize( "repl", [lambda: None, lambda m, x: None, lambda m, x, y=None: None] ) -def test_replace_callable_raises(any_string_dtype, performance_warning, repl): +def test_replace_callable_raises(any_string_dtype, repl): # GH 15055 values = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) @@ -504,43 +494,31 @@ def test_replace_callable_raises(any_string_dtype, performance_warning, repl): r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " r"(?(3)required )positional arguments?" ) - if not using_pyarrow(any_string_dtype): - performance_warning = False with pytest.raises(TypeError, match=msg): - with tm.assert_produces_warning(performance_warning): - values.str.replace("a", repl, regex=True) + values.str.replace("a", repl, regex=True) -def test_replace_callable_named_groups(any_string_dtype, performance_warning): +def test_replace_callable_named_groups(any_string_dtype): # test regex named groups ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace(pat, repl, regex=True) + result = ser.str.replace(pat, repl, regex=True) expected = Series(["bAR", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) -def test_replace_compiled_regex(any_string_dtype, performance_warning): +def test_replace_compiled_regex(any_string_dtype): # GH 15446 ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) # test with compiled regex pat = re.compile(r"BAD_*") - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace(pat, "", regex=True) + result = ser.str.replace(pat, "", regex=True) expected = Series(["foobar", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace(pat, "", n=1, regex=True) + result = ser.str.replace(pat, "", n=1, regex=True) expected = Series(["foobarBAD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -557,14 +535,11 @@ def test_replace_compiled_regex_mixed_object(): tm.assert_series_equal(result, expected) -def test_replace_compiled_regex_unicode(any_string_dtype, performance_warning): +def test_replace_compiled_regex_unicode(any_string_dtype): ser = Series([b"abcd,\xc3\xa0".decode("utf-8")], dtype=any_string_dtype) expected = Series([b"abcd, \xc3\xa0".decode("utf-8")], dtype=any_string_dtype) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace(pat, ", ", regex=True) + result = ser.str.replace(pat, ", ", regex=True) tm.assert_series_equal(result, expected) @@ -586,15 +561,12 @@ def test_replace_compiled_regex_raises(any_string_dtype): ser.str.replace(pat, "", case=True, regex=True) -def test_replace_compiled_regex_callable(any_string_dtype, performance_warning): +def test_replace_compiled_regex_callable(any_string_dtype): # test with callable ser = Series(["fooBAD__barBAD", np.nan], dtype=any_string_dtype) repl = lambda m: m.group(0).swapcase() pat = re.compile("[a-z][A-Z]{2}") - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace(pat, repl, n=2, regex=True) + result = ser.str.replace(pat, repl, n=2, regex=True) expected = Series(["foObaD__baRbaD", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -626,7 +598,7 @@ def test_replace_literal_compiled_raises(any_string_dtype): ser.str.replace(pat, "", regex=False) -def test_replace_moar(any_string_dtype, performance_warning): +def test_replace_moar(any_string_dtype): # PR #1179 ser = Series( ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], @@ -640,10 +612,7 @@ def test_replace_moar(any_string_dtype, performance_warning): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace("A", "YYY", case=False) + result = ser.str.replace("A", "YYY", case=False) expected = Series( [ "YYY", @@ -661,10 +630,7 @@ def test_replace_moar(any_string_dtype, performance_warning): ) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) + result = ser.str.replace("^.a|dog", "XX-XX ", case=False, regex=True) expected = Series( [ "A", @@ -683,21 +649,15 @@ def test_replace_moar(any_string_dtype, performance_warning): tm.assert_series_equal(result, expected) -def test_replace_not_case_sensitive_not_regex(any_string_dtype, performance_warning): +def test_replace_not_case_sensitive_not_regex(any_string_dtype): # https://github.com/pandas-dev/pandas/issues/41602 ser = Series(["A.", "a.", "Ab", "ab", np.nan], dtype=any_string_dtype) - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace("a", "c", case=False, regex=False) + result = ser.str.replace("a", "c", case=False, regex=False) expected = Series(["c.", "c.", "cb", "cb", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.replace("a.", "c.", case=False, regex=False) + result = ser.str.replace("a.", "c.", case=False, regex=False) expected = Series(["c.", "c.", "Ab", "ab", np.nan], dtype=any_string_dtype) tm.assert_series_equal(result, expected) @@ -853,7 +813,7 @@ def test_fullmatch_na_kwarg(any_string_dtype): tm.assert_series_equal(result, expected) -def test_fullmatch_case_kwarg(any_string_dtype, performance_warning): +def test_fullmatch_case_kwarg(any_string_dtype): ser = Series(["ab", "AB", "abc", "ABC"], dtype=any_string_dtype) expected_dtype = ( np.bool_ if is_object_or_nan_string_dtype(any_string_dtype) else "boolean" @@ -869,10 +829,7 @@ def test_fullmatch_case_kwarg(any_string_dtype, performance_warning): result = ser.str.fullmatch("ab", case=False) tm.assert_series_equal(result, expected) - with tm.maybe_produces_warning( - performance_warning, using_pyarrow(any_string_dtype) - ): - result = ser.str.fullmatch("ab", flags=re.IGNORECASE) + result = ser.str.fullmatch("ab", flags=re.IGNORECASE) tm.assert_series_equal(result, expected) @@ -1046,7 +1003,7 @@ def test_translate_mixed_object(): # -------------------------------------------------------------------------------------- -def test_flags_kwarg(any_string_dtype, performance_warning): +def test_flags_kwarg(any_string_dtype): data = { "Dave": "dave@google.com", "Steve": "steve@gmail.com", @@ -1057,17 +1014,13 @@ def test_flags_kwarg(any_string_dtype, performance_warning): pat = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})" - use_pyarrow = using_pyarrow(any_string_dtype) - result = data.str.extract(pat, flags=re.IGNORECASE, expand=True) assert result.iloc[0].tolist() == ["dave", "google", "com"] - with tm.maybe_produces_warning(performance_warning, use_pyarrow): - result = data.str.match(pat, flags=re.IGNORECASE) + result = data.str.match(pat, flags=re.IGNORECASE) assert result.iloc[0] - with tm.maybe_produces_warning(performance_warning, use_pyarrow): - result = data.str.fullmatch(pat, flags=re.IGNORECASE) + result = data.str.fullmatch(pat, flags=re.IGNORECASE) assert result.iloc[0] result = data.str.findall(pat, flags=re.IGNORECASE) @@ -1077,8 +1030,6 @@ def test_flags_kwarg(any_string_dtype, performance_warning): assert result.iloc[0] == 1 msg = "has match groups" - with tm.assert_produces_warning( - UserWarning, match=msg, raise_on_extra_warnings=not use_pyarrow - ): + with tm.assert_produces_warning(UserWarning, match=msg): result = data.str.contains(pat, flags=re.IGNORECASE) assert result.iloc[0] diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py index 0b3f368afea5e..517ddb164985c 100644 --- a/pandas/tests/strings/test_string_array.py +++ b/pandas/tests/strings/test_string_array.py @@ -12,7 +12,6 @@ ) -@pytest.mark.filterwarnings("ignore:Falling back") def test_string_array(nullable_string_dtype, any_string_method): method_name, args, kwargs = any_string_method From 16b7288eccdf29efcb430616e77cd701497fe8ed Mon Sep 17 00:00:00 2001 From: ammar-qazi Date: Tue, 10 Sep 2024 19:11:32 +0200 Subject: [PATCH 030/224] DOC: Add docstring for Extensionarray interpolate (#59749) * Update docstring of Extensionarray.interpolate * Remove Extensionarray.interpolate from code_checks.sh * Resolving pre-commit errors * Resolving pre-commit errors 2 * Resolved ruff formatting error * Fix issues after review --- ci/code_checks.sh | 1 - pandas/core/arrays/base.py | 78 +++++++++++++++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 06078d8958492..2aa256b65a493 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -116,7 +116,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.Timestamp.year GL08" \ - -i "pandas.api.extensions.ExtensionArray.interpolate PR01,SA01" \ -i "pandas.api.types.is_dict_like PR07,SA01" \ -i "pandas.api.types.is_extension_array_dtype SA01" \ -i "pandas.api.types.is_file_like PR07,SA01" \ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 536c7303a2f92..a933a9ce11646 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -999,16 +999,74 @@ def interpolate( **kwargs, ) -> Self: """ - See DataFrame.interpolate.__doc__. + Fill NaN values using an interpolation method. + + Parameters + ---------- + method : str, default 'linear' + Interpolation technique to use. One of: + * 'linear': Ignore the index and treat the values as equally spaced. + This is the only method supported on MultiIndexes. + * 'time': Works on daily and higher resolution data to interpolate + given length of interval. + * 'index', 'values': use the actual numerical values of the index. + * 'pad': Fill in NaNs using existing values. + * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', + 'polynomial': Passed to scipy.interpolate.interp1d, whereas 'spline' + is passed to scipy.interpolate.UnivariateSpline. These methods use + the numerical values of the index. + Both 'polynomial' and 'spline' require that you also specify an + order (int), e.g. arr.interpolate(method='polynomial', order=5). + * 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima', + 'cubicspline': Wrappers around the SciPy interpolation methods + of similar names. See Notes. + * 'from_derivatives': Refers to scipy.interpolate.BPoly.from_derivatives. + axis : int + Axis to interpolate along. For 1-dimensional data, use 0. + index : Index + Index to use for interpolation. + limit : int or None + Maximum number of consecutive NaNs to fill. Must be greater than 0. + limit_direction : {'forward', 'backward', 'both'} + Consecutive NaNs will be filled in this direction. + limit_area : {'inside', 'outside'} or None + If limit is specified, consecutive NaNs will be filled with this + restriction. + * None: No fill restriction. + * 'inside': Only fill NaNs surrounded by valid values (interpolate). + * 'outside': Only fill NaNs outside valid values (extrapolate). + copy : bool + If True, a copy of the object is returned with interpolated values. + **kwargs : optional + Keyword arguments to pass on to the interpolating function. + + Returns + ------- + ExtensionArray + An ExtensionArray with interpolated values. + + See Also + -------- + Series.interpolate : Interpolate values in a Series. + DataFrame.interpolate : Interpolate values in a DataFrame. + + Notes + ----- + - All parameters must be specified as keyword arguments. + - The 'krogh', 'piecewise_polynomial', 'spline', 'pchip' and 'akima' + methods are wrappers around the respective SciPy implementations of + similar names. These use the actual numerical values of the index. Examples -------- + Interpolating values in a NumPy array: + >>> arr = pd.arrays.NumpyExtensionArray(np.array([0, 1, np.nan, 3])) >>> arr.interpolate( ... method="linear", ... limit=3, ... limit_direction="forward", - ... index=pd.Index([1, 2, 3, 4]), + ... index=pd.Index(range(len(arr))), ... fill_value=1, ... copy=False, ... axis=0, @@ -1017,6 +1075,22 @@ def interpolate( [0.0, 1.0, 2.0, 3.0] Length: 4, dtype: float64 + + Interpolating values in a FloatingArray: + + >>> arr = pd.array([1.0, pd.NA, 3.0, 4.0, pd.NA, 6.0], dtype="Float64") + >>> arr.interpolate( + ... method="linear", + ... axis=0, + ... index=pd.Index(range(len(arr))), + ... limit=None, + ... limit_direction="both", + ... limit_area=None, + ... copy=True, + ... ) + + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0] + Length: 6, dtype: Float64 """ # NB: we return type(self) even if copy=False raise NotImplementedError( From 4444e5279b2a42b927044d65cbd894abd33fa724 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 11 Sep 2024 12:40:01 -0700 Subject: [PATCH 031/224] REF (string): de-duplicate ArrowStringArray methods (#59555) --- pandas/core/arrays/_arrow_string_mixins.py | 83 ++++++++++++++++ pandas/core/arrays/arrow/array.py | 86 +---------------- pandas/core/arrays/string_arrow.py | 106 ++++----------------- 3 files changed, 103 insertions(+), 172 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 32fa5e7c383b5..aa5b28c71b12a 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -1,6 +1,7 @@ from __future__ import annotations from functools import partial +import re from typing import ( TYPE_CHECKING, Any, @@ -48,6 +49,37 @@ def _convert_int_result(self, result): def _apply_elementwise(self, func: Callable) -> list[list[Any]]: raise NotImplementedError + def _str_len(self): + result = pc.utf8_length(self._pa_array) + return self._convert_int_result(result) + + def _str_lower(self) -> Self: + return type(self)(pc.utf8_lower(self._pa_array)) + + def _str_upper(self) -> Self: + return type(self)(pc.utf8_upper(self._pa_array)) + + def _str_strip(self, to_strip=None) -> Self: + if to_strip is None: + result = pc.utf8_trim_whitespace(self._pa_array) + else: + result = pc.utf8_trim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_lstrip(self, to_strip=None) -> Self: + if to_strip is None: + result = pc.utf8_ltrim_whitespace(self._pa_array) + else: + result = pc.utf8_ltrim(self._pa_array, characters=to_strip) + return type(self)(result) + + def _str_rstrip(self, to_strip=None) -> Self: + if to_strip is None: + result = pc.utf8_rtrim_whitespace(self._pa_array) + else: + result = pc.utf8_rtrim(self._pa_array, characters=to_strip) + return type(self)(result) + def _str_pad( self, width: int, @@ -128,6 +160,33 @@ def _str_slice_replace( stop = np.iinfo(np.int64).max return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl)) + def _str_replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool = True, + flags: int = 0, + regex: bool = True, + ) -> Self: + if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: + raise NotImplementedError( + "replace is not supported with a re.Pattern, callable repl, " + "case=False, or flags!=0" + ) + + func = pc.replace_substring_regex if regex else pc.replace_substring + # https://github.com/apache/arrow/issues/39149 + # GH 56404, unexpected behavior with negative max_replacements with pyarrow. + pa_max_replacements = None if n < 0 else n + result = func( + self._pa_array, + pattern=pat, + replacement=repl, + max_replacements=pa_max_replacements, + ) + return type(self)(result) + def _str_capitalize(self) -> Self: return type(self)(pc.utf8_capitalize(self._pa_array)) @@ -137,6 +196,16 @@ def _str_title(self) -> Self: def _str_swapcase(self) -> Self: return type(self)(pc.utf8_swapcase(self._pa_array)) + def _str_removeprefix(self, prefix: str): + if not pa_version_under13p0: + starts_with = pc.starts_with(self._pa_array, pattern=prefix) + removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) + result = pc.if_else(starts_with, removed, self._pa_array) + return type(self)(result) + predicate = lambda val: val.removeprefix(prefix) + result = self._apply_elementwise(predicate) + return type(self)(pa.chunked_array(result)) + def _str_removesuffix(self, suffix: str): ends_with = pc.ends_with(self._pa_array, pattern=suffix) removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) @@ -228,6 +297,20 @@ def _str_contains( result = result.fill_null(na) return self._convert_bool_result(result) + def _str_match( + self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None + ): + if not pat.startswith("^"): + pat = f"^{pat}" + return self._str_contains(pat, case, flags, na, regex=True) + + def _str_fullmatch( + self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None + ): + if not pat.endswith("$") or pat.endswith("\\$"): + pat = f"{pat}$" + return self._str_match(pat, case, flags, na) + def _str_find(self, sub: str, start: int = 0, end: int | None = None): if ( pa_version_under13p0 diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 41d40d8304e8f..bd94447f0cd80 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1999,7 +1999,7 @@ def _rank( """ See Series.rank.__doc__. """ - return type(self)( + return self._convert_int_result( self._rank_calc( axis=axis, method=method, @@ -2323,36 +2323,6 @@ def _str_count(self, pat: str, flags: int = 0) -> Self: raise NotImplementedError(f"count not implemented with {flags=}") return type(self)(pc.count_substring_regex(self._pa_array, pat)) - def _result_converter(self, result): - return type(self)(result) - - def _str_replace( - self, - pat: str | re.Pattern, - repl: str | Callable, - n: int = -1, - case: bool = True, - flags: int = 0, - regex: bool = True, - ) -> Self: - if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: - raise NotImplementedError( - "replace is not supported with a re.Pattern, callable repl, " - "case=False, or flags!=0" - ) - - func = pc.replace_substring_regex if regex else pc.replace_substring - # https://github.com/apache/arrow/issues/39149 - # GH 56404, unexpected behavior with negative max_replacements with pyarrow. - pa_max_replacements = None if n < 0 else n - result = func( - self._pa_array, - pattern=pat, - replacement=repl, - max_replacements=pa_max_replacements, - ) - return type(self)(result) - def _str_repeat(self, repeats: int | Sequence[int]) -> Self: if not isinstance(repeats, int): raise NotImplementedError( @@ -2360,20 +2330,6 @@ def _str_repeat(self, repeats: int | Sequence[int]) -> Self: ) return type(self)(pc.binary_repeat(self._pa_array, repeats)) - def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None - ) -> Self: - if not pat.startswith("^"): - pat = f"^{pat}" - return self._str_contains(pat, case, flags, na, regex=True) - - def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None - ) -> Self: - if not pat.endswith("$") or pat.endswith("\\$"): - pat = f"{pat}$" - return self._str_match(pat, case, flags, na) - def _str_join(self, sep: str) -> Self: if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( self._pa_array.type @@ -2394,46 +2350,6 @@ def _str_rpartition(self, sep: str, expand: bool) -> Self: result = self._apply_elementwise(predicate) return type(self)(pa.chunked_array(result)) - def _str_len(self) -> Self: - return type(self)(pc.utf8_length(self._pa_array)) - - def _str_lower(self) -> Self: - return type(self)(pc.utf8_lower(self._pa_array)) - - def _str_upper(self) -> Self: - return type(self)(pc.utf8_upper(self._pa_array)) - - def _str_strip(self, to_strip=None) -> Self: - if to_strip is None: - result = pc.utf8_trim_whitespace(self._pa_array) - else: - result = pc.utf8_trim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_lstrip(self, to_strip=None) -> Self: - if to_strip is None: - result = pc.utf8_ltrim_whitespace(self._pa_array) - else: - result = pc.utf8_ltrim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_rstrip(self, to_strip=None) -> Self: - if to_strip is None: - result = pc.utf8_rtrim_whitespace(self._pa_array) - else: - result = pc.utf8_rtrim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_removeprefix(self, prefix: str): - if not pa_version_under13p0: - starts_with = pc.starts_with(self._pa_array, pattern=prefix) - removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) - result = pc.if_else(starts_with, removed, self._pa_array) - return type(self)(result) - predicate = lambda val: val.removeprefix(prefix) - result = self._apply_elementwise(predicate) - return type(self)(pa.chunked_array(result)) - def _str_casefold(self) -> Self: predicate = lambda val: val.casefold() result = self._apply_elementwise(predicate) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index a669b6d669b48..f446cc5bde147 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -50,10 +50,8 @@ from pandas._typing import ( ArrayLike, - AxisInt, Dtype, NpDtype, - Scalar, Self, npt, ) @@ -290,6 +288,20 @@ def astype(self, dtype, copy: bool = True): _str_startswith = ArrowStringArrayMixin._str_startswith _str_endswith = ArrowStringArrayMixin._str_endswith _str_pad = ArrowStringArrayMixin._str_pad + _str_match = ArrowStringArrayMixin._str_match + _str_fullmatch = ArrowStringArrayMixin._str_fullmatch + _str_lower = ArrowStringArrayMixin._str_lower + _str_upper = ArrowStringArrayMixin._str_upper + _str_strip = ArrowStringArrayMixin._str_strip + _str_lstrip = ArrowStringArrayMixin._str_lstrip + _str_rstrip = ArrowStringArrayMixin._str_rstrip + _str_removesuffix = ArrowStringArrayMixin._str_removesuffix + _str_get = ArrowStringArrayMixin._str_get + _str_capitalize = ArrowStringArrayMixin._str_capitalize + _str_title = ArrowStringArrayMixin._str_title + _str_swapcase = ArrowStringArrayMixin._str_swapcase + _str_slice_replace = ArrowStringArrayMixin._str_slice_replace + _str_len = ArrowStringArrayMixin._str_len _str_slice = ArrowStringArrayMixin._str_slice def _str_contains( @@ -323,73 +335,21 @@ def _str_replace( if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: return super()._str_replace(pat, repl, n, case, flags, regex) - return ArrowExtensionArray._str_replace(self, pat, repl, n, case, flags, regex) + return ArrowStringArrayMixin._str_replace( + self, pat, repl, n, case, flags, regex + ) def _str_repeat(self, repeats: int | Sequence[int]): if not isinstance(repeats, int): return super()._str_repeat(repeats) else: - return type(self)(pc.binary_repeat(self._pa_array, repeats)) - - def _str_match( - self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.startswith("^"): - pat = f"^{pat}" - return self._str_contains(pat, case, flags, na, regex=True) - - def _str_fullmatch( - self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None - ): - if not pat.endswith("$") or pat.endswith("\\$"): - pat = f"{pat}$" - return self._str_match(pat, case, flags, na) - - def _str_len(self): - result = pc.utf8_length(self._pa_array) - return self._convert_int_result(result) - - def _str_lower(self) -> Self: - return type(self)(pc.utf8_lower(self._pa_array)) - - def _str_upper(self) -> Self: - return type(self)(pc.utf8_upper(self._pa_array)) - - def _str_strip(self, to_strip=None) -> Self: - if to_strip is None: - result = pc.utf8_trim_whitespace(self._pa_array) - else: - result = pc.utf8_trim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_lstrip(self, to_strip=None) -> Self: - if to_strip is None: - result = pc.utf8_ltrim_whitespace(self._pa_array) - else: - result = pc.utf8_ltrim(self._pa_array, characters=to_strip) - return type(self)(result) - - def _str_rstrip(self, to_strip=None) -> Self: - if to_strip is None: - result = pc.utf8_rtrim_whitespace(self._pa_array) - else: - result = pc.utf8_rtrim(self._pa_array, characters=to_strip) - return type(self)(result) + return ArrowExtensionArray._str_repeat(self, repeats=repeats) def _str_removeprefix(self, prefix: str): if not pa_version_under13p0: - starts_with = pc.starts_with(self._pa_array, pattern=prefix) - removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) - result = pc.if_else(starts_with, removed, self._pa_array) - return type(self)(result) + return ArrowStringArrayMixin._str_removeprefix(self, prefix) return super()._str_removeprefix(prefix) - def _str_removesuffix(self, suffix: str): - ends_with = pc.ends_with(self._pa_array, pattern=suffix) - removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix)) - result = pc.if_else(ends_with, removed, self._pa_array) - return type(self)(result) - def _str_count(self, pat: str, flags: int = 0): if flags: return super()._str_count(pat, flags) @@ -456,28 +416,6 @@ def _reduce( else: return result - def _rank( - self, - *, - axis: AxisInt = 0, - method: str = "average", - na_option: str = "keep", - ascending: bool = True, - pct: bool = False, - ): - """ - See Series.rank.__doc__. - """ - return self._convert_int_result( - self._rank_calc( - axis=axis, - method=method, - na_option=na_option, - ascending=ascending, - pct=pct, - ) - ) - def value_counts(self, dropna: bool = True) -> Series: result = super().value_counts(dropna=dropna) if self.dtype.na_value is np.nan: @@ -499,9 +437,3 @@ def _cmp_method(self, other, op): class ArrowStringArrayNumpySemantics(ArrowStringArray): _na_value = np.nan - _str_get = ArrowStringArrayMixin._str_get - _str_removesuffix = ArrowStringArrayMixin._str_removesuffix - _str_capitalize = ArrowStringArrayMixin._str_capitalize - _str_title = ArrowStringArrayMixin._str_title - _str_swapcase = ArrowStringArrayMixin._str_swapcase - _str_slice_replace = ArrowStringArrayMixin._str_slice_replace From 7acb9659afafbe308d2d78345021487aa7f2f73f Mon Sep 17 00:00:00 2001 From: sshu2017 <66704517+sshu2017@users.noreply.github.com> Date: Wed, 11 Sep 2024 18:15:52 -0700 Subject: [PATCH 032/224] Fix/na_values_GH59303 (#59755) * fixed GH#59303 * pre-commit done * updated v3.0.0.rst * sort my entry in v3.0.0.rst * changes based on comments on PR * reformat long lines * reformat test_na_values.py * reformat test_na_values.py again --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/parsers/readers.py | 2 +- pandas/tests/io/parser/test_na_values.py | 18 ++++++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 819318e119668..89a1c388b3ba1 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -627,6 +627,7 @@ I/O - Bug in :meth:`read_csv` causing segmentation fault when ``encoding_errors`` is not a string. (:issue:`59059`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) +- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 2916e4d98cce4..ffc2690a5efdf 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1648,7 +1648,7 @@ def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = T if keep_default_na: v = set(v) | STR_NA_VALUES - na_values[k] = v + na_values[k] = _stringify_na_values(v, floatify) na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()} else: if not is_list_like(na_values): diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 360a5feebe073..b612e60c959b1 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -812,3 +812,21 @@ def test_bool_and_nan_to_float(all_parsers): result = parser.read_csv(StringIO(data), dtype="float") expected = DataFrame.from_dict({"0": [np.nan, 1.0, 0.0]}) tm.assert_frame_equal(result, expected) + + +@xfail_pyarrow +@pytest.mark.parametrize( + "na_values", + [[-99.0, -99], [-99, -99.0]], +) +def test_na_values_dict_without_dtype(all_parsers, na_values): + parser = all_parsers + data = """A +-99 +-99 +-99.0 +-99.0""" + + result = parser.read_csv(StringIO(data), na_values=na_values) + expected = DataFrame({"A": [np.nan, np.nan, np.nan, np.nan]}) + tm.assert_frame_equal(result, expected) From 2a3cf8300b183f4230cc9dd4911604e454134450 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 12 Sep 2024 08:39:37 +0200 Subject: [PATCH 033/224] BUG: avoid triggering numpy deprecation warning in assert functions for nested array with empty array/list (#59778) --- pandas/_libs/lib.pyx | 2 ++ pandas/tests/dtypes/test_missing.py | 12 +----------- pandas/tests/series/methods/test_equals.py | 12 +----------- pandas/tests/util/test_assert_almost_equal.py | 4 ++++ 4 files changed, 8 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 75f58f565dd6f..3f2dfbfb3b404 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -600,6 +600,8 @@ def array_equivalent_object(ndarray left, ndarray right) -> bool: if not array_equivalent(x, y): return False + elif PyArray_Check(x) or PyArray_Check(y): + return False elif (x is C_NA) ^ (y is C_NA): return False elif not ( diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index f86ed6f49759f..73c462d492d2d 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -1,4 +1,3 @@ -from contextlib import nullcontext from datetime import datetime from decimal import Decimal @@ -7,7 +6,6 @@ from pandas._libs import missing as libmissing from pandas._libs.tslibs import iNaT -from pandas.compat.numpy import np_version_gte1p25 from pandas.core.dtypes.common import ( is_float, @@ -458,15 +456,7 @@ def test_array_equivalent_dti(dtype_equal): ) def test_array_equivalent_series(val): arr = np.array([1, 2]) - msg = "elementwise comparison failed" - cm = ( - # stacklevel is chosen to make sense when called from .equals - tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False) - if isinstance(val, str) and not np_version_gte1p25 - else nullcontext() - ) - with cm: - assert not array_equivalent(Series([arr, arr]), Series([arr, val])) + assert not array_equivalent(Series([arr, arr]), Series([arr, val])) def test_array_equivalent_array_mismatched_shape(): diff --git a/pandas/tests/series/methods/test_equals.py b/pandas/tests/series/methods/test_equals.py index b94723b7cbddf..0c52eacd7e516 100644 --- a/pandas/tests/series/methods/test_equals.py +++ b/pandas/tests/series/methods/test_equals.py @@ -1,11 +1,9 @@ -from contextlib import nullcontext import copy import numpy as np import pytest from pandas._libs.missing import is_matching_na -from pandas.compat.numpy import np_version_gte1p25 from pandas.core.dtypes.common import is_float @@ -14,7 +12,6 @@ MultiIndex, Series, ) -import pandas._testing as tm @pytest.mark.parametrize( @@ -48,14 +45,7 @@ def test_equals_list_array(val): assert s1.equals(s2) s1[1] = val - - cm = ( - tm.assert_produces_warning(FutureWarning, check_stacklevel=False) - if isinstance(val, str) and not np_version_gte1p25 - else nullcontext() - ) - with cm: - assert not s1.equals(s2) + assert not s1.equals(s2) def test_equals_false_negative(): diff --git a/pandas/tests/util/test_assert_almost_equal.py b/pandas/tests/util/test_assert_almost_equal.py index bcc2e4e03f367..091670ed69f11 100644 --- a/pandas/tests/util/test_assert_almost_equal.py +++ b/pandas/tests/util/test_assert_almost_equal.py @@ -534,6 +534,10 @@ def test_assert_almost_equal_iterable_values_mismatch(): np.array([np.array([1, 2, 3]), np.array([4, 5])], dtype=object), np.array([[1, 2, 3], [4, 5]], dtype=object), ), + ( + np.array([np.array([], dtype=object), None], dtype=object), + np.array([[], None], dtype=object), + ), ( np.array( [ From 5927bd8c66f126897d97d03865e1526a0072f6f4 Mon Sep 17 00:00:00 2001 From: ktseng4096 <32848825+ktseng4096@users.noreply.github.com> Date: Thu, 12 Sep 2024 14:07:29 -0700 Subject: [PATCH 034/224] DOC: Update GroupBy docstrings with See Also requirements (#59748) * update groupby docstrings * fix function name --- ci/code_checks.sh | 6 ------ pandas/core/groupby/generic.py | 2 ++ pandas/core/groupby/groupby.py | 9 +++++++++ 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 2aa256b65a493..c2ab1e6b62352 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -153,14 +153,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.hist RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.indices SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.max SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.min SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.ohlc SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.sum SA01" \ -i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \ -i "pandas.core.groupby.SeriesGroupBy.agg RT03" \ -i "pandas.core.groupby.SeriesGroupBy.aggregate RT03" \ @@ -169,13 +166,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.SeriesGroupBy.indices SA01" \ -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing SA01" \ -i "pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.max SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.min SA01" \ -i "pandas.core.groupby.SeriesGroupBy.nth PR02" \ -i "pandas.core.groupby.SeriesGroupBy.ohlc SA01" \ -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.sum SA01" \ -i "pandas.core.resample.Resampler.__iter__ RT03,SA01" \ -i "pandas.core.resample.Resampler.ffill RT03" \ -i "pandas.core.resample.Resampler.get_group RT03,SA01" \ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 230f61bab96df..eae33ddc1df29 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -615,6 +615,7 @@ def filter(self, func, dropna: bool = True, *args, **kwargs): See Also -------- + Series.filter: Filter elements of ungrouped Series. DataFrameGroupBy.filter : Filter elements from groups base on criterion. Notes @@ -1963,6 +1964,7 @@ def filter(self, func, dropna: bool = True, *args, **kwargs) -> DataFrame: See Also -------- + DataFrame.filter: Filter elements of ungrouped DataFrame. SeriesGroupBy.filter : Filter elements from groups base on criterion. Notes diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 79fe78b7e5405..38dad446b4c39 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -199,6 +199,15 @@ class providing the base-class of operations. Series or DataFrame Computed {fname} of values within each group. +See Also +-------- +SeriesGroupBy.min : Return the min of the group values. +DataFrameGroupBy.min : Return the min of the group values. +SeriesGroupBy.max : Return the max of the group values. +DataFrameGroupBy.max : Return the max of the group values. +SeriesGroupBy.sum : Return the sum of the group values. +DataFrameGroupBy.sum : Return the sum of the group values. + Examples -------- {example} From 2c49f555a004a86a2065525b1f424d1b17208b87 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 12 Sep 2024 23:08:34 +0200 Subject: [PATCH 035/224] BUG/API (string dtype): return float dtype for series[str].rank() (#59768) * BUG/API (string dtype): return float dtype for series[str].rank() * update frame tests * add whatsnew * correct whatsnew note --- doc/source/whatsnew/v2.3.0.rst | 1 + pandas/core/arrays/arrow/array.py | 5 +- pandas/core/arrays/string_arrow.py | 11 ++++ pandas/tests/frame/methods/test_rank.py | 23 ++------ pandas/tests/series/methods/test_rank.py | 72 ++++++++++++++++++------ 5 files changed, 76 insertions(+), 36 deletions(-) diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst index 03b3a6b55dff6..01c2ed3821d7a 100644 --- a/doc/source/whatsnew/v2.3.0.rst +++ b/doc/source/whatsnew/v2.3.0.rst @@ -102,6 +102,7 @@ Conversion Strings ^^^^^^^ +- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`) - Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`) - Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`) - Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index bd94447f0cd80..39cae5b8e2683 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1999,7 +1999,7 @@ def _rank( """ See Series.rank.__doc__. """ - return self._convert_int_result( + return self._convert_rank_result( self._rank_calc( axis=axis, method=method, @@ -2318,6 +2318,9 @@ def _convert_bool_result(self, result): def _convert_int_result(self, result): return type(self)(result) + def _convert_rank_result(self, result): + return type(self)(result) + def _str_count(self, pat: str, flags: int = 0) -> Self: if flags: raise NotImplementedError(f"count not implemented with {flags=}") diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index f446cc5bde147..75bb1f8fb1a65 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -29,6 +29,7 @@ from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin from pandas.core.arrays.arrow import ArrowExtensionArray from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.arrays.floating import Float64Dtype from pandas.core.arrays.integer import Int64Dtype from pandas.core.arrays.numeric import NumericDtype from pandas.core.arrays.string_ import ( @@ -395,6 +396,16 @@ def _convert_int_result(self, result): return Int64Dtype().__from_arrow__(result) + def _convert_rank_result(self, result): + if self.dtype.na_value is np.nan: + if isinstance(result, pa.Array): + result = result.to_numpy(zero_copy_only=False) + else: + result = result.to_numpy() + return result.astype("float64", copy=False) + + return Float64Dtype().__from_arrow__(result) + def _reduce( self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs ): diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index c1cdeaa6c10dd..6c6c208ee0c78 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -6,15 +6,11 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.algos import ( Infinity, NegInfinity, ) -from pandas.compat import HAS_PYARROW -import pandas as pd from pandas import ( DataFrame, Index, @@ -467,23 +463,10 @@ def test_rank_inf_nans_na_option( ("top", False, [2.0, 3.0, 1.0, 4.0]), ], ) - def test_rank_object_first( - self, - request, - frame_or_series, - na_option, - ascending, - expected, - using_infer_string, - ): + def test_rank_object_first(self, frame_or_series, na_option, ascending, expected): obj = frame_or_series(["foo", "foo", None, "foo"]) - if using_string_dtype() and not HAS_PYARROW and isinstance(obj, Series): - request.applymarker(pytest.mark.xfail(reason="TODO(infer_string)")) - result = obj.rank(method="first", na_option=na_option, ascending=ascending) expected = frame_or_series(expected) - if using_infer_string and isinstance(obj, Series): - expected = expected.astype("uint64") tm.assert_equal(result, expected) @pytest.mark.parametrize( @@ -507,7 +490,9 @@ def test_rank_string_dtype(self, string_dtype_no_object): # GH#55362 obj = Series(["foo", "foo", None, "foo"], dtype=string_dtype_no_object) result = obj.rank(method="first") - exp_dtype = "Int64" if string_dtype_no_object.na_value is pd.NA else "float64" + exp_dtype = ( + "Float64" if string_dtype_no_object == "string[pyarrow]" else "float64" + ) if string_dtype_no_object.storage == "python": # TODO nullable string[python] should also return nullable Int64 exp_dtype = "float64" diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py index 2d7fde130ce70..7c6a7893ba3a0 100644 --- a/pandas/tests/series/methods/test_rank.py +++ b/pandas/tests/series/methods/test_rank.py @@ -33,7 +33,8 @@ def ser(): ["max", np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6])], ["first", np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6])], ["dense", np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])], - ] + ], + ids=lambda x: x[0], ) def results(request): return request.param @@ -48,12 +49,29 @@ def results(request): "Int64", pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")), pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")), + pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), + "string[python]", + "str", ] ) def dtype(request): return request.param +def expected_dtype(dtype, method, pct=False): + exp_dtype = "float64" + # elif dtype in ["Int64", "Float64", "string[pyarrow]", "string[python]"]: + if dtype in ["string[pyarrow]"]: + exp_dtype = "Float64" + elif dtype in ["float64[pyarrow]", "int64[pyarrow]"]: + if method == "average" or pct: + exp_dtype = "double[pyarrow]" + else: + exp_dtype = "uint64[pyarrow]" + + return exp_dtype + + class TestSeriesRank: def test_rank(self, datetime_series): sp_stats = pytest.importorskip("scipy.stats") @@ -251,12 +269,14 @@ def test_rank_signature(self): with pytest.raises(ValueError, match=msg): s.rank("average") - @pytest.mark.parametrize("dtype", [None, object]) - def test_rank_tie_methods(self, ser, results, dtype): + def test_rank_tie_methods(self, ser, results, dtype, using_infer_string): method, exp = results + if dtype == "int64" or (not using_infer_string and dtype == "str"): + pytest.skip("int64/str does not support NaN") + ser = ser if dtype is None else ser.astype(dtype) result = ser.rank(method=method) - tm.assert_series_equal(result, Series(exp)) + tm.assert_series_equal(result, Series(exp, dtype=expected_dtype(dtype, method))) @pytest.mark.parametrize("na_option", ["top", "bottom", "keep"]) @pytest.mark.parametrize( @@ -357,25 +377,35 @@ def test_rank_methods_series(self, rank_method, op, value): ], ) def test_rank_dense_method(self, dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="dense") - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "dense")) tm.assert_series_equal(result, expected) - def test_rank_descending(self, ser, results, dtype): + def test_rank_descending(self, ser, results, dtype, using_infer_string): method, _ = results - if "i" in dtype: + if dtype == "int64" or (not using_infer_string and dtype == "str"): s = ser.dropna() else: s = ser.astype(dtype) res = s.rank(ascending=False) - expected = (s.max() - s).rank() - tm.assert_series_equal(res, expected) + if dtype.startswith("str"): + expected = (s.astype("float64").max() - s.astype("float64")).rank() + else: + expected = (s.max() - s).rank() + tm.assert_series_equal(res, expected.astype(expected_dtype(dtype, "average"))) - expected = (s.max() - s).rank(method=method) + if dtype.startswith("str"): + expected = (s.astype("float64").max() - s.astype("float64")).rank( + method=method + ) + else: + expected = (s.max() - s).rank(method=method) res2 = s.rank(method=method, ascending=False) - tm.assert_series_equal(res2, expected) + tm.assert_series_equal(res2, expected.astype(expected_dtype(dtype, method))) def test_rank_int(self, ser, results): method, exp = results @@ -432,9 +462,11 @@ def test_rank_ea_small_values(self): ], ) def test_rank_dense_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="dense", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "dense", pct=True)) tm.assert_series_equal(result, expected) @@ -453,9 +485,11 @@ def test_rank_dense_pct(dtype, ser, exp): ], ) def test_rank_min_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="min", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "min", pct=True)) tm.assert_series_equal(result, expected) @@ -474,9 +508,11 @@ def test_rank_min_pct(dtype, ser, exp): ], ) def test_rank_max_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="max", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "max", pct=True)) tm.assert_series_equal(result, expected) @@ -495,9 +531,11 @@ def test_rank_max_pct(dtype, ser, exp): ], ) def test_rank_average_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="average", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "average", pct=True)) tm.assert_series_equal(result, expected) @@ -516,9 +554,11 @@ def test_rank_average_pct(dtype, ser, exp): ], ) def test_rank_first_pct(dtype, ser, exp): + if ser[0] < 0 and dtype.startswith("str"): + exp = exp[::-1] s = Series(ser).astype(dtype) result = s.rank(method="first", pct=True) - expected = Series(exp).astype(result.dtype) + expected = Series(exp).astype(expected_dtype(dtype, "first", pct=True)) tm.assert_series_equal(result, expected) From 0d2505dca9c34b666155c1483d592877206081aa Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 12 Sep 2024 23:11:52 +0200 Subject: [PATCH 036/224] String dtype: fix isin() values handling for python storage (#59759) * String dtype: fix isin() values handling for python storage * address feedback --- pandas/conftest.py | 9 ++++- pandas/core/arrays/string_.py | 20 +++++++++++ pandas/tests/arrays/string_/test_string.py | 41 +++++++++++++++++++--- 3 files changed, 64 insertions(+), 6 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 222aefb4afda8..e2db9260ac37d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1338,7 +1338,13 @@ def string_storage(request): pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")), pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")), ("python", np.nan), - ] + ], + ids=[ + "string=string[python]", + "string=string[pyarrow]", + "string=str[pyarrow]", + "string=str[python]", + ], ) def string_dtype_arguments(request): """ @@ -1369,6 +1375,7 @@ def dtype_backend(request): # Alias so we can test with cartesian product of string_storage string_storage2 = string_storage +string_dtype_arguments2 = string_dtype_arguments @pytest.fixture(params=tm.BYTES_DTYPES) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index a46475a7d1ec2..b3aa782341c77 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -46,6 +46,7 @@ nanops, ops, ) +from pandas.core.algorithms import isin from pandas.core.array_algos import masked_reductions from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.floating import ( @@ -65,6 +66,7 @@ import pyarrow from pandas._typing import ( + ArrayLike, AxisInt, Dtype, DtypeObj, @@ -735,6 +737,24 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: # base class implementation that uses __setitem__ ExtensionArray._putmask(self, mask, value) + def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: + if isinstance(values, BaseStringArray) or ( + isinstance(values, ExtensionArray) and is_string_dtype(values.dtype) + ): + values = values.astype(self.dtype, copy=False) + else: + if not lib.is_string_array(np.asarray(values), skipna=True): + values = np.array( + [val for val in values if isinstance(val, str) or isna(val)], + dtype=object, + ) + if not len(values): + return np.zeros(self.shape, dtype=bool) + + values = self._from_sequence(values, dtype=self.dtype) + + return isin(np.asarray(self), np.asarray(values)) + def astype(self, dtype, copy: bool = True): dtype = pandas_dtype(dtype) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 87bd1d5921caa..33708be497f31 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -30,6 +30,12 @@ def dtype(string_dtype_arguments): return pd.StringDtype(storage=storage, na_value=na_value) +@pytest.fixture +def dtype2(string_dtype_arguments2): + storage, na_value = string_dtype_arguments2 + return pd.StringDtype(storage=storage, na_value=na_value) + + @pytest.fixture def cls(dtype): """Fixture giving array type from parametrized 'dtype'""" @@ -662,11 +668,7 @@ def test_isin(dtype, fixed_now_ts): tm.assert_series_equal(result, expected) result = s.isin(["a", pd.NA]) - if dtype.storage == "python" and dtype.na_value is np.nan: - # TODO(infer_string) we should make this consistent - expected = pd.Series([True, False, False]) - else: - expected = pd.Series([True, False, True]) + expected = pd.Series([True, False, True]) tm.assert_series_equal(result, expected) result = s.isin([]) @@ -677,6 +679,35 @@ def test_isin(dtype, fixed_now_ts): expected = pd.Series([True, False, False]) tm.assert_series_equal(result, expected) + result = s.isin([fixed_now_ts]) + expected = pd.Series([False, False, False]) + tm.assert_series_equal(result, expected) + + +def test_isin_string_array(dtype, dtype2): + s = pd.Series(["a", "b", None], dtype=dtype) + + result = s.isin(pd.array(["a", "c"], dtype=dtype2)) + expected = pd.Series([True, False, False]) + tm.assert_series_equal(result, expected) + + result = s.isin(pd.array(["a", None], dtype=dtype2)) + expected = pd.Series([True, False, True]) + tm.assert_series_equal(result, expected) + + +def test_isin_arrow_string_array(dtype): + pa = pytest.importorskip("pyarrow") + s = pd.Series(["a", "b", None], dtype=dtype) + + result = s.isin(pd.array(["a", "c"], dtype=pd.ArrowDtype(pa.string()))) + expected = pd.Series([True, False, False]) + tm.assert_series_equal(result, expected) + + result = s.isin(pd.array(["a", None], dtype=pd.ArrowDtype(pa.string()))) + expected = pd.Series([True, False, True]) + tm.assert_series_equal(result, expected) + def test_setitem_scalar_with_mask_validation(dtype): # https://github.com/pandas-dev/pandas/issues/47628 From 73c4fce2fe8c8893b1d370ce04211c59c8182d61 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Fri, 13 Sep 2024 23:55:51 +0530 Subject: [PATCH 037/224] DOC: fix SA01 for pandas.NA (#59787) --- ci/code_checks.sh | 1 - pandas/_libs/missing.pyx | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index c2ab1e6b62352..7ad29b3a2a1f3 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -70,7 +70,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then --format=actions \ -i ES01 `# For now it is ok if docstrings are missing the extended summary` \ -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ - -i "pandas.NA SA01" \ -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ -i "pandas.PeriodDtype.freq SA01" \ diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 2f44128cda822..390a527c22bbb 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -347,6 +347,14 @@ class NAType(C_NAType): The NA singleton is a missing value indicator defined by pandas. It is used in certain new extension dtypes (currently the "string" dtype). + See Also + -------- + numpy.nan : Floating point representation of Not a Number (NaN) for numerical data. + isna : Detect missing values for an array-like object. + notna : Detect non-missing values for an array-like object. + DataFrame.fillna : Fill missing values in a DataFrame. + Series.fillna : Fill missing values in a Series. + Examples -------- >>> pd.NA From a71df34cb841d5aefb94458767d6987caf02ae67 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 14 Sep 2024 22:21:08 +0530 Subject: [PATCH 038/224] DOC: fix SA01,ES01 for pandas.Timedelta.components (#59799) --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/timedeltas.pyx | 11 +++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7ad29b3a2a1f3..fd42fa70a6f7c 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -102,7 +102,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.sparse.from_coo PR07,SA01" \ -i "pandas.Series.sparse.npoints SA01" \ -i "pandas.Series.sparse.sp_values SA01" \ - -i "pandas.Timedelta.components SA01" \ -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 4f90f26cf31ab..6159bd0dadb47 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1493,6 +1493,17 @@ cdef class _Timedelta(timedelta): """ Return a components namedtuple-like. + Each component represents a different time unit, allowing you to access the + breakdown of the total duration in terms of days, hours, minutes, seconds, + milliseconds, microseconds, and nanoseconds. + + See Also + -------- + Timedelta.total_seconds : Returns the total duration of the Timedelta in + seconds. + to_timedelta : Convert argument to Timedelta. + Timedelta : Represents a duration, the difference between two dates or times. + Examples -------- >>> td = pd.Timedelta('2 day 4 min 3 us 42 ns') From 695dbde594f6d7eef732340f57b4439f6661e74a Mon Sep 17 00:00:00 2001 From: ammar-qazi Date: Sat, 14 Sep 2024 18:53:36 +0200 Subject: [PATCH 039/224] Update ExtensionArray.interpolate to remove outdated method of pad (#59798) --- pandas/core/arrays/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index a933a9ce11646..5f2c2a7772f78 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1010,7 +1010,6 @@ def interpolate( * 'time': Works on daily and higher resolution data to interpolate given length of interval. * 'index', 'values': use the actual numerical values of the index. - * 'pad': Fill in NaNs using existing values. * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric', 'polynomial': Passed to scipy.interpolate.interp1d, whereas 'spline' is passed to scipy.interpolate.UnivariateSpline. These methods use From e3bcd10d7dedd71a70a5229ce2b53c543feb63c5 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 14 Sep 2024 22:24:30 +0530 Subject: [PATCH 040/224] DOC: fix SA01,ES01 for pandas.PeriodDtype.freq (#59796) --- ci/code_checks.sh | 1 - pandas/core/dtypes/dtypes.py | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index fd42fa70a6f7c..73b389e427648 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -72,7 +72,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \ -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ - -i "pandas.PeriodDtype.freq SA01" \ -i "pandas.RangeIndex.from_range PR01,SA01" \ -i "pandas.RangeIndex.step SA01" \ -i "pandas.Series.cat.add_categories PR01,PR02" \ diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 68b4807961d19..bb6610c514375 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -1065,6 +1065,20 @@ def freq(self) -> BaseOffset: """ The frequency object of this PeriodDtype. + The `freq` property returns the `BaseOffset` object that represents the + frequency of the PeriodDtype. This frequency specifies the interval (e.g., + daily, monthly, yearly) associated with the Period type. It is essential + for operations that depend on time-based calculations within a period index + or series. + + See Also + -------- + Period : Represents a period of time. + PeriodIndex : Immutable ndarray holding ordinal values indicating + regular periods. + PeriodDtype : An ExtensionDtype for Period data. + date_range : Return a fixed frequency range of dates. + Examples -------- >>> dtype = pd.PeriodDtype(freq="D") From e215121f71a59ba44b614f1962a960b8415864ad Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sun, 15 Sep 2024 22:45:39 +0530 Subject: [PATCH 041/224] DOC: fix SA01,ES01 for pandas.Timedelta.total_seconds (#59800) * DOC: fix SA01,ES01 for pandas.Timedelta.total_seconds * DOC: fix SA01,ES01 for pandas.Timedelta.total_seconds --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/nattype.pyx | 8 ++++++++ pandas/_libs/tslibs/timedeltas.pyx | 8 ++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 73b389e427648..606ede4e861fa 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -105,7 +105,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ -i "pandas.Timedelta.to_timedelta64 SA01" \ - -i "pandas.Timedelta.total_seconds SA01" \ -i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \ -i "pandas.Timestamp.max PR02" \ -i "pandas.Timestamp.min PR02" \ diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 60afc1acdc297..620e0846c750e 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -493,6 +493,14 @@ class NaTType(_NaT): """ Total seconds in the duration. + This method calculates the total duration in seconds by combining + the days, seconds, and microseconds of the `Timedelta` object. + + See Also + -------- + to_timedelta : Convert argument to timedelta. + Timedelta : Represents a duration, the difference between two dates or times. + Examples -------- >>> td = pd.Timedelta('1min') diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 6159bd0dadb47..0ff5c5fb81df8 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1189,6 +1189,14 @@ cdef class _Timedelta(timedelta): """ Total seconds in the duration. + This method calculates the total duration in seconds by combining + the days, seconds, and microseconds of the `Timedelta` object. + + See Also + -------- + to_timedelta : Convert argument to timedelta. + Timedelta : Represents a duration, the difference between two dates or times. + Examples -------- >>> td = pd.Timedelta('1min') From 679578742669e208265b9089b6afe3f0451be680 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sun, 15 Sep 2024 22:46:32 +0530 Subject: [PATCH 042/224] DOC: fix SA01 for pandas.api.types.is_array_like (#59802) --- ci/code_checks.sh | 1 - pandas/core/dtypes/common.py | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 606ede4e861fa..ff5bfee1518c4 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -113,7 +113,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.Timestamp.year GL08" \ -i "pandas.api.types.is_dict_like PR07,SA01" \ - -i "pandas.api.types.is_extension_array_dtype SA01" \ -i "pandas.api.types.is_file_like PR07,SA01" \ -i "pandas.api.types.is_float PR01,SA01" \ -i "pandas.api.types.is_float_dtype SA01" \ diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 16f6bd396fe93..de38395cecad3 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1401,6 +1401,10 @@ def is_extension_array_dtype(arr_or_dtype) -> bool: bool Whether the `arr_or_dtype` is an extension array type. + See Also + -------- + api.extensions.ExtensionArray : Abstract base class for pandas extension arrays. + Notes ----- This checks whether an object implements the pandas extension From 2b37219aa0617edce4f4326aec71e049b6acc1d2 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sun, 15 Sep 2024 22:47:05 +0530 Subject: [PATCH 043/224] DOC: fix SA01 for pandas.api.types.is_integer_dtype (#59803) --- ci/code_checks.sh | 1 - pandas/core/dtypes/common.py | 9 +++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ff5bfee1518c4..ffa540291e560 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -119,7 +119,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_hashable PR01,RT03,SA01" \ -i "pandas.api.types.is_int64_dtype SA01" \ -i "pandas.api.types.is_integer PR01,SA01" \ - -i "pandas.api.types.is_integer_dtype SA01" \ -i "pandas.api.types.is_interval_dtype SA01" \ -i "pandas.api.types.is_iterator PR07,SA01" \ -i "pandas.api.types.is_list_like SA01" \ diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index de38395cecad3..ff855f97a352b 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -694,6 +694,15 @@ def is_integer_dtype(arr_or_dtype) -> bool: Whether or not the array or dtype is of an integer dtype and not an instance of timedelta64. + See Also + -------- + api.types.is_integer : Return True if given object is integer. + api.types.is_numeric_dtype : Check whether the provided array or dtype is of a + numeric dtype. + api.types.is_float_dtype : Check whether the provided array or dtype is of a + float dtype. + Int64Dtype : An ExtensionDtype for Int64Dtype integer data. + Examples -------- >>> from pandas.api.types import is_integer_dtype From 1d80ac59028b01d3efc15b97119cf6b3c896c1da Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sun, 15 Sep 2024 22:47:47 +0530 Subject: [PATCH 044/224] DOC: fix SA01 for pandas.arrays.FloatingArray (#59804) --- ci/code_checks.sh | 1 - pandas/core/arrays/floating.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ffa540291e560..f022e0176a987 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -130,7 +130,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ -i "pandas.arrays.BooleanArray SA01" \ -i "pandas.arrays.DatetimeArray SA01" \ - -i "pandas.arrays.FloatingArray SA01" \ -i "pandas.arrays.IntegerArray SA01" \ -i "pandas.arrays.IntervalArray.left SA01" \ -i "pandas.arrays.IntervalArray.length SA01" \ diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index b3fbf0f92c32d..67c23f4825a7f 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -96,6 +96,14 @@ class FloatingArray(NumericArray): ------- FloatingArray + See Also + -------- + array : Create an array. + Float32Dtype : Float32 dtype for FloatingArray. + Float64Dtype : Float64 dtype for FloatingArray. + Series : One-dimensional labeled array capable of holding data. + DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data. + Examples -------- Create an FloatingArray with :func:`pandas.array`: From 235e1bea1366f9ffd54866e7a997d2a75016bf84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= <6618166+twoertwein@users.noreply.github.com> Date: Sun, 15 Sep 2024 14:24:36 -0400 Subject: [PATCH 045/224] WEB: update list of (in)active core devs (#59808) --- web/pandas/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/pandas/config.yml b/web/pandas/config.yml index 74e7fda2e7983..a49aadd45204a 100644 --- a/web/pandas/config.yml +++ b/web/pandas/config.yml @@ -89,7 +89,6 @@ maintainers: - phofl - attack68 - fangchenli - - twoertwein - lithomas1 - lukemanley - noatamir @@ -108,6 +107,7 @@ maintainers: - wesm - gfyoung - mzeitlin11 + - twoertwein workgroups: coc: name: Code of Conduct From 3e8ac12d1dacc2308b2f4c2869fa7bc2079bd323 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 15 Sep 2024 22:00:26 +0200 Subject: [PATCH 046/224] BUG (CoW): fix reference tracking in replace_list with None (#59807) --- pandas/core/internals/blocks.py | 2 +- pandas/tests/copy_view/test_replace.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index dced92ba04520..cb40e920149fa 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -915,7 +915,7 @@ def _replace_coerce( nb = nb.copy() putmask_inplace(nb.values, mask, value) return [nb] - return [self] + return [self.copy(deep=False)] return self.replace( to_replace=to_replace, value=value, diff --git a/pandas/tests/copy_view/test_replace.py b/pandas/tests/copy_view/test_replace.py index 58c979fb05089..a8acd446ff5f5 100644 --- a/pandas/tests/copy_view/test_replace.py +++ b/pandas/tests/copy_view/test_replace.py @@ -286,6 +286,12 @@ def test_replace_list_none(): assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a")) + # replace multiple values that don't actually replace anything with None + # https://github.com/pandas-dev/pandas/issues/59770 + df3 = df.replace(["d", "e", "f"], value=None) + tm.assert_frame_equal(df3, df_orig) + assert tm.shares_memory(get_array(df, "a"), get_array(df3, "a")) + def test_replace_list_none_inplace_refs(): df = DataFrame({"a": ["a", "b", "c"]}) From 122fc4c6f45b8e603132b57c4cf99c8837bac43e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 15 Sep 2024 23:03:33 +0200 Subject: [PATCH 047/224] DOC: add whatsnew for v2.2.3 (#59811) * DOC: add whatsnew for v2.2.3 * fix warning --------- Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com> --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v2.2.3.rst | 36 ++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 doc/source/whatsnew/v2.2.3.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 2f7ec52d117f8..1dd6c5fabef04 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -32,6 +32,7 @@ Version 2.2 .. toctree:: :maxdepth: 2 + v2.2.3 v2.2.2 v2.2.1 v2.2.0 diff --git a/doc/source/whatsnew/v2.2.3.rst b/doc/source/whatsnew/v2.2.3.rst new file mode 100644 index 0000000000000..aa6e241e74b0a --- /dev/null +++ b/doc/source/whatsnew/v2.2.3.rst @@ -0,0 +1,36 @@ +.. _whatsnew_223: + +What's new in 2.2.3 (September XX, 2024) +---------------------------------------- + +These are the changes in pandas 2.2.3. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- +.. _whatsnew_223.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_223.bug_fixes: + +Bug fixes +~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_223.other: + +Other +~~~~~ +- + +.. --------------------------------------------------------------------------- +.. _whatsnew_223.contributors: + +Contributors +~~~~~~~~~~~~ From 160b3eb4be5150a2d2bcb6b4e47dc8a44a4c0922 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 16 Sep 2024 02:43:06 +0530 Subject: [PATCH 048/224] DOC: fix SA01 for pandas.errors.MergeError (#59805) * DOC: fix SA01 for pandas.errors.MergeError * DOC: fix SA01 for pandas.errors.MergeError Co-authored-by: mroeschke --------- Co-authored-by: mroeschke --- ci/code_checks.sh | 1 - pandas/errors/__init__.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index f022e0176a987..b57426dbb2078 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -191,7 +191,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.IntCastingNaNError SA01" \ -i "pandas.errors.InvalidIndexError SA01" \ -i "pandas.errors.InvalidVersion SA01" \ - -i "pandas.errors.MergeError SA01" \ -i "pandas.errors.NullFrequencyError SA01" \ -i "pandas.errors.NumExprClobberingError SA01" \ -i "pandas.errors.NumbaUtilError SA01" \ diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 2f625090e0492..7851bc90c5782 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -261,6 +261,11 @@ class MergeError(ValueError): Subclass of ``ValueError``. + See Also + -------- + DataFrame.join : For joining DataFrames on their indexes. + merge : For merging two DataFrames on a common set of keys. + Examples -------- >>> left = pd.DataFrame( From 013ac6702c738b73a6729aa75399eebe9ef52f45 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 16 Sep 2024 19:25:59 +0200 Subject: [PATCH 049/224] String dtype: allow string dtype in query/eval with default numexpr engine (#59810) String dtype: allow string dtype in query/eval with default mumexpr engine --- pandas/core/computation/eval.py | 12 +++++++++--- pandas/core/computation/expr.py | 6 +++++- pandas/tests/frame/test_query_eval.py | 24 ++++++------------------ 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index aad768d31483a..485c7f87d6f33 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -14,7 +14,10 @@ from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg -from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.core.dtypes.common import ( + is_extension_array_dtype, + is_string_dtype, +) from pandas.core.computation.engines import ENGINES from pandas.core.computation.expr import ( @@ -345,10 +348,13 @@ def eval( parsed_expr = Expr(expr, engine=engine, parser=parser, env=env) if engine == "numexpr" and ( - is_extension_array_dtype(parsed_expr.terms.return_type) + ( + is_extension_array_dtype(parsed_expr.terms.return_type) + and not is_string_dtype(parsed_expr.terms.return_type) + ) or getattr(parsed_expr.terms, "operand_types", None) is not None and any( - is_extension_array_dtype(elem) + (is_extension_array_dtype(elem) and not is_string_dtype(elem)) for elem in parsed_expr.terms.operand_types ) ): diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index b074e768e0842..f45bc453d2541 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -21,6 +21,8 @@ from pandas.errors import UndefinedVariableError +from pandas.core.dtypes.common import is_string_dtype + import pandas.core.common as com from pandas.core.computation.ops import ( ARITH_OPS_SYMS, @@ -524,10 +526,12 @@ def _maybe_evaluate_binop( elif self.engine != "pytables": if ( getattr(lhs, "return_type", None) == object + or is_string_dtype(getattr(lhs, "return_type", None)) or getattr(rhs, "return_type", None) == object + or is_string_dtype(getattr(rhs, "return_type", None)) ): # evaluate "==" and "!=" in python if either of our operands - # has an object return type + # has an object or string return type return self._maybe_eval(res, eval_in_python + maybe_eval_in_python) return res diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index fa71153d01157..a574989860957 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -4,8 +4,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ( NumExprClobberingError, UndefinedVariableError, @@ -762,7 +760,6 @@ def test_inf(self, op, f, engine, parser): result = df.query(q, engine=engine, parser=parser) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_check_tz_aware_index_query(self, tz_aware_fixture): # https://github.com/pandas-dev/pandas/issues/29463 tz = tz_aware_fixture @@ -775,6 +772,7 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture): tm.assert_frame_equal(result, expected) expected = DataFrame(df_index) + expected.columns = expected.columns.astype(object) result = df.reset_index().query('"2018-01-03 00:00:00+00" < time') tm.assert_frame_equal(result, expected) @@ -1072,7 +1070,7 @@ def test_query_with_string_columns(self, parser, engine): with pytest.raises(NotImplementedError, match=msg): df.query("a in b and c < d", parser=parser, engine=engine) - def test_object_array_eq_ne(self, parser, engine, using_infer_string): + def test_object_array_eq_ne(self, parser, engine): df = DataFrame( { "a": list("aaaabbbbcccc"), @@ -1081,14 +1079,11 @@ def test_object_array_eq_ne(self, parser, engine, using_infer_string): "d": np.random.default_rng(2).integers(9, size=12), } ) - warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None - with tm.assert_produces_warning(warning): - res = df.query("a == b", parser=parser, engine=engine) + res = df.query("a == b", parser=parser, engine=engine) exp = df[df.a == df.b] tm.assert_frame_equal(res, exp) - with tm.assert_produces_warning(warning): - res = df.query("a != b", parser=parser, engine=engine) + res = df.query("a != b", parser=parser, engine=engine) exp = df[df.a != df.b] tm.assert_frame_equal(res, exp) @@ -1128,15 +1123,13 @@ def test_query_with_nested_special_character(self, parser, engine): ], ) def test_query_lex_compare_strings( - self, parser, engine, op, func, using_infer_string + self, parser, engine, op, func ): a = Series(np.random.default_rng(2).choice(list("abcde"), 20)) b = Series(np.arange(a.size)) df = DataFrame({"X": a, "Y": b}) - warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None - with tm.assert_produces_warning(warning): - res = df.query(f'X {op} "d"', engine=engine, parser=parser) + res = df.query(f'X {op} "d"', engine=engine, parser=parser) expected = df[func(df.X, "d")] tm.assert_frame_equal(res, expected) @@ -1400,7 +1393,6 @@ def test_expr_with_column_name_with_backtick(self): expected = df[df["a`b"] < 2] tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_expr_with_string_with_backticks(self): # GH 59285 df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"]) @@ -1408,7 +1400,6 @@ def test_expr_with_string_with_backticks(self): expected = df["```" < df["#backticks"]] tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_expr_with_string_with_backticked_substring_same_as_column_name(self): # GH 59285 df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"]) @@ -1439,7 +1430,6 @@ def test_expr_with_column_names_with_special_characters(self, col1, col2, expr): expected = df[df[col1] < df[col2]] tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_expr_with_no_backticks(self): # GH 59285 df = DataFrame(("aaa", "vvv", "zzz"), columns=["column_name"]) @@ -1483,7 +1473,6 @@ def test_expr_with_quote_opened_before_backtick_and_quote_is_unmatched(self): ): df.query("`column-name` < 'It`s that\\'s \"quote\" #hash") - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_at_end(self): # GH 59285 df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) @@ -1491,7 +1480,6 @@ def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_at_end(self expected = df[df["column-name"] < 'It`s that\'s "quote" #hash'] tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_in_mid(self): # GH 59285 df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) From 081dcdee8d754af90e307cf2311b06b3d02fae2a Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Mon, 16 Sep 2024 19:40:33 +0200 Subject: [PATCH 050/224] BUG: Remove np._get_promotion_state usage (#59818) --- pandas/tests/series/indexing/test_setitem.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 71ba2dab671ef..789e3ac752097 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -4,13 +4,17 @@ datetime, ) from decimal import Decimal +import os import numpy as np import pytest from pandas._config import using_string_dtype -from pandas.compat import HAS_PYARROW +from pandas.compat import ( + HAS_PYARROW, + WASM, +) from pandas.compat.numpy import np_version_gte1p24 from pandas.errors import IndexingError @@ -1446,7 +1450,11 @@ def obj(self): marks=pytest.mark.xfail( ( not np_version_gte1p24 - or (np_version_gte1p24 and np._get_promotion_state() != "weak") + or ( + np_version_gte1p24 + and os.environ.get("NPY_PROMOTION_STATE", "weak") != "weak" + ) + or WASM ), reason="np.float32(1.1) ends up as 1.100000023841858, so " "np_can_hold_element raises and we cast to float64", From 8b1b2114ea72b9b79220e3cb2828b3e562bb5e07 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 17 Sep 2024 19:00:50 -0400 Subject: [PATCH 051/224] CI: Debug failing ARM builds (#59813) * try bumping cython? * maybe pinning numpy helps? * skip tests * Update test_sparse.py * go for green * Update test_sparse.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- pandas/tests/extension/test_sparse.py | 5 +++++ pandas/tests/series/test_ufunc.py | 5 ++++- pyproject.toml | 12 ++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 56c023d99bb1c..b7685a61d4937 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -340,11 +340,16 @@ def test_argmin_argmax_all_na(self, method, data, na_value): self._check_unsupported(data) super().test_argmin_argmax_all_na(method, data, na_value) + @pytest.mark.fails_arm_wheels @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame]) def test_equals(self, data, na_value, as_series, box): self._check_unsupported(data) super().test_equals(data, na_value, as_series, box) + @pytest.mark.fails_arm_wheels + def test_equals_same_data_different_object(self, data): + super().test_equals_same_data_different_object(data) + @pytest.mark.parametrize( "func, na_action, expected", [ diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py index 36a2afb2162c2..a5976bb2518c9 100644 --- a/pandas/tests/series/test_ufunc.py +++ b/pandas/tests/series/test_ufunc.py @@ -16,7 +16,10 @@ def ufunc(request): return request.param -@pytest.fixture(params=[True, False], ids=["sparse", "dense"]) +@pytest.fixture( + params=[pytest.param(True, marks=pytest.mark.fails_arm_wheels), False], + ids=["sparse", "dense"], +) def sparse(request): return request.param diff --git a/pyproject.toml b/pyproject.toml index 645ded35f3d18..9e4199ab735c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -163,6 +163,14 @@ before-test = "bash {package}/scripts/cibw_before_test.sh" before-build = "pip install delvewheel && bash {package}/scripts/cibw_before_build.sh" repair-wheel-command = "delvewheel repair -w {dest_dir} {wheel}" +[[tool.cibuildwheel.overrides]] +select = "*-manylinux_aarch64*" +test-command = """ + PANDAS_CI='1' python -c 'import pandas as pd; \ + pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db and not fails_arm_wheels", "-n 2", "--no-strict-data-files"]); \ + pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \ + """ + [[tool.cibuildwheel.overrides]] select = "*-musllinux*" before-test = "apk update && apk add musl-locales && bash {package}/scripts/cibw_before_test.sh" @@ -478,6 +486,10 @@ markers = [ "clipboard: mark a pd.read_clipboard test", "arm_slow: mark a test as slow for arm64 architecture", "skip_ubsan: Tests known to fail UBSAN check", + # TODO: someone should investigate this ... + # these tests only fail in the wheel builder and don't fail in regular + # ARM CI + "fails_arm_wheels: Tests that fail in the ARM wheel build only", ] [tool.mypy] From a851438906ad5ec5f33df4a28ced85c4a0dcb492 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 18 Sep 2024 22:20:41 +0530 Subject: [PATCH 052/224] DOC: fix SA01,ES01 for pandas.tseries.offsets.WeekOfMonth (#59834) --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/offsets.pyx | 11 +++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index b57426dbb2078..f2d9f582d8932 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -383,7 +383,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.Week.n GL08" \ -i "pandas.tseries.offsets.Week.normalize GL08" \ -i "pandas.tseries.offsets.Week.weekday GL08" \ - -i "pandas.tseries.offsets.WeekOfMonth SA01" \ -i "pandas.tseries.offsets.WeekOfMonth.is_on_offset GL08" \ -i "pandas.tseries.offsets.WeekOfMonth.n GL08" \ -i "pandas.tseries.offsets.WeekOfMonth.normalize GL08" \ diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 043c029ec900c..4fa1af0ec882c 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -3582,6 +3582,11 @@ cdef class WeekOfMonth(WeekOfMonthMixin): """ Describes monthly dates like "the Tuesday of the 2nd week of each month". + This offset allows for generating or adjusting dates by specifying + a particular week and weekday within a month. The week is zero-indexed, + where 0 corresponds to the first week of the month, and weekday follows + a Monday=0 convention. + Attributes ---------- n : int, default 1 @@ -3602,6 +3607,12 @@ cdef class WeekOfMonth(WeekOfMonthMixin): - 5 is Saturday - 6 is Sunday. + See Also + -------- + offsets.Week : Describes weekly frequency adjustments. + offsets.MonthEnd : Describes month-end frequency adjustments. + date_range : Generates a range of dates based on a specific frequency. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 1) From 0ad2c0d549ecc866a334e482afadc96845a01efa Mon Sep 17 00:00:00 2001 From: Matthew Simpson <156332325+ms041223@users.noreply.github.com> Date: Wed, 18 Sep 2024 19:25:53 +0100 Subject: [PATCH 053/224] DOC: Adding ArcticDB to the ecosystem.md page (#59830) * adding ArcticDB to the ecosystem.md page * Update web/pandas/community/ecosystem.md Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * making pandas lower case --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- web/pandas/community/ecosystem.md | 91 +++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 73a3cb6429790..2ea10954fc929 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -367,6 +367,97 @@ pandas-gbq provides high performance reads and writes to and from these methods were exposed as `pandas.read_gbq` and `DataFrame.to_gbq`. Use `pandas_gbq.read_gbq` and `pandas_gbq.to_gbq`, instead. + +### [ArcticDB](https://github.com/man-group/ArcticDB) + +ArcticDB is a serverless DataFrame database engine designed for the Python Data Science ecosystem. ArcticDB enables you to store, retrieve, and process pandas DataFrames at scale. It is a storage engine designed for object storage and also supports local-disk storage using LMDB. ArcticDB requires zero additional infrastructure beyond a running Python environment and access to object storage and can be installed in seconds. Please find full documentation [here](https://docs.arcticdb.io/latest/). + +#### ArcticDB Terminology + +ArcticDB is structured to provide a scalable and efficient way to manage and retrieve DataFrames, organized into several key components: + +- `Object Store` Collections of libraries. Used to separate logical environments from each other. Analogous to a database server. +- `Library` Contains multiple symbols which are grouped in a certain way (different users, markets, etc). Analogous to a database. +- `Symbol` Atomic unit of data storage. Identified by a string name. Data stored under a symbol strongly resembles a pandas DataFrame. Analogous to tables. +- `Version` Every modifying action (write, append, update) performed on a symbol creates a new version of that object. + +#### Installation + +To install, simply run: + +```console +pip install arcticdb +``` + +To get started, we can import ArcticDB and instantiate it: + +```python +import arcticdb as adb +import numpy as np +import pandas as pd +# this will set up the storage using the local file system +arctic = adb.Arctic("lmdb://arcticdb_test") +``` + +> **Note:** ArcticDB supports any S3 API compatible storage, including AWS. ArcticDB also supports Azure Blob storage. +> ArcticDB also supports LMDB for local/file based storage - to use LMDB, pass an LMDB path as the URI: `adb.Arctic('lmdb://path/to/desired/database')`. + +#### Library Setup + +ArcticDB is geared towards storing many (potentially millions) of tables. Individual tables (DataFrames) are called symbols and are stored in collections called libraries. A single library can store many symbols. Libraries must first be initialized prior to use: + +```python +lib = arctic.get_library('sample', create_if_missing=True) +``` + +#### Writing Data to ArcticDB + +Now we have a library set up, we can get to reading and writing data. ArcticDB has a set of simple functions for DataFrame storage. Let's write a DataFrame to storage. + +```python +df = pd.DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("20130101", periods=3) + } +) + +df +df.dtypes +``` + +Write to ArcticDB. + +```python +write_record = lib.write("test", df) +``` + +> **Note:** When writing pandas DataFrames, ArcticDB supports the following index types: +> +> - `pandas.Index` containing int64 (or the corresponding dedicated types Int64Index, UInt64Index) +> - `RangeIndex` +> - `DatetimeIndex` +> - `MultiIndex` composed of above supported types +> +> The "row" concept in `head`/`tail` refers to the row number ('iloc'), not the value in the `pandas.Index` ('loc'). + +#### Reading Data from ArcticDB + +Read the data back from storage: + +```python +read_record = lib.read("test") +read_record.data +df.dtypes +``` + +ArcticDB also supports appending, updating, and querying data from storage to a pandas DataFrame. Please find more information [here](https://docs.arcticdb.io/latest/api/query_builder/). + + ## Out-of-core ### [Bodo](https://bodo.ai/) From 09c7a873cacfcb2caa38329d4bb27d61fd153d74 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 18 Sep 2024 17:16:55 -0400 Subject: [PATCH 054/224] BLD: Fix bad Cython annotation (#59836) --- pandas/_libs/tslibs/np_datetime.pxd | 2 +- pandas/_libs/tslibs/np_datetime.pyx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index 43240046c6500..3e5654b70cd92 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -89,7 +89,7 @@ cdef int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, - format: str | None = *, + str format = *, bint exact = * ) except? -1 diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 61095b3f034fd..0b02fc13246f0 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -331,7 +331,7 @@ cdef int string_to_dts( int* out_local, int* out_tzoffset, bint want_exc, - format: str | None=None, + str format=None, bint exact=True, ) except? -1: cdef: From 22372175e04f05f73521cab1b26f0818d6766717 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 19 Sep 2024 00:46:07 +0200 Subject: [PATCH 055/224] BLD/RLS: build wheels with released numpy/cython for Python 3.13 (#59819) --- .github/workflows/wheels.yml | 6 +----- MANIFEST.in | 1 - pyproject.toml | 3 +-- scripts/cibw_before_build.sh | 8 +++----- scripts/cibw_before_test.sh | 8 -------- 5 files changed, 5 insertions(+), 21 deletions(-) delete mode 100644 scripts/cibw_before_test.sh diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 67d8715f72614..2aaec8c9b56b0 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -102,9 +102,7 @@ jobs: python: [["cp310", "3.10"], ["cp311", "3.11"], ["cp312", "3.12"], ["cp313", "3.13"], ["cp313t", "3.13"]] include: # TODO: Remove this plus installing build deps in cibw_before_build.sh - # and test deps in cibw_before_test.sh after pandas can be built with a released NumPy/Cython - - python: ["cp313", "3.13"] - cibw_build_frontend: 'pip; args: --no-build-isolation' + # after pandas can be built with a released NumPy/Cython - python: ["cp313t", "3.13"] cibw_build_frontend: 'pip; args: --no-build-isolation' # Build Pyodide wheels and upload them to Anaconda.org @@ -187,11 +185,9 @@ jobs: - name: Test Windows Wheels if: ${{ matrix.buildplat[1] == 'win_amd64' }} shell: pwsh - # TODO: Remove NumPy nightly install when there's a 3.13 wheel on PyPI run: | $TST_CMD = @" python -m pip install hypothesis>=6.84.0 pytest>=7.3.2 pytest-xdist>=3.4.0; - ${{ matrix.python[1] == '3.13' && 'python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy;' }} python -m pip install `$(Get-Item pandas\wheelhouse\*.whl); python -c `'import pandas as pd; pd.test(extra_args=[`\"--no-strict-data-files`\", `\"-m not clipboard and not single_cpu and not slow and not network and not db`\"])`'; "@ diff --git a/MANIFEST.in b/MANIFEST.in index f586d457eaaf8..a7d7d7eb4e062 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -65,4 +65,3 @@ graft pandas/_libs/include # Include cibw script in sdist since it's needed for building wheels include scripts/cibw_before_build.sh -include scripts/cibw_before_test.sh diff --git a/pyproject.toml b/pyproject.toml index 9e4199ab735c6..5ffd9d9a5608c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -157,7 +157,6 @@ test-command = """ """ free-threaded-support = true before-build = "bash {package}/scripts/cibw_before_build.sh" -before-test = "bash {package}/scripts/cibw_before_test.sh" [tool.cibuildwheel.windows] before-build = "pip install delvewheel && bash {package}/scripts/cibw_before_build.sh" @@ -173,7 +172,7 @@ test-command = """ [[tool.cibuildwheel.overrides]] select = "*-musllinux*" -before-test = "apk update && apk add musl-locales && bash {package}/scripts/cibw_before_test.sh" +before-test = "apk update && apk add musl-locales" [[tool.cibuildwheel.overrides]] select = "*-win*" diff --git a/scripts/cibw_before_build.sh b/scripts/cibw_before_build.sh index f3049b27ed5d1..6186340807f8f 100644 --- a/scripts/cibw_before_build.sh +++ b/scripts/cibw_before_build.sh @@ -1,8 +1,6 @@ -# TODO: Delete when there's PyPI NumPy/Cython releases the support Python 3.13. -# If free-threading support is not included in those releases, this script will have -# to whether this runs for a free-threaded build instead. -PYTHON_VERSION="$(python -c "import sys; print(f'{sys.version_info.major}{sys.version_info.minor}')")" -if [[ $PYTHON_VERSION == "313" ]]; then +# TODO: Delete when there's a PyPI Cython release that supports free-threaded Python 3.13. +FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" +if [[ $FREE_THREADED_BUILD == "True" ]]; then python -m pip install -U pip python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy cython python -m pip install ninja meson-python versioneer[toml] diff --git a/scripts/cibw_before_test.sh b/scripts/cibw_before_test.sh deleted file mode 100644 index 7d1b143881ced..0000000000000 --- a/scripts/cibw_before_test.sh +++ /dev/null @@ -1,8 +0,0 @@ -# TODO: Delete when there's PyPI NumPy/Cython releases the support Python 3.13. -# If free-threading support is not included in those releases, this script will have -# to whether this runs for a free-threaded build instead. -PYTHON_VERSION="$(python -c "import sys; print(f'{sys.version_info.major}{sys.version_info.minor}')")" -if [[ $PYTHON_VERSION == "313" ]]; then - python -m pip install -U pip - python -m pip install -i https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy -fi From f1e6cc184ae0534e11c0a2947f4948bc4c5e0a9d Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Thu, 19 Sep 2024 15:39:08 -0400 Subject: [PATCH 056/224] BLD: Final release prep for 2.2.3 (#59840) * BLD: Final release prep * change back perms * debug * try to fix license addition * silence stable version warning? --- doc/source/conf.py | 4 +++- doc/source/whatsnew/v2.2.2.rst | 2 +- doc/source/whatsnew/v2.2.3.rst | 23 ++++++++++++++++------- doc/source/whatsnew/v3.0.0.rst | 1 - pyproject.toml | 2 +- scripts/cibw_before_build.sh | 5 +++++ 6 files changed, 26 insertions(+), 11 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 77dd5d03d311c..ddbda0aa3bf65 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -254,7 +254,9 @@ "json_url": "https://pandas.pydata.org/versions.json", "version_match": switcher_version, }, - "show_version_warning_banner": True, + # This shows a warning for patch releases since the + # patch version doesn't compare as equal (e.g. 2.2.1 != 2.2.0 but it should be) + "show_version_warning_banner": False, "icon_links": [ { "name": "Mastodon", diff --git a/doc/source/whatsnew/v2.2.2.rst b/doc/source/whatsnew/v2.2.2.rst index 72a2f84c4aaee..fbe5e9b4febb5 100644 --- a/doc/source/whatsnew/v2.2.2.rst +++ b/doc/source/whatsnew/v2.2.2.rst @@ -56,4 +56,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v2.2.1..v2.2.2|HEAD +.. contributors:: v2.2.1..v2.2.2 diff --git a/doc/source/whatsnew/v2.2.3.rst b/doc/source/whatsnew/v2.2.3.rst index aa6e241e74b0a..1696a7b6449af 100644 --- a/doc/source/whatsnew/v2.2.3.rst +++ b/doc/source/whatsnew/v2.2.3.rst @@ -1,6 +1,6 @@ .. _whatsnew_223: -What's new in 2.2.3 (September XX, 2024) +What's new in 2.2.3 (September 20, 2024) ---------------------------------------- These are the changes in pandas 2.2.3. See :ref:`release` for a full changelog @@ -9,28 +9,37 @@ including other versions of pandas. {{ header }} .. --------------------------------------------------------------------------- -.. _whatsnew_223.regressions: -Fixed regressions -~~~~~~~~~~~~~~~~~ -- +.. _whatsnew_220.py13_compat: + +Pandas 2.2.3 is now compatible with Python 3.13 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Pandas 2.2.3 is the first version of pandas that is generally compatible with the upcoming +Python 3.13, and both wheels for free-threaded and normal Python 3.13 will be uploaded for +this release. + +As usual please report any bugs discovered to our `issue tracker `_ .. --------------------------------------------------------------------------- .. _whatsnew_223.bug_fixes: Bug fixes ~~~~~~~~~ -- +- Bug in :func:`eval` on :class:`complex` including division ``/`` discards imaginary part. (:issue:`21374`) +- Minor fixes for numpy 2.1 compatibility. (:issue:`59444`) .. --------------------------------------------------------------------------- .. _whatsnew_223.other: Other ~~~~~ -- +- Missing licenses for 3rd party dependencies were added back into the wheels. (:issue:`58632`) .. --------------------------------------------------------------------------- .. _whatsnew_223.contributors: Contributors ~~~~~~~~~~~~ + +.. contributors:: v2.2.2..v2.2.3|HEAD diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 89a1c388b3ba1..c2a56afbc580e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -693,7 +693,6 @@ Other ^^^^^ - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) - Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`) -- Bug in :func:`eval` on :class:`complex` including division ``/`` discards imaginary part. (:issue:`21374`) - Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`) - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) - Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`) diff --git a/pyproject.toml b/pyproject.toml index 5ffd9d9a5608c..d0fcdc4b21b33 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -156,7 +156,7 @@ test-command = """ pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \ """ free-threaded-support = true -before-build = "bash {package}/scripts/cibw_before_build.sh" +before-build = "PACKAGE_DIR={package} bash {package}/scripts/cibw_before_build.sh" [tool.cibuildwheel.windows] before-build = "pip install delvewheel && bash {package}/scripts/cibw_before_build.sh" diff --git a/scripts/cibw_before_build.sh b/scripts/cibw_before_build.sh index 6186340807f8f..679b91e3280ec 100644 --- a/scripts/cibw_before_build.sh +++ b/scripts/cibw_before_build.sh @@ -1,3 +1,8 @@ +# Add 3rd party licenses, like numpy does +for file in $PACKAGE_DIR/LICENSES/*; do + cat $file >> $PACKAGE_DIR/LICENSE +done + # TODO: Delete when there's a PyPI Cython release that supports free-threaded Python 3.13. FREE_THREADED_BUILD="$(python -c"import sysconfig; print(bool(sysconfig.get_config_var('Py_GIL_DISABLED')))")" if [[ $FREE_THREADED_BUILD == "True" ]]; then From 2419343bfea5dba678146139ca9663d831c47b22 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Fri, 20 Sep 2024 07:39:20 -0400 Subject: [PATCH 057/224] BLD: Build wheels for Python 3.13 on aarch64 as well (#59847) * BLD: Build wheels for Python 3.13 on aarch64 as well * some fixups * another typo --- .circleci/config.yml | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 27b6829dcda70..9c986e5b1b054 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -92,7 +92,13 @@ jobs: no_output_timeout: 30m # Sometimes the tests won't generate any output, make sure the job doesn't get killed by that command: | pip3 install cibuildwheel==2.20.0 - cibuildwheel --output-dir wheelhouse + if [[ $CIBW_BUILD == cp313t* ]]; then + # TODO: temporarily run 3.13 free threaded builds without build isolation + # since we need pre-release cython + CIBW_BUILD_FRONTEND="pip; args: --no-build-isolation" cibuildwheel --output-dir wheelhouse + else + cibuildwheel --output-dir wheelhouse + fi environment: CIBW_BUILD: << parameters.cibw-build >> @@ -141,6 +147,10 @@ workflows: cibw-build: ["cp310-manylinux_aarch64", "cp311-manylinux_aarch64", "cp312-manylinux_aarch64", + "cp313-manylinux_aarch64", + "cp313t-manylinux_aarch64", "cp310-musllinux_aarch64", "cp311-musllinux_aarch64", - "cp312-musllinux_aarch64",] + "cp312-musllinux_aarch64", + "cp313-musllinux_aarch64", + "cp313t-musllinux_aarch64"] From 71b395f2cf513f7c4ef8b50c608072bf3950e596 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sun, 22 Sep 2024 19:27:28 +0530 Subject: [PATCH 058/224] DOC: fix RT03 for pandas.core.groupby.DataFrameGroupBy.hist (#59870) --- ci/code_checks.sh | 1 - pandas/core/groupby/generic.py | 4 +++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index f2d9f582d8932..21104c2e00450 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -144,7 +144,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.hist RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.indices SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.nth PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.nunique SA01" \ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index eae33ddc1df29..bec9d344d42e2 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -2694,7 +2694,9 @@ def hist( Returns ------- - matplotlib.Axes or numpy.ndarray of them + matplotlib.Axes or numpy.ndarray + A ``matplotlib.Axes`` object or an array of ``Axes`` objects, depending on + the layout and grouping. See Also -------- From 2cdb97e2f806d83965c7dee8fb5fcf164a340379 Mon Sep 17 00:00:00 2001 From: Fawaz Ahmed Date: Tue, 24 Sep 2024 06:21:39 +0530 Subject: [PATCH 059/224] BUG: Fix precision loss in read_json (#59284) --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/io/json/_json.py | 3 ++- pandas/tests/io/json/test_pandas.py | 12 ++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c2a56afbc580e..3b5183c43bcd0 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -630,6 +630,7 @@ I/O - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_json` not validating the ``typ`` argument to not be exactly ``"frame"`` or ``"series"`` (:issue:`59124`) +- Bug in :meth:`read_json` where extreme value integers in string format were incorrectly parsed as a different integer number (:issue:`20608`) - Bug in :meth:`read_stata` raising ``KeyError`` when input file is stored in big-endian format and contains strL data. (:issue:`58638`) - Bug in :meth:`read_stata` where extreme value integers were incorrectly interpreted as missing for format versions 111 and prior (:issue:`58130`) - Bug in :meth:`read_stata` where the missing code for double was not recognised for format versions 105 and prior (:issue:`58149`) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index d077b9e0c4568..e9c9f5ba225a5 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1168,6 +1168,7 @@ def _try_convert_data( """ Try to parse a Series into a column by inferring dtype. """ + org_data = data # don't try to coerce, unless a force conversion if use_dtypes: if not self.dtype: @@ -1222,7 +1223,7 @@ def _try_convert_data( if len(data) and data.dtype in ("float", "object"): # coerce ints if we can try: - new_data = data.astype("int64") + new_data = org_data.astype("int64") if (new_data == data).all(): data = new_data converted = True diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 1c54232b8b510..d3328d1dfcaef 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -2286,3 +2286,15 @@ def test_read_json_lines_rangeindex(): result = read_json(StringIO(data), lines=True).index expected = RangeIndex(2) tm.assert_index_equal(result, expected, exact=True) + + +def test_large_number(): + # GH#20608 + result = read_json( + StringIO('["9999999999999999"]'), + orient="values", + typ="series", + convert_dates=False, + ) + expected = Series([9999999999999999]) + tm.assert_series_equal(result, expected) From dc24410c0fbbfff2b191247dc7dc963cc92c0321 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:15:06 +0530 Subject: [PATCH 060/224] DOC: fix SA01 for pandas.api.types.is_int64_dtype (#59862) --- ci/code_checks.sh | 1 - pandas/core/dtypes/common.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 21104c2e00450..3d31781b886ab 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -117,7 +117,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_float PR01,SA01" \ -i "pandas.api.types.is_float_dtype SA01" \ -i "pandas.api.types.is_hashable PR01,RT03,SA01" \ - -i "pandas.api.types.is_int64_dtype SA01" \ -i "pandas.api.types.is_integer PR01,SA01" \ -i "pandas.api.types.is_interval_dtype SA01" \ -i "pandas.api.types.is_iterator PR07,SA01" \ diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index ff855f97a352b..0252927241ef4 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -886,6 +886,16 @@ def is_int64_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of the int64 dtype. + See Also + -------- + api.types.is_float_dtype : Check whether the provided array or dtype is of a + float dtype. + api.types.is_bool_dtype : Check whether the provided array or dtype is of a + boolean dtype. + api.types.is_object_dtype : Check whether an array-like or dtype is of the + object dtype. + numpy.int64 : Numpy's 64-bit integer type. + Notes ----- Depending on system architecture, the return value of `is_int64_dtype( From b91be12f8854d87e0f1c6cf9e2db7a5e68983be1 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:16:03 +0530 Subject: [PATCH 061/224] DOC: fix SA01, ES01 for pandas.api.types.is_float_dtype (#59861) --- ci/code_checks.sh | 1 - pandas/core/dtypes/common.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 3d31781b886ab..119c6e2b33684 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -115,7 +115,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_dict_like PR07,SA01" \ -i "pandas.api.types.is_file_like PR07,SA01" \ -i "pandas.api.types.is_float PR01,SA01" \ - -i "pandas.api.types.is_float_dtype SA01" \ -i "pandas.api.types.is_hashable PR01,RT03,SA01" \ -i "pandas.api.types.is_integer PR01,SA01" \ -i "pandas.api.types.is_interval_dtype SA01" \ diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 0252927241ef4..48d2106aff124 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1285,6 +1285,9 @@ def is_float_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a float dtype. + The function checks for floating-point data types, which represent real numbers + that may have fractional components. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -1295,6 +1298,15 @@ def is_float_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of a float dtype. + See Also + -------- + api.types.is_numeric_dtype : Check whether the provided array or dtype is of + a numeric dtype. + api.types.is_integer_dtype : Check whether the provided array or dtype is of + an integer dtype. + api.types.is_object_dtype : Check whether an array-like or dtype is of the + object dtype. + Examples -------- >>> from pandas.api.types import is_float_dtype From b81ed16389385ad1272e94d2796db31ce8ccbafd Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:19:34 +0530 Subject: [PATCH 062/224] DOC: fix SA01, ES01 for pandas.Series.sparse.sp_values (#59859) --- ci/code_checks.sh | 1 - pandas/core/arrays/sparse/array.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 119c6e2b33684..42955a6476734 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -100,7 +100,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.sparse.fill_value SA01" \ -i "pandas.Series.sparse.from_coo PR07,SA01" \ -i "pandas.Series.sparse.npoints SA01" \ - -i "pandas.Series.sparse.sp_values SA01" \ -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index a09dc20af3b36..40012357f40cd 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -603,6 +603,18 @@ def sp_values(self) -> np.ndarray: """ An ndarray containing the non- ``fill_value`` values. + This property returns the actual data values stored in the sparse + representation, excluding the values that are equal to the ``fill_value``. + The result is an ndarray of the underlying values, preserving the sparse + structure by omitting the default ``fill_value`` entries. + + See Also + -------- + Series.sparse.to_dense : Convert a Series from sparse values to dense. + Series.sparse.fill_value : Elements in `data` that are `fill_value` are + not stored. + Series.sparse.density : The percent of non- ``fill_value`` points, as decimal. + Examples -------- >>> from pandas.arrays import SparseArray From 7cebd7822ba0598f53fdd6dd8141c66b949c9023 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:22:11 +0530 Subject: [PATCH 063/224] DOC: fix SA01 for pandas.Series.sparse.fill_value (#59858) --- ci/code_checks.sh | 1 - pandas/core/arrays/sparse/array.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 42955a6476734..e0d6efa0278e4 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -97,7 +97,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.tz_localize PR01,PR02" \ -i "pandas.Series.dt.unit GL08" \ -i "pandas.Series.pad PR01,SA01" \ - -i "pandas.Series.sparse.fill_value SA01" \ -i "pandas.Series.sparse.from_coo PR07,SA01" \ -i "pandas.Series.sparse.npoints SA01" \ -i "pandas.Timedelta.max PR02" \ diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 40012357f40cd..c8ec4068ca199 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -635,6 +635,12 @@ def fill_value(self): For memory savings, this should be the most common value in the array. + See Also + -------- + SparseDtype : Dtype for data stored in :class:`SparseArray`. + Series.value_counts : Return a Series containing counts of unique values. + Series.fillna : Fill NA/NaN in a Series with a specified value. + Examples -------- >>> ser = pd.Series([0, 0, 2, 2, 2], dtype="Sparse[int]") From 5b6997ca14187b31a87490b9e61e3af4cbdda6d7 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:24:42 +0530 Subject: [PATCH 064/224] DOC: fix SA01, ES01 for pandas.tseries.offsets.SemiMonthEnd (#59856) DOC: fix SA01 for pandas.tseries.offsets.SemiMonthEnd --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/offsets.pyx | 12 ++++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index e0d6efa0278e4..7cc314007aabd 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -364,7 +364,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.SemiMonthBegin.n GL08" \ -i "pandas.tseries.offsets.SemiMonthBegin.normalize GL08" \ -i "pandas.tseries.offsets.SemiMonthBegin.rule_code GL08" \ - -i "pandas.tseries.offsets.SemiMonthEnd SA01" \ -i "pandas.tseries.offsets.SemiMonthEnd.day_of_month GL08" \ -i "pandas.tseries.offsets.SemiMonthEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.SemiMonthEnd.n GL08" \ diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 4fa1af0ec882c..4db96fbaa3aad 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -3316,6 +3316,11 @@ cdef class SemiMonthEnd(SemiMonthOffset): """ Two DateOffset's per month repeating on the last day of the month & day_of_month. + This offset allows for flexibility in generating date ranges or adjusting dates + to the end of a month or a specific day in the month, such as the 15th or the last + day of the month. It is useful for financial or scheduling applications where + events occur bi-monthly. + Attributes ---------- n : int, default 1 @@ -3325,6 +3330,13 @@ cdef class SemiMonthEnd(SemiMonthOffset): day_of_month : int, {1, 3,...,27}, default 15 A specific integer for the day of the month. + See Also + -------- + tseries.offsets.SemiMonthBegin : Offset for semi-monthly frequencies, starting at + the beginning of the month. + tseries.offsets.MonthEnd : Offset to the last calendar day of the month. + tseries.offsets.MonthBegin : Offset to the first calendar day of the month. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 14) From a9e30c5f62d080aea7629ca17cf1e9c0e8c3e080 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 25 Sep 2024 19:57:49 +0200 Subject: [PATCH 065/224] String dtype: map builtin str alias to StringDtype (#59685) * String dtype: map builtin str alias to StringDtype * fix tests * fix datetimelike astype and more tests * remove xfails * try fix typing * fix copy_view tests * fix remaining tests with infer_string enabled * ignore typing issue for now * move to common.py * simplify Categorical._str_get_dummies * small cleanup * fix ensure_string_array to not modify extension arrays inplace * fix ensure_string_array once more + fix is_extension_array_dtype for str * still xfail TestArrowArray::test_astype_str when not using infer_string * ensure maybe_convert_objects copies object dtype input array when inferring StringDtype * update test_1d_object_array_does_not_copy test * update constructor copy test + do not copy in maybe_convert_objects? * skip str.get_dummies test for now * use pandas_dtype() instead of registry.find * fix corner cases for calling pandas_dtype * add TODO comment in ensure_string_array --- pandas/_libs/lib.pyx | 9 +++- pandas/_testing/__init__.py | 2 +- pandas/core/arrays/categorical.py | 4 +- pandas/core/arrays/datetimelike.py | 10 ++++- pandas/core/dtypes/common.py | 18 +++++++- pandas/core/indexes/base.py | 6 ++- pandas/core/indexes/interval.py | 3 +- pandas/tests/arrays/floating/test_astype.py | 6 +-- pandas/tests/arrays/integer/test_dtypes.py | 6 +-- pandas/tests/arrays/sparse/test_astype.py | 4 +- pandas/tests/arrays/sparse/test_dtype.py | 2 +- pandas/tests/dtypes/test_common.py | 12 ++++++ pandas/tests/extension/base/casting.py | 4 +- pandas/tests/extension/json/array.py | 3 +- pandas/tests/extension/test_arrow.py | 29 +++---------- pandas/tests/frame/methods/test_astype.py | 17 ++++---- .../tests/frame/methods/test_select_dtypes.py | 5 ++- pandas/tests/frame/test_constructors.py | 41 +++++++++++++++---- .../indexes/datetimes/methods/test_astype.py | 15 ++++--- pandas/tests/indexes/object/test_astype.py | 4 +- .../indexes/period/methods/test_astype.py | 9 +++- .../indexes/timedeltas/methods/test_astype.py | 9 +++- pandas/tests/interchange/test_impl.py | 1 + pandas/tests/io/excel/test_readers.py | 6 +-- .../io/parser/dtypes/test_dtypes_basic.py | 17 ++++---- pandas/tests/io/parser/test_na_values.py | 2 - .../io/parser/test_python_parser_only.py | 6 +-- pandas/tests/series/methods/test_astype.py | 30 ++++++++------ pandas/tests/series/methods/test_map.py | 4 +- pandas/tests/series/test_constructors.py | 2 +- pandas/tests/strings/test_get_dummies.py | 3 ++ pandas/tests/test_algos.py | 7 +++- 32 files changed, 185 insertions(+), 111 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 3f2dfbfb3b404..8af48a861967a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -754,7 +754,14 @@ cpdef ndarray[object] ensure_string_array( if hasattr(arr, "to_numpy"): - if hasattr(arr, "dtype") and arr.dtype.kind in "mM": + if ( + hasattr(arr, "dtype") + and arr.dtype.kind in "mM" + # TODO: we should add a custom ArrowExtensionArray.astype implementation + # that handles astype(str) specifically, avoiding ending up here and + # then we can remove the below check for `_pa_array` (for ArrowEA) + and not hasattr(arr, "_pa_array") + ): # dtype check to exclude DataFrame # GH#41409 TODO: not a great place for this out = arr.astype(str).astype(object) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 5fa1a984b8aea..0be01da1816a2 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -108,7 +108,7 @@ COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"] if using_string_dtype(): - STRING_DTYPES: list[Dtype] = [str, "U"] + STRING_DTYPES: list[Dtype] = ["U"] else: STRING_DTYPES: list[Dtype] = [str, "str", "U"] # type: ignore[no-redef] COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 8e0225b31e17b..a69e197df851d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2685,7 +2685,9 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): # sep may not be in categories. Just bail on this. from pandas.core.arrays import NumpyExtensionArray - return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep, dtype) + return NumpyExtensionArray(self.to_numpy(str, na_value="NaN"))._str_get_dummies( + sep, dtype + ) # ------------------------------------------------------------------------ # GroupBy Methods diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index fbe1677b95b33..7be8daa09c758 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -471,10 +471,16 @@ def astype(self, dtype, copy: bool = True): return self._box_values(self.asi8.ravel()).reshape(self.shape) + elif is_string_dtype(dtype): + if isinstance(dtype, ExtensionDtype): + arr_object = self._format_native_types(na_rep=dtype.na_value) # type: ignore[arg-type] + cls = dtype.construct_array_type() + return cls._from_sequence(arr_object, dtype=dtype, copy=False) + else: + return self._format_native_types() + elif isinstance(dtype, ExtensionDtype): return super().astype(dtype, copy=copy) - elif is_string_dtype(dtype): - return self._format_native_types() elif dtype.kind in "iu": # we deliberately ignore int32 vs. int64 here. # See https://github.com/pandas-dev/pandas/issues/24381 for more. diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 48d2106aff124..1a38bb03b2c1c 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -12,6 +12,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas._libs import ( Interval, Period, @@ -1470,7 +1472,15 @@ def is_extension_array_dtype(arr_or_dtype) -> bool: elif isinstance(dtype, np.dtype): return False else: - return registry.find(dtype) is not None + try: + with warnings.catch_warnings(): + # pandas_dtype(..) can raise UserWarning for class input + warnings.simplefilter("ignore", UserWarning) + dtype = pandas_dtype(dtype) + except (TypeError, ValueError): + # np.dtype(..) can raise ValueError + return False + return isinstance(dtype, ExtensionDtype) def is_ea_or_datetimelike_dtype(dtype: DtypeObj | None) -> bool: @@ -1773,6 +1783,12 @@ def pandas_dtype(dtype) -> DtypeObj: elif isinstance(dtype, (np.dtype, ExtensionDtype)): return dtype + # builtin aliases + if dtype is str and using_string_dtype(): + from pandas.core.arrays.string_ import StringDtype + + return StringDtype(na_value=np.nan) + # registered extension types result = registry.find(dtype) if result is not None: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2346c20004210..852049804a4f5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6262,7 +6262,11 @@ def _should_compare(self, other: Index) -> bool: return False dtype = _unpack_nested_dtype(other) - return self._is_comparable_dtype(dtype) or is_object_dtype(dtype) + return ( + self._is_comparable_dtype(dtype) + or is_object_dtype(dtype) + or is_string_dtype(dtype) + ) def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 359cdf880937b..8feac890883eb 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -51,6 +51,7 @@ is_number, is_object_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -712,7 +713,7 @@ def _get_indexer( # left/right get_indexer, compare elementwise, equality -> match indexer = self._get_indexer_unique_sides(target) - elif not is_object_dtype(target.dtype): + elif not (is_object_dtype(target.dtype) or is_string_dtype(target.dtype)): # homogeneous scalar index: use IntervalTree # we should always have self._should_partial_index(target) here target = self._maybe_convert_i8(target) diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py index ccf644b34051d..752ebe194ffcf 100644 --- a/pandas/tests/arrays/floating/test_astype.py +++ b/pandas/tests/arrays/floating/test_astype.py @@ -68,11 +68,9 @@ def test_astype_str(using_infer_string): if using_infer_string: expected = pd.array(["0.1", "0.2", None], dtype=pd.StringDtype(na_value=np.nan)) - tm.assert_extension_array_equal(a.astype("str"), expected) - # TODO(infer_string) this should also be a string array like above - expected = np.array(["0.1", "0.2", ""], dtype="U32") - tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype("str"), expected) else: expected = np.array(["0.1", "0.2", ""], dtype="U32") diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index fadd7ac67b58d..7972ba7b9fb0f 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -281,11 +281,9 @@ def test_astype_str(using_infer_string): if using_infer_string: expected = pd.array(["1", "2", None], dtype=pd.StringDtype(na_value=np.nan)) - tm.assert_extension_array_equal(a.astype("str"), expected) - # TODO(infer_string) this should also be a string array like above - expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") - tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype("str"), expected) else: expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") diff --git a/pandas/tests/arrays/sparse/test_astype.py b/pandas/tests/arrays/sparse/test_astype.py index 83a507e679d46..e6e4a11a0f5ab 100644 --- a/pandas/tests/arrays/sparse/test_astype.py +++ b/pandas/tests/arrays/sparse/test_astype.py @@ -81,8 +81,8 @@ def test_astype_all(self, any_real_numpy_dtype): ), ( SparseArray([0, 1, 10]), - str, - SparseArray(["0", "1", "10"], dtype=SparseDtype(str, "0")), + np.str_, + SparseArray(["0", "1", "10"], dtype=SparseDtype(np.str_, "0")), ), (SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])), ( diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py index 1819744d9a9ae..6143163735ab8 100644 --- a/pandas/tests/arrays/sparse/test_dtype.py +++ b/pandas/tests/arrays/sparse/test_dtype.py @@ -184,7 +184,7 @@ def test_construct_from_string_fill_value_raises(string): [ (SparseDtype(int, 0), float, SparseDtype(float, 0.0)), (SparseDtype(int, 1), float, SparseDtype(float, 1.0)), - (SparseDtype(int, 1), str, SparseDtype(object, "1")), + (SparseDtype(int, 1), np.str_, SparseDtype(object, "1")), (SparseDtype(float, 1.5), int, SparseDtype(int, 1)), ], ) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 2c2dff7a957fe..e338fb1331734 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -810,11 +810,23 @@ def test_pandas_dtype_string_dtypes(string_storage): "pyarrow" if HAS_PYARROW else "python", na_value=np.nan ) + with pd.option_context("future.infer_string", True): + # with the default string_storage setting + result = pandas_dtype(str) + assert result == pd.StringDtype( + "pyarrow" if HAS_PYARROW else "python", na_value=np.nan + ) + with pd.option_context("future.infer_string", True): with pd.option_context("string_storage", string_storage): result = pandas_dtype("str") assert result == pd.StringDtype(string_storage, na_value=np.nan) + with pd.option_context("future.infer_string", True): + with pd.option_context("string_storage", string_storage): + result = pandas_dtype(str) + assert result == pd.StringDtype(string_storage, na_value=np.nan) + with pd.option_context("future.infer_string", False): with pd.option_context("string_storage", string_storage): result = pandas_dtype("str") diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index e924e38ee5030..8e3f21e1a4f56 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -44,8 +44,8 @@ def test_tolist(self, data): assert result == expected def test_astype_str(self, data): - result = pd.Series(data[:5]).astype(str) - expected = pd.Series([str(x) for x in data[:5]], dtype=str) + result = pd.Series(data[:2]).astype(str) + expected = pd.Series([str(x) for x in data[:2]], dtype=str) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 3a4391edc99ef..4fa48023fbc95 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -208,9 +208,8 @@ def astype(self, dtype, copy=True): return self.copy() return self elif isinstance(dtype, StringDtype): - value = self.astype(str) # numpy doesn't like nested dicts arr_cls = dtype.construct_array_type() - return arr_cls._from_sequence(value, dtype=dtype, copy=False) + return arr_cls._from_sequence(self, dtype=dtype, copy=False) elif not copy: return np.asarray([dict(x) for x in self], dtype=dtype) else: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index f86d927ddda67..f56094dfd47ca 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -43,7 +43,6 @@ pa_version_under13p0, pa_version_under14p0, ) -import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -292,7 +291,7 @@ def test_map(self, data_missing, na_action): expected = data_missing.to_numpy() tm.assert_numpy_array_equal(result, expected) - def test_astype_str(self, data, request): + def test_astype_str(self, data, request, using_infer_string): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_binary(pa_dtype): request.applymarker( @@ -300,9 +299,10 @@ def test_astype_str(self, data, request): reason=f"For {pa_dtype} .astype(str) decodes.", ) ) - elif ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ) or pa.types.is_duration(pa_dtype): + elif not using_infer_string and ( + (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None) + or pa.types.is_duration(pa_dtype) + ): request.applymarker( pytest.mark.xfail( reason="pd.Timestamp/pd.Timedelta repr different from numpy repr", @@ -310,25 +310,6 @@ def test_astype_str(self, data, request): ) super().test_astype_str(data) - @pytest.mark.parametrize( - "nullable_string_dtype", - [ - "string[python]", - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], - ) - def test_astype_string(self, data, nullable_string_dtype, request): - pa_dtype = data.dtype.pyarrow_dtype - if ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ) or pa.types.is_duration(pa_dtype): - request.applymarker( - pytest.mark.xfail( - reason="pd.Timestamp/pd.Timedelta repr different from numpy repr", - ) - ) - super().test_astype_string(data, nullable_string_dtype) - def test_from_dtype(self, data, request): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_string(pa_dtype) or pa.types.is_decimal(pa_dtype): diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 8647df0e8ad96..ab3743283ea13 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -168,21 +168,21 @@ def test_astype_str(self): "d": list(map(str, d._values)), "e": list(map(str, e._values)), }, - dtype="object", + dtype="str", ) tm.assert_frame_equal(result, expected) - def test_astype_str_float(self): + def test_astype_str_float(self, using_infer_string): # see GH#11302 result = DataFrame([np.nan]).astype(str) - expected = DataFrame(["nan"], dtype="object") + expected = DataFrame([np.nan if using_infer_string else "nan"], dtype="str") tm.assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(str) val = "1.1234567890123457" - expected = DataFrame([val], dtype="object") + expected = DataFrame([val], dtype="str") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) @@ -284,7 +284,7 @@ def test_astype_duplicate_col_series_arg(self): result = df.astype(dtypes) expected = DataFrame( { - 0: Series(vals[:, 0].astype(str), dtype=object), + 0: Series(vals[:, 0].astype(str), dtype="str"), 1: vals[:, 1], 2: pd.array(vals[:, 2], dtype="Float64"), 3: vals[:, 3], @@ -647,9 +647,10 @@ def test_astype_dt64tz(self, timezone_frame): # dt64tz->dt64 deprecated timezone_frame.astype("datetime64[ns]") - def test_astype_dt64tz_to_str(self, timezone_frame): + def test_astype_dt64tz_to_str(self, timezone_frame, using_infer_string): # str formatting result = timezone_frame.astype(str) + na_value = np.nan if using_infer_string else "NaT" expected = DataFrame( [ [ @@ -657,7 +658,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): "2013-01-01 00:00:00-05:00", "2013-01-01 00:00:00+01:00", ], - ["2013-01-02", "NaT", "NaT"], + ["2013-01-02", na_value, na_value], [ "2013-01-03", "2013-01-03 00:00:00-05:00", @@ -665,7 +666,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): ], ], columns=timezone_frame.columns, - dtype="object", + dtype="str", ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 875dca321635f..0354e9df3d168 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -99,6 +99,9 @@ def test_select_dtypes_include_using_list_like(self, using_infer_string): ei = df[["a"]] tm.assert_frame_equal(ri, ei) + ri = df.select_dtypes(include=[str]) + tm.assert_frame_equal(ri, ei) + def test_select_dtypes_exclude_using_list_like(self): df = DataFrame( { @@ -358,7 +361,7 @@ def test_select_dtypes_datetime_with_tz(self): @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"]) @pytest.mark.parametrize("arg", ["include", "exclude"]) def test_select_dtypes_str_raises(self, dtype, arg, using_infer_string): - if using_infer_string and dtype == "str": + if using_infer_string and (dtype == "str" or dtype is str): # this is tested below pytest.skip("Selecting string columns works with future strings") df = DataFrame( diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 3d46e03547c38..0a924aa393be5 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -24,7 +24,6 @@ from pandas._config import using_string_dtype from pandas._libs import lib -from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError @@ -82,7 +81,7 @@ def test_constructor_from_ndarray_with_str_dtype(self): # with an array of strings each of which is e.g. "[0 1 2]" arr = np.arange(12).reshape(4, 3) df = DataFrame(arr, dtype=str) - expected = DataFrame(arr.astype(str), dtype=object) + expected = DataFrame(arr.astype(str), dtype="str") tm.assert_frame_equal(df, expected) def test_constructor_from_2d_datetimearray(self): @@ -300,18 +299,38 @@ def test_constructor_dtype_nocast_view_2d_array(self): df2 = DataFrame(df.values, dtype=df[0].dtype) assert df2._mgr.blocks[0].values.flags.c_contiguous - @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="conversion copies") - def test_1d_object_array_does_not_copy(self): + def test_1d_object_array_does_not_copy(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") df = DataFrame(arr, copy=False) + if using_infer_string: + if df[0].dtype.storage == "pyarrow": + # object dtype strings are converted to arrow memory, + # no numpy arrays to compare + pass + else: + assert np.shares_memory(df[0].to_numpy(), arr) + else: + assert np.shares_memory(df.values, arr) + + df = DataFrame(arr, dtype=object, copy=False) assert np.shares_memory(df.values, arr) - @pytest.mark.xfail(using_string_dtype(), reason="conversion copies") - def test_2d_object_array_does_not_copy(self): + def test_2d_object_array_does_not_copy(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") df = DataFrame(arr, copy=False) + if using_infer_string: + if df[0].dtype.storage == "pyarrow": + # object dtype strings are converted to arrow memory, + # no numpy arrays to compare + pass + else: + assert np.shares_memory(df[0].to_numpy(), arr) + else: + assert np.shares_memory(df.values, arr) + + df = DataFrame(arr, dtype=object, copy=False) assert np.shares_memory(df.values, arr) def test_constructor_dtype_list_data(self): @@ -1766,12 +1785,18 @@ def test_constructor_column_duplicates(self): tm.assert_frame_equal(idf, edf) - def test_constructor_empty_with_string_dtype(self): + def test_constructor_empty_with_string_dtype(self, using_infer_string): # GH 9428 expected = DataFrame(index=[0, 1], columns=[0, 1], dtype=object) + expected_str = DataFrame( + index=[0, 1], columns=[0, 1], dtype=pd.StringDtype(na_value=np.nan) + ) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=str) - tm.assert_frame_equal(df, expected) + if using_infer_string: + tm.assert_frame_equal(df, expected_str) + else: + tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.str_) tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5") diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py index 81dc3b3ecc45e..62be8903da206 100644 --- a/pandas/tests/indexes/datetimes/methods/test_astype.py +++ b/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -101,13 +101,16 @@ def test_astype_tznaive_to_tzaware(self): # dt64->dt64tz deprecated idx._data.astype("datetime64[ns, US/Eastern]") - def test_astype_str_nat(self): + def test_astype_str_nat(self, using_infer_string): # GH 13149, GH 13209 # verify that we are returning NaT as a string (and not unicode) idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan]) result = idx.astype(str) - expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) + if using_infer_string: + expected = Index(["2016-05-16", None, None, None], dtype="str") + else: + expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) tm.assert_index_equal(result, expected) def test_astype_str(self): @@ -117,7 +120,7 @@ def test_astype_str(self): expected = Index( ["2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04"], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -132,7 +135,7 @@ def test_astype_str_tz_and_name(self): "2012-01-03 00:00:00-05:00", ], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -143,7 +146,7 @@ def test_astype_str_freq_and_name(self): expected = Index( ["2011-01-01 00:00:00", "2011-01-01 01:00:00", "2011-01-01 02:00:00"], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -155,7 +158,7 @@ def test_astype_str_freq_and_tz(self): result = dti.astype(str) expected = Index( ["2012-03-06 00:00:00+00:00", "2012-03-06 01:00:00+00:00"], - dtype=object, + dtype="str", name="test_name", ) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py index 9c1ef302c5b51..ce05b5e9f2238 100644 --- a/pandas/tests/indexes/object/test_astype.py +++ b/pandas/tests/indexes/object/test_astype.py @@ -15,12 +15,12 @@ def test_astype_str_from_bytes(): # ensure_string_array which does f"{val}" idx = Index(["あ", b"a"], dtype="object") result = idx.astype(str) - expected = Index(["あ", "a"], dtype="object") + expected = Index(["あ", "a"], dtype="str") tm.assert_index_equal(result, expected) # while we're here, check that Series.astype behaves the same result = Series(idx).astype(str) - expected = Series(expected, dtype=object) + expected = Series(expected, dtype="str") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/period/methods/test_astype.py b/pandas/tests/indexes/period/methods/test_astype.py index d545bfd2fae0f..af3c2667f51b4 100644 --- a/pandas/tests/indexes/period/methods/test_astype.py +++ b/pandas/tests/indexes/period/methods/test_astype.py @@ -22,7 +22,7 @@ def test_astype_raises(self, dtype): with pytest.raises(TypeError, match=msg): idx.astype(dtype) - def test_astype_conversion(self): + def test_astype_conversion(self, using_infer_string): # GH#13149, GH#13209 idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq="D", name="idx") @@ -41,7 +41,12 @@ def test_astype_conversion(self): tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx", dtype=object) + if using_infer_string: + expected = Index( + [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str" + ) + else: + expected = Index([str(x) for x in idx], name="idx", dtype=object) tm.assert_index_equal(result, expected) idx = period_range("1990", "2009", freq="Y", name="idx") diff --git a/pandas/tests/indexes/timedeltas/methods/test_astype.py b/pandas/tests/indexes/timedeltas/methods/test_astype.py index 311f2b5c9aa59..5166cadae499e 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_astype.py +++ b/pandas/tests/indexes/timedeltas/methods/test_astype.py @@ -44,7 +44,7 @@ def test_astype_object_with_nat(self): tm.assert_index_equal(result, expected) assert idx.tolist() == expected_list - def test_astype(self): + def test_astype(self, using_infer_string): # GH 13149, GH 13209 idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan], name="idx") @@ -61,7 +61,12 @@ def test_astype(self): tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx", dtype=object) + if using_infer_string: + expected = Index( + [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str" + ) + else: + expected = Index([str(x) for x in idx], name="idx", dtype=object) tm.assert_index_equal(result, expected) rng = timedelta_range("1 days", periods=10) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 38961345dc1f2..29ce9d0c03111 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -401,6 +401,7 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: pd.api.interchange.from_dataframe(df) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_empty_string_column(): # https://github.com/pandas-dev/pandas/issues/56703 df = pd.DataFrame({"a": []}, dtype=str) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index b831ec3bb2c6a..3989e022dbbd2 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -587,7 +587,7 @@ def test_reader_dtype(self, read_ext): expected["a"] = expected["a"].astype("float64") expected["b"] = expected["b"].astype("float32") - expected["c"] = Series(["001", "002", "003", "004"], dtype=object) + expected["c"] = Series(["001", "002", "003", "004"], dtype="str") tm.assert_frame_equal(actual, expected) msg = "Unable to convert column d to type int64" @@ -611,8 +611,8 @@ def test_reader_dtype(self, read_ext): { "a": Series([1, 2, 3, 4], dtype="float64"), "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), - "c": Series(["001", "002", "003", "004"], dtype=object), - "d": Series(["1", "2", np.nan, "4"], dtype=object), + "c": Series(["001", "002", "003", "004"], dtype="str"), + "d": Series(["1", "2", np.nan, "4"], dtype="str"), }, ), ], diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index b664423364f6b..e02562ac8d93d 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -31,7 +31,7 @@ @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) @pytest.mark.usefixtures("pyarrow_xfail") -def test_dtype_all_columns(all_parsers, dtype, check_orig): +def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string): # see gh-3795, gh-6607 parser = all_parsers @@ -49,8 +49,10 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig): if check_orig: expected = df.copy() result = result.astype(float) - else: + elif using_infer_string and dtype is str: expected = df.astype(str) + else: + expected = df.astype(str).astype(object) tm.assert_frame_equal(result, expected) @@ -300,7 +302,6 @@ def test_true_values_cast_to_bool(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") @pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)]) def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): @@ -316,7 +317,6 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_mangle_dup_cols_single_dtype(all_parsers): # GH#42022 @@ -565,7 +565,7 @@ def test_string_inference(all_parsers): @pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_]) -def test_string_inference_object_dtype(all_parsers, dtype): +def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string): # GH#56047 data = """a,b x,a @@ -575,10 +575,11 @@ def test_string_inference_object_dtype(all_parsers, dtype): with pd.option_context("future.infer_string", True): result = parser.read_csv(StringIO(data), dtype=dtype) + expected_dtype = pd.StringDtype(na_value=np.nan) if dtype is str else object expected = DataFrame( { - "a": pd.Series(["x", "y", "z"], dtype=object), - "b": pd.Series(["a", "a", "a"], dtype=object), + "a": pd.Series(["x", "y", "z"], dtype=expected_dtype), + "b": pd.Series(["a", "a", "a"], dtype=expected_dtype), }, columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) @@ -589,7 +590,7 @@ def test_string_inference_object_dtype(all_parsers, dtype): expected = DataFrame( { - "a": pd.Series(["x", "y", "z"], dtype=object), + "a": pd.Series(["x", "y", "z"], dtype=expected_dtype), "b": pd.Series(["a", "a", "a"], dtype=pd.StringDtype(na_value=np.nan)), }, columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index b612e60c959b1..89645b526f2ee 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -667,7 +667,6 @@ def test_inf_na_values_with_int_index(all_parsers): tm.assert_frame_equal(out, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # mismatched shape @pytest.mark.parametrize("na_filter", [True, False]) def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): @@ -719,7 +718,6 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): # TODO: this test isn't about the na_values keyword, it is about the empty entries # being returned with NaN entries, whereas the pyarrow engine returns "nan" @xfail_pyarrow # mismatched shapes -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_str_nan_dropped(all_parsers): # see gh-21131 parser = all_parsers diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 26480010fc687..a5bb151e84f47 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -18,8 +18,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ( ParserError, ParserWarning, @@ -499,7 +497,6 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}] ) @@ -524,10 +521,11 @@ def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, d "c": [0, 4000, 131], } ) + if dtype["a"] == object: + expected["a"] = expected["a"].astype(object) tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype,expected", [ diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 579d41f964df0..4a7e204ee4161 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -76,7 +76,7 @@ def test_astype_dict_like(self, dtype_class): dt1 = dtype_class({"abc": str}) result = ser.astype(dt1) - expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype=object) + expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype="str") tm.assert_series_equal(result, expected) dt2 = dtype_class({"abc": "float64"}) @@ -173,10 +173,14 @@ def test_astype_empty_constructor_equality(self, dtype): def test_astype_str_map(self, dtype, data, using_infer_string): # see GH#4405 series = Series(data) + using_string_dtype = using_infer_string and dtype is str result = series.astype(dtype) - expected = series.map(str) - if using_infer_string: - expected = expected.astype(object) + if using_string_dtype: + expected = series.map(lambda val: str(val) if val is not np.nan else np.nan) + else: + expected = series.map(str) + if using_infer_string: + expected = expected.astype(object) tm.assert_series_equal(result, expected) def test_astype_float_to_period(self): @@ -213,7 +217,7 @@ def test_astype_dt64_to_str(self): # GH#10442 : testing astype(str) is correct for Series/DatetimeIndex dti = date_range("2012-01-01", periods=3) result = Series(dti).astype(str) - expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype=object) + expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype="str") tm.assert_series_equal(result, expected) def test_astype_dt64tz_to_str(self): @@ -226,7 +230,7 @@ def test_astype_dt64tz_to_str(self): "2012-01-02 00:00:00-05:00", "2012-01-03 00:00:00-05:00", ], - dtype=object, + dtype="str", ) tm.assert_series_equal(result, expected) @@ -286,13 +290,13 @@ def test_astype_str_cast_dt64(self): ts = Series([Timestamp("2010-01-04 00:00:00")]) res = ts.astype(str) - expected = Series(["2010-01-04"], dtype=object) + expected = Series(["2010-01-04"], dtype="str") tm.assert_series_equal(res, expected) ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")]) res = ts.astype(str) - expected = Series(["2010-01-04 00:00:00-05:00"], dtype=object) + expected = Series(["2010-01-04 00:00:00-05:00"], dtype="str") tm.assert_series_equal(res, expected) def test_astype_str_cast_td64(self): @@ -301,7 +305,7 @@ def test_astype_str_cast_td64(self): td = Series([Timedelta(1, unit="D")]) ser = td.astype(str) - expected = Series(["1 days"], dtype=object) + expected = Series(["1 days"], dtype="str") tm.assert_series_equal(ser, expected) def test_dt64_series_astype_object(self): @@ -347,7 +351,7 @@ def test_astype_from_float_to_str(self, any_float_dtype): # https://github.com/pandas-dev/pandas/issues/36451 ser = Series([0.1], dtype=any_float_dtype) result = ser.astype(str) - expected = Series(["0.1"], dtype=object) + expected = Series(["0.1"], dtype="str") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -358,11 +362,13 @@ def test_astype_from_float_to_str(self, any_float_dtype): (NA, ""), ], ) - def test_astype_to_str_preserves_na(self, value, string_value): + def test_astype_to_str_preserves_na(self, value, string_value, using_infer_string): # https://github.com/pandas-dev/pandas/issues/36904 ser = Series(["a", "b", value], dtype=object) result = ser.astype(str) - expected = Series(["a", "b", string_value], dtype=object) + expected = Series( + ["a", "b", None if using_infer_string else string_value], dtype="str" + ) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"]) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index fe84ffafa70b4..7fa8686fcc6c8 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -549,13 +549,11 @@ def f(x): (list(range(3)), {0: 42}, [42] + [np.nan] * 3), ], ) -def test_map_missing_mixed(vals, mapping, exp, using_infer_string): +def test_map_missing_mixed(vals, mapping, exp): # GH20495 s = Series(vals + [np.nan]) result = s.map(mapping) exp = Series(exp) - if using_infer_string and mapping == {np.nan: "not NaN"}: - exp.iloc[-1] = np.nan tm.assert_series_equal(result, exp) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 1771a4dfdb71f..69f42b5e42878 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -229,7 +229,7 @@ def test_constructor_empty(self, input_class, using_infer_string): # GH 19853 : with empty string, index and dtype str empty = Series("", dtype=str, index=range(3)) if using_infer_string: - empty2 = Series("", index=range(3), dtype=object) + empty2 = Series("", index=range(3), dtype="str") else: empty2 = Series("", index=range(3)) tm.assert_series_equal(empty, empty2) diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 0656f505dc745..3b989e284ca25 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -96,6 +98,7 @@ def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype): # GH#47872 +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_get_dummies_with_str_dtype(any_string_dtype): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) result = s.str.get_dummies("|", dtype=str) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 06fd81ed722d9..dac74a0e32a42 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1877,13 +1877,16 @@ def test_strobj_mode(self): tm.assert_series_equal(ser.mode(), exp) @pytest.mark.parametrize("dt", [str, object]) - def test_strobj_multi_char(self, dt): + def test_strobj_multi_char(self, dt, using_infer_string): exp = ["bar"] data = ["foo"] * 2 + ["bar"] * 3 ser = Series(data, dtype=dt) exp = Series(exp, dtype=dt) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + if using_infer_string and dt is str: + tm.assert_extension_array_equal(algos.mode(ser.values), exp.values) + else: + tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) tm.assert_series_equal(ser.mode(), exp) def test_datelike_mode(self): From 0962007726634e55f75150db82aadb754bea9752 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:29:16 +0530 Subject: [PATCH 066/224] DOC: fix SA01 for pandas.api.types.is_interval_dtype (#59863) --- ci/code_checks.sh | 1 - pandas/core/dtypes/common.py | 9 +++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7cc314007aabd..a436acd01013b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -115,7 +115,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_float PR01,SA01" \ -i "pandas.api.types.is_hashable PR01,RT03,SA01" \ -i "pandas.api.types.is_integer PR01,SA01" \ - -i "pandas.api.types.is_interval_dtype SA01" \ -i "pandas.api.types.is_iterator PR07,SA01" \ -i "pandas.api.types.is_list_like SA01" \ -i "pandas.api.types.is_named_tuple PR07,SA01" \ diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 1a38bb03b2c1c..1093b35afa8a0 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -481,6 +481,15 @@ def is_interval_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of the Interval dtype. + See Also + -------- + api.types.is_object_dtype : Check whether an array-like or dtype is of the + object dtype. + api.types.is_numeric_dtype : Check whether the provided array or dtype is + of a numeric dtype. + api.types.is_categorical_dtype : Check whether an array-like or dtype is of + the Categorical dtype. + Examples -------- >>> from pandas.core.dtypes.common import is_interval_dtype From ffb3c1523747738369bd27d5cdb924ee6884100d Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:30:08 +0530 Subject: [PATCH 067/224] DOC: fix SA01 for pandas.api.types.is_list_like (#59864) --- ci/code_checks.sh | 1 - pandas/_libs/lib.pyx | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index a436acd01013b..dd1b441b51772 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -116,7 +116,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_hashable PR01,RT03,SA01" \ -i "pandas.api.types.is_integer PR01,SA01" \ -i "pandas.api.types.is_iterator PR07,SA01" \ - -i "pandas.api.types.is_list_like SA01" \ -i "pandas.api.types.is_named_tuple PR07,SA01" \ -i "pandas.api.types.is_object_dtype SA01" \ -i "pandas.api.types.is_re PR07,SA01" \ diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 8af48a861967a..de7d9af731010 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1220,6 +1220,12 @@ def is_list_like(obj: object, allow_sets: bool = True) -> bool: bool Whether `obj` has list-like properties. + See Also + -------- + Series : One-dimensional ndarray with axis labels (including time series). + Index : Immutable sequence used for indexing and alignment. + numpy.ndarray : Array object from NumPy, which is considered list-like. + Examples -------- >>> import datetime From 4b22453651cb71684ce1f56aa67ff6fc451af053 Mon Sep 17 00:00:00 2001 From: musvaage <112724366+musvaage@users.noreply.github.com> Date: Wed, 25 Sep 2024 20:16:32 +0200 Subject: [PATCH 068/224] typo (#59852) --- pandas/io/pytables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index be7b8dc6640ba..618254fee9259 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3580,7 +3580,7 @@ def is_transposed(self) -> bool: @property def data_orientation(self) -> tuple[int, ...]: - """return a tuple of my permuted axes, non_indexable at the front""" + """return a tuple of my permutated axes, non_indexable at the front""" return tuple( itertools.chain( [int(a[0]) for a in self.non_index_axes], From 7543426cdf2728635e92b59585203963035ae536 Mon Sep 17 00:00:00 2001 From: Vibavari Gurunathan Date: Wed, 25 Sep 2024 11:17:57 -0700 Subject: [PATCH 069/224] BUG: Fix from_records() column reorder issue, if columns!=None use passed param (#59717) (#59809) * BUG: Fix columns param reorder issue - if columns!=None, use passed param (#59717) * Add tests for to_arrays() * Fix import order with isort * fix sort * Update datatype to int32 * Fis test * Revert commit * Add test for DaaFrame.from_records() * Apply comments * Delete test_to_arrays.py --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/internals/construction.py | 3 ++- .../frame/constructors/test_from_records.py | 23 +++++++++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3b5183c43bcd0..516a5d938fb18 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -619,6 +619,7 @@ I/O ^^^ - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`) - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`) +- Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`) - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 07465e7b87fcd..959e572b2b35b 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -750,7 +750,8 @@ def to_arrays( elif isinstance(data, np.ndarray) and data.dtype.names is not None: # e.g. recarray - columns = Index(list(data.dtype.names)) + if columns is None: + columns = Index(data.dtype.names) arrays = [data[k] for k in columns] return arrays, columns diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index abc3aab1c1492..1d4a2c0075e3e 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -469,3 +469,26 @@ def test_from_records_empty2(self): alt = DataFrame(arr) tm.assert_frame_equal(alt, expected) + + def test_from_records_structured_array(self): + # GH 59717 + data = np.array( + [ + ("John", 25, "New York", 50000), + ("Jane", 30, "San Francisco", 75000), + ("Bob", 35, "Chicago", 65000), + ("Alice", 28, "Los Angeles", 60000), + ], + dtype=[("name", "U10"), ("age", "i4"), ("city", "U15"), ("salary", "i4")], + ) + + actual_result = DataFrame.from_records(data, columns=["name", "salary", "city"]) + + modified_data = { + "name": ["John", "Jane", "Bob", "Alice"], + "salary": np.array([50000, 75000, 65000, 60000], dtype="int32"), + "city": ["New York", "San Francisco", "Chicago", "Los Angeles"], + } + expected_result = DataFrame(modified_data) + + tm.assert_frame_equal(actual_result, expected_result) From e38409c304f8da88efd7cf074819a1cf7d12be31 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:49:00 +0530 Subject: [PATCH 070/224] DOC: fix SA01 for pandas.arrays.BooleanArray (#59866) --- ci/code_checks.sh | 1 - pandas/core/arrays/boolean.py | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index dd1b441b51772..40582f3069e97 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -122,7 +122,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_re_compilable PR07,SA01" \ -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ - -i "pandas.arrays.BooleanArray SA01" \ -i "pandas.arrays.DatetimeArray SA01" \ -i "pandas.arrays.IntegerArray SA01" \ -i "pandas.arrays.IntervalArray.left SA01" \ diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 74c0cd7719c13..53ebc35b68d14 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -286,6 +286,13 @@ class BooleanArray(BaseMaskedArray): ------- BooleanArray + See Also + -------- + array : Create an array from data with the appropriate dtype. + BooleanDtype : Extension dtype for boolean data. + Series : One-dimensional ndarray with axis labels (including time series). + DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data. + Examples -------- Create an BooleanArray with :func:`pandas.array`: From f049159d8245959bf313e05d1109ed33f778a077 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:49:56 +0530 Subject: [PATCH 071/224] DOC: fix SA01, ES01 for pandas.api.types.is_object_dtype (#59865) --- ci/code_checks.sh | 1 - pandas/core/dtypes/common.py | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 40582f3069e97..4eb9d4055e1f8 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -117,7 +117,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_integer PR01,SA01" \ -i "pandas.api.types.is_iterator PR07,SA01" \ -i "pandas.api.types.is_named_tuple PR07,SA01" \ - -i "pandas.api.types.is_object_dtype SA01" \ -i "pandas.api.types.is_re PR07,SA01" \ -i "pandas.api.types.is_re_compilable PR07,SA01" \ -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 1093b35afa8a0..98c770ec4a8b0 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -141,6 +141,11 @@ def is_object_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the object dtype. + This method examines the input to determine if it is of the + object data type. Object dtype is a generic data type that can + hold any Python objects, including strings, lists, and custom + objects. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -151,6 +156,15 @@ def is_object_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of the object dtype. + See Also + -------- + api.types.is_numeric_dtype : Check whether the provided array or dtype is of a + numeric dtype. + api.types.is_string_dtype : Check whether the provided array or dtype is of + the string dtype. + api.types.is_bool_dtype : Check whether the provided array or dtype is of a + boolean dtype. + Examples -------- >>> from pandas.api.types import is_object_dtype From e221fa48a5d5e61f9adc830ed33562548bea9dd4 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:51:58 +0530 Subject: [PATCH 072/224] DOC: fix RT03 for pandas.date_range (#59868) --- ci/code_checks.sh | 1 - pandas/core/indexes/datetimes.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 4eb9d4055e1f8..72e12effb1104 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -170,7 +170,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.resample.Resampler.sum SA01" \ -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.var SA01" \ - -i "pandas.date_range RT03" \ -i "pandas.errors.AttributeConflictWarning SA01" \ -i "pandas.errors.CSSWarning SA01" \ -i "pandas.errors.CategoricalConversionWarning SA01" \ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 3b3cda8f7cd33..536f22d38468d 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -875,6 +875,7 @@ def date_range( Returns ------- DatetimeIndex + A DatetimeIndex object of the generated dates. See Also -------- From cf79ac87545744d7c7af7e49b443b2ed0b3ed047 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:52:33 +0530 Subject: [PATCH 073/224] DOC: fix RT03, ES01 for pandas.core.resample.Resampler.ffill (#59871) --- ci/code_checks.sh | 1 - pandas/core/resample.py | 8 +++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 72e12effb1104..49702dce0e258 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -155,7 +155,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \ -i "pandas.core.resample.Resampler.__iter__ RT03,SA01" \ - -i "pandas.core.resample.Resampler.ffill RT03" \ -i "pandas.core.resample.Resampler.get_group RT03,SA01" \ -i "pandas.core.resample.Resampler.groups SA01" \ -i "pandas.core.resample.Resampler.indices SA01" \ diff --git a/pandas/core/resample.py b/pandas/core/resample.py index b621fcf9a6415..711396096a5e3 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -529,6 +529,11 @@ def ffill(self, limit: int | None = None): """ Forward fill the values. + This method fills missing values by propagating the last valid + observation forward, up to the next valid observation. It is commonly + used in time series analysis when resampling data to a higher frequency + (upsampling) and filling gaps in the resampled output. + Parameters ---------- limit : int, optional @@ -536,7 +541,8 @@ def ffill(self, limit: int | None = None): Returns ------- - An upsampled Series. + Series + The resampled data with missing values filled forward. See Also -------- From 1ddf028c9469a9d6264171c4c79ef1691fe2c680 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 26 Sep 2024 00:08:18 +0530 Subject: [PATCH 074/224] DOC: fix SA01, ES01 for pandas.arrays.IntervalArray.mid (#59867) * DOC: fix SA01, ES01 for pandas.arrays.IntervalArray.mid * DOC: add double backticks for sphinx compatibility Co-authored-by: mroeschke --------- Co-authored-by: mroeschke --- ci/code_checks.sh | 1 - pandas/core/arrays/interval.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 49702dce0e258..3dfd5a3931ecd 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -125,7 +125,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.arrays.IntegerArray SA01" \ -i "pandas.arrays.IntervalArray.left SA01" \ -i "pandas.arrays.IntervalArray.length SA01" \ - -i "pandas.arrays.IntervalArray.mid SA01" \ -i "pandas.arrays.IntervalArray.right SA01" \ -i "pandas.arrays.NumpyExtensionArray SA01" \ -i "pandas.arrays.SparseArray PR07,SA01" \ diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 52d64162358c8..2ac9c77bef322 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1291,6 +1291,16 @@ def mid(self) -> Index: """ Return the midpoint of each Interval in the IntervalArray as an Index. + The midpoint of an interval is calculated as the average of its + ``left`` and ``right`` bounds. This property returns a ``pandas.Index`` object + containing the midpoint for each interval. + + See Also + -------- + Interval.left : Return left bound for the interval. + Interval.right : Return right bound for the interval. + Interval.length : Return the length of each interval. + Examples -------- From 22055e4d3d42c297b1c86306d77f7a27fad8dcf8 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 26 Sep 2024 00:08:59 +0530 Subject: [PATCH 075/224] DOC: fix SA01, ES01 for pandas.RangeIndex.step (#59857) * DOC: fix SA01, ES01 for pandas.RangeIndex.step * DOC: add double backticks for sphinx compatibility Co-authored-by: mroeschke --------- Co-authored-by: mroeschke --- ci/code_checks.sh | 1 - pandas/core/indexes/range.py | 9 +++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 3dfd5a3931ecd..01486f0e3f926 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -73,7 +73,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ -i "pandas.RangeIndex.from_range PR01,SA01" \ - -i "pandas.RangeIndex.step SA01" \ -i "pandas.Series.cat.add_categories PR01,PR02" \ -i "pandas.Series.cat.as_ordered PR01" \ -i "pandas.Series.cat.as_unordered PR01" \ diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 75d0dfbeb6f01..dc96d1c11db74 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -351,6 +351,15 @@ def step(self) -> int: """ The value of the `step` parameter (``1`` if this was not supplied). + The ``step`` parameter determines the increment (or decrement in the case + of negative values) between consecutive elements in the ``RangeIndex``. + + See Also + -------- + RangeIndex : Immutable index implementing a range-based index. + RangeIndex.stop : Returns the stop value of the RangeIndex. + RangeIndex.start : Returns the start value of the RangeIndex. + Examples -------- >>> idx = pd.RangeIndex(5) From efbc29666d820cf62854556cdeadf044b489de4c Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 26 Sep 2024 00:09:46 +0530 Subject: [PATCH 076/224] DOC: fix SA01, ES01 for pandas.Timedelta.to_timedelta64 (#59860) * DOC: fix SA01, ES01 for pandas.Timedelta.to_timedelta64 * DOC: add double backticks for sphinx compatibility Co-authored-by: mroeschke --------- Co-authored-by: mroeschke --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/timedeltas.pyx | 12 ++++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 01486f0e3f926..20e75f0f6f616 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -101,7 +101,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ - -i "pandas.Timedelta.to_timedelta64 SA01" \ -i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \ -i "pandas.Timestamp.max PR02" \ -i "pandas.Timestamp.min PR02" \ diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 0ff5c5fb81df8..84ca48c96459f 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1411,6 +1411,18 @@ cdef class _Timedelta(timedelta): """ Return a numpy.timedelta64 object with 'ns' precision. + Since NumPy uses ``timedelta64`` objects for its time operations, converting + a pandas ``Timedelta`` into a NumPy ``timedelta64`` provides seamless + integration between the two libraries, especially when working in environments + that heavily rely on NumPy for array-based calculations. + + See Also + -------- + to_timedelta : Convert argument to timedelta. + numpy.timedelta64 : A NumPy object for time duration. + Timedelta : Represents a duration, the difference between two dates + or times. + Examples -------- >>> td = pd.Timedelta('3D') From c5cfe5d32c7fef4d42e1b22e188a438b5607b804 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 26 Sep 2024 00:12:43 +0530 Subject: [PATCH 077/224] DOC: fix SA01, ES01 for pandas.errors.EmptyDataError (#59872) --- ci/code_checks.sh | 1 - pandas/errors/__init__.py | 11 +++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 20e75f0f6f616..f662b4781e84b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -173,7 +173,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.ClosedFileError SA01" \ -i "pandas.errors.DataError SA01" \ -i "pandas.errors.DuplicateLabelError SA01" \ - -i "pandas.errors.EmptyDataError SA01" \ -i "pandas.errors.IntCastingNaNError SA01" \ -i "pandas.errors.InvalidIndexError SA01" \ -i "pandas.errors.InvalidVersion SA01" \ diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 7851bc90c5782..b9ceae341afd3 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -205,6 +205,17 @@ class EmptyDataError(ValueError): """ Exception raised in ``pd.read_csv`` when empty data or header is encountered. + This error is typically encountered when attempting to read an empty file or + an invalid file where no data or headers are present. + + See Also + -------- + read_csv : Read a comma-separated values (CSV) file into DataFrame. + errors.ParserError : Exception that is raised by an error encountered in parsing + file contents. + errors.DtypeWarning : Warning raised when reading different dtypes in a column + from a file. + Examples -------- >>> from io import StringIO From 7e5282f5f125406cff7fdf80b452e114adfa4c26 Mon Sep 17 00:00:00 2001 From: Jonathan Marriott <34217286+JonathanMarriott@users.noreply.github.com> Date: Wed, 25 Sep 2024 20:14:49 +0100 Subject: [PATCH 078/224] DOC: Fix inconsistent and incomplete documentation of `pandas.eval` (#59855) * Improve content and organisation of eval documentation * Link to pd.eval in pd.DataFrame.query * Correct name for `//` is floor division * Include arctan2 Co-authored-by: Xiao Yuan --------- Co-authored-by: Xiao Yuan --- pandas/core/computation/eval.py | 37 +++++++++++++++++++++++++-------- pandas/core/frame.py | 33 +++++++++++++++-------------- 2 files changed, 45 insertions(+), 25 deletions(-) diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 485c7f87d6f33..4ccfbd71d9ce8 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -188,15 +188,6 @@ def eval( """ Evaluate a Python expression as a string using various backends. - The following arithmetic operations are supported: ``+``, ``-``, ``*``, - ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following - boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not). - Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, - :keyword:`or`, and :keyword:`not` with the same semantics as the - corresponding bitwise operators. :class:`~pandas.Series` and - :class:`~pandas.DataFrame` objects are supported and behave as they would - with plain ol' Python evaluation. - .. warning:: ``eval`` can run arbitrary code which can make you vulnerable to code @@ -210,6 +201,34 @@ def eval( `__, only Python `expressions `__. + + By default, with the numexpr engine, the following operations are supported: + + - Arthimetic operations: ``+``, ``-``, ``*``, ``/``, ``**``, ``%`` + - Boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not) + - Comparison operators: ``<``, ``<=``, ``==``, ``!=``, ``>=``, ``>`` + + Furthermore, the following mathematical functions are supported: + + - Trigonometric: ``sin``, ``cos``, ``tan``, ``arcsin``, ``arccos``, \ + ``arctan``, ``arctan2``, ``sinh``, ``cosh``, ``tanh``, ``arcsinh``, \ + ``arccosh`` and ``arctanh`` + - Logarithms: ``log`` natural, ``log10`` base 10, ``log1p`` log(1+x) + - Absolute Value ``abs`` + - Square root ``sqrt`` + - Exponential ``exp`` and Exponential minus one ``expm1`` + + See the numexpr engine `documentation + `__ + for further function support details. + + Using the ``'python'`` engine allows the use of native Python operators + such as floor division ``//``, in addition to built-in and user-defined + Python functions. + + Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, + :keyword:`or`, and :keyword:`not` with the same semantics as the + corresponding bitwise operators. parser : {'pandas', 'python'}, default 'pandas' The parser to use to construct the syntax tree from the expression. The default of ``'pandas'`` parses code slightly different than standard diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c80e9dfd23ba2..4c56948a48eb2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4479,20 +4479,11 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No expr : str The query string to evaluate. - You can refer to variables - in the environment by prefixing them with an '@' character like - ``@a + b``. - - You can refer to column names that are not valid Python variable names - by surrounding them in backticks. Thus, column names containing spaces - or punctuation (besides underscores) or starting with digits must be - surrounded by backticks. (For example, a column named "Area (cm^2)" would - be referenced as ```Area (cm^2)```). Column names which are Python keywords - (like "if", "for", "import", etc) cannot be used. - - For example, if one of your columns is called ``a a`` and you want - to sum it with ``b``, your query should be ```a a` + b``. + See the documentation for :func:`eval` for details of + supported operations and functions in the query string. + See the documentation for :meth:`DataFrame.eval` for details on + referring to column names and variables in the query string. inplace : bool Whether to modify the DataFrame rather than creating a new one. **kwargs @@ -4651,8 +4642,18 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: in the environment by prefixing them with an '@' character like ``@a + b``. - You can refer to column names that are not valid Python variable - names by surrounding them with backticks `````. + You can refer to column names that are not valid Python variable names + by surrounding them in backticks. Thus, column names containing spaces + or punctuation (besides underscores) or starting with digits must be + surrounded by backticks. (For example, a column named "Area (cm^2)" would + be referenced as ```Area (cm^2)```). Column names which are Python keywords + (like "if", "for", "import", etc) cannot be used. + + For example, if one of your columns is called ``a a`` and you want + to sum it with ``b``, your query should be ```a a` + b``. + + See the documentation for :func:`eval` for full details of + supported operations and functions in the expression string. inplace : bool, default False If the expression contains an assignment, whether to perform the operation inplace and mutate the existing DataFrame. Otherwise, @@ -4660,7 +4661,7 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: **kwargs See the documentation for :func:`eval` for complete details on the keyword arguments accepted by - :meth:`~pandas.DataFrame.query`. + :meth:`~pandas.DataFrame.eval`. Returns ------- From c8a67401932c773ace0f62660f09b5684f39a148 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 25 Sep 2024 21:16:04 +0200 Subject: [PATCH 079/224] String dtype: allow string dtype for non-raw apply with numba engine (#59854) * String dtype: allow string dtype for non-raw apply with numba engine * remove xfails * clean-up --- pandas/core/_numba/extensions.py | 3 ++- pandas/core/apply.py | 5 ----- pandas/tests/apply/test_frame_apply.py | 1 - pandas/tests/apply/test_numba.py | 4 ---- 4 files changed, 2 insertions(+), 11 deletions(-) diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py index e6f0427de2a3a..413fdafc7fd04 100644 --- a/pandas/core/_numba/extensions.py +++ b/pandas/core/_numba/extensions.py @@ -53,7 +53,8 @@ @contextmanager def set_numba_data(index: Index): numba_data = index._data - if numba_data.dtype == object: + if numba_data.dtype in (object, "string"): + numba_data = np.asarray(numba_data) if not lib.is_string_array(numba_data): raise ValueError( "The numba engine only supports using string or numeric column names" diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 5959156d11123..7d50b466f5126 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1172,12 +1172,7 @@ def apply_with_numba(self) -> dict[int, Any]: from pandas.core._numba.extensions import set_numba_data index = self.obj.index - if index.dtype == "string": - index = index.astype(object) - columns = self.obj.columns - if columns.dtype == "string": - columns = columns.astype(object) # Convert from numba dict to regular dict # Our isinstance checks in the df constructor don't pass for numbas typed dict diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 3be3562d23cd6..dee0efcd8fd15 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -65,7 +65,6 @@ def test_apply(float_frame, engine, request): assert result.index is float_frame.index -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("raw", [True, False]) @pytest.mark.parametrize("nopython", [True, False]) diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 825d295043e69..d6cd9c321ace6 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -20,7 +18,6 @@ def apply_axis(request): return request.param -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_numba_vs_python_noop(float_frame, apply_axis): func = lambda x: x result = float_frame.apply(func, engine="numba", axis=apply_axis) @@ -43,7 +40,6 @@ def test_numba_vs_python_string_index(): ) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_numba_vs_python_indexing(): frame = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]}, From b87bf854519466182b43f9f7d5b6c9d91be87ad0 Mon Sep 17 00:00:00 2001 From: Naresh Kumar Date: Wed, 25 Sep 2024 12:18:03 -0700 Subject: [PATCH 080/224] ENH: Add kwargs to Series.map (#59843) Co-authored-by: Naresh Kumar --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/series.py | 9 +++++++++ pandas/tests/series/methods/test_map.py | 7 +++++++ 3 files changed, 17 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 516a5d938fb18..41ba80989a0ce 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -54,6 +54,7 @@ Other enhancements - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) +- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) - :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`) diff --git a/pandas/core/series.py b/pandas/core/series.py index 0c26ce27c680c..bbcb6615aeefd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -11,6 +11,7 @@ Mapping, Sequence, ) +import functools import operator import sys from textwrap import dedent @@ -4312,6 +4313,7 @@ def map( self, arg: Callable | Mapping | Series, na_action: Literal["ignore"] | None = None, + **kwargs, ) -> Series: """ Map values of Series according to an input mapping or function. @@ -4327,6 +4329,11 @@ def map( na_action : {None, 'ignore'}, default None If 'ignore', propagate NaN values, without passing them to the mapping correspondence. + **kwargs + Additional keyword arguments to pass as keywords arguments to + `arg`. + + .. versionadded:: 3.0.0 Returns ------- @@ -4388,6 +4395,8 @@ def map( 3 I am a rabbit dtype: object """ + if callable(arg): + arg = functools.partial(arg, **kwargs) new_values = self._map_values(arg, na_action=na_action) return self._constructor(new_values, index=self.index, copy=False).__finalize__( self, method="map" diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 7fa8686fcc6c8..84b60a2afe6eb 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -597,3 +597,10 @@ def test_map_type(): result = s.map(type) expected = Series([int, str, type], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) + + +def test_map_kwargs(): + # GH 59814 + result = Series([2, 4, 5]).map(lambda x, y: x + y, y=2) + expected = Series([4, 6, 7]) + tm.assert_series_equal(result, expected) From a9f76d753dfe3db9206e5556c90ffac0e0ebf46d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 25 Sep 2024 12:19:47 -0700 Subject: [PATCH 081/224] REF: pass dtype explicitly to _from_sequence inside pd.array (#59773) REF: pass dtype explicitly to _from_sequence --- pandas/core/construction.py | 6 ++++-- pandas/tests/extension/base/methods.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index bb3aa3867ab08..1e1292f8ef089 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -358,7 +358,8 @@ def array( return cls._from_sequence(data, dtype=dtype, copy=copy) elif data.dtype.kind in "iu": - return IntegerArray._from_sequence(data, copy=copy) + dtype = IntegerArray._dtype_cls._get_dtype_mapping()[data.dtype] + return IntegerArray._from_sequence(data, dtype=dtype, copy=copy) elif data.dtype.kind == "f": # GH#44715 Exclude np.float16 bc FloatingArray does not support it; # we will fall back to NumpyExtensionArray. @@ -366,7 +367,8 @@ def array( return NumpyExtensionArray._from_sequence( data, dtype=data.dtype, copy=copy ) - return FloatingArray._from_sequence(data, copy=copy) + dtype = FloatingArray._dtype_cls._get_dtype_mapping()[data.dtype] + return FloatingArray._from_sequence(data, dtype=dtype, copy=copy) elif data.dtype.kind == "b": return BooleanArray._from_sequence(data, dtype="boolean", copy=copy) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index dd2ed0bd62a02..fd9fec0cb490c 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -549,7 +549,7 @@ def _test_searchsorted_bool_dtypes(self, data_for_sorting, as_series): dtype = data_for_sorting.dtype data_for_sorting = pd.array([True, False], dtype=dtype) b, a = data_for_sorting - arr = type(data_for_sorting)._from_sequence([a, b]) + arr = type(data_for_sorting)._from_sequence([a, b], dtype=dtype) if as_series: arr = pd.Series(arr) From a92b919a1bb676252b45e574d102b2af29daac12 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 25 Sep 2024 12:21:12 -0700 Subject: [PATCH 082/224] REF: pass dtype explicitly to _from_sequence (#59774) --- pandas/core/arrays/arrow/array.py | 8 ++++++- pandas/core/arrays/datetimelike.py | 6 ++--- pandas/core/arrays/datetimes.py | 6 +---- pandas/core/arrays/period.py | 2 +- .../arrays/datetimes/test_constructors.py | 22 ++++++++++++------- pandas/tests/arrays/test_array.py | 8 +++++-- pandas/tests/arrays/test_datetimelike.py | 10 +++++---- pandas/tests/arrays/test_datetimes.py | 12 +++++++--- pandas/tests/arrays/test_timedeltas.py | 8 +++---- pandas/tests/base/test_conversion.py | 3 ++- pandas/tests/dtypes/test_generic.py | 4 ++-- .../series/accessors/test_dt_accessor.py | 3 ++- 12 files changed, 57 insertions(+), 35 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 39cae5b8e2683..00d46ab9296d0 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2300,7 +2300,13 @@ def _groupby_op( ) if isinstance(result, np.ndarray): return result - return type(self)._from_sequence(result, copy=False) + elif isinstance(result, BaseMaskedArray): + pa_result = result.__arrow_array__() + return type(self)(pa_result) + else: + # DatetimeArray, TimedeltaArray + pa_result = pa.array(result, from_pandas=True) + return type(self)(pa_result) def _apply_elementwise(self, func: Callable) -> list[list[Any]]: """Apply a callable to each element while maintaining the chunking structure.""" diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 7be8daa09c758..a25a698856747 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1393,7 +1393,7 @@ def __add__(self, other): if isinstance(result, np.ndarray) and lib.is_np_dtype(result.dtype, "m"): from pandas.core.arrays import TimedeltaArray - return TimedeltaArray._from_sequence(result) + return TimedeltaArray._from_sequence(result, dtype=result.dtype) return result def __radd__(self, other): @@ -1453,7 +1453,7 @@ def __sub__(self, other): if isinstance(result, np.ndarray) and lib.is_np_dtype(result.dtype, "m"): from pandas.core.arrays import TimedeltaArray - return TimedeltaArray._from_sequence(result) + return TimedeltaArray._from_sequence(result, dtype=result.dtype) return result def __rsub__(self, other): @@ -1472,7 +1472,7 @@ def __rsub__(self, other): # Avoid down-casting DatetimeIndex from pandas.core.arrays import DatetimeArray - other = DatetimeArray._from_sequence(other) + other = DatetimeArray._from_sequence(other, dtype=other.dtype) return other - self elif self.dtype.kind == "M" and hasattr(other, "dtype") and not other_is_dt64: # GH#19959 datetime - datetime is well-defined as timedelta, diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 201c449185057..43f4428118aa7 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -818,11 +818,7 @@ def _add_offset(self, offset: BaseOffset) -> Self: stacklevel=find_stack_level(), ) res_values = self.astype("O") + offset - # TODO(GH#55564): as_unit will be unnecessary - result = type(self)._from_sequence(res_values).as_unit(self.unit) - if not len(self): - # GH#30336 _from_sequence won't be able to infer self.tz - return result.tz_localize(self.tz) + result = type(self)._from_sequence(res_values, dtype=self.dtype) else: result = type(self)._simple_new(res_values, dtype=res_values.dtype) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index aa8dacbd6aad5..7d0ad74f851f0 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -812,7 +812,7 @@ def to_timestamp(self, freq=None, how: str = "start") -> DatetimeArray: new_parr = self.asfreq(freq, how=how) new_data = libperiod.periodarr_to_dt64arr(new_parr.asi8, base) - dta = DatetimeArray._from_sequence(new_data) + dta = DatetimeArray._from_sequence(new_data, dtype=np.dtype("M8[ns]")) if self.freq.name == "B": # See if we can retain BDay instead of Day in cases where diff --git a/pandas/tests/arrays/datetimes/test_constructors.py b/pandas/tests/arrays/datetimes/test_constructors.py index d7264c002c67f..74cc3e991bb76 100644 --- a/pandas/tests/arrays/datetimes/test_constructors.py +++ b/pandas/tests/arrays/datetimes/test_constructors.py @@ -28,10 +28,12 @@ def test_mixing_naive_tzaware_raises(self, meth): # GH#24569 arr = np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]) - msg = ( - "Cannot mix tz-aware with tz-naive values|" - "Tz-aware datetime.datetime cannot be converted " - "to datetime64 unless utc=True" + msg = "|".join( + [ + "Cannot mix tz-aware with tz-naive values", + "Tz-aware datetime.datetime cannot be converted " + "to datetime64 unless utc=True", + ] ) for obj in [arr, arr[::-1]]: @@ -63,10 +65,10 @@ def test_bool_dtype_raises(self): def test_copy(self): data = np.array([1, 2, 3], dtype="M8[ns]") - arr = DatetimeArray._from_sequence(data, copy=False) + arr = DatetimeArray._from_sequence(data, dtype=data.dtype, copy=False) assert arr._ndarray is data - arr = DatetimeArray._from_sequence(data, copy=True) + arr = DatetimeArray._from_sequence(data, dtype=data.dtype, copy=True) assert arr._ndarray is not data def test_numpy_datetime_unit(self, unit): @@ -163,7 +165,9 @@ def test_from_arrow_from_empty(unit, tz): dtype = DatetimeTZDtype(unit=unit, tz=tz) result = dtype.__from_arrow__(arr) - expected = DatetimeArray._from_sequence(np.array(data, dtype=f"datetime64[{unit}]")) + expected = DatetimeArray._from_sequence( + np.array(data, dtype=f"datetime64[{unit}]"), dtype=np.dtype(f"M8[{unit}]") + ) expected = expected.tz_localize(tz=tz) tm.assert_extension_array_equal(result, expected) @@ -179,7 +183,9 @@ def test_from_arrow_from_integers(): dtype = DatetimeTZDtype(unit="ns", tz="UTC") result = dtype.__from_arrow__(arr) - expected = DatetimeArray._from_sequence(np.array(data, dtype="datetime64[ns]")) + expected = DatetimeArray._from_sequence( + np.array(data, dtype="datetime64[ns]"), dtype=np.dtype("M8[ns]") + ) expected = expected.tz_localize("UTC") tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 4070a2844846f..3c0ef1e4d928b 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -370,11 +370,15 @@ def test_array_copy(): ), ( np.array([1, 2], dtype="m8[ns]"), - TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[ns]")), + TimedeltaArray._from_sequence( + np.array([1, 2], dtype="m8[ns]"), dtype=np.dtype("m8[ns]") + ), ), ( np.array([1, 2], dtype="m8[us]"), - TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[us]")), + TimedeltaArray._from_sequence( + np.array([1, 2], dtype="m8[us]"), dtype=np.dtype("m8[us]") + ), ), # integer ([1, 2], IntegerArray._from_sequence([1, 2], dtype="Int64")), diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 6dd1ef9d59ab4..0c8eefab95464 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -257,7 +257,8 @@ def test_fillna_method_doesnt_change_orig(self, method): if self.array_cls is PeriodArray: arr = self.array_cls(data, dtype="period[D]") else: - arr = self.array_cls._from_sequence(data) + dtype = "M8[ns]" if self.array_cls is DatetimeArray else "m8[ns]" + arr = self.array_cls._from_sequence(data, dtype=np.dtype(dtype)) arr[4] = NaT fill_value = arr[3] if method == "pad" else arr[5] @@ -273,7 +274,8 @@ def test_searchsorted(self): if self.array_cls is PeriodArray: arr = self.array_cls(data, dtype="period[D]") else: - arr = self.array_cls._from_sequence(data) + dtype = "M8[ns]" if self.array_cls is DatetimeArray else "m8[ns]" + arr = self.array_cls._from_sequence(data, dtype=np.dtype(dtype)) # scalar result = arr.searchsorted(arr[1]) @@ -739,10 +741,10 @@ def test_array_i8_dtype(self, arr1d): def test_from_array_keeps_base(self): # Ensure that DatetimeArray._ndarray.base isn't lost. arr = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]") - dta = DatetimeArray._from_sequence(arr) + dta = DatetimeArray._from_sequence(arr, dtype=arr.dtype) assert dta._ndarray is arr - dta = DatetimeArray._from_sequence(arr[:0]) + dta = DatetimeArray._from_sequence(arr[:0], dtype=arr.dtype) assert dta._ndarray.base is arr def test_from_dti(self, arr1d): diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 8e348805de978..e3f49d04a0ff2 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -499,7 +499,7 @@ def test_value_counts_preserves_tz(self): @pytest.mark.parametrize("method", ["pad", "backfill"]) def test_fillna_preserves_tz(self, method): dti = pd.date_range("2000-01-01", periods=5, freq="D", tz="US/Central") - arr = DatetimeArray._from_sequence(dti, copy=True) + arr = DatetimeArray._from_sequence(dti, dtype=dti.dtype, copy=True) arr[2] = pd.NaT fill_val = dti[1] if method == "pad" else dti[3] @@ -665,7 +665,9 @@ def test_shift_fill_value(self): dti = pd.date_range("2016-01-01", periods=3) dta = dti._data - expected = DatetimeArray._from_sequence(np.roll(dta._ndarray, 1)) + expected = DatetimeArray._from_sequence( + np.roll(dta._ndarray, 1), dtype=dti.dtype + ) fv = dta[-1] for fill_value in [fv, fv.to_pydatetime(), fv.to_datetime64()]: @@ -731,7 +733,11 @@ def test_iter_zoneinfo_fold(self, tz): ) utc_vals *= 1_000_000_000 - dta = DatetimeArray._from_sequence(utc_vals).tz_localize("UTC").tz_convert(tz) + dta = ( + DatetimeArray._from_sequence(utc_vals, dtype=np.dtype("M8[ns]")) + .tz_localize("UTC") + .tz_convert(tz) + ) left = dta[2] right = list(dta)[2] diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index bcc52f197ee51..fb7c7afdc6ff9 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -263,10 +263,10 @@ def test_searchsorted_invalid_types(self, other, index): class TestUnaryOps: def test_abs(self): vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - arr = TimedeltaArray._from_sequence(vals) + arr = TimedeltaArray._from_sequence(vals, dtype=vals.dtype) evals = np.array([3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - expected = TimedeltaArray._from_sequence(evals) + expected = TimedeltaArray._from_sequence(evals, dtype=evals.dtype) result = abs(arr) tm.assert_timedelta_array_equal(result, expected) @@ -276,7 +276,7 @@ def test_abs(self): def test_pos(self): vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - arr = TimedeltaArray._from_sequence(vals) + arr = TimedeltaArray._from_sequence(vals, dtype=vals.dtype) result = +arr tm.assert_timedelta_array_equal(result, arr) @@ -288,7 +288,7 @@ def test_pos(self): def test_neg(self): vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - arr = TimedeltaArray._from_sequence(vals) + arr = TimedeltaArray._from_sequence(vals, dtype=vals.dtype) evals = np.array([3600 * 10**9, "NaT", -7200 * 10**9], dtype="m8[ns]") expected = TimedeltaArray._from_sequence(evals) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 13a3ff048c79e..d8af7abe83084 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -333,7 +333,8 @@ def test_array_multiindex_raises(): # Timedelta ( TimedeltaArray._from_sequence( - np.array([0, 3600000000000], dtype="i8").view("m8[ns]") + np.array([0, 3600000000000], dtype="i8").view("m8[ns]"), + dtype=np.dtype("m8[ns]"), ), np.array([0, 3600000000000], dtype="m8[ns]"), ), diff --git a/pandas/tests/dtypes/test_generic.py b/pandas/tests/dtypes/test_generic.py index 261f86bfb0326..2b90886a8d070 100644 --- a/pandas/tests/dtypes/test_generic.py +++ b/pandas/tests/dtypes/test_generic.py @@ -20,8 +20,8 @@ class TestABCClasses: df = pd.DataFrame({"names": ["a", "b", "c"]}, index=multi_index) sparse_array = pd.arrays.SparseArray(np.random.default_rng(2).standard_normal(10)) - datetime_array = pd.core.arrays.DatetimeArray._from_sequence(datetime_index) - timedelta_array = pd.core.arrays.TimedeltaArray._from_sequence(timedelta_index) + datetime_array = datetime_index.array + timedelta_array = timedelta_index.array abc_pairs = [ ("ABCMultiIndex", multi_index), diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 9b9a8ea3600ae..885adb3543b46 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -790,7 +790,8 @@ def test_end_time_timevalues(self, input_vals): # GH#17157 # Check that the time part of the Period is adjusted by end_time # when using the dt accessor on a Series - input_vals = PeriodArray._from_sequence(np.asarray(input_vals)) + dtype = pd.PeriodDtype(input_vals[0].freq) + input_vals = PeriodArray._from_sequence(np.asarray(input_vals), dtype=dtype) ser = Series(input_vals) result = ser.dt.end_time From b96491a11b7938c9146a26bfac339a6ebe0ca4a2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 25 Sep 2024 09:22:49 -1000 Subject: [PATCH 083/224] DOC: Emphasize team managed pandas in installation docs (#59822) * DOC: Emphasize team managed pandas in installation docs * grammar --- doc/source/development/maintaining.rst | 2 +- doc/source/getting_started/index.rst | 3 +- doc/source/getting_started/install.rst | 151 ++++++------------ web/pandas/getting_started.md | 29 +--- .../static/img/install/anaconda_prompt.png | Bin 1373 -> 0 bytes .../static/img/install/jupyterlab_home.png | Bin 1962 -> 0 bytes .../img/install/pandas_import_and_version.png | Bin 2252 -> 0 bytes 7 files changed, 52 insertions(+), 133 deletions(-) delete mode 100644 web/pandas/static/img/install/anaconda_prompt.png delete mode 100644 web/pandas/static/img/install/jupyterlab_home.png delete mode 100644 web/pandas/static/img/install/pandas_import_and_version.png diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index 50d380cab1d50..1e4a851d0e72d 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -344,7 +344,7 @@ in the next places: - Git repo with a `new tag `_ - Source distribution in a `GitHub release `_ - Pip packages in the `PyPI `_ -- Conda/Mamba packages in `conda-forge `_ +- Conda packages in `conda-forge `_ The process for releasing a new version of pandas is detailed next section. diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst index 36ed553d9d88e..a17699a71fbd3 100644 --- a/doc/source/getting_started/index.rst +++ b/doc/source/getting_started/index.rst @@ -17,8 +17,7 @@ Installation :columns: 12 12 6 6 :padding: 3 - pandas is part of the `Anaconda `__ - distribution and can be installed with Anaconda or Miniconda: + pandas can be installed via conda from `conda-forge `__. ++++++++++++++++++++++ diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 8e6cb9e9a132d..b3982c4ad091f 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -6,15 +6,16 @@ Installation ============ -The easiest way to install pandas is to install it -as part of the `Anaconda `__ distribution, a -cross platform distribution for data analysis and scientific computing. -The `Conda `__ package manager is the -recommended installation method for most users. +The pandas development team officially distributes pandas for installation +through the following methods: -Instructions for installing :ref:`from source `, -:ref:`PyPI `, or a -:ref:`development version ` are also provided. +* Available on `conda-forge `__ for installation with the conda package manager. +* Available on `PyPI `__ for installation with pip. +* Available on `Github `__ for installation from source. + +.. note:: + pandas may be installable from other sources besides the ones listed above, + but they are **not** managed by the pandas development team. .. _install.version: @@ -26,68 +27,54 @@ See :ref:`Python support policy `. Installing pandas ----------------- -.. _install.anaconda: +.. _install.conda: -Installing with Anaconda -~~~~~~~~~~~~~~~~~~~~~~~~ +Installing with Conda +~~~~~~~~~~~~~~~~~~~~~ -For users that are new to Python, the easiest way to install Python, pandas, and the -packages that make up the `PyData `__ stack -(`SciPy `__, `NumPy `__, -`Matplotlib `__, `and more `__) -is with `Anaconda `__, a cross-platform -(Linux, macOS, Windows) Python distribution for data analytics and -scientific computing. Installation instructions for Anaconda -`can be found here `__. +For users working with the `Conda `__ package manager, +pandas can be installed from the ``conda-forge`` channel. -.. _install.miniconda: +.. code-block:: shell -Installing with Miniconda -~~~~~~~~~~~~~~~~~~~~~~~~~ + conda install -c conda-forge pandas -For users experienced with Python, the recommended way to install pandas with -`Miniconda `__. -Miniconda allows you to create a minimal, self-contained Python installation compared to Anaconda and use the -`Conda `__ package manager to install additional packages -and create a virtual environment for your installation. Installation instructions for Miniconda -`can be found here `__. +To install the Conda package manager on your system, the +`Miniforge distribution `__ +is recommended. -The next step is to create a new conda environment. A conda environment is like a -virtualenv that allows you to specify a specific version of Python and set of libraries. -Run the following commands from a terminal window. +Additionally, it is recommended to install and run pandas from a virtual environment. .. code-block:: shell conda create -c conda-forge -n name_of_my_env python pandas - -This will create a minimal environment with only Python and pandas installed. -To put your self inside this environment run. - -.. code-block:: shell - + # On Linux or MacOS source activate name_of_my_env # On Windows activate name_of_my_env -.. _install.pypi: +.. tip:: + For users that are new to Python, the easiest way to install Python, pandas, and the + packages that make up the `PyData `__ stack such as + `SciPy `__, `NumPy `__ and + `Matplotlib `__ + is with `Anaconda `__, a cross-platform + (Linux, macOS, Windows) Python distribution for data analytics and + scientific computing. -Installing from PyPI -~~~~~~~~~~~~~~~~~~~~ + However, pandas from Anaconda is **not** officially managed by the pandas development team. -pandas can be installed via pip from -`PyPI `__. +.. _install.pip: -.. code-block:: shell - - pip install pandas +Installing with pip +~~~~~~~~~~~~~~~~~~~ -.. note:: - You must have ``pip>=19.3`` to install from PyPI. +For users working with the `pip `__ package manager, +pandas can be installed from `PyPI `__. -.. note:: +.. code-block:: shell - It is recommended to install and run pandas from a virtual environment, for example, - using the Python standard library's `venv `__ + pip install pandas pandas can also be installed with sets of optional dependencies to enable certain functionality. For example, to install pandas with the optional dependencies to read Excel files. @@ -98,25 +85,8 @@ to install pandas with the optional dependencies to read Excel files. The full list of extras that can be installed can be found in the :ref:`dependency section.` -Handling ImportErrors -~~~~~~~~~~~~~~~~~~~~~ - -If you encounter an ``ImportError``, it usually means that Python couldn't find pandas in the list of available -libraries. Python internally has a list of directories it searches through, to find packages. You can -obtain these directories with. - -.. code-block:: python - - import sys - sys.path - -One way you could be encountering this error is if you have multiple Python installations on your system -and you don't have pandas installed in the Python installation you're currently using. -In Linux/Mac you can run ``which python`` on your terminal and it will tell you which Python installation you're -using. If it's something like "/usr/bin/python", you're using the Python from the system, which is not recommended. - -It is highly recommended to use ``conda``, for quick installation and for package and dependency updates. -You can find simple installation instructions for pandas :ref:`in this document `. +Additionally, it is recommended to install and run pandas from a virtual environment, for example, +using the Python standard library's `venv `__ .. _install.source: @@ -144,49 +114,24 @@ index from the PyPI registry of anaconda.org. You can install it by running. pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple pandas -Note that you might be required to uninstall an existing version of pandas to install the development version. +.. note:: + You might be required to uninstall an existing version of pandas to install the development version. -.. code-block:: shell + .. code-block:: shell - pip uninstall pandas -y + pip uninstall pandas -y Running the test suite ---------------------- -pandas is equipped with an exhaustive set of unit tests. The packages required to run the tests -can be installed with ``pip install "pandas[test]"``. To run the tests from a -Python terminal. - -.. code-block:: python - - >>> import pandas as pd - >>> pd.test() - running: pytest -m "not slow and not network and not db" /home/user/anaconda3/lib/python3.10/site-packages/pandas - - ============================= test session starts ============================== - platform linux -- Python 3.9.7, pytest-6.2.5, py-1.11.0, pluggy-1.0.0 - rootdir: /home/user - plugins: dash-1.19.0, anyio-3.5.0, hypothesis-6.29.3 - collected 154975 items / 4 skipped / 154971 selected - ........................................................................ [ 0%] - ........................................................................ [ 99%] - ....................................... [100%] - - ==================================== ERRORS ==================================== - - =================================== FAILURES =================================== - - =============================== warnings summary =============================== - - =========================== short test summary info ============================ - - = 1 failed, 146194 passed, 7402 skipped, 1367 xfailed, 5 xpassed, 197 warnings, 10 errors in 1090.16s (0:18:10) = +If pandas has been installed :ref:`from source `, running ``pytest pandas`` will run all of pandas unit tests. +The unit tests can also be run from the pandas module itself with the :func:`test` function. The packages required to run the tests +can be installed with ``pip install "pandas[test]"``. .. note:: - This is just an example of what information is shown. Test failures are not necessarily indicative - of a broken pandas installation. + Test failures are not necessarily indicative of a broken pandas installation. .. _install.dependencies: @@ -219,7 +164,7 @@ For example, :func:`pandas.read_hdf` requires the ``pytables`` package, while optional dependency is not installed, pandas will raise an ``ImportError`` when the method requiring that dependency is called. -If using pip, optional pandas dependencies can be installed or managed in a file (e.g. requirements.txt or pyproject.toml) +With pip, optional pandas dependencies can be installed or managed in a file (e.g. requirements.txt or pyproject.toml) as optional extras (e.g. ``pandas[performance, aws]``). All optional dependencies can be installed with ``pandas[all]``, and specific sets of dependencies are listed in the sections below. diff --git a/web/pandas/getting_started.md b/web/pandas/getting_started.md index 0c4219e1ae12e..801081a9ef391 100644 --- a/web/pandas/getting_started.md +++ b/web/pandas/getting_started.md @@ -2,33 +2,8 @@ ## Installation instructions -The next steps provides the easiest and recommended way to set up your -environment to use pandas. Other installation options can be found in -the [advanced installation page]({{ base_url}}docs/getting_started/install.html). - -1. Download [Anaconda](https://www.anaconda.com/download/) for your operating system and - the latest Python version, run the installer, and follow the steps. Please note: - - - It is not needed (and discouraged) to install Anaconda as root or administrator. - - When asked if you wish to initialize Anaconda3, answer yes. - - Restart the terminal after completing the installation. - - Detailed instructions on how to install Anaconda can be found in the - [Anaconda documentation](https://docs.anaconda.com/anaconda/install/). - -2. In the Anaconda prompt (or terminal in Linux or macOS), start JupyterLab: - - - -3. In JupyterLab, create a new (Python 3) notebook: - - - -4. In the first cell of the notebook, you can import pandas and check the version with: - - - -5. Now you are ready to use pandas, and you can write your code in the next cells. +To install pandas, please reference the [installation page]({{ base_url}}docs/getting_started/install.html) +from the pandas documentation. ## Tutorials diff --git a/web/pandas/static/img/install/anaconda_prompt.png b/web/pandas/static/img/install/anaconda_prompt.png deleted file mode 100644 index 7b547e4ebb02a6102ecf615ddddf576dc74ccd15..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1373 zcmeAS@N?(olHy`uVBq!ia0y~yVA=y@|6*YVk{h`?HGq^sfKP}k10#b9bF2%uKq8Bc zoSdPsM`yJOV`zA^eCw@!9nLL_Hq6}KxA)+_-aCJPy;}PB|DUWqmG2oCSYCL#IEGZ* zdV6bswsgA0fsf$}7Q0V-C)y;v<(a<5EiOjKX*{jYYJ;_H*%M>%es_ziw=tk%35>kwN(GMdp|eD+v?`u#*FU$_YYp{-z{xd_vPMdfu$Mq z_b#zrrsVsZJFeoPYQ@9W=@0(uf2{NBy7zda`91S|w-4{yzyJB;B6M#}*47iZzoiHG zYu|el9(QBjrhq{Cd;aR`+V}YT%Rg`1eDB-q%6#+p`IW^J4@JMQ6TDaVpYPtI@4sG0 zZkZkaPf2jk`TUB0`xiI;*Y(BzOo?CZF)zkGEB%?n!@0HQU+?_!);D?Ib9ZND>BO|7 zfuYM&F7`dFTd=1xf8x2t@Tj8+eN|-Mv!uJPv~0IE?>M%*B6eTVp4TFG)2qH7e;)67sCwDgi4T8n z4X?YjcI(e2^KTw?GXlEo*wa2woyS@FRCE7~#}U=`)1)_Z@9v!X(z`i*<<9)r z>%qS_)?bOU==Z*Nr+Sw7<=<;}&OLee^q%dHMT+MqKi_a?#@zTnOM}yo_I;c8G3EYm zbM4BnckiG6aqSy_Wv2cf7X3XRg!X(m(bpAUZ=Zag;xDg+Hz52wSc$R1xzm+dkTEnleepdRpSGcyqyz(#?-~Ktr z9&ApqJiBjut@0s^;@`;qp1b);qoiDsgw-|5aG> za!=^p!$*9!%eVOApd{e)Hh;}t2VQU%U3-wI;wsAwEyAk z{_lI;3N~AaymyZNz{dLC(dgqGm9j}HoZBi=CFXXP&)!~WAr@D?r~dW%gmujCuYIXA zo-#YUHsd&Of|dK8s4n5iKccB*XkbB z-dsFkYxuI8?@pevY47t%UomHXx2a0Z^Qo_Ie9zokETOvbXF&aZshP13?%#Gj{cv*M z^-rhHRI$E4xZBOU&pbKHZ_bU!Ag_2<&izrm_Uro(`&P)9)W2>`QV&>jvMj#({4=q8 zSGR9lD?T^&ey#P#uI&@rzW=K)z7e)&pP}x4W$yK#&ozHP4~khx)FGl15#bOqq*w+f s0<2;%VWh+YR|*qDw`P!8z2z1E3ib2V9y4c40?SJVPgg&ebxsLQ0Dqmv%>V!Z diff --git a/web/pandas/static/img/install/jupyterlab_home.png b/web/pandas/static/img/install/jupyterlab_home.png deleted file mode 100644 index c62d33a5e0fc605be6d66c4a7be9f31d9baee8bc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1962 zcmY*aX;f2576yr|C!ium34ulk$V5mBVvYnv8bsrQ2tvTfbBO~AhNmd-LK+t~8(G|w zZZ^fGl{JG9BJfZk5=0F+G65Sqk0dNKexVHcd4p->#NEQ4cUq{ zw>38~Fu>Bb1%(+HEQ0lT4r8R3b>_Ki^uza|JNIs}x5pYd-DUe*1k z%vIBQ4hKMHFgp94I-?}2QIV}z^#UoA*VyC<4x1rALk{=C_+h-kpRThYZ*=bL*pC2c zia?N%+<|ve|K3q^nbJp&cH_ihceofoADYf%M)eH1z8@3MihnH3f8@eW< zI8th{1-mOEK!iWZZ0v3M`{+nuXG^SKINXN@JCMTSt5KJHtqW4uIG0EQz{8;$AMtna zo(sy=f$O&VpdA)u_?x&|Yov$E^atdf?%tu4X;;vE%QwE_D2QL|=f72=V~0GO*QSrb z01)RF?K*hxSNMNEUsHVbK>*c)sw?Lch98SKY>Izo11Zw0ApebzkPLc08>#>xEu{6) zuR80@=N5udEhtw~d3^ktVjYGMa1=0R6VTzkUDm<-JEIizl*G{s#FVff5 z_34XD2pZal+{a^o%iQe2scBivlFxDga&*ML^`I#vMCDb)0W>0wPeU! z(>R?oqDA9JY6Q!{562YfpfQL+epM1=*>op$$PUfE2{e9`^*5yZ7G-&)Tq>IP93g!4 zv&$#Gl8CBm(k4tcDz>H=H(PyR>ysIGawMen&bw#ghp~C7tu!3K!q;a?68)^?(w9`F zHxATULEhNTN^Rkv8}!OKundRzj&?vRUat>)htLrpF!Po&NlPbqQ|t-`NHU&|1~R<6>0Y#?Z3(7T^&; zv%x$Yo1=UL35GEL;q(8P{RM$eeBL_?ax5N!`>gmb$f1Gh1Kc7AXJ-NIf)UgaIH{0| zIsMZ;wz*_aZffGICt)`uM&6*~@s|CM65PA7+X ztF0$F3aRHg!Ew>75q|~Hl|$JCq;8fM)TJ#(D4o=6wl)0o^%CMu+i)`PA{wlaY!69} z9XPH?C)&*D=7sVCF+(t@>35|BsU(y+O z`c^{n4}TsIRh>^A#M9#@fyn2`BmdC|ckefBH*0`EH3@%MYcq{Jj|qo0mG4RO6|d}0 z@8u5}+qGapbcq@A2YlnLqNhB`|EMK8KAJ2Zhjq#^GRshXswmR~5ZspRbVCj@SD=xi z3M}$RoMiz1r~_0SPBjL<7q;WA`CcxlII2}o;?FpwaLN>6qCKfaO?S@r8X|$z`mU*% zk*ZM5*u)}rc>zd)DVI!PR*`x*5fbI8fxcS`zuCbRI{)<@+`FpVJo?41ZB~`yPDUUl y1er|d?4wgb;X(igiMl+x-r3`?cVQ@0XH=pqxn;r(->m<94QN|Jf@*&_nDK8H{P*er diff --git a/web/pandas/static/img/install/pandas_import_and_version.png b/web/pandas/static/img/install/pandas_import_and_version.png deleted file mode 100644 index 64c1303ac495ccf72a7c649401cce26c47c15ace..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2252 zcmZ{mc|6Wpy{dDhjCxN;DWyRAKT3#6!c$7O199%SvO7=DJRb0p64C6$g|JF zTAw<-^m@N`3KWT@3E`Bzyd#cqWCog8sr(66Dz64>{Nn8&KY_ihRZ97es|LP`*jD=h zEA8-nu=H%uPN9!@DZ)c!iAx_H%kPz3VwxTGZa}dW^iYqWb9BWL!1a*TciL)^!y#3fsKrS2sQC_&=q+eG< z(9?22qGyJ}=aJC{Crg9O7l;h^oa&wdn^4N)rkwYInz<{Tbd1Z3tP_AKm`uf%R+|w^ zoz$xhN({Qy06RzSMthB@ItW~}0$$x6N@D^t%NxT%Ojpd@vWucL2OWs&RuK2>uk|XG zMCXvS^a^Vw(P!h%Y`l@3+WW|0M1ed!h6(6sr&VZ`+^?1x6T^}6#N69XHrJ4^*nI%o zS{7&}Rk%$OyvMrL(~M0A!JQV!S9H%Ta7=; zK$aRD5Q}uHWRcGzVGj1U(~bdO5doI+ph(6 zx9liJ_c&wMNqCuZag9-=0KQ(?fy2hu*@O|Y?xHQuV=j3k#wqYWyl(Xx7_VykN!MKc z>29swK`j%`+nM-yqTO=&;KA02?nTD=-PwlcV^_~BFI4;VBeQtCnZ&`xD-AJ*2|6+G z0B4T`ujB+_8-moJ%?y_zcsp|rgqB8-Yjy|5c$m)uHPUdddMq=?#gO#V=Hx{oq zz-{9)`{H-SK)5wcNgPS76&LSuJmK0co$TXNbMkZ3%yQw-X z=YB+Iz^)-}wnj(ofUwMCRNr^(f-1+pp@WAq&7MkPUyNk>VMAuWnu%`}P8hH@<@RaA z3|K!1qZTm~rhILb4p*#0s?et2m( zGgtyMG`g0TZ|YI^%2nw z#K?lV{FT{z#CRBwftD3G6$s_n_ zP2i|nyh^k(*~M=^=b>f>Z4~>JSo`tRqVHJq(f_M5>b~B5AuMsoLn>kr(HxxAyOu1? zsf6OMzfLK3G8Fi4FmNEB7m~IzKMDcd>FrgvD&Noc;9}PyV97<1f9wQbR9YD8DJ^xy z=-Wm75<5ffMfI(t&9Oh6_*#O$Yw#_^{tCYDwfX<4_K&^)M`!*MFup6v6KhATdP%`I R`QH-(TPsJ)dW)+!{{$*Kl8*oY From b9488218ae27b70d1669a932ab16e8ce5a257cf0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 25 Sep 2024 14:47:10 -1000 Subject: [PATCH 084/224] CI/TST: Check for tzset in set_timezone (#59893) * CI/TST: Check for tzset in set_timezone * adjust test message --- pandas/_testing/contexts.py | 17 +++++++++-------- pandas/tests/tslibs/test_parsing.py | 11 +++++++---- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py index 91b5d2a981bef..4ca67d6fc082d 100644 --- a/pandas/_testing/contexts.py +++ b/pandas/_testing/contexts.py @@ -73,14 +73,15 @@ def set_timezone(tz: str) -> Generator[None, None, None]: import time def setTZ(tz) -> None: - if tz is None: - try: - del os.environ["TZ"] - except KeyError: - pass - else: - os.environ["TZ"] = tz - time.tzset() + if hasattr(time, "tzset"): + if tz is None: + try: + del os.environ["TZ"] + except KeyError: + pass + else: + os.environ["TZ"] = tz + time.tzset() orig_tz = os.environ.get("TZ") setTZ(tz) diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 9b64beaf09273..07425af8ed37a 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -37,10 +37,13 @@ ) def test_parsing_tzlocal_deprecated(): # GH#50791 - msg = ( - r"Parsing 'EST' as tzlocal \(dependent on system timezone\) " - r"is no longer supported\. " - "Pass the 'tz' keyword or call tz_localize after construction instead" + msg = "|".join( + [ + r"Parsing 'EST' as tzlocal \(dependent on system timezone\) " + r"is no longer supported\. " + "Pass the 'tz' keyword or call tz_localize after construction instead", + ".*included an un-recognized timezone", + ] ) dtstr = "Jan 15 2004 03:00 EST" From 23c497bb2f7e05af1fda966e7fb04db942453559 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 26 Sep 2024 05:06:07 -1000 Subject: [PATCH 085/224] DOC: Recommend conda from miniforge for contributing environment (#59894) --- doc/source/development/contributing.rst | 6 ++--- .../development/contributing_codebase.rst | 2 +- .../development/contributing_environment.rst | 23 +++++++++---------- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index fe5271dab7132..4d99f282aa695 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -305,15 +305,15 @@ It is important to periodically update your local ``main`` branch with updates f branch and update your development environment to reflect any changes to the various packages that are used during development. -If using :ref:`mamba `, run: +If using :ref:`conda `, run: .. code-block:: shell git checkout main git fetch upstream git merge upstream/main - mamba activate pandas-dev - mamba env update -f environment.yml --prune + conda activate pandas-dev + conda env update -f environment.yml --prune If using :ref:`pip ` , do: diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 9d5a992e911b6..670ffe6996302 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -244,7 +244,7 @@ in your python environment. .. warning:: - * Please be aware that the above commands will use the current python environment. If your python packages are older/newer than those installed by the pandas CI, the above commands might fail. This is often the case when the ``mypy`` or ``numpy`` versions do not match. Please see :ref:`how to setup the python environment ` or select a `recently succeeded workflow `_, select the "Docstring validation, typing, and other manual pre-commit hooks" job, then click on "Set up Conda" and "Environment info" to see which versions the pandas CI installs. + * Please be aware that the above commands will use the current python environment. If your python packages are older/newer than those installed by the pandas CI, the above commands might fail. This is often the case when the ``mypy`` or ``numpy`` versions do not match. Please see :ref:`how to setup the python environment ` or select a `recently succeeded workflow `_, select the "Docstring validation, typing, and other manual pre-commit hooks" job, then click on "Set up Conda" and "Environment info" to see which versions the pandas CI installs. .. _contributing.ci: diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index 643021db7b823..1426d3a84a748 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -43,7 +43,7 @@ and consult the ``Linux`` instructions below. **macOS** -To use the :ref:`mamba `-based compilers, you will need to install the +To use the :ref:`conda `-based compilers, you will need to install the Developer Tools using ``xcode-select --install``. If you prefer to use a different compiler, general information can be found here: @@ -51,9 +51,9 @@ https://devguide.python.org/setup/#macos **Linux** -For Linux-based :ref:`mamba ` installations, you won't have to install any -additional components outside of the mamba environment. The instructions -below are only needed if your setup isn't based on mamba environments. +For Linux-based :ref:`conda ` installations, you won't have to install any +additional components outside of the conda environment. The instructions +below are only needed if your setup isn't based on conda environments. Some Linux distributions will come with a pre-installed C compiler. To find out which compilers (and versions) are installed on your system:: @@ -82,19 +82,18 @@ Before we begin, please: * Make sure that you have :any:`cloned the repository ` * ``cd`` to the pandas source directory you just created with the clone command -.. _contributing.mamba: +.. _contributing.conda: -Option 1: using mamba (recommended) +Option 1: using conda (recommended) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* Install miniforge to get `mamba `_ -* Make sure your mamba is up to date (``mamba update mamba``) -* Create and activate the ``pandas-dev`` mamba environment using the following commands: +* Install miniforge to get `conda `_ +* Create and activate the ``pandas-dev`` conda environment using the following commands: -.. code-block:: none +.. code-block:: bash - mamba env create --file environment.yml - mamba activate pandas-dev + conda env create --file environment.yml + conda activate pandas-dev .. _contributing.pip: From 5ced458f6318f0319877ab655b8cb6b86092ea62 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sat, 28 Sep 2024 07:51:30 -0400 Subject: [PATCH 086/224] CI: Pin micromamba to 1.x (#59912) --- .github/actions/setup-conda/action.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml index 3eb68bdd2a15c..4fe901998cbcc 100644 --- a/.github/actions/setup-conda/action.yml +++ b/.github/actions/setup-conda/action.yml @@ -9,6 +9,8 @@ runs: - name: Install ${{ inputs.environment-file }} uses: mamba-org/setup-micromamba@v1 with: + # Pinning to avoid 2.0 failures + micromamba-version: '1.5.10-0' environment-file: ${{ inputs.environment-file }} environment-name: test condarc-file: ci/.condarc From 96de1f13103cd21417101de9d555f203cf93867a Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sun, 29 Sep 2024 00:07:34 +0530 Subject: [PATCH 087/224] DOC: fix SA01, ES01 for pandas.Series.sparse.npoints (#59896) * DOC: fix SA01, ES01 for pandas.Series.sparse.npoints * Update pandas/core/arrays/sparse/array.py --- ci/code_checks.sh | 1 - pandas/core/arrays/sparse/array.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index f662b4781e84b..149c5c0326733 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -97,7 +97,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.unit GL08" \ -i "pandas.Series.pad PR01,SA01" \ -i "pandas.Series.sparse.from_coo PR07,SA01" \ - -i "pandas.Series.sparse.npoints SA01" \ -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index c8ec4068ca199..0c76280e7fdb4 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -708,6 +708,18 @@ def npoints(self) -> int: """ The number of non- ``fill_value`` points. + This property returns the number of elements in the sparse series that are + not equal to the ``fill_value``. Sparse data structures store only the + non-``fill_value`` elements, reducing memory usage when the majority of + values are the same. + + See Also + -------- + Series.sparse.to_dense : Convert a Series from sparse values to dense. + Series.sparse.fill_value : Elements in ``data`` that are ``fill_value`` are + not stored. + Series.sparse.density : The percent of non- ``fill_value`` points, as decimal. + Examples -------- >>> from pandas.arrays import SparseArray From cf12e6722cfaba646e7f0a1e5e8db88be8d076cd Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sun, 29 Sep 2024 00:08:55 +0530 Subject: [PATCH 088/224] DOC: fix RT03, ES01 for pandas.core.groupby.DataFrameGroupBy.agg and pandas.core.groupby.DataFrameGroupBy.aggregate (#59869) * DOC: add double backticks for sphinx compatibility Co-authored-by: mroeschke * DOC: remove _agg_template_frame Co-authored-by: mroeschke Co-authored-by: rhshadrach * DOC: fix RT03, ES01 for pandas.core.groupby.DataFrameGroupBy.aggregate --------- Co-authored-by: mroeschke Co-authored-by: rhshadrach --- ci/code_checks.sh | 2 - pandas/core/groupby/generic.py | 176 +++++++++++++++++++++++++- pandas/core/groupby/groupby.py | 78 ------------ scripts/validate_unwanted_patterns.py | 1 - 4 files changed, 174 insertions(+), 83 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 149c5c0326733..669c793737161 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -127,8 +127,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.arrays.SparseArray PR07,SA01" \ -i "pandas.arrays.TimedeltaArray PR07,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.__iter__ RT03,SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.agg RT03" \ - -i "pandas.core.groupby.DataFrameGroupBy.aggregate RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index bec9d344d42e2..0c211afb5073c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -67,7 +67,6 @@ from pandas.core.groupby.groupby import ( GroupBy, GroupByPlot, - _agg_template_frame, _agg_template_series, _transform_template, ) @@ -1515,8 +1514,181 @@ class DataFrameGroupBy(GroupBy[DataFrame]): """ ) - @doc(_agg_template_frame, examples=_agg_examples_doc, klass="DataFrame") def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): + """ + Aggregate using one or more operations. + + The ``aggregate`` function allows the application of one or more aggregation + operations on groups of data within a DataFrameGroupBy object. It supports + various aggregation methods, including user-defined functions and predefined + functions such as 'sum', 'mean', etc. + + Parameters + ---------- + func : function, str, list, dict or None + Function to use for aggregating the data. If a function, must either + work when passed a DataFrame or when passed to DataFrame.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - dict of index labels -> functions, function names or list of such. + - None, in which case ``**kwargs`` are used with Named Aggregation. Here the + output has one column for each element in ``**kwargs``. The name of the + column is keyword, whereas the value determines the aggregation used to + compute the values in the column. + + Can also accept a Numba JIT function with + ``engine='numba'`` specified. Only passing a single function is supported + with this engine. + + If the ``'numba'`` engine is chosen, the function must be + a user defined function with ``values`` and ``index`` as the + first and second arguments respectively in the function signature. + Each group's index will be passed to the user defined function + and optionally available for use. + + *args + Positional arguments to pass to func. + engine : str, default None + * ``'cython'`` : Runs the function through C-extensions from cython. + * ``'numba'`` : Runs the function through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be + applied to the function + + **kwargs + * If ``func`` is None, ``**kwargs`` are used to define the output names and + aggregations via Named Aggregation. See ``func`` entry. + * Otherwise, keyword arguments to be passed into func. + + Returns + ------- + DataFrame + Aggregated DataFrame based on the grouping and the applied aggregation + functions. + + See Also + -------- + DataFrame.groupby.apply : Apply function func group-wise + and combine the results together. + DataFrame.groupby.transform : Transforms the Series on each group + based on the given function. + DataFrame.aggregate : Aggregate using one or more operations. + + Notes + ----- + When using ``engine='numba'``, there will be no "fall back" behavior internally. + The group data and group index will be passed as numpy arrays to the JITed + user defined function, and no alternative execution attempts will be tried. + + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. + + Examples + -------- + >>> data = { + ... "A": [1, 1, 2, 2], + ... "B": [1, 2, 3, 4], + ... "C": [0.362838, 0.227877, 1.267767, -0.562860], + ... } + >>> df = pd.DataFrame(data) + >>> df + A B C + 0 1 1 0.362838 + 1 1 2 0.227877 + 2 2 3 1.267767 + 3 2 4 -0.562860 + + The aggregation is for each column. + + >>> df.groupby("A").agg("min") + B C + A + 1 1 0.227877 + 2 3 -0.562860 + + Multiple aggregations + + >>> df.groupby("A").agg(["min", "max"]) + B C + min max min max + A + 1 1 2 0.227877 0.362838 + 2 3 4 -0.562860 1.267767 + + Select a column for aggregation + + >>> df.groupby("A").B.agg(["min", "max"]) + min max + A + 1 1 2 + 2 3 4 + + User-defined function for aggregation + + >>> df.groupby("A").agg(lambda x: sum(x) + 2) + B C + A + 1 5 2.590715 + 2 9 2.704907 + + Different aggregations per column + + >>> df.groupby("A").agg({"B": ["min", "max"], "C": "sum"}) + B C + min max sum + A + 1 1 2 0.590715 + 2 3 4 0.704907 + + To control the output names with different aggregations per column, + pandas supports "named aggregation" + + >>> df.groupby("A").agg( + ... b_min=pd.NamedAgg(column="B", aggfunc="min"), + ... c_sum=pd.NamedAgg(column="C", aggfunc="sum"), + ... ) + b_min c_sum + A + 1 1 0.590715 + 2 3 0.704907 + + - The keywords are the *output* column names + - The values are tuples whose first element is the column to select + and the second element is the aggregation to apply to that column. + Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields + ``['column', 'aggfunc']`` to make it clearer what the arguments are. + As usual, the aggregation can be a callable or a string alias. + + See :ref:`groupby.aggregate.named` for more. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the aggregating + function. + + >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min()) + B + A + 1 1.0 + 2 3.0 + """ relabeling, func, columns, order = reconstruct_func(func, **kwargs) func = maybe_mangle_lambdas(func) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 38dad446b4c39..9e36837bc679f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -445,84 +445,6 @@ class providing the base-class of operations. see the examples below. {examples}""" -_agg_template_frame = """ -Aggregate using one or more operations. - -Parameters ----------- -func : function, str, list, dict or None - Function to use for aggregating the data. If a function, must either - work when passed a {klass} or when passed to {klass}.apply. - - Accepted combinations are: - - - function - - string function name - - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` - - dict of index labels -> functions, function names or list of such. - - None, in which case ``**kwargs`` are used with Named Aggregation. Here the - output has one column for each element in ``**kwargs``. The name of the - column is keyword, whereas the value determines the aggregation used to compute - the values in the column. - - Can also accept a Numba JIT function with - ``engine='numba'`` specified. Only passing a single function is supported - with this engine. - - If the ``'numba'`` engine is chosen, the function must be - a user defined function with ``values`` and ``index`` as the - first and second arguments respectively in the function signature. - Each group's index will be passed to the user defined function - and optionally available for use. - -*args - Positional arguments to pass to func. -engine : str, default None - * ``'cython'`` : Runs the function through C-extensions from cython. - * ``'numba'`` : Runs the function through JIT compiled code from numba. - * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` - -engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be - applied to the function - -**kwargs - * If ``func`` is None, ``**kwargs`` are used to define the output names and - aggregations via Named Aggregation. See ``func`` entry. - * Otherwise, keyword arguments to be passed into func. - -Returns -------- -{klass} - -See Also --------- -{klass}.groupby.apply : Apply function func group-wise - and combine the results together. -{klass}.groupby.transform : Transforms the Series on each group - based on the given function. -{klass}.aggregate : Aggregate using one or more operations. - -Notes ------ -When using ``engine='numba'``, there will be no "fall back" behavior internally. -The group data and group index will be passed as numpy arrays to the JITed -user defined function, and no alternative execution attempts will be tried. - -Functions that mutate the passed object can produce unexpected -behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` -for more details. - -.. versionchanged:: 1.3.0 - - The resulting dtype will reflect the return value of the passed ``func``, - see the examples below. -{examples}""" - @final class GroupByPlot(PandasObject): diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 35f6ffb4980df..5962709056ae8 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -30,7 +30,6 @@ "_new_Index", "_new_PeriodIndex", "_agg_template_series", - "_agg_template_frame", "_pipe_template", "_apply_groupings_depr", "__main__", From d538a1cd1ad5d1e506c2dc36144e4cac5534858a Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sun, 29 Sep 2024 01:08:32 +0530 Subject: [PATCH 089/224] DOC: fix RT03, ES01 for pandas.core.groupby.SeriesGroupBy.agg and pandas.core.groupby.SeriesGroupBy.aggregate (#59898) * DOC: fix RT03, ES01 for pandas.core.groupby.SeriesGroupBy.agg * DOC: remove _agg_template_series Co-authored-by: mroeschke Co-authored-by: rhshadrach * DOC: remove _agg_template_series Co-authored-by: mroeschke Co-authored-by: rhshadrach * DOC: remove _agg_template_seris --------- Co-authored-by: mroeschke Co-authored-by: rhshadrach --- ci/code_checks.sh | 2 - pandas/core/groupby/generic.py | 136 +++++++++++++++++++++++++- pandas/core/groupby/groupby.py | 81 --------------- scripts/validate_unwanted_patterns.py | 1 - 4 files changed, 134 insertions(+), 86 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 669c793737161..b65dcedbd8a10 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -137,8 +137,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \ -i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.agg RT03" \ - -i "pandas.core.groupby.SeriesGroupBy.aggregate RT03" \ -i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \ -i "pandas.core.groupby.SeriesGroupBy.groups SA01" \ -i "pandas.core.groupby.SeriesGroupBy.indices SA01" \ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0c211afb5073c..110c0ea88a0a1 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -67,7 +67,6 @@ from pandas.core.groupby.groupby import ( GroupBy, GroupByPlot, - _agg_template_series, _transform_template, ) from pandas.core.indexes.api import ( @@ -323,8 +322,141 @@ def apply(self, func, *args, **kwargs) -> Series: """ return super().apply(func, *args, **kwargs) - @doc(_agg_template_series, examples=_agg_examples_doc, klass="Series") def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): + """ + Aggregate using one or more operations. + + The ``aggregate`` method enables flexible and efficient aggregation of grouped + data using a variety of functions, including built-in, user-defined, and + optimized JIT-compiled functions. + + Parameters + ---------- + func : function, str, list, dict or None + Function to use for aggregating the data. If a function, must either + work when passed a Series or when passed to Series.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - None, in which case ``**kwargs`` are used with Named Aggregation. Here + the output has one column for each element in ``**kwargs``. The name of + the column is keyword, whereas the value determines the aggregation + used to compute the values in the column. + + Can also accept a Numba JIT function with + ``engine='numba'`` specified. Only passing a single function is supported + with this engine. + + If the ``'numba'`` engine is chosen, the function must be + a user defined function with ``values`` and ``index`` as the + first and second arguments respectively in the function signature. + Each group's index will be passed to the user defined function + and optionally available for use. + + .. deprecated:: 2.1.0 + + Passing a dictionary is deprecated and will raise in a future version + of pandas. Pass a list of aggregations instead. + *args + Positional arguments to pass to func. + engine : str, default None + * ``'cython'`` : Runs the function through C-extensions from cython. + * ``'numba'`` : Runs the function through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be + applied to the function + + **kwargs + * If ``func`` is None, ``**kwargs`` are used to define the output names and + aggregations via Named Aggregation. See ``func`` entry. + * Otherwise, keyword arguments to be passed into func. + + Returns + ------- + Series + Aggregated Series based on the grouping and the applied aggregation + functions. + + See Also + -------- + SeriesGroupBy.apply : Apply function func group-wise + and combine the results together. + SeriesGroupBy.transform : Transforms the Series on each group + based on the given function. + Series.aggregate : Aggregate using one or more operations. + + Notes + ----- + When using ``engine='numba'``, there will be no "fall back" behavior internally. + The group data and group index will be passed as numpy arrays to the JITed + user defined function, and no alternative execution attempts will be tried. + + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. + + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4]) + + >>> s + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 + + >>> s.groupby([1, 1, 2, 2]).min() + 1 1 + 2 3 + dtype: int64 + + >>> s.groupby([1, 1, 2, 2]).agg("min") + 1 1 + 2 3 + dtype: int64 + + >>> s.groupby([1, 1, 2, 2]).agg(["min", "max"]) + min max + 1 1 2 + 2 3 4 + + The output column names can be controlled by passing + the desired column names and aggregations as keyword arguments. + + >>> s.groupby([1, 1, 2, 2]).agg( + ... minimum="min", + ... maximum="max", + ... ) + minimum maximum + 1 1 2 + 2 3 4 + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the aggregating + function. + + >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min()) + 1 1.0 + 2 3.0 + dtype: float64 + """ relabeling = func is None columns = None if relabeling: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9e36837bc679f..e2410788ea95e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -364,87 +364,6 @@ class providing the base-class of operations. -------- %(example)s""" -_agg_template_series = """ -Aggregate using one or more operations. - -Parameters ----------- -func : function, str, list, dict or None - Function to use for aggregating the data. If a function, must either - work when passed a {klass} or when passed to {klass}.apply. - - Accepted combinations are: - - - function - - string function name - - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` - - None, in which case ``**kwargs`` are used with Named Aggregation. Here the - output has one column for each element in ``**kwargs``. The name of the - column is keyword, whereas the value determines the aggregation used to compute - the values in the column. - - Can also accept a Numba JIT function with - ``engine='numba'`` specified. Only passing a single function is supported - with this engine. - - If the ``'numba'`` engine is chosen, the function must be - a user defined function with ``values`` and ``index`` as the - first and second arguments respectively in the function signature. - Each group's index will be passed to the user defined function - and optionally available for use. - - .. deprecated:: 2.1.0 - - Passing a dictionary is deprecated and will raise in a future version - of pandas. Pass a list of aggregations instead. -*args - Positional arguments to pass to func. -engine : str, default None - * ``'cython'`` : Runs the function through C-extensions from cython. - * ``'numba'`` : Runs the function through JIT compiled code from numba. - * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` - -engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be - applied to the function - -**kwargs - * If ``func`` is None, ``**kwargs`` are used to define the output names and - aggregations via Named Aggregation. See ``func`` entry. - * Otherwise, keyword arguments to be passed into func. - -Returns -------- -{klass} - -See Also --------- -{klass}GroupBy.apply : Apply function func group-wise - and combine the results together. -{klass}GroupBy.transform : Transforms the Series on each group - based on the given function. -{klass}.aggregate : Aggregate using one or more operations. - -Notes ------ -When using ``engine='numba'``, there will be no "fall back" behavior internally. -The group data and group index will be passed as numpy arrays to the JITed -user defined function, and no alternative execution attempts will be tried. - -Functions that mutate the passed object can produce unexpected -behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` -for more details. - -.. versionchanged:: 1.3.0 - - The resulting dtype will reflect the return value of the passed ``func``, - see the examples below. -{examples}""" - @final class GroupByPlot(PandasObject): diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 5962709056ae8..076acc359f933 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -29,7 +29,6 @@ "_shared_docs", "_new_Index", "_new_PeriodIndex", - "_agg_template_series", "_pipe_template", "_apply_groupings_depr", "__main__", From 34f546f8e73386659457fec0b3fa1ef5b0c6d569 Mon Sep 17 00:00:00 2001 From: Deepak Saldanha Date: Sun, 29 Sep 2024 23:05:45 +0530 Subject: [PATCH 090/224] DOC: fix docstrings for multiple api.types methods (#59920) fix docstrings for api.types --- ci/code_checks.sh | 5 --- pandas/core/dtypes/inference.py | 63 ++++++++++++++++++++++++++++++--- 2 files changed, 59 insertions(+), 9 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index b65dcedbd8a10..2b3e83d64ab21 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -107,14 +107,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.Timestamp.year GL08" \ - -i "pandas.api.types.is_dict_like PR07,SA01" \ - -i "pandas.api.types.is_file_like PR07,SA01" \ -i "pandas.api.types.is_float PR01,SA01" \ - -i "pandas.api.types.is_hashable PR01,RT03,SA01" \ -i "pandas.api.types.is_integer PR01,SA01" \ -i "pandas.api.types.is_iterator PR07,SA01" \ - -i "pandas.api.types.is_named_tuple PR07,SA01" \ - -i "pandas.api.types.is_re PR07,SA01" \ -i "pandas.api.types.is_re_compilable PR07,SA01" \ -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index f042911b53d2b..6adb34ff0f777 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -113,13 +113,24 @@ def is_file_like(obj: object) -> bool: Parameters ---------- - obj : The object to check + obj : object + The object to check for file-like properties. + This can be any Python object, and the function will + check if it has attributes typically associated with + file-like objects (e.g., `read`, `write`, `__iter__`). Returns ------- bool Whether `obj` has file-like properties. + See Also + -------- + api.types.is_dict_like : Check if the object is dict-like. + api.types.is_hashable : Return True if hash(obj) will succeed, False otherwise. + api.types.is_named_tuple : Check if the object is a named tuple. + api.types.is_iterator : Check if the object is an iterator. + Examples -------- >>> import io @@ -142,13 +153,24 @@ def is_re(obj: object) -> TypeGuard[Pattern]: Parameters ---------- - obj : The object to check + obj : object + The object to check for being a regex pattern. Typically, + this would be an object that you expect to be a compiled + pattern from the `re` module. Returns ------- bool Whether `obj` is a regex pattern. + See Also + -------- + api.types.is_float : Return True if given object is float. + api.types.is_iterator : Check if the object is an iterator. + api.types.is_integer : Return True if given object is integer. + api.types.is_re_compilable : Check if the object can be compiled + into a regex pattern instance. + Examples -------- >>> from pandas.api.types import is_re @@ -275,13 +297,22 @@ def is_dict_like(obj: object) -> bool: Parameters ---------- - obj : The object to check + obj : object + The object to check. This can be any Python object, + and the function will determine whether it + behaves like a dictionary. Returns ------- bool Whether `obj` has dict-like properties. + See Also + -------- + api.types.is_list_like : Check if the object is list-like. + api.types.is_file_like : Check if the object is a file-like. + api.types.is_named_tuple : Check if the object is a named tuple. + Examples -------- >>> from pandas.api.types import is_dict_like @@ -308,13 +339,22 @@ def is_named_tuple(obj: object) -> bool: Parameters ---------- - obj : The object to check + obj : object + The object that will be checked to determine + whether it is a named tuple. Returns ------- bool Whether `obj` is a named tuple. + See Also + -------- + api.types.is_dict_like: Check if the object is dict-like. + api.types.is_hashable: Return True if hash(obj) + will succeed, False otherwise. + api.types.is_categorical_dtype : Check if the dtype is categorical. + Examples -------- >>> from collections import namedtuple @@ -340,9 +380,24 @@ def is_hashable(obj: object) -> TypeGuard[Hashable]: Distinguish between these and other types by trying the call to hash() and seeing if they raise TypeError. + Parameters + ---------- + obj : object + The object to check for hashability. Any Python object can be passed here. + Returns ------- bool + True if object can be hashed (i.e., does not raise TypeError when + passed to hash()), and False otherwise (e.g., if object is mutable + like a list or dictionary). + + See Also + -------- + api.types.is_float : Return True if given object is float. + api.types.is_iterator : Check if the object is an iterator. + api.types.is_list_like : Check if the object is list-like. + api.types.is_dict_like : Check if the object is dict-like. Examples -------- From 5b35c77041a74b53ebd7c330ca5930fa22929726 Mon Sep 17 00:00:00 2001 From: gameofby Date: Mon, 30 Sep 2024 01:36:34 +0800 Subject: [PATCH 091/224] DOC: the table name should be `air_quality_parameters` rather than `air_quality_parameters_name` (#59918) --- .../getting_started/intro_tutorials/08_combine_dataframes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst index 05729809491b5..024300bb8a9b0 100644 --- a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst +++ b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst @@ -271,7 +271,7 @@ Add the parameters' full description and name, provided by the parameters metada Compared to the previous example, there is no common column name. However, the ``parameter`` column in the ``air_quality`` table and the -``id`` column in the ``air_quality_parameters_name`` both provide the +``id`` column in the ``air_quality_parameters`` table both provide the measured variable in a common format. The ``left_on`` and ``right_on`` arguments are used here (instead of just ``on``) to make the link between the two tables. From 90c26ce7ce04d97fdabb394e604ecee0a558c019 Mon Sep 17 00:00:00 2001 From: Deepak Saldanha Date: Mon, 30 Sep 2024 00:25:17 +0530 Subject: [PATCH 092/224] DOC: Separate out examples - pandas.str.is methods (#59850) --- pandas/core/strings/accessor.py | 193 ++++++++++++++++++++++++++------ 1 file changed, 156 insertions(+), 37 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 6d10365a1b968..10117aa6bf503 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3443,10 +3443,10 @@ def casefold(self): Series or Index of bool Series or Index of boolean values with the same length as the original Series/Index. - + """ + _shared_docs["isalpha"] = """ See Also -------- - Series.str.isalpha : Check whether all characters are alphabetic. Series.str.isnumeric : Check whether all characters are numeric. Series.str.isalnum : Check whether all characters are alphanumeric. Series.str.isdigit : Check whether all characters are digits. @@ -3458,24 +3458,56 @@ def casefold(self): Examples -------- - **Checks for Alphabetic and Numeric Characters** >>> s1 = pd.Series(['one', 'one1', '1', '']) - >>> s1.str.isalpha() 0 True 1 False 2 False 3 False dtype: bool + """ + _shared_docs["isnumeric"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. + + Examples + -------- + The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but + also includes other characters that can represent quantities such as + unicode fractions. + >>> s1 = pd.Series(['one', 'one1', '1', '']) >>> s1.str.isnumeric() 0 False 1 False 2 True 3 False dtype: bool + """ + _shared_docs["isalnum"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. + Examples + -------- + >>> s1 = pd.Series(['one', 'one1', '1', '']) >>> s1.str.isalnum() 0 True 1 True @@ -3492,47 +3524,72 @@ def casefold(self): 1 False 2 False dtype: bool + """ + _shared_docs["isdecimal"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. - **More Detailed Checks for Numeric Characters** - - There are several different but overlapping sets of numeric characters that - can be checked for. + Examples + -------- + The ``s3.str.isdecimal`` method checks for characters used to form + numbers in base 10. >>> s3 = pd.Series(['23', '³', '⅕', '']) - - The ``s3.str.isdecimal`` method checks for characters used to form numbers - in base 10. - >>> s3.str.isdecimal() 0 True 1 False 2 False 3 False dtype: bool + """ + _shared_docs["isdigit"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. - The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also - includes special digits, like superscripted and subscripted digits in - unicode. + Examples + -------- + Similar to ``str.isdecimal`` but also includes special digits, like + superscripted and subscripted digits in unicode. + >>> s3 = pd.Series(['23', '³', '⅕', '']) >>> s3.str.isdigit() 0 True 1 True 2 False 3 False dtype: bool + """ - The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also - includes other characters that can represent quantities such as unicode - fractions. - - >>> s3.str.isnumeric() - 0 True - 1 True - 2 True - 3 False - dtype: bool + _shared_docs["isspace"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. - **Checks for Whitespace** + Examples + -------- >>> s4 = pd.Series([' ', '\\t\\r\\n ', '']) >>> s4.str.isspace() @@ -3540,30 +3597,74 @@ def casefold(self): 1 True 2 False dtype: bool + """ + _shared_docs["islower"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. - **Checks for Character Case** + Examples + -------- >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) - >>> s5.str.islower() 0 True 1 False 2 False 3 False dtype: bool + """ + + _shared_docs["isupper"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.istitle : Check whether all characters are titlecase. + Examples + -------- + + >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) >>> s5.str.isupper() 0 False 1 False 2 True 3 False dtype: bool + """ + _shared_docs["istitle"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Examples + ------------ The ``s5.str.istitle`` method checks for whether all words are in title case (whether only the first letter of each word is capitalized). Words are assumed to be as any sequence of non-numeric characters separated by whitespace characters. + >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) >>> s5.str.istitle() 0 False 1 True @@ -3583,31 +3684,49 @@ def casefold(self): # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624) isalnum = _map_and_wrap( - "isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] + "isalnum", + docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] + + _shared_docs["isalnum"], ) isalpha = _map_and_wrap( - "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"] + "isalpha", + docstring=_shared_docs["ismethods"] % _doc_args["isalpha"] + + _shared_docs["isalpha"], ) isdigit = _map_and_wrap( - "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"] + "isdigit", + docstring=_shared_docs["ismethods"] % _doc_args["isdigit"] + + _shared_docs["isdigit"], ) isspace = _map_and_wrap( - "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"] + "isspace", + docstring=_shared_docs["ismethods"] % _doc_args["isspace"] + + _shared_docs["isspace"], ) islower = _map_and_wrap( - "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"] + "islower", + docstring=_shared_docs["ismethods"] % _doc_args["islower"] + + _shared_docs["islower"], ) isupper = _map_and_wrap( - "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"] + "isupper", + docstring=_shared_docs["ismethods"] % _doc_args["isupper"] + + _shared_docs["isupper"], ) istitle = _map_and_wrap( - "istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"] + "istitle", + docstring=_shared_docs["ismethods"] % _doc_args["istitle"] + + _shared_docs["istitle"], ) isnumeric = _map_and_wrap( - "isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"] + "isnumeric", + docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"] + + _shared_docs["isnumeric"], ) isdecimal = _map_and_wrap( - "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"] + "isdecimal", + docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"] + + _shared_docs["isdecimal"], ) From d66d5823607ecf4c6d1f8eac9ae679863218f2ba Mon Sep 17 00:00:00 2001 From: Deepak Saldanha Date: Mon, 30 Sep 2024 21:29:09 +0530 Subject: [PATCH 093/224] DOC: fix pandas.TimedeltaIndex.to_pytimedelta RT03,SA01 (#59914) * update docstrings * Update pandas/core/arrays/timedeltas.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- ci/code_checks.sh | 1 - pandas/core/arrays/timedeltas.py | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 2b3e83d64ab21..fa23adca6d61e 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -100,7 +100,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ - -i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \ -i "pandas.Timestamp.max PR02" \ -i "pandas.Timestamp.min PR02" \ -i "pandas.Timestamp.nanosecond GL08" \ diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 754ae277e359a..a8a0037d0bbb9 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -790,6 +790,19 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: Returns ------- numpy.ndarray + A NumPy ``timedelta64`` object representing the same duration as the + original pandas ``Timedelta`` object. The precision of the resulting + object is in nanoseconds, which is the default + time resolution used by pandas for ``Timedelta`` objects, ensuring + high precision for time-based calculations. + + See Also + -------- + to_timedelta : Convert argument to timedelta format. + Timedelta : Represents a duration between two dates or times. + DatetimeIndex: Index of datetime64 data. + Timedelta.components : Return a components namedtuple-like + of a single timedelta. Examples -------- @@ -800,6 +813,14 @@ def to_pytimedelta(self) -> npt.NDArray[np.object_]: >>> tdelta_idx.to_pytimedelta() array([datetime.timedelta(days=1), datetime.timedelta(days=2), datetime.timedelta(days=3)], dtype=object) + + >>> tidx = pd.TimedeltaIndex(data=["1 days 02:30:45", "3 days 04:15:10"]) + >>> tidx + TimedeltaIndex(['1 days 02:30:45', '3 days 04:15:10'], + dtype='timedelta64[ns]', freq=None) + >>> tidx.to_pytimedelta() + array([datetime.timedelta(days=1, seconds=9045), + datetime.timedelta(days=3, seconds=15310)], dtype=object) """ return ints_to_pytimedelta(self._ndarray) From 111ff84bb958cc7b13a060d9539f83b67ced8f02 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 30 Sep 2024 21:29:49 +0530 Subject: [PATCH 094/224] DOC: fix SA01, ES01 for pandas.errors.ClosedFileError (#59924) --- ci/code_checks.sh | 1 - pandas/errors/__init__.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index fa23adca6d61e..42eedfe8e223b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -159,7 +159,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.CSSWarning SA01" \ -i "pandas.errors.CategoricalConversionWarning SA01" \ -i "pandas.errors.ChainedAssignmentError SA01" \ - -i "pandas.errors.ClosedFileError SA01" \ -i "pandas.errors.DataError SA01" \ -i "pandas.errors.DuplicateLabelError SA01" \ -i "pandas.errors.IntCastingNaNError SA01" \ diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index b9ceae341afd3..46e090cc3a589 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -615,6 +615,16 @@ class ClosedFileError(Exception): """ Exception is raised when trying to perform an operation on a closed HDFStore file. + ``ClosedFileError`` is specific to operations on ``HDFStore`` objects. Once an + HDFStore is closed, its resources are no longer available, and any further attempt + to access data or perform file operations will raise this exception. + + See Also + -------- + HDFStore.close : Closes the PyTables file handle. + HDFStore.open : Opens the file in the specified mode. + HDFStore.is_open : Returns a boolean indicating whether the file is open. + Examples -------- >>> store = pd.HDFStore("my-store", "a") # doctest: +SKIP From 1baec153e72f98e7184e972f1e937626703e42a6 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 30 Sep 2024 21:30:32 +0530 Subject: [PATCH 095/224] DOC: fix SA01, ES01 for pandas.errors.OutOfBoundsDatetime (#59925) --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/np_datetime.pyx | 9 +++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 42eedfe8e223b..4a1a0042405e3 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -168,7 +168,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.NumExprClobberingError SA01" \ -i "pandas.errors.NumbaUtilError SA01" \ -i "pandas.errors.OptionError SA01" \ - -i "pandas.errors.OutOfBoundsDatetime SA01" \ -i "pandas.errors.OutOfBoundsTimedelta SA01" \ -i "pandas.errors.PerformanceWarning SA01" \ -i "pandas.errors.PossibleDataLossError SA01" \ diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 0b02fc13246f0..193556b2697a9 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -176,6 +176,15 @@ class OutOfBoundsDatetime(ValueError): """ Raised when the datetime is outside the range that can be represented. + This error occurs when attempting to convert or parse a datetime value + that exceeds the bounds supported by pandas' internal datetime + representation. + + See Also + -------- + to_datetime : Convert argument to datetime. + Timestamp : Pandas replacement for python ``datetime.datetime`` object. + Examples -------- >>> pd.to_datetime("08335394550") From 74d36ac1c1fe7e735f5b7392cb9dd1bff57f729b Mon Sep 17 00:00:00 2001 From: Florian Bourgey Date: Mon, 30 Sep 2024 12:02:41 -0400 Subject: [PATCH 096/224] Fix docstring Timedelta.to_timedelta64 SA01, Timedelta.total_seconds SA01, Timedelta.view SA01 (#59719) * Add 'See Also' section for Timedelta.to_timedelta64 * Fix SA01 for Timedelta.total_seconds() * Fix SA01 for Timedelta.view * Add space * Fix test_nat_doc_strings * Revert "Fix test_nat_doc_strings" This reverts commit 9d0965805daa2dbd02eaa1878858cfb0eb97df02. * Match doc of total_seconds method in nattype.pyx --- pandas/_libs/tslibs/nattype.pyx | 2 ++ pandas/_libs/tslibs/timedeltas.pyx | 3 +++ 2 files changed, 5 insertions(+) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 620e0846c750e..1c0a99eb1ea25 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -500,6 +500,8 @@ class NaTType(_NaT): -------- to_timedelta : Convert argument to timedelta. Timedelta : Represents a duration, the difference between two dates or times. + Timedelta.seconds : Returns the seconds component of the timedelta. + Timedelta.microseconds : Returns the microseconds component of the timedelta. Examples -------- diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 84ca48c96459f..bbefea7c47fc3 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1196,6 +1196,8 @@ cdef class _Timedelta(timedelta): -------- to_timedelta : Convert argument to timedelta. Timedelta : Represents a duration, the difference between two dates or times. + Timedelta.seconds : Returns the seconds component of the timedelta. + Timedelta.microseconds : Returns the microseconds component of the timedelta. Examples -------- @@ -1493,6 +1495,7 @@ cdef class _Timedelta(timedelta): See Also -------- + Timedelta.asm8 : Return a numpy timedelta64 array scalar view. numpy.ndarray.view : Returns a view of an array with the same data. Timedelta.to_numpy : Converts the Timedelta to a NumPy timedelta64. Timedelta.total_seconds : Returns the total duration of the Timedelta From 00855f81bd84cc6ed9ae42c5f66916b2208dbe04 Mon Sep 17 00:00:00 2001 From: Qaiser Abbasi <3501767+bbq2100@users.noreply.github.com> Date: Mon, 30 Sep 2024 18:05:22 +0200 Subject: [PATCH 097/224] Fix typo in 10min.rst (#59921) --- doc/source/user_guide/10min.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 887ffd5580a52..72bb93d21a99f 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -177,7 +177,7 @@ See the indexing documentation :ref:`Indexing and Selecting Data ` and Getitem (``[]``) ~~~~~~~~~~~~~~~~ -For a :class:`DataFrame`, passing a single label selects a columns and +For a :class:`DataFrame`, passing a single label selects a column and yields a :class:`Series` equivalent to ``df.A``: .. ipython:: python From cf480366a6bd9979124b91dd894230cbb510ca4d Mon Sep 17 00:00:00 2001 From: Marc Mueller <30130371+cdce8p@users.noreply.github.com> Date: Mon, 30 Sep 2024 22:47:40 +0200 Subject: [PATCH 098/224] BLD: Fix armv7 build (#59906) --- pandas/_libs/src/vendored/ujson/python/JSONtoObj.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c index 7cc20a52f1849..4cfead8ac77a5 100644 --- a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c @@ -38,9 +38,11 @@ Numeric decoder derived from TCL library // Licence at LICENSES/ULTRAJSON_LICENSE -#include "pandas/vendored/ujson/lib/ultrajson.h" +// clang-format off #define PY_SSIZE_T_CLEAN #include +#include "pandas/vendored/ujson/lib/ultrajson.h" +// clang-format on static int Object_objectAddKey(void *Py_UNUSED(prv), JSOBJ obj, JSOBJ name, JSOBJ value) { From e78ebd3f845c086af1d71c0604701ec49df97228 Mon Sep 17 00:00:00 2001 From: Florian Bourgey Date: Mon, 30 Sep 2024 17:50:16 -0400 Subject: [PATCH 099/224] DOC: Fix intro to datastructures Series constructor behavior (#59793) --- doc/source/user_guide/dsintro.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index 9757a72f13fa8..b9c285ca30c96 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -87,8 +87,9 @@ index will be pulled out. **From scalar value** -If ``data`` is a scalar value, an index must be -provided. The value will be repeated to match the length of **index**. +If ``data`` is a scalar value, the value will be repeated to match +the length of **index**. If the **index** is not provided, it defaults +to ``RangeIndex(1)``. .. ipython:: python From f598670353311a6fff4e6e1e96074ccf0737e6b7 Mon Sep 17 00:00:00 2001 From: Petroncini <59212480+Petroncini@users.noreply.github.com> Date: Tue, 1 Oct 2024 17:33:42 -0300 Subject: [PATCH 100/224] BUG: groupby().any() returns true for groups with timedelta all NaT (#59782) --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/groupby/ops.py | 8 +++++--- pandas/tests/groupby/test_grouping.py | 12 ++++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 41ba80989a0ce..6ebb51cd3ef89 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -652,6 +652,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`.DataFrameGroupBy.__len__` and :meth:`.SeriesGroupBy.__len__` would raise when the grouping contained NA values and ``dropna=False`` (:issue:`58644`) +- Bug in :meth:`.DataFrameGroupBy.any` that returned True for groups where all Timedelta values are NaT. (:issue:`59712`) - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`) - Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`) - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index da80969b613cd..0e99178642715 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -371,6 +371,10 @@ def _call_cython_op( is_datetimelike = dtype.kind in "mM" + if self.how in ["any", "all"]: + if mask is None: + mask = isna(values) + if is_datetimelike: values = values.view("int64") is_numeric = True @@ -380,12 +384,10 @@ def _call_cython_op( values = values.astype(np.float32) if self.how in ["any", "all"]: - if mask is None: - mask = isna(values) if dtype == object: if kwargs["skipna"]: # GH#37501: don't raise on pd.NA when skipna=True - if mask.any(): + if mask is not None and mask.any(): # mask on original values computed separately values = values.copy() values[mask] = True diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index fc2a8a970010a..6bb2eaf89b5d7 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -1180,3 +1180,15 @@ def test_grouping_by_key_is_in_axis(): result = gb.sum() expected = DataFrame({"a": [1, 2], "b": [1, 2], "c": [7, 5]}) tm.assert_frame_equal(result, expected) + + +def test_groupby_any_with_timedelta(): + # GH#59712 + df = DataFrame({"value": [pd.Timedelta(1), pd.NaT]}) + + result = df.groupby(np.array([0, 1], dtype=np.int64))["value"].any() + + expected = Series({0: True, 1: False}, name="value", dtype=bool) + expected.index = expected.index.astype(np.int64) + + tm.assert_series_equal(result, expected) From f738d9754ff3eb9b92fef9f294e4bd3699191903 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 2 Oct 2024 12:57:25 +0200 Subject: [PATCH 101/224] CI: Run jobs on 2.3.x branch (#59939) --- .github/workflows/code-checks.yml | 4 ++-- .github/workflows/docbuild-and-upload.yml | 4 ++-- .github/workflows/package-checks.yml | 4 ++-- .github/workflows/unit-tests.yml | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 7e9c056e75131..e1d2d1ea846b8 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.2.x + - 2.3.x pull_request: branches: - main - - 2.2.x + - 2.3.x env: ENV_FILE: environment.yml diff --git a/.github/workflows/docbuild-and-upload.yml b/.github/workflows/docbuild-and-upload.yml index 47b97fa57852a..908baa87815ab 100644 --- a/.github/workflows/docbuild-and-upload.yml +++ b/.github/workflows/docbuild-and-upload.yml @@ -4,13 +4,13 @@ on: push: branches: - main - - 2.2.x + - 2.3.x tags: - '*' pull_request: branches: - main - - 2.2.x + - 2.3.x env: ENV_FILE: environment.yml diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 97f90c1588962..6748832903e30 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.2.x + - 2.3.x pull_request: branches: - main - - 2.2.x + - 2.3.x types: [ labeled, opened, synchronize, reopened ] permissions: diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index d145836f3e596..60b234d613a38 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - 2.2.x + - 2.3.x pull_request: branches: - main - - 2.2.x + - 2.3.x paths-ignore: - "doc/**" - "web/**" From fd823d22578b684b6070d956def006230e3f6bb3 Mon Sep 17 00:00:00 2001 From: Marc Mueller <30130371+cdce8p@users.noreply.github.com> Date: Wed, 2 Oct 2024 15:25:48 +0200 Subject: [PATCH 102/224] Include Python.h first (#59929) --- pandas/_libs/include/pandas/datetime/date_conversions.h | 1 + pandas/_libs/include/pandas/parser/io.h | 3 ++- pandas/_libs/include/pandas/parser/pd_parser.h | 3 ++- pandas/_libs/include/pandas/vendored/klib/khash_python.h | 1 + pandas/_libs/src/vendored/ujson/python/JSONtoObj.c | 3 +-- pandas/_libs/src/vendored/ujson/python/ujson.c | 1 + 6 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/include/pandas/datetime/date_conversions.h b/pandas/_libs/include/pandas/datetime/date_conversions.h index e039991847a62..043805a8b25f4 100644 --- a/pandas/_libs/include/pandas/datetime/date_conversions.h +++ b/pandas/_libs/include/pandas/datetime/date_conversions.h @@ -9,6 +9,7 @@ The full license is in the LICENSE file, distributed with this software. #define PY_SSIZE_T_CLEAN #include + #include // Scales value inplace from nanosecond resolution to unit resolution diff --git a/pandas/_libs/include/pandas/parser/io.h b/pandas/_libs/include/pandas/parser/io.h index c707c23b567d2..41f1bb9312724 100644 --- a/pandas/_libs/include/pandas/parser/io.h +++ b/pandas/_libs/include/pandas/parser/io.h @@ -10,9 +10,10 @@ The full license is in the LICENSE file, distributed with this software. #pragma once #define PY_SSIZE_T_CLEAN -#include "tokenizer.h" #include +#include "tokenizer.h" + #define FS(source) ((file_source *)source) typedef struct _rd_source { diff --git a/pandas/_libs/include/pandas/parser/pd_parser.h b/pandas/_libs/include/pandas/parser/pd_parser.h index 58a09ae1bba39..543839b5d75bf 100644 --- a/pandas/_libs/include/pandas/parser/pd_parser.h +++ b/pandas/_libs/include/pandas/parser/pd_parser.h @@ -13,9 +13,10 @@ extern "C" { #endif #define PY_SSIZE_T_CLEAN -#include "pandas/parser/tokenizer.h" #include +#include "pandas/parser/tokenizer.h" + typedef struct { int (*to_double)(char *, double *, char, char, int *); int (*floatify)(PyObject *, double *, int *); diff --git a/pandas/_libs/include/pandas/vendored/klib/khash_python.h b/pandas/_libs/include/pandas/vendored/klib/khash_python.h index 2fa61642968cf..9706a8211b61f 100644 --- a/pandas/_libs/include/pandas/vendored/klib/khash_python.h +++ b/pandas/_libs/include/pandas/vendored/klib/khash_python.h @@ -3,6 +3,7 @@ #pragma once #include + #include #include diff --git a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c index 4cfead8ac77a5..ef6f1104a1fb9 100644 --- a/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/vendored/ujson/python/JSONtoObj.c @@ -38,11 +38,10 @@ Numeric decoder derived from TCL library // Licence at LICENSES/ULTRAJSON_LICENSE -// clang-format off #define PY_SSIZE_T_CLEAN #include + #include "pandas/vendored/ujson/lib/ultrajson.h" -// clang-format on static int Object_objectAddKey(void *Py_UNUSED(prv), JSOBJ obj, JSOBJ name, JSOBJ value) { diff --git a/pandas/_libs/src/vendored/ujson/python/ujson.c b/pandas/_libs/src/vendored/ujson/python/ujson.c index f369d122a3dbe..2ee084b9304f4 100644 --- a/pandas/_libs/src/vendored/ujson/python/ujson.c +++ b/pandas/_libs/src/vendored/ujson/python/ujson.c @@ -40,6 +40,7 @@ Numeric decoder derived from TCL library #define PY_SSIZE_T_CLEAN #include + #define PY_ARRAY_UNIQUE_SYMBOL UJSON_NUMPY #include "numpy/arrayobject.h" From ba7e83da18ac8bfc4f0a521855c0b2ad05ccbbd4 Mon Sep 17 00:00:00 2001 From: FuzzyParrabellum <58094668+FuzzyParrabellum@users.noreply.github.com> Date: Wed, 2 Oct 2024 21:38:49 +0200 Subject: [PATCH 103/224] DOC: Fix docstring of pandas.Series.compare list indent formatting (#59911) Co-authored-by: rdzantoine.pro@gmail.com Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/shared_docs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index cb0c3d241534c..81fa508ae6d23 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -65,9 +65,9 @@ Determine which axis to align the comparison on. * 0, or 'index' : Resulting differences are stacked vertically - with rows drawn alternately from self and other. + with rows drawn alternately from self and other. * 1, or 'columns' : Resulting differences are aligned horizontally - with columns drawn alternately from self and other. + with columns drawn alternately from self and other. keep_shape : bool, default False If true, all rows and columns are kept. From 198ed865420c2a206dc062a32be47c7cc5e76bc0 Mon Sep 17 00:00:00 2001 From: Xiao Yuan Date: Fri, 4 Oct 2024 00:08:49 +0800 Subject: [PATCH 104/224] BUG: pd.eval with engine="numexpr" fails with float division (#59907) * BUG: pd.eval with engine="numexpr" fails with float division * Add skip * Add whatsnew * update --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/computation/align.py | 2 +- pandas/tests/computation/test_eval.py | 8 ++++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 6ebb51cd3ef89..346e2b9e7997e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -698,6 +698,7 @@ Other - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) - Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`) - Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`) +- Bug in :func:`eval` with ``engine="numexpr"`` returning unexpected result for float division. (:issue:`59736`) - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) - Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index 7de4d8cdf99e1..6158c4f4d0539 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -213,7 +213,7 @@ def reconstruct_object(typ, obj, axes, dtype, name): if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_: ret_value = res_t.type(obj) else: - ret_value = typ(obj).astype(res_t) + ret_value = res_t.type(obj) # The condition is to distinguish 0-dim array (returned in case of # scalar) and 1 element array # e.g. np.array(0) and np.array([0]) diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 31d568d7c1e0c..3c0bf6c35866c 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -1998,3 +1998,11 @@ def test_validate_bool_args(value): msg = 'For argument "inplace" expected type bool, received type' with pytest.raises(ValueError, match=msg): pd.eval("2+2", inplace=value) + + +@td.skip_if_no("numexpr") +def test_eval_float_div_numexpr(): + # GH 59736 + result = pd.eval("1 / 2", engine="numexpr") + expected = 0.5 + assert result == expected From c47296ad3b9908f77fba5830ec9dbb7f546cb720 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 3 Oct 2024 06:09:36 -1000 Subject: [PATCH 105/224] CLN: indexes/base.py (#59928) CLN: indexes.base.py --- pandas/core/indexes/base.py | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 852049804a4f5..749a5fea4d513 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4153,7 +4153,8 @@ def reindex( preserve_names = not hasattr(target, "name") # GH7774: preserve dtype/tz if target is empty and not an Index. - target = ensure_has_len(target) # target may be an iterator + if is_iterator(target): + target = list(target) if not isinstance(target, Index) and len(target) == 0: if level is not None and self._is_multi: @@ -7568,21 +7569,9 @@ def ensure_index(index_like: Axes, copy: bool = False) -> Index: return Index(index_like, copy=copy) -def ensure_has_len(seq): - """ - If seq is an iterator, put its values into a list. - """ - try: - len(seq) - except TypeError: - return list(seq) - else: - return seq - - def trim_front(strings: list[str]) -> list[str]: """ - Trims zeros and decimal points. + Trims leading spaces evenly among all strings. Examples -------- @@ -7594,8 +7583,9 @@ def trim_front(strings: list[str]) -> list[str]: """ if not strings: return strings - while all(strings) and all(x[0] == " " for x in strings): - strings = [x[1:] for x in strings] + smallest_leading_space = min(len(x) - len(x.lstrip()) for x in strings) + if smallest_leading_space > 0: + strings = [x[smallest_leading_space:] for x in strings] return strings From 139def2145b83d40364235c6297e1833eab7bb05 Mon Sep 17 00:00:00 2001 From: Deepak Saldanha Date: Fri, 4 Oct 2024 01:39:46 +0530 Subject: [PATCH 106/224] BUG: fix html float display (#59930) * fix html display float/strings * add test under io, update whatsnew * fix linting * changes to fix floats only * Revert "fix linting" This reverts commit 1061442e0a1cf8f745b0863762f2aa023d388336. * test script for float format * remove nbsp implementation, keep floats * Trigger CI * implement changes post review * lint check * update test_formats.py * rfc test_format.py * update test cases --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/frame.py | 3 ++- pandas/tests/io/formats/test_format.py | 17 +++++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 346e2b9e7997e..a5b4560a47bc4 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -620,6 +620,7 @@ I/O ^^^ - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`) - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`) +- Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`) - Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`) - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4c56948a48eb2..f184aab4070d7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1192,6 +1192,7 @@ def _repr_html_(self) -> str | None: min_rows = get_option("display.min_rows") max_cols = get_option("display.max_columns") show_dimensions = get_option("display.show_dimensions") + show_floats = get_option("display.float_format") formatter = fmt.DataFrameFormatter( self, @@ -1199,7 +1200,7 @@ def _repr_html_(self) -> str | None: col_space=None, na_rep="NaN", formatters=None, - float_format=None, + float_format=show_floats, sparsify=None, justify=None, index_names=True, diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index af7b04d66096a..82cc3a838ca68 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -368,6 +368,23 @@ def test_repr_min_rows(self): assert ".." not in repr(df) assert ".." not in df._repr_html_() + @pytest.mark.parametrize( + "data, format_option, expected_values", + [ + (12345.6789, "{:12.3f}", "12345.679"), + (None, "{:.3f}", "None"), + ("", "{:.2f}", ""), + (112345.6789, "{:6.3f}", "112345.679"), + ], + ) + def test_repr_float_formatting_html_output( + self, data, format_option, expected_values + ): + with option_context("display.float_format", format_option.format): + df = DataFrame({"A": [data]}) + html_output = df._repr_html_() + assert expected_values in html_output + def test_str_max_colwidth(self): # GH 7856 df = DataFrame( From 4ad6c7a287009f727a8b627b091ba19ba06d9342 Mon Sep 17 00:00:00 2001 From: Deepak Saldanha Date: Sat, 5 Oct 2024 01:15:29 +0530 Subject: [PATCH 107/224] BUG: fix nbsp for html formatting (#59964) * nbsp for strings * update changes post review --- pandas/io/formats/html.py | 2 ++ pandas/tests/io/formats/test_format.py | 19 ++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index adaeed017d7bf..fdea1831d5596 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -195,6 +195,8 @@ def _write_cell( esc = {} rs = pprint_thing(s, escape_chars=esc).strip() + # replace spaces betweens strings with non-breaking spaces + rs = rs.replace(" ", "  ") if self.render_links and is_url(rs): rs_unescaped = pprint_thing(s, escape_chars={}).strip() diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 82cc3a838ca68..0dc16e1ebc723 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -375,12 +375,29 @@ def test_repr_min_rows(self): (None, "{:.3f}", "None"), ("", "{:.2f}", ""), (112345.6789, "{:6.3f}", "112345.679"), + ("foo foo", None, "foo      foo"), + (" foo", None, "foo"), + ( + "foo foo foo", + None, + "foo foo       foo", + ), # odd no.of spaces + ( + "foo foo foo", + None, + "foo foo    foo", + ), # even no.of spaces ], ) def test_repr_float_formatting_html_output( self, data, format_option, expected_values ): - with option_context("display.float_format", format_option.format): + if format_option is not None: + with option_context("display.float_format", format_option.format): + df = DataFrame({"A": [data]}) + html_output = df._repr_html_() + assert expected_values in html_output + else: df = DataFrame({"A": [data]}) html_output = df._repr_html_() assert expected_values in html_output From 58de332785ecac78dbea2d19b5a25253eecf78a0 Mon Sep 17 00:00:00 2001 From: Deepak Saldanha Date: Sat, 5 Oct 2024 02:49:57 +0530 Subject: [PATCH 108/224] BUG: fix treatment of NaNs when .apply() function is used on categorical columns. (#59966) * remove action=ignore for .apply() on cat dtype * add PR reference in comments * fix pytest linting * refac failing test_series_apply.py * Trigger CI * changes post review * rephrase change log --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/apply.py | 14 ++------------ pandas/tests/apply/test_frame_apply.py | 3 ++- pandas/tests/apply/test_series_apply.py | 6 +++--- 4 files changed, 8 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a5b4560a47bc4..52debcc49eb27 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -544,7 +544,7 @@ Bug fixes Categorical ^^^^^^^^^^^ -- +- Bug in :func:`Series.apply` where ``nan`` was ignored for :class:`CategoricalDtype` (:issue:`59938`) - Datetimelike diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 7d50b466f5126..1f13459724d78 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -38,10 +38,7 @@ is_numeric_dtype, is_sequence, ) -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, - ExtensionDtype, -) +from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, ABCNDFrame, @@ -1465,14 +1462,7 @@ def curried(x): else: curried = func - - # row-wise access - # apply doesn't have a `na_action` keyword and for backward compat reasons - # we need to give `na_action="ignore"` for categorical data. - # TODO: remove the `na_action="ignore"` when that default has been changed in - # Categorical (GH51645). - action = "ignore" if isinstance(obj.dtype, CategoricalDtype) else None - mapped = obj._map_values(mapper=curried, na_action=action) + mapped = obj._map_values(mapper=curried) if len(mapped) and isinstance(mapped[0], ABCSeries): # GH#43986 Need to do list(mapped) in order to get treated as nested diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index dee0efcd8fd15..f0ab01e9e960e 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -741,8 +741,9 @@ def test_apply_category_equalness(val): result = df.a.apply(lambda x: x == val) expected = Series( - [np.nan if pd.isnull(x) else x == val for x in df_values], name="a" + [False if pd.isnull(x) else x == val for x in df_values], name="a" ) + # False since behavior of NaN for categorical dtype has been changed (GH 59966) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index 76704de6f2d10..9541b0b7495c7 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -236,10 +236,10 @@ def test_apply_categorical_with_nan_values(series, by_row): with pytest.raises(AttributeError, match=msg): s.apply(lambda x: x.split("-")[0], by_row=by_row) return - - result = s.apply(lambda x: x.split("-")[0], by_row=by_row) + # NaN for cat dtype fixed in (GH 59966) + result = s.apply(lambda x: x.split("-")[0] if pd.notna(x) else False, by_row=by_row) result = result.astype(object) - expected = Series(["1", "1", np.nan], dtype="category") + expected = Series(["1", "1", False], dtype="category") expected = expected.astype(object) tm.assert_series_equal(result, expected) From 7f54bec678694b1bb8e91ab4dc8944431d1c7ae1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Janez=20Dem=C5=A1ar?= Date: Sat, 5 Oct 2024 00:09:52 +0200 Subject: [PATCH 109/224] BUG: Fix SparseFrameAccessor.to_dense return type (#59967) * BUG: Fix SparseFrameAccessor.to_dense return type * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/arrays/sparse/accessor.py | 6 +++--- pandas/tests/arrays/sparse/test_accessor.py | 4 ++++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 52debcc49eb27..35963a90b5d07 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -682,6 +682,7 @@ Sparse ^^^^^^ - Bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`) - Bug in :meth:`DataFrame.sparse.from_spmatrix` which hard coded an invalid ``fill_value`` for certain subtypes. (:issue:`59063`) +- Bug in :meth:`DataFrame.sparse.to_dense` which ignored subclassing and always returned an instance of :class:`DataFrame` (:issue:`59913`) ExtensionArray ^^^^^^^^^^^^^^ diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index e610e018c5a74..8083371ed171a 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -369,10 +369,10 @@ def to_dense(self) -> DataFrame: 1 1 2 0 """ - from pandas import DataFrame - data = {k: v.array.to_dense() for k, v in self._parent.items()} - return DataFrame(data, index=self._parent.index, columns=self._parent.columns) + return self._parent._constructor( + data, index=self._parent.index, columns=self._parent.columns + ) def to_coo(self) -> spmatrix: """ diff --git a/pandas/tests/arrays/sparse/test_accessor.py b/pandas/tests/arrays/sparse/test_accessor.py index bd3298940ae3a..08bfd5b69fdd9 100644 --- a/pandas/tests/arrays/sparse/test_accessor.py +++ b/pandas/tests/arrays/sparse/test_accessor.py @@ -252,3 +252,7 @@ def test_with_column_named_sparse(self): # https://github.com/pandas-dev/pandas/issues/30758 df = pd.DataFrame({"sparse": pd.arrays.SparseArray([1, 2])}) assert isinstance(df.sparse, pd.core.arrays.sparse.accessor.SparseFrameAccessor) + + def test_subclassing(self): + df = tm.SubclassedDataFrame({"sparse": pd.arrays.SparseArray([1, 2])}) + assert isinstance(df.sparse.to_dense(), tm.SubclassedDataFrame) From aea1643c6428cbf52abfa07b068c445149b98827 Mon Sep 17 00:00:00 2001 From: invalidarg Date: Sat, 5 Oct 2024 00:10:36 +0200 Subject: [PATCH 110/224] BUG: CSS strings truncated at ":" (#59720) * second item in tuple is no longer truncated at first colon https://github.com/pandas-dev/pandas/issues/59623 * added testcase for maybe_convert_css_to_tuples #59623 * maybe_convert_css_to_tuples() raises on strings without ":" * fixed implicit str concatination * Fixed raise on empty string * Update test_style.py * attr:; -> ("attr","") Same behavior as before patch * add test for "attr:;", ie empty value * str concatenation in the test broke mypy * revert explicit str concat * Invalidarg patch black (#1) * black test_style * Update style_render.py --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/io/formats/style_render.py | 24 ++++++++++----------- pandas/tests/io/formats/style/test_style.py | 13 ++++++++++- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 8a6383f7e8f82..08d9fd938c873 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -906,9 +906,9 @@ def concatenated_visible_rows(obj): row_body_headers = [ { **col, - "display_value": col["display_value"] - if col["is_visible"] - else "", + "display_value": ( + col["display_value"] if col["is_visible"] else "" + ), "cellstyle": self.ctx_index[r, c], } for c, col in enumerate(row[:index_levels]) @@ -2069,18 +2069,18 @@ def maybe_convert_css_to_tuples(style: CSSProperties) -> CSSList: ('border','1px solid red')] """ if isinstance(style, str): - s = style.split(";") - try: - return [ - (x.split(":")[0].strip(), x.split(":")[1].strip()) - for x in s - if x.strip() != "" - ] - except IndexError as err: + if style and ":" not in style: raise ValueError( "Styles supplied as string must follow CSS rule formats, " f"for example 'attr: val;'. '{style}' was given." - ) from err + ) + s = style.split(";") + return [ + (x.split(":")[0].strip(), ":".join(x.split(":")[1:]).strip()) + for x in s + if x.strip() != "" + ] + return style diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index 89addbbbc1ded..e9fc2b2d27afd 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -886,8 +886,19 @@ def test_maybe_convert_css_to_tuples(self): expected = [] assert maybe_convert_css_to_tuples("") == expected + # issue #59623 + expected = [("a", "b"), ("c", "url('data:123')")] + assert maybe_convert_css_to_tuples("a:b;c: url('data:123');") == expected + + # if no value, return attr and empty string + expected = [("a", ""), ("c", "")] + assert maybe_convert_css_to_tuples("a:;c: ") == expected + def test_maybe_convert_css_to_tuples_err(self): - msg = "Styles supplied as string must follow CSS rule formats" + msg = ( + "Styles supplied as string must follow CSS rule formats, " + "for example 'attr: val;'. 'err' was given." + ) with pytest.raises(ValueError, match=msg): maybe_convert_css_to_tuples("err") From 24190fdb0efd781be9f0a886256edc595587c20f Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 5 Oct 2024 22:38:20 +0530 Subject: [PATCH 111/224] DOC: fix RT03,SA01 for pandas.period_range (#59958) --- ci/code_checks.sh | 1 - pandas/core/indexes/period.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 4a1a0042405e3..c9d2f54eba1ed 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -184,7 +184,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.io.stata.StataReader.variable_labels RT03,SA01" \ -i "pandas.io.stata.StataWriter.write_file SA01" \ -i "pandas.json_normalize RT03,SA01" \ - -i "pandas.period_range RT03,SA01" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ -i "pandas.plotting.lag_plot RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index b5f05ef0ab78f..377406e24b1d3 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -563,6 +563,14 @@ def period_range( Returns ------- PeriodIndex + A PeriodIndex of fixed frequency periods. + + See Also + -------- + date_range : Returns a fixed frequency DatetimeIndex. + Period : Represents a period of time. + PeriodIndex : Immutable ndarray holding ordinal values indicating regular periods + in time. Notes ----- From b63c7954d5195b3999cd867b788758e412bf30e1 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 5 Oct 2024 22:40:04 +0530 Subject: [PATCH 112/224] DOC: fix SA01, ES01 for pandas.io.stata.StataReader.data_label (#59962) --- ci/code_checks.sh | 1 - pandas/io/stata.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index c9d2f54eba1ed..ad6ea5b0deb9f 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -179,7 +179,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.ValueLabelTypeMismatch SA01" \ -i "pandas.infer_freq SA01" \ -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \ - -i "pandas.io.stata.StataReader.data_label SA01" \ -i "pandas.io.stata.StataReader.value_labels RT03,SA01" \ -i "pandas.io.stata.StataReader.variable_labels RT03,SA01" \ -i "pandas.io.stata.StataWriter.write_file SA01" \ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 4be06f93689f2..6b988d8fed6bf 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2004,6 +2004,16 @@ def data_label(self) -> str: """ Return data label of Stata file. + The data label is a descriptive string associated with the dataset + stored in the Stata file. This property provides access to that + label, if one is present. + + See Also + -------- + io.stata.StataReader.variable_labels : Return a dict associating each variable + name with corresponding label. + DataFrame.to_stata : Export DataFrame object to Stata dta format. + Examples -------- >>> df = pd.DataFrame([(1,)], columns=["variable"]) From e740857e6399c589e2704da5376a0a28cc251a38 Mon Sep 17 00:00:00 2001 From: Xiao Yuan Date: Sun, 6 Oct 2024 01:12:09 +0800 Subject: [PATCH 113/224] BUG: fix to_numeric raises TypeError for Timedelta and Timestamp scalar (#59974) * BUG: fix to_numeric raises TypeError for Timedelta and Timestamp scalar * Add whatsnew --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/tools/numeric.py | 6 ++++++ pandas/tests/tools/test_to_numeric.py | 15 +++++++++++++++ 3 files changed, 22 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 35963a90b5d07..ed0836233553b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -701,6 +701,7 @@ Other - Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`) - Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`) - Bug in :func:`eval` with ``engine="numexpr"`` returning unexpected result for float division. (:issue:`59736`) +- Bug in :func:`to_numeric` raising ``TypeError`` when ``arg`` is a :class:`Timedelta` or :class:`Timestamp` scalar. (:issue:`59944`) - Bug in :func:`unique` on :class:`Index` not always returning :class:`Index` (:issue:`57043`) - Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 982851d0557c3..f159babb7e018 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -11,6 +11,10 @@ lib, missing as libmissing, ) +from pandas._libs.tslibs import ( + Timedelta, + Timestamp, +) from pandas.util._validators import check_dtype_backend from pandas.core.dtypes.cast import maybe_downcast_numeric @@ -189,6 +193,8 @@ def to_numeric( return float(arg) if is_number(arg): return arg + if isinstance(arg, (Timedelta, Timestamp)): + return arg._value is_scalars = True values = np.array([arg], dtype="O") elif getattr(arg, "ndim", 1) > 1: diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 585b7ca94f730..f3645bf0649bd 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -384,6 +384,21 @@ def test_timedelta(transform_assert_equal): assert_equal(result, expected) +@pytest.mark.parametrize( + "scalar", + [ + pd.Timedelta(1, "D"), + pd.Timestamp("2017-01-01T12"), + pd.Timestamp("2017-01-01T12", tz="US/Pacific"), + ], +) +def test_timedelta_timestamp_scalar(scalar): + # GH#59944 + result = to_numeric(scalar) + expected = to_numeric(Series(scalar))[0] + assert result == expected + + def test_period(request, transform_assert_equal): transform, assert_equal = transform_assert_equal From 05fa9583f7bc22796076b7e2a7b94058bebca511 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sat, 5 Oct 2024 22:43:23 +0530 Subject: [PATCH 114/224] DOC: fix SA01, ES01 for pandas.testing.assert_extension_array_equal (#59975) --- ci/code_checks.sh | 1 - pandas/_testing/asserters.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ad6ea5b0deb9f..16a3a22bc4876 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -187,7 +187,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.plotting.lag_plot RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.set_eng_float_format RT03,SA01" \ - -i "pandas.testing.assert_extension_array_equal SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ -i "pandas.tseries.offsets.BQuarterBegin.is_on_offset GL08" \ -i "pandas.tseries.offsets.BQuarterBegin.n GL08" \ diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index bbd5e60a5a812..01c4dcd92ee40 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -701,6 +701,10 @@ def assert_extension_array_equal( """ Check that left and right ExtensionArrays are equal. + This method compares two ``ExtensionArray`` instances for equality, + including checks for missing values, the dtype of the arrays, and + the exactness of the comparison (or tolerance when comparing floats). + Parameters ---------- left, right : ExtensionArray @@ -726,6 +730,12 @@ def assert_extension_array_equal( .. versionadded:: 2.0.0 + See Also + -------- + testing.assert_series_equal : Check that left and right ``Series`` are equal. + testing.assert_frame_equal : Check that left and right ``DataFrame`` are equal. + testing.assert_index_equal : Check that left and right ``Index`` are equal. + Notes ----- Missing values are checked separately from valid values. From c8813aeebcff18699e558ea0ee56abb9dde6a6f6 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sun, 6 Oct 2024 13:33:14 -0400 Subject: [PATCH 115/224] API: value_counts to consistently maintain order of input (#59745) * API: value_counts to consistently maintain order of input * Docs * Cleanup * Test & docs fixups * Refine whatsnew * Refine whatsnew --- doc/source/whatsnew/v3.0.0.rst | 61 ++++++++++++++++++ pandas/core/frame.py | 10 ++- pandas/core/groupby/generic.py | 28 ++++---- pandas/core/groupby/groupby.py | 4 +- pandas/core/groupby/ops.py | 39 +++++++++-- .../tests/frame/methods/test_value_counts.py | 4 +- .../groupby/methods/test_value_counts.py | 64 +++++++++---------- 7 files changed, 157 insertions(+), 53 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index ed0836233553b..321005272817d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -203,6 +203,67 @@ In cases with mixed-resolution inputs, the highest resolution is used: In [2]: pd.to_datetime([pd.Timestamp("2024-03-22 11:43:01"), "2024-03-22 11:43:01.002"]).dtype Out[2]: dtype('>> df.groupby("gender").value_counts() gender education country - female high FR 1 - US 1 + female high US 1 + FR 1 male low FR 2 US 1 medium FR 1 @@ -2682,8 +2688,8 @@ def value_counts( >>> df.groupby("gender").value_counts(ascending=True) gender education country - female high FR 1 - US 1 + female high US 1 + FR 1 male low US 1 medium FR 1 low FR 2 @@ -2691,8 +2697,8 @@ def value_counts( >>> df.groupby("gender").value_counts(normalize=True) gender education country - female high FR 0.50 - US 0.50 + female high US 0.50 + FR 0.50 male low FR 0.50 US 0.25 medium FR 0.25 @@ -2700,16 +2706,16 @@ def value_counts( >>> df.groupby("gender", as_index=False).value_counts() gender education country count - 0 female high FR 1 - 1 female high US 1 + 0 female high US 1 + 1 female high FR 1 2 male low FR 2 3 male low US 1 4 male medium FR 1 >>> df.groupby("gender", as_index=False).value_counts(normalize=True) gender education country proportion - 0 female high FR 0.50 - 1 female high US 0.50 + 0 female high US 0.50 + 1 female high FR 0.50 2 male low FR 0.50 3 male low US 0.25 4 male medium FR 0.25 diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e2410788ea95e..68314567d1b5e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2519,7 +2519,7 @@ def _value_counts( grouper, _, _ = get_grouper( df, key=key, - sort=self.sort, + sort=False, observed=False, dropna=dropna, ) @@ -2528,7 +2528,7 @@ def _value_counts( # Take the size of the overall columns gb = df.groupby( groupings, - sort=self.sort, + sort=False, observed=self.observed, dropna=self.dropna, ) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 0e99178642715..a82e77140d274 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -755,6 +755,7 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: obs = [ ping._observed or not ping._passed_categorical for ping in self.groupings ] + sorts = [ping._sort for ping in self.groupings] # When passed a categorical grouping, keep all categories for k, (ping, level) in enumerate(zip(self.groupings, levels)): if ping._passed_categorical: @@ -765,7 +766,9 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: result_index.name = self.names[0] ids = ensure_platform_int(self.codes[0]) elif all(obs): - result_index, ids = self._ob_index_and_ids(levels, self.codes, self.names) + result_index, ids = self._ob_index_and_ids( + levels, self.codes, self.names, sorts + ) elif not any(obs): result_index, ids = self._unob_index_and_ids(levels, self.codes, self.names) else: @@ -778,6 +781,7 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: levels=[levels[idx] for idx in ob_indices], codes=[codes[idx] for idx in ob_indices], names=[names[idx] for idx in ob_indices], + sorts=[sorts[idx] for idx in ob_indices], ) unob_index, unob_ids = self._unob_index_and_ids( levels=[levels[idx] for idx in unob_indices], @@ -800,9 +804,18 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: ).reorder_levels(index) ids = len(unob_index) * ob_ids + unob_ids - if self._sort: + if any(sorts): # Sort result_index and recode ids using the new order - sorter = result_index.argsort() + n_levels = len(sorts) + drop_levels = [ + n_levels - idx + for idx, sort in enumerate(reversed(sorts), 1) + if not sort + ] + if len(drop_levels) > 0: + sorter = result_index._drop_level_numbers(drop_levels).argsort() + else: + sorter = result_index.argsort() result_index = result_index.take(sorter) _, index = np.unique(sorter, return_index=True) ids = ensure_platform_int(ids) @@ -837,10 +850,13 @@ def _ob_index_and_ids( levels: list[Index], codes: list[npt.NDArray[np.intp]], names: list[Hashable], + sorts: list[bool], ) -> tuple[MultiIndex, npt.NDArray[np.intp]]: + consistent_sorting = all(sorts[0] == sort for sort in sorts[1:]) + sort_in_compress = sorts[0] if consistent_sorting else False shape = tuple(len(level) for level in levels) group_index = get_group_index(codes, shape, sort=True, xnull=True) - ob_ids, obs_group_ids = compress_group_index(group_index, sort=self._sort) + ob_ids, obs_group_ids = compress_group_index(group_index, sort=sort_in_compress) ob_ids = ensure_platform_int(ob_ids) ob_index_codes = decons_obs_group_ids( ob_ids, obs_group_ids, shape, codes, xnull=True @@ -851,6 +867,21 @@ def _ob_index_and_ids( names=names, verify_integrity=False, ) + if not consistent_sorting: + # Sort by the levels where the corresponding sort argument is True + n_levels = len(sorts) + drop_levels = [ + n_levels - idx + for idx, sort in enumerate(reversed(sorts), 1) + if not sort + ] + if len(drop_levels) > 0: + sorter = ob_index._drop_level_numbers(drop_levels).argsort() + else: + sorter = ob_index.argsort() + ob_index = ob_index.take(sorter) + _, index = np.unique(sorter, return_index=True) + ob_ids = np.where(ob_ids == -1, -1, index.take(ob_ids)) ob_ids = ensure_platform_int(ob_ids) return ob_index, ob_ids diff --git a/pandas/tests/frame/methods/test_value_counts.py b/pandas/tests/frame/methods/test_value_counts.py index 7670b53f23173..de5029b9f18b2 100644 --- a/pandas/tests/frame/methods/test_value_counts.py +++ b/pandas/tests/frame/methods/test_value_counts.py @@ -128,7 +128,7 @@ def test_data_frame_value_counts_dropna_true(nulls_fixture): expected = pd.Series( data=[1, 1], index=pd.MultiIndex.from_arrays( - [("Beth", "John"), ("Louise", "Smith")], names=["first_name", "middle_name"] + [("John", "Beth"), ("Smith", "Louise")], names=["first_name", "middle_name"] ), name="count", ) @@ -156,7 +156,7 @@ def test_data_frame_value_counts_dropna_false(nulls_fixture): pd.Index(["Anne", "Beth", "John"]), pd.Index(["Louise", "Smith", np.nan]), ], - codes=[[0, 1, 2, 2], [2, 0, 1, 2]], + codes=[[2, 0, 2, 1], [1, 2, 2, 0]], names=["first_name", "middle_name"], ), name="count", diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 8f8f7f64aba75..8f3022fbe551c 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -255,10 +255,10 @@ def test_basic(education_df, request): index=MultiIndex.from_tuples( [ ("FR", "male", "low"), - ("FR", "female", "high"), ("FR", "male", "medium"), - ("US", "female", "high"), + ("FR", "female", "high"), ("US", "male", "low"), + ("US", "female", "high"), ], names=["country", "gender", "education"], ), @@ -472,11 +472,11 @@ def test_data_frame_value_counts( ( False, False, - [0, 1, 3, 5, 7, 6, 8, 2, 4], + [0, 1, 3, 5, 6, 7, 8, 2, 4], [0.5, 0.5, 1.0, 0.25, 0.25, 0.25, 0.25, 1.0, 1.0], ), (False, True, [0, 1, 3, 5, 2, 4], [0.5, 0.5, 1.0, 1.0, 1.0, 1.0]), - (True, False, [0, 1, 5, 7, 6, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]), + (True, False, [0, 1, 5, 6, 7, 8], [0.5, 0.5, 0.25, 0.25, 0.25, 0.25]), (True, True, [0, 1, 5], [0.5, 0.5, 1.0]), ], ) @@ -518,7 +518,7 @@ def test_dropna_combinations( True, [1, 1], MultiIndex.from_arrays( - [(1, 1), ("Beth", "John"), ("Louise", "Smith")], + [(1, 1), ("John", "Beth"), ("Smith", "Louise")], names=["key", "first_name", "middle_name"], ), ), @@ -531,7 +531,7 @@ def test_dropna_combinations( Index(["Anne", "Beth", "John"]), Index(["Louise", "Smith", np.nan]), ], - codes=[[0, 0, 0, 0], [0, 1, 2, 2], [2, 0, 1, 2]], + codes=[[0, 0, 0, 0], [2, 0, 2, 1], [1, 2, 2, 0]], names=["key", "first_name", "middle_name"], ), ), @@ -609,17 +609,17 @@ def test_categorical_single_grouper_with_only_observed_categories( expected_index = MultiIndex.from_tuples( [ ("FR", "male", "low"), - ("FR", "female", "high"), ("FR", "male", "medium"), + ("FR", "female", "high"), + ("FR", "male", "high"), ("FR", "female", "low"), ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), ("US", "male", "low"), + ("US", "female", "high"), + ("US", "male", "medium"), + ("US", "male", "high"), ("US", "female", "low"), ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), ], names=["country", "gender", "education"], ) @@ -711,17 +711,17 @@ def test_categorical_single_grouper_observed_true( expected_index = [ ("FR", "male", "low"), - ("FR", "female", "high"), ("FR", "male", "medium"), + ("FR", "female", "high"), + ("FR", "male", "high"), ("FR", "female", "low"), ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), ("US", "male", "low"), + ("US", "female", "high"), + ("US", "male", "medium"), + ("US", "male", "high"), ("US", "female", "low"), ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), ] assert_categorical_single_grouper( @@ -791,23 +791,23 @@ def test_categorical_single_grouper_observed_false( expected_index = [ ("FR", "male", "low"), - ("FR", "female", "high"), ("FR", "male", "medium"), + ("FR", "female", "high"), + ("FR", "male", "high"), ("FR", "female", "low"), ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), ("US", "male", "low"), + ("US", "female", "high"), + ("US", "male", "medium"), + ("US", "male", "high"), ("US", "female", "low"), ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), - ("ASIA", "female", "high"), - ("ASIA", "female", "low"), - ("ASIA", "female", "medium"), - ("ASIA", "male", "high"), ("ASIA", "male", "low"), ("ASIA", "male", "medium"), + ("ASIA", "male", "high"), + ("ASIA", "female", "low"), + ("ASIA", "female", "medium"), + ("ASIA", "female", "high"), ] assert_categorical_single_grouper( @@ -837,8 +837,8 @@ def test_categorical_single_grouper_observed_false( ("US", "high", "male"), ("US", "low", "male"), ("US", "low", "female"), - ("US", "medium", "female"), ("US", "medium", "male"), + ("US", "medium", "female"), ], ), ( @@ -949,17 +949,17 @@ def test_categorical_non_groupers( expected_index = [ ("FR", "male", "low"), - ("FR", "female", "high"), ("FR", "male", "medium"), + ("FR", "female", "high"), + ("FR", "male", "high"), ("FR", "female", "low"), ("FR", "female", "medium"), - ("FR", "male", "high"), - ("US", "female", "high"), ("US", "male", "low"), + ("US", "female", "high"), + ("US", "male", "medium"), + ("US", "male", "high"), ("US", "female", "low"), ("US", "female", "medium"), - ("US", "male", "high"), - ("US", "male", "medium"), ] expected_series = Series( data=expected_data, @@ -1178,7 +1178,7 @@ def test_value_counts_sort(sort, vc_sort, normalize): if sort and vc_sort: taker = [0, 1, 2] elif sort and not vc_sort: - taker = [0, 1, 2] + taker = [1, 0, 2] elif not sort and vc_sort: taker = [0, 2, 1] else: From febfc0b32c92326a6ca3a4a0aa25dd4d88ab19ad Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sun, 6 Oct 2024 23:07:24 +0530 Subject: [PATCH 116/224] DOC: fix PR07,SA01,ES01 for pandas.Series.sparse.from_coo (#59980) --- ci/code_checks.sh | 1 - pandas/core/arrays/sparse/accessor.py | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 16a3a22bc4876..c93dbf511aec0 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -96,7 +96,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.tz_localize PR01,PR02" \ -i "pandas.Series.dt.unit GL08" \ -i "pandas.Series.pad PR01,SA01" \ - -i "pandas.Series.sparse.from_coo PR07,SA01" \ -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 8083371ed171a..0ed5f69fe4703 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -88,9 +88,17 @@ def from_coo(cls, A, dense_index: bool = False) -> Series: """ Create a Series with sparse values from a scipy.sparse.coo_matrix. + This method takes a ``scipy.sparse.coo_matrix`` (coordinate format) as input and + returns a pandas ``Series`` where the non-zero elements are represented as + sparse values. The index of the Series can either include only the coordinates + of non-zero elements (default behavior) or the full sorted set of coordinates + from the matrix if ``dense_index`` is set to `True`. + Parameters ---------- A : scipy.sparse.coo_matrix + The sparse matrix in coordinate format from which the sparse Series + will be created. dense_index : bool, default False If False (default), the index consists of only the coords of the non-null entries of the original coo_matrix. @@ -102,6 +110,12 @@ def from_coo(cls, A, dense_index: bool = False) -> Series: s : Series A Series with sparse values. + See Also + -------- + DataFrame.sparse.from_spmatrix : Create a new DataFrame from a scipy sparse + matrix. + scipy.sparse.coo_matrix : A sparse matrix in COOrdinate format. + Examples -------- >>> from scipy import sparse From e4905bfd1825f490ff12b12c2659d3194882a9e3 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sun, 6 Oct 2024 23:08:02 +0530 Subject: [PATCH 117/224] DOC: fix PR01,SA01,ES01 for pandas.api.types.is_float (#59981) --- ci/code_checks.sh | 1 - pandas/_libs/lib.pyx | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index c93dbf511aec0..6f4534ba4a4de 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -105,7 +105,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.Timestamp.year GL08" \ - -i "pandas.api.types.is_float PR01,SA01" \ -i "pandas.api.types.is_integer PR01,SA01" \ -i "pandas.api.types.is_iterator PR07,SA01" \ -i "pandas.api.types.is_re_compilable PR07,SA01" \ diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index de7d9af731010..23e0f387466aa 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1089,9 +1089,23 @@ def is_float(obj: object) -> bool: """ Return True if given object is float. + This method checks whether the passed object is a float type. It + returns `True` if the object is a float, and `False` otherwise. + + Parameters + ---------- + obj : object + The object to check for float type. + Returns ------- bool + `True` if the object is of float type, otherwise `False`. + + See Also + -------- + api.types.is_integer : Check if an object is of integer type. + api.types.is_numeric_dtype : Check if an object is of numeric type. Examples -------- From 8c0777ed0d00cafe32fbb1b37e40396898601490 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sun, 6 Oct 2024 23:08:49 +0530 Subject: [PATCH 118/224] DOC: fix SA01 for pandas.arrays.DatetimeArray (#59982) --- ci/code_checks.sh | 1 - pandas/core/arrays/datetimes.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 6f4534ba4a4de..5ef4f26e66134 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -110,7 +110,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_re_compilable PR07,SA01" \ -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ - -i "pandas.arrays.DatetimeArray SA01" \ -i "pandas.arrays.IntegerArray SA01" \ -i "pandas.arrays.IntervalArray.left SA01" \ -i "pandas.arrays.IntervalArray.length SA01" \ diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 43f4428118aa7..41128e52e31b3 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -205,6 +205,14 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): # type: ignore[misc] ------- None + See Also + -------- + DatetimeIndex : Immutable Index for datetime-like data. + Series : One-dimensional labeled array capable of holding datetime-like data. + Timestamp : Pandas replacement for python datetime.datetime object. + to_datetime : Convert argument to datetime. + period_range : Return a fixed frequency PeriodIndex. + Examples -------- >>> pd.arrays.DatetimeArray._from_sequence( From 4c9620545dcfdf69cc995cf14313e46b75385816 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sun, 6 Oct 2024 23:09:30 +0530 Subject: [PATCH 119/224] DOC: fix SA01 for pandas.errors.SpecificationError (#59983) --- ci/code_checks.sh | 1 - pandas/errors/__init__.py | 5 +++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 5ef4f26e66134..453c163792fa4 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -169,7 +169,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.PerformanceWarning SA01" \ -i "pandas.errors.PossibleDataLossError SA01" \ -i "pandas.errors.PossiblePrecisionLoss SA01" \ - -i "pandas.errors.SpecificationError SA01" \ -i "pandas.errors.UndefinedVariableError PR01,SA01" \ -i "pandas.errors.UnsortedIndexError SA01" \ -i "pandas.errors.UnsupportedFunctionCall SA01" \ diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 46e090cc3a589..cf2a9d3f4a238 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -444,6 +444,11 @@ class SpecificationError(Exception): The second way is calling ``agg`` on a Dataframe with duplicated functions names without assigning column name. + See Also + -------- + DataFrame.agg : Aggregate using one or more operations over the specified axis. + Series.agg : Aggregate using one or more operations over the specified axis. + Examples -------- >>> df = pd.DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)}) From 3c2c5f425ba03508d323f793c933c14bebd39dce Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sun, 6 Oct 2024 23:10:02 +0530 Subject: [PATCH 120/224] DOC: fix SA01 for pandas.errors.InvalidVersion (#59984) --- ci/code_checks.sh | 1 - pandas/util/version/__init__.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 453c163792fa4..5487dc19338da 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -160,7 +160,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.DuplicateLabelError SA01" \ -i "pandas.errors.IntCastingNaNError SA01" \ -i "pandas.errors.InvalidIndexError SA01" \ - -i "pandas.errors.InvalidVersion SA01" \ -i "pandas.errors.NullFrequencyError SA01" \ -i "pandas.errors.NumExprClobberingError SA01" \ -i "pandas.errors.NumbaUtilError SA01" \ diff --git a/pandas/util/version/__init__.py b/pandas/util/version/__init__.py index b5d975a0db1d8..bd741140f6542 100644 --- a/pandas/util/version/__init__.py +++ b/pandas/util/version/__init__.py @@ -114,6 +114,14 @@ class InvalidVersion(ValueError): """ An invalid version was found, users should refer to PEP 440. + The ``InvalidVersion`` exception is raised when a version string is + improperly formatted. Pandas uses this exception to ensure that all + version strings are PEP 440 compliant. + + See Also + -------- + util.version.Version : Class for handling and parsing version strings. + Examples -------- >>> pd.util.version.Version("1.") From 5829e3ea20adc978ebfb82f08d3d5347108be0f0 Mon Sep 17 00:00:00 2001 From: Steffen Rehberg Date: Sun, 6 Oct 2024 23:29:45 +0200 Subject: [PATCH 121/224] DOC: Fix typos in plotting.table (#59986) Fix typos in pandas.plotting.table docstring. --- pandas/plotting/_misc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index d8455f44ef0d1..03701f8778065 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -39,7 +39,7 @@ def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table: **kwargs Keyword arguments to be passed to matplotlib.table.table. If `rowLabels` or `colLabels` is not specified, data index or column - name will be used. + names will be used. Returns ------- @@ -59,11 +59,11 @@ def table(ax: Axes, data: DataFrame | Series, **kwargs) -> Table: >>> import matplotlib.pyplot as plt >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) - >>> fix, ax = plt.subplots() + >>> fig, ax = plt.subplots() >>> ax.axis("off") (0.0, 1.0, 0.0, 1.0) >>> table = pd.plotting.table( - ... ax, df, loc="center", cellLoc="center", colWidths=list([0.2, 0.2]) + ... ax, df, loc="center", cellLoc="center", colWidths=[0.2, 0.2] ... ) """ plot_backend = _get_plot_backend("matplotlib") From 2d9c95ddb70f9c68e1ad4893d07bf0f68a23316e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 7 Oct 2024 10:00:48 -0700 Subject: [PATCH 122/224] Bump mamba-org/setup-micromamba from 1 to 2 (#59988) Bumps [mamba-org/setup-micromamba](https://github.com/mamba-org/setup-micromamba) from 1 to 2. - [Release notes](https://github.com/mamba-org/setup-micromamba/releases) - [Commits](https://github.com/mamba-org/setup-micromamba/compare/v1...v2) --- updated-dependencies: - dependency-name: mamba-org/setup-micromamba dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/package-checks.yml | 2 +- .github/workflows/wheels.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index 6748832903e30..331af6e05b650 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -67,7 +67,7 @@ jobs: fetch-depth: 0 - name: Set up Python - uses: mamba-org/setup-micromamba@v1 + uses: mamba-org/setup-micromamba@v2 with: environment-name: recipe-test create-args: >- diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 2aaec8c9b56b0..de59a454c827c 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -165,7 +165,7 @@ jobs: CIBW_PLATFORM: ${{ matrix.buildplat[1] == 'pyodide_wasm32' && 'pyodide' || 'auto' }} - name: Set up Python - uses: mamba-org/setup-micromamba@v1 + uses: mamba-org/setup-micromamba@v2 with: environment-name: wheel-env # Use a fixed Python, since we might have an unreleased Python not From e5dc0646bb4b945cec03cc328ac0989cfe0fa60a Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 7 Oct 2024 22:44:19 +0530 Subject: [PATCH 123/224] DOC: fix RT03,SA01,ES01 for pandas.io.stata.StataReader.value_labels (#59991) --- ci/code_checks.sh | 1 - pandas/io/stata.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 5487dc19338da..102abf4be187c 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -174,7 +174,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.ValueLabelTypeMismatch SA01" \ -i "pandas.infer_freq SA01" \ -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \ - -i "pandas.io.stata.StataReader.value_labels RT03,SA01" \ -i "pandas.io.stata.StataReader.variable_labels RT03,SA01" \ -i "pandas.io.stata.StataWriter.write_file SA01" \ -i "pandas.json_normalize RT03,SA01" \ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 6b988d8fed6bf..f1d289726c9c8 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -2076,9 +2076,19 @@ def value_labels(self) -> dict[str, dict[int, str]]: """ Return a nested dict associating each variable name to its value and label. + This method retrieves the value labels from a Stata file. Value labels are + mappings between the coded values and their corresponding descriptive labels + in a Stata dataset. + Returns ------- dict + A python dictionary. + + See Also + -------- + read_stata : Read Stata file into DataFrame. + DataFrame.to_stata : Export DataFrame object to Stata dta format. Examples -------- From b3d0b9622bcd5bdf9733100407bd8b2695bc9af6 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Mon, 7 Oct 2024 22:45:35 +0530 Subject: [PATCH 124/224] DOC: fix RT03,SA01,ES01 for pandas.plotting.lag_plot (#59990) --- ci/code_checks.sh | 1 - pandas/plotting/_misc.py | 11 +++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 102abf4be187c..6a1b613eccb8b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -178,7 +178,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.io.stata.StataWriter.write_file SA01" \ -i "pandas.json_normalize RT03,SA01" \ -i "pandas.plotting.andrews_curves RT03,SA01" \ - -i "pandas.plotting.lag_plot RT03,SA01" \ -i "pandas.plotting.scatter_matrix PR07,SA01" \ -i "pandas.set_eng_float_format RT03,SA01" \ -i "pandas.tseries.offsets.BDay PR02,SA01" \ diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 03701f8778065..81940613dd2b0 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -549,6 +549,10 @@ def lag_plot(series: Series, lag: int = 1, ax: Axes | None = None, **kwds) -> Ax """ Lag plot for time series. + A lag plot is a scatter plot of a time series against a lag of itself. It helps + in visualizing the temporal dependence between observations by plotting the values + at time `t` on the x-axis and the values at time `t + lag` on the y-axis. + Parameters ---------- series : Series @@ -563,6 +567,13 @@ def lag_plot(series: Series, lag: int = 1, ax: Axes | None = None, **kwds) -> Ax Returns ------- matplotlib.axes.Axes + The matplotlib Axes object containing the lag plot. + + See Also + -------- + plotting.autocorrelation_plot : Autocorrelation plot for time series. + matplotlib.pyplot.scatter : A scatter plot of y vs. x with varying marker size + and/or color in Matplotlib. Examples -------- From 02267e55586c33a4724dd5e9dbaecfe12e3aa8b4 Mon Sep 17 00:00:00 2001 From: Randolf Scholz Date: Mon, 7 Oct 2024 19:22:27 +0200 Subject: [PATCH 125/224] Typing: Added missing methods to `NaTType` stub (#59995) added missing methods to NaTType stub --- pandas/_libs/tslibs/nattype.pyi | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/_libs/tslibs/nattype.pyi b/pandas/_libs/tslibs/nattype.pyi index f49e894a0bfec..fa1577f033fff 100644 --- a/pandas/_libs/tslibs/nattype.pyi +++ b/pandas/_libs/tslibs/nattype.pyi @@ -9,6 +9,7 @@ from typing import ( Literal, NoReturn, TypeAlias, + overload, ) import numpy as np @@ -159,15 +160,31 @@ class NaTType: # inject Period properties @property def qyear(self) -> float: ... + # comparisons def __eq__(self, other: object) -> bool: ... def __ne__(self, other: object) -> bool: ... __lt__: _NatComparison __le__: _NatComparison __gt__: _NatComparison __ge__: _NatComparison + # unary operators + def __pos__(self) -> Self: ... + def __neg__(self) -> Self: ... + # binary operators def __sub__(self, other: Self | timedelta | datetime) -> Self: ... def __rsub__(self, other: Self | timedelta | datetime) -> Self: ... def __add__(self, other: Self | timedelta | datetime) -> Self: ... def __radd__(self, other: Self | timedelta | datetime) -> Self: ... + def __mul__(self, other: float) -> Self: ... # analogous to timedelta + def __rmul__(self, other: float) -> Self: ... + @overload # analogous to timedelta + def __truediv__(self, other: Self | timedelta) -> float: ... # Literal[NaN] + @overload + def __truediv__(self, other: float) -> Self: ... + @overload # analogous to timedelta + def __floordiv__(self, other: Self | timedelta) -> float: ... # Literal[NaN] + @overload + def __floordiv__(self, other: float) -> Self: ... + # other def __hash__(self) -> int: ... def as_unit(self, unit: str, round_ok: bool = ...) -> NaTType: ... From 37c31afa1be8b51af545a2dc3354acaf42a9c95e Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Mon, 7 Oct 2024 18:30:40 -0400 Subject: [PATCH 126/224] REGR: groupby.value_counts with all NA values (#59999) * REGR: groupby.value_counts with all NA values * Better implementation --- pandas/core/groupby/ops.py | 2 +- .../groupby/methods/test_value_counts.py | 22 +++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index a82e77140d274..b32119a2ddbde 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -867,7 +867,7 @@ def _ob_index_and_ids( names=names, verify_integrity=False, ) - if not consistent_sorting: + if not consistent_sorting and len(ob_index) > 0: # Sort by the levels where the corresponding sort argument is True n_levels = len(sorts) drop_levels = [ diff --git a/pandas/tests/groupby/methods/test_value_counts.py b/pandas/tests/groupby/methods/test_value_counts.py index 8f3022fbe551c..8ca6593a19f20 100644 --- a/pandas/tests/groupby/methods/test_value_counts.py +++ b/pandas/tests/groupby/methods/test_value_counts.py @@ -1219,3 +1219,25 @@ def test_value_counts_sort_categorical(sort, vc_sort, normalize): expected = expected.take(taker) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("groupby_sort", [True, False]) +def test_value_counts_all_na(sort, dropna, groupby_sort): + # GH#59989 + df = DataFrame({"a": [2, 1, 1], "b": np.nan}) + gb = df.groupby("a", sort=groupby_sort) + result = gb.value_counts(sort=sort, dropna=dropna) + + kwargs = {"levels": [[1, 2], [np.nan]], "names": ["a", "b"]} + if dropna: + data = [] + index = MultiIndex(codes=[[], []], **kwargs) + elif not groupby_sort and not sort: + data = [1, 2] + index = MultiIndex(codes=[[1, 0], [0, 0]], **kwargs) + else: + data = [2, 1] + index = MultiIndex(codes=[[0, 1], [0, 0]], **kwargs) + expected = Series(data, index=index, dtype="int64", name="count") + + tm.assert_series_equal(result, expected) From 5126dcaf88167ff869db874be40a520bb86a27ed Mon Sep 17 00:00:00 2001 From: Deepak Saldanha Date: Tue, 8 Oct 2024 08:05:04 +0530 Subject: [PATCH 127/224] Doc: Update docstring for `dummy_na` parameter (#60000) * update docstring for dummy_na parameter * Update pandas/core/reshape/encoding.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/reshape/encoding.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/encoding.py b/pandas/core/reshape/encoding.py index c397c1c2566a5..33ff182f5baee 100644 --- a/pandas/core/reshape/encoding.py +++ b/pandas/core/reshape/encoding.py @@ -68,7 +68,8 @@ def get_dummies( If appending prefix, separator/delimiter to use. Or pass a list or dictionary as with `prefix`. dummy_na : bool, default False - Add a column to indicate NaNs, if False NaNs are ignored. + If True, a NaN indicator column will be added even if no NaN values are present. + If False, NA values are encoded as all zero. columns : list-like, default None Column names in the DataFrame to be encoded. If `columns` is None then all the columns with From 5ea5bd95d5bb93434fb5f1686f50b176c46dbac8 Mon Sep 17 00:00:00 2001 From: Randolf Scholz Date: Tue, 8 Oct 2024 20:36:41 +0200 Subject: [PATCH 128/224] Typing: More precise NaT stub (#60002) * more precise NaT stub * ruff format * updated == and != to return literal --- pandas/_libs/tslibs/nattype.pyi | 40 +++++++++++++++------------------ 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/tslibs/nattype.pyi b/pandas/_libs/tslibs/nattype.pyi index fa1577f033fff..d3b10fbe79cb9 100644 --- a/pandas/_libs/tslibs/nattype.pyi +++ b/pandas/_libs/tslibs/nattype.pyi @@ -25,12 +25,8 @@ NaT: NaTType iNaT: int nat_strings: set[str] -_NaTComparisonTypes: TypeAlias = ( - datetime | timedelta | Period | np.datetime64 | np.timedelta64 -) - -class _NatComparison: - def __call__(self, other: _NaTComparisonTypes) -> bool: ... +_TimeLike: TypeAlias = datetime | timedelta | Period | np.datetime64 | np.timedelta64 +_TimeDelta: TypeAlias = timedelta | np.timedelta64 class NaTType: _value: np.int64 @@ -161,30 +157,30 @@ class NaTType: @property def qyear(self) -> float: ... # comparisons - def __eq__(self, other: object) -> bool: ... - def __ne__(self, other: object) -> bool: ... - __lt__: _NatComparison - __le__: _NatComparison - __gt__: _NatComparison - __ge__: _NatComparison + def __eq__(self, other: object, /) -> Literal[False]: ... + def __ne__(self, other: object, /) -> Literal[True]: ... + def __lt__(self, other: Self | _TimeLike, /) -> Literal[False]: ... + def __le__(self, other: Self | _TimeLike, /) -> Literal[False]: ... + def __gt__(self, other: Self | _TimeLike, /) -> Literal[False]: ... + def __ge__(self, other: Self | _TimeLike, /) -> Literal[False]: ... # unary operators def __pos__(self) -> Self: ... def __neg__(self) -> Self: ... # binary operators - def __sub__(self, other: Self | timedelta | datetime) -> Self: ... - def __rsub__(self, other: Self | timedelta | datetime) -> Self: ... - def __add__(self, other: Self | timedelta | datetime) -> Self: ... - def __radd__(self, other: Self | timedelta | datetime) -> Self: ... - def __mul__(self, other: float) -> Self: ... # analogous to timedelta - def __rmul__(self, other: float) -> Self: ... + def __sub__(self, other: Self | _TimeLike, /) -> Self: ... + def __rsub__(self, other: Self | _TimeLike, /) -> Self: ... + def __add__(self, other: Self | _TimeLike, /) -> Self: ... + def __radd__(self, other: Self | _TimeLike, /) -> Self: ... + def __mul__(self, other: float, /) -> Self: ... # analogous to timedelta + def __rmul__(self, other: float, /) -> Self: ... @overload # analogous to timedelta - def __truediv__(self, other: Self | timedelta) -> float: ... # Literal[NaN] + def __truediv__(self, other: Self | _TimeDelta, /) -> float: ... # Literal[NaN] @overload - def __truediv__(self, other: float) -> Self: ... + def __truediv__(self, other: float, /) -> Self: ... @overload # analogous to timedelta - def __floordiv__(self, other: Self | timedelta) -> float: ... # Literal[NaN] + def __floordiv__(self, other: Self | _TimeDelta, /) -> float: ... # Literal[NaN] @overload - def __floordiv__(self, other: float) -> Self: ... + def __floordiv__(self, other: float, /) -> Self: ... # other def __hash__(self) -> int: ... def as_unit(self, unit: str, round_ok: bool = ...) -> NaTType: ... From f94860e1ce75b57db9eda2c37154c5b22b661121 Mon Sep 17 00:00:00 2001 From: Deepak Saldanha Date: Wed, 9 Oct 2024 00:11:39 +0530 Subject: [PATCH 129/224] DOC: Refactor _create_delegator_method using functools (#59878) * add tag dt.to_timestamp, series.rst * add doc strings for dt.to_timestamp * update datetimes.py * refactor _create_delegator_method to use functools wrap * changes to accessor.py * remove from code_checks.sh * update code_checks.sh * update code_checks.sh * rewrite functools, adjust unit tests * update change log * remove dup entry * update code_checks.sh * update * revert all dt related changes * update series.rst * update imports * format use of functools import --- ci/code_checks.sh | 20 -------------------- pandas/core/accessor.py | 7 ++++--- pandas/core/arrays/categorical.py | 14 ++++++++++++-- 3 files changed, 16 insertions(+), 25 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 6a1b613eccb8b..6fb675069e81d 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -73,27 +73,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ -i "pandas.RangeIndex.from_range PR01,SA01" \ - -i "pandas.Series.cat.add_categories PR01,PR02" \ - -i "pandas.Series.cat.as_ordered PR01" \ - -i "pandas.Series.cat.as_unordered PR01" \ - -i "pandas.Series.cat.remove_categories PR01,PR02" \ - -i "pandas.Series.cat.remove_unused_categories PR01" \ - -i "pandas.Series.cat.rename_categories PR01,PR02" \ - -i "pandas.Series.cat.reorder_categories PR01,PR02" \ - -i "pandas.Series.cat.set_categories PR01,PR02" \ - -i "pandas.Series.dt.as_unit PR01,PR02" \ - -i "pandas.Series.dt.ceil PR01,PR02" \ - -i "pandas.Series.dt.day_name PR01,PR02" \ - -i "pandas.Series.dt.floor PR01,PR02" \ -i "pandas.Series.dt.freq GL08" \ - -i "pandas.Series.dt.month_name PR01,PR02" \ - -i "pandas.Series.dt.normalize PR01" \ - -i "pandas.Series.dt.round PR01,PR02" \ - -i "pandas.Series.dt.strftime PR01,PR02" \ - -i "pandas.Series.dt.to_period PR01,PR02" \ - -i "pandas.Series.dt.total_seconds PR01" \ - -i "pandas.Series.dt.tz_convert PR01,PR02" \ - -i "pandas.Series.dt.tz_localize PR01,PR02" \ -i "pandas.Series.dt.unit GL08" \ -i "pandas.Series.pad PR01,SA01" \ -i "pandas.Timedelta.max PR02" \ diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index d8463fda34caa..78684eacf2d66 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -7,6 +7,7 @@ from __future__ import annotations +import functools from typing import ( TYPE_CHECKING, final, @@ -117,12 +118,12 @@ def _setter(self, new_values): ) def _create_delegator_method(name: str): + method = getattr(delegate, accessor_mapping(name)) + + @functools.wraps(method) def f(self, *args, **kwargs): return self._delegate_method(name, *args, **kwargs) - f.__name__ = name - f.__doc__ = getattr(delegate, accessor_mapping(name)).__doc__ - return f for name in accessors: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a69e197df851d..0484ef89f61c2 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1155,6 +1155,12 @@ def rename_categories(self, new_categories) -> Self: """ Rename categories. + This method is commonly used to re-label or adjust the + category names in categorical data without changing the + underlying data. It is useful in situations where you want + to modify the labels used for clarity, consistency, + or readability. + Parameters ---------- new_categories : list-like, dict-like or callable @@ -1371,8 +1377,8 @@ def remove_categories(self, removals) -> Self: """ Remove the specified categories. - `removals` must be included in the old categories. Values which were in - the removed categories will be set to NaN + The ``removals`` argument must be a subset of the current categories. + Any values that were part of the removed categories will be set to NaN. Parameters ---------- @@ -1431,6 +1437,10 @@ def remove_unused_categories(self) -> Self: """ Remove categories which are not used. + This method is useful when working with datasets + that undergo dynamic changes where categories may no longer be + relevant, allowing to maintain a clean, efficient data structure. + Returns ------- Categorical From b975191afe1401f13ab5e15d3df83b5d95dffe75 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Tue, 8 Oct 2024 22:00:43 +0200 Subject: [PATCH 130/224] Fix Styler docstring (#60001) * Fix Styler docstring * Remove blankspaces --- pandas/io/formats/style.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 6e5ae09485951..eb6773310da69 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -222,6 +222,7 @@ class Styler(StylerRenderer): * ``level`` where `k` is the level in a MultiIndex * Column label cells include + * ``col_heading`` * ``col`` where `n` is the numeric position of the column * ``level`` where `k` is the level in a MultiIndex @@ -231,7 +232,7 @@ class Styler(StylerRenderer): * Trimmed cells include ``col_trim`` or ``row_trim``. Any, or all, or these classes can be renamed by using the ``css_class_names`` - argument in ``Styler.set_table_classes``, giving a value such as + argument in ``Styler.set_table_styles``, giving a value such as *{"row": "MY_ROW_CLASS", "col_trim": "", "row_trim": ""}*. Examples From a0f9140b942d9f596889cd26ac395551dcdf3afb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 8 Oct 2024 14:51:49 -0700 Subject: [PATCH 131/224] [pre-commit.ci] pre-commit autoupdate (#59998) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [pre-commit.ci] pre-commit autoupdate updates: - [github.com/astral-sh/ruff-pre-commit: v0.5.0 → v0.6.9](https://github.com/astral-sh/ruff-pre-commit/compare/v0.5.0...v0.6.9) - [github.com/jendrikseipp/vulture: v2.11 → v2.13](https://github.com/jendrikseipp/vulture/compare/v2.11...v2.13) - [github.com/pre-commit/pre-commit-hooks: v4.6.0 → v5.0.0](https://github.com/pre-commit/pre-commit-hooks/compare/v4.6.0...v5.0.0) - [github.com/asottile/pyupgrade: v3.16.0 → v3.17.0](https://github.com/asottile/pyupgrade/compare/v3.16.0...v3.17.0) - [github.com/sphinx-contrib/sphinx-lint: v0.9.1 → v1.0.0](https://github.com/sphinx-contrib/sphinx-lint/compare/v0.9.1...v1.0.0) - [github.com/pre-commit/mirrors-clang-format: v18.1.8 → v19.1.1](https://github.com/pre-commit/mirrors-clang-format/compare/v18.1.8...v19.1.1) * Update .pre-commit-config.yaml * fix style.ipynb, ignore some pylint * pyupgrade * Revert "pyupgrade" This reverts commit b539c71009ff15769c501cf170ed9894a49ddcfb. * don't bump pyupgrade * Typo in random call * Delete hidden cell * Undo max/min rule from ruff --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .pre-commit-config.yaml | 10 +- doc/source/user_guide/style.ipynb | 689 ++++++++++++++------------ pandas/core/arrays/string_.py | 2 +- pandas/tests/indexes/test_old_base.py | 2 +- pyproject.toml | 9 +- 5 files changed, 396 insertions(+), 316 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f6717dd503c9b..7c9ebf7d94173 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ ci: skip: [pyright, mypy] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.5.0 + rev: v0.6.9 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -34,7 +34,7 @@ repos: - id: ruff-format exclude: ^scripts|^pandas/tests/frame/test_query_eval.py - repo: https://github.com/jendrikseipp/vulture - rev: 'v2.11' + rev: 'v2.13' hooks: - id: vulture entry: python scripts/run_vulture.py @@ -52,7 +52,7 @@ repos: - id: cython-lint - id: double-quote-cython-strings - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: check-case-conflict - id: check-toml @@ -90,12 +90,12 @@ repos: types: [text] # overwrite types: [rst] types_or: [python, rst] - repo: https://github.com/sphinx-contrib/sphinx-lint - rev: v0.9.1 + rev: v1.0.0 hooks: - id: sphinx-lint args: ["--enable", "all", "--disable", "line-too-long"] - repo: https://github.com/pre-commit/mirrors-clang-format - rev: v18.1.8 + rev: v19.1.1 hooks: - id: clang-format files: ^pandas/_libs/src|^pandas/_libs/include diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index daecfce6ecebc..abb7181fc8d72 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -38,19 +38,6 @@ "[concatfunc]: ../reference/api/pandas.io.formats.style.Styler.concat.rst" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "nbsphinx": "hidden" - }, - "outputs": [], - "source": [ - "import matplotlib.pyplot\n", - "# We have this here to trigger matplotlib's font cache stuff.\n", - "# This cell is hidden from the output" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -78,17 +65,13 @@ "source": [ "import pandas as pd\n", "import numpy as np\n", - "import matplotlib as mpl\n", "\n", - "df = pd.DataFrame({\n", - " \"strings\": [\"Adam\", \"Mike\"],\n", - " \"ints\": [1, 3],\n", - " \"floats\": [1.123, 1000.23]\n", - "})\n", - "df.style \\\n", - " .format(precision=3, thousands=\".\", decimal=\",\") \\\n", - " .format_index(str.upper, axis=1) \\\n", - " .relabel_index([\"row 1\", \"row 2\"], axis=0)" + "df = pd.DataFrame(\n", + " {\"strings\": [\"Adam\", \"Mike\"], \"ints\": [1, 3], \"floats\": [1.123, 1000.23]}\n", + ")\n", + "df.style.format(precision=3, thousands=\".\", decimal=\",\").format_index(\n", + " str.upper, axis=1\n", + ").relabel_index([\"row 1\", \"row 2\"], axis=0)" ] }, { @@ -104,17 +87,21 @@ "metadata": {}, "outputs": [], "source": [ - "weather_df = pd.DataFrame(np.random.rand(10,2)*5, \n", - " index=pd.date_range(start=\"2021-01-01\", periods=10),\n", - " columns=[\"Tokyo\", \"Beijing\"])\n", + "weather_df = pd.DataFrame(\n", + " np.random.default_rng().random((10, 2)) * 5,\n", + " index=pd.date_range(start=\"2021-01-01\", periods=10),\n", + " columns=[\"Tokyo\", \"Beijing\"],\n", + ")\n", + "\n", "\n", - "def rain_condition(v): \n", + "def rain_condition(v):\n", " if v < 1.75:\n", " return \"Dry\"\n", " elif v < 2.75:\n", " return \"Rain\"\n", " return \"Heavy Rain\"\n", "\n", + "\n", "def make_pretty(styler):\n", " styler.set_caption(\"Weather Conditions\")\n", " styler.format(rain_condition)\n", @@ -122,6 +109,7 @@ " styler.background_gradient(axis=None, vmin=1, vmax=5, cmap=\"YlGnBu\")\n", " return styler\n", "\n", + "\n", "weather_df" ] }, @@ -157,10 +145,8 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.DataFrame(np.random.randn(5, 5))\n", - "df.style \\\n", - " .hide(subset=[0, 2, 4], axis=0) \\\n", - " .hide(subset=[0, 2, 4], axis=1)" + "df = pd.DataFrame(np.random.default_rng().standard_normal((5, 5)))\n", + "df.style.hide(subset=[0, 2, 4], axis=0).hide(subset=[0, 2, 4], axis=1)" ] }, { @@ -177,9 +163,9 @@ "outputs": [], "source": [ "show = [0, 2, 4]\n", - "df.style \\\n", - " .hide([row for row in df.index if row not in show], axis=0) \\\n", - " .hide([col for col in df.columns if col not in show], axis=1)" + "df.style.hide([row for row in df.index if row not in show], axis=0).hide(\n", + " [col for col in df.columns if col not in show], axis=1\n", + ")" ] }, { @@ -199,9 +185,9 @@ "metadata": {}, "outputs": [], "source": [ - "summary_styler = df.agg([\"sum\", \"mean\"]).style \\\n", - " .format(precision=3) \\\n", - " .relabel_index([\"Sum\", \"Average\"])\n", + "summary_styler = (\n", + " df.agg([\"sum\", \"mean\"]).style.format(precision=3).relabel_index([\"Sum\", \"Average\"])\n", + ")\n", "df.style.format(precision=1).concat(summary_styler)" ] }, @@ -227,9 +213,16 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.DataFrame([[38.0, 2.0, 18.0, 22.0, 21, np.nan],[19, 439, 6, 452, 226,232]], \n", - " index=pd.Index(['Tumour (Positive)', 'Non-Tumour (Negative)'], name='Actual Label:'), \n", - " columns=pd.MultiIndex.from_product([['Decision Tree', 'Regression', 'Random'],['Tumour', 'Non-Tumour']], names=['Model:', 'Predicted:']))\n", + "idx = pd.Index([\"Tumour (Positive)\", \"Non-Tumour (Negative)\"], name=\"Actual Label:\")\n", + "cols = pd.MultiIndex.from_product(\n", + " [[\"Decision Tree\", \"Regression\", \"Random\"], [\"Tumour\", \"Non-Tumour\"]],\n", + " names=[\"Model:\", \"Predicted:\"],\n", + ")\n", + "df = pd.DataFrame(\n", + " [[38.0, 2.0, 18.0, 22.0, 21, np.nan], [19, 439, 6, 452, 226, 232]],\n", + " index=idx,\n", + " columns=cols,\n", + ")\n", "df.style" ] }, @@ -242,63 +235,68 @@ "outputs": [], "source": [ "# Hidden cell to just create the below example: code is covered throughout the guide.\n", - "s = df.style\\\n", - " .hide([('Random', 'Tumour'), ('Random', 'Non-Tumour')], axis='columns')\\\n", - " .format('{:.0f}')\\\n", - " .set_table_styles([{\n", - " 'selector': '',\n", - " 'props': 'border-collapse: separate;'\n", - " },{\n", - " 'selector': 'caption',\n", - " 'props': 'caption-side: bottom; font-size:1.3em;'\n", - " },{\n", - " 'selector': '.index_name',\n", - " 'props': 'font-style: italic; color: darkgrey; font-weight:normal;'\n", - " },{\n", - " 'selector': 'th:not(.index_name)',\n", - " 'props': 'background-color: #000066; color: white;'\n", - " },{\n", - " 'selector': 'th.col_heading',\n", - " 'props': 'text-align: center;'\n", - " },{\n", - " 'selector': 'th.col_heading.level0',\n", - " 'props': 'font-size: 1.5em;'\n", - " },{\n", - " 'selector': 'th.col2',\n", - " 'props': 'border-left: 1px solid white;'\n", - " },{\n", - " 'selector': '.col2',\n", - " 'props': 'border-left: 1px solid #000066;'\n", - " },{\n", - " 'selector': 'td',\n", - " 'props': 'text-align: center; font-weight:bold;'\n", - " },{\n", - " 'selector': '.true',\n", - " 'props': 'background-color: #e6ffe6;'\n", - " },{\n", - " 'selector': '.false',\n", - " 'props': 'background-color: #ffe6e6;'\n", - " },{\n", - " 'selector': '.border-red',\n", - " 'props': 'border: 2px dashed red;'\n", - " },{\n", - " 'selector': '.border-green',\n", - " 'props': 'border: 2px dashed green;'\n", - " },{\n", - " 'selector': 'td:hover',\n", - " 'props': 'background-color: #ffffb3;'\n", - " }])\\\n", - " .set_td_classes(pd.DataFrame([['true border-green', 'false', 'true', 'false border-red', '', ''],\n", - " ['false', 'true', 'false', 'true', '', '']], \n", - " index=df.index, columns=df.columns))\\\n", - " .set_caption(\"Confusion matrix for multiple cancer prediction models.\")\\\n", - " .set_tooltips(pd.DataFrame([['This model has a very strong true positive rate', '', '', \"This model's total number of false negatives is too high\", '', ''],\n", - " ['', '', '', '', '', '']], \n", - " index=df.index, columns=df.columns),\n", - " css_class='pd-tt', props=\n", - " 'visibility: hidden; position: absolute; z-index: 1; border: 1px solid #000066;'\n", - " 'background-color: white; color: #000066; font-size: 0.8em;' \n", - " 'transform: translate(0px, -24px); padding: 0.6em; border-radius: 0.5em;')\n" + "s = (\n", + " df.style.hide([(\"Random\", \"Tumour\"), (\"Random\", \"Non-Tumour\")], axis=\"columns\")\n", + " .format(\"{:.0f}\")\n", + " .set_table_styles(\n", + " [\n", + " {\"selector\": \"\", \"props\": \"border-collapse: separate;\"},\n", + " {\"selector\": \"caption\", \"props\": \"caption-side: bottom; font-size:1.3em;\"},\n", + " {\n", + " \"selector\": \".index_name\",\n", + " \"props\": \"font-style: italic; color: darkgrey; font-weight:normal;\",\n", + " },\n", + " {\n", + " \"selector\": \"th:not(.index_name)\",\n", + " \"props\": \"background-color: #000066; color: white;\",\n", + " },\n", + " {\"selector\": \"th.col_heading\", \"props\": \"text-align: center;\"},\n", + " {\"selector\": \"th.col_heading.level0\", \"props\": \"font-size: 1.5em;\"},\n", + " {\"selector\": \"th.col2\", \"props\": \"border-left: 1px solid white;\"},\n", + " {\"selector\": \".col2\", \"props\": \"border-left: 1px solid #000066;\"},\n", + " {\"selector\": \"td\", \"props\": \"text-align: center; font-weight:bold;\"},\n", + " {\"selector\": \".true\", \"props\": \"background-color: #e6ffe6;\"},\n", + " {\"selector\": \".false\", \"props\": \"background-color: #ffe6e6;\"},\n", + " {\"selector\": \".border-red\", \"props\": \"border: 2px dashed red;\"},\n", + " {\"selector\": \".border-green\", \"props\": \"border: 2px dashed green;\"},\n", + " {\"selector\": \"td:hover\", \"props\": \"background-color: #ffffb3;\"},\n", + " ]\n", + " )\n", + " .set_td_classes(\n", + " pd.DataFrame(\n", + " [\n", + " [\"true border-green\", \"false\", \"true\", \"false border-red\", \"\", \"\"],\n", + " [\"false\", \"true\", \"false\", \"true\", \"\", \"\"],\n", + " ],\n", + " index=df.index,\n", + " columns=df.columns,\n", + " )\n", + " )\n", + " .set_caption(\"Confusion matrix for multiple cancer prediction models.\")\n", + " .set_tooltips(\n", + " pd.DataFrame(\n", + " [\n", + " [\n", + " \"This model has a very strong true positive rate\",\n", + " \"\",\n", + " \"\",\n", + " \"This model's total number of false negatives is too high\",\n", + " \"\",\n", + " \"\",\n", + " ],\n", + " [\"\", \"\", \"\", \"\", \"\", \"\"],\n", + " ],\n", + " index=df.index,\n", + " columns=df.columns,\n", + " ),\n", + " css_class=\"pd-tt\",\n", + " props=\"visibility: hidden; \"\n", + " \"position: absolute; z-index: 1; \"\n", + " \"border: 1px solid #000066;\"\n", + " \"background-color: white; color: #000066; font-size: 0.8em;\"\n", + " \"transform: translate(0px, -24px); padding: 0.6em; border-radius: 0.5em;\",\n", + " )\n", + ")" ] }, { @@ -325,7 +323,9 @@ "metadata": {}, "outputs": [], "source": [ - "s = df.style.format('{:.0f}').hide([('Random', 'Tumour'), ('Random', 'Non-Tumour')], axis=\"columns\")\n", + "s = df.style.format(\"{:.0f}\").hide(\n", + " [(\"Random\", \"Tumour\"), (\"Random\", \"Non-Tumour\")], axis=\"columns\"\n", + ")\n", "s" ] }, @@ -337,8 +337,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('after_hide')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"after_hide\")" ] }, { @@ -395,16 +395,16 @@ "outputs": [], "source": [ "cell_hover = { # for row hover use instead of \n", - " 'selector': 'td:hover',\n", - " 'props': [('background-color', '#ffffb3')]\n", + " \"selector\": \"td:hover\",\n", + " \"props\": [(\"background-color\", \"#ffffb3\")],\n", "}\n", "index_names = {\n", - " 'selector': '.index_name',\n", - " 'props': 'font-style: italic; color: darkgrey; font-weight:normal;'\n", + " \"selector\": \".index_name\",\n", + " \"props\": \"font-style: italic; color: darkgrey; font-weight:normal;\",\n", "}\n", "headers = {\n", - " 'selector': 'th:not(.index_name)',\n", - " 'props': 'background-color: #000066; color: white;'\n", + " \"selector\": \"th:not(.index_name)\",\n", + " \"props\": \"background-color: #000066; color: white;\",\n", "}\n", "s.set_table_styles([cell_hover, index_names, headers])" ] @@ -417,8 +417,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('after_tab_styles1')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"after_tab_styles1\")" ] }, { @@ -434,11 +434,14 @@ "metadata": {}, "outputs": [], "source": [ - "s.set_table_styles([\n", - " {'selector': 'th.col_heading', 'props': 'text-align: center;'},\n", - " {'selector': 'th.col_heading.level0', 'props': 'font-size: 1.5em;'},\n", - " {'selector': 'td', 'props': 'text-align: center; font-weight: bold;'},\n", - "], overwrite=False)" + "s.set_table_styles(\n", + " [\n", + " {\"selector\": \"th.col_heading\", \"props\": \"text-align: center;\"},\n", + " {\"selector\": \"th.col_heading.level0\", \"props\": \"font-size: 1.5em;\"},\n", + " {\"selector\": \"td\", \"props\": \"text-align: center; font-weight: bold;\"},\n", + " ],\n", + " overwrite=False,\n", + ")" ] }, { @@ -449,8 +452,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('after_tab_styles2')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"after_tab_styles2\")" ] }, { @@ -468,10 +471,16 @@ "metadata": {}, "outputs": [], "source": [ - "s.set_table_styles({\n", - " ('Regression', 'Tumour'): [{'selector': 'th', 'props': 'border-left: 1px solid white'},\n", - " {'selector': 'td', 'props': 'border-left: 1px solid #000066'}]\n", - "}, overwrite=False, axis=0)" + "s.set_table_styles(\n", + " {\n", + " (\"Regression\", \"Tumour\"): [\n", + " {\"selector\": \"th\", \"props\": \"border-left: 1px solid white\"},\n", + " {\"selector\": \"td\", \"props\": \"border-left: 1px solid #000066\"},\n", + " ]\n", + " },\n", + " overwrite=False,\n", + " axis=0,\n", + ")" ] }, { @@ -482,8 +491,8 @@ }, "outputs": [], "source": [ - "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting \n", - "s.set_uuid('xyz01')" + "# Hidden cell to avoid CSS clashes and latter code upcoding previous formatting\n", + "s.set_uuid(\"xyz01\")" ] }, { @@ -508,7 +517,7 @@ "outputs": [], "source": [ "out = s.set_table_attributes('class=\"my-table-cls\"').to_html()\n", - "print(out[out.find('