diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3d3ec53948a01..49f9abd99db53 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -354,6 +354,7 @@ of columns could result in a larger Series result. See (:issue:`37799`). *New behavior*: .. ipython:: python + :okwarning: In [5]: df.all(bool_only=True) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 790f52bdcc7ad..851cf201cae6a 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -280,6 +280,7 @@ Other Deprecations - Deprecated the behavior of :meth:`Series.fillna` and :meth:`DataFrame.fillna` with ``timedelta64[ns]`` dtype and incompatible fill value; in a future version this will cast to a common dtype (usually object) instead of raising, matching the behavior of other dtypes (:issue:`45746`) - Deprecated the ``warn`` parameter in :func:`infer_freq` (:issue:`45947`) - Deprecated allowing non-keyword arguments in :meth:`ExtensionArray.argsort` (:issue:`46134`) +- Deprecated treating all-bool ``object``-dtype columns as bool-like in :meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True``, explicitly cast to bool instead (:issue:`46188`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 1f1486b1b29a7..f47aeb16e19f1 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -4,11 +4,13 @@ from numbers import Number import re from typing import Pattern +import warnings import numpy as np from pandas._libs import lib from pandas._typing import ArrayLike +from pandas.util._exceptions import find_stack_level is_bool = lib.is_bool @@ -447,5 +449,16 @@ def is_inferred_bool_dtype(arr: ArrayLike) -> bool: if dtype == np.dtype(bool): return True elif dtype == np.dtype("object"): - return lib.is_bool_array(arr) + result = lib.is_bool_array(arr) + if result: + # GH#46188 + warnings.warn( + "In a future version, object-dtype columns with all-bool values " + "will not be included in reductions with bool_only=True. " + "Explicitly cast to bool dtype instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + return result + return False diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index c727a02cb5061..cc0195bf1dff9 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1219,6 +1219,8 @@ def test_any_all_object(self): assert result is False def test_any_all_object_bool_only(self): + msg = "object-dtype columns with all-bool values" + df = DataFrame({"A": ["foo", 2], "B": [True, False]}).astype(object) df._consolidate_inplace() df["C"] = Series([True, True]) @@ -1228,29 +1230,36 @@ def test_any_all_object_bool_only(self): # The underlying bug is in DataFrame._get_bool_data, so we check # that while we're here - res = df._get_bool_data() + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df._get_bool_data() expected = df[["B", "C"]] tm.assert_frame_equal(res, expected) - res = df.all(bool_only=True, axis=0) + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df.all(bool_only=True, axis=0) expected = Series([False, True], index=["B", "C"]) tm.assert_series_equal(res, expected) # operating on a subset of columns should not produce a _larger_ Series - res = df[["B", "C"]].all(bool_only=True, axis=0) + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df[["B", "C"]].all(bool_only=True, axis=0) tm.assert_series_equal(res, expected) - assert not df.all(bool_only=True, axis=None) + with tm.assert_produces_warning(FutureWarning, match=msg): + assert not df.all(bool_only=True, axis=None) - res = df.any(bool_only=True, axis=0) + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df.any(bool_only=True, axis=0) expected = Series([True, True], index=["B", "C"]) tm.assert_series_equal(res, expected) # operating on a subset of columns should not produce a _larger_ Series - res = df[["B", "C"]].any(bool_only=True, axis=0) + with tm.assert_produces_warning(FutureWarning, match=msg): + res = df[["B", "C"]].any(bool_only=True, axis=0) tm.assert_series_equal(res, expected) - assert df.any(bool_only=True, axis=None) + with tm.assert_produces_warning(FutureWarning, match=msg): + assert df.any(bool_only=True, axis=None) @pytest.mark.parametrize("method", ["any", "all"]) def test_any_all_level_axis_none_raises(self, method): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 3113aa1ad8452..f4060c84f533a 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -784,6 +784,7 @@ def test_get_numeric_data(self): ) def test_get_bool_data(self): + msg = "object-dtype columns with all-bool values" mgr = create_mgr( "int: int; float: float; complex: complex;" "str: object; bool: bool; obj: object; dt: datetime", @@ -791,7 +792,8 @@ def test_get_bool_data(self): ) mgr.iset(6, np.array([True, False, True], dtype=np.object_)) - bools = mgr.get_bool_data() + with tm.assert_produces_warning(FutureWarning, match=msg): + bools = mgr.get_bool_data() tm.assert_index_equal(bools.items, Index(["bool", "dt"])) tm.assert_almost_equal( mgr.iget(mgr.items.get_loc("bool")).internal_values(), @@ -805,7 +807,8 @@ def test_get_bool_data(self): ) # Check sharing - bools2 = mgr.get_bool_data(copy=True) + with tm.assert_produces_warning(FutureWarning, match=msg): + bools2 = mgr.get_bool_data(copy=True) bools2.iset(0, np.array([False, True, False])) tm.assert_numpy_array_equal( mgr.iget(mgr.items.get_loc("bool")).internal_values(),