From e564642775afafa8247c0e16205141719625fc4b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 11 Nov 2024 20:42:28 +0100 Subject: [PATCH 1/2] ENH (string dtype): convert string_view columns to future string dtype instead of object dtype in Parquet/Feather IO (#60235) (cherry picked from commit f307a0a3615d93c2177f6581133bdb541e12a93c) --- pandas/compat/pyarrow.py | 2 ++ pandas/io/_util.py | 9 +++++++-- pandas/tests/io/test_feather.py | 20 ++++++++++++++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 7fa197c4a9824..f579b8a45d386 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -17,6 +17,7 @@ pa_version_under15p0 = _palv < Version("15.0.0") pa_version_under16p0 = _palv < Version("16.0.0") pa_version_under17p0 = _palv < Version("17.0.0") + pa_version_under18p0 = _palv < Version("18.0.0") HAS_PYARROW = True except ImportError: pa_version_under10p1 = True @@ -28,4 +29,5 @@ pa_version_under15p0 = True pa_version_under16p0 = True pa_version_under17p0 = True + pa_version_under18p0 = False HAS_PYARROW = False diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 50a97f1059b5c..f3e6dba1391be 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -4,6 +4,7 @@ import numpy as np +from pandas.compat import pa_version_under18p0 from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -32,7 +33,11 @@ def _arrow_dtype_mapping() -> dict: def arrow_string_types_mapper() -> Callable: pa = import_optional_dependency("pyarrow") - return { + mapping = { pa.string(): pd.StringDtype(na_value=np.nan), pa.large_string(): pd.StringDtype(na_value=np.nan), - }.get + } + if not pa_version_under18p0: + mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan) + + return mapping.get diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 3b4484e44e155..58a5f78ce3258 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -2,6 +2,8 @@ import numpy as np import pytest +from pandas.compat.pyarrow import pa_version_under18p0 + import pandas as pd import pandas._testing as tm @@ -250,3 +252,21 @@ def test_string_inference(self, tmp_path): data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan) ) tm.assert_frame_equal(result, expected) + + @pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0") + def test_string_inference_string_view_type(self, tmp_path): + # GH#54798 + import pyarrow as pa + from pyarrow import feather + + path = tmp_path / "string_view.parquet" + table = pa.table({"a": pa.array([None, "b", "c"], pa.string_view())}) + feather.write_feather(table, path) + + with pd.option_context("future.infer_string", True): + result = read_feather(path) + + expected = pd.DataFrame( + data={"a": [None, "b", "c"]}, dtype=pd.StringDtype(na_value=np.nan) + ) + tm.assert_frame_equal(result, expected) From 681f02571498fa1a67d7a3bc6a2beda9d43313cb Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 13 Nov 2024 09:30:04 +0100 Subject: [PATCH 2/2] fix import --- pandas/compat/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 38fb0188df5ff..5e82853109015 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -33,6 +33,7 @@ pa_version_under14p1, pa_version_under16p0, pa_version_under17p0, + pa_version_under18p0, ) if TYPE_CHECKING: @@ -191,6 +192,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]: "pa_version_under14p1", "pa_version_under16p0", "pa_version_under17p0", + "pa_version_under18p0", "HAS_PYARROW", "IS64", "ISMUSL",