diff --git a/ci/deps/actions-38-db.yaml b/ci/deps/actions-38-db.yaml index 421eeb7a9c2d0..b4495fa6887f4 100644 --- a/ci/deps/actions-38-db.yaml +++ b/ci/deps/actions-38-db.yaml @@ -15,7 +15,7 @@ dependencies: - beautifulsoup4 - botocore>=1.11 - dask - - fastparquet>=0.4.0, < 0.7.0 + - fastparquet>=0.4.0 - fsspec>=0.7.4, <2021.6.0 - gcsfs>=0.6.0 - geopandas diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index 2863dc82ded32..c56496bce7d6c 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -15,7 +15,7 @@ dependencies: # pandas dependencies - blosc - bottleneck - - fastparquet>=0.4.0, <0.7.0 + - fastparquet>=0.4.0 - flask - fsspec>=0.8.0, <2021.6.0 - matplotlib=3.3.2 diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst index e68f5d9d6bc56..f54cea744f4d2 100644 --- a/doc/source/whatsnew/v1.3.2.rst +++ b/doc/source/whatsnew/v1.3.2.rst @@ -44,7 +44,7 @@ Bug fixes Other ~~~~~ -- +- :meth:`pandas.read_parquet` now supports reading nullable dtypes with ``fastparquet`` versions above 0.7.1. - .. --------------------------------------------------------------------------- diff --git a/environment.yml b/environment.yml index 32b724900510f..ff833be999fcb 100644 --- a/environment.yml +++ b/environment.yml @@ -99,7 +99,7 @@ dependencies: - xlwt - odfpy - - fastparquet>=0.4.0, <0.7.0 # pandas.read_parquet, DataFrame.to_parquet + - fastparquet>=0.4.0 # pandas.read_parquet, DataFrame.to_parquet - pyarrow>=0.17.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather - python-snappy # required by pyarrow diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index b7523fada07d0..f0aeeb3e6c893 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -309,14 +309,21 @@ def write( def read( self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): + parquet_kwargs = {} use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) - if use_nullable_dtypes: - raise ValueError( - "The 'use_nullable_dtypes' argument is not supported for the " - "fastparquet engine" - ) + # Technically works with 0.7.0, but was incorrect + # so lets just require 0.7.1 + if Version(self.api.__version__) >= Version("0.7.1"): + # Need to set even for use_nullable_dtypes = False, + # since our defaults differ + parquet_kwargs["pandas_nulls"] = use_nullable_dtypes + else: + if use_nullable_dtypes: + raise ValueError( + "The 'use_nullable_dtypes' argument is not supported for the " + "fastparquet engine for fastparquet versions less than 0.7.1" + ) path = stringify_path(path) - parquet_kwargs = {} handles = None if is_fsspec_url(path): fsspec = import_optional_dependency("fsspec") @@ -337,6 +344,7 @@ def read( path, "rb", is_text=False, storage_options=storage_options ) path = handles.handle + parquet_file = self.api.ParquetFile(path, **parquet_kwargs) result = parquet_file.to_pandas(columns=columns, **kwargs) @@ -470,7 +478,7 @@ def read_parquet( use_nullable_dtypes : bool, default False If True, use dtypes that use ``pd.NA`` as missing value indicator - for the resulting DataFrame (only applicable for ``engine="pyarrow"``). + for the resulting DataFrame. As new dtypes are added that support ``pd.NA`` in the future, the output with this option will change to use those dtypes. Note: this is an experimental option, and behaviour (e.g. additional @@ -478,6 +486,10 @@ def read_parquet( .. versionadded:: 1.2.0 + .. versionchanged:: 1.3.2 + ``use_nullable_dtypes`` now works with the the ``fastparquet`` engine + if ``fastparquet`` is version 0.7.1 or higher. + **kwargs Any additional kwargs are passed to the engine. diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 58aef2f2844df..b1f7f15dfa99a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -596,6 +596,46 @@ def test_write_column_index_nonstring(self, pa): msg = r"parquet must have string column names" self.check_error_on_write(df, engine, ValueError, msg) + def test_use_nullable_dtypes(self, engine): + import pyarrow.parquet as pq + + if engine == "fastparquet": + pytest.importorskip( + "fastparquet", + "0.7.1", + reason="fastparquet must be 0.7.1 or higher for nullable dtype support", + ) + + table = pyarrow.table( + { + "a": pyarrow.array([1, 2, 3, None], "int64"), + "b": pyarrow.array([1, 2, 3, None], "uint8"), + "c": pyarrow.array(["a", "b", "c", None]), + "d": pyarrow.array([True, False, True, None]), + } + ) + with tm.ensure_clean() as path: + # write manually with pyarrow to write integers + pq.write_table(table, path) + result1 = read_parquet(path, engine=engine) + result2 = read_parquet(path, engine=engine, use_nullable_dtypes=True) + + assert result1["a"].dtype == np.dtype("float64") + expected = pd.DataFrame( + { + "a": pd.array([1, 2, 3, None], dtype="Int64"), + "b": pd.array([1, 2, 3, None], dtype="UInt8"), + "c": pd.array(["a", "b", "c", None], dtype="string"), + "d": pd.array([True, False, True, None], dtype="boolean"), + } + ) + if engine == "fastparquet": + # Fastparquet doesn't support string columns yet + # Only int and boolean + result2 = result2.drop("c", axis=1) + expected = expected.drop("c", axis=1) + tm.assert_frame_equal(result2, expected) + @pytest.mark.filterwarnings("ignore:CategoricalBlock is deprecated:DeprecationWarning") class TestParquetPyArrow(Base): @@ -842,35 +882,6 @@ def test_additional_extension_types(self, pa): ) check_round_trip(df, pa) - @td.skip_if_no("pyarrow") - def test_use_nullable_dtypes(self, pa): - import pyarrow.parquet as pq - - table = pyarrow.table( - { - "a": pyarrow.array([1, 2, 3, None], "int64"), - "b": pyarrow.array([1, 2, 3, None], "uint8"), - "c": pyarrow.array(["a", "b", "c", None]), - "d": pyarrow.array([True, False, True, None]), - } - ) - with tm.ensure_clean() as path: - # write manually with pyarrow to write integers - pq.write_table(table, path) - result1 = read_parquet(path) - result2 = read_parquet(path, use_nullable_dtypes=True) - - assert result1["a"].dtype == np.dtype("float64") - expected = pd.DataFrame( - { - "a": pd.array([1, 2, 3, None], dtype="Int64"), - "b": pd.array([1, 2, 3, None], dtype="UInt8"), - "c": pd.array(["a", "b", "c", None], dtype="string"), - "d": pd.array([True, False, True, None], dtype="boolean"), - } - ) - tm.assert_frame_equal(result2, expected) - def test_timestamp_nanoseconds(self, pa): # with version 2.0, pyarrow defaults to writing the nanoseconds, so # this should work without error @@ -941,7 +952,9 @@ def test_duplicate_columns(self, fp): def test_bool_with_none(self, fp): df = pd.DataFrame({"a": [True, None, False]}) expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") - check_round_trip(df, fp, expected=expected) + # Fastparquet bug in 0.7.1 makes it so that this dtype becomes + # float64 + check_round_trip(df, fp, expected=expected, check_dtype=False) def test_unsupported(self, fp): @@ -1062,9 +1075,14 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list): expected.index.name = "index" check_round_trip(df, fp, expected=expected) - def test_use_nullable_dtypes_not_supported(self, fp): + def test_use_nullable_dtypes_not_supported(self, monkeypatch, fp): df = pd.DataFrame({"a": [1, 2]}) + # This is supported now in fastparquet 0.7.1 and above actually + # Still need to ensure that this raises in all versions below + import fastparquet as fp + + monkeypatch.setattr(fp, "__version__", "0.4") with tm.ensure_clean() as path: df.to_parquet(path) with pytest.raises(ValueError, match="not supported for the fastparquet"): diff --git a/requirements-dev.txt b/requirements-dev.txt index 49e966cc3a1cf..86f04ed77c985 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -64,7 +64,7 @@ xlrd xlsxwriter xlwt odfpy -fastparquet>=0.4.0, <0.7.0 +fastparquet>=0.4.0 pyarrow>=0.17.0 python-snappy tables>=3.6.1