COMPAT: Support fastparquet 0.7.1 (pandas-dev#42919)

me-kbs · Aug 8, 2021 · e042219 · e042219
1 parent 08d296f
commit e042219
Show file tree

Hide file tree

Showing 7 changed files with 73 additions and 43 deletions.
diff --git a/ci/deps/actions-38-db.yaml b/ci/deps/actions-38-db.yaml
@@ -15,7 +15,7 @@ dependencies:
   - beautifulsoup4
   - botocore>=1.11
   - dask
-  - fastparquet>=0.4.0, < 0.7.0
+  - fastparquet>=0.4.0
   - fsspec>=0.7.4, <2021.6.0
   - gcsfs>=0.6.0
   - geopandas

diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml
@@ -15,7 +15,7 @@ dependencies:
   # pandas dependencies
   - blosc
   - bottleneck
-  - fastparquet>=0.4.0, <0.7.0
+  - fastparquet>=0.4.0
   - flask
   - fsspec>=0.8.0, <2021.6.0
   - matplotlib=3.3.2

diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst
@@ -44,7 +44,7 @@ Bug fixes
 
 Other
 ~~~~~
--
+- :meth:`pandas.read_parquet` now supports reading nullable dtypes with ``fastparquet`` versions above 0.7.1.
 -
 
 .. ---------------------------------------------------------------------------

diff --git a/environment.yml b/environment.yml
@@ -99,7 +99,7 @@ dependencies:
   - xlwt
   - odfpy
 
-  - fastparquet>=0.4.0, <0.7.0  # pandas.read_parquet, DataFrame.to_parquet
+  - fastparquet>=0.4.0  # pandas.read_parquet, DataFrame.to_parquet
   - pyarrow>=0.17.0  # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather
   - python-snappy  # required by pyarrow
 

diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -309,14 +309,21 @@ def write(
     def read(
         self, path, columns=None, storage_options: StorageOptions = None, **kwargs
     ):
+        parquet_kwargs = {}
         use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
-        if use_nullable_dtypes:
-            raise ValueError(
-                "The 'use_nullable_dtypes' argument is not supported for the "
-                "fastparquet engine"
-            )
+        # Technically works with 0.7.0, but was incorrect
+        # so lets just require 0.7.1
+        if Version(self.api.__version__) >= Version("0.7.1"):
+            # Need to set even for use_nullable_dtypes = False,
+            # since our defaults differ
+            parquet_kwargs["pandas_nulls"] = use_nullable_dtypes
+        else:
+            if use_nullable_dtypes:
+                raise ValueError(
+                    "The 'use_nullable_dtypes' argument is not supported for the "
+                    "fastparquet engine for fastparquet versions less than 0.7.1"
+                )
         path = stringify_path(path)
-        parquet_kwargs = {}
         handles = None
         if is_fsspec_url(path):
             fsspec = import_optional_dependency("fsspec")
@@ -337,6 +344,7 @@ def read(
                 path, "rb", is_text=False, storage_options=storage_options
             )
             path = handles.handle
+
         parquet_file = self.api.ParquetFile(path, **parquet_kwargs)
 
         result = parquet_file.to_pandas(columns=columns, **kwargs)
@@ -470,14 +478,18 @@ def read_parquet(
 
     use_nullable_dtypes : bool, default False
         If True, use dtypes that use ``pd.NA`` as missing value indicator
-        for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
+        for the resulting DataFrame.
         As new dtypes are added that support ``pd.NA`` in the future, the
         output with this option will change to use those dtypes.
         Note: this is an experimental option, and behaviour (e.g. additional
         support dtypes) may change without notice.
 
         .. versionadded:: 1.2.0
 
+        .. versionchanged:: 1.3.2
+            ``use_nullable_dtypes`` now works with the the ``fastparquet`` engine
+            if ``fastparquet`` is version 0.7.1 or higher.
+
     **kwargs
         Any additional kwargs are passed to the engine.
 

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -596,6 +596,46 @@ def test_write_column_index_nonstring(self, pa):
         msg = r"parquet must have string column names"
         self.check_error_on_write(df, engine, ValueError, msg)
 
+    def test_use_nullable_dtypes(self, engine):
+        import pyarrow.parquet as pq
+
+        if engine == "fastparquet":
+            pytest.importorskip(
+                "fastparquet",
+                "0.7.1",
+                reason="fastparquet must be 0.7.1 or higher for nullable dtype support",
+            )
+
+        table = pyarrow.table(
+            {
+                "a": pyarrow.array([1, 2, 3, None], "int64"),
+                "b": pyarrow.array([1, 2, 3, None], "uint8"),
+                "c": pyarrow.array(["a", "b", "c", None]),
+                "d": pyarrow.array([True, False, True, None]),
+            }
+        )
+        with tm.ensure_clean() as path:
+            # write manually with pyarrow to write integers
+            pq.write_table(table, path)
+            result1 = read_parquet(path, engine=engine)
+            result2 = read_parquet(path, engine=engine, use_nullable_dtypes=True)
+
+        assert result1["a"].dtype == np.dtype("float64")
+        expected = pd.DataFrame(
+            {
+                "a": pd.array([1, 2, 3, None], dtype="Int64"),
+                "b": pd.array([1, 2, 3, None], dtype="UInt8"),
+                "c": pd.array(["a", "b", "c", None], dtype="string"),
+                "d": pd.array([True, False, True, None], dtype="boolean"),
+            }
+        )
+        if engine == "fastparquet":
+            # Fastparquet doesn't support string columns yet
+            # Only int and boolean
+            result2 = result2.drop("c", axis=1)
+            expected = expected.drop("c", axis=1)
+        tm.assert_frame_equal(result2, expected)
+
 
 @pytest.mark.filterwarnings("ignore:CategoricalBlock is deprecated:DeprecationWarning")
 class TestParquetPyArrow(Base):
@@ -842,35 +882,6 @@ def test_additional_extension_types(self, pa):
         )
         check_round_trip(df, pa)
 
-    @td.skip_if_no("pyarrow")
-    def test_use_nullable_dtypes(self, pa):
-        import pyarrow.parquet as pq
-
-        table = pyarrow.table(
-            {
-                "a": pyarrow.array([1, 2, 3, None], "int64"),
-                "b": pyarrow.array([1, 2, 3, None], "uint8"),
-                "c": pyarrow.array(["a", "b", "c", None]),
-                "d": pyarrow.array([True, False, True, None]),
-            }
-        )
-        with tm.ensure_clean() as path:
-            # write manually with pyarrow to write integers
-            pq.write_table(table, path)
-            result1 = read_parquet(path)
-            result2 = read_parquet(path, use_nullable_dtypes=True)
-
-        assert result1["a"].dtype == np.dtype("float64")
-        expected = pd.DataFrame(
-            {
-                "a": pd.array([1, 2, 3, None], dtype="Int64"),
-                "b": pd.array([1, 2, 3, None], dtype="UInt8"),
-                "c": pd.array(["a", "b", "c", None], dtype="string"),
-                "d": pd.array([True, False, True, None], dtype="boolean"),
-            }
-        )
-        tm.assert_frame_equal(result2, expected)
-
     def test_timestamp_nanoseconds(self, pa):
         # with version 2.0, pyarrow defaults to writing the nanoseconds, so
         # this should work without error
@@ -941,7 +952,9 @@ def test_duplicate_columns(self, fp):
     def test_bool_with_none(self, fp):
         df = pd.DataFrame({"a": [True, None, False]})
         expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
-        check_round_trip(df, fp, expected=expected)
+        # Fastparquet bug in 0.7.1 makes it so that this dtype becomes
+        # float64
+        check_round_trip(df, fp, expected=expected, check_dtype=False)
 
     def test_unsupported(self, fp):
 
@@ -1062,9 +1075,14 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list):
         expected.index.name = "index"
         check_round_trip(df, fp, expected=expected)
 
-    def test_use_nullable_dtypes_not_supported(self, fp):
+    def test_use_nullable_dtypes_not_supported(self, monkeypatch, fp):
         df = pd.DataFrame({"a": [1, 2]})
 
+        # This is supported now in fastparquet 0.7.1 and above actually
+        # Still need to ensure that this raises in all versions below
+        import fastparquet as fp
+
+        monkeypatch.setattr(fp, "__version__", "0.4")
         with tm.ensure_clean() as path:
             df.to_parquet(path)
             with pytest.raises(ValueError, match="not supported for the fastparquet"):

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -64,7 +64,7 @@ xlrd
 xlsxwriter
 xlwt
 odfpy
-fastparquet>=0.4.0, <0.7.0
+fastparquet>=0.4.0
 pyarrow>=0.17.0
 python-snappy
 tables>=3.6.1
-Original file line number
+Diff line change
@@ Expand Up / @@ -44,7 +44,7 @@ Bug fixes @@
     Other
     ~~~~~
-    -
+    - :meth:`pandas.read_parquet` now supports reading nullable dtypes with ``fastparquet`` versions above 0.7.1.
     -
     .. ---------------------------------------------------------------------------
@@ Expand Down @@