Skip to content

Commit

Permalink
COMPAT: Support fastparquet 0.7.1 (pandas-dev#42919)
Browse files Browse the repository at this point in the history
  • Loading branch information
lithomas1 authored Aug 8, 2021
1 parent 08d296f commit e042219
Show file tree
Hide file tree
Showing 7 changed files with 73 additions and 43 deletions.
2 changes: 1 addition & 1 deletion ci/deps/actions-38-db.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ dependencies:
- beautifulsoup4
- botocore>=1.11
- dask
- fastparquet>=0.4.0, < 0.7.0
- fastparquet>=0.4.0
- fsspec>=0.7.4, <2021.6.0
- gcsfs>=0.6.0
- geopandas
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/azure-windows-38.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ dependencies:
# pandas dependencies
- blosc
- bottleneck
- fastparquet>=0.4.0, <0.7.0
- fastparquet>=0.4.0
- flask
- fsspec>=0.8.0, <2021.6.0
- matplotlib=3.3.2
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.3.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ Bug fixes

Other
~~~~~
-
- :meth:`pandas.read_parquet` now supports reading nullable dtypes with ``fastparquet`` versions above 0.7.1.
-

.. ---------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ dependencies:
- xlwt
- odfpy

- fastparquet>=0.4.0, <0.7.0 # pandas.read_parquet, DataFrame.to_parquet
- fastparquet>=0.4.0 # pandas.read_parquet, DataFrame.to_parquet
- pyarrow>=0.17.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather
- python-snappy # required by pyarrow

Expand Down
26 changes: 19 additions & 7 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,14 +309,21 @@ def write(
def read(
self, path, columns=None, storage_options: StorageOptions = None, **kwargs
):
parquet_kwargs = {}
use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
if use_nullable_dtypes:
raise ValueError(
"The 'use_nullable_dtypes' argument is not supported for the "
"fastparquet engine"
)
# Technically works with 0.7.0, but was incorrect
# so lets just require 0.7.1
if Version(self.api.__version__) >= Version("0.7.1"):
# Need to set even for use_nullable_dtypes = False,
# since our defaults differ
parquet_kwargs["pandas_nulls"] = use_nullable_dtypes
else:
if use_nullable_dtypes:
raise ValueError(
"The 'use_nullable_dtypes' argument is not supported for the "
"fastparquet engine for fastparquet versions less than 0.7.1"
)
path = stringify_path(path)
parquet_kwargs = {}
handles = None
if is_fsspec_url(path):
fsspec = import_optional_dependency("fsspec")
Expand All @@ -337,6 +344,7 @@ def read(
path, "rb", is_text=False, storage_options=storage_options
)
path = handles.handle

parquet_file = self.api.ParquetFile(path, **parquet_kwargs)

result = parquet_file.to_pandas(columns=columns, **kwargs)
Expand Down Expand Up @@ -470,14 +478,18 @@ def read_parquet(
use_nullable_dtypes : bool, default False
If True, use dtypes that use ``pd.NA`` as missing value indicator
for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
for the resulting DataFrame.
As new dtypes are added that support ``pd.NA`` in the future, the
output with this option will change to use those dtypes.
Note: this is an experimental option, and behaviour (e.g. additional
support dtypes) may change without notice.
.. versionadded:: 1.2.0
.. versionchanged:: 1.3.2
``use_nullable_dtypes`` now works with the the ``fastparquet`` engine
if ``fastparquet`` is version 0.7.1 or higher.
**kwargs
Any additional kwargs are passed to the engine.
Expand Down
80 changes: 49 additions & 31 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,46 @@ def test_write_column_index_nonstring(self, pa):
msg = r"parquet must have string column names"
self.check_error_on_write(df, engine, ValueError, msg)

def test_use_nullable_dtypes(self, engine):
import pyarrow.parquet as pq

if engine == "fastparquet":
pytest.importorskip(
"fastparquet",
"0.7.1",
reason="fastparquet must be 0.7.1 or higher for nullable dtype support",
)

table = pyarrow.table(
{
"a": pyarrow.array([1, 2, 3, None], "int64"),
"b": pyarrow.array([1, 2, 3, None], "uint8"),
"c": pyarrow.array(["a", "b", "c", None]),
"d": pyarrow.array([True, False, True, None]),
}
)
with tm.ensure_clean() as path:
# write manually with pyarrow to write integers
pq.write_table(table, path)
result1 = read_parquet(path, engine=engine)
result2 = read_parquet(path, engine=engine, use_nullable_dtypes=True)

assert result1["a"].dtype == np.dtype("float64")
expected = pd.DataFrame(
{
"a": pd.array([1, 2, 3, None], dtype="Int64"),
"b": pd.array([1, 2, 3, None], dtype="UInt8"),
"c": pd.array(["a", "b", "c", None], dtype="string"),
"d": pd.array([True, False, True, None], dtype="boolean"),
}
)
if engine == "fastparquet":
# Fastparquet doesn't support string columns yet
# Only int and boolean
result2 = result2.drop("c", axis=1)
expected = expected.drop("c", axis=1)
tm.assert_frame_equal(result2, expected)


@pytest.mark.filterwarnings("ignore:CategoricalBlock is deprecated:DeprecationWarning")
class TestParquetPyArrow(Base):
Expand Down Expand Up @@ -842,35 +882,6 @@ def test_additional_extension_types(self, pa):
)
check_round_trip(df, pa)

@td.skip_if_no("pyarrow")
def test_use_nullable_dtypes(self, pa):
import pyarrow.parquet as pq

table = pyarrow.table(
{
"a": pyarrow.array([1, 2, 3, None], "int64"),
"b": pyarrow.array([1, 2, 3, None], "uint8"),
"c": pyarrow.array(["a", "b", "c", None]),
"d": pyarrow.array([True, False, True, None]),
}
)
with tm.ensure_clean() as path:
# write manually with pyarrow to write integers
pq.write_table(table, path)
result1 = read_parquet(path)
result2 = read_parquet(path, use_nullable_dtypes=True)

assert result1["a"].dtype == np.dtype("float64")
expected = pd.DataFrame(
{
"a": pd.array([1, 2, 3, None], dtype="Int64"),
"b": pd.array([1, 2, 3, None], dtype="UInt8"),
"c": pd.array(["a", "b", "c", None], dtype="string"),
"d": pd.array([True, False, True, None], dtype="boolean"),
}
)
tm.assert_frame_equal(result2, expected)

def test_timestamp_nanoseconds(self, pa):
# with version 2.0, pyarrow defaults to writing the nanoseconds, so
# this should work without error
Expand Down Expand Up @@ -941,7 +952,9 @@ def test_duplicate_columns(self, fp):
def test_bool_with_none(self, fp):
df = pd.DataFrame({"a": [True, None, False]})
expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
check_round_trip(df, fp, expected=expected)
# Fastparquet bug in 0.7.1 makes it so that this dtype becomes
# float64
check_round_trip(df, fp, expected=expected, check_dtype=False)

def test_unsupported(self, fp):

Expand Down Expand Up @@ -1062,9 +1075,14 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list):
expected.index.name = "index"
check_round_trip(df, fp, expected=expected)

def test_use_nullable_dtypes_not_supported(self, fp):
def test_use_nullable_dtypes_not_supported(self, monkeypatch, fp):
df = pd.DataFrame({"a": [1, 2]})

# This is supported now in fastparquet 0.7.1 and above actually
# Still need to ensure that this raises in all versions below
import fastparquet as fp

monkeypatch.setattr(fp, "__version__", "0.4")
with tm.ensure_clean() as path:
df.to_parquet(path)
with pytest.raises(ValueError, match="not supported for the fastparquet"):
Expand Down
2 changes: 1 addition & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ xlrd
xlsxwriter
xlwt
odfpy
fastparquet>=0.4.0, <0.7.0
fastparquet>=0.4.0
pyarrow>=0.17.0
python-snappy
tables>=3.6.1
Expand Down

0 comments on commit e042219

Please sign in to comment.