diff --git a/.github/workflows/database.yml b/.github/workflows/database.yml index a5aef7825c770..69f2e689c0228 100644 --- a/.github/workflows/database.yml +++ b/.github/workflows/database.yml @@ -70,7 +70,7 @@ jobs: - uses: conda-incubator/setup-miniconda@v2 with: activate-environment: pandas-dev - channel-priority: strict + channel-priority: flexible environment-file: ${{ matrix.ENV_FILE }} use-only-tar-bz2: true diff --git a/ci/deps/actions-37-db-min.yaml b/ci/deps/actions-37-db-min.yaml index 1d3794576220a..65c4c5769b1a3 100644 --- a/ci/deps/actions-37-db-min.yaml +++ b/ci/deps/actions-37-db-min.yaml @@ -31,7 +31,8 @@ dependencies: - openpyxl - pandas-gbq - google-cloud-bigquery>=1.27.2 # GH 36436 - - pyarrow=0.17 # GH 38803 + - protobuf>=3.12.4 + - pyarrow=0.17.1 # GH 38803 - pytables>=3.5.1 - scipy - xarray=0.12.3 diff --git a/ci/deps/actions-37-db.yaml b/ci/deps/actions-37-db.yaml index 8755e1a02c3cf..fa58f412cebf4 100644 --- a/ci/deps/actions-37-db.yaml +++ b/ci/deps/actions-37-db.yaml @@ -31,7 +31,7 @@ dependencies: - pandas-gbq - google-cloud-bigquery>=1.27.2 # GH 36436 - psycopg2 - - pyarrow>=0.15.0 + - pyarrow>=0.17.0 - pymysql - pytables - python-snappy diff --git a/ci/deps/actions-37-minimum_versions.yaml b/ci/deps/actions-37-minimum_versions.yaml index 3237cf9770220..aa5284e4f35d1 100644 --- a/ci/deps/actions-37-minimum_versions.yaml +++ b/ci/deps/actions-37-minimum_versions.yaml @@ -23,7 +23,7 @@ dependencies: - pytables=3.5.1 - python-dateutil=2.7.3 - pytz=2017.3 - - pyarrow=0.15 + - pyarrow=0.17.0 - scipy=1.2 - xlrd=1.2.0 - xlsxwriter=1.0.2 diff --git a/ci/deps/actions-37.yaml b/ci/deps/actions-37.yaml index f29830e9b3e79..a209a9099d2bb 100644 --- a/ci/deps/actions-37.yaml +++ b/ci/deps/actions-37.yaml @@ -18,7 +18,7 @@ dependencies: - numpy=1.19 - python-dateutil - nomkl - - pyarrow=0.15.1 + - pyarrow - pytz - s3fs>=0.4.0 - moto>=1.3.14 diff --git a/ci/deps/azure-macos-37.yaml b/ci/deps/azure-macos-37.yaml index 8c8b49ff3df5b..a0b1cdc684d2c 100644 --- a/ci/deps/azure-macos-37.yaml +++ b/ci/deps/azure-macos-37.yaml @@ -1,6 +1,7 @@ name: pandas-dev channels: - defaults + - conda-forge dependencies: - python=3.7.* @@ -21,7 +22,7 @@ dependencies: - numexpr - numpy=1.17.3 - openpyxl - - pyarrow=0.15.1 + - pyarrow=0.17.0 - pytables - python-dateutil==2.7.3 - pytz diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index c9d22ffbead45..8266e3bc4d07d 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -26,7 +26,7 @@ dependencies: - numexpr - numpy=1.17.* - openpyxl - - pyarrow=0.15 + - pyarrow=0.17.0 - pytables - python-dateutil - pytz diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index 661d8813d32d2..200e695a69d1f 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -25,7 +25,7 @@ dependencies: - numpy=1.18.* - openpyxl - jinja2 - - pyarrow>=0.15.0 + - pyarrow>=0.17.0 - pytables - python-dateutil - pytz diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 16beb00d201b7..ce35e9e15976f 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -358,7 +358,7 @@ PyTables 3.5.1 HDF5-based reading / writing blosc 1.17.0 Compression for HDF5 zlib Compression for HDF5 fastparquet 0.4.0 Parquet reading / writing -pyarrow 0.15.0 Parquet, ORC, and feather reading / writing +pyarrow 0.17.0 Parquet, ORC, and feather reading / writing pyreadstat SPSS files (.sav) reading ========================= ================== ============================================================= diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 9968a103a13bf..f6b5c30635980 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -579,7 +579,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | openpyxl | 3.0.0 | X | +-----------------+-----------------+---------+ -| pyarrow | 0.15.0 | | +| pyarrow | 0.17.0 | X | +-----------------+-----------------+---------+ | pymysql | 0.8.1 | X | +-----------------+-----------------+---------+ diff --git a/environment.yml b/environment.yml index 67b42d545af88..56a36c593a458 100644 --- a/environment.yml +++ b/environment.yml @@ -100,7 +100,7 @@ dependencies: - odfpy - fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet - - pyarrow>=0.15.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather + - pyarrow>=0.17.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather - python-snappy # required by pyarrow - pyqt>=5.9.2 # pandas.read_clipboard diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 0ef6da53191c5..f8eccfeb2c60a 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -21,7 +21,7 @@ "odfpy": "1.3.0", "openpyxl": "3.0.0", "pandas_gbq": "0.12.0", - "pyarrow": "0.15.0", + "pyarrow": "0.17.0", "pytest": "5.0.1", "pyxlsb": "1.0.6", "s3fs": "0.4.0", diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 6ae3f75069899..7d27b617c0e6e 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -165,7 +165,7 @@ def test_repr(): # Arrow interaction -pyarrow_skip = td.skip_if_no("pyarrow", min_version="0.16.0") +pyarrow_skip = td.skip_if_no("pyarrow") @pyarrow_skip diff --git a/pandas/tests/arrays/masked/test_arrow_compat.py b/pandas/tests/arrays/masked/test_arrow_compat.py index 193017ddfcadf..9f755412dbf39 100644 --- a/pandas/tests/arrays/masked/test_arrow_compat.py +++ b/pandas/tests/arrays/masked/test_arrow_compat.py @@ -6,7 +6,7 @@ import pandas as pd import pandas._testing as tm -pa = pytest.importorskip("pyarrow", minversion="0.15.0") +pa = pytest.importorskip("pyarrow", minversion="0.17.0") from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask @@ -21,8 +21,6 @@ def data(request): def test_arrow_array(data): - # protocol added in 0.15.0 - arr = pa.array(data) expected = pa.array( data.to_numpy(object, na_value=None), @@ -31,10 +29,8 @@ def test_arrow_array(data): assert arr.equals(expected) -@td.skip_if_no("pyarrow", min_version="0.16.0") +@td.skip_if_no("pyarrow") def test_arrow_roundtrip(data): - # roundtrip possible from arrow 0.16.0 - df = pd.DataFrame({"a": data}) table = pa.table(df) assert table.field("a").type == str(data.dtype.numpy_dtype) @@ -43,7 +39,7 @@ def test_arrow_roundtrip(data): tm.assert_frame_equal(result, df) -@td.skip_if_no("pyarrow", min_version="0.16.0") +@td.skip_if_no("pyarrow") def test_arrow_load_from_zero_chunks(data): # GH-41040 @@ -58,7 +54,7 @@ def test_arrow_load_from_zero_chunks(data): tm.assert_frame_equal(result, df) -@td.skip_if_no("pyarrow", min_version="0.16.0") +@td.skip_if_no("pyarrow") def test_arrow_from_arrow_uint(): # https://github.com/pandas-dev/pandas/issues/31896 # possible mismatch in types @@ -70,7 +66,7 @@ def test_arrow_from_arrow_uint(): tm.assert_extension_array_equal(result, expected) -@td.skip_if_no("pyarrow", min_version="0.16.0") +@td.skip_if_no("pyarrow") def test_arrow_sliced(data): # https://github.com/pandas-dev/pandas/issues/38525 @@ -165,7 +161,7 @@ def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays): tm.assert_numpy_array_equal(mask, mask_expected_empty) -@td.skip_if_no("pyarrow", min_version="0.16.0") +@td.skip_if_no("pyarrow") def test_from_arrow_type_error(request, data): # ensure that __from_arrow__ returns a TypeError when getting a wrong # array type diff --git a/pandas/tests/arrays/period/test_arrow_compat.py b/pandas/tests/arrays/period/test_arrow_compat.py index d7b0704cdfb05..5211397f20c36 100644 --- a/pandas/tests/arrays/period/test_arrow_compat.py +++ b/pandas/tests/arrays/period/test_arrow_compat.py @@ -11,7 +11,7 @@ period_array, ) -pyarrow_skip = pyarrow_skip = td.skip_if_no("pyarrow", min_version="0.16.0") +pyarrow_skip = td.skip_if_no("pyarrow", min_version="0.17.0") @pyarrow_skip diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 3205664e7c80a..e3b43c544a477 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -437,7 +437,7 @@ def test_fillna_args(dtype, request): arr.fillna(value=1) -@td.skip_if_no("pyarrow", min_version="0.15.0") +@td.skip_if_no("pyarrow") def test_arrow_array(dtype): # protocol added in 0.15.0 import pyarrow as pa @@ -451,7 +451,7 @@ def test_arrow_array(dtype): assert arr.equals(expected) -@td.skip_if_no("pyarrow", min_version="0.16.0") +@td.skip_if_no("pyarrow") def test_arrow_roundtrip(dtype, dtype_object): # roundtrip possible from arrow 1.0.0 import pyarrow as pa @@ -467,7 +467,7 @@ def test_arrow_roundtrip(dtype, dtype_object): assert result.loc[2, "a"] is pd.NA -@td.skip_if_no("pyarrow", min_version="0.16.0") +@td.skip_if_no("pyarrow") def test_arrow_load_from_zero_chunks(dtype, dtype_object): # GH-41040 import pyarrow as pa diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index a5254f5ff7988..ba8a9ed070236 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -6,14 +6,12 @@ import pandas as pd import pandas._testing as tm -from pandas.util.version import Version from pandas.io.feather_format import read_feather, to_feather # isort:skip pyarrow = pytest.importorskip("pyarrow") -pyarrow_version = Version(pyarrow.__version__) filter_sparse = pytest.mark.filterwarnings("ignore:The Sparse") @@ -89,12 +87,11 @@ def test_basic(self): ), } ) - if pyarrow_version >= Version("0.17.0"): - df["periods"] = pd.period_range("2013", freq="M", periods=3) - df["timedeltas"] = pd.timedelta_range("1 day", periods=3) - # TODO temporary disable due to regression in pyarrow 0.17.1 - # https://github.com/pandas-dev/pandas/issues/34255 - # df["intervals"] = pd.interval_range(0, 3, 3) + df["periods"] = pd.period_range("2013", freq="M", periods=3) + df["timedeltas"] = pd.timedelta_range("1 day", periods=3) + # TODO temporary disable due to regression in pyarrow 0.17.1 + # https://github.com/pandas-dev/pandas/issues/34255 + # df["intervals"] = pd.interval_range(0, 3, 3) assert df.dttz.dtype.tz.zone == "US/Eastern" self.check_round_trip(df) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index f66451cd72309..ae6425cd93ac5 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -17,6 +17,10 @@ PY38, is_platform_windows, ) +from pandas.compat.pyarrow import ( + pa_version_under1p0, + pa_version_under2p0, +) import pandas.util._test_decorators as td import pandas as pd @@ -653,8 +657,6 @@ def test_categorical(self, pa): ) def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa, s3so): s3fs = pytest.importorskip("s3fs") - if Version(pyarrow.__version__) <= Version("0.17.0"): - pytest.skip() s3 = s3fs.S3FileSystem(**s3so) kw = {"filesystem": s3} check_round_trip( @@ -666,8 +668,6 @@ def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa, s3so): ) def test_s3_roundtrip(self, df_compat, s3_resource, pa, s3so): - if Version(pyarrow.__version__) <= Version("0.17.0"): - pytest.skip() # GH #19134 s3so = {"storage_options": s3so} check_round_trip( @@ -698,14 +698,12 @@ def test_s3_roundtrip_for_dir( # These are added to back of dataframe on read. In new API category dtype is # only used if partition field is string, but this changed again to use # category dtype for all types (not only strings) in pyarrow 2.0.0 - pa10 = (Version(pyarrow.__version__) >= Version("1.0.0")) and ( - Version(pyarrow.__version__) < Version("2.0.0") - ) if partition_col: - if pa10: - partition_col_type = "int32" - else: - partition_col_type = "category" + partition_col_type = ( + "int32" + if (not pa_version_under1p0) and pa_version_under2p0 + else "category" + ) expected_df[partition_col] = expected_df[partition_col].astype( partition_col_type @@ -795,7 +793,7 @@ def test_write_with_schema(self, pa): out_df = df.astype(bool) check_round_trip(df, pa, write_kwargs={"schema": schema}, expected=out_df) - @td.skip_if_no("pyarrow", min_version="0.15.0") + @td.skip_if_no("pyarrow") def test_additional_extension_arrays(self, pa): # test additional ExtensionArrays that are supported through the # __arrow_array__ protocol @@ -806,22 +804,10 @@ def test_additional_extension_arrays(self, pa): "c": pd.Series(["a", None, "c"], dtype="string"), } ) - if Version(pyarrow.__version__) >= Version("0.16.0"): - expected = df - else: - # de-serialized as plain int / object - expected = df.assign( - a=df.a.astype("int64"), b=df.b.astype("int64"), c=df.c.astype("object") - ) - check_round_trip(df, pa, expected=expected) + check_round_trip(df, pa) df = pd.DataFrame({"a": pd.Series([1, 2, 3, None], dtype="Int64")}) - if Version(pyarrow.__version__) >= Version("0.16.0"): - expected = df - else: - # if missing values in integer, currently de-serialized as float - expected = df.assign(a=df.a.astype("float64")) - check_round_trip(df, pa, expected=expected) + check_round_trip(df, pa) @td.skip_if_no("pyarrow", min_version="1.0.0") def test_pyarrow_backed_string_array(self, pa): @@ -831,7 +817,7 @@ def test_pyarrow_backed_string_array(self, pa): df = pd.DataFrame({"a": pd.Series(["a", None, "c"], dtype="arrow_string")}) check_round_trip(df, pa, expected=df) - @td.skip_if_no("pyarrow", min_version="0.16.0") + @td.skip_if_no("pyarrow") def test_additional_extension_types(self, pa): # test additional ExtensionArrays that are supported through the # __arrow_array__ protocol + by defining a custom ExtensionType @@ -844,7 +830,7 @@ def test_additional_extension_types(self, pa): ) check_round_trip(df, pa) - @td.skip_if_no("pyarrow", min_version="0.16.0") + @td.skip_if_no("pyarrow") def test_use_nullable_dtypes(self, pa): import pyarrow.parquet as pq @@ -880,7 +866,7 @@ def test_timestamp_nanoseconds(self, pa): check_round_trip(df, pa, write_kwargs={"version": "2.0"}) def test_timezone_aware_index(self, pa, timezone_aware_date_list): - if Version(pyarrow.__version__) >= Version("2.0.0"): + if not pa_version_under2p0: # temporary skip this test until it is properly resolved # https://github.com/pandas-dev/pandas/issues/37286 pytest.skip() diff --git a/requirements-dev.txt b/requirements-dev.txt index 35fb6d18a7e81..d1fafbbf9101d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -65,7 +65,7 @@ xlsxwriter xlwt odfpy fastparquet>=0.3.2 -pyarrow>=0.15.0 +pyarrow>=0.17.0 python-snappy pyqt5>=5.9.2 tables>=3.5.1