Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix pandas datetime decoding with NumPy >= 2.0 for small integer dtypes #9518

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ Bug fixes
- Make illegal path-like variable names when constructing a DataTree from a Dataset
(:issue:`9339`, :pull:`9378`)
By `Etienne Schalk <https://github.com/etienneschalk>`_.
- Work around `upstream pandas issue
<https://github.com/pandas-dev/pandas/issues/56996>`_ to ensure that we can
decode times encoded with small integer dtype values (e.g. ``np.int32``) in
environments with NumPy 2.0 or greater without needing to fall back to cftime
(:pull:`9518`). By `Spencer Clark <https://github.com/spencerkclark>`_.
- Fix bug when encoding times with missing values as floats in the case when
the non-missing times could in theory be encoded with integers
(:issue:`9488`, :pull:`9497`). By `Spencer Clark
Expand Down
9 changes: 9 additions & 0 deletions xarray/coding/times.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,15 @@ def _decode_datetime_with_pandas(
"pandas."
)

# Work around pandas.to_timedelta issue with dtypes smaller than int64 and
# NumPy 2.0 by casting all int and uint data to int64 and uint64,
# respectively. See https://github.com/pandas-dev/pandas/issues/56996 for
# more details.
if flat_num_dates.dtype.kind == "i":
flat_num_dates = flat_num_dates.astype(np.int64)
elif flat_num_dates.dtype.kind == "u":
flat_num_dates = flat_num_dates.astype(np.uint64)

time_units, ref_date_str = _unpack_netcdf_time_units(units)
time_units = _netcdf_to_numpy_timeunit(time_units)
try:
Expand Down
27 changes: 19 additions & 8 deletions xarray/tests/test_coding_times.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import numpy as np
import pandas as pd
import pytest
from pandas.errors import OutOfBoundsDatetime
from pandas.errors import OutOfBoundsDatetime, OutOfBoundsTimedelta

from xarray import (
DataArray,
Expand Down Expand Up @@ -1136,11 +1136,16 @@ def test_should_cftime_be_used_target_not_npable():
_should_cftime_be_used(src, "noleap", False)


@pytest.mark.parametrize("dtype", [np.uint8, np.uint16, np.uint32, np.uint64])
def test_decode_cf_datetime_uint(dtype):
@pytest.mark.parametrize(
"dtype",
[np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64],
)
def test_decode_cf_datetime_varied_integer_dtypes(dtype):
units = "seconds since 2018-08-22T03:23:03Z"
num_dates = dtype(50)
result = decode_cf_datetime(num_dates, units)
# Set use_cftime=False to ensure we cannot mask a failure by falling back
# to cftime.
result = decode_cf_datetime(num_dates, units, use_cftime=False)
expected = np.asarray(np.datetime64("2018-08-22T03:23:53", "ns"))
np.testing.assert_equal(result, expected)

Expand All @@ -1154,6 +1159,14 @@ def test_decode_cf_datetime_uint64_with_cftime():
np.testing.assert_equal(result, expected)


def test_decode_cf_datetime_uint64_with_pandas_overflow_error():
units = "nanoseconds since 1970-01-01"
calendar = "standard"
num_dates = np.uint64(1_000_000 * 86_400 * 360 * 500_000)
with pytest.raises(OutOfBoundsTimedelta):
decode_cf_datetime(num_dates, units, calendar, use_cftime=False)


@requires_cftime
def test_decode_cf_datetime_uint64_with_cftime_overflow_error():
units = "microseconds since 1700-01-01"
Expand Down Expand Up @@ -1438,10 +1451,8 @@ def test_roundtrip_float_times(fill_value, times, units, encoded_values) -> None
"days since 1700-01-01",
np.dtype("int32"),
),
"mixed-cftime-pandas-encoding-with-prescribed-units-and-dtype": (
"250YS",
"days since 1700-01-01",
np.dtype("int32"),
"mixed-cftime-pandas-encoding-with-prescribed-units-and-dtype": pytest.param(
"250YS", "days since 1700-01-01", np.dtype("int32"), marks=requires_cftime
Comment on lines +1454 to +1455
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not related to this PR, but I noticed when testing in an environment without cftime that this test was not skipped. I guess we don't have a CI environment that has dask, but not cftime. I figured I would take the opportunity to clean this up.

),
"pandas-encoding-with-default-units-and-dtype": ("250YS", None, None),
}
Expand Down
Loading