Merge branch 'main' into h5netcdf-new-features

pydata · Sep 19, 2024 · 074c8c5 · 074c8c5
2 parents b78c42e + 7750c00
commit 074c8c5
Show file tree

Hide file tree

Showing 58 changed files with 342 additions and 203 deletions.
diff --git a/asv_bench/benchmarks/__init__.py b/asv_bench/benchmarks/__init__.py
@@ -18,15 +18,15 @@ def decorator(func):
 def requires_dask():
     try:
         import dask  # noqa: F401
-    except ImportError:
-        raise NotImplementedError()
+    except ImportError as err:
+        raise NotImplementedError() from err
 
 
 def requires_sparse():
     try:
         import sparse  # noqa: F401
-    except ImportError:
-        raise NotImplementedError()
+    except ImportError as err:
+        raise NotImplementedError() from err
 
 
 def randn(shape, frac_nan=None, chunks=None, seed=0):

diff --git a/asv_bench/benchmarks/accessors.py b/asv_bench/benchmarks/accessors.py
@@ -16,10 +16,10 @@ def setup(self, calendar):
         self.da = xr.DataArray(data, dims="time", coords={"time": time})
 
     def time_dayofyear(self, calendar):
-        self.da.time.dt.dayofyear
+        _ = self.da.time.dt.dayofyear
 
     def time_year(self, calendar):
-        self.da.time.dt.year
+        _ = self.da.time.dt.year
 
     def time_floor(self, calendar):
-        self.da.time.dt.floor("D")
+        _ = self.da.time.dt.floor("D")
diff --git a/asv_bench/benchmarks/dataset_io.py b/asv_bench/benchmarks/dataset_io.py
@@ -606,8 +606,8 @@ def setup(self):
 
         try:
             import distributed
-        except ImportError:
-            raise NotImplementedError()
+        except ImportError as err:
+            raise NotImplementedError() from err
 
         self.client = distributed.Client()
         self.write = create_delayed_write()

diff --git a/ci/min_deps_check.py b/ci/min_deps_check.py
@@ -68,8 +68,8 @@ def parse_requirements(fname) -> Iterator[tuple[str, int, int, int | None]]:
 
         try:
             version_tup = tuple(int(x) for x in version.split("."))
-        except ValueError:
-            raise ValueError("non-numerical version: " + row)
+        except ValueError as err:
+            raise ValueError("non-numerical version: " + row) from err
 
         if len(version_tup) == 2:
             yield (pkg, *version_tup, None)  # type: ignore[misc]

diff --git a/doc/user-guide/hierarchical-data.rst b/doc/user-guide/hierarchical-data.rst
@@ -200,7 +200,7 @@ and even the distinguishing feature of the common ancestor of any two species (t
 
 .. ipython:: python
 
-    [node.name for node in primates.ancestors]
+    [node.name for node in reversed(primates.parents)]
     primates.root.name
     primates.find_common_ancestor(dinosaurs).name
 

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -49,10 +49,17 @@ Bug fixes
 - Make illegal path-like variable names when constructing a DataTree from a Dataset
   (:issue:`9339`, :pull:`9378`)
   By `Etienne Schalk <https://github.com/etienneschalk>`_.
+- Work around `upstream pandas issue
+  <https://github.com/pandas-dev/pandas/issues/56996>`_ to ensure that we can
+  decode times encoded with small integer dtype values (e.g. ``np.int32``) in
+  environments with NumPy 2.0 or greater without needing to fall back to cftime
+  (:pull:`9518`). By `Spencer Clark <https://github.com/spencerkclark>`_.
 - Fix bug when encoding times with missing values as floats in the case when
   the non-missing times could in theory be encoded with integers
   (:issue:`9488`, :pull:`9497`). By `Spencer Clark
   <https://github.com/spencerkclark>`_.
+- Fix a few bugs affecting groupby reductions with `flox`. (:issue:`8090`, :issue:`9398`).
+  By `Deepak Cherian <https://github.com/dcherian>`_.
 
 
 Documentation

diff --git a/pyproject.toml b/pyproject.toml
@@ -246,13 +246,14 @@ extend-exclude = [
 extend-safe-fixes = [
   "TID252", # absolute imports
 ]
-extend-ignore = [
+ignore = [
   "E402",
   "E501",
   "E731",
   "UP007",
 ]
 extend-select = [
+  "B", # flake8-bugbear
   "F", # Pyflakes
   "E", # Pycodestyle
   "W",

diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -99,11 +99,11 @@ def _get_default_engine_remote_uri() -> Literal["netcdf4", "pydap"]:
             import pydap  # noqa: F401
 
             engine = "pydap"
-        except ImportError:
+        except ImportError as err:
             raise ValueError(
                 "netCDF4 or pydap is required for accessing "
                 "remote datasets via OPeNDAP"
-            )
+            ) from err
     return engine
 
 
@@ -112,8 +112,8 @@ def _get_default_engine_gz() -> Literal["scipy"]:
         import scipy  # noqa: F401
 
         engine: Final = "scipy"
-    except ImportError:  # pragma: no cover
-        raise ValueError("scipy is required for accessing .gz files")
+    except ImportError as err:  # pragma: no cover
+        raise ValueError("scipy is required for accessing .gz files") from err
     return engine
 
 
@@ -128,11 +128,11 @@ def _get_default_engine_netcdf() -> Literal["netcdf4", "scipy"]:
             import scipy.io.netcdf  # noqa: F401
 
             engine = "scipy"
-        except ImportError:
+        except ImportError as err:
             raise ValueError(
                 "cannot read or write netCDF files without "
                 "netCDF4-python or scipy installed"
-            )
+            ) from err
     return engine
 
 
@@ -1382,8 +1382,8 @@ def to_netcdf(
 
     try:
         store_open = WRITEABLE_STORES[engine]
-    except KeyError:
-        raise ValueError(f"unrecognized engine for to_netcdf: {engine!r}")
+    except KeyError as err:
+        raise ValueError(f"unrecognized engine for to_netcdf: {engine!r}") from err
 
     if format is not None:
         format = format.upper()  # type: ignore[assignment]

diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py
@@ -111,7 +111,7 @@ def _getitem(self, key):
             with self.datastore.lock:
                 original_array = self.get_array(needs_lock=False)
                 array = getitem(original_array, key)
-        except IndexError:
+        except IndexError as err:
             # Catch IndexError in netCDF4 and return a more informative
             # error message.  This is most often called when an unsorted
             # indexer is used before the data is loaded from disk.
@@ -120,7 +120,7 @@ def _getitem(self, key):
                 "is not valid on netCDF4.Variable object. Try loading "
                 "your data into memory first by calling .load()."
             )
-            raise IndexError(msg)
+            raise IndexError(msg) from err
         return array
 
 
@@ -192,7 +192,7 @@ def _nc4_require_group(ds, group, mode, create_group=_netcdf4_create_group):
                     ds = create_group(ds, key)
                 else:
                     # wrap error to provide slightly more helpful message
-                    raise OSError(f"group not found: {key}", e)
+                    raise OSError(f"group not found: {key}", e) from e
         return ds
 
 

diff --git a/xarray/backends/plugins.py b/xarray/backends/plugins.py
@@ -40,6 +40,7 @@ def remove_duplicates(entrypoints: EntryPoints) -> list[EntryPoint]:
                 f"\n {all_module_names}.\n "
                 f"The entrypoint {selected_module_name} will be used.",
                 RuntimeWarning,
+                stacklevel=2,
             )
     return unique_entrypoints
 
@@ -72,7 +73,9 @@ def backends_dict_from_pkg(
             backend = entrypoint.load()
             backend_entrypoints[name] = backend
         except Exception as ex:
-            warnings.warn(f"Engine {name!r} loading failed:\n{ex}", RuntimeWarning)
+            warnings.warn(
+                f"Engine {name!r} loading failed:\n{ex}", RuntimeWarning, stacklevel=2
+            )
     return backend_entrypoints
 
 
@@ -146,7 +149,9 @@ def guess_engine(
         except PermissionError:
             raise
         except Exception:
-            warnings.warn(f"{engine!r} fails while guessing", RuntimeWarning)
+            warnings.warn(
+                f"{engine!r} fails while guessing", RuntimeWarning, stacklevel=2
+            )
 
     compatible_engines = []
     for engine, (_, backend_cls) in BACKEND_ENTRYPOINTS.items():
@@ -155,7 +160,9 @@ def guess_engine(
             if backend.guess_can_open(store_spec):
                 compatible_engines.append(engine)
         except Exception:
-            warnings.warn(f"{engine!r} fails while guessing", RuntimeWarning)
+            warnings.warn(
+                f"{engine!r} fails while guessing", RuntimeWarning, stacklevel=2
+            )
 
     installed_engines = [k for k in engines if k != "store"]
     if not compatible_engines:

diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py
@@ -114,7 +114,9 @@ def _open_scipy_netcdf(filename, mode, mmap, version):
             # TODO: gzipped loading only works with NetCDF3 files.
             errmsg = e.args[0]
             if "is not a valid NetCDF 3 file" in errmsg:
-                raise ValueError("gzipped file loading only supports NetCDF 3 files.")
+                raise ValueError(
+                    "gzipped file loading only supports NetCDF 3 files."
+                ) from e
             else:
                 raise
 
@@ -134,7 +136,7 @@ def _open_scipy_netcdf(filename, mode, mmap, version):
             $ pip install netcdf4
             """
             errmsg += msg
-            raise TypeError(errmsg)
+            raise TypeError(errmsg) from e
         else:
             raise
 

diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py
@@ -577,7 +577,7 @@ def get_attrs(self):
     def get_dimensions(self):
         try_nczarr = self._mode == "r"
         dimensions = {}
-        for k, v in self.zarr_group.arrays():
+        for _k, v in self.zarr_group.arrays():
             dim_names, _ = _get_zarr_dims_and_attrs(v, DIMENSION_KEY, try_nczarr)
             for d, s in zip(dim_names, v.shape, strict=True):
                 if d in dimensions and dimensions[d] != s:
@@ -1374,8 +1374,10 @@ def _get_open_params(
                     RuntimeWarning,
                     stacklevel=stacklevel,
                 )
-            except zarr.errors.GroupNotFoundError:
-                raise FileNotFoundError(f"No such file or directory: '{store}'")
+            except zarr.errors.GroupNotFoundError as err:
+                raise FileNotFoundError(
+                    f"No such file or directory: '{store}'"
+                ) from err
     elif consolidated:
         # TODO: an option to pass the metadata_key keyword
         zarr_group = zarr.open_consolidated(store, **open_kwargs)

diff --git a/xarray/coding/cftimeindex.py b/xarray/coding/cftimeindex.py
@@ -439,8 +439,8 @@ def _get_string_slice(self, key):
         parsed, resolution = _parse_iso8601_with_reso(self.date_type, key)
         try:
             loc = self._partial_date_slice(resolution, parsed)
-        except KeyError:
-            raise KeyError(key)
+        except KeyError as err:
+            raise KeyError(key) from err
         return loc
 
     def _get_nearest_indexer(self, target, limit, tolerance):
@@ -593,21 +593,21 @@ def __sub__(self, other):
         if _contains_cftime_datetimes(np.array(other)):
             try:
                 return pd.TimedeltaIndex(np.array(self) - np.array(other))
-            except OUT_OF_BOUNDS_TIMEDELTA_ERRORS:
+            except OUT_OF_BOUNDS_TIMEDELTA_ERRORS as err:
                 raise ValueError(
                     "The time difference exceeds the range of values "
                     "that can be expressed at the nanosecond resolution."
-                )
+                ) from err
         return NotImplemented
 
     def __rsub__(self, other):
         try:
             return pd.TimedeltaIndex(other - np.array(self))
-        except OUT_OF_BOUNDS_TIMEDELTA_ERRORS:
+        except OUT_OF_BOUNDS_TIMEDELTA_ERRORS as err:
             raise ValueError(
                 "The time difference exceeds the range of values "
                 "that can be expressed at the nanosecond resolution."
-            )
+            ) from err
 
     def to_datetimeindex(self, unsafe=False):
         """If possible, convert this index to a pandas.DatetimeIndex.

diff --git a/xarray/coding/times.py b/xarray/coding/times.py
@@ -169,7 +169,7 @@ def _ensure_padded_year(ref_date: str) -> str:
         "To remove this message, remove the ambiguity by padding your reference "
         "date strings with zeros."
     )
-    warnings.warn(warning_msg, SerializationWarning)
+    warnings.warn(warning_msg, SerializationWarning, stacklevel=2)
 
     return ref_date_padded
 
@@ -216,7 +216,7 @@ def _decode_cf_datetime_dtype(
 
     try:
         result = decode_cf_datetime(example_value, units, calendar, use_cftime)
-    except Exception:
+    except Exception as err:
         calendar_msg = (
             "the default calendar" if calendar is None else f"calendar {calendar!r}"
         )
@@ -225,7 +225,7 @@ def _decode_cf_datetime_dtype(
             "opening your dataset with decode_times=False or installing cftime "
             "if it is not installed."
         )
-        raise ValueError(msg)
+        raise ValueError(msg) from err
     else:
         dtype = getattr(result, "dtype", np.dtype("object"))
 
@@ -254,16 +254,25 @@ def _decode_datetime_with_pandas(
             "pandas."
         )
 
+    # Work around pandas.to_timedelta issue with dtypes smaller than int64 and
+    # NumPy 2.0 by casting all int and uint data to int64 and uint64,
+    # respectively. See https://github.com/pandas-dev/pandas/issues/56996 for
+    # more details.
+    if flat_num_dates.dtype.kind == "i":
+        flat_num_dates = flat_num_dates.astype(np.int64)
+    elif flat_num_dates.dtype.kind == "u":
+        flat_num_dates = flat_num_dates.astype(np.uint64)
+
     time_units, ref_date_str = _unpack_netcdf_time_units(units)
     time_units = _netcdf_to_numpy_timeunit(time_units)
     try:
         # TODO: the strict enforcement of nanosecond precision Timestamps can be
         # relaxed when addressing GitHub issue #7493.
         ref_date = nanosecond_precision_timestamp(ref_date_str)
-    except ValueError:
+    except ValueError as err:
         # ValueError is raised by pd.Timestamp for non-ISO timestamp
         # strings, in which case we fall back to using cftime
-        raise OutOfBoundsDatetime
+        raise OutOfBoundsDatetime from err
 
     with warnings.catch_warnings():
         warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning)
@@ -488,7 +497,7 @@ def cftime_to_nptime(times, raise_on_invalid: bool = True) -> np.ndarray:
                 raise ValueError(
                     f"Cannot convert date {t} to a date in the "
                     f"standard calendar.  Reason: {e}."
-                )
+                ) from e
             else:
                 dt = "NaT"
         new[i] = np.datetime64(dt)
@@ -521,7 +530,7 @@ def convert_times(times, date_type, raise_on_invalid: bool = True) -> np.ndarray
                 raise ValueError(
                     f"Cannot convert date {t} to a date in the "
                     f"{date_type(2000, 1, 1).calendar} calendar.  Reason: {e}."
-                )
+                ) from e
             else:
                 dt = np.nan
 

diff --git a/xarray/core/accessor_dt.py b/xarray/core/accessor_dt.py
@@ -454,6 +454,7 @@ def weekofyear(self) -> DataArray:
             "dt.weekofyear and dt.week have been deprecated. Please use "
             "dt.isocalendar().week instead.",
             FutureWarning,
+            stacklevel=2,
         )
 
         weekofyear = self.isocalendar().week

diff --git a/xarray/core/combine.py b/xarray/core/combine.py
@@ -308,7 +308,7 @@ def _combine_1d(
                     "xarray.combine_by_coords, or do it manually "
                     "with xarray.concat, xarray.merge and "
                     "xarray.align"
-                )
+                ) from err
             else:
                 raise
     else: