From 2a6212e1255ea56065ec1bfad8d484fbdad33945 Mon Sep 17 00:00:00 2001 From: joseph nowak Date: Sat, 21 Sep 2024 21:30:32 -0400 Subject: [PATCH 01/15] Improve safe chunk validation (#9527) * fix safe chunks validation * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix safe chunks validation * Update xarray/tests/test_backends.py Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> * The validation of the chunks now is able to detect full or partial chunk and raise a proper error based on the mode selected, it is also possible to use the auto region detection with the mode "a" * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * The test_extract_zarr_variable_encoding does not need to use the region parameter * Inline the code of the allow_partial_chunks and end, document the parameter in order on the extract_zarr_variable_encoding method, raise the correct error if the border size is smaller than the zchunk on mode equal to r+ * Inline the code of the allow_partial_chunks and end, document the parameter in order on the extract_zarr_variable_encoding method, raise the correct error if the border size is smaller than the zchunk on mode equal to r+ * Now the mode r+ is able to update the last chunk of Zarr even if it is not "complete" * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Now the mode r+ is able to update the last chunk of Zarr even if it is not "complete" * Add a typehint to the modes to avoid issues with mypy --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> --- doc/whats-new.rst | 4 +- xarray/backends/zarr.py | 168 +++++++++++++++++++++++---------- xarray/core/dataarray.py | 8 ++ xarray/core/dataset.py | 8 ++ xarray/tests/test_backends.py | 169 ++++++++++++++++++++++++++++++++-- 5 files changed, 303 insertions(+), 54 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index e4b2a06a3e7..89c8d3b4599 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -58,7 +58,9 @@ Bug fixes `_. - Fix a few bugs affecting groupby reductions with `flox`. (:issue:`8090`, :issue:`9398`). By `Deepak Cherian `_. - +- Fix the safe_chunks validation option on the to_zarr method + (:issue:`5511`, :pull:`9513`). By `Joseph Nowak + `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 5a6b043eef8..2c6b50b3589 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -112,7 +112,9 @@ def __getitem__(self, key): # could possibly have a work-around for 0d data here -def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks): +def _determine_zarr_chunks( + enc_chunks, var_chunks, ndim, name, safe_chunks, region, mode, shape +): """ Given encoding chunks (possibly None or []) and variable chunks (possibly None or []). @@ -163,7 +165,9 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks): if len(enc_chunks_tuple) != ndim: # throw away encoding chunks, start over - return _determine_zarr_chunks(None, var_chunks, ndim, name, safe_chunks) + return _determine_zarr_chunks( + None, var_chunks, ndim, name, safe_chunks, region, mode, shape + ) for x in enc_chunks_tuple: if not isinstance(x, int): @@ -189,20 +193,59 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks): # TODO: incorporate synchronizer to allow writes from multiple dask # threads if var_chunks and enc_chunks_tuple: - for zchunk, dchunks in zip(enc_chunks_tuple, var_chunks, strict=True): - for dchunk in dchunks[:-1]: + # If it is possible to write on partial chunks then it is not necessary to check + # the last one contained on the region + allow_partial_chunks = mode != "r+" + + base_error = ( + f"Specified zarr chunks encoding['chunks']={enc_chunks_tuple!r} for " + f"variable named {name!r} would overlap multiple dask chunks {var_chunks!r} " + f"on the region {region}. " + f"Writing this array in parallel with dask could lead to corrupted data." + f"Consider either rechunking using `chunk()`, deleting " + f"or modifying `encoding['chunks']`, or specify `safe_chunks=False`." + ) + + for zchunk, dchunks, interval, size in zip( + enc_chunks_tuple, var_chunks, region, shape, strict=True + ): + if not safe_chunks: + continue + + for dchunk in dchunks[1:-1]: if dchunk % zchunk: - base_error = ( - f"Specified zarr chunks encoding['chunks']={enc_chunks_tuple!r} for " - f"variable named {name!r} would overlap multiple dask chunks {var_chunks!r}. " - f"Writing this array in parallel with dask could lead to corrupted data." - ) - if safe_chunks: - raise ValueError( - base_error - + " Consider either rechunking using `chunk()`, deleting " - "or modifying `encoding['chunks']`, or specify `safe_chunks=False`." - ) + raise ValueError(base_error) + + region_start = interval.start if interval.start else 0 + + if len(dchunks) > 1: + # The first border size is the amount of data that needs to be updated on the + # first chunk taking into account the region slice. + first_border_size = zchunk + if allow_partial_chunks: + first_border_size = zchunk - region_start % zchunk + + if (dchunks[0] - first_border_size) % zchunk: + raise ValueError(base_error) + + if not allow_partial_chunks: + chunk_start = sum(dchunks[:-1]) + region_start + if chunk_start % zchunk: + # The last chunk which can also be the only one is a partial chunk + # if it is not aligned at the beginning + raise ValueError(base_error) + + region_stop = interval.stop if interval.stop else size + + if size - region_stop + 1 < zchunk: + # If the region is covering the last chunk then check + # if the reminder with the default chunk size + # is equal to the size of the last chunk + if dchunks[-1] % zchunk != size % zchunk: + raise ValueError(base_error) + elif dchunks[-1] % zchunk: + raise ValueError(base_error) + return enc_chunks_tuple raise AssertionError("We should never get here. Function logic must be wrong.") @@ -243,7 +286,14 @@ def _get_zarr_dims_and_attrs(zarr_obj, dimension_key, try_nczarr): def extract_zarr_variable_encoding( - variable, raise_on_invalid=False, name=None, safe_chunks=True + variable, + raise_on_invalid=False, + name=None, + *, + safe_chunks=True, + region=None, + mode=None, + shape=None, ): """ Extract zarr encoding dictionary from xarray Variable @@ -252,12 +302,18 @@ def extract_zarr_variable_encoding( ---------- variable : Variable raise_on_invalid : bool, optional - + name: str | Hashable, optional + safe_chunks: bool, optional + region: tuple[slice, ...], optional + mode: str, optional + shape: tuple[int, ...], optional Returns ------- encoding : dict Zarr encoding for `variable` """ + + shape = shape if shape else variable.shape encoding = variable.encoding.copy() safe_to_drop = {"source", "original_shape"} @@ -285,7 +341,14 @@ def extract_zarr_variable_encoding( del encoding[k] chunks = _determine_zarr_chunks( - encoding.get("chunks"), variable.chunks, variable.ndim, name, safe_chunks + enc_chunks=encoding.get("chunks"), + var_chunks=variable.chunks, + ndim=variable.ndim, + name=name, + safe_chunks=safe_chunks, + region=region, + mode=mode, + shape=shape, ) encoding["chunks"] = chunks return encoding @@ -762,16 +825,10 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No if v.encoding == {"_FillValue": None} and fill_value is None: v.encoding = {} - # We need to do this for both new and existing variables to ensure we're not - # writing to a partial chunk, even though we don't use the `encoding` value - # when writing to an existing variable. See - # https://github.com/pydata/xarray/issues/8371 for details. - encoding = extract_zarr_variable_encoding( - v, - raise_on_invalid=vn in check_encoding_set, - name=vn, - safe_chunks=self._safe_chunks, - ) + zarr_array = None + zarr_shape = None + write_region = self._write_region if self._write_region is not None else {} + write_region = {dim: write_region.get(dim, slice(None)) for dim in dims} if name in existing_keys: # existing variable @@ -801,7 +858,40 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No ) else: zarr_array = self.zarr_group[name] - else: + + if self._append_dim is not None and self._append_dim in dims: + # resize existing variable + append_axis = dims.index(self._append_dim) + assert write_region[self._append_dim] == slice(None) + write_region[self._append_dim] = slice( + zarr_array.shape[append_axis], None + ) + + new_shape = list(zarr_array.shape) + new_shape[append_axis] += v.shape[append_axis] + zarr_array.resize(new_shape) + + zarr_shape = zarr_array.shape + + region = tuple(write_region[dim] for dim in dims) + + # We need to do this for both new and existing variables to ensure we're not + # writing to a partial chunk, even though we don't use the `encoding` value + # when writing to an existing variable. See + # https://github.com/pydata/xarray/issues/8371 for details. + # Note: Ideally there should be two functions, one for validating the chunks and + # another one for extracting the encoding. + encoding = extract_zarr_variable_encoding( + v, + raise_on_invalid=vn in check_encoding_set, + name=vn, + safe_chunks=self._safe_chunks, + region=region, + mode=self._mode, + shape=zarr_shape, + ) + + if name not in existing_keys: # new variable encoded_attrs = {} # the magic for storing the hidden dimension data @@ -833,22 +923,6 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No ) zarr_array = _put_attrs(zarr_array, encoded_attrs) - write_region = self._write_region if self._write_region is not None else {} - write_region = {dim: write_region.get(dim, slice(None)) for dim in dims} - - if self._append_dim is not None and self._append_dim in dims: - # resize existing variable - append_axis = dims.index(self._append_dim) - assert write_region[self._append_dim] == slice(None) - write_region[self._append_dim] = slice( - zarr_array.shape[append_axis], None - ) - - new_shape = list(zarr_array.shape) - new_shape[append_axis] += v.shape[append_axis] - zarr_array.resize(new_shape) - - region = tuple(write_region[dim] for dim in dims) writer.add(v.data, zarr_array, region) def close(self) -> None: @@ -897,9 +971,9 @@ def _validate_and_autodetect_region(self, ds) -> None: if not isinstance(region, dict): raise TypeError(f"``region`` must be a dict, got {type(region)}") if any(v == "auto" for v in region.values()): - if self._mode != "r+": + if self._mode not in ["r+", "a"]: raise ValueError( - f"``mode`` must be 'r+' when using ``region='auto'``, got {self._mode!r}" + f"``mode`` must be 'r+' or 'a' when using ``region='auto'``, got {self._mode!r}" ) region = self._auto_detect_regions(ds, region) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index b5441fc273a..bcc57acd316 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4316,6 +4316,14 @@ def to_zarr( if Zarr arrays are written in parallel. This option may be useful in combination with ``compute=False`` to initialize a Zarr store from an existing DataArray with arbitrary chunk structure. + In addition to the many-to-one relationship validation, it also detects partial + chunks writes when using the region parameter, + these partial chunks are considered unsafe in the mode "r+" but safe in + the mode "a". + Note: Even with these validations it can still be unsafe to write + two or more chunked arrays in the same location in parallel if they are + not writing in independent regions, for those cases it is better to use + a synchronizer. storage_options : dict, optional Any additional parameters for the storage backend (ignored for local paths). diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 7b9b4819245..b1ce264cbc8 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2509,6 +2509,14 @@ def to_zarr( if Zarr arrays are written in parallel. This option may be useful in combination with ``compute=False`` to initialize a Zarr from an existing Dataset with arbitrary chunk structure. + In addition to the many-to-one relationship validation, it also detects partial + chunks writes when using the region parameter, + these partial chunks are considered unsafe in the mode "r+" but safe in + the mode "a". + Note: Even with these validations it can still be unsafe to write + two or more chunked arrays in the same location in parallel if they are + not writing in independent regions, for those cases it is better to use + a synchronizer. storage_options : dict, optional Any additional parameters for the storage backend (ignored for local paths). diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index cbc0b9e019d..ccf1bc73dd6 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5989,9 +5989,10 @@ def test_zarr_region_append(self, tmp_path): } ) - # Don't allow auto region detection in append mode due to complexities in - # implementing the overlap logic and lack of safety with parallel writes - with pytest.raises(ValueError): + # Now it is valid to use auto region detection with the append mode, + # but it is still unsafe to modify dimensions or metadata using the region + # parameter. + with pytest.raises(KeyError): ds_new.to_zarr( tmp_path / "test.zarr", mode="a", append_dim="x", region="auto" ) @@ -6096,6 +6097,162 @@ def test_zarr_region_chunk_partial_offset(tmp_path): store, safe_chunks=False, region="auto" ) - # This write is unsafe, and should raise an error, but does not. - # with pytest.raises(ValueError): - # da.isel(x=slice(5, 25)).chunk(x=(10, 10)).to_zarr(store, region="auto") + with pytest.raises(ValueError): + da.isel(x=slice(5, 25)).chunk(x=(10, 10)).to_zarr(store, region="auto") + + +@requires_zarr +@requires_dask +def test_zarr_safe_chunk_append_dim(tmp_path): + store = tmp_path / "foo.zarr" + data = np.ones((20,)) + da = xr.DataArray(data, dims=["x"], coords={"x": range(20)}, name="foo").chunk(x=5) + + da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") + with pytest.raises(ValueError): + # If the first chunk is smaller than the border size then raise an error + da.isel(x=slice(7, 11)).chunk(x=(2, 2)).to_zarr( + store, append_dim="x", safe_chunks=True + ) + + da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") + # If the first chunk is of the size of the border size then it is valid + da.isel(x=slice(7, 11)).chunk(x=(3, 1)).to_zarr( + store, safe_chunks=True, append_dim="x" + ) + assert xr.open_zarr(store)["foo"].equals(da.isel(x=slice(0, 11))) + + da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") + # If the first chunk is of the size of the border size + N * zchunk then it is valid + da.isel(x=slice(7, 17)).chunk(x=(8, 2)).to_zarr( + store, safe_chunks=True, append_dim="x" + ) + assert xr.open_zarr(store)["foo"].equals(da.isel(x=slice(0, 17))) + + da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") + with pytest.raises(ValueError): + # If the first chunk is valid but the other are not then raise an error + da.isel(x=slice(7, 14)).chunk(x=(3, 3, 1)).to_zarr( + store, append_dim="x", safe_chunks=True + ) + + da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") + with pytest.raises(ValueError): + # If the first chunk have a size bigger than the border size but not enough + # to complete the size of the next chunk then an error must be raised + da.isel(x=slice(7, 14)).chunk(x=(4, 3)).to_zarr( + store, append_dim="x", safe_chunks=True + ) + + da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") + # Append with a single chunk it's totally valid, + # and it does not matter the size of the chunk + da.isel(x=slice(7, 19)).chunk(x=-1).to_zarr(store, append_dim="x", safe_chunks=True) + assert xr.open_zarr(store)["foo"].equals(da.isel(x=slice(0, 19))) + + +@requires_zarr +@requires_dask +def test_zarr_safe_chunk_region(tmp_path): + store = tmp_path / "foo.zarr" + + arr = xr.DataArray( + list(range(11)), dims=["a"], coords={"a": list(range(11))}, name="foo" + ).chunk(a=3) + arr.to_zarr(store, mode="w") + + modes: list[Literal["r+", "a"]] = ["r+", "a"] + for mode in modes: + with pytest.raises(ValueError): + # There are two Dask chunks on the same Zarr chunk, + # which means that it is unsafe in any mode + arr.isel(a=slice(0, 3)).chunk(a=(2, 1)).to_zarr( + store, region="auto", mode=mode + ) + + with pytest.raises(ValueError): + # the first chunk is covering the border size, but it is not + # completely covering the second chunk, which means that it is + # unsafe in any mode + arr.isel(a=slice(1, 5)).chunk(a=(3, 1)).to_zarr( + store, region="auto", mode=mode + ) + + with pytest.raises(ValueError): + # The first chunk is safe but the other two chunks are overlapping with + # the same Zarr chunk + arr.isel(a=slice(0, 5)).chunk(a=(3, 1, 1)).to_zarr( + store, region="auto", mode=mode + ) + + # Fully update two contiguous chunks is safe in any mode + arr.isel(a=slice(3, 9)).to_zarr(store, region="auto", mode=mode) + + # The last chunk is considered full based on their current size (2) + arr.isel(a=slice(9, 11)).to_zarr(store, region="auto", mode=mode) + arr.isel(a=slice(6, None)).chunk(a=-1).to_zarr(store, region="auto", mode=mode) + + # Write the last chunk of a region partially is safe in "a" mode + arr.isel(a=slice(3, 8)).to_zarr(store, region="auto", mode="a") + with pytest.raises(ValueError): + # with "r+" mode it is invalid to write partial chunk + arr.isel(a=slice(3, 8)).to_zarr(store, region="auto", mode="r+") + + # This is safe with mode "a", the border size is covered by the first chunk of Dask + arr.isel(a=slice(1, 4)).chunk(a=(2, 1)).to_zarr(store, region="auto", mode="a") + with pytest.raises(ValueError): + # This is considered unsafe in mode "r+" because it is writing in a partial chunk + arr.isel(a=slice(1, 4)).chunk(a=(2, 1)).to_zarr(store, region="auto", mode="r+") + + # This is safe on mode "a" because there is a single dask chunk + arr.isel(a=slice(1, 5)).chunk(a=(4,)).to_zarr(store, region="auto", mode="a") + with pytest.raises(ValueError): + # This is unsafe on mode "r+", because the Dask chunk is partially writing + # in the first chunk of Zarr + arr.isel(a=slice(1, 5)).chunk(a=(4,)).to_zarr(store, region="auto", mode="r+") + + # The first chunk is completely covering the first Zarr chunk + # and the last chunk is a partial one + arr.isel(a=slice(0, 5)).chunk(a=(3, 2)).to_zarr(store, region="auto", mode="a") + + with pytest.raises(ValueError): + # The last chunk is partial, so it is considered unsafe on mode "r+" + arr.isel(a=slice(0, 5)).chunk(a=(3, 2)).to_zarr(store, region="auto", mode="r+") + + # The first chunk is covering the border size (2 elements) + # and also the second chunk (3 elements), so it is valid + arr.isel(a=slice(1, 8)).chunk(a=(5, 2)).to_zarr(store, region="auto", mode="a") + + with pytest.raises(ValueError): + # The first chunk is not fully covering the first zarr chunk + arr.isel(a=slice(1, 8)).chunk(a=(5, 2)).to_zarr(store, region="auto", mode="r+") + + with pytest.raises(ValueError): + # Validate that the border condition is not affecting the "r+" mode + arr.isel(a=slice(1, 9)).to_zarr(store, region="auto", mode="r+") + + arr.isel(a=slice(10, 11)).to_zarr(store, region="auto", mode="a") + with pytest.raises(ValueError): + # Validate that even if we write with a single Dask chunk on the last Zarr + # chunk it is still unsafe if it is not fully covering it + # (the last Zarr chunk has size 2) + arr.isel(a=slice(10, 11)).to_zarr(store, region="auto", mode="r+") + + # Validate the same as the above test but in the beginning of the last chunk + arr.isel(a=slice(9, 10)).to_zarr(store, region="auto", mode="a") + with pytest.raises(ValueError): + arr.isel(a=slice(9, 10)).to_zarr(store, region="auto", mode="r+") + + arr.isel(a=slice(7, None)).chunk(a=-1).to_zarr(store, region="auto", mode="a") + with pytest.raises(ValueError): + # Test that even a Dask chunk that covers the last Zarr chunk can be unsafe + # if it is partial covering other Zarr chunks + arr.isel(a=slice(7, None)).chunk(a=-1).to_zarr(store, region="auto", mode="r+") + + with pytest.raises(ValueError): + # If the chunk is of size equal to the one in the Zarr encoding, but + # it is partially writing in the first chunk then raise an error + arr.isel(a=slice(8, None)).chunk(a=3).to_zarr(store, region="auto", mode="r+") + + with pytest.raises(ValueError): + arr.isel(a=slice(5, -1)).chunk(a=5).to_zarr(store, region="auto", mode="r+") From ab84e048c2d80c82af692ec52dc37d31fc49f123 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Sun, 22 Sep 2024 15:56:22 -0700 Subject: [PATCH 02/15] Fix DataTree repr to not repeat inherited coordinates (#9532) * Fix DataTree repr to not repeat inherited coordinates Fixes GH9499 * skip failing test on Windows --- xarray/core/formatting.py | 3 +- xarray/tests/test_datatree.py | 107 +++++++++++++++++++++++++++++++++- 2 files changed, 108 insertions(+), 2 deletions(-) diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 1cea9a7a28d..646e000a984 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -1102,7 +1102,8 @@ def _datatree_node_repr(node: DataTree, show_inherited: bool) -> str: summary.append(f"{dims_start}({dims_values})") if node._node_coord_variables: - summary.append(coords_repr(node.coords, col_width=col_width, max_rows=max_rows)) + node_coords = node.to_dataset(inherited=False).coords + summary.append(coords_repr(node_coords, col_width=col_width, max_rows=max_rows)) if show_inherited and inherited_coords: summary.append( diff --git a/xarray/tests/test_datatree.py b/xarray/tests/test_datatree.py index 4e22ba57e2e..2adf73fa5c8 100644 --- a/xarray/tests/test_datatree.py +++ b/xarray/tests/test_datatree.py @@ -1,4 +1,5 @@ import re +import sys import typing from copy import copy, deepcopy from textwrap import dedent @@ -15,6 +16,8 @@ from xarray.testing import assert_equal, assert_identical from xarray.tests import assert_array_equal, create_test_data, source_ndarray +ON_WINDOWS = sys.platform == "win32" + class TestTreeCreation: def test_empty(self): @@ -1052,7 +1055,7 @@ def test_repr_two_children(self): { "/": Dataset(coords={"x": [1.0]}), "/first_child": None, - "/second_child": Dataset({"foo": ("x", [0.0])}), + "/second_child": Dataset({"foo": ("x", [0.0])}, coords={"z": 1.0}), } ) @@ -1067,6 +1070,8 @@ def test_repr_two_children(self): ├── Group: /first_child └── Group: /second_child Dimensions: (x: 1) + Coordinates: + z float64 8B 1.0 Data variables: foo (x) float64 8B 0.0 """ @@ -1091,6 +1096,8 @@ def test_repr_two_children(self): Group: /second_child Dimensions: (x: 1) + Coordinates: + z float64 8B 1.0 Inherited coordinates: * x (x) float64 8B 1.0 Data variables: @@ -1138,6 +1145,104 @@ def test_repr_inherited_dims(self): ).strip() assert result == expected + @pytest.mark.skipif( + ON_WINDOWS, reason="windows (pre NumPy2) uses int32 instead of int64" + ) + def test_doc_example(self): + # regression test for https://github.com/pydata/xarray/issues/9499 + time = xr.DataArray(data=["2022-01", "2023-01"], dims="time") + stations = xr.DataArray(data=list("abcdef"), dims="station") + lon = [-100, -80, -60] + lat = [10, 20, 30] + # Set up fake data + wind_speed = xr.DataArray(np.ones((2, 6)) * 2, dims=("time", "station")) + pressure = xr.DataArray(np.ones((2, 6)) * 3, dims=("time", "station")) + air_temperature = xr.DataArray(np.ones((2, 6)) * 4, dims=("time", "station")) + dewpoint = xr.DataArray(np.ones((2, 6)) * 5, dims=("time", "station")) + infrared = xr.DataArray(np.ones((2, 3, 3)) * 6, dims=("time", "lon", "lat")) + true_color = xr.DataArray(np.ones((2, 3, 3)) * 7, dims=("time", "lon", "lat")) + tree = xr.DataTree.from_dict( + { + "/": xr.Dataset( + coords={"time": time}, + ), + "/weather": xr.Dataset( + coords={"station": stations}, + data_vars={ + "wind_speed": wind_speed, + "pressure": pressure, + }, + ), + "/weather/temperature": xr.Dataset( + data_vars={ + "air_temperature": air_temperature, + "dewpoint": dewpoint, + }, + ), + "/satellite": xr.Dataset( + coords={"lat": lat, "lon": lon}, + data_vars={ + "infrared": infrared, + "true_color": true_color, + }, + ), + }, + ) + + result = repr(tree) + expected = dedent( + """ + + Group: / + │ Dimensions: (time: 2) + │ Coordinates: + │ * time (time) + Group: /weather + │ Dimensions: (time: 2, station: 6) + │ Coordinates: + │ * station (station) str: return re.escape(dedent(message).strip()) From 01206daa06c33ea1ccf9ce06618f71796d9f3f6d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Sep 2024 09:44:06 +0200 Subject: [PATCH 03/15] Bump pypa/gh-action-pypi-publish in the actions group (#9537) Bumps the actions group with 1 update: [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish). Updates `pypa/gh-action-pypi-publish` from 1.10.1 to 1.10.2 - [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases) - [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/v1.10.1...v1.10.2) --- updated-dependencies: - dependency-name: pypa/gh-action-pypi-publish dependency-type: direct:production update-type: version-update:semver-patch dependency-group: actions ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/pypi-release.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pypi-release.yaml b/.github/workflows/pypi-release.yaml index 2eeabf469b7..5cf48dfd1fb 100644 --- a/.github/workflows/pypi-release.yaml +++ b/.github/workflows/pypi-release.yaml @@ -88,7 +88,7 @@ jobs: path: dist - name: Publish package to TestPyPI if: github.event_name == 'push' - uses: pypa/gh-action-pypi-publish@v1.10.1 + uses: pypa/gh-action-pypi-publish@v1.10.2 with: repository_url: https://test.pypi.org/legacy/ verbose: true @@ -111,6 +111,6 @@ jobs: name: releases path: dist - name: Publish package to PyPI - uses: pypa/gh-action-pypi-publish@v1.10.1 + uses: pypa/gh-action-pypi-publish@v1.10.2 with: verbose: true From d26144db962d9d620c7f7c4e7398c2f6b3bf6d47 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Mon, 23 Sep 2024 13:59:39 +0200 Subject: [PATCH 04/15] cast `numpy` scalars to arrays in `as_compatible_data` (#9403) * also call `np.asarray` on numpy scalars * check that numpy scalars are properly casted to arrays * don't allow `numpy.ndarray` subclasses * comment on the purpose of the explicit isinstance and `np.asarray` --- xarray/core/variable.py | 6 ++++-- xarray/tests/test_variable.py | 7 ++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 9b9239cc042..d8cf0fe7550 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -320,12 +320,14 @@ def convert_non_numpy_type(data): else: data = np.asarray(data) - if not isinstance(data, np.ndarray) and ( + # immediately return array-like types except `numpy.ndarray` subclasses and `numpy` scalars + if not isinstance(data, np.ndarray | np.generic) and ( hasattr(data, "__array_function__") or hasattr(data, "__array_namespace__") ): return cast("T_DuckArray", data) - # validate whether the data is valid data types. + # validate whether the data is valid data types. Also, explicitly cast `numpy` + # subclasses and `numpy` scalars to `numpy.ndarray` data = np.asarray(data) if data.dtype.kind in "OMm": diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 0a55b42f228..1d430b6b27e 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -2585,7 +2585,12 @@ def test_unchanged_types(self): assert source_ndarray(x) is source_ndarray(as_compatible_data(x)) def test_converted_types(self): - for input_array in [[[0, 1, 2]], pd.DataFrame([[0, 1, 2]])]: + for input_array in [ + [[0, 1, 2]], + pd.DataFrame([[0, 1, 2]]), + np.float64(1.4), + np.str_("abc"), + ]: actual = as_compatible_data(input_array) assert_array_equal(np.asarray(input_array), actual) assert np.ndarray is type(actual) From 52f13d442748fa8d3dbfaf382c649b15162fb4e6 Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Mon, 23 Sep 2024 15:14:26 -0700 Subject: [PATCH 05/15] Add `.rolling_exp` onto `.rolling`'s 'See also' (#9534) Also remove the internal object, which is referenced from `Returns:` already --- xarray/core/dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index b1ce264cbc8..dd860b54933 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -10621,7 +10621,7 @@ def rolling( -------- Dataset.cumulative DataArray.rolling - core.rolling.DatasetRolling + DataArray.rolling_exp """ from xarray.core.rolling import DatasetRolling @@ -10651,9 +10651,9 @@ def cumulative( See Also -------- - Dataset.rolling DataArray.cumulative - core.rolling.DatasetRolling + Dataset.rolling + Dataset.rolling_exp """ from xarray.core.rolling import DatasetRolling From ea06c6f2983a31186fe3bb7495f3505dbb56d630 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Wed, 25 Sep 2024 15:24:43 +0200 Subject: [PATCH 06/15] Update __array__ signatures with copy (#9529) * Update __array__ with copy * Update common.py * Update indexing.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * copy only available from np2 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Raise if copy=false * Update groupby.py * Update test_namedarray.py * Update pyproject.toml --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- pyproject.toml | 1 - xarray/core/common.py | 2 +- xarray/core/datatree.py | 5 +++- xarray/core/groupby.py | 6 ++++- xarray/core/indexing.py | 43 +++++++++++++++++++++------------ xarray/namedarray/_typing.py | 6 ++--- xarray/tests/arrays.py | 12 ++++++--- xarray/tests/test_assertions.py | 6 +++-- xarray/tests/test_formatting.py | 4 ++- xarray/tests/test_namedarray.py | 11 +++++++-- 10 files changed, 65 insertions(+), 31 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 35522d82edf..0078a346b75 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -323,7 +323,6 @@ filterwarnings = [ "default:Using a non-tuple sequence for multidimensional indexing is deprecated:FutureWarning", "default:Duplicate dimension names present:UserWarning:xarray.namedarray.core", "default:::xarray.tests.test_strategies", # TODO: remove once we know how to deal with a changed signature in protocols - "ignore:__array__ implementation doesn't accept a copy keyword, so passing copy=False failed.", ] log_cli_level = "INFO" diff --git a/xarray/core/common.py b/xarray/core/common.py index e4c61a1bc12..9a6807faad2 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -163,7 +163,7 @@ def __complex__(self: Any) -> complex: return complex(self.values) def __array__( - self: Any, dtype: DTypeLike | None = None, copy: bool | None = None + self: Any, dtype: np.typing.DTypeLike = None, /, *, copy: bool | None = None ) -> np.ndarray: if not copy: if np.lib.NumpyVersion(np.__version__) >= "2.0.0": diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index bd583ac86cb..65436be9038 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -55,6 +55,7 @@ from xarray.core.dataset import calculate_dimensions if TYPE_CHECKING: + import numpy as np import pandas as pd from xarray.core.datatree_io import T_DataTreeNetcdfEngine, T_DataTreeNetcdfTypes @@ -737,7 +738,9 @@ def __bool__(self) -> bool: def __iter__(self) -> Iterator[str]: return itertools.chain(self._data_variables, self._children) # type: ignore[arg-type] - def __array__(self, dtype=None, copy=None): + def __array__( + self, dtype: np.typing.DTypeLike = None, /, *, copy: bool | None = None + ) -> np.ndarray: raise TypeError( "cannot directly convert a DataTree into a " "numpy array. Instead, create an xarray.DataArray " diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 58971435018..92f0572d37a 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -193,7 +193,11 @@ def values(self) -> range: def data(self) -> range: return range(self.size) - def __array__(self) -> np.ndarray: + def __array__( + self, dtype: np.typing.DTypeLike = None, /, *, copy: bool | None = None + ) -> np.ndarray: + if copy is False: + raise NotImplementedError(f"An array copy is necessary, got {copy = }.") return np.arange(self.size) @property diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 67912908a2b..08b1d0be290 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -13,6 +13,7 @@ import numpy as np import pandas as pd +from packaging.version import Version from xarray.core import duck_array_ops from xarray.core.nputils import NumpyVIndexAdapter @@ -505,9 +506,14 @@ class ExplicitlyIndexed: __slots__ = () - def __array__(self, dtype: np.typing.DTypeLike = None) -> np.ndarray: + def __array__( + self, dtype: np.typing.DTypeLike = None, /, *, copy: bool | None = None + ) -> np.ndarray: # Leave casting to an array up to the underlying array type. - return np.asarray(self.get_duck_array(), dtype=dtype) + if Version(np.__version__) >= Version("2.0.0"): + return np.asarray(self.get_duck_array(), dtype=dtype, copy=copy) + else: + return np.asarray(self.get_duck_array(), dtype=dtype) def get_duck_array(self): return self.array @@ -520,11 +526,6 @@ def get_duck_array(self): key = BasicIndexer((slice(None),) * self.ndim) return self[key] - def __array__(self, dtype: np.typing.DTypeLike = None) -> np.ndarray: - # This is necessary because we apply the indexing key in self.get_duck_array() - # Note this is the base class for all lazy indexing classes - return np.asarray(self.get_duck_array(), dtype=dtype) - def _oindex_get(self, indexer: OuterIndexer): raise NotImplementedError( f"{self.__class__.__name__}._oindex_get method should be overridden" @@ -570,8 +571,13 @@ def __init__(self, array, indexer_cls: type[ExplicitIndexer] = BasicIndexer): self.array = as_indexable(array) self.indexer_cls = indexer_cls - def __array__(self, dtype: np.typing.DTypeLike = None) -> np.ndarray: - return np.asarray(self.get_duck_array(), dtype=dtype) + def __array__( + self, dtype: np.typing.DTypeLike = None, /, *, copy: bool | None = None + ) -> np.ndarray: + if Version(np.__version__) >= Version("2.0.0"): + return np.asarray(self.get_duck_array(), dtype=dtype, copy=copy) + else: + return np.asarray(self.get_duck_array(), dtype=dtype) def get_duck_array(self): return self.array.get_duck_array() @@ -830,9 +836,6 @@ def __init__(self, array): def _ensure_cached(self): self.array = as_indexable(self.array.get_duck_array()) - def __array__(self, dtype: np.typing.DTypeLike = None) -> np.ndarray: - return np.asarray(self.get_duck_array(), dtype=dtype) - def get_duck_array(self): self._ensure_cached() return self.array.get_duck_array() @@ -1674,7 +1677,9 @@ def __init__(self, array: pd.Index, dtype: DTypeLike = None): def dtype(self) -> np.dtype: return self._dtype - def __array__(self, dtype: DTypeLike = None) -> np.ndarray: + def __array__( + self, dtype: np.typing.DTypeLike = None, /, *, copy: bool | None = None + ) -> np.ndarray: if dtype is None: dtype = self.dtype array = self.array @@ -1682,7 +1687,11 @@ def __array__(self, dtype: DTypeLike = None) -> np.ndarray: with suppress(AttributeError): # this might not be public API array = array.astype("object") - return np.asarray(array.values, dtype=dtype) + + if Version(np.__version__) >= Version("2.0.0"): + return np.asarray(array.values, dtype=dtype, copy=copy) + else: + return np.asarray(array.values, dtype=dtype) def get_duck_array(self) -> np.ndarray: return np.asarray(self) @@ -1831,7 +1840,9 @@ def __init__( super().__init__(array, dtype) self.level = level - def __array__(self, dtype: DTypeLike = None) -> np.ndarray: + def __array__( + self, dtype: np.typing.DTypeLike = None, /, *, copy: bool | None = None + ) -> np.ndarray: if dtype is None: dtype = self.dtype if self.level is not None: @@ -1839,7 +1850,7 @@ def __array__(self, dtype: DTypeLike = None) -> np.ndarray: self.array.get_level_values(self.level).values, dtype=dtype ) else: - return super().__array__(dtype) + return super().__array__(dtype, copy=copy) def _convert_scalar(self, item): if isinstance(item, tuple) and self.level is not None: diff --git a/xarray/namedarray/_typing.py b/xarray/namedarray/_typing.py index a7d7ed7994f..90c442d2e1f 100644 --- a/xarray/namedarray/_typing.py +++ b/xarray/namedarray/_typing.py @@ -153,15 +153,15 @@ def __getitem__( @overload def __array__( - self, dtype: None = ..., /, *, copy: None | bool = ... + self, dtype: None = ..., /, *, copy: bool | None = ... ) -> np.ndarray[Any, _DType_co]: ... @overload def __array__( - self, dtype: _DType, /, *, copy: None | bool = ... + self, dtype: _DType, /, *, copy: bool | None = ... ) -> np.ndarray[Any, _DType]: ... def __array__( - self, dtype: _DType | None = ..., /, *, copy: None | bool = ... + self, dtype: _DType | None = ..., /, *, copy: bool | None = ... ) -> np.ndarray[Any, _DType] | np.ndarray[Any, _DType_co]: ... # TODO: Should return the same subclass but with a new dtype generic. diff --git a/xarray/tests/arrays.py b/xarray/tests/arrays.py index 4e1e31cfa49..7373b6c75ab 100644 --- a/xarray/tests/arrays.py +++ b/xarray/tests/arrays.py @@ -24,7 +24,9 @@ def __init__(self, array): def get_duck_array(self): raise UnexpectedDataAccess("Tried accessing data") - def __array__(self, dtype: np.typing.DTypeLike = None): + def __array__( + self, dtype: np.typing.DTypeLike = None, /, *, copy: bool | None = None + ) -> np.ndarray: raise UnexpectedDataAccess("Tried accessing data") def __getitem__(self, key): @@ -49,7 +51,9 @@ def __init__(self, array: np.ndarray): def __getitem__(self, key): return type(self)(self.array[key]) - def __array__(self, dtype: np.typing.DTypeLike = None): + def __array__( + self, dtype: np.typing.DTypeLike = None, /, *, copy: bool | None = None + ) -> np.ndarray: raise UnexpectedDataAccess("Tried accessing data") def __array_namespace__(self): @@ -140,7 +144,9 @@ def __repr__(self: Any) -> str: def get_duck_array(self): raise UnexpectedDataAccess("Tried accessing data") - def __array__(self, dtype: np.typing.DTypeLike = None): + def __array__( + self, dtype: np.typing.DTypeLike = None, /, *, copy: bool | None = None + ) -> np.ndarray: raise UnexpectedDataAccess("Tried accessing data") def __getitem__(self, key) -> "ConcatenatableArray": diff --git a/xarray/tests/test_assertions.py b/xarray/tests/test_assertions.py index 3e1ce0ea266..ec4b39aaab6 100644 --- a/xarray/tests/test_assertions.py +++ b/xarray/tests/test_assertions.py @@ -173,9 +173,11 @@ def dims(self): warnings.warn("warning in test", stacklevel=2) return super().dims - def __array__(self, dtype=None, copy=None): + def __array__( + self, dtype: np.typing.DTypeLike = None, /, *, copy: bool | None = None + ) -> np.ndarray: warnings.warn("warning in test", stacklevel=2) - return super().__array__() + return super().__array__(dtype, copy=copy) a = WarningVariable("x", [1]) b = WarningVariable("x", [2]) diff --git a/xarray/tests/test_formatting.py b/xarray/tests/test_formatting.py index 4123b3e8aee..688f41a7f92 100644 --- a/xarray/tests/test_formatting.py +++ b/xarray/tests/test_formatting.py @@ -942,7 +942,9 @@ def test_lazy_array_wont_compute() -> None: from xarray.core.indexing import LazilyIndexedArray class LazilyIndexedArrayNotComputable(LazilyIndexedArray): - def __array__(self, dtype=None, copy=None): + def __array__( + self, dtype: np.typing.DTypeLike = None, /, *, copy: bool | None = None + ) -> np.ndarray: raise NotImplementedError("Computing this array is not possible.") arr = LazilyIndexedArrayNotComputable(np.array([1, 2])) diff --git a/xarray/tests/test_namedarray.py b/xarray/tests/test_namedarray.py index 8ccf8c541b7..5e4a39ada73 100644 --- a/xarray/tests/test_namedarray.py +++ b/xarray/tests/test_namedarray.py @@ -8,6 +8,7 @@ import numpy as np import pytest +from packaging.version import Version from xarray.core.indexing import ExplicitlyIndexed from xarray.namedarray._typing import ( @@ -53,8 +54,14 @@ def shape(self) -> _Shape: class CustomArray( CustomArrayBase[_ShapeType_co, _DType_co], Generic[_ShapeType_co, _DType_co] ): - def __array__(self) -> np.ndarray[Any, np.dtype[np.generic]]: - return np.array(self.array) + def __array__( + self, dtype: np.typing.DTypeLike = None, /, *, copy: bool | None = None + ) -> np.ndarray[Any, np.dtype[np.generic]]: + + if Version(np.__version__) >= Version("2.0.0"): + return np.asarray(self.array, dtype=dtype, copy=copy) + else: + return np.asarray(self.array, dtype=dtype) class CustomArrayIndexable( From e239389611ca27d9131a044eeef7ab31edd083ef Mon Sep 17 00:00:00 2001 From: Holly Mandel Date: Wed, 25 Sep 2024 12:38:54 -0700 Subject: [PATCH 07/15] Support vectorized interpolation with more scipy interpolators (#9526) * vectorize 1d interpolators * whats new * formatting --- doc/whats-new.rst | 3 ++ xarray/core/dataarray.py | 10 ++-- xarray/core/dataset.py | 10 ++-- xarray/core/missing.py | 37 +++++++++----- xarray/core/types.py | 4 +- xarray/tests/__init__.py | 1 + xarray/tests/test_interp.py | 97 +++++++++++++++++++++++++----------- xarray/tests/test_missing.py | 6 ++- 8 files changed, 115 insertions(+), 53 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 89c8d3b4599..9cf630acea1 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -32,6 +32,9 @@ New Features `Tom Nicholas `_. - Added zarr backends for :py:func:`open_groups` (:issue:`9430`, :pull:`9469`). By `Eni Awowale `_. +- Added support for vectorized interpolation using additional interpolators + from the ``scipy.interpolate`` module (:issue:`9049`, :pull:`9526`). + By `Holly Mandel `_. Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index bcc57acd316..2adf862f1fd 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2224,12 +2224,12 @@ def interp( Performs univariate or multivariate interpolation of a DataArray onto new coordinates using scipy's interpolation routines. If interpolating - along an existing dimension, :py:class:`scipy.interpolate.interp1d` is - called. When interpolating along multiple existing dimensions, an + along an existing dimension, either :py:class:`scipy.interpolate.interp1d` + or a 1-dimensional scipy interpolator (e.g. :py:class:`scipy.interpolate.KroghInterpolator`) + is called. When interpolating along multiple existing dimensions, an attempt is made to decompose the interpolation into multiple - 1-dimensional interpolations. If this is possible, - :py:class:`scipy.interpolate.interp1d` is called. Otherwise, - :py:func:`scipy.interpolate.interpn` is called. + 1-dimensional interpolations. If this is possible, the 1-dimensional interpolator is called. + Otherwise, :py:func:`scipy.interpolate.interpn` is called. Parameters ---------- diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index dd860b54933..82b60d7abc8 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3885,12 +3885,12 @@ def interp( Performs univariate or multivariate interpolation of a Dataset onto new coordinates using scipy's interpolation routines. If interpolating - along an existing dimension, :py:class:`scipy.interpolate.interp1d` is - called. When interpolating along multiple existing dimensions, an + along an existing dimension, either :py:class:`scipy.interpolate.interp1d` + or a 1-dimensional scipy interpolator (e.g. :py:class:`scipy.interpolate.KroghInterpolator`) + is called. When interpolating along multiple existing dimensions, an attempt is made to decompose the interpolation into multiple - 1-dimensional interpolations. If this is possible, - :py:class:`scipy.interpolate.interp1d` is called. Otherwise, - :py:func:`scipy.interpolate.interpn` is called. + 1-dimensional interpolations. If this is possible, the 1-dimensional interpolator + is called. Otherwise, :py:func:`scipy.interpolate.interpn` is called. Parameters ---------- diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 2df53b172f0..6a380f53f0d 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -138,6 +138,7 @@ def __init__( copy=False, bounds_error=False, order=None, + axis=-1, **kwargs, ): from scipy.interpolate import interp1d @@ -173,6 +174,7 @@ def __init__( bounds_error=bounds_error, assume_sorted=assume_sorted, copy=copy, + axis=axis, **self.cons_kwargs, ) @@ -479,7 +481,8 @@ def _get_interpolator( interp1d_methods = get_args(Interp1dOptions) valid_methods = tuple(vv for v in get_args(InterpOptions) for vv in get_args(v)) - # prioritize scipy.interpolate + # prefer numpy.interp for 1d linear interpolation. This function cannot + # take higher dimensional data but scipy.interp1d can. if ( method == "linear" and not kwargs.get("fill_value", None) == "extrapolate" @@ -492,21 +495,33 @@ def _get_interpolator( if method in interp1d_methods: kwargs.update(method=method) interp_class = ScipyInterpolator - elif vectorizeable_only: - raise ValueError( - f"{method} is not a vectorizeable interpolator. " - f"Available methods are {interp1d_methods}" - ) elif method == "barycentric": + kwargs.update(axis=-1) interp_class = _import_interpolant("BarycentricInterpolator", method) elif method in ["krogh", "krog"]: + kwargs.update(axis=-1) interp_class = _import_interpolant("KroghInterpolator", method) elif method == "pchip": + kwargs.update(axis=-1) interp_class = _import_interpolant("PchipInterpolator", method) elif method == "spline": + utils.emit_user_level_warning( + "The 1d SplineInterpolator class is performing an incorrect calculation and " + "is being deprecated. Please use `method=polynomial` for 1D Spline Interpolation.", + PendingDeprecationWarning, + ) + if vectorizeable_only: + raise ValueError(f"{method} is not a vectorizeable interpolator. ") kwargs.update(method=method) interp_class = SplineInterpolator elif method == "akima": + kwargs.update(axis=-1) + interp_class = _import_interpolant("Akima1DInterpolator", method) + elif method == "makima": + kwargs.update(method="makima", axis=-1) + interp_class = _import_interpolant("Akima1DInterpolator", method) + elif method == "makima": + kwargs.update(method="makima", axis=-1) interp_class = _import_interpolant("Akima1DInterpolator", method) else: raise ValueError(f"{method} is not a valid scipy interpolator") @@ -525,6 +540,7 @@ def _get_interpolator_nd(method, **kwargs): if method in valid_methods: kwargs.update(method=method) + kwargs.setdefault("bounds_error", False) interp_class = _import_interpolant("interpn", method) else: raise ValueError( @@ -614,9 +630,6 @@ def interp(var, indexes_coords, method: InterpOptions, **kwargs): if not indexes_coords: return var.copy() - # default behavior - kwargs["bounds_error"] = kwargs.get("bounds_error", False) - result = var # decompose the interpolation into a succession of independent interpolation for indep_indexes_coords in decompose_interp(indexes_coords): @@ -663,8 +676,8 @@ def interp_func(var, x, new_x, method: InterpOptions, kwargs): new_x : a list of 1d array New coordinates. Should not contain NaN. method : string - {'linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic'} for - 1-dimensional interpolation. + {'linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'pchip', 'akima', + 'makima', 'barycentric', 'krogh'} for 1-dimensional interpolation. {'linear', 'nearest'} for multidimensional interpolation **kwargs Optional keyword arguments to be passed to scipy.interpolator @@ -756,7 +769,7 @@ def interp_func(var, x, new_x, method: InterpOptions, kwargs): def _interp1d(var, x, new_x, func, kwargs): # x, new_x are tuples of size 1. x, new_x = x[0], new_x[0] - rslt = func(x, var, assume_sorted=True, **kwargs)(np.ravel(new_x)) + rslt = func(x, var, **kwargs)(np.ravel(new_x)) if new_x.ndim > 1: return reshape(rslt, (var.shape[:-1] + new_x.shape)) if new_x.ndim == 0: diff --git a/xarray/core/types.py b/xarray/core/types.py index 34b6029ee15..1d383d550ec 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -228,7 +228,9 @@ def copy( Interp1dOptions = Literal[ "linear", "nearest", "zero", "slinear", "quadratic", "cubic", "polynomial" ] -InterpolantOptions = Literal["barycentric", "krogh", "pchip", "spline", "akima"] +InterpolantOptions = Literal[ + "barycentric", "krogh", "pchip", "spline", "akima", "makima" +] InterpOptions = Union[Interp1dOptions, InterpolantOptions] DatetimeUnitOptions = Literal[ diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 71ae1a7075f..6b54994f311 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -87,6 +87,7 @@ def _importorskip( has_matplotlib, requires_matplotlib = _importorskip("matplotlib") has_scipy, requires_scipy = _importorskip("scipy") +has_scipy_ge_1_13, requires_scipy_ge_1_13 = _importorskip("scipy", "1.13") with warnings.catch_warnings(): warnings.filterwarnings( "ignore", diff --git a/xarray/tests/test_interp.py b/xarray/tests/test_interp.py index 5c03881242b..d02e12dd695 100644 --- a/xarray/tests/test_interp.py +++ b/xarray/tests/test_interp.py @@ -16,6 +16,7 @@ assert_identical, has_dask, has_scipy, + has_scipy_ge_1_13, requires_cftime, requires_dask, requires_scipy, @@ -132,29 +133,66 @@ def func(obj, new_x): assert_allclose(actual, expected) -@pytest.mark.parametrize("use_dask", [False, True]) -def test_interpolate_vectorize(use_dask: bool) -> None: - if not has_scipy: - pytest.skip("scipy is not installed.") - - if not has_dask and use_dask: - pytest.skip("dask is not installed in the environment.") - +@requires_scipy +@pytest.mark.parametrize( + "use_dask, method", + ( + (False, "linear"), + (False, "akima"), + pytest.param( + False, + "makima", + marks=pytest.mark.skipif(not has_scipy_ge_1_13, reason="scipy too old"), + ), + pytest.param( + True, + "linear", + marks=pytest.mark.skipif(not has_dask, reason="dask not available"), + ), + pytest.param( + True, + "akima", + marks=pytest.mark.skipif(not has_dask, reason="dask not available"), + ), + ), +) +def test_interpolate_vectorize(use_dask: bool, method: InterpOptions) -> None: # scipy interpolation for the reference - def func(obj, dim, new_x): + def func(obj, dim, new_x, method): + scipy_kwargs = {} + interpolant_options = { + "barycentric": scipy.interpolate.BarycentricInterpolator, + "krogh": scipy.interpolate.KroghInterpolator, + "pchip": scipy.interpolate.PchipInterpolator, + "akima": scipy.interpolate.Akima1DInterpolator, + "makima": scipy.interpolate.Akima1DInterpolator, + } + shape = [s for i, s in enumerate(obj.shape) if i != obj.get_axis_num(dim)] for s in new_x.shape[::-1]: shape.insert(obj.get_axis_num(dim), s) - return scipy.interpolate.interp1d( - da[dim], - obj.data, - axis=obj.get_axis_num(dim), - bounds_error=False, - fill_value=np.nan, - )(new_x).reshape(shape) + if method in interpolant_options: + interpolant = interpolant_options[method] + if method == "makima": + scipy_kwargs["method"] = method + return interpolant( + da[dim], obj.data, axis=obj.get_axis_num(dim), **scipy_kwargs + )(new_x).reshape(shape) + else: + + return scipy.interpolate.interp1d( + da[dim], + obj.data, + axis=obj.get_axis_num(dim), + kind=method, + bounds_error=False, + fill_value=np.nan, + **scipy_kwargs, + )(new_x).reshape(shape) da = get_example_data(0) + if use_dask: da = da.chunk({"y": 5}) @@ -165,17 +203,17 @@ def func(obj, dim, new_x): coords={"z": np.random.randn(30), "z2": ("z", np.random.randn(30))}, ) - actual = da.interp(x=xdest, method="linear") + actual = da.interp(x=xdest, method=method) expected = xr.DataArray( - func(da, "x", xdest), + func(da, "x", xdest, method), dims=["z", "y"], coords={ "z": xdest["z"], "z2": xdest["z2"], "y": da["y"], "x": ("z", xdest.values), - "x2": ("z", func(da["x2"], "x", xdest)), + "x2": ("z", func(da["x2"], "x", xdest, method)), }, ) assert_allclose(actual, expected.transpose("z", "y", transpose_coords=True)) @@ -191,10 +229,10 @@ def func(obj, dim, new_x): }, ) - actual = da.interp(x=xdest, method="linear") + actual = da.interp(x=xdest, method=method) expected = xr.DataArray( - func(da, "x", xdest), + func(da, "x", xdest, method), dims=["z", "w", "y"], coords={ "z": xdest["z"], @@ -202,7 +240,7 @@ def func(obj, dim, new_x): "z2": xdest["z2"], "y": da["y"], "x": (("z", "w"), xdest.data), - "x2": (("z", "w"), func(da["x2"], "x", xdest)), + "x2": (("z", "w"), func(da["x2"], "x", xdest, method)), }, ) assert_allclose(actual, expected.transpose("z", "w", "y", transpose_coords=True)) @@ -393,19 +431,17 @@ def test_nans(use_dask: bool) -> None: assert actual.count() > 0 +@requires_scipy @pytest.mark.parametrize("use_dask", [True, False]) def test_errors(use_dask: bool) -> None: - if not has_scipy: - pytest.skip("scipy is not installed.") - - # akima and spline are unavailable + # spline is unavailable da = xr.DataArray([0, 1, np.nan, 2], dims="x", coords={"x": range(4)}) if not has_dask and use_dask: pytest.skip("dask is not installed in the environment.") da = da.chunk() - for method in ["akima", "spline"]: - with pytest.raises(ValueError): + for method in ["spline"]: + with pytest.raises(ValueError), pytest.warns(PendingDeprecationWarning): da.interp(x=[0.5, 1.5], method=method) # type: ignore[arg-type] # not sorted @@ -922,7 +958,10 @@ def test_interp1d_bounds_error() -> None: (("x", np.array([0, 0.5, 1, 2]), dict(unit="s")), False), ], ) -def test_coord_attrs(x, expect_same_attrs: bool) -> None: +def test_coord_attrs( + x, + expect_same_attrs: bool, +) -> None: base_attrs = dict(foo="bar") ds = xr.Dataset( data_vars=dict(a=2 * np.arange(5)), diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index bf90074a7cc..58d8a9dcf5d 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -137,7 +137,11 @@ def test_scipy_methods_function(method) -> None: # Note: Pandas does some wacky things with these methods and the full # integration tests won't work. da, _ = make_interpolate_example_data((25, 25), 0.4, non_uniform=True) - actual = da.interpolate_na(method=method, dim="time") + if method == "spline": + with pytest.warns(PendingDeprecationWarning): + actual = da.interpolate_na(method=method, dim="time") + else: + actual = da.interpolate_na(method=method, dim="time") assert (da.count("time") <= actual.count("time")).all() From 378b4acec7d0f1c27297352eda194360dfc78fa3 Mon Sep 17 00:00:00 2001 From: Mark Harfouche Date: Wed, 25 Sep 2024 19:16:56 -0400 Subject: [PATCH 08/15] Add yet an other CI for numpy 2.1 (#9540) --- .github/workflows/ci.yaml | 3 ++ ci/requirements/all-but-numba.yml | 54 +++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 ci/requirements/all-but-numba.yml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4698e144293..1229dc97925 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -58,6 +58,9 @@ jobs: python-version: "3.10" os: ubuntu-latest # Latest python version: + - env: "all-but-numba" + python-version: "3.12" + os: ubuntu-latest - env: "all-but-dask" # Not 3.12 because of pint python-version: "3.11" diff --git a/ci/requirements/all-but-numba.yml b/ci/requirements/all-but-numba.yml new file mode 100644 index 00000000000..61f64a176af --- /dev/null +++ b/ci/requirements/all-but-numba.yml @@ -0,0 +1,54 @@ +name: xarray-tests +channels: + - conda-forge + - nodefaults +dependencies: + # Pin a "very new numpy" (updated Sept 24, 2024) + - numpy>=2.1.1 + - aiobotocore + - array-api-strict + - boto3 + - bottleneck + - cartopy + - cftime + - dask-core + - dask-expr # dask raises a deprecation warning without this, breaking doctests + - distributed + - flox + - fsspec + - h5netcdf + - h5py + - hdf5 + - hypothesis + - iris + - lxml # Optional dep of pydap + - matplotlib-base + - nc-time-axis + - netcdf4 + # numba, sparse, numbagg, numexpr often conflicts with newer versions of numpy. + # This environment helps us test xarray with the latest versions + # of numpy + # - numba + # - numbagg + # - numexpr + # - sparse + - opt_einsum + - packaging + - pandas + # - pint>=0.22 + - pip + - pooch + - pre-commit + - pyarrow # pandas raises a deprecation warning without this, breaking doctests + - pydap + - pytest + - pytest-cov + - pytest-env + - pytest-xdist + - pytest-timeout + - rasterio + - scipy + - seaborn + - toolz + - typing_extensions + - zarr From 22e93e7116238401353c4f8bb8b6d234e54447c7 Mon Sep 17 00:00:00 2001 From: Martey Dodoo Date: Wed, 25 Sep 2024 21:05:34 -0400 Subject: [PATCH 09/15] Update donation links (#9549) * Update NumFOCUS donation link in README * Update sponsor button to use HTTPS link --- .github/FUNDING.yml | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index 30c1e18f33c..2398577db10 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -1,2 +1,2 @@ github: numfocus -custom: http://numfocus.org/donate-to-xarray +custom: https://numfocus.org/donate-to-xarray diff --git a/README.md b/README.md index dcf71217e2c..8fc8ff335d4 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,7 @@ Xarray is a fiscally sponsored project of [NumFOCUS](https://numfocus.org), a nonprofit dedicated to supporting the open source scientific computing community. If you like Xarray and want to support our mission, please consider making a -[donation](https://numfocus.salsalabs.org/donate-to-xarray/) to support +[donation](https://numfocus.org/donate-to-xarray) to support our efforts. ## History From fecaa850f08599eaab4dbf2fd69a1fbf221a19df Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 25 Sep 2024 19:14:02 -0700 Subject: [PATCH 10/15] Remove duplicate coordinates with indexes on sub-trees (#9531) --- xarray/core/datatree.py | 33 +++++++++++++++++++++++++-- xarray/tests/test_datatree.py | 28 +++++++++++++++++++++++ xarray/tests/test_datatree_mapping.py | 13 ++++++++++- 3 files changed, 71 insertions(+), 3 deletions(-) diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py index 65436be9038..52d44bec96f 100644 --- a/xarray/core/datatree.py +++ b/xarray/core/datatree.py @@ -157,6 +157,34 @@ def check_alignment( check_alignment(child_path, child_ds, base_ds, child.children) +def _deduplicate_inherited_coordinates(child: DataTree, parent: DataTree) -> None: + # This method removes repeated indexes (and corresponding coordinates) + # that are repeated between a DataTree and its parents. + # + # TODO(shoyer): Decide how to handle repeated coordinates *without* an + # index. Should these be allowed, in which case we probably want to + # exclude them from inheritance, or should they be automatically + # dropped? + # https://github.com/pydata/xarray/issues/9475#issuecomment-2357004264 + removed_something = False + for name in parent._indexes: + if name in child._node_indexes: + # Indexes on a Dataset always have a corresponding coordinate. + # We already verified that these coordinates match in the + # check_alignment() call from _pre_attach(). + del child._node_indexes[name] + del child._node_coord_variables[name] + removed_something = True + + if removed_something: + child._node_dims = calculate_dimensions( + child._data_variables | child._node_coord_variables + ) + + for grandchild in child._children.values(): + _deduplicate_inherited_coordinates(grandchild, child) + + def _check_for_slashes_in_names(variables: Iterable[Hashable]) -> None: offending_variable_names = [ name for name in variables if isinstance(name, str) and "/" in name @@ -375,7 +403,7 @@ def map( # type: ignore[override] class DataTree( - NamedNode, + NamedNode["DataTree"], MappedDatasetMethodsMixin, MappedDataWithCoords, DataTreeArithmeticMixin, @@ -486,6 +514,7 @@ def _pre_attach(self: DataTree, parent: DataTree, name: str) -> None: node_ds = self.to_dataset(inherited=False) parent_ds = parent._to_dataset_view(rebuild_dims=False, inherited=True) check_alignment(path, node_ds, parent_ds, self.children) + _deduplicate_inherited_coordinates(self, parent) @property def _coord_variables(self) -> ChainMap[Hashable, Variable]: @@ -1353,7 +1382,7 @@ def map_over_subtree( func: Callable, *args: Iterable[Any], **kwargs: Any, - ) -> DataTree | tuple[DataTree]: + ) -> DataTree | tuple[DataTree, ...]: """ Apply a function to every dataset in this subtree, returning a new tree which stores the results. diff --git a/xarray/tests/test_datatree.py b/xarray/tests/test_datatree.py index 2adf73fa5c8..30934f83c63 100644 --- a/xarray/tests/test_datatree.py +++ b/xarray/tests/test_datatree.py @@ -1295,6 +1295,22 @@ def test_inherited_coords_override(self): xr.testing.assert_equal(dt["/b/y"], xr.DataArray(2, coords=sub_coords)) xr.testing.assert_equal(dt["/b/z"], xr.DataArray(3, coords=sub_coords)) + def test_inherited_coords_with_index_are_deduplicated(self): + dt = DataTree.from_dict( + { + "/": xr.Dataset(coords={"x": [1, 2]}), + "/b": xr.Dataset(coords={"x": [1, 2]}), + } + ) + child_dataset = dt.children["b"].to_dataset(inherited=False) + expected = xr.Dataset() + assert_identical(child_dataset, expected) + + dt["/c"] = xr.Dataset({"foo": ("x", [4, 5])}, coords={"x": [1, 2]}) + child_dataset = dt.children["c"].to_dataset(inherited=False) + expected = xr.Dataset({"foo": ("x", [4, 5])}) + assert_identical(child_dataset, expected) + def test_inconsistent_dims(self): expected_msg = _exact_match( """ @@ -1639,6 +1655,18 @@ def test_binary_op_on_datatree(self): result = dt * dt # type: ignore[operator] assert_equal(result, expected) + def test_arithmetic_inherited_coords(self): + tree = DataTree(xr.Dataset(coords={"x": [1, 2, 3]})) + tree["/foo"] = DataTree(xr.Dataset({"bar": ("x", [4, 5, 6])})) + actual: DataTree = 2 * tree # type: ignore[assignment,operator] + + actual_dataset = actual.children["foo"].to_dataset(inherited=False) + assert "x" not in actual_dataset.coords + + expected = tree.copy() + expected["/foo/bar"].data = np.array([8, 10, 12]) + assert_identical(actual, expected) + class TestUFuncs: def test_tree(self, create_test_datatree): diff --git a/xarray/tests/test_datatree_mapping.py b/xarray/tests/test_datatree_mapping.py index c7e0e93b89b..9a7d3009c3b 100644 --- a/xarray/tests/test_datatree_mapping.py +++ b/xarray/tests/test_datatree_mapping.py @@ -7,7 +7,7 @@ check_isomorphic, map_over_subtree, ) -from xarray.testing import assert_equal +from xarray.testing import assert_equal, assert_identical empty = xr.Dataset() @@ -306,6 +306,17 @@ def fail_on_specific_node(ds): ): dt.map_over_subtree(fail_on_specific_node) + def test_inherited_coordinates_with_index(self): + root = xr.Dataset(coords={"x": [1, 2]}) + child = xr.Dataset({"foo": ("x", [0, 1])}) # no coordinates + tree = xr.DataTree.from_dict({"/": root, "/child": child}) + actual = tree.map_over_subtree(lambda ds: ds) # identity + assert isinstance(actual, xr.DataTree) + assert_identical(tree, actual) + + actual_child = actual.children["child"].to_dataset(inherited=False) + assert_identical(actual_child, child) + class TestMutableOperations: def test_construct_using_type(self): From 44c7c151717a97208e4e7bc069587b48a76a6e61 Mon Sep 17 00:00:00 2001 From: carschandler <92899389+carschandler@users.noreply.github.com> Date: Thu, 26 Sep 2024 13:01:42 -0500 Subject: [PATCH 11/15] Typos in pandas.rst (#9551) --- doc/user-guide/pandas.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/user-guide/pandas.rst b/doc/user-guide/pandas.rst index 30939cbbd17..5fe5e15fa63 100644 --- a/doc/user-guide/pandas.rst +++ b/doc/user-guide/pandas.rst @@ -103,7 +103,7 @@ DataFrames: xr.DataArray.from_series(s) Both the ``from_series`` and ``from_dataframe`` methods use reindexing, so they -work even if not the hierarchical index is not a full tensor product: +work even if the hierarchical index is not a full tensor product: .. ipython:: python @@ -126,7 +126,7 @@ Particularly after a roundtrip, the following deviations are noted: To avoid these problems, the third-party `ntv-pandas `__ library offers lossless and reversible conversions between ``Dataset``/ ``DataArray`` and pandas ``DataFrame`` objects. -This solution is particularly interesting for converting any ``DataFrame`` into a ``Dataset`` (the converter find the multidimensional structure hidden by the tabular structure). +This solution is particularly interesting for converting any ``DataFrame`` into a ``Dataset`` (the converter finds the multidimensional structure hidden by the tabular structure). The `ntv-pandas examples `__ show how to improve the conversion for the previous ``Dataset`` example and for more complex examples. From cde720f21788a1dbff2bdb0775a56b3ee6da6b1a Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Fri, 27 Sep 2024 20:39:43 +0200 Subject: [PATCH 12/15] Fix NamedArray html repr crashing (#9553) * Fix namedarray repr crashing * fix mypy --- xarray/core/formatting.py | 2 +- xarray/tests/test_namedarray.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index 646e000a984..38dccfa2038 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -303,7 +303,7 @@ def inline_variable_array_repr(var, max_width): """Build a one-line summary of a variable's data.""" if hasattr(var._data, "_repr_inline_"): return var._data._repr_inline_(max_width) - if var._in_memory: + if getattr(var, "_in_memory", False): return format_array_flat(var, max_width) dask_array_type = array_type("dask") if isinstance(var._data, dask_array_type): diff --git a/xarray/tests/test_namedarray.py b/xarray/tests/test_namedarray.py index 5e4a39ada73..39ca08fa06c 100644 --- a/xarray/tests/test_namedarray.py +++ b/xarray/tests/test_namedarray.py @@ -591,3 +591,15 @@ def test_broadcast_to_errors( def test_warn_on_repeated_dimension_names(self) -> None: with pytest.warns(UserWarning, match="Duplicate dimension names"): NamedArray(("x", "x"), np.arange(4).reshape(2, 2)) + + +def test_repr() -> None: + x: NamedArray[Any, np.dtype[np.uint64]] + x = NamedArray(("x",), np.array([0], dtype=np.uint64)) + + # Reprs should not crash: + r = x.__repr__() + x._repr_html_() + + # Basic comparison: + assert r == " Size: 8B\narray([0], dtype=uint64)" From a14d202ed91e5bbc93035b25ffc3d334193c9590 Mon Sep 17 00:00:00 2001 From: Justus Magin Date: Mon, 30 Sep 2024 19:51:58 +0200 Subject: [PATCH 13/15] allow using `__array_function__` as a fallback for missing Array API functions (#9530) * don't pass along `out` for `nanprod` We don't do this anywhere elsewhere, so it doesn't make sense to do this only for `nanprod`. * add tests for `as_indexable` * allow using `__array_function__` as a fallback for missing array API funcs * also check dask * don't try to create a `dask` array if `dask` is not installed --- xarray/core/indexing.py | 4 +-- xarray/core/nanops.py | 2 +- xarray/tests/test_indexing.py | 68 +++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 3 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 08b1d0be290..d8727c38c48 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -878,10 +878,10 @@ def as_indexable(array): return PandasIndexingAdapter(array) if is_duck_dask_array(array): return DaskIndexingAdapter(array) - if hasattr(array, "__array_function__"): - return NdArrayLikeIndexingAdapter(array) if hasattr(array, "__array_namespace__"): return ArrayApiIndexingAdapter(array) + if hasattr(array, "__array_function__"): + return NdArrayLikeIndexingAdapter(array) raise TypeError(f"Invalid array type: {type(array)}") diff --git a/xarray/core/nanops.py b/xarray/core/nanops.py index fc7240139aa..7fbb63068c0 100644 --- a/xarray/core/nanops.py +++ b/xarray/core/nanops.py @@ -162,7 +162,7 @@ def nanstd(a, axis=None, dtype=None, out=None, ddof=0): def nanprod(a, axis=None, dtype=None, out=None, min_count=None): mask = isnull(a) - result = nputils.nanprod(a, axis=axis, dtype=dtype, out=out) + result = nputils.nanprod(a, axis=axis, dtype=dtype) if min_count is not None: return _maybe_null_out(result, axis, mask, min_count) else: diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 985fb02a87e..92c21cc32fb 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -894,6 +894,74 @@ def test_posify_mask_subindexer(indices, expected) -> None: np.testing.assert_array_equal(expected, actual) +class ArrayWithNamespace: + def __array_namespace__(self, version=None): + pass + + +class ArrayWithArrayFunction: + def __array_function__(self, func, types, args, kwargs): + pass + + +class ArrayWithNamespaceAndArrayFunction: + def __array_namespace__(self, version=None): + pass + + def __array_function__(self, func, types, args, kwargs): + pass + + +def as_dask_array(arr, chunks): + try: + import dask.array as da + except ImportError: + return None + + return da.from_array(arr, chunks=chunks) + + +@pytest.mark.parametrize( + ["array", "expected_type"], + ( + pytest.param( + indexing.CopyOnWriteArray(np.array([1, 2])), + indexing.CopyOnWriteArray, + id="ExplicitlyIndexed", + ), + pytest.param( + np.array([1, 2]), indexing.NumpyIndexingAdapter, id="numpy.ndarray" + ), + pytest.param( + pd.Index([1, 2]), indexing.PandasIndexingAdapter, id="pandas.Index" + ), + pytest.param( + as_dask_array(np.array([1, 2]), chunks=(1,)), + indexing.DaskIndexingAdapter, + id="dask.array", + marks=requires_dask, + ), + pytest.param( + ArrayWithNamespace(), indexing.ArrayApiIndexingAdapter, id="array_api" + ), + pytest.param( + ArrayWithArrayFunction(), + indexing.NdArrayLikeIndexingAdapter, + id="array_like", + ), + pytest.param( + ArrayWithNamespaceAndArrayFunction(), + indexing.ArrayApiIndexingAdapter, + id="array_api_with_fallback", + ), + ), +) +def test_as_indexable(array, expected_type): + actual = indexing.as_indexable(array) + + assert isinstance(actual, expected_type) + + def test_indexing_1d_object_array() -> None: items = (np.arange(3), np.arange(6)) arr = DataArray(np.array(items, dtype=object)) From ece582dd8753f6b4abec420de4c7134c09f070e7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 30 Sep 2024 19:58:46 +0200 Subject: [PATCH 14/15] Bump scientific-python/upload-nightly-action in the actions group (#9556) Bumps the actions group with 1 update: [scientific-python/upload-nightly-action](https://github.com/scientific-python/upload-nightly-action). Updates `scientific-python/upload-nightly-action` from 0.5.0 to 0.6.0 - [Release notes](https://github.com/scientific-python/upload-nightly-action/releases) - [Commits](https://github.com/scientific-python/upload-nightly-action/compare/b67d7fcc0396e1128a474d1ab2b48aa94680f9fc...ccf29c805b5d0c1dc31fa97fcdb962be074cade3) --- updated-dependencies: - dependency-name: scientific-python/upload-nightly-action dependency-type: direct:production update-type: version-update:semver-minor dependency-group: actions ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/nightly-wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nightly-wheels.yml b/.github/workflows/nightly-wheels.yml index 9aac7ab8775..c31f2774bd8 100644 --- a/.github/workflows/nightly-wheels.yml +++ b/.github/workflows/nightly-wheels.yml @@ -38,7 +38,7 @@ jobs: fi - name: Upload wheel - uses: scientific-python/upload-nightly-action@b67d7fcc0396e1128a474d1ab2b48aa94680f9fc # 0.5.0 + uses: scientific-python/upload-nightly-action@ccf29c805b5d0c1dc31fa97fcdb962be074cade3 # 0.6.0 with: anaconda_nightly_upload_token: ${{ secrets.ANACONDA_NIGHTLY }} artifacts_path: dist From 7bdc6d4ccc6359143cb6b8d6a7642ca38c3ff550 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Mon, 30 Sep 2024 12:22:48 -0700 Subject: [PATCH 15/15] Revert "Improve safe chunk validation (#9527)" (#9558) This reverts commit 2a6212e1255ea56065ec1bfad8d484fbdad33945. --- doc/whats-new.rst | 4 +- xarray/backends/zarr.py | 168 ++++++++++----------------------- xarray/core/dataarray.py | 8 -- xarray/core/dataset.py | 8 -- xarray/tests/test_backends.py | 169 ++-------------------------------- 5 files changed, 54 insertions(+), 303 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9cf630acea1..84aa4d96c6b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -61,9 +61,7 @@ Bug fixes `_. - Fix a few bugs affecting groupby reductions with `flox`. (:issue:`8090`, :issue:`9398`). By `Deepak Cherian `_. -- Fix the safe_chunks validation option on the to_zarr method - (:issue:`5511`, :pull:`9513`). By `Joseph Nowak - `_. + Documentation ~~~~~~~~~~~~~ diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 2c6b50b3589..5a6b043eef8 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -112,9 +112,7 @@ def __getitem__(self, key): # could possibly have a work-around for 0d data here -def _determine_zarr_chunks( - enc_chunks, var_chunks, ndim, name, safe_chunks, region, mode, shape -): +def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name, safe_chunks): """ Given encoding chunks (possibly None or []) and variable chunks (possibly None or []). @@ -165,9 +163,7 @@ def _determine_zarr_chunks( if len(enc_chunks_tuple) != ndim: # throw away encoding chunks, start over - return _determine_zarr_chunks( - None, var_chunks, ndim, name, safe_chunks, region, mode, shape - ) + return _determine_zarr_chunks(None, var_chunks, ndim, name, safe_chunks) for x in enc_chunks_tuple: if not isinstance(x, int): @@ -193,59 +189,20 @@ def _determine_zarr_chunks( # TODO: incorporate synchronizer to allow writes from multiple dask # threads if var_chunks and enc_chunks_tuple: - # If it is possible to write on partial chunks then it is not necessary to check - # the last one contained on the region - allow_partial_chunks = mode != "r+" - - base_error = ( - f"Specified zarr chunks encoding['chunks']={enc_chunks_tuple!r} for " - f"variable named {name!r} would overlap multiple dask chunks {var_chunks!r} " - f"on the region {region}. " - f"Writing this array in parallel with dask could lead to corrupted data." - f"Consider either rechunking using `chunk()`, deleting " - f"or modifying `encoding['chunks']`, or specify `safe_chunks=False`." - ) - - for zchunk, dchunks, interval, size in zip( - enc_chunks_tuple, var_chunks, region, shape, strict=True - ): - if not safe_chunks: - continue - - for dchunk in dchunks[1:-1]: + for zchunk, dchunks in zip(enc_chunks_tuple, var_chunks, strict=True): + for dchunk in dchunks[:-1]: if dchunk % zchunk: - raise ValueError(base_error) - - region_start = interval.start if interval.start else 0 - - if len(dchunks) > 1: - # The first border size is the amount of data that needs to be updated on the - # first chunk taking into account the region slice. - first_border_size = zchunk - if allow_partial_chunks: - first_border_size = zchunk - region_start % zchunk - - if (dchunks[0] - first_border_size) % zchunk: - raise ValueError(base_error) - - if not allow_partial_chunks: - chunk_start = sum(dchunks[:-1]) + region_start - if chunk_start % zchunk: - # The last chunk which can also be the only one is a partial chunk - # if it is not aligned at the beginning - raise ValueError(base_error) - - region_stop = interval.stop if interval.stop else size - - if size - region_stop + 1 < zchunk: - # If the region is covering the last chunk then check - # if the reminder with the default chunk size - # is equal to the size of the last chunk - if dchunks[-1] % zchunk != size % zchunk: - raise ValueError(base_error) - elif dchunks[-1] % zchunk: - raise ValueError(base_error) - + base_error = ( + f"Specified zarr chunks encoding['chunks']={enc_chunks_tuple!r} for " + f"variable named {name!r} would overlap multiple dask chunks {var_chunks!r}. " + f"Writing this array in parallel with dask could lead to corrupted data." + ) + if safe_chunks: + raise ValueError( + base_error + + " Consider either rechunking using `chunk()`, deleting " + "or modifying `encoding['chunks']`, or specify `safe_chunks=False`." + ) return enc_chunks_tuple raise AssertionError("We should never get here. Function logic must be wrong.") @@ -286,14 +243,7 @@ def _get_zarr_dims_and_attrs(zarr_obj, dimension_key, try_nczarr): def extract_zarr_variable_encoding( - variable, - raise_on_invalid=False, - name=None, - *, - safe_chunks=True, - region=None, - mode=None, - shape=None, + variable, raise_on_invalid=False, name=None, safe_chunks=True ): """ Extract zarr encoding dictionary from xarray Variable @@ -302,18 +252,12 @@ def extract_zarr_variable_encoding( ---------- variable : Variable raise_on_invalid : bool, optional - name: str | Hashable, optional - safe_chunks: bool, optional - region: tuple[slice, ...], optional - mode: str, optional - shape: tuple[int, ...], optional + Returns ------- encoding : dict Zarr encoding for `variable` """ - - shape = shape if shape else variable.shape encoding = variable.encoding.copy() safe_to_drop = {"source", "original_shape"} @@ -341,14 +285,7 @@ def extract_zarr_variable_encoding( del encoding[k] chunks = _determine_zarr_chunks( - enc_chunks=encoding.get("chunks"), - var_chunks=variable.chunks, - ndim=variable.ndim, - name=name, - safe_chunks=safe_chunks, - region=region, - mode=mode, - shape=shape, + encoding.get("chunks"), variable.chunks, variable.ndim, name, safe_chunks ) encoding["chunks"] = chunks return encoding @@ -825,10 +762,16 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No if v.encoding == {"_FillValue": None} and fill_value is None: v.encoding = {} - zarr_array = None - zarr_shape = None - write_region = self._write_region if self._write_region is not None else {} - write_region = {dim: write_region.get(dim, slice(None)) for dim in dims} + # We need to do this for both new and existing variables to ensure we're not + # writing to a partial chunk, even though we don't use the `encoding` value + # when writing to an existing variable. See + # https://github.com/pydata/xarray/issues/8371 for details. + encoding = extract_zarr_variable_encoding( + v, + raise_on_invalid=vn in check_encoding_set, + name=vn, + safe_chunks=self._safe_chunks, + ) if name in existing_keys: # existing variable @@ -858,40 +801,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No ) else: zarr_array = self.zarr_group[name] - - if self._append_dim is not None and self._append_dim in dims: - # resize existing variable - append_axis = dims.index(self._append_dim) - assert write_region[self._append_dim] == slice(None) - write_region[self._append_dim] = slice( - zarr_array.shape[append_axis], None - ) - - new_shape = list(zarr_array.shape) - new_shape[append_axis] += v.shape[append_axis] - zarr_array.resize(new_shape) - - zarr_shape = zarr_array.shape - - region = tuple(write_region[dim] for dim in dims) - - # We need to do this for both new and existing variables to ensure we're not - # writing to a partial chunk, even though we don't use the `encoding` value - # when writing to an existing variable. See - # https://github.com/pydata/xarray/issues/8371 for details. - # Note: Ideally there should be two functions, one for validating the chunks and - # another one for extracting the encoding. - encoding = extract_zarr_variable_encoding( - v, - raise_on_invalid=vn in check_encoding_set, - name=vn, - safe_chunks=self._safe_chunks, - region=region, - mode=self._mode, - shape=zarr_shape, - ) - - if name not in existing_keys: + else: # new variable encoded_attrs = {} # the magic for storing the hidden dimension data @@ -923,6 +833,22 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No ) zarr_array = _put_attrs(zarr_array, encoded_attrs) + write_region = self._write_region if self._write_region is not None else {} + write_region = {dim: write_region.get(dim, slice(None)) for dim in dims} + + if self._append_dim is not None and self._append_dim in dims: + # resize existing variable + append_axis = dims.index(self._append_dim) + assert write_region[self._append_dim] == slice(None) + write_region[self._append_dim] = slice( + zarr_array.shape[append_axis], None + ) + + new_shape = list(zarr_array.shape) + new_shape[append_axis] += v.shape[append_axis] + zarr_array.resize(new_shape) + + region = tuple(write_region[dim] for dim in dims) writer.add(v.data, zarr_array, region) def close(self) -> None: @@ -971,9 +897,9 @@ def _validate_and_autodetect_region(self, ds) -> None: if not isinstance(region, dict): raise TypeError(f"``region`` must be a dict, got {type(region)}") if any(v == "auto" for v in region.values()): - if self._mode not in ["r+", "a"]: + if self._mode != "r+": raise ValueError( - f"``mode`` must be 'r+' or 'a' when using ``region='auto'``, got {self._mode!r}" + f"``mode`` must be 'r+' when using ``region='auto'``, got {self._mode!r}" ) region = self._auto_detect_regions(ds, region) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 2adf862f1fd..abf9e8ec643 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -4316,14 +4316,6 @@ def to_zarr( if Zarr arrays are written in parallel. This option may be useful in combination with ``compute=False`` to initialize a Zarr store from an existing DataArray with arbitrary chunk structure. - In addition to the many-to-one relationship validation, it also detects partial - chunks writes when using the region parameter, - these partial chunks are considered unsafe in the mode "r+" but safe in - the mode "a". - Note: Even with these validations it can still be unsafe to write - two or more chunked arrays in the same location in parallel if they are - not writing in independent regions, for those cases it is better to use - a synchronizer. storage_options : dict, optional Any additional parameters for the storage backend (ignored for local paths). diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 82b60d7abc8..b3d84ff7336 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2509,14 +2509,6 @@ def to_zarr( if Zarr arrays are written in parallel. This option may be useful in combination with ``compute=False`` to initialize a Zarr from an existing Dataset with arbitrary chunk structure. - In addition to the many-to-one relationship validation, it also detects partial - chunks writes when using the region parameter, - these partial chunks are considered unsafe in the mode "r+" but safe in - the mode "a". - Note: Even with these validations it can still be unsafe to write - two or more chunked arrays in the same location in parallel if they are - not writing in independent regions, for those cases it is better to use - a synchronizer. storage_options : dict, optional Any additional parameters for the storage backend (ignored for local paths). diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ccf1bc73dd6..cbc0b9e019d 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5989,10 +5989,9 @@ def test_zarr_region_append(self, tmp_path): } ) - # Now it is valid to use auto region detection with the append mode, - # but it is still unsafe to modify dimensions or metadata using the region - # parameter. - with pytest.raises(KeyError): + # Don't allow auto region detection in append mode due to complexities in + # implementing the overlap logic and lack of safety with parallel writes + with pytest.raises(ValueError): ds_new.to_zarr( tmp_path / "test.zarr", mode="a", append_dim="x", region="auto" ) @@ -6097,162 +6096,6 @@ def test_zarr_region_chunk_partial_offset(tmp_path): store, safe_chunks=False, region="auto" ) - with pytest.raises(ValueError): - da.isel(x=slice(5, 25)).chunk(x=(10, 10)).to_zarr(store, region="auto") - - -@requires_zarr -@requires_dask -def test_zarr_safe_chunk_append_dim(tmp_path): - store = tmp_path / "foo.zarr" - data = np.ones((20,)) - da = xr.DataArray(data, dims=["x"], coords={"x": range(20)}, name="foo").chunk(x=5) - - da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") - with pytest.raises(ValueError): - # If the first chunk is smaller than the border size then raise an error - da.isel(x=slice(7, 11)).chunk(x=(2, 2)).to_zarr( - store, append_dim="x", safe_chunks=True - ) - - da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") - # If the first chunk is of the size of the border size then it is valid - da.isel(x=slice(7, 11)).chunk(x=(3, 1)).to_zarr( - store, safe_chunks=True, append_dim="x" - ) - assert xr.open_zarr(store)["foo"].equals(da.isel(x=slice(0, 11))) - - da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") - # If the first chunk is of the size of the border size + N * zchunk then it is valid - da.isel(x=slice(7, 17)).chunk(x=(8, 2)).to_zarr( - store, safe_chunks=True, append_dim="x" - ) - assert xr.open_zarr(store)["foo"].equals(da.isel(x=slice(0, 17))) - - da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") - with pytest.raises(ValueError): - # If the first chunk is valid but the other are not then raise an error - da.isel(x=slice(7, 14)).chunk(x=(3, 3, 1)).to_zarr( - store, append_dim="x", safe_chunks=True - ) - - da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") - with pytest.raises(ValueError): - # If the first chunk have a size bigger than the border size but not enough - # to complete the size of the next chunk then an error must be raised - da.isel(x=slice(7, 14)).chunk(x=(4, 3)).to_zarr( - store, append_dim="x", safe_chunks=True - ) - - da.isel(x=slice(0, 7)).to_zarr(store, safe_chunks=True, mode="w") - # Append with a single chunk it's totally valid, - # and it does not matter the size of the chunk - da.isel(x=slice(7, 19)).chunk(x=-1).to_zarr(store, append_dim="x", safe_chunks=True) - assert xr.open_zarr(store)["foo"].equals(da.isel(x=slice(0, 19))) - - -@requires_zarr -@requires_dask -def test_zarr_safe_chunk_region(tmp_path): - store = tmp_path / "foo.zarr" - - arr = xr.DataArray( - list(range(11)), dims=["a"], coords={"a": list(range(11))}, name="foo" - ).chunk(a=3) - arr.to_zarr(store, mode="w") - - modes: list[Literal["r+", "a"]] = ["r+", "a"] - for mode in modes: - with pytest.raises(ValueError): - # There are two Dask chunks on the same Zarr chunk, - # which means that it is unsafe in any mode - arr.isel(a=slice(0, 3)).chunk(a=(2, 1)).to_zarr( - store, region="auto", mode=mode - ) - - with pytest.raises(ValueError): - # the first chunk is covering the border size, but it is not - # completely covering the second chunk, which means that it is - # unsafe in any mode - arr.isel(a=slice(1, 5)).chunk(a=(3, 1)).to_zarr( - store, region="auto", mode=mode - ) - - with pytest.raises(ValueError): - # The first chunk is safe but the other two chunks are overlapping with - # the same Zarr chunk - arr.isel(a=slice(0, 5)).chunk(a=(3, 1, 1)).to_zarr( - store, region="auto", mode=mode - ) - - # Fully update two contiguous chunks is safe in any mode - arr.isel(a=slice(3, 9)).to_zarr(store, region="auto", mode=mode) - - # The last chunk is considered full based on their current size (2) - arr.isel(a=slice(9, 11)).to_zarr(store, region="auto", mode=mode) - arr.isel(a=slice(6, None)).chunk(a=-1).to_zarr(store, region="auto", mode=mode) - - # Write the last chunk of a region partially is safe in "a" mode - arr.isel(a=slice(3, 8)).to_zarr(store, region="auto", mode="a") - with pytest.raises(ValueError): - # with "r+" mode it is invalid to write partial chunk - arr.isel(a=slice(3, 8)).to_zarr(store, region="auto", mode="r+") - - # This is safe with mode "a", the border size is covered by the first chunk of Dask - arr.isel(a=slice(1, 4)).chunk(a=(2, 1)).to_zarr(store, region="auto", mode="a") - with pytest.raises(ValueError): - # This is considered unsafe in mode "r+" because it is writing in a partial chunk - arr.isel(a=slice(1, 4)).chunk(a=(2, 1)).to_zarr(store, region="auto", mode="r+") - - # This is safe on mode "a" because there is a single dask chunk - arr.isel(a=slice(1, 5)).chunk(a=(4,)).to_zarr(store, region="auto", mode="a") - with pytest.raises(ValueError): - # This is unsafe on mode "r+", because the Dask chunk is partially writing - # in the first chunk of Zarr - arr.isel(a=slice(1, 5)).chunk(a=(4,)).to_zarr(store, region="auto", mode="r+") - - # The first chunk is completely covering the first Zarr chunk - # and the last chunk is a partial one - arr.isel(a=slice(0, 5)).chunk(a=(3, 2)).to_zarr(store, region="auto", mode="a") - - with pytest.raises(ValueError): - # The last chunk is partial, so it is considered unsafe on mode "r+" - arr.isel(a=slice(0, 5)).chunk(a=(3, 2)).to_zarr(store, region="auto", mode="r+") - - # The first chunk is covering the border size (2 elements) - # and also the second chunk (3 elements), so it is valid - arr.isel(a=slice(1, 8)).chunk(a=(5, 2)).to_zarr(store, region="auto", mode="a") - - with pytest.raises(ValueError): - # The first chunk is not fully covering the first zarr chunk - arr.isel(a=slice(1, 8)).chunk(a=(5, 2)).to_zarr(store, region="auto", mode="r+") - - with pytest.raises(ValueError): - # Validate that the border condition is not affecting the "r+" mode - arr.isel(a=slice(1, 9)).to_zarr(store, region="auto", mode="r+") - - arr.isel(a=slice(10, 11)).to_zarr(store, region="auto", mode="a") - with pytest.raises(ValueError): - # Validate that even if we write with a single Dask chunk on the last Zarr - # chunk it is still unsafe if it is not fully covering it - # (the last Zarr chunk has size 2) - arr.isel(a=slice(10, 11)).to_zarr(store, region="auto", mode="r+") - - # Validate the same as the above test but in the beginning of the last chunk - arr.isel(a=slice(9, 10)).to_zarr(store, region="auto", mode="a") - with pytest.raises(ValueError): - arr.isel(a=slice(9, 10)).to_zarr(store, region="auto", mode="r+") - - arr.isel(a=slice(7, None)).chunk(a=-1).to_zarr(store, region="auto", mode="a") - with pytest.raises(ValueError): - # Test that even a Dask chunk that covers the last Zarr chunk can be unsafe - # if it is partial covering other Zarr chunks - arr.isel(a=slice(7, None)).chunk(a=-1).to_zarr(store, region="auto", mode="r+") - - with pytest.raises(ValueError): - # If the chunk is of size equal to the one in the Zarr encoding, but - # it is partially writing in the first chunk then raise an error - arr.isel(a=slice(8, None)).chunk(a=3).to_zarr(store, region="auto", mode="r+") - - with pytest.raises(ValueError): - arr.isel(a=slice(5, -1)).chunk(a=5).to_zarr(store, region="auto", mode="r+") + # This write is unsafe, and should raise an error, but does not. + # with pytest.raises(ValueError): + # da.isel(x=slice(5, 25)).chunk(x=(10, 10)).to_zarr(store, region="auto")