diff --git a/doc/api.rst b/doc/api.rst index f41eaa12038..caf9fd8ff37 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -192,6 +192,7 @@ Computation Dataset.map_blocks Dataset.polyfit Dataset.curvefit + Dataset.eval Aggregation ----------- diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 2dd1fbea64c..0743a29316b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -41,6 +41,9 @@ New Features - :py:meth:`~xarray.DataArray.rank` now operates on dask-backed arrays, assuming the core dim has exactly one chunk. (:pull:`8475`). By `Maximilian Roos `_. +- Add a :py:meth:`Dataset.eval` method, similar to the pandas' method of the + same name. (:pull:`7163`). This is currently marked as experimental and + doesn't yet support the ``numexpr`` engine. - :py:meth:`Dataset.drop_vars` & :py:meth:`DataArray.drop_vars` allow passing a callable, similar to :py:meth:`Dataset.where` & :py:meth:`Dataset.sortby` & others. (:pull:`8511`). diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index cd143d96e7c..a6a3e327cfb 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -98,6 +98,7 @@ Self, T_ChunkDim, T_Chunks, + T_DataArray, T_DataArrayOrSet, T_Dataset, ZarrWriteModes, @@ -9554,6 +9555,68 @@ def argmax(self, dim: Hashable | None = None, **kwargs) -> Self: "Dataset.argmin() with a sequence or ... for dim" ) + def eval( + self, + statement: str, + *, + parser: QueryParserOptions = "pandas", + ) -> Self | T_DataArray: + """ + Calculate an expression supplied as a string in the context of the dataset. + + This is currently experimental; the API may change particularly around + assignments, which currently returnn a ``Dataset`` with the additional variable. + Currently only the ``python`` engine is supported, which has the same + performance as executing in python. + + Parameters + ---------- + statement : str + String containing the Python-like expression to evaluate. + + Returns + ------- + result : Dataset or DataArray, depending on whether ``statement`` contains an + assignment. + + Examples + -------- + >>> ds = xr.Dataset( + ... {"a": ("x", np.arange(0, 5, 1)), "b": ("x", np.linspace(0, 1, 5))} + ... ) + >>> ds + + Dimensions: (x: 5) + Dimensions without coordinates: x + Data variables: + a (x) int64 0 1 2 3 4 + b (x) float64 0.0 0.25 0.5 0.75 1.0 + + >>> ds.eval("a + b") + + array([0. , 1.25, 2.5 , 3.75, 5. ]) + Dimensions without coordinates: x + + >>> ds.eval("c = a + b") + + Dimensions: (x: 5) + Dimensions without coordinates: x + Data variables: + a (x) int64 0 1 2 3 4 + b (x) float64 0.0 0.25 0.5 0.75 1.0 + c (x) float64 0.0 1.25 2.5 3.75 5.0 + """ + + return pd.eval( + statement, + resolvers=[self], + target=self, + parser=parser, + # Because numexpr returns a numpy array, using that engine results in + # different behavior. We'd be very open to a contribution handling this. + engine="python", + ) + def query( self, queries: Mapping[Any, Any] | None = None, diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 17515744a31..664d108b89c 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -6718,6 +6718,23 @@ def test_query(self, backend, engine, parser) -> None: # pytest tests — new tests should go here, rather than in the class. +@pytest.mark.parametrize("parser", ["pandas", "python"]) +def test_eval(ds, parser) -> None: + """Currently much more minimal testing that `query` above, and much of the setup + isn't used. But the risks are fairly low — `query` shares much of the code, and + the method is currently experimental.""" + + actual = ds.eval("z1 + 5", parser=parser) + expect = ds["z1"] + 5 + assert_identical(expect, actual) + + # check pandas query syntax is supported + if parser == "pandas": + actual = ds.eval("(z1 > 5) and (z2 > 0)", parser=parser) + expect = (ds["z1"] > 5) & (ds["z2"] > 0) + assert_identical(expect, actual) + + @pytest.mark.parametrize("test_elements", ([1, 2], np.array([1, 2]), DataArray([1, 2]))) def test_isin(test_elements, backend) -> None: expected = Dataset(