Regrid dataset with dask variables (JiaweiZhuang#122)

* Fix JiaweiZhuang#121 * upd changes * linting * Fix shapely deprecation - manually cast dask arrays on datasets * linting * Add sparse to reqs * Revert adding sparse to reqs - add to mock import * Bump rtd to py3.8 * Pin docutils * Changes after review * lint Co-authored-by: David Huard <[email protected]>
Ouranosinc · Nov 10, 2021 · bfdb40f · bfdb40f
1 parent 2feddcf
commit bfdb40f
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 3 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -6,6 +6,7 @@ What's new
 
 Bug fixes
 ~~~~~~~~~
+- Regridding datasets with dask-backed variables is fixed. Dtype of the outputs is changed for this specific case. By `Pascal Bourgault <https://github.com/aulemahal>`_
 - SpatialAverager did not compute the same weights as Regridder when source cell areas were not uniform (:pull:`128`). By `David Huard <https://github.com/huard>`_
 
 

diff --git a/doc/conf.py b/doc/conf.py
@@ -58,6 +58,7 @@
 
 # To avoid installing xESMF and all its dependencies when building doc
 # https://stackoverflow.com/a/15912502/8729698
+
 autodoc_mock_imports = ['numpy', 'xarray', 'scipy', 'ESMF', 'dask', 'sparse', 'numba']
 
 # avoid automatic execution for notebooks

diff --git a/xesmf/frontend.py b/xesmf/frontend.py
@@ -432,7 +432,7 @@ def __call__(self, indata, keep_attrs=False, skipna=False, na_thres=1.0):
 
         Returns
         -------
-        outdata : Data type is the same as input data type.
+        outdata : Data type is the same as input data type, except for datasets.
             On the same horizontal grid as ``ds_out``,
             with extra dims in ``dr_in``.
 
@@ -443,6 +443,10 @@ def __call__(self, indata, keep_attrs=False, skipna=False, na_thres=1.0):
             - (n_time, n_lev, n_y_out, n_x_out), if ``dr_in`` has shape
               (n_time, n_lev, n_y, n_x)
 
+            Datasets with dask-backed variables will have modified dtypes.
+            If all input variables are 'float32', all output will be 'float32',
+            for any other case, all outputs will be 'float64'.
+
         """
         if isinstance(indata, np.ndarray):
             return self.regrid_numpy(indata, skipna=skipna, na_thres=na_thres)
@@ -560,7 +564,15 @@ def regrid_dataset(self, ds_in, keep_attrs=False, skipna=False, na_thres=1.0):
         ]
         ds_in = ds_in.drop_vars(non_regriddable)
 
-        ds_dtypes = [d.dtype for d in ds_in.data_vars.values()]
+        # Select dtype (only relevant for dask-backed arrays, this has no effect otherwise)
+        dtypes = set(da.dtype for da in ds_in.data_vars.values())
+        if len(dtypes) == 1:
+            out_dtype = dtypes.pop()
+        else:
+            size = 1  # init to 1 byte
+            for d in dtypes:
+                size = max(size, d.itemsize)  # use max number of bytes found in dtypes
+                out_dtype = np.dtype(f'f{size}')
 
         ds_out = xr.apply_ufunc(
             self._regrid_array,
@@ -570,7 +582,7 @@ def regrid_dataset(self, ds_in, keep_attrs=False, skipna=False, na_thres=1.0):
             input_core_dims=[input_horiz_dims, ('out_dim', 'in_dim')],
             output_core_dims=[temp_horiz_dims],
             dask='parallelized',
-            output_dtypes=ds_dtypes,
+            output_dtypes=[out_dtype],
             dask_gufunc_kwargs={
                 'output_sizes': {
                     temp_horiz_dims[0]: self.shape_out[0],
@@ -580,6 +592,13 @@ def regrid_dataset(self, ds_in, keep_attrs=False, skipna=False, na_thres=1.0):
             keep_attrs=keep_attrs,
         )
 
+        # For dask-backed, we force the dtype to fit the input's
+        # (numpy-backed variables already have been handled in `apply_weights`)
+        for name, data in ds_out.data_vars.items():
+            indtype = ds_in[name].dtype
+            if xr.core.common.is_duck_dask_array(data.data) and data.dtype != indtype:
+                ds_out[name] = data.astype(indtype, keep_attrs=True)
+
         return self._format_xroutput(ds_out, temp_horiz_dims)
 
     def _parse_xrinput(self, dr_in):

diff --git a/xesmf/tests/test_frontend.py b/xesmf/tests/test_frontend.py
@@ -575,6 +575,27 @@ def test_regrid_dataset():
     xr.testing.assert_identical(ds_result2, ds_result)
 
 
+@pytest.mark.parametrize('scheduler', dask_schedulers)
+def test_regrid_dataset_dask(request, scheduler):
+    scheduler = request.getfixturevalue(scheduler)
+    # xarray.Dataset containing dask array
+    regridder = xe.Regridder(ds_in, ds_out, 'conservative')
+
+    # `ds_out` already refers to output grid object
+    ds_result = regridder(ds_in.chunk())
+
+    # output should contain all data variables
+    assert set(ds_result.data_vars.keys()) == set(ds_in.data_vars.keys())
+    assert dask.is_dask_collection(ds_result)
+    assert ds_result.data.dtype == ds_in.data.dtype
+
+    ds_in_f4 = ds_in.copy()
+    ds_in_f4['data'] = ds_in_f4.data.astype('float32')
+    ds_in_f4['data4D'] = ds_in_f4.data4D.astype('float32')
+    ds_result = regridder(ds_in_f4.chunk())
+    assert ds_result.data.dtype == 'float32'
+
+
 def test_regrid_dataset_to_locstream():
     # xarray.Dataset containing in-memory numpy array