Passing all tests except lint and examples

dtscalibration · Oct 16, 2023 · 74a0a8b · 74a0a8b
1 parent 1825050
commit 74a0a8b
Show file tree

Hide file tree

Showing 5 changed files with 47 additions and 302 deletions.
diff --git a/src/dtscalibration/__init__.py b/src/dtscalibration/__init__.py
@@ -1,11 +1,8 @@
-from dtscalibration.datastore import DataStore
 from dtscalibration.datastore_utils import check_dims
 from dtscalibration.datastore_utils import get_netcdf_encoding
 from dtscalibration.datastore_utils import merge_double_ended
 from dtscalibration.datastore_utils import shift_double_ended
 from dtscalibration.datastore_utils import suggest_cable_shift_double_ended
-from dtscalibration.io import open_datastore
-from dtscalibration.io import open_mf_datastore
 from dtscalibration.io import read_apsensing_files
 from dtscalibration.io import read_sensornet_files
 from dtscalibration.io import read_sensortran_files
@@ -18,9 +15,6 @@
 
 __version__ = "2.0.0"
 __all__ = [
-    "DataStore",
-    "open_datastore",
-    "open_mf_datastore",
     "read_apsensing_files",
     "read_sensornet_files",
     "read_sensortran_files",

diff --git a/src/dtscalibration/datastore_accessor.py b/src/dtscalibration/datastore_accessor.py
@@ -46,7 +46,7 @@ def __repr__(self):
         #   'xarray' is prepended. so we remove it and add 'dtscalibration'
         s = xr.core.formatting.dataset_repr(self._obj)
         name_module = type(self._obj).__name__
-        preamble_new = "<dtscalibration.%s>" % name_module
+        preamble_new = f"<dtscalibration.{name_module}>"
 
         # Add sections to new preamble
         preamble_new += "\nSections:"
@@ -268,6 +268,12 @@ def get_default_encoding(self, time_chunks_from_key=None):
 
         return encoding
 
+    def get_timeseries_keys(self):
+        """
+        Returns a list of the keys of the time series variables.
+        """
+        return [k for k, v in self._obj.data_vars.items() if v.dims == ("time",)]
+
     def ufunc_per_section(
         self,
         sections=None,

diff --git a/src/dtscalibration/datastore_utils.py b/src/dtscalibration/datastore_utils.py
@@ -459,45 +459,8 @@ def check_deprecated_kwargs(kwargs):
     pass
 
 
-# def check_timestep_allclose(ds: "DataStore", eps: float = 0.05) -> None:
-#     """
-#     Check if all timesteps are of equal size. For now it is not possible to calibrate
-#     over timesteps if the acquisition time of timesteps varies, as the Stokes variance
-#     would change over time.
-
-#     The acquisition time is stored for single ended measurements in userAcquisitionTime,
-#     for double ended measurements in userAcquisitionTimeFW and userAcquisitionTimeBW.
-
-#     Parameters
-#     ----------
-#     ds : DataStore
-#     eps : float
-#         Default accepts 1% of relative variation between min and max acquisition time.
-
-#     Returns
-#     -------
-#     """
-#     dt = ds["userAcquisitionTimeFW"].data
-#     dtmin = dt.min()
-#     dtmax = dt.max()
-#     dtavg = (dtmin + dtmax) / 2
-#     assert (dtmax - dtmin) / dtavg < eps, (
-#         "Acquisition time is Forward channel not equal for all time steps"
-#     )
-
-#     if "userAcquisitionTimeBW" in ds:
-#         dt = ds["userAcquisitionTimeBW"].data
-#         dtmin = dt.min()
-#         dtmax = dt.max()
-#         dtavg = (dtmin + dtmax) / 2
-#         assert (dtmax - dtmin) / dtavg < eps, (
-#             "Acquisition time Backward channel is not equal for all time steps"
-#         )
-#     pass
-
-
 def get_netcdf_encoding(
-    ds: "DataStore", zlib: bool = True, complevel: int = 5, **kwargs
+    ds: xr.Dataset, zlib: bool = True, complevel: int = 5, **kwargs
 ) -> dict:
     """Get default netcdf compression parameters. The same for each data variable.
 
@@ -788,12 +751,12 @@ def get_params_from_pval_single_ended(
 
 
 def merge_double_ended(
-    ds_fw: "DataStore",
-    ds_bw: "DataStore",
+    ds_fw: xr.Dataset,
+    ds_bw: xr.Dataset,
     cable_length: float,
     plot_result: bool = True,
     verbose: bool = True,
-) -> "DataStore":
+) -> xr.Dataset:
     """
     Some measurements are not set up on the DTS-device as double-ended
     meausurements. This means that the two channels have to be merged manually.
@@ -858,11 +821,11 @@ def merge_double_ended(
 
 
 def merge_double_ended_times(
-    ds_fw: "DataStore",
-    ds_bw: "DataStore",
+    ds_fw: xr.Dataset,
+    ds_bw: xr.Dataset,
     verify_timedeltas: bool = True,
     verbose: bool = True,
-) -> tuple["DataStore", "DataStore"]:
+) -> tuple[xr.Dataset, xr.Dataset]:
     """Helper for `merge_double_ended()` to deal with missing measurements. The
     number of measurements of the forward and backward channels might get out
     of sync if the device shuts down before the measurement of the last channel
@@ -997,8 +960,8 @@ def merge_double_ended_times(
 
 
 def shift_double_ended(
-    ds: "DataStore", i_shift: int, verbose: bool = True
-) -> "DataStore":
+    ds: xr.Dataset, i_shift: int, verbose: bool = True
+) -> xr.Dataset:
     """
     The cable length was initially configured during the DTS measurement. For double ended
     measurements it is important to enter the correct length so that the forward channel and the
@@ -1031,8 +994,6 @@ def shift_double_ended(
     ds2 : DataStore object
         With a shifted x-axis
     """
-    from dtscalibration import DataStore
-
     assert isinstance(i_shift, (int, np.integer))
 
     nx = ds.x.size
@@ -1074,11 +1035,11 @@ def shift_double_ended(
     if not_included and verbose:
         print("I dont know what to do with the following data", not_included)
 
-    return DataStore(data_vars=d2_data, coords=d2_coords, attrs=ds.attrs)
+    return xr.Dataset(data_vars=d2_data, coords=d2_coords, attrs=ds.attrs)
 
 
 def suggest_cable_shift_double_ended(
-    ds: "DataStore",
+    ds: xr.Dataset,
     irange: npt.NDArray[np.int_],
     plot_result: bool = True,
     **fig_kwargs,
@@ -1107,8 +1068,7 @@ def suggest_cable_shift_double_ended(
 
     Parameters
     ----------
-    ds : DataSore object
-        DataStore object that needs to be shifted
+    ds : Xarray Dataset
     irange : array-like
         a numpy array with data of type int. Containing all the shift index
         that are tested.

diff --git a/src/dtscalibration/io.py b/src/dtscalibration/io.py
@@ -20,188 +20,6 @@
 from dtscalibration.io_utils import ziphandle_to_filepathlist
 
 
-def open_datastore(
-    filename_or_obj,
-    group=None,
-    decode_cf=True,
-    mask_and_scale=None,
-    decode_times=True,
-    concat_characters=True,
-    decode_coords=True,
-    engine=None,
-    chunks=None,
-    lock=None,
-    cache=None,
-    drop_variables=None,
-    backend_kwargs=None,
-    load_in_memory=False,
-    **kwargs,
-):
-    """Load and decode a datastore from a file or file-like object. Most
-    arguments are passed to xarray.open_dataset().
-
-    Parameters
-    ----------
-    filename_or_obj : str, Path, file or xarray.backends.*Dataset
-        Strings and Path objects are interpreted as a path to a netCDF file
-        or an OpenDAP URL and opened with python-netCDF4, unless the filename
-        ends with .gz, in which case the file is gunzipped and opened with
-        scipy.io.netcdf (only netCDF3 supported). File-like objects are opened
-        with scipy.io.netcdf (only netCDF3 supported).
-    group : str, optional
-        Path to the netCDF4 group in the given file to open (only works for
-        netCDF4 files).
-    decode_cf : bool, optional
-        Whether to decode these variables, assuming they were saved according
-        to CF conventions.
-    mask_and_scale : bool, optional
-        If True, replace array values equal to `_FillValue` with NA and scale
-        values according to the formula `original_values * scale_factor +
-        add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are
-        taken from variable attributes (if they exist).  If the `_FillValue` or
-        `missing_value` attribute contains multiple values a warning will be
-        issued and all array values matching one of the multiple values will
-        be replaced by NA. mask_and_scale defaults to True except for the
-        pseudonetcdf backend.
-    decode_times : bool, optional
-        If True, decode times encoded in the standard NetCDF datetime format
-        into datetime objects. Otherwise, leave them encoded as numbers.
-    concat_characters : bool, optional
-        If True, concatenate along the last dimension of character arrays to
-        form string arrays. Dimensions will only be concatenated over (and
-        removed) if they have no corresponding variable and if they are only
-        used as the last dimension of character arrays.
-    decode_coords : bool, optional
-        If True, decode the 'coordinates' attribute to identify coordinates in
-        the resulting dataset.
-    engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio',
-    'pseudonetcdf'}, optional
-        Engine to use when reading files. If not provided, the default engine
-        is chosen based on available dependencies, with a preference for
-        'netcdf4'.
-    chunks : int or dict, optional
-        If chunks is provided, it used to load the new dataset into dask
-        arrays. ``chunks={}`` loads the dataset with dask using a single
-        chunk for all arrays.
-    lock : False, True or threading.Lock, optional
-        If chunks is provided, this argument is passed on to
-        :py:func:`dask.array.from_array`. By default, a global lock is
-        used when reading data from netCDF files with the netcdf4 and h5netcdf
-        engines to avoid issues with concurrent access when using dask's
-        multithreaded backend.
-    cache : bool, optional
-        If True, cache data loaded from the underlying datastore in memory as
-        NumPy arrays when accessed to avoid reading from the underlying data-
-        store multiple times. Defaults to True unless you specify the `chunks`
-        argument to use dask, in which case it defaults to False. Does not
-        change the behavior of coordinates corresponding to dimensions, which
-        always load their data from disk into a ``pandas.Index``.
-    drop_variables: string or iterable, optional
-        A variable or list of variables to exclude from being parsed from the
-        dataset. This may be useful to drop variables with problems or
-        inconsistent values.
-    backend_kwargs: dictionary, optional
-        A dictionary of keyword arguments to pass on to the backend. This
-        may be useful when backend options would improve performance or
-        allow user control of dataset processing.
-
-    Returns
-    -------
-    dataset : Dataset
-        The newly created dataset.
-
-    See Also
-    --------
-    xarray.open_dataset
-    xarray.load_dataset
-    """
-
-    xr_kws = inspect.signature(xr.open_dataset).parameters.keys()
-
-    ds_kwargs = {k: v for k, v in kwargs.items() if k not in xr_kws}
-
-    if chunks is None:
-        chunks = {}
-
-    with xr.open_dataset(
-        filename_or_obj,
-        group=group,
-        decode_cf=decode_cf,
-        mask_and_scale=mask_and_scale,
-        decode_times=decode_times,
-        concat_characters=concat_characters,
-        decode_coords=decode_coords,
-        engine=engine,
-        chunks=chunks,
-        lock=lock,
-        cache=cache,
-        drop_variables=drop_variables,
-        backend_kwargs=backend_kwargs,
-    ) as ds_xr:
-        ds = Dataset(
-            data_vars=ds_xr.data_vars,
-            coords=ds_xr.coords,
-            attrs=ds_xr.attrs,
-            **ds_kwargs,
-        )
-
-        # to support deprecated st_labels
-        ds = ds.rename_labels(assertion=False)
-
-        if load_in_memory:
-            if "cache" in kwargs:
-                raise TypeError("cache has no effect in this context")
-            return ds.load()
-
-        else:
-            return ds
-
-
-def open_mf_datastore(
-    path=None, paths=None, combine="by_coords", load_in_memory=False, **kwargs
-):
-    """
-    Open a datastore from multiple netCDF files. This script assumes the
-    datastore was split along the time dimension. But only variables with a
-    time dimension should be concatenated in the time dimension. Other
-    options from xarray do not support this.
-
-    Parameters
-    ----------
-    combine : {'by_coords', 'nested'}, optional
-        Leave it at by_coords
-    path : str
-        A file path to the stored netcdf files with an asterisk in the
-        filename to list all. Ensure you have leading zeros in the file
-        numbering.
-    paths : list
-        Define you own list of file paths.
-    Returns
-    -------
-    dataset : Dataset
-        The newly created dataset.
-    """
-    from xarray.backends.api import open_mfdataset
-
-    if paths is None:
-        paths = sorted(glob.glob(path))
-        assert paths, "No files match found with: " + path
-
-    with open_mfdataset(paths=paths, combine=combine, **kwargs) as xds:
-        ds = Dataset(data_vars=xds.data_vars, coords=xds.coords, attrs=xds.attrs)
-
-        # to support deprecated st_labels
-        ds = ds.rename_labels(assertion=False)
-
-        if load_in_memory:
-            if "cache" in kwargs:
-                raise TypeError("cache has no effect in this context")
-            return ds.load()
-
-        else:
-            return ds
-
-
 def read_silixa_files(
     filepathlist=None,
     directory=None,