From 74a0a8b072233fc0ca5e33e77ab8c146d1e647fa Mon Sep 17 00:00:00 2001 From: Bas des Tombe Date: Mon, 16 Oct 2023 14:50:56 +0200 Subject: [PATCH] Passing all tests except lint and examples --- src/dtscalibration/__init__.py | 6 - src/dtscalibration/datastore_accessor.py | 8 +- src/dtscalibration/datastore_utils.py | 64 ++------ src/dtscalibration/io.py | 182 ----------------------- tests/test_datastore.py | 89 ++++------- 5 files changed, 47 insertions(+), 302 deletions(-) diff --git a/src/dtscalibration/__init__.py b/src/dtscalibration/__init__.py index 66fdb7ac..856a02b9 100644 --- a/src/dtscalibration/__init__.py +++ b/src/dtscalibration/__init__.py @@ -1,11 +1,8 @@ -from dtscalibration.datastore import DataStore from dtscalibration.datastore_utils import check_dims from dtscalibration.datastore_utils import get_netcdf_encoding from dtscalibration.datastore_utils import merge_double_ended from dtscalibration.datastore_utils import shift_double_ended from dtscalibration.datastore_utils import suggest_cable_shift_double_ended -from dtscalibration.io import open_datastore -from dtscalibration.io import open_mf_datastore from dtscalibration.io import read_apsensing_files from dtscalibration.io import read_sensornet_files from dtscalibration.io import read_sensortran_files @@ -18,9 +15,6 @@ __version__ = "2.0.0" __all__ = [ - "DataStore", - "open_datastore", - "open_mf_datastore", "read_apsensing_files", "read_sensornet_files", "read_sensortran_files", diff --git a/src/dtscalibration/datastore_accessor.py b/src/dtscalibration/datastore_accessor.py index 255f9b5a..3227ca74 100644 --- a/src/dtscalibration/datastore_accessor.py +++ b/src/dtscalibration/datastore_accessor.py @@ -46,7 +46,7 @@ def __repr__(self): # 'xarray' is prepended. so we remove it and add 'dtscalibration' s = xr.core.formatting.dataset_repr(self._obj) name_module = type(self._obj).__name__ - preamble_new = "" % name_module + preamble_new = f"" # Add sections to new preamble preamble_new += "\nSections:" @@ -268,6 +268,12 @@ def get_default_encoding(self, time_chunks_from_key=None): return encoding + def get_timeseries_keys(self): + """ + Returns a list of the keys of the time series variables. + """ + return [k for k, v in self._obj.data_vars.items() if v.dims == ("time",)] + def ufunc_per_section( self, sections=None, diff --git a/src/dtscalibration/datastore_utils.py b/src/dtscalibration/datastore_utils.py index 41709c32..25c86c9d 100644 --- a/src/dtscalibration/datastore_utils.py +++ b/src/dtscalibration/datastore_utils.py @@ -459,45 +459,8 @@ def check_deprecated_kwargs(kwargs): pass -# def check_timestep_allclose(ds: "DataStore", eps: float = 0.05) -> None: -# """ -# Check if all timesteps are of equal size. For now it is not possible to calibrate -# over timesteps if the acquisition time of timesteps varies, as the Stokes variance -# would change over time. - -# The acquisition time is stored for single ended measurements in userAcquisitionTime, -# for double ended measurements in userAcquisitionTimeFW and userAcquisitionTimeBW. - -# Parameters -# ---------- -# ds : DataStore -# eps : float -# Default accepts 1% of relative variation between min and max acquisition time. - -# Returns -# ------- -# """ -# dt = ds["userAcquisitionTimeFW"].data -# dtmin = dt.min() -# dtmax = dt.max() -# dtavg = (dtmin + dtmax) / 2 -# assert (dtmax - dtmin) / dtavg < eps, ( -# "Acquisition time is Forward channel not equal for all time steps" -# ) - -# if "userAcquisitionTimeBW" in ds: -# dt = ds["userAcquisitionTimeBW"].data -# dtmin = dt.min() -# dtmax = dt.max() -# dtavg = (dtmin + dtmax) / 2 -# assert (dtmax - dtmin) / dtavg < eps, ( -# "Acquisition time Backward channel is not equal for all time steps" -# ) -# pass - - def get_netcdf_encoding( - ds: "DataStore", zlib: bool = True, complevel: int = 5, **kwargs + ds: xr.Dataset, zlib: bool = True, complevel: int = 5, **kwargs ) -> dict: """Get default netcdf compression parameters. The same for each data variable. @@ -788,12 +751,12 @@ def get_params_from_pval_single_ended( def merge_double_ended( - ds_fw: "DataStore", - ds_bw: "DataStore", + ds_fw: xr.Dataset, + ds_bw: xr.Dataset, cable_length: float, plot_result: bool = True, verbose: bool = True, -) -> "DataStore": +) -> xr.Dataset: """ Some measurements are not set up on the DTS-device as double-ended meausurements. This means that the two channels have to be merged manually. @@ -858,11 +821,11 @@ def merge_double_ended( def merge_double_ended_times( - ds_fw: "DataStore", - ds_bw: "DataStore", + ds_fw: xr.Dataset, + ds_bw: xr.Dataset, verify_timedeltas: bool = True, verbose: bool = True, -) -> tuple["DataStore", "DataStore"]: +) -> tuple[xr.Dataset, xr.Dataset]: """Helper for `merge_double_ended()` to deal with missing measurements. The number of measurements of the forward and backward channels might get out of sync if the device shuts down before the measurement of the last channel @@ -997,8 +960,8 @@ def merge_double_ended_times( def shift_double_ended( - ds: "DataStore", i_shift: int, verbose: bool = True -) -> "DataStore": + ds: xr.Dataset, i_shift: int, verbose: bool = True +) -> xr.Dataset: """ The cable length was initially configured during the DTS measurement. For double ended measurements it is important to enter the correct length so that the forward channel and the @@ -1031,8 +994,6 @@ def shift_double_ended( ds2 : DataStore object With a shifted x-axis """ - from dtscalibration import DataStore - assert isinstance(i_shift, (int, np.integer)) nx = ds.x.size @@ -1074,11 +1035,11 @@ def shift_double_ended( if not_included and verbose: print("I dont know what to do with the following data", not_included) - return DataStore(data_vars=d2_data, coords=d2_coords, attrs=ds.attrs) + return xr.Dataset(data_vars=d2_data, coords=d2_coords, attrs=ds.attrs) def suggest_cable_shift_double_ended( - ds: "DataStore", + ds: xr.Dataset, irange: npt.NDArray[np.int_], plot_result: bool = True, **fig_kwargs, @@ -1107,8 +1068,7 @@ def suggest_cable_shift_double_ended( Parameters ---------- - ds : DataSore object - DataStore object that needs to be shifted + ds : Xarray Dataset irange : array-like a numpy array with data of type int. Containing all the shift index that are tested. diff --git a/src/dtscalibration/io.py b/src/dtscalibration/io.py index 8bc312e0..f06ca683 100644 --- a/src/dtscalibration/io.py +++ b/src/dtscalibration/io.py @@ -20,188 +20,6 @@ from dtscalibration.io_utils import ziphandle_to_filepathlist -def open_datastore( - filename_or_obj, - group=None, - decode_cf=True, - mask_and_scale=None, - decode_times=True, - concat_characters=True, - decode_coords=True, - engine=None, - chunks=None, - lock=None, - cache=None, - drop_variables=None, - backend_kwargs=None, - load_in_memory=False, - **kwargs, -): - """Load and decode a datastore from a file or file-like object. Most - arguments are passed to xarray.open_dataset(). - - Parameters - ---------- - filename_or_obj : str, Path, file or xarray.backends.*Dataset - Strings and Path objects are interpreted as a path to a netCDF file - or an OpenDAP URL and opened with python-netCDF4, unless the filename - ends with .gz, in which case the file is gunzipped and opened with - scipy.io.netcdf (only netCDF3 supported). File-like objects are opened - with scipy.io.netcdf (only netCDF3 supported). - group : str, optional - Path to the netCDF4 group in the given file to open (only works for - netCDF4 files). - decode_cf : bool, optional - Whether to decode these variables, assuming they were saved according - to CF conventions. - mask_and_scale : bool, optional - If True, replace array values equal to `_FillValue` with NA and scale - values according to the formula `original_values * scale_factor + - add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are - taken from variable attributes (if they exist). If the `_FillValue` or - `missing_value` attribute contains multiple values a warning will be - issued and all array values matching one of the multiple values will - be replaced by NA. mask_and_scale defaults to True except for the - pseudonetcdf backend. - decode_times : bool, optional - If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, leave them encoded as numbers. - concat_characters : bool, optional - If True, concatenate along the last dimension of character arrays to - form string arrays. Dimensions will only be concatenated over (and - removed) if they have no corresponding variable and if they are only - used as the last dimension of character arrays. - decode_coords : bool, optional - If True, decode the 'coordinates' attribute to identify coordinates in - the resulting dataset. - engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio', - 'pseudonetcdf'}, optional - Engine to use when reading files. If not provided, the default engine - is chosen based on available dependencies, with a preference for - 'netcdf4'. - chunks : int or dict, optional - If chunks is provided, it used to load the new dataset into dask - arrays. ``chunks={}`` loads the dataset with dask using a single - chunk for all arrays. - lock : False, True or threading.Lock, optional - If chunks is provided, this argument is passed on to - :py:func:`dask.array.from_array`. By default, a global lock is - used when reading data from netCDF files with the netcdf4 and h5netcdf - engines to avoid issues with concurrent access when using dask's - multithreaded backend. - cache : bool, optional - If True, cache data loaded from the underlying datastore in memory as - NumPy arrays when accessed to avoid reading from the underlying data- - store multiple times. Defaults to True unless you specify the `chunks` - argument to use dask, in which case it defaults to False. Does not - change the behavior of coordinates corresponding to dimensions, which - always load their data from disk into a ``pandas.Index``. - drop_variables: string or iterable, optional - A variable or list of variables to exclude from being parsed from the - dataset. This may be useful to drop variables with problems or - inconsistent values. - backend_kwargs: dictionary, optional - A dictionary of keyword arguments to pass on to the backend. This - may be useful when backend options would improve performance or - allow user control of dataset processing. - - Returns - ------- - dataset : Dataset - The newly created dataset. - - See Also - -------- - xarray.open_dataset - xarray.load_dataset - """ - - xr_kws = inspect.signature(xr.open_dataset).parameters.keys() - - ds_kwargs = {k: v for k, v in kwargs.items() if k not in xr_kws} - - if chunks is None: - chunks = {} - - with xr.open_dataset( - filename_or_obj, - group=group, - decode_cf=decode_cf, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - engine=engine, - chunks=chunks, - lock=lock, - cache=cache, - drop_variables=drop_variables, - backend_kwargs=backend_kwargs, - ) as ds_xr: - ds = Dataset( - data_vars=ds_xr.data_vars, - coords=ds_xr.coords, - attrs=ds_xr.attrs, - **ds_kwargs, - ) - - # to support deprecated st_labels - ds = ds.rename_labels(assertion=False) - - if load_in_memory: - if "cache" in kwargs: - raise TypeError("cache has no effect in this context") - return ds.load() - - else: - return ds - - -def open_mf_datastore( - path=None, paths=None, combine="by_coords", load_in_memory=False, **kwargs -): - """ - Open a datastore from multiple netCDF files. This script assumes the - datastore was split along the time dimension. But only variables with a - time dimension should be concatenated in the time dimension. Other - options from xarray do not support this. - - Parameters - ---------- - combine : {'by_coords', 'nested'}, optional - Leave it at by_coords - path : str - A file path to the stored netcdf files with an asterisk in the - filename to list all. Ensure you have leading zeros in the file - numbering. - paths : list - Define you own list of file paths. - Returns - ------- - dataset : Dataset - The newly created dataset. - """ - from xarray.backends.api import open_mfdataset - - if paths is None: - paths = sorted(glob.glob(path)) - assert paths, "No files match found with: " + path - - with open_mfdataset(paths=paths, combine=combine, **kwargs) as xds: - ds = Dataset(data_vars=xds.data_vars, coords=xds.coords, attrs=xds.attrs) - - # to support deprecated st_labels - ds = ds.rename_labels(assertion=False) - - if load_in_memory: - if "cache" in kwargs: - raise TypeError("cache has no effect in this context") - return ds.load() - - else: - return ds - - def read_silixa_files( filepathlist=None, directory=None, diff --git a/tests/test_datastore.py b/tests/test_datastore.py index 5620f54b..8b7e5461 100644 --- a/tests/test_datastore.py +++ b/tests/test_datastore.py @@ -8,18 +8,21 @@ import dask.array as da import numpy as np import pytest +from xarray import Dataset +import xarray as xr -from dtscalibration import open_datastore -from dtscalibration import open_mf_datastore from dtscalibration import read_apsensing_files from dtscalibration import read_sensornet_files from dtscalibration import read_sensortran_files from dtscalibration import read_silixa_files from dtscalibration.calibration.section_utils import set_sections +from dtscalibration.calibration.section_utils import set_matching_sections from dtscalibration.datastore_utils import merge_double_ended from dtscalibration.datastore_utils import shift_double_ended from dtscalibration.datastore_utils import suggest_cable_shift_double_ended +from dtscalibration.datastore_accessor import DtsAccessor # noqa: F401 + np.random.seed(0) fn = [ @@ -120,24 +123,23 @@ def test_read_data_from_single_file_single_ended(): assert actual_hash == desired_hash, "The data is not read correctly" -def test_empty_construction(): - ds = DataStore() # noqa: F841 - - def test_repr(): - ds = DataStore() - assert "dtscalibration" in str(ds) - assert "Sections" in str(ds) - + ds = Dataset( + { + "st": (["x", "time"], np.ones((100, 5))), + "ast": (["x", "time"], np.ones((100, 5))), + "probe1Temperature": (["time"], range(5)), + "probe2Temperature": (["time"], range(5)), + }, + coords={"x": range(100), "time": range(5)}, + ) -def test_has_sectionattr_upon_creation(): - ds = DataStore() - assert hasattr(ds, "_sections") - assert isinstance(ds._sections, str) + assert "dtscalibration" in str(ds.dts) + assert "Sections" in str(ds.dts) def test_sections_property(): - ds = DataStore( + ds = Dataset( { "st": (["x", "time"], np.ones((100, 5))), "ast": (["x", "time"], np.ones((100, 5))), @@ -155,15 +157,15 @@ def test_sections_property(): "probe1Temperature": [slice(0.0, 17.0), slice(70.0, 80.0)], # cold bath "probe2Temperature": [slice(24.0, 34.0), slice(85.0, 95.0)], # warm bath } - ds = set_sections(ds, sections1) + set_sections(ds, sections1) assert isinstance(ds.attrs["_sections"], str) - assert ds.sections == sections1 - assert ds.sections != sections2 + assert ds.dts.sections == sections1 + assert ds.dts.sections != sections2 # test if accepts singleton numpy arrays - ds = set_sections( + set_sections( ds, { "probe1Temperature": [ @@ -175,7 +177,7 @@ def test_sections_property(): def test_io_sections_property(): - ds = DataStore( + ds = Dataset( { "st": (["x", "time"], np.ones((100, 5))), "ast": (["x", "time"], np.ones((100, 5))), @@ -191,7 +193,7 @@ def test_io_sections_property(): } ds["x"].attrs["units"] = "m" - ds = set_sections(ds, sections) + set_sections(ds, sections) # Create a temporary file to write data to. # 'with' method is used so the file is closed by tempfile @@ -203,14 +205,14 @@ def test_io_sections_property(): ds.to_netcdf(path=temppath) try: - ds2 = open_datastore(temppath) + ds2 = xr.open_dataset(temppath) except ValueError as e: if str(e) != "cannot guess the engine, try passing one explicitly": raise warnings.warn("Could not guess engine, defaulted to netcdf4") - ds2 = open_datastore(temppath, engine="netcdf4") + ds2 = xr.open_datastore(temppath, engine="netcdf4") - assert ds.sections == ds2.sections + assert ds.dts.sections == ds2.dts.sections # Close the datastore so the temp file can be removed ds2.close() @@ -493,41 +495,6 @@ def test_read_sensortran_files(): ) -def test_to_mf_netcdf_open_mf_datastore(): - filepath = data_dir_single_ended - ds = read_silixa_files(directory=filepath, file_ext="*.xml") - - with tempfile.TemporaryDirectory() as tmpdirname: - print("created temporary directory", tmpdirname) - - # work around the effects of deafault encoding. - path = os.path.join(tmpdirname, "ds_merged.nc") - - with read_silixa_files(directory=filepath, file_ext="*.xml") as ds: - ds.to_netcdf(path) - - time.sleep(5) # to ensure all is written on Windows and file released - - with open_datastore(path, load_in_memory=True) as ds1: - # Test saving - ds1 = ds1.chunk({"time": 1}) - ds1.to_mf_netcdf( - folder_path=tmpdirname, - filename_preamble="file_", - filename_extension=".nc", - ) - correct_val = float(ds1.st.sum()) - - time.sleep(2) # to ensure all is written on Windows and file released - - # Test loading - path = os.path.join(tmpdirname, "file_*.nc") - - with open_mf_datastore(path=path, load_in_memory=True) as ds2: - test_val = float(ds2.st.sum()) - np.testing.assert_equal(correct_val, test_val) - - def read_data_from_fp_numpy(fp): """ Read the data from a single Silixa xml file. Using a simple approach @@ -565,7 +532,7 @@ def test_resample_datastore(): ds = read_silixa_files(directory=filepath, timezone_netcdf="UTC", file_ext="*.xml") assert ds.time.size == 3 - ds_resampled = DataStore(ds.resample(time="47S").mean()) + ds_resampled = Dataset(ds.resample(time="47S").mean()) assert ds_resampled.time.size == 2 assert ds_resampled.st.dims == ("x", "time"), ( @@ -580,7 +547,7 @@ def test_timeseries_keys(): filepath = data_dir_single_ended ds = read_silixa_files(directory=filepath, timezone_netcdf="UTC", file_ext="*.xml") - k = ds.timeseries_keys + k = ds.dts.get_timeseries_keys() # no false positive for ki in k: