Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/issue 189 #190

Merged
merged 5 commits into from
Sep 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Removed
### Fixed
- [issue/119](https://github.com/podaac/l2ss-py/issues/119): GPM variable dimensions are renamed from "phony_dim" to the dimension names in the variable attribute "DimensionNames"
- [issue/189](https://github.com/podaac/l2ss-py/issues/189): Fix temporal subsetting for SWOT collections, use mask_and_scale args for opening granule file if we have an overflow in time fill value, use original dataset encoding when writing file.

### Security


Expand Down
57 changes: 40 additions & 17 deletions podaac/subsetter/subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import os
from itertools import zip_longest
from typing import List, Tuple, Union
import traceback
import dateutil
from dateutil import parser

Expand Down Expand Up @@ -1065,6 +1066,33 @@ def decode_cf_datetime(num_dates, units, calendar=None, use_cftime=None):
xarray.coding.times.decode_cf_datetime = decode_cf_datetime


def open_dataset_test(file, args):
"""
Open a NetCDF dataset using xarray, handling specific exceptions.

This function attempts to open a NetCDF dataset using the provided arguments.
If an OverflowError with a specific message is encountered, it modifies the
'mask_and_scale' argument to True and retries opening the dataset.

Args:
file (str): Path to the NetCDF file.
args (dict): Dictionary of arguments to pass to xr.open_dataset.

Returns:
None: The function modifies the 'args' dictionary in place.

"""
try:
test_xr_open = xr.open_dataset(file, **args)
test_xr_open.close()
except Exception: # pylint: disable=broad-except
traceback_str = traceback.format_exc()

# Check for the specific OverflowError message
if "Python int too large to convert to C long" in traceback_str and "Failed to decode variable 'time': unable to decode time units" in traceback_str:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hardcoded 'time' variable name

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i have the intention to only catch the "Python int too large to convert to C long" for the time variable, although i can change it catch for any.

args["mask_and_scale"] = True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By setting mask_and_scale to true, the resulting granule will have no scale/offset attributes because those will be applied to the data variables. Ideally we'd maintain the old behavior (where the subsetted granule has the same scale/offset attributes) -- would such a thing be possible? The hard way to do this would be to re-apply the scale/offset to each data variable, but maybe there's an easier way.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is correct, im not sure if we can undo this, i haven't found any other way where i can open the file using xarray without the mask_and_scale options



def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
variables: Union[List[str], str, None] = (),
# pylint: disable=too-many-branches, disable=too-many-statements
Expand Down Expand Up @@ -1162,10 +1190,15 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,

if min_time or max_time:
args['decode_times'] = True
open_dataset_test(file_to_subset, args)

with xr.open_dataset(
xr.backends.NetCDF4DataStore(nc_dataset),
**args
) as dataset:

original_dataset = dataset

lat_var_names, lon_var_names, time_var_names = get_coordinate_variable_names(
dataset=dataset,
lat_var_names=lat_var_names,
Expand Down Expand Up @@ -1224,28 +1257,18 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
lon_var_names=lon_var_names
))
else:
encoding = {}
compression = {"zlib": True, "complevel": 5, "_FillValue": None}

if (min_time or max_time) and not all(
dim_size == 1 for dim_size in dataset.dims.values()):
encoding = {
var_name: {
'units': nc_dataset.variables[var_name].__dict__['units'],
'zlib': True,
"complevel": 5,
"_FillValue": None
} for var_name in time_var_names
if 'units' in nc_dataset.variables[var_name].__dict__
}
for var in dataset.data_vars:
if var not in encoding:
encoding[var] = compression
if dataset[var].dtype == 'S1' and isinstance(dataset[var].attrs.get('_FillValue'), bytes):
dataset[var].attrs['_FillValue'] = dataset[var].attrs['_FillValue'].decode('UTF-8')

var_encoding = {
"zlib": True,
"complevel": 5,
"_FillValue": original_dataset[var].encoding.get('_FillValue')
}

data_var = dataset[var].copy()
data_var.load().to_netcdf(output_file, 'a', encoding={var: encoding.get(var)})
data_var.load().to_netcdf(output_file, 'a', encoding={var: var_encoding})
del data_var

with nc.Dataset(output_file, 'a') as dataset_attr:
Expand Down
18 changes: 18 additions & 0 deletions tests/test_subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,24 @@ def test_subset_variables(test_file, data_dir, subset_output_dir, request):
decode_times=False,
decode_coords=False)


nc_in_ds = nc.Dataset(join(data_dir, test_file))
nc_out_ds = nc.Dataset(join(subset_output_dir, output_file))

time_var_name = None
try:
lat_var_name = subset.compute_coordinate_variable_names(in_ds)[0][0]
time_var_name = subset.compute_time_variable_name(in_ds, in_ds[lat_var_name])
except ValueError:
# unable to determine lon lat vars
pass

if time_var_name:
assert nc_in_ds[time_var_name].units == nc_out_ds[time_var_name].units

nc_in_ds.close()
nc_out_ds.close()

for in_var, out_var in zip(in_ds.data_vars.items(), out_ds.data_vars.items()):
# compare names
assert in_var[0] == out_var[0]
Expand Down
Loading