Skip to content

Commit

Permalink
Feature/issue 189 (#190)
Browse files Browse the repository at this point in the history
* add function that test opening of granule file, and change fill value encoding

* update changelog, and change exception catching when testing opening granule file

* improve on keeping original encoding

* fix pylint

* revert encoding
  • Loading branch information
sliu008 authored Sep 12, 2023
1 parent 4687f98 commit 6b9c957
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 17 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Removed
### Fixed
- [issue/119](https://github.com/podaac/l2ss-py/issues/119): GPM variable dimensions are renamed from "phony_dim" to the dimension names in the variable attribute "DimensionNames"
- [issue/189](https://github.com/podaac/l2ss-py/issues/189): Fix temporal subsetting for SWOT collections, use mask_and_scale args for opening granule file if we have an overflow in time fill value, use original dataset encoding when writing file.

### Security


Expand Down
57 changes: 40 additions & 17 deletions podaac/subsetter/subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import os
from itertools import zip_longest
from typing import List, Tuple, Union
import traceback
import dateutil
from dateutil import parser

Expand Down Expand Up @@ -1065,6 +1066,33 @@ def decode_cf_datetime(num_dates, units, calendar=None, use_cftime=None):
xarray.coding.times.decode_cf_datetime = decode_cf_datetime


def open_dataset_test(file, args):
"""
Open a NetCDF dataset using xarray, handling specific exceptions.
This function attempts to open a NetCDF dataset using the provided arguments.
If an OverflowError with a specific message is encountered, it modifies the
'mask_and_scale' argument to True and retries opening the dataset.
Args:
file (str): Path to the NetCDF file.
args (dict): Dictionary of arguments to pass to xr.open_dataset.
Returns:
None: The function modifies the 'args' dictionary in place.
"""
try:
test_xr_open = xr.open_dataset(file, **args)
test_xr_open.close()
except Exception: # pylint: disable=broad-except
traceback_str = traceback.format_exc()

# Check for the specific OverflowError message
if "Python int too large to convert to C long" in traceback_str and "Failed to decode variable 'time': unable to decode time units" in traceback_str:
args["mask_and_scale"] = True


def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
variables: Union[List[str], str, None] = (),
# pylint: disable=too-many-branches, disable=too-many-statements
Expand Down Expand Up @@ -1162,10 +1190,15 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,

if min_time or max_time:
args['decode_times'] = True
open_dataset_test(file_to_subset, args)

with xr.open_dataset(
xr.backends.NetCDF4DataStore(nc_dataset),
**args
) as dataset:

original_dataset = dataset

lat_var_names, lon_var_names, time_var_names = get_coordinate_variable_names(
dataset=dataset,
lat_var_names=lat_var_names,
Expand Down Expand Up @@ -1224,28 +1257,18 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
lon_var_names=lon_var_names
))
else:
encoding = {}
compression = {"zlib": True, "complevel": 5, "_FillValue": None}

if (min_time or max_time) and not all(
dim_size == 1 for dim_size in dataset.dims.values()):
encoding = {
var_name: {
'units': nc_dataset.variables[var_name].__dict__['units'],
'zlib': True,
"complevel": 5,
"_FillValue": None
} for var_name in time_var_names
if 'units' in nc_dataset.variables[var_name].__dict__
}
for var in dataset.data_vars:
if var not in encoding:
encoding[var] = compression
if dataset[var].dtype == 'S1' and isinstance(dataset[var].attrs.get('_FillValue'), bytes):
dataset[var].attrs['_FillValue'] = dataset[var].attrs['_FillValue'].decode('UTF-8')

var_encoding = {
"zlib": True,
"complevel": 5,
"_FillValue": original_dataset[var].encoding.get('_FillValue')
}

data_var = dataset[var].copy()
data_var.load().to_netcdf(output_file, 'a', encoding={var: encoding.get(var)})
data_var.load().to_netcdf(output_file, 'a', encoding={var: var_encoding})
del data_var

with nc.Dataset(output_file, 'a') as dataset_attr:
Expand Down
18 changes: 18 additions & 0 deletions tests/test_subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,24 @@ def test_subset_variables(test_file, data_dir, subset_output_dir, request):
decode_times=False,
decode_coords=False)


nc_in_ds = nc.Dataset(join(data_dir, test_file))
nc_out_ds = nc.Dataset(join(subset_output_dir, output_file))

time_var_name = None
try:
lat_var_name = subset.compute_coordinate_variable_names(in_ds)[0][0]
time_var_name = subset.compute_time_variable_name(in_ds, in_ds[lat_var_name])
except ValueError:
# unable to determine lon lat vars
pass

if time_var_name:
assert nc_in_ds[time_var_name].units == nc_out_ds[time_var_name].units

nc_in_ds.close()
nc_out_ds.close()

for in_var, out_var in zip(in_ds.data_vars.items(), out_ds.data_vars.items()):
# compare names
assert in_var[0] == out_var[0]
Expand Down

1 comment on commit 6b9c957

@jjmcnelis
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks so much, Simon!

Please sign in to comment.