-
Notifications
You must be signed in to change notification settings - Fork 13
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feature/issue 189 #190
Feature/issue 189 #190
Changes from 2 commits
eef13d8
c887508
9ec4a06
8fa3452
562a538
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,7 @@ | |
import os | ||
from itertools import zip_longest | ||
from typing import List, Tuple, Union | ||
import traceback | ||
import dateutil | ||
from dateutil import parser | ||
|
||
|
@@ -1065,6 +1066,33 @@ def decode_cf_datetime(num_dates, units, calendar=None, use_cftime=None): | |
xarray.coding.times.decode_cf_datetime = decode_cf_datetime | ||
|
||
|
||
def open_dataset_test(file, args): | ||
""" | ||
Open a NetCDF dataset using xarray, handling specific exceptions. | ||
|
||
This function attempts to open a NetCDF dataset using the provided arguments. | ||
If an OverflowError with a specific message is encountered, it modifies the | ||
'mask_and_scale' argument to True and retries opening the dataset. | ||
|
||
Args: | ||
file (str): Path to the NetCDF file. | ||
args (dict): Dictionary of arguments to pass to xr.open_dataset. | ||
|
||
Returns: | ||
None: The function modifies the 'args' dictionary in place. | ||
|
||
""" | ||
try: | ||
test_xr_open = xr.open_dataset(file, **args) | ||
test_xr_open.close() | ||
except Exception: # pylint: disable=broad-except | ||
traceback_str = traceback.format_exc() | ||
|
||
# Check for the specific OverflowError message | ||
if "Python int too large to convert to C long" in traceback_str and "Failed to decode variable 'time': unable to decode time units" in traceback_str: | ||
args["mask_and_scale"] = True | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By setting mask_and_scale to true, the resulting granule will have no scale/offset attributes because those will be applied to the data variables. Ideally we'd maintain the old behavior (where the subsetted granule has the same scale/offset attributes) -- would such a thing be possible? The hard way to do this would be to re-apply the scale/offset to each data variable, but maybe there's an easier way. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That is correct, im not sure if we can undo this, i haven't found any other way where i can open the file using xarray without the mask_and_scale options |
||
|
||
|
||
def subset(file_to_subset: str, bbox: np.ndarray, output_file: str, | ||
variables: Union[List[str], str, None] = (), | ||
# pylint: disable=too-many-branches, disable=too-many-statements | ||
|
@@ -1162,10 +1190,15 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str, | |
|
||
if min_time or max_time: | ||
args['decode_times'] = True | ||
open_dataset_test(file_to_subset, args) | ||
|
||
with xr.open_dataset( | ||
xr.backends.NetCDF4DataStore(nc_dataset), | ||
**args | ||
) as dataset: | ||
|
||
original_dataset = dataset | ||
|
||
lat_var_names, lon_var_names, time_var_names = get_coordinate_variable_names( | ||
dataset=dataset, | ||
lat_var_names=lat_var_names, | ||
|
@@ -1225,22 +1258,22 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str, | |
)) | ||
else: | ||
encoding = {} | ||
compression = {"zlib": True, "complevel": 5, "_FillValue": None} | ||
|
||
compression = {"zlib": True, "complevel": 5} | ||
if (min_time or max_time) and not all( | ||
dim_size == 1 for dim_size in dataset.dims.values()): | ||
encoding = { | ||
var_name: { | ||
'units': nc_dataset.variables[var_name].__dict__['units'], | ||
'zlib': True, | ||
"complevel": 5, | ||
"_FillValue": None | ||
"_FillValue": original_dataset[var_name].encoding.get('_FillValue') | ||
} for var_name in time_var_names | ||
if 'units' in nc_dataset.variables[var_name].__dict__ | ||
} | ||
for var in dataset.data_vars: | ||
if var not in encoding: | ||
encoding[var] = compression | ||
encoding[var]['_FillValue'] = original_dataset[var].encoding.get('_FillValue') | ||
if dataset[var].dtype == 'S1' and isinstance(dataset[var].attrs.get('_FillValue'), bytes): | ||
dataset[var].attrs['_FillValue'] = dataset[var].attrs['_FillValue'].decode('UTF-8') | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hardcoded 'time' variable name
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i have the intention to only catch the "Python int too large to convert to C long" for the time variable, although i can change it catch for any.