From eef13d8976030871109300cb2479219ee23f3be3 Mon Sep 17 00:00:00 2001 From: Simon Liu Date: Thu, 31 Aug 2023 09:15:14 -0700 Subject: [PATCH 1/7] add function that test opening of granule file, and change fill value encoding --- podaac/subsetter/subset.py | 39 +++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py index fc4c4459..52eee246 100644 --- a/podaac/subsetter/subset.py +++ b/podaac/subsetter/subset.py @@ -25,6 +25,7 @@ import os from itertools import zip_longest from typing import List, Tuple, Union +import traceback import dateutil from dateutil import parser @@ -1065,6 +1066,33 @@ def decode_cf_datetime(num_dates, units, calendar=None, use_cftime=None): xarray.coding.times.decode_cf_datetime = decode_cf_datetime +def open_dataset_test(file, args): + """ + Open a NetCDF dataset using xarray, handling specific exceptions. + + This function attempts to open a NetCDF dataset using the provided arguments. + If an OverflowError with a specific message is encountered, it modifies the + 'mask_and_scale' argument to True and retries opening the dataset. + + Args: + file (str): Path to the NetCDF file. + args (dict): Dictionary of arguments to pass to xr.open_dataset. + + Returns: + None: The function modifies the 'args' dictionary in place. + + """ + try: + test_xr_open = xr.open_dataset(file, **args) + test_xr_open.close() + except ValueError: + traceback_str = traceback.format_exc() + + # Check for the specific OverflowError message + if "Python int too large to convert to C long" in traceback_str and "Failed to decode variable 'time': unable to decode time units" in traceback_str: + args["mask_and_scale"] = True + + def subset(file_to_subset: str, bbox: np.ndarray, output_file: str, variables: Union[List[str], str, None] = (), # pylint: disable=too-many-branches, disable=too-many-statements @@ -1162,10 +1190,15 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str, if min_time or max_time: args['decode_times'] = True + open_dataset_test(file_to_subset, args) + with xr.open_dataset( xr.backends.NetCDF4DataStore(nc_dataset), **args ) as dataset: + + original_dataset = dataset + lat_var_names, lon_var_names, time_var_names = get_coordinate_variable_names( dataset=dataset, lat_var_names=lat_var_names, @@ -1225,8 +1258,7 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str, )) else: encoding = {} - compression = {"zlib": True, "complevel": 5, "_FillValue": None} - + compression = {"zlib": True, "complevel": 5} if (min_time or max_time) and not all( dim_size == 1 for dim_size in dataset.dims.values()): encoding = { @@ -1234,13 +1266,14 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str, 'units': nc_dataset.variables[var_name].__dict__['units'], 'zlib': True, "complevel": 5, - "_FillValue": None + "_FillValue": original_dataset[var_name].encoding.get('_FillValue') } for var_name in time_var_names if 'units' in nc_dataset.variables[var_name].__dict__ } for var in dataset.data_vars: if var not in encoding: encoding[var] = compression + encoding[var]['_FillValue'] = original_dataset[var].encoding.get('_FillValue') if dataset[var].dtype == 'S1' and isinstance(dataset[var].attrs.get('_FillValue'), bytes): dataset[var].attrs['_FillValue'] = dataset[var].attrs['_FillValue'].decode('UTF-8') From c8875089f305253eddb16ae07b982e83db0b39bc Mon Sep 17 00:00:00 2001 From: Simon Liu Date: Thu, 31 Aug 2023 09:55:32 -0700 Subject: [PATCH 2/7] update changelog, and change exception catching when testing opening granule file --- CHANGELOG.md | 2 ++ podaac/subsetter/subset.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c7c881c..5178a13c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Removed ### Fixed - [issue/119](https://github.com/podaac/l2ss-py/issues/119): GPM variable dimensions are renamed from "phony_dim" to the dimension names in the variable attribute "DimensionNames" +- [issue/189](https://github.com/podaac/l2ss-py/issues/189): Fix temporal subsetting for SWOT collections, use mask_and_scale args for opening granule file if we have an overflow in time fill value, use original dataset encoding when writing file. + ### Security diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py index 52eee246..4495e09a 100644 --- a/podaac/subsetter/subset.py +++ b/podaac/subsetter/subset.py @@ -1085,7 +1085,7 @@ def open_dataset_test(file, args): try: test_xr_open = xr.open_dataset(file, **args) test_xr_open.close() - except ValueError: + except Exception: # pylint: disable=broad-except traceback_str = traceback.format_exc() # Check for the specific OverflowError message From 9ec4a06793432782fc6f944a179d0081e4ceb7c0 Mon Sep 17 00:00:00 2001 From: Simon Liu Date: Fri, 1 Sep 2023 11:54:19 -0700 Subject: [PATCH 3/7] improve on keeping original encoding --- podaac/subsetter/subset.py | 34 +++++++++++++++++----------------- tests/test_subset.py | 18 ++++++++++++++++++ 2 files changed, 35 insertions(+), 17 deletions(-) diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py index 4495e09a..3cf34984 100644 --- a/podaac/subsetter/subset.py +++ b/podaac/subsetter/subset.py @@ -1257,28 +1257,28 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str, lon_var_names=lon_var_names )) else: - encoding = {} - compression = {"zlib": True, "complevel": 5} - if (min_time or max_time) and not all( - dim_size == 1 for dim_size in dataset.dims.values()): - encoding = { - var_name: { - 'units': nc_dataset.variables[var_name].__dict__['units'], - 'zlib': True, - "complevel": 5, - "_FillValue": original_dataset[var_name].encoding.get('_FillValue') - } for var_name in time_var_names - if 'units' in nc_dataset.variables[var_name].__dict__ - } for var in dataset.data_vars: - if var not in encoding: - encoding[var] = compression - encoding[var]['_FillValue'] = original_dataset[var].encoding.get('_FillValue') if dataset[var].dtype == 'S1' and isinstance(dataset[var].attrs.get('_FillValue'), bytes): dataset[var].attrs['_FillValue'] = dataset[var].attrs['_FillValue'].decode('UTF-8') + # Preserve original encoding as much as possible + valid_encodings = [ + "fletcher32", + "contiguous", + "shuffle", + "compression" + ] + + var_encoding = { + "zlib": True, + "complevel": 5, + "_FillValue": original_dataset[var].encoding.get('_FillValue') + } + + original_encoding = {key: value for key, value in original_dataset[var].encoding.items() if key in valid_encodings} data_var = dataset[var].copy() - data_var.load().to_netcdf(output_file, 'a', encoding={var: encoding.get(var)}) + var_encoding.update(original_encoding) + data_var.load().to_netcdf(output_file, 'a', encoding={var: var_encoding}) del data_var with nc.Dataset(output_file, 'a') as dataset_attr: diff --git a/tests/test_subset.py b/tests/test_subset.py index 66c508b0..f318cfdc 100644 --- a/tests/test_subset.py +++ b/tests/test_subset.py @@ -158,6 +158,24 @@ def test_subset_variables(test_file, data_dir, subset_output_dir, request): decode_times=False, decode_coords=False) + + nc_in_ds = nc.Dataset(join(data_dir, test_file)) + nc_out_ds = nc.Dataset(join(subset_output_dir, output_file)) + + time_var_name = None + try: + lat_var_name = subset.compute_coordinate_variable_names(in_ds)[0][0] + time_var_name = subset.compute_time_variable_name(in_ds, in_ds[lat_var_name]) + except ValueError: + # unable to determine lon lat vars + pass + + if time_var_name: + assert nc_in_ds[time_var_name].units == nc_out_ds[time_var_name].units + + nc_in_ds.close() + nc_out_ds.close() + for in_var, out_var in zip(in_ds.data_vars.items(), out_ds.data_vars.items()): # compare names assert in_var[0] == out_var[0] From 8fa3452f88fc0ba916595d49d0a7b07714390dde Mon Sep 17 00:00:00 2001 From: Simon Liu Date: Fri, 1 Sep 2023 12:13:59 -0700 Subject: [PATCH 4/7] fix pylint --- podaac/subsetter/subset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py index 3cf34984..55add46f 100644 --- a/podaac/subsetter/subset.py +++ b/podaac/subsetter/subset.py @@ -1270,7 +1270,7 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str, ] var_encoding = { - "zlib": True, + "zlib": True, "complevel": 5, "_FillValue": original_dataset[var].encoding.get('_FillValue') } From 562a538fd9b6faac6cf5a1d721211dd059c30b8c Mon Sep 17 00:00:00 2001 From: Simon Liu Date: Tue, 5 Sep 2023 09:34:00 -0700 Subject: [PATCH 5/7] revert encoding --- podaac/subsetter/subset.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py index 55add46f..9c82a947 100644 --- a/podaac/subsetter/subset.py +++ b/podaac/subsetter/subset.py @@ -1261,23 +1261,13 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str, if dataset[var].dtype == 'S1' and isinstance(dataset[var].attrs.get('_FillValue'), bytes): dataset[var].attrs['_FillValue'] = dataset[var].attrs['_FillValue'].decode('UTF-8') - # Preserve original encoding as much as possible - valid_encodings = [ - "fletcher32", - "contiguous", - "shuffle", - "compression" - ] - var_encoding = { "zlib": True, "complevel": 5, "_FillValue": original_dataset[var].encoding.get('_FillValue') } - original_encoding = {key: value for key, value in original_dataset[var].encoding.items() if key in valid_encodings} data_var = dataset[var].copy() - var_encoding.update(original_encoding) data_var.load().to_netcdf(output_file, 'a', encoding={var: var_encoding}) del data_var From 2eccd8feba92274f78ba9db571751c92fe323a56 Mon Sep 17 00:00:00 2001 From: Simon Liu Date: Thu, 14 Sep 2023 15:13:13 -0700 Subject: [PATCH 6/7] update how we check for time overflow integer --- podaac/subsetter/subset.py | 32 +++----------------------------- 1 file changed, 3 insertions(+), 29 deletions(-) diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py index 9c82a947..df3167de 100644 --- a/podaac/subsetter/subset.py +++ b/podaac/subsetter/subset.py @@ -25,7 +25,6 @@ import os from itertools import zip_longest from typing import List, Tuple, Union -import traceback import dateutil from dateutil import parser @@ -1066,33 +1065,6 @@ def decode_cf_datetime(num_dates, units, calendar=None, use_cftime=None): xarray.coding.times.decode_cf_datetime = decode_cf_datetime -def open_dataset_test(file, args): - """ - Open a NetCDF dataset using xarray, handling specific exceptions. - - This function attempts to open a NetCDF dataset using the provided arguments. - If an OverflowError with a specific message is encountered, it modifies the - 'mask_and_scale' argument to True and retries opening the dataset. - - Args: - file (str): Path to the NetCDF file. - args (dict): Dictionary of arguments to pass to xr.open_dataset. - - Returns: - None: The function modifies the 'args' dictionary in place. - - """ - try: - test_xr_open = xr.open_dataset(file, **args) - test_xr_open.close() - except Exception: # pylint: disable=broad-except - traceback_str = traceback.format_exc() - - # Check for the specific OverflowError message - if "Python int too large to convert to C long" in traceback_str and "Failed to decode variable 'time': unable to decode time units" in traceback_str: - args["mask_and_scale"] = True - - def subset(file_to_subset: str, bbox: np.ndarray, output_file: str, variables: Union[List[str], str, None] = (), # pylint: disable=too-many-branches, disable=too-many-statements @@ -1190,7 +1162,9 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str, if min_time or max_time: args['decode_times'] = True - open_dataset_test(file_to_subset, args) + # check fill value and dtype, we know that this will cause an integer Overflow with xarray + if nc_dataset['time'].getncattr('_FillValue') == nc.default_fillvals.get('f8') and nc_dataset['time'].dtype == 'float64': + args['mask_and_scale'] = True with xr.open_dataset( xr.backends.NetCDF4DataStore(nc_dataset), From cf35bbee72301c82a332193a60fe1329e90b0d77 Mon Sep 17 00:00:00 2001 From: Simon Liu Date: Thu, 14 Sep 2023 16:04:59 -0700 Subject: [PATCH 7/7] update trying to get attributes --- podaac/subsetter/subset.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py index df3167de..70c2ba24 100644 --- a/podaac/subsetter/subset.py +++ b/podaac/subsetter/subset.py @@ -1163,8 +1163,13 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str, if min_time or max_time: args['decode_times'] = True # check fill value and dtype, we know that this will cause an integer Overflow with xarray - if nc_dataset['time'].getncattr('_FillValue') == nc.default_fillvals.get('f8') and nc_dataset['time'].dtype == 'float64': - args['mask_and_scale'] = True + if 'time' in nc_dataset.variables.keys(): + try: + if nc_dataset['time'].getncattr('_FillValue') == nc.default_fillvals.get('f8') and \ + nc_dataset['time'].dtype == 'float64': + args['mask_and_scale'] = True + except AttributeError: + pass with xr.open_dataset( xr.backends.NetCDF4DataStore(nc_dataset),