From eef13d8976030871109300cb2479219ee23f3be3 Mon Sep 17 00:00:00 2001
From: Simon Liu <simon.liu@jpl.nasa.gov>
Date: Thu, 31 Aug 2023 09:15:14 -0700
Subject: [PATCH 1/7] add function that test opening of granule file, and
 change fill value encoding

---
 podaac/subsetter/subset.py | 39 +++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py
index fc4c4459..52eee246 100644
--- a/podaac/subsetter/subset.py
+++ b/podaac/subsetter/subset.py
@@ -25,6 +25,7 @@
 import os
 from itertools import zip_longest
 from typing import List, Tuple, Union
+import traceback
 import dateutil
 from dateutil import parser
 
@@ -1065,6 +1066,33 @@ def decode_cf_datetime(num_dates, units, calendar=None, use_cftime=None):
     xarray.coding.times.decode_cf_datetime = decode_cf_datetime
 
 
+def open_dataset_test(file, args):
+    """
+    Open a NetCDF dataset using xarray, handling specific exceptions.
+
+    This function attempts to open a NetCDF dataset using the provided arguments.
+    If an OverflowError with a specific message is encountered, it modifies the
+    'mask_and_scale' argument to True and retries opening the dataset.
+
+    Args:
+        file (str): Path to the NetCDF file.
+        args (dict): Dictionary of arguments to pass to xr.open_dataset.
+
+    Returns:
+        None: The function modifies the 'args' dictionary in place.
+
+    """
+    try:
+        test_xr_open = xr.open_dataset(file, **args)
+        test_xr_open.close()
+    except ValueError:
+        traceback_str = traceback.format_exc()
+
+        # Check for the specific OverflowError message
+        if "Python int too large to convert to C long" in traceback_str and "Failed to decode variable 'time': unable to decode time units" in traceback_str:
+            args["mask_and_scale"] = True
+
+
 def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
            variables: Union[List[str], str, None] = (),
            # pylint: disable=too-many-branches, disable=too-many-statements
@@ -1162,10 +1190,15 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
 
     if min_time or max_time:
         args['decode_times'] = True
+        open_dataset_test(file_to_subset, args)
+
     with xr.open_dataset(
             xr.backends.NetCDF4DataStore(nc_dataset),
             **args
     ) as dataset:
+
+        original_dataset = dataset
+
         lat_var_names, lon_var_names, time_var_names = get_coordinate_variable_names(
             dataset=dataset,
             lat_var_names=lat_var_names,
@@ -1225,8 +1258,7 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
                 ))
             else:
                 encoding = {}
-                compression = {"zlib": True, "complevel": 5, "_FillValue": None}
-
+                compression = {"zlib": True, "complevel": 5}
                 if (min_time or max_time) and not all(
                         dim_size == 1 for dim_size in dataset.dims.values()):
                     encoding = {
@@ -1234,13 +1266,14 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
                             'units': nc_dataset.variables[var_name].__dict__['units'],
                             'zlib': True,
                             "complevel": 5,
-                            "_FillValue": None
+                            "_FillValue": original_dataset[var_name].encoding.get('_FillValue')
                         } for var_name in time_var_names
                         if 'units' in nc_dataset.variables[var_name].__dict__
                     }
                 for var in dataset.data_vars:
                     if var not in encoding:
                         encoding[var] = compression
+                        encoding[var]['_FillValue'] = original_dataset[var].encoding.get('_FillValue')
                     if dataset[var].dtype == 'S1' and isinstance(dataset[var].attrs.get('_FillValue'), bytes):
                         dataset[var].attrs['_FillValue'] = dataset[var].attrs['_FillValue'].decode('UTF-8')
 

From c8875089f305253eddb16ae07b982e83db0b39bc Mon Sep 17 00:00:00 2001
From: Simon Liu <simon.liu@jpl.nasa.gov>
Date: Thu, 31 Aug 2023 09:55:32 -0700
Subject: [PATCH 2/7] update changelog, and change exception catching when
 testing opening granule file

---
 CHANGELOG.md               | 2 ++
 podaac/subsetter/subset.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2c7c881c..5178a13c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Removed
 ### Fixed
 - [issue/119](https://github.com/podaac/l2ss-py/issues/119): GPM variable dimensions are renamed from "phony_dim" to the dimension names in the variable attribute "DimensionNames"
+- [issue/189](https://github.com/podaac/l2ss-py/issues/189): Fix temporal subsetting for SWOT collections, use mask_and_scale args for opening granule file if we have an overflow in time fill value, use original dataset encoding when writing file.
+
 ### Security
 
 
diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py
index 52eee246..4495e09a 100644
--- a/podaac/subsetter/subset.py
+++ b/podaac/subsetter/subset.py
@@ -1085,7 +1085,7 @@ def open_dataset_test(file, args):
     try:
         test_xr_open = xr.open_dataset(file, **args)
         test_xr_open.close()
-    except ValueError:
+    except Exception:  # pylint: disable=broad-except
         traceback_str = traceback.format_exc()
 
         # Check for the specific OverflowError message

From 9ec4a06793432782fc6f944a179d0081e4ceb7c0 Mon Sep 17 00:00:00 2001
From: Simon Liu <simon.liu@jpl.nasa.gov>
Date: Fri, 1 Sep 2023 11:54:19 -0700
Subject: [PATCH 3/7] improve on keeping original encoding

---
 podaac/subsetter/subset.py | 34 +++++++++++++++++-----------------
 tests/test_subset.py       | 18 ++++++++++++++++++
 2 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py
index 4495e09a..3cf34984 100644
--- a/podaac/subsetter/subset.py
+++ b/podaac/subsetter/subset.py
@@ -1257,28 +1257,28 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
                     lon_var_names=lon_var_names
                 ))
             else:
-                encoding = {}
-                compression = {"zlib": True, "complevel": 5}
-                if (min_time or max_time) and not all(
-                        dim_size == 1 for dim_size in dataset.dims.values()):
-                    encoding = {
-                        var_name: {
-                            'units': nc_dataset.variables[var_name].__dict__['units'],
-                            'zlib': True,
-                            "complevel": 5,
-                            "_FillValue": original_dataset[var_name].encoding.get('_FillValue')
-                        } for var_name in time_var_names
-                        if 'units' in nc_dataset.variables[var_name].__dict__
-                    }
                 for var in dataset.data_vars:
-                    if var not in encoding:
-                        encoding[var] = compression
-                        encoding[var]['_FillValue'] = original_dataset[var].encoding.get('_FillValue')
                     if dataset[var].dtype == 'S1' and isinstance(dataset[var].attrs.get('_FillValue'), bytes):
                         dataset[var].attrs['_FillValue'] = dataset[var].attrs['_FillValue'].decode('UTF-8')
 
+                    # Preserve original encoding as much as possible
+                    valid_encodings = [
+                        "fletcher32",
+                        "contiguous",
+                        "shuffle",
+                        "compression"
+                    ]
+
+                    var_encoding = {
+                        "zlib": True, 
+                        "complevel": 5,
+                        "_FillValue": original_dataset[var].encoding.get('_FillValue')
+                    }
+
+                    original_encoding = {key: value for key, value in original_dataset[var].encoding.items() if key in valid_encodings}
                     data_var = dataset[var].copy()
-                    data_var.load().to_netcdf(output_file, 'a', encoding={var: encoding.get(var)})
+                    var_encoding.update(original_encoding)
+                    data_var.load().to_netcdf(output_file, 'a', encoding={var: var_encoding})
                     del data_var
 
                 with nc.Dataset(output_file, 'a') as dataset_attr:
diff --git a/tests/test_subset.py b/tests/test_subset.py
index 66c508b0..f318cfdc 100644
--- a/tests/test_subset.py
+++ b/tests/test_subset.py
@@ -158,6 +158,24 @@ def test_subset_variables(test_file, data_dir, subset_output_dir, request):
                              decode_times=False,
                              decode_coords=False)
 
+
+    nc_in_ds = nc.Dataset(join(data_dir, test_file))
+    nc_out_ds = nc.Dataset(join(subset_output_dir, output_file))
+
+    time_var_name = None
+    try:
+        lat_var_name = subset.compute_coordinate_variable_names(in_ds)[0][0]
+        time_var_name = subset.compute_time_variable_name(in_ds, in_ds[lat_var_name])
+    except ValueError:
+        # unable to determine lon lat vars
+        pass
+
+    if time_var_name:
+        assert nc_in_ds[time_var_name].units == nc_out_ds[time_var_name].units
+
+    nc_in_ds.close()
+    nc_out_ds.close()
+
     for in_var, out_var in zip(in_ds.data_vars.items(), out_ds.data_vars.items()):
         # compare names
         assert in_var[0] == out_var[0]

From 8fa3452f88fc0ba916595d49d0a7b07714390dde Mon Sep 17 00:00:00 2001
From: Simon Liu <simon.liu@jpl.nasa.gov>
Date: Fri, 1 Sep 2023 12:13:59 -0700
Subject: [PATCH 4/7] fix pylint

---
 podaac/subsetter/subset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py
index 3cf34984..55add46f 100644
--- a/podaac/subsetter/subset.py
+++ b/podaac/subsetter/subset.py
@@ -1270,7 +1270,7 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
                     ]
 
                     var_encoding = {
-                        "zlib": True, 
+                        "zlib": True,
                         "complevel": 5,
                         "_FillValue": original_dataset[var].encoding.get('_FillValue')
                     }

From 562a538fd9b6faac6cf5a1d721211dd059c30b8c Mon Sep 17 00:00:00 2001
From: Simon Liu <simon.liu@jpl.nasa.gov>
Date: Tue, 5 Sep 2023 09:34:00 -0700
Subject: [PATCH 5/7] revert encoding

---
 podaac/subsetter/subset.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py
index 55add46f..9c82a947 100644
--- a/podaac/subsetter/subset.py
+++ b/podaac/subsetter/subset.py
@@ -1261,23 +1261,13 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
                     if dataset[var].dtype == 'S1' and isinstance(dataset[var].attrs.get('_FillValue'), bytes):
                         dataset[var].attrs['_FillValue'] = dataset[var].attrs['_FillValue'].decode('UTF-8')
 
-                    # Preserve original encoding as much as possible
-                    valid_encodings = [
-                        "fletcher32",
-                        "contiguous",
-                        "shuffle",
-                        "compression"
-                    ]
-
                     var_encoding = {
                         "zlib": True,
                         "complevel": 5,
                         "_FillValue": original_dataset[var].encoding.get('_FillValue')
                     }
 
-                    original_encoding = {key: value for key, value in original_dataset[var].encoding.items() if key in valid_encodings}
                     data_var = dataset[var].copy()
-                    var_encoding.update(original_encoding)
                     data_var.load().to_netcdf(output_file, 'a', encoding={var: var_encoding})
                     del data_var
 

From 2eccd8feba92274f78ba9db571751c92fe323a56 Mon Sep 17 00:00:00 2001
From: Simon Liu <simon.liu@jpl.nasa.gov>
Date: Thu, 14 Sep 2023 15:13:13 -0700
Subject: [PATCH 6/7] update how we check for time overflow integer

---
 podaac/subsetter/subset.py | 32 +++-----------------------------
 1 file changed, 3 insertions(+), 29 deletions(-)

diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py
index 9c82a947..df3167de 100644
--- a/podaac/subsetter/subset.py
+++ b/podaac/subsetter/subset.py
@@ -25,7 +25,6 @@
 import os
 from itertools import zip_longest
 from typing import List, Tuple, Union
-import traceback
 import dateutil
 from dateutil import parser
 
@@ -1066,33 +1065,6 @@ def decode_cf_datetime(num_dates, units, calendar=None, use_cftime=None):
     xarray.coding.times.decode_cf_datetime = decode_cf_datetime
 
 
-def open_dataset_test(file, args):
-    """
-    Open a NetCDF dataset using xarray, handling specific exceptions.
-
-    This function attempts to open a NetCDF dataset using the provided arguments.
-    If an OverflowError with a specific message is encountered, it modifies the
-    'mask_and_scale' argument to True and retries opening the dataset.
-
-    Args:
-        file (str): Path to the NetCDF file.
-        args (dict): Dictionary of arguments to pass to xr.open_dataset.
-
-    Returns:
-        None: The function modifies the 'args' dictionary in place.
-
-    """
-    try:
-        test_xr_open = xr.open_dataset(file, **args)
-        test_xr_open.close()
-    except Exception:  # pylint: disable=broad-except
-        traceback_str = traceback.format_exc()
-
-        # Check for the specific OverflowError message
-        if "Python int too large to convert to C long" in traceback_str and "Failed to decode variable 'time': unable to decode time units" in traceback_str:
-            args["mask_and_scale"] = True
-
-
 def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
            variables: Union[List[str], str, None] = (),
            # pylint: disable=too-many-branches, disable=too-many-statements
@@ -1190,7 +1162,9 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
 
     if min_time or max_time:
         args['decode_times'] = True
-        open_dataset_test(file_to_subset, args)
+        # check fill value and dtype, we know that this will cause an integer Overflow with xarray
+        if nc_dataset['time'].getncattr('_FillValue') == nc.default_fillvals.get('f8') and nc_dataset['time'].dtype == 'float64':
+            args['mask_and_scale'] = True
 
     with xr.open_dataset(
             xr.backends.NetCDF4DataStore(nc_dataset),

From cf35bbee72301c82a332193a60fe1329e90b0d77 Mon Sep 17 00:00:00 2001
From: Simon Liu <simon.liu@jpl.nasa.gov>
Date: Thu, 14 Sep 2023 16:04:59 -0700
Subject: [PATCH 7/7] update trying to get attributes

---
 podaac/subsetter/subset.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py
index df3167de..70c2ba24 100644
--- a/podaac/subsetter/subset.py
+++ b/podaac/subsetter/subset.py
@@ -1163,8 +1163,13 @@ def subset(file_to_subset: str, bbox: np.ndarray, output_file: str,
     if min_time or max_time:
         args['decode_times'] = True
         # check fill value and dtype, we know that this will cause an integer Overflow with xarray
-        if nc_dataset['time'].getncattr('_FillValue') == nc.default_fillvals.get('f8') and nc_dataset['time'].dtype == 'float64':
-            args['mask_and_scale'] = True
+        if 'time' in nc_dataset.variables.keys():
+            try:
+                if nc_dataset['time'].getncattr('_FillValue') == nc.default_fillvals.get('f8') and \
+                 nc_dataset['time'].dtype == 'float64':
+                    args['mask_and_scale'] = True
+            except AttributeError:
+                pass
 
     with xr.open_dataset(
             xr.backends.NetCDF4DataStore(nc_dataset),