From b254b5e860e84b6bbfea2c431e7d6231f491496d Mon Sep 17 00:00:00 2001 From: dkaufma3 Date: Mon, 28 Nov 2022 17:51:07 -0500 Subject: [PATCH 01/16] move methods for flattening netcdf and hdf group structures to separate module --- podaac/subsetter/group_handling.py | 231 +++++++++++++++++++++++++++++ podaac/subsetter/subset.py | 228 +--------------------------- tests/test_subset.py | 32 ++-- 3 files changed, 250 insertions(+), 241 deletions(-) create mode 100644 podaac/subsetter/group_handling.py diff --git a/podaac/subsetter/group_handling.py b/podaac/subsetter/group_handling.py new file mode 100644 index 00000000..cf9a648b --- /dev/null +++ b/podaac/subsetter/group_handling.py @@ -0,0 +1,231 @@ +from shutil import copy + +import h5py +import netCDF4 as nc +import numpy as np +import xarray as xr + +GROUP_DELIM = '__' + + +def transform_grouped_dataset(nc_dataset, file_to_subset): + """ + Transform a netCDF4 Dataset that has groups to an xarray compatible + dataset. xarray does not work with groups, so this transformation + will flatten the variables in the dataset and use the group path as + the new variable name. For example, data_01 > km > sst would become + 'data_01__km__sst', where GROUP_DELIM is __. + + This same pattern is applied to dimensions, which are located under + the appropriate group. They are renamed and placed in the root + group. + + Parameters + ---------- + nc_dataset : nc.Dataset + netCDF4 Dataset that contains groups + + Returns + ------- + nc.Dataset + netCDF4 Dataset that does not contain groups and that has been + flattened. + """ + + # Close the existing read-only dataset and reopen in append mode + nc_dataset.close() + nc_dataset = nc.Dataset(file_to_subset, 'r+') + + dimensions = {} + + def walk(group_node, path): + for key, item in group_node.items(): + group_path = f'{path}{GROUP_DELIM}{key}' + + # If there are variables in this group, copy to root group + # and then delete from current group + if item.variables: + # Copy variables to root group with new name + for var_name, var in item.variables.items(): + var_group_name = f'{group_path}{GROUP_DELIM}{var_name}' + nc_dataset.variables[var_group_name] = var + # Delete variables + var_names = list(item.variables.keys()) + for var_name in var_names: + del item.variables[var_name] + + if item.dimensions: + dims = list(item.dimensions.keys()) + for dim_name in dims: + new_dim_name = f'{group_path.replace("/", GROUP_DELIM)}{GROUP_DELIM}{dim_name}' + item.dimensions[new_dim_name] = item.dimensions[dim_name] + dimensions[new_dim_name] = item.dimensions[dim_name] + item.renameDimension(dim_name, new_dim_name) + + # If there are subgroups in this group, call this function + # again on that group. + if item.groups: + walk(item.groups, group_path) + + # Delete non-root groups + group_names = list(group_node.keys()) + for group_name in group_names: + del group_node[group_name] + + for var_name in list(nc_dataset.variables.keys()): + new_var_name = f'{GROUP_DELIM}{var_name}' + nc_dataset.variables[new_var_name] = nc_dataset.variables[var_name] + del nc_dataset.variables[var_name] + + walk(nc_dataset.groups, '') + + # Update the dimensions of the dataset in the root group + nc_dataset.dimensions.update(dimensions) + + return nc_dataset + + +def recombine_grouped_datasets(datasets, output_file, start_date): # pylint: disable=too-many-branches + """ + Given a list of xarray datasets, combine those datasets into a + single netCDF4 Dataset and write to the disk. Each dataset has been + transformed using its group path and needs to be un-transformed and + placed in the appropriate group. + + Parameters + ---------- + datasets : list (xr.Dataset) + List of xarray datasets to be combined + output_file : str + Name of the output file to write the resulting NetCDF file to. + """ + + base_dataset = nc.Dataset(output_file, mode='w') + + for dataset in datasets: + group_lst = [] + for var_name in dataset.variables.keys(): # need logic if there is data in the top level not in a group + group_lst.append('/'.join(var_name.split(GROUP_DELIM)[:-1])) + group_lst = ['/' if group == '' else group for group in group_lst] + groups = set(group_lst) + for group in groups: + base_dataset.createGroup(group) + + for dim_name in list(dataset.dims.keys()): + new_dim_name = dim_name.split(GROUP_DELIM)[-1] + dim_group = _get_nested_group(base_dataset, dim_name) + dim_group.createDimension(new_dim_name, dataset.dims[dim_name]) + + # Rename variables + _rename_variables(dataset, base_dataset, start_date) + + # Remove group vars from base dataset + for var_name in list(base_dataset.variables.keys()): + if GROUP_DELIM in var_name: + del base_dataset.variables[var_name] + + # Remove group dims from base dataset + for dim_name in list(base_dataset.dimensions.keys()): + if GROUP_DELIM in dim_name: + del base_dataset.dimensions[dim_name] + + # Copy global attributes + base_dataset.setncatts(datasets[0].attrs) + # Write and close + base_dataset.close() + + +def _get_nested_group(dataset, group_path): + nested_group = dataset + for group in group_path.strip(GROUP_DELIM).split(GROUP_DELIM)[:-1]: + nested_group = nested_group.groups[group] + return nested_group + + +def _rename_variables(dataset, base_dataset, start_date): + for var_name in list(dataset.variables.keys()): + new_var_name = var_name.split(GROUP_DELIM)[-1] + var_group = _get_nested_group(base_dataset, var_name) + variable = dataset.variables[var_name] + var_dims = [x.split(GROUP_DELIM)[-1] for x in dataset.variables[var_name].dims] + if np.issubdtype( + dataset.variables[var_name].dtype, np.dtype(np.datetime64) + ) or np.issubdtype( + dataset.variables[var_name].dtype, np.dtype(np.timedelta64) + ): + if start_date: + dataset.variables[var_name].values = (dataset.variables[var_name].values - np.datetime64(start_date))/np.timedelta64(1, 's') + variable = dataset.variables[var_name] + else: + cf_dt_coder = xr.coding.times.CFDatetimeCoder() + encoded_var = cf_dt_coder.encode(dataset.variables[var_name]) + variable = encoded_var + + var_attrs = variable.attrs + fill_value = var_attrs.get('_FillValue') + var_attrs.pop('_FillValue', None) + comp_args = {"zlib": True, "complevel": 1} + + if variable.dtype == object: + var_group.createVariable(new_var_name, 'S1', var_dims, fill_value=fill_value, **comp_args) + elif variable.dtype == 'timedelta64[ns]': + var_group.createVariable(new_var_name, 'i4', var_dims, fill_value=fill_value, **comp_args) + else: + var_group.createVariable(new_var_name, variable.dtype, var_dims, fill_value=fill_value, **comp_args) + + # Copy attributes + var_group.variables[new_var_name].setncatts(var_attrs) + + # Copy data + var_group.variables[new_var_name].set_auto_maskandscale(False) + var_group.variables[new_var_name][:] = variable.data + + +def h5file_transform(finput): + """ + Transform a h5py Dataset that has groups to an xarray compatible + dataset. xarray does not work with groups, so this transformation + will flatten the variables in the dataset and use the group path as + the new variable name. For example, data_01 > km > sst would become + 'data_01__km__sst', where GROUP_DELIM is __. + + Returns + ------- + nc.Dataset + netCDF4 Dataset that does not contain groups and that has been + flattened. + """ + data_new = h5py.File(finput, 'r+') + del_group_list = list(data_new.keys()) + has_groups = bool(data_new['/']) + + def walk_h5py(data_new, group): + # flattens h5py file + for key, item in data_new[group].items(): + group_path = f'{group}{key}' + if isinstance(item, h5py.Dataset): + new_var_name = group_path.replace('/', '__') + + data_new[new_var_name] = data_new[group_path] + del data_new[group_path] + + elif isinstance(item, h5py.Group): + if len(list(item.keys())) == 0: + new_group_name = group_path.replace('/', '__') + data_new[new_group_name] = data_new[group_path] + + walk_h5py(data_new, data_new[group_path].name + '/') + + walk_h5py(data_new, data_new.name) + + for del_group in del_group_list: + del data_new[del_group] + + finputnc = '.'.join(finput.split('.')[:-1]) + '.nc' + + data_new.close() # close the h5py dataset + copy(finput, finputnc) # copy to a nc file + + nc_dataset = nc.Dataset(finputnc, mode='r') + + return nc_dataset, has_groups diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py index 4d9d7cbe..e3db8de2 100644 --- a/podaac/subsetter/subset.py +++ b/podaac/subsetter/subset.py @@ -23,11 +23,9 @@ import json import operator import os -from shutil import copy import cf_xarray as cfxr import geopandas as gpd -import h5py import importlib_metadata import julian import netCDF4 as nc @@ -39,8 +37,9 @@ from podaac.subsetter import dimension_cleanup as dc from podaac.subsetter import xarray_enhancements as xre +from podaac.subsetter.group_handling import GROUP_DELIM, transform_grouped_dataset, recombine_grouped_datasets, \ + h5file_transform -GROUP_DELIM = '__' SERVICE_NAME = 'l2ss-py' @@ -871,229 +870,6 @@ def in_shape(lon, lat): return xre.where(dataset, boolean_mask, cut) -def transform_grouped_dataset(nc_dataset, file_to_subset): - """ - Transform a netCDF4 Dataset that has groups to an xarray compatible - dataset. xarray does not work with groups, so this transformation - will flatten the variables in the dataset and use the group path as - the new variable name. For example, data_01 > km > sst would become - 'data_01__km__sst', where GROUP_DELIM is __. - - This same pattern is applied to dimensions, which are located under - the appropriate group. They are renamed and placed in the root - group. - - Parameters - ---------- - nc_dataset : nc.Dataset - netCDF4 Dataset that contains groups - - Returns - ------- - nc.Dataset - netCDF4 Dataset that does not contain groups and that has been - flattened. - """ - - # Close the existing read-only dataset and reopen in append mode - nc_dataset.close() - nc_dataset = nc.Dataset(file_to_subset, 'r+') - - dimensions = {} - - def walk(group_node, path): - for key, item in group_node.items(): - group_path = f'{path}{GROUP_DELIM}{key}' - - # If there are variables in this group, copy to root group - # and then delete from current group - if item.variables: - # Copy variables to root group with new name - for var_name, var in item.variables.items(): - var_group_name = f'{group_path}{GROUP_DELIM}{var_name}' - nc_dataset.variables[var_group_name] = var - # Delete variables - var_names = list(item.variables.keys()) - for var_name in var_names: - del item.variables[var_name] - - if item.dimensions: - dims = list(item.dimensions.keys()) - for dim_name in dims: - new_dim_name = f'{group_path.replace("/", GROUP_DELIM)}{GROUP_DELIM}{dim_name}' - item.dimensions[new_dim_name] = item.dimensions[dim_name] - dimensions[new_dim_name] = item.dimensions[dim_name] - item.renameDimension(dim_name, new_dim_name) - - # If there are subgroups in this group, call this function - # again on that group. - if item.groups: - walk(item.groups, group_path) - - # Delete non-root groups - group_names = list(group_node.keys()) - for group_name in group_names: - del group_node[group_name] - - for var_name in list(nc_dataset.variables.keys()): - new_var_name = f'{GROUP_DELIM}{var_name}' - nc_dataset.variables[new_var_name] = nc_dataset.variables[var_name] - del nc_dataset.variables[var_name] - - walk(nc_dataset.groups, '') - - # Update the dimensions of the dataset in the root group - nc_dataset.dimensions.update(dimensions) - - return nc_dataset - - -def recombine_grouped_datasets(datasets, output_file, start_date): # pylint: disable=too-many-branches - """ - Given a list of xarray datasets, combine those datasets into a - single netCDF4 Dataset and write to the disk. Each dataset has been - transformed using its group path and needs to be un-transformed and - placed in the appropriate group. - - Parameters - ---------- - datasets : list (xr.Dataset) - List of xarray datasets to be combined - output_file : str - Name of the output file to write the resulting NetCDF file to. - """ - - base_dataset = nc.Dataset(output_file, mode='w') - - for dataset in datasets: - group_lst = [] - for var_name in dataset.variables.keys(): # need logic if there is data in the top level not in a group - group_lst.append('/'.join(var_name.split(GROUP_DELIM)[:-1])) - group_lst = ['/' if group == '' else group for group in group_lst] - groups = set(group_lst) - for group in groups: - base_dataset.createGroup(group) - - for dim_name in list(dataset.dims.keys()): - new_dim_name = dim_name.split(GROUP_DELIM)[-1] - dim_group = _get_nested_group(base_dataset, dim_name) - dim_group.createDimension(new_dim_name, dataset.dims[dim_name]) - - # Rename variables - _rename_variables(dataset, base_dataset, start_date) - - # Remove group vars from base dataset - for var_name in list(base_dataset.variables.keys()): - if GROUP_DELIM in var_name: - del base_dataset.variables[var_name] - - # Remove group dims from base dataset - for dim_name in list(base_dataset.dimensions.keys()): - if GROUP_DELIM in dim_name: - del base_dataset.dimensions[dim_name] - - # Copy global attributes - base_dataset.setncatts(datasets[0].attrs) - # Write and close - base_dataset.close() - - -def _get_nested_group(dataset, group_path): - nested_group = dataset - for group in group_path.strip(GROUP_DELIM).split(GROUP_DELIM)[:-1]: - nested_group = nested_group.groups[group] - return nested_group - - -def _rename_variables(dataset, base_dataset, start_date): - for var_name in list(dataset.variables.keys()): - new_var_name = var_name.split(GROUP_DELIM)[-1] - var_group = _get_nested_group(base_dataset, var_name) - variable = dataset.variables[var_name] - var_dims = [x.split(GROUP_DELIM)[-1] for x in dataset.variables[var_name].dims] - if np.issubdtype( - dataset.variables[var_name].dtype, np.dtype(np.datetime64) - ) or np.issubdtype( - dataset.variables[var_name].dtype, np.dtype(np.timedelta64) - ): - if start_date: - dataset.variables[var_name].values = (dataset.variables[var_name].values - np.datetime64(start_date))/np.timedelta64(1, 's') - variable = dataset.variables[var_name] - else: - cf_dt_coder = xr.coding.times.CFDatetimeCoder() - encoded_var = cf_dt_coder.encode(dataset.variables[var_name]) - variable = encoded_var - - var_attrs = variable.attrs - fill_value = var_attrs.get('_FillValue') - var_attrs.pop('_FillValue', None) - comp_args = {"zlib": True, "complevel": 1} - - if variable.dtype == object: - var_group.createVariable(new_var_name, 'S1', var_dims, fill_value=fill_value, **comp_args) - elif variable.dtype == 'timedelta64[ns]': - var_group.createVariable(new_var_name, 'i4', var_dims, fill_value=fill_value, **comp_args) - else: - var_group.createVariable(new_var_name, variable.dtype, var_dims, fill_value=fill_value, **comp_args) - - # Copy attributes - var_group.variables[new_var_name].setncatts(var_attrs) - - # Copy data - var_group.variables[new_var_name].set_auto_maskandscale(False) - var_group.variables[new_var_name][:] = variable.data - - -def h5file_transform(finput): - """ - Transform a h5py Dataset that has groups to an xarray compatible - dataset. xarray does not work with groups, so this transformation - will flatten the variables in the dataset and use the group path as - the new variable name. For example, data_01 > km > sst would become - 'data_01__km__sst', where GROUP_DELIM is __. - - Returns - ------- - nc.Dataset - netCDF4 Dataset that does not contain groups and that has been - flattened. - """ - data_new = h5py.File(finput, 'r+') - del_group_list = list(data_new.keys()) - has_groups = bool(data_new['/']) - - def walk_h5py(data_new, group): - # flattens h5py file - for key, item in data_new[group].items(): - group_path = f'{group}{key}' - if isinstance(item, h5py.Dataset): - new_var_name = group_path.replace('/', '__') - - data_new[new_var_name] = data_new[group_path] - del data_new[group_path] - - elif isinstance(item, h5py.Group): - if len(list(item.keys())) == 0: - new_group_name = group_path.replace('/', '__') - data_new[new_group_name] = data_new[group_path] - - walk_h5py(data_new, data_new[group_path].name + '/') - - walk_h5py(data_new, data_new.name) - - for del_group in del_group_list: - del data_new[del_group] - - finputnc = '.'.join(finput.split('.')[:-1]) + '.nc' - - data_new.close() # close the h5py dataset - copy(finput, finputnc) # copy to a nc file - - nc_dataset = nc.Dataset(finputnc, mode='r') - - return nc_dataset, has_groups - - def get_coordinate_variable_names(dataset, lat_var_names=None, lon_var_names=None, time_var_names=None): """ Retrieve coordinate variables for this dataset. If coordinate diff --git a/tests/test_subset.py b/tests/test_subset.py index 763345ec..5c683540 100644 --- a/tests/test_subset.py +++ b/tests/test_subset.py @@ -36,6 +36,7 @@ from jsonschema import validate from shapely.geometry import Point +from podaac.subsetter import group_handling as gh from podaac.subsetter import subset from podaac.subsetter.subset import SERVICE_NAME from podaac.subsetter import xarray_enhancements as xre @@ -886,7 +887,7 @@ def test_transform_grouped_dataset(self): os.path.join(self.subset_output_dir, s6_file_name)) nc_ds = nc.Dataset(os.path.join(self.test_data_dir, 'sentinel_6', s6_file_name)) - nc_ds_transformed = subset.transform_grouped_dataset( + nc_ds_transformed = gh.transform_grouped_dataset( nc.Dataset(os.path.join(self.subset_output_dir, s6_file_name), 'r'), os.path.join(self.subset_output_dir, s6_file_name) ) @@ -1361,16 +1362,16 @@ def test_root_group(self): 'mask_and_scale': False, 'decode_times': False } - nc_dataset = subset.transform_grouped_dataset(nc_dataset, os.path.join(self.subset_output_dir, sndr_file_name)) + nc_dataset = gh.transform_grouped_dataset(nc_dataset, os.path.join(self.subset_output_dir, sndr_file_name)) with xr.open_dataset( xr.backends.NetCDF4DataStore(nc_dataset), **args ) as dataset: var_list = list(dataset.variables) - assert (var_list[0][0:2] == subset.GROUP_DELIM) + assert (var_list[0][0:2] == gh.GROUP_DELIM) group_lst = [] for var_name in dataset.variables.keys(): #need logic if there is data in the top level not in a group - group_lst.append('/'.join(var_name.split(subset.GROUP_DELIM)[:-1])) + group_lst.append('/'.join(var_name.split(gh.GROUP_DELIM)[:-1])) group_lst = ['/' if group=='' else group for group in group_lst] groups = set(group_lst) expected_group = {'/mw', '/ave_kern', '/', '/mol_lay', '/aux'} @@ -1392,7 +1393,7 @@ def test_get_time_squeeze(self): 'mask_and_scale': False, 'decode_times': False } - nc_dataset = subset.transform_grouped_dataset(nc_dataset, os.path.join(self.subset_output_dir, tropomi_file_name)) + nc_dataset = gh.transform_grouped_dataset(nc_dataset, os.path.join(self.subset_output_dir, tropomi_file_name)) with xr.open_dataset( xr.backends.NetCDF4DataStore(nc_dataset), **args @@ -1417,7 +1418,7 @@ def test_get_indexers_nd(self): 'mask_and_scale': False, 'decode_times': False } - nc_dataset = subset.transform_grouped_dataset(nc_dataset, os.path.join(self.subset_output_dir, tropomi_file_name)) + nc_dataset = gh.transform_grouped_dataset(nc_dataset, os.path.join(self.subset_output_dir, tropomi_file_name)) with xr.open_dataset( xr.backends.NetCDF4DataStore(nc_dataset), **args @@ -1484,7 +1485,7 @@ def test_transform_h5py_dataset(self): entry_lst.append(entry_str + "/" + group_keys) key_lst.append(entry_str + "/" + group_keys) - nc_dataset, has_groups = subset.h5file_transform(os.path.join(self.subset_output_dir, OMI_file_name)) + nc_dataset, has_groups = gh.h5file_transform(os.path.join(self.subset_output_dir, OMI_file_name)) nc_vars_flattened = list(nc_dataset.variables.keys()) for i in range(len(entry_lst)): # go through all the datasets in h5py file @@ -1511,32 +1512,33 @@ def test_variable_dims_matched_tropomi(self): # Get variable dimensions from input dataset in_var_dims = { - var_name: [dim.split(subset.GROUP_DELIM)[-1] for dim in var.dimensions] + var_name: [dim.split(gh.GROUP_DELIM)[-1] for dim in var.dimensions] for var_name, var in in_nc.groups['PRODUCT'].variables.items() } # Get variables from METADATA group in_var_dims.update( { - var_name: [dim.split(subset.GROUP_DELIM)[-1] for dim in var.dimensions] + var_name: [dim.split(gh.GROUP_DELIM)[-1] for dim in var.dimensions] for var_name, var in in_nc.groups['METADATA'].groups['QA_STATISTICS'].variables.items() } ) # Include PRODUCT>SUPPORT_DATA>GEOLOCATIONS location in_var_dims.update( { - var_name: [dim.split(subset.GROUP_DELIM)[-1] for dim in var.dimensions] + var_name: [dim.split(gh.GROUP_DELIM)[-1] for dim in var.dimensions] for var_name, var in in_nc.groups['PRODUCT'].groups['SUPPORT_DATA'].groups['GEOLOCATIONS'].variables.items() } ) - out_nc = subset.transform_grouped_dataset( + out_nc = gh.transform_grouped_dataset( in_nc, os.path.join(self.subset_output_dir, tropomi_file_name) ) # Get variable dimensions from output dataset out_var_dims = { - var_name.split(subset.GROUP_DELIM)[-1]: [dim.split(subset.GROUP_DELIM)[-1] for dim in var.dimensions] + var_name.split(gh.GROUP_DELIM)[-1]: [dim.split( + gh.GROUP_DELIM)[-1] for dim in var.dimensions] for var_name, var in out_nc.variables.items() } @@ -1602,7 +1604,7 @@ def test_get_time_epoch_var(self): nc_dataset = nc.Dataset(os.path.join(self.subset_output_dir, tropomi_file), mode='r') - nc_dataset = subset.transform_grouped_dataset(nc_dataset, os.path.join(self.subset_output_dir, tropomi_file)) + nc_dataset = gh.transform_grouped_dataset(nc_dataset, os.path.join(self.subset_output_dir, tropomi_file)) args = { 'decode_coords': False, @@ -1696,7 +1698,7 @@ def test_temporal_he5file_subset(self): min_time='2020-01-16T12:30:00Z' max_time='2020-01-16T12:40:00Z' bbox = np.array(((-180, 180), (-90, 90))) - nc_dataset, has_groups = subset.h5file_transform(os.path.join(self.subset_output_dir, OMI_copy_file)) + nc_dataset, has_groups = gh.h5file_transform(os.path.join(self.subset_output_dir, OMI_copy_file)) args = { 'decode_coords': False, @@ -1824,7 +1826,7 @@ def test_get_time_OMI(self): shutil.copyfile(os.path.join(self.test_data_dir, 'OMI', omi_file), os.path.join(self.subset_output_dir, omi_file)) - nc_dataset, has_groups = subset.h5file_transform(os.path.join(self.subset_output_dir, omi_file)) + nc_dataset, has_groups = gh.h5file_transform(os.path.join(self.subset_output_dir, omi_file)) args = { 'decode_coords': False, From 8c2d9e8c545df1d1a48ae5656713be7fa9c8ca14 Mon Sep 17 00:00:00 2001 From: sliu008 <69875423+sliu008@users.noreply.github.com> Date: Thu, 8 Dec 2022 10:13:39 -0800 Subject: [PATCH 02/16] feature/PODAAC-5065 (#129) * fix way xarray open granules that have as a time unit * fix pylint * change function to use original function if can parse only change units if we can not parse * make xarray override into its own function * add test for override_decode_cf_datetime function * disable pyline one line instead of global * Update podaac/subsetter/subset.py Co-authored-by: Frank Greguska <89428916+frankinspace@users.noreply.github.com> --- CHANGELOG.md | 1 + podaac/subsetter/subset.py | 29 +++++++++++++++++++++++++++++ tests/test_subset.py | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3dcd2a23..69ecce10 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Deprecated ### Removed ### Fixed +- PODAAC-5065: integration with SMAP_RSS_L2_SSS_V5, fix way xarray open granules that have `seconds since 2000-1-1 0:0:0 0` as a time unit. ### Security ## [2.2.0] diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py index 4d9d7cbe..e5ed21b1 100644 --- a/podaac/subsetter/subset.py +++ b/podaac/subsetter/subset.py @@ -24,8 +24,11 @@ import operator import os from shutil import copy +import dateutil +from dateutil import parser import cf_xarray as cfxr +import cftime import geopandas as gpd import h5py import importlib_metadata @@ -34,6 +37,7 @@ import numpy as np import pandas as pd import xarray as xr +import xarray.coding.times from shapely.geometry import Point from shapely.ops import transform @@ -1154,6 +1158,29 @@ def convert_to_datetime(dataset, time_vars): return dataset, start_date +def override_decode_cf_datetime(): + """ + WARNING !!! REMOVE AT EARLIEST XARRAY FIX, this is a override to xarray override_decode_cf_datetime function. + xarray has problems decoding time units with format `seconds since 2000-1-1 0:0:0 0`, this solves by testing + the unit to see if its parsable, if it is use original function, if not format unit into a parsable format. + + https://github.com/pydata/xarray/issues/7210 + """ + + orig_decode_cf_datetime = xarray.coding.times.decode_cf_datetime + + def decode_cf_datetime(num_dates, units, calendar=None, use_cftime=None): + try: + parser.parse(units.split('since')[-1]) + return orig_decode_cf_datetime(num_dates, units, calendar, use_cftime) + except dateutil.parser.ParserError: + reference_time = cftime.num2date(0, units, calendar) + units = f"{units.split('since')[0]} since {reference_time}" + return orig_decode_cf_datetime(num_dates, units, calendar, use_cftime) + + xarray.coding.times.decode_cf_datetime = decode_cf_datetime + + def subset(file_to_subset, bbox, output_file, variables=None, # pylint: disable=too-many-branches, disable=too-many-statements cut=True, shapefile=None, min_time=None, max_time=None, origin_source=None, @@ -1221,6 +1248,8 @@ def subset(file_to_subset, bbox, output_file, variables=None, nc_dataset, rename_vars = dc.remove_duplicate_dims(nc_dataset) + override_decode_cf_datetime() + if variables: variables = [x.replace('/', GROUP_DELIM) for x in variables] diff --git a/tests/test_subset.py b/tests/test_subset.py index 763345ec..c8587c6f 100644 --- a/tests/test_subset.py +++ b/tests/test_subset.py @@ -1945,3 +1945,38 @@ def test_passed_coords(self): assert lats == dummy_lats assert lons == dummy_lons assert times == dummy_times + + def test_bad_time_unit(self): + + fill_val = -99999.0 + time_vals = np.random.rand(10) + time_vals[0] = fill_val + time_vals[-1] = fill_val + + data_vars = { + 'foo': (['x'], np.random.rand(10)), + 'time': ( + ['x'], + time_vals, + { + 'units': 'seconds since 2000-1-1 0:0:0 0', + '_FillValue': fill_val, + 'standard_name': 'time', + 'calendar': 'standard' + } + ), + } + + ds = xr.Dataset( + data_vars=data_vars, + coords={'x': (['x'], np.arange(10))} + ) + + nc_out_location = join(self.subset_output_dir, "bad_time.nc") + ds.to_netcdf(nc_out_location) + + subset.override_decode_cf_datetime() + + ds_test = xr.open_dataset(nc_out_location) + ds_test.close() + From 91297b6d543b214aa0befa3328ed26160165dced Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Thu, 8 Dec 2022 13:19:01 -0500 Subject: [PATCH 03/16] add missing parameter to docstring --- podaac/subsetter/group_handling.py | 1 + 1 file changed, 1 insertion(+) diff --git a/podaac/subsetter/group_handling.py b/podaac/subsetter/group_handling.py index cf9a648b..a8e6b0a5 100644 --- a/podaac/subsetter/group_handling.py +++ b/podaac/subsetter/group_handling.py @@ -24,6 +24,7 @@ def transform_grouped_dataset(nc_dataset, file_to_subset): ---------- nc_dataset : nc.Dataset netCDF4 Dataset that contains groups + file_to_subset : str Returns ------- From 1311835d9d10e536c6283c6b65f03a2c8c367c7a Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Thu, 8 Dec 2022 13:19:39 -0500 Subject: [PATCH 04/16] typo in docstring --- podaac/subsetter/subset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py index e3db8de2..b1673352 100644 --- a/podaac/subsetter/subset.py +++ b/podaac/subsetter/subset.py @@ -489,7 +489,7 @@ def compute_time_variable_name(dataset, lat_var): Parameters ---------- - dataset : xr.Dataset: + dataset : xr.Dataset xarray dataset to find time variable from lat_var : xr.Variable Lat variable for this dataset From e41dd08030fb077665f8ee7f6c726a4bfe93b220 Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Thu, 8 Dec 2022 13:21:03 -0500 Subject: [PATCH 05/16] extract netcdf opening procedure from beginning of `subset() into a new function --- podaac/subsetter/subset.py | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py index b1673352..92607a88 100644 --- a/podaac/subsetter/subset.py +++ b/podaac/subsetter/subset.py @@ -930,6 +930,26 @@ def convert_to_datetime(dataset, time_vars): return dataset, start_date +def open_as_nc_dataset(filepath: str) -> tuple[nc.Dataset, list, bool]: + """Open netcdf file, and flatten groups if they exist.""" + file_extension = filepath.split('.')[-1] + + if file_extension == 'he5': + nc_dataset, has_groups = h5file_transform(filepath) + else: + # Open dataset with netCDF4 first, so we can get group info + nc_dataset = nc.Dataset(filepath, mode='r') + has_groups = bool(nc_dataset.groups) + + # If dataset has groups, transform to work with xarray + if has_groups: + nc_dataset = transform_grouped_dataset(nc_dataset, filepath) + + nc_dataset, rename_vars = dc.remove_duplicate_dims(nc_dataset) + + return nc_dataset, rename_vars, has_groups + + def subset(file_to_subset, bbox, output_file, variables=None, # pylint: disable=too-many-branches, disable=too-many-statements cut=True, shapefile=None, min_time=None, max_time=None, origin_source=None, @@ -982,20 +1002,7 @@ def subset(file_to_subset, bbox, output_file, variables=None, than one value in the case where there are multiple groups and different coordinate variables for each group. """ - file_extension = file_to_subset.split('.')[-1] - - if file_extension == 'he5': - nc_dataset, has_groups = h5file_transform(file_to_subset) - else: - # Open dataset with netCDF4 first, so we can get group info - nc_dataset = nc.Dataset(file_to_subset, mode='r') - has_groups = bool(nc_dataset.groups) - - # If dataset has groups, transform to work with xarray - if has_groups: - nc_dataset = transform_grouped_dataset(nc_dataset, file_to_subset) - - nc_dataset, rename_vars = dc.remove_duplicate_dims(nc_dataset) + nc_dataset, rename_vars, has_groups = open_as_nc_dataset(file_to_subset) if variables: variables = [x.replace('/', GROUP_DELIM) for x in variables] From b4d51a2753e81b25fdda2d7785bb59c265ddad02 Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Thu, 8 Dec 2022 13:22:04 -0500 Subject: [PATCH 06/16] update tests to use netcdf opening wrapper function, to prevent errors with tempo data --- tests/test_subset.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/tests/test_subset.py b/tests/test_subset.py index 5c683540..982d900f 100644 --- a/tests/test_subset.py +++ b/tests/test_subset.py @@ -170,11 +170,19 @@ def test_subset_bbox(self): output_file=join(self.subset_output_dir, output_file) ) - out_ds = xr.open_dataset(join(self.subset_output_dir, output_file), + file_to_subset = join(self.subset_output_dir, output_file) + + out_ds, rename_vars, _ = subset.open_as_nc_dataset(file_to_subset) + out_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(out_ds), decode_times=False, decode_coords=False, mask_and_scale=False) + # out_ds = xr.open_dataset(join(self.subset_output_dir, output_file), + # decode_times=False, + # decode_coords=False, + # mask_and_scale=False) + lat_var_name, lon_var_name = subset.compute_coordinate_variable_names(out_ds) lat_var_name = lat_var_name[0] @@ -555,6 +563,7 @@ def test_specified_variables(self): excluded_variables = list(set(variable[0] for variable in in_ds.data_vars.items()) - set(included_variables)) + in_ds.close() subset.subset( file_to_subset=join(self.test_data_dir, file), @@ -563,6 +572,14 @@ def test_specified_variables(self): variables=included_variables ) + in_ds, rename_vars, _ = subset.open_as_nc_dataset(join(self.test_data_dir, file)) + in_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(in_ds), + decode_times=False, + decode_coords=False) + + # in_ds = xr.open_dataset(join(self.test_data_dir, file), + # decode_times=False, + # decode_coords=False) # Get coord variables time_var_name = [] lat_var_names, lon_var_names = subset.compute_coordinate_variable_names(in_ds) @@ -1221,7 +1238,12 @@ def test_get_time_variable_name(self): 'decode_times': True } time_var_names = [] - ds = xr.open_dataset(os.path.join(self.test_data_dir, test_file), **args) + ds, rename_vars, _ = subset.open_as_nc_dataset(os.path.join(self.test_data_dir, test_file)) + ds = xr.open_dataset(xr.backends.NetCDF4DataStore(ds), + decode_times=False, + decode_coords=False, + mask_and_scale=False) + # ds = xr.open_dataset(os.path.join(self.test_data_dir, test_file), **args) lat_var_name = subset.compute_coordinate_variable_names(ds)[0][0] time_var_name = subset.compute_time_variable_name(ds, ds[lat_var_name]) From 8fe0f41e6972fade0ca180b81c33f82bae20c493 Mon Sep 17 00:00:00 2001 From: l2ss-py bot Date: Thu, 8 Dec 2022 18:31:04 +0000 Subject: [PATCH 07/16] /version 2.3.0-alpha.5 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index abd88b9f..b84fe59f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ [tool.poetry] name = "l2ss-py" -version = "2.3.0-alpha.4" +version = "2.3.0-alpha.5" description = "L2 Subsetter Service" authors = ["podaac-tva "] license = "Apache-2.0" From bc5889905b3190186e2e9e75049ecfed3045f013 Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Thu, 8 Dec 2022 13:57:25 -0500 Subject: [PATCH 08/16] update `test_specified_variables()` to use netcdf opening wrapper function in multiple places to prevent errors with tempo data --- tests/test_subset.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/test_subset.py b/tests/test_subset.py index 982d900f..60421275 100644 --- a/tests/test_subset.py +++ b/tests/test_subset.py @@ -554,9 +554,13 @@ def test_specified_variables(self): for file in self.test_files: output_file = "{}_{}".format(self._testMethodName, file) - in_ds = xr.open_dataset(join(self.test_data_dir, file), - decode_times=False, - decode_coords=False) + in_ds, rename_vars, _ = subset.open_as_nc_dataset(join(self.test_data_dir, file)) + in_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(in_ds), + decode_times=False, + decode_coords=False) + # in_ds = xr.open_dataset(join(self.test_data_dir, file), + # decode_times=False, + # decode_coords=False) included_variables = set([variable[0] for variable in in_ds.data_vars.items()][::2]) included_variables = list(included_variables) @@ -599,9 +603,13 @@ def test_specified_variables(self): if time_var_name in excluded_variables: excluded_variables.remove(time_var_name) - out_ds = xr.open_dataset(join(self.subset_output_dir, output_file), + out_ds, rename_vars, _ = subset.open_as_nc_dataset(join(self.subset_output_dir, output_file)) + out_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(out_ds), decode_times=False, decode_coords=False) + # out_ds = xr.open_dataset(join(self.subset_output_dir, output_file), + # decode_times=False, + # decode_coords=False) out_vars = [out_var for out_var in out_ds.data_vars.keys()] out_vars.extend(out_ds.coords.keys()) From 0728f9783fdcf6d4f2ace587c8092265d7483135 Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Thu, 8 Dec 2022 14:21:38 -0500 Subject: [PATCH 09/16] cosmetic --- tests/test_subset.py | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/tests/test_subset.py b/tests/test_subset.py index 60421275..93cc5a1d 100644 --- a/tests/test_subset.py +++ b/tests/test_subset.py @@ -178,11 +178,6 @@ def test_subset_bbox(self): decode_coords=False, mask_and_scale=False) - # out_ds = xr.open_dataset(join(self.subset_output_dir, output_file), - # decode_times=False, - # decode_coords=False, - # mask_and_scale=False) - lat_var_name, lon_var_name = subset.compute_coordinate_variable_names(out_ds) lat_var_name = lat_var_name[0] @@ -556,11 +551,8 @@ def test_specified_variables(self): in_ds, rename_vars, _ = subset.open_as_nc_dataset(join(self.test_data_dir, file)) in_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(in_ds), - decode_times=False, - decode_coords=False) - # in_ds = xr.open_dataset(join(self.test_data_dir, file), - # decode_times=False, - # decode_coords=False) + decode_times=False, + decode_coords=False) included_variables = set([variable[0] for variable in in_ds.data_vars.items()][::2]) included_variables = list(included_variables) @@ -578,12 +570,9 @@ def test_specified_variables(self): in_ds, rename_vars, _ = subset.open_as_nc_dataset(join(self.test_data_dir, file)) in_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(in_ds), - decode_times=False, - decode_coords=False) + decode_times=False, + decode_coords=False) - # in_ds = xr.open_dataset(join(self.test_data_dir, file), - # decode_times=False, - # decode_coords=False) # Get coord variables time_var_name = [] lat_var_names, lon_var_names = subset.compute_coordinate_variable_names(in_ds) @@ -607,9 +596,6 @@ def test_specified_variables(self): out_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(out_ds), decode_times=False, decode_coords=False) - # out_ds = xr.open_dataset(join(self.subset_output_dir, output_file), - # decode_times=False, - # decode_coords=False) out_vars = [out_var for out_var in out_ds.data_vars.keys()] out_vars.extend(out_ds.coords.keys()) From 6bf7888818075852f0b799fab847c6068ece08e9 Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Thu, 8 Dec 2022 14:29:00 -0500 Subject: [PATCH 10/16] clean up comment and use 'decode_times'=True for test --- tests/test_subset.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/test_subset.py b/tests/test_subset.py index 93cc5a1d..23327826 100644 --- a/tests/test_subset.py +++ b/tests/test_subset.py @@ -1233,11 +1233,8 @@ def test_get_time_variable_name(self): } time_var_names = [] ds, rename_vars, _ = subset.open_as_nc_dataset(os.path.join(self.test_data_dir, test_file)) - ds = xr.open_dataset(xr.backends.NetCDF4DataStore(ds), - decode_times=False, - decode_coords=False, - mask_and_scale=False) - # ds = xr.open_dataset(os.path.join(self.test_data_dir, test_file), **args) + ds = xr.open_dataset(xr.backends.NetCDF4DataStore(ds), **args) + lat_var_name = subset.compute_coordinate_variable_names(ds)[0][0] time_var_name = subset.compute_time_variable_name(ds, ds[lat_var_name]) From ef5c63632a2847d675f7d45af03a7891ee92ec3f Mon Sep 17 00:00:00 2001 From: Nick Lenssen Date: Mon, 12 Dec 2022 12:24:41 -0500 Subject: [PATCH 11/16] feature/issue 126 (#131) * Add variable leading slash flexibility * Add tests back to test file * changelog added and updated * Update podaac/subsetter/subset.py Co-authored-by: Frank Greguska <89428916+frankinspace@users.noreply.github.com> * update Syntax * resolve conflict Co-authored-by: nlensse1 Co-authored-by: Frank Greguska <89428916+frankinspace@users.noreply.github.com> --- CHANGELOG.md | 2 ++ podaac/subsetter/subset.py | 2 ++ tests/test_subset.py | 37 ++++++++++++++++++++++++++++++++++++- 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 69ecce10..1ffbad7c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- [issue/126](https://github.com/podaac/l2ss-py/issues/126): Added flexibility to variable subsetting +for variables to not have leading slash in the front ### Changed ### Deprecated ### Removed diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py index e5ed21b1..2e1b4627 100644 --- a/podaac/subsetter/subset.py +++ b/podaac/subsetter/subset.py @@ -1252,6 +1252,8 @@ def subset(file_to_subset, bbox, output_file, variables=None, if variables: variables = [x.replace('/', GROUP_DELIM) for x in variables] + if has_groups: + variables = [GROUP_DELIM + x if not x.startswith(GROUP_DELIM) else x for x in variables] args = { 'decode_coords': False, diff --git a/tests/test_subset.py b/tests/test_subset.py index c8587c6f..161bbf36 100644 --- a/tests/test_subset.py +++ b/tests/test_subset.py @@ -850,7 +850,6 @@ def test_variable_subset_oco2(self): var_listout = list(out_nc.groups['Retrieval'].variables.keys()) assert ('water_height' in var_listout) - def test_variable_subset_s6(self): """ multiple variable subset of variables in different groups in oco3 @@ -1946,6 +1945,42 @@ def test_passed_coords(self): assert lons == dummy_lons assert times == dummy_times + def test_var_subsetting_tropomi(self): + """ + Check that variable subsetting is the same if a leading slash is included + """ + TROP_dir = join(self.test_data_dir, 'tropomi') + trop_file = 'S5P_OFFL_L2__CH4____20190319T110835_20190319T125006_07407_01_010202_20190325T125810_subset.nc4' + variable_slash = ['/PRODUCT/methane_mixing_ratio'] + variable_noslash = ['PRODUCT/methane_mixing_ratio'] + bbox = np.array(((-180, 180), (-90, 90))) + output_file_slash = "{}_{}".format(self._testMethodName, trop_file) + output_file_noslash = "{}_noslash_{}".format(self._testMethodName, trop_file) + shutil.copyfile( + os.path.join(TROP_dir, trop_file), + os.path.join(self.subset_output_dir, trop_file) + ) + shutil.copyfile( + os.path.join(TROP_dir, trop_file), + os.path.join(self.subset_output_dir,'slashtest'+trop_file) + ) + slash_test = subset.subset( + file_to_subset=join(self.subset_output_dir, trop_file), + bbox=bbox, + output_file=join(self.subset_output_dir, output_file_slash), + variables = variable_slash + ) + noslash_test = subset.subset( + file_to_subset=join(self.subset_output_dir, 'slashtest'+trop_file), + bbox=bbox, + output_file=join(self.subset_output_dir, output_file_noslash), + variables = variable_noslash + ) + + slash_dataset = nc.Dataset(join(self.subset_output_dir, output_file_slash)) + noslash_dataset = nc.Dataset(join(self.subset_output_dir, output_file_noslash)) + + assert list(slash_dataset.groups['PRODUCT'].variables) == list(noslash_dataset.groups['PRODUCT'].variables) def test_bad_time_unit(self): fill_val = -99999.0 From ddac85c6f80a5550b44cdc95e0ed5bdf324aca34 Mon Sep 17 00:00:00 2001 From: l2ss-py bot Date: Mon, 12 Dec 2022 17:38:45 +0000 Subject: [PATCH 12/16] /version 2.3.0-alpha.6 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b84fe59f..da9c5726 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ [tool.poetry] name = "l2ss-py" -version = "2.3.0-alpha.5" +version = "2.3.0-alpha.6" description = "L2 Subsetter Service" authors = ["podaac-tva "] license = "Apache-2.0" From 98b131a080279f8780aec34de9f5ae7826a494b2 Mon Sep 17 00:00:00 2001 From: Frank Greguska <89428916+frankinspace@users.noreply.github.com> Date: Mon, 12 Dec 2022 17:14:00 -0800 Subject: [PATCH 13/16] Update build-pipeline.yml --- .github/workflows/build-pipeline.yml | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build-pipeline.yml b/.github/workflows/build-pipeline.yml index 62348d55..8c964bad 100644 --- a/.github/workflows/build-pipeline.yml +++ b/.github/workflows/build-pipeline.yml @@ -184,7 +184,7 @@ jobs: - name: Extract metadata (tags, labels) for Docker if: ${{ !startsWith(github.ref, 'refs/heads/feature') }} id: meta - uses: docker/metadata-action@v3 + uses: docker/metadata-action@v4 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} tags: | @@ -197,7 +197,7 @@ jobs: ${GITHUB_WORKSPACE}/.github/workflows/wait-for-pypi.py ${{env.pyproject_name}}[harmony]==${{ env.software_version }} - name: Build and push Docker image if: ${{ !startsWith(github.ref, 'refs/heads/feature') }} - uses: docker/build-push-action@v2 + uses: docker/build-push-action@v3 with: context: . file: docker/Dockerfile @@ -216,13 +216,6 @@ jobs: env: SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }} with: - image: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.software_version }} + image: ${{ steps.meta.outputs.tags[0] }} args: > --severity-threshold=high - --file=./docker/Dockerfile - --sarif-file-output=docker.sarif - - name: Upload result to GitHub Code Scanning - if: ${{ !startsWith(github.ref, 'refs/heads/feature') }} - uses: github/codeql-action/upload-sarif@v2 - with: - sarif_file: ./ From 13c5eebd931f5f1b98a9c201f621de3cd77030f0 Mon Sep 17 00:00:00 2001 From: l2ss-py bot Date: Tue, 13 Dec 2022 01:27:39 +0000 Subject: [PATCH 14/16] /version 2.3.0-alpha.7 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index da9c5726..d1307f10 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ [tool.poetry] name = "l2ss-py" -version = "2.3.0-alpha.6" +version = "2.3.0-alpha.7" description = "L2 Subsetter Service" authors = ["podaac-tva "] license = "Apache-2.0" From 1da8f5f94515ca868c269ec2f4edc001acbe6505 Mon Sep 17 00:00:00 2001 From: Frank Greguska <89428916+frankinspace@users.noreply.github.com> Date: Mon, 12 Dec 2022 18:15:47 -0800 Subject: [PATCH 15/16] Merge changes from origin/develop --- podaac/subsetter/subset.py | 22 ++- poetry.lock | 320 +++++++++++++++++-------------------- pyproject.toml | 6 +- tests/test_subset.py | 164 +++++++++---------- 4 files changed, 249 insertions(+), 263 deletions(-) diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py index b927d7f1..d5f6d91c 100644 --- a/podaac/subsetter/subset.py +++ b/podaac/subsetter/subset.py @@ -978,10 +978,10 @@ def decode_cf_datetime(num_dates, units, calendar=None, use_cftime=None): xarray.coding.times.decode_cf_datetime = decode_cf_datetime -def subset(file_to_subset, bbox, output_file, variables=None, +def subset(file_to_subset, bbox, output_file, variables=(), # pylint: disable=too-many-branches, disable=too-many-statements cut=True, shapefile=None, min_time=None, max_time=None, origin_source=None, - lat_var_names=None, lon_var_names=None, time_var_names=None): + lat_var_names=(), lon_var_names=(), time_var_names=()): """ Subset a given NetCDF file given a bounding box @@ -1014,6 +1014,9 @@ def subset(file_to_subset, bbox, output_file, variables=None, ISO timestamp representing the upper bound of the temporal subset to be performed. If this value is not provided, the granule will not be subset temporally on the upper bound. + origin_source : str + The original granule source prior to this subset operation to + be used for provenance information. lat_var_names : list List of variables that represent the latitude coordinate variables for this granule. This list will only contain more @@ -1034,10 +1037,17 @@ def subset(file_to_subset, bbox, output_file, variables=None, override_decode_cf_datetime() - if variables: - variables = [x.replace('/', GROUP_DELIM) for x in variables] - if has_groups: - variables = [GROUP_DELIM + x if not x.startswith(GROUP_DELIM) else x for x in variables] + if has_groups: + # Make sure all variables start with '/' + variables = ['/' + var if not var.startswith('/') else var for var in variables] + lat_var_names = ['/' + var if not var.startswith('/') else var for var in lat_var_names] + lon_var_names = ['/' + var if not var.startswith('/') else var for var in lon_var_names] + time_var_names = ['/' + var if not var.startswith('/') else var for var in time_var_names] + # Replace all '/' with GROUP_DELIM + variables = [var.replace('/', GROUP_DELIM) for var in variables] + lat_var_names = [var.replace('/', GROUP_DELIM) for var in lat_var_names] + lon_var_names = [var.replace('/', GROUP_DELIM) for var in lon_var_names] + time_var_names = [var.replace('/', GROUP_DELIM) for var in time_var_names] args = { 'decode_coords': False, diff --git a/poetry.lock b/poetry.lock index 2d142e97..22d4ddb2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -8,7 +8,7 @@ python-versions = "*" [[package]] name = "astroid" -version = "2.12.9" +version = "2.12.13" description = "An abstract syntax tree for Python with inference support." category = "dev" optional = false @@ -22,14 +22,6 @@ wrapt = [ {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}, ] -[[package]] -name = "atomicwrites" -version = "1.4.1" -description = "Atomic file writes." -category = "dev" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" - [[package]] name = "attrs" version = "22.1.0" @@ -46,7 +38,7 @@ tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (> [[package]] name = "aws-sam-translator" -version = "1.50.0" +version = "1.55.0" description = "AWS SAM Translator is a library that transform SAM templates into AWS CloudFormation templates" category = "dev" optional = false @@ -57,11 +49,11 @@ boto3 = ">=1.19.5,<2.0.0" jsonschema = ">=3.2,<4.0" [package.extras] -dev = ["coverage (>=5.3,<6.0)", "flake8 (>=3.8.4,<3.9.0)", "tox (>=3.24,<4.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pytest-xdist (>=2.5,<3.0)", "pytest-env (>=0.6.2,<0.7.0)", "pylint (>=2.9.0,<2.10.0)", "pyyaml (>=5.4,<6.0)", "pytest (>=6.2.5,<6.3.0)", "parameterized (>=0.7.4,<0.8.0)", "click (>=7.1,<8.0)", "dateparser (>=0.7,<1.0)", "boto3 (>=1.23,<2)", "tenacity (>=7.0.0,<7.1.0)", "requests (>=2.24.0,<2.25.0)", "docopt (>=0.6.2,<0.7.0)", "black (==20.8b1)"] +dev = ["coverage (>=5.3,<6.0)", "flake8 (>=3.8.4,<3.9.0)", "tox (>=3.24,<4.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pytest-xdist (>=2.5,<3.0)", "pytest-env (>=0.6.2,<0.7.0)", "pytest-rerunfailures (>=9.1.1,<9.2.0)", "pylint (>=2.15.0,<2.16.0)", "pyyaml (>=5.4,<6.0)", "pytest (>=6.2.5,<6.3.0)", "parameterized (>=0.7.4,<0.8.0)", "click (>=7.1,<8.0)", "dateparser (>=0.7,<1.0)", "boto3 (>=1.23,<2)", "tenacity (>=7.0.0,<7.1.0)", "requests (>=2.24.0,<2.25.0)", "docopt (>=0.6.2,<0.7.0)", "black (==20.8b1)", "ruamel.yaml (==0.17.21)", "mypy (==0.971)", "boto3-stubs[serverlessrepo,appconfig] (>=1.19.5,<2.0.0)", "types-PyYAML (>=5.4,<6.0)", "types-jsonschema (>=3.2,<4.0)"] [[package]] name = "aws-xray-sdk" -version = "2.10.0" +version = "2.11.0" description = "The AWS X-Ray SDK for Python (the SDK) enables Python developers to record and emit information from within their applications to the AWS X-Ray service." category = "dev" optional = false @@ -73,7 +65,7 @@ wrapt = "*" [[package]] name = "babel" -version = "2.10.3" +version = "2.11.0" description = "Internationalization utilities" category = "dev" optional = false @@ -109,14 +101,14 @@ python-versions = "*" [[package]] name = "boto3" -version = "1.24.67" +version = "1.26.28" description = "The AWS SDK for Python" category = "main" optional = false python-versions = ">= 3.7" [package.dependencies] -botocore = ">=1.27.67,<1.28.0" +botocore = ">=1.29.28,<1.30.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.6.0,<0.7.0" @@ -125,7 +117,7 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.27.67" +version = "1.29.28" description = "Low-level, data-driven core of boto 3." category = "main" optional = false @@ -137,11 +129,11 @@ python-dateutil = ">=2.1,<3.0.0" urllib3 = ">=1.25.4,<1.27" [package.extras] -crt = ["awscrt (==0.14.0)"] +crt = ["awscrt (==0.15.3)"] [[package]] name = "certifi" -version = "2022.6.15" +version = "2022.12.7" description = "Python package for providing Mozilla's CA Bundle." category = "main" optional = false @@ -149,7 +141,7 @@ python-versions = ">=3.6" [[package]] name = "cf-xarray" -version = "0.7.4" +version = "0.7.6" description = "A lightweight convenience wrapper for using CF attributes on xarray objects" category = "main" optional = false @@ -171,14 +163,14 @@ pycparser = "*" [[package]] name = "cfn-lint" -version = "0.64.1" +version = "0.72.2" description = "Checks CloudFormation templates for practices and behaviour that could potentially be improved" category = "dev" optional = false python-versions = ">=3.7, <=4.0, !=4.0" [package.dependencies] -aws-sam-translator = ">=1.50.0" +aws-sam-translator = ">=1.55.0" jschema-to-python = ">=1.2.3,<1.3.0" jsonpatch = "*" jsonschema = ">=3.0,<5" @@ -189,7 +181,7 @@ sarif-om = ">=1.0.4,<1.1.0" [[package]] name = "cftime" -version = "1.6.1" +version = "1.6.2" description = "Time-handling functionality from netcdf4-python" category = "main" optional = false @@ -258,26 +250,29 @@ python-versions = ">=3.6" [[package]] name = "colorama" -version = "0.4.5" +version = "0.4.6" description = "Cross-platform colored terminal text." category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" [[package]] name = "coverage" -version = "6.4.4" +version = "6.5.0" description = "Code coverage measurement for Python" category = "dev" optional = false python-versions = ">=3.7" +[package.dependencies] +tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""} + [package.extras] toml = ["tomli"] [[package]] name = "cryptography" -version = "38.0.1" +version = "38.0.4" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." category = "dev" optional = false @@ -296,16 +291,17 @@ test = ["pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-subtests", [[package]] name = "dask" -version = "2022.9.0" +version = "2022.12.0" description = "Parallel PyData with Task Scheduling" category = "main" optional = false python-versions = ">=3.8" [package.dependencies] -bokeh = {version = ">=2.4.2", optional = true, markers = "extra == \"complete\""} +bokeh = {version = ">=2.4.2,<3", optional = true, markers = "extra == \"complete\""} +click = ">=7.0" cloudpickle = ">=1.1.1" -distributed = {version = "2022.9.0", optional = true, markers = "extra == \"complete\""} +distributed = {version = "2022.12.0", optional = true, markers = "extra == \"complete\""} fsspec = ">=0.6.0" jinja2 = {version = "*", optional = true, markers = "extra == \"complete\""} numpy = {version = ">=1.18", optional = true, markers = "extra == \"complete\""} @@ -317,10 +313,10 @@ toolz = ">=0.8.2" [package.extras] array = ["numpy (>=1.18)"] -complete = ["bokeh (>=2.4.2)", "distributed (==2022.9.0)", "jinja2", "numpy (>=1.18)", "pandas (>=1.0)"] +complete = ["bokeh (>=2.4.2,<3)", "distributed (==2022.12.0)", "jinja2", "numpy (>=1.18)", "pandas (>=1.0)"] dataframe = ["numpy (>=1.18)", "pandas (>=1.0)"] -diagnostics = ["bokeh (>=2.4.2)", "jinja2"] -distributed = ["distributed (==2022.9.0)"] +diagnostics = ["bokeh (>=2.4.2,<3)", "jinja2"] +distributed = ["distributed (==2022.12.0)"] test = ["pandas", "pytest", "pytest-rerunfailures", "pytest-xdist", "pre-commit"] [[package]] @@ -336,27 +332,27 @@ packaging = "*" [[package]] name = "dill" -version = "0.3.5.1" +version = "0.3.6" description = "serialize all of python" category = "dev" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*" +python-versions = ">=3.7" [package.extras] graph = ["objgraph (>=1.7.2)"] [[package]] name = "distributed" -version = "2022.9.0" +version = "2022.12.0" description = "Distributed scheduler for Dask" category = "main" optional = false python-versions = ">=3.8" [package.dependencies] -click = ">=6.6" +click = ">=7.0" cloudpickle = ">=1.5.0" -dask = "2022.9.0" +dask = "2022.12.0" jinja2 = "*" locket = ">=1.0.0" msgpack = ">=0.6.0" @@ -365,14 +361,14 @@ psutil = ">=5.0" pyyaml = "*" sortedcontainers = "<2.0.0 || >2.0.0,<2.0.1 || >2.0.1" tblib = ">=1.6.0" -toolz = ">=0.8.2" -tornado = ">=6.0.3,<6.2" +toolz = ">=0.10.0" +tornado = ">=6.0.3" urllib3 = "*" zict = ">=0.1.3" [[package]] name = "docker" -version = "6.0.0" +version = "6.0.1" description = "A Python library for the Docker Engine API." category = "dev" optional = false @@ -411,9 +407,20 @@ six = ">=1.9.0" gmpy = ["gmpy"] gmpy2 = ["gmpy2"] +[[package]] +name = "exceptiongroup" +version = "1.0.4" +description = "Backport of PEP 654 (exception groups)" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +test = ["pytest (>=6)"] + [[package]] name = "fiona" -version = "1.8.21" +version = "1.8.22" description = "Fiona reads and writes spatial data files" category = "main" optional = false @@ -429,7 +436,7 @@ munch = "*" six = ">=1.7" [package.extras] -all = ["boto3 (>=1.2.4)", "pytest-cov", "shapely", "pytest (>=3)", "mock"] +all = ["pytest-cov", "shapely", "boto3 (>=1.2.4)", "pytest (>=3)", "mock"] calc = ["shapely"] s3 = ["boto3 (>=1.2.4)"] test = ["pytest (>=3)", "pytest-cov", "boto3 (>=1.2.4)", "mock"] @@ -449,7 +456,7 @@ pyflakes = ">=2.3.0,<2.4.0" [[package]] name = "fsspec" -version = "2022.8.2" +version = "2022.11.0" description = "File-system specification" category = "main" optional = false @@ -505,7 +512,7 @@ numpy = ">=1.14.5" [[package]] name = "harmony-service-lib" -version = "1.0.21" +version = "1.0.22" description = "A library for Python-based Harmony services to parse incoming messages, fetch data, stage data, and call back to Harmony" category = "main" optional = true @@ -546,7 +553,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" [[package]] name = "importlib-metadata" -version = "4.12.0" +version = "4.13.0" description = "Read metadata from Python packages" category = "main" optional = false @@ -556,21 +563,29 @@ python-versions = ">=3.7" zipp = ">=0.5" [package.extras] -docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"] +docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "jaraco.tidelift (>=1.4)"] perf = ["ipython"] -testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.3)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"] +testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "flake8 (<5)", "pytest-cov", "pytest-enabler (>=1.3)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"] + +[[package]] +name = "iniconfig" +version = "1.1.1" +description = "iniconfig: brain-dead simple config-ini parsing" +category = "dev" +optional = false +python-versions = "*" [[package]] name = "isort" -version = "5.10.1" +version = "5.11.1" description = "A Python utility / library to sort Python imports." category = "dev" optional = false -python-versions = ">=3.6.1,<4.0" +python-versions = ">=3.7.0" [package.extras] -pipfile_deprecated_finder = ["pipreqs", "requirementslib"] -requirements_deprecated_finder = ["pipreqs", "pip-api"] +pipfile-deprecated-finder = ["pipreqs", "requirementslib"] +requirements-deprecated-finder = ["pipreqs", "pip-api"] colors = ["colorama (>=0.4.3,<0.5.0)"] plugins = ["setuptools"] @@ -630,16 +645,16 @@ jsonpointer = ">=1.9" [[package]] name = "jsonpickle" -version = "2.2.0" +version = "3.0.0" description = "Python library for serializing any arbitrary object graph into JSON" category = "dev" optional = false -python-versions = ">=2.7" +python-versions = ">=3.7" [package.extras] docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] -testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-black-multipy", "pytest-cov", "ecdsa", "feedparser", "numpy", "pandas", "pymongo", "scikit-learn", "sqlalchemy", "pytest-flake8 (<1.1.0)", "enum34", "jsonlib", "pytest-flake8 (>=1.1.1)"] -"testing.libs" = ["simplejson", "ujson", "yajl"] +testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8 (>=1.1.1)", "pytest-black-multipy", "pytest-cov", "ecdsa", "feedparser", "gmpy2", "numpy", "pandas", "pymongo", "scikit-learn", "sqlalchemy"] +"testing.libs" = ["simplejson", "ujson"] [[package]] name = "jsonpointer" @@ -687,11 +702,11 @@ six = "*" [[package]] name = "lazy-object-proxy" -version = "1.7.1" +version = "1.8.0" description = "A fast and thorough lazy object proxy." category = "dev" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" [[package]] name = "locket" @@ -750,14 +765,6 @@ build = ["twine", "wheel", "blurb"] docs = ["sphinx"] test = ["pytest (<5.4)", "pytest-cov"] -[[package]] -name = "more-itertools" -version = "8.14.0" -description = "More routines for operating on iterables, beyond itertools" -category = "dev" -optional = false -python-versions = ">=3.5" - [[package]] name = "moto" version = "1.3.14" @@ -817,11 +824,11 @@ yaml = ["PyYAML (>=5.1.0)"] [[package]] name = "netcdf4" -version = "1.6.0" +version = "1.6.2" description = "Provides an object-oriented python interface to the netCDF version 4 library." category = "main" optional = false -python-versions = "*" +python-versions = ">=3.6" [package.dependencies] cftime = "*" @@ -829,7 +836,7 @@ numpy = ">=1.9" [[package]] name = "networkx" -version = "2.8.6" +version = "2.8.8" description = "Python package for creating and manipulating graphs and networks" category = "dev" optional = false @@ -837,14 +844,14 @@ python-versions = ">=3.8" [package.extras] default = ["numpy (>=1.19)", "scipy (>=1.8)", "matplotlib (>=3.4)", "pandas (>=1.3)"] -developer = ["pre-commit (>=2.20)", "mypy (>=0.961)"] -doc = ["sphinx (>=5)", "pydata-sphinx-theme (>=0.9)", "sphinx-gallery (>=0.10)", "numpydoc (>=1.4)", "pillow (>=9.1)", "nb2plots (>=0.6)", "texext (>=0.6.6)"] +developer = ["pre-commit (>=2.20)", "mypy (>=0.982)"] +doc = ["sphinx (>=5.2)", "pydata-sphinx-theme (>=0.11)", "sphinx-gallery (>=0.11)", "numpydoc (>=1.5)", "pillow (>=9.2)", "nb2plots (>=0.6)", "texext (>=0.6.6)"] extra = ["lxml (>=4.6)", "pygraphviz (>=1.9)", "pydot (>=1.4.2)", "sympy (>=1.10)"] -test = ["pytest (>=7.1)", "pytest-cov (>=3.0)", "codecov (>=2.1)"] +test = ["pytest (>=7.2)", "pytest-cov (>=4.0)", "codecov (>=2.1)"] [[package]] name = "numpy" -version = "1.23.2" +version = "1.23.5" description = "NumPy is the fundamental package for array computing with Python." category = "main" optional = false @@ -852,18 +859,15 @@ python-versions = ">=3.8" [[package]] name = "packaging" -version = "21.3" +version = "22.0" description = "Core utilities for Python packages" category = "main" optional = false -python-versions = ">=3.6" - -[package.dependencies] -pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" +python-versions = ">=3.7" [[package]] name = "pandas" -version = "1.4.4" +version = "1.5.2" description = "Powerful data structures for data analysis, time series, and statistics" category = "main" optional = false @@ -871,10 +875,9 @@ python-versions = ">=3.8" [package.dependencies] numpy = [ - {version = ">=1.18.5", markers = "platform_machine != \"aarch64\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, - {version = ">=1.19.2", markers = "platform_machine == \"aarch64\" and python_version < \"3.10\""}, - {version = ">=1.20.0", markers = "platform_machine == \"arm64\" and python_version < \"3.10\""}, + {version = ">=1.20.3", markers = "python_version < \"3.10\""}, {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, + {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, ] python-dateutil = ">=2.8.1" pytz = ">=2020.1" @@ -899,7 +902,7 @@ complete = ["blosc", "pyzmq", "pandas (>=0.19.0)", "numpy (>=1.9.0)"] [[package]] name = "pbr" -version = "5.10.0" +version = "5.11.0" description = "Python Build Reasonableness" category = "dev" optional = false @@ -907,7 +910,7 @@ python-versions = ">=2.6" [[package]] name = "pillow" -version = "9.2.0" +version = "9.3.0" description = "Python Imaging Library (Fork)" category = "main" optional = false @@ -919,30 +922,31 @@ tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "pa [[package]] name = "platformdirs" -version = "2.5.2" -description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +version = "2.6.0" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." category = "dev" optional = false python-versions = ">=3.7" [package.extras] -docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx-autodoc-typehints (>=1.12)", "sphinx (>=4)"] -test = ["appdirs (==1.4.4)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)", "pytest (>=6)"] +docs = ["furo (>=2022.9.29)", "proselint (>=0.13)", "sphinx-autodoc-typehints (>=1.19.4)", "sphinx (>=5.3)"] +test = ["appdirs (==1.4.4)", "pytest-cov (>=4)", "pytest-mock (>=3.10)", "pytest (>=7.2)"] [[package]] name = "pluggy" -version = "0.13.1" +version = "1.0.0" description = "plugin and hook calling mechanisms for python" category = "dev" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +python-versions = ">=3.6" [package.extras] dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] [[package]] name = "psutil" -version = "5.9.2" +version = "5.9.4" description = "Cross-platform lib for process and system monitoring in Python." category = "main" optional = false @@ -951,18 +955,10 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" [package.extras] test = ["ipaddress", "mock", "enum34", "pywin32", "wmi"] -[[package]] -name = "py" -version = "1.11.0" -description = "library with cross-python path, ini-parsing, io, code, log facilities" -category = "dev" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" - [[package]] name = "py-cpuinfo" -version = "8.0.0" -description = "Get CPU info with pure Python 2 & 3" +version = "9.0.0" +description = "Get CPU info with pure Python" category = "dev" optional = false python-versions = "*" @@ -1012,14 +1008,14 @@ plugins = ["importlib-metadata"] [[package]] name = "pylint" -version = "2.15.2" +version = "2.15.8" description = "python code static checker" category = "dev" optional = false python-versions = ">=3.7.2" [package.dependencies] -astroid = ">=2.12.9,<=2.14.0-dev0" +astroid = ">=2.12.13,<=2.14.0-dev0" colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} dill = ">=0.2" isort = ">=4.2.5,<6" @@ -1048,20 +1044,9 @@ cffi = ">=1.4.1" docs = ["sphinx (>=1.6.5)", "sphinx-rtd-theme"] tests = ["pytest (>=3.2.1,!=3.3.0)", "hypothesis (>=3.27.0)"] -[[package]] -name = "pyparsing" -version = "3.0.9" -description = "pyparsing module - Classes and methods to define and execute parsing grammars" -category = "main" -optional = false -python-versions = ">=3.6.8" - -[package.extras] -diagrams = ["railroad-diagrams", "jinja2"] - [[package]] name = "pyproj" -version = "3.3.1" +version = "3.4.0" description = "Python interface to PROJ (cartographic projections and coordinate transformations library)" category = "main" optional = false @@ -1072,7 +1057,7 @@ certifi = "*" [[package]] name = "pyrsistent" -version = "0.18.1" +version = "0.19.2" description = "Persistent/Functional/Immutable data structures" category = "dev" optional = false @@ -1094,33 +1079,31 @@ validation = ["jsonschema (==3.2.0)"] [[package]] name = "pytest" -version = "5.4.3" +version = "7.2.0" description = "pytest: simple powerful testing with Python" category = "dev" optional = false -python-versions = ">=3.5" +python-versions = ">=3.7" [package.dependencies] -atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} -attrs = ">=17.4.0" +attrs = ">=19.2.0" colorama = {version = "*", markers = "sys_platform == \"win32\""} -more-itertools = ">=4.0.0" +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +iniconfig = "*" packaging = "*" -pluggy = ">=0.12,<1.0" -py = ">=1.5.0" -wcwidth = "*" +pluggy = ">=0.12,<2.0" +tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] -checkqa-mypy = ["mypy (==v0.761)"] -testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] +testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] [[package]] name = "pytest-benchmark" -version = "3.4.1" +version = "4.0.0" description = "A ``pytest`` fixture for benchmarking code. It will group the tests into rounds that are calibrated to the chosen timer." category = "dev" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +python-versions = ">=3.7" [package.dependencies] py-cpuinfo = "*" @@ -1133,16 +1116,15 @@ histogram = ["pygal", "pygaljs"] [[package]] name = "pytest-cov" -version = "2.12.1" +version = "4.0.0" description = "Pytest plugin for measuring coverage." category = "dev" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +python-versions = ">=3.6" [package.dependencies] -coverage = ">=5.2.1" +coverage = {version = ">=5.2.1", extras = ["toml"]} pytest = ">=4.6" -toml = "*" [package.extras] testing = ["fields", "hunter", "process-tests", "six", "pytest-xdist", "virtualenv"] @@ -1186,7 +1168,7 @@ python-versions = ">=3.5" [[package]] name = "pytz" -version = "2022.2.1" +version = "2022.6" description = "World timezone definitions, modern and historical" category = "main" optional = false @@ -1194,7 +1176,7 @@ python-versions = "*" [[package]] name = "pywin32" -version = "304" +version = "305" description = "Python for Window Extensions" category = "dev" optional = false @@ -1228,18 +1210,20 @@ use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"] [[package]] name = "responses" -version = "0.21.0" +version = "0.22.0" description = "A utility library for mocking out the `requests` Python library." category = "dev" optional = false python-versions = ">=3.7" [package.dependencies] -requests = ">=2.0,<3.0" +requests = ">=2.22.0,<3.0" +toml = "*" +types-toml = "*" urllib3 = ">=1.25.10" [package.extras] -tests = ["pytest (>=7.0.0)", "coverage (>=6.0.0)", "pytest-cov", "pytest-asyncio", "pytest-localserver", "flake8", "types-mock", "types-requests", "mypy"] +tests = ["pytest (>=7.0.0)", "coverage (>=6.0.0)", "pytest-cov", "pytest-asyncio", "pytest-httpserver", "flake8", "types-requests", "mypy"] [[package]] name = "rsa" @@ -1280,7 +1264,7 @@ pbr = "*" [[package]] name = "shapely" -version = "1.8.4" +version = "1.8.5.post1" description = "Geometric objects, predicates, and operations" category = "main" optional = false @@ -1474,11 +1458,11 @@ python-versions = ">=3.7" [[package]] name = "tomlkit" -version = "0.11.4" +version = "0.11.6" description = "Style preserving TOML library" category = "dev" optional = false -python-versions = ">=3.6,<4.0" +python-versions = ">=3.6" [[package]] name = "toolz" @@ -1490,15 +1474,23 @@ python-versions = ">=3.5" [[package]] name = "tornado" -version = "6.1" +version = "6.2" description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed." category = "main" optional = false -python-versions = ">= 3.5" +python-versions = ">= 3.7" + +[[package]] +name = "types-toml" +version = "0.10.8.1" +description = "Typing stubs for toml" +category = "dev" +optional = false +python-versions = "*" [[package]] name = "typing-extensions" -version = "4.3.0" +version = "4.4.0" description = "Backported and Experimental Type Hints for Python 3.7+" category = "main" optional = false @@ -1506,28 +1498,20 @@ python-versions = ">=3.7" [[package]] name = "urllib3" -version = "1.26.12" +version = "1.26.13" description = "HTTP library with thread-safe connection pooling, file post, and more." category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" [package.extras] brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"] secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "urllib3-secure-extra", "ipaddress"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] -[[package]] -name = "wcwidth" -version = "0.2.5" -description = "Measures the displayed width of unicode strings in a terminal" -category = "dev" -optional = false -python-versions = "*" - [[package]] name = "websocket-client" -version = "1.4.1" +version = "1.4.2" description = "WebSocket client for Python with low level API options" category = "dev" optional = false @@ -1562,7 +1546,7 @@ python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" [[package]] name = "xarray" -version = "2022.6.0" +version = "2022.12.0" description = "N-D labeled arrays and datasets in Python" category = "main" optional = false @@ -1570,15 +1554,15 @@ python-versions = ">=3.8" [package.dependencies] dask = {version = "*", extras = ["complete"], optional = true, markers = "extra == \"parallel\""} -numpy = ">=1.19" -packaging = ">=20.0" -pandas = ">=1.2" +numpy = ">=1.20" +packaging = ">=21.3" +pandas = ">=1.3" [package.extras] accel = ["scipy", "bottleneck", "numbagg", "flox"] -complete = ["netcdf4", "h5netcdf", "scipy", "pydap", "zarr", "fsspec", "cftime", "rasterio", "cfgrib", "pooch", "bottleneck", "numbagg", "flox", "dask", "matplotlib", "seaborn", "nc-time-axis"] -docs = ["netcdf4", "h5netcdf", "scipy", "pydap", "zarr", "fsspec", "cftime", "rasterio", "cfgrib", "pooch", "bottleneck", "numbagg", "flox", "dask", "matplotlib", "seaborn", "nc-time-axis", "sphinx-autosummary-accessors", "sphinx-rtd-theme", "ipython", "ipykernel", "jupyter-client", "nbsphinx", "scanpydoc"] -io = ["netcdf4", "h5netcdf", "scipy", "pydap", "zarr", "fsspec", "cftime", "rasterio", "cfgrib", "pooch"] +complete = ["netcdf4", "h5netcdf", "scipy", "zarr", "fsspec", "cftime", "rasterio", "cfgrib", "pooch", "bottleneck", "numbagg", "flox", "dask", "matplotlib", "seaborn", "nc-time-axis", "pydap"] +docs = ["netcdf4", "h5netcdf", "scipy", "zarr", "fsspec", "cftime", "rasterio", "cfgrib", "pooch", "bottleneck", "numbagg", "flox", "dask", "matplotlib", "seaborn", "nc-time-axis", "sphinx-autosummary-accessors", "sphinx-rtd-theme", "ipython", "ipykernel", "jupyter-client", "nbsphinx", "scanpydoc", "pydap"] +io = ["netcdf4", "h5netcdf", "scipy", "zarr", "fsspec", "cftime", "rasterio", "cfgrib", "pooch", "pydap"] parallel = ["dask"] viz = ["matplotlib", "seaborn", "nc-time-axis"] @@ -1603,15 +1587,15 @@ heapdict = "*" [[package]] name = "zipp" -version = "3.8.1" +version = "3.11.0" description = "Backport of pathlib-compatible object wrapper for zip files" category = "main" optional = false python-versions = ">=3.7" [package.extras] -docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "jaraco.tidelift (>=1.4)"] -testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.3)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"] +docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "jaraco.tidelift (>=1.4)"] +testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "flake8 (<5)", "pytest-cov", "pytest-enabler (>=1.3)", "jaraco.itertools", "func-timeout", "jaraco.functools", "more-itertools", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "pytest-flake8"] [extras] harmony = ["harmony-service-lib", "pystac"] @@ -1619,12 +1603,11 @@ harmony = ["harmony-service-lib", "pystac"] [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "a812b9c24f128e06197e201439794f7a28bb95055a72f928390326c92111bca9" +content-hash = "ae9d1d8198b4c7d46344bde0f75fa5028fa0f0e31aa97603648636a30e45bed5" [metadata.files] alabaster = [] astroid = [] -atomicwrites = [] attrs = [] aws-sam-translator = [] aws-xray-sdk = [] @@ -1653,6 +1636,7 @@ distributed = [] docker = [] docutils = [] ecdsa = [] +exceptiongroup = [] fiona = [] flake8 = [] fsspec = [] @@ -1663,6 +1647,7 @@ heapdict = [] idna = [] imagesize = [] importlib-metadata = [] +iniconfig = [] isort = [] jinja2 = [] jmespath = [] @@ -1681,7 +1666,6 @@ markupsafe = [] mccabe = [] mistune = [] mock = [] -more-itertools = [] moto = [] msgpack = [] munch = [] @@ -1696,7 +1680,6 @@ pillow = [] platformdirs = [] pluggy = [] psutil = [] -py = [] py-cpuinfo = [] pyasn1 = [] pycodestyle = [] @@ -1705,7 +1688,6 @@ pyflakes = [] pygments = [] pylint = [] pynacl = [] -pyparsing = [] pyproj = [] pyrsistent = [] pystac = [] @@ -1742,9 +1724,9 @@ tomli = [] tomlkit = [] toolz = [] tornado = [] +types-toml = [] typing-extensions = [] urllib3 = [] -wcwidth = [] websocket-client = [] werkzeug = [] wrapt = [] diff --git a/pyproject.toml b/pyproject.toml index d1307f10..f401b8b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,12 +40,12 @@ h5py = "^3.6.0" cf-xarray = "*" [tool.poetry.dev-dependencies] -pytest = "^5.2" +pytest = "~7" flake8 = "^3.7" -pytest-cov = "^2.8" +pytest-cov = "~4" pylint = "^2.4" sphinx = "^4.4" -pytest-benchmark = "^3.2.3" +pytest-benchmark = "~4" moto = "1.3.14" jsonschema = "^3.2.0" m2r2 = "^0.3.1" diff --git a/tests/test_subset.py b/tests/test_subset.py index e17aa06a..75cb0e5f 100644 --- a/tests/test_subset.py +++ b/tests/test_subset.py @@ -151,7 +151,6 @@ def test_subset_variables(self): in_ds.close() out_ds.close() - def test_subset_bbox(self): """ @@ -340,7 +339,6 @@ def test_subset_empty_bbox(self): assert test_input_dataset.dims.keys() == empty_dataset.dims.keys() - def test_bbox_conversion(self): """ Test that the bounding box conversion returns expected @@ -849,15 +847,15 @@ def test_variable_subset_oco2(self): output_file_name = 'oco2_test_out.nc' shutil.copyfile(os.path.join(self.test_data_dir, 'OCO2', oco2_file_name), os.path.join(self.subset_output_dir, oco2_file_name)) - bbox = np.array(((-180,180),(-90.0,90))) - variables = ['/xco2','/xco2_quality_flag','/Retrieval/water_height','/sounding_id'] + bbox = np.array(((-180, 180), (-90.0, 90))) + variables = ['/xco2', '/xco2_quality_flag', '/Retrieval/water_height', '/sounding_id'] subset.subset( - file_to_subset=join(self.test_data_dir, 'OCO2',oco2_file_name), + file_to_subset=join(self.test_data_dir, 'OCO2', oco2_file_name), bbox=bbox, variables=variables, output_file=join(self.subset_output_dir, output_file_name), ) - + out_nc = nc.Dataset(join(self.subset_output_dir, output_file_name)) var_listout = list(out_nc.groups['Retrieval'].variables.keys()) assert ('water_height' in var_listout) @@ -871,7 +869,7 @@ def test_variable_subset_s6(self): output_file_name = 's6_test_out.nc' shutil.copyfile(os.path.join(self.test_data_dir, 'sentinel_6', s6_file_name), os.path.join(self.subset_output_dir, s6_file_name)) - bbox = np.array(((-180,180),(-90.0,90))) + bbox = np.array(((-180, 180), (-90.0, 90))) variables = ['/data_01/ku/range_ocean_mle3_rms', '/data_20/ku/range_ocean'] subset.subset( file_to_subset=join(self.subset_output_dir, s6_file_name), @@ -879,14 +877,13 @@ def test_variable_subset_s6(self): variables=variables, output_file=join(self.subset_output_dir, output_file_name), ) - + out_nc = nc.Dataset(join(self.subset_output_dir, output_file_name)) - var_listout =list(out_nc.groups['data_01'].groups['ku'].variables.keys()) + var_listout = list(out_nc.groups['data_01'].groups['ku'].variables.keys()) var_listout.extend(list(out_nc.groups['data_20'].groups['ku'].variables.keys())) assert ('range_ocean_mle3_rms' in var_listout) assert ('range_ocean' in var_listout) - def test_transform_grouped_dataset(self): """ Test that the transformation function results in a correctly @@ -923,7 +920,6 @@ def test_transform_grouped_dataset(self): group = group[g] assert var_name.strip('__').split('__')[-1] in group.variables.keys() - def test_group_subset(self): """ Ensure a subset function can be run on a granule that contains @@ -1327,9 +1323,10 @@ def test_duplicate_dims_tropomi(self): in_nc = nc.Dataset(join(TROP_dir, trop_file)) out_nc = nc.Dataset(join(self.subset_output_dir, output_file)) - for var_name, variable in in_nc.groups['PRODUCT'].groups['SUPPORT_DATA'].groups['DETAILED_RESULTS'].variables.items(): - assert variable.shape == out_nc.groups['PRODUCT'].groups['SUPPORT_DATA'].groups['DETAILED_RESULTS'].variables[var_name].shape - + for var_name, variable in in_nc.groups['PRODUCT'].groups['SUPPORT_DATA'].groups[ + 'DETAILED_RESULTS'].variables.items(): + assert variable.shape == \ + out_nc.groups['PRODUCT'].groups['SUPPORT_DATA'].groups['DETAILED_RESULTS'].variables[var_name].shape def test_omi_novars_subset(self): """ @@ -1355,10 +1352,12 @@ def test_omi_novars_subset(self): in_nc = nc.Dataset(join(omi_dir, omi_file)) out_nc = nc.Dataset(join(self.subset_output_dir, output_file)) - for var_name, variable in in_nc.groups['HDFEOS'].groups['SWATHS'].groups['OMI Total Column Amount SO2'].groups['Geolocation Fields'].variables.items(): - assert in_nc.groups['HDFEOS'].groups['SWATHS'].groups['OMI Total Column Amount SO2'].groups['Geolocation Fields'].variables[var_name].shape == \ - out_nc.groups['HDFEOS'].groups['SWATHS'].groups['OMI Total Column Amount SO2'].groups['Geolocation Fields'].variables[var_name].shape - + for var_name, variable in in_nc.groups['HDFEOS'].groups['SWATHS'].groups['OMI Total Column Amount SO2'].groups[ + 'Geolocation Fields'].variables.items(): + assert in_nc.groups['HDFEOS'].groups['SWATHS'].groups['OMI Total Column Amount SO2'].groups[ + 'Geolocation Fields'].variables[var_name].shape == \ + out_nc.groups['HDFEOS'].groups['SWATHS'].groups['OMI Total Column Amount SO2'].groups[ + 'Geolocation Fields'].variables[var_name].shape def test_root_group(self): """test that the GROUP_DELIM string, '__', is added to variables in the root group""" @@ -1370,21 +1369,21 @@ def test_root_group(self): nc_dataset = nc.Dataset(os.path.join(self.subset_output_dir, sndr_file_name)) args = { - 'decode_coords': False, - 'mask_and_scale': False, - 'decode_times': False - } + 'decode_coords': False, + 'mask_and_scale': False, + 'decode_times': False + } nc_dataset = gh.transform_grouped_dataset(nc_dataset, os.path.join(self.subset_output_dir, sndr_file_name)) with xr.open_dataset( - xr.backends.NetCDF4DataStore(nc_dataset), - **args + xr.backends.NetCDF4DataStore(nc_dataset), + **args ) as dataset: var_list = list(dataset.variables) assert (var_list[0][0:2] == gh.GROUP_DELIM) group_lst = [] - for var_name in dataset.variables.keys(): #need logic if there is data in the top level not in a group + for var_name in dataset.variables.keys(): # need logic if there is data in the top level not in a group group_lst.append('/'.join(var_name.split(gh.GROUP_DELIM)[:-1])) - group_lst = ['/' if group=='' else group for group in group_lst] + group_lst = ['/' if group == '' else group for group in group_lst] groups = set(group_lst) expected_group = {'/mw', '/ave_kern', '/', '/mol_lay', '/aux'} assert (groups == expected_group) @@ -1401,14 +1400,14 @@ def test_get_time_squeeze(self): nc_dataset = nc.Dataset(os.path.join(self.subset_output_dir, tropomi_file_name)) args = { - 'decode_coords': False, - 'mask_and_scale': False, - 'decode_times': False - } + 'decode_coords': False, + 'mask_and_scale': False, + 'decode_times': False + } nc_dataset = gh.transform_grouped_dataset(nc_dataset, os.path.join(self.subset_output_dir, tropomi_file_name)) with xr.open_dataset( - xr.backends.NetCDF4DataStore(nc_dataset), - **args + xr.backends.NetCDF4DataStore(nc_dataset), + **args ) as dataset: lat_var_name = subset.compute_coordinate_variable_names(dataset)[0][0] time_var_name = subset.compute_time_variable_name(dataset, dataset[lat_var_name]) @@ -1426,14 +1425,14 @@ def test_get_indexers_nd(self): nc_dataset = nc.Dataset(os.path.join(self.subset_output_dir, tropomi_file_name)) args = { - 'decode_coords': False, - 'mask_and_scale': False, - 'decode_times': False - } + 'decode_coords': False, + 'mask_and_scale': False, + 'decode_times': False + } nc_dataset = gh.transform_grouped_dataset(nc_dataset, os.path.join(self.subset_output_dir, tropomi_file_name)) with xr.open_dataset( - xr.backends.NetCDF4DataStore(nc_dataset), - **args + xr.backends.NetCDF4DataStore(nc_dataset), + **args ) as dataset: time_var_names = [] lat_var_name = subset.compute_coordinate_variable_names(dataset)[0][0] @@ -1444,14 +1443,14 @@ def test_get_indexers_nd(self): cond = oper( (dataset[lon_var_name] >= -180), (dataset[lon_var_name] <= 180) - ) & (dataset[lat_var_name] >= -90) & (dataset[lat_var_name] <= 90) & True + ) & (dataset[lat_var_name] >= -90) & (dataset[lat_var_name] <= 90) & True indexers = xre.get_indexers_from_nd(cond, True) indexed_cond = cond.isel(**indexers) indexed_ds = dataset.isel(**indexers) new_dataset = indexed_ds.where(indexed_cond) - - assert ((time_var_name not in indexers.keys()) == True) #time can't be in the index + + assert ((time_var_name not in indexers.keys()) == True) # time can't be in the index assert (new_dataset.dims == dataset.dims) def test_variable_type_string_oco2(self): @@ -1461,15 +1460,15 @@ def test_variable_type_string_oco2(self): output_file_name = 'oco2_test_out.nc' shutil.copyfile(os.path.join(self.test_data_dir, 'OCO2', oco2_file_name), os.path.join(self.subset_output_dir, oco2_file_name)) - bbox = np.array(((-180,180),(-90.0,90))) + bbox = np.array(((-180, 180), (-90.0, 90))) subset.subset( - file_to_subset=join(self.test_data_dir, 'OCO2',oco2_file_name), + file_to_subset=join(self.test_data_dir, 'OCO2', oco2_file_name), bbox=bbox, output_file=join(self.subset_output_dir, output_file_name), ) - in_nc = xr.open_dataset(join(self.test_data_dir, 'OCO2',oco2_file_name)) + in_nc = xr.open_dataset(join(self.test_data_dir, 'OCO2', oco2_file_name)) out_nc = xr.open_dataset(join(self.subset_output_dir, output_file_name)) assert (in_nc.variables['source_files'].dtype == out_nc.variables['source_files'].dtype) @@ -1487,11 +1486,11 @@ def test_transform_h5py_dataset(self): entry_lst = [] # Get root level objects key_lst = list(h5_ds.keys()) - + # Go through every level of the file to fill out the remaining objects for entry_str in key_lst: # If object is a group, add it to the loop list - if (isinstance(h5_ds[entry_str],h5py.Group)): + if (isinstance(h5_ds[entry_str], h5py.Group)): for group_keys in list(h5_ds[entry_str].keys()): if (isinstance(h5_ds[entry_str + "/" + group_keys], h5py.Dataset)): entry_lst.append(entry_str + "/" + group_keys) @@ -1500,15 +1499,14 @@ def test_transform_h5py_dataset(self): nc_dataset, has_groups = gh.h5file_transform(os.path.join(self.subset_output_dir, OMI_file_name)) nc_vars_flattened = list(nc_dataset.variables.keys()) - for i in range(len(entry_lst)): # go through all the datasets in h5py file - input_variable = '__'+entry_lst[i].replace('/', '__') + for i in range(len(entry_lst)): # go through all the datasets in h5py file + input_variable = '__' + entry_lst[i].replace('/', '__') output_variable = nc_vars_flattened[i] assert (input_variable == output_variable) nc_dataset.close() h5_ds.close() - def test_variable_dims_matched_tropomi(self): """ Code must match the dimensions for each variable rather than @@ -1527,7 +1525,7 @@ def test_variable_dims_matched_tropomi(self): var_name: [dim.split(gh.GROUP_DELIM)[-1] for dim in var.dimensions] for var_name, var in in_nc.groups['PRODUCT'].variables.items() } - + # Get variables from METADATA group in_var_dims.update( { @@ -1539,7 +1537,8 @@ def test_variable_dims_matched_tropomi(self): in_var_dims.update( { var_name: [dim.split(gh.GROUP_DELIM)[-1] for dim in var.dimensions] - for var_name, var in in_nc.groups['PRODUCT'].groups['SUPPORT_DATA'].groups['GEOLOCATIONS'].variables.items() + for var_name, var in + in_nc.groups['PRODUCT'].groups['SUPPORT_DATA'].groups['GEOLOCATIONS'].variables.items() } ) @@ -1556,7 +1555,6 @@ def test_variable_dims_matched_tropomi(self): self.assertDictEqual(in_var_dims, out_var_dims) - def test_temporal_merged_topex(self): """ Test that a temporal subset results in a granule that only @@ -1613,7 +1611,6 @@ def test_get_time_epoch_var(self): shutil.copyfile(os.path.join(self.test_data_dir, 'tropomi', tropomi_file), os.path.join(self.subset_output_dir, tropomi_file)) - nc_dataset = nc.Dataset(os.path.join(self.subset_output_dir, tropomi_file), mode='r') nc_dataset = gh.transform_grouped_dataset(nc_dataset, os.path.join(self.subset_output_dir, tropomi_file)) @@ -1628,7 +1625,6 @@ def test_get_time_epoch_var(self): xr.backends.NetCDF4DataStore(nc_dataset), **args ) as dataset: - lat_var_names, lon_var_names = subset.compute_coordinate_variable_names(dataset) time_var_names = [ subset.compute_time_variable_name( @@ -1636,7 +1632,7 @@ def test_get_time_epoch_var(self): ) for lat_var_name in lat_var_names ] epoch_time_var = subset.get_time_epoch_var(dataset, time_var_names[0]) - + assert epoch_time_var.split('__')[-1] == 'time' def test_temporal_variable_subset(self): @@ -1694,21 +1690,20 @@ def test_temporal_variable_subset(self): # Only coordinate variables and variables requested in variable # subset should be present. assert set(np.append(['lat', 'lon', 'time'], variables)) == set(out_ds.data_vars.keys()) - def test_temporal_he5file_subset(self): """ Test that the time type changes to datetime for subsetting """ - + OMI_file_names = ['OMI-Aura_L2-OMSO2_2020m0116t1207-o82471_v003-2020m0223t142939.he5', 'OMI-Aura_L2-OMBRO_2020m0116t1207-o82471_v003-2020m0116t182003.he5'] OMI_copy_file = 'OMI_copy_testing_2.he5' for i in OMI_file_names: shutil.copyfile(os.path.join(self.test_data_dir, 'OMI', i), os.path.join(self.subset_output_dir, OMI_copy_file)) - min_time='2020-01-16T12:30:00Z' - max_time='2020-01-16T12:40:00Z' + min_time = '2020-01-16T12:30:00Z' + max_time = '2020-01-16T12:40:00Z' bbox = np.array(((-180, 180), (-90, 90))) nc_dataset, has_groups = gh.h5file_transform(os.path.join(self.subset_output_dir, OMI_copy_file)) @@ -1719,7 +1714,7 @@ def test_temporal_he5file_subset(self): } if min_time or max_time: - args['decode_times'] = True + args['decode_times'] = True with xr.open_dataset( xr.backends.NetCDF4DataStore(nc_dataset), @@ -1737,13 +1732,12 @@ def test_temporal_he5file_subset(self): dataset, start_date = subset.convert_to_datetime(dataset, time_var_names) assert dataset[time_var_names[0]].dtype == 'datetime64[ns]' - def test_he5_timeattrs_output(self): """Test that the time attributes in the output match the attributes of the input for OMI test files""" omi_dir = join(self.test_data_dir, 'OMI') omi_file = 'OMI-Aura_L2-OMBRO_2020m0116t1207-o82471_v003-2020m0116t182003.he5' - omi_file_input = 'input'+omi_file + omi_file_input = 'input' + omi_file bbox = np.array(((-180, 90), (-90, 90))) output_file = "{}_{}".format(self._testMethodName, omi_file) shutil.copyfile( @@ -1754,15 +1748,16 @@ def test_he5_timeattrs_output(self): os.path.join(omi_dir, omi_file), os.path.join(self.subset_output_dir, omi_file_input) ) - - min_time='2020-01-16T12:30:00Z' - max_time='2020-01-16T12:40:00Z' + + min_time = '2020-01-16T12:30:00Z' + max_time = '2020-01-16T12:40:00Z' bbox = np.array(((-180, 180), (-90, 90))) nc_dataset_input = nc.Dataset(os.path.join(self.subset_output_dir, omi_file_input)) - incut_set = nc_dataset_input.groups['HDFEOS'].groups['SWATHS'].groups['OMI Total Column Amount BrO'].groups['Geolocation Fields'] + incut_set = nc_dataset_input.groups['HDFEOS'].groups['SWATHS'].groups['OMI Total Column Amount BrO'].groups[ + 'Geolocation Fields'] xr_dataset_input = xr.open_dataset(xr.backends.NetCDF4DataStore(incut_set)) - inattrs = xr_dataset_input['Time'].attrs - + inattrs = xr_dataset_input['Time'].attrs + subset.subset( file_to_subset=os.path.join(self.subset_output_dir, omi_file), bbox=bbox, @@ -1772,20 +1767,20 @@ def test_he5_timeattrs_output(self): ) output_ncdataset = nc.Dataset(os.path.join(self.subset_output_dir, output_file)) - outcut_set = output_ncdataset.groups['HDFEOS'].groups['SWATHS'].groups['OMI Total Column Amount BrO'].groups['Geolocation Fields'] + outcut_set = output_ncdataset.groups['HDFEOS'].groups['SWATHS'].groups['OMI Total Column Amount BrO'].groups[ + 'Geolocation Fields'] xrout_dataset = xr.open_dataset(xr.backends.NetCDF4DataStore(outcut_set)) outattrs = xrout_dataset['Time'].attrs for key in inattrs.keys(): if isinstance(inattrs[key], np.ndarray): - if np.array_equal(inattrs[key],outattrs[key]): + if np.array_equal(inattrs[key], outattrs[key]): pass else: raise AssertionError('Attributes for {} do not equal each other'.format(key)) else: assert inattrs[key] == outattrs[key] - - + def test_temporal_subset_lines(self): bbox = np.array(((-180, 180), (-90, 90))) file = 'SWOT_L2_LR_SSH_Expert_368_012_20121111T235910_20121112T005015_DG10_01.nc' @@ -1860,7 +1855,6 @@ def test_get_time_OMI(self): assert "Time" in time_var_names[0] assert "Latitude" in lat_var_names[0] - def test_empty_temporal_subset(self): """ Test the edge case where a subsetted empty granule @@ -1900,8 +1894,8 @@ def test_passed_coords(self): file = 'ascat_20150702_084200_metopa_45145_eps_o_250_2300_ovw.l2.nc' dataset = xr.open_dataset(join(self.test_data_dir, file), - decode_times=False, - decode_coords=False) + decode_times=False, + decode_coords=False) dummy_lats = ['dummy_lat'] dummy_lons = ['dummy_lon'] @@ -1964,7 +1958,7 @@ def test_var_subsetting_tropomi(self): """ Check that variable subsetting is the same if a leading slash is included """ - TROP_dir = join(self.test_data_dir, 'tropomi') + trop_dir = join(self.test_data_dir, 'tropomi') trop_file = 'S5P_OFFL_L2__CH4____20190319T110835_20190319T125006_07407_01_010202_20190325T125810_subset.nc4' variable_slash = ['/PRODUCT/methane_mixing_ratio'] variable_noslash = ['PRODUCT/methane_mixing_ratio'] @@ -1972,30 +1966,31 @@ def test_var_subsetting_tropomi(self): output_file_slash = "{}_{}".format(self._testMethodName, trop_file) output_file_noslash = "{}_noslash_{}".format(self._testMethodName, trop_file) shutil.copyfile( - os.path.join(TROP_dir, trop_file), + os.path.join(trop_dir, trop_file), os.path.join(self.subset_output_dir, trop_file) ) shutil.copyfile( - os.path.join(TROP_dir, trop_file), - os.path.join(self.subset_output_dir,'slashtest'+trop_file) + os.path.join(trop_dir, trop_file), + os.path.join(self.subset_output_dir, 'slashtest' + trop_file) ) - slash_test = subset.subset( + subset.subset( file_to_subset=join(self.subset_output_dir, trop_file), bbox=bbox, output_file=join(self.subset_output_dir, output_file_slash), - variables = variable_slash + variables=variable_slash ) - noslash_test = subset.subset( - file_to_subset=join(self.subset_output_dir, 'slashtest'+trop_file), + subset.subset( + file_to_subset=join(self.subset_output_dir, 'slashtest' + trop_file), bbox=bbox, output_file=join(self.subset_output_dir, output_file_noslash), - variables = variable_noslash + variables=variable_noslash ) slash_dataset = nc.Dataset(join(self.subset_output_dir, output_file_slash)) noslash_dataset = nc.Dataset(join(self.subset_output_dir, output_file_noslash)) assert list(slash_dataset.groups['PRODUCT'].variables) == list(noslash_dataset.groups['PRODUCT'].variables) + def test_bad_time_unit(self): fill_val = -99999.0 @@ -2029,4 +2024,3 @@ def test_bad_time_unit(self): ds_test = xr.open_dataset(nc_out_location) ds_test.close() - From 164b6c69768726646724587599002e409983d704 Mon Sep 17 00:00:00 2001 From: Frank Greguska <89428916+frankinspace@users.noreply.github.com> Date: Mon, 12 Dec 2022 18:26:31 -0800 Subject: [PATCH 16/16] Merge changes from issues/127 --- tests/test_subset.py | 73 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/tests/test_subset.py b/tests/test_subset.py index 74b9d643..9c938213 100644 --- a/tests/test_subset.py +++ b/tests/test_subset.py @@ -1979,3 +1979,76 @@ def test_passed_coords(data_dir, subset_output_dir): assert lats == dummy_lats assert lons == dummy_lons assert times == dummy_times + + +def test_var_subsetting_tropomi(data_dir, subset_output_dir, request): + """ + Check that variable subsetting is the same if a leading slash is included + """ + trop_dir = join(data_dir, 'tropomi') + trop_file = 'S5P_OFFL_L2__CH4____20190319T110835_20190319T125006_07407_01_010202_20190325T125810_subset.nc4' + variable_slash = ['/PRODUCT/methane_mixing_ratio'] + variable_noslash = ['PRODUCT/methane_mixing_ratio'] + bbox = np.array(((-180, 180), (-90, 90))) + output_file_slash = "{}_{}".format(request.node.name, trop_file) + output_file_noslash = "{}_noslash_{}".format(request.node.name, trop_file) + shutil.copyfile( + os.path.join(trop_dir, trop_file), + os.path.join(subset_output_dir, trop_file) + ) + shutil.copyfile( + os.path.join(trop_dir, trop_file), + os.path.join(subset_output_dir, 'slashtest' + trop_file) + ) + subset.subset( + file_to_subset=join(subset_output_dir, trop_file), + bbox=bbox, + output_file=join(subset_output_dir, output_file_slash), + variables=variable_slash + ) + subset.subset( + file_to_subset=join(subset_output_dir, 'slashtest' + trop_file), + bbox=bbox, + output_file=join(subset_output_dir, output_file_noslash), + variables=variable_noslash + ) + + slash_dataset = nc.Dataset(join(subset_output_dir, output_file_slash)) + noslash_dataset = nc.Dataset(join(subset_output_dir, output_file_noslash)) + + assert list(slash_dataset.groups['PRODUCT'].variables) == list(noslash_dataset.groups['PRODUCT'].variables) + + +def test_bad_time_unit(subset_output_dir): + + fill_val = -99999.0 + time_vals = np.random.rand(10) + time_vals[0] = fill_val + time_vals[-1] = fill_val + + data_vars = { + 'foo': (['x'], np.random.rand(10)), + 'time': ( + ['x'], + time_vals, + { + 'units': 'seconds since 2000-1-1 0:0:0 0', + '_FillValue': fill_val, + 'standard_name': 'time', + 'calendar': 'standard' + } + ), + } + + ds = xr.Dataset( + data_vars=data_vars, + coords={'x': (['x'], np.arange(10))} + ) + + nc_out_location = join(subset_output_dir, "bad_time.nc") + ds.to_netcdf(nc_out_location) + + subset.override_decode_cf_datetime() + + ds_test = xr.open_dataset(nc_out_location) + ds_test.close() \ No newline at end of file