diff --git a/.github/workflows/build-pipeline.yml b/.github/workflows/build-pipeline.yml index 8c964bad..c031a4a4 100644 --- a/.github/workflows/build-pipeline.yml +++ b/.github/workflows/build-pipeline.yml @@ -4,7 +4,7 @@ name: Build on: # Triggers the workflow on push events push: - branches: [ develop, release/**, main, feature/** ] + branches: [ develop, release/**, main, feature/**, issue/**, issues/**, dependabot/** ] # Allows you to run this workflow manually from the Actions tab workflow_dispatch: @@ -36,7 +36,10 @@ jobs: echo "pyproject_name=$(poetry version | awk '{print $1}')" >> $GITHUB_ENV - name: Bump pre-alpha version # If triggered by push to a feature branch - if: ${{ startsWith(github.ref, 'refs/heads/feature/') }} + if: | + ${{ startsWith(github.ref, 'refs/heads/issue') }} || + ${{ startsWith(github.ref, 'refs/heads/dependabot/') }} || + ${{ startsWith(github.ref, 'refs/heads/feature/') }} run: | new_ver="${{ steps.get-version.outputs.current_version }}+$(git rev-parse --short ${GITHUB_SHA})" poetry version $new_ver @@ -160,6 +163,7 @@ jobs: name: python-artifact path: dist/* - name: Publish to test.pypi.org + id: pypi-test-publish if: | github.ref == 'refs/heads/develop' || startsWith(github.ref, 'refs/heads/release') @@ -170,19 +174,24 @@ jobs: poetry publish -r testpypi - name: Publish to pypi.org if: ${{ github.ref == 'refs/heads/main' }} + id: pypi-publish env: POETRY_PYPI_TOKEN_PYPI: ${{secrets.POETRY_PYPI_TOKEN_PYPI}} run: | poetry publish - name: Log in to the Container registry - if: ${{ !startsWith(github.ref, 'refs/heads/feature') }} + if: | + steps.pypi-test-publish.conclusion == 'success' || + steps.pypi-publish.conclusion == 'success' uses: docker/login-action@v1 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Extract metadata (tags, labels) for Docker - if: ${{ !startsWith(github.ref, 'refs/heads/feature') }} + if: | + steps.pypi-test-publish.conclusion == 'success' || + steps.pypi-publish.conclusion == 'success' id: meta uses: docker/metadata-action@v4 with: @@ -191,12 +200,16 @@ jobs: type=semver,pattern={{version}},value=${{ env.software_version }} type=raw,value=${{ env.venue }} - name: Wait for package - if: ${{ !startsWith(github.ref, 'refs/heads/feature') }} + if: | + steps.pypi-test-publish.conclusion == 'success' || + steps.pypi-publish.conclusion == 'success' run: | pip install tenacity ${GITHUB_WORKSPACE}/.github/workflows/wait-for-pypi.py ${{env.pyproject_name}}[harmony]==${{ env.software_version }} - name: Build and push Docker image - if: ${{ !startsWith(github.ref, 'refs/heads/feature') }} + if: | + steps.pypi-test-publish.conclusion == 'success' || + steps.pypi-publish.conclusion == 'success' uses: docker/build-push-action@v3 with: context: . @@ -208,7 +221,9 @@ jobs: tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} - name: Run Snyk on Docker Image - if: ${{ !startsWith(github.ref, 'refs/heads/feature') }} + if: | + steps.pypi-test-publish.conclusion == 'success' || + steps.pypi-publish.conclusion == 'success' # Snyk can be used to break the build when it detects vulnerabilities. # In this case we want to upload the issues to GitHub Code Scanning continue-on-error: true diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ffbad7c..afe92a44 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ for variables to not have leading slash in the front ### Removed ### Fixed - PODAAC-5065: integration with SMAP_RSS_L2_SSS_V5, fix way xarray open granules that have `seconds since 2000-1-1 0:0:0 0` as a time unit. +- [issue/127](https://github.com/podaac/l2ss-py/issues/127): Fixed bug when subsetting variables in grouped datasets. Variable names passed to `subset` will now have `/` replaced by `GROUP_DELIM` so they can be located in flattened datasets ### Security ## [2.2.0] diff --git a/podaac/subsetter/group_handling.py b/podaac/subsetter/group_handling.py new file mode 100644 index 00000000..052a3a00 --- /dev/null +++ b/podaac/subsetter/group_handling.py @@ -0,0 +1,238 @@ +""" +group_handling.py + +Functions for converting multidimensional data structures + between a group hierarchy and a flat structure +""" +from shutil import copy + +import h5py +import netCDF4 as nc +import numpy as np +import xarray as xr + +GROUP_DELIM = '__' + + +def transform_grouped_dataset(nc_dataset, file_to_subset): + """ + Transform a netCDF4 Dataset that has groups to an xarray compatible + dataset. xarray does not work with groups, so this transformation + will flatten the variables in the dataset and use the group path as + the new variable name. For example, data_01 > km > sst would become + 'data_01__km__sst', where GROUP_DELIM is __. + + This same pattern is applied to dimensions, which are located under + the appropriate group. They are renamed and placed in the root + group. + + Parameters + ---------- + nc_dataset : nc.Dataset + netCDF4 Dataset that contains groups + file_to_subset : str + + Returns + ------- + nc.Dataset + netCDF4 Dataset that does not contain groups and that has been + flattened. + """ + + # Close the existing read-only dataset and reopen in append mode + nc_dataset.close() + nc_dataset = nc.Dataset(file_to_subset, 'r+') + + dimensions = {} + + def walk(group_node, path): + for key, item in group_node.items(): + group_path = f'{path}{GROUP_DELIM}{key}' + + # If there are variables in this group, copy to root group + # and then delete from current group + if item.variables: + # Copy variables to root group with new name + for var_name, var in item.variables.items(): + var_group_name = f'{group_path}{GROUP_DELIM}{var_name}' + nc_dataset.variables[var_group_name] = var + # Delete variables + var_names = list(item.variables.keys()) + for var_name in var_names: + del item.variables[var_name] + + if item.dimensions: + dims = list(item.dimensions.keys()) + for dim_name in dims: + new_dim_name = f'{group_path.replace("/", GROUP_DELIM)}{GROUP_DELIM}{dim_name}' + item.dimensions[new_dim_name] = item.dimensions[dim_name] + dimensions[new_dim_name] = item.dimensions[dim_name] + item.renameDimension(dim_name, new_dim_name) + + # If there are subgroups in this group, call this function + # again on that group. + if item.groups: + walk(item.groups, group_path) + + # Delete non-root groups + group_names = list(group_node.keys()) + for group_name in group_names: + del group_node[group_name] + + for var_name in list(nc_dataset.variables.keys()): + new_var_name = f'{GROUP_DELIM}{var_name}' + nc_dataset.variables[new_var_name] = nc_dataset.variables[var_name] + del nc_dataset.variables[var_name] + + walk(nc_dataset.groups, '') + + # Update the dimensions of the dataset in the root group + nc_dataset.dimensions.update(dimensions) + + return nc_dataset + + +def recombine_grouped_datasets(datasets, output_file, start_date): # pylint: disable=too-many-branches + """ + Given a list of xarray datasets, combine those datasets into a + single netCDF4 Dataset and write to the disk. Each dataset has been + transformed using its group path and needs to be un-transformed and + placed in the appropriate group. + + Parameters + ---------- + datasets : list (xr.Dataset) + List of xarray datasets to be combined + output_file : str + Name of the output file to write the resulting NetCDF file to. + """ + + base_dataset = nc.Dataset(output_file, mode='w') + + for dataset in datasets: + group_lst = [] + for var_name in dataset.variables.keys(): # need logic if there is data in the top level not in a group + group_lst.append('/'.join(var_name.split(GROUP_DELIM)[:-1])) + group_lst = ['/' if group == '' else group for group in group_lst] + groups = set(group_lst) + for group in groups: + base_dataset.createGroup(group) + + for dim_name in list(dataset.dims.keys()): + new_dim_name = dim_name.split(GROUP_DELIM)[-1] + dim_group = _get_nested_group(base_dataset, dim_name) + dim_group.createDimension(new_dim_name, dataset.dims[dim_name]) + + # Rename variables + _rename_variables(dataset, base_dataset, start_date) + + # Remove group vars from base dataset + for var_name in list(base_dataset.variables.keys()): + if GROUP_DELIM in var_name: + del base_dataset.variables[var_name] + + # Remove group dims from base dataset + for dim_name in list(base_dataset.dimensions.keys()): + if GROUP_DELIM in dim_name: + del base_dataset.dimensions[dim_name] + + # Copy global attributes + base_dataset.setncatts(datasets[0].attrs) + # Write and close + base_dataset.close() + + +def _get_nested_group(dataset, group_path): + nested_group = dataset + for group in group_path.strip(GROUP_DELIM).split(GROUP_DELIM)[:-1]: + nested_group = nested_group.groups[group] + return nested_group + + +def _rename_variables(dataset, base_dataset, start_date): + for var_name in list(dataset.variables.keys()): + new_var_name = var_name.split(GROUP_DELIM)[-1] + var_group = _get_nested_group(base_dataset, var_name) + variable = dataset.variables[var_name] + var_dims = [x.split(GROUP_DELIM)[-1] for x in dataset.variables[var_name].dims] + if np.issubdtype( + dataset.variables[var_name].dtype, np.dtype(np.datetime64) + ) or np.issubdtype( + dataset.variables[var_name].dtype, np.dtype(np.timedelta64) + ): + if start_date: + dataset.variables[var_name].values = (dataset.variables[var_name].values - np.datetime64(start_date))/np.timedelta64(1, 's') + variable = dataset.variables[var_name] + else: + cf_dt_coder = xr.coding.times.CFDatetimeCoder() + encoded_var = cf_dt_coder.encode(dataset.variables[var_name]) + variable = encoded_var + + var_attrs = variable.attrs + fill_value = var_attrs.get('_FillValue') + var_attrs.pop('_FillValue', None) + comp_args = {"zlib": True, "complevel": 1} + + if variable.dtype == object: + var_group.createVariable(new_var_name, 'S1', var_dims, fill_value=fill_value, **comp_args) + elif variable.dtype == 'timedelta64[ns]': + var_group.createVariable(new_var_name, 'i4', var_dims, fill_value=fill_value, **comp_args) + else: + var_group.createVariable(new_var_name, variable.dtype, var_dims, fill_value=fill_value, **comp_args) + + # Copy attributes + var_group.variables[new_var_name].setncatts(var_attrs) + + # Copy data + var_group.variables[new_var_name].set_auto_maskandscale(False) + var_group.variables[new_var_name][:] = variable.data + + +def h5file_transform(finput): + """ + Transform a h5py Dataset that has groups to an xarray compatible + dataset. xarray does not work with groups, so this transformation + will flatten the variables in the dataset and use the group path as + the new variable name. For example, data_01 > km > sst would become + 'data_01__km__sst', where GROUP_DELIM is __. + + Returns + ------- + nc.Dataset + netCDF4 Dataset that does not contain groups and that has been + flattened. + """ + data_new = h5py.File(finput, 'r+') + del_group_list = list(data_new.keys()) + has_groups = bool(data_new['/']) + + def walk_h5py(data_new, group): + # flattens h5py file + for key, item in data_new[group].items(): + group_path = f'{group}{key}' + if isinstance(item, h5py.Dataset): + new_var_name = group_path.replace('/', '__') + + data_new[new_var_name] = data_new[group_path] + del data_new[group_path] + + elif isinstance(item, h5py.Group): + if len(list(item.keys())) == 0: + new_group_name = group_path.replace('/', '__') + data_new[new_group_name] = data_new[group_path] + + walk_h5py(data_new, data_new[group_path].name + '/') + + walk_h5py(data_new, data_new.name) + + for del_group in del_group_list: + del data_new[del_group] + + finputnc = '.'.join(finput.split('.')[:-1]) + '.nc' + + data_new.close() # close the h5py dataset + copy(finput, finputnc) # copy to a nc file + + nc_dataset = nc.Dataset(finputnc, mode='r') + + return nc_dataset, has_groups diff --git a/podaac/subsetter/subset.py b/podaac/subsetter/subset.py index 2e1b4627..78a19281 100644 --- a/podaac/subsetter/subset.py +++ b/podaac/subsetter/subset.py @@ -23,14 +23,13 @@ import json import operator import os -from shutil import copy +from typing import Tuple import dateutil from dateutil import parser import cf_xarray as cfxr import cftime import geopandas as gpd -import h5py import importlib_metadata import julian import netCDF4 as nc @@ -43,8 +42,9 @@ from podaac.subsetter import dimension_cleanup as dc from podaac.subsetter import xarray_enhancements as xre +from podaac.subsetter.group_handling import GROUP_DELIM, transform_grouped_dataset, recombine_grouped_datasets, \ + h5file_transform -GROUP_DELIM = '__' SERVICE_NAME = 'l2ss-py' @@ -494,7 +494,7 @@ def compute_time_variable_name(dataset, lat_var): Parameters ---------- - dataset : xr.Dataset: + dataset : xr.Dataset xarray dataset to find time variable from lat_var : xr.Variable Lat variable for this dataset @@ -875,229 +875,6 @@ def in_shape(lon, lat): return xre.where(dataset, boolean_mask, cut) -def transform_grouped_dataset(nc_dataset, file_to_subset): - """ - Transform a netCDF4 Dataset that has groups to an xarray compatible - dataset. xarray does not work with groups, so this transformation - will flatten the variables in the dataset and use the group path as - the new variable name. For example, data_01 > km > sst would become - 'data_01__km__sst', where GROUP_DELIM is __. - - This same pattern is applied to dimensions, which are located under - the appropriate group. They are renamed and placed in the root - group. - - Parameters - ---------- - nc_dataset : nc.Dataset - netCDF4 Dataset that contains groups - - Returns - ------- - nc.Dataset - netCDF4 Dataset that does not contain groups and that has been - flattened. - """ - - # Close the existing read-only dataset and reopen in append mode - nc_dataset.close() - nc_dataset = nc.Dataset(file_to_subset, 'r+') - - dimensions = {} - - def walk(group_node, path): - for key, item in group_node.items(): - group_path = f'{path}{GROUP_DELIM}{key}' - - # If there are variables in this group, copy to root group - # and then delete from current group - if item.variables: - # Copy variables to root group with new name - for var_name, var in item.variables.items(): - var_group_name = f'{group_path}{GROUP_DELIM}{var_name}' - nc_dataset.variables[var_group_name] = var - # Delete variables - var_names = list(item.variables.keys()) - for var_name in var_names: - del item.variables[var_name] - - if item.dimensions: - dims = list(item.dimensions.keys()) - for dim_name in dims: - new_dim_name = f'{group_path.replace("/", GROUP_DELIM)}{GROUP_DELIM}{dim_name}' - item.dimensions[new_dim_name] = item.dimensions[dim_name] - dimensions[new_dim_name] = item.dimensions[dim_name] - item.renameDimension(dim_name, new_dim_name) - - # If there are subgroups in this group, call this function - # again on that group. - if item.groups: - walk(item.groups, group_path) - - # Delete non-root groups - group_names = list(group_node.keys()) - for group_name in group_names: - del group_node[group_name] - - for var_name in list(nc_dataset.variables.keys()): - new_var_name = f'{GROUP_DELIM}{var_name}' - nc_dataset.variables[new_var_name] = nc_dataset.variables[var_name] - del nc_dataset.variables[var_name] - - walk(nc_dataset.groups, '') - - # Update the dimensions of the dataset in the root group - nc_dataset.dimensions.update(dimensions) - - return nc_dataset - - -def recombine_grouped_datasets(datasets, output_file, start_date): # pylint: disable=too-many-branches - """ - Given a list of xarray datasets, combine those datasets into a - single netCDF4 Dataset and write to the disk. Each dataset has been - transformed using its group path and needs to be un-transformed and - placed in the appropriate group. - - Parameters - ---------- - datasets : list (xr.Dataset) - List of xarray datasets to be combined - output_file : str - Name of the output file to write the resulting NetCDF file to. - """ - - base_dataset = nc.Dataset(output_file, mode='w') - - for dataset in datasets: - group_lst = [] - for var_name in dataset.variables.keys(): # need logic if there is data in the top level not in a group - group_lst.append('/'.join(var_name.split(GROUP_DELIM)[:-1])) - group_lst = ['/' if group == '' else group for group in group_lst] - groups = set(group_lst) - for group in groups: - base_dataset.createGroup(group) - - for dim_name in list(dataset.dims.keys()): - new_dim_name = dim_name.split(GROUP_DELIM)[-1] - dim_group = _get_nested_group(base_dataset, dim_name) - dim_group.createDimension(new_dim_name, dataset.dims[dim_name]) - - # Rename variables - _rename_variables(dataset, base_dataset, start_date) - - # Remove group vars from base dataset - for var_name in list(base_dataset.variables.keys()): - if GROUP_DELIM in var_name: - del base_dataset.variables[var_name] - - # Remove group dims from base dataset - for dim_name in list(base_dataset.dimensions.keys()): - if GROUP_DELIM in dim_name: - del base_dataset.dimensions[dim_name] - - # Copy global attributes - base_dataset.setncatts(datasets[0].attrs) - # Write and close - base_dataset.close() - - -def _get_nested_group(dataset, group_path): - nested_group = dataset - for group in group_path.strip(GROUP_DELIM).split(GROUP_DELIM)[:-1]: - nested_group = nested_group.groups[group] - return nested_group - - -def _rename_variables(dataset, base_dataset, start_date): - for var_name in list(dataset.variables.keys()): - new_var_name = var_name.split(GROUP_DELIM)[-1] - var_group = _get_nested_group(base_dataset, var_name) - variable = dataset.variables[var_name] - var_dims = [x.split(GROUP_DELIM)[-1] for x in dataset.variables[var_name].dims] - if np.issubdtype( - dataset.variables[var_name].dtype, np.dtype(np.datetime64) - ) or np.issubdtype( - dataset.variables[var_name].dtype, np.dtype(np.timedelta64) - ): - if start_date: - dataset.variables[var_name].values = (dataset.variables[var_name].values - np.datetime64(start_date))/np.timedelta64(1, 's') - variable = dataset.variables[var_name] - else: - cf_dt_coder = xr.coding.times.CFDatetimeCoder() - encoded_var = cf_dt_coder.encode(dataset.variables[var_name]) - variable = encoded_var - - var_attrs = variable.attrs - fill_value = var_attrs.get('_FillValue') - var_attrs.pop('_FillValue', None) - comp_args = {"zlib": True, "complevel": 1} - - if variable.dtype == object: - var_group.createVariable(new_var_name, 'S1', var_dims, fill_value=fill_value, **comp_args) - elif variable.dtype == 'timedelta64[ns]': - var_group.createVariable(new_var_name, 'i4', var_dims, fill_value=fill_value, **comp_args) - else: - var_group.createVariable(new_var_name, variable.dtype, var_dims, fill_value=fill_value, **comp_args) - - # Copy attributes - var_group.variables[new_var_name].setncatts(var_attrs) - - # Copy data - var_group.variables[new_var_name].set_auto_maskandscale(False) - var_group.variables[new_var_name][:] = variable.data - - -def h5file_transform(finput): - """ - Transform a h5py Dataset that has groups to an xarray compatible - dataset. xarray does not work with groups, so this transformation - will flatten the variables in the dataset and use the group path as - the new variable name. For example, data_01 > km > sst would become - 'data_01__km__sst', where GROUP_DELIM is __. - - Returns - ------- - nc.Dataset - netCDF4 Dataset that does not contain groups and that has been - flattened. - """ - data_new = h5py.File(finput, 'r+') - del_group_list = list(data_new.keys()) - has_groups = bool(data_new['/']) - - def walk_h5py(data_new, group): - # flattens h5py file - for key, item in data_new[group].items(): - group_path = f'{group}{key}' - if isinstance(item, h5py.Dataset): - new_var_name = group_path.replace('/', '__') - - data_new[new_var_name] = data_new[group_path] - del data_new[group_path] - - elif isinstance(item, h5py.Group): - if len(list(item.keys())) == 0: - new_group_name = group_path.replace('/', '__') - data_new[new_group_name] = data_new[group_path] - - walk_h5py(data_new, data_new[group_path].name + '/') - - walk_h5py(data_new, data_new.name) - - for del_group in del_group_list: - del data_new[del_group] - - finputnc = '.'.join(finput.split('.')[:-1]) + '.nc' - - data_new.close() # close the h5py dataset - copy(finput, finputnc) # copy to a nc file - - nc_dataset = nc.Dataset(finputnc, mode='r') - - return nc_dataset, has_groups - - def get_coordinate_variable_names(dataset, lat_var_names=None, lon_var_names=None, time_var_names=None): """ Retrieve coordinate variables for this dataset. If coordinate @@ -1158,6 +935,26 @@ def convert_to_datetime(dataset, time_vars): return dataset, start_date +def open_as_nc_dataset(filepath: str) -> Tuple[nc.Dataset, list, bool]: + """Open netcdf file, and flatten groups if they exist.""" + file_extension = filepath.split('.')[-1] + + if file_extension == 'he5': + nc_dataset, has_groups = h5file_transform(filepath) + else: + # Open dataset with netCDF4 first, so we can get group info + nc_dataset = nc.Dataset(filepath, mode='r') + has_groups = bool(nc_dataset.groups) + + # If dataset has groups, transform to work with xarray + if has_groups: + nc_dataset = transform_grouped_dataset(nc_dataset, filepath) + + nc_dataset, rename_vars = dc.remove_duplicate_dims(nc_dataset) + + return nc_dataset, rename_vars, has_groups + + def override_decode_cf_datetime(): """ WARNING !!! REMOVE AT EARLIEST XARRAY FIX, this is a override to xarray override_decode_cf_datetime function. @@ -1181,10 +978,10 @@ def decode_cf_datetime(num_dates, units, calendar=None, use_cftime=None): xarray.coding.times.decode_cf_datetime = decode_cf_datetime -def subset(file_to_subset, bbox, output_file, variables=None, +def subset(file_to_subset, bbox, output_file, variables=(), # pylint: disable=too-many-branches, disable=too-many-statements cut=True, shapefile=None, min_time=None, max_time=None, origin_source=None, - lat_var_names=None, lon_var_names=None, time_var_names=None): + lat_var_names=(), lon_var_names=(), time_var_names=()): """ Subset a given NetCDF file given a bounding box @@ -1217,6 +1014,9 @@ def subset(file_to_subset, bbox, output_file, variables=None, ISO timestamp representing the upper bound of the temporal subset to be performed. If this value is not provided, the granule will not be subset temporally on the upper bound. + origin_source : str + Original location or filename of data to be used in "derived from" + history element. lat_var_names : list List of variables that represent the latitude coordinate variables for this granule. This list will only contain more @@ -1233,27 +1033,21 @@ def subset(file_to_subset, bbox, output_file, variables=None, than one value in the case where there are multiple groups and different coordinate variables for each group. """ - file_extension = file_to_subset.split('.')[-1] - - if file_extension == 'he5': - nc_dataset, has_groups = h5file_transform(file_to_subset) - else: - # Open dataset with netCDF4 first, so we can get group info - nc_dataset = nc.Dataset(file_to_subset, mode='r') - has_groups = bool(nc_dataset.groups) - - # If dataset has groups, transform to work with xarray - if has_groups: - nc_dataset = transform_grouped_dataset(nc_dataset, file_to_subset) - - nc_dataset, rename_vars = dc.remove_duplicate_dims(nc_dataset) + nc_dataset, rename_vars, has_groups = open_as_nc_dataset(file_to_subset) override_decode_cf_datetime() - if variables: - variables = [x.replace('/', GROUP_DELIM) for x in variables] - if has_groups: - variables = [GROUP_DELIM + x if not x.startswith(GROUP_DELIM) else x for x in variables] + if has_groups: + # Make sure all variables start with '/' + variables = ['/' + var if not var.startswith('/') else var for var in variables] + lat_var_names = ['/' + var if not var.startswith('/') else var for var in lat_var_names] + lon_var_names = ['/' + var if not var.startswith('/') else var for var in lon_var_names] + time_var_names = ['/' + var if not var.startswith('/') else var for var in time_var_names] + # Replace all '/' with GROUP_DELIM + variables = [var.replace('/', GROUP_DELIM) for var in variables] + lat_var_names = [var.replace('/', GROUP_DELIM) for var in lat_var_names] + lon_var_names = [var.replace('/', GROUP_DELIM) for var in lon_var_names] + time_var_names = [var.replace('/', GROUP_DELIM) for var in time_var_names] args = { 'decode_coords': False, diff --git a/poetry.lock b/poetry.lock index 2d142e97..22d4ddb2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -8,7 +8,7 @@ python-versions = "*" [[package]] name = "astroid" -version = "2.12.9" +version = "2.12.13" description = "An abstract syntax tree for Python with inference support." category = "dev" optional = false @@ -22,14 +22,6 @@ wrapt = [ {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}, ] -[[package]] -name = "atomicwrites" -version = "1.4.1" -description = "Atomic file writes." -category = "dev" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" - [[package]] name = "attrs" version = "22.1.0" @@ -46,7 +38,7 @@ tests_no_zope = ["coverage[toml] (>=5.0.2)", "hypothesis", "pympler", "pytest (> [[package]] name = "aws-sam-translator" -version = "1.50.0" +version = "1.55.0" description = "AWS SAM Translator is a library that transform SAM templates into AWS CloudFormation templates" category = "dev" optional = false @@ -57,11 +49,11 @@ boto3 = ">=1.19.5,<2.0.0" jsonschema = ">=3.2,<4.0" [package.extras] -dev = ["coverage (>=5.3,<6.0)", "flake8 (>=3.8.4,<3.9.0)", "tox (>=3.24,<4.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pytest-xdist (>=2.5,<3.0)", "pytest-env (>=0.6.2,<0.7.0)", "pylint (>=2.9.0,<2.10.0)", "pyyaml (>=5.4,<6.0)", "pytest (>=6.2.5,<6.3.0)", "parameterized (>=0.7.4,<0.8.0)", "click (>=7.1,<8.0)", "dateparser (>=0.7,<1.0)", "boto3 (>=1.23,<2)", "tenacity (>=7.0.0,<7.1.0)", "requests (>=2.24.0,<2.25.0)", "docopt (>=0.6.2,<0.7.0)", "black (==20.8b1)"] +dev = ["coverage (>=5.3,<6.0)", "flake8 (>=3.8.4,<3.9.0)", "tox (>=3.24,<4.0)", "pytest-cov (>=2.10.1,<2.11.0)", "pytest-xdist (>=2.5,<3.0)", "pytest-env (>=0.6.2,<0.7.0)", "pytest-rerunfailures (>=9.1.1,<9.2.0)", "pylint (>=2.15.0,<2.16.0)", "pyyaml (>=5.4,<6.0)", "pytest (>=6.2.5,<6.3.0)", "parameterized (>=0.7.4,<0.8.0)", "click (>=7.1,<8.0)", "dateparser (>=0.7,<1.0)", "boto3 (>=1.23,<2)", "tenacity (>=7.0.0,<7.1.0)", "requests (>=2.24.0,<2.25.0)", "docopt (>=0.6.2,<0.7.0)", "black (==20.8b1)", "ruamel.yaml (==0.17.21)", "mypy (==0.971)", "boto3-stubs[serverlessrepo,appconfig] (>=1.19.5,<2.0.0)", "types-PyYAML (>=5.4,<6.0)", "types-jsonschema (>=3.2,<4.0)"] [[package]] name = "aws-xray-sdk" -version = "2.10.0" +version = "2.11.0" description = "The AWS X-Ray SDK for Python (the SDK) enables Python developers to record and emit information from within their applications to the AWS X-Ray service." category = "dev" optional = false @@ -73,7 +65,7 @@ wrapt = "*" [[package]] name = "babel" -version = "2.10.3" +version = "2.11.0" description = "Internationalization utilities" category = "dev" optional = false @@ -109,14 +101,14 @@ python-versions = "*" [[package]] name = "boto3" -version = "1.24.67" +version = "1.26.28" description = "The AWS SDK for Python" category = "main" optional = false python-versions = ">= 3.7" [package.dependencies] -botocore = ">=1.27.67,<1.28.0" +botocore = ">=1.29.28,<1.30.0" jmespath = ">=0.7.1,<2.0.0" s3transfer = ">=0.6.0,<0.7.0" @@ -125,7 +117,7 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.27.67" +version = "1.29.28" description = "Low-level, data-driven core of boto 3." category = "main" optional = false @@ -137,11 +129,11 @@ python-dateutil = ">=2.1,<3.0.0" urllib3 = ">=1.25.4,<1.27" [package.extras] -crt = ["awscrt (==0.14.0)"] +crt = ["awscrt (==0.15.3)"] [[package]] name = "certifi" -version = "2022.6.15" +version = "2022.12.7" description = "Python package for providing Mozilla's CA Bundle." category = "main" optional = false @@ -149,7 +141,7 @@ python-versions = ">=3.6" [[package]] name = "cf-xarray" -version = "0.7.4" +version = "0.7.6" description = "A lightweight convenience wrapper for using CF attributes on xarray objects" category = "main" optional = false @@ -171,14 +163,14 @@ pycparser = "*" [[package]] name = "cfn-lint" -version = "0.64.1" +version = "0.72.2" description = "Checks CloudFormation templates for practices and behaviour that could potentially be improved" category = "dev" optional = false python-versions = ">=3.7, <=4.0, !=4.0" [package.dependencies] -aws-sam-translator = ">=1.50.0" +aws-sam-translator = ">=1.55.0" jschema-to-python = ">=1.2.3,<1.3.0" jsonpatch = "*" jsonschema = ">=3.0,<5" @@ -189,7 +181,7 @@ sarif-om = ">=1.0.4,<1.1.0" [[package]] name = "cftime" -version = "1.6.1" +version = "1.6.2" description = "Time-handling functionality from netcdf4-python" category = "main" optional = false @@ -258,26 +250,29 @@ python-versions = ">=3.6" [[package]] name = "colorama" -version = "0.4.5" +version = "0.4.6" description = "Cross-platform colored terminal text." category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" [[package]] name = "coverage" -version = "6.4.4" +version = "6.5.0" description = "Code coverage measurement for Python" category = "dev" optional = false python-versions = ">=3.7" +[package.dependencies] +tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""} + [package.extras] toml = ["tomli"] [[package]] name = "cryptography" -version = "38.0.1" +version = "38.0.4" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." category = "dev" optional = false @@ -296,16 +291,17 @@ test = ["pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-subtests", [[package]] name = "dask" -version = "2022.9.0" +version = "2022.12.0" description = "Parallel PyData with Task Scheduling" category = "main" optional = false python-versions = ">=3.8" [package.dependencies] -bokeh = {version = ">=2.4.2", optional = true, markers = "extra == \"complete\""} +bokeh = {version = ">=2.4.2,<3", optional = true, markers = "extra == \"complete\""} +click = ">=7.0" cloudpickle = ">=1.1.1" -distributed = {version = "2022.9.0", optional = true, markers = "extra == \"complete\""} +distributed = {version = "2022.12.0", optional = true, markers = "extra == \"complete\""} fsspec = ">=0.6.0" jinja2 = {version = "*", optional = true, markers = "extra == \"complete\""} numpy = {version = ">=1.18", optional = true, markers = "extra == \"complete\""} @@ -317,10 +313,10 @@ toolz = ">=0.8.2" [package.extras] array = ["numpy (>=1.18)"] -complete = ["bokeh (>=2.4.2)", "distributed (==2022.9.0)", "jinja2", "numpy (>=1.18)", "pandas (>=1.0)"] +complete = ["bokeh (>=2.4.2,<3)", "distributed (==2022.12.0)", "jinja2", "numpy (>=1.18)", "pandas (>=1.0)"] dataframe = ["numpy (>=1.18)", "pandas (>=1.0)"] -diagnostics = ["bokeh (>=2.4.2)", "jinja2"] -distributed = ["distributed (==2022.9.0)"] +diagnostics = ["bokeh (>=2.4.2,<3)", "jinja2"] +distributed = ["distributed (==2022.12.0)"] test = ["pandas", "pytest", "pytest-rerunfailures", "pytest-xdist", "pre-commit"] [[package]] @@ -336,27 +332,27 @@ packaging = "*" [[package]] name = "dill" -version = "0.3.5.1" +version = "0.3.6" description = "serialize all of python" category = "dev" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*" +python-versions = ">=3.7" [package.extras] graph = ["objgraph (>=1.7.2)"] [[package]] name = "distributed" -version = "2022.9.0" +version = "2022.12.0" description = "Distributed scheduler for Dask" category = "main" optional = false python-versions = ">=3.8" [package.dependencies] -click = ">=6.6" +click = ">=7.0" cloudpickle = ">=1.5.0" -dask = "2022.9.0" +dask = "2022.12.0" jinja2 = "*" locket = ">=1.0.0" msgpack = ">=0.6.0" @@ -365,14 +361,14 @@ psutil = ">=5.0" pyyaml = "*" sortedcontainers = "<2.0.0 || >2.0.0,<2.0.1 || >2.0.1" tblib = ">=1.6.0" -toolz = ">=0.8.2" -tornado = ">=6.0.3,<6.2" +toolz = ">=0.10.0" +tornado = ">=6.0.3" urllib3 = "*" zict = ">=0.1.3" [[package]] name = "docker" -version = "6.0.0" +version = "6.0.1" description = "A Python library for the Docker Engine API." category = "dev" optional = false @@ -411,9 +407,20 @@ six = ">=1.9.0" gmpy = ["gmpy"] gmpy2 = ["gmpy2"] +[[package]] +name = "exceptiongroup" +version = "1.0.4" +description = "Backport of PEP 654 (exception groups)" +category = "dev" +optional = false +python-versions = ">=3.7" + +[package.extras] +test = ["pytest (>=6)"] + [[package]] name = "fiona" -version = "1.8.21" +version = "1.8.22" description = "Fiona reads and writes spatial data files" category = "main" optional = false @@ -429,7 +436,7 @@ munch = "*" six = ">=1.7" [package.extras] -all = ["boto3 (>=1.2.4)", "pytest-cov", "shapely", "pytest (>=3)", "mock"] +all = ["pytest-cov", "shapely", "boto3 (>=1.2.4)", "pytest (>=3)", "mock"] calc = ["shapely"] s3 = ["boto3 (>=1.2.4)"] test = ["pytest (>=3)", "pytest-cov", "boto3 (>=1.2.4)", "mock"] @@ -449,7 +456,7 @@ pyflakes = ">=2.3.0,<2.4.0" [[package]] name = "fsspec" -version = "2022.8.2" +version = "2022.11.0" description = "File-system specification" category = "main" optional = false @@ -505,7 +512,7 @@ numpy = ">=1.14.5" [[package]] name = "harmony-service-lib" -version = "1.0.21" +version = "1.0.22" description = "A library for Python-based Harmony services to parse incoming messages, fetch data, stage data, and call back to Harmony" category = "main" optional = true @@ -546,7 +553,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" [[package]] name = "importlib-metadata" -version = "4.12.0" +version = "4.13.0" description = "Read metadata from Python packages" category = "main" optional = false @@ -556,21 +563,29 @@ python-versions = ">=3.7" zipp = ">=0.5" [package.extras] -docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)"] +docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "jaraco.tidelift (>=1.4)"] perf = ["ipython"] -testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.3)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"] +testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "flake8 (<5)", "pytest-cov", "pytest-enabler (>=1.3)", "packaging", "pyfakefs", "flufl.flake8", "pytest-perf (>=0.9.2)", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "importlib-resources (>=1.3)"] + +[[package]] +name = "iniconfig" +version = "1.1.1" +description = "iniconfig: brain-dead simple config-ini parsing" +category = "dev" +optional = false +python-versions = "*" [[package]] name = "isort" -version = "5.10.1" +version = "5.11.1" description = "A Python utility / library to sort Python imports." category = "dev" optional = false -python-versions = ">=3.6.1,<4.0" +python-versions = ">=3.7.0" [package.extras] -pipfile_deprecated_finder = ["pipreqs", "requirementslib"] -requirements_deprecated_finder = ["pipreqs", "pip-api"] +pipfile-deprecated-finder = ["pipreqs", "requirementslib"] +requirements-deprecated-finder = ["pipreqs", "pip-api"] colors = ["colorama (>=0.4.3,<0.5.0)"] plugins = ["setuptools"] @@ -630,16 +645,16 @@ jsonpointer = ">=1.9" [[package]] name = "jsonpickle" -version = "2.2.0" +version = "3.0.0" description = "Python library for serializing any arbitrary object graph into JSON" category = "dev" optional = false -python-versions = ">=2.7" +python-versions = ">=3.7" [package.extras] docs = ["sphinx", "jaraco.packaging (>=3.2)", "rst.linker (>=1.9)"] -testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-black-multipy", "pytest-cov", "ecdsa", "feedparser", "numpy", "pandas", "pymongo", "scikit-learn", "sqlalchemy", "pytest-flake8 (<1.1.0)", "enum34", "jsonlib", "pytest-flake8 (>=1.1.1)"] -"testing.libs" = ["simplejson", "ujson", "yajl"] +testing = ["pytest (>=3.5,!=3.7.3)", "pytest-checkdocs (>=1.2.3)", "pytest-flake8 (>=1.1.1)", "pytest-black-multipy", "pytest-cov", "ecdsa", "feedparser", "gmpy2", "numpy", "pandas", "pymongo", "scikit-learn", "sqlalchemy"] +"testing.libs" = ["simplejson", "ujson"] [[package]] name = "jsonpointer" @@ -687,11 +702,11 @@ six = "*" [[package]] name = "lazy-object-proxy" -version = "1.7.1" +version = "1.8.0" description = "A fast and thorough lazy object proxy." category = "dev" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" [[package]] name = "locket" @@ -750,14 +765,6 @@ build = ["twine", "wheel", "blurb"] docs = ["sphinx"] test = ["pytest (<5.4)", "pytest-cov"] -[[package]] -name = "more-itertools" -version = "8.14.0" -description = "More routines for operating on iterables, beyond itertools" -category = "dev" -optional = false -python-versions = ">=3.5" - [[package]] name = "moto" version = "1.3.14" @@ -817,11 +824,11 @@ yaml = ["PyYAML (>=5.1.0)"] [[package]] name = "netcdf4" -version = "1.6.0" +version = "1.6.2" description = "Provides an object-oriented python interface to the netCDF version 4 library." category = "main" optional = false -python-versions = "*" +python-versions = ">=3.6" [package.dependencies] cftime = "*" @@ -829,7 +836,7 @@ numpy = ">=1.9" [[package]] name = "networkx" -version = "2.8.6" +version = "2.8.8" description = "Python package for creating and manipulating graphs and networks" category = "dev" optional = false @@ -837,14 +844,14 @@ python-versions = ">=3.8" [package.extras] default = ["numpy (>=1.19)", "scipy (>=1.8)", "matplotlib (>=3.4)", "pandas (>=1.3)"] -developer = ["pre-commit (>=2.20)", "mypy (>=0.961)"] -doc = ["sphinx (>=5)", "pydata-sphinx-theme (>=0.9)", "sphinx-gallery (>=0.10)", "numpydoc (>=1.4)", "pillow (>=9.1)", "nb2plots (>=0.6)", "texext (>=0.6.6)"] +developer = ["pre-commit (>=2.20)", "mypy (>=0.982)"] +doc = ["sphinx (>=5.2)", "pydata-sphinx-theme (>=0.11)", "sphinx-gallery (>=0.11)", "numpydoc (>=1.5)", "pillow (>=9.2)", "nb2plots (>=0.6)", "texext (>=0.6.6)"] extra = ["lxml (>=4.6)", "pygraphviz (>=1.9)", "pydot (>=1.4.2)", "sympy (>=1.10)"] -test = ["pytest (>=7.1)", "pytest-cov (>=3.0)", "codecov (>=2.1)"] +test = ["pytest (>=7.2)", "pytest-cov (>=4.0)", "codecov (>=2.1)"] [[package]] name = "numpy" -version = "1.23.2" +version = "1.23.5" description = "NumPy is the fundamental package for array computing with Python." category = "main" optional = false @@ -852,18 +859,15 @@ python-versions = ">=3.8" [[package]] name = "packaging" -version = "21.3" +version = "22.0" description = "Core utilities for Python packages" category = "main" optional = false -python-versions = ">=3.6" - -[package.dependencies] -pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" +python-versions = ">=3.7" [[package]] name = "pandas" -version = "1.4.4" +version = "1.5.2" description = "Powerful data structures for data analysis, time series, and statistics" category = "main" optional = false @@ -871,10 +875,9 @@ python-versions = ">=3.8" [package.dependencies] numpy = [ - {version = ">=1.18.5", markers = "platform_machine != \"aarch64\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, - {version = ">=1.19.2", markers = "platform_machine == \"aarch64\" and python_version < \"3.10\""}, - {version = ">=1.20.0", markers = "platform_machine == \"arm64\" and python_version < \"3.10\""}, + {version = ">=1.20.3", markers = "python_version < \"3.10\""}, {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, + {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, ] python-dateutil = ">=2.8.1" pytz = ">=2020.1" @@ -899,7 +902,7 @@ complete = ["blosc", "pyzmq", "pandas (>=0.19.0)", "numpy (>=1.9.0)"] [[package]] name = "pbr" -version = "5.10.0" +version = "5.11.0" description = "Python Build Reasonableness" category = "dev" optional = false @@ -907,7 +910,7 @@ python-versions = ">=2.6" [[package]] name = "pillow" -version = "9.2.0" +version = "9.3.0" description = "Python Imaging Library (Fork)" category = "main" optional = false @@ -919,30 +922,31 @@ tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "pa [[package]] name = "platformdirs" -version = "2.5.2" -description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +version = "2.6.0" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." category = "dev" optional = false python-versions = ">=3.7" [package.extras] -docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx-autodoc-typehints (>=1.12)", "sphinx (>=4)"] -test = ["appdirs (==1.4.4)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)", "pytest (>=6)"] +docs = ["furo (>=2022.9.29)", "proselint (>=0.13)", "sphinx-autodoc-typehints (>=1.19.4)", "sphinx (>=5.3)"] +test = ["appdirs (==1.4.4)", "pytest-cov (>=4)", "pytest-mock (>=3.10)", "pytest (>=7.2)"] [[package]] name = "pluggy" -version = "0.13.1" +version = "1.0.0" description = "plugin and hook calling mechanisms for python" category = "dev" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +python-versions = ">=3.6" [package.extras] dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] [[package]] name = "psutil" -version = "5.9.2" +version = "5.9.4" description = "Cross-platform lib for process and system monitoring in Python." category = "main" optional = false @@ -951,18 +955,10 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" [package.extras] test = ["ipaddress", "mock", "enum34", "pywin32", "wmi"] -[[package]] -name = "py" -version = "1.11.0" -description = "library with cross-python path, ini-parsing, io, code, log facilities" -category = "dev" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" - [[package]] name = "py-cpuinfo" -version = "8.0.0" -description = "Get CPU info with pure Python 2 & 3" +version = "9.0.0" +description = "Get CPU info with pure Python" category = "dev" optional = false python-versions = "*" @@ -1012,14 +1008,14 @@ plugins = ["importlib-metadata"] [[package]] name = "pylint" -version = "2.15.2" +version = "2.15.8" description = "python code static checker" category = "dev" optional = false python-versions = ">=3.7.2" [package.dependencies] -astroid = ">=2.12.9,<=2.14.0-dev0" +astroid = ">=2.12.13,<=2.14.0-dev0" colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} dill = ">=0.2" isort = ">=4.2.5,<6" @@ -1048,20 +1044,9 @@ cffi = ">=1.4.1" docs = ["sphinx (>=1.6.5)", "sphinx-rtd-theme"] tests = ["pytest (>=3.2.1,!=3.3.0)", "hypothesis (>=3.27.0)"] -[[package]] -name = "pyparsing" -version = "3.0.9" -description = "pyparsing module - Classes and methods to define and execute parsing grammars" -category = "main" -optional = false -python-versions = ">=3.6.8" - -[package.extras] -diagrams = ["railroad-diagrams", "jinja2"] - [[package]] name = "pyproj" -version = "3.3.1" +version = "3.4.0" description = "Python interface to PROJ (cartographic projections and coordinate transformations library)" category = "main" optional = false @@ -1072,7 +1057,7 @@ certifi = "*" [[package]] name = "pyrsistent" -version = "0.18.1" +version = "0.19.2" description = "Persistent/Functional/Immutable data structures" category = "dev" optional = false @@ -1094,33 +1079,31 @@ validation = ["jsonschema (==3.2.0)"] [[package]] name = "pytest" -version = "5.4.3" +version = "7.2.0" description = "pytest: simple powerful testing with Python" category = "dev" optional = false -python-versions = ">=3.5" +python-versions = ">=3.7" [package.dependencies] -atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} -attrs = ">=17.4.0" +attrs = ">=19.2.0" colorama = {version = "*", markers = "sys_platform == \"win32\""} -more-itertools = ">=4.0.0" +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +iniconfig = "*" packaging = "*" -pluggy = ">=0.12,<1.0" -py = ">=1.5.0" -wcwidth = "*" +pluggy = ">=0.12,<2.0" +tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] -checkqa-mypy = ["mypy (==v0.761)"] -testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] +testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] [[package]] name = "pytest-benchmark" -version = "3.4.1" +version = "4.0.0" description = "A ``pytest`` fixture for benchmarking code. It will group the tests into rounds that are calibrated to the chosen timer." category = "dev" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +python-versions = ">=3.7" [package.dependencies] py-cpuinfo = "*" @@ -1133,16 +1116,15 @@ histogram = ["pygal", "pygaljs"] [[package]] name = "pytest-cov" -version = "2.12.1" +version = "4.0.0" description = "Pytest plugin for measuring coverage." category = "dev" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +python-versions = ">=3.6" [package.dependencies] -coverage = ">=5.2.1" +coverage = {version = ">=5.2.1", extras = ["toml"]} pytest = ">=4.6" -toml = "*" [package.extras] testing = ["fields", "hunter", "process-tests", "six", "pytest-xdist", "virtualenv"] @@ -1186,7 +1168,7 @@ python-versions = ">=3.5" [[package]] name = "pytz" -version = "2022.2.1" +version = "2022.6" description = "World timezone definitions, modern and historical" category = "main" optional = false @@ -1194,7 +1176,7 @@ python-versions = "*" [[package]] name = "pywin32" -version = "304" +version = "305" description = "Python for Window Extensions" category = "dev" optional = false @@ -1228,18 +1210,20 @@ use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"] [[package]] name = "responses" -version = "0.21.0" +version = "0.22.0" description = "A utility library for mocking out the `requests` Python library." category = "dev" optional = false python-versions = ">=3.7" [package.dependencies] -requests = ">=2.0,<3.0" +requests = ">=2.22.0,<3.0" +toml = "*" +types-toml = "*" urllib3 = ">=1.25.10" [package.extras] -tests = ["pytest (>=7.0.0)", "coverage (>=6.0.0)", "pytest-cov", "pytest-asyncio", "pytest-localserver", "flake8", "types-mock", "types-requests", "mypy"] +tests = ["pytest (>=7.0.0)", "coverage (>=6.0.0)", "pytest-cov", "pytest-asyncio", "pytest-httpserver", "flake8", "types-requests", "mypy"] [[package]] name = "rsa" @@ -1280,7 +1264,7 @@ pbr = "*" [[package]] name = "shapely" -version = "1.8.4" +version = "1.8.5.post1" description = "Geometric objects, predicates, and operations" category = "main" optional = false @@ -1474,11 +1458,11 @@ python-versions = ">=3.7" [[package]] name = "tomlkit" -version = "0.11.4" +version = "0.11.6" description = "Style preserving TOML library" category = "dev" optional = false -python-versions = ">=3.6,<4.0" +python-versions = ">=3.6" [[package]] name = "toolz" @@ -1490,15 +1474,23 @@ python-versions = ">=3.5" [[package]] name = "tornado" -version = "6.1" +version = "6.2" description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed." category = "main" optional = false -python-versions = ">= 3.5" +python-versions = ">= 3.7" + +[[package]] +name = "types-toml" +version = "0.10.8.1" +description = "Typing stubs for toml" +category = "dev" +optional = false +python-versions = "*" [[package]] name = "typing-extensions" -version = "4.3.0" +version = "4.4.0" description = "Backported and Experimental Type Hints for Python 3.7+" category = "main" optional = false @@ -1506,28 +1498,20 @@ python-versions = ">=3.7" [[package]] name = "urllib3" -version = "1.26.12" +version = "1.26.13" description = "HTTP library with thread-safe connection pooling, file post, and more." category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, <4" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" [package.extras] brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"] secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "urllib3-secure-extra", "ipaddress"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] -[[package]] -name = "wcwidth" -version = "0.2.5" -description = "Measures the displayed width of unicode strings in a terminal" -category = "dev" -optional = false -python-versions = "*" - [[package]] name = "websocket-client" -version = "1.4.1" +version = "1.4.2" description = "WebSocket client for Python with low level API options" category = "dev" optional = false @@ -1562,7 +1546,7 @@ python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" [[package]] name = "xarray" -version = "2022.6.0" +version = "2022.12.0" description = "N-D labeled arrays and datasets in Python" category = "main" optional = false @@ -1570,15 +1554,15 @@ python-versions = ">=3.8" [package.dependencies] dask = {version = "*", extras = ["complete"], optional = true, markers = "extra == \"parallel\""} -numpy = ">=1.19" -packaging = ">=20.0" -pandas = ">=1.2" +numpy = ">=1.20" +packaging = ">=21.3" +pandas = ">=1.3" [package.extras] accel = ["scipy", "bottleneck", "numbagg", "flox"] -complete = ["netcdf4", "h5netcdf", "scipy", "pydap", "zarr", "fsspec", "cftime", "rasterio", "cfgrib", "pooch", "bottleneck", "numbagg", "flox", "dask", "matplotlib", "seaborn", "nc-time-axis"] -docs = ["netcdf4", "h5netcdf", "scipy", "pydap", "zarr", "fsspec", "cftime", "rasterio", "cfgrib", "pooch", "bottleneck", "numbagg", "flox", "dask", "matplotlib", "seaborn", "nc-time-axis", "sphinx-autosummary-accessors", "sphinx-rtd-theme", "ipython", "ipykernel", "jupyter-client", "nbsphinx", "scanpydoc"] -io = ["netcdf4", "h5netcdf", "scipy", "pydap", "zarr", "fsspec", "cftime", "rasterio", "cfgrib", "pooch"] +complete = ["netcdf4", "h5netcdf", "scipy", "zarr", "fsspec", "cftime", "rasterio", "cfgrib", "pooch", "bottleneck", "numbagg", "flox", "dask", "matplotlib", "seaborn", "nc-time-axis", "pydap"] +docs = ["netcdf4", "h5netcdf", "scipy", "zarr", "fsspec", "cftime", "rasterio", "cfgrib", "pooch", "bottleneck", "numbagg", "flox", "dask", "matplotlib", "seaborn", "nc-time-axis", "sphinx-autosummary-accessors", "sphinx-rtd-theme", "ipython", "ipykernel", "jupyter-client", "nbsphinx", "scanpydoc", "pydap"] +io = ["netcdf4", "h5netcdf", "scipy", "zarr", "fsspec", "cftime", "rasterio", "cfgrib", "pooch", "pydap"] parallel = ["dask"] viz = ["matplotlib", "seaborn", "nc-time-axis"] @@ -1603,15 +1587,15 @@ heapdict = "*" [[package]] name = "zipp" -version = "3.8.1" +version = "3.11.0" description = "Backport of pathlib-compatible object wrapper for zip files" category = "main" optional = false python-versions = ">=3.7" [package.extras] -docs = ["sphinx", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "jaraco.tidelift (>=1.4)"] -testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-flake8", "pytest-cov", "pytest-enabler (>=1.3)", "jaraco.itertools", "func-timeout", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)"] +docs = ["sphinx (>=3.5)", "jaraco.packaging (>=9)", "rst.linker (>=1.9)", "furo", "jaraco.tidelift (>=1.4)"] +testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "flake8 (<5)", "pytest-cov", "pytest-enabler (>=1.3)", "jaraco.itertools", "func-timeout", "jaraco.functools", "more-itertools", "pytest-black (>=0.3.7)", "pytest-mypy (>=0.9.1)", "pytest-flake8"] [extras] harmony = ["harmony-service-lib", "pystac"] @@ -1619,12 +1603,11 @@ harmony = ["harmony-service-lib", "pystac"] [metadata] lock-version = "1.1" python-versions = "^3.8" -content-hash = "a812b9c24f128e06197e201439794f7a28bb95055a72f928390326c92111bca9" +content-hash = "ae9d1d8198b4c7d46344bde0f75fa5028fa0f0e31aa97603648636a30e45bed5" [metadata.files] alabaster = [] astroid = [] -atomicwrites = [] attrs = [] aws-sam-translator = [] aws-xray-sdk = [] @@ -1653,6 +1636,7 @@ distributed = [] docker = [] docutils = [] ecdsa = [] +exceptiongroup = [] fiona = [] flake8 = [] fsspec = [] @@ -1663,6 +1647,7 @@ heapdict = [] idna = [] imagesize = [] importlib-metadata = [] +iniconfig = [] isort = [] jinja2 = [] jmespath = [] @@ -1681,7 +1666,6 @@ markupsafe = [] mccabe = [] mistune = [] mock = [] -more-itertools = [] moto = [] msgpack = [] munch = [] @@ -1696,7 +1680,6 @@ pillow = [] platformdirs = [] pluggy = [] psutil = [] -py = [] py-cpuinfo = [] pyasn1 = [] pycodestyle = [] @@ -1705,7 +1688,6 @@ pyflakes = [] pygments = [] pylint = [] pynacl = [] -pyparsing = [] pyproj = [] pyrsistent = [] pystac = [] @@ -1742,9 +1724,9 @@ tomli = [] tomlkit = [] toolz = [] tornado = [] +types-toml = [] typing-extensions = [] urllib3 = [] -wcwidth = [] websocket-client = [] werkzeug = [] wrapt = [] diff --git a/pyproject.toml b/pyproject.toml index d1307f10..f401b8b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,12 +40,12 @@ h5py = "^3.6.0" cf-xarray = "*" [tool.poetry.dev-dependencies] -pytest = "^5.2" +pytest = "~7" flake8 = "^3.7" -pytest-cov = "^2.8" +pytest-cov = "~4" pylint = "^2.4" sphinx = "^4.4" -pytest-benchmark = "^3.2.3" +pytest-benchmark = "~4" moto = "1.3.14" jsonschema = "^3.2.0" m2r2 = "^0.3.1" diff --git a/tests/data/TEMPO_NO2-PROXY_L2_V01_20130731T232959Z_S015G06_partial.nc b/tests/data/TEMPO_NO2-PROXY_L2_V01_20130731T232959Z_S015G06_partial.nc new file mode 100644 index 00000000..99c20829 Binary files /dev/null and b/tests/data/TEMPO_NO2-PROXY_L2_V01_20130731T232959Z_S015G06_partial.nc differ diff --git a/tests/test_subset.py b/tests/test_subset.py index 161bbf36..6b1e726f 100644 --- a/tests/test_subset.py +++ b/tests/test_subset.py @@ -15,6 +15,12 @@ test_subset.py ============== Test the subsetter functionality. + +Unit tests for the L2 subsetter. These tests are all related to the +subsetting functionality itself, and should provide coverage on the +following files: + - podaac.subsetter.subset.py + - podaac.subsetter.xarray_enhancements.py """ import json import operator @@ -27,1803 +33,1693 @@ import geopandas as gpd import importlib_metadata +import netCDF4 import netCDF4 as nc import h5py import numpy as np import pandas as pd import pytest import xarray as xr +import urllib.parse from jsonschema import validate from shapely.geometry import Point +from unittest import TestCase from podaac.subsetter import subset +from podaac.subsetter.group_handling import GROUP_DELIM from podaac.subsetter.subset import SERVICE_NAME from podaac.subsetter import xarray_enhancements as xre from podaac.subsetter import dimension_cleanup as dc -class TestSubsetter(unittest.TestCase): - """ - Unit tests for the L2 subsetter. These tests are all related to the - subsetting functionality itself, and should provide coverage on the - following files: - - podaac.subsetter.subset.py - - podaac.subsetter.xarray_enhancements.py - """ - - @classmethod - def setUpClass(cls): - cls.test_dir = dirname(realpath(__file__)) - cls.test_data_dir = join(cls.test_dir, 'data') - cls.subset_output_dir = tempfile.mkdtemp(dir=cls.test_data_dir) - cls.test_files = [f for f in listdir(cls.test_data_dir) - if isfile(join(cls.test_data_dir, f)) and f.endswith(".nc")] - - cls.history_json_schema = { - "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://harmony.earthdata.nasa.gov/history.schema.json", - "title": "Data Processing History", - "description": "A history record of processing that produced a given data file. For more information, see: https://wiki.earthdata.nasa.gov/display/TRT/In-File+Provenance+Metadata+-+TRT-42", - "type": ["array", "object"], - "items": {"$ref": "#/definitions/history_record"}, - - "definitions": { - "history_record": { - "type": "object", - "properties": { - "date_time": { - "description": "A Date/Time stamp in ISO-8601 format, including time-zone, GMT (or Z) preferred", - "type": "string", - "format": "date-time" - }, - "derived_from": { - "description": "List of source data files used in the creation of this data file", - "type": ["array", "string"], - "items": {"type": "string"} - }, - "program": { - "description": "The name of the program which generated this data file", - "type": "string" - }, - "version": { - "description": "The version identification of the program which generated this data file", - "type": "string" - }, - "parameters": { - "description": "The list of parameters to the program when generating this data file", - "type": ["array", "string"], - "items": {"type": "string"} - }, - "program_ref": { - "description": "A URL reference that defines the program, e.g., a UMM-S reference URL", - "type": "string" - }, - "$schema": { - "description": "The URL to this schema", - "type": "string" - } +@pytest.fixture(scope='class') +def data_dir(): + test_dir = dirname(realpath(__file__)) + return join(test_dir, 'data') + + +@pytest.fixture(scope='class') +def subset_output_dir(data_dir): + subset_output_dir = tempfile.mkdtemp(dir=data_dir) + yield subset_output_dir + shutil.rmtree(subset_output_dir) + + +@pytest.fixture(scope='class') +def history_json_schema(): + return { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://harmony.earthdata.nasa.gov/history.schema.json", + "title": "Data Processing History", + "description": "A history record of processing that produced a given data file. For more information, see: https://wiki.earthdata.nasa.gov/display/TRT/In-File+Provenance+Metadata+-+TRT-42", + "type": ["array", "object"], + "items": {"$ref": "#/definitions/history_record"}, + + "definitions": { + "history_record": { + "type": "object", + "properties": { + "date_time": { + "description": "A Date/Time stamp in ISO-8601 format, including time-zone, GMT (or Z) preferred", + "type": "string", + "format": "date-time" + }, + "derived_from": { + "description": "List of source data files used in the creation of this data file", + "type": ["array", "string"], + "items": {"type": "string"} + }, + "program": { + "description": "The name of the program which generated this data file", + "type": "string" }, - "required": ["date_time", "program"], - "additionalProperties": False - } + "version": { + "description": "The version identification of the program which generated this data file", + "type": "string" + }, + "parameters": { + "description": "The list of parameters to the program when generating this data file", + "type": ["array", "string"], + "items": {"type": "string"} + }, + "program_ref": { + "description": "A URL reference that defines the program, e.g., a UMM-S reference URL", + "type": "string" + }, + "$schema": { + "description": "The URL to this schema", + "type": "string" + } + }, + "required": ["date_time", "program"], + "additionalProperties": False } } + } - @classmethod - def tearDownClass(cls): - # Remove the temporary directories used to house subset data - shutil.rmtree(cls.subset_output_dir) - - def test_subset_variables(self): - """ - Test that all variables present in the original NetCDF file - are present after the subset takes place, and with the same - attributes. - """ - - bbox = np.array(((-180, 90), (-90, 90))) - for file in self.test_files: - output_file = "{}_{}".format(self._testMethodName, file) - subset.subset( - file_to_subset=join(self.test_data_dir, file), - bbox=bbox, - output_file=join(self.subset_output_dir, output_file) - ) - - in_ds = xr.open_dataset(join(self.test_data_dir, file), - decode_times=False, - decode_coords=False) - out_ds = xr.open_dataset(join(self.subset_output_dir, output_file), - decode_times=False, - decode_coords=False) - - for in_var, out_var in zip(in_ds.data_vars.items(), out_ds.data_vars.items()): - # compare names - assert in_var[0] == out_var[0] - - # compare attributes - np.testing.assert_equal(in_var[1].attrs, out_var[1].attrs) - - # compare type and dimension names - assert in_var[1].dtype == out_var[1].dtype - assert in_var[1].dims == out_var[1].dims - - in_ds.close() - out_ds.close() - - - def test_subset_bbox(self): - """ - Test that all data present is within the bounding box given, - and that the correct bounding box is used. This test assumed - that the scanline *is* being cut. - """ - - # pylint: disable=too-many-locals - bbox = np.array(((-180, 90), (-90, 90))) - for file in self.test_files: - output_file = "{}_{}".format(self._testMethodName, file) - subset.subset( - file_to_subset=join(self.test_data_dir, file), - bbox=bbox, - output_file=join(self.subset_output_dir, output_file) - ) - - out_ds = xr.open_dataset(join(self.subset_output_dir, output_file), - decode_times=False, - decode_coords=False, - mask_and_scale=False) - - lat_var_name, lon_var_name = subset.compute_coordinate_variable_names(out_ds) - - lat_var_name = lat_var_name[0] - lon_var_name = lon_var_name[0] - lon_bounds, lat_bounds = subset.convert_bbox(bbox, out_ds, lat_var_name, lon_var_name) +def data_files(): + test_dir = dirname(realpath(__file__)) + test_data_dir = join(test_dir, 'data') + return [f for f in listdir(test_data_dir) if isfile(join(test_data_dir, f)) and f.endswith(".nc")] - lats = out_ds[lat_var_name].values - lons = out_ds[lon_var_name].values - np.warnings.filterwarnings('ignore') +TEST_DATA_FILES = data_files() - # Step 1: Get mask of values which aren't in the bounds. - # For lon spatial condition, need to consider the - # lon_min > lon_max case. If that's the case, should do - # an 'or' instead. - oper = operator.and_ if lon_bounds[0] < lon_bounds[1] else operator.or_ +@pytest.mark.parametrize("test_file", TEST_DATA_FILES) +def test_subset_variables(test_file, data_dir, subset_output_dir, request): + """ + Test that all variables present in the original NetCDF file + are present after the subset takes place, and with the same + attributes. + """ - # In these two masks, True == valid and False == invalid - lat_truth = np.ma.masked_where((lats >= lat_bounds[0]) - & (lats <= lat_bounds[1]), lats).mask - lon_truth = np.ma.masked_where(oper((lons >= lon_bounds[0]), - (lons <= lon_bounds[1])), lons).mask + bbox = np.array(((-180, 90), (-90, 90))) + output_file = "{}_{}".format(request.node.name, test_file) + subset.subset( + file_to_subset=join(data_dir, test_file), + bbox=bbox, + output_file=join(subset_output_dir, output_file) + ) + + in_ds = xr.open_dataset(join(data_dir, test_file), + decode_times=False, + decode_coords=False) + out_ds = xr.open_dataset(join(subset_output_dir, output_file), + decode_times=False, + decode_coords=False) - # combine masks - spatial_mask = np.bitwise_and(lat_truth, lon_truth) + for in_var, out_var in zip(in_ds.data_vars.items(), out_ds.data_vars.items()): + # compare names + assert in_var[0] == out_var[0] - # Create a mask which represents the valid matrix bounds of - # the spatial mask. This is used in the case where a var - # has no _FillValue. - if lon_truth.ndim == 1: - bound_mask = spatial_mask - else: - rows = np.any(spatial_mask, axis=1) - cols = np.any(spatial_mask, axis=0) - bound_mask = np.array([[r & c for c in cols] for r in rows]) - - # If all the lat/lon values are valid, the file is valid and - # there is no need to check individual variables. - if np.all(spatial_mask): - continue - - # Step 2: Get mask of values which are NaN or "_FillValue in - # each variable. - for var_name, var in out_ds.data_vars.items(): - # remove dimension of '1' if necessary - vals = np.squeeze(var.values) - - # Get the Fill Value - fill_value = var.attrs.get('_FillValue') - - # If _FillValue isn't provided, check that all values - # are in the valid matrix bounds go to the next variable - if fill_value is None: - combined_mask = np.ma.mask_or(spatial_mask, bound_mask) - np.testing.assert_equal(bound_mask, combined_mask) - continue - - # If the shapes of this var doesn't match the mask, - # reshape the var so the comparison can be made. Take - # the first index of the unknown dims. This makes - # assumptions about the ordering of the dimensions. - if vals.shape != out_ds[lat_var_name].shape and vals.shape: - slice_list = [] - for dim in var.dims: - if dim in out_ds[lat_var_name].dims: - slice_list.append(slice(None)) - else: - slice_list.append(slice(0, 1)) - vals = np.squeeze(vals[tuple(slice_list)]) - - # Skip for byte type. - if vals.dtype == 'S1': - continue - - # In this mask, False == NaN and True = valid - var_mask = np.invert(np.ma.masked_invalid(vals).mask) - fill_mask = np.invert(np.ma.masked_values(vals, fill_value).mask) - - var_mask = np.bitwise_and(var_mask, fill_mask) - - if var_mask.shape != spatial_mask.shape: - # This may be a case where the time represents lines, - # or some other case where the variable doesn't share - # a shape with the coordinate variables. - continue - - # Step 3: Combine the spatial and var mask with 'or' - combined_mask = np.ma.mask_or(var_mask, spatial_mask) - - # Step 4: compare the newly combined mask and the - # spatial mask created from the lat/lon masks. They - # should be equal, because the 'or' of the two masks - # where out-of-bounds values are 'False' will leave - # those values assuming there are only NaN values - # in the data at those locations. - np.testing.assert_equal(spatial_mask, combined_mask) - - out_ds.close() - - @pytest.mark.skip(reason="This is being tested currently. Temporarily skipped.") - def test_subset_no_bbox(self): - """ - Test that the subsetted file is identical to the given file - when a 'full' bounding box is given. - """ + # compare attributes + np.testing.assert_equal(in_var[1].attrs, out_var[1].attrs) - bbox = np.array(((-180, 180), (-90, 90))) - for file in self.test_files: - output_file = "{}_{}".format(self._testMethodName, file) - subset.subset( - file_to_subset=join(self.test_data_dir, file), - bbox=bbox, - output_file=join(self.subset_output_dir, output_file) - ) + # compare type and dimension names + assert in_var[1].dtype == out_var[1].dtype + assert in_var[1].dims == out_var[1].dims - # pylint: disable=no-member - in_nc = nc.Dataset(join(self.test_data_dir, file), 'r') - out_nc = nc.Dataset(join(self.subset_output_dir, output_file), 'r') - - # Make sure the output dimensions match the input - # dimensions, which means the full file was returned. - for name, dimension in in_nc.dimensions.items(): - assert dimension.size == out_nc.dimensions[name].size - - in_nc.close() - out_nc.close() - - def test_subset_empty_bbox(self): - """ - Test that an empty file is returned when the bounding box - contains no data. - """ - - bbox = np.array(((120, 125), (-90, -85))) - for file in self.test_files: - output_file = "{}_{}".format(self._testMethodName, file) - subset.subset( - file_to_subset=join(self.test_data_dir, file), - bbox=bbox, - output_file=join(self.subset_output_dir, output_file) - ) - test_input_dataset = xr.open_dataset( - join(self.test_data_dir, file), - decode_times=False, - decode_coords=False, - mask_and_scale=False - ) - empty_dataset = xr.open_dataset( - join(self.subset_output_dir, output_file), - decode_times=False, - decode_coords=False, - mask_and_scale=False - ) + in_ds.close() + out_ds.close() - # Ensure all variables are present but empty. - for variable_name, variable in empty_dataset.data_vars.items(): - assert np.all(variable.data == variable.attrs.get('_FillValue', np.nan) or np.isnan(variable.data)) - - assert test_input_dataset.dims.keys() == empty_dataset.dims.keys() - - - def test_bbox_conversion(self): - """ - Test that the bounding box conversion returns expected - results. Expected results are hand-calculated. - """ - - ds_180 = xr.open_dataset(join(self.test_data_dir, - "MODIS_A-JPL-L2P-v2014.0.nc"), - decode_times=False, - decode_coords=False) - - ds_360 = xr.open_dataset(join( - self.test_data_dir, - "ascat_20150702_084200_metopa_45145_eps_o_250_2300_ovw.l2.nc"), - decode_times=False, - decode_coords=False) - - # Elements in each tuple are: - # ds type, lon_range, expected_result - test_bboxes = [ - (ds_180, (-180, 180), (-180, 180)), - (ds_360, (-180, 180), (0, 360)), - (ds_180, (-180, 0), (-180, 0)), - (ds_360, (-180, 0), (180, 360)), - (ds_180, (-80, 80), (-80, 80)), - (ds_360, (-80, 80), (280, 80)), - (ds_180, (0, 180), (0, 180)), - (ds_360, (0, 180), (0, 180)), - (ds_180, (80, -80), (80, -80)), - (ds_360, (80, -80), (80, 280)), - (ds_180, (-80, -80), (-180, 180)), - (ds_360, (-80, -80), (0, 360)) - ] - lat_var = 'lat' - lon_var = 'lon' - - for test_bbox in test_bboxes: - dataset = test_bbox[0] - lon_range = test_bbox[1] - expected_result = test_bbox[2] - actual_result, _ = subset.convert_bbox(np.array([lon_range, [0, 0]]), - dataset, lat_var, lon_var) - - np.testing.assert_equal(actual_result, expected_result) - - def compare_java(self, java_files, cut): - """ - Run the L2 subsetter and compare the result to the equivelant - legacy (Java) subsetter result. - Parameters - ---------- - java_files : list of strings - List of paths to each subsetted Java file. - cut : boolean - True if the subsetter should return compact. - """ - bbox_map = [("ascat_20150702_084200", ((-180, 0), (-90, 0))), - ("ascat_20150702_102400", ((-180, 0), (-90, 0))), - ("MODIS_A-JPL", ((65.8, 86.35), (40.1, 50.15))), - ("MODIS_T-JPL", ((-78.7, -60.7), (-54.8, -44))), - ("VIIRS", ((-172.3, -126.95), (62.3, 70.65))), - ("AMSR2-L2B_v08_r38622", ((-180, 0), (-90, 0)))] - - for file_str, bbox in bbox_map: - java_file = [file for file in java_files if file_str in file][0] - test_file = [file for file in self.test_files if file_str in file][0] - output_file = "{}_{}".format(self._testMethodName, test_file) - subset.subset( - file_to_subset=join(self.test_data_dir, test_file), - bbox=np.array(bbox), - output_file=join(self.subset_output_dir, output_file), - cut=cut - ) +@pytest.mark.parametrize("test_file", TEST_DATA_FILES) +def test_subset_bbox(test_file, data_dir, subset_output_dir, request): + """ + Test that all data present is within the bounding box given, + and that the correct bounding box is used. This test assumed + that the scanline *is* being cut. + """ - j_ds = xr.open_dataset(join(self.test_data_dir, java_file), - decode_times=False, - decode_coords=False, - mask_and_scale=False) - - py_ds = xr.open_dataset(join(self.subset_output_dir, output_file), - decode_times=False, - decode_coords=False, - mask_and_scale=False) - - for var_name, var in j_ds.data_vars.items(): - # Compare shape - np.testing.assert_equal(var.shape, py_ds[var_name].shape) - - # Compare meta - np.testing.assert_equal(var.attrs, py_ds[var_name].attrs) - - # Compare data - np.testing.assert_equal(var.values, py_ds[var_name].values) - - # Compare meta. History will always be different, so remove - # from the headers for comparison. - del j_ds.attrs['history'] - del py_ds.attrs['history'] - del py_ds.attrs['history_json'] - np.testing.assert_equal(j_ds.attrs, py_ds.attrs) - - def test_compare_java_compact(self): - """ - Tests that the results of the subsetting operation is - equivalent to the Java subsetting result on the same bounding - box. For simplicity the subsetted Java granules have been - manually run and copied into this project. This test DOES - cut the scanline. - """ - - java_result_files = [join("java_results", "cut", f) for f in - listdir(join(self.test_data_dir, "java_results", "cut")) if - isfile(join(self.test_data_dir, "java_results", "cut", f)) - and f.endswith(".nc")] - - self.compare_java(java_result_files, cut=True) - - def test_compare_java(self): - """ - Tests that the results of the subsetting operation is - equivalent to the Java subsetting result on the same bounding - box. For simplicity the subsetted Java granules have been - manually run and copied into this project. This runs does NOT - cut the scanline. - """ - - java_result_files = [join("java_results", "uncut", f) for f in - listdir(join(self.test_data_dir, "java_results", "uncut")) if - isfile(join(self.test_data_dir, "java_results", "uncut", f)) - and f.endswith(".nc")] - - self.compare_java(java_result_files, cut=False) - - def test_history_metadata_append(self): - """ - Tests that the history metadata header is appended to when it - already exists. - """ - test_file = next(filter( - lambda f: '20180101005944-REMSS-L2P_GHRSST-SSTsubskin-AMSR2-L2B_rt_r29918-v02.0-fv01.0.nc' in f - , self.test_files)) - output_file = "{}_{}".format(self._testMethodName, test_file) - subset.subset( - file_to_subset=join(self.test_data_dir, test_file), - bbox=np.array(((-180, 180), (-90.0, 90))), - output_file=join(self.subset_output_dir, output_file) - ) + # pylint: disable=too-many-locals + bbox = np.array(((-180, 90), (-90, 90))) + output_file = "{}_{}".format(request.node.name, test_file) + subset_output_file = join(subset_output_dir, output_file) + subset.subset( + file_to_subset=join(data_dir, test_file), + bbox=bbox, + output_file=subset_output_file + ) + + out_ds, rename_vars, _ = subset.open_as_nc_dataset(subset_output_file) + out_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(out_ds), + decode_times=False, + decode_coords=False, + mask_and_scale=False) - in_nc = xr.open_dataset(join(self.test_data_dir, test_file)) - out_nc = xr.open_dataset(join(self.subset_output_dir, output_file)) + lat_var_name, lon_var_name = subset.compute_coordinate_variable_names(out_ds) + + lat_var_name = lat_var_name[0] + lon_var_name = lon_var_name[0] + + lon_bounds, lat_bounds = subset.convert_bbox(bbox, out_ds, lat_var_name, lon_var_name) + + lats = out_ds[lat_var_name].values + lons = out_ds[lon_var_name].values + + np.warnings.filterwarnings('ignore') + + # Step 1: Get mask of values which aren't in the bounds. + + # For lon spatial condition, need to consider the + # lon_min > lon_max case. If that's the case, should do + # an 'or' instead. + oper = operator.and_ if lon_bounds[0] < lon_bounds[1] else operator.or_ + + # In these two masks, True == valid and False == invalid + lat_truth = np.ma.masked_where((lats >= lat_bounds[0]) + & (lats <= lat_bounds[1]), lats).mask + lon_truth = np.ma.masked_where(oper((lons >= lon_bounds[0]), + (lons <= lon_bounds[1])), lons).mask + + # combine masks + spatial_mask = np.bitwise_and(lat_truth, lon_truth) + + # Create a mask which represents the valid matrix bounds of + # the spatial mask. This is used in the case where a var + # has no _FillValue. + if lon_truth.ndim == 1: + bound_mask = spatial_mask + else: + rows = np.any(spatial_mask, axis=1) + cols = np.any(spatial_mask, axis=0) + bound_mask = np.array([[r & c for c in cols] for r in rows]) + + # If all the lat/lon values are valid, the file is valid and + # there is no need to check individual variables. + if np.all(spatial_mask): + return + + # Step 2: Get mask of values which are NaN or "_FillValue in + # each variable. + for var_name, var in out_ds.data_vars.items(): + # remove dimension of '1' if necessary + vals = np.squeeze(var.values) + + # Get the Fill Value + fill_value = var.attrs.get('_FillValue') + + # If _FillValue isn't provided, check that all values + # are in the valid matrix bounds go to the next variable + if fill_value is None: + combined_mask = np.ma.mask_or(spatial_mask, bound_mask) + np.testing.assert_equal(bound_mask, combined_mask) + continue + + # If the shapes of this var doesn't match the mask, + # reshape the var so the comparison can be made. Take + # the first index of the unknown dims. This makes + # assumptions about the ordering of the dimensions. + if vals.shape != out_ds[lat_var_name].shape and vals.shape: + slice_list = [] + for dim in var.dims: + if dim in out_ds[lat_var_name].dims: + slice_list.append(slice(None)) + else: + slice_list.append(slice(0, 1)) + vals = np.squeeze(vals[tuple(slice_list)]) - # Assert that the original granule contains history - assert in_nc.attrs.get('history') is not None + # Skip for byte type. + if vals.dtype == 'S1': + continue - # Assert that input and output files have different history - self.assertNotEqual(in_nc.attrs['history'], out_nc.attrs['history']) + # In this mask, False == NaN and True = valid + var_mask = np.invert(np.ma.masked_invalid(vals).mask) + fill_mask = np.invert(np.ma.masked_values(vals, fill_value).mask) - # Assert that last line of history was created by this service - assert SERVICE_NAME in out_nc.attrs['history'].split('\n')[-1] + var_mask = np.bitwise_and(var_mask, fill_mask) - # Assert that the old history is still in the subsetted granule - assert in_nc.attrs['history'] in out_nc.attrs['history'] + if var_mask.shape != spatial_mask.shape: + # This may be a case where the time represents lines, + # or some other case where the variable doesn't share + # a shape with the coordinate variables. + continue - def test_history_metadata_create(self): - """ - Tests that the history metadata header is created when it does - not exist. All test granules contain this header already, so - for this test the header will be removed manually from a granule. - """ - test_file = next(filter( - lambda f: '20180101005944-REMSS-L2P_GHRSST-SSTsubskin-AMSR2-L2B_rt_r29918-v02.0-fv01.0.nc' in f - , self.test_files)) - output_file = "{}_{}".format(self._testMethodName, test_file) + # Step 3: Combine the spatial and var mask with 'or' + combined_mask = np.ma.mask_or(var_mask, spatial_mask) - # Remove the 'history' metadata from the granule - in_nc = xr.open_dataset(join(self.test_data_dir, test_file)) - del in_nc.attrs['history'] - in_nc.to_netcdf(join(self.subset_output_dir, 'int_{}'.format(output_file)), 'w') + # Step 4: compare the newly combined mask and the + # spatial mask created from the lat/lon masks. They + # should be equal, because the 'or' of the two masks + # where out-of-bounds values are 'False' will leave + # those values assuming there are only NaN values + # in the data at those locations. + np.testing.assert_equal(spatial_mask, combined_mask) - subset.subset( - file_to_subset=join(self.subset_output_dir, "int_{}".format(output_file)), - bbox=np.array(((-180, 180), (-90.0, 90))), - output_file=join(self.subset_output_dir, output_file) - ) + out_ds.close() - out_nc = xr.open_dataset(join(self.subset_output_dir, output_file)) - # Assert that the input granule contains no history - assert in_nc.attrs.get('history') is None +@pytest.mark.parametrize("test_file", TEST_DATA_FILES) +def test_subset_empty_bbox(test_file, data_dir, subset_output_dir, request): + """ + Test that an empty file is returned when the bounding box + contains no data. + """ - # Assert that the history was created by this service - assert SERVICE_NAME in out_nc.attrs['history'] + bbox = np.array(((120, 125), (-90, -85))) + output_file = "{}_{}".format(request.node.name, test_file) + subset.subset( + file_to_subset=join(data_dir, test_file), + bbox=bbox, + output_file=join(subset_output_dir, output_file) + ) + test_input_dataset = xr.open_dataset( + join(data_dir, test_file), + decode_times=False, + decode_coords=False, + mask_and_scale=False + ) + empty_dataset = xr.open_dataset( + join(subset_output_dir, output_file), + decode_times=False, + decode_coords=False, + mask_and_scale=False + ) + + # Ensure all variables are present but empty. + for variable_name, variable in empty_dataset.data_vars.items(): + assert np.all(variable.data == variable.attrs.get('_FillValue', np.nan) or np.isnan(variable.data)) + + assert test_input_dataset.dims.keys() == empty_dataset.dims.keys() + + +def test_bbox_conversion(data_dir): + """ + Test that the bounding box conversion returns expected + results. Expected results are hand-calculated. + """ - # Assert that the history created by this service is the only - # line present in the history. - assert '\n' not in out_nc.attrs['history'] + ds_180 = xr.open_dataset(join(data_dir, "MODIS_A-JPL-L2P-v2014.0.nc"), + decode_times=False, + decode_coords=False) + + ds_360 = xr.open_dataset(join( + data_dir, + "ascat_20150702_084200_metopa_45145_eps_o_250_2300_ovw.l2.nc"), + decode_times=False, + decode_coords=False) + + # Elements in each tuple are: + # ds type, lon_range, expected_result + test_bboxes = [ + (ds_180, (-180, 180), (-180, 180)), + (ds_360, (-180, 180), (0, 360)), + (ds_180, (-180, 0), (-180, 0)), + (ds_360, (-180, 0), (180, 360)), + (ds_180, (-80, 80), (-80, 80)), + (ds_360, (-80, 80), (280, 80)), + (ds_180, (0, 180), (0, 180)), + (ds_360, (0, 180), (0, 180)), + (ds_180, (80, -80), (80, -80)), + (ds_360, (80, -80), (80, 280)), + (ds_180, (-80, -80), (-180, 180)), + (ds_360, (-80, -80), (0, 360)) + ] + + lat_var = 'lat' + lon_var = 'lon' + + for test_bbox in test_bboxes: + dataset = test_bbox[0] + lon_range = test_bbox[1] + expected_result = test_bbox[2] + actual_result, _ = subset.convert_bbox(np.array([lon_range, [0, 0]]), + dataset, lat_var, lon_var) + + np.testing.assert_equal(actual_result, expected_result) + + +def compare_java(test_file, cut, data_dir, subset_output_dir, request): + """ + Run the L2 subsetter and compare the result to the equivelant + legacy (Java) subsetter result. + Parameters + ---------- + test_file : str + path to test file. + cut : boolean + True if the subsetter should return compact. + """ + bbox_map = [("ascat_20150702_084200", ((-180, 0), (-90, 0))), + ("ascat_20150702_102400", ((-180, 0), (-90, 0))), + ("MODIS_A-JPL", ((65.8, 86.35), (40.1, 50.15))), + ("MODIS_T-JPL", ((-78.7, -60.7), (-54.8, -44))), + ("VIIRS", ((-172.3, -126.95), (62.3, 70.65))), + ("AMSR2-L2B_v08_r38622", ((-180, 0), (-90, 0)))] + + java_files_dir = join(data_dir, "java_results", "cut" if cut else "uncut") + + java_files = [join(java_files_dir, f) for f in listdir(java_files_dir) if + isfile(join(java_files_dir, f)) and f.endswith(".nc")] + + file, bbox = next(iter([b for b in bbox_map if b[0] in test_file])) + java_file = next(iter([f for f in java_files if file in f])) + + output_file = "{}_{}".format(urllib.parse.quote_plus(request.node.name), test_file) + subset.subset( + file_to_subset=join(data_dir, test_file), + bbox=np.array(bbox), + output_file=join(subset_output_dir, output_file), + cut=cut + ) + + j_ds = xr.open_dataset(join(data_dir, java_file), + decode_times=False, + decode_coords=False, + mask_and_scale=False) + + py_ds = xr.open_dataset(join(subset_output_dir, output_file), + decode_times=False, + decode_coords=False, + mask_and_scale=False) + + for var_name, var in j_ds.data_vars.items(): + # Compare shape + np.testing.assert_equal(var.shape, py_ds[var_name].shape) + + # Compare meta + np.testing.assert_equal(var.attrs, py_ds[var_name].attrs) + + # Compare data + np.testing.assert_equal(var.values, py_ds[var_name].values) + + # Compare meta. History will always be different, so remove + # from the headers for comparison. + del j_ds.attrs['history'] + del py_ds.attrs['history'] + del py_ds.attrs['history_json'] + np.testing.assert_equal(j_ds.attrs, py_ds.attrs) + + +@pytest.mark.parametrize("test_file", [ + "ascat_20150702_084200_metopa_45145_eps_o_250_2300_ovw.l2.nc", + "ascat_20150702_102400_metopa_45146_eps_o_250_2300_ovw.l2.nc", + "MODIS_A-JPL-L2P-v2014.0.nc", + "MODIS_T-JPL-L2P-v2014.0.nc", + "VIIRS_NPP-NAVO-L2P-v3.0.nc", + "AMSR2-L2B_v08_r38622-v02.0-fv01.0.nc" +]) +def test_compare_java_compact(test_file, data_dir, subset_output_dir, request): + """ + Tests that the results of the subsetting operation is + equivalent to the Java subsetting result on the same bounding + box. For simplicity the subsetted Java granules have been + manually run and copied into this project. This test DOES + cut the scanline. + """ - def test_specified_variables(self): - """ - Test that the variables which are specified when calling the subset - operation are present in the resulting subsetted data file, - and that the variables which are specified are not present. - """ - bbox = np.array(((-180, 180), (-90, 90))) - for file in self.test_files: - output_file = "{}_{}".format(self._testMethodName, file) + compare_java(test_file, True, data_dir, subset_output_dir, request) - in_ds = xr.open_dataset(join(self.test_data_dir, file), - decode_times=False, - decode_coords=False) - included_variables = set([variable[0] for variable in in_ds.data_vars.items()][::2]) - included_variables = list(included_variables) +@pytest.mark.parametrize("test_file", [ + "ascat_20150702_084200_metopa_45145_eps_o_250_2300_ovw.l2.nc", + "ascat_20150702_102400_metopa_45146_eps_o_250_2300_ovw.l2.nc", + "MODIS_A-JPL-L2P-v2014.0.nc", + "MODIS_T-JPL-L2P-v2014.0.nc", + "VIIRS_NPP-NAVO-L2P-v3.0.nc", + "AMSR2-L2B_v08_r38622-v02.0-fv01.0.nc" +]) +def test_compare_java(test_file, data_dir, subset_output_dir, request): + """ + Tests that the results of the subsetting operation is + equivalent to the Java subsetting result on the same bounding + box. For simplicity the subsetted Java granules have been + manually run and copied into this project. This runs does NOT + cut the scanline. + """ - excluded_variables = list(set(variable[0] for variable in in_ds.data_vars.items()) - - set(included_variables)) + compare_java(test_file, False, data_dir, subset_output_dir, request) - subset.subset( - file_to_subset=join(self.test_data_dir, file), - bbox=bbox, - output_file=join(self.subset_output_dir, output_file), - variables=included_variables - ) - # Get coord variables - time_var_name = [] - lat_var_names, lon_var_names = subset.compute_coordinate_variable_names(in_ds) - lat_var_name = lat_var_names[0] - lon_var_name = lon_var_names[0] - time_var_name = subset.compute_time_variable_name(in_ds, in_ds[lat_var_name]) - - included_variables.append(lat_var_name) - included_variables.append(lon_var_name) - included_variables.append(time_var_name) - included_variables.extend(in_ds.coords.keys()) - - if lat_var_name in excluded_variables: - excluded_variables.remove(lat_var_name) - if lon_var_name in excluded_variables: - excluded_variables.remove(lon_var_name) - if time_var_name in excluded_variables: - excluded_variables.remove(time_var_name) - - out_ds = xr.open_dataset(join(self.subset_output_dir, output_file), - decode_times=False, - decode_coords=False) - - out_vars = [out_var for out_var in out_ds.data_vars.keys()] - out_vars.extend(out_ds.coords.keys()) - - assert set(out_vars) == set(included_variables) - assert set(out_vars).isdisjoint(excluded_variables) - - in_ds.close() - out_ds.close() - - def test_calculate_chunks(self): - """ - Test that the calculate chunks function in the subset module - correctly calculates and returns the chunks dims dictionary. - """ - rs = np.random.RandomState(0) - dataset = xr.DataArray( - rs.randn(2, 4000, 4001), - dims=['x', 'y', 'z'] - ).to_dataset(name='foo') - - chunk_dict = subset.calculate_chunks(dataset) - - assert chunk_dict.get('x') is None - assert chunk_dict.get('y') is None - assert chunk_dict.get('z') == 4000 - - def test_missing_coord_vars(self): - """ - As of right now, the subsetter expects the data to contain lat - and lon variables. If not present, an error is thrown. - """ - file = 'MODIS_T-JPL-L2P-v2014.0.nc' - ds = xr.open_dataset(join(self.test_data_dir, file), - decode_times=False, - decode_coords=False, - mask_and_scale=False) +def test_history_metadata_append(data_dir, subset_output_dir, request): + """ + Tests that the history metadata header is appended to when it + already exists. + """ + test_file = next(filter( + lambda f: '20180101005944-REMSS-L2P_GHRSST-SSTsubskin-AMSR2-L2B_rt_r29918-v02.0-fv01.0.nc' in f + , TEST_DATA_FILES)) + output_file = "{}_{}".format(request.node.name, test_file) + subset.subset( + file_to_subset=join(data_dir, test_file), + bbox=np.array(((-180, 180), (-90.0, 90))), + output_file=join(subset_output_dir, output_file) + ) - # Manually remove var which will cause error when attempting - # to subset. - ds = ds.drop_vars(['lat']) + in_nc = xr.open_dataset(join(data_dir, test_file)) + out_nc = xr.open_dataset(join(subset_output_dir, output_file)) - output_file = '{}_{}'.format('missing_coords', file) - ds.to_netcdf(join(self.subset_output_dir, output_file)) + # Assert that the original granule contains history + assert in_nc.attrs.get('history') is not None - bbox = np.array(((-180, 180), (-90, 90))) + # Assert that input and output files have different history + assert in_nc.attrs['history'] != out_nc.attrs['history'] - with pytest.raises(ValueError): - subset.subset( - file_to_subset=join(self.subset_output_dir, output_file), - bbox=bbox, - output_file='' - ) + # Assert that last line of history was created by this service + assert SERVICE_NAME in out_nc.attrs['history'].split('\n')[-1] - def test_data_1D(self): - """ - Test that subsetting a 1-D granule does not result in failure. - """ - merged_jason_filename = 'JA1_GPN_2PeP001_002_20020115_060706_20020115_070316.nc' - output_file = "{}_{}".format(self._testMethodName, merged_jason_filename) + # Assert that the old history is still in the subsetted granule + assert in_nc.attrs['history'] in out_nc.attrs['history'] - subset.subset( - file_to_subset=join(self.test_data_dir, merged_jason_filename), - bbox=np.array(((-180, 0), (-90, 0))), - output_file=join(self.subset_output_dir, output_file) - ) - xr.open_dataset(join(self.subset_output_dir, output_file)) +def test_history_metadata_create(data_dir, subset_output_dir, request): + """ + Tests that the history metadata header is created when it does + not exist. All test granules contain this header already, so + for this test the header will be removed manually from a granule. + """ + test_file = next(filter( + lambda f: '20180101005944-REMSS-L2P_GHRSST-SSTsubskin-AMSR2-L2B_rt_r29918-v02.0-fv01.0.nc' in f + , TEST_DATA_FILES)) + output_file = "{}_{}".format(request.node.name, test_file) - def test_get_coord_variable_names(self): - """ - Test that the expected coord variable names are returned - """ - file = 'MODIS_T-JPL-L2P-v2014.0.nc' - ds = xr.open_dataset(join(self.test_data_dir, file), - decode_times=False, - decode_coords=False, - mask_and_scale=False) + # Remove the 'history' metadata from the granule + in_nc = xr.open_dataset(join(data_dir, test_file)) + del in_nc.attrs['history'] + in_nc.to_netcdf(join(subset_output_dir, 'int_{}'.format(output_file)), 'w') - old_lat_var_name = 'lat' - old_lon_var_name = 'lon' + subset.subset( + file_to_subset=join(subset_output_dir, "int_{}".format(output_file)), + bbox=np.array(((-180, 180), (-90.0, 90))), + output_file=join(subset_output_dir, output_file) + ) - lat_var_name, lon_var_name = subset.compute_coordinate_variable_names(ds) + out_nc = xr.open_dataset(join(subset_output_dir, output_file)) - assert lat_var_name[0] == old_lat_var_name - assert lon_var_name[0] == old_lon_var_name + # Assert that the input granule contains no history + assert in_nc.attrs.get('history') is None - new_lat_var_name = 'latitude' - new_lon_var_name = 'x' - ds = ds.rename({old_lat_var_name: new_lat_var_name, - old_lon_var_name: new_lon_var_name}) + # Assert that the history was created by this service + assert SERVICE_NAME in out_nc.attrs['history'] - lat_var_name, lon_var_name = subset.compute_coordinate_variable_names(ds) + # Assert that the history created by this service is the only + # line present in the history. + assert '\n' not in out_nc.attrs['history'] - assert lat_var_name[0] == new_lat_var_name - assert lon_var_name[0] == new_lon_var_name - def test_cannot_get_coord_variable_names(self): - """ - Test that, when given a dataset with coord vars which are not - expected, a ValueError is raised. - """ - file = 'MODIS_T-JPL-L2P-v2014.0.nc' - ds = xr.open_dataset(join(self.test_data_dir, file), +@pytest.mark.parametrize("test_file", TEST_DATA_FILES) +def test_specified_variables(test_file, data_dir, subset_output_dir, request): + """ + Test that the variables which are specified when calling the subset + operation are present in the resulting subsetted data file, + and that the variables which are specified are not present. + """ + bbox = np.array(((-180, 180), (-90, 90))) + output_file = "{}_{}".format(request.node.name, test_file) + + in_ds, rename_vars, _ = subset.open_as_nc_dataset(join(data_dir, test_file)) + in_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(in_ds), + decode_times=False, + decode_coords=False) + # Non-data vars are by default included in the result + non_data_vars = set(in_ds.variables.keys()) - set(in_ds.data_vars.keys()) + + # Coordinate variables are always included in the result + lat_var_names, lon_var_names, time_var_names = subset.get_coordinate_variable_names(in_ds) + coordinate_variables = lat_var_names + lon_var_names + time_var_names + + # Pick some variable to include in the result + included_variables = set([variable[0] for variable in in_ds.data_vars.items()][::2]) + included_variables = list(included_variables) + + # All other data variables should be dropped + expected_excluded_variables = list(set(variable[0] for variable in in_ds.data_vars.items()) + - set(included_variables) - set(coordinate_variables)) + + subset.subset( + file_to_subset=join(data_dir, test_file), + bbox=bbox, + output_file=join(subset_output_dir, output_file), + variables=[var.replace(GROUP_DELIM, '/') for var in included_variables] + ) + + out_ds, rename_vars, _ = subset.open_as_nc_dataset(join(subset_output_dir, output_file)) + out_ds = xr.open_dataset(xr.backends.NetCDF4DataStore(out_ds), decode_times=False, - decode_coords=False, - mask_and_scale=False) + decode_coords=False) - old_lat_var_name = 'lat' - new_lat_var_name = 'foo' - - ds = ds.rename({old_lat_var_name: new_lat_var_name}) - # Remove 'coordinates' attribute - for var_name, var in ds.items(): - if 'coordinates' in var.attrs: - del var.attrs['coordinates'] - - self.assertRaises(ValueError, subset.compute_coordinate_variable_names, ds) - - def test_get_spatial_bounds(self): - """ - Test that the get_spatial_bounds function works as expected. - The get_spatial_bounds function should return lat/lon min/max - which is masked and scaled for both variables. The values - should also be adjusted for -180,180/-90,90 coordinate types - """ - ascat_filename = 'ascat_20150702_084200_metopa_45145_eps_o_250_2300_ovw.l2.nc' - ghrsst_filename = '20190927000500-JPL-L2P_GHRSST-SSTskin-MODIS_A-D-v02.0-fv01.0.nc' - - ascat_dataset = xr.open_dataset( - join(self.test_data_dir, ascat_filename), - decode_times=False, - decode_coords=False, - mask_and_scale=False - ) - ghrsst_dataset = xr.open_dataset( - join(self.test_data_dir, ghrsst_filename), - decode_times=False, - decode_coords=False, - mask_and_scale=False - ) + out_vars = [out_var for out_var in out_ds.variables.keys()] - # ascat1 longitude is -0 360, ghrsst modis A is -180 180 - # Both have metadata for valid_min - - # Manually calculated spatial bounds - ascat_expected_lat_min = -89.4 - ascat_expected_lat_max = 89.2 - ascat_expected_lon_min = -180.0 - ascat_expected_lon_max = 180.0 - - ghrsst_expected_lat_min = -77.2 - ghrsst_expected_lat_max = -53.6 - ghrsst_expected_lon_min = -170.5 - ghrsst_expected_lon_max = -101.7 - - min_lon, max_lon, min_lat, max_lat = subset.get_spatial_bounds( - dataset=ascat_dataset, - lat_var_names=['lat'], - lon_var_names=['lon'] - ).flatten() - - assert np.isclose(min_lat, ascat_expected_lat_min) - assert np.isclose(max_lat, ascat_expected_lat_max) - assert np.isclose(min_lon, ascat_expected_lon_min) - assert np.isclose(max_lon, ascat_expected_lon_max) - - # Remove the label from the dataset coordinate variables indicating the valid_min. - del ascat_dataset['lat'].attrs['valid_min'] - del ascat_dataset['lon'].attrs['valid_min'] - - min_lon, max_lon, min_lat, max_lat = subset.get_spatial_bounds( - dataset=ascat_dataset, - lat_var_names=['lat'], - lon_var_names=['lon'] - ).flatten() - - assert np.isclose(min_lat, ascat_expected_lat_min) - assert np.isclose(max_lat, ascat_expected_lat_max) - assert np.isclose(min_lon, ascat_expected_lon_min) - assert np.isclose(max_lon, ascat_expected_lon_max) - - # Repeat test, but with GHRSST granule - - min_lon, max_lon, min_lat, max_lat = subset.get_spatial_bounds( - dataset=ghrsst_dataset, - lat_var_names=['lat'], - lon_var_names=['lon'] - ).flatten() - - assert np.isclose(min_lat, ghrsst_expected_lat_min) - assert np.isclose(max_lat, ghrsst_expected_lat_max) - assert np.isclose(min_lon, ghrsst_expected_lon_min) - assert np.isclose(max_lon, ghrsst_expected_lon_max) - - # Remove the label from the dataset coordinate variables indicating the valid_min. - - del ghrsst_dataset['lat'].attrs['valid_min'] - del ghrsst_dataset['lon'].attrs['valid_min'] - - min_lon, max_lon, min_lat, max_lat = subset.get_spatial_bounds( - dataset=ghrsst_dataset, - lat_var_names=['lat'], - lon_var_names=['lon'] - ).flatten() - - assert np.isclose(min_lat, ghrsst_expected_lat_min) - assert np.isclose(max_lat, ghrsst_expected_lat_max) - assert np.isclose(min_lon, ghrsst_expected_lon_min) - assert np.isclose(max_lon, ghrsst_expected_lon_max) - - def test_shapefile_subset(self): - """ - Test that using a shapefile to subset data instead of a bbox - works as expected - """ - shapefile = 'test.shp' - ascat_filename = 'ascat_20150702_084200_metopa_45145_eps_o_250_2300_ovw.l2.nc' - output_filename = f'{self._testMethodName}_{ascat_filename}' - - shapefile_file_path = join(self.test_data_dir, 'test_shapefile_subset', shapefile) - ascat_file_path = join(self.test_data_dir, ascat_filename) - output_file_path = join(self.subset_output_dir, output_filename) + assert set(out_vars) == set(included_variables + coordinate_variables).union(non_data_vars) + assert set(out_vars).isdisjoint(expected_excluded_variables) - subset.subset( - file_to_subset=ascat_file_path, - bbox=None, - output_file=output_file_path, - shapefile=shapefile_file_path - ) + in_ds.close() + out_ds.close() - # Check that each point of data is within the shapefile - shapefile_df = gpd.read_file(shapefile_file_path) - with xr.open_dataset(output_file_path) as result_dataset: - def in_shape(lon, lat): - if np.isnan(lon) or np.isnan(lat): - return - point = Point(lon, lat) - point_in_shapefile = shapefile_df.contains(point) - assert point_in_shapefile[0] - - in_shape_vec = np.vectorize(in_shape) - in_shape_vec(result_dataset.lon, result_dataset.lat) - - def test_variable_subset_oco2(self): - """ - variable subsets for groups and root group using a '/' - """ - - oco2_file_name = 'oco2_LtCO2_190201_B10206Ar_200729175909s.nc4' - output_file_name = 'oco2_test_out.nc' - shutil.copyfile(os.path.join(self.test_data_dir, 'OCO2', oco2_file_name), - os.path.join(self.subset_output_dir, oco2_file_name)) - bbox = np.array(((-180,180),(-90.0,90))) - variables = ['/xco2','/xco2_quality_flag','/Retrieval/water_height','/sounding_id'] - subset.subset( - file_to_subset=join(self.test_data_dir, 'OCO2',oco2_file_name), - bbox=bbox, - variables=variables, - output_file=join(self.subset_output_dir, output_file_name), - ) - - out_nc = nc.Dataset(join(self.subset_output_dir, output_file_name)) - var_listout = list(out_nc.groups['Retrieval'].variables.keys()) - assert ('water_height' in var_listout) - - def test_variable_subset_s6(self): - """ - multiple variable subset of variables in different groups in oco3 - """ - - s6_file_name = 'S6A_P4_2__LR_STD__ST_002_140_20201207T011501_20201207T013023_F00.nc' - output_file_name = 's6_test_out.nc' - shutil.copyfile(os.path.join(self.test_data_dir, 'sentinel_6', s6_file_name), - os.path.join(self.subset_output_dir, s6_file_name)) - bbox = np.array(((-180,180),(-90.0,90))) - variables = ['/data_01/ku/range_ocean_mle3_rms', '/data_20/ku/range_ocean'] - subset.subset( - file_to_subset=join(self.subset_output_dir, s6_file_name), - bbox=bbox, - variables=variables, - output_file=join(self.subset_output_dir, output_file_name), - ) - - out_nc = nc.Dataset(join(self.subset_output_dir, output_file_name)) - var_listout =list(out_nc.groups['data_01'].groups['ku'].variables.keys()) - var_listout.extend(list(out_nc.groups['data_20'].groups['ku'].variables.keys())) - assert ('range_ocean_mle3_rms' in var_listout) - assert ('range_ocean' in var_listout) - - - def test_transform_grouped_dataset(self): - """ - Test that the transformation function results in a correctly - formatted dataset. - """ - s6_file_name = 'S6A_P4_2__LR_STD__ST_002_140_20201207T011501_20201207T013023_F00.nc' - shutil.copyfile(os.path.join(self.test_data_dir, 'sentinel_6', s6_file_name), - os.path.join(self.subset_output_dir, s6_file_name)) - - nc_ds = nc.Dataset(os.path.join(self.test_data_dir, 'sentinel_6', s6_file_name)) - nc_ds_transformed = subset.transform_grouped_dataset( - nc.Dataset(os.path.join(self.subset_output_dir, s6_file_name), 'r'), - os.path.join(self.subset_output_dir, s6_file_name) - ) - # The original ds has groups - assert nc_ds.groups +def test_calculate_chunks(): + """ + Test that the calculate chunks function in the subset module + correctly calculates and returns the chunks dims dictionary. + """ + rs = np.random.RandomState(0) + dataset = xr.DataArray( + rs.randn(2, 4000, 4001), + dims=['x', 'y', 'z'] + ).to_dataset(name='foo') - # There should be no groups in the new ds - assert not nc_ds_transformed.groups + chunk_dict = subset.calculate_chunks(dataset) - # The original ds has no variables in the root group - assert not nc_ds.variables + assert chunk_dict.get('x') is None + assert chunk_dict.get('y') is None + assert chunk_dict.get('z') == 4000 - # The new ds has variables in the root group - assert nc_ds_transformed.variables - # Each var in the new ds should map to a variable in the old ds - for var_name, var in nc_ds_transformed.variables.items(): - path = var_name.strip('__').split('__') +def test_missing_coord_vars(data_dir, subset_output_dir): + """ + As of right now, the subsetter expects the data to contain lat + and lon variables. If not present, an error is thrown. + """ + file = 'MODIS_T-JPL-L2P-v2014.0.nc' + ds = xr.open_dataset(join(data_dir, file), + decode_times=False, + decode_coords=False, + mask_and_scale=False) - group = nc_ds[path[0]] - for g in path[1:-1]: - group = group[g] - assert var_name.strip('__').split('__')[-1] in group.variables.keys() + # Manually remove var which will cause error when attempting + # to subset. + ds = ds.drop_vars(['lat']) + output_file = '{}_{}'.format('missing_coords', file) + ds.to_netcdf(join(subset_output_dir, output_file)) - def test_group_subset(self): - """ - Ensure a subset function can be run on a granule that contains - groups without errors, and that the subsetted data is within - the given spatial bounds. - """ - s6_file_name = 'S6A_P4_2__LR_STD__ST_002_140_20201207T011501_20201207T013023_F00.nc' - s6_output_file_name = 'SS_S6A_P4_2__LR_STD__ST_002_140_20201207T011501_20201207T013023_F00.nc' - # Copy S6 file to temp dir - shutil.copyfile( - os.path.join(self.test_data_dir, 'sentinel_6', s6_file_name), - os.path.join(self.subset_output_dir, s6_file_name) - ) + bbox = np.array(((-180, 180), (-90, 90))) - # Make sure it runs without errors - bbox = np.array(((150, 180), (-90, -50))) - bounds = subset.subset( - file_to_subset=os.path.join(self.subset_output_dir, s6_file_name), + with pytest.raises(ValueError): + subset.subset( + file_to_subset=join(subset_output_dir, output_file), bbox=bbox, - output_file=os.path.join(self.subset_output_dir, s6_output_file_name) + output_file='' ) - # Check that bounds are within requested bbox - assert bounds[0][0] >= bbox[0][0] - assert bounds[0][1] <= bbox[0][1] - assert bounds[1][0] >= bbox[1][0] - assert bounds[1][1] <= bbox[1][1] - - def test_json_history_metadata_append(self): - """ - Tests that the json history metadata header is appended to when it - already exists. First we create a fake json_history header for input file. - """ - test_file = next(filter( - lambda f: '20180101005944-REMSS-L2P_GHRSST-SSTsubskin-AMSR2-L2B_rt_r29918-v02.0-fv01.0.nc' in f - , self.test_files)) - output_file = "{}_{}".format(self._testMethodName, test_file) - input_file_subset = join(self.subset_output_dir, "int_{}".format(output_file)) - - fake_history = [ - { - "date_time": "2021-05-10T14:30:24.553263", - "derived_from": basename(input_file_subset), - "program": SERVICE_NAME, - "version": importlib_metadata.distribution(SERVICE_NAME).version, - "parameters": "bbox=[[-180.0, 180.0], [-90.0, 90.0]] cut=True", - "program_ref": "https://cmr.earthdata.nasa.gov:443/search/concepts/S1962070864-POCLOUD", - "$schema": "https://harmony.earthdata.nasa.gov/schemas/history/0.1.0/history-v0.1.0.json" - } - ] - in_nc = xr.open_dataset(join(self.test_data_dir, test_file)) - in_nc.attrs['history_json'] = json.dumps(fake_history) - in_nc.to_netcdf(join(self.subset_output_dir, 'int_{}'.format(output_file)), 'w') +def test_data_1D(data_dir, subset_output_dir, request): + """ + Test that subsetting a 1-D granule does not result in failure. + """ + merged_jason_filename = 'JA1_GPN_2PeP001_002_20020115_060706_20020115_070316.nc' + output_file = "{}_{}".format(request.node.name, merged_jason_filename) - subset.subset( - file_to_subset=input_file_subset, - bbox=np.array(((-180, 180), (-90.0, 90))), - output_file=join(self.subset_output_dir, output_file) - ) + subset.subset( + file_to_subset=join(data_dir, merged_jason_filename), + bbox=np.array(((-180, 0), (-90, 0))), + output_file=join(subset_output_dir, output_file) + ) - out_nc = xr.open_dataset(join(self.subset_output_dir, output_file)) - - history_json = json.loads(out_nc.attrs['history_json']) - assert len(history_json) == 2 - - is_valid_shema = validate(instance=history_json, schema=self.history_json_schema) - assert is_valid_shema is None - - for history in history_json: - assert "date_time" in history - assert history.get('program') == SERVICE_NAME - assert history.get('derived_from') == basename(input_file_subset) - assert history.get('version') == importlib_metadata.distribution(SERVICE_NAME).version - assert history.get('parameters') == 'bbox=[[-180.0, 180.0], [-90.0, 90.0]] cut=True' - assert history.get( - 'program_ref') == "https://cmr.earthdata.nasa.gov:443/search/concepts/S1962070864-POCLOUD" - assert history.get( - '$schema') == "https://harmony.earthdata.nasa.gov/schemas/history/0.1.0/history-v0.1.0.json" - - def test_json_history_metadata_create(self): - """ - Tests that the json history metadata header is created when it does - not exist. All test granules does not contain this header. - """ - test_file = next(filter( - lambda f: '20180101005944-REMSS-L2P_GHRSST-SSTsubskin-AMSR2-L2B_rt_r29918-v02.0-fv01.0.nc' in f - , self.test_files)) - output_file = "{}_{}".format(self._testMethodName, test_file) - - # Remove the 'history' metadata from the granule - in_nc = xr.open_dataset(join(self.test_data_dir, test_file)) - in_nc.to_netcdf(join(self.subset_output_dir, 'int_{}'.format(output_file)), 'w') - - input_file_subset = join(self.subset_output_dir, "int_{}".format(output_file)) - subset.subset( - file_to_subset=input_file_subset, - bbox=np.array(((-180, 180), (-90.0, 90))), - output_file=join(self.subset_output_dir, output_file) - ) + xr.open_dataset(join(subset_output_dir, output_file)) - out_nc = xr.open_dataset(join(self.subset_output_dir, output_file)) - - history_json = json.loads(out_nc.attrs['history_json']) - assert len(history_json) == 1 - - is_valid_shema = validate(instance=history_json, schema=self.history_json_schema) - assert is_valid_shema is None - - for history in history_json: - assert "date_time" in history - assert history.get('program') == SERVICE_NAME - assert history.get('derived_from') == basename(input_file_subset) - assert history.get('version') == importlib_metadata.distribution(SERVICE_NAME).version - assert history.get('parameters') == 'bbox=[[-180.0, 180.0], [-90.0, 90.0]] cut=True' - assert history.get( - 'program_ref') == "https://cmr.earthdata.nasa.gov:443/search/concepts/S1962070864-POCLOUD" - assert history.get( - '$schema') == "https://harmony.earthdata.nasa.gov/schemas/history/0.1.0/history-v0.1.0.json" - - def test_json_history_metadata_create_origin_source(self): - """ - Tests that the json history metadata header is created when it does - not exist. All test granules does not contain this header. - """ - test_file = next(filter( - lambda f: '20180101005944-REMSS-L2P_GHRSST-SSTsubskin-AMSR2-L2B_rt_r29918-v02.0-fv01.0.nc' in f - , self.test_files)) - output_file = "{}_{}".format(self._testMethodName, test_file) - - # Remove the 'history' metadata from the granule - in_nc = xr.open_dataset(join(self.test_data_dir, test_file)) - in_nc.to_netcdf(join(self.subset_output_dir, 'int_{}'.format(output_file)), 'w') - - input_file_subset = join(self.subset_output_dir, "int_{}".format(output_file)) - subset.subset( - file_to_subset=input_file_subset, - bbox=np.array(((-180, 180), (-90.0, 90))), - output_file=join(self.subset_output_dir, output_file), - origin_source="fake_original_file.nc" - ) - out_nc = xr.open_dataset(join(self.subset_output_dir, output_file)) - - history_json = json.loads(out_nc.attrs['history_json']) - assert len(history_json) == 1 - - is_valid_shema = validate(instance=history_json, schema=self.history_json_schema) - assert is_valid_shema is None - - for history in history_json: - assert "date_time" in history - assert history.get('program') == SERVICE_NAME - assert history.get('derived_from') == "fake_original_file.nc" - assert history.get('version') == importlib_metadata.distribution(SERVICE_NAME).version - assert history.get('parameters') == 'bbox=[[-180.0, 180.0], [-90.0, 90.0]] cut=True' - assert history.get( - 'program_ref') == "https://cmr.earthdata.nasa.gov:443/search/concepts/S1962070864-POCLOUD" - assert history.get( - '$schema') == "https://harmony.earthdata.nasa.gov/schemas/history/0.1.0/history-v0.1.0.json" - - def test_temporal_subset_ascat(self): - """ - Test that a temporal subset results in a granule that only - contains times within the given bounds. - """ - bbox = np.array(((-180, 180), (-90, 90))) - file = 'ascat_20150702_084200_metopa_45145_eps_o_250_2300_ovw.l2.nc' - output_file = "{}_{}".format(self._testMethodName, file) - min_time = '2015-07-02T09:00:00' - max_time = '2015-07-02T10:00:00' +def test_get_coord_variable_names(data_dir): + """ + Test that the expected coord variable names are returned + """ + file = 'MODIS_T-JPL-L2P-v2014.0.nc' + ds = xr.open_dataset(join(data_dir, file), + decode_times=False, + decode_coords=False, + mask_and_scale=False) - subset.subset( - file_to_subset=join(self.test_data_dir, file), - bbox=bbox, - output_file=join(self.subset_output_dir, output_file), - min_time=min_time, - max_time=max_time - ) + old_lat_var_name = 'lat' + old_lon_var_name = 'lon' - in_ds = xr.open_dataset(join(self.test_data_dir, file), - decode_times=False, - decode_coords=False) + lat_var_name, lon_var_name = subset.compute_coordinate_variable_names(ds) - out_ds = xr.open_dataset(join(self.subset_output_dir, output_file), - decode_times=False, - decode_coords=False) + assert lat_var_name[0] == old_lat_var_name + assert lon_var_name[0] == old_lon_var_name - # Check that 'time' types match - assert in_ds.time.dtype == out_ds.time.dtype + new_lat_var_name = 'latitude' + new_lon_var_name = 'x' + ds = ds.rename({old_lat_var_name: new_lat_var_name, + old_lon_var_name: new_lon_var_name}) - in_ds.close() - out_ds.close() + lat_var_name, lon_var_name = subset.compute_coordinate_variable_names(ds) - # Check that all times are within the given bounds. Open - # dataset using 'decode_times=True' for auto-conversions to - # datetime - out_ds = xr.open_dataset(join(self.subset_output_dir, output_file), - decode_coords=False) + assert lat_var_name[0] == new_lat_var_name + assert lon_var_name[0] == new_lon_var_name - start_dt = subset.translate_timestamp(min_time) - end_dt = subset.translate_timestamp(max_time) - # All dates should be within the given temporal bounds. - assert (out_ds.time >= pd.to_datetime(start_dt)).all() - assert (out_ds.time <= pd.to_datetime(end_dt)).all() +def test_cannot_get_coord_variable_names(data_dir): + """ + Test that, when given a dataset with coord vars which are not + expected, a ValueError is raised. + """ + file = 'MODIS_T-JPL-L2P-v2014.0.nc' + ds = xr.open_dataset(join(data_dir, file), + decode_times=False, + decode_coords=False, + mask_and_scale=False) - def test_temporal_subset_modis_a(self): - """ - Test that a temporal subset results in a granule that only - contains times within the given bounds. - """ - bbox = np.array(((-180, 180), (-90, 90))) - file = 'MODIS_A-JPL-L2P-v2014.0.nc' - output_file = "{}_{}".format(self._testMethodName, file) - min_time = '2019-08-05T06:57:00' - max_time = '2019-08-05T06:58:00' - # Actual min is 2019-08-05T06:55:01.000000000 - # Actual max is 2019-08-05T06:59:57.000000000 + old_lat_var_name = 'lat' + new_lat_var_name = 'foo' - subset.subset( - file_to_subset=join(self.test_data_dir, file), - bbox=bbox, - output_file=join(self.subset_output_dir, output_file), - min_time=min_time, - max_time=max_time - ) + ds = ds.rename({old_lat_var_name: new_lat_var_name}) + # Remove 'coordinates' attribute + for var_name, var in ds.items(): + if 'coordinates' in var.attrs: + del var.attrs['coordinates'] - in_ds = xr.open_dataset(join(self.test_data_dir, file), - decode_times=False, - decode_coords=False) + with pytest.raises(ValueError) as e_info: + subset.compute_coordinate_variable_names(ds) - out_ds = xr.open_dataset(join(self.subset_output_dir, output_file), - decode_times=False, - decode_coords=False) - # Check that 'time' types match - assert in_ds.time.dtype == out_ds.time.dtype +def test_get_spatial_bounds(data_dir): + """ + Test that the get_spatial_bounds function works as expected. + The get_spatial_bounds function should return lat/lon min/max + which is masked and scaled for both variables. The values + should also be adjusted for -180,180/-90,90 coordinate types + """ + ascat_filename = 'ascat_20150702_084200_metopa_45145_eps_o_250_2300_ovw.l2.nc' + ghrsst_filename = '20190927000500-JPL-L2P_GHRSST-SSTskin-MODIS_A-D-v02.0-fv01.0.nc' + + ascat_dataset = xr.open_dataset( + join(data_dir, ascat_filename), + decode_times=False, + decode_coords=False, + mask_and_scale=False + ) + ghrsst_dataset = xr.open_dataset( + join(data_dir, ghrsst_filename), + decode_times=False, + decode_coords=False, + mask_and_scale=False + ) + + # ascat1 longitude is -0 360, ghrsst modis A is -180 180 + # Both have metadata for valid_min + + # Manually calculated spatial bounds + ascat_expected_lat_min = -89.4 + ascat_expected_lat_max = 89.2 + ascat_expected_lon_min = -180.0 + ascat_expected_lon_max = 180.0 + + ghrsst_expected_lat_min = -77.2 + ghrsst_expected_lat_max = -53.6 + ghrsst_expected_lon_min = -170.5 + ghrsst_expected_lon_max = -101.7 + + min_lon, max_lon, min_lat, max_lat = subset.get_spatial_bounds( + dataset=ascat_dataset, + lat_var_names=['lat'], + lon_var_names=['lon'] + ).flatten() + + assert np.isclose(min_lat, ascat_expected_lat_min) + assert np.isclose(max_lat, ascat_expected_lat_max) + assert np.isclose(min_lon, ascat_expected_lon_min) + assert np.isclose(max_lon, ascat_expected_lon_max) + + # Remove the label from the dataset coordinate variables indicating the valid_min. + del ascat_dataset['lat'].attrs['valid_min'] + del ascat_dataset['lon'].attrs['valid_min'] + + min_lon, max_lon, min_lat, max_lat = subset.get_spatial_bounds( + dataset=ascat_dataset, + lat_var_names=['lat'], + lon_var_names=['lon'] + ).flatten() + + assert np.isclose(min_lat, ascat_expected_lat_min) + assert np.isclose(max_lat, ascat_expected_lat_max) + assert np.isclose(min_lon, ascat_expected_lon_min) + assert np.isclose(max_lon, ascat_expected_lon_max) + + # Repeat test, but with GHRSST granule + + min_lon, max_lon, min_lat, max_lat = subset.get_spatial_bounds( + dataset=ghrsst_dataset, + lat_var_names=['lat'], + lon_var_names=['lon'] + ).flatten() + + assert np.isclose(min_lat, ghrsst_expected_lat_min) + assert np.isclose(max_lat, ghrsst_expected_lat_max) + assert np.isclose(min_lon, ghrsst_expected_lon_min) + assert np.isclose(max_lon, ghrsst_expected_lon_max) + + # Remove the label from the dataset coordinate variables indicating the valid_min. + + del ghrsst_dataset['lat'].attrs['valid_min'] + del ghrsst_dataset['lon'].attrs['valid_min'] + + min_lon, max_lon, min_lat, max_lat = subset.get_spatial_bounds( + dataset=ghrsst_dataset, + lat_var_names=['lat'], + lon_var_names=['lon'] + ).flatten() + + assert np.isclose(min_lat, ghrsst_expected_lat_min) + assert np.isclose(max_lat, ghrsst_expected_lat_max) + assert np.isclose(min_lon, ghrsst_expected_lon_min) + assert np.isclose(max_lon, ghrsst_expected_lon_max) + + +def test_shapefile_subset(data_dir, subset_output_dir, request): + """ + Test that using a shapefile to subset data instead of a bbox + works as expected + """ + shapefile = 'test.shp' + ascat_filename = 'ascat_20150702_084200_metopa_45145_eps_o_250_2300_ovw.l2.nc' + output_filename = f'{request.node.name}_{ascat_filename}' + + shapefile_file_path = join(data_dir, 'test_shapefile_subset', shapefile) + ascat_file_path = join(data_dir, ascat_filename) + output_file_path = join(subset_output_dir, output_filename) + + subset.subset( + file_to_subset=ascat_file_path, + bbox=None, + output_file=output_file_path, + shapefile=shapefile_file_path + ) + + # Check that each point of data is within the shapefile + shapefile_df = gpd.read_file(shapefile_file_path) + with xr.open_dataset(output_file_path) as result_dataset: + def in_shape(lon, lat): + if np.isnan(lon) or np.isnan(lat): + return + point = Point(lon, lat) + point_in_shapefile = shapefile_df.contains(point) + assert point_in_shapefile[0] + + in_shape_vec = np.vectorize(in_shape) + in_shape_vec(result_dataset.lon, result_dataset.lat) + + +def test_variable_subset_oco2(data_dir, subset_output_dir): + """ + variable subsets for groups and root group using a '/' + """ - in_ds.close() - out_ds.close() + oco2_file_name = 'oco2_LtCO2_190201_B10206Ar_200729175909s.nc4' + output_file_name = 'oco2_test_out.nc' + shutil.copyfile(os.path.join(data_dir, 'OCO2', oco2_file_name), + os.path.join(subset_output_dir, oco2_file_name)) + bbox = np.array(((-180, 180), (-90.0, 90))) + variables = ['/xco2', '/xco2_quality_flag', '/Retrieval/water_height', '/sounding_id'] + subset.subset( + file_to_subset=join(data_dir, 'OCO2', oco2_file_name), + bbox=bbox, + variables=variables, + output_file=join(subset_output_dir, output_file_name), + ) + + out_nc = nc.Dataset(join(subset_output_dir, output_file_name)) + var_listout = list(out_nc.groups['Retrieval'].variables.keys()) + assert ('water_height' in var_listout) + + +def test_variable_subset_s6(data_dir, subset_output_dir): + """ + multiple variable subset of variables in different groups in oco3 + """ - # Check that all times are within the given bounds. Open - # dataset using 'decode_times=True' for auto-conversions to - # datetime - out_ds = xr.open_dataset(join(self.subset_output_dir, output_file), - decode_coords=False) + s6_file_name = 'S6A_P4_2__LR_STD__ST_002_140_20201207T011501_20201207T013023_F00.nc' + output_file_name = 's6_test_out.nc' + shutil.copyfile(os.path.join(data_dir, 'sentinel_6', s6_file_name), + os.path.join(subset_output_dir, s6_file_name)) + bbox = np.array(((-180, 180), (-90.0, 90))) + variables = ['/data_01/ku/range_ocean_mle3_rms', '/data_20/ku/range_ocean'] + subset.subset( + file_to_subset=join(subset_output_dir, s6_file_name), + bbox=bbox, + variables=variables, + output_file=join(subset_output_dir, output_file_name), + ) + + out_nc = nc.Dataset(join(subset_output_dir, output_file_name)) + var_listout = list(out_nc.groups['data_01'].groups['ku'].variables.keys()) + var_listout.extend(list(out_nc.groups['data_20'].groups['ku'].variables.keys())) + assert ('range_ocean_mle3_rms' in var_listout) + assert ('range_ocean' in var_listout) + + +def test_transform_grouped_dataset(data_dir, subset_output_dir): + """ + Test that the transformation function results in a correctly + formatted dataset. + """ + s6_file_name = 'S6A_P4_2__LR_STD__ST_002_140_20201207T011501_20201207T013023_F00.nc' + shutil.copyfile(os.path.join(data_dir, 'sentinel_6', s6_file_name), + os.path.join(subset_output_dir, s6_file_name)) - start_dt = subset.translate_timestamp(min_time) - end_dt = subset.translate_timestamp(max_time) + nc_ds = nc.Dataset(os.path.join(data_dir, 'sentinel_6', s6_file_name)) + nc_ds_transformed = subset.transform_grouped_dataset( + nc.Dataset(os.path.join(subset_output_dir, s6_file_name), 'r'), + os.path.join(subset_output_dir, s6_file_name) + ) - epoch_dt = out_ds['time'].values[0] + # The original ds has groups + assert nc_ds.groups - # All timedelta + epoch should be within the given temporal bounds. - assert out_ds.sst_dtime.min() + epoch_dt >= np.datetime64(start_dt) - assert out_ds.sst_dtime.min() + epoch_dt <= np.datetime64(end_dt) + # There should be no groups in the new ds + assert not nc_ds_transformed.groups - def test_temporal_subset_s6(self): - """ - Test that a temporal subset results in a granule that only - contains times within the given bounds. - """ - bbox = np.array(((-180, 180), (-90, 90))) - file = 'S6A_P4_2__LR_STD__ST_002_140_20201207T011501_20201207T013023_F00.nc' - # Copy S6 file to temp dir - shutil.copyfile( - os.path.join(self.test_data_dir, 'sentinel_6', file), - os.path.join(self.subset_output_dir, file) - ) - output_file = "{}_{}".format(self._testMethodName, file) - min_time = '2020-12-07T01:20:00' - max_time = '2020-12-07T01:25:00' - # Actual min is 2020-12-07T01:15:01.000000000 - # Actual max is 2020-12-07T01:30:23.000000000 + # The original ds has no variables in the root group + assert not nc_ds.variables - subset.subset( - file_to_subset=join(self.subset_output_dir, file), - bbox=bbox, - output_file=join(self.subset_output_dir, output_file), - min_time=min_time, - max_time=max_time - ) + # The new ds has variables in the root group + assert nc_ds_transformed.variables - # Check that all times are within the given bounds. Open - # dataset using 'decode_times=True' for auto-conversions to - # datetime - out_ds = xr.open_dataset( - join(self.subset_output_dir, output_file), - decode_coords=False, - group='data_01' - ) + # Each var in the new ds should map to a variable in the old ds + for var_name, var in nc_ds_transformed.variables.items(): + path = var_name.strip('__').split('__') - start_dt = subset.translate_timestamp(min_time) - end_dt = subset.translate_timestamp(max_time) + group = nc_ds[path[0]] + for g in path[1:-1]: + group = group[g] + assert var_name.strip('__').split('__')[-1] in group.variables.keys() - # All dates should be within the given temporal bounds. - assert (out_ds.time >= pd.to_datetime(start_dt)).all() - assert (out_ds.time <= pd.to_datetime(end_dt)).all() - def test_get_time_variable_name(self): - for test_file in self.test_files: - args = { - 'decode_coords': False, - 'mask_and_scale': False, - 'decode_times': True - } - time_var_names = [] - ds = xr.open_dataset(os.path.join(self.test_data_dir, test_file), **args) - lat_var_name = subset.compute_coordinate_variable_names(ds)[0][0] - time_var_name = subset.compute_time_variable_name(ds, ds[lat_var_name]) +def test_group_subset(data_dir, subset_output_dir): + """ + Ensure a subset function can be run on a granule that contains + groups without errors, and that the subsetted data is within + the given spatial bounds. + """ + s6_file_name = 'S6A_P4_2__LR_STD__ST_002_140_20201207T011501_20201207T013023_F00.nc' + s6_output_file_name = 'SS_S6A_P4_2__LR_STD__ST_002_140_20201207T011501_20201207T013023_F00.nc' + # Copy S6 file to temp dir + shutil.copyfile( + os.path.join(data_dir, 'sentinel_6', s6_file_name), + os.path.join(subset_output_dir, s6_file_name) + ) + + # Make sure it runs without errors + bbox = np.array(((150, 180), (-90, -50))) + bounds = subset.subset( + file_to_subset=os.path.join(subset_output_dir, s6_file_name), + bbox=bbox, + output_file=os.path.join(subset_output_dir, s6_output_file_name) + ) + + # Check that bounds are within requested bbox + assert bounds[0][0] >= bbox[0][0] + assert bounds[0][1] <= bbox[0][1] + assert bounds[1][0] >= bbox[1][0] + assert bounds[1][1] <= bbox[1][1] + + +def test_json_history_metadata_append(history_json_schema, data_dir, subset_output_dir, request): + """ + Tests that the json history metadata header is appended to when it + already exists. First we create a fake json_history header for input file. + """ + test_file = next(filter( + lambda f: '20180101005944-REMSS-L2P_GHRSST-SSTsubskin-AMSR2-L2B_rt_r29918-v02.0-fv01.0.nc' in f + , TEST_DATA_FILES)) + output_file = "{}_{}".format(request.node.name, test_file) + input_file_subset = join(subset_output_dir, "int_{}".format(output_file)) + + fake_history = [ + { + "date_time": "2021-05-10T14:30:24.553263", + "derived_from": basename(input_file_subset), + "program": SERVICE_NAME, + "version": importlib_metadata.distribution(SERVICE_NAME).version, + "parameters": "bbox=[[-180.0, 180.0], [-90.0, 90.0]] cut=True", + "program_ref": "https://cmr.earthdata.nasa.gov:443/search/concepts/S1962070864-POCLOUD", + "$schema": "https://harmony.earthdata.nasa.gov/schemas/history/0.1.0/history-v0.1.0.json" + } + ] - assert time_var_name is not None - assert 'time' in time_var_name + in_nc = xr.open_dataset(join(data_dir, test_file)) + in_nc.attrs['history_json'] = json.dumps(fake_history) + in_nc.to_netcdf(join(subset_output_dir, 'int_{}'.format(output_file)), 'w') - def test_subset_jason(self): - bbox = np.array(((-180, 0), (-90, 90))) - file = 'JA1_GPN_2PeP001_002_20020115_060706_20020115_070316.nc' - output_file = "{}_{}".format(self._testMethodName, file) - min_time = "2002-01-15T06:07:06Z" - max_time = "2002-01-15T06:30:16Z" + subset.subset( + file_to_subset=input_file_subset, + bbox=np.array(((-180, 180), (-90.0, 90))), + output_file=join(subset_output_dir, output_file) + ) - subset.subset( - file_to_subset=os.path.join(self.test_data_dir, file), - bbox=bbox, - min_time=min_time, - max_time=max_time, - output_file=os.path.join(self.subset_output_dir, output_file) - ) + out_nc = xr.open_dataset(join(subset_output_dir, output_file)) - def test_subset_size(self): + history_json = json.loads(out_nc.attrs['history_json']) + assert len(history_json) == 2 - for file in self.test_files: - bbox = np.array(((-180, 0), (-30, 90))) - output_file = "{}_{}".format(self._testMethodName, file) - input_file_path = os.path.join(self.test_data_dir, file) - output_file_path = os.path.join(self.subset_output_dir, output_file) + validate(instance=history_json, schema=history_json_schema) - subset.subset( - file_to_subset=input_file_path, - bbox=bbox, - output_file=output_file_path - ) + for history in history_json: + assert "date_time" in history + assert history.get('program') == SERVICE_NAME + assert history.get('derived_from') == basename(input_file_subset) + assert history.get('version') == importlib_metadata.distribution(SERVICE_NAME).version + assert history.get('parameters') == 'bbox=[[-180.0, 180.0], [-90.0, 90.0]] cut=True' + assert history.get( + 'program_ref') == "https://cmr.earthdata.nasa.gov:443/search/concepts/S1962070864-POCLOUD" + assert history.get( + '$schema') == "https://harmony.earthdata.nasa.gov/schemas/history/0.1.0/history-v0.1.0.json" - original_file_size = os.path.getsize(input_file_path) - subset_file_size = os.path.getsize(output_file_path) - assert subset_file_size < original_file_size +def test_json_history_metadata_create(history_json_schema, data_dir, subset_output_dir, request): + """ + Tests that the json history metadata header is created when it does + not exist. All test granules does not contain this header. + """ + test_file = next(filter( + lambda f: '20180101005944-REMSS-L2P_GHRSST-SSTsubskin-AMSR2-L2B_rt_r29918-v02.0-fv01.0.nc' in f + , TEST_DATA_FILES)) + output_file = "{}_{}".format(request.node.name, test_file) + + # Remove the 'history' metadata from the granule + in_nc = xr.open_dataset(join(data_dir, test_file)) + in_nc.to_netcdf(join(subset_output_dir, 'int_{}'.format(output_file)), 'w') + + input_file_subset = join(subset_output_dir, "int_{}".format(output_file)) + subset.subset( + file_to_subset=input_file_subset, + bbox=np.array(((-180, 180), (-90.0, 90))), + output_file=join(subset_output_dir, output_file) + ) + + out_nc = xr.open_dataset(join(subset_output_dir, output_file)) + + history_json = json.loads(out_nc.attrs['history_json']) + assert len(history_json) == 1 + + validate(instance=history_json, schema=history_json_schema) + + for history in history_json: + assert "date_time" in history + assert history.get('program') == SERVICE_NAME + assert history.get('derived_from') == basename(input_file_subset) + assert history.get('version') == importlib_metadata.distribution(SERVICE_NAME).version + assert history.get('parameters') == 'bbox=[[-180.0, 180.0], [-90.0, 90.0]] cut=True' + assert history.get( + 'program_ref') == "https://cmr.earthdata.nasa.gov:443/search/concepts/S1962070864-POCLOUD" + assert history.get( + '$schema') == "https://harmony.earthdata.nasa.gov/schemas/history/0.1.0/history-v0.1.0.json" + + +def test_json_history_metadata_create_origin_source(history_json_schema, data_dir, subset_output_dir, request): + """ + Tests that the json history metadata header is created when it does + not exist. All test granules does not contain this header. + """ + test_file = next(filter( + lambda f: '20180101005944-REMSS-L2P_GHRSST-SSTsubskin-AMSR2-L2B_rt_r29918-v02.0-fv01.0.nc' in f + , TEST_DATA_FILES)) + output_file = "{}_{}".format(request.node.name, test_file) + + # Remove the 'history' metadata from the granule + in_nc = xr.open_dataset(join(data_dir, test_file)) + in_nc.to_netcdf(join(subset_output_dir, 'int_{}'.format(output_file)), 'w') + + input_file_subset = join(subset_output_dir, "int_{}".format(output_file)) + subset.subset( + file_to_subset=input_file_subset, + bbox=np.array(((-180, 180), (-90.0, 90))), + output_file=join(subset_output_dir, output_file), + origin_source="fake_original_file.nc" + ) + + out_nc = xr.open_dataset(join(subset_output_dir, output_file)) + + history_json = json.loads(out_nc.attrs['history_json']) + assert len(history_json) == 1 + + validate(instance=history_json, schema=history_json_schema) + + for history in history_json: + assert "date_time" in history + assert history.get('program') == SERVICE_NAME + assert history.get('derived_from') == "fake_original_file.nc" + assert history.get('version') == importlib_metadata.distribution(SERVICE_NAME).version + assert history.get('parameters') == 'bbox=[[-180.0, 180.0], [-90.0, 90.0]] cut=True' + assert history.get( + 'program_ref') == "https://cmr.earthdata.nasa.gov:443/search/concepts/S1962070864-POCLOUD" + assert history.get( + '$schema') == "https://harmony.earthdata.nasa.gov/schemas/history/0.1.0/history-v0.1.0.json" + + +def test_temporal_subset_ascat(data_dir, subset_output_dir, request): + """ + Test that a temporal subset results in a granule that only + contains times within the given bounds. + """ + bbox = np.array(((-180, 180), (-90, 90))) + file = 'ascat_20150702_084200_metopa_45145_eps_o_250_2300_ovw.l2.nc' + output_file = "{}_{}".format(request.node.name, file) + min_time = '2015-07-02T09:00:00' + max_time = '2015-07-02T10:00:00' + + subset.subset( + file_to_subset=join(data_dir, file), + bbox=bbox, + output_file=join(subset_output_dir, output_file), + min_time=min_time, + max_time=max_time + ) + + in_ds = xr.open_dataset(join(data_dir, file), + decode_times=False, + decode_coords=False) + + out_ds = xr.open_dataset(join(subset_output_dir, output_file), + decode_times=False, + decode_coords=False) - def test_duplicate_dims_sndr(self): - """ - Check if SNDR Climcaps files run successfully even though - these files have variables with duplicate dimensions - """ - SNDR_dir = join(self.test_data_dir, 'SNDR') - sndr_file = 'SNDR.J1.CRIMSS.20210224T0100.m06.g011.L2_CLIMCAPS_RET.std.v02_28.G.210331064430.nc' + # Check that 'time' types match + assert in_ds.time.dtype == out_ds.time.dtype - bbox = np.array(((-180, 90), (-90, 90))) - output_file = "{}_{}".format(self._testMethodName, sndr_file) - shutil.copyfile( - os.path.join(SNDR_dir, sndr_file), - os.path.join(self.subset_output_dir, sndr_file) - ) - box_test = subset.subset( - file_to_subset=join(self.subset_output_dir, sndr_file), - bbox=bbox, - output_file=join(self.subset_output_dir, output_file), - min_time='2021-02-24T00:50:20Z', - max_time='2021-02-24T01:09:55Z' - ) - # check if the box_test is + in_ds.close() + out_ds.close() - in_nc = nc.Dataset(join(SNDR_dir, sndr_file)) - out_nc = nc.Dataset(join(self.subset_output_dir, output_file)) + # Check that all times are within the given bounds. Open + # dataset using 'decode_times=True' for auto-conversions to + # datetime + out_ds = xr.open_dataset(join(subset_output_dir, output_file), + decode_coords=False) - for var_name, variable in in_nc.variables.items(): - assert in_nc[var_name].shape == out_nc[var_name].shape + start_dt = subset.translate_timestamp(min_time) + end_dt = subset.translate_timestamp(max_time) - def test_duplicate_dims_tropomi(self): - """ - Check if SNDR Climcaps files run successfully even though - these files have variables with duplicate dimensions - """ - TROP_dir = join(self.test_data_dir, 'tropomi') - trop_file = 'S5P_OFFL_L2__AER_LH_20210704T005246_20210704T023416_19290_02_020200_20210708T023111.nc' + # All dates should be within the given temporal bounds. + assert (out_ds.time >= pd.to_datetime(start_dt)).all() + assert (out_ds.time <= pd.to_datetime(end_dt)).all() - bbox = np.array(((-180, 180), (-90, 90))) - output_file = "{}_{}".format(self._testMethodName, trop_file) - shutil.copyfile( - os.path.join(TROP_dir, trop_file), - os.path.join(self.subset_output_dir, trop_file) - ) - box_test = subset.subset( - file_to_subset=join(self.subset_output_dir, trop_file), - bbox=bbox, - output_file=join(self.subset_output_dir, output_file) - ) - # check if the box_test is - - in_nc = nc.Dataset(join(TROP_dir, trop_file)) - out_nc = nc.Dataset(join(self.subset_output_dir, output_file)) - - for var_name, variable in in_nc.groups['PRODUCT'].groups['SUPPORT_DATA'].groups['DETAILED_RESULTS'].variables.items(): - assert variable.shape == out_nc.groups['PRODUCT'].groups['SUPPORT_DATA'].groups['DETAILED_RESULTS'].variables[var_name].shape - - - def test_omi_novars_subset(self): - """ - Check that the OMI variables are conserved when no variable are specified - the data field and lat/lon are in different groups - """ - omi_dir = join(self.test_data_dir, 'OMI') - omi_file = 'OMI-Aura_L2-OMSO2_2020m0116t1207-o82471_v003-2020m0223t142939.he5' - - bbox = np.array(((-180, 90), (-90, 90))) - output_file = "{}_{}".format(self._testMethodName, omi_file) - shutil.copyfile( - os.path.join(omi_dir, omi_file), - os.path.join(self.subset_output_dir, omi_file) - ) - box_test = subset.subset( - file_to_subset=join(self.subset_output_dir, omi_file), - bbox=bbox, - output_file=join(self.subset_output_dir, output_file), - ) - # check if the box_test is - in_nc = nc.Dataset(join(omi_dir, omi_file)) - out_nc = nc.Dataset(join(self.subset_output_dir, output_file)) +def test_temporal_subset_modis_a(data_dir, subset_output_dir, request): + """ + Test that a temporal subset results in a granule that only + contains times within the given bounds. + """ + bbox = np.array(((-180, 180), (-90, 90))) + file = 'MODIS_A-JPL-L2P-v2014.0.nc' + output_file = "{}_{}".format(request.node.name, file) + min_time = '2019-08-05T06:57:00' + max_time = '2019-08-05T06:58:00' + # Actual min is 2019-08-05T06:55:01.000000000 + # Actual max is 2019-08-05T06:59:57.000000000 + + subset.subset( + file_to_subset=join(data_dir, file), + bbox=bbox, + output_file=join(subset_output_dir, output_file), + min_time=min_time, + max_time=max_time + ) + + in_ds = xr.open_dataset(join(data_dir, file), + decode_times=False, + decode_coords=False) + + out_ds = xr.open_dataset(join(subset_output_dir, output_file), + decode_times=False, + decode_coords=False) + + # Check that 'time' types match + assert in_ds.time.dtype == out_ds.time.dtype - for var_name, variable in in_nc.groups['HDFEOS'].groups['SWATHS'].groups['OMI Total Column Amount SO2'].groups['Geolocation Fields'].variables.items(): - assert in_nc.groups['HDFEOS'].groups['SWATHS'].groups['OMI Total Column Amount SO2'].groups['Geolocation Fields'].variables[var_name].shape == \ - out_nc.groups['HDFEOS'].groups['SWATHS'].groups['OMI Total Column Amount SO2'].groups['Geolocation Fields'].variables[var_name].shape + in_ds.close() + out_ds.close() + # Check that all times are within the given bounds. Open + # dataset using 'decode_times=True' for auto-conversions to + # datetime + out_ds = xr.open_dataset(join(subset_output_dir, output_file), + decode_coords=False) - def test_root_group(self): - """test that the GROUP_DELIM string, '__', is added to variables in the root group""" + start_dt = subset.translate_timestamp(min_time) + end_dt = subset.translate_timestamp(max_time) - sndr_file_name = 'SNDR.SNPP.CRIMSS.20200118T0024.m06.g005.L2_CLIMCAPS_RET.std.v02_28.G.200314032326_subset.nc' - shutil.copyfile(os.path.join(self.test_data_dir, 'SNDR', sndr_file_name), - os.path.join(self.subset_output_dir, sndr_file_name)) + epoch_dt = out_ds['time'].values[0] - nc_dataset = nc.Dataset(os.path.join(self.subset_output_dir, sndr_file_name)) + # All timedelta + epoch should be within the given temporal bounds. + assert out_ds.sst_dtime.min() + epoch_dt >= np.datetime64(start_dt) + assert out_ds.sst_dtime.min() + epoch_dt <= np.datetime64(end_dt) - args = { - 'decode_coords': False, - 'mask_and_scale': False, - 'decode_times': False - } - nc_dataset = subset.transform_grouped_dataset(nc_dataset, os.path.join(self.subset_output_dir, sndr_file_name)) - with xr.open_dataset( + +def test_temporal_subset_s6(data_dir, subset_output_dir, request): + """ + Test that a temporal subset results in a granule that only + contains times within the given bounds. + """ + bbox = np.array(((-180, 180), (-90, 90))) + file = 'S6A_P4_2__LR_STD__ST_002_140_20201207T011501_20201207T013023_F00.nc' + # Copy S6 file to temp dir + shutil.copyfile( + os.path.join(data_dir, 'sentinel_6', file), + os.path.join(subset_output_dir, file) + ) + output_file = "{}_{}".format(request.node.name, file) + min_time = '2020-12-07T01:20:00' + max_time = '2020-12-07T01:25:00' + # Actual min is 2020-12-07T01:15:01.000000000 + # Actual max is 2020-12-07T01:30:23.000000000 + + subset.subset( + file_to_subset=join(subset_output_dir, file), + bbox=bbox, + output_file=join(subset_output_dir, output_file), + min_time=min_time, + max_time=max_time + ) + + # Check that all times are within the given bounds. Open + # dataset using 'decode_times=True' for auto-conversions to + # datetime + out_ds = xr.open_dataset( + join(subset_output_dir, output_file), + decode_coords=False, + group='data_01' + ) + + start_dt = subset.translate_timestamp(min_time) + end_dt = subset.translate_timestamp(max_time) + + # All dates should be within the given temporal bounds. + assert (out_ds.time >= pd.to_datetime(start_dt)).all() + assert (out_ds.time <= pd.to_datetime(end_dt)).all() + + +@pytest.mark.parametrize('test_file', TEST_DATA_FILES) +def test_get_time_variable_name(test_file, data_dir, subset_output_dir): + args = { + 'decode_coords': False, + 'mask_and_scale': False, + 'decode_times': True + } + ds, rename_vars, _ = subset.open_as_nc_dataset(os.path.join(data_dir, test_file)) + ds = xr.open_dataset(xr.backends.NetCDF4DataStore(ds), **args) + + lat_var_name = subset.compute_coordinate_variable_names(ds)[0][0] + time_var_name = subset.compute_time_variable_name(ds, ds[lat_var_name]) + + assert time_var_name is not None + assert 'time' in time_var_name + + +def test_subset_jason(data_dir, subset_output_dir, request): + bbox = np.array(((-180, 0), (-90, 90))) + file = 'JA1_GPN_2PeP001_002_20020115_060706_20020115_070316.nc' + output_file = "{}_{}".format(request.node.name, file) + min_time = "2002-01-15T06:07:06Z" + max_time = "2002-01-15T06:30:16Z" + + subset.subset( + file_to_subset=os.path.join(data_dir, file), + bbox=bbox, + min_time=min_time, + max_time=max_time, + output_file=os.path.join(subset_output_dir, output_file) + ) + + +@pytest.mark.parametrize('test_file', TEST_DATA_FILES) +def test_subset_size(test_file, data_dir, subset_output_dir, request): + bbox = np.array(((-180, 0), (-30, 90))) + output_file = "{}_{}".format(request.node.name, test_file) + input_file_path = os.path.join(data_dir, test_file) + output_file_path = os.path.join(subset_output_dir, output_file) + + subset.subset( + file_to_subset=input_file_path, + bbox=bbox, + output_file=output_file_path + ) + + original_file_size = os.path.getsize(input_file_path) + subset_file_size = os.path.getsize(output_file_path) + + assert subset_file_size < original_file_size + + +def test_duplicate_dims_sndr(data_dir, subset_output_dir, request): + """ + Check if SNDR Climcaps files run successfully even though + these files have variables with duplicate dimensions + """ + SNDR_dir = join(data_dir, 'SNDR') + sndr_file = 'SNDR.J1.CRIMSS.20210224T0100.m06.g011.L2_CLIMCAPS_RET.std.v02_28.G.210331064430.nc' + + bbox = np.array(((-180, 90), (-90, 90))) + output_file = "{}_{}".format(request.node.name, sndr_file) + shutil.copyfile( + os.path.join(SNDR_dir, sndr_file), + os.path.join(subset_output_dir, sndr_file) + ) + box_test = subset.subset( + file_to_subset=join(subset_output_dir, sndr_file), + bbox=bbox, + output_file=join(subset_output_dir, output_file), + min_time='2021-02-24T00:50:20Z', + max_time='2021-02-24T01:09:55Z' + ) + # check if the box_test is + + in_nc = nc.Dataset(join(SNDR_dir, sndr_file)) + out_nc = nc.Dataset(join(subset_output_dir, output_file)) + + for var_name, variable in in_nc.variables.items(): + assert in_nc[var_name].shape == out_nc[var_name].shape + + +def test_duplicate_dims_tropomi(data_dir, subset_output_dir, request): + """ + Check if SNDR Climcaps files run successfully even though + these files have variables with duplicate dimensions + """ + TROP_dir = join(data_dir, 'tropomi') + trop_file = 'S5P_OFFL_L2__AER_LH_20210704T005246_20210704T023416_19290_02_020200_20210708T023111.nc' + + bbox = np.array(((-180, 180), (-90, 90))) + output_file = "{}_{}".format(request.node.name, trop_file) + shutil.copyfile( + os.path.join(TROP_dir, trop_file), + os.path.join(subset_output_dir, trop_file) + ) + box_test = subset.subset( + file_to_subset=join(subset_output_dir, trop_file), + bbox=bbox, + output_file=join(subset_output_dir, output_file) + ) + # check if the box_test is + + in_nc = nc.Dataset(join(TROP_dir, trop_file)) + out_nc = nc.Dataset(join(subset_output_dir, output_file)) + + for var_name, variable in in_nc.groups['PRODUCT'].groups['SUPPORT_DATA'].groups[ + 'DETAILED_RESULTS'].variables.items(): + assert variable.shape == \ + out_nc.groups['PRODUCT'].groups['SUPPORT_DATA'].groups['DETAILED_RESULTS'].variables[var_name].shape + + +def test_omi_novars_subset(data_dir, subset_output_dir, request): + """ + Check that the OMI variables are conserved when no variable are specified + the data field and lat/lon are in different groups + """ + omi_dir = join(data_dir, 'OMI') + omi_file = 'OMI-Aura_L2-OMSO2_2020m0116t1207-o82471_v003-2020m0223t142939.he5' + + bbox = np.array(((-180, 90), (-90, 90))) + output_file = "{}_{}".format(request.node.name, omi_file) + shutil.copyfile( + os.path.join(omi_dir, omi_file), + os.path.join(subset_output_dir, omi_file) + ) + box_test = subset.subset( + file_to_subset=join(subset_output_dir, omi_file), + bbox=bbox, + output_file=join(subset_output_dir, output_file), + ) + # check if the box_test is + + in_nc = nc.Dataset(join(omi_dir, omi_file)) + out_nc = nc.Dataset(join(subset_output_dir, output_file)) + + for var_name, variable in in_nc.groups['HDFEOS'].groups['SWATHS'].groups['OMI Total Column Amount SO2'].groups[ + 'Geolocation Fields'].variables.items(): + assert in_nc.groups['HDFEOS'].groups['SWATHS'].groups['OMI Total Column Amount SO2'].groups[ + 'Geolocation Fields'].variables[var_name].shape == \ + out_nc.groups['HDFEOS'].groups['SWATHS'].groups['OMI Total Column Amount SO2'].groups[ + 'Geolocation Fields'].variables[var_name].shape + + +def test_root_group(data_dir, subset_output_dir): + """test that the GROUP_DELIM string, '__', is added to variables in the root group""" + + sndr_file_name = 'SNDR.SNPP.CRIMSS.20200118T0024.m06.g005.L2_CLIMCAPS_RET.std.v02_28.G.200314032326_subset.nc' + shutil.copyfile(os.path.join(data_dir, 'SNDR', sndr_file_name), + os.path.join(subset_output_dir, sndr_file_name)) + + nc_dataset = nc.Dataset(os.path.join(subset_output_dir, sndr_file_name)) + + args = { + 'decode_coords': False, + 'mask_and_scale': False, + 'decode_times': False + } + nc_dataset = subset.transform_grouped_dataset(nc_dataset, os.path.join(subset_output_dir, sndr_file_name)) + with xr.open_dataset( xr.backends.NetCDF4DataStore(nc_dataset), **args - ) as dataset: - var_list = list(dataset.variables) - assert (var_list[0][0:2] == subset.GROUP_DELIM) - group_lst = [] - for var_name in dataset.variables.keys(): #need logic if there is data in the top level not in a group - group_lst.append('/'.join(var_name.split(subset.GROUP_DELIM)[:-1])) - group_lst = ['/' if group=='' else group for group in group_lst] - groups = set(group_lst) - expected_group = {'/mw', '/ave_kern', '/', '/mol_lay', '/aux'} - assert (groups == expected_group) - - def test_get_time_squeeze(self): - """test builtin squeeze method on the lat and time variables so - when the two have the same shape with a time and delta time in - the tropomi product granuales the get_time_variable_name returns delta time as well""" - - tropomi_file_name = 'S5P_OFFL_L2__SO2____20200713T002730_20200713T020900_14239_01_020103_20200721T191355_subset.nc4' - shutil.copyfile(os.path.join(self.test_data_dir, 'tropomi', tropomi_file_name), - os.path.join(self.subset_output_dir, tropomi_file_name)) - - nc_dataset = nc.Dataset(os.path.join(self.subset_output_dir, tropomi_file_name)) - - args = { - 'decode_coords': False, - 'mask_and_scale': False, - 'decode_times': False - } - nc_dataset = subset.transform_grouped_dataset(nc_dataset, os.path.join(self.subset_output_dir, tropomi_file_name)) - with xr.open_dataset( + ) as dataset: + var_list = list(dataset.variables) + assert (var_list[0][0:2] == subset.GROUP_DELIM) + group_lst = [] + for var_name in dataset.variables.keys(): # need logic if there is data in the top level not in a group + group_lst.append('/'.join(var_name.split(subset.GROUP_DELIM)[:-1])) + group_lst = ['/' if group == '' else group for group in group_lst] + groups = set(group_lst) + expected_group = {'/mw', '/ave_kern', '/', '/mol_lay', '/aux'} + assert (groups == expected_group) + + +def test_get_time_squeeze(data_dir, subset_output_dir): + """test builtin squeeze method on the lat and time variables so + when the two have the same shape with a time and delta time in + the tropomi product granuales the get_time_variable_name returns delta time as well""" + + tropomi_file_name = 'S5P_OFFL_L2__SO2____20200713T002730_20200713T020900_14239_01_020103_20200721T191355_subset.nc4' + shutil.copyfile(os.path.join(data_dir, 'tropomi', tropomi_file_name), + os.path.join(subset_output_dir, tropomi_file_name)) + + nc_dataset = nc.Dataset(os.path.join(subset_output_dir, tropomi_file_name)) + + args = { + 'decode_coords': False, + 'mask_and_scale': False, + 'decode_times': False + } + nc_dataset = subset.transform_grouped_dataset(nc_dataset, + os.path.join(subset_output_dir, tropomi_file_name)) + with xr.open_dataset( xr.backends.NetCDF4DataStore(nc_dataset), **args - ) as dataset: - lat_var_name = subset.compute_coordinate_variable_names(dataset)[0][0] - time_var_name = subset.compute_time_variable_name(dataset, dataset[lat_var_name]) - lat_dims = dataset[lat_var_name].squeeze().dims - time_dims = dataset[time_var_name].squeeze().dims - assert (lat_dims == time_dims) + ) as dataset: + lat_var_name = subset.compute_coordinate_variable_names(dataset)[0][0] + time_var_name = subset.compute_time_variable_name(dataset, dataset[lat_var_name]) + lat_dims = dataset[lat_var_name].squeeze().dims + time_dims = dataset[time_var_name].squeeze().dims + assert (lat_dims == time_dims) + + +def test_get_indexers_nd(data_dir, subset_output_dir): + """test that the time coordinate is not included in the indexers. Also test that the dimensions are the same for + a global box subset""" + tropomi_file_name = 'S5P_OFFL_L2__SO2____20200713T002730_20200713T020900_14239_01_020103_20200721T191355_subset.nc4' + shutil.copyfile(os.path.join(data_dir, 'tropomi', tropomi_file_name), + os.path.join(subset_output_dir, tropomi_file_name)) + + nc_dataset = nc.Dataset(os.path.join(subset_output_dir, tropomi_file_name)) + + args = { + 'decode_coords': False, + 'mask_and_scale': False, + 'decode_times': False + } + nc_dataset = subset.transform_grouped_dataset(nc_dataset, + os.path.join(subset_output_dir, tropomi_file_name)) + with xr.open_dataset( + xr.backends.NetCDF4DataStore(nc_dataset), + **args + ) as dataset: + time_var_names = [] + lat_var_name = subset.compute_coordinate_variable_names(dataset)[0][0] + lon_var_name = subset.compute_coordinate_variable_names(dataset)[1][0] + time_var_name = subset.compute_time_variable_name(dataset, dataset[lat_var_name]) + oper = operator.and_ - def test_get_indexers_nd(self): - """test that the time coordinate is not included in the indexers. Also test that the dimensions are the same for - a global box subset""" - tropomi_file_name = 'S5P_OFFL_L2__SO2____20200713T002730_20200713T020900_14239_01_020103_20200721T191355_subset.nc4' - shutil.copyfile(os.path.join(self.test_data_dir, 'tropomi', tropomi_file_name), - os.path.join(self.subset_output_dir, tropomi_file_name)) + cond = oper( + (dataset[lon_var_name] >= -180), + (dataset[lon_var_name] <= 180) + ) & (dataset[lat_var_name] >= -90) & (dataset[lat_var_name] <= 90) & True - nc_dataset = nc.Dataset(os.path.join(self.subset_output_dir, tropomi_file_name)) + indexers = xre.get_indexers_from_nd(cond, True) + indexed_cond = cond.isel(**indexers) + indexed_ds = dataset.isel(**indexers) + new_dataset = indexed_ds.where(indexed_cond) - args = { - 'decode_coords': False, - 'mask_and_scale': False, - 'decode_times': False - } - nc_dataset = subset.transform_grouped_dataset(nc_dataset, os.path.join(self.subset_output_dir, tropomi_file_name)) - with xr.open_dataset( - xr.backends.NetCDF4DataStore(nc_dataset), - **args - ) as dataset: - time_var_names = [] - lat_var_name = subset.compute_coordinate_variable_names(dataset)[0][0] - lon_var_name = subset.compute_coordinate_variable_names(dataset)[1][0] - time_var_name = subset.compute_time_variable_name(dataset, dataset[lat_var_name]) - oper = operator.and_ - - cond = oper( - (dataset[lon_var_name] >= -180), - (dataset[lon_var_name] <= 180) - ) & (dataset[lat_var_name] >= -90) & (dataset[lat_var_name] <= 90) & True - - indexers = xre.get_indexers_from_nd(cond, True) - indexed_cond = cond.isel(**indexers) - indexed_ds = dataset.isel(**indexers) - new_dataset = indexed_ds.where(indexed_cond) - - assert ((time_var_name not in indexers.keys()) == True) #time can't be in the index - assert (new_dataset.dims == dataset.dims) - - def test_variable_type_string_oco2(self): - """Code passes a ceating a variable that is type object in oco2 file""" - - oco2_file_name = 'oco2_LtCO2_190201_B10206Ar_200729175909s.nc4' - output_file_name = 'oco2_test_out.nc' - shutil.copyfile(os.path.join(self.test_data_dir, 'OCO2', oco2_file_name), - os.path.join(self.subset_output_dir, oco2_file_name)) - bbox = np.array(((-180,180),(-90.0,90))) + assert ((time_var_name not in indexers.keys()) == True) # time can't be in the index + assert (new_dataset.dims == dataset.dims) - subset.subset( - file_to_subset=join(self.test_data_dir, 'OCO2',oco2_file_name), - bbox=bbox, - output_file=join(self.subset_output_dir, output_file_name), - ) - in_nc = xr.open_dataset(join(self.test_data_dir, 'OCO2',oco2_file_name)) - out_nc = xr.open_dataset(join(self.subset_output_dir, output_file_name)) - assert (in_nc.variables['source_files'].dtype == out_nc.variables['source_files'].dtype) - - def test_transform_h5py_dataset(self): - """ - Test that the transformation function results in a correctly - formatted dataset for h5py files - """ - OMI_file_name = 'OMI-Aura_L2-OMSO2_2020m0116t1207-o82471_v003-2020m0223t142939.he5' - shutil.copyfile(os.path.join(self.test_data_dir, 'OMI', OMI_file_name), - os.path.join(self.subset_output_dir, OMI_file_name)) - - h5_ds = h5py.File(os.path.join(self.test_data_dir, 'OMI', OMI_file_name), 'r') - - entry_lst = [] - # Get root level objects - key_lst = list(h5_ds.keys()) - - # Go through every level of the file to fill out the remaining objects - for entry_str in key_lst: - # If object is a group, add it to the loop list - if (isinstance(h5_ds[entry_str],h5py.Group)): - for group_keys in list(h5_ds[entry_str].keys()): - if (isinstance(h5_ds[entry_str + "/" + group_keys], h5py.Dataset)): - entry_lst.append(entry_str + "/" + group_keys) - key_lst.append(entry_str + "/" + group_keys) - - nc_dataset, has_groups = subset.h5file_transform(os.path.join(self.subset_output_dir, OMI_file_name)) - - nc_vars_flattened = list(nc_dataset.variables.keys()) - for i in range(len(entry_lst)): # go through all the datasets in h5py file - input_variable = '__'+entry_lst[i].replace('/', '__') - output_variable = nc_vars_flattened[i] - assert (input_variable == output_variable) - - nc_dataset.close() - h5_ds.close() - - - def test_variable_dims_matched_tropomi(self): - """ - Code must match the dimensions for each variable rather than - assume all dimensions in a group are the same - """ - - tropomi_file_name = 'S5P_OFFL_L2__SO2____20200713T002730_20200713T020900_14239_01_020103_20200721T191355_subset.nc4' - output_file_name = 'tropomi_test_out.nc' - shutil.copyfile(os.path.join(self.test_data_dir, 'tropomi', tropomi_file_name), - os.path.join(self.subset_output_dir, tropomi_file_name)) - - in_nc = nc.Dataset(os.path.join(self.subset_output_dir, tropomi_file_name)) - - # Get variable dimensions from input dataset - in_var_dims = { - var_name: [dim.split(subset.GROUP_DELIM)[-1] for dim in var.dimensions] - for var_name, var in in_nc.groups['PRODUCT'].variables.items() - } - - # Get variables from METADATA group - in_var_dims.update( - { - var_name: [dim.split(subset.GROUP_DELIM)[-1] for dim in var.dimensions] - for var_name, var in in_nc.groups['METADATA'].groups['QA_STATISTICS'].variables.items() - } - ) - # Include PRODUCT>SUPPORT_DATA>GEOLOCATIONS location - in_var_dims.update( - { - var_name: [dim.split(subset.GROUP_DELIM)[-1] for dim in var.dimensions] - for var_name, var in in_nc.groups['PRODUCT'].groups['SUPPORT_DATA'].groups['GEOLOCATIONS'].variables.items() - } - ) +def test_variable_type_string_oco2(data_dir, subset_output_dir): + """Code passes a ceating a variable that is type object in oco2 file""" - out_nc = subset.transform_grouped_dataset( - in_nc, os.path.join(self.subset_output_dir, tropomi_file_name) - ) + oco2_file_name = 'oco2_LtCO2_190201_B10206Ar_200729175909s.nc4' + output_file_name = 'oco2_test_out.nc' + shutil.copyfile(os.path.join(data_dir, 'OCO2', oco2_file_name), + os.path.join(subset_output_dir, oco2_file_name)) + bbox = np.array(((-180, 180), (-90.0, 90))) - # Get variable dimensions from output dataset - out_var_dims = { - var_name.split(subset.GROUP_DELIM)[-1]: [dim.split(subset.GROUP_DELIM)[-1] for dim in var.dimensions] - for var_name, var in out_nc.variables.items() - } + subset.subset( + file_to_subset=join(data_dir, 'OCO2', oco2_file_name), + bbox=bbox, + output_file=join(subset_output_dir, output_file_name), + ) - self.assertDictEqual(in_var_dims, out_var_dims) + in_nc = xr.open_dataset(join(data_dir, 'OCO2', oco2_file_name)) + out_nc = xr.open_dataset(join(subset_output_dir, output_file_name)) + assert (in_nc.variables['source_files'].dtype == out_nc.variables['source_files'].dtype) - def test_temporal_merged_topex(self): - """ - Test that a temporal subset results in a granule that only - contains times within the given bounds. - """ - bbox = np.array(((-180, 180), (-90, 90))) - file = 'Merged_TOPEX_Jason_OSTM_Jason-3_Cycle_002.V4_2.nc' - # Copy S6 file to temp dir - shutil.copyfile( - os.path.join(self.test_data_dir, file), - os.path.join(self.subset_output_dir, file) - ) - output_file = "{}_{}".format(self._testMethodName, file) - min_time = '1992-01-01T00:00:00' - max_time = '1992-11-01T00:00:00' - # Actual min is 2020-12-07T01:15:01.000000000 - # Actual max is 2020-12-07T01:30:23.000000000 +def test_transform_h5py_dataset(data_dir, subset_output_dir): + """ + Test that the transformation function results in a correctly + formatted dataset for h5py files + """ + OMI_file_name = 'OMI-Aura_L2-OMSO2_2020m0116t1207-o82471_v003-2020m0223t142939.he5' + shutil.copyfile(os.path.join(data_dir, 'OMI', OMI_file_name), + os.path.join(subset_output_dir, OMI_file_name)) - subset.subset( - file_to_subset=join(self.subset_output_dir, file), - bbox=bbox, - output_file=join(self.subset_output_dir, output_file), - min_time=min_time, - max_time=max_time - ) + h5_ds = h5py.File(os.path.join(data_dir, 'OMI', OMI_file_name), 'r') - # Check that all times are within the given bounds. Open - # dataset using 'decode_times=True' for auto-conversions to - # datetime - out_ds = xr.open_dataset( - join(self.subset_output_dir, output_file), - decode_coords=False - ) + entry_lst = [] + # Get root level objects + key_lst = list(h5_ds.keys()) - start_dt = subset.translate_timestamp(min_time) - end_dt = subset.translate_timestamp(max_time) + # Go through every level of the file to fill out the remaining objects + for entry_str in key_lst: + # If object is a group, add it to the loop list + if (isinstance(h5_ds[entry_str], h5py.Group)): + for group_keys in list(h5_ds[entry_str].keys()): + if (isinstance(h5_ds[entry_str + "/" + group_keys], h5py.Dataset)): + entry_lst.append(entry_str + "/" + group_keys) + key_lst.append(entry_str + "/" + group_keys) - # delta time from the MJD of this data collection - mjd_dt = np.datetime64("1992-01-01") - start_delta_dt = np.datetime64(start_dt) - mjd_dt - end_delta_dt = np.datetime64(end_dt) - mjd_dt + nc_dataset, has_groups = subset.h5file_transform(os.path.join(subset_output_dir, OMI_file_name)) - # All dates should be within the given temporal bounds. - assert (out_ds.time.values >= start_delta_dt).all() - assert (out_ds.time.values <= end_delta_dt).all() + nc_vars_flattened = list(nc_dataset.variables.keys()) + for i in range(len(entry_lst)): # go through all the datasets in h5py file + input_variable = '__' + entry_lst[i].replace('/', '__') + output_variable = nc_vars_flattened[i] + assert (input_variable == output_variable) - def test_get_time_epoch_var(self): - """ - Test that get_time_epoch_var method returns the 'time' variable for the tropomi CH4 granule" - """ - bbox = np.array(((-180, 180), (-90, 90))) - tropomi_file = 'S5P_OFFL_L2__CH4____20190319T110835_20190319T125006_07407_01_010202_20190325T125810_subset.nc4' + nc_dataset.close() + h5_ds.close() - shutil.copyfile(os.path.join(self.test_data_dir, 'tropomi', tropomi_file), - os.path.join(self.subset_output_dir, tropomi_file)) +def test_variable_dims_matched_tropomi(data_dir, subset_output_dir): + """ + Code must match the dimensions for each variable rather than + assume all dimensions in a group are the same + """ + + tropomi_file_name = 'S5P_OFFL_L2__SO2____20200713T002730_20200713T020900_14239_01_020103_20200721T191355_subset.nc4' + output_file_name = 'tropomi_test_out.nc' + shutil.copyfile(os.path.join(data_dir, 'tropomi', tropomi_file_name), + os.path.join(subset_output_dir, tropomi_file_name)) - nc_dataset = nc.Dataset(os.path.join(self.subset_output_dir, tropomi_file), mode='r') + in_nc = nc.Dataset(os.path.join(subset_output_dir, tropomi_file_name)) - nc_dataset = subset.transform_grouped_dataset(nc_dataset, os.path.join(self.subset_output_dir, tropomi_file)) + # Get variable dimensions from input dataset + in_var_dims = { + var_name: [dim.split(subset.GROUP_DELIM)[-1] for dim in var.dimensions] + for var_name, var in in_nc.groups['PRODUCT'].variables.items() + } - args = { - 'decode_coords': False, - 'mask_and_scale': False, - 'decode_times': False + # Get variables from METADATA group + in_var_dims.update( + { + var_name: [dim.split(subset.GROUP_DELIM)[-1] for dim in var.dimensions] + for var_name, var in in_nc.groups['METADATA'].groups['QA_STATISTICS'].variables.items() + } + ) + # Include PRODUCT>SUPPORT_DATA>GEOLOCATIONS location + in_var_dims.update( + { + var_name: [dim.split(subset.GROUP_DELIM)[-1] for dim in var.dimensions] + for var_name, var in + in_nc.groups['PRODUCT'].groups['SUPPORT_DATA'].groups['GEOLOCATIONS'].variables.items() } + ) - with xr.open_dataset( - xr.backends.NetCDF4DataStore(nc_dataset), - **args - ) as dataset: + out_nc = subset.transform_grouped_dataset( + in_nc, os.path.join(subset_output_dir, tropomi_file_name) + ) - lat_var_names, lon_var_names = subset.compute_coordinate_variable_names(dataset) - time_var_names = [ - subset.compute_time_variable_name( - dataset, dataset[lat_var_name] - ) for lat_var_name in lat_var_names - ] - epoch_time_var = subset.get_time_epoch_var(dataset, time_var_names[0]) - - assert epoch_time_var.split('__')[-1] == 'time' - - def test_temporal_variable_subset(self): - """ - Test that both a temporal and variable subset can be executed - on a granule, and that all of the data within that granule is - subsetted as expected. - """ - bbox = np.array(((-180, 180), (-90, 90))) - file = 'ascat_20150702_084200_metopa_45145_eps_o_250_2300_ovw.l2.nc' - output_file = "{}_{}".format(self._testMethodName, file) - min_time = '2015-07-02T09:00:00' - max_time = '2015-07-02T10:00:00' - variables = [ - 'wind_speed', - 'wind_dir' - ] + # Get variable dimensions from output dataset + out_var_dims = { + var_name.split(subset.GROUP_DELIM)[-1]: [dim.split(subset.GROUP_DELIM)[-1] for dim in var.dimensions] + for var_name, var in out_nc.variables.items() + } - subset.subset( - file_to_subset=join(self.test_data_dir, file), - bbox=bbox, - output_file=join(self.subset_output_dir, output_file), - min_time=min_time, - max_time=max_time, - variables=variables - ) + TestCase().assertDictEqual(in_var_dims, out_var_dims) - in_ds = xr.open_dataset(join(self.test_data_dir, file), - decode_times=False, - decode_coords=False) - - out_ds = xr.open_dataset(join(self.subset_output_dir, output_file), - decode_times=False, - decode_coords=False) - - # Check that 'time' types match - assert in_ds.time.dtype == out_ds.time.dtype - - in_ds.close() - out_ds.close() - - # Check that all times are within the given bounds. Open - # dataset using 'decode_times=True' for auto-conversions to - # datetime - out_ds = xr.open_dataset(join(self.subset_output_dir, output_file), - decode_coords=False) - - start_dt = subset.translate_timestamp(min_time) - end_dt = subset.translate_timestamp(max_time) - - # All dates should be within the given temporal bounds. - assert (out_ds.time >= pd.to_datetime(start_dt)).all() - assert (out_ds.time <= pd.to_datetime(end_dt)).all() - - # Only coordinate variables and variables requested in variable - # subset should be present. - assert set(np.append(['lat', 'lon', 'time'], variables)) == set(out_ds.data_vars.keys()) - - - def test_temporal_he5file_subset(self): - """ - Test that the time type changes to datetime for subsetting - """ - - OMI_file_names = ['OMI-Aura_L2-OMSO2_2020m0116t1207-o82471_v003-2020m0223t142939.he5', - 'OMI-Aura_L2-OMBRO_2020m0116t1207-o82471_v003-2020m0116t182003.he5'] - OMI_copy_file = 'OMI_copy_testing_2.he5' - for i in OMI_file_names: - shutil.copyfile(os.path.join(self.test_data_dir, 'OMI', i), - os.path.join(self.subset_output_dir, OMI_copy_file)) - min_time='2020-01-16T12:30:00Z' - max_time='2020-01-16T12:40:00Z' - bbox = np.array(((-180, 180), (-90, 90))) - nc_dataset, has_groups = subset.h5file_transform(os.path.join(self.subset_output_dir, OMI_copy_file)) - - args = { - 'decode_coords': False, - 'mask_and_scale': False, - 'decode_times': False - } - if min_time or max_time: - args['decode_times'] = True - - with xr.open_dataset( - xr.backends.NetCDF4DataStore(nc_dataset), - **args - ) as dataset: - lat_var_names, lon_var_names, time_var_names = subset.get_coordinate_variable_names( - dataset=dataset, - lat_var_names=None, - lon_var_names=None, - time_var_names=None - ) - if 'BRO' in i: - assert any('utc' in x.lower() for x in time_var_names) - - dataset, start_date = subset.convert_to_datetime(dataset, time_var_names) - assert dataset[time_var_names[0]].dtype == 'datetime64[ns]' - - - def test_he5_timeattrs_output(self): - """Test that the time attributes in the output match the attributes of the input for OMI test files""" - - omi_dir = join(self.test_data_dir, 'OMI') - omi_file = 'OMI-Aura_L2-OMBRO_2020m0116t1207-o82471_v003-2020m0116t182003.he5' - omi_file_input = 'input'+omi_file - bbox = np.array(((-180, 90), (-90, 90))) - output_file = "{}_{}".format(self._testMethodName, omi_file) - shutil.copyfile( - os.path.join(omi_dir, omi_file), - os.path.join(self.subset_output_dir, omi_file) - ) - shutil.copyfile( - os.path.join(omi_dir, omi_file), - os.path.join(self.subset_output_dir, omi_file_input) - ) - - min_time='2020-01-16T12:30:00Z' - max_time='2020-01-16T12:40:00Z' - bbox = np.array(((-180, 180), (-90, 90))) - nc_dataset_input = nc.Dataset(os.path.join(self.subset_output_dir, omi_file_input)) - incut_set = nc_dataset_input.groups['HDFEOS'].groups['SWATHS'].groups['OMI Total Column Amount BrO'].groups['Geolocation Fields'] - xr_dataset_input = xr.open_dataset(xr.backends.NetCDF4DataStore(incut_set)) - inattrs = xr_dataset_input['Time'].attrs - - subset.subset( - file_to_subset=os.path.join(self.subset_output_dir, omi_file), - bbox=bbox, - output_file=os.path.join(self.subset_output_dir, output_file), - min_time=min_time, - max_time=max_time - ) +def test_temporal_merged_topex(data_dir, subset_output_dir, request): + """ + Test that a temporal subset results in a granule that only + contains times within the given bounds. + """ + bbox = np.array(((-180, 180), (-90, 90))) + file = 'Merged_TOPEX_Jason_OSTM_Jason-3_Cycle_002.V4_2.nc' + # Copy S6 file to temp dir + shutil.copyfile( + os.path.join(data_dir, file), + os.path.join(subset_output_dir, file) + ) + output_file = "{}_{}".format(request.node.name, file) + min_time = '1992-01-01T00:00:00' + max_time = '1992-11-01T00:00:00' + # Actual min is 2020-12-07T01:15:01.000000000 + # Actual max is 2020-12-07T01:30:23.000000000 + + subset.subset( + file_to_subset=join(subset_output_dir, file), + bbox=bbox, + output_file=join(subset_output_dir, output_file), + min_time=min_time, + max_time=max_time + ) + + # Check that all times are within the given bounds. Open + # dataset using 'decode_times=True' for auto-conversions to + # datetime + out_ds = xr.open_dataset( + join(subset_output_dir, output_file), + decode_coords=False + ) + + start_dt = subset.translate_timestamp(min_time) + end_dt = subset.translate_timestamp(max_time) + + # delta time from the MJD of this data collection + mjd_dt = np.datetime64("1992-01-01") + start_delta_dt = np.datetime64(start_dt) - mjd_dt + end_delta_dt = np.datetime64(end_dt) - mjd_dt + + # All dates should be within the given temporal bounds. + assert (out_ds.time.values >= start_delta_dt).all() + assert (out_ds.time.values <= end_delta_dt).all() + + +def test_get_time_epoch_var(data_dir, subset_output_dir): + """ + Test that get_time_epoch_var method returns the 'time' variable for the tropomi CH4 granule" + """ + bbox = np.array(((-180, 180), (-90, 90))) + tropomi_file = 'S5P_OFFL_L2__CH4____20190319T110835_20190319T125006_07407_01_010202_20190325T125810_subset.nc4' - output_ncdataset = nc.Dataset(os.path.join(self.subset_output_dir, output_file)) - outcut_set = output_ncdataset.groups['HDFEOS'].groups['SWATHS'].groups['OMI Total Column Amount BrO'].groups['Geolocation Fields'] - xrout_dataset = xr.open_dataset(xr.backends.NetCDF4DataStore(outcut_set)) - outattrs = xrout_dataset['Time'].attrs + shutil.copyfile(os.path.join(data_dir, 'tropomi', tropomi_file), + os.path.join(subset_output_dir, tropomi_file)) - for key in inattrs.keys(): - if isinstance(inattrs[key], np.ndarray): - if np.array_equal(inattrs[key],outattrs[key]): - pass - else: - raise AssertionError('Attributes for {} do not equal each other'.format(key)) - else: - assert inattrs[key] == outattrs[key] - - - def test_temporal_subset_lines(self): - bbox = np.array(((-180, 180), (-90, 90))) - file = 'SWOT_L2_LR_SSH_Expert_368_012_20121111T235910_20121112T005015_DG10_01.nc' - output_file = "{}_{}".format(self._testMethodName, file) - min_time = '2012-11-11T23:59:10' - max_time = '2012-11-12T00:20:10' + nc_dataset = nc.Dataset(os.path.join(subset_output_dir, tropomi_file), mode='r') - subset.subset( - file_to_subset=join(self.test_data_dir, file), - bbox=bbox, - output_file=join(self.subset_output_dir, output_file), - min_time=min_time, - max_time=max_time - ) + nc_dataset = subset.transform_grouped_dataset(nc_dataset, os.path.join(subset_output_dir, tropomi_file)) - ds = xr.open_dataset( - join(self.subset_output_dir, output_file), - decode_times=False, - decode_coords=False - ) + args = { + 'decode_coords': False, + 'mask_and_scale': False, + 'decode_times': False + } + + with xr.open_dataset( + xr.backends.NetCDF4DataStore(nc_dataset), + **args + ) as dataset: + lat_var_names, lon_var_names = subset.compute_coordinate_variable_names(dataset) + time_var_names = [ + subset.compute_time_variable_name( + dataset, dataset[lat_var_name] + ) for lat_var_name in lat_var_names + ] + epoch_time_var = subset.get_time_epoch_var(dataset, time_var_names[0]) - assert ds.time.dims != ds.latitude.dims + assert epoch_time_var.split('__')[-1] == 'time' - def test_grouped_empty_subset(self): - """ - Test that an empty subset of a grouped dataset returns 'None' - spatial bounds. - """ - bbox = np.array(((-10, 10), (-10, 10))) - file = 'S6A_P4_2__LR_STD__ST_002_140_20201207T011501_20201207T013023_F00.nc' - output_file = "{}_{}".format(self._testMethodName, file) - shutil.copyfile(os.path.join(self.test_data_dir, 'sentinel_6', file), - os.path.join(self.subset_output_dir, file)) +def test_temporal_variable_subset(data_dir, subset_output_dir, request): + """ + Test that both a temporal and variable subset can be executed + on a granule, and that all of the data within that granule is + subsetted as expected. + """ + bbox = np.array(((-180, 180), (-90, 90))) + file = 'ascat_20150702_084200_metopa_45145_eps_o_250_2300_ovw.l2.nc' + output_file = "{}_{}".format(request.node.name, file) + min_time = '2015-07-02T09:00:00' + max_time = '2015-07-02T10:00:00' + variables = [ + 'wind_speed', + 'wind_dir' + ] + + subset.subset( + file_to_subset=join(data_dir, file), + bbox=bbox, + output_file=join(subset_output_dir, output_file), + min_time=min_time, + max_time=max_time, + variables=variables + ) + + in_ds = xr.open_dataset(join(data_dir, file), + decode_times=False, + decode_coords=False) + + out_ds = xr.open_dataset(join(subset_output_dir, output_file), + decode_times=False, + decode_coords=False) - spatial_bounds = subset.subset( - file_to_subset=join(self.subset_output_dir, file), - bbox=bbox, - output_file=join(self.subset_output_dir, output_file) - ) + # Check that 'time' types match + assert in_ds.time.dtype == out_ds.time.dtype + + in_ds.close() + out_ds.close() + + # Check that all times are within the given bounds. Open + # dataset using 'decode_times=True' for auto-conversions to + # datetime + out_ds = xr.open_dataset(join(subset_output_dir, output_file), + decode_coords=False) - assert spatial_bounds is None + start_dt = subset.translate_timestamp(min_time) + end_dt = subset.translate_timestamp(max_time) - def test_get_time_OMI(self): - """ - Test that code get time variables for OMI .he5 files" - """ - omi_file = 'OMI-Aura_L2-OMSO2_2020m0116t1207-o82471_v003-2020m0223t142939.he5' + # All dates should be within the given temporal bounds. + assert (out_ds.time >= pd.to_datetime(start_dt)).all() + assert (out_ds.time <= pd.to_datetime(end_dt)).all() - shutil.copyfile(os.path.join(self.test_data_dir, 'OMI', omi_file), - os.path.join(self.subset_output_dir, omi_file)) + # Only coordinate variables and variables requested in variable + # subset should be present. + assert set(np.append(['lat', 'lon', 'time'], variables)) == set(out_ds.data_vars.keys()) - nc_dataset, has_groups = subset.h5file_transform(os.path.join(self.subset_output_dir, omi_file)) + +def test_temporal_he5file_subset(data_dir, subset_output_dir): + """ + Test that the time type changes to datetime for subsetting + """ + + OMI_file_names = ['OMI-Aura_L2-OMSO2_2020m0116t1207-o82471_v003-2020m0223t142939.he5', + 'OMI-Aura_L2-OMBRO_2020m0116t1207-o82471_v003-2020m0116t182003.he5'] + OMI_copy_file = 'OMI_copy_testing_2.he5' + for i in OMI_file_names: + shutil.copyfile(os.path.join(data_dir, 'OMI', i), + os.path.join(subset_output_dir, OMI_copy_file)) + min_time = '2020-01-16T12:30:00Z' + max_time = '2020-01-16T12:40:00Z' + bbox = np.array(((-180, 180), (-90, 90))) + nc_dataset, has_groups = subset.h5file_transform(os.path.join(subset_output_dir, OMI_copy_file)) args = { 'decode_coords': False, @@ -1831,187 +1727,321 @@ def test_get_time_OMI(self): 'decode_times': False } + if min_time or max_time: + args['decode_times'] = True + with xr.open_dataset( xr.backends.NetCDF4DataStore(nc_dataset), **args ) as dataset: - time_var_names = [] - lat_var_names, lon_var_names = subset.compute_coordinate_variable_names(dataset) - time_var_names = [ - subset.compute_time_variable_name( - dataset, dataset[lat_var_name] - ) for lat_var_name in lat_var_names - ] - assert "Time" in time_var_names[0] - assert "Latitude" in lat_var_names[0] - - - def test_empty_temporal_subset(self): - """ - Test the edge case where a subsetted empty granule - (due to bbox) is temporally subset, which causes the encoding - step to fail due to size '1' data for each dimension. - """ - # 37.707:38.484 - bbox = np.array(((37.707, 38.484), (-13.265, -12.812))) - file = '20190927000500-JPL-L2P_GHRSST-SSTskin-MODIS_A-D-v02.0-fv01.0.nc' - output_file = "{}_{}".format(self._testMethodName, file) - min_time = '2019-09-01' - max_time = '2019-09-30' + lat_var_names, lon_var_names, time_var_names = subset.get_coordinate_variable_names( + dataset=dataset, + lat_var_names=None, + lon_var_names=None, + time_var_names=None + ) + if 'BRO' in i: + assert any('utc' in x.lower() for x in time_var_names) + + dataset, start_date = subset.convert_to_datetime(dataset, time_var_names) + assert dataset[time_var_names[0]].dtype == 'datetime64[ns]' + + +def test_he5_timeattrs_output(data_dir, subset_output_dir, request): + """Test that the time attributes in the output match the attributes of the input for OMI test files""" + + omi_dir = join(data_dir, 'OMI') + omi_file = 'OMI-Aura_L2-OMBRO_2020m0116t1207-o82471_v003-2020m0116t182003.he5' + omi_file_input = 'input' + omi_file + bbox = np.array(((-180, 90), (-90, 90))) + output_file = "{}_{}".format(request.node.name, omi_file) + shutil.copyfile( + os.path.join(omi_dir, omi_file), + os.path.join(subset_output_dir, omi_file) + ) + shutil.copyfile( + os.path.join(omi_dir, omi_file), + os.path.join(subset_output_dir, omi_file_input) + ) + + min_time = '2020-01-16T12:30:00Z' + max_time = '2020-01-16T12:40:00Z' + bbox = np.array(((-180, 180), (-90, 90))) + nc_dataset_input = nc.Dataset(os.path.join(subset_output_dir, omi_file_input)) + incut_set = nc_dataset_input.groups['HDFEOS'].groups['SWATHS'].groups['OMI Total Column Amount BrO'].groups[ + 'Geolocation Fields'] + xr_dataset_input = xr.open_dataset(xr.backends.NetCDF4DataStore(incut_set)) + inattrs = xr_dataset_input['Time'].attrs + + subset.subset( + file_to_subset=os.path.join(subset_output_dir, omi_file), + bbox=bbox, + output_file=os.path.join(subset_output_dir, output_file), + min_time=min_time, + max_time=max_time + ) + + output_ncdataset = nc.Dataset(os.path.join(subset_output_dir, output_file)) + outcut_set = output_ncdataset.groups['HDFEOS'].groups['SWATHS'].groups['OMI Total Column Amount BrO'].groups[ + 'Geolocation Fields'] + xrout_dataset = xr.open_dataset(xr.backends.NetCDF4DataStore(outcut_set)) + outattrs = xrout_dataset['Time'].attrs + + for key in inattrs.keys(): + if isinstance(inattrs[key], np.ndarray): + if np.array_equal(inattrs[key], outattrs[key]): + pass + else: + raise AssertionError('Attributes for {} do not equal each other'.format(key)) + else: + assert inattrs[key] == outattrs[key] - subset.subset( - file_to_subset=join(self.test_data_dir, file), - bbox=bbox, - output_file=join(self.subset_output_dir, output_file), - min_time=min_time, - max_time=max_time - ) - # Check that all times are within the given bounds. Open - # dataset using 'decode_times=True' for auto-conversions to - # datetime - ds = xr.open_dataset( - join(self.subset_output_dir, output_file), - decode_coords=False - ) +def test_temporal_subset_lines(data_dir, subset_output_dir, request): + bbox = np.array(((-180, 180), (-90, 90))) + file = 'SWOT_L2_LR_SSH_Expert_368_012_20121111T235910_20121112T005015_DG10_01.nc' + output_file = "{}_{}".format(request.node.name, file) + min_time = '2012-11-11T23:59:10' + max_time = '2012-11-12T00:20:10' - assert all(dim_size == 1 for dim_size in ds.dims.values()) - - def test_passed_coords(self): - """ - Ensure the coordinates passed in to the subsetter are - utilized and not manually calculated. - """ - file = 'ascat_20150702_084200_metopa_45145_eps_o_250_2300_ovw.l2.nc' - - dataset = xr.open_dataset(join(self.test_data_dir, file), - decode_times=False, - decode_coords=False) - - dummy_lats = ['dummy_lat'] - dummy_lons = ['dummy_lon'] - dummy_times = ['dummy_time'] - - actual_lats = ['lat'] - actual_lons = ['lon'] - actual_times = ['time'] - - # When none are passed in, variables are computed manually - lats, lons, times = subset.get_coordinate_variable_names( - dataset, - lat_var_names=None, - lon_var_names=None, - time_var_names=None - ) + subset.subset( + file_to_subset=join(data_dir, file), + bbox=bbox, + output_file=join(subset_output_dir, output_file), + min_time=min_time, + max_time=max_time + ) - assert lats == actual_lats - assert lons == actual_lons - assert times == actual_times + ds = xr.open_dataset( + join(subset_output_dir, output_file), + decode_times=False, + decode_coords=False + ) - # When lats or lons are passed in, only time is computed manually - # This case is a bit different because the lat values are used to - # compute the time variable so we can't pass in dummy values. + assert ds.time.dims != ds.latitude.dims - lats, lons, times = subset.get_coordinate_variable_names( - dataset, - lat_var_names=actual_lats, - lon_var_names=dummy_lons, - time_var_names=None, - ) - assert lats == actual_lats - assert lons == dummy_lons - assert times == actual_times - # When only time is passed in, lats and lons are computed manually - lats, lons, times = subset.get_coordinate_variable_names( - dataset, - lat_var_names=None, - lon_var_names=None, - time_var_names=dummy_times - ) - assert lats == actual_lats - assert lons == actual_lons - assert times == dummy_times - - # When time, lats, and lons are passed in, nothing is computed manually - lats, lons, times = subset.get_coordinate_variable_names( - dataset, - lat_var_names=dummy_lats, - lon_var_names=dummy_lons, - time_var_names=dummy_times - ) +def test_grouped_empty_subset(data_dir, subset_output_dir, request): + """ + Test that an empty subset of a grouped dataset returns 'None' + spatial bounds. + """ + bbox = np.array(((-10, 10), (-10, 10))) + file = 'S6A_P4_2__LR_STD__ST_002_140_20201207T011501_20201207T013023_F00.nc' + output_file = "{}_{}".format(request.node.name, file) - assert lats == dummy_lats - assert lons == dummy_lons - assert times == dummy_times - - def test_var_subsetting_tropomi(self): - """ - Check that variable subsetting is the same if a leading slash is included - """ - TROP_dir = join(self.test_data_dir, 'tropomi') - trop_file = 'S5P_OFFL_L2__CH4____20190319T110835_20190319T125006_07407_01_010202_20190325T125810_subset.nc4' - variable_slash = ['/PRODUCT/methane_mixing_ratio'] - variable_noslash = ['PRODUCT/methane_mixing_ratio'] - bbox = np.array(((-180, 180), (-90, 90))) - output_file_slash = "{}_{}".format(self._testMethodName, trop_file) - output_file_noslash = "{}_noslash_{}".format(self._testMethodName, trop_file) - shutil.copyfile( - os.path.join(TROP_dir, trop_file), - os.path.join(self.subset_output_dir, trop_file) - ) - shutil.copyfile( - os.path.join(TROP_dir, trop_file), - os.path.join(self.subset_output_dir,'slashtest'+trop_file) - ) - slash_test = subset.subset( - file_to_subset=join(self.subset_output_dir, trop_file), - bbox=bbox, - output_file=join(self.subset_output_dir, output_file_slash), - variables = variable_slash - ) - noslash_test = subset.subset( - file_to_subset=join(self.subset_output_dir, 'slashtest'+trop_file), - bbox=bbox, - output_file=join(self.subset_output_dir, output_file_noslash), - variables = variable_noslash - ) + shutil.copyfile(os.path.join(data_dir, 'sentinel_6', file), + os.path.join(subset_output_dir, file)) - slash_dataset = nc.Dataset(join(self.subset_output_dir, output_file_slash)) - noslash_dataset = nc.Dataset(join(self.subset_output_dir, output_file_noslash)) - - assert list(slash_dataset.groups['PRODUCT'].variables) == list(noslash_dataset.groups['PRODUCT'].variables) - def test_bad_time_unit(self): - - fill_val = -99999.0 - time_vals = np.random.rand(10) - time_vals[0] = fill_val - time_vals[-1] = fill_val - - data_vars = { - 'foo': (['x'], np.random.rand(10)), - 'time': ( - ['x'], - time_vals, - { - 'units': 'seconds since 2000-1-1 0:0:0 0', - '_FillValue': fill_val, - 'standard_name': 'time', - 'calendar': 'standard' - } - ), - } + spatial_bounds = subset.subset( + file_to_subset=join(subset_output_dir, file), + bbox=bbox, + output_file=join(subset_output_dir, output_file) + ) - ds = xr.Dataset( - data_vars=data_vars, - coords={'x': (['x'], np.arange(10))} - ) + assert spatial_bounds is None + + +def test_get_time_OMI(data_dir, subset_output_dir): + """ + Test that code get time variables for OMI .he5 files" + """ + omi_file = 'OMI-Aura_L2-OMSO2_2020m0116t1207-o82471_v003-2020m0223t142939.he5' + + shutil.copyfile(os.path.join(data_dir, 'OMI', omi_file), + os.path.join(subset_output_dir, omi_file)) + + nc_dataset, has_groups = subset.h5file_transform(os.path.join(subset_output_dir, omi_file)) + + args = { + 'decode_coords': False, + 'mask_and_scale': False, + 'decode_times': False + } + + with xr.open_dataset( + xr.backends.NetCDF4DataStore(nc_dataset), + **args + ) as dataset: + time_var_names = [] + lat_var_names, lon_var_names = subset.compute_coordinate_variable_names(dataset) + time_var_names = [ + subset.compute_time_variable_name( + dataset, dataset[lat_var_name] + ) for lat_var_name in lat_var_names + ] + assert "Time" in time_var_names[0] + assert "Latitude" in lat_var_names[0] + + +def test_empty_temporal_subset(data_dir, subset_output_dir, request): + """ + Test the edge case where a subsetted empty granule + (due to bbox) is temporally subset, which causes the encoding + step to fail due to size '1' data for each dimension. + """ + # 37.707:38.484 + bbox = np.array(((37.707, 38.484), (-13.265, -12.812))) + file = '20190927000500-JPL-L2P_GHRSST-SSTskin-MODIS_A-D-v02.0-fv01.0.nc' + output_file = "{}_{}".format(request.node.name, file) + min_time = '2019-09-01' + max_time = '2019-09-30' + + subset.subset( + file_to_subset=join(data_dir, file), + bbox=bbox, + output_file=join(subset_output_dir, output_file), + min_time=min_time, + max_time=max_time + ) + + # Check that all times are within the given bounds. Open + # dataset using 'decode_times=True' for auto-conversions to + # datetime + ds = xr.open_dataset( + join(subset_output_dir, output_file), + decode_coords=False + ) + + assert all(dim_size == 1 for dim_size in ds.dims.values()) + + +def test_passed_coords(data_dir, subset_output_dir): + """ + Ensure the coordinates passed in to the subsetter are + utilized and not manually calculated. + """ + file = 'ascat_20150702_084200_metopa_45145_eps_o_250_2300_ovw.l2.nc' + + dataset = xr.open_dataset(join(data_dir, file), + decode_times=False, + decode_coords=False) + + dummy_lats = ['dummy_lat'] + dummy_lons = ['dummy_lon'] + dummy_times = ['dummy_time'] + + actual_lats = ['lat'] + actual_lons = ['lon'] + actual_times = ['time'] + + # When none are passed in, variables are computed manually + lats, lons, times = subset.get_coordinate_variable_names( + dataset, + lat_var_names=None, + lon_var_names=None, + time_var_names=None + ) + + assert lats == actual_lats + assert lons == actual_lons + assert times == actual_times + + # When lats or lons are passed in, only time is computed manually + # This case is a bit different because the lat values are used to + # compute the time variable so we can't pass in dummy values. + + lats, lons, times = subset.get_coordinate_variable_names( + dataset, + lat_var_names=actual_lats, + lon_var_names=dummy_lons, + time_var_names=None, + ) + + assert lats == actual_lats + assert lons == dummy_lons + assert times == actual_times + # When only time is passed in, lats and lons are computed manually + lats, lons, times = subset.get_coordinate_variable_names( + dataset, + lat_var_names=None, + lon_var_names=None, + time_var_names=dummy_times + ) + assert lats == actual_lats + assert lons == actual_lons + assert times == dummy_times + + # When time, lats, and lons are passed in, nothing is computed manually + lats, lons, times = subset.get_coordinate_variable_names( + dataset, + lat_var_names=dummy_lats, + lon_var_names=dummy_lons, + time_var_names=dummy_times + ) + + assert lats == dummy_lats + assert lons == dummy_lons + assert times == dummy_times + + +def test_var_subsetting_tropomi(data_dir, subset_output_dir, request): + """ + Check that variable subsetting is the same if a leading slash is included + """ + trop_dir = join(data_dir, 'tropomi') + trop_file = 'S5P_OFFL_L2__CH4____20190319T110835_20190319T125006_07407_01_010202_20190325T125810_subset.nc4' + variable_slash = ['/PRODUCT/methane_mixing_ratio'] + variable_noslash = ['PRODUCT/methane_mixing_ratio'] + bbox = np.array(((-180, 180), (-90, 90))) + output_file_slash = "{}_{}".format(request.node.name, trop_file) + output_file_noslash = "{}_noslash_{}".format(request.node.name, trop_file) + shutil.copyfile( + os.path.join(trop_dir, trop_file), + os.path.join(subset_output_dir, trop_file) + ) + shutil.copyfile( + os.path.join(trop_dir, trop_file), + os.path.join(subset_output_dir, 'slashtest' + trop_file) + ) + subset.subset( + file_to_subset=join(subset_output_dir, trop_file), + bbox=bbox, + output_file=join(subset_output_dir, output_file_slash), + variables=variable_slash + ) + subset.subset( + file_to_subset=join(subset_output_dir, 'slashtest' + trop_file), + bbox=bbox, + output_file=join(subset_output_dir, output_file_noslash), + variables=variable_noslash + ) + + slash_dataset = nc.Dataset(join(subset_output_dir, output_file_slash)) + noslash_dataset = nc.Dataset(join(subset_output_dir, output_file_noslash)) + + assert list(slash_dataset.groups['PRODUCT'].variables) == list(noslash_dataset.groups['PRODUCT'].variables) + + +def test_bad_time_unit(subset_output_dir): + fill_val = -99999.0 + time_vals = np.random.rand(10) + time_vals[0] = fill_val + time_vals[-1] = fill_val + + data_vars = { + 'foo': (['x'], np.random.rand(10)), + 'time': ( + ['x'], + time_vals, + { + 'units': 'seconds since 2000-1-1 0:0:0 0', + '_FillValue': fill_val, + 'standard_name': 'time', + 'calendar': 'standard' + } + ), + } - nc_out_location = join(self.subset_output_dir, "bad_time.nc") - ds.to_netcdf(nc_out_location) + ds = xr.Dataset( + data_vars=data_vars, + coords={'x': (['x'], np.arange(10))} + ) - subset.override_decode_cf_datetime() + nc_out_location = join(subset_output_dir, "bad_time.nc") + ds.to_netcdf(nc_out_location) - ds_test = xr.open_dataset(nc_out_location) - ds_test.close() + subset.override_decode_cf_datetime() + ds_test = xr.open_dataset(nc_out_location) + ds_test.close()