Skip to content

Commit

Permalink
perform flattening as necessary in test suite (#130)
Browse files Browse the repository at this point in the history
* move methods for flattening netcdf and hdf group structures to separate module

* feature/PODAAC-5065 (#129)

* fix way xarray open granules that have  as a time unit

* fix pylint

* change function to use original function if can parse only change units if we can not parse

* make xarray override into its own function

* add test for override_decode_cf_datetime function

* disable pyline one line instead of global

* Update podaac/subsetter/subset.py

Co-authored-by: Frank Greguska <[email protected]>

* add missing parameter to docstring

* typo in docstring

* extract netcdf opening procedure from beginning of `subset() into a new function

* update tests to use netcdf opening wrapper function, to prevent errors with tempo data

* /version 2.3.0-alpha.5

* update `test_specified_variables()` to use netcdf opening wrapper function in multiple places

to prevent errors with tempo data

* cosmetic

* clean up comment and use 'decode_times'=True for test

* feature/issue 126 (#131)

* Add variable leading slash flexibility

* Add tests back to test file

* changelog added and updated

* Update podaac/subsetter/subset.py

Co-authored-by: Frank Greguska <[email protected]>

* update Syntax

* resolve conflict

Co-authored-by: nlensse1 <[email protected]>
Co-authored-by: Frank Greguska <[email protected]>

* /version 2.3.0-alpha.6

* Update build-pipeline.yml

* /version 2.3.0-alpha.7

* Merge changes from origin/develop

* Merge changes from issues/127

Co-authored-by: sliu008 <[email protected]>
Co-authored-by: Frank Greguska <[email protected]>
Co-authored-by: l2ss-py bot <[email protected]>
Co-authored-by: Nick Lenssen <[email protected]>
Co-authored-by: nlensse1 <[email protected]>
  • Loading branch information
6 people committed Dec 13, 2022
1 parent 3728c7d commit f8122d4
Show file tree
Hide file tree
Showing 7 changed files with 528 additions and 427 deletions.
13 changes: 3 additions & 10 deletions .github/workflows/build-pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ jobs:
- name: Extract metadata (tags, labels) for Docker
if: ${{ !startsWith(github.ref, 'refs/heads/feature') }}
id: meta
uses: docker/metadata-action@v3
uses: docker/metadata-action@v4
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
tags: |
Expand All @@ -197,7 +197,7 @@ jobs:
${GITHUB_WORKSPACE}/.github/workflows/wait-for-pypi.py ${{env.pyproject_name}}[harmony]==${{ env.software_version }}
- name: Build and push Docker image
if: ${{ !startsWith(github.ref, 'refs/heads/feature') }}
uses: docker/build-push-action@v2
uses: docker/build-push-action@v3
with:
context: .
file: docker/Dockerfile
Expand All @@ -216,13 +216,6 @@ jobs:
env:
SNYK_TOKEN: ${{ secrets.SNYK_TOKEN }}
with:
image: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.software_version }}
image: ${{ steps.meta.outputs.tags[0] }}
args: >
--severity-threshold=high
--file=./docker/Dockerfile
--sarif-file-output=docker.sarif
- name: Upload result to GitHub Code Scanning
if: ${{ !startsWith(github.ref, 'refs/heads/feature') }}
uses: github/codeql-action/upload-sarif@v2
with:
sarif_file: ./
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]
### Added
- [issue/126](https://github.com/podaac/l2ss-py/issues/126): Added flexibility to variable subsetting
for variables to not have leading slash in the front
### Changed
### Deprecated
### Removed
### Fixed
- PODAAC-5065: integration with SMAP_RSS_L2_SSS_V5, fix way xarray open granules that have `seconds since 2000-1-1 0:0:0 0` as a time unit.
### Security

## [2.2.0]
Expand Down
232 changes: 232 additions & 0 deletions podaac/subsetter/group_handling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
from shutil import copy

import h5py
import netCDF4 as nc
import numpy as np
import xarray as xr

GROUP_DELIM = '__'


def transform_grouped_dataset(nc_dataset, file_to_subset):
"""
Transform a netCDF4 Dataset that has groups to an xarray compatible
dataset. xarray does not work with groups, so this transformation
will flatten the variables in the dataset and use the group path as
the new variable name. For example, data_01 > km > sst would become
'data_01__km__sst', where GROUP_DELIM is __.
This same pattern is applied to dimensions, which are located under
the appropriate group. They are renamed and placed in the root
group.
Parameters
----------
nc_dataset : nc.Dataset
netCDF4 Dataset that contains groups
file_to_subset : str
Returns
-------
nc.Dataset
netCDF4 Dataset that does not contain groups and that has been
flattened.
"""

# Close the existing read-only dataset and reopen in append mode
nc_dataset.close()
nc_dataset = nc.Dataset(file_to_subset, 'r+')

dimensions = {}

def walk(group_node, path):
for key, item in group_node.items():
group_path = f'{path}{GROUP_DELIM}{key}'

# If there are variables in this group, copy to root group
# and then delete from current group
if item.variables:
# Copy variables to root group with new name
for var_name, var in item.variables.items():
var_group_name = f'{group_path}{GROUP_DELIM}{var_name}'
nc_dataset.variables[var_group_name] = var
# Delete variables
var_names = list(item.variables.keys())
for var_name in var_names:
del item.variables[var_name]

if item.dimensions:
dims = list(item.dimensions.keys())
for dim_name in dims:
new_dim_name = f'{group_path.replace("/", GROUP_DELIM)}{GROUP_DELIM}{dim_name}'
item.dimensions[new_dim_name] = item.dimensions[dim_name]
dimensions[new_dim_name] = item.dimensions[dim_name]
item.renameDimension(dim_name, new_dim_name)

# If there are subgroups in this group, call this function
# again on that group.
if item.groups:
walk(item.groups, group_path)

# Delete non-root groups
group_names = list(group_node.keys())
for group_name in group_names:
del group_node[group_name]

for var_name in list(nc_dataset.variables.keys()):
new_var_name = f'{GROUP_DELIM}{var_name}'
nc_dataset.variables[new_var_name] = nc_dataset.variables[var_name]
del nc_dataset.variables[var_name]

walk(nc_dataset.groups, '')

# Update the dimensions of the dataset in the root group
nc_dataset.dimensions.update(dimensions)

return nc_dataset


def recombine_grouped_datasets(datasets, output_file, start_date): # pylint: disable=too-many-branches
"""
Given a list of xarray datasets, combine those datasets into a
single netCDF4 Dataset and write to the disk. Each dataset has been
transformed using its group path and needs to be un-transformed and
placed in the appropriate group.
Parameters
----------
datasets : list (xr.Dataset)
List of xarray datasets to be combined
output_file : str
Name of the output file to write the resulting NetCDF file to.
"""

base_dataset = nc.Dataset(output_file, mode='w')

for dataset in datasets:
group_lst = []
for var_name in dataset.variables.keys(): # need logic if there is data in the top level not in a group
group_lst.append('/'.join(var_name.split(GROUP_DELIM)[:-1]))
group_lst = ['/' if group == '' else group for group in group_lst]
groups = set(group_lst)
for group in groups:
base_dataset.createGroup(group)

for dim_name in list(dataset.dims.keys()):
new_dim_name = dim_name.split(GROUP_DELIM)[-1]
dim_group = _get_nested_group(base_dataset, dim_name)
dim_group.createDimension(new_dim_name, dataset.dims[dim_name])

# Rename variables
_rename_variables(dataset, base_dataset, start_date)

# Remove group vars from base dataset
for var_name in list(base_dataset.variables.keys()):
if GROUP_DELIM in var_name:
del base_dataset.variables[var_name]

# Remove group dims from base dataset
for dim_name in list(base_dataset.dimensions.keys()):
if GROUP_DELIM in dim_name:
del base_dataset.dimensions[dim_name]

# Copy global attributes
base_dataset.setncatts(datasets[0].attrs)
# Write and close
base_dataset.close()


def _get_nested_group(dataset, group_path):
nested_group = dataset
for group in group_path.strip(GROUP_DELIM).split(GROUP_DELIM)[:-1]:
nested_group = nested_group.groups[group]
return nested_group


def _rename_variables(dataset, base_dataset, start_date):
for var_name in list(dataset.variables.keys()):
new_var_name = var_name.split(GROUP_DELIM)[-1]
var_group = _get_nested_group(base_dataset, var_name)
variable = dataset.variables[var_name]
var_dims = [x.split(GROUP_DELIM)[-1] for x in dataset.variables[var_name].dims]
if np.issubdtype(
dataset.variables[var_name].dtype, np.dtype(np.datetime64)
) or np.issubdtype(
dataset.variables[var_name].dtype, np.dtype(np.timedelta64)
):
if start_date:
dataset.variables[var_name].values = (dataset.variables[var_name].values - np.datetime64(start_date))/np.timedelta64(1, 's')
variable = dataset.variables[var_name]
else:
cf_dt_coder = xr.coding.times.CFDatetimeCoder()
encoded_var = cf_dt_coder.encode(dataset.variables[var_name])
variable = encoded_var

var_attrs = variable.attrs
fill_value = var_attrs.get('_FillValue')
var_attrs.pop('_FillValue', None)
comp_args = {"zlib": True, "complevel": 1}

if variable.dtype == object:
var_group.createVariable(new_var_name, 'S1', var_dims, fill_value=fill_value, **comp_args)
elif variable.dtype == 'timedelta64[ns]':
var_group.createVariable(new_var_name, 'i4', var_dims, fill_value=fill_value, **comp_args)
else:
var_group.createVariable(new_var_name, variable.dtype, var_dims, fill_value=fill_value, **comp_args)

# Copy attributes
var_group.variables[new_var_name].setncatts(var_attrs)

# Copy data
var_group.variables[new_var_name].set_auto_maskandscale(False)
var_group.variables[new_var_name][:] = variable.data


def h5file_transform(finput):
"""
Transform a h5py Dataset that has groups to an xarray compatible
dataset. xarray does not work with groups, so this transformation
will flatten the variables in the dataset and use the group path as
the new variable name. For example, data_01 > km > sst would become
'data_01__km__sst', where GROUP_DELIM is __.
Returns
-------
nc.Dataset
netCDF4 Dataset that does not contain groups and that has been
flattened.
"""
data_new = h5py.File(finput, 'r+')
del_group_list = list(data_new.keys())
has_groups = bool(data_new['/'])

def walk_h5py(data_new, group):
# flattens h5py file
for key, item in data_new[group].items():
group_path = f'{group}{key}'
if isinstance(item, h5py.Dataset):
new_var_name = group_path.replace('/', '__')

data_new[new_var_name] = data_new[group_path]
del data_new[group_path]

elif isinstance(item, h5py.Group):
if len(list(item.keys())) == 0:
new_group_name = group_path.replace('/', '__')
data_new[new_group_name] = data_new[group_path]

walk_h5py(data_new, data_new[group_path].name + '/')

walk_h5py(data_new, data_new.name)

for del_group in del_group_list:
del data_new[del_group]

finputnc = '.'.join(finput.split('.')[:-1]) + '.nc'

data_new.close() # close the h5py dataset
copy(finput, finputnc) # copy to a nc file

nc_dataset = nc.Dataset(finputnc, mode='r')

return nc_dataset, has_groups
Loading

0 comments on commit f8122d4

Please sign in to comment.