Skip to content

Commit

Permalink
Updates to making QC Summary files (#852)
Browse files Browse the repository at this point in the history
* Not creating QC variable for time variables.

* Updates to make writing more CF complient.

* Updated how to handle _FillValue. Changed all Xarray Datasets to ds. Improved how history attribute is modified.

* Adding option to suppres adding QC variables. Checking if time is numpy.datetime64. If not will convert to work with method to add DQR information.'

* Adding option to remove QC variable attributes. Updated method to add info to history attribute. Removed command that updates orginal Dataset.

* Adding a method to ensure datatype is datetime64.

* Returning order to correct format for making copy. Returning .update() method to ensure the attribute removal takes hold.

* Changing the default from Internal QC Assessment terms to DQR Assessment terms.

* Adding option to normalize assessment terms used.

* Adding option to set the missing value indicater to be a value other than NaN.

* Changed to use Suspect and Incorrect

* Updated to match new default values for normalized assessments

* Removing commented code.

* Improving the datafilter test. Checking ancillary_variables attribute. Checking rm_assessments keyword.

* Adding more testing to qc_summary.

* Updated to handle flag_assessments ouside the standard 4. Correctly skips scalar qc variable.

* Catching warning with pytest to ensure the warning was issued

* Catching warning with pytest to ensure the warning was issued

* Catching warning with pytest to ensure the warning was issued

* Catching warning with pytest to ensure the warning was issued

* Catching warning with pytest to ensure the warning was issued

* Catching warning with pytest to ensure the warning was issued

* Catching warning with pytest to ensure the warning was issued

* DOC: Fix spelling

* DOC: Fix spelling

* DOC: Fix spelling

* MTN: Remove old code

* MTN: Remove old code

* MNT: Remove old code

* MNT: Remove old code

* MNT: Remove old code

---------

Co-authored-by: Zach Sherman <[email protected]>
  • Loading branch information
kenkehoe and zssherman authored Sep 4, 2024
1 parent eed7afd commit 9217936
Show file tree
Hide file tree
Showing 14 changed files with 435 additions and 191 deletions.
151 changes: 77 additions & 74 deletions act/io/arm.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,8 +549,9 @@ def write_netcdf(
make_copy=True,
cf_compliant=False,
delete_global_attrs=['qc_standards_version', 'qc_method', 'qc_comment'],
FillValue=-9999,
FillValue=True,
cf_convention='CF-1.8',
encoding={},
**kwargs,
):
"""
Expand All @@ -573,7 +574,8 @@ def write_netcdf(
white space between words.
join_char : str
The character sting to use for replacing white spaces between words when converting
a list of strings to single character string attributes.
a list of strings to single character string attributes. Main use is with the
flag_meanings attribute.
make_copy : boolean
Make a copy before modifying Dataset to write. For large Datasets this
may add processing time and memory. If modifying the Dataset is OK
Expand All @@ -587,14 +589,18 @@ def write_netcdf(
Optional global attributes to be deleted. Defaults to some standard
QC attributes that are not needed. Can add more or set to None to not
remove the attributes.
FillValue : int, float
The value to use as a _FillValue in output file. This is used to fix
issues with how Xarray handles missing_value upon reading. It's confusing
so not a perfect fix. Set to None to leave Xarray to do what it wants.
Set to a value to be the value used as _FillValue in the file and data
array. This should then remove missing_value attribute from the file as well.
FillValue : boolean
Xarray assumes all float type variables had the missing value indicator converted
to NaN upon reading. to_netcdf() will then write a _FillValue attribute set to NaN.
Set FillValue to False to supress adding the _FillValue=NaN variable attribute to
the written file. Set to True to allow to_netcdf() to add the attribute.
If the Dataset variable already has a _FillValue attribute or a _FillValue key
is provided in the encoding dictionary those will not be changed and a _FillValue
will be written to NetCDF file.
cf_convention : str
The Climate and Forecast convention string to add to Conventions attribute.
encoding : dict
The encoding dictionary used with to_netcdf() method.
**kwargs : keywords
Keywords to pass through to Dataset.to_netcdf()
Expand All @@ -607,114 +613,118 @@ def write_netcdf(
"""

if make_copy:
write_ds = copy.deepcopy(self._ds)
ds = copy.deepcopy(self._ds)
else:
write_ds = self._ds
ds = self._ds

encoding = {}
if cleanup_global_atts:
for attr in list(write_ds.attrs):
for attr in list(ds.attrs):
if attr.startswith('_'):
del write_ds.attrs[attr]
del ds.attrs[attr]

if cleanup_qc_atts:
check_atts = ['flag_meanings', 'flag_assessments']
for var_name in list(write_ds.data_vars):
if 'standard_name' not in write_ds[var_name].attrs.keys():
for var_name in list(ds.data_vars):
if 'standard_name' not in ds[var_name].attrs.keys():
continue

if ds[var_name].attrs['standard_name'] != "quality_flag":
continue

for attr_name in check_atts:
try:
att_values = write_ds[var_name].attrs[attr_name]
att_values = ds[var_name].attrs[attr_name]
if isinstance(att_values, (list, tuple)):
att_values = [
att_value.replace(' ', join_char) for att_value in att_values
]
write_ds[var_name].attrs[attr_name] = ' '.join(att_values)
ds[var_name].attrs[attr_name] = ' '.join(att_values)

except KeyError:
pass

# Tell .to_netcdf() to not add a _FillValue attribute for
# quality control variables.
if FillValue is not None:
encoding[var_name] = {'_FillValue': None}
# Xarray makes an assumption that float type variables were read in and converted
# missing value indicator to NaN. .to_netcdf() will then automatically assign
# _FillValue attribute set to NaN when writing. If requested will set _FillValue
# key in encoding to None which will supress to_netcdf() from adding a _FillValue.
# If _FillValue attribute or _FillValue key in encoding is already set, will not
# override and the _FillValue will be written to the file.
if not FillValue:
all_var_names = list(ds.coords.keys()) + list(ds.data_vars)
for var_name in all_var_names:
if '_FillValue' in ds[var_name].attrs:
continue

# Clean up _FillValue vs missing_value mess by creating an
# encoding dictionary with each variable's _FillValue set to
# requested fill value. May need to improve upon this for data type
# and other issues in the future.
if FillValue is not None:
skip_variables = ['base_time', 'time_offset', 'qc_time'] + list(encoding.keys())
for var_name in list(write_ds.data_vars):
if var_name not in skip_variables:
encoding[var_name] = {'_FillValue': FillValue}
if var_name not in encoding.keys():
encoding[var_name] = {'_FillValue': None}
elif '_FillValue' not in encoding[var_name].keys():
encoding[var_name]['_FillValue'] = None

if delete_global_attrs is not None:
for attr in delete_global_attrs:
try:
del write_ds.attrs[attr]
del ds.attrs[attr]
except KeyError:
pass

for var_name in list(write_ds.keys()):
if 'string' in list(write_ds[var_name].attrs.keys()):
att = write_ds[var_name].attrs['string']
write_ds[var_name].attrs[var_name + '_string'] = att
del write_ds[var_name].attrs['string']
for var_name in list(ds.keys()):
if 'string' in list(ds[var_name].attrs.keys()):
att = ds[var_name].attrs['string']
ds[var_name].attrs[var_name + '_string'] = att
del ds[var_name].attrs['string']

# If requested update global attributes and variables attributes for required
# CF attributes.
if cf_compliant:
# Get variable names and standard name for each variable
var_names = list(write_ds.keys())
var_names = list(ds.keys())
standard_names = []
for var_name in var_names:
try:
standard_names.append(write_ds[var_name].attrs['standard_name'])
standard_names.append(ds[var_name].attrs['standard_name'])
except KeyError:
standard_names.append(None)

# Check if time varible has axis and standard_name attribute
coord_name = 'time'
try:
write_ds[coord_name].attrs['axis']
ds[coord_name].attrs['axis']
except KeyError:
try:
write_ds[coord_name].attrs['axis'] = 'T'
ds[coord_name].attrs['axis'] = 'T'
except KeyError:
pass

try:
write_ds[coord_name].attrs['standard_name']
ds[coord_name].attrs['standard_name']
except KeyError:
try:
write_ds[coord_name].attrs['standard_name'] = 'time'
ds[coord_name].attrs['standard_name'] = 'time'
except KeyError:
pass

# Try to determine type of dataset by coordinate dimention named time
# and other factors
try:
write_ds.attrs['FeatureType']
ds.attrs['FeatureType']
except KeyError:
dim_names = list(write_ds.dims)
dim_names = list(ds.dims)
FeatureType = None
if dim_names == ['time']:
FeatureType = 'timeSeries'
elif len(dim_names) == 2 and 'time' in dim_names and 'bound' in dim_names:
FeatureType = 'timeSeries'
elif len(dim_names) >= 2 and 'time' in dim_names:
for var_name in var_names:
dims = list(write_ds[var_name].dims)
dims = list(ds[var_name].dims)
if len(dims) == 2 and 'time' in dims:
prof_dim = list(set(dims) - {'time'})[0]
if write_ds[prof_dim].values.size > 2:
if ds[prof_dim].values.size > 2:
FeatureType = 'timeSeriesProfile'
break

if FeatureType is not None:
write_ds.attrs['FeatureType'] = FeatureType
ds.attrs['FeatureType'] = FeatureType

# Add axis and positive attributes to variables with standard_name
# equal to 'altitude'
Expand All @@ -723,18 +733,18 @@ def write_netcdf(
]
for var_name in alt_variables:
try:
write_ds[var_name].attrs['axis']
ds[var_name].attrs['axis']
except KeyError:
write_ds[var_name].attrs['axis'] = 'Z'
ds[var_name].attrs['axis'] = 'Z'

try:
write_ds[var_name].attrs['positive']
ds[var_name].attrs['positive']
except KeyError:
write_ds[var_name].attrs['positive'] = 'up'
ds[var_name].attrs['positive'] = 'up'

# Check if the Conventions global attribute lists the CF convention
try:
Conventions = write_ds.attrs['Conventions']
Conventions = ds.attrs['Conventions']
Conventions = Conventions.split()
cf_listed = False
for ii in Conventions:
Expand All @@ -743,37 +753,30 @@ def write_netcdf(
break
if not cf_listed:
Conventions.append(cf_convention)
write_ds.attrs['Conventions'] = ' '.join(Conventions)
ds.attrs['Conventions'] = ' '.join(Conventions)

except KeyError:
write_ds.attrs['Conventions'] = str(cf_convention)
ds.attrs['Conventions'] = str(cf_convention)

# Reorder global attributes to ensure history is last
try:
history = copy.copy(write_ds.attrs['history'])
del write_ds.attrs['history']
write_ds.attrs['history'] = history
history = copy.copy(ds.attrs['history'])
del ds.attrs['history']
ds.attrs['history'] = history
except KeyError:
pass
current_time = dt.datetime.now().replace(microsecond=0)
if 'history' in list(write_ds.attrs.keys()):
write_ds.attrs['history'] += ''.join(
[
'\n',
str(current_time),
' created by ACT ',
str(act.__version__),
' act.io.write.write_netcdf',
]
)

if 'time_bounds' in encoding.keys():
encoding['time_bounds']['dtype'] = 'float64'

if hasattr(write_ds, 'time_bounds') and not write_ds.time.encoding:
write_ds.time.encoding.update(write_ds.time_bounds.encoding)
current_time = dt.datetime.utcnow().replace(microsecond=0)
history_value = (
f'Written to file by ACT-{act.__version__} '
f'with write_netcdf() at {current_time} UTC'
)
if 'history' in list(ds.attrs.keys()):
ds.attrs['history'] += f" ; {history_value}"
else:
ds.attrs['history'] = history_value

write_ds.to_netcdf(encoding=encoding, **kwargs)
ds.to_netcdf(encoding=encoding, **kwargs)


def check_if_tar_gz_file(filenames):
Expand Down
48 changes: 45 additions & 3 deletions act/qc/arm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
import requests
import json
from dateutil import parser

from act.config import DEFAULT_DATASTREAM_NAME

Expand All @@ -22,6 +23,7 @@ def add_dqr_to_qc(
cleanup_qc=True,
dqr_link=False,
skip_location_vars=False,
create_missing_qc_variables=True,
):
"""
Function to query the ARM DQR web service for reports and
Expand Down Expand Up @@ -68,6 +70,9 @@ def add_dqr_to_qc(
skip_location_vars : boolean
Does not apply DQRs to location variables. This can be useful in the event
the submitter has erroneously selected all variables.
create_missing_qc_variables : boolean
If a quality control variable for the data variable does not exist,
create the quality control varible and apply DQR.
Returns
-------
Expand Down Expand Up @@ -102,8 +107,35 @@ def add_dqr_to_qc(
if cleanup_qc:
ds.clean.cleanup()

start_date = ds['time'].values[0].astype('datetime64[s]').astype(dt.datetime).strftime('%Y%m%d')
end_date = ds['time'].values[-1].astype('datetime64[s]').astype(dt.datetime).strftime('%Y%m%d')
# Get time from Dataset
time = ds['time'].values

# If the time is not a datetime64 because the read routine was not asked to
# convert CF variables, convert the time variable for this routine only.
if not np.issubdtype(time.dtype, np.datetime64):
units_strings = [
'seconds since ',
'minutes since ',
'hours since ',
'days since ',
'milliseconds since ',
'months since ',
'years since ',
]
td64_strings = ['s', 'm', 'h', 'D', 'ms', 'M', 'Y']
units = ds['time'].attrs['units']
for ii, _ in enumerate(units_strings):
if units.startswith(units_strings[ii]):
units = units.replace(units_strings[ii], '')
td64_string = td64_strings[ii]
break

start_time = parser.parse(units)
start_time = np.datetime64(start_time, td64_string)
time = start_time + ds['time'].values.astype('timedelta64[s]')

start_date = time[0].astype('datetime64[s]').astype(dt.datetime).strftime('%Y%m%d')
end_date = time[-1].astype('datetime64[s]').astype(dt.datetime).strftime('%Y%m%d')

# Clean up assessment to ensure it is a string with no spaces.
if isinstance(assessment, (list, tuple)):
Expand Down Expand Up @@ -152,7 +184,7 @@ def add_dqr_to_qc(
for time_range in docs[quality_category][dqr_number]['dates']:
starttime = np.datetime64(time_range['start_date'])
endtime = np.datetime64(time_range['end_date'])
ind = np.where((ds['time'].values >= starttime) & (ds['time'].values <= endtime))
ind = np.where((time >= starttime) & (time <= endtime))
if ind[0].size > 0:
index = np.append(index, ind[0])

Expand Down Expand Up @@ -181,6 +213,10 @@ def add_dqr_to_qc(
if skip_location_vars and var_name in loc_vars:
continue

# Do not process time variables
if var_name in ['time', 'time_offset', 'time_bounds']:
continue

# Only process provided variable names
if variable is not None and var_name not in variable:
continue
Expand All @@ -193,6 +229,12 @@ def add_dqr_to_qc(
except KeyError:
pass

if (
create_missing_qc_variables is False
and ds.qcfilter.check_for_ancillary_qc(var_name, add_if_missing=False) is None
):
continue

try:
ds.qcfilter.add_test(
var_name,
Expand Down
2 changes: 1 addition & 1 deletion act/qc/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,7 +792,7 @@ def normalize_assessment(
self,
variables=None,
exclude_variables=None,
qc_lookup={'Incorrect': 'Bad', 'Suspect': 'Indeterminate'},
qc_lookup={'Bad': 'Incorrect', 'Indeterminate': 'Suspect'},
):
"""
Method to clean up assessment terms used to be consistent between
Expand Down
Loading

0 comments on commit 9217936

Please sign in to comment.