Skip to content

Commit

Permalink
Merge pull request #394 from kenkehoe/qc_processing_time
Browse files Browse the repository at this point in the history
QC processing time
  • Loading branch information
AdamTheisen authored Feb 20, 2022
2 parents eb209ed + 87b2244 commit 3e7f957
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 29 deletions.
57 changes: 31 additions & 26 deletions act/qc/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,38 +40,43 @@ def matched_qc_variables(self, check_arm_syntax=True):
A list of strings containing the name of each variable.
"""
variables = []

# Will need to find all historical cases and add to list
qc_dict = {'description':
["See global attributes for individual.+bit descriptions.",
("This field contains bit packed integer values, where each "
"bit represents a QC test on the data. Non-zero bits indicate "
"the QC condition given in the description for those bits; "
"a value of 0.+ indicates the data has not "
"failed any QC tests."),
(r"This field contains bit packed values which should be "
r"interpreted as listed..+")
]
}
description_list = [
"See global attributes for individual.+bit descriptions.",
("This field contains bit packed integer values, where each "
"bit represents a QC test on the data. Non-zero bits indicate "
"the QC condition given in the description for those bits; "
"a value of 0.+ indicates the data has not "
"failed any QC tests."),
(r"This field contains bit packed values which should be "
r"interpreted as listed..+")
]

# Loop over each variable and look for a match to an attribute that
# would exist if the variable is a QC variable
# would exist if the variable is a QC variable.
variables = []
for var in self._obj.data_vars:
attributes = self._obj[var].attrs
for att_name in attributes:
if att_name in qc_dict.keys():
for value in qc_dict[att_name]:
if re.match(value, attributes[att_name]) is not None:
variables.append(var)
break
try:
if self._obj[var].attrs['standard_name'] == 'quality_flag':
variables.append(var)
continue
except KeyError:
pass

if check_arm_syntax and var.startswith('qc_'):
variables.append(var)
continue

try:
for desc in description_list:
if re.match(desc, self._obj[var].attrs['description']) is not None:
variables.append(var)
break
except KeyError:
pass

# Check the start of the variable name. If it begins with qc_ assume quality
# control variable from ARM.
if check_arm_syntax:
variables_qc = [var for var in self._obj.data_vars if var.startswith('qc_')]
variables = variables + variables_qc
variables = list(set(variables))
variables = list(set(variables))

return variables

Expand Down
5 changes: 2 additions & 3 deletions act/qc/qcfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __init__(self, xarray_obj):
self._obj = xarray_obj

def check_for_ancillary_qc(self, var_name, add_if_missing=True,
cleanup=True, flag_type=False):
cleanup=False, flag_type=False):
"""
Method to check if a quality control variable exist in the dataset
and return the quality control varible name.
Expand Down Expand Up @@ -92,8 +92,7 @@ def check_for_ancillary_qc(self, var_name, add_if_missing=True,
self._obj.qcfilter.update_ancillary_variable(var_name, qc_var_name)

# Clean up quality control variables to the requried standard in the
# xarray object. If the quality control variables are already cleaned
# the extra work is small since it's just checking.
# xarray object.
if cleanup:
self._obj.clean.cleanup(handle_missing_value=True,
link_qc_variables=False)
Expand Down
32 changes: 32 additions & 0 deletions act/tests/test_qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@
import numpy as np
import pytest
import copy
import pandas as pd
import dask.array as da
import xarray as xr
from datetime import datetime


def test_fft_shading_test():
Expand Down Expand Up @@ -787,3 +790,32 @@ def test_qc_data_type():
assert ds_object[expected_qc_var_name].attrs['flag_masks'][0].dtype == np.uint64

ds_object.qcfilter.add_test(var_name, index=[1], test_meaning='Fourth test', recycle=True)


def test_qc_speed():
"""
This tests the speed of the QC module to ensure changes do not significantly
slow down the module's processing.
"""

n_variables = 100
n_samples = 100

time = pd.date_range(start="2022-02-17 00:00:00", end="2022-02-18 00:00:00", periods=n_samples)

# Create data variables with random noise
np.random.seed(42)
noisy_data_mapping = {f"data_var_{i}": np.random.random(time.shape) for i in range(n_variables)}

ds = xr.Dataset(
data_vars={name: ("time", data) for name, data in noisy_data_mapping.items()},
coords={"time": time},
)

start = datetime.utcnow()
for name, var in noisy_data_mapping.items():
failed_qc = var > 0.75 # Consider data above 0.75 as bad. Negligible time here.
ds.qcfilter.add_test(name, index=failed_qc, test_meaning="Value above threshold")

time_diff = datetime.utcnow() - start
assert time_diff.seconds <= 3

0 comments on commit 3e7f957

Please sign in to comment.