Skip to content

Commit

Permalink
feat(#639): add check for guideline consistency
Browse files Browse the repository at this point in the history
  • Loading branch information
tamslo committed Oct 16, 2024
1 parent 2855cd8 commit 656871c
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 10 deletions.
4 changes: 3 additions & 1 deletion scripts/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,9 @@ Run `python analyze.py <PATH_TO_BACKUP> [--correct]` to analyze annotations
and optionally correct what can be corrected easily in
`<PATH_TO_BACKUP>_corrected_<TIMESTAMP>.base64.json`.

Also checks which bricks are not used in guidelines.
Also checks whether guidelines with same implications / recommendations were
annotated consistently (although this check will not catch similar formulations)
and which bricks are not used in guidelines.

### Drug annotation checks

Expand Down
21 changes: 16 additions & 5 deletions scripts/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from analyze_functions.checks.fully_annotated_staged import check_if_fully_annotated_staged
from analyze_functions.checks.brand_name import check_brand_name_comma, check_brand_name_whitespace
from analyze_functions.checks.guideline_consistency import check_guideline_consistencies
from analyze_functions.checks.metabolization_before_consequence import check_metabolization_before_consequence
from analyze_functions.checks.fallback_guidelines import check_single_any_fallback_guideline, check_single_lookup_fallback_guideline
from analyze_functions.checks.metabolization_type import check_same_metabolization_type
Expand Down Expand Up @@ -112,6 +113,7 @@ def run_analyses():
failed_guideline_annotation_count = 0
log_content = []
used_bricks = []
guideline_check_args_list = []
for drug in data[DRUG_COLLECTION_NAME]:
drug_name = drug['name']
log_content.append(f'* {drug_name}')
Expand Down Expand Up @@ -153,13 +155,15 @@ def run_analyses():
missing_guideline_annotation_count += 1
log_not_annotated(log_content)
continue
check_args = {
'item': guideline,
'annotations': guideline_annotations,
'drug_name': drug_name,
}
guideline_check_args_list = [*guideline_check_args_list, check_args]
guideline_result = analyze_annotations(
GUIDELINE_CHECKS,
{
'item': guideline,
'annotations': guideline_annotations,
'drug_name': drug_name,
},
check_args,
)
if guideline_result == None: continue
if not all(guideline_result.values()):
Expand All @@ -171,12 +175,19 @@ def run_analyses():
failed_guideline_annotation_count += failed
else:
log_all_passed(log_content)

inconsistent_guidelines_count, guideline_inconsistency_log = \
check_guideline_consistencies(guideline_check_args_list)

log_header = [
'# Analyze annotation data\n\n',
f'Correct if possible: {correct_inconsistencies}\n\n',
'**Failed annotation checks** (search for `_some checks failed_`):\n\n',
f'* Drugs: {failed_drug_annotation_count}\n',
f'* Guidelines: {failed_guideline_annotation_count}\n\n',
f'**Inconsistent guidelines**: {inconsistent_guidelines_count}\n\n',
*guideline_inconsistency_log,
'\n',
'Missing annotations (search for `_not annotated_`):\n\n',
f'* Drugs: {missing_drug_annotation_count}\n',
f'* Guidelines: {missing_guideline_annotation_count}\n\n',
Expand Down
69 changes: 69 additions & 0 deletions scripts/analyze_functions/checks/guideline_consistency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from analyze_functions.data_helpers import get_guideline_content, joint_implication_text
from common.get_data import get_phenotype

def _group_check_args_by_guideline(guideline_check_args_list):
check_args_per_external_guideline = {}
for check_args in guideline_check_args_list:
guideline = check_args['item']
guideline_url = get_guideline_content(guideline, 'guidelineUrl')
guideline_key = guideline_url \
.replace('https://cpicpgx.org/guidelines/', '') \
.replace('https://www.fda.gov/medical-devices/precision-medicine/', 'fda-') \
.replace('/', '')
if guideline_key in check_args_per_external_guideline:
check_args_per_external_guideline[guideline_key] = [
*check_args_per_external_guideline[guideline_key],
check_args,
]
else:
check_args_per_external_guideline[guideline_key] = [check_args]
return check_args_per_external_guideline

def _group_annotations_by_guideline_content(check_args_list):
same_guideline_annotations = {}
for check_args in check_args_list:
drug_name = check_args['drug_name']
guideline = check_args['item']
annotations = check_args['annotations']
grouped_content = {
f'Implication "{joint_implication_text(guideline)}"': annotations['implication'],
f'Recommendation "{get_guideline_content(guideline, "recommendation")}"': annotations['recommendation'],
}
content_identifier = f'{drug_name} {get_phenotype(guideline)}'
for key, content in grouped_content.items():
normalized_key = key.replace(drug_name, '#drug-name').replace('phenytoin', '#drug-name')
normalized_content = content.replace(' still', '')
# TODO: add drug name to structure and log
if normalized_key in same_guideline_annotations:
if normalized_content in same_guideline_annotations[normalized_key]:
same_guideline_annotations[normalized_key][normalized_content].append(content_identifier)
else:
same_guideline_annotations[normalized_key][normalized_content] = [content_identifier]
else:
same_guideline_annotations[normalized_key] = {normalized_content: [content_identifier]}
return same_guideline_annotations

def check_guideline_consistencies(guideline_check_args_list):
check_args_per_external_guideline = \
_group_check_args_by_guideline(guideline_check_args_list)
inconsistent_guidelines_count = 0
log_content = []
for guideline_key, check_args_list in check_args_per_external_guideline.items():
if (len(check_args_list) < 2): continue
same_guideline_annotations = _group_annotations_by_guideline_content(
check_args_list,
)
inconsistency_log_content = []
for same_guideline_key, guideline_content in same_guideline_annotations.items():
unique_guideline_content = set(guideline_content.keys())
if len(unique_guideline_content) != 1:
inconsistency_log_content += f' * {same_guideline_key} maps to:\n'
for content in unique_guideline_content:
content_identifier = guideline_content[content]
inconsistency_log_content += f' * {content} ({"; ".join(content_identifier)})\n'
if (len(inconsistency_log_content) > 0):
inconsistent_guidelines_count += 1
log_content += f'* {guideline_key}\n'
for inconsistency in inconsistency_log_content:
log_content += inconsistency
return inconsistent_guidelines_count, log_content
4 changes: 2 additions & 2 deletions scripts/analyze_functions/checks/normal_side_effect_risk.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from analyze_functions.constants import IGNORED_PHENOTYPES, NORMAL_RISK_TEXTS
from analyze_functions.data_helpers import joint_annotation_text
from analyze_functions.data_helpers import joint_implication_text

def check_normal_side_effect_risk(args):
guideline = args['item']
annotations = args['annotations']
can_have_normal_risk = any(map(
lambda normal_risk_text: normal_risk_text in \
joint_annotation_text(guideline),
joint_implication_text(guideline),
NORMAL_RISK_TEXTS,
)) or all(map(
lambda gene:
Expand Down
4 changes: 2 additions & 2 deletions scripts/analyze_functions/data_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
def get_guideline_content(guideline, key):
return guideline['externalData'][0][key]

def joint_annotation_text(guideline):
return '–'.join(get_guideline_content(guideline, 'implications').values()).lower()
def joint_implication_text(guideline):
return '–'.join(sorted(set(get_guideline_content(guideline, 'implications').values()))).lower()

def ensure_unique_item(item_filter, field_name, value):
item = list(item_filter)
Expand Down

0 comments on commit 656871c

Please sign in to comment.