From 656871cb977a28e512c38d32b95b932518043a72 Mon Sep 17 00:00:00 2001 From: Tamara Slosarek Date: Wed, 16 Oct 2024 17:18:39 +0200 Subject: [PATCH] feat(#639): add check for guideline consistency --- scripts/README.md | 4 +- scripts/analyze.py | 21 ++++-- .../checks/guideline_consistency.py | 69 +++++++++++++++++++ .../checks/normal_side_effect_risk.py | 4 +- scripts/analyze_functions/data_helpers.py | 4 +- 5 files changed, 92 insertions(+), 10 deletions(-) create mode 100644 scripts/analyze_functions/checks/guideline_consistency.py diff --git a/scripts/README.md b/scripts/README.md index 3af162fb..f3df3d36 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -89,7 +89,9 @@ Run `python analyze.py [--correct]` to analyze annotations and optionally correct what can be corrected easily in `_corrected_.base64.json`. -Also checks which bricks are not used in guidelines. +Also checks whether guidelines with same implications / recommendations were +annotated consistently (although this check will not catch similar formulations) +and which bricks are not used in guidelines. ### Drug annotation checks diff --git a/scripts/analyze.py b/scripts/analyze.py index 413c1448..11ded513 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -2,6 +2,7 @@ from analyze_functions.checks.fully_annotated_staged import check_if_fully_annotated_staged from analyze_functions.checks.brand_name import check_brand_name_comma, check_brand_name_whitespace +from analyze_functions.checks.guideline_consistency import check_guideline_consistencies from analyze_functions.checks.metabolization_before_consequence import check_metabolization_before_consequence from analyze_functions.checks.fallback_guidelines import check_single_any_fallback_guideline, check_single_lookup_fallback_guideline from analyze_functions.checks.metabolization_type import check_same_metabolization_type @@ -112,6 +113,7 @@ def run_analyses(): failed_guideline_annotation_count = 0 log_content = [] used_bricks = [] + guideline_check_args_list = [] for drug in data[DRUG_COLLECTION_NAME]: drug_name = drug['name'] log_content.append(f'* {drug_name}') @@ -153,13 +155,15 @@ def run_analyses(): missing_guideline_annotation_count += 1 log_not_annotated(log_content) continue + check_args = { + 'item': guideline, + 'annotations': guideline_annotations, + 'drug_name': drug_name, + } + guideline_check_args_list = [*guideline_check_args_list, check_args] guideline_result = analyze_annotations( GUIDELINE_CHECKS, - { - 'item': guideline, - 'annotations': guideline_annotations, - 'drug_name': drug_name, - }, + check_args, ) if guideline_result == None: continue if not all(guideline_result.values()): @@ -171,12 +175,19 @@ def run_analyses(): failed_guideline_annotation_count += failed else: log_all_passed(log_content) + + inconsistent_guidelines_count, guideline_inconsistency_log = \ + check_guideline_consistencies(guideline_check_args_list) + log_header = [ '# Analyze annotation data\n\n', f'Correct if possible: {correct_inconsistencies}\n\n', '**Failed annotation checks** (search for `_some checks failed_`):\n\n', f'* Drugs: {failed_drug_annotation_count}\n', f'* Guidelines: {failed_guideline_annotation_count}\n\n', + f'**Inconsistent guidelines**: {inconsistent_guidelines_count}\n\n', + *guideline_inconsistency_log, + '\n', 'Missing annotations (search for `_not annotated_`):\n\n', f'* Drugs: {missing_drug_annotation_count}\n', f'* Guidelines: {missing_guideline_annotation_count}\n\n', diff --git a/scripts/analyze_functions/checks/guideline_consistency.py b/scripts/analyze_functions/checks/guideline_consistency.py new file mode 100644 index 00000000..d4682b16 --- /dev/null +++ b/scripts/analyze_functions/checks/guideline_consistency.py @@ -0,0 +1,69 @@ +from analyze_functions.data_helpers import get_guideline_content, joint_implication_text +from common.get_data import get_phenotype + +def _group_check_args_by_guideline(guideline_check_args_list): + check_args_per_external_guideline = {} + for check_args in guideline_check_args_list: + guideline = check_args['item'] + guideline_url = get_guideline_content(guideline, 'guidelineUrl') + guideline_key = guideline_url \ + .replace('https://cpicpgx.org/guidelines/', '') \ + .replace('https://www.fda.gov/medical-devices/precision-medicine/', 'fda-') \ + .replace('/', '') + if guideline_key in check_args_per_external_guideline: + check_args_per_external_guideline[guideline_key] = [ + *check_args_per_external_guideline[guideline_key], + check_args, + ] + else: + check_args_per_external_guideline[guideline_key] = [check_args] + return check_args_per_external_guideline + +def _group_annotations_by_guideline_content(check_args_list): + same_guideline_annotations = {} + for check_args in check_args_list: + drug_name = check_args['drug_name'] + guideline = check_args['item'] + annotations = check_args['annotations'] + grouped_content = { + f'Implication "{joint_implication_text(guideline)}"': annotations['implication'], + f'Recommendation "{get_guideline_content(guideline, "recommendation")}"': annotations['recommendation'], + } + content_identifier = f'{drug_name} {get_phenotype(guideline)}' + for key, content in grouped_content.items(): + normalized_key = key.replace(drug_name, '#drug-name').replace('phenytoin', '#drug-name') + normalized_content = content.replace(' still', '') + # TODO: add drug name to structure and log + if normalized_key in same_guideline_annotations: + if normalized_content in same_guideline_annotations[normalized_key]: + same_guideline_annotations[normalized_key][normalized_content].append(content_identifier) + else: + same_guideline_annotations[normalized_key][normalized_content] = [content_identifier] + else: + same_guideline_annotations[normalized_key] = {normalized_content: [content_identifier]} + return same_guideline_annotations + +def check_guideline_consistencies(guideline_check_args_list): + check_args_per_external_guideline = \ + _group_check_args_by_guideline(guideline_check_args_list) + inconsistent_guidelines_count = 0 + log_content = [] + for guideline_key, check_args_list in check_args_per_external_guideline.items(): + if (len(check_args_list) < 2): continue + same_guideline_annotations = _group_annotations_by_guideline_content( + check_args_list, + ) + inconsistency_log_content = [] + for same_guideline_key, guideline_content in same_guideline_annotations.items(): + unique_guideline_content = set(guideline_content.keys()) + if len(unique_guideline_content) != 1: + inconsistency_log_content += f' * {same_guideline_key} maps to:\n' + for content in unique_guideline_content: + content_identifier = guideline_content[content] + inconsistency_log_content += f' * {content} ({"; ".join(content_identifier)})\n' + if (len(inconsistency_log_content) > 0): + inconsistent_guidelines_count += 1 + log_content += f'* {guideline_key}\n' + for inconsistency in inconsistency_log_content: + log_content += inconsistency + return inconsistent_guidelines_count, log_content \ No newline at end of file diff --git a/scripts/analyze_functions/checks/normal_side_effect_risk.py b/scripts/analyze_functions/checks/normal_side_effect_risk.py index 4d3964f1..815eb81d 100644 --- a/scripts/analyze_functions/checks/normal_side_effect_risk.py +++ b/scripts/analyze_functions/checks/normal_side_effect_risk.py @@ -1,12 +1,12 @@ from analyze_functions.constants import IGNORED_PHENOTYPES, NORMAL_RISK_TEXTS -from analyze_functions.data_helpers import joint_annotation_text +from analyze_functions.data_helpers import joint_implication_text def check_normal_side_effect_risk(args): guideline = args['item'] annotations = args['annotations'] can_have_normal_risk = any(map( lambda normal_risk_text: normal_risk_text in \ - joint_annotation_text(guideline), + joint_implication_text(guideline), NORMAL_RISK_TEXTS, )) or all(map( lambda gene: diff --git a/scripts/analyze_functions/data_helpers.py b/scripts/analyze_functions/data_helpers.py index 3b104a6e..ac358fbd 100644 --- a/scripts/analyze_functions/data_helpers.py +++ b/scripts/analyze_functions/data_helpers.py @@ -3,8 +3,8 @@ def get_guideline_content(guideline, key): return guideline['externalData'][0][key] -def joint_annotation_text(guideline): - return '–'.join(get_guideline_content(guideline, 'implications').values()).lower() +def joint_implication_text(guideline): + return '–'.join(sorted(set(get_guideline_content(guideline, 'implications').values()))).lower() def ensure_unique_item(item_filter, field_name, value): item = list(item_filter)