From 656871cb977a28e512c38d32b95b932518043a72 Mon Sep 17 00:00:00 2001
From: Tamara Slosarek <tamara.slosarek@hpi.de>
Date: Wed, 16 Oct 2024 17:18:39 +0200
Subject: [PATCH] feat(#639): add check for guideline consistency

---
 scripts/README.md                             |  4 +-
 scripts/analyze.py                            | 21 ++++--
 .../checks/guideline_consistency.py           | 69 +++++++++++++++++++
 .../checks/normal_side_effect_risk.py         |  4 +-
 scripts/analyze_functions/data_helpers.py     |  4 +-
 5 files changed, 92 insertions(+), 10 deletions(-)
 create mode 100644 scripts/analyze_functions/checks/guideline_consistency.py
diff --git a/scripts/README.md b/scripts/README.md
index 3af162fb..f3df3d36 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -89,7 +89,9 @@ Run `python analyze.py <PATH_TO_BACKUP> [--correct]` to analyze annotations
 and optionally correct what can be corrected easily in
 `<PATH_TO_BACKUP>_corrected_<TIMESTAMP>.base64.json`.
 
-Also checks which bricks are not used in guidelines.
+Also checks whether guidelines with same implications / recommendations were
+annotated consistently (although this check will not catch similar formulations)
+and which bricks are not used in guidelines.
 
 ### Drug annotation checks
 
diff --git a/scripts/analyze.py b/scripts/analyze.py
index 413c1448..11ded513 100644
--- a/scripts/analyze.py
+++ b/scripts/analyze.py
@@ -2,6 +2,7 @@
 
 from analyze_functions.checks.fully_annotated_staged import check_if_fully_annotated_staged
 from analyze_functions.checks.brand_name import check_brand_name_comma, check_brand_name_whitespace
+from analyze_functions.checks.guideline_consistency import check_guideline_consistencies
 from analyze_functions.checks.metabolization_before_consequence import check_metabolization_before_consequence
 from analyze_functions.checks.fallback_guidelines import check_single_any_fallback_guideline, check_single_lookup_fallback_guideline
 from analyze_functions.checks.metabolization_type import check_same_metabolization_type
@@ -112,6 +113,7 @@ def run_analyses():
     failed_guideline_annotation_count = 0
     log_content = []
     used_bricks = []
+    guideline_check_args_list = []
     for drug in data[DRUG_COLLECTION_NAME]:
         drug_name = drug['name']
         log_content.append(f'* {drug_name}')
@@ -153,13 +155,15 @@ def run_analyses():
                 missing_guideline_annotation_count += 1
                 log_not_annotated(log_content)
                 continue
+            check_args = {
+                'item': guideline,
+                'annotations': guideline_annotations,
+                'drug_name': drug_name,
+            }
+            guideline_check_args_list = [*guideline_check_args_list, check_args]
             guideline_result = analyze_annotations(
                 GUIDELINE_CHECKS,
-                {
-                    'item': guideline,
-                    'annotations': guideline_annotations,
-                    'drug_name': drug_name,
-                },
+                check_args,
             )
             if guideline_result == None: continue
             if not all(guideline_result.values()):
@@ -171,12 +175,19 @@ def run_analyses():
                 failed_guideline_annotation_count += failed
             else:
                 log_all_passed(log_content)
+    
+    inconsistent_guidelines_count, guideline_inconsistency_log = \
+        check_guideline_consistencies(guideline_check_args_list)
+    
     log_header = [
         '# Analyze annotation data\n\n',
         f'Correct if possible: {correct_inconsistencies}\n\n',
         '**Failed annotation checks** (search for `_some checks failed_`):\n\n',
         f'* Drugs: {failed_drug_annotation_count}\n',
         f'* Guidelines: {failed_guideline_annotation_count}\n\n',
+        f'**Inconsistent guidelines**: {inconsistent_guidelines_count}\n\n',
+        *guideline_inconsistency_log,
+        '\n',
         'Missing annotations (search for `_not annotated_`):\n\n',
         f'* Drugs: {missing_drug_annotation_count}\n',
         f'* Guidelines: {missing_guideline_annotation_count}\n\n',
diff --git a/scripts/analyze_functions/checks/guideline_consistency.py b/scripts/analyze_functions/checks/guideline_consistency.py
new file mode 100644
index 00000000..d4682b16
--- /dev/null
+++ b/scripts/analyze_functions/checks/guideline_consistency.py
@@ -0,0 +1,69 @@
+from analyze_functions.data_helpers import get_guideline_content, joint_implication_text
+from common.get_data import get_phenotype
+
+def _group_check_args_by_guideline(guideline_check_args_list):
+    check_args_per_external_guideline = {}
+    for check_args in guideline_check_args_list:
+        guideline = check_args['item']
+        guideline_url = get_guideline_content(guideline, 'guidelineUrl')
+        guideline_key = guideline_url \
+            .replace('https://cpicpgx.org/guidelines/', '') \
+            .replace('https://www.fda.gov/medical-devices/precision-medicine/', 'fda-') \
+            .replace('/', '')
+        if guideline_key in check_args_per_external_guideline:
+            check_args_per_external_guideline[guideline_key] = [
+                *check_args_per_external_guideline[guideline_key],
+                check_args,
+            ]
+        else:
+            check_args_per_external_guideline[guideline_key] = [check_args]
+    return check_args_per_external_guideline
+
+def _group_annotations_by_guideline_content(check_args_list):
+    same_guideline_annotations = {}
+    for check_args in check_args_list:
+        drug_name = check_args['drug_name']
+        guideline = check_args['item']
+        annotations = check_args['annotations']
+        grouped_content = {
+            f'Implication "{joint_implication_text(guideline)}"': annotations['implication'],
+            f'Recommendation "{get_guideline_content(guideline, "recommendation")}"': annotations['recommendation'],
+        }
+        content_identifier = f'{drug_name} {get_phenotype(guideline)}'
+        for key, content in grouped_content.items():
+            normalized_key = key.replace(drug_name, '#drug-name').replace('phenytoin', '#drug-name')
+            normalized_content = content.replace(' still', '')
+            # TODO: add drug name to structure and log
+            if normalized_key in same_guideline_annotations:
+                if normalized_content in same_guideline_annotations[normalized_key]:
+                    same_guideline_annotations[normalized_key][normalized_content].append(content_identifier)
+                else:
+                    same_guideline_annotations[normalized_key][normalized_content] = [content_identifier]
+            else:
+                same_guideline_annotations[normalized_key] = {normalized_content: [content_identifier]}
+    return same_guideline_annotations
+
+def check_guideline_consistencies(guideline_check_args_list):
+    check_args_per_external_guideline = \
+        _group_check_args_by_guideline(guideline_check_args_list)
+    inconsistent_guidelines_count = 0
+    log_content = []
+    for guideline_key, check_args_list in check_args_per_external_guideline.items():
+        if (len(check_args_list) < 2): continue
+        same_guideline_annotations = _group_annotations_by_guideline_content(
+            check_args_list,
+        )
+        inconsistency_log_content = []
+        for same_guideline_key, guideline_content in same_guideline_annotations.items():
+            unique_guideline_content = set(guideline_content.keys())
+            if len(unique_guideline_content) != 1:
+                inconsistency_log_content += f'  * {same_guideline_key} maps to:\n'
+                for content in unique_guideline_content:
+                    content_identifier = guideline_content[content]
+                    inconsistency_log_content += f'    * {content} ({"; ".join(content_identifier)})\n'
+        if (len(inconsistency_log_content) > 0):
+            inconsistent_guidelines_count += 1
+            log_content += f'* {guideline_key}\n'
+            for inconsistency in inconsistency_log_content:
+                log_content += inconsistency
+    return inconsistent_guidelines_count, log_content
\ No newline at end of file
diff --git a/scripts/analyze_functions/checks/normal_side_effect_risk.py b/scripts/analyze_functions/checks/normal_side_effect_risk.py
index 4d3964f1..815eb81d 100644
--- a/scripts/analyze_functions/checks/normal_side_effect_risk.py
+++ b/scripts/analyze_functions/checks/normal_side_effect_risk.py
@@ -1,12 +1,12 @@
 from analyze_functions.constants import IGNORED_PHENOTYPES, NORMAL_RISK_TEXTS
-from analyze_functions.data_helpers import joint_annotation_text
+from analyze_functions.data_helpers import joint_implication_text
 
 def check_normal_side_effect_risk(args):
     guideline = args['item']
     annotations = args['annotations']
     can_have_normal_risk = any(map(
         lambda normal_risk_text: normal_risk_text in \
-            joint_annotation_text(guideline),
+            joint_implication_text(guideline),
         NORMAL_RISK_TEXTS,
     )) or all(map(
         lambda gene:
diff --git a/scripts/analyze_functions/data_helpers.py b/scripts/analyze_functions/data_helpers.py
index 3b104a6e..ac358fbd 100644
--- a/scripts/analyze_functions/data_helpers.py
+++ b/scripts/analyze_functions/data_helpers.py
@@ -3,8 +3,8 @@
 def get_guideline_content(guideline, key):
     return guideline['externalData'][0][key]
 
-def joint_annotation_text(guideline):
-    return '–'.join(get_guideline_content(guideline, 'implications').values()).lower()
+def joint_implication_text(guideline):
+    return '–'.join(sorted(set(get_guideline_content(guideline, 'implications').values()))).lower()
 
 def ensure_unique_item(item_filter, field_name, value):
     item = list(item_filter)