From 2199ece29182cbfb3911c1a8b6d261f9a76bf1f0 Mon Sep 17 00:00:00 2001
From: David Ochoa <ochoa@ebi.ac.uk>
Date: Tue, 24 Sep 2024 17:04:42 +0100
Subject: [PATCH] feat: flag credible sets explained by SuSiE regions (#780)

* feat: flag PICS top hits in studies with credset sumstats

* feat: flag credible sets explained by SuSiE region

* feat: consider unresolved LD cases

* refactor: improve readability of code

* refactor: improve code readability

---------

Co-authored-by: Daniel Suveges <daniel.suveges@protonmail.com>
---
 src/gentropy/dataset/study_locus.py        |  72 ++++++++++
 src/gentropy/study_locus_validation.py     |   1 +
 tests/gentropy/dataset/test_study_locus.py | 160 +++++++++++++++++++++
 3 files changed, 233 insertions(+)

diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py
index c7f9ffc3d..e3706eaf0 100644
--- a/src/gentropy/dataset/study_locus.py
+++ b/src/gentropy/dataset/study_locus.py
@@ -57,6 +57,7 @@ class StudyLocusQualityCheck(Enum):
         TOP_HIT (str): Study locus from curated top hit
         IN_MHC (str): Flagging study loci in the MHC region
         REDUNDANT_PICS_TOP_HIT (str): Flagging study loci in studies with PICS results from summary statistics
+        EXPLAINED_BY_SUSIE (str): Study locus in region explained by a SuSiE credible set
     """
 
     SUBSIGNIFICANT_FLAG = "Subsignificant p-value"
@@ -84,6 +85,7 @@ class StudyLocusQualityCheck(Enum):
         "PICS results from summary statistics available for this same study"
     )
     TOP_HIT = "Study locus from curated top hit"
+    EXPLAINED_BY_SUSIE = "Study locus in region explained by a SuSiE credible set"
 
 
 class CredibleInterval(Enum):
@@ -963,6 +965,76 @@ def qc_redundant_top_hits_from_PICS(self: StudyLocus) -> StudyLocus:
             _schema=StudyLocus.get_schema(),
         )
 
+    def qc_explained_by_SuSiE(self: StudyLocus) -> StudyLocus:
+        """Flag associations that are explained by SuSiE associations.
+
+        Credible sets overlapping in the same region as a SuSiE credible set are flagged as explained by SuSiE.
+
+        Returns:
+            StudyLocus: Updated study locus with SuSiE explained flags.
+        """
+        # unique study-regions covered by SuSie credible sets
+        susie_study_regions = (
+            self.filter(f.col("finemappingMethod") == "SuSiE-inf")
+            .df.select(
+                "studyId",
+                "chromosome",
+                "locusStart",
+                "locusEnd",
+                f.lit(True).alias("inSuSiE"),
+            )
+            .distinct()
+        )
+
+        # non SuSiE credible sets (studyLocusId) overlapping in any variant with SuSiE locus
+        redundant_study_locus = (
+            self.filter(f.col("finemappingMethod") != "SuSiE-inf")
+            .df.withColumn("l", f.explode("locus"))
+            .select(
+                "studyLocusId",
+                "studyId",
+                "chromosome",
+                f.split(f.col("l.variantId"), "_")[1].alias("tag_position"),
+            )
+            .alias("study_locus")
+            .join(
+                susie_study_regions.alias("regions"),
+                how="inner",
+                on=[
+                    (f.col("study_locus.chromosome") == f.col("regions.chromosome"))
+                    & (f.col("study_locus.studyId") == f.col("regions.studyId"))
+                    & (f.col("study_locus.tag_position") >= f.col("regions.locusStart"))
+                    & (f.col("study_locus.tag_position") <= f.col("regions.locusEnd"))
+                ],
+            )
+            .select("studyLocusId", "inSuSiE")
+            .distinct()
+        )
+
+        return StudyLocus(
+            _df=(
+                self.df.join(redundant_study_locus, on="studyLocusId", how="left")
+                .withColumn(
+                    "qualityControls",
+                    self.update_quality_flag(
+                        f.col("qualityControls"),
+                        # credible set in SuSiE overlapping region
+                        f.col("inSuSiE")
+                        # credible set not based on SuSiE
+                        & (f.col("finemappingMethod") != "SuSiE-inf")
+                        # credible set not already flagged as unresolved LD
+                        & ~f.array_contains(
+                            f.col("qualityControls"),
+                            StudyLocusQualityCheck.UNRESOLVED_LD.value,
+                        ),
+                        StudyLocusQualityCheck.EXPLAINED_BY_SUSIE,
+                    ),
+                )
+                .drop("inSuSiE")
+            ),
+            _schema=StudyLocus.get_schema(),
+        )
+
     def _qc_no_population(self: StudyLocus) -> StudyLocus:
         """Flag associations where the study doesn't have population information to resolve LD.
 
diff --git a/src/gentropy/study_locus_validation.py b/src/gentropy/study_locus_validation.py
index 114eb01f7..287cd5645 100644
--- a/src/gentropy/study_locus_validation.py
+++ b/src/gentropy/study_locus_validation.py
@@ -46,6 +46,7 @@ def __init__(
             .validate_study(study_index)  # Flagging studies not in study index
             .annotate_study_type(study_index)  # Add study type to study locus
             .qc_redundant_top_hits_from_PICS()  # Flagging top hits from studies with PICS summary statistics
+            .qc_explained_by_SuSiE()  # Flagging credible sets in regions explained by SuSiE
             # Annotates credible intervals and filter to only keep 99% credible sets
             .filter_credible_set(credible_interval=CredibleInterval.IS99)
         ).persist()  # we will need this for 2 types of outputs
diff --git a/tests/gentropy/dataset/test_study_locus.py b/tests/gentropy/dataset/test_study_locus.py
index 51fc2ed92..29cbffad2 100644
--- a/tests/gentropy/dataset/test_study_locus.py
+++ b/tests/gentropy/dataset/test_study_locus.py
@@ -904,3 +904,163 @@ def test_qc_redundant_top_hits_from_PICS_correctness(
             )
             .count()
         ) == 3
+
+
+class TestStudyLocusSuSiERedundancyFlagging:
+    """Collection of tests related to flagging redundant credible sets."""
+
+    STUDY_LOCUS_DATA: Any = [
+        # to be flagged due to v4
+        (
+            1,
+            "v1",
+            "s1",
+            "X",
+            "pics",
+            1,
+            3,
+            [
+                {"variantId": "X_1_A_A"},
+                {"variantId": "X_2_A_A"},
+                {"variantId": "X_3_A_A"},
+            ],
+            [],
+        ),
+        # to be flagged due to v4
+        (
+            2,
+            "v2",
+            "s1",
+            "X",
+            "pics",
+            4,
+            5,
+            [
+                {"variantId": "X_4_A_A"},
+                {"variantId": "X_5_A_A"},
+            ],
+            [],
+        ),
+        # NOT to be flagged (outside regions)
+        (
+            3,
+            "v3",
+            "s1",
+            "X",
+            "pics",
+            6,
+            7,
+            [
+                {"variantId": "X_6_A_A"},
+                {"variantId": "X_7_A_A"},
+            ],
+            [],
+        ),
+        # NOT to be flagged (SuSie-Inf credible set)
+        (
+            4,
+            "v4",
+            "s1",
+            "X",
+            "SuSiE-inf",
+            3,
+            5,
+            [{"variantId": "X_3_A_A"}, {"variantId": "X_5_A_A"}],
+            [],
+        ),
+        # NOT to be flagged (Unresolved LD)
+        (
+            5,
+            "v5",
+            "s1",
+            "X",
+            "pics",
+            5,
+            5,
+            [
+                {"variantId": "X_5_A_A"},
+            ],
+            [StudyLocusQualityCheck.UNRESOLVED_LD.value],
+        ),
+        # NOT to be flagged (different study)
+        (
+            6,
+            "v6",
+            "s2",
+            "X",
+            "pics",
+            3,
+            5,
+            [
+                {"variantId": "X_3_A_A"},
+                {"variantId": "X_5_A_A"},
+            ],
+            [],
+        ),
+    ]
+
+    STUDY_LOCUS_SCHEMA = t.StructType(
+        [
+            t.StructField("studyLocusId", t.LongType(), False),
+            t.StructField("variantId", t.StringType(), False),
+            t.StructField("studyId", t.StringType(), False),
+            t.StructField("chromosome", t.StringType(), False),
+            t.StructField("finemappingMethod", t.StringType(), False),
+            t.StructField("locusStart", t.IntegerType(), False),
+            t.StructField("locusEnd", t.IntegerType(), False),
+            StructField(
+                "locus",
+                ArrayType(
+                    StructType(
+                        [
+                            StructField("variantId", StringType(), True),
+                        ]
+                    )
+                ),
+                True,
+            ),
+            t.StructField("qualityControls", t.ArrayType(t.StringType()), False),
+        ]
+    )
+
+    @pytest.fixture(autouse=True)
+    def _setup(
+        self: TestStudyLocusSuSiERedundancyFlagging, spark: SparkSession
+    ) -> None:
+        """Setup study locus for testing."""
+        self.study_locus = StudyLocus(
+            _df=spark.createDataFrame(
+                self.STUDY_LOCUS_DATA, schema=self.STUDY_LOCUS_SCHEMA
+            ),
+            _schema=StudyLocus.get_schema(),
+        )
+
+    def test_qc_qc_explained_by_SuSiE_returntype(
+        self: TestStudyLocusSuSiERedundancyFlagging,
+    ) -> None:
+        """Test qc_explained_by_SuSiE."""
+        assert isinstance(self.study_locus.qc_explained_by_SuSiE(), StudyLocus)
+
+    def test_qc_explained_by_SuSiE_no_data_loss(
+        self: TestStudyLocusSuSiERedundancyFlagging,
+    ) -> None:
+        """Test qc_explained_by_SuSiE no data loss."""
+        assert (
+            self.study_locus.qc_explained_by_SuSiE().df.count()
+            == self.study_locus.df.count()
+        )
+
+    def test_qc_explained_by_SuSiE_correctness(
+        self: TestStudyLocusSuSiERedundancyFlagging,
+    ) -> None:
+        """Testing if the study validation flags the right number of studies."""
+        assert (
+            self.study_locus.qc_explained_by_SuSiE()
+            .df.filter(
+                f.array_contains(
+                    f.col("qualityControls"),
+                    StudyLocusQualityCheck.EXPLAINED_BY_SUSIE.value,
+                )
+            )
+            .count()
+        ) == 2