From 2199ece29182cbfb3911c1a8b6d261f9a76bf1f0 Mon Sep 17 00:00:00 2001 From: David Ochoa Date: Tue, 24 Sep 2024 17:04:42 +0100 Subject: [PATCH] feat: flag credible sets explained by SuSiE regions (#780) * feat: flag PICS top hits in studies with credset sumstats * feat: flag credible sets explained by SuSiE region * feat: consider unresolved LD cases * refactor: improve readability of code * refactor: improve code readability --------- Co-authored-by: Daniel Suveges --- src/gentropy/dataset/study_locus.py | 72 ++++++++++ src/gentropy/study_locus_validation.py | 1 + tests/gentropy/dataset/test_study_locus.py | 160 +++++++++++++++++++++ 3 files changed, 233 insertions(+) diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py index c7f9ffc3d..e3706eaf0 100644 --- a/src/gentropy/dataset/study_locus.py +++ b/src/gentropy/dataset/study_locus.py @@ -57,6 +57,7 @@ class StudyLocusQualityCheck(Enum): TOP_HIT (str): Study locus from curated top hit IN_MHC (str): Flagging study loci in the MHC region REDUNDANT_PICS_TOP_HIT (str): Flagging study loci in studies with PICS results from summary statistics + EXPLAINED_BY_SUSIE (str): Study locus in region explained by a SuSiE credible set """ SUBSIGNIFICANT_FLAG = "Subsignificant p-value" @@ -84,6 +85,7 @@ class StudyLocusQualityCheck(Enum): "PICS results from summary statistics available for this same study" ) TOP_HIT = "Study locus from curated top hit" + EXPLAINED_BY_SUSIE = "Study locus in region explained by a SuSiE credible set" class CredibleInterval(Enum): @@ -963,6 +965,76 @@ def qc_redundant_top_hits_from_PICS(self: StudyLocus) -> StudyLocus: _schema=StudyLocus.get_schema(), ) + def qc_explained_by_SuSiE(self: StudyLocus) -> StudyLocus: + """Flag associations that are explained by SuSiE associations. + + Credible sets overlapping in the same region as a SuSiE credible set are flagged as explained by SuSiE. + + Returns: + StudyLocus: Updated study locus with SuSiE explained flags. + """ + # unique study-regions covered by SuSie credible sets + susie_study_regions = ( + self.filter(f.col("finemappingMethod") == "SuSiE-inf") + .df.select( + "studyId", + "chromosome", + "locusStart", + "locusEnd", + f.lit(True).alias("inSuSiE"), + ) + .distinct() + ) + + # non SuSiE credible sets (studyLocusId) overlapping in any variant with SuSiE locus + redundant_study_locus = ( + self.filter(f.col("finemappingMethod") != "SuSiE-inf") + .df.withColumn("l", f.explode("locus")) + .select( + "studyLocusId", + "studyId", + "chromosome", + f.split(f.col("l.variantId"), "_")[1].alias("tag_position"), + ) + .alias("study_locus") + .join( + susie_study_regions.alias("regions"), + how="inner", + on=[ + (f.col("study_locus.chromosome") == f.col("regions.chromosome")) + & (f.col("study_locus.studyId") == f.col("regions.studyId")) + & (f.col("study_locus.tag_position") >= f.col("regions.locusStart")) + & (f.col("study_locus.tag_position") <= f.col("regions.locusEnd")) + ], + ) + .select("studyLocusId", "inSuSiE") + .distinct() + ) + + return StudyLocus( + _df=( + self.df.join(redundant_study_locus, on="studyLocusId", how="left") + .withColumn( + "qualityControls", + self.update_quality_flag( + f.col("qualityControls"), + # credible set in SuSiE overlapping region + f.col("inSuSiE") + # credible set not based on SuSiE + & (f.col("finemappingMethod") != "SuSiE-inf") + # credible set not already flagged as unresolved LD + & ~f.array_contains( + f.col("qualityControls"), + StudyLocusQualityCheck.UNRESOLVED_LD.value, + ), + StudyLocusQualityCheck.EXPLAINED_BY_SUSIE, + ), + ) + .drop("inSuSiE") + ), + _schema=StudyLocus.get_schema(), + ) + def _qc_no_population(self: StudyLocus) -> StudyLocus: """Flag associations where the study doesn't have population information to resolve LD. diff --git a/src/gentropy/study_locus_validation.py b/src/gentropy/study_locus_validation.py index 114eb01f7..287cd5645 100644 --- a/src/gentropy/study_locus_validation.py +++ b/src/gentropy/study_locus_validation.py @@ -46,6 +46,7 @@ def __init__( .validate_study(study_index) # Flagging studies not in study index .annotate_study_type(study_index) # Add study type to study locus .qc_redundant_top_hits_from_PICS() # Flagging top hits from studies with PICS summary statistics + .qc_explained_by_SuSiE() # Flagging credible sets in regions explained by SuSiE # Annotates credible intervals and filter to only keep 99% credible sets .filter_credible_set(credible_interval=CredibleInterval.IS99) ).persist() # we will need this for 2 types of outputs diff --git a/tests/gentropy/dataset/test_study_locus.py b/tests/gentropy/dataset/test_study_locus.py index 51fc2ed92..29cbffad2 100644 --- a/tests/gentropy/dataset/test_study_locus.py +++ b/tests/gentropy/dataset/test_study_locus.py @@ -904,3 +904,163 @@ def test_qc_redundant_top_hits_from_PICS_correctness( ) .count() ) == 3 + + +class TestStudyLocusSuSiERedundancyFlagging: + """Collection of tests related to flagging redundant credible sets.""" + + STUDY_LOCUS_DATA: Any = [ + # to be flagged due to v4 + ( + 1, + "v1", + "s1", + "X", + "pics", + 1, + 3, + [ + {"variantId": "X_1_A_A"}, + {"variantId": "X_2_A_A"}, + {"variantId": "X_3_A_A"}, + ], + [], + ), + # to be flagged due to v4 + ( + 2, + "v2", + "s1", + "X", + "pics", + 4, + 5, + [ + {"variantId": "X_4_A_A"}, + {"variantId": "X_5_A_A"}, + ], + [], + ), + # NOT to be flagged (outside regions) + ( + 3, + "v3", + "s1", + "X", + "pics", + 6, + 7, + [ + {"variantId": "X_6_A_A"}, + {"variantId": "X_7_A_A"}, + ], + [], + ), + # NOT to be flagged (SuSie-Inf credible set) + ( + 4, + "v4", + "s1", + "X", + "SuSiE-inf", + 3, + 5, + [{"variantId": "X_3_A_A"}, {"variantId": "X_5_A_A"}], + [], + ), + # NOT to be flagged (Unresolved LD) + ( + 5, + "v5", + "s1", + "X", + "pics", + 5, + 5, + [ + {"variantId": "X_5_A_A"}, + ], + [StudyLocusQualityCheck.UNRESOLVED_LD.value], + ), + # NOT to be flagged (different study) + ( + 6, + "v6", + "s2", + "X", + "pics", + 3, + 5, + [ + {"variantId": "X_3_A_A"}, + {"variantId": "X_5_A_A"}, + ], + [], + ), + ] + + STUDY_LOCUS_SCHEMA = t.StructType( + [ + t.StructField("studyLocusId", t.LongType(), False), + t.StructField("variantId", t.StringType(), False), + t.StructField("studyId", t.StringType(), False), + t.StructField("chromosome", t.StringType(), False), + t.StructField("finemappingMethod", t.StringType(), False), + t.StructField("locusStart", t.IntegerType(), False), + t.StructField("locusEnd", t.IntegerType(), False), + StructField( + "locus", + ArrayType( + StructType( + [ + StructField("variantId", StringType(), True), + ] + ) + ), + True, + ), + t.StructField("qualityControls", t.ArrayType(t.StringType()), False), + ] + ) + + @pytest.fixture(autouse=True) + def _setup( + self: TestStudyLocusSuSiERedundancyFlagging, spark: SparkSession + ) -> None: + """Setup study locus for testing.""" + self.study_locus = StudyLocus( + _df=spark.createDataFrame( + self.STUDY_LOCUS_DATA, schema=self.STUDY_LOCUS_SCHEMA + ), + _schema=StudyLocus.get_schema(), + ) + + def test_qc_qc_explained_by_SuSiE_returntype( + self: TestStudyLocusSuSiERedundancyFlagging, + ) -> None: + """Test qc_explained_by_SuSiE.""" + assert isinstance(self.study_locus.qc_explained_by_SuSiE(), StudyLocus) + + def test_qc_explained_by_SuSiE_no_data_loss( + self: TestStudyLocusSuSiERedundancyFlagging, + ) -> None: + """Test qc_explained_by_SuSiE no data loss.""" + assert ( + self.study_locus.qc_explained_by_SuSiE().df.count() + == self.study_locus.df.count() + ) + + def test_qc_explained_by_SuSiE_correctness( + self: TestStudyLocusSuSiERedundancyFlagging, + ) -> None: + """Testing if the study validation flags the right number of studies.""" + assert ( + self.study_locus.qc_explained_by_SuSiE() + .df.filter( + f.array_contains( + f.col("qualityControls"), + StudyLocusQualityCheck.EXPLAINED_BY_SUSIE.value, + ) + ) + .count() + ) == 2