Skip to content

Commit

Permalink
feat: flag credible sets explained by SuSiE regions (#780)
Browse files Browse the repository at this point in the history
* feat: flag PICS top hits in studies with credset sumstats

* feat: flag credible sets explained by SuSiE region

* feat: consider unresolved LD cases

* refactor: improve readability of code

* refactor: improve code readability

---------

Co-authored-by: Daniel Suveges <[email protected]>
  • Loading branch information
d0choa and DSuveges committed Sep 24, 2024
1 parent 84d6638 commit 2199ece
Show file tree
Hide file tree
Showing 3 changed files with 233 additions and 0 deletions.
72 changes: 72 additions & 0 deletions src/gentropy/dataset/study_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ class StudyLocusQualityCheck(Enum):
TOP_HIT (str): Study locus from curated top hit
IN_MHC (str): Flagging study loci in the MHC region
REDUNDANT_PICS_TOP_HIT (str): Flagging study loci in studies with PICS results from summary statistics
EXPLAINED_BY_SUSIE (str): Study locus in region explained by a SuSiE credible set
"""

SUBSIGNIFICANT_FLAG = "Subsignificant p-value"
Expand Down Expand Up @@ -84,6 +85,7 @@ class StudyLocusQualityCheck(Enum):
"PICS results from summary statistics available for this same study"
)
TOP_HIT = "Study locus from curated top hit"
EXPLAINED_BY_SUSIE = "Study locus in region explained by a SuSiE credible set"


class CredibleInterval(Enum):
Expand Down Expand Up @@ -963,6 +965,76 @@ def qc_redundant_top_hits_from_PICS(self: StudyLocus) -> StudyLocus:
_schema=StudyLocus.get_schema(),
)

def qc_explained_by_SuSiE(self: StudyLocus) -> StudyLocus:
"""Flag associations that are explained by SuSiE associations.
Credible sets overlapping in the same region as a SuSiE credible set are flagged as explained by SuSiE.
Returns:
StudyLocus: Updated study locus with SuSiE explained flags.
"""
# unique study-regions covered by SuSie credible sets
susie_study_regions = (
self.filter(f.col("finemappingMethod") == "SuSiE-inf")
.df.select(
"studyId",
"chromosome",
"locusStart",
"locusEnd",
f.lit(True).alias("inSuSiE"),
)
.distinct()
)

# non SuSiE credible sets (studyLocusId) overlapping in any variant with SuSiE locus
redundant_study_locus = (
self.filter(f.col("finemappingMethod") != "SuSiE-inf")
.df.withColumn("l", f.explode("locus"))
.select(
"studyLocusId",
"studyId",
"chromosome",
f.split(f.col("l.variantId"), "_")[1].alias("tag_position"),
)
.alias("study_locus")
.join(
susie_study_regions.alias("regions"),
how="inner",
on=[
(f.col("study_locus.chromosome") == f.col("regions.chromosome"))
& (f.col("study_locus.studyId") == f.col("regions.studyId"))
& (f.col("study_locus.tag_position") >= f.col("regions.locusStart"))
& (f.col("study_locus.tag_position") <= f.col("regions.locusEnd"))
],
)
.select("studyLocusId", "inSuSiE")
.distinct()
)

return StudyLocus(
_df=(
self.df.join(redundant_study_locus, on="studyLocusId", how="left")
.withColumn(
"qualityControls",
self.update_quality_flag(
f.col("qualityControls"),
# credible set in SuSiE overlapping region
f.col("inSuSiE")
# credible set not based on SuSiE
& (f.col("finemappingMethod") != "SuSiE-inf")
# credible set not already flagged as unresolved LD
& ~f.array_contains(
f.col("qualityControls"),
StudyLocusQualityCheck.UNRESOLVED_LD.value,
),
StudyLocusQualityCheck.EXPLAINED_BY_SUSIE,
),
)
.drop("inSuSiE")
),
_schema=StudyLocus.get_schema(),
)

def _qc_no_population(self: StudyLocus) -> StudyLocus:
"""Flag associations where the study doesn't have population information to resolve LD.
Expand Down
1 change: 1 addition & 0 deletions src/gentropy/study_locus_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def __init__(
.validate_study(study_index) # Flagging studies not in study index
.annotate_study_type(study_index) # Add study type to study locus
.qc_redundant_top_hits_from_PICS() # Flagging top hits from studies with PICS summary statistics
.qc_explained_by_SuSiE() # Flagging credible sets in regions explained by SuSiE
# Annotates credible intervals and filter to only keep 99% credible sets
.filter_credible_set(credible_interval=CredibleInterval.IS99)
).persist() # we will need this for 2 types of outputs
Expand Down
160 changes: 160 additions & 0 deletions tests/gentropy/dataset/test_study_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -904,3 +904,163 @@ def test_qc_redundant_top_hits_from_PICS_correctness(
)
.count()
) == 3


class TestStudyLocusSuSiERedundancyFlagging:
"""Collection of tests related to flagging redundant credible sets."""

STUDY_LOCUS_DATA: Any = [
# to be flagged due to v4
(
1,
"v1",
"s1",
"X",
"pics",
1,
3,
[
{"variantId": "X_1_A_A"},
{"variantId": "X_2_A_A"},
{"variantId": "X_3_A_A"},
],
[],
),
# to be flagged due to v4
(
2,
"v2",
"s1",
"X",
"pics",
4,
5,
[
{"variantId": "X_4_A_A"},
{"variantId": "X_5_A_A"},
],
[],
),
# NOT to be flagged (outside regions)
(
3,
"v3",
"s1",
"X",
"pics",
6,
7,
[
{"variantId": "X_6_A_A"},
{"variantId": "X_7_A_A"},
],
[],
),
# NOT to be flagged (SuSie-Inf credible set)
(
4,
"v4",
"s1",
"X",
"SuSiE-inf",
3,
5,
[{"variantId": "X_3_A_A"}, {"variantId": "X_5_A_A"}],
[],
),
# NOT to be flagged (Unresolved LD)
(
5,
"v5",
"s1",
"X",
"pics",
5,
5,
[
{"variantId": "X_5_A_A"},
],
[StudyLocusQualityCheck.UNRESOLVED_LD.value],
),
# NOT to be flagged (different study)
(
6,
"v6",
"s2",
"X",
"pics",
3,
5,
[
{"variantId": "X_3_A_A"},
{"variantId": "X_5_A_A"},
],
[],
),
]

STUDY_LOCUS_SCHEMA = t.StructType(
[
t.StructField("studyLocusId", t.LongType(), False),
t.StructField("variantId", t.StringType(), False),
t.StructField("studyId", t.StringType(), False),
t.StructField("chromosome", t.StringType(), False),
t.StructField("finemappingMethod", t.StringType(), False),
t.StructField("locusStart", t.IntegerType(), False),
t.StructField("locusEnd", t.IntegerType(), False),
StructField(
"locus",
ArrayType(
StructType(
[
StructField("variantId", StringType(), True),
]
)
),
True,
),
t.StructField("qualityControls", t.ArrayType(t.StringType()), False),
]
)

@pytest.fixture(autouse=True)
def _setup(
self: TestStudyLocusSuSiERedundancyFlagging, spark: SparkSession
) -> None:
"""Setup study locus for testing."""
self.study_locus = StudyLocus(
_df=spark.createDataFrame(
self.STUDY_LOCUS_DATA, schema=self.STUDY_LOCUS_SCHEMA
),
_schema=StudyLocus.get_schema(),
)

def test_qc_qc_explained_by_SuSiE_returntype(
self: TestStudyLocusSuSiERedundancyFlagging,
) -> None:
"""Test qc_explained_by_SuSiE."""
assert isinstance(self.study_locus.qc_explained_by_SuSiE(), StudyLocus)

def test_qc_explained_by_SuSiE_no_data_loss(
self: TestStudyLocusSuSiERedundancyFlagging,
) -> None:
"""Test qc_explained_by_SuSiE no data loss."""
assert (
self.study_locus.qc_explained_by_SuSiE().df.count()
== self.study_locus.df.count()
)

def test_qc_explained_by_SuSiE_correctness(
self: TestStudyLocusSuSiERedundancyFlagging,
) -> None:
"""Testing if the study validation flags the right number of studies."""
assert (
self.study_locus.qc_explained_by_SuSiE()
.df.filter(
f.array_contains(
f.col("qualityControls"),
StudyLocusQualityCheck.EXPLAINED_BY_SUSIE.value,
)
)
.count()
) == 2

0 comments on commit 2199ece

Please sign in to comment.