From 84d663849716a61fa40642959d80300dd99842fc Mon Sep 17 00:00:00 2001 From: David Ochoa Date: Tue, 24 Sep 2024 16:57:29 +0100 Subject: [PATCH] feat: 99% credible set validation during `study_locus_validation` (#765) * feat: study locus validation filters for 95% credible sets * revert: no longer needed to filter for credible set interval * feat: annotate credible sets before filter them * docs: adding more context here --- src/gentropy/colocalisation.py | 6 ++---- src/gentropy/dataset/study_locus.py | 4 ++-- src/gentropy/pics.py | 6 ++---- src/gentropy/study_locus_validation.py | 4 +++- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/gentropy/colocalisation.py b/src/gentropy/colocalisation.py index 4f8431b98..0dcdff206 100644 --- a/src/gentropy/colocalisation.py +++ b/src/gentropy/colocalisation.py @@ -8,7 +8,7 @@ from pyspark.sql.functions import col from gentropy.common.session import Session -from gentropy.dataset.study_locus import CredibleInterval, StudyLocus +from gentropy.dataset.study_locus import StudyLocus from gentropy.method.colocalisation import Coloc @@ -46,9 +46,7 @@ def __init__( ) # Transform - overlaps = credible_set.filter_credible_set( - CredibleInterval.IS95 - ).find_overlaps() + overlaps = credible_set.find_overlaps() colocalisation_results = colocalisation_class.colocalise(overlaps) # type: ignore # Load diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py index 2385df984..c7f9ffc3d 100644 --- a/src/gentropy/dataset/study_locus.py +++ b/src/gentropy/dataset/study_locus.py @@ -553,7 +553,7 @@ def filter_credible_set( self: StudyLocus, credible_interval: CredibleInterval, ) -> StudyLocus: - """Filter study-locus tag variants based on given credible interval. + """Annotate and filter study-locus tag variants based on given credible interval. Args: credible_interval (CredibleInterval): Credible interval to filter for. @@ -562,7 +562,7 @@ def filter_credible_set( StudyLocus: Filtered study-locus dataset. """ return StudyLocus( - _df=self._df.withColumn( + _df=self.annotate_credible_sets().df.withColumn( "locus", f.filter( f.col("locus"), diff --git a/src/gentropy/pics.py b/src/gentropy/pics.py index 80421b9ae..e80a37eb6 100644 --- a/src/gentropy/pics.py +++ b/src/gentropy/pics.py @@ -28,10 +28,8 @@ def __init__( session, study_locus_ld_annotated_in ) # PICS - picsed_sl = ( - PICS.finemap(study_locus_ld_annotated) - .annotate_credible_sets() - .filter_credible_set(credible_interval=CredibleInterval.IS99) + picsed_sl = PICS.finemap(study_locus_ld_annotated).filter_credible_set( + credible_interval=CredibleInterval.IS99 ) # Write picsed_sl.df.write.mode(session.write_mode).parquet(picsed_study_locus_out) diff --git a/src/gentropy/study_locus_validation.py b/src/gentropy/study_locus_validation.py index 7c853bbcb..114eb01f7 100644 --- a/src/gentropy/study_locus_validation.py +++ b/src/gentropy/study_locus_validation.py @@ -4,7 +4,7 @@ from gentropy.common.session import Session from gentropy.dataset.study_index import StudyIndex -from gentropy.dataset.study_locus import StudyLocus +from gentropy.dataset.study_locus import CredibleInterval, StudyLocus class StudyLocusValidationStep: @@ -46,6 +46,8 @@ def __init__( .validate_study(study_index) # Flagging studies not in study index .annotate_study_type(study_index) # Add study type to study locus .qc_redundant_top_hits_from_PICS() # Flagging top hits from studies with PICS summary statistics + # Annotates credible intervals and filter to only keep 99% credible sets + .filter_credible_set(credible_interval=CredibleInterval.IS99) ).persist() # we will need this for 2 types of outputs study_locus_with_qc.valid_rows(