From dcacaf7d2694818c4d60545944ba6d40f27e45a8 Mon Sep 17 00:00:00 2001 From: Vivien Ho <56025826+vivienho@users.noreply.github.com> Date: Tue, 24 Sep 2024 14:37:11 +0100 Subject: [PATCH 1/2] feat: add `studyType` to `StudyLocus` and `Colocalisation` (and `StudyLocusOverlap`) (#782) * feat: add studyType to StudyLocus schema * feat: add annotate_study_type function to add studyType to StudyLocus * fix: remove lines for retrieving studyType as StudyLocus now contains studyType * fix: add studyType to test input data as StudyLocus now contains studyType * feat: add leftStudyType and rightStudyType to Colocalisation and StudyLocusOverlap schemas * feat: update _convert_to_square_matrix and its test with leftStudyType and rightStudyType * feat: update test_find_overlaps_semantic inputs with leftStudyType and rightStudyType * feat: update tests in test_colocalisation_method.py with leftStudyType and rightStudyType * feat: add leftStudyType and rightStudyType when creating StudyLocusOverlap * feat: add leftStudyType and rightStudyType to Colocalisation results * fix: remove redundant study_index parameter from filter_by_study_type function def and calls * fix: remove redundant study_index parameter from find_overlaps function def and calls * fix: remove leftStudyType from Colocalisation (not needed as always gwas) * fix: remove leftStudyType from StudyLocusOverlap (not needed as always gwas) * fix: missing comma * feat: update tests in test_locus_to_gene.py with studyType and rightStudyType * feat: update tests (colocalisation, l2g, l2g feature matrix) with rightStudyType * fix: remove studyType from metadata_cols in append_study_metadata function call --- .../assets/schemas/colocalisation.json | 6 ++ src/gentropy/assets/schemas/study_locus.json | 6 ++ .../assets/schemas/study_locus_overlap.json | 6 ++ src/gentropy/colocalisation.py | 8 +-- src/gentropy/dataset/colocalisation.py | 2 +- src/gentropy/dataset/study_locus.py | 31 ++++++++-- src/gentropy/dataset/study_locus_overlap.py | 7 +-- src/gentropy/l2g.py | 4 +- src/gentropy/method/colocalisation.py | 4 +- src/gentropy/study_locus_validation.py | 1 + tests/gentropy/dataset/test_colocalisation.py | 3 +- tests/gentropy/dataset/test_l2g.py | 4 +- .../dataset/test_l2g_feature_matrix.py | 3 +- tests/gentropy/dataset/test_study_locus.py | 58 ++++--------------- .../dataset/test_study_locus_overlap.py | 10 ++-- .../dataset/test_study_locus_overlaps.py | 8 +-- .../method/test_colocalisation_method.py | 5 ++ 17 files changed, 83 insertions(+), 83 deletions(-) diff --git a/src/gentropy/assets/schemas/colocalisation.json b/src/gentropy/assets/schemas/colocalisation.json index 7ff7453b9..6e1163cfe 100644 --- a/src/gentropy/assets/schemas/colocalisation.json +++ b/src/gentropy/assets/schemas/colocalisation.json @@ -13,6 +13,12 @@ "type": "long", "metadata": {} }, + { + "name": "rightStudyType", + "nullable": false, + "type": "string", + "metadata": {} + }, { "name": "chromosome", "nullable": false, diff --git a/src/gentropy/assets/schemas/study_locus.json b/src/gentropy/assets/schemas/study_locus.json index 11908f687..a8d15aba6 100644 --- a/src/gentropy/assets/schemas/study_locus.json +++ b/src/gentropy/assets/schemas/study_locus.json @@ -6,6 +6,12 @@ "nullable": false, "type": "long" }, + { + "metadata": {}, + "name": "studyType", + "nullable": true, + "type": "string" + }, { "metadata": {}, "name": "variantId", diff --git a/src/gentropy/assets/schemas/study_locus_overlap.json b/src/gentropy/assets/schemas/study_locus_overlap.json index 9a8e123cd..22ba7705e 100644 --- a/src/gentropy/assets/schemas/study_locus_overlap.json +++ b/src/gentropy/assets/schemas/study_locus_overlap.json @@ -12,6 +12,12 @@ "nullable": false, "type": "long" }, + { + "metadata": {}, + "name": "rightStudyType", + "nullable": false, + "type": "string" + }, { "metadata": {}, "name": "chromosome", diff --git a/src/gentropy/colocalisation.py b/src/gentropy/colocalisation.py index 6b370d426..4f8431b98 100644 --- a/src/gentropy/colocalisation.py +++ b/src/gentropy/colocalisation.py @@ -8,7 +8,6 @@ from pyspark.sql.functions import col from gentropy.common.session import Session -from gentropy.dataset.study_index import StudyIndex from gentropy.dataset.study_locus import CredibleInterval, StudyLocus from gentropy.method.colocalisation import Coloc @@ -23,7 +22,6 @@ def __init__( self, session: Session, credible_set_path: str, - study_index_path: str, coloc_path: str, colocalisation_method: str, ) -> None: @@ -32,7 +30,6 @@ def __init__( Args: session (Session): Session object. credible_set_path (str): Input credible sets path. - study_index_path (str): Input study index path. coloc_path (str): Output Colocalisation path. colocalisation_method (str): Colocalisation method. """ @@ -47,14 +44,11 @@ def __init__( session, credible_set_path, recursiveFileLookup=True ) ) - si = StudyIndex.from_parquet( - session, study_index_path, recursiveFileLookup=True - ) # Transform overlaps = credible_set.filter_credible_set( CredibleInterval.IS95 - ).find_overlaps(si) + ).find_overlaps() colocalisation_results = colocalisation_class.colocalise(overlaps) # type: ignore # Load diff --git a/src/gentropy/dataset/colocalisation.py b/src/gentropy/dataset/colocalisation.py index c0d074ae3..94a4f09dc 100644 --- a/src/gentropy/dataset/colocalisation.py +++ b/src/gentropy/dataset/colocalisation.py @@ -91,7 +91,7 @@ def extract_maximum_coloc_probability_per_region_and_gene( self.append_study_metadata( study_locus, study_index, - metadata_cols=["studyType", "geneId"], + metadata_cols=["geneId"], colocalisation_side="right", ) # it also filters based on method and qtl type diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py index e8363aa4e..57482fda8 100644 --- a/src/gentropy/dataset/study_locus.py +++ b/src/gentropy/dataset/study_locus.py @@ -157,6 +157,24 @@ def validate_study(self: StudyLocus, study_index: StudyIndex) -> StudyLocus: _schema=self.get_schema(), ) + def annotate_study_type(self: StudyLocus, study_index: StudyIndex) -> StudyLocus: + """Gets study type from study index and adds it to study locus. + + Args: + study_index (StudyIndex): Study index to get study type. + + Returns: + StudyLocus: Updated study locus with study type. + """ + return StudyLocus( + _df=( + self.df + .drop("studyType") + .join(study_index.study_type_lut(), on="studyId", how="left") + ), + _schema=self.get_schema(), + ) + def validate_variant_identifiers( self: StudyLocus, variant_index: VariantIndex ) -> StudyLocus: @@ -394,6 +412,7 @@ def _align_overlapping_tags( f.col("chromosome"), f.col("tagVariantId"), f.col("studyLocusId").alias("rightStudyLocusId"), + f.col("studyType").alias("rightStudyType"), *[f.col(col).alias(f"right_{col}") for col in stats_cols], ).join(peak_overlaps, on=["chromosome", "rightStudyLocusId"], how="inner") @@ -410,6 +429,7 @@ def _align_overlapping_tags( ).select( "leftStudyLocusId", "rightStudyLocusId", + "rightStudyType", "chromosome", "tagVariantId", f.struct( @@ -505,13 +525,12 @@ def get_QC_mappings(cls: type[StudyLocus]) -> dict[str, str]: return {member.name: member.value for member in StudyLocusQualityCheck} def filter_by_study_type( - self: StudyLocus, study_type: str, study_index: StudyIndex + self: StudyLocus, study_type: str ) -> StudyLocus: """Creates a new StudyLocus dataset filtered by study type. Args: study_type (str): Study type to filter for. Can be one of `gwas`, `eqtl`, `pqtl`, `eqtl`. - study_index (StudyIndex): Study index to resolve study types. Returns: StudyLocus: Filtered study-locus dataset. @@ -524,7 +543,7 @@ def filter_by_study_type( f"Study type {study_type} not supported. Supported types are: gwas, eqtl, pqtl, sqtl." ) new_df = ( - self.df.join(study_index.study_type_lut(), on="studyId", how="inner") + self.df .filter(f.col("studyType") == study_type) .drop("studyType") ) @@ -576,7 +595,7 @@ def filter_ld_set(ld_set: Column, r2_threshold: float) -> Column: ) def find_overlaps( - self: StudyLocus, study_index: StudyIndex, intra_study_overlap: bool = False + self: StudyLocus, intra_study_overlap: bool = False ) -> StudyLocusOverlap: """Calculate overlapping study-locus. @@ -584,14 +603,14 @@ def find_overlaps( appearing on the right side. Args: - study_index (StudyIndex): Study index to resolve study types. intra_study_overlap (bool): If True, finds intra-study overlaps for credible set deduplication. Default is False. Returns: StudyLocusOverlap: Pairs of overlapping study-locus with aligned tags. """ loci_to_overlap = ( - self.df.join(study_index.study_type_lut(), on="studyId", how="inner") + self.df + .filter(f.col("studyType").isNotNull()) .withColumn("locus", f.explode("locus")) .select( "studyLocusId", diff --git a/src/gentropy/dataset/study_locus_overlap.py b/src/gentropy/dataset/study_locus_overlap.py index 5f839bd9c..d14a2da96 100644 --- a/src/gentropy/dataset/study_locus_overlap.py +++ b/src/gentropy/dataset/study_locus_overlap.py @@ -10,7 +10,6 @@ if TYPE_CHECKING: from pyspark.sql.types import StructType - from gentropy.dataset.study_index import StudyIndex from gentropy.dataset.study_locus import StudyLocus @@ -36,18 +35,17 @@ def get_schema(cls: type[StudyLocusOverlap]) -> StructType: @classmethod def from_associations( - cls: type[StudyLocusOverlap], study_locus: StudyLocus, study_index: StudyIndex + cls: type[StudyLocusOverlap], study_locus: StudyLocus ) -> StudyLocusOverlap: """Find the overlapping signals in a particular set of associations (StudyLocus dataset). Args: study_locus (StudyLocus): Study-locus associations to find the overlapping signals - study_index (StudyIndex): Study index to find the overlapping signals Returns: StudyLocusOverlap: Study-locus overlap dataset """ - return study_locus.find_overlaps(study_index) + return study_locus.find_overlaps() def _convert_to_square_matrix(self: StudyLocusOverlap) -> StudyLocusOverlap: """Convert the dataset to a square matrix. @@ -60,6 +58,7 @@ def _convert_to_square_matrix(self: StudyLocusOverlap) -> StudyLocusOverlap: self.df.selectExpr( "leftStudyLocusId as rightStudyLocusId", "rightStudyLocusId as leftStudyLocusId", + "rightStudyType", "tagVariantId", ) ).distinct(), diff --git a/src/gentropy/l2g.py b/src/gentropy/l2g.py index 832023cd8..13dbb881b 100644 --- a/src/gentropy/l2g.py +++ b/src/gentropy/l2g.py @@ -204,7 +204,7 @@ def _generate_feature_matrix(self, write_feature_matrix: bool) -> L2GFeatureMatr ValueError: If write_feature_matrix is set to True but a path is not provided. ValueError: If dependencies to build features are not set. """ - if self.gs_curation and self.interactions and self.v2g and self.studies: + if self.gs_curation and self.interactions and self.v2g: study_locus_overlap = StudyLocus( _df=self.credible_set.df.join( f.broadcast( @@ -225,7 +225,7 @@ def _generate_feature_matrix(self, write_feature_matrix: bool) -> L2GFeatureMatr "inner", ), _schema=StudyLocus.get_schema(), - ).find_overlaps(self.studies) + ).find_overlaps() gold_standards = L2GGoldStandard.from_otg_curation( gold_standard_curation=self.gs_curation, diff --git a/src/gentropy/method/colocalisation.py b/src/gentropy/method/colocalisation.py index c3320f931..7a3a0d9c5 100644 --- a/src/gentropy/method/colocalisation.py +++ b/src/gentropy/method/colocalisation.py @@ -79,7 +79,7 @@ def colocalise( f.col("statistics.right_posteriorProbability"), ), ) - .groupBy("leftStudyLocusId", "rightStudyLocusId", "chromosome") + .groupBy("leftStudyLocusId", "rightStudyLocusId", "rightStudyType", "chromosome") .agg( f.count("*").alias("numberColocalisingVariants"), f.sum(f.col("clpp")).alias("clpp"), @@ -168,7 +168,7 @@ def colocalise( f.col("left_logBF") + f.col("right_logBF"), ) # Group by overlapping peak and generating dense vectors of log_BF: - .groupBy("chromosome", "leftStudyLocusId", "rightStudyLocusId") + .groupBy("chromosome", "leftStudyLocusId", "rightStudyLocusId", "rightStudyType") .agg( f.count("*").alias("numberColocalisingVariants"), fml.array_to_vector(f.collect_list(f.col("left_logBF"))).alias( diff --git a/src/gentropy/study_locus_validation.py b/src/gentropy/study_locus_validation.py index e3d10f3db..4d1c234dc 100644 --- a/src/gentropy/study_locus_validation.py +++ b/src/gentropy/study_locus_validation.py @@ -46,6 +46,7 @@ def __init__( # Add flag for MHC region .qc_MHC_region() .validate_study(study_index) # Flagging studies not in study index + .annotate_study_type(study_index) # Add study type to study locus .qc_redundant_top_hits_from_PICS() # Flagging top hits from studies with PICS summary statistics .validate_unique_study_locus_id() # Flagging duplicated study locus ids ).persist() # we will need this for 2 types of outputs diff --git a/tests/gentropy/dataset/test_colocalisation.py b/tests/gentropy/dataset/test_colocalisation.py index 5371cf42c..8f2766fb4 100644 --- a/tests/gentropy/dataset/test_colocalisation.py +++ b/tests/gentropy/dataset/test_colocalisation.py @@ -100,10 +100,11 @@ def _setup(self: TestAppendStudyMetadata, spark: SparkSession) -> None: ) self.sample_colocalisation = Colocalisation( _df=spark.createDataFrame( - [(1, 2, "X", "COLOC", 1, 0.9)], + [(1, 2, "eqtl", "X", "COLOC", 1, 0.9)], [ "leftStudyLocusId", "rightStudyLocusId", + "rightStudyType", "chromosome", "colocalisationMethod", "numberColocalisingVariants", diff --git a/tests/gentropy/dataset/test_l2g.py b/tests/gentropy/dataset/test_l2g.py index d37ce5a4a..2523b97dd 100644 --- a/tests/gentropy/dataset/test_l2g.py +++ b/tests/gentropy/dataset/test_l2g.py @@ -70,8 +70,8 @@ def test_filter_unique_associations(spark: SparkSession) -> None: ) mock_sl_overlap_df = spark.createDataFrame( - [(1, 2, "variant2"), (1, 4, "variant4")], - "leftStudyLocusId LONG, rightStudyLocusId LONG, tagVariantId STRING", + [(1, 2, "eqtl", "variant2"), (1, 4, "eqtl", "variant4")], + "leftStudyLocusId LONG, rightStudyLocusId LONG, rightStudyType STRING, tagVariantId STRING", ) expected_df = spark.createDataFrame( diff --git a/tests/gentropy/dataset/test_l2g_feature_matrix.py b/tests/gentropy/dataset/test_l2g_feature_matrix.py index 46384239c..09460ee85 100644 --- a/tests/gentropy/dataset/test_l2g_feature_matrix.py +++ b/tests/gentropy/dataset/test_l2g_feature_matrix.py @@ -136,10 +136,11 @@ def _setup(self: TestFromFeaturesList, spark: SparkSession) -> None: ) self.sample_colocalisation = Colocalisation( _df=spark.createDataFrame( - [(1, 2, "X", "COLOC", 1, 0.9)], + [(1, 2, "eqtl", "X", "COLOC", 1, 0.9)], [ "leftStudyLocusId", "rightStudyLocusId", + "rightStudyType", "chromosome", "colocalisationMethod", "numberColocalisingVariants", diff --git a/tests/gentropy/dataset/test_study_locus.py b/tests/gentropy/dataset/test_study_locus.py index c89521b3c..ffb210d09 100644 --- a/tests/gentropy/dataset/test_study_locus.py +++ b/tests/gentropy/dataset/test_study_locus.py @@ -43,6 +43,7 @@ { "leftStudyLocusId": 1, "rightStudyLocusId": 2, + "rightStudyType": "eqtl", "chromosome": "1", "tagVariantId": "commonTag", "statistics": { @@ -53,6 +54,7 @@ { "leftStudyLocusId": 1, "rightStudyLocusId": 2, + "rightStudyType": "eqtl", "chromosome": "1", "tagVariantId": "nonCommonTag", "statistics": { @@ -79,6 +81,7 @@ def test_find_overlaps_semantic( "studyLocusId": 1, "variantId": "lead1", "studyId": "study1", + "studyType": "gwas", "locus": [ {"variantId": "commonTag", "posteriorProbability": 0.9}, ], @@ -88,6 +91,7 @@ def test_find_overlaps_semantic( "studyLocusId": 2, "variantId": "lead2", "studyId": "study2", + "studyType": "eqtl", "locus": [ {"variantId": "commonTag", "posteriorProbability": 0.6}, {"variantId": "nonCommonTag", "posteriorProbability": 0.6}, @@ -108,6 +112,7 @@ def test_find_overlaps_semantic( "studyLocusId": 1, "variantId": "lead1", "studyId": "study1", + "studyType": "gwas", "locus": [ {"variantId": "var1", "posteriorProbability": 0.9}, ], @@ -117,6 +122,7 @@ def test_find_overlaps_semantic( "studyLocusId": 2, "variantId": "lead2", "studyId": "study2", + "studyType": "eqtl", "locus": None, "chromosome": "1", }, @@ -126,25 +132,6 @@ def test_find_overlaps_semantic( _schema=StudyLocus.get_schema(), ) - studies = StudyIndex( - _df=spark.createDataFrame( - [ - { - "studyId": "study1", - "studyType": "gwas", - "traitFromSource": "trait1", - "projectId": "project1", - }, - { - "studyId": "study2", - "studyType": "eqtl", - "traitFromSource": "trait2", - "projectId": "project2", - }, - ] - ), - _schema=StudyIndex.get_schema(), - ) expected_overlaps_df = spark.createDataFrame( expected, StudyLocusOverlap.get_schema() ) @@ -154,18 +141,14 @@ def test_find_overlaps_semantic( "statistics.right_posteriorProbability", ] assert ( - credset.find_overlaps(studies).df.select(*cols_to_compare).collect() + credset.find_overlaps().df.select(*cols_to_compare).collect() == expected_overlaps_df.select(*cols_to_compare).collect() ), "Overlaps differ from expected." -def test_find_overlaps( - mock_study_locus: StudyLocus, mock_study_index: StudyIndex -) -> None: +def test_find_overlaps(mock_study_locus: StudyLocus) -> None: """Test study locus overlaps.""" - assert isinstance( - mock_study_locus.find_overlaps(mock_study_index), StudyLocusOverlap - ) + assert isinstance(mock_study_locus.find_overlaps(), StudyLocusOverlap) @pytest.mark.parametrize( @@ -184,39 +167,22 @@ def test_filter_by_study_type( "studyLocusId": 1, "variantId": "lead1", "studyId": "study1", + "studyType": "gwas", }, { # from eqtl "studyLocusId": 2, "variantId": "lead2", "studyId": "study2", + "studyType": "eqtl", }, ], StudyLocus.get_schema(), ), _schema=StudyLocus.get_schema(), ) - studies = StudyIndex( - _df=spark.createDataFrame( - [ - { - "studyId": "study1", - "studyType": "gwas", - "traitFromSource": "trait1", - "projectId": "project1", - }, - { - "studyId": "study2", - "studyType": "eqtl", - "traitFromSource": "trait2", - "projectId": "project2", - }, - ] - ), - _schema=StudyIndex.get_schema(), - ) - observed = sl.filter_by_study_type(study_type, studies) + observed = sl.filter_by_study_type(study_type) assert observed.df.count() == expected_sl_count diff --git a/tests/gentropy/dataset/test_study_locus_overlap.py b/tests/gentropy/dataset/test_study_locus_overlap.py index e26b59c30..7e591df30 100644 --- a/tests/gentropy/dataset/test_study_locus_overlap.py +++ b/tests/gentropy/dataset/test_study_locus_overlap.py @@ -19,19 +19,19 @@ def test_convert_to_square_matrix(spark: SparkSession) -> None: mock_sl_overlap = StudyLocusOverlap( _df=spark.createDataFrame( [ - (1, 2, "variant2"), + (1, 2, "eqtl", "variant2"), ], - "leftStudyLocusId LONG, rightStudyLocusId LONG, tagVariantId STRING", + "leftStudyLocusId LONG, rightStudyLocusId LONG, rightStudyType STRING, tagVariantId STRING", ), _schema=StudyLocusOverlap.get_schema(), ) expected_df = spark.createDataFrame( [ - (1, 2, "variant2"), - (2, 1, "variant2"), + (1, 2, "eqtl", "variant2"), + (2, 1, "eqtl", "variant2"), ], - "leftStudyLocusId LONG, rightStudyLocusId LONG, tagVariantId STRING", + "leftStudyLocusId LONG, rightStudyLocusId LONG, rightStudyType STRING, tagVariantId STRING", ) observed_df = mock_sl_overlap._convert_to_square_matrix().df diff --git a/tests/gentropy/dataset/test_study_locus_overlaps.py b/tests/gentropy/dataset/test_study_locus_overlaps.py index bd3415959..745f07ed2 100644 --- a/tests/gentropy/dataset/test_study_locus_overlaps.py +++ b/tests/gentropy/dataset/test_study_locus_overlaps.py @@ -13,8 +13,6 @@ if TYPE_CHECKING: from pyspark.sql import SparkSession - from gentropy.dataset.study_index import StudyIndex - def test_study_locus_overlap_creation( mock_study_locus_overlap: StudyLocusOverlap, @@ -23,11 +21,9 @@ def test_study_locus_overlap_creation( assert isinstance(mock_study_locus_overlap, StudyLocusOverlap) -def test_study_locus_overlap_from_associations( - mock_study_locus: StudyLocus, mock_study_index: StudyIndex -) -> None: +def test_study_locus_overlap_from_associations(mock_study_locus: StudyLocus) -> None: """Test colocalisation creation from mock associations.""" - overlaps = StudyLocusOverlap.from_associations(mock_study_locus, mock_study_index) + overlaps = StudyLocusOverlap.from_associations(mock_study_locus) assert isinstance(overlaps, StudyLocusOverlap) diff --git a/tests/gentropy/method/test_colocalisation_method.py b/tests/gentropy/method/test_colocalisation_method.py index d6798d831..e292784c1 100644 --- a/tests/gentropy/method/test_colocalisation_method.py +++ b/tests/gentropy/method/test_colocalisation_method.py @@ -29,6 +29,7 @@ def test_coloc(mock_study_locus_overlap: StudyLocusOverlap) -> None: { "leftStudyLocusId": 1, "rightStudyLocusId": 2, + "rightStudyType": "eqtl", "chromosome": "1", "tagVariantId": "snp", "statistics": {"left_logBF": 10.3, "right_logBF": 10.5}, @@ -52,6 +53,7 @@ def test_coloc(mock_study_locus_overlap: StudyLocusOverlap) -> None: { "leftStudyLocusId": 1, "rightStudyLocusId": 2, + "rightStudyType": "eqtl", "chromosome": "1", "tagVariantId": "snp1", "statistics": {"left_logBF": 10.3, "right_logBF": 10.5}, @@ -59,6 +61,7 @@ def test_coloc(mock_study_locus_overlap: StudyLocusOverlap) -> None: { "leftStudyLocusId": 1, "rightStudyLocusId": 2, + "rightStudyType": "eqtl", "chromosome": "1", "tagVariantId": "snp2", "statistics": {"left_logBF": 10.3, "right_logBF": 10.5}, @@ -119,6 +122,7 @@ def test_coloc_no_logbf( { "leftStudyLocusId": 1, "rightStudyLocusId": 2, + "rightStudyType": "eqtl", "chromosome": "1", "tagVariantId": "snp", "statistics": { @@ -131,6 +135,7 @@ def test_coloc_no_logbf( [ StructField("leftStudyLocusId", LongType(), False), StructField("rightStudyLocusId", LongType(), False), + StructField("rightStudyType", StringType(), False), StructField("chromosome", StringType(), False), StructField("tagVariantId", StringType(), False), StructField( From df45a6c9a7b925745f919a42f03f11a4c6eaaab5 Mon Sep 17 00:00:00 2001 From: Daniel Suveges Date: Tue, 24 Sep 2024 15:58:42 +0100 Subject: [PATCH 2/2] feat: adding window based clumping to StudyLocus (#779) * feat: adding window based clumping to locus * fix: reverting some changes * chore: pre-commit auto fixes [...] * fix: fixing probem introduced by merge conflict * fix: addressing review comment --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- src/gentropy/dataset/study_locus.py | 42 +++++++----- src/gentropy/dataset/summary_statistics.py | 7 +- src/gentropy/gwas_catalog_ingestion.py | 10 ++- src/gentropy/method/window_based_clumping.py | 51 +++++++++------ tests/gentropy/dataset/test_study_locus.py | 68 ++++++++++++++++++++ 5 files changed, 138 insertions(+), 40 deletions(-) diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py index 57482fda8..2385df984 100644 --- a/src/gentropy/dataset/study_locus.py +++ b/src/gentropy/dataset/study_locus.py @@ -17,6 +17,7 @@ order_array_of_structs_by_field, ) from gentropy.common.utils import get_logsum +from gentropy.config import WindowBasedClumpingStepConfig from gentropy.dataset.dataset import Dataset from gentropy.dataset.study_locus_overlap import StudyLocusOverlap from gentropy.dataset.variant_index import VariantIndex @@ -45,7 +46,8 @@ class StudyLocusQualityCheck(Enum): PALINDROMIC_ALLELE_FLAG (str): Alleles are palindromic - cannot harmonize AMBIGUOUS_STUDY (str): Association with ambiguous study UNRESOLVED_LD (str): Variant not found in LD reference - LD_CLUMPED (str): Explained by a more significant variant in high LD (clumped) + LD_CLUMPED (str): Explained by a more significant variant in high LD + WINDOW_CLUMPED (str): Explained by a more significant variant in the same window NO_POPULATION (str): Study does not have population annotation to resolve LD NOT_QUALIFYING_LD_BLOCK (str): LD block does not contain variants at the required R^2 threshold FAILED_STUDY (str): Flagging study loci if the study has failed QC @@ -65,7 +67,8 @@ class StudyLocusQualityCheck(Enum): PALINDROMIC_ALLELE_FLAG = "Palindrome alleles - cannot harmonize" AMBIGUOUS_STUDY = "Association with ambiguous study" UNRESOLVED_LD = "Variant not found in LD reference" - LD_CLUMPED = "Explained by a more significant variant in high LD (clumped)" + LD_CLUMPED = "Explained by a more significant variant in high LD" + WINDOW_CLUMPED = "Explained by a more significant variant in the same window" NO_POPULATION = "Study does not have population annotation to resolve LD" NOT_QUALIFYING_LD_BLOCK = ( "LD block does not contain variants at the required R^2 threshold" @@ -168,9 +171,9 @@ def annotate_study_type(self: StudyLocus, study_index: StudyIndex) -> StudyLocus """ return StudyLocus( _df=( - self.df - .drop("studyType") - .join(study_index.study_type_lut(), on="studyId", how="left") + self.df.drop("studyType").join( + study_index.study_type_lut(), on="studyId", how="left" + ) ), _schema=self.get_schema(), ) @@ -524,9 +527,7 @@ def get_QC_mappings(cls: type[StudyLocus]) -> dict[str, str]: """ return {member.name: member.value for member in StudyLocusQualityCheck} - def filter_by_study_type( - self: StudyLocus, study_type: str - ) -> StudyLocus: + def filter_by_study_type(self: StudyLocus, study_type: str) -> StudyLocus: """Creates a new StudyLocus dataset filtered by study type. Args: @@ -542,11 +543,7 @@ def filter_by_study_type( raise ValueError( f"Study type {study_type} not supported. Supported types are: gwas, eqtl, pqtl, sqtl." ) - new_df = ( - self.df - .filter(f.col("studyType") == study_type) - .drop("studyType") - ) + new_df = self.df.filter(f.col("studyType") == study_type).drop("studyType") return StudyLocus( _df=new_df, _schema=self._schema, @@ -609,8 +606,7 @@ def find_overlaps( StudyLocusOverlap: Pairs of overlapping study-locus with aligned tags. """ loci_to_overlap = ( - self.df - .filter(f.col("studyType").isNotNull()) + self.df.filter(f.col("studyType").isNotNull()) .withColumn("locus", f.explode("locus")) .select( "studyLocusId", @@ -1051,3 +1047,19 @@ def annotate_locus_statistics_boundaries( ) return self + + def window_based_clumping( + self: StudyLocus, + window_size: int = WindowBasedClumpingStepConfig().distance, + ) -> StudyLocus: + """Clump study locus by window size. + + Args: + window_size (int): Window size for clumping. + + Returns: + StudyLocus: Clumped study locus, where clumped associations are flagged. + """ + from gentropy.method.window_based_clumping import WindowBasedClumping + + return WindowBasedClumping.clump(self, window_size) diff --git a/src/gentropy/dataset/summary_statistics.py b/src/gentropy/dataset/summary_statistics.py index d0875fe85..25edbeca7 100644 --- a/src/gentropy/dataset/summary_statistics.py +++ b/src/gentropy/dataset/summary_statistics.py @@ -77,10 +77,11 @@ def window_based_clumping( from gentropy.method.window_based_clumping import WindowBasedClumping return WindowBasedClumping.clump( - self, + # Before clumping, we filter the summary statistics by p-value: + self.pvalue_filter(gwas_significance), distance=distance, - gwas_significance=gwas_significance, - ) + # After applying the clumping, we filter the clumped loci by the flag: + ).valid_rows(["WINDOW_CLUMPED"]) def locus_breaker_clumping( self: SummaryStatistics, diff --git a/src/gentropy/gwas_catalog_ingestion.py b/src/gentropy/gwas_catalog_ingestion.py index 725f1ca4d..5dab5bf16 100644 --- a/src/gentropy/gwas_catalog_ingestion.py +++ b/src/gentropy/gwas_catalog_ingestion.py @@ -3,6 +3,7 @@ from __future__ import annotations from gentropy.common.session import Session +from gentropy.config import WindowBasedClumpingStepConfig from gentropy.dataset.variant_index import VariantIndex from gentropy.datasource.gwas_catalog.associations import ( GWASCatalogCuratedAssociationsParser, @@ -30,6 +31,7 @@ def __init__( gnomad_variant_path: str, catalog_studies_out: str, catalog_associations_out: str, + distance: int = WindowBasedClumpingStepConfig().distance, gwas_catalog_study_curation_file: str | None = None, inclusion_list_path: str | None = None, ) -> None: @@ -44,6 +46,7 @@ def __init__( gnomad_variant_path (str): Path to GnomAD variants. catalog_studies_out (str): Output GWAS catalog studies path. catalog_associations_out (str): Output GWAS catalog associations path. + distance (int): Distance, within which tagging variants are collected around the semi-index. gwas_catalog_study_curation_file (str | None): file of the curation table. Optional. inclusion_list_path (str | None): optional inclusion list (parquet) """ @@ -86,4 +89,9 @@ def __init__( # Load study_index.df.write.mode(session.write_mode).parquet(catalog_studies_out) - study_locus.df.write.mode(session.write_mode).parquet(catalog_associations_out) + + ( + study_locus.window_based_clumping(distance) + .df.write.mode(session.write_mode) + .parquet(catalog_associations_out) + ) diff --git a/src/gentropy/method/window_based_clumping.py b/src/gentropy/method/window_based_clumping.py index 629fe627e..9ef747abf 100644 --- a/src/gentropy/method/window_based_clumping.py +++ b/src/gentropy/method/window_based_clumping.py @@ -12,7 +12,7 @@ from pyspark.sql.window import Window from gentropy.config import WindowBasedClumpingStepConfig -from gentropy.dataset.study_locus import StudyLocus +from gentropy.dataset.study_locus import StudyLocus, StudyLocusQualityCheck if TYPE_CHECKING: from numpy.typing import NDArray @@ -154,22 +154,38 @@ def _prune_peak(position: NDArray[np.float64], window_size: int) -> DenseVector: @staticmethod def clump( - summary_statistics: SummaryStatistics, + unclumped_associations: SummaryStatistics | StudyLocus, distance: int = WindowBasedClumpingStepConfig().distance, - gwas_significance: float = WindowBasedClumpingStepConfig().gwas_significance, ) -> StudyLocus: - """Clump significant signals from summary statistics based on window. + """Clump single point associations from summary statistics or study locus dataset based on window. Args: - summary_statistics (SummaryStatistics): Summary statistics to be used for clumping. + unclumped_associations (SummaryStatistics | StudyLocus): Input dataset to be used for clumping. Assumes that the input dataset is already filtered for significant variants. distance (int): Distance in base pairs to be used for clumping. Defaults to 500_000. - gwas_significance (float): GWAS significance threshold. Defaults to 5e-8. Returns: - StudyLocus: clumped summary statistics (without locus collection) - - Check WindowBasedClumpingStepConfig object for default values + StudyLocus: clumped associations, where the clumped variants are flagged. """ + # Quality check expression that flags variants that are not considered lead variant: + qc_check = f.col("semiIndices")[f.col("pvRank") - 1] <= 0 + + # The quality control expression will depend on the input dataset, as the column might be already present: + qc_expression = ( + # When the column is already present and the condition is met, the value is appended to the array, otherwise keep as is: + f.when( + qc_check, + f.array_union( + f.col("qualityControls"), + f.array(f.lit(StudyLocusQualityCheck.WINDOW_CLUMPED.value)), + ), + ).otherwise(f.col("qualityControls")) + if "qualityControls" in unclumped_associations.df.columns + # If column is not there yet, initialize it with the flag value, or an empty array: + else f.when( + qc_check, f.array(f.lit(StudyLocusQualityCheck.WINDOW_CLUMPED.value)) + ).otherwise(f.array().cast(t.ArrayType(t.StringType()))) + ) + # Create window for locus clusters # - variants where the distance between subsequent variants is below the defined threshold. # - Variants are sorted by descending significance @@ -179,11 +195,8 @@ def clump( return StudyLocus( _df=( - summary_statistics - # Dropping snps below significance - all subsequent steps are done on significant variants: - .pvalue_filter(gwas_significance) - .df - # Clustering summary variants for efficient windowing (complexity reduction): + unclumped_associations.df + # Clustering variants for efficient windowing (complexity reduction): .withColumn( "cluster_id", WindowBasedClumping._cluster_peaks( @@ -207,7 +220,7 @@ def clump( ), ).otherwise(f.array()), ) - # Get semi indices only ONCE per cluster: + # Collect top loci per cluster: .withColumn( "semiIndices", f.when( @@ -230,9 +243,6 @@ def clump( ), ).otherwise(f.col("semiIndices")), ) - # Keeping semi indices only: - .filter(f.col("semiIndices")[f.col("pvRank") - 1] > 0) - .drop("pvRank", "collectedPositions", "semiIndices", "cluster_id") # Adding study-locus id: .withColumn( "studyLocusId", @@ -241,9 +251,8 @@ def clump( ), ) # Initialize QC column as array of strings: - .withColumn( - "qualityControls", f.array().cast(t.ArrayType(t.StringType())) - ) + .withColumn("qualityControls", qc_expression) + .drop("pvRank", "collectedPositions", "semiIndices", "cluster_id") ), _schema=StudyLocus.get_schema(), ) diff --git a/tests/gentropy/dataset/test_study_locus.py b/tests/gentropy/dataset/test_study_locus.py index ffb210d09..51fc2ed92 100644 --- a/tests/gentropy/dataset/test_study_locus.py +++ b/tests/gentropy/dataset/test_study_locus.py @@ -749,6 +749,74 @@ def test_study_validation_correctness(self: TestStudyLocusValidation) -> None: ) == 1 +class TestStudyLocusWindowClumping: + """Testing window-based clumping on study locus.""" + + TEST_DATASET = [ + ("s1", "c1", 1, -1), + ("s1", "c1", 2, -2), + ("s1", "c1", 3, -3), + ("s2", "c2", 2, -2), + ("s3", "c2", 2, -2), + ] + + TEST_SCHEMA = t.StructType( + [ + t.StructField("studyId", t.StringType(), False), + t.StructField("chromosome", t.StringType(), False), + t.StructField("position", t.IntegerType(), False), + t.StructField("pValueExponent", t.IntegerType(), False), + ] + ) + + @pytest.fixture(autouse=True) + def _setup(self: TestStudyLocusWindowClumping, spark: SparkSession) -> None: + """Setup study locus for testing.""" + self.study_locus = StudyLocus( + _df=( + spark.createDataFrame( + self.TEST_DATASET, schema=self.TEST_SCHEMA + ).withColumns( + { + "studyLocusId": f.monotonically_increasing_id().cast( + t.LongType() + ), + "pValueMantissa": f.lit(1).cast(t.FloatType()), + "variantId": f.concat( + f.lit("v"), + f.monotonically_increasing_id().cast(t.StringType()), + ), + } + ) + ), + _schema=StudyLocus.get_schema(), + ) + + def test_clump_return_type(self: TestStudyLocusWindowClumping) -> None: + """Testing if the clumping returns the right type.""" + assert isinstance(self.study_locus.window_based_clumping(3), StudyLocus) + + def test_clump_no_data_loss(self: TestStudyLocusWindowClumping) -> None: + """Testing if the clumping returns same number of rows.""" + assert ( + self.study_locus.window_based_clumping(3).df.count() + == self.study_locus.df.count() + ) + + def test_correct_flag(self: TestStudyLocusWindowClumping) -> None: + """Testing if the clumping flags are for variants.""" + assert ( + self.study_locus.window_based_clumping(3) + .df.filter( + f.array_contains( + f.col("qualityControls"), + StudyLocusQualityCheck.WINDOW_CLUMPED.value, + ) + ) + .count() + ) == 2 + + def test_build_feature_matrix( mock_study_locus: StudyLocus, mock_colocalisation: Colocalisation,