Merge branch 'dev' into alegbe-biosample_index

opentargets · Sep 24, 2024 · c9eada2 · c9eada2
2 parents 73b25da + df45a6c
commit c9eada2
Show file tree

Hide file tree

Showing 20 changed files with 214 additions and 116 deletions.
diff --git a/src/gentropy/assets/schemas/colocalisation.json b/src/gentropy/assets/schemas/colocalisation.json
@@ -13,6 +13,12 @@
       "type": "long",
       "metadata": {}
     },
+    {
+      "name": "rightStudyType",
+      "nullable": false,
+      "type": "string",
+      "metadata": {}
+    },
     {
       "name": "chromosome",
       "nullable": false,

diff --git a/src/gentropy/assets/schemas/study_locus.json b/src/gentropy/assets/schemas/study_locus.json
@@ -6,6 +6,12 @@
       "nullable": false,
       "type": "long"
     },
+    {
+      "metadata": {},
+      "name": "studyType",
+      "nullable": true,
+      "type": "string"
+    },
     {
       "metadata": {},
       "name": "variantId",

diff --git a/src/gentropy/assets/schemas/study_locus_overlap.json b/src/gentropy/assets/schemas/study_locus_overlap.json
@@ -12,6 +12,12 @@
       "nullable": false,
       "type": "long"
     },
+    {
+      "metadata": {},
+      "name": "rightStudyType",
+      "nullable": false,
+      "type": "string"
+    },
     {
       "metadata": {},
       "name": "chromosome",

diff --git a/src/gentropy/colocalisation.py b/src/gentropy/colocalisation.py
@@ -8,7 +8,6 @@
 from pyspark.sql.functions import col
 
 from gentropy.common.session import Session
-from gentropy.dataset.study_index import StudyIndex
 from gentropy.dataset.study_locus import CredibleInterval, StudyLocus
 from gentropy.method.colocalisation import Coloc
 
@@ -23,7 +22,6 @@ def __init__(
         self,
         session: Session,
         credible_set_path: str,
-        study_index_path: str,
         coloc_path: str,
         colocalisation_method: str,
     ) -> None:
@@ -32,7 +30,6 @@ def __init__(
         Args:
             session (Session): Session object.
             credible_set_path (str): Input credible sets path.
-            study_index_path (str): Input study index path.
             coloc_path (str): Output Colocalisation path.
             colocalisation_method (str): Colocalisation method.
         """
@@ -47,14 +44,11 @@ def __init__(
                 session, credible_set_path, recursiveFileLookup=True
             )
         )
-        si = StudyIndex.from_parquet(
-            session, study_index_path, recursiveFileLookup=True
-        )
 
         # Transform
         overlaps = credible_set.filter_credible_set(
             CredibleInterval.IS95
-        ).find_overlaps(si)
+        ).find_overlaps()
         colocalisation_results = colocalisation_class.colocalise(overlaps)  # type: ignore
 
         # Load

diff --git a/src/gentropy/dataset/colocalisation.py b/src/gentropy/dataset/colocalisation.py
@@ -91,7 +91,7 @@ def extract_maximum_coloc_probability_per_region_and_gene(
             self.append_study_metadata(
                 study_locus,
                 study_index,
-                metadata_cols=["studyType", "geneId"],
+                metadata_cols=["geneId"],
                 colocalisation_side="right",
             )
             # it also filters based on method and qtl type

diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py
@@ -17,6 +17,7 @@
     order_array_of_structs_by_field,
 )
 from gentropy.common.utils import get_logsum
+from gentropy.config import WindowBasedClumpingStepConfig
 from gentropy.dataset.dataset import Dataset
 from gentropy.dataset.study_locus_overlap import StudyLocusOverlap
 from gentropy.dataset.variant_index import VariantIndex
@@ -45,7 +46,8 @@ class StudyLocusQualityCheck(Enum):
         PALINDROMIC_ALLELE_FLAG (str): Alleles are palindromic - cannot harmonize
         AMBIGUOUS_STUDY (str): Association with ambiguous study
         UNRESOLVED_LD (str): Variant not found in LD reference
-        LD_CLUMPED (str): Explained by a more significant variant in high LD (clumped)
+        LD_CLUMPED (str): Explained by a more significant variant in high LD
+        WINDOW_CLUMPED (str): Explained by a more significant variant in the same window
         NO_POPULATION (str): Study does not have population annotation to resolve LD
         NOT_QUALIFYING_LD_BLOCK (str): LD block does not contain variants at the required R^2 threshold
         FAILED_STUDY (str): Flagging study loci if the study has failed QC
@@ -65,7 +67,8 @@ class StudyLocusQualityCheck(Enum):
     PALINDROMIC_ALLELE_FLAG = "Palindrome alleles - cannot harmonize"
     AMBIGUOUS_STUDY = "Association with ambiguous study"
     UNRESOLVED_LD = "Variant not found in LD reference"
-    LD_CLUMPED = "Explained by a more significant variant in high LD (clumped)"
+    LD_CLUMPED = "Explained by a more significant variant in high LD"
+    WINDOW_CLUMPED = "Explained by a more significant variant in the same window"
     NO_POPULATION = "Study does not have population annotation to resolve LD"
     NOT_QUALIFYING_LD_BLOCK = (
         "LD block does not contain variants at the required R^2 threshold"
@@ -157,6 +160,24 @@ def validate_study(self: StudyLocus, study_index: StudyIndex) -> StudyLocus:
             _schema=self.get_schema(),
         )
 
+    def annotate_study_type(self: StudyLocus, study_index: StudyIndex) -> StudyLocus:
+        """Gets study type from study index and adds it to study locus.
+
+        Args:
+            study_index (StudyIndex): Study index to get study type.
+
+        Returns:
+            StudyLocus: Updated study locus with study type.
+        """
+        return StudyLocus(
+            _df=(
+                self.df.drop("studyType").join(
+                    study_index.study_type_lut(), on="studyId", how="left"
+                )
+            ),
+            _schema=self.get_schema(),
+        )
+
     def validate_variant_identifiers(
         self: StudyLocus, variant_index: VariantIndex
     ) -> StudyLocus:
@@ -394,6 +415,7 @@ def _align_overlapping_tags(
             f.col("chromosome"),
             f.col("tagVariantId"),
             f.col("studyLocusId").alias("rightStudyLocusId"),
+            f.col("studyType").alias("rightStudyType"),
             *[f.col(col).alias(f"right_{col}") for col in stats_cols],
         ).join(peak_overlaps, on=["chromosome", "rightStudyLocusId"], how="inner")
 
@@ -410,6 +432,7 @@ def _align_overlapping_tags(
         ).select(
             "leftStudyLocusId",
             "rightStudyLocusId",
+            "rightStudyType",
             "chromosome",
             "tagVariantId",
             f.struct(
@@ -504,14 +527,11 @@ def get_QC_mappings(cls: type[StudyLocus]) -> dict[str, str]:
         """
         return {member.name: member.value for member in StudyLocusQualityCheck}
 
-    def filter_by_study_type(
-        self: StudyLocus, study_type: str, study_index: StudyIndex
-    ) -> StudyLocus:
+    def filter_by_study_type(self: StudyLocus, study_type: str) -> StudyLocus:
         """Creates a new StudyLocus dataset filtered by study type.
 
         Args:
             study_type (str): Study type to filter for. Can be one of `gwas`, `eqtl`, `pqtl`, `eqtl`.
-            study_index (StudyIndex): Study index to resolve study types.
 
         Returns:
             StudyLocus: Filtered study-locus dataset.
@@ -523,11 +543,7 @@ def filter_by_study_type(
             raise ValueError(
                 f"Study type {study_type} not supported. Supported types are: gwas, eqtl, pqtl, sqtl."
             )
-        new_df = (
-            self.df.join(study_index.study_type_lut(), on="studyId", how="inner")
-            .filter(f.col("studyType") == study_type)
-            .drop("studyType")
-        )
+        new_df = self.df.filter(f.col("studyType") == study_type).drop("studyType")
         return StudyLocus(
             _df=new_df,
             _schema=self._schema,
@@ -576,22 +592,21 @@ def filter_ld_set(ld_set: Column, r2_threshold: float) -> Column:
         )
 
     def find_overlaps(
-        self: StudyLocus, study_index: StudyIndex, intra_study_overlap: bool = False
+        self: StudyLocus, intra_study_overlap: bool = False
     ) -> StudyLocusOverlap:
         """Calculate overlapping study-locus.
 
         Find overlapping study-locus that share at least one tagging variant. All GWAS-GWAS and all GWAS-Molecular traits are computed with the Molecular traits always
         appearing on the right side.
 
         Args:
-            study_index (StudyIndex): Study index to resolve study types.
             intra_study_overlap (bool): If True, finds intra-study overlaps for credible set deduplication. Default is False.
 
         Returns:
             StudyLocusOverlap: Pairs of overlapping study-locus with aligned tags.
         """
         loci_to_overlap = (
-            self.df.join(study_index.study_type_lut(), on="studyId", how="inner")
+            self.df.filter(f.col("studyType").isNotNull())
             .withColumn("locus", f.explode("locus"))
             .select(
                 "studyLocusId",
@@ -1032,3 +1047,19 @@ def annotate_locus_statistics_boundaries(
         )
 
         return self
+
+    def window_based_clumping(
+        self: StudyLocus,
+        window_size: int = WindowBasedClumpingStepConfig().distance,
+    ) -> StudyLocus:
+        """Clump study locus by window size.
+
+        Args:
+            window_size (int): Window size for clumping.
+
+        Returns:
+            StudyLocus: Clumped study locus, where clumped associations are flagged.
+        """
+        from gentropy.method.window_based_clumping import WindowBasedClumping
+
+        return WindowBasedClumping.clump(self, window_size)
diff --git a/src/gentropy/dataset/study_locus_overlap.py b/src/gentropy/dataset/study_locus_overlap.py
@@ -10,7 +10,6 @@
 if TYPE_CHECKING:
     from pyspark.sql.types import StructType
 
-    from gentropy.dataset.study_index import StudyIndex
     from gentropy.dataset.study_locus import StudyLocus
 
 
@@ -36,18 +35,17 @@ def get_schema(cls: type[StudyLocusOverlap]) -> StructType:
 
     @classmethod
     def from_associations(
-        cls: type[StudyLocusOverlap], study_locus: StudyLocus, study_index: StudyIndex
+        cls: type[StudyLocusOverlap], study_locus: StudyLocus
     ) -> StudyLocusOverlap:
         """Find the overlapping signals in a particular set of associations (StudyLocus dataset).
 
         Args:
             study_locus (StudyLocus): Study-locus associations to find the overlapping signals
-            study_index (StudyIndex): Study index to find the overlapping signals
 
         Returns:
             StudyLocusOverlap: Study-locus overlap dataset
         """
-        return study_locus.find_overlaps(study_index)
+        return study_locus.find_overlaps()
 
     def _convert_to_square_matrix(self: StudyLocusOverlap) -> StudyLocusOverlap:
         """Convert the dataset to a square matrix.
@@ -60,6 +58,7 @@ def _convert_to_square_matrix(self: StudyLocusOverlap) -> StudyLocusOverlap:
                 self.df.selectExpr(
                     "leftStudyLocusId as rightStudyLocusId",
                     "rightStudyLocusId as leftStudyLocusId",
+                    "rightStudyType",
                     "tagVariantId",
                 )
             ).distinct(),

diff --git a/src/gentropy/dataset/summary_statistics.py b/src/gentropy/dataset/summary_statistics.py
@@ -77,10 +77,11 @@ def window_based_clumping(
         from gentropy.method.window_based_clumping import WindowBasedClumping
 
         return WindowBasedClumping.clump(
-            self,
+            # Before clumping, we filter the summary statistics by p-value:
+            self.pvalue_filter(gwas_significance),
             distance=distance,
-            gwas_significance=gwas_significance,
-        )
+            # After applying the clumping, we filter the clumped loci by the flag:
+        ).valid_rows(["WINDOW_CLUMPED"])
 
     def locus_breaker_clumping(
         self: SummaryStatistics,

diff --git a/src/gentropy/gwas_catalog_ingestion.py b/src/gentropy/gwas_catalog_ingestion.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 from gentropy.common.session import Session
+from gentropy.config import WindowBasedClumpingStepConfig
 from gentropy.dataset.variant_index import VariantIndex
 from gentropy.datasource.gwas_catalog.associations import (
     GWASCatalogCuratedAssociationsParser,
@@ -30,6 +31,7 @@ def __init__(
         gnomad_variant_path: str,
         catalog_studies_out: str,
         catalog_associations_out: str,
+        distance: int = WindowBasedClumpingStepConfig().distance,
         gwas_catalog_study_curation_file: str | None = None,
         inclusion_list_path: str | None = None,
     ) -> None:
@@ -44,6 +46,7 @@ def __init__(
             gnomad_variant_path (str): Path to GnomAD variants.
             catalog_studies_out (str): Output GWAS catalog studies path.
             catalog_associations_out (str): Output GWAS catalog associations path.
+            distance (int): Distance, within which tagging variants are collected around the semi-index.
             gwas_catalog_study_curation_file (str | None): file of the curation table. Optional.
             inclusion_list_path (str | None): optional inclusion list (parquet)
         """
@@ -86,4 +89,9 @@ def __init__(
 
         # Load
         study_index.df.write.mode(session.write_mode).parquet(catalog_studies_out)
-        study_locus.df.write.mode(session.write_mode).parquet(catalog_associations_out)
+
+        (
+            study_locus.window_based_clumping(distance)
+            .df.write.mode(session.write_mode)
+            .parquet(catalog_associations_out)
+        )
diff --git a/src/gentropy/l2g.py b/src/gentropy/l2g.py
@@ -204,7 +204,7 @@ def _generate_feature_matrix(self, write_feature_matrix: bool) -> L2GFeatureMatr
             ValueError: If write_feature_matrix is set to True but a path is not provided.
             ValueError: If dependencies to build features are not set.
         """
-        if self.gs_curation and self.interactions and self.v2g and self.studies:
+        if self.gs_curation and self.interactions and self.v2g:
             study_locus_overlap = StudyLocus(
                 _df=self.credible_set.df.join(
                     f.broadcast(
@@ -225,7 +225,7 @@ def _generate_feature_matrix(self, write_feature_matrix: bool) -> L2GFeatureMatr
                     "inner",
                 ),
                 _schema=StudyLocus.get_schema(),
-            ).find_overlaps(self.studies)
+            ).find_overlaps()
 
             gold_standards = L2GGoldStandard.from_otg_curation(
                 gold_standard_curation=self.gs_curation,

diff --git a/src/gentropy/method/colocalisation.py b/src/gentropy/method/colocalisation.py
@@ -79,7 +79,7 @@ def colocalise(
                         f.col("statistics.right_posteriorProbability"),
                     ),
                 )
-                .groupBy("leftStudyLocusId", "rightStudyLocusId", "chromosome")
+                .groupBy("leftStudyLocusId", "rightStudyLocusId", "rightStudyType", "chromosome")
                 .agg(
                     f.count("*").alias("numberColocalisingVariants"),
                     f.sum(f.col("clpp")).alias("clpp"),
@@ -168,7 +168,7 @@ def colocalise(
                     f.col("left_logBF") + f.col("right_logBF"),
                 )
                 # Group by overlapping peak and generating dense vectors of log_BF:
-                .groupBy("chromosome", "leftStudyLocusId", "rightStudyLocusId")
+                .groupBy("chromosome", "leftStudyLocusId", "rightStudyLocusId", "rightStudyType")
                 .agg(
                     f.count("*").alias("numberColocalisingVariants"),
                     fml.array_to_vector(f.collect_list(f.col("left_logBF"))).alias(