feat: the FineMapper function for one locus (#564)

* test: adding test for pairwiseLD * feat: adding ld matrix extraction * chore: merge from dev * feat: index and block matrix extraction for studyLocus * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * chore: updating some test files to gentropy * chore: updating tests * chore: updating pairwise_ld_schema for tests * chore: updating pairwise_ld tests * chore: fix ld_pairwise tests * chore: fix pairwise_ld tests * chore: fix tests * chore: fix tests * chore: fixing typing for tests * chore: fixing tests * chore: fixing ld tests * Update src/gentropy/dataset/study_index.py Co-authored-by: Daniel Suveges <[email protected]> * feat: moving functions to their appropriate locations and improving logic * fix: optimise conversion of BM to NumPy * feat: updating get_locus_index to allow for just chromosome and position inputs * fix: suggested changes * Update study_index.py * fix: changes to datasource/gnomad/ld.py * feat: add the draft of finemapper fucntion * feat: updated method for ld_index extraction * fix: changing input * fix: adding fillter by studyId * fix: sorting idx in hail * feat: add fine-mapping of one study_locus_row * fix: small fix in majpop * fix: small fixes in function * fix: using more spark before converting to pandas * fix: fix in test --------- Co-authored-by: Daniel Suveges <[email protected]> Co-authored-by: Daniel Considine <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Daniel-Considine <[email protected]>
opentargets · Apr 5, 2024 · e1d20f3 · e1d20f3
1 parent 56067e7
commit e1d20f3
Show file tree

Hide file tree

Showing 2 changed files with 135 additions and 9 deletions.
diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py
@@ -5,11 +5,17 @@
 from typing import Any
 
 import numpy as np
+import pandas as pd
 import pyspark.sql.functions as f
-from pyspark.sql import DataFrame, Window
+from pyspark.sql import DataFrame, Row, Window
+from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 
 from gentropy.common.session import Session
+from gentropy.dataset.study_index import StudyIndex
 from gentropy.dataset.study_locus import StudyLocus
+from gentropy.dataset.summary_statistics import SummaryStatistics
+from gentropy.datasource.gnomad.ld import GnomADLDMatrix
+from gentropy.method.susie_inf import SUSIE_inf
 
 
 class SusieFineMapperStep:
@@ -19,12 +25,129 @@ class SusieFineMapperStep:
     In the future this step will be refactored and moved to the methods module.
     """
 
+    @staticmethod
+    def susie_finemapper_one_studylocus_row(
+        GWAS: SummaryStatistics,
+        session: Session,
+        study_locus_row: Row,
+        study_index: StudyIndex,
+        window: int = 1_000_000,
+        L: int = 10,
+    ) -> StudyLocus:
+        """Susie fine-mapper function that uses Summary Statstics, chromosome and position as inputs.
+
+        Args:
+            GWAS (SummaryStatistics): GWAS summary statistics
+            session (Session): Spark session
+            study_locus_row (Row): StudyLocus row
+            study_index (StudyIndex): StudyIndex object
+            window (int): window size for fine-mapping
+            L (int): number of causal variants
+
+        Returns:
+            StudyLocus: StudyLocus object with fine-mapped credible sets
+        """
+        # PLEASE DO NOT REMOVE THIS LINE
+        pd.DataFrame.iteritems = pd.DataFrame.items
+
+        chromosome = study_locus_row["chromosome"]
+        position = study_locus_row["position"]
+        studyId = study_locus_row["studyId"]
+
+        study_index_df = study_index._df
+        study_index_df = study_index_df.filter(f.col("studyId") == studyId)
+        major_population = study_index_df.select(
+            "studyId",
+            f.array_max(f.col("ldPopulationStructure"))
+            .getItem("ldPopulation")
+            .alias("majorPopulation"),
+        ).collect()[0]["majorPopulation"]
+
+        region = (
+            chromosome
+            + ":"
+            + str(int(position - window / 2))
+            + "-"
+            + str(int(position + window / 2))
+        )
+
+        gwas_df = (
+            GWAS.df.withColumn("z", f.col("beta") / f.col("standardError"))
+            .withColumn("chromosome", f.split(f.col("variantId"), "_")[0])
+            .withColumn("position", f.split(f.col("variantId"), "_")[1])
+            .filter(f.col("studyId") == studyId)
+            .filter(f.col("z").isNotNull())
+        )
+
+        ld_index = (
+            GnomADLDMatrix()
+            .get_locus_index(
+                study_locus_row=study_locus_row,
+                window_size=window,
+                major_population=major_population,
+            )
+            .withColumn(
+                "variantId",
+                f.concat(
+                    f.lit(chromosome),
+                    f.lit("_"),
+                    f.col("`locus.position`"),
+                    f.lit("_"),
+                    f.col("alleles").getItem(0),
+                    f.lit("_"),
+                    f.col("alleles").getItem(1),
+                ).cast("string"),
+            )
+        )
+
+        # Filtering out the variants that are not in the LD matrix, we don't need them
+        gwas_index = gwas_df.join(
+            ld_index.select("variantId", "alleles", "idx"), on="variantId"
+        ).sort("idx")
+
+        gnomad_ld = GnomADLDMatrix.get_numpy_matrix(
+            gwas_index, gnomad_ancestry=major_population
+        )
+
+        pd_df = gwas_index.toPandas()
+        z_to_fm = np.array(pd_df["z"])
+        ld_to_fm = gnomad_ld
+
+        susie_output = SUSIE_inf.susie_inf(z=z_to_fm, LD=ld_to_fm, L=L)
+
+        schema = StructType(
+            [
+                StructField("variantId", StringType(), True),
+                StructField("chromosome", StringType(), True),
+                StructField("position", IntegerType(), True),
+            ]
+        )
+        pd_df["position"] = pd_df["position"].astype(int)
+        variant_index = session.spark.createDataFrame(
+            pd_df[
+                [
+                    "variantId",
+                    "chromosome",
+                    "position",
+                ]
+            ],
+            schema=schema,
+        )
+
+        return SusieFineMapperStep.susie_inf_to_studylocus(
+            susie_output=susie_output,
+            session=session,
+            studyId=studyId,
+            region=region,
+            variant_index=variant_index,
+        )
+
     @staticmethod
     def susie_inf_to_studylocus(
         susie_output: dict[str, Any],
         session: Session,
-        _studyId: str,
-        _region: str,
+        studyId: str,
+        region: str,
         variant_index: DataFrame,
         cs_lbf_thr: float = 2,
     ) -> StudyLocus:
@@ -33,8 +156,8 @@ def susie_inf_to_studylocus(
         Args:
             susie_output (dict[str, Any]): SuSiE-inf output dictionary
             session (Session): Spark session
-            _studyId (str): study ID
-            _region (str): region
+            studyId (str): study ID
+            region (str): region
             variant_index (DataFrame): DataFrame with variant information
             cs_lbf_thr (float): credible set logBF threshold, default is 2
 
@@ -44,6 +167,7 @@ def susie_inf_to_studylocus(
         variants = np.array(
             [row["variantId"] for row in variant_index.select("variantId").collect()]
         ).reshape(-1, 1)
+
         PIPs = susie_output["PIP"]
         lbfs = susie_output["lbf_variable"]
         mu = susie_output["mu"]
@@ -74,6 +198,7 @@ def susie_inf_to_studylocus(
             win = Window.rowsBetween(
                 Window.unboundedPreceding, Window.unboundedFollowing
             )
+
             cred_set = (
                 session.spark.createDataFrame(
                     cred_set.tolist(),
@@ -104,8 +229,8 @@ def susie_inf_to_studylocus(
                 .limit(1)
                 .withColumns(
                     {
-                        "studyId": f.lit(_studyId),
-                        "region": f.lit(_region),
+                        "studyId": f.lit(studyId),
+                        "region": f.lit(region),
                         "credibleSetIndex": f.lit(counter),
                         "credibleSetlog10BF": f.lit(cs_lbf_value * 0.4342944819),
                         "finemappingMethod": f.lit("SuSiE-inf"),

diff --git a/tests/gentropy/method/test_susie_inf.py b/tests/gentropy/method/test_susie_inf.py
@@ -68,11 +68,12 @@ def test_SUSIE_inf_convert_to_study_locus(
             est_tausq=False,
         )
         gwas_df = sample_summary_statistics._df.limit(21)
+
         L1 = SusieFineMapperStep.susie_inf_to_studylocus(
             susie_output=susie_output,
             session=session,
-            _studyId="sample_id",
-            _region="sample_region",
+            studyId="sample_id",
+            region="sample_region",
             variant_index=gwas_df,
             cs_lbf_thr=2,
         )