Skip to content

Commit

Permalink
feat: the FineMapper function for one locus (#564)
Browse files Browse the repository at this point in the history
* test: adding test for pairwiseLD

* feat: adding ld matrix extraction

* chore: merge from dev

* feat: index and block matrix extraction for studyLocus

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* chore: updating some test files to gentropy

* chore: updating tests

* chore: updating pairwise_ld_schema for tests

* chore: updating pairwise_ld tests

* chore: fix ld_pairwise tests

* chore: fix pairwise_ld tests

* chore: fix tests

* chore: fix tests

* chore: fixing typing for tests

* chore: fixing tests

* chore: fixing ld tests

* Update src/gentropy/dataset/study_index.py

Co-authored-by: Daniel Suveges <[email protected]>

* feat: moving functions to their appropriate locations and improving logic

* fix: optimise conversion of BM to NumPy

* feat: updating get_locus_index to allow for just chromosome and position inputs

* fix: suggested changes

* Update study_index.py

* fix: changes to datasource/gnomad/ld.py

* feat: add the draft of finemapper fucntion

* feat: updated method for ld_index extraction

* fix: changing input

* fix: adding fillter by studyId

* fix: sorting idx in hail

* feat: add fine-mapping of one study_locus_row

* fix: small fix in majpop

* fix: small fixes in function

* fix: using more spark before converting to pandas

* fix: fix in test

---------

Co-authored-by: Daniel Suveges <[email protected]>
Co-authored-by: Daniel Considine <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Daniel-Considine <[email protected]>
  • Loading branch information
5 people committed Apr 5, 2024
1 parent 56067e7 commit e1d20f3
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 9 deletions.
139 changes: 132 additions & 7 deletions src/gentropy/susie_finemapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,17 @@
from typing import Any

import numpy as np
import pandas as pd
import pyspark.sql.functions as f
from pyspark.sql import DataFrame, Window
from pyspark.sql import DataFrame, Row, Window
from pyspark.sql.types import IntegerType, StringType, StructField, StructType

from gentropy.common.session import Session
from gentropy.dataset.study_index import StudyIndex
from gentropy.dataset.study_locus import StudyLocus
from gentropy.dataset.summary_statistics import SummaryStatistics
from gentropy.datasource.gnomad.ld import GnomADLDMatrix
from gentropy.method.susie_inf import SUSIE_inf


class SusieFineMapperStep:
Expand All @@ -19,12 +25,129 @@ class SusieFineMapperStep:
In the future this step will be refactored and moved to the methods module.
"""

@staticmethod
def susie_finemapper_one_studylocus_row(
GWAS: SummaryStatistics,
session: Session,
study_locus_row: Row,
study_index: StudyIndex,
window: int = 1_000_000,
L: int = 10,
) -> StudyLocus:
"""Susie fine-mapper function that uses Summary Statstics, chromosome and position as inputs.
Args:
GWAS (SummaryStatistics): GWAS summary statistics
session (Session): Spark session
study_locus_row (Row): StudyLocus row
study_index (StudyIndex): StudyIndex object
window (int): window size for fine-mapping
L (int): number of causal variants
Returns:
StudyLocus: StudyLocus object with fine-mapped credible sets
"""
# PLEASE DO NOT REMOVE THIS LINE
pd.DataFrame.iteritems = pd.DataFrame.items

chromosome = study_locus_row["chromosome"]
position = study_locus_row["position"]
studyId = study_locus_row["studyId"]

study_index_df = study_index._df
study_index_df = study_index_df.filter(f.col("studyId") == studyId)
major_population = study_index_df.select(
"studyId",
f.array_max(f.col("ldPopulationStructure"))
.getItem("ldPopulation")
.alias("majorPopulation"),
).collect()[0]["majorPopulation"]

region = (
chromosome
+ ":"
+ str(int(position - window / 2))
+ "-"
+ str(int(position + window / 2))
)

gwas_df = (
GWAS.df.withColumn("z", f.col("beta") / f.col("standardError"))
.withColumn("chromosome", f.split(f.col("variantId"), "_")[0])
.withColumn("position", f.split(f.col("variantId"), "_")[1])
.filter(f.col("studyId") == studyId)
.filter(f.col("z").isNotNull())
)

ld_index = (
GnomADLDMatrix()
.get_locus_index(
study_locus_row=study_locus_row,
window_size=window,
major_population=major_population,
)
.withColumn(
"variantId",
f.concat(
f.lit(chromosome),
f.lit("_"),
f.col("`locus.position`"),
f.lit("_"),
f.col("alleles").getItem(0),
f.lit("_"),
f.col("alleles").getItem(1),
).cast("string"),
)
)

# Filtering out the variants that are not in the LD matrix, we don't need them
gwas_index = gwas_df.join(
ld_index.select("variantId", "alleles", "idx"), on="variantId"
).sort("idx")

gnomad_ld = GnomADLDMatrix.get_numpy_matrix(
gwas_index, gnomad_ancestry=major_population
)

pd_df = gwas_index.toPandas()
z_to_fm = np.array(pd_df["z"])
ld_to_fm = gnomad_ld

susie_output = SUSIE_inf.susie_inf(z=z_to_fm, LD=ld_to_fm, L=L)

schema = StructType(
[
StructField("variantId", StringType(), True),
StructField("chromosome", StringType(), True),
StructField("position", IntegerType(), True),
]
)
pd_df["position"] = pd_df["position"].astype(int)
variant_index = session.spark.createDataFrame(
pd_df[
[
"variantId",
"chromosome",
"position",
]
],
schema=schema,
)

return SusieFineMapperStep.susie_inf_to_studylocus(
susie_output=susie_output,
session=session,
studyId=studyId,
region=region,
variant_index=variant_index,
)

@staticmethod
def susie_inf_to_studylocus(
susie_output: dict[str, Any],
session: Session,
_studyId: str,
_region: str,
studyId: str,
region: str,
variant_index: DataFrame,
cs_lbf_thr: float = 2,
) -> StudyLocus:
Expand All @@ -33,8 +156,8 @@ def susie_inf_to_studylocus(
Args:
susie_output (dict[str, Any]): SuSiE-inf output dictionary
session (Session): Spark session
_studyId (str): study ID
_region (str): region
studyId (str): study ID
region (str): region
variant_index (DataFrame): DataFrame with variant information
cs_lbf_thr (float): credible set logBF threshold, default is 2
Expand All @@ -44,6 +167,7 @@ def susie_inf_to_studylocus(
variants = np.array(
[row["variantId"] for row in variant_index.select("variantId").collect()]
).reshape(-1, 1)

PIPs = susie_output["PIP"]
lbfs = susie_output["lbf_variable"]
mu = susie_output["mu"]
Expand Down Expand Up @@ -74,6 +198,7 @@ def susie_inf_to_studylocus(
win = Window.rowsBetween(
Window.unboundedPreceding, Window.unboundedFollowing
)

cred_set = (
session.spark.createDataFrame(
cred_set.tolist(),
Expand Down Expand Up @@ -104,8 +229,8 @@ def susie_inf_to_studylocus(
.limit(1)
.withColumns(
{
"studyId": f.lit(_studyId),
"region": f.lit(_region),
"studyId": f.lit(studyId),
"region": f.lit(region),
"credibleSetIndex": f.lit(counter),
"credibleSetlog10BF": f.lit(cs_lbf_value * 0.4342944819),
"finemappingMethod": f.lit("SuSiE-inf"),
Expand Down
5 changes: 3 additions & 2 deletions tests/gentropy/method/test_susie_inf.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,12 @@ def test_SUSIE_inf_convert_to_study_locus(
est_tausq=False,
)
gwas_df = sample_summary_statistics._df.limit(21)

L1 = SusieFineMapperStep.susie_inf_to_studylocus(
susie_output=susie_output,
session=session,
_studyId="sample_id",
_region="sample_region",
studyId="sample_id",
region="sample_region",
variant_index=gwas_df,
cs_lbf_thr=2,
)
Expand Down

0 comments on commit e1d20f3

Please sign in to comment.