Skip to content

Commit

Permalink
Merge branch 'dev' of https://github.com/opentargets/gentropy into il…
Browse files Browse the repository at this point in the history
…-optimise-l2g-coloc
  • Loading branch information
ireneisdoomed committed Mar 21, 2024
2 parents c1f49e5 + dee3085 commit 475b016
Show file tree
Hide file tree
Showing 12 changed files with 1,259 additions and 219 deletions.
2 changes: 1 addition & 1 deletion docs/src_snippets/howto/python_api/c_applying_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def apply_class_method_clumping(summary_stats: SummaryStatistics) -> StudyLocus:
from gentropy.method.window_based_clumping import WindowBasedClumping

clumped_summary_statistics = WindowBasedClumping.clump(
summary_stats, window_length=500_000
summary_stats, distance=250_000
)
# --8<-- [end:apply_class_method_clumping]
return clumped_summary_statistics
Expand Down
1,103 changes: 1,103 additions & 0 deletions notebooks/Release_QC_metrics.ipynb

Large diffs are not rendered by default.

70 changes: 0 additions & 70 deletions src/gentropy/clump.py

This file was deleted.

5 changes: 3 additions & 2 deletions src/gentropy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,9 +321,10 @@ class WindowBasedClumpingStep(StepConfig):

summary_statistics_input_path: str = MISSING
study_locus_output_path: str = MISSING
distance: int = 500_000
collect_locus: bool = False
collect_locus_distance: int = 500_000
inclusion_list_path: str | None = None
locus_collect_distance: str | None = None

_target_: str = "gentropy.window_based_clumping.WindowBasedClumpingStep"


Expand Down
70 changes: 70 additions & 0 deletions src/gentropy/dataset/study_locus.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Study locus dataset."""

from __future__ import annotations

from dataclasses import dataclass
Expand All @@ -24,6 +25,7 @@

from gentropy.dataset.ld_index import LDIndex
from gentropy.dataset.study_index import StudyIndex
from gentropy.dataset.summary_statistics import SummaryStatistics


class StudyLocusQualityCheck(Enum):
Expand Down Expand Up @@ -427,6 +429,74 @@ def annotate_credible_sets(self: StudyLocus) -> StudyLocus:
)
return self

def annotate_locus_statistics(
self: StudyLocus,
summary_statistics: SummaryStatistics,
collect_locus_distance: int,
) -> StudyLocus:
"""Annotates study locus with summary statistics in the specified distance around the position.
Args:
summary_statistics (SummaryStatistics): Summary statistics to be used for annotation.
collect_locus_distance (int): distance from variant defining window for inclusion of variants in locus.
Returns:
StudyLocus: Study locus annotated with summary statistics in `locus` column. If no statistics are found, the `locus` column will be empty.
"""
# The clumps will be used several times (persisting)
self.df.persist()
# Renaming columns:
sumstats_renamed = summary_statistics.df.selectExpr(
*[f"{col} as tag_{col}" for col in summary_statistics.df.columns]
).alias("sumstat")

locus_df = (
sumstats_renamed
# Joining the two datasets together:
.join(
f.broadcast(
self.df.alias("clumped").select(
"position", "chromosome", "studyId", "studyLocusId"
)
),
on=[
(f.col("sumstat.tag_studyId") == f.col("clumped.studyId"))
& (f.col("sumstat.tag_chromosome") == f.col("clumped.chromosome"))
& (
f.col("sumstat.tag_position")
>= (f.col("clumped.position") - collect_locus_distance)
)
& (
f.col("sumstat.tag_position")
<= (f.col("clumped.position") + collect_locus_distance)
)
],
how="inner",
)
.withColumn(
"locus",
f.struct(
f.col("tag_variantId").alias("variantId"),
f.col("tag_beta").alias("beta"),
f.col("tag_pValueMantissa").alias("pValueMantissa"),
f.col("tag_pValueExponent").alias("pValueExponent"),
f.col("tag_standardError").alias("standardError"),
),
)
.groupBy("studyLocusId")
.agg(
f.collect_list(f.col("locus")).alias("locus"),
)
)

self.df = self.df.drop("locus").join(
locus_df,
on="studyLocusId",
how="left",
)

return self

def annotate_ld(
self: StudyLocus, study_index: StudyIndex, ld_index: LDIndex
) -> StudyLocus:
Expand Down
31 changes: 10 additions & 21 deletions src/gentropy/dataset/summary_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from gentropy.common.schemas import parse_spark_schema
from gentropy.common.utils import parse_region, split_pvalue
from gentropy.dataset.dataset import Dataset
from gentropy.method.window_based_clumping import WindowBasedClumping

if TYPE_CHECKING:
from pyspark.sql.types import StructType
Expand Down Expand Up @@ -59,34 +58,24 @@ def window_based_clumping(
self: SummaryStatistics,
distance: int = 500_000,
gwas_significance: float = 5e-8,
baseline_significance: float = 0.05,
locus_collect_distance: int | None = None,
) -> StudyLocus:
"""Generate study-locus from summary statistics by distance based clumping + collect locus.
"""Generate study-locus from summary statistics using window-based clumping.
For more info, see [`WindowBasedClumping`][gentropy.method.window_based_clumping.WindowBasedClumping]
Args:
distance (int): Distance in base pairs to be used for clumping. Defaults to 500_000.
gwas_significance (float, optional): GWAS significance threshold. Defaults to 5e-8.
baseline_significance (float, optional): Baseline significance threshold for inclusion in the locus. Defaults to 0.05.
locus_collect_distance (int | None): The distance to collect locus around semi-indices. If not provided, locus is not collected.
Returns:
StudyLocus: Clumped study-locus containing variants based on window.
StudyLocus: Clumped study-locus optionally containing variants based on window.
"""
return (
WindowBasedClumping.clump_with_locus(
self,
window_length=distance,
p_value_significance=gwas_significance,
p_value_baseline=baseline_significance,
locus_window_length=locus_collect_distance,
)
if locus_collect_distance
else WindowBasedClumping.clump(
self,
window_length=distance,
p_value_significance=gwas_significance,
)
from gentropy.method.window_based_clumping import WindowBasedClumping

return WindowBasedClumping.clump(
self,
distance=distance,
gwas_significance=gwas_significance,
)

def exclude_region(self: SummaryStatistics, region: str) -> SummaryStatistics:
Expand Down
4 changes: 2 additions & 2 deletions src/gentropy/method/susie_inf.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def susie_inf( # noqa: C901
ssq_range: tuple[float, float] = (0, 1),
pi0: np.ndarray | None = None,
est_sigmasq: bool = True,
est_tausq: bool = True,
est_tausq: bool = False,
sigmasq: float = 1,
tausq: float = 0,
sigmasq_range: tuple[float, float] | None = None,
Expand Down Expand Up @@ -399,7 +399,7 @@ def g(x: float) -> float:
def cred_inf(
PIP: np.ndarray,
n: int = 100_000,
coverage: float = 0.9,
coverage: float = 0.99,
purity: float = 0.5,
LD: np.ndarray | None = None,
V: np.ndarray | None = None,
Expand Down
Loading

0 comments on commit 475b016

Please sign in to comment.