Skip to content

Commit

Permalink
Merge branch 'dev' into dc_studyLocus_z-score_extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel-Considine committed Mar 20, 2024
2 parents 1ad85bd + 650bb2e commit 2a60e9e
Show file tree
Hide file tree
Showing 20 changed files with 1,397 additions and 499 deletions.
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ ci:
skip: [poetry-lock]
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.3.2
rev: v0.3.3
hooks:
- id: ruff
args:
Expand Down Expand Up @@ -104,7 +104,7 @@ repos:
- id: pydoclint

- repo: https://github.com/python-poetry/poetry
rev: "1.8.2"
rev: "1.8.0"
hooks:
- id: poetry-check
- id: poetry-lock
Expand Down
2 changes: 1 addition & 1 deletion docs/src_snippets/howto/python_api/c_applying_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def apply_class_method_clumping(summary_stats: SummaryStatistics) -> StudyLocus:
from gentropy.method.window_based_clumping import WindowBasedClumping

clumped_summary_statistics = WindowBasedClumping.clump(
summary_stats, window_length=500_000
summary_stats, distance=250_000
)
# --8<-- [end:apply_class_method_clumping]
return clumped_summary_statistics
Expand Down
1,103 changes: 1,103 additions & 0 deletions notebooks/Release_QC_metrics.ipynb

Large diffs are not rendered by default.

131 changes: 5 additions & 126 deletions poetry.lock

Large diffs are not rendered by default.

70 changes: 0 additions & 70 deletions src/gentropy/clump.py

This file was deleted.

6 changes: 4 additions & 2 deletions src/gentropy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ class LocusToGeneConfig(StepConfig):
"spark.dynamicAllocation.enabled": "false",
"spark.driver.memory": "48g",
"spark.executor.memory": "48g",
"spark.sql.shuffle.partitions": "800",
}
}
)
Expand Down Expand Up @@ -320,9 +321,10 @@ class WindowBasedClumpingStep(StepConfig):

summary_statistics_input_path: str = MISSING
study_locus_output_path: str = MISSING
distance: int = 500_000
collect_locus: bool = False
collect_locus_distance: int = 500_000
inclusion_list_path: str | None = None
locus_collect_distance: str | None = None

_target_: str = "gentropy.window_based_clumping.WindowBasedClumpingStep"


Expand Down
33 changes: 22 additions & 11 deletions src/gentropy/dataset/l2g_feature_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from functools import reduce
from typing import TYPE_CHECKING, Type

from pyspark.sql.functions import col

from gentropy.common.schemas import parse_spark_schema
from gentropy.common.spark_helpers import convert_from_long_to_wide
from gentropy.dataset.dataset import Dataset
Expand Down Expand Up @@ -40,7 +42,7 @@ def __post_init__(self: L2GFeatureMatrix) -> None:
def generate_features(
cls: Type[L2GFeatureMatrix],
features_list: list[str],
study_locus: StudyLocus,
credible_set: StudyLocus,
study_index: StudyIndex,
variant_gene: V2G,
colocalisation: Colocalisation,
Expand All @@ -49,7 +51,7 @@ def generate_features(
Args:
features_list (list[str]): List of features to generate
study_locus (StudyLocus): Study locus dataset
credible_set (StudyLocus): Credible set dataset
study_index (StudyIndex): Study index dataset
variant_gene (V2G): Variant to gene dataset
colocalisation (Colocalisation): Colocalisation dataset
Expand All @@ -60,13 +62,24 @@ def generate_features(
Raises:
ValueError: If the feature matrix is empty
"""
coloc_methods = (
colocalisation.df.select("colocalisationMethod")
.distinct()
.toPandas()["colocalisationMethod"]
.tolist()
)
if features_dfs := [
# Extract features
ColocalisationFactory._get_coloc_features(
study_locus, study_index, colocalisation
).df,
StudyLocusFactory._get_tss_distance_features(study_locus, variant_gene).df,
StudyLocusFactory._get_vep_features(study_locus, variant_gene).df,
ColocalisationFactory._get_max_coloc_per_credible_set(
credible_set,
study_index,
colocalisation.filter(col("colocalisationMethod") == method),
method,
).df
for method in coloc_methods
] + [
StudyLocusFactory._get_tss_distance_features(credible_set, variant_gene).df,
StudyLocusFactory._get_vep_features(credible_set, variant_gene).df,
]:
fm = reduce(
lambda x, y: x.unionByName(y),
Expand Down Expand Up @@ -162,8 +175,6 @@ def train_test_split(
"""
train, test = self._df.randomSplit([fraction, 1 - fraction], seed=42)
return (
L2GFeatureMatrix(
_df=train, _schema=L2GFeatureMatrix.get_schema()
).persist(),
L2GFeatureMatrix(_df=test, _schema=L2GFeatureMatrix.get_schema()).persist(),
L2GFeatureMatrix(_df=train, _schema=L2GFeatureMatrix.get_schema()),
L2GFeatureMatrix(_df=test, _schema=L2GFeatureMatrix.get_schema()),
)
8 changes: 4 additions & 4 deletions src/gentropy/dataset/l2g_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def from_credible_set(
cls: Type[L2GPrediction],
model_path: str,
features_list: list[str],
study_locus: StudyLocus,
credible_set: StudyLocus,
study_index: StudyIndex,
v2g: V2G,
coloc: Colocalisation,
Expand All @@ -54,7 +54,7 @@ def from_credible_set(
Args:
model_path (str): Path to the fitted model
features_list (list[str]): List of features to use for the model
study_locus (StudyLocus): Study locus dataset
credible_set (StudyLocus): Credible set dataset
study_index (StudyIndex): Study index dataset
v2g (V2G): Variant to gene dataset
coloc (Colocalisation): Colocalisation dataset
Expand All @@ -64,7 +64,7 @@ def from_credible_set(
"""
fm = L2GFeatureMatrix.generate_features(
features_list=features_list,
study_locus=study_locus,
credible_set=credible_set,
study_index=study_index,
variant_gene=v2g,
colocalisation=coloc,
Expand All @@ -73,7 +73,7 @@ def from_credible_set(
gwas_fm = L2GFeatureMatrix(
_df=(
fm.df.join(
study_locus.filter_by_study_type("gwas", study_index).df,
credible_set.filter_by_study_type("gwas", study_index).df,
on="studyLocusId",
)
),
Expand Down
70 changes: 70 additions & 0 deletions src/gentropy/dataset/study_locus.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Study locus dataset."""

from __future__ import annotations

from dataclasses import dataclass
Expand All @@ -24,6 +25,7 @@

from gentropy.dataset.ld_index import LDIndex
from gentropy.dataset.study_index import StudyIndex
from gentropy.dataset.summary_statistics import SummaryStatistics


class StudyLocusQualityCheck(Enum):
Expand Down Expand Up @@ -427,6 +429,74 @@ def annotate_credible_sets(self: StudyLocus) -> StudyLocus:
)
return self

def annotate_locus_statistics(
self: StudyLocus,
summary_statistics: SummaryStatistics,
collect_locus_distance: int,
) -> StudyLocus:
"""Annotates study locus with summary statistics in the specified distance around the position.
Args:
summary_statistics (SummaryStatistics): Summary statistics to be used for annotation.
collect_locus_distance (int): distance from variant defining window for inclusion of variants in locus.
Returns:
StudyLocus: Study locus annotated with summary statistics in `locus` column. If no statistics are found, the `locus` column will be empty.
"""
# The clumps will be used several times (persisting)
self.df.persist()
# Renaming columns:
sumstats_renamed = summary_statistics.df.selectExpr(
*[f"{col} as tag_{col}" for col in summary_statistics.df.columns]
).alias("sumstat")

locus_df = (
sumstats_renamed
# Joining the two datasets together:
.join(
f.broadcast(
self.df.alias("clumped").select(
"position", "chromosome", "studyId", "studyLocusId"
)
),
on=[
(f.col("sumstat.tag_studyId") == f.col("clumped.studyId"))
& (f.col("sumstat.tag_chromosome") == f.col("clumped.chromosome"))
& (
f.col("sumstat.tag_position")
>= (f.col("clumped.position") - collect_locus_distance)
)
& (
f.col("sumstat.tag_position")
<= (f.col("clumped.position") + collect_locus_distance)
)
],
how="inner",
)
.withColumn(
"locus",
f.struct(
f.col("tag_variantId").alias("variantId"),
f.col("tag_beta").alias("beta"),
f.col("tag_pValueMantissa").alias("pValueMantissa"),
f.col("tag_pValueExponent").alias("pValueExponent"),
f.col("tag_standardError").alias("standardError"),
),
)
.groupBy("studyLocusId")
.agg(
f.collect_list(f.col("locus")).alias("locus"),
)
)

self.df = self.df.drop("locus").join(
locus_df,
on="studyLocusId",
how="left",
)

return self

def annotate_ld(
self: StudyLocus, study_index: StudyIndex, ld_index: LDIndex
) -> StudyLocus:
Expand Down
31 changes: 10 additions & 21 deletions src/gentropy/dataset/summary_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from gentropy.common.schemas import parse_spark_schema
from gentropy.common.utils import parse_region, split_pvalue
from gentropy.dataset.dataset import Dataset
from gentropy.method.window_based_clumping import WindowBasedClumping

if TYPE_CHECKING:
from pyspark.sql import DataFrame
Expand Down Expand Up @@ -60,34 +59,24 @@ def window_based_clumping(
self: SummaryStatistics,
distance: int = 500_000,
gwas_significance: float = 5e-8,
baseline_significance: float = 0.05,
locus_collect_distance: int | None = None,
) -> StudyLocus:
"""Generate study-locus from summary statistics by distance based clumping + collect locus.
"""Generate study-locus from summary statistics using window-based clumping.
For more info, see [`WindowBasedClumping`][gentropy.method.window_based_clumping.WindowBasedClumping]
Args:
distance (int): Distance in base pairs to be used for clumping. Defaults to 500_000.
gwas_significance (float, optional): GWAS significance threshold. Defaults to 5e-8.
baseline_significance (float, optional): Baseline significance threshold for inclusion in the locus. Defaults to 0.05.
locus_collect_distance (int | None): The distance to collect locus around semi-indices. If not provided, locus is not collected.
Returns:
StudyLocus: Clumped study-locus containing variants based on window.
StudyLocus: Clumped study-locus optionally containing variants based on window.
"""
return (
WindowBasedClumping.clump_with_locus(
self,
window_length=distance,
p_value_significance=gwas_significance,
p_value_baseline=baseline_significance,
locus_window_length=locus_collect_distance,
)
if locus_collect_distance
else WindowBasedClumping.clump(
self,
window_length=distance,
p_value_significance=gwas_significance,
)
from gentropy.method.window_based_clumping import WindowBasedClumping

return WindowBasedClumping.clump(
self,
distance=distance,
gwas_significance=gwas_significance,
)

def exclude_region(self: SummaryStatistics, region: str) -> SummaryStatistics:
Expand Down
Loading

0 comments on commit 2a60e9e

Please sign in to comment.