Skip to content

Commit

Permalink
feat(config): extract gwas_significance parameter to step configurati…
Browse files Browse the repository at this point in the history
…on (#628)

* feat(clumping): lower p-value significance threshold for clumping step
* feat(config): extract gwas_significance to config
* feat(config): synced p-value in association parsing

---------

Co-authored-by: Szymon Szyszkowski <[email protected]>
Co-authored-by: Yakov <[email protected]>
  • Loading branch information
3 people authored Jun 4, 2024
1 parent b22951b commit daa8331
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 12 deletions.
1 change: 1 addition & 0 deletions config/step/ot_window_based_clumping.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ defaults:
summary_statistics_input_path: ???
study_locus_output_path: ???
inclusion_list_path: ???
gwas_significance: 1e-8
12 changes: 10 additions & 2 deletions src/gentropy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,11 +355,17 @@ class VariantToGeneConfig(StepConfig):


@dataclass
class WindowBasedClumpingStep(StepConfig):
class WindowBasedClumpingStepConfig(StepConfig):
"""Window-based clumping step configuration."""

session: Any = field(
default_factory=lambda: {
"start_hail": True,
}
)
summary_statistics_input_path: str = MISSING
study_locus_output_path: str = MISSING
gwas_significance: float = 5e-8
distance: int = 500_000
collect_locus: bool = False
collect_locus_distance: int = 500_000
Expand Down Expand Up @@ -454,5 +460,7 @@ def register_config() -> None:
cs.store(group="step", name="variant_annotation", node=VariantAnnotationConfig)
cs.store(group="step", name="variant_index", node=VariantIndexConfig)
cs.store(group="step", name="variant_to_gene", node=VariantToGeneConfig)
cs.store(group="step", name="window_based_clumping", node=WindowBasedClumpingStep)
cs.store(
group="step", name="window_based_clumping", node=WindowBasedClumpingStepConfig
)
cs.store(group="step", name="susie_finemapping", node=FinemapperConfig)
6 changes: 4 additions & 2 deletions src/gentropy/dataset/summary_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from gentropy.common.schemas import parse_spark_schema
from gentropy.common.utils import parse_region, split_pvalue
from gentropy.config import WindowBasedClumpingStepConfig
from gentropy.dataset.dataset import Dataset

if TYPE_CHECKING:
Expand Down Expand Up @@ -57,8 +58,8 @@ def pvalue_filter(self: SummaryStatistics, pvalue: float) -> SummaryStatistics:

def window_based_clumping(
self: SummaryStatistics,
distance: int = 500_000,
gwas_significance: float = 5e-8,
distance: int = WindowBasedClumpingStepConfig().distance,
gwas_significance: float = WindowBasedClumpingStepConfig().gwas_significance,
) -> StudyLocus:
"""Generate study-locus from summary statistics using window-based clumping.
Expand All @@ -70,6 +71,7 @@ def window_based_clumping(
Returns:
StudyLocus: Clumped study-locus optionally containing variants based on window.
Check WindowBasedClumpingStepConfig object for default values.
"""
from gentropy.method.window_based_clumping import WindowBasedClumping

Expand Down
5 changes: 4 additions & 1 deletion src/gentropy/datasource/gwas_catalog/associations.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
pvalue_to_zscore,
)
from gentropy.common.utils import parse_efos
from gentropy.config import WindowBasedClumpingStepConfig
from gentropy.dataset.study_locus import StudyLocus, StudyLocusQualityCheck

if TYPE_CHECKING:
Expand Down Expand Up @@ -1035,7 +1036,7 @@ def from_source(
cls: type[GWASCatalogCuratedAssociationsParser],
gwas_associations: DataFrame,
variant_annotation: VariantAnnotation,
pvalue_threshold: float = 5e-8,
pvalue_threshold: float = WindowBasedClumpingStepConfig.gwas_significance,
) -> StudyLocusGWASCatalog:
"""Read GWASCatalog associations.
Expand All @@ -1049,6 +1050,8 @@ def from_source(
Returns:
StudyLocusGWASCatalog: GWASCatalogAssociations dataset
pvalue_threshold is keeped in sync with the WindowBasedClumpingStep gwas_significance.
"""
return StudyLocusGWASCatalog(
_df=gwas_associations.withColumn(
Expand Down
7 changes: 5 additions & 2 deletions src/gentropy/method/window_based_clumping.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pyspark.ml.linalg import DenseVector, VectorUDT
from pyspark.sql.window import Window

from gentropy.config import WindowBasedClumpingStepConfig
from gentropy.dataset.study_locus import StudyLocus

if TYPE_CHECKING:
Expand Down Expand Up @@ -154,8 +155,8 @@ def _prune_peak(position: NDArray[np.float64], window_size: int) -> DenseVector:
@staticmethod
def clump(
summary_statistics: SummaryStatistics,
distance: int = 500_000,
gwas_significance: float = 5e-8,
distance: int = WindowBasedClumpingStepConfig().distance,
gwas_significance: float = WindowBasedClumpingStepConfig().gwas_significance,
) -> StudyLocus:
"""Clump significant signals from summary statistics based on window.
Expand All @@ -166,6 +167,8 @@ def clump(
Returns:
StudyLocus: clumped summary statistics (without locus collection)
Check WindowBasedClumpingStepConfig object for default values
"""
# Create window for locus clusters
# - variants where the distance between subsequent variants is below the defined threshold.
Expand Down
16 changes: 11 additions & 5 deletions src/gentropy/window_based_clumping.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

from gentropy.common.session import Session
from gentropy.config import WindowBasedClumpingStepConfig
from gentropy.dataset.summary_statistics import SummaryStatistics


Expand All @@ -14,10 +15,12 @@ def __init__(
session: Session,
summary_statistics_input_path: str,
study_locus_output_path: str,
distance: int = 500_000,
collect_locus: bool = False,
collect_locus_distance: int = 500_000,
inclusion_list_path: str | None = None,
distance: int = WindowBasedClumpingStepConfig().distance,
gwas_significance: float = WindowBasedClumpingStepConfig().gwas_significance,
collect_locus: bool = WindowBasedClumpingStepConfig().collect_locus,
collect_locus_distance: int = WindowBasedClumpingStepConfig().collect_locus_distance,
inclusion_list_path: str
| None = WindowBasedClumpingStepConfig().inclusion_list_path,
) -> None:
"""Run window-based clumping step.
Expand All @@ -26,9 +29,12 @@ def __init__(
summary_statistics_input_path (str): Path to the harmonized summary statistics dataset.
study_locus_output_path (str): Output path for the resulting study locus dataset.
distance (int): Distance, within which tagging variants are collected around the semi-index. Optional.
gwas_significance (float): GWAS significance threshold. Defaults to 5e-8.
collect_locus (bool): Whether to collect locus around semi-indices. Optional.
collect_locus_distance (int): Distance, within which tagging variants are collected around the semi-index. Optional.
inclusion_list_path (str | None): Path to the inclusion list (list of white-listed study identifier). Optional.
Check WindowBasedClumpingStepConfig object for default values.
"""
# If inclusion list path is provided, only these studies will be read:
if inclusion_list_path:
Expand All @@ -48,7 +54,7 @@ def __init__(

# Clumping:
study_locus = ss.window_based_clumping(
distance=distance,
distance=distance, gwas_significance=gwas_significance
)

# Optional locus collection:
Expand Down

0 comments on commit daa8331

Please sign in to comment.