Skip to content

Commit

Permalink
fix: remove n_eff check from qc_step (#785)
Browse files Browse the repository at this point in the history
  • Loading branch information
addramir committed Sep 24, 2024
1 parent 2199ece commit 2010fb6
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 24 deletions.
1 change: 1 addition & 0 deletions src/gentropy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,7 @@ class GWASQCStep(StepConfig):
gwas_path: str = MISSING
output_path: str = MISSING
studyid: str = MISSING
pval_threshold: float = MISSING
_target_: str = "gentropy.sumstat_qc_step.SummaryStatisticsQCStep"


Expand Down
21 changes: 9 additions & 12 deletions src/gentropy/method/sumstat_quality_controls.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Summary statistics qulity control methods."""

from __future__ import annotations

import numpy as np
Expand Down Expand Up @@ -225,13 +226,13 @@ def gc_lambda_check(

@staticmethod
def number_of_snps(
gwas_for_qc: SummaryStatistics, pval_threhod: float = 5e-8
gwas_for_qc: SummaryStatistics, pval_threshold: float = 5e-8
) -> DataFrame:
"""The function caluates number of SNPs and number of SNPs with p-value less than 5e-8.
Args:
gwas_for_qc (SummaryStatistics): The instance of the SummaryStatistics class.
pval_threhod (float): The threshold for the p-value.
pval_threshold (float): The threshold for the p-value.
Returns:
DataFrame: PySpark DataFrame with the number of SNPs and number of SNPs with p-value less than threshold.
Expand All @@ -243,7 +244,7 @@ def number_of_snps(
f.sum(
(
f.log10(f.col("pValueMantissa")) + f.col("pValueExponent")
<= np.log10(pval_threhod)
<= np.log10(pval_threshold)
).cast("int")
).alias("n_variants_sig"),
)
Expand All @@ -254,30 +255,26 @@ def number_of_snps(
def get_quality_control_metrics(
gwas: SummaryStatistics,
limit: int = 100_000_000,
min_count: int = 100_000,
n_total: int = 100_000,
pval_threshold: float = 5e-8,
) -> DataFrame:
"""The function calculates the quality control metrics for the summary statistics.
Args:
gwas (SummaryStatistics): The instance of the SummaryStatistics class.
limit (int): The limit for the number of variants to be used for the estimation.
min_count (int): The minimum number of variants to be used for the estimation.
n_total (int): The total sample size.
pval_threshold (float): The threshold for the p-value.
Returns:
DataFrame: PySpark DataFrame with the quality control metrics for the summary statistics.
"""
qc1 = SummaryStatisticsQC.sumstat_qc_beta_check(gwas_for_qc=gwas)
qc2 = SummaryStatisticsQC.sumstat_qc_pz_check(gwas_for_qc=gwas, limit=limit)
qc3 = SummaryStatisticsQC.sumstat_n_eff_check(
gwas_for_qc=gwas, n_total=n_total, limit=limit, min_count=min_count
)
qc4 = SummaryStatisticsQC.gc_lambda_check(gwas_for_qc=gwas, limit=limit)
qc5 = SummaryStatisticsQC.number_of_snps(gwas_for_qc=gwas)
qc5 = SummaryStatisticsQC.number_of_snps(
gwas_for_qc=gwas, pval_threshold=pval_threshold
)
df = (
qc1.join(qc2, on="studyId", how="outer")
.join(qc3, on="studyId", how="outer")
.join(qc4, on="studyId", how="outer")
.join(qc5, on="studyId", how="outer")
)
Expand Down
4 changes: 3 additions & 1 deletion src/gentropy/sumstat_qc_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def __init__(
gwas_path: str,
output_path: str,
studyid: str,
pval_threshold: float = 1e-8,
) -> None:
"""Calculating quality control metrics on the provided GWAS study.
Expand All @@ -24,13 +25,14 @@ def __init__(
gwas_path (str): Path to the GWAS summary statistics.
output_path (str): Output path for the QC results.
studyid (str): Study ID for the QC.
pval_threshold (float): P-value threshold for the QC. Default is 1e-8.
"""
gwas = SummaryStatistics.from_parquet(session, path=gwas_path)

(
SummaryStatisticsQC.get_quality_control_metrics(
gwas=gwas, limit=100_000_000, min_count=100, n_total=100000
gwas=gwas, limit=100_000_000, pval_threshold=pval_threshold
)
.write.mode(session.write_mode)
.parquet(output_path + "/qc_results_" + studyid)
Expand Down
16 changes: 5 additions & 11 deletions tests/gentropy/method/test_qc_of_sumstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from __future__ import annotations

import numpy as np
import pandas as pd
import pyspark.sql.functions as f
import pytest
from pyspark.sql.functions import rand, when
Expand All @@ -18,9 +17,7 @@ def test_qc_functions(
) -> None:
"""Test all sumstat qc functions."""
gwas = sample_summary_statistics.sanity_filter()
QC = SummaryStatisticsQC.get_quality_control_metrics(
gwas=gwas, limit=100000, min_count=100, n_total=100000
)
QC = SummaryStatisticsQC.get_quality_control_metrics(gwas=gwas, limit=100000)
QC = QC.toPandas()

assert QC["n_variants"].iloc[0] == 1663
Expand All @@ -29,7 +26,6 @@ def test_qc_functions(
assert np.round(QC["mean_beta"].iloc[0], 4) == 0.0013
assert np.round(QC["mean_diff_pz"].iloc[0], 6) == 0
assert np.round(QC["se_diff_pz"].iloc[0], 6) == 0
assert pd.isna(QC["se_N"].iloc[0])


def test_neff_check_eaf(
Expand All @@ -41,8 +37,8 @@ def test_neff_check_eaf(
gwas_df = gwas_df.withColumn("effectAlleleFrequencyFromSource", f.lit(0.5))
gwas._df = gwas_df

QC = SummaryStatisticsQC.get_quality_control_metrics(
gwas=gwas, limit=100000, min_count=100, n_total=100000
QC = SummaryStatisticsQC.sumstat_n_eff_check(
gwas_for_qc=gwas, limit=100000, min_count=100, n_total=100000
)
QC = QC.toPandas()
assert np.round(QC["se_N"].iloc[0], 4) == 0.5586
Expand All @@ -59,11 +55,9 @@ def test_several_studyid(
)
gwas._df = gwas_df

QC = SummaryStatisticsQC.get_quality_control_metrics(
gwas=gwas, limit=100000, min_count=100, n_total=100000
)
QC = SummaryStatisticsQC.get_quality_control_metrics(gwas=gwas, limit=100000)
QC = QC.toPandas()
assert QC.shape == (2, 8)
assert QC.shape == (2, 7)


def test_sanity_filter_remove_inf_values(
Expand Down

0 comments on commit 2010fb6

Please sign in to comment.