From 4522e0c98a3be475807d27f8d77864f467330afd Mon Sep 17 00:00:00 2001 From: Yakov Tsepilov Date: Wed, 25 Sep 2024 11:01:39 +0100 Subject: [PATCH] feat: adding QC login to fine_mapper --- src/gentropy/config.py | 10 +++ src/gentropy/susie_finemapper.py | 103 +++++++++++++++++++++++-------- 2 files changed, 86 insertions(+), 27 deletions(-) diff --git a/src/gentropy/config.py b/src/gentropy/config.py index 3293a882a..63ba6b54e 100644 --- a/src/gentropy/config.py +++ b/src/gentropy/config.py @@ -453,10 +453,20 @@ class FinemapperConfig(StepConfig): class GWASQCStep(StepConfig): """GWAS QC step configuration.""" + session: Any = field( + default_factory=lambda: { + "start_hail": True, + } + ) gwas_path: str = MISSING output_path: str = MISSING studyid: str = MISSING pval_threshold: float = MISSING + threshold_mean_beta: float = 0.05 + threshold_mean_diff_pz: float = 0.05 + threshold_se_diff_pz: float = 0.05 + threshold_gc_lambda: float = 2 + threshold_min_n_variants_for_susie: int = 2_000_000 _target_: str = "gentropy.sumstat_qc_step.SummaryStatisticsQCStep" diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py index 254f46d1e..f8aab06fc 100644 --- a/src/gentropy/susie_finemapper.py +++ b/src/gentropy/susie_finemapper.py @@ -4,7 +4,7 @@ import logging import time -from typing import Any +from typing import Any, Optional import numpy as np import pandas as pd @@ -25,6 +25,7 @@ neglog_pvalue_to_mantissa_and_exponent, order_array_of_structs_by_field, ) +from gentropy.config import GWASQCStep from gentropy.dataset.study_index import StudyIndex from gentropy.dataset.study_locus import StudyLocus from gentropy.datasource.gnomad.ld import GnomADLDMatrix @@ -60,6 +61,7 @@ def __init__( carma_tau: float = 0.15, imputed_r2_threshold: float = 0.9, ld_score_threshold: float = 5, + qc_report_path: Optional[str] = None, ) -> None: """Run fine-mapping on a studyLocusId from a collected studyLocus table. @@ -82,6 +84,7 @@ def __init__( carma_tau (float): CARMA tau, shrinkage parameter imputed_r2_threshold (float): imputed R2 threshold, default is 0.9 ld_score_threshold (float): LD score threshold ofr imputation, default is 5 + qc_report_path (Optional[str]): path to the QC report, default is None """ # Read locus manifest. study_locus_manifest = pd.read_csv(study_locus_manifest_path) @@ -101,27 +104,70 @@ def __init__( .collect()[0] ) study_index = StudyIndex.from_parquet(session, study_index_path) - # Run fine-mapping - result_logging = self.susie_finemapper_one_sl_row_v4_ss_gathered_boundaries( - session=session, - study_locus_row=study_locus, - study_index=study_index, - max_causal_snps=max_causal_snps, - primary_signal_pval_threshold=primary_signal_pval_threshold, - secondary_signal_pval_threshold=secondary_signal_pval_threshold, - purity_mean_r2_threshold=purity_mean_r2_threshold, - purity_min_r2_threshold=purity_min_r2_threshold, - cs_lbf_thr=cs_lbf_thr, - sum_pips=sum_pips, - susie_est_tausq=susie_est_tausq, - run_carma=run_carma, - run_sumstat_imputation=run_sumstat_imputation, - carma_tau=carma_tau, - carma_time_limit=carma_time_limit, - imputed_r2_threshold=imputed_r2_threshold, - ld_score_threshold=ld_score_threshold, - ) + # Major ancestry + + pd.DataFrame.iteritems = pd.DataFrame.items + studyId = study_locus["studyId"] + study_index_df = study_index._df + study_index_df = study_index_df.filter(f.col("studyId") == studyId) + major_population = study_index_df.select( + "studyId", + order_array_of_structs_by_field( + "ldPopulationStructure", "relativeSampleSize" + )[0]["ldPopulation"].alias("majorPopulation"), + ).collect()[0]["majorPopulation"] + if major_population in ["nfe", "csa", "afr"]: + condition_1 = True + else: + condition_1 = False + + # Read QC report + if qc_report_path is not None: + qc_report = pd.read_parquet(qc_report_path) + qc_report = qc_report.loc[qc_report["studyId"] == studyId] + + if ( + qc_report.loc[qc_report["studyId"] == studyId, "gc_lambda"] + <= GWASQCStep().threshold_gc_lambda + and abs(qc_report.loc[qc_report["studyId"] == studyId, "mean_beta"]) + <= GWASQCStep().threshold_mean_beta + and abs(qc_report.loc[qc_report["studyId"] == studyId, "mean_diff_pz"]) + <= GWASQCStep().threshold_mean_diff_pz + and abs(qc_report.loc[qc_report["studyId"] == studyId, "se_diff_pz"]) + <= GWASQCStep().threshold_se_diff_pz + and qc_report.loc[qc_report["studyId"] == studyId, "n_variants"] + >= GWASQCStep().threshold_min_n_variants_for_susie + ): + condition_2 = True + else: + condition_2 = False + else: + condition_2 = True + + if condition_1 and condition_2: + # Run fine-mapping + result_logging = self.susie_finemapper_one_sl_row_v4_ss_gathered_boundaries( + session=session, + study_locus_row=study_locus, + study_index=study_index, + max_causal_snps=max_causal_snps, + primary_signal_pval_threshold=primary_signal_pval_threshold, + secondary_signal_pval_threshold=secondary_signal_pval_threshold, + purity_mean_r2_threshold=purity_mean_r2_threshold, + purity_min_r2_threshold=purity_min_r2_threshold, + cs_lbf_thr=cs_lbf_thr, + sum_pips=sum_pips, + susie_est_tausq=susie_est_tausq, + run_carma=run_carma, + run_sumstat_imputation=run_sumstat_imputation, + carma_tau=carma_tau, + carma_time_limit=carma_time_limit, + imputed_r2_threshold=imputed_r2_threshold, + ld_score_threshold=ld_score_threshold, + ) + else: + result_logging = None if result_logging is not None: # Write result @@ -702,12 +748,7 @@ def susie_finemapper_one_sl_row_v4_ss_gathered_boundaries( # noqa: C901 if gwas_index.rdd.isEmpty(): logging.warning("No overlapping variants in the LD Index") return None - # Check if gwas_index has less than 100 variants - if gwas_index.count() < 100: - logging.warning( - "Less than 100 variants after joining GWAS and LD index" - ) - return None + gnomad_ld = GnomADLDMatrix.get_numpy_matrix( gwas_index, gnomad_ancestry=major_population ) @@ -794,6 +835,14 @@ def susie_finemapper_one_sl_row_v4_ss_gathered_boundaries( # noqa: C901 ) np.fill_diagonal(gnomad_ld, 1) + # Check if gwas_index is too small or too big + if gwas_index.count() < 100: + logging.warning("Less than 100 variants after joining GWAS and LD index") + return None + elif gwas_index.count() >= 15_000: + logging.warning("More than 15000 variants after joining GWAS and LD index") + return None + out = SusieFineMapperStep.susie_finemapper_from_prepared_dataframes( GWAS_df=gwas_df, ld_index=gwas_index,