From 73b97a37470d30d51246e521b6ccea59dea1e6dd Mon Sep 17 00:00:00 2001 From: Yakov Tsepilov Date: Tue, 24 Sep 2024 17:24:14 +0100 Subject: [PATCH 1/2] fix: adding data specific fillters --- src/gentropy/config.py | 8 +++++++- src/gentropy/eqtl_catalogue.py | 5 +++++ src/gentropy/finngen_finemapping_ingestion.py | 4 ++++ src/gentropy/pics.py | 7 +++++++ src/gentropy/study_locus_validation.py | 2 -- 5 files changed, 23 insertions(+), 3 deletions(-) diff --git a/src/gentropy/config.py b/src/gentropy/config.py index 86bfc7afe..9ec8d73cd 100644 --- a/src/gentropy/config.py +++ b/src/gentropy/config.py @@ -122,10 +122,16 @@ class GWASCatalogSumstatsPreprocessConfig(StepConfig): class EqtlCatalogueConfig(StepConfig): """eQTL Catalogue step configuration.""" + session: Any = field( + default_factory=lambda: { + "start_hail": True, + } + ) eqtl_catalogue_paths_imported: str = MISSING eqtl_catalogue_study_index_out: str = MISSING eqtl_catalogue_credible_sets_out: str = MISSING mqtl_quantification_methods_blacklist: list[str] = field(default_factory=lambda: []) + eqtl_lead_pvalue_threshold: float = 1e-3 _target_: str = "gentropy.eqtl_catalogue.EqtlCatalogueStep" @@ -168,6 +174,7 @@ class FinngenFinemappingConfig(StepConfig): _target_: str = ( "gentropy.finngen_finemapping_ingestion.FinnGenFinemappingIngestionStep" ) + finngen_finemapping_lead_pvalue_threshold: float = 1e-5 @dataclass @@ -502,7 +509,6 @@ class StudyLocusValidationStepConfig(StepConfig): valid_study_locus_path: str = MISSING invalid_study_locus_path: str = MISSING invalid_qc_reasons: list[str] = MISSING - gwas_significance: float = WindowBasedClumpingStepConfig.gwas_significance _target_: str = "gentropy.study_locus_validation.StudyLocusValidationStep" diff --git a/src/gentropy/eqtl_catalogue.py b/src/gentropy/eqtl_catalogue.py index 7adc5d8a2..ac3b14879 100644 --- a/src/gentropy/eqtl_catalogue.py +++ b/src/gentropy/eqtl_catalogue.py @@ -3,6 +3,7 @@ from __future__ import annotations from gentropy.common.session import Session +from gentropy.config import EqtlCatalogueConfig from gentropy.datasource.eqtl_catalogue.finemapping import EqtlCatalogueFinemapping from gentropy.datasource.eqtl_catalogue.study_index import EqtlCatalogueStudyIndex @@ -61,6 +62,10 @@ def __init__( credible_sets = EqtlCatalogueFinemapping.from_susie_results(processed_susie_df) study_index = EqtlCatalogueStudyIndex.from_susie_results(processed_susie_df) + credible_sets = credible_sets.validate_lead_pvalue( + pvalue_cutoff=EqtlCatalogueConfig().eqtl_lead_pvalue_threshold + ) + # Load study_index.df.write.mode(session.write_mode).parquet( eqtl_catalogue_study_index_out diff --git a/src/gentropy/finngen_finemapping_ingestion.py b/src/gentropy/finngen_finemapping_ingestion.py index 80089cf68..4c008d83b 100644 --- a/src/gentropy/finngen_finemapping_ingestion.py +++ b/src/gentropy/finngen_finemapping_ingestion.py @@ -37,6 +37,10 @@ def __init__( finngen_susie_finemapping_cs_summary_files=finngen_susie_finemapping_cs_summary_files, ) + finngen_finemapping_df = finngen_finemapping_df.validate_lead_pvalue( + pvalue_cutoff=FinngenFinemappingConfig().finngen_finemapping_lead_pvalue_threshold + ) + # Write the output. finngen_finemapping_df.df.write.mode(session.write_mode).parquet( finngen_finemapping_out diff --git a/src/gentropy/pics.py b/src/gentropy/pics.py index e80a37eb6..a8f3c345b 100644 --- a/src/gentropy/pics.py +++ b/src/gentropy/pics.py @@ -3,6 +3,7 @@ from __future__ import annotations from gentropy.common.session import Session +from gentropy.config import WindowBasedClumpingStepConfig from gentropy.dataset.study_locus import CredibleInterval, StudyLocus from gentropy.method.pics import PICS @@ -31,5 +32,11 @@ def __init__( picsed_sl = PICS.finemap(study_locus_ld_annotated).filter_credible_set( credible_interval=CredibleInterval.IS99 ) + + # Validate lead p-value + picsed_sl = picsed_sl.validate_lead_pvalue( + pvalue_cutoff=WindowBasedClumpingStepConfig().gwas_significance + ) + # Write picsed_sl.df.write.mode(session.write_mode).parquet(picsed_study_locus_out) diff --git a/src/gentropy/study_locus_validation.py b/src/gentropy/study_locus_validation.py index 287cd5645..fc69f6855 100644 --- a/src/gentropy/study_locus_validation.py +++ b/src/gentropy/study_locus_validation.py @@ -19,7 +19,6 @@ def __init__( session: Session, study_index_path: str, study_locus_path: list[str], - gwas_significance: float, valid_study_locus_path: str, invalid_study_locus_path: str, invalid_qc_reasons: list[str] = [], @@ -30,7 +29,6 @@ def __init__( session (Session): Session object. study_index_path (str): Path to study index file. study_locus_path (list[str]): Path to study locus dataset. - gwas_significance (float): GWAS significance threshold. valid_study_locus_path (str): Path to write the valid records. invalid_study_locus_path (str): Path to write the output file. invalid_qc_reasons (list[str]): List of invalid quality check reason names from `StudyLocusQualityCheck` (e.g. ['SUBSIGNIFICANT_FLAG']). From 9d661270af602c3d278df7662acdfa36fa77fb87 Mon Sep 17 00:00:00 2001 From: Yakov Tsepilov Date: Wed, 25 Sep 2024 11:55:54 +0100 Subject: [PATCH 2/2] fix: removing variables --- src/gentropy/eqtl_catalogue.py | 24 ++++++++++------- src/gentropy/finngen_finemapping_ingestion.py | 26 +++++++++---------- src/gentropy/pics.py | 20 +++++++------- 3 files changed, 37 insertions(+), 33 deletions(-) diff --git a/src/gentropy/eqtl_catalogue.py b/src/gentropy/eqtl_catalogue.py index ac3b14879..ff01e1e22 100644 --- a/src/gentropy/eqtl_catalogue.py +++ b/src/gentropy/eqtl_catalogue.py @@ -59,17 +59,21 @@ def __init__( processed_susie_df = EqtlCatalogueFinemapping.parse_susie_results( credible_sets_df, lbf_df, studies_metadata ) - credible_sets = EqtlCatalogueFinemapping.from_susie_results(processed_susie_df) - study_index = EqtlCatalogueStudyIndex.from_susie_results(processed_susie_df) - credible_sets = credible_sets.validate_lead_pvalue( - pvalue_cutoff=EqtlCatalogueConfig().eqtl_lead_pvalue_threshold + ( + EqtlCatalogueStudyIndex.from_susie_results(processed_susie_df) + # Writing the output: + .df.write.mode(session.write_mode) + .parquet(eqtl_catalogue_study_index_out) ) - # Load - study_index.df.write.mode(session.write_mode).parquet( - eqtl_catalogue_study_index_out - ) - credible_sets.df.write.mode(session.write_mode).parquet( - eqtl_catalogue_credible_sets_out + ( + EqtlCatalogueFinemapping.from_susie_results(processed_susie_df) + # Flagging sub-significnat loci: + .validate_lead_pvalue( + pvalue_cutoff=EqtlCatalogueConfig().eqtl_lead_pvalue_threshold + ) + # Writing the output: + .df.write.mode(session.write_mode) + .parquet(eqtl_catalogue_credible_sets_out) ) diff --git a/src/gentropy/finngen_finemapping_ingestion.py b/src/gentropy/finngen_finemapping_ingestion.py index 4c008d83b..be925d1cc 100644 --- a/src/gentropy/finngen_finemapping_ingestion.py +++ b/src/gentropy/finngen_finemapping_ingestion.py @@ -31,17 +31,17 @@ def __init__( """ # Read finemapping outputs from the input paths. - finngen_finemapping_df = FinnGenFinemapping.from_finngen_susie_finemapping( - spark=session.spark, - finngen_susie_finemapping_snp_files=finngen_susie_finemapping_snp_files, - finngen_susie_finemapping_cs_summary_files=finngen_susie_finemapping_cs_summary_files, - ) - - finngen_finemapping_df = finngen_finemapping_df.validate_lead_pvalue( - pvalue_cutoff=FinngenFinemappingConfig().finngen_finemapping_lead_pvalue_threshold - ) - - # Write the output. - finngen_finemapping_df.df.write.mode(session.write_mode).parquet( - finngen_finemapping_out + ( + FinnGenFinemapping.from_finngen_susie_finemapping( + spark=session.spark, + finngen_susie_finemapping_snp_files=finngen_susie_finemapping_snp_files, + finngen_susie_finemapping_cs_summary_files=finngen_susie_finemapping_cs_summary_files, + ) + # Flagging sub-significnat loci: + .validate_lead_pvalue( + pvalue_cutoff=FinngenFinemappingConfig().finngen_finemapping_lead_pvalue_threshold + ) + # Writing the output: + .df.write.mode(session.write_mode) + .parquet(finngen_finemapping_out) ) diff --git a/src/gentropy/pics.py b/src/gentropy/pics.py index a8f3c345b..f96f54997 100644 --- a/src/gentropy/pics.py +++ b/src/gentropy/pics.py @@ -29,14 +29,14 @@ def __init__( session, study_locus_ld_annotated_in ) # PICS - picsed_sl = PICS.finemap(study_locus_ld_annotated).filter_credible_set( - credible_interval=CredibleInterval.IS99 + ( + PICS.finemap(study_locus_ld_annotated) + .filter_credible_set(credible_interval=CredibleInterval.IS99) + # Flagging sub-significnat loci: + .validate_lead_pvalue( + pvalue_cutoff=WindowBasedClumpingStepConfig().gwas_significance + ) + # Writing the output: + .df.write.mode(session.write_mode) + .parquet(picsed_study_locus_out) ) - - # Validate lead p-value - picsed_sl = picsed_sl.validate_lead_pvalue( - pvalue_cutoff=WindowBasedClumpingStepConfig().gwas_significance - ) - - # Write - picsed_sl.df.write.mode(session.write_mode).parquet(picsed_study_locus_out)