From 632ff147e68c5c90824a989465d670a52be2eb33 Mon Sep 17 00:00:00 2001 From: Daniel Suveges Date: Mon, 13 Nov 2023 15:19:54 +0000 Subject: [PATCH 01/33] test: adding test for pairwiseLD --- src/otg/assets/schemas/pairwise_ld.json | 23 +++++ src/otg/dataset/pairwise_ld.py | 108 ++++++++++++++++++++++ src/otg/datasource/gnomad/ld.py | 61 ++++++------ tests/dataset/test_pairwise_ld.py | 103 +++++++++++++++++++++ tests/datasource/gnomad/test_gnomad_ld.py | 9 +- 5 files changed, 270 insertions(+), 34 deletions(-) create mode 100644 src/otg/assets/schemas/pairwise_ld.json create mode 100644 src/otg/dataset/pairwise_ld.py create mode 100644 tests/dataset/test_pairwise_ld.py diff --git a/src/otg/assets/schemas/pairwise_ld.json b/src/otg/assets/schemas/pairwise_ld.json new file mode 100644 index 000000000..2d5351d82 --- /dev/null +++ b/src/otg/assets/schemas/pairwise_ld.json @@ -0,0 +1,23 @@ +{ + "fields": [ + { + "metadata": {}, + "name": "variantId_i", + "nullable": false, + "type": "string" + }, + { + "metadata": {}, + "name": "variantId_j", + "nullable": false, + "type": "string" + }, + { + "metadata": {}, + "name": "r", + "nullable": false, + "type": "double" + } + ], + "type": "struct" +} diff --git a/src/otg/dataset/pairwise_ld.py b/src/otg/dataset/pairwise_ld.py new file mode 100644 index 000000000..20aac1f6d --- /dev/null +++ b/src/otg/dataset/pairwise_ld.py @@ -0,0 +1,108 @@ +"""Pairwise LD dataset.""" +from __future__ import annotations + +from dataclasses import dataclass, field +from math import sqrt +from typing import TYPE_CHECKING + +import numpy as np +from pyspark.sql import functions as f +from pyspark.sql import types as t + +from otg.common.schemas import parse_spark_schema +from otg.dataset.dataset import Dataset + +if TYPE_CHECKING: + from pyspark.sql.types import StructType + + +@dataclass +class PairwiseLD(Dataset): + """Pairwise variant correlation dataset. + + This class captures logic applied on pairwise linkage data + by validation ensuring data quality. + """ + + dimension: tuple[int, int] = field(init=False) + + def __post_init__(self: PairwiseLD) -> None: + """Validating the dataset upon creation. + + - Besides the schema, a pairwise LD table is expected have rows being a square number. + """ + row_count = self.df.count() + + assert int(sqrt(row_count)) == sqrt( + row_count + ), f"The number of rows in a pairwise LD table has to be square. Got: {row_count}" + + self.dimension = (int(sqrt(row_count)), int(sqrt(row_count))) + + @classmethod + def get_schema(cls: type[PairwiseLD]) -> StructType: + """Provide the schema for the StudyIndex dataset. + + Returns: + StructType: The schema of the StudyIndex dataset. + """ + return parse_spark_schema("pairwise_ld.json") + + def overlap_with_locus(self: PairwiseLD, locus_variants: list[str]) -> PairwiseLD: + """Subset pairwise LD table with locus. + + Args: + locus_variants (list[str]): List of variants found in the locus. + + Returns: + PairwiseLD: _description_ + """ + return PairwiseLD( + _df=( + self.df.filter( + f.col("variantId_i").isin(locus_variants) + & f.col("variantId_j").isin(locus_variants) + ) + ), + _schema=PairwiseLD.get_schema(), + ) + + def r_to_numpy_matrix(self) -> np.ndarray: + """Convert pairwise LD to a numpy square matrix. + + Returns: + np.ndarray: 2D square matrix with r values. + """ + return np.array( + self.df.select( + f.split("variantId_i", "_")[1] + .cast(t.IntegerType()) + .alias("position_i"), + f.split("variantId_j", "_")[1] + .cast(t.IntegerType()) + .alias("position_j"), + "r", + ) + .orderBy(f.col("position_i").asc(), f.col("position_j").asc()) + .select("r") + .collect() + ).reshape(self.dimension) + + def get_variant_list(self) -> list[str]: + """Return a list of unique variants from the dataset. + + Returns: + list[str]: list of variant identifiers sorted by position. + """ + return [ + row["variantId"] + for row in ( + self.df.select( + f.col("variantId_i").alias("variantId"), + f.split(f.col("variantId_i"), "_")[1] + .cast(t.IntegerType()) + .alias("position"), + ) + .orderBy(f.col("position").asc()) + .collect() + ) + ] diff --git a/src/otg/datasource/gnomad/ld.py b/src/otg/datasource/gnomad/ld.py index 6b85dabc6..1aa2ac223 100644 --- a/src/otg/datasource/gnomad/ld.py +++ b/src/otg/datasource/gnomad/ld.py @@ -14,6 +14,7 @@ from otg.common.utils import _liftover_loci, convert_gnomad_position_to_ensembl from otg.dataset.ld_index import LDIndex +from otg.dataset.pairwise_ld import PairwiseLD if TYPE_CHECKING: from pyspark.sql import DataFrame @@ -337,7 +338,7 @@ def get_ld_variants( chromosome: str, start: int, end: int, - ) -> DataFrame: + ) -> PairwiseLD: """Return melted LD table with resolved variant id based on ancestry and genomic location. Args: @@ -347,7 +348,7 @@ def get_ld_variants( end (int): window lower bound Returns: - DataFrame: square LD matrix resolved to variants. + PairwiseLD: Melted representation of the square slice of the LD matrix. """ # Extracting locus: ld_index_df = ( @@ -363,9 +364,6 @@ def get_ld_variants( .select("chromosome", "position", "variantId", "idx") .persist() ) - # +----------+---------+--------------------+--------+ - # |chromosome| position| variantId| idx| - # +----------+---------+--------------------+--------+ start_index = self._get_value_from_row( ld_index_df.orderBy(f.col("position").asc()).first(), "idx" ) @@ -385,31 +383,34 @@ def get_ld_variants( return SparkSession.builder.getOrCreate().createDataFrame([], schema=schema) # Extract square matrix: - return ( - self.get_ld_matrix_slice( - gnomad_ancestry, start_index=start_index, end_index=end_index - ) - .join( - ( - ld_index_df.select( - f.col("idx").alias("idx_i"), - f.col("variantId").alias("variantId_i"), - ) - ), - on="idx_i", - how="inner", - ) - .join( - ( - ld_index_df.select( - f.col("idx").alias("idx_j"), - f.col("variantId").alias("variantId_j"), - ) - ), - on="idx_j", - how="inner", - ) - .select("variantId_i", "variantId_j", "r") + return PairwiseLD( + _df=( + self.get_ld_matrix_slice( + gnomad_ancestry, start_index=start_index, end_index=end_index + ) + .join( + ( + ld_index_df.select( + f.col("idx").alias("idx_i"), + f.col("variantId").alias("variantId_i"), + ) + ), + on="idx_i", + how="inner", + ) + .join( + ( + ld_index_df.select( + f.col("idx").alias("idx_j"), + f.col("variantId").alias("variantId_j"), + ) + ), + on="idx_j", + how="inner", + ) + .select("variantId_i", "variantId_j", "r") + ), + _schema=PairwiseLD.get_schema(), ) def get_ld_matrix_slice( diff --git a/tests/dataset/test_pairwise_ld.py b/tests/dataset/test_pairwise_ld.py new file mode 100644 index 000000000..b6b7daf46 --- /dev/null +++ b/tests/dataset/test_pairwise_ld.py @@ -0,0 +1,103 @@ +"""Testing pairwise LD dataset.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np +import pytest +from pyspark.sql import functions as f +from pyspark.sql.window import Window + +from otg.dataset.pairwise_ld import PairwiseLD + +if TYPE_CHECKING: + from pyspark.sql import SparkSession + + +class TestPairwiseLD: + """Test suit for pairwise LD dataset and associated methods.""" + + variants = [ + "1_8_A_C", + "1_9_A_C", + "1_10_A_C", + "1_99_A_C", + ] + + @pytest.fixture(scope="class") + def mock_pairwise_ld(self: TestPairwiseLD, spark: SparkSession) -> PairwiseLD: + """Generate a mock pairwise LD dataset. + + Args: + spark (SparkSession): _description_ + + Returns: + PairwiseLD: _description_ + """ + spark = spark.builder.getOrCreate() + + data = [(v1, v2) for v1 in self.variants for v2 in self.variants] + return PairwiseLD( + _df=( + spark.createDataFrame(data, ["variantId_i", "variantId_j"]) + .withColumn( + "r", + f.row_number() + .over(Window.partitionBy(f.lit("x")).orderBy("variantId_i")) + .cast("double"), + ) + .withColumn( + "r", + f.when(f.col("variantId_i") == f.col("variantId_j"), 1.0).otherwise( + f.col("r") + ), + ) + .persist() + ), + _schema=PairwiseLD.get_schema(), + ) + + @staticmethod + def test_pairwise_ld__type(mock_pairwise_ld: PairwiseLD) -> None: + """Testing type.""" + assert isinstance(mock_pairwise_ld, PairwiseLD) + + def test_pariwise_ld__get_variants( + self: TestPairwiseLD, mock_pairwise_ld: PairwiseLD + ) -> None: + """Testing function that returns list of variants from the LD table. + + Args: + mock_pairwise_ld (PairwiseLD): _description_ + """ + variant_set_expected = set(self.variants) + variant_set_from_data = set(mock_pairwise_ld.get_variant_list()) + + assert variant_set_from_data == variant_set_expected + + def test_pairwise_ld__r_to_numpy_matrix__type( + self: TestPairwiseLD, mock_pairwise_ld: PairwiseLD + ) -> None: + """Testing the returned numpy array.""" + assert isinstance(mock_pairwise_ld.r_to_numpy_matrix(), np.ndarray) + + def test_pairwise_ld__r_to_numpy_matrix__dimensions( + self: TestPairwiseLD, mock_pairwise_ld: PairwiseLD + ) -> None: + """Testing the returned numpy array.""" + assert mock_pairwise_ld.r_to_numpy_matrix().shape == ( + len(self.variants), + len(self.variants), + ) + + def test_pairwise_ld__overlap_with_locus( + self: TestPairwiseLD, mock_pairwise_ld: PairwiseLD + ) -> None: + """Testing the returned numpy array.""" + variant_subset = self.variants[1:3] + + assert ( + mock_pairwise_ld.overlap_with_locus(variant_subset).df.count() + == len(variant_subset) ** 2 + ) diff --git a/tests/datasource/gnomad/test_gnomad_ld.py b/tests/datasource/gnomad/test_gnomad_ld.py index bc2bd6f45..85dfbd0e1 100644 --- a/tests/datasource/gnomad/test_gnomad_ld.py +++ b/tests/datasource/gnomad/test_gnomad_ld.py @@ -10,6 +10,7 @@ from pyspark.sql import DataFrame, SparkSession from pyspark.sql import functions as f +from otg.dataset.pairwise_ld import PairwiseLD from otg.datasource.gnomad.ld import GnomADLDMatrix @@ -91,7 +92,7 @@ def _setup(self: TestGnomADLDMatrixVariants, spark: SparkSession) -> None: ) @pytest.fixture(scope="class") - def ld_slice(self: TestGnomADLDMatrixVariants) -> DataFrame: + def ld_slice(self: TestGnomADLDMatrixVariants) -> PairwiseLD: """Generate a resolved LD slice.""" return self.gnomad_ld_matrix.get_ld_variants( gnomad_ancestry="test-pop", # observed[0], @@ -102,14 +103,14 @@ def ld_slice(self: TestGnomADLDMatrixVariants) -> DataFrame: def test_get_ld_variants__type( self: TestGnomADLDMatrixVariants, - ld_slice: DataFrame + ld_slice: PairwiseLD # self: TestGnomADLDMatrixVariants, observed: list[Any], expected: list[Any] ) -> None: """Testing if the function returns the right type.""" # Do we have the right type? - assert isinstance(ld_slice, DataFrame) + assert isinstance(ld_slice, PairwiseLD) # Do we have a real square? - assert sqrt(ld_slice.count()) == int(sqrt(ld_slice.count())) + assert sqrt(ld_slice.df.count()) == int(sqrt(ld_slice.df.count())) class TestGnomADLDMatrixSlice: From 557fadedf3aa12d7e15c55fd3ea0a240318fbfef Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Thu, 18 Jan 2024 10:57:54 +0000 Subject: [PATCH 02/33] feat: adding ld matrix extraction --- src/otg/method/ld.py | 55 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/src/otg/method/ld.py b/src/otg/method/ld.py index c834efb85..fab66eb9a 100644 --- a/src/otg/method/ld.py +++ b/src/otg/method/ld.py @@ -132,7 +132,60 @@ def ld_annotate( f.col("ldPopulationStructure").isNotNull(), cls._calculate_weighted_r_overall(f.col("ldSet")), ), - ).drop("ldPopulationStructure") + ) + .drop("ldPopulationStructure") + ), + _schema=StudyLocus.get_schema(), + ) + ._qc_no_population() + ._qc_unresolved_ld() + ) + + +class LDMatrix: + """Class to extract LD matrix from GnomAD.""" + + @classmethod + def matrix_extract( + cls: type[LDMatrix], + associations: StudyLocus, + studies: StudyIndex, + ld_index: LDIndex, + ) -> StudyLocus: + """Extract LD matrix for a set of studyLocus. + + This function: + 1. Annotates study locus with population structure information from the study index + 2. Joins the LD index to the StudyLocus + 3. Adds the population size of the study to each rValues entry in the ldSet + 4. Calculates the overall R weighted by the ancestry proportions in every given study. + + Args: + associations (StudyLocus): Dataset to be LD annotated + studies (StudyIndex): Dataset with study information + ld_index (LDIndex): Dataset with LD information for every variant present in LD matrix + + Returns: + StudyLocus: including additional column with LD matrix. + """ + return ( + StudyLocus( + _df=( + associations.df + # Annotate study locus with population structure from study index + .join( + studies.df.select( + "studyId", f.explode("ldPopulationStructure") + ), + on="studyId", + how="left", + ) + # Bring LD information from LD Index + .join( + ld_index.df, + on=["variantId", "chromosome"], + how="left", + ) ), _schema=StudyLocus.get_schema(), ) From c952898cd9409e01d6e30434b6fd063f62882036 Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Thu, 18 Jan 2024 12:33:28 +0000 Subject: [PATCH 03/33] chore: merge from dev --- config/datasets/ot_gcp.yaml | 35 ++++-- config/step/ot_gwas_catalog_ingestion.yaml | 14 +-- .../step/ot_gwas_catalog_study_curation.yaml | 6 +- .../step/ot_gwas_catalog_study_inclusion.yaml | 8 +- config/step/ot_ld_based_clumping.yaml | 2 +- docs/python_api/steps/ld_clump.md | 2 +- .../dags/gwas_catalog_harmonisation.py | 21 ++-- src/airflow/dags/gwas_catalog_preprocess.py | 93 ++++++++++------ src/gentropy/config.py | 4 +- src/gentropy/ld_based_clumping.py | 2 +- utils/update_GWAS_Catalog_data.sh | 102 +++++++++++++----- 11 files changed, 193 insertions(+), 96 deletions(-) diff --git a/config/datasets/ot_gcp.yaml b/config/datasets/ot_gcp.yaml index 8e588ec62..93630c3a3 100644 --- a/config/datasets/ot_gcp.yaml +++ b/config/datasets/ot_gcp.yaml @@ -1,10 +1,33 @@ # Release specific configuration: release_version: "24.01" +version: "XX.XX" release_folder: gs://genetics_etl_python_playground/releases/${datasets.release_version} inputs: gs://genetics_etl_python_playground/input outputs: gs://genetics_etl_python_playground/output/python_etl/parquet/${datasets.version} +## Datasets: +gwas_catalog_dataset: gs://gwas_catalog_data +# Ingestion input files: +gwas_catalog_associations: ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_associations_ontology_annotated.tsv +gwas_catalog_studies: + - ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_download_studies.tsv + - ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_unpublished_studies.tsv +gwas_catalog_ancestries: + - ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_download_ancestries.tsv + - ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_unpublished_ancestries.tsv +gwas_catalog_sumstats_lut: ${datasets.gwas_catalog_dataset}/curated_inputs/harmonised_list.txt +gwas_catalog_study_curation: ${datasets.gwas_catalog_dataset}/manifests/gwas_catalog_study_curation.tsv +# Harmonised summary statistics list: +gwas_catalog_summary_stats_list: ${datasets.gwas_catalog_dataset}/manifests/gwas_catalog_harmonised_sumstats_list.txt +# Inclusion lists: +gwas_catalog_curated_inclusion_list: ${datasets.gwas_catalog_dataset}/manifests/gwas_catalog_curated_included_studies +gwas_catalog_summary_satistics_inclusion_list: ${datasets.gwas_catalog_dataset}/manifests/gwas_catalog_summary_statistics_included_studies +# Ingestion output folders: +gwas_catalog_study_index: ${datasets.gwas_catalog_dataset}/study_index +gwas_catalog_study_locus_folder: ${datasets.gwas_catalog_dataset}/study_locus_datasets +gwas_catalog_credible_set_folder: ${datasets.gwas_catalog_dataset}/credible_set_datasets + # Input datasets chain_37_38: ${datasets.inputs}/v2g_input/grch37_to_grch38.over.chain target_index: ${datasets.inputs}/v2g_input/targets_correct_tss @@ -13,16 +36,6 @@ anderson: gs://genetics-portal-input/v2g_input/andersson2014/enhancer_tss_associ javierre: gs://genetics-portal-input/v2g_input/javierre_2016_preprocessed.parquet jung: gs://genetics-portal-raw/pchic_jung2019/jung2019_pchic_tableS3.csv thurman: gs://genetics-portal-input/v2g_input/thurman2012/genomewideCorrs_above0.7_promoterPlusMinus500kb_withGeneNames_32celltypeCategories.bed8.gz -catalog_associations: ${datasets.inputs}/v2d/gwas_catalog_v1.0.2-associations_e110_r2023-12-21.tsv -catalog_studies: - # To get a complete representation of all GWAS Catalog studies, we need to - # ingest the list of unpublished studies from a different file. - - ${datasets.inputs}/v2d/gwas-catalog-v1.0.3-studies-r2023-12-21.tsv - - ${datasets.inputs}/v2d/gwas-catalog-v1.0.3-unpublished-studies-r2023-12-21.tsv -catalog_ancestries: - - ${datasets.inputs}/v2d/gwas-catalog-v1.0.3-ancestries-r2023-12-21.tsv - - ${datasets.inputs}/v2d/gwas-catalog-v1.0.3-unpublished-ancestries-r2023-12-21.tsv -catalog_sumstats_lut: ${datasets.inputs}/v2d/harmonised_list-r2023-12-21.txt gene_interactions: ${datasets.inputs}/l2g/interaction # 23.09 data eqtl_catalogue_paths_imported: ${datasets.inputs}/preprocess/eqtl_catalogue/tabix_ftp_paths_imported.tsv @@ -37,7 +50,7 @@ study_locus_overlap: ${datasets.outputs}/study_locus_overlap ld_index: ${datasets.outputs}/ld_index catalog_study_index: ${datasets.study_index}/catalog catalog_study_locus: ${datasets.study_locus}/catalog_study_locus -gwas_catalog_study_curation: ${datasets.inputs}/v2d/GWAS_Catalog_study_curation.tsv + finngen_study_index: ${datasets.study_index}/finngen finngen_summary_stats: ${datasets.summary_statistics}/finngen from_sumstats_study_locus: ${datasets.study_locus}/from_sumstats diff --git a/config/step/ot_gwas_catalog_ingestion.yaml b/config/step/ot_gwas_catalog_ingestion.yaml index 65606b7e4..fc82b82c2 100644 --- a/config/step/ot_gwas_catalog_ingestion.yaml +++ b/config/step/ot_gwas_catalog_ingestion.yaml @@ -1,12 +1,12 @@ defaults: - gwas_catalog_ingestion -catalog_study_files: ${datasets.catalog_studies} -catalog_ancestry_files: ${datasets.catalog_ancestries} -catalog_associations_file: ${datasets.catalog_associations} -catalog_sumstats_lut: ${datasets.catalog_sumstats_lut} +catalog_study_files: ${datasets.gwas_catalog_studies} +catalog_ancestry_files: ${datasets.gwas_catalog_ancestries} +catalog_associations_file: ${datasets.gwas_catalog_associations} +catalog_sumstats_lut: ${datasets.gwas_catalog_sumstats_lut} variant_annotation_path: ${datasets.variant_annotation} -catalog_studies_out: ${datasets.catalog_study_index} -catalog_associations_out: ${datasets.catalog_study_locus} +catalog_studies_out: ${datasets.gwas_catalog_study_index} +catalog_associations_out: ${datasets.gwas_catalog_study_locus_folder}/gwas_catalog_curated_associations gwas_catalog_study_curation_file: ${datasets.gwas_catalog_study_curation} -inclusion_list_path: ??? +inclusion_list_path: ${datasets.gwas_catalog_curated_inclusion_list} diff --git a/config/step/ot_gwas_catalog_study_curation.yaml b/config/step/ot_gwas_catalog_study_curation.yaml index eb6c0ec78..77c1d7834 100644 --- a/config/step/ot_gwas_catalog_study_curation.yaml +++ b/config/step/ot_gwas_catalog_study_curation.yaml @@ -1,8 +1,8 @@ defaults: - gwas_catalog_study_curation -catalog_study_files: ${datasets.catalog_studies} -catalog_ancestry_files: ${datasets.catalog_ancestries} -catalog_sumstats_lut: ${datasets.catalog_sumstats_lut} +catalog_study_files: ${datasets.gwas_catalog_studies} +catalog_ancestry_files: ${datasets.gwas_catalog_ancestries} +catalog_sumstats_lut: ${datasets.gwas_catalog_sumstats_lut} gwas_catalog_study_curation_file: ${datasets.gwas_catalog_study_curation} gwas_catalog_study_curation_out: ??? diff --git a/config/step/ot_gwas_catalog_study_inclusion.yaml b/config/step/ot_gwas_catalog_study_inclusion.yaml index 8a560127e..7f3bf80b3 100644 --- a/config/step/ot_gwas_catalog_study_inclusion.yaml +++ b/config/step/ot_gwas_catalog_study_inclusion.yaml @@ -1,12 +1,12 @@ defaults: - gwas_catalog_study_inclusion -catalog_study_files: ${datasets.catalog_studies} -catalog_ancestry_files: ${datasets.catalog_ancestries} -catalog_associations_file: ${datasets.catalog_associations} +catalog_study_files: ${datasets.gwas_catalog_studies} +catalog_ancestry_files: ${datasets.gwas_catalog_ancestries} +catalog_associations_file: ${datasets.gwas_catalog_associations} variant_annotation_path: ${datasets.variant_annotation} gwas_catalog_study_curation_file: ${datasets.gwas_catalog_study_curation} -harmonised_study_file: ??? +harmonised_study_file: ${datasets.gwas_catalog_summary_stats_list} criteria: ??? inclusion_list_path: ??? exclusion_list_path: ??? diff --git a/config/step/ot_ld_based_clumping.yaml b/config/step/ot_ld_based_clumping.yaml index d25ca84b7..d02c0acdd 100644 --- a/config/step/ot_ld_based_clumping.yaml +++ b/config/step/ot_ld_based_clumping.yaml @@ -1,7 +1,7 @@ defaults: - ld_based_clumping +ld_index_path: ${datasets.ld_index} study_locus_input_path: ??? -ld_index_path: ??? study_index_path: ??? clumped_study_locus_output_path: ??? diff --git a/docs/python_api/steps/ld_clump.md b/docs/python_api/steps/ld_clump.md index 75097bd8c..00fc8f2e4 100644 --- a/docs/python_api/steps/ld_clump.md +++ b/docs/python_api/steps/ld_clump.md @@ -2,4 +2,4 @@ title: LD-based clumping --- -::: gentropy.ld_based_clumping.LdBasedClumpingStep +::: gentropy.ld_based_clumping.LDBasedClumpingStep diff --git a/src/airflow/dags/gwas_catalog_harmonisation.py b/src/airflow/dags/gwas_catalog_harmonisation.py index 7e7790224..5713e223d 100644 --- a/src/airflow/dags/gwas_catalog_harmonisation.py +++ b/src/airflow/dags/gwas_catalog_harmonisation.py @@ -14,7 +14,9 @@ CLUSTER_NAME = "otg-gwascatalog-harmonisation" AUTOSCALING = "gwascatalog-harmonisation" -SUMMARY_STATS_BUCKET_NAME = "open-targets-gwas-summary-stats" +SUMMARY_STATS_BUCKET_NAME = "gwas_catalog_data" +RAW_SUMMARY_STATISTICS_PREFIX = "raw_summary_statistics" +HARMONISED_SUMMARY_STATISTICS_PREFIX = "harmonised_summary_statistics" with DAG( dag_id=Path(__file__).stem, @@ -26,14 +28,14 @@ list_inputs = GCSListObjectsOperator( task_id="list_raw_harmonised", bucket=SUMMARY_STATS_BUCKET_NAME, - prefix="raw-harmonised", + prefix=RAW_SUMMARY_STATISTICS_PREFIX, match_glob="**/*.h.tsv.gz", ) # List parquet files that have been previously processed list_outputs = GCSListObjectsOperator( task_id="list_harmonised_parquet", bucket=SUMMARY_STATS_BUCKET_NAME, - prefix="harmonised", + prefix=HARMONISED_SUMMARY_STATISTICS_PREFIX, match_glob="**/_SUCCESS", ) @@ -59,11 +61,15 @@ def create_to_do_list(**kwargs: Any) -> Any: print("Number of parquet files: ", len(parquets)) # noqa: T201 for path in raw_harmonised: match_result = re.search( - r"raw-harmonised/(.*)/(GCST\d+)/harmonised/(.*)\.h\.tsv\.gz", path + rf"{RAW_SUMMARY_STATISTICS_PREFIX}/(.*)/(GCST\d+)/harmonised/(.*)\.h\.tsv\.gz", + path, ) if match_result: study_id = match_result.group(2) - if f"harmonised/{study_id}.parquet/_SUCCESS" not in parquets: + if ( + f"{HARMONISED_SUMMARY_STATISTICS_PREFIX}/{study_id}.parquet/_SUCCESS" + not in parquets + ): to_do_list.append(path) print("Number of jobs to submit: ", len(to_do_list)) # noqa: T201 ti.xcom_push(key="to_do_list", value=to_do_list) @@ -85,7 +91,8 @@ def submit_jobs(**kwargs: Any) -> None: time.sleep(60) input_path = todo[i] match_result = re.search( - r"raw-harmonised/(.*)/(GCST\d+)/harmonised/(.*)\.h\.tsv\.gz", input_path + rf"{RAW_SUMMARY_STATISTICS_PREFIX}/(.*)/(GCST\d+)/harmonised/(.*)\.h\.tsv\.gz", + input_path, ) if match_result: study_id = match_result.group(2) @@ -95,7 +102,7 @@ def submit_jobs(**kwargs: Any) -> None: step_id="ot_gwas_catalog_sumstat_preprocess", other_args=[ f"step.raw_sumstats_path=gs://{SUMMARY_STATS_BUCKET_NAME}/{input_path}", - f"step.out_sumstats_path=gs://{SUMMARY_STATS_BUCKET_NAME}/harmonised/{study_id}.parquet", + f"step.out_sumstats_path=gs://{SUMMARY_STATS_BUCKET_NAME}/{HARMONISED_SUMMARY_STATISTICS_PREFIX}/{study_id}.parquet", ], ) diff --git a/src/airflow/dags/gwas_catalog_preprocess.py b/src/airflow/dags/gwas_catalog_preprocess.py index 33d4062a9..9df22586a 100644 --- a/src/airflow/dags/gwas_catalog_preprocess.py +++ b/src/airflow/dags/gwas_catalog_preprocess.py @@ -13,11 +13,44 @@ CLUSTER_NAME = "otg-preprocess-gwascatalog" AUTOSCALING = "otg-preprocess-gwascatalog" -RELEASEBUCKET = "gs://genetics_etl_python_playground/output/python_etl/parquet/XX.XX" -RELEASEBUCKET_NAME = "genetics_etl_python_playground" -SUMMARY_STATS_BUCKET_NAME = "open-targets-gwas-summary-stats" -SUMSTATS = "gs://open-targets-gwas-summary-stats/harmonised" -MANIFESTS_PATH = f"{RELEASEBUCKET}/manifests/" +# Setting up bucket name and output object names: +GWAS_CATALOG_BUCKET_NAME = "gwas_catalog_data" +HARMONISED_SUMSTATS_PREFIX = "harmonised_summary_statistics" + +# Manifest paths: +MANIFESTS_PATH = f"gs://{GWAS_CATALOG_BUCKET_NAME}/manifests/" + +# The name of the manifest files have to be consistent with the config file: +HARMONISED_SUMSTATS_LIST_OBJECT_NAME = ( + "manifests/gwas_catalog_harmonised_sumstats_list.txt" +) +HARMONISED_SUMSTATS_LIST_FULL_NAME = ( + f"gs://{GWAS_CATALOG_BUCKET_NAME}/{HARMONISED_SUMSTATS_LIST_OBJECT_NAME}" +) +CURATION_INCLUSION_NAME = f"{MANIFESTS_PATH}/gwas_catalog_curated_included_studies" +CURATION_EXCLUSION_NAME = f"{MANIFESTS_PATH}/gwas_catalog_curation_excluded_studies" +SUMMARY_STATISTICS_INCLUSION_NAME = ( + f"{MANIFESTS_PATH}/gwas_catalog_summary_statistics_included_studies" +) +SUMMARY_STATISTICS_EXCLUSION_NAME = ( + f"{MANIFESTS_PATH}/gwas_catalog_summary_statistics_excluded_studies" +) + +# Study index: +STUDY_INDEX = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_index" + +# Study loci: +CURATED_STUDY_LOCI = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_curated_associations" +CURATED_LD_CLUMPED = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_curated_associations_ld_clumped" +WINDOW_BASED_CLUMPED = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_summary_stats_window_clumped" +LD_BASED_CLUMPED = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_summary_stats_ld_clumped" +# Credible sets: +CURATED_CREDIBLE_SETS = ( + f"gs://{GWAS_CATALOG_BUCKET_NAME}/credible_set_datasets/gwas_catalog_curated" +) +SUMMARY_STATISTICS_CREDIBLE_SETS = ( + f"gs://{GWAS_CATALOG_BUCKET_NAME}/credible_set_datasets/gwas_catalog_summary_stats" +) def upload_harmonized_study_list( @@ -48,8 +81,8 @@ def upload_harmonized_study_list( # Getting list of folders (each a gwas study with summary statistics) list_harmonised_sumstats = GCSListObjectsOperator( task_id="list_harmonised_parquet", - bucket=SUMMARY_STATS_BUCKET_NAME, - prefix="harmonised", + bucket=GWAS_CATALOG_BUCKET_NAME, + prefix=HARMONISED_SUMSTATS_PREFIX, match_glob="**/_SUCCESS", ) @@ -59,8 +92,8 @@ def upload_harmonized_study_list( python_callable=upload_harmonized_study_list, op_kwargs={ "concatenated_studies": '{{ "\n".join(ti.xcom_pull( key="return_value", task_ids="list_harmonised_parquet")) }}', - "bucket_name": RELEASEBUCKET_NAME, - "object_name": "output/python_etl/parquet/XX.XX/manifests/harmonised_sumstats.txt", + "bucket_name": GWAS_CATALOG_BUCKET_NAME, + "object_name": HARMONISED_SUMSTATS_LIST_OBJECT_NAME, }, ) @@ -73,9 +106,9 @@ def upload_harmonized_study_list( task_id="catalog_curation_inclusion_list", other_args=[ "step.criteria=curation", - f"step.inclusion_list_path={MANIFESTS_PATH}manifest_curation", - f"step.exclusion_list_path={MANIFESTS_PATH}exclusion_curation", - f"step.harmonised_study_file={MANIFESTS_PATH}harmonised_sumstats.txt", + f"step.inclusion_list_path={CURATION_INCLUSION_NAME}", + f"step.exclusion_list_path={CURATION_EXCLUSION_NAME}", + f"step.harmonised_study_file={HARMONISED_SUMSTATS_LIST_FULL_NAME}", ], ) @@ -84,7 +117,7 @@ def upload_harmonized_study_list( cluster_name=CLUSTER_NAME, step_id="ot_gwas_catalog_ingestion", task_id="ingest_curated_gwas_catalog_data", - other_args=[f"step.inclusion_list_path={MANIFESTS_PATH}manifest_curation"], + other_args=[f"step.inclusion_list_path={CURATION_INCLUSION_NAME}"], ) # Run LD-annotation and clumping on curated data: @@ -93,10 +126,9 @@ def upload_harmonized_study_list( step_id="ot_ld_based_clumping", task_id="catalog_curation_ld_clumping", other_args=[ - f"step.study_locus_input_path={RELEASEBUCKET}/study_locus/catalog_curated", - f"step.ld_index_path={RELEASEBUCKET}/ld_index", - f"step.study_index_path={RELEASEBUCKET}/study_index/catalog", - f"step.clumped_study_locus_output_path={RELEASEBUCKET}/study_locus/ld_clumped/catalog_curated", + f"step.study_locus_input_path={CURATED_STUDY_LOCI}", + f"step.study_index_path={STUDY_INDEX}", + f"step.clumped_study_locus_output_path={CURATED_LD_CLUMPED}", ], ) @@ -106,8 +138,8 @@ def upload_harmonized_study_list( step_id="ot_pics", task_id="catalog_curation_pics", other_args=[ - f"step.study_locus_ld_annotated_in={RELEASEBUCKET}/study_locus/ld_clumped/catalog_curated", - f"step.picsed_study_locus_out={RELEASEBUCKET}/credible_set/catalog_curated", + f"step.study_locus_ld_annotated_in={CURATED_LD_CLUMPED}", + f"step.picsed_study_locus_out={CURATED_CREDIBLE_SETS}", ], ) @@ -130,9 +162,9 @@ def upload_harmonized_study_list( task_id="catalog_sumstats_inclusion_list", other_args=[ "step.criteria=summary_stats", - f"step.inclusion_list_path={MANIFESTS_PATH}manifest_sumstats", - f"step.exclusion_list_path={MANIFESTS_PATH}exclusion_sumstats", - f"step.harmonised_study_file={MANIFESTS_PATH}harmonised_sumstats.txt", + f"step.inclusion_list_path={SUMMARY_STATISTICS_INCLUSION_NAME}", + f"step.exclusion_list_path={SUMMARY_STATISTICS_EXCLUSION_NAME}", + f"step.harmonised_study_file={HARMONISED_SUMSTATS_LIST_FULL_NAME}", ], ) @@ -142,9 +174,9 @@ def upload_harmonized_study_list( step_id="ot_window_based_clumping", task_id="catalog_sumstats_window_clumping", other_args=[ - f"step.summary_statistics_input_path={SUMSTATS}", - f"step.study_locus_output_path={RELEASEBUCKET}/study_locus/window_clumped/from_sumstats/catalog", - f"step.inclusion_list_path={MANIFESTS_PATH}manifest_sumstats", + f"step.summary_statistics_input_path=gs://{GWAS_CATALOG_BUCKET_NAME}/{HARMONISED_SUMSTATS_PREFIX}", + f"step.inclusion_list_path={SUMMARY_STATISTICS_INCLUSION_NAME}", + f"step.study_locus_output_path={WINDOW_BASED_CLUMPED}", ], ) @@ -154,10 +186,9 @@ def upload_harmonized_study_list( step_id="ot_ld_based_clumping", task_id="catalog_sumstats_ld_clumping", other_args=[ - f"step.study_locus_input_path={RELEASEBUCKET}/study_locus/window_clumped/from_sumstats/catalog", - f"step.ld_index_path={RELEASEBUCKET}/ld_index", - f"step.study_index_path={RELEASEBUCKET}/study_index/catalog", - f"step.clumped_study_locus_output_path={RELEASEBUCKET}/study_locus/ld_clumped/from_sumstats/catalog", + f"step.study_locus_input_path={WINDOW_BASED_CLUMPED}", + f"step.study_index_path={STUDY_INDEX}", + f"step.clumped_study_locus_output_path={LD_BASED_CLUMPED}", ], ) @@ -167,8 +198,8 @@ def upload_harmonized_study_list( step_id="ot_pics", task_id="catalog_sumstats_pics", other_args=[ - f"step.study_locus_ld_annotated_in={RELEASEBUCKET}/study_locus/ld_clumped/from_sumstats/catalog", - f"step.picsed_study_locus_out={RELEASEBUCKET}/credible_set/from_sumstats/catalog", + f"step.study_locus_ld_annotated_in={LD_BASED_CLUMPED}", + f"step.picsed_study_locus_out={SUMMARY_STATISTICS_CREDIBLE_SETS}", ], ) diff --git a/src/gentropy/config.py b/src/gentropy/config.py index ba213a349..ac9ce821d 100644 --- a/src/gentropy/config.py +++ b/src/gentropy/config.py @@ -78,7 +78,7 @@ class GWASCatalogStudyInclusionConfig(StepConfig): inclusion_list_path: str = MISSING exclusion_list_path: str = MISSING _target_: str = ( - "gentropy.gwas_catalog_study_inclusion.GWASCatalogStudyInclusionStep" + "gentropy.gwas_catalog_study_inclusion.GWASCatalogStudyInclusionGenerator" ) @@ -301,7 +301,7 @@ class WindowBasedClumpingStep(StepConfig): inclusion_list_path: str = MISSING locus_collect_distance: str | None = None - _target_: str = "gentropy.clump.WindowBasedClumpingStep" + _target_: str = "gentropy.window_based_clumping.WindowBasedClumpingStep" @dataclass diff --git a/src/gentropy/ld_based_clumping.py b/src/gentropy/ld_based_clumping.py index ea9646806..e6a477a89 100644 --- a/src/gentropy/ld_based_clumping.py +++ b/src/gentropy/ld_based_clumping.py @@ -7,7 +7,7 @@ from gentropy.dataset.study_locus import StudyLocus -class LdBasedClumpingStep: +class LDBasedClumpingStep: """Step to perform LD-based clumping on study locus dataset. As a first step, study locus is enriched with population specific linked-variants. diff --git a/utils/update_GWAS_Catalog_data.sh b/utils/update_GWAS_Catalog_data.sh index 98ac4cc36..1e380d30c 100755 --- a/utils/update_GWAS_Catalog_data.sh +++ b/utils/update_GWAS_Catalog_data.sh @@ -1,6 +1,5 @@ #!/usr/bin/env bash - # Function to get the most recent date: get_most_recent(){ cat $1 | perl -lane 'push @a, $_ if $_ =~ /^\d+$/; END {@a = sort { $a <=> $b} @a; print pop @a }' @@ -21,13 +20,47 @@ get_release_info(){ logging(){ log_prompt="[$(date "+%Y.%m.%d %H:%M")]" - echo "${log_prompt} $@" + echo "${log_prompt} $@" >> ${LOG_FILE} +} + +upload_file_to_gcp(){ + FILENAME=${1} + TARGET=${2} + # Test if file exists: + if [ ! -f ${FILENAME} ]; then + logging "File ${FILENAME} does not exist." + return + fi + + logging "Copying ${FILENAME} to GCP..." + gsutil -mq cp file://$(pwd)/${FILENAME} ${TARGET} + + # Test if file was successfully uploaded: + if [ $? -ne 0 ]; then + logging "File ${FILENAME} failed to upload." + fi } # Resources: export BASE_URL=ftp://ftp.ebi.ac.uk/pub/databases/gwas export RELEASE_INFO_URL=https://www.ebi.ac.uk/gwas/api/search/stats -export GCP_TARGET=gs://genetics_etl_python_playground/input/v2d/ +export GCP_TARGET=gs://gwas_catalog_data +export LOG_FILE=gwas_catalog_data_update.log + +export GWAS_CATALOG_STUDY_CURATION_URL=https://raw.githubusercontent.com/opentargets/curation/master/genetics/GWAS_Catalog_study_curation.tsv + +ASSOCIATION_FILE=gwas_catalog_associations_ontology_annotated.tsv +PUBLISHED_STUDIES_FILE=gwas_catalog_download_studies.tsv +PUBLISHED_ANCESTRIES_FILE=gwas_catalog_download_ancestries.tsv +UNPUBLISHED_STUDIES_FILE=gwas_catalog_unpublished_studies.tsv +UNPUBLISHED_ANCESTRIES_FILE=gwas_catalog_unpublished_ancestries.tsv +HARMONISED_LIST_FILE=harmonised_list.txt +GWAS_CATALOG_STUDY_CURATION_FILE=gwas_catalog_study_curation.tsv + +# Remove log file if exists: +if [ -f ${LOG_FILE} ]; then + rm -rf ${LOG_FILE} +fi logging "Extracing data from: ${BASE_URL}" logging "Release info fetched fom: ${RELEASE_INFO_URL}" @@ -47,36 +80,49 @@ RELEASE_URL=${BASE_URL}/releases/${YEAR}/${MONTH}/${DAY} logging "Datafiles are fetching from ${RELEASE_URL}" # Fetching files while assigning properly dated and annotated names: -wget -q ${RELEASE_URL}/gwas-catalog-associations_ontology-annotated.tsv \ - -O gwas_catalog_v1.0.2-associations_e${ENSEMBL}_r${YEAR}-${MONTH}-${DAY}.tsv -logging "File gwas_catalog_v1.0.2-associations_e${ENSEMBL}_r${YEAR}-${MONTH}-${DAY}.tsv saved." +wget -q ${RELEASE_URL}/gwas-catalog-associations_ontology-annotated.tsv -O ${ASSOCIATION_FILE} +logging "File ${ASSOCIATION_FILE} saved." -wget -q ${RELEASE_URL}/gwas-catalog-download-studies-v1.0.3.txt \ - -O gwas-catalog-v1.0.3-studies-r${YEAR}-${MONTH}-${DAY}.tsv -logging "File gwas-catalog-v1.0.3-studies-r${YEAR}-${MONTH}-${DAY}.tsv saved." +wget -q ${RELEASE_URL}/gwas-catalog-download-studies-v1.0.3.txt -O ${PUBLISHED_STUDIES_FILE} +logging "File ${PUBLISHED_STUDIES_FILE} saved." -wget -q ${RELEASE_URL}/gwas-catalog-unpublished-studies-v1.0.3.tsv \ - -O gwas-catalog-v1.0.3-unpublished-studies-r${YEAR}-${MONTH}-${DAY}.tsv -logging "File gwas-catalog-v1.0.3-unpublished-studies-r${YEAR}-${MONTH}-${DAY}.tsv saved." +wget -q ${RELEASE_URL}/gwas-catalog-unpublished-studies-v1.0.3.tsv -O ${UNPUBLISHED_STUDIES_FILE} +logging "File ${UNPUBLISHED_STUDIES_FILE} saved." -wget -q ${RELEASE_URL}/gwas-catalog-download-ancestries-v1.0.3.txt \ - -O gwas-catalog-v1.0.3-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv -logging "File gwas-catalog-v1.0.3-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv saved." +wget -q ${RELEASE_URL}/gwas-catalog-download-ancestries-v1.0.3.txt -O ${PUBLISHED_ANCESTRIES_FILE} +logging "File ${PUBLISHED_ANCESTRIES_FILE} saved." -wget -q ${RELEASE_URL}/gwas-catalog-unpublished-ancestries-v1.0.3.tsv \ - -O gwas-catalog-v1.0.3-unpublished-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv -logging "File gwas-catalog-v1.0.3-unpublished-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv saved." +wget -q ${RELEASE_URL}/gwas-catalog-unpublished-ancestries-v1.0.3.tsv -O ${UNPUBLISHED_ANCESTRIES_FILE} +logging "File ${UNPUBLISHED_ANCESTRIES_FILE} saved." +wget -q ${BASE_URL}/summary_statistics/harmonised_list.txt -O ${HARMONISED_LIST_FILE} +logging "File ${HARMONISED_LIST_FILE} saved." -wget -q ${BASE_URL}/summary_statistics/harmonised_list.txt -O harmonised_list-r${YEAR}-${MONTH}-${DAY}.txt -logging "File harmonised_list-r${YEAR}-${MONTH}-${DAY}.txt saved." +wget -q ${GWAS_CATALOG_STUDY_CURATION_URL} -O ${GWAS_CATALOG_STUDY_CURATION_FILE} +logging "In-house GWAS Catalog study curation file fetched from GitHub." logging "Copying files to GCP..." -gsutil -mq cp file://$(pwd)/gwas_catalog_v1.0.2-associations_e${ENSEMBL}_r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/ -gsutil -mq cp file://$(pwd)/gwas-catalog-v1.0.3-studies-r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/ -gsutil -mq cp file://$(pwd)/gwas-catalog-v1.0.3-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/ -gsutil -mq cp file://$(pwd)/harmonised_list-r${YEAR}-${MONTH}-${DAY}.txt ${GCP_TARGET}/ -gsutil -mq cp file://$(pwd)/gwas-catalog-v1.0.3-unpublished-studies-r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/ -gsutil -mq cp file://$(pwd)/gwas-catalog-v1.0.3-unpublished-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/ - -logging "Done." + +upload_file_to_gcp ${ASSOCIATION_FILE} ${GCP_TARGET}/curated_inputs/ +upload_file_to_gcp ${PUBLISHED_STUDIES_FILE} ${GCP_TARGET}/curated_inputs/ +upload_file_to_gcp ${PUBLISHED_ANCESTRIES_FILE} ${GCP_TARGET}/curated_inputs/ +upload_file_to_gcp ${HARMONISED_LIST_FILE} ${GCP_TARGET}/curated_inputs/ +upload_file_to_gcp ${UNPUBLISHED_STUDIES_FILE} ${GCP_TARGET}/curated_inputs/ +upload_file_to_gcp ${UNPUBLISHED_ANCESTRIES_FILE} ${GCP_TARGET}/curated_inputs/ +upload_file_to_gcp ${GWAS_CATALOG_STUDY_CURATION_FILE} ${GCP_TARGET}/manifests/ + + +logging "Files successfully uploaded." +logging "Removing local files..." +rm ${ASSOCIATION_FILE} \ + ${PUBLISHED_STUDIES_FILE} \ + ${PUBLISHED_ANCESTRIES_FILE} \ + ${HARMONISED_LIST_FILE} \ + ${UNPUBLISHED_STUDIES_FILE} \ + ${UNPUBLISHED_ANCESTRIES_FILE} \ + ${GWAS_CATALOG_STUDY_CURATION_FILE} + +# Uploading log file to GCP manifest folder: +logging "Uploading log file to GCP manifest folder..." +upload_file_to_gcp ${LOG_FILE} ${GCP_TARGET}/manifests/ +cat $LOG_FILE From 82107ec28246899c89154a1d3d3238c065475244 Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Thu, 1 Feb 2024 20:22:53 +0000 Subject: [PATCH 04/33] feat: index and block matrix extraction for studyLocus --- .pre-commit-config.yaml | 4 +- src/gentropy/dataset/study_index.py | 11 ++++++ src/gentropy/dataset/study_locus.py | 58 ++++++++++++++++++++++++++++ src/gentropy/datasource/gnomad/ld.py | 2 + 4 files changed, 73 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c5330c4fe..c78b367b7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.13 + rev: v0.1.14 hooks: - id: ruff args: @@ -91,6 +91,6 @@ repos: - id: beautysh - repo: https://github.com/jsh9/pydoclint - rev: 0.3.8 + rev: 0.3.9 hooks: - id: pydoclint diff --git a/src/gentropy/dataset/study_index.py b/src/gentropy/dataset/study_index.py index d048157d6..89a914c4b 100644 --- a/src/gentropy/dataset/study_index.py +++ b/src/gentropy/dataset/study_index.py @@ -183,3 +183,14 @@ def has_summarystats(self: StudyIndex) -> Column: Column: True if the study has harmonized summary statistics. """ return self.df.hasSumstats + + def get_major_pop(self: StudyIndex) -> DataFrame: + """Extract major population from ldPopulationStructure rows with multiple ancestries. + + Returns: + DataFrame: Columns studyId and the extracted major population from ldPopulationStructure. + """ + return self.df.select("studyId", "ldPopulationStructure").withColumn( + "ldPopulationStructure", + f.array_max(f.col("ldPopulationStructure")).getItem("ldPopulation"), + ) diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py index 41c099959..050246187 100644 --- a/src/gentropy/dataset/study_locus.py +++ b/src/gentropy/dataset/study_locus.py @@ -5,7 +5,9 @@ from enum import Enum from typing import TYPE_CHECKING +import numpy as np import pyspark.sql.functions as f +from hail.linalg import BlockMatrix from gentropy.common.schemas import parse_spark_schema from gentropy.common.spark_helpers import ( @@ -14,12 +16,14 @@ ) from gentropy.dataset.dataset import Dataset from gentropy.dataset.study_locus_overlap import StudyLocusOverlap +from gentropy.datasource.gnomad.ld import GnomADLDMatrix from gentropy.method.clump import LDclumping if TYPE_CHECKING: from pyspark.sql import Column, DataFrame from pyspark.sql.types import StructType + from gentropy.common.session import Session from gentropy.dataset.ld_index import LDIndex from gentropy.dataset.study_index import StudyIndex @@ -453,6 +457,60 @@ def clump(self: StudyLocus) -> StudyLocus: ) return self + def get_gnomad_matrix( + self: StudyLocus, session: Session, study_index: StudyIndex, window_size: int + ) -> tuple[DataFrame, np.ndarray]: + """Extract hail matrix index and block matrix from StudyLocus rows. + + Args: + session (Session): Spark session + study_index (StudyIndex): Study index + window_size (int): Window size to extract from gnomad matrix + + Returns: + tuple[DataFrame, np.ndarray]: Returns the index of the gnomad matrix and the gnomad block matrix + """ + _df = ( + self.df.join(study_index.get_major_pop(), on="studyId", how="left") + .withColumn("start", f.col("position") - (window_size / 2)) + .withColumn("end", f.col("position") + (window_size / 2)) + .alias("_df") + ) + _pop = _df.collect()[0]["ldPopulationStructure"] + + _matrix_index = session.spark.read.parquet( + GnomADLDMatrix.ld_index_38_template.format(POP=_pop) + ) + _index_joined = ( + _df.alias("df") + .join( + _matrix_index.alias("matrix_index"), + (f.col("df.chromosome") == f.col("matrix_index.chromosome")) + & (f.col("df.start") <= f.col("matrix_index.position")) + & (f.col("df.end") >= f.col("matrix_index.position")), + ) + .select( + "matrix_index.chromosome", + "matrix_index.position", + "referenceAllele", + "alternateAllele", + "idx", + ) + ).persist() + _idx = ( + _index_joined.sort("idx").select("idx").rdd.flatMap(lambda x: x).collect() + ) + _block_matrix = ( + BlockMatrix.read(GnomADLDMatrix.ld_matrix_template.format(POP=_pop)) + .filter(_idx, _idx) + .to_numpy() + ) + _block_matrix = ( + _block_matrix + _block_matrix.T - np.diag(np.diag(_block_matrix)) + ) + + return _index_joined, _block_matrix + def _qc_unresolved_ld( self: StudyLocus, ) -> StudyLocus: diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py index 1d2d6c18f..858751690 100644 --- a/src/gentropy/datasource/gnomad/ld.py +++ b/src/gentropy/datasource/gnomad/ld.py @@ -28,12 +28,14 @@ class GnomADLDMatrix: Attributes: ld_matrix_template (str): Template for the LD matrix path. Defaults to "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm". ld_index_raw_template (str): Template for the LD index path. Defaults to "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht". + ld_index_38_template (str): Template for the LD index path in build 38. Defaults to "gs://genetics_etl_python_playground/input/ld/gnomad_r2.1.1.{POP}.common.ld.variant_indices.parquet" grch37_to_grch38_chain_path (str): Path to the chain file used to lift over the coordinates. Defaults to "gs://hail-common/references/grch37_to_grch38.over.chain.gz". ld_populations (list[str]): List of populations to use to build the LDIndex. Defaults to ["afr", "amr", "asj", "eas", "fin", "nfe", "nwe", "seu"]. """ ld_matrix_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm" ld_index_raw_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht" + ld_index_38_template: str = "gs://genetics_etl_python_playground/input/ld/gnomad_r2.1.1.{POP}.common.ld.variant_indices.parquet" grch37_to_grch38_chain_path: str = ( "gs://hail-common/references/grch37_to_grch38.over.chain.gz" ) From bce30d264a82264953758bf7b62cb70ce3182255 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 1 Feb 2024 21:39:10 +0000 Subject: [PATCH 05/33] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/dataset/test_pairwise_ld.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/dataset/test_pairwise_ld.py b/tests/dataset/test_pairwise_ld.py index b6b7daf46..91f7475e8 100644 --- a/tests/dataset/test_pairwise_ld.py +++ b/tests/dataset/test_pairwise_ld.py @@ -6,11 +6,10 @@ import numpy as np import pytest +from otg.dataset.pairwise_ld import PairwiseLD from pyspark.sql import functions as f from pyspark.sql.window import Window -from otg.dataset.pairwise_ld import PairwiseLD - if TYPE_CHECKING: from pyspark.sql import SparkSession From 4d09a601f573d0660d3104cf9f47bdefa8d90156 Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Thu, 1 Feb 2024 21:49:05 +0000 Subject: [PATCH 06/33] chore: updating some test files to gentropy --- tests/dataset/test_pairwise_ld.py | 2 +- tests/datasource/gnomad/test_gnomad_ld.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/dataset/test_pairwise_ld.py b/tests/dataset/test_pairwise_ld.py index 91f7475e8..bd8ad7068 100644 --- a/tests/dataset/test_pairwise_ld.py +++ b/tests/dataset/test_pairwise_ld.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from otg.dataset.pairwise_ld import PairwiseLD +from gentropy.dataset.pairwise_ld import PairwiseLD from pyspark.sql import functions as f from pyspark.sql.window import Window diff --git a/tests/datasource/gnomad/test_gnomad_ld.py b/tests/datasource/gnomad/test_gnomad_ld.py index bc1b63871..25336e13f 100644 --- a/tests/datasource/gnomad/test_gnomad_ld.py +++ b/tests/datasource/gnomad/test_gnomad_ld.py @@ -7,7 +7,7 @@ import hail as hl import pytest -from genotropy.dataset.pairwise_ld import PairwiseLD +from gentropy.dataset.pairwise_ld import PairwiseLD from gentropy.datasource.gnomad.ld import GnomADLDMatrix from pyspark.sql import DataFrame, SparkSession from pyspark.sql import functions as f From f7dfba176a1b25796d5a75f4ed261fc75c918981 Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Thu, 1 Feb 2024 23:20:32 +0000 Subject: [PATCH 07/33] chore: updating tests --- tests/datasource/gnomad/test_gnomad_ld.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/datasource/gnomad/test_gnomad_ld.py b/tests/datasource/gnomad/test_gnomad_ld.py index 25336e13f..d75639ec1 100644 --- a/tests/datasource/gnomad/test_gnomad_ld.py +++ b/tests/datasource/gnomad/test_gnomad_ld.py @@ -91,7 +91,7 @@ def _setup(self: TestGnomADLDMatrixVariants, spark: SparkSession) -> None: ) @pytest.fixture(scope="class") - def ld_slice(self: TestGnomADLDMatrixVariants) -> PairwiseLD: + def ld_slice(self: TestGnomADLDMatrixVariants) -> DataFrame | None: """Generate a resolved LD slice.""" return self.gnomad_ld_matrix.get_ld_variants( gnomad_ancestry="test-pop", # observed[0], From 3a629ef38b95226353284566236e4a92a7133014 Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Fri, 2 Feb 2024 13:09:33 +0000 Subject: [PATCH 08/33] chore: updating pairwise_ld_schema for tests --- src/gentropy/assets/schemas/pairwise_ld.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gentropy/assets/schemas/pairwise_ld.json b/src/gentropy/assets/schemas/pairwise_ld.json index 2d5351d82..bac781ac3 100644 --- a/src/gentropy/assets/schemas/pairwise_ld.json +++ b/src/gentropy/assets/schemas/pairwise_ld.json @@ -2,13 +2,13 @@ "fields": [ { "metadata": {}, - "name": "variantId_i", + "name": "variantIdI", "nullable": false, "type": "string" }, { "metadata": {}, - "name": "variantId_j", + "name": "variantIdJ", "nullable": false, "type": "string" }, From 6ff6b1c03726bc5d461f5b57c015ca42ccc2367b Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Fri, 2 Feb 2024 13:24:19 +0000 Subject: [PATCH 09/33] chore: updating pairwise_ld tests --- src/gentropy/dataset/pairwise_ld.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/gentropy/dataset/pairwise_ld.py b/src/gentropy/dataset/pairwise_ld.py index fc94afa85..221522d0b 100644 --- a/src/gentropy/dataset/pairwise_ld.py +++ b/src/gentropy/dataset/pairwise_ld.py @@ -32,8 +32,8 @@ def __post_init__(self: PairwiseLD) -> None: """ row_count = self.df.count() - assert int(sqrt(row_count)) == sqrt( - row_count + assert ( + int(sqrt(row_count)) == sqrt(row_count) ), f"The number of rows in a pairwise LD table has to be square. Got: {row_count}" self.dimension = (int(sqrt(row_count)), int(sqrt(row_count))) @@ -74,12 +74,8 @@ def r_to_numpy_matrix(self) -> np.ndarray: """ return np.array( self.df.select( - f.split("variantId_i", "_")[1] - .cast(t.IntegerType()) - .alias("position_i"), - f.split("variantId_j", "_")[1] - .cast(t.IntegerType()) - .alias("position_j"), + f.split("variantIdI", "_")[1].cast(t.IntegerType()).alias("position_i"), + f.split("variantIdJ", "_")[1].cast(t.IntegerType()).alias("position_j"), "r", ) .orderBy(f.col("position_i").asc(), f.col("position_j").asc()) @@ -97,8 +93,8 @@ def get_variant_list(self) -> list[str]: row["variantId"] for row in ( self.df.select( - f.col("variantId_i").alias("variantId"), - f.split(f.col("variantId_i"), "_")[1] + f.col("variantIdI").alias("variantId"), + f.split(f.col("variantIdI"), "_")[1] .cast(t.IntegerType()) .alias("position"), ) From 3fc9cfaeb81e3bc93d331b2eae71af93d7616037 Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Fri, 2 Feb 2024 13:49:57 +0000 Subject: [PATCH 10/33] chore: fix ld_pairwise tests --- tests/dataset/test_pairwise_ld.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/dataset/test_pairwise_ld.py b/tests/dataset/test_pairwise_ld.py index bd8ad7068..11ebf75ca 100644 --- a/tests/dataset/test_pairwise_ld.py +++ b/tests/dataset/test_pairwise_ld.py @@ -39,16 +39,16 @@ def mock_pairwise_ld(self: TestPairwiseLD, spark: SparkSession) -> PairwiseLD: data = [(v1, v2) for v1 in self.variants for v2 in self.variants] return PairwiseLD( _df=( - spark.createDataFrame(data, ["variantId_i", "variantId_j"]) + spark.createDataFrame(data, ["variantIdI", "variantIdJ"]) .withColumn( "r", f.row_number() - .over(Window.partitionBy(f.lit("x")).orderBy("variantId_i")) + .over(Window.partitionBy(f.lit("x")).orderBy("variantIdI")) .cast("double"), ) .withColumn( "r", - f.when(f.col("variantId_i") == f.col("variantId_j"), 1.0).otherwise( + f.when(f.col("variantIdI") == f.col("variantIdJ"), 1.0).otherwise( f.col("r") ), ) From 0d6802057d373ea84af9b242804eb2a7e4c64317 Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Fri, 2 Feb 2024 15:28:41 +0000 Subject: [PATCH 11/33] chore: fix pairwise_ld tests --- src/gentropy/dataset/pairwise_ld.py | 4 ++-- tests/datasource/gnomad/test_gnomad_ld.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gentropy/dataset/pairwise_ld.py b/src/gentropy/dataset/pairwise_ld.py index 221522d0b..9650efa32 100644 --- a/src/gentropy/dataset/pairwise_ld.py +++ b/src/gentropy/dataset/pairwise_ld.py @@ -59,8 +59,8 @@ def overlap_with_locus(self: PairwiseLD, locus_variants: list[str]) -> PairwiseL return PairwiseLD( _df=( self.df.filter( - f.col("variantId_i").isin(locus_variants) - & f.col("variantId_j").isin(locus_variants) + f.col("variantIdI").isin(locus_variants) + & f.col("variantIdJ").isin(locus_variants) ) ), _schema=PairwiseLD.get_schema(), diff --git a/tests/datasource/gnomad/test_gnomad_ld.py b/tests/datasource/gnomad/test_gnomad_ld.py index d75639ec1..eaeb6ebaa 100644 --- a/tests/datasource/gnomad/test_gnomad_ld.py +++ b/tests/datasource/gnomad/test_gnomad_ld.py @@ -64,7 +64,7 @@ def test_resolve_variant_indices( ) expected_df = spark.createDataFrame( expected, - ["r", "variantId_i", "chromosome", "variantId_j"], + ["r", "variantIdI", "chromosome", "variantIdJ"], ) observed_df = GnomADLDMatrix._resolve_variant_indices(ld_index, ld_matrix) assert ( From 07260a7897e450049a02a3d39584ae1648f7322e Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Fri, 2 Feb 2024 16:11:36 +0000 Subject: [PATCH 12/33] chore: fix tests --- src/gentropy/datasource/gnomad/ld.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py index 858751690..870446623 100644 --- a/src/gentropy/datasource/gnomad/ld.py +++ b/src/gentropy/datasource/gnomad/ld.py @@ -220,9 +220,9 @@ def _resolve_variant_indices( DataFrame: Dataframe with variant IDs instead of `i` and `j` indices """ ld_index_i = ld_index.selectExpr( - "idx as i", "variantId as variantId_i", "chromosome" + "idx as i", "variantId as variantIdI", "chromosome" ) - ld_index_j = ld_index.selectExpr("idx as j", "variantId as variantId_j") + ld_index_j = ld_index.selectExpr("idx as j", "variantId as variantIdJ") return ( ld_matrix.join(ld_index_i, on="i", how="inner") .join(ld_index_j, on="j", how="inner") @@ -246,11 +246,11 @@ def _transpose_ld_matrix(ld_matrix: DataFrame) -> DataFrame: ... (1, 2, 0.5, "1", "AFR"), ... (2, 2, 1.0, "1", "AFR"), ... ], - ... ["variantId_i", "variantId_j", "r", "chromosome", "population"], + ... ["variantIdI", "variantIdJ", "r", "chromosome", "population"], ... ) >>> GnomADLDMatrix._transpose_ld_matrix(df).show() +-----------+-----------+---+----------+----------+ - |variantId_i|variantId_j| r|chromosome|population| + |variantIdI|variantIdJ| r|chromosome|population| +-----------+-----------+---+----------+----------+ | 1| 2|0.5| 1| AFR| | 1| 1|1.0| 1| AFR| @@ -260,15 +260,15 @@ def _transpose_ld_matrix(ld_matrix: DataFrame) -> DataFrame: """ ld_matrix_transposed = ld_matrix.selectExpr( - "variantId_i as variantId_j", - "variantId_j as variantId_i", + "variantIdI as variantIdJ", + "variantIdJ as variantIdI", "r", "chromosome", "population", ) - return ld_matrix.filter( - f.col("variantId_i") != f.col("variantId_j") - ).unionByName(ld_matrix_transposed) + return ld_matrix.filter(f.col("variantIdI") != f.col("variantIdJ")).unionByName( + ld_matrix_transposed + ) def as_ld_index( self: GnomADLDMatrix, @@ -309,8 +309,8 @@ def as_ld_index( GnomADLDMatrix._transpose_ld_matrix( reduce(lambda df1, df2: df1.unionByName(df2), ld_indices_unaggregated) ) - .withColumnRenamed("variantId_i", "variantId") - .withColumnRenamed("variantId_j", "tagVariantId") + .withColumnRenamed("variantIdI", "variantId") + .withColumnRenamed("variantIdJ", "tagVariantId") ) return LDIndex( _df=self._aggregate_ld_index_across_populations(ld_index_unaggregated), @@ -397,7 +397,7 @@ def _extract_square_matrix( .join( ld_index_df.select( f.col("idx").alias("idx_i"), - f.col("variantId").alias("variantId_i"), + f.col("variantId").alias("variantIdI"), ), on="idx_i", how="inner", @@ -405,12 +405,12 @@ def _extract_square_matrix( .join( ld_index_df.select( f.col("idx").alias("idx_j"), - f.col("variantId").alias("variantId_j"), + f.col("variantId").alias("variantIdJ"), ), on="idx_j", how="inner", ) - .select("variantId_i", "variantId_j", "r") + .select("variantIdI", "variantIdJ", "r") ) def get_ld_matrix_slice( From 65cea4e246413becee1f45f50efc1933ca8b5797 Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Fri, 2 Feb 2024 16:46:06 +0000 Subject: [PATCH 13/33] chore: fix tests --- src/gentropy/datasource/gnomad/ld.py | 36 ++++++++++++++-------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py index 870446623..7becf6201 100644 --- a/src/gentropy/datasource/gnomad/ld.py +++ b/src/gentropy/datasource/gnomad/ld.py @@ -240,24 +240,24 @@ def _transpose_ld_matrix(ld_matrix: DataFrame) -> DataFrame: DataFrame: Square LD matrix without diagonal duplicates Examples: - >>> df = spark.createDataFrame( - ... [ - ... (1, 1, 1.0, "1", "AFR"), - ... (1, 2, 0.5, "1", "AFR"), - ... (2, 2, 1.0, "1", "AFR"), - ... ], - ... ["variantIdI", "variantIdJ", "r", "chromosome", "population"], - ... ) - >>> GnomADLDMatrix._transpose_ld_matrix(df).show() - +-----------+-----------+---+----------+----------+ - |variantIdI|variantIdJ| r|chromosome|population| - +-----------+-----------+---+----------+----------+ - | 1| 2|0.5| 1| AFR| - | 1| 1|1.0| 1| AFR| - | 2| 1|0.5| 1| AFR| - | 2| 2|1.0| 1| AFR| - +-----------+-----------+---+----------+----------+ - + >>> df = spark.createDataFrame( + ... [ + ... (1, 1, 1.0, "1", "AFR"), + ... (1, 2, 0.5, "1", "AFR"), + ... (2, 2, 1.0, "1", "AFR"), + ... ], + ... ["variantIdI", "variantIdJ", "r", "chromosome", "population"], + ... ) + >>> GnomADLDMatrix._transpose_ld_matrix(df).show() + +----------+----------+---+----------+----------+ + |variantIdI|variantIdJ| r|chromosome|population| + +----------+----------+---+----------+----------+ + | 1| 2|0.5| 1| AFR| + | 1| 1|1.0| 1| AFR| + | 2| 1|0.5| 1| AFR| + | 2| 2|1.0| 1| AFR| + +----------+----------+---+----------+----------+ + """ ld_matrix_transposed = ld_matrix.selectExpr( "variantIdI as variantIdJ", From aa19f9f5d50e5b583f2236938e2ada9e883b2d7c Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Fri, 2 Feb 2024 17:13:11 +0000 Subject: [PATCH 14/33] chore: fixing typing for tests --- tests/datasource/gnomad/test_gnomad_ld.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/datasource/gnomad/test_gnomad_ld.py b/tests/datasource/gnomad/test_gnomad_ld.py index eaeb6ebaa..be2e39aec 100644 --- a/tests/datasource/gnomad/test_gnomad_ld.py +++ b/tests/datasource/gnomad/test_gnomad_ld.py @@ -91,7 +91,7 @@ def _setup(self: TestGnomADLDMatrixVariants, spark: SparkSession) -> None: ) @pytest.fixture(scope="class") - def ld_slice(self: TestGnomADLDMatrixVariants) -> DataFrame | None: + def ld_slice(self: TestGnomADLDMatrixVariants) -> PairwiseLD: """Generate a resolved LD slice.""" return self.gnomad_ld_matrix.get_ld_variants( gnomad_ancestry="test-pop", # observed[0], @@ -107,7 +107,7 @@ def test_get_ld_variants__type( ) -> None: """Testing if the function returns the right type.""" # Do we have the right type? - assert isinstance(ld_slice, PairwiseLD) + assert isinstance(ld_slice, DataFrame) # Do we have a real square? assert sqrt(ld_slice.df.count()) == int(sqrt(ld_slice.df.count())) From e24cac1088e56be127e5a6b81e54bc5d18f7d992 Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Fri, 2 Feb 2024 17:21:59 +0000 Subject: [PATCH 15/33] chore: fixing tests --- tests/datasource/gnomad/test_gnomad_ld.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/datasource/gnomad/test_gnomad_ld.py b/tests/datasource/gnomad/test_gnomad_ld.py index be2e39aec..cdf530de1 100644 --- a/tests/datasource/gnomad/test_gnomad_ld.py +++ b/tests/datasource/gnomad/test_gnomad_ld.py @@ -109,7 +109,7 @@ def test_get_ld_variants__type( # Do we have the right type? assert isinstance(ld_slice, DataFrame) # Do we have a real square? - assert sqrt(ld_slice.df.count()) == int(sqrt(ld_slice.df.count())) + assert sqrt(ld_slice.count()) == int(sqrt(ld_slice.count())) class TestGnomADLDMatrixSlice: From 4ec36b9dbcb47f1c5bed0d3f6a84c2b030c3751c Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Fri, 2 Feb 2024 17:52:22 +0000 Subject: [PATCH 16/33] chore: fixing ld tests --- tests/datasource/gnomad/test_gnomad_ld.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/datasource/gnomad/test_gnomad_ld.py b/tests/datasource/gnomad/test_gnomad_ld.py index cdf530de1..67961c821 100644 --- a/tests/datasource/gnomad/test_gnomad_ld.py +++ b/tests/datasource/gnomad/test_gnomad_ld.py @@ -91,7 +91,7 @@ def _setup(self: TestGnomADLDMatrixVariants, spark: SparkSession) -> None: ) @pytest.fixture(scope="class") - def ld_slice(self: TestGnomADLDMatrixVariants) -> PairwiseLD: + def ld_slice(self: TestGnomADLDMatrixVariants) -> DataFrame | None: """Generate a resolved LD slice.""" return self.gnomad_ld_matrix.get_ld_variants( gnomad_ancestry="test-pop", # observed[0], From 0e93adb642537f4680d1770889886100e81c5e0c Mon Sep 17 00:00:00 2001 From: Daniel-Considine <113430683+Daniel-Considine@users.noreply.github.com> Date: Mon, 5 Feb 2024 17:34:16 +0000 Subject: [PATCH 17/33] Update src/gentropy/dataset/study_index.py Co-authored-by: Daniel Suveges --- src/gentropy/dataset/study_index.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/gentropy/dataset/study_index.py b/src/gentropy/dataset/study_index.py index 89a914c4b..52b0a8269 100644 --- a/src/gentropy/dataset/study_index.py +++ b/src/gentropy/dataset/study_index.py @@ -190,7 +190,9 @@ def get_major_pop(self: StudyIndex) -> DataFrame: Returns: DataFrame: Columns studyId and the extracted major population from ldPopulationStructure. """ - return self.df.select("studyId", "ldPopulationStructure").withColumn( - "ldPopulationStructure", - f.array_max(f.col("ldPopulationStructure")).getItem("ldPopulation"), + return self.df.select( + "studyId", + f.array_max(f.col("ldPopulationStructure")) + .getItem("ldPopulation") + .alias("majorPopulation"), ) From cd4cddfcee94d7fccc72960ede70e311fbf20ab4 Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Wed, 7 Feb 2024 18:30:54 +0000 Subject: [PATCH 18/33] feat: moving functions to their appropriate locations and improving logic --- src/gentropy/dataset/study_index.py | 2 +- src/gentropy/dataset/study_locus.py | 58 --------------------- src/gentropy/datasource/gnomad/ld.py | 76 +++++++++++++++++++++++++++- 3 files changed, 76 insertions(+), 60 deletions(-) diff --git a/src/gentropy/dataset/study_index.py b/src/gentropy/dataset/study_index.py index 52b0a8269..9a7c6a2aa 100644 --- a/src/gentropy/dataset/study_index.py +++ b/src/gentropy/dataset/study_index.py @@ -184,7 +184,7 @@ def has_summarystats(self: StudyIndex) -> Column: """ return self.df.hasSumstats - def get_major_pop(self: StudyIndex) -> DataFrame: + def get_major_population(self: StudyIndex) -> DataFrame: """Extract major population from ldPopulationStructure rows with multiple ancestries. Returns: diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py index 050246187..41c099959 100644 --- a/src/gentropy/dataset/study_locus.py +++ b/src/gentropy/dataset/study_locus.py @@ -5,9 +5,7 @@ from enum import Enum from typing import TYPE_CHECKING -import numpy as np import pyspark.sql.functions as f -from hail.linalg import BlockMatrix from gentropy.common.schemas import parse_spark_schema from gentropy.common.spark_helpers import ( @@ -16,14 +14,12 @@ ) from gentropy.dataset.dataset import Dataset from gentropy.dataset.study_locus_overlap import StudyLocusOverlap -from gentropy.datasource.gnomad.ld import GnomADLDMatrix from gentropy.method.clump import LDclumping if TYPE_CHECKING: from pyspark.sql import Column, DataFrame from pyspark.sql.types import StructType - from gentropy.common.session import Session from gentropy.dataset.ld_index import LDIndex from gentropy.dataset.study_index import StudyIndex @@ -457,60 +453,6 @@ def clump(self: StudyLocus) -> StudyLocus: ) return self - def get_gnomad_matrix( - self: StudyLocus, session: Session, study_index: StudyIndex, window_size: int - ) -> tuple[DataFrame, np.ndarray]: - """Extract hail matrix index and block matrix from StudyLocus rows. - - Args: - session (Session): Spark session - study_index (StudyIndex): Study index - window_size (int): Window size to extract from gnomad matrix - - Returns: - tuple[DataFrame, np.ndarray]: Returns the index of the gnomad matrix and the gnomad block matrix - """ - _df = ( - self.df.join(study_index.get_major_pop(), on="studyId", how="left") - .withColumn("start", f.col("position") - (window_size / 2)) - .withColumn("end", f.col("position") + (window_size / 2)) - .alias("_df") - ) - _pop = _df.collect()[0]["ldPopulationStructure"] - - _matrix_index = session.spark.read.parquet( - GnomADLDMatrix.ld_index_38_template.format(POP=_pop) - ) - _index_joined = ( - _df.alias("df") - .join( - _matrix_index.alias("matrix_index"), - (f.col("df.chromosome") == f.col("matrix_index.chromosome")) - & (f.col("df.start") <= f.col("matrix_index.position")) - & (f.col("df.end") >= f.col("matrix_index.position")), - ) - .select( - "matrix_index.chromosome", - "matrix_index.position", - "referenceAllele", - "alternateAllele", - "idx", - ) - ).persist() - _idx = ( - _index_joined.sort("idx").select("idx").rdd.flatMap(lambda x: x).collect() - ) - _block_matrix = ( - BlockMatrix.read(GnomADLDMatrix.ld_matrix_template.format(POP=_pop)) - .filter(_idx, _idx) - .to_numpy() - ) - _block_matrix = ( - _block_matrix + _block_matrix.T - np.diag(np.diag(_block_matrix)) - ) - - return _index_joined, _block_matrix - def _qc_unresolved_ld( self: StudyLocus, ) -> StudyLocus: diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py index 7becf6201..a7e5725a6 100644 --- a/src/gentropy/datasource/gnomad/ld.py +++ b/src/gentropy/datasource/gnomad/ld.py @@ -7,10 +7,12 @@ from typing import TYPE_CHECKING import hail as hl +import numpy as np import pyspark.sql.functions as f from hail.linalg import BlockMatrix from pyspark.sql import Window +from gentropy.common.session import Session from gentropy.common.spark_helpers import get_top_ranked_in_window, get_value_from_row from gentropy.common.utils import _liftover_loci, convert_gnomad_position_to_ensembl from gentropy.dataset.ld_index import LDIndex @@ -347,7 +349,6 @@ def get_ld_variants( & (f.col("position") <= end) ) .select("chromosome", "position", "variantId", "idx") - .persist() ) if ld_index_df.limit(1).count() == 0: @@ -450,3 +451,76 @@ def get_ld_matrix_slice( .alias("r"), ) ) + + @staticmethod + def get_locus_index( + session: Session, + study_locus_row: DataFrame, + window_size: int, + major_population: str = "nfe", + ld_index_path: str = "gs://genetics_etl_python_playground/input/ld/gnomad_r2.1.1.{POP}.common.ld.variant_indices.parquet", + ) -> DataFrame: + """Extract hail matrix index from StudyLocus rows. + + Args: + session (Session): Spark session + study_locus_row (DataFrame): Study-locus row + window_size (int): Window size to extract from gnomad matrix + major_population (str): Major population to extract from gnomad matrix, default is "nfe" + ld_index_path (str): Optional path to the LD index parquet + Returns: + DataFrame: Returns the index of the gnomad matrix for the locus + """ + _df = ( + study_locus_row.withColumn("start", f.col("position") - (window_size / 2)) + .withColumn("end", f.col("position") + (window_size / 2)) + .alias("_df") + ) + + _matrix_index = session.spark.read.parquet( + ld_index_path.format(POP=major_population) + ) + + _index_joined = ( + _df.alias("df") + .join( + _matrix_index.alias("matrix_index"), + (f.col("df.chromosome") == f.col("matrix_index.chromosome")) + & (f.col("df.start") <= f.col("matrix_index.position")) + & (f.col("df.end") >= f.col("matrix_index.position")), + ) + .select( + "matrix_index.chromosome", + "matrix_index.position", + "referenceAllele", + "alternateAllele", + "idx", + ) + .sort("idx") + ) + + return _index_joined + + @staticmethod + def get_locus_matrix( + locus_index: DataFrame, + gnomad_ancestry: str, + ) -> np.ndarray: + """Extract the LD block matrix for a locus. + + Args: + locus_index (DataFrame): hail matrix variant index table + gnomad_ancestry (str): GnomAD major ancestry label eg. `nfe` + + Returns: + np.ndarray: LD block matrix for the locus + """ + idx = [row["idx"] for row in locus_index.select("idx").collect()] + + half_matrix = BlockMatrix.read( + GnomADLDMatrix.ld_matrix_template.format(POP=gnomad_ancestry) + ).filter(idx, idx) + + return (half_matrix + half_matrix.T).to_numpy() - np.diag( + np.diag(half_matrix.to_numpy()) + ) From 228c66b80117de8fa8a247b8814d0fceef800c3e Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Wed, 7 Feb 2024 18:57:06 +0000 Subject: [PATCH 19/33] fix: optimise conversion of BM to NumPy --- src/gentropy/datasource/gnomad/ld.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py index a7e5725a6..eaf7d1b3f 100644 --- a/src/gentropy/datasource/gnomad/ld.py +++ b/src/gentropy/datasource/gnomad/ld.py @@ -517,10 +517,12 @@ def get_locus_matrix( """ idx = [row["idx"] for row in locus_index.select("idx").collect()] - half_matrix = BlockMatrix.read( - GnomADLDMatrix.ld_matrix_template.format(POP=gnomad_ancestry) - ).filter(idx, idx) - - return (half_matrix + half_matrix.T).to_numpy() - np.diag( - np.diag(half_matrix.to_numpy()) + half_matrix = ( + BlockMatrix.read( + GnomADLDMatrix.ld_matrix_template.format(POP=gnomad_ancestry) + ) + .filter(idx, idx) + .to_numpy() ) + + return (half_matrix + half_matrix.T) - np.diag(np.diag(half_matrix)) From 06a9e96191b95899718d516329a69ef28bafc8cf Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Fri, 9 Feb 2024 18:30:06 +0000 Subject: [PATCH 20/33] feat: updating get_locus_index to allow for just chromosome and position inputs --- src/gentropy/datasource/gnomad/ld.py | 30 +++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py index eaf7d1b3f..5733189c0 100644 --- a/src/gentropy/datasource/gnomad/ld.py +++ b/src/gentropy/datasource/gnomad/ld.py @@ -4,7 +4,7 @@ import sys from dataclasses import dataclass, field from functools import reduce -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Optional import hail as hl import numpy as np @@ -455,8 +455,10 @@ def get_ld_matrix_slice( @staticmethod def get_locus_index( session: Session, - study_locus_row: DataFrame, - window_size: int, + study_locus_row: Optional[DataFrame] = None, + chromosome: Optional[str] = None, + position: Optional[int] = None, + window_size: int = 1_000_000, major_population: str = "nfe", ld_index_path: str = "gs://genetics_etl_python_playground/input/ld/gnomad_r2.1.1.{POP}.common.ld.variant_indices.parquet", ) -> DataFrame: @@ -464,15 +466,33 @@ def get_locus_index( Args: session (Session): Spark session - study_locus_row (DataFrame): Study-locus row + study_locus_row (Optional[DataFrame]): Study-locus row + chromosome (Optional[str]): Chromosome to extract from gnomad matrix + position (Optional[int]): Position to extract from gnomad matrix window_size (int): Window size to extract from gnomad matrix major_population (str): Major population to extract from gnomad matrix, default is "nfe" ld_index_path (str): Optional path to the LD index parquet + Returns: DataFrame: Returns the index of the gnomad matrix for the locus + + Raises: + ValueError: Either a studyLocus or a chromosome and position must be provided """ + if study_locus_row is None: + if chromosome is not None and position is not None: + _df = session.spark.createDataFrame( + [(chromosome, position)], ["chromosome", "position"] + ) + else: + raise ValueError( + "Either a studyLocus or a chromosome and position must be provided." + ) + else: + _df = study_locus_row + _df = ( - study_locus_row.withColumn("start", f.col("position") - (window_size / 2)) + _df.withColumn("start", f.col("position") - (window_size / 2)) .withColumn("end", f.col("position") + (window_size / 2)) .alias("_df") ) From ebdd983c179fe1486ab69018b6180110e5418ef9 Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Mon, 18 Mar 2024 12:44:19 +0000 Subject: [PATCH 21/33] fix: suggested changes --- src/gentropy/datasource/gnomad/ld.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py index 5733189c0..364ba0c35 100644 --- a/src/gentropy/datasource/gnomad/ld.py +++ b/src/gentropy/datasource/gnomad/ld.py @@ -455,23 +455,23 @@ def get_ld_matrix_slice( @staticmethod def get_locus_index( session: Session, + ld_index_path: str, study_locus_row: Optional[DataFrame] = None, chromosome: Optional[str] = None, position: Optional[int] = None, window_size: int = 1_000_000, major_population: str = "nfe", - ld_index_path: str = "gs://genetics_etl_python_playground/input/ld/gnomad_r2.1.1.{POP}.common.ld.variant_indices.parquet", ) -> DataFrame: """Extract hail matrix index from StudyLocus rows. Args: session (Session): Spark session + ld_index_path (str): Optional path to the LD index parquet study_locus_row (Optional[DataFrame]): Study-locus row chromosome (Optional[str]): Chromosome to extract from gnomad matrix position (Optional[int]): Position to extract from gnomad matrix window_size (int): Window size to extract from gnomad matrix major_population (str): Major population to extract from gnomad matrix, default is "nfe" - ld_index_path (str): Optional path to the LD index parquet Returns: DataFrame: Returns the index of the gnomad matrix for the locus From a2232b596605f5fbde58aa5ec69af0b843354f08 Mon Sep 17 00:00:00 2001 From: Daniel-Considine <113430683+Daniel-Considine@users.noreply.github.com> Date: Thu, 21 Mar 2024 11:31:51 +0000 Subject: [PATCH 22/33] Update study_index.py --- src/gentropy/dataset/study_index.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/gentropy/dataset/study_index.py b/src/gentropy/dataset/study_index.py index 9a7c6a2aa..d048157d6 100644 --- a/src/gentropy/dataset/study_index.py +++ b/src/gentropy/dataset/study_index.py @@ -183,16 +183,3 @@ def has_summarystats(self: StudyIndex) -> Column: Column: True if the study has harmonized summary statistics. """ return self.df.hasSumstats - - def get_major_population(self: StudyIndex) -> DataFrame: - """Extract major population from ldPopulationStructure rows with multiple ancestries. - - Returns: - DataFrame: Columns studyId and the extracted major population from ldPopulationStructure. - """ - return self.df.select( - "studyId", - f.array_max(f.col("ldPopulationStructure")) - .getItem("ldPopulation") - .alias("majorPopulation"), - ) From b9b817d979b9964eb2dbf2702612f264612e8a60 Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Thu, 21 Mar 2024 13:54:48 +0000 Subject: [PATCH 23/33] fix: changes to datasource/gnomad/ld.py --- src/gentropy/datasource/gnomad/ld.py | 30 +++++----------------------- 1 file changed, 5 insertions(+), 25 deletions(-) diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py index 364ba0c35..99b7b7ad2 100644 --- a/src/gentropy/datasource/gnomad/ld.py +++ b/src/gentropy/datasource/gnomad/ld.py @@ -4,7 +4,7 @@ import sys from dataclasses import dataclass, field from functools import reduce -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING import hail as hl import numpy as np @@ -30,14 +30,12 @@ class GnomADLDMatrix: Attributes: ld_matrix_template (str): Template for the LD matrix path. Defaults to "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm". ld_index_raw_template (str): Template for the LD index path. Defaults to "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht". - ld_index_38_template (str): Template for the LD index path in build 38. Defaults to "gs://genetics_etl_python_playground/input/ld/gnomad_r2.1.1.{POP}.common.ld.variant_indices.parquet" grch37_to_grch38_chain_path (str): Path to the chain file used to lift over the coordinates. Defaults to "gs://hail-common/references/grch37_to_grch38.over.chain.gz". ld_populations (list[str]): List of populations to use to build the LDIndex. Defaults to ["afr", "amr", "asj", "eas", "fin", "nfe", "nwe", "seu"]. """ ld_matrix_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm" ld_index_raw_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht" - ld_index_38_template: str = "gs://genetics_etl_python_playground/input/ld/gnomad_r2.1.1.{POP}.common.ld.variant_indices.parquet" grch37_to_grch38_chain_path: str = ( "gs://hail-common/references/grch37_to_grch38.over.chain.gz" ) @@ -455,10 +453,8 @@ def get_ld_matrix_slice( @staticmethod def get_locus_index( session: Session, + study_locus_row: DataFrame, ld_index_path: str, - study_locus_row: Optional[DataFrame] = None, - chromosome: Optional[str] = None, - position: Optional[int] = None, window_size: int = 1_000_000, major_population: str = "nfe", ) -> DataFrame: @@ -466,33 +462,17 @@ def get_locus_index( Args: session (Session): Spark session - ld_index_path (str): Optional path to the LD index parquet - study_locus_row (Optional[DataFrame]): Study-locus row - chromosome (Optional[str]): Chromosome to extract from gnomad matrix - position (Optional[int]): Position to extract from gnomad matrix + study_locus_row (DataFrame): Study-locus row + ld_index_path (str): Path to the hail LD index parquet window_size (int): Window size to extract from gnomad matrix major_population (str): Major population to extract from gnomad matrix, default is "nfe" Returns: DataFrame: Returns the index of the gnomad matrix for the locus - Raises: - ValueError: Either a studyLocus or a chromosome and position must be provided """ - if study_locus_row is None: - if chromosome is not None and position is not None: - _df = session.spark.createDataFrame( - [(chromosome, position)], ["chromosome", "position"] - ) - else: - raise ValueError( - "Either a studyLocus or a chromosome and position must be provided." - ) - else: - _df = study_locus_row - _df = ( - _df.withColumn("start", f.col("position") - (window_size / 2)) + study_locus_row.withColumn("start", f.col("position") - (window_size / 2)) .withColumn("end", f.col("position") + (window_size / 2)) .alias("_df") ) From 37668d14dda2125f66842fd4dbe019fa78069bd6 Mon Sep 17 00:00:00 2001 From: Yakov Tsepilov Date: Tue, 2 Apr 2024 16:40:50 +0100 Subject: [PATCH 24/33] feat: add the draft of finemapper fucntion --- src/gentropy/susie_finemapper.py | 119 +++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py index bd02716c2..760ca73a1 100644 --- a/src/gentropy/susie_finemapper.py +++ b/src/gentropy/susie_finemapper.py @@ -9,7 +9,10 @@ from pyspark.sql import DataFrame, Window from gentropy.common.session import Session +from gentropy.dataset.study_index import StudyIndex from gentropy.dataset.study_locus import StudyLocus +from gentropy.dataset.summary_statistics import SummaryStatistics +from gentropy.method.susie_inf import SUSIE_inf class SusieFineMapperStep: @@ -19,6 +22,122 @@ class SusieFineMapperStep: In the future this step will be refactored and moved to the methods module. """ + @staticmethod + def susie_finemapper_one_locus( + GWAS: SummaryStatistics, + session: Session, + study_locus: StudyLocus, + study_index: StudyIndex, + window: int = 1_000_000, + ) -> StudyLocus: + """Susie fine-mapper for one study locus. + + Args: + GWAS (SummaryStatistics): GWAS summary statistics + session (Session): Spark session + study_locus (StudyLocus): StudyLocus object with one row (the first one will be used) + study_index (StudyIndex): StudyIndex object + window (int): window size for fine-mapping + + Returns: + StudyLocus: StudyLocus object with fine-mapped credible sets + """ + study_locus_df = study_locus._df + first_line = study_locus_df.first() + chromosome = first_line.chromosome + position = first_line.position + _studyId = first_line.studyId + + study_index_df = study_index._df + study_index_df = study_index_df.filter(study_index_df.studyId == _studyId) + _major_population = study_index_df.select( + "studyId", + f.array_max(f.col("ldPopulationStructure")) + .getItem("ldPopulation") + .alias("majorPopulation"), + ) + + _region = ( + first_line.withColumn( + "region", + f.regexp_replace( + f.concat( + f.col("chromosome"), + f.lit(":"), + f.format_number((f.col("position") - (window / 2)), 0), + f.lit("-"), + f.format_number((f.col("position") + (window / 2)), 0), + ), + ",", + "", + ), + ) + .select("region") + .collect()[0] + ) + + GWAS_df = GWAS._df + GWAS_df = GWAS_df.filter( + (f.col("chromosome") == chromosome) + & (f.col("position") >= position - (window / 2)) + & (f.col("position") <= position + (window / 2)) + ).withColumn("z", f.col("beta") / f.col("standardError")) + + _z = np.array([row["z"] for row in GWAS_df.select("z").collect()]) + + # # Extract summary statistics + # _ss = ( + # SummaryStatistics.get_locus_sumstats(session, window, _locus) + # .withColumn("z", f.col("beta") / f.col("standardError")) + # .withColumn("ref", f.split(f.col("variantId"), "_").getItem(2)) + # .withColumn("alt", f.split(f.col("variantId"), "_").getItem(3)) + # .select( + # "variantId", + # f.col("chromosome").alias("chr"), + # f.col("position").alias("pos"), + # "ref", + # "alt", + # "beta", + # "pValueMantissa", + # "pValueExponent", + # "effectAlleleFrequencyFromSource", + # "standardError", + # "z", + # ) + # ) + + # Extract LD index + # _index = GnomADLDMatrix.get_locus_index( + # session, locus, window_size=window, major_population=_major_population + # ) + # _join = ( + # _ss.join( + # _index.alias("_index"), + # on=( + # (_ss["chr"] == _index["chromosome"]) + # & (_ss["pos"] == _index["position"]) + # & (_ss["ref"] == _index["referenceAllele"]) + # & (_ss["alt"] == _index["alternateAllele"]) + # ), + # ) + # .drop("ref", "alt", "chr", "pos") + # .sort("idx") + # ) + + # Extracting z-scores and LD matrix, then running SuSiE-inf + # _ld = GnomADLDMatrix.get_locus_matrix(_join, gnomad_ancestry=_major_population) + + _ld = 1 + susie_output = SUSIE_inf.susie_inf(z=_z, LD=_ld, L=10) + + return SusieFineMapperStep.susie_inf_to_studylocus( + susie_output=susie_output, + session=session, + _studyId=_studyId, + _region=_region, + variant_index=GWAS, + ) + @staticmethod def susie_inf_to_studylocus( susie_output: dict[str, Any], From 33b6a51d1cc2575a7e91536a0c61a7e8a4720f6c Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Tue, 2 Apr 2024 16:44:37 +0100 Subject: [PATCH 25/33] feat: updated method for ld_index extraction --- src/gentropy/datasource/gnomad/ld.py | 61 ++++++++++++---------------- 1 file changed, 26 insertions(+), 35 deletions(-) diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py index 99b7b7ad2..752a05abb 100644 --- a/src/gentropy/datasource/gnomad/ld.py +++ b/src/gentropy/datasource/gnomad/ld.py @@ -12,13 +12,12 @@ from hail.linalg import BlockMatrix from pyspark.sql import Window -from gentropy.common.session import Session from gentropy.common.spark_helpers import get_top_ranked_in_window, get_value_from_row from gentropy.common.utils import _liftover_loci, convert_gnomad_position_to_ensembl from gentropy.dataset.ld_index import LDIndex if TYPE_CHECKING: - from pyspark.sql import DataFrame + from pyspark.sql import DataFrame, Row @dataclass @@ -36,6 +35,7 @@ class GnomADLDMatrix: ld_matrix_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm" ld_index_raw_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht" + liftover_ht_path: str = "gs://gcp-public-data--gnomad/release/2.1.1/liftover_grch38/ht/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.ht" grch37_to_grch38_chain_path: str = ( "gs://hail-common/references/grch37_to_grch38.over.chain.gz" ) @@ -450,20 +450,16 @@ def get_ld_matrix_slice( ) ) - @staticmethod def get_locus_index( - session: Session, - study_locus_row: DataFrame, - ld_index_path: str, + self: GnomADLDMatrix, + study_locus_row: Row, window_size: int = 1_000_000, major_population: str = "nfe", ) -> DataFrame: """Extract hail matrix index from StudyLocus rows. Args: - session (Session): Spark session - study_locus_row (DataFrame): Study-locus row - ld_index_path (str): Path to the hail LD index parquet + study_locus_row (Row): Study-locus row window_size (int): Window size to extract from gnomad matrix major_population (str): Major population to extract from gnomad matrix, default is "nfe" @@ -471,40 +467,35 @@ def get_locus_index( DataFrame: Returns the index of the gnomad matrix for the locus """ - _df = ( - study_locus_row.withColumn("start", f.col("position") - (window_size / 2)) - .withColumn("end", f.col("position") + (window_size / 2)) - .alias("_df") + chromosome = str("chr" + study_locus_row["chromosome"]) + start = study_locus_row["position"] - window_size // 2 + end = study_locus_row["position"] + window_size // 2 + + liftover_ht = hl.read_table(self.liftover_ht_path) + liftover_ht = ( + liftover_ht.filter( + (liftover_ht.locus.contig == chromosome) + & (liftover_ht.locus.position >= start) + & (liftover_ht.locus.position <= end) + ) + .key_by() + .select("locus", "alleles", "original_locus") + .key_by("original_locus", "alleles") + .naive_coalesce(20) ) - _matrix_index = session.spark.read.parquet( - ld_index_path.format(POP=major_population) + hail_index = hl.read_table( + self.ld_index_raw_template.format(POP=major_population) ) - _index_joined = ( - _df.alias("df") - .join( - _matrix_index.alias("matrix_index"), - (f.col("df.chromosome") == f.col("matrix_index.chromosome")) - & (f.col("df.start") <= f.col("matrix_index.position")) - & (f.col("df.end") >= f.col("matrix_index.position")), - ) - .select( - "matrix_index.chromosome", - "matrix_index.position", - "referenceAllele", - "alternateAllele", - "idx", - ) - .sort("idx") - ) + joined_index = liftover_ht.join(hail_index, how="inner").to_spark().sort("idx") - return _index_joined + return joined_index @staticmethod - def get_locus_matrix( + def get_numpy_matrix( locus_index: DataFrame, - gnomad_ancestry: str, + gnomad_ancestry: str = "nfe", ) -> np.ndarray: """Extract the LD block matrix for a locus. From 67f7f36d0a073b0c4eef56927f010892dea0467b Mon Sep 17 00:00:00 2001 From: Yakov Tsepilov Date: Tue, 2 Apr 2024 16:55:32 +0100 Subject: [PATCH 26/33] fix: changing input --- src/gentropy/susie_finemapper.py | 39 +++++++++++--------------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py index 760ca73a1..e6d577784 100644 --- a/src/gentropy/susie_finemapper.py +++ b/src/gentropy/susie_finemapper.py @@ -23,31 +23,29 @@ class SusieFineMapperStep: """ @staticmethod - def susie_finemapper_one_locus( + def susie_finemapper_chr_pos( GWAS: SummaryStatistics, session: Session, - study_locus: StudyLocus, + chromosome: str, + position: int, + _studyId: str, study_index: StudyIndex, window: int = 1_000_000, ) -> StudyLocus: - """Susie fine-mapper for one study locus. + """Susie fine-mapper function that uses Summary Statstics, chromosome and position as inputs. Args: GWAS (SummaryStatistics): GWAS summary statistics session (Session): Spark session - study_locus (StudyLocus): StudyLocus object with one row (the first one will be used) + chromosome (str): chromosome + position (int): position + _studyId (str): study ID study_index (StudyIndex): StudyIndex object window (int): window size for fine-mapping Returns: StudyLocus: StudyLocus object with fine-mapped credible sets """ - study_locus_df = study_locus._df - first_line = study_locus_df.first() - chromosome = first_line.chromosome - position = first_line.position - _studyId = first_line.studyId - study_index_df = study_index._df study_index_df = study_index_df.filter(study_index_df.studyId == _studyId) _major_population = study_index_df.select( @@ -58,22 +56,11 @@ def susie_finemapper_one_locus( ) _region = ( - first_line.withColumn( - "region", - f.regexp_replace( - f.concat( - f.col("chromosome"), - f.lit(":"), - f.format_number((f.col("position") - (window / 2)), 0), - f.lit("-"), - f.format_number((f.col("position") + (window / 2)), 0), - ), - ",", - "", - ), - ) - .select("region") - .collect()[0] + chromosome + + ":" + + str(int(position - window / 2)) + + "-" + + str(int(position + window / 2)) ) GWAS_df = GWAS._df From 1dc67013585bcaba5a291ada1cdc05422a59c79a Mon Sep 17 00:00:00 2001 From: Yakov Tsepilov Date: Tue, 2 Apr 2024 17:01:49 +0100 Subject: [PATCH 27/33] fix: adding fillter by studyId --- src/gentropy/susie_finemapper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py index e6d577784..001f34bbf 100644 --- a/src/gentropy/susie_finemapper.py +++ b/src/gentropy/susie_finemapper.py @@ -65,7 +65,8 @@ def susie_finemapper_chr_pos( GWAS_df = GWAS._df GWAS_df = GWAS_df.filter( - (f.col("chromosome") == chromosome) + (f.col("studyId") == _studyId) + & (f.col("chromosome") == chromosome) & (f.col("position") >= position - (window / 2)) & (f.col("position") <= position + (window / 2)) ).withColumn("z", f.col("beta") / f.col("standardError")) From 2a2afbe0cf7f681b328ad8f766abf4d387af5bc7 Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Tue, 2 Apr 2024 17:02:38 +0100 Subject: [PATCH 28/33] fix: sorting idx in hail --- src/gentropy/datasource/gnomad/ld.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py index 752a05abb..a19cbb06b 100644 --- a/src/gentropy/datasource/gnomad/ld.py +++ b/src/gentropy/datasource/gnomad/ld.py @@ -488,7 +488,9 @@ def get_locus_index( self.ld_index_raw_template.format(POP=major_population) ) - joined_index = liftover_ht.join(hail_index, how="inner").to_spark().sort("idx") + joined_index = ( + liftover_ht.join(hail_index, how="inner").order_by("idx").to_spark() + ) return joined_index From 58f3eb95965fe58f5632ffbcec4248e88ac80a1d Mon Sep 17 00:00:00 2001 From: Yakov Tsepilov Date: Wed, 3 Apr 2024 12:37:36 +0100 Subject: [PATCH 29/33] feat: add fine-mapping of one study_locus_row --- src/gentropy/susie_finemapper.py | 111 ++++++++++++------------ tests/gentropy/method/test_susie_inf.py | 1 + 2 files changed, 58 insertions(+), 54 deletions(-) diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py index 001f34bbf..740b6ecb3 100644 --- a/src/gentropy/susie_finemapper.py +++ b/src/gentropy/susie_finemapper.py @@ -6,12 +6,13 @@ import numpy as np import pyspark.sql.functions as f -from pyspark.sql import DataFrame, Window +from pyspark.sql import DataFrame, Row, Window from gentropy.common.session import Session from gentropy.dataset.study_index import StudyIndex from gentropy.dataset.study_locus import StudyLocus from gentropy.dataset.summary_statistics import SummaryStatistics +from gentropy.datasource.gnomad.ld import GnomADLDMatrix from gentropy.method.susie_inf import SUSIE_inf @@ -26,26 +27,28 @@ class SusieFineMapperStep: def susie_finemapper_chr_pos( GWAS: SummaryStatistics, session: Session, - chromosome: str, - position: int, - _studyId: str, + study_locus_row: Row, study_index: StudyIndex, window: int = 1_000_000, + L: int = 10, ) -> StudyLocus: """Susie fine-mapper function that uses Summary Statstics, chromosome and position as inputs. Args: GWAS (SummaryStatistics): GWAS summary statistics session (Session): Spark session - chromosome (str): chromosome - position (int): position - _studyId (str): study ID + study_locus_row (Row): StudyLocus row study_index (StudyIndex): StudyIndex object window (int): window size for fine-mapping + L (int): number of causal variants Returns: StudyLocus: StudyLocus object with fine-mapped credible sets """ + chromosome = study_locus_row.chromosome + position = study_locus_row.position + _studyId = study_locus_row.studyId + study_index_df = study_index._df study_index_df = study_index_df.filter(study_index_df.studyId == _studyId) _major_population = study_index_df.select( @@ -71,59 +74,57 @@ def susie_finemapper_chr_pos( & (f.col("position") <= position + (window / 2)) ).withColumn("z", f.col("beta") / f.col("standardError")) - _z = np.array([row["z"] for row in GWAS_df.select("z").collect()]) - - # # Extract summary statistics - # _ss = ( - # SummaryStatistics.get_locus_sumstats(session, window, _locus) - # .withColumn("z", f.col("beta") / f.col("standardError")) - # .withColumn("ref", f.split(f.col("variantId"), "_").getItem(2)) - # .withColumn("alt", f.split(f.col("variantId"), "_").getItem(3)) - # .select( - # "variantId", - # f.col("chromosome").alias("chr"), - # f.col("position").alias("pos"), - # "ref", - # "alt", - # "beta", - # "pValueMantissa", - # "pValueExponent", - # "effectAlleleFrequencyFromSource", - # "standardError", - # "z", - # ) - # ) - - # Extract LD index - # _index = GnomADLDMatrix.get_locus_index( - # session, locus, window_size=window, major_population=_major_population - # ) - # _join = ( - # _ss.join( - # _index.alias("_index"), - # on=( - # (_ss["chr"] == _index["chromosome"]) - # & (_ss["pos"] == _index["position"]) - # & (_ss["ref"] == _index["referenceAllele"]) - # & (_ss["alt"] == _index["alternateAllele"]) - # ), - # ) - # .drop("ref", "alt", "chr", "pos") - # .sort("idx") - # ) - - # Extracting z-scores and LD matrix, then running SuSiE-inf - # _ld = GnomADLDMatrix.get_locus_matrix(_join, gnomad_ancestry=_major_population) - - _ld = 1 - susie_output = SUSIE_inf.susie_inf(z=_z, LD=_ld, L=10) + ld_index = ( + GnomADLDMatrix() + .get_locus_index( + study_locus_row=study_locus_row, + window_size=window, + major_population=_major_population, + ) + .withColumn( + "variantId", + f.concat( + f.lit(chromosome), + f.lit("_"), + f.col("`locus.position`"), + f.lit("_"), + f.col("alleles").getItem(0), + f.lit("_"), + f.col("alleles").getItem(1), + ).cast("string"), + ) + ) + + gnomad_ld = GnomADLDMatrix.get_numpy_matrix( + ld_index, gnomad_ancestry=_major_population + ) + + GWAS_df = GWAS_df.toPandas() + ld_index = ld_index.toPandas() + ld_index = ld_index.reset_index() + + # Filtering out the variants that are not in the LD matrix, we don't need them + df_columns = GWAS_df.columns + GWAS_df = GWAS_df.merge(ld_index, on="variantId", how="inner") + GWAS_df = GWAS_df[df_columns].reset_index() + + merged_df = GWAS_df.merge( + ld_index, left_on="variantId", right_on="variantId", how="inner" + ) + indices = merged_df["index_y"].values + + ld_to_fm = gnomad_ld[indices][:, indices] + z_to_fm = GWAS_df["z"].values + + susie_output = SUSIE_inf.susie_inf(z=z_to_fm, LD=ld_to_fm, L=L) + variant_index = session.spark.createDataFrame(GWAS_df) return SusieFineMapperStep.susie_inf_to_studylocus( susie_output=susie_output, session=session, _studyId=_studyId, _region=_region, - variant_index=GWAS, + variant_index=variant_index, ) @staticmethod @@ -151,6 +152,7 @@ def susie_inf_to_studylocus( variants = np.array( [row["variantId"] for row in variant_index.select("variantId").collect()] ).reshape(-1, 1) + PIPs = susie_output["PIP"] lbfs = susie_output["lbf_variable"] mu = susie_output["mu"] @@ -181,6 +183,7 @@ def susie_inf_to_studylocus( win = Window.rowsBetween( Window.unboundedPreceding, Window.unboundedFollowing ) + cred_set = ( session.spark.createDataFrame( cred_set.tolist(), diff --git a/tests/gentropy/method/test_susie_inf.py b/tests/gentropy/method/test_susie_inf.py index 393f786d7..e7a5d219f 100644 --- a/tests/gentropy/method/test_susie_inf.py +++ b/tests/gentropy/method/test_susie_inf.py @@ -68,6 +68,7 @@ def test_SUSIE_inf_convert_to_study_locus( est_tausq=False, ) gwas_df = sample_summary_statistics._df.limit(21) + L1 = SusieFineMapperStep.susie_inf_to_studylocus( susie_output=susie_output, session=session, From 5ee023a29de0ba5ac39fca5b14f83078e6dec6bc Mon Sep 17 00:00:00 2001 From: Yakov Tsepilov Date: Wed, 3 Apr 2024 16:07:57 +0100 Subject: [PATCH 30/33] fix: small fix in majpop --- src/gentropy/susie_finemapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py index 740b6ecb3..be5de68c1 100644 --- a/src/gentropy/susie_finemapper.py +++ b/src/gentropy/susie_finemapper.py @@ -56,7 +56,7 @@ def susie_finemapper_chr_pos( f.array_max(f.col("ldPopulationStructure")) .getItem("ldPopulation") .alias("majorPopulation"), - ) + ).collect()[0]["majorPopulation"] _region = ( chromosome From 3b0ddda3fc64772bc3e414c7e9dd41dbf05ffa35 Mon Sep 17 00:00:00 2001 From: Yakov Tsepilov Date: Wed, 3 Apr 2024 17:18:43 +0100 Subject: [PATCH 31/33] fix: small fixes in function --- src/gentropy/susie_finemapper.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py index be5de68c1..7117295b3 100644 --- a/src/gentropy/susie_finemapper.py +++ b/src/gentropy/susie_finemapper.py @@ -5,8 +5,10 @@ from typing import Any import numpy as np +import pandas as pd import pyspark.sql.functions as f from pyspark.sql import DataFrame, Row, Window +from pyspark.sql.types import IntegerType, StringType, StructField, StructType from gentropy.common.session import Session from gentropy.dataset.study_index import StudyIndex @@ -24,7 +26,7 @@ class SusieFineMapperStep: """ @staticmethod - def susie_finemapper_chr_pos( + def susie_finemapper_one_studylocus_row( GWAS: SummaryStatistics, session: Session, study_locus_row: Row, @@ -45,6 +47,9 @@ def susie_finemapper_chr_pos( Returns: StudyLocus: StudyLocus object with fine-mapped credible sets """ + # PLEASE DO NOT REMOVE THIS LINE + pd.DataFrame.iteritems = pd.DataFrame.items + chromosome = study_locus_row.chromosome position = study_locus_row.position _studyId = study_locus_row.studyId @@ -117,7 +122,24 @@ def susie_finemapper_chr_pos( z_to_fm = GWAS_df["z"].values susie_output = SUSIE_inf.susie_inf(z=z_to_fm, LD=ld_to_fm, L=L) - variant_index = session.spark.createDataFrame(GWAS_df) + + schema = StructType( + [ + StructField("variantId", StringType(), True), + StructField("chromosome", StringType(), True), + StructField("position", IntegerType(), True), + ] + ) + variant_index = session.spark.createDataFrame( + GWAS_df[ + [ + "variantId", + "chromosome", + "position", + ] + ], + schema=schema, + ) return SusieFineMapperStep.susie_inf_to_studylocus( susie_output=susie_output, From 3f8e20ab10c9485a03d254fc21f8e77e7c4c651a Mon Sep 17 00:00:00 2001 From: Daniel Considine Date: Thu, 4 Apr 2024 21:04:13 +0100 Subject: [PATCH 32/33] fix: using more spark before converting to pandas --- src/gentropy/susie_finemapper.py | 70 ++++++++++++++------------------ 1 file changed, 31 insertions(+), 39 deletions(-) diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py index 7117295b3..8d25f533e 100644 --- a/src/gentropy/susie_finemapper.py +++ b/src/gentropy/susie_finemapper.py @@ -50,20 +50,20 @@ def susie_finemapper_one_studylocus_row( # PLEASE DO NOT REMOVE THIS LINE pd.DataFrame.iteritems = pd.DataFrame.items - chromosome = study_locus_row.chromosome - position = study_locus_row.position - _studyId = study_locus_row.studyId + chromosome = study_locus_row["chromosome"] + position = study_locus_row["position"] + studyId = study_locus_row["studyId"] study_index_df = study_index._df - study_index_df = study_index_df.filter(study_index_df.studyId == _studyId) - _major_population = study_index_df.select( + study_index_df = study_index_df.filter(f.col("studyId") == studyId) + major_population = study_index_df.select( "studyId", f.array_max(f.col("ldPopulationStructure")) .getItem("ldPopulation") .alias("majorPopulation"), ).collect()[0]["majorPopulation"] - _region = ( + region = ( chromosome + ":" + str(int(position - window / 2)) @@ -71,20 +71,19 @@ def susie_finemapper_one_studylocus_row( + str(int(position + window / 2)) ) - GWAS_df = GWAS._df - GWAS_df = GWAS_df.filter( - (f.col("studyId") == _studyId) - & (f.col("chromosome") == chromosome) - & (f.col("position") >= position - (window / 2)) - & (f.col("position") <= position + (window / 2)) - ).withColumn("z", f.col("beta") / f.col("standardError")) + gwas_df = ( + GWAS.df.withColumn("z", f.col("beta") / f.col("standardError")) + .withColumn("chromosome", f.split(f.col("variantId"), "_")[0]) + .withColumn("position", f.split(f.col("variantId"), "_")[1]) + .filter(f.col("z").isNotNull()) + ) ld_index = ( GnomADLDMatrix() .get_locus_index( study_locus_row=study_locus_row, window_size=window, - major_population=_major_population, + major_population=major_population, ) .withColumn( "variantId", @@ -100,26 +99,18 @@ def susie_finemapper_one_studylocus_row( ) ) - gnomad_ld = GnomADLDMatrix.get_numpy_matrix( - ld_index, gnomad_ancestry=_major_population - ) - - GWAS_df = GWAS_df.toPandas() - ld_index = ld_index.toPandas() - ld_index = ld_index.reset_index() - # Filtering out the variants that are not in the LD matrix, we don't need them - df_columns = GWAS_df.columns - GWAS_df = GWAS_df.merge(ld_index, on="variantId", how="inner") - GWAS_df = GWAS_df[df_columns].reset_index() + gwas_index = gwas_df.join( + ld_index.select("variantId", "alleles", "idx"), on="variantId" + ).sort("idx") - merged_df = GWAS_df.merge( - ld_index, left_on="variantId", right_on="variantId", how="inner" + gnomad_ld = GnomADLDMatrix.get_numpy_matrix( + gwas_index, gnomad_ancestry=major_population ) - indices = merged_df["index_y"].values - ld_to_fm = gnomad_ld[indices][:, indices] - z_to_fm = GWAS_df["z"].values + pd_df = gwas_index.toPandas() + z_to_fm = np.array(pd_df["z"]) + ld_to_fm = gnomad_ld susie_output = SUSIE_inf.susie_inf(z=z_to_fm, LD=ld_to_fm, L=L) @@ -130,8 +121,9 @@ def susie_finemapper_one_studylocus_row( StructField("position", IntegerType(), True), ] ) + pd_df["position"] = pd_df["position"].astype(int) variant_index = session.spark.createDataFrame( - GWAS_df[ + pd_df[ [ "variantId", "chromosome", @@ -144,8 +136,8 @@ def susie_finemapper_one_studylocus_row( return SusieFineMapperStep.susie_inf_to_studylocus( susie_output=susie_output, session=session, - _studyId=_studyId, - _region=_region, + studyId=studyId, + region=region, variant_index=variant_index, ) @@ -153,8 +145,8 @@ def susie_finemapper_one_studylocus_row( def susie_inf_to_studylocus( susie_output: dict[str, Any], session: Session, - _studyId: str, - _region: str, + studyId: str, + region: str, variant_index: DataFrame, cs_lbf_thr: float = 2, ) -> StudyLocus: @@ -163,8 +155,8 @@ def susie_inf_to_studylocus( Args: susie_output (dict[str, Any]): SuSiE-inf output dictionary session (Session): Spark session - _studyId (str): study ID - _region (str): region + studyId (str): study ID + region (str): region variant_index (DataFrame): DataFrame with variant information cs_lbf_thr (float): credible set logBF threshold, default is 2 @@ -236,8 +228,8 @@ def susie_inf_to_studylocus( .limit(1) .withColumns( { - "studyId": f.lit(_studyId), - "region": f.lit(_region), + "studyId": f.lit(studyId), + "region": f.lit(region), "credibleSetIndex": f.lit(counter), "credibleSetlog10BF": f.lit(cs_lbf_value * 0.4342944819), "finemappingMethod": f.lit("SuSiE-inf"), From d4623688f388155afb83676e640be1810551fa27 Mon Sep 17 00:00:00 2001 From: Yakov Tsepilov Date: Fri, 5 Apr 2024 12:01:29 +0100 Subject: [PATCH 33/33] fix: fix in test --- src/gentropy/susie_finemapper.py | 1 + tests/gentropy/method/test_susie_inf.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py index 8d25f533e..03e1c2a0b 100644 --- a/src/gentropy/susie_finemapper.py +++ b/src/gentropy/susie_finemapper.py @@ -75,6 +75,7 @@ def susie_finemapper_one_studylocus_row( GWAS.df.withColumn("z", f.col("beta") / f.col("standardError")) .withColumn("chromosome", f.split(f.col("variantId"), "_")[0]) .withColumn("position", f.split(f.col("variantId"), "_")[1]) + .filter(f.col("studyId") == studyId) .filter(f.col("z").isNotNull()) ) diff --git a/tests/gentropy/method/test_susie_inf.py b/tests/gentropy/method/test_susie_inf.py index e7a5d219f..0129a5934 100644 --- a/tests/gentropy/method/test_susie_inf.py +++ b/tests/gentropy/method/test_susie_inf.py @@ -72,8 +72,8 @@ def test_SUSIE_inf_convert_to_study_locus( L1 = SusieFineMapperStep.susie_inf_to_studylocus( susie_output=susie_output, session=session, - _studyId="sample_id", - _region="sample_region", + studyId="sample_id", + region="sample_region", variant_index=gwas_df, cs_lbf_thr=2, )