From fd3154ab463c85d4f123f97c11f6fcb2965b2bd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= <45119610+ireneisdoomed@users.noreply.github.com> Date: Thu, 6 Jun 2024 17:00:03 +0100 Subject: [PATCH] feat(qtl): ingest credible sets from single cell derived QTLs (#630) * chore: prototype ingestion of sceqtls * chore: use credible sets from ftp * feat(study_index): rename tissuefromsourceid to biosamplefromsourceid to accommodate cell type ids * chore: update output paths to final destination --- src/airflow/dags/eqtl_preprocess.py | 8 ++++---- src/gentropy/assets/schemas/study_index.json | 2 +- src/gentropy/datasource/eqtl_catalogue/finemapping.py | 3 ++- src/gentropy/datasource/eqtl_catalogue/study_index.py | 5 +++-- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/airflow/dags/eqtl_preprocess.py b/src/airflow/dags/eqtl_preprocess.py index 5433d8968..aef70085d 100644 --- a/src/airflow/dags/eqtl_preprocess.py +++ b/src/airflow/dags/eqtl_preprocess.py @@ -15,9 +15,9 @@ AUTOSCALING = "eqtl-preprocess" PROJECT_ID = "open-targets-genetics-dev" -EQTL_CATALOG_SUSIE_LOCATION = "gs://eqtl_catalogue_data/ebi_ftp/susie" -TEMP_DECOMPRESS_LOCATION = "gs://eqtl_catalogue_data/susie_decompressed_tmp" -DECOMPRESS_FAILED_LOG = f"{TEMP_DECOMPRESS_LOCATION}.log" +EQTL_CATALOGUE_SUSIE_LOCATION = "gs://eqtl_catalogue_data/ebi_ftp/susie" +TEMP_DECOMPRESS_LOCATION = f"{EQTL_CATALOGUE_SUSIE_LOCATION}_decompressed_tmp" +DECOMPRESS_FAILED_LOG = f"{TEMP_DECOMPRESS_LOCATION}/logs.log" STUDY_INDEX_PATH = "gs://eqtl_catalogue_data/study_index" CREDIBLE_SET_PATH = "gs://eqtl_catalogue_data/credible_set_datasets/susie" @@ -35,7 +35,7 @@ location="europe-west1", project_id=PROJECT_ID, parameters={ - "inputFilePattern": f"{EQTL_CATALOG_SUSIE_LOCATION}/**/*.gz", + "inputFilePattern": f"{EQTL_CATALOGUE_SUSIE_LOCATION}/**/*.gz", "outputDirectory": TEMP_DECOMPRESS_LOCATION, "outputFailureFile": DECOMPRESS_FAILED_LOG, }, diff --git a/src/gentropy/assets/schemas/study_index.json b/src/gentropy/assets/schemas/study_index.json index e529fafd2..de87fc2fa 100644 --- a/src/gentropy/assets/schemas/study_index.json +++ b/src/gentropy/assets/schemas/study_index.json @@ -42,7 +42,7 @@ "metadata": {} }, { - "name": "tissueFromSourceId", + "name": "biosampleFromSourceId", "type": "string", "nullable": true, "metadata": {} diff --git a/src/gentropy/datasource/eqtl_catalogue/finemapping.py b/src/gentropy/datasource/eqtl_catalogue/finemapping.py index ff55ead72..c6cf58326 100644 --- a/src/gentropy/datasource/eqtl_catalogue/finemapping.py +++ b/src/gentropy/datasource/eqtl_catalogue/finemapping.py @@ -1,4 +1,5 @@ """Process SuSIE finemapping results from eQTL Catalogue.""" + from __future__ import annotations from dataclasses import dataclass @@ -190,7 +191,7 @@ def parse_susie_results( f.col("sample_group"), f.col("molecular_trait_id"), ).alias("studyId"), - f.col("tissue_id").alias("tissueFromSourceId"), + f.col("tissue_id").alias("biosampleFromSourceId"), EqtlCatalogueStudyIndex._identify_study_type( f.col("quant_method") ).alias("studyType"), diff --git a/src/gentropy/datasource/eqtl_catalogue/study_index.py b/src/gentropy/datasource/eqtl_catalogue/study_index.py index 71cec1ec0..628d69a4b 100644 --- a/src/gentropy/datasource/eqtl_catalogue/study_index.py +++ b/src/gentropy/datasource/eqtl_catalogue/study_index.py @@ -43,7 +43,7 @@ class EqtlCatalogueStudyIndex: StructField("quant_method", StringType(), True), ] ) - raw_studies_metadata_path = "https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/19929ff6a99bf402194292a14f96f9615b35f65f/data_tables/dataset_metadata.tsv" + raw_studies_metadata_path = "https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/4c8ca340e3eb2878073b290785cb8ff1a4c788f8/data_tables/dataset_metadata_upcoming.tsv" @classmethod def _identify_study_type( @@ -85,7 +85,8 @@ def _identify_study_type( @classmethod def get_studies_of_interest( - cls: type[EqtlCatalogueStudyIndex], studies_metadata: DataFrame + cls: type[EqtlCatalogueStudyIndex], + studies_metadata: DataFrame, ) -> list[str]: """Filter studies of interest from the raw studies metadata.