feat(data_release): preparation for 24.06 data release (#633)

* feat(airflow): drop finngen PICS results from etl * feat(l2g): allow to publish feature matrix * fix(types): typo in types * chore(data_release): bumped release date --------- Co-authored-by: Szymon Szyszkowski <[email protected]>
opentargets · Jun 6, 2024 · 95f26d0 · 95f26d0
1 parent daa8331
commit 95f26d0
Show file tree

Hide file tree

Showing 8 changed files with 37 additions and 61 deletions.
diff --git a/config/datasets/ot_gcp.yaml b/config/datasets/ot_gcp.yaml
@@ -1,5 +1,5 @@
 # Release specific configuration:
-release_version: "24.03"
+release_version: "24.06"
 dev_version: XX.XX
 release_folder: gs://genetics_etl_python_playground/releases/${datasets.release_version}
 
@@ -72,6 +72,7 @@ from_sumstats_pics: ${datasets.credible_set}/from_sumstats
 l2g_gold_standard_curation: ${datasets.release_folder}/locus_to_gene_gold_standard.json
 l2g_model: ${datasets.release_folder}/locus_to_gene_model
 l2g_predictions: ${datasets.release_folder}/locus_to_gene_predictions
+l2g_feature_matrix: ${datasets.release_folder}/locus_to_gene_feature_matrix
 colocalisation: ${datasets.release_folder}/colocalisation
 study_index: ${datasets.release_folder}/study_index
 variant_index: ${datasets.release_folder}/variant_index

diff --git a/config/step/ot_locus_to_gene_predict.yaml b/config/step/ot_locus_to_gene_predict.yaml
@@ -4,6 +4,7 @@ defaults:
 run_mode: predict
 model_path: ${datasets.l2g_model}
 predictions_path: ${datasets.l2g_predictions}
+feature_matrix_path: ${datasets.l2g_feature_matrix}
 credible_set_path: ${datasets.credible_set}
 variant_gene_path: ${datasets.v2g}
 colocalisation_path: ${datasets.colocalisation}

diff --git a/src/airflow/dags/finngen_preprocess.py b/src/airflow/dags/finngen_preprocess.py
@@ -1,4 +1,5 @@
 """Airflow DAG for the Preprocess part of the pipeline."""
+
 from __future__ import annotations
 
 from pathlib import Path
@@ -59,39 +60,8 @@
             ],
         )
 
-        window_based_clumping = common.submit_step(
-            cluster_name=CLUSTER_NAME,
-            step_id="window_based_clumping",
-            task_id="finngen_window_based_clumping",
-            other_args=[
-                f"step.summary_statistics_input_path={SUMMARY_STATISTICS}",
-                f"step.study_locus_output_path={WINDOW_BASED_CLUMPED}",
-            ],
-        )
-        ld_clumping = common.submit_step(
-            cluster_name=CLUSTER_NAME,
-            step_id="ot_ld_based_clumping",
-            task_id="finngen_ld_clumping",
-            other_args=[
-                f"step.study_locus_input_path={WINDOW_BASED_CLUMPED}",
-                f"step.study_index_path={STUDY_INDEX}",
-                f"step.clumped_study_locus_output_path={LD_CLUMPED}",
-            ],
-            trigger_rule=TriggerRule.ALL_DONE,
-        )
-        pics = common.submit_step(
-            cluster_name=CLUSTER_NAME,
-            step_id="ot_pics",
-            task_id="finngen_pics",
-            other_args=[
-                f"step.study_locus_ld_annotated_in={LD_CLUMPED}",
-                f"step.picsed_study_locus_out={PICSED_CREDIBLE_SET}",
-            ],
-            # This allows to attempt running the task when above step fails do to failifexists
-            trigger_rule=TriggerRule.ALL_DONE,
-        )
         # Define order of steps:
-        (study_index >> window_based_clumping >> ld_clumping >> pics)
+        (study_index)
     (
         common.create_cluster(
             CLUSTER_NAME,

diff --git a/src/airflow/dags/genetics_etl.py b/src/airflow/dags/genetics_etl.py
@@ -14,7 +14,7 @@
 SOURCE_CONFIG_FILE_PATH = Path(__file__).parent / "configs" / "dag.yaml"
 
 # Release specific variables:
-RELEASE_VERSION = "24.03"
+RELEASE_VERSION = "24.06"
 RELEASE_BUCKET_NAME = "genetics_etl_python_playground"
 
 # Datasource paths:
@@ -74,13 +74,6 @@
         "destination_bucket": RELEASE_BUCKET_NAME,
         "destination_object": f"releases/{RELEASE_VERSION}/study_index/finngen",
     },
-    # Finngen summary statistics:
-    "finngen_PICS_credible_set": {
-        "source_bucket": FINNGEN_BUCKET_NAME,
-        "source_object": f"{FINNGEN_RELEASE}/credible_set_datasets/finngen_pics",
-        "destination_bucket": RELEASE_BUCKET_NAME,
-        "destination_object": f"releases/{RELEASE_VERSION}/credible_set/finngen_pics",
-    },
     # Finngen SuSiE credible sets:
     "finngen_susie_credible_set": {
         "source_bucket": FINNGEN_BUCKET_NAME,

diff --git a/src/gentropy/common/types.py b/src/gentropy/common/types.py
@@ -9,7 +9,7 @@
 ]
 DataSourceType = Literal[
     "gnomad",
-    "fingenn",
+    "finngen",
     "gwas_catalog",
     "eqtl_catalog",
     "ukbiobank",

diff --git a/src/gentropy/config.py b/src/gentropy/config.py
@@ -215,6 +215,7 @@ class LocusToGeneConfig(StepConfig):
     variant_gene_path: str = MISSING
     colocalisation_path: str = MISSING
     study_index_path: str = MISSING
+    feature_matrix_path: str | None = None
     gold_standard_curation_path: str | None = None
     gene_interactions_path: str | None = None
     features_list: list[str] = field(

diff --git a/src/gentropy/dataset/l2g_prediction.py b/src/gentropy/dataset/l2g_prediction.py
@@ -48,7 +48,7 @@ def from_credible_set(
         study_index: StudyIndex,
         v2g: V2G,
         coloc: Colocalisation,
-    ) -> L2GPrediction:
+    ) -> tuple[L2GPrediction, L2GFeatureMatrix]:
         """Extract L2G predictions for a set of credible sets derived from GWAS.
 
         Args:
@@ -60,7 +60,7 @@ def from_credible_set(
             coloc (Colocalisation): Colocalisation dataset
 
         Returns:
-            L2GPrediction: L2G dataset
+            tuple[L2GPrediction, L2GFeatureMatrix]: L2G dataset and feature matrix limited to GWAS study only.
         """
         fm = L2GFeatureMatrix.generate_features(
             features_list=features_list,
@@ -79,21 +79,24 @@ def from_credible_set(
             ),
             _schema=cls.get_schema(),
         )
-        return L2GPrediction(
-            # Load and apply fitted model
-            _df=(
-                LocusToGeneModel.load_from_disk(
-                    model_path,
-                    features_list=features_list,
-                )
-                .predict(gwas_fm)
-                # the probability of the positive class is the second element inside the probability array
-                # - this is selected as the L2G probability
-                .select(
-                    "studyLocusId",
-                    "geneId",
-                    vector_to_array(f.col("probability"))[1].alias("score"),
-                )
+        return (
+            L2GPrediction(
+                # Load and apply fitted model
+                _df=(
+                    LocusToGeneModel.load_from_disk(
+                        model_path,
+                        features_list=features_list,
+                    )
+                    .predict(gwas_fm)
+                    # the probability of the positive class is the second element inside the probability array
+                    # - this is selected as the L2G probability
+                    .select(
+                        "studyLocusId",
+                        "geneId",
+                        vector_to_array(f.col("probability"))[1].alias("score"),
+                    )
+                ),
+                _schema=cls.get_schema(),
             ),
-            _schema=cls.get_schema(),
+            gwas_fm,
         )
diff --git a/src/gentropy/l2g.py b/src/gentropy/l2g.py
@@ -36,6 +36,7 @@ def __init__(
         gene_interactions_path: str,
         features_list: list[str],
         hyperparameters: dict[str, Any],
+        feature_matrix_path: str | None = None,
         wandb_run_name: str | None = None,
         perform_cross_validation: bool = False,
     ) -> None:
@@ -54,6 +55,8 @@ def __init__(
             gene_interactions_path (str): Path to gene interactions Parquet files.
             features_list (list[str]): List of features to use.
             hyperparameters (dict[str, Any]): Hyperparameters for the model.
+            feature_matrix_path (str | None): Optional path where the raw feature matrix should be stored.
+                If None, the feature matrix is not published. The feature matrix is published only when `run_mode` is `predict`.
             wandb_run_name (str | None): Name of the run to be tracked on W&B.
             perform_cross_validation (bool): Whether to perform cross validation.
 
@@ -82,9 +85,13 @@ def __init__(
                 raise ValueError(
                     "model_path and predictions_path must be set for predict mode."
                 )
-            predictions = L2GPrediction.from_credible_set(
+            predictions, feature_matrix = L2GPrediction.from_credible_set(
                 model_path, list(features_list), credible_set, studies, v2g, coloc
             )
+            if feature_matrix_path:
+                feature_matrix.df.write.mode(session.write_mode).parquet(
+                    feature_matrix_path
+                )
             predictions.df.write.mode(session.write_mode).parquet(predictions_path)
             session.logger.info(predictions_path)
         elif (