Skip to content

Commit

Permalink
feat(data_release): preparation for 24.06 data release (#633)
Browse files Browse the repository at this point in the history
* feat(airflow): drop finngen PICS results from etl
* feat(l2g): allow to publish feature matrix
* fix(types): typo in types
* chore(data_release): bumped release date

---------

Co-authored-by: Szymon Szyszkowski <[email protected]>
  • Loading branch information
project-defiant and Szymon Szyszkowski authored Jun 6, 2024
1 parent daa8331 commit 95f26d0
Show file tree
Hide file tree
Showing 8 changed files with 37 additions and 61 deletions.
3 changes: 2 additions & 1 deletion config/datasets/ot_gcp.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Release specific configuration:
release_version: "24.03"
release_version: "24.06"
dev_version: XX.XX
release_folder: gs://genetics_etl_python_playground/releases/${datasets.release_version}

Expand Down Expand Up @@ -72,6 +72,7 @@ from_sumstats_pics: ${datasets.credible_set}/from_sumstats
l2g_gold_standard_curation: ${datasets.release_folder}/locus_to_gene_gold_standard.json
l2g_model: ${datasets.release_folder}/locus_to_gene_model
l2g_predictions: ${datasets.release_folder}/locus_to_gene_predictions
l2g_feature_matrix: ${datasets.release_folder}/locus_to_gene_feature_matrix
colocalisation: ${datasets.release_folder}/colocalisation
study_index: ${datasets.release_folder}/study_index
variant_index: ${datasets.release_folder}/variant_index
Expand Down
1 change: 1 addition & 0 deletions config/step/ot_locus_to_gene_predict.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ defaults:
run_mode: predict
model_path: ${datasets.l2g_model}
predictions_path: ${datasets.l2g_predictions}
feature_matrix_path: ${datasets.l2g_feature_matrix}
credible_set_path: ${datasets.credible_set}
variant_gene_path: ${datasets.v2g}
colocalisation_path: ${datasets.colocalisation}
Expand Down
34 changes: 2 additions & 32 deletions src/airflow/dags/finngen_preprocess.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Airflow DAG for the Preprocess part of the pipeline."""

from __future__ import annotations

from pathlib import Path
Expand Down Expand Up @@ -59,39 +60,8 @@
],
)

window_based_clumping = common.submit_step(
cluster_name=CLUSTER_NAME,
step_id="window_based_clumping",
task_id="finngen_window_based_clumping",
other_args=[
f"step.summary_statistics_input_path={SUMMARY_STATISTICS}",
f"step.study_locus_output_path={WINDOW_BASED_CLUMPED}",
],
)
ld_clumping = common.submit_step(
cluster_name=CLUSTER_NAME,
step_id="ot_ld_based_clumping",
task_id="finngen_ld_clumping",
other_args=[
f"step.study_locus_input_path={WINDOW_BASED_CLUMPED}",
f"step.study_index_path={STUDY_INDEX}",
f"step.clumped_study_locus_output_path={LD_CLUMPED}",
],
trigger_rule=TriggerRule.ALL_DONE,
)
pics = common.submit_step(
cluster_name=CLUSTER_NAME,
step_id="ot_pics",
task_id="finngen_pics",
other_args=[
f"step.study_locus_ld_annotated_in={LD_CLUMPED}",
f"step.picsed_study_locus_out={PICSED_CREDIBLE_SET}",
],
# This allows to attempt running the task when above step fails do to failifexists
trigger_rule=TriggerRule.ALL_DONE,
)
# Define order of steps:
(study_index >> window_based_clumping >> ld_clumping >> pics)
(study_index)
(
common.create_cluster(
CLUSTER_NAME,
Expand Down
9 changes: 1 addition & 8 deletions src/airflow/dags/genetics_etl.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
SOURCE_CONFIG_FILE_PATH = Path(__file__).parent / "configs" / "dag.yaml"

# Release specific variables:
RELEASE_VERSION = "24.03"
RELEASE_VERSION = "24.06"
RELEASE_BUCKET_NAME = "genetics_etl_python_playground"

# Datasource paths:
Expand Down Expand Up @@ -74,13 +74,6 @@
"destination_bucket": RELEASE_BUCKET_NAME,
"destination_object": f"releases/{RELEASE_VERSION}/study_index/finngen",
},
# Finngen summary statistics:
"finngen_PICS_credible_set": {
"source_bucket": FINNGEN_BUCKET_NAME,
"source_object": f"{FINNGEN_RELEASE}/credible_set_datasets/finngen_pics",
"destination_bucket": RELEASE_BUCKET_NAME,
"destination_object": f"releases/{RELEASE_VERSION}/credible_set/finngen_pics",
},
# Finngen SuSiE credible sets:
"finngen_susie_credible_set": {
"source_bucket": FINNGEN_BUCKET_NAME,
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/common/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
]
DataSourceType = Literal[
"gnomad",
"fingenn",
"finngen",
"gwas_catalog",
"eqtl_catalog",
"ukbiobank",
Expand Down
1 change: 1 addition & 0 deletions src/gentropy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ class LocusToGeneConfig(StepConfig):
variant_gene_path: str = MISSING
colocalisation_path: str = MISSING
study_index_path: str = MISSING
feature_matrix_path: str | None = None
gold_standard_curation_path: str | None = None
gene_interactions_path: str | None = None
features_list: list[str] = field(
Expand Down
39 changes: 21 additions & 18 deletions src/gentropy/dataset/l2g_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def from_credible_set(
study_index: StudyIndex,
v2g: V2G,
coloc: Colocalisation,
) -> L2GPrediction:
) -> tuple[L2GPrediction, L2GFeatureMatrix]:
"""Extract L2G predictions for a set of credible sets derived from GWAS.
Args:
Expand All @@ -60,7 +60,7 @@ def from_credible_set(
coloc (Colocalisation): Colocalisation dataset
Returns:
L2GPrediction: L2G dataset
tuple[L2GPrediction, L2GFeatureMatrix]: L2G dataset and feature matrix limited to GWAS study only.
"""
fm = L2GFeatureMatrix.generate_features(
features_list=features_list,
Expand All @@ -79,21 +79,24 @@ def from_credible_set(
),
_schema=cls.get_schema(),
)
return L2GPrediction(
# Load and apply fitted model
_df=(
LocusToGeneModel.load_from_disk(
model_path,
features_list=features_list,
)
.predict(gwas_fm)
# the probability of the positive class is the second element inside the probability array
# - this is selected as the L2G probability
.select(
"studyLocusId",
"geneId",
vector_to_array(f.col("probability"))[1].alias("score"),
)
return (
L2GPrediction(
# Load and apply fitted model
_df=(
LocusToGeneModel.load_from_disk(
model_path,
features_list=features_list,
)
.predict(gwas_fm)
# the probability of the positive class is the second element inside the probability array
# - this is selected as the L2G probability
.select(
"studyLocusId",
"geneId",
vector_to_array(f.col("probability"))[1].alias("score"),
)
),
_schema=cls.get_schema(),
),
_schema=cls.get_schema(),
gwas_fm,
)
9 changes: 8 additions & 1 deletion src/gentropy/l2g.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def __init__(
gene_interactions_path: str,
features_list: list[str],
hyperparameters: dict[str, Any],
feature_matrix_path: str | None = None,
wandb_run_name: str | None = None,
perform_cross_validation: bool = False,
) -> None:
Expand All @@ -54,6 +55,8 @@ def __init__(
gene_interactions_path (str): Path to gene interactions Parquet files.
features_list (list[str]): List of features to use.
hyperparameters (dict[str, Any]): Hyperparameters for the model.
feature_matrix_path (str | None): Optional path where the raw feature matrix should be stored.
If None, the feature matrix is not published. The feature matrix is published only when `run_mode` is `predict`.
wandb_run_name (str | None): Name of the run to be tracked on W&B.
perform_cross_validation (bool): Whether to perform cross validation.
Expand Down Expand Up @@ -82,9 +85,13 @@ def __init__(
raise ValueError(
"model_path and predictions_path must be set for predict mode."
)
predictions = L2GPrediction.from_credible_set(
predictions, feature_matrix = L2GPrediction.from_credible_set(
model_path, list(features_list), credible_set, studies, v2g, coloc
)
if feature_matrix_path:
feature_matrix.df.write.mode(session.write_mode).parquet(
feature_matrix_path
)
predictions.df.write.mode(session.write_mode).parquet(predictions_path)
session.logger.info(predictions_path)
elif (
Expand Down

0 comments on commit 95f26d0

Please sign in to comment.