Skip to content

Commit

Permalink
Merge branch 'dev' into vh-3448
Browse files Browse the repository at this point in the history
  • Loading branch information
DSuveges authored Sep 25, 2024
2 parents e873353 + 6c4bdf5 commit ce125f9
Show file tree
Hide file tree
Showing 5 changed files with 322 additions and 62 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ exclude = ["dist"]
addopts = "-n auto --doctest-modules --cov=src/ --cov-report=xml"
pythonpath = ["."]
testpaths = ["tests/gentropy", "src/gentropy"]
marks = ["step_test"]

# Semi-strict mode for mypy
[tool.mypy]
Expand Down
1 change: 1 addition & 0 deletions src/gentropy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ class FinngenStudiesConfig(StepConfig):
)
finngen_summary_stats_url_suffix: str = ".gz"
efo_curation_mapping_url: str = "https://raw.githubusercontent.com/opentargets/curation/24.09.1/mappings/disease/manual_string.tsv"
sample_size: int = 453733 # https://www.finngen.fi/en/access_results#:~:text=Total%20sample%20size%3A%C2%A0453%2C733%C2%A0(254%2C618%C2%A0females%20and%C2%A0199%2C115%20males)
_target_: str = "gentropy.finngen_studies.FinnGenStudiesStep"


Expand Down
19 changes: 10 additions & 9 deletions src/gentropy/datasource/finngen/study_index.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Study Index for Finngen data source."""
"""Study Index for FinnGen data source."""

from __future__ import annotations

Expand All @@ -8,7 +8,6 @@
import pyspark.sql.functions as f
from pyspark.sql import DataFrame, SparkSession

from gentropy.config import FinngenStudiesConfig
from gentropy.dataset.study_index import StudyIndex


Expand All @@ -30,7 +29,7 @@ class FinnGenStudyIndex:
def join_efo_mapping(
study_index: StudyIndex,
efo_curation_mapping: DataFrame,
finngen_release_prefix: str = FinngenStudiesConfig().finngen_release_prefix,
finngen_release_prefix: str,
) -> StudyIndex:
"""Add EFO mapping to the Finngen study index table.
Expand Down Expand Up @@ -88,10 +87,11 @@ def join_efo_mapping(
def from_source(
cls: type[FinnGenStudyIndex],
spark: SparkSession,
finngen_phenotype_table_url: str = FinngenStudiesConfig().finngen_phenotype_table_url,
finngen_release_prefix: str = FinngenStudiesConfig().finngen_release_prefix,
finngen_summary_stats_url_prefix: str = FinngenStudiesConfig().finngen_summary_stats_url_prefix,
finngen_summary_stats_url_suffix: str = FinngenStudiesConfig().finngen_summary_stats_url_suffix,
finngen_phenotype_table_url: str,
finngen_release_prefix: str,
finngen_summary_stats_url_prefix: str,
finngen_summary_stats_url_suffix: str,
sample_size: int,
) -> StudyIndex:
"""This function ingests study level metadata from FinnGen.
Expand All @@ -101,6 +101,7 @@ def from_source(
finngen_release_prefix (str): FinnGen release prefix.
finngen_summary_stats_url_prefix (str): FinnGen summary stats URL prefix.
finngen_summary_stats_url_suffix (str): FinnGen summary stats URL suffix.
sample_size (int): Number of individuals participated in sample collection.
Returns:
StudyIndex: Parsed and annotated FinnGen study table.
Expand All @@ -120,12 +121,12 @@ def from_source(
f.lit(finngen_release_prefix).alias("projectId"),
f.lit("gwas").alias("studyType"),
f.lit(True).alias("hasSumstats"),
f.lit("377,277 (210,870 females and 166,407 males)").alias(
f.lit("453,733 (254,618 females and 199,115 males)").alias(
"initialSampleSize"
),
f.array(
f.struct(
f.lit(377277).cast("integer").alias("sampleSize"),
f.lit(sample_size).cast("integer").alias("sampleSize"),
f.lit("Finnish").alias("ancestry"),
)
).alias("discoverySamples"),
Expand Down
4 changes: 4 additions & 0 deletions src/gentropy/finngen_studies.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def __init__(
finngen_summary_stats_url_prefix: str = FinngenStudiesConfig().finngen_summary_stats_url_prefix,
finngen_summary_stats_url_suffix: str = FinngenStudiesConfig().finngen_summary_stats_url_suffix,
efo_curation_mapping_url: str = FinngenStudiesConfig().efo_curation_mapping_url,
sample_size: int = FinngenStudiesConfig().sample_size,
) -> None:
"""Run FinnGen study index generation step.
Expand All @@ -32,19 +33,22 @@ def __init__(
finngen_summary_stats_url_prefix (str): FinnGen summary stats URL prefix.
finngen_summary_stats_url_suffix (str): FinnGen summary stats URL suffix.
efo_curation_mapping_url (str): URL to the EFO curation mapping file
sample_size (int): Number of individuals that participated in sample collection, derived from finngen release metadata.
"""
study_index = FinnGenStudyIndex.from_source(
session.spark,
finngen_phenotype_table_url,
finngen_release_prefix,
finngen_summary_stats_url_prefix,
finngen_summary_stats_url_suffix,
sample_size,
)

# NOTE: hack to allow spark to read directly from the URL.
csv_data = urlopen(efo_curation_mapping_url).readlines()
csv_rows = [row.decode("utf8") for row in csv_data]
rdd = session.spark.sparkContext.parallelize(csv_rows)
# NOTE: type annotations for spark.read.csv miss the fact that the first param can be [RDD[str]]
efo_curation_mapping = session.spark.read.csv(rdd, header=True, sep="\t")

study_index_with_efo = FinnGenStudyIndex.join_efo_mapping(
Expand Down
Loading

0 comments on commit ce125f9

Please sign in to comment.