Skip to content

Commit

Permalink
fix(sumstats): correct study id for dir of finngen studies
Browse files Browse the repository at this point in the history
  • Loading branch information
louwenjjr committed Mar 20, 2024
1 parent 650bb2e commit 9475542
Showing 1 changed file with 5 additions and 2 deletions.
7 changes: 5 additions & 2 deletions src/gentropy/datasource/finngen/summary_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ def from_source(
Returns:
SummaryStatistics: Processed summary statistics dataset
"""
study_id = raw_file.split("/")[-1].split(".")[0].upper()
processed_summary_stats_df = (
spark.read.schema(cls.raw_schema)
.option("delimiter", "\t")
Expand All @@ -59,7 +58,11 @@ def from_source(
.filter(f.col("pos").cast(t.IntegerType()).isNotNull())
.select(
# From the full path, extracts just the filename, and converts to upper case to get the study ID.
f.lit(study_id).alias("studyId"),
f.upper(
f.regexp_extract(
f.input_file_name(), r"([^/]+)(\.tsv\.gz|\.gz|\.tsv)", 1
)
).alias("studyId"),
# Add variant information.
f.concat_ws(
"_",
Expand Down

0 comments on commit 9475542

Please sign in to comment.