Skip to content

Commit

Permalink
feat(l2g): distance features based on weighted score (#545)
Browse files Browse the repository at this point in the history
* refactor(l2g): streamline coloc feature factory

* perf(l2g): make joins in _get_vep_features lighter

* fix(l2g): use weighted scores for _get_vep_features

* refactor(l2g): minor improvements

* perf(l2g): adapt session to set partition number for shuffling to 800

* feat(l2g): distance features based on weighted score

* docs: update docs

* refactor(l2g): select features of interest in data outside trainer

* chore(l2g): log annotated gold standards in w&b

* fix: update test_train

* fix(l2g): typo in _get_tss_distance_features with no consequences
  • Loading branch information
ireneisdoomed authored Mar 19, 2024
1 parent 923c622 commit 160051c
Showing 1 changed file with 19 additions and 13 deletions.
32 changes: 19 additions & 13 deletions src/gentropy/method/l2g/feature_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,37 +191,43 @@ class StudyLocusFactory(StudyLocus):
"""Feature extraction in study locus."""

@staticmethod
def _get_tss_distance_features(
credible_set: StudyLocus, distances: V2G
) -> L2GFeature:
"""Joins StudyLocus with the V2G to extract the minimum distance to a gene TSS of all variants in a StudyLocus credible set.
def _get_tss_distance_features(credible_set: StudyLocus, v2g: V2G) -> L2GFeature:
"""Joins StudyLocus with the V2G to extract a score that is based on the distance to a gene TSS of any variant weighted by its posterior probability in a credible set.
Args:
credible_set (StudyLocus): Credible set dataset
distances (V2G): Dataframe containing the distances of all variants to all genes TSS within a region
v2g (V2G): Dataframe containing the distances of all variants to all genes TSS within a region
Returns:
L2GFeature: Stores the features with the minimum distance among all variants in the credible set and a gene TSS.
L2GFeature: Stores the features with the score of weighting the distance to the TSS by the posterior probability of the variant
"""
wide_df = (
credible_set.filter_credible_set(CredibleInterval.IS95)
.df.select(
.df.withColumn("variantInLocus", f.explode_outer("locus"))
.select(
"studyLocusId",
"variantId",
f.explode("locus.variantId").alias("tagVariantId"),
f.col("variantInLocus.variantId").alias("variantInLocusId"),
f.col("variantInLocus.posteriorProbability").alias(
"variantInLocusPosteriorProbability"
),
)
.join(
distances.df.selectExpr(
"variantId as tagVariantId", "geneId", "distance"
v2g.df.filter(f.col("datasourceId") == "canonical_tss").selectExpr(
"variantId as variantInLocusId", "geneId", "score"
),
on="tagVariantId",
on="variantInLocusId",
how="inner",
)
.withColumn(
"weightedScore",
f.col("score") * f.col("variantInLocusPosteriorProbability"),
)
.groupBy("studyLocusId", "geneId")
.agg(
f.min("distance").alias("distanceTssMinimum"),
f.mean("distance").alias("distanceTssMean"),
f.min("weightedScore").alias("distanceTssMinimum"),
f.mean("weightedScore").alias("distanceTssMean"),
)
)

Expand Down

0 comments on commit 160051c

Please sign in to comment.