Skip to content

Commit

Permalink
perf(l2g): more elegant rewrite ColocalisationFactory._get_max_coloc_…
Browse files Browse the repository at this point in the history
…per_credible_set
  • Loading branch information
ireneisdoomed committed Mar 20, 2024
1 parent f788e73 commit 75bafd5
Showing 1 changed file with 9 additions and 28 deletions.
37 changes: 9 additions & 28 deletions src/gentropy/method/l2g/feature_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import pyspark.sql.functions as f

from gentropy.common.spark_helpers import (
convert_from_long_to_wide,
convert_from_wide_to_long,
get_record_with_maximum_value,
)
Expand Down Expand Up @@ -121,44 +120,26 @@ def _get_max_coloc_per_credible_set(
)
).drop("tmp_nbh_max_score", "local_max_score")

wide_df = convert_from_long_to_wide(
df=(
return L2GFeature(
_df=(
# Combine local and neighborhood metrics
local_max.unionByName(
neighbourhood_max, allowMissingColumns=True
).withColumn(
"featureName",
).select(
"studyLocusId",
"geneId",
# Feature name is a concatenation of the QTL type, colocalisation metric and if it's local or in the vicinity
f.concat_ws(
"",
f.col("right_studyType"),
f.lit("Coloc"),
f.col("colocalisationMetric"),
f.lit("Maximum"),
f.col("score_type"),
),
).alias("featureName"),
f.col("max_score").alias("featureValue"),
)
),
id_vars=["studyLocusId", "geneId"],
var_name="featureName",
value_name="max_score",
)

return L2GFeature(
_df=convert_from_wide_to_long(
wide_df.groupBy("studyLocusId", "geneId").agg(
*(
f.first(f.col(c), ignorenulls=True).alias(c)
for c in wide_df.columns
if c
not in [
"studyLocusId",
"geneId",
]
)
),
id_vars=("studyLocusId", "geneId"),
var_name="featureName",
value_name="featureValue",
).filter(f.col("featureValue").isNotNull()),
_schema=L2GFeature.get_schema(),
)

Expand Down

0 comments on commit 75bafd5

Please sign in to comment.