Skip to content

Commit

Permalink
feat: adding VEP predictor
Browse files Browse the repository at this point in the history
  • Loading branch information
DSuveges committed Nov 15, 2024
1 parent 7654a88 commit 6f2fd97
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 62 deletions.
87 changes: 38 additions & 49 deletions src/gentropy/dataset/variant_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,9 +303,12 @@ def get_loftee(self: VariantIndex) -> DataFrame:
class InSilicoPredictorNormaliser:
"""Class to normalise in silico predictor assessments.
Essentially based on the raw scores, it normalises the scores to a range between 0 and 1, and appends the normalised
Essentially based on the raw scores, it normalises the scores to a range between -1 and 1, and appends the normalised
value to the in silico predictor struct.
The higher negative values indicate increasingly confident prediction to be a benign variant,
while the higher positive values indicate increasingly deleterious predicted effect.
The point of these operations to make the scores comparable across different in silico predictors.
"""

Expand Down Expand Up @@ -363,6 +366,7 @@ def resolve_predictor_methods(
.when(method == "phred scaled CADD", cls._normalise_cadd(score))
.when(method == "SpliceAI", cls._normalise_splice_ai(score))
.when(method == "Pangolin", cls._normalise_pangolin(score))
.when(method == "VEP", score)
)

@staticmethod
Expand Down Expand Up @@ -397,10 +401,10 @@ def _normalise_cadd(
"""Normalise CADD scores.
Logic: CADD scores are divided into four ranges and scaled accordingly:
- 0-10 -> 0-0.025 (low impact)
- 10-20 -> 0.025-0.5 (weak impact)
- 20-30 -> 0.5-0.75 (moderate impact)
- 30-81 -> 0.75-1 (high impact)
- 0-10 -> -1-0 (likely benign ~2M)
- 10-20 -> 0-0.5 (potentially deleterious ~300k)
- 20-30 -> 0.5-0.75 (likely deleterious ~350k)
- 30-81 -> 0.75-1 (highly likely deleterious ~86k)
Args:
score (Column): CADD score.
Expand All @@ -409,15 +413,9 @@ def _normalise_cadd(
Column: Normalised CADD score.
"""
return (
f.when(score <= 10, cls._rescaleColumnValue(score, 0, 10, 0, 0.25))
.when(
(score > 10) & (score <= 20),
cls._rescaleColumnValue(score, 10, 20, 0.25, 0.5),
)
.when(
(score > 20) & (score <= 30),
cls._rescaleColumnValue(score, 20, 30, 0.5, 0.75),
)
f.when(score <= 10, cls._rescaleColumnValue(score, 0, 10, -1.0, 0.0))
.when(score <= 20, cls._rescaleColumnValue(score, 10, 20, 0.0, 0.5))
.when(score <= 30, cls._rescaleColumnValue(score, 20, 30, 0.5, 0.75))
.when(score > 30, cls._rescaleColumnValue(score, 30, 81, 0.75, 1))
)

Expand All @@ -429,8 +427,8 @@ def _normalise_loftee(
"""Normalise LOFTEE scores.
Logic: LOFTEE scores are divided into two categories:
- HC (high confidence): 1.0
- LC (low confidence): 0.85
- HC (high confidence): 1.0 (~120k)
- LC (low confidence): 0.85 (~18k)
The normalised score is calculated based on the category the score falls into.
Args:
Expand Down Expand Up @@ -468,22 +466,22 @@ def _normalise_sift(
f.when(
(1 - f.round(score.cast(t.DoubleType()), 2) >= 0.95)
& (assessment == "deleterious"),
cls._rescaleColumnValue(1 - score, 0.95, 1, 0.75, 1),
cls._rescaleColumnValue(1 - score, 0.95, 1, 0.5, 1),
)
.when(
(1 - f.round(score.cast(t.DoubleType()), 2) >= 0.95)
& (assessment == "deleterious_low_confidence"),
cls._rescaleColumnValue(1 - score, 0.95, 1, 0.5, 0.75),
cls._rescaleColumnValue(1 - score, 0.95, 1, 0, 0.5),
)
.when(
(1 - f.round(score.cast(t.DoubleType()), 2) <= 0.95)
& (assessment == "tolerated_low_confidence"),
cls._rescaleColumnValue(1 - score, 0, 0.95, 0.25, 0.5),
cls._rescaleColumnValue(1 - score, 0, 0.95, -0.5, 0.0),
)
.when(
(1 - f.round(score.cast(t.DoubleType()), 2) <= 0.95)
& (assessment == "tolerated"),
cls._rescaleColumnValue(1 - score, 0, 0.95, 0, 0.25),
cls._rescaleColumnValue(1 - score, 0, 0.95, -1, -0.5),
)
)

Expand All @@ -496,9 +494,9 @@ def _normalise_polyphen(
"""Normalise PolyPhen scores.
Logic: PolyPhen scores are divided into three categories:
- benign: 0-0.446: 0-0.25
- possibly_damaging: 0.446-0.908: 0.25-0.75
- probably_damaging: 0.908-1: 0.75-1
- benign: 0-0.446: -1--0.25
- possibly_damaging: 0.446-0.908: -0.25-0.25
- probably_damaging: 0.908-1: 0.25-1
- if assessment is unknown: None
Args:
Expand All @@ -510,15 +508,12 @@ def _normalise_polyphen(
"""
return (
f.when(assessment == "unknown", f.lit(None).cast(t.DoubleType()))
.when(
score <= 0.446,
cls._rescaleColumnValue(score, 0, 0.446, 0, 0.25),
)
.when(score <= 0.446, cls._rescaleColumnValue(score, 0, 0.446, -1.0, -0.25))
.when(
score <= 0.908,
cls._rescaleColumnValue(score, 0.446, 0.908, 0.25, 0.75),
cls._rescaleColumnValue(score, 0.446, 0.908, -0.25, 0.25),
)
.when(score > 0.908, cls._rescaleColumnValue(score, 0.908, 1, 0.75, 1))
.when(score > 0.908, cls._rescaleColumnValue(score, 0.908, 1.0, 0.25, 1.0))
)

@classmethod
Expand All @@ -529,9 +524,9 @@ def _normalise_alpha_missense(
"""Normalise AlphaMissense scores.
Logic: AlphaMissense scores are divided into three categories:
- 0-0.06: 0-0.5
- 0.06-0.77: 0.25-0.75
- 0.77-1: 0.75-1
- 0-0.06: -1.0--0.25
- 0.06-0.77: -0.25-0.25
- 0.77-1: 0.25-1
Args:
score (Column): AlphaMissense score.
Expand All @@ -540,12 +535,9 @@ def _normalise_alpha_missense(
Column: Normalised AlphaMissense score.
"""
return (
f.when(score < 0.06, cls._rescaleColumnValue(score, 0, 0.06, 0, 0.5))
.when(
(score >= 0.06) & (score < 0.77),
cls._rescaleColumnValue(score, 0.06, 0.77, 0.25, 0.75),
)
.when(score >= 0.77, cls._rescaleColumnValue(score, 0.77, 1, 0.75, 1))
f.when(score < 0.06, cls._rescaleColumnValue(score, 0, 0.06, -1.0, -0.25))
.when(score < 0.77, cls._rescaleColumnValue(score, 0.06, 0.77, -0.25, 0.25))
.when(score >= 0.77, cls._rescaleColumnValue(score, 0.77, 1, 0.25, 1))
)

@classmethod
Expand All @@ -556,9 +548,9 @@ def _normalise_splice_ai(
"""Normalise SpliceAI scores.
Logic: SpliceAI scores are divided into three categories:
- 0-0.2: 0-0.25
- 0.2-0.5: 0.5-0.75
- 0.5-1: 0.75-1
- 0-0.2: -1.0--0.25
- 0.2-0.5: -0.25-0.25
- 0.5-1: 0.25-1
Args:
score (Column): SpliceAI score.
Expand All @@ -567,12 +559,9 @@ def _normalise_splice_ai(
Column: Normalised SpliceAI score.
"""
return (
f.when(score <= 0.2, cls._rescaleColumnValue(score, 0, 0.2, 0.25, 0.5))
.when(
score <= 0.5,
cls._rescaleColumnValue(score, 0.2, 0.5, 0.5, 0.75),
)
.when(score > 0.5, cls._rescaleColumnValue(score, 0.5, 1, 0.75, 1))
f.when(score <= 0.2, cls._rescaleColumnValue(score, 0, 0.2, -1, -0.25))
.when(score <= 0.5, cls._rescaleColumnValue(score, 0.2, 0.5, -0.25, 0.25))
.when(score > 0.5, cls._rescaleColumnValue(score, 0.5, 1, 0.25, 1))
)

@classmethod
Expand All @@ -593,8 +582,8 @@ def _normalise_pangolin(
Column: Normalised Pangolin score.
"""
return f.when(
f.abs(score) > 0.14, cls._rescaleColumnValue(f.abs(score), 0.14, 1, 0.75, 1)
f.abs(score) > 0.14, cls._rescaleColumnValue(f.abs(score), 0.14, 1, 0.0, 1)
).when(
f.abs(score) <= 0.14,
cls._rescaleColumnValue(f.abs(score), 0, 0.14, 0.25, 0.75),
cls._rescaleColumnValue(f.abs(score), 0, 0.14, -1, 0.0),
)
43 changes: 30 additions & 13 deletions src/gentropy/datasource/ensembl/vep_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,18 @@ class VariantEffectPredictorParser:
# Schema for the allele frequency column:
ALLELE_FREQUENCY_SCHEMA = VariantIndex.get_schema()["alleleFrequencies"].dataType

# Consequence to sequence ontology map:
SEQUENCE_ONTOLOGY_MAP = {
item["label"]: item["id"]
for item in VariantIndexConfig.consequence_to_pathogenicity_score
}

# Sequence ontology to score map:
LABEL_TO_SCORE_MAP = {
item["label"]: item["score"]
for item in VariantIndexConfig.consequence_to_pathogenicity_score
}

@staticmethod
def get_schema() -> t.StructType:
"""Return the schema of the VEP output.
Expand Down Expand Up @@ -328,6 +340,17 @@ def _get_most_severe_transcript(
lambda transcript: transcript.getItem(score_field_name).isNotNull(),
)[0]

@classmethod
@enforce_schema(IN_SILICO_PREDICTOR_SCHEMA)
def _get_vep_prediction(cls, most_severe_consequence: Column) -> Column:
return f.struct(
f.lit("VEP").alias("method"),
most_severe_consequence.alias("assessment"),
map_column_by_dictionary(
most_severe_consequence, cls.LABEL_TO_SCORE_MAP
).alias("score"),
)

@staticmethod
@enforce_schema(IN_SILICO_PREDICTOR_SCHEMA)
def _get_max_alpha_missense(transcripts: Column) -> Column:
Expand Down Expand Up @@ -587,16 +610,6 @@ def process_vep_output(
Returns:
DataFrame: processed data in the right shape.
"""
# Consequence to sequence ontology map:
sequence_ontology_map = {
item["label"]: item["id"]
for item in VariantIndexConfig.consequence_to_pathogenicity_score
}
# Sequence ontology to score map:
label_to_score_map = {
item["label"]: item["score"]
for item in VariantIndexConfig.consequence_to_pathogenicity_score
}
# Processing VEP output:
return (
vep_output
Expand Down Expand Up @@ -659,6 +672,8 @@ def process_vep_output(
cls._get_max_alpha_missense(
f.col("transcript_consequences")
),
# Extract VEP prediction:
cls._get_vep_prediction(f.col("most_severe_consequence")),
),
lambda predictor: predictor.isNotNull(),
),
Expand All @@ -671,12 +686,14 @@ def process_vep_output(
method_name="phred scaled CADD",
score_column_name="cadd_phred",
),
# Extract VEP prediction:
cls._get_vep_prediction(f.col("most_severe_consequence")),
)
)
.alias("inSilicoPredictors"),
# Convert consequence to SO:
map_column_by_dictionary(
f.col("most_severe_consequence"), sequence_ontology_map
f.col("most_severe_consequence"), cls.SEQUENCE_ONTOLOGY_MAP
).alias("mostSevereConsequenceId"),
# Propagate most severe consequence:
"most_severe_consequence",
Expand All @@ -701,15 +718,15 @@ def process_vep_output(
f.transform(
transcript.consequence_terms,
lambda y: map_column_by_dictionary(
y, sequence_ontology_map
y, cls.SEQUENCE_ONTOLOGY_MAP
),
).alias("variantFunctionalConsequenceIds"),
# Convert consequence terms to consequence score:
f.array_max(
f.transform(
transcript.consequence_terms,
lambda term: map_column_by_dictionary(
term, label_to_score_map
term, cls.LABEL_TO_SCORE_MAP
),
)
)
Expand Down

0 comments on commit 6f2fd97

Please sign in to comment.