feat: adding VEP predictor

opentargets · Nov 15, 2024 · 6f2fd97 · 6f2fd97
1 parent 7654a88
commit 6f2fd97
Show file tree

Hide file tree

Showing 2 changed files with 68 additions and 62 deletions.
diff --git a/src/gentropy/dataset/variant_index.py b/src/gentropy/dataset/variant_index.py
@@ -303,9 +303,12 @@ def get_loftee(self: VariantIndex) -> DataFrame:
 class InSilicoPredictorNormaliser:
     """Class to normalise in silico predictor assessments.
 
-    Essentially based on the raw scores, it normalises the scores to a range between 0 and 1, and appends the normalised
+    Essentially based on the raw scores, it normalises the scores to a range between -1 and 1, and appends the normalised
     value to the in silico predictor struct.
 
+    The higher negative values indicate increasingly confident prediction to be a benign variant,
+    while the higher positive values indicate increasingly deleterious predicted effect.
+
     The point of these operations to make the scores comparable across different in silico predictors.
     """
 
@@ -363,6 +366,7 @@ def resolve_predictor_methods(
             .when(method == "phred scaled CADD", cls._normalise_cadd(score))
             .when(method == "SpliceAI", cls._normalise_splice_ai(score))
             .when(method == "Pangolin", cls._normalise_pangolin(score))
+            .when(method == "VEP", score)
         )
 
     @staticmethod
@@ -397,10 +401,10 @@ def _normalise_cadd(
         """Normalise CADD scores.
 
         Logic: CADD scores are divided into four ranges and scaled accordingly:
-         - 0-10 -> 0-0.025 (low impact)
-         - 10-20 -> 0.025-0.5 (weak impact)
-         - 20-30 -> 0.5-0.75 (moderate impact)
-         - 30-81 -> 0.75-1 (high impact)
+         - 0-10 -> -1-0 (likely benign ~2M)
+         - 10-20 -> 0-0.5 (potentially deleterious ~300k)
+         - 20-30 -> 0.5-0.75 (likely deleterious ~350k)
+         - 30-81 -> 0.75-1 (highly likely deleterious ~86k)
 
         Args:
             score (Column): CADD score.
@@ -409,15 +413,9 @@ def _normalise_cadd(
             Column: Normalised CADD score.
         """
         return (
-            f.when(score <= 10, cls._rescaleColumnValue(score, 0, 10, 0, 0.25))
-            .when(
-                (score > 10) & (score <= 20),
-                cls._rescaleColumnValue(score, 10, 20, 0.25, 0.5),
-            )
-            .when(
-                (score > 20) & (score <= 30),
-                cls._rescaleColumnValue(score, 20, 30, 0.5, 0.75),
-            )
+            f.when(score <= 10, cls._rescaleColumnValue(score, 0, 10, -1.0, 0.0))
+            .when(score <= 20, cls._rescaleColumnValue(score, 10, 20, 0.0, 0.5))
+            .when(score <= 30, cls._rescaleColumnValue(score, 20, 30, 0.5, 0.75))
             .when(score > 30, cls._rescaleColumnValue(score, 30, 81, 0.75, 1))
         )
 
@@ -429,8 +427,8 @@ def _normalise_loftee(
         """Normalise LOFTEE scores.
 
         Logic: LOFTEE scores are divided into two categories:
-         - HC (high confidence): 1.0
-         - LC (low confidence): 0.85
+         - HC (high confidence): 1.0 (~120k)
+         - LC (low confidence): 0.85 (~18k)
         The normalised score is calculated based on the category the score falls into.
 
         Args:
@@ -468,22 +466,22 @@ def _normalise_sift(
             f.when(
                 (1 - f.round(score.cast(t.DoubleType()), 2) >= 0.95)
                 & (assessment == "deleterious"),
-                cls._rescaleColumnValue(1 - score, 0.95, 1, 0.75, 1),
+                cls._rescaleColumnValue(1 - score, 0.95, 1, 0.5, 1),
             )
             .when(
                 (1 - f.round(score.cast(t.DoubleType()), 2) >= 0.95)
                 & (assessment == "deleterious_low_confidence"),
-                cls._rescaleColumnValue(1 - score, 0.95, 1, 0.5, 0.75),
+                cls._rescaleColumnValue(1 - score, 0.95, 1, 0, 0.5),
             )
             .when(
                 (1 - f.round(score.cast(t.DoubleType()), 2) <= 0.95)
                 & (assessment == "tolerated_low_confidence"),
-                cls._rescaleColumnValue(1 - score, 0, 0.95, 0.25, 0.5),
+                cls._rescaleColumnValue(1 - score, 0, 0.95, -0.5, 0.0),
             )
             .when(
                 (1 - f.round(score.cast(t.DoubleType()), 2) <= 0.95)
                 & (assessment == "tolerated"),
-                cls._rescaleColumnValue(1 - score, 0, 0.95, 0, 0.25),
+                cls._rescaleColumnValue(1 - score, 0, 0.95, -1, -0.5),
             )
         )
 
@@ -496,9 +494,9 @@ def _normalise_polyphen(
         """Normalise PolyPhen scores.
 
         Logic: PolyPhen scores are divided into three categories:
-         - benign: 0-0.446: 0-0.25
-         - possibly_damaging: 0.446-0.908: 0.25-0.75
-         - probably_damaging: 0.908-1: 0.75-1
+         - benign: 0-0.446: -1--0.25
+         - possibly_damaging: 0.446-0.908: -0.25-0.25
+         - probably_damaging: 0.908-1: 0.25-1
          - if assessment is unknown: None
 
         Args:
@@ -510,15 +508,12 @@ def _normalise_polyphen(
         """
         return (
             f.when(assessment == "unknown", f.lit(None).cast(t.DoubleType()))
-            .when(
-                score <= 0.446,
-                cls._rescaleColumnValue(score, 0, 0.446, 0, 0.25),
-            )
+            .when(score <= 0.446, cls._rescaleColumnValue(score, 0, 0.446, -1.0, -0.25))
             .when(
                 score <= 0.908,
-                cls._rescaleColumnValue(score, 0.446, 0.908, 0.25, 0.75),
+                cls._rescaleColumnValue(score, 0.446, 0.908, -0.25, 0.25),
             )
-            .when(score > 0.908, cls._rescaleColumnValue(score, 0.908, 1, 0.75, 1))
+            .when(score > 0.908, cls._rescaleColumnValue(score, 0.908, 1.0, 0.25, 1.0))
         )
 
     @classmethod
@@ -529,9 +524,9 @@ def _normalise_alpha_missense(
         """Normalise AlphaMissense scores.
 
         Logic: AlphaMissense scores are divided into three categories:
-         - 0-0.06: 0-0.5
-         - 0.06-0.77: 0.25-0.75
-         - 0.77-1: 0.75-1
+         - 0-0.06: -1.0--0.25
+         - 0.06-0.77: -0.25-0.25
+         - 0.77-1: 0.25-1
 
         Args:
             score (Column): AlphaMissense score.
@@ -540,12 +535,9 @@ def _normalise_alpha_missense(
             Column: Normalised AlphaMissense score.
         """
         return (
-            f.when(score < 0.06, cls._rescaleColumnValue(score, 0, 0.06, 0, 0.5))
-            .when(
-                (score >= 0.06) & (score < 0.77),
-                cls._rescaleColumnValue(score, 0.06, 0.77, 0.25, 0.75),
-            )
-            .when(score >= 0.77, cls._rescaleColumnValue(score, 0.77, 1, 0.75, 1))
+            f.when(score < 0.06, cls._rescaleColumnValue(score, 0, 0.06, -1.0, -0.25))
+            .when(score < 0.77, cls._rescaleColumnValue(score, 0.06, 0.77, -0.25, 0.25))
+            .when(score >= 0.77, cls._rescaleColumnValue(score, 0.77, 1, 0.25, 1))
         )
 
     @classmethod
@@ -556,9 +548,9 @@ def _normalise_splice_ai(
         """Normalise SpliceAI scores.
 
         Logic: SpliceAI scores are divided into three categories:
-         - 0-0.2: 0-0.25
-         - 0.2-0.5: 0.5-0.75
-         - 0.5-1: 0.75-1
+         - 0-0.2: -1.0--0.25
+         - 0.2-0.5: -0.25-0.25
+         - 0.5-1: 0.25-1
 
         Args:
             score (Column): SpliceAI score.
@@ -567,12 +559,9 @@ def _normalise_splice_ai(
             Column: Normalised SpliceAI score.
         """
         return (
-            f.when(score <= 0.2, cls._rescaleColumnValue(score, 0, 0.2, 0.25, 0.5))
-            .when(
-                score <= 0.5,
-                cls._rescaleColumnValue(score, 0.2, 0.5, 0.5, 0.75),
-            )
-            .when(score > 0.5, cls._rescaleColumnValue(score, 0.5, 1, 0.75, 1))
+            f.when(score <= 0.2, cls._rescaleColumnValue(score, 0, 0.2, -1, -0.25))
+            .when(score <= 0.5, cls._rescaleColumnValue(score, 0.2, 0.5, -0.25, 0.25))
+            .when(score > 0.5, cls._rescaleColumnValue(score, 0.5, 1, 0.25, 1))
         )
 
     @classmethod
@@ -593,8 +582,8 @@ def _normalise_pangolin(
             Column: Normalised Pangolin score.
         """
         return f.when(
-            f.abs(score) > 0.14, cls._rescaleColumnValue(f.abs(score), 0.14, 1, 0.75, 1)
+            f.abs(score) > 0.14, cls._rescaleColumnValue(f.abs(score), 0.14, 1, 0.0, 1)
         ).when(
             f.abs(score) <= 0.14,
-            cls._rescaleColumnValue(f.abs(score), 0, 0.14, 0.25, 0.75),
+            cls._rescaleColumnValue(f.abs(score), 0, 0.14, -1, 0.0),
         )
diff --git a/src/gentropy/datasource/ensembl/vep_parser.py b/src/gentropy/datasource/ensembl/vep_parser.py
@@ -41,6 +41,18 @@ class VariantEffectPredictorParser:
     # Schema for the allele frequency column:
     ALLELE_FREQUENCY_SCHEMA = VariantIndex.get_schema()["alleleFrequencies"].dataType
 
+    # Consequence to sequence ontology map:
+    SEQUENCE_ONTOLOGY_MAP = {
+        item["label"]: item["id"]
+        for item in VariantIndexConfig.consequence_to_pathogenicity_score
+    }
+
+    # Sequence ontology to score map:
+    LABEL_TO_SCORE_MAP = {
+        item["label"]: item["score"]
+        for item in VariantIndexConfig.consequence_to_pathogenicity_score
+    }
+
     @staticmethod
     def get_schema() -> t.StructType:
         """Return the schema of the VEP output.
@@ -328,6 +340,17 @@ def _get_most_severe_transcript(
             lambda transcript: transcript.getItem(score_field_name).isNotNull(),
         )[0]
 
+    @classmethod
+    @enforce_schema(IN_SILICO_PREDICTOR_SCHEMA)
+    def _get_vep_prediction(cls, most_severe_consequence: Column) -> Column:
+        return f.struct(
+            f.lit("VEP").alias("method"),
+            most_severe_consequence.alias("assessment"),
+            map_column_by_dictionary(
+                most_severe_consequence, cls.LABEL_TO_SCORE_MAP
+            ).alias("score"),
+        )
+
     @staticmethod
     @enforce_schema(IN_SILICO_PREDICTOR_SCHEMA)
     def _get_max_alpha_missense(transcripts: Column) -> Column:
@@ -587,16 +610,6 @@ def process_vep_output(
         Returns:
            DataFrame: processed data in the right shape.
         """
-        # Consequence to sequence ontology map:
-        sequence_ontology_map = {
-            item["label"]: item["id"]
-            for item in VariantIndexConfig.consequence_to_pathogenicity_score
-        }
-        # Sequence ontology to score map:
-        label_to_score_map = {
-            item["label"]: item["score"]
-            for item in VariantIndexConfig.consequence_to_pathogenicity_score
-        }
         # Processing VEP output:
         return (
             vep_output
@@ -659,6 +672,8 @@ def process_vep_output(
                             cls._get_max_alpha_missense(
                                 f.col("transcript_consequences")
                             ),
+                            # Extract VEP prediction:
+                            cls._get_vep_prediction(f.col("most_severe_consequence")),
                         ),
                         lambda predictor: predictor.isNotNull(),
                     ),
@@ -671,12 +686,14 @@ def process_vep_output(
                             method_name="phred scaled CADD",
                             score_column_name="cadd_phred",
                         ),
+                        # Extract VEP prediction:
+                        cls._get_vep_prediction(f.col("most_severe_consequence")),
                     )
                 )
                 .alias("inSilicoPredictors"),
                 # Convert consequence to SO:
                 map_column_by_dictionary(
-                    f.col("most_severe_consequence"), sequence_ontology_map
+                    f.col("most_severe_consequence"), cls.SEQUENCE_ONTOLOGY_MAP
                 ).alias("mostSevereConsequenceId"),
                 # Propagate most severe consequence:
                 "most_severe_consequence",
@@ -701,15 +718,15 @@ def process_vep_output(
                             f.transform(
                                 transcript.consequence_terms,
                                 lambda y: map_column_by_dictionary(
-                                    y, sequence_ontology_map
+                                    y, cls.SEQUENCE_ONTOLOGY_MAP
                                 ),
                             ).alias("variantFunctionalConsequenceIds"),
                             # Convert consequence terms to consequence score:
                             f.array_max(
                                 f.transform(
                                     transcript.consequence_terms,
                                     lambda term: map_column_by_dictionary(
-                                        term, label_to_score_map
+                                        term, cls.LABEL_TO_SCORE_MAP
                                     ),
                                 )
                             )