opentargets · ireneisdoomed · Oct 1, 2024 · Sep 3, 2024 · Sep 3, 2024 · Sep 3, 2024
diff --git a/docs/howto/command_line/run_step_in_cli.md b/docs/howto/command_line/run_step_in_cli.md
@@ -24,7 +24,6 @@ Available options:
         ukbiobank
         variant_annotation
         variant_index
-        variant_to_gene
 
 Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.
 ```

diff --git a/docs/python_api/datasets/l2g_feature.md b/docs/python_api/datasets/l2g_feature.md
diff --git a/docs/python_api/datasets/l2g_features/_l2g_feature.md b/docs/python_api/datasets/l2g_features/_l2g_feature.md
@@ -0,0 +1,11 @@
+---
+title: L2G Feature
+---
+
+## Abstract Class
+
+::: gentropy.dataset.l2g_features.l2g_feature.L2GFeature
+
+## Schema
+
+--8<-- "assets/schemas/l2g_feature.md"
diff --git a/docs/python_api/datasets/l2g_features/colocalisation.md b/docs/python_api/datasets/l2g_features/colocalisation.md
@@ -0,0 +1,27 @@
+---
+title: From colocalisation
+---
+
+## List of features
+
+::: gentropy.dataset.l2g_features.colocalisation.EQtlColocClppMaximumFeature
+::: gentropy.dataset.l2g_features.colocalisation.PQtlColocClppMaximumFeature
+::: gentropy.dataset.l2g_features.colocalisation.SQtlColocClppMaximumFeature
+::: gentropy.dataset.l2g_features.colocalisation.TuQtlColocClppMaximumFeature
+::: gentropy.dataset.l2g_features.colocalisation.EQtlColocH4MaximumFeature
+::: gentropy.dataset.l2g_features.colocalisation.PQtlColocH4MaximumFeature
+::: gentropy.dataset.l2g_features.colocalisation.SQtlColocH4MaximumFeature
+::: gentropy.dataset.l2g_features.colocalisation.TuQtlColocH4MaximumFeature
+::: gentropy.dataset.l2g_features.colocalisation.EQtlColocClppMaximumNeighbourhoodFeature
+::: gentropy.dataset.l2g_features.colocalisation.PQtlColocClppMaximumNeighbourhoodFeature
+::: gentropy.dataset.l2g_features.colocalisation.SQtlColocClppMaximumNeighbourhoodFeature
+::: gentropy.dataset.l2g_features.colocalisation.TuQtlColocClppMaximumNeighbourhoodFeature
+::: gentropy.dataset.l2g_features.colocalisation.EQtlColocH4MaximumNeighbourhoodFeature
+::: gentropy.dataset.l2g_features.colocalisation.PQtlColocH4MaximumNeighbourhoodFeature
+::: gentropy.dataset.l2g_features.colocalisation.SQtlColocH4MaximumNeighbourhoodFeature
+::: gentropy.dataset.l2g_features.colocalisation.TuQtlColocH4MaximumNeighbourhoodFeature
+
+## Common logic
+
+::: gentropy.dataset.l2g_features.colocalisation.common_colocalisation_feature_logic
+::: gentropy.dataset.l2g_features.colocalisation.common_neighbourhood_colocalisation_feature_logic
diff --git a/docs/python_api/datasets/l2g_features/distance.md b/docs/python_api/datasets/l2g_features/distance.md
@@ -0,0 +1,19 @@
+---
+title: From distance
+---
+
+## List of features
+
+::: gentropy.dataset.l2g_features.distance.DistanceSentinelTssFeature
+::: gentropy.dataset.l2g_features.distance.DistanceSentinelTssNeighbourhoodFeature
+::: gentropy.dataset.l2g_features.distance.DistanceTssMeanFeature
+::: gentropy.dataset.l2g_features.distance.DistanceTssMeanNeighbourhoodFeature
+::: gentropy.dataset.l2g_features.distance.DistanceSentinelFootprintFeature
+::: gentropy.dataset.l2g_features.distance.DistanceSentinelFootprintNeighbourhoodFeature
+::: gentropy.dataset.l2g_features.distance.DistanceFootprintMeanFeature
+::: gentropy.dataset.l2g_features.distance.DistanceFootprintMeanNeighbourhoodFeature
+
+## Common logic
+
+::: gentropy.dataset.l2g_features.distance.common_distance_feature_logic
+::: gentropy.dataset.l2g_features.distance.common_neighbourhood_distance_feature_logic
diff --git a/docs/python_api/datasets/variant_to_gene.md b/docs/python_api/datasets/variant_to_gene.md
diff --git a/docs/python_api/methods/l2g/_l2g.md b/docs/python_api/methods/l2g/_l2g.md
@@ -9,13 +9,10 @@ The **“locus-to-gene” (L2G)** model derives features to prioritize likely ca
 - **Chromatin Interaction:** (e.g., promoter-capture Hi-C)
 - **Variant Pathogenicity:** (from VEP)
 
-The L2G model is distinct from the variant-to-gene (V2G) pipeline in that it:
-
-- Uses a machine-learning model to learn the weights of each evidence source based on a gold standard of previously identified causal genes.
-- Relies upon fine-mapping and colocalization data.
-
 Some of the predictive features weight variant-to-gene (or genomic region-to-gene) evidence based on the posterior probability that the variant is causal, determined through fine-mapping of the GWAS association.
 
+For a more detailed description of how each feature is computed, see [the L2G Feature documentation](../../datasets/l2g_features/_l2g_feature.md).
+
 Details of the L2G model are provided in our Nature Genetics publication (ref - [Nature Genetics Publication](https://www.nature.com/articles/s41588-021-00945-5)):
 
 - **Title:** An open approach to systematically prioritize causal variants and genes at all published human GWAS trait-associated loci.

diff --git a/docs/python_api/steps/variant_to_gene_step.md b/docs/python_api/steps/variant_to_gene_step.md
diff --git a/notebooks/Release_QC_metrics.ipynb b/notebooks/Release_QC_metrics.ipynb
@@ -13,21 +13,17 @@
     "1. Import necessary modules and set up the release path and version.\n",
     "2. Load and analyze the variant index data:\n",
     "   - Count the number of unique variants.\n",
-    "3. Load and analyze the variant-to-gene (v2g) data:\n",
-    "   - Count the number of unique variants and total variant-to-gene assignments.\n",
-    "   - Count the number of v2g assignments where the score is > 0.8.\n",
-    "   - Plot a histogram/density plot for the \"score\" column.\n",
-    "4. Load and analyze the study index data for different data sources (FinnGen, GWASCat, eQTLcat):\n",
+    "3. Load and analyze the study index data for different data sources (FinnGen, GWASCat, eQTLcat):\n",
     "   - Count the number of unique studies for each data source.\n",
-    "5. Analyze the credible sets for each datasource (Finngen, gwascat, eqtlcat):\n",
+    "4. Analyze the credible sets for each datasource (Finngen, gwascat, eqtlcat):\n",
     "   - Analyze the credible sets:\n",
     "     - Count the number of unique credible sets and unique study IDs.\n",
     "     - Plot a scatter plot of the credible set size vs. the top posterior probability.\n",
     "     - Count the number of credible sets with a top SNP posterior probability > 0.9..\n",
-    "6. Analyze colocalization data:\n",
+    "5. Analyze colocalization data:\n",
     "   - Count the total number of colocalizations and the number with clpp > 0.8.\n",
     "   - Calculate the average number of overlaps per credible set.\n",
-    "7. Analyze locus-to-gene (L2G) predictions:\n",
+    "6. Analyze locus-to-gene (L2G) predictions:\n",
     "   - Load the locus-to-gene predictions data.\n",
     "   - How many Studylocus contains a \"good\" l2g prediction? (l2g_score > 0.5)\n",
     "   - How does l2g perform based on different datasource inputs? (impossible to tell)\n",
@@ -126,79 +122,6 @@
     "#variant_index.filter(variant_index[\"alleleFrequencies.populationName\"] > 0.05).show(10, False)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "\n",
-    "#### 3. Load and analyze the variant-to-gene (v2g) data:\n",
-    "   - Count the number of unique variants and total variant-to-gene assignments.\n",
-    "   - Count the number of v2g assignments where the score is > 0.8."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                                                                \r"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Unique variants in v2g release:  5090991 , total variant to gene assignments:  105771851 , number of v2g assignments where score > 0.8:  23176515 ( 4.552 %)\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "                                                                                \r"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Summary of v2g_score: Mean:  0.5909395615801637 L.quart:  0.29 Median:  0.62 U.quart:  0.94\n"
-     ]
-    }
-   ],
-   "source": [
-    "#v2g_path='gs://genetics_etl_python_playground/releases/24.03/variant_to_gene'\n",
-    "v2g_path=f\"{release_path}/{release_ver}/variant_to_gene\"\n",
-    "v2g=session.spark.read.parquet(v2g_path, recursiveFileLookup=True)\n",
-    "\n",
-    "#How many variants?\n",
-    "sample_size_quartiles = v2g.stat.approxQuantile(\"score\", [0.25, 0.5, 0.75], 0.01)\n",
-    "#v2g.select().toPandas().plot.hist()\n",
-    "#v2g.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "   - Plot a histogram/density plot for the \"score\" column."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#The histogram/density plot for “score”\n",
-    "# Out of mem error:\n",
-    "#v2g.select(f.col(\"score\")).toPandas().plot.hist(bins=10, alpha=0.5, label=\"v2g scores\")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},

diff --git a/src/gentropy/assets/schemas/v2g.json b/src/gentropy/assets/schemas/v2g.json
diff --git a/src/gentropy/common/spark_helpers.py b/src/gentropy/common/spark_helpers.py
@@ -6,7 +6,7 @@
 import sys
 from functools import reduce, wraps
 from itertools import chain
-from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, Optional, TypeVar
+from typing import TYPE_CHECKING, Any, Callable, Iterable, Optional, TypeVar
 
 import pyspark.sql.functions as f
 import pyspark.sql.types as t
@@ -447,14 +447,14 @@ def order_array_of_structs_by_two_fields(
     )
 
 
-def map_column_by_dictionary(col: Column, mapping_dict: Dict[str, str]) -> Column:
+def map_column_by_dictionary(col: Column, mapping_dict: dict[str, str]) -> Column:
     """Map column values to dictionary values by key.
 
     Missing consequence label will be converted to None, unmapped consequences will be mapped as None.
 
     Args:
         col (Column): Column containing labels to map.
-        mapping_dict (Dict[str, str]): Dictionary with mapping key/value pairs.
+        mapping_dict (dict[str, str]): Dictionary with mapping key/value pairs.
 
     Returns:
         Column: Column with mapped values.