feat: extract pos and chromosome from variantid (opentargets#909)

Co-authored-by: Szymon Szyszkowski <[email protected]>
thehyve · Nov 11, 2024 · 10b4be0 · 10b4be0
1 parent bb609cb
commit 10b4be0
Showing 1 changed file with 56 additions and 0 deletions.
diff --git a/src/gentropy/common/utils.py b/src/gentropy/common/utils.py
@@ -315,3 +315,59 @@ def copy_to_gcs(source_path: str, destination_blob: str) -> None:
     bucket = client.bucket(bucket_name=urlparse(destination_blob).hostname)
     blob = bucket.blob(blob_name=urlparse(destination_blob).path.lstrip("/"))
     blob.upload_from_filename(source_path)
+
+
+def extract_chromosome(variant_id: Column) -> Column:
+    """Extract chromosome from variant ID.
+
+    This function extracts the chromosome from a variant ID. The variantId is expected to be in the format `chromosome_position_ref_alt`.
+    The function does not convert the GENCODE to Ensembl chromosome notation.
+    See https://genome.ucsc.edu/FAQ/FAQgenes.html#:~:text=maps%20only%20once.-,The%20differences,-Some%20of%20our
+
+    Args:
+        variant_id (Column): Variant ID
+
+    Returns:
+        Column: Chromosome
+
+    Examples:
+        >>> d = [("chr1_12345_A_T",),("15_KI270850v1_alt_48777_C_T",),]
+        >>> df = spark.createDataFrame(d).toDF("variantId")
+        >>> df.withColumn("chromosome", extract_chromosome(f.col("variantId"))).show(truncate=False)
+        +---------------------------+-----------------+
+        |variantId                  |chromosome       |
+        +---------------------------+-----------------+
+        |chr1_12345_A_T             |chr1             |
+        |15_KI270850v1_alt_48777_C_T|15_KI270850v1_alt|
+        +---------------------------+-----------------+
+        <BLANKLINE>
+
+    """
+    return f.regexp_extract(variant_id, r"^(.*)_\d+_.*$", 1)
+
+
+def extract_position(variant_id: Column) -> Column:
+    """Extract position from variant ID.
+
+    This function extracts the position from a variant ID. The variantId is expected to be in the format `chromosome_position_ref_alt`.
+
+    Args:
+        variant_id (Column): Variant ID
+
+    Returns:
+        Column: Position
+
+    Examples:
+        >>> d = [("chr1_12345_A_T",),("15_KI270850v1_alt_48777_C_T",),]
+        >>> df = spark.createDataFrame(d).toDF("variantId")
+        >>> df.withColumn("position", extract_position(f.col("variantId"))).show(truncate=False)
+        +---------------------------+--------+
+        |variantId                  |position|
+        +---------------------------+--------+
+        |chr1_12345_A_T             |12345   |
+        |15_KI270850v1_alt_48777_C_T|48777   |
+        +---------------------------+--------+
+        <BLANKLINE>
+
+    """
+    return f.regexp_extract(variant_id, r"^.*_(\d+)_.*$", 1)