Skip to content

Commit

Permalink
feat: extract pos and chromosome from variantid (opentargets#909)
Browse files Browse the repository at this point in the history
Co-authored-by: Szymon Szyszkowski <[email protected]>
  • Loading branch information
project-defiant and Szymon Szyszkowski authored Nov 11, 2024
1 parent bb609cb commit 10b4be0
Showing 1 changed file with 56 additions and 0 deletions.
56 changes: 56 additions & 0 deletions src/gentropy/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,3 +315,59 @@ def copy_to_gcs(source_path: str, destination_blob: str) -> None:
bucket = client.bucket(bucket_name=urlparse(destination_blob).hostname)
blob = bucket.blob(blob_name=urlparse(destination_blob).path.lstrip("/"))
blob.upload_from_filename(source_path)


def extract_chromosome(variant_id: Column) -> Column:
"""Extract chromosome from variant ID.
This function extracts the chromosome from a variant ID. The variantId is expected to be in the format `chromosome_position_ref_alt`.
The function does not convert the GENCODE to Ensembl chromosome notation.
See https://genome.ucsc.edu/FAQ/FAQgenes.html#:~:text=maps%20only%20once.-,The%20differences,-Some%20of%20our
Args:
variant_id (Column): Variant ID
Returns:
Column: Chromosome
Examples:
>>> d = [("chr1_12345_A_T",),("15_KI270850v1_alt_48777_C_T",),]
>>> df = spark.createDataFrame(d).toDF("variantId")
>>> df.withColumn("chromosome", extract_chromosome(f.col("variantId"))).show(truncate=False)
+---------------------------+-----------------+
|variantId |chromosome |
+---------------------------+-----------------+
|chr1_12345_A_T |chr1 |
|15_KI270850v1_alt_48777_C_T|15_KI270850v1_alt|
+---------------------------+-----------------+
<BLANKLINE>
"""
return f.regexp_extract(variant_id, r"^(.*)_\d+_.*$", 1)


def extract_position(variant_id: Column) -> Column:
"""Extract position from variant ID.
This function extracts the position from a variant ID. The variantId is expected to be in the format `chromosome_position_ref_alt`.
Args:
variant_id (Column): Variant ID
Returns:
Column: Position
Examples:
>>> d = [("chr1_12345_A_T",),("15_KI270850v1_alt_48777_C_T",),]
>>> df = spark.createDataFrame(d).toDF("variantId")
>>> df.withColumn("position", extract_position(f.col("variantId"))).show(truncate=False)
+---------------------------+--------+
|variantId |position|
+---------------------------+--------+
|chr1_12345_A_T |12345 |
|15_KI270850v1_alt_48777_C_T|48777 |
+---------------------------+--------+
<BLANKLINE>
"""
return f.regexp_extract(variant_id, r"^.*_(\d+)_.*$", 1)

0 comments on commit 10b4be0

Please sign in to comment.