Skip to content

Commit

Permalink
feat: updated method for ld_index extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel-Considine committed Apr 2, 2024
1 parent 170dd09 commit 33b6a51
Showing 1 changed file with 26 additions and 35 deletions.
61 changes: 26 additions & 35 deletions src/gentropy/datasource/gnomad/ld.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,12 @@
from hail.linalg import BlockMatrix
from pyspark.sql import Window

from gentropy.common.session import Session
from gentropy.common.spark_helpers import get_top_ranked_in_window, get_value_from_row
from gentropy.common.utils import _liftover_loci, convert_gnomad_position_to_ensembl
from gentropy.dataset.ld_index import LDIndex

if TYPE_CHECKING:
from pyspark.sql import DataFrame
from pyspark.sql import DataFrame, Row


@dataclass
Expand All @@ -36,6 +35,7 @@ class GnomADLDMatrix:

ld_matrix_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm"
ld_index_raw_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht"
liftover_ht_path: str = "gs://gcp-public-data--gnomad/release/2.1.1/liftover_grch38/ht/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.ht"
grch37_to_grch38_chain_path: str = (
"gs://hail-common/references/grch37_to_grch38.over.chain.gz"
)
Expand Down Expand Up @@ -450,61 +450,52 @@ def get_ld_matrix_slice(
)
)

@staticmethod
def get_locus_index(
session: Session,
study_locus_row: DataFrame,
ld_index_path: str,
self: GnomADLDMatrix,
study_locus_row: Row,
window_size: int = 1_000_000,
major_population: str = "nfe",
) -> DataFrame:
"""Extract hail matrix index from StudyLocus rows.
Args:
session (Session): Spark session
study_locus_row (DataFrame): Study-locus row
ld_index_path (str): Path to the hail LD index parquet
study_locus_row (Row): Study-locus row
window_size (int): Window size to extract from gnomad matrix
major_population (str): Major population to extract from gnomad matrix, default is "nfe"
Returns:
DataFrame: Returns the index of the gnomad matrix for the locus
"""
_df = (
study_locus_row.withColumn("start", f.col("position") - (window_size / 2))
.withColumn("end", f.col("position") + (window_size / 2))
.alias("_df")
chromosome = str("chr" + study_locus_row["chromosome"])
start = study_locus_row["position"] - window_size // 2
end = study_locus_row["position"] + window_size // 2

liftover_ht = hl.read_table(self.liftover_ht_path)
liftover_ht = (
liftover_ht.filter(
(liftover_ht.locus.contig == chromosome)
& (liftover_ht.locus.position >= start)
& (liftover_ht.locus.position <= end)
)
.key_by()
.select("locus", "alleles", "original_locus")
.key_by("original_locus", "alleles")
.naive_coalesce(20)
)

_matrix_index = session.spark.read.parquet(
ld_index_path.format(POP=major_population)
hail_index = hl.read_table(
self.ld_index_raw_template.format(POP=major_population)
)

_index_joined = (
_df.alias("df")
.join(
_matrix_index.alias("matrix_index"),
(f.col("df.chromosome") == f.col("matrix_index.chromosome"))
& (f.col("df.start") <= f.col("matrix_index.position"))
& (f.col("df.end") >= f.col("matrix_index.position")),
)
.select(
"matrix_index.chromosome",
"matrix_index.position",
"referenceAllele",
"alternateAllele",
"idx",
)
.sort("idx")
)
joined_index = liftover_ht.join(hail_index, how="inner").to_spark().sort("idx")

return _index_joined
return joined_index

@staticmethod
def get_locus_matrix(
def get_numpy_matrix(
locus_index: DataFrame,
gnomad_ancestry: str,
gnomad_ancestry: str = "nfe",
) -> np.ndarray:
"""Extract the LD block matrix for a locus.
Expand Down

0 comments on commit 33b6a51

Please sign in to comment.