Skip to content

Commit

Permalink
Merge branch 'dev' into alegbe-biosample_index
Browse files Browse the repository at this point in the history
  • Loading branch information
DSuveges committed Sep 24, 2024
2 parents 73b25da + df45a6c commit c9eada2
Show file tree
Hide file tree
Showing 20 changed files with 214 additions and 116 deletions.
6 changes: 6 additions & 0 deletions src/gentropy/assets/schemas/colocalisation.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@
"type": "long",
"metadata": {}
},
{
"name": "rightStudyType",
"nullable": false,
"type": "string",
"metadata": {}
},
{
"name": "chromosome",
"nullable": false,
Expand Down
6 changes: 6 additions & 0 deletions src/gentropy/assets/schemas/study_locus.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@
"nullable": false,
"type": "long"
},
{
"metadata": {},
"name": "studyType",
"nullable": true,
"type": "string"
},
{
"metadata": {},
"name": "variantId",
Expand Down
6 changes: 6 additions & 0 deletions src/gentropy/assets/schemas/study_locus_overlap.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@
"nullable": false,
"type": "long"
},
{
"metadata": {},
"name": "rightStudyType",
"nullable": false,
"type": "string"
},
{
"metadata": {},
"name": "chromosome",
Expand Down
8 changes: 1 addition & 7 deletions src/gentropy/colocalisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from pyspark.sql.functions import col

from gentropy.common.session import Session
from gentropy.dataset.study_index import StudyIndex
from gentropy.dataset.study_locus import CredibleInterval, StudyLocus
from gentropy.method.colocalisation import Coloc

Expand All @@ -23,7 +22,6 @@ def __init__(
self,
session: Session,
credible_set_path: str,
study_index_path: str,
coloc_path: str,
colocalisation_method: str,
) -> None:
Expand All @@ -32,7 +30,6 @@ def __init__(
Args:
session (Session): Session object.
credible_set_path (str): Input credible sets path.
study_index_path (str): Input study index path.
coloc_path (str): Output Colocalisation path.
colocalisation_method (str): Colocalisation method.
"""
Expand All @@ -47,14 +44,11 @@ def __init__(
session, credible_set_path, recursiveFileLookup=True
)
)
si = StudyIndex.from_parquet(
session, study_index_path, recursiveFileLookup=True
)

# Transform
overlaps = credible_set.filter_credible_set(
CredibleInterval.IS95
).find_overlaps(si)
).find_overlaps()
colocalisation_results = colocalisation_class.colocalise(overlaps) # type: ignore

# Load
Expand Down
2 changes: 1 addition & 1 deletion src/gentropy/dataset/colocalisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def extract_maximum_coloc_probability_per_region_and_gene(
self.append_study_metadata(
study_locus,
study_index,
metadata_cols=["studyType", "geneId"],
metadata_cols=["geneId"],
colocalisation_side="right",
)
# it also filters based on method and qtl type
Expand Down
59 changes: 45 additions & 14 deletions src/gentropy/dataset/study_locus.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
order_array_of_structs_by_field,
)
from gentropy.common.utils import get_logsum
from gentropy.config import WindowBasedClumpingStepConfig
from gentropy.dataset.dataset import Dataset
from gentropy.dataset.study_locus_overlap import StudyLocusOverlap
from gentropy.dataset.variant_index import VariantIndex
Expand Down Expand Up @@ -45,7 +46,8 @@ class StudyLocusQualityCheck(Enum):
PALINDROMIC_ALLELE_FLAG (str): Alleles are palindromic - cannot harmonize
AMBIGUOUS_STUDY (str): Association with ambiguous study
UNRESOLVED_LD (str): Variant not found in LD reference
LD_CLUMPED (str): Explained by a more significant variant in high LD (clumped)
LD_CLUMPED (str): Explained by a more significant variant in high LD
WINDOW_CLUMPED (str): Explained by a more significant variant in the same window
NO_POPULATION (str): Study does not have population annotation to resolve LD
NOT_QUALIFYING_LD_BLOCK (str): LD block does not contain variants at the required R^2 threshold
FAILED_STUDY (str): Flagging study loci if the study has failed QC
Expand All @@ -65,7 +67,8 @@ class StudyLocusQualityCheck(Enum):
PALINDROMIC_ALLELE_FLAG = "Palindrome alleles - cannot harmonize"
AMBIGUOUS_STUDY = "Association with ambiguous study"
UNRESOLVED_LD = "Variant not found in LD reference"
LD_CLUMPED = "Explained by a more significant variant in high LD (clumped)"
LD_CLUMPED = "Explained by a more significant variant in high LD"
WINDOW_CLUMPED = "Explained by a more significant variant in the same window"
NO_POPULATION = "Study does not have population annotation to resolve LD"
NOT_QUALIFYING_LD_BLOCK = (
"LD block does not contain variants at the required R^2 threshold"
Expand Down Expand Up @@ -157,6 +160,24 @@ def validate_study(self: StudyLocus, study_index: StudyIndex) -> StudyLocus:
_schema=self.get_schema(),
)

def annotate_study_type(self: StudyLocus, study_index: StudyIndex) -> StudyLocus:
"""Gets study type from study index and adds it to study locus.
Args:
study_index (StudyIndex): Study index to get study type.
Returns:
StudyLocus: Updated study locus with study type.
"""
return StudyLocus(
_df=(
self.df.drop("studyType").join(
study_index.study_type_lut(), on="studyId", how="left"
)
),
_schema=self.get_schema(),
)

def validate_variant_identifiers(
self: StudyLocus, variant_index: VariantIndex
) -> StudyLocus:
Expand Down Expand Up @@ -394,6 +415,7 @@ def _align_overlapping_tags(
f.col("chromosome"),
f.col("tagVariantId"),
f.col("studyLocusId").alias("rightStudyLocusId"),
f.col("studyType").alias("rightStudyType"),
*[f.col(col).alias(f"right_{col}") for col in stats_cols],
).join(peak_overlaps, on=["chromosome", "rightStudyLocusId"], how="inner")

Expand All @@ -410,6 +432,7 @@ def _align_overlapping_tags(
).select(
"leftStudyLocusId",
"rightStudyLocusId",
"rightStudyType",
"chromosome",
"tagVariantId",
f.struct(
Expand Down Expand Up @@ -504,14 +527,11 @@ def get_QC_mappings(cls: type[StudyLocus]) -> dict[str, str]:
"""
return {member.name: member.value for member in StudyLocusQualityCheck}

def filter_by_study_type(
self: StudyLocus, study_type: str, study_index: StudyIndex
) -> StudyLocus:
def filter_by_study_type(self: StudyLocus, study_type: str) -> StudyLocus:
"""Creates a new StudyLocus dataset filtered by study type.
Args:
study_type (str): Study type to filter for. Can be one of `gwas`, `eqtl`, `pqtl`, `eqtl`.
study_index (StudyIndex): Study index to resolve study types.
Returns:
StudyLocus: Filtered study-locus dataset.
Expand All @@ -523,11 +543,7 @@ def filter_by_study_type(
raise ValueError(
f"Study type {study_type} not supported. Supported types are: gwas, eqtl, pqtl, sqtl."
)
new_df = (
self.df.join(study_index.study_type_lut(), on="studyId", how="inner")
.filter(f.col("studyType") == study_type)
.drop("studyType")
)
new_df = self.df.filter(f.col("studyType") == study_type).drop("studyType")
return StudyLocus(
_df=new_df,
_schema=self._schema,
Expand Down Expand Up @@ -576,22 +592,21 @@ def filter_ld_set(ld_set: Column, r2_threshold: float) -> Column:
)

def find_overlaps(
self: StudyLocus, study_index: StudyIndex, intra_study_overlap: bool = False
self: StudyLocus, intra_study_overlap: bool = False
) -> StudyLocusOverlap:
"""Calculate overlapping study-locus.
Find overlapping study-locus that share at least one tagging variant. All GWAS-GWAS and all GWAS-Molecular traits are computed with the Molecular traits always
appearing on the right side.
Args:
study_index (StudyIndex): Study index to resolve study types.
intra_study_overlap (bool): If True, finds intra-study overlaps for credible set deduplication. Default is False.
Returns:
StudyLocusOverlap: Pairs of overlapping study-locus with aligned tags.
"""
loci_to_overlap = (
self.df.join(study_index.study_type_lut(), on="studyId", how="inner")
self.df.filter(f.col("studyType").isNotNull())
.withColumn("locus", f.explode("locus"))
.select(
"studyLocusId",
Expand Down Expand Up @@ -1032,3 +1047,19 @@ def annotate_locus_statistics_boundaries(
)

return self

def window_based_clumping(
self: StudyLocus,
window_size: int = WindowBasedClumpingStepConfig().distance,
) -> StudyLocus:
"""Clump study locus by window size.
Args:
window_size (int): Window size for clumping.
Returns:
StudyLocus: Clumped study locus, where clumped associations are flagged.
"""
from gentropy.method.window_based_clumping import WindowBasedClumping

return WindowBasedClumping.clump(self, window_size)
7 changes: 3 additions & 4 deletions src/gentropy/dataset/study_locus_overlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
if TYPE_CHECKING:
from pyspark.sql.types import StructType

from gentropy.dataset.study_index import StudyIndex
from gentropy.dataset.study_locus import StudyLocus


Expand All @@ -36,18 +35,17 @@ def get_schema(cls: type[StudyLocusOverlap]) -> StructType:

@classmethod
def from_associations(
cls: type[StudyLocusOverlap], study_locus: StudyLocus, study_index: StudyIndex
cls: type[StudyLocusOverlap], study_locus: StudyLocus
) -> StudyLocusOverlap:
"""Find the overlapping signals in a particular set of associations (StudyLocus dataset).
Args:
study_locus (StudyLocus): Study-locus associations to find the overlapping signals
study_index (StudyIndex): Study index to find the overlapping signals
Returns:
StudyLocusOverlap: Study-locus overlap dataset
"""
return study_locus.find_overlaps(study_index)
return study_locus.find_overlaps()

def _convert_to_square_matrix(self: StudyLocusOverlap) -> StudyLocusOverlap:
"""Convert the dataset to a square matrix.
Expand All @@ -60,6 +58,7 @@ def _convert_to_square_matrix(self: StudyLocusOverlap) -> StudyLocusOverlap:
self.df.selectExpr(
"leftStudyLocusId as rightStudyLocusId",
"rightStudyLocusId as leftStudyLocusId",
"rightStudyType",
"tagVariantId",
)
).distinct(),
Expand Down
7 changes: 4 additions & 3 deletions src/gentropy/dataset/summary_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,11 @@ def window_based_clumping(
from gentropy.method.window_based_clumping import WindowBasedClumping

return WindowBasedClumping.clump(
self,
# Before clumping, we filter the summary statistics by p-value:
self.pvalue_filter(gwas_significance),
distance=distance,
gwas_significance=gwas_significance,
)
# After applying the clumping, we filter the clumped loci by the flag:
).valid_rows(["WINDOW_CLUMPED"])

def locus_breaker_clumping(
self: SummaryStatistics,
Expand Down
10 changes: 9 additions & 1 deletion src/gentropy/gwas_catalog_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

from gentropy.common.session import Session
from gentropy.config import WindowBasedClumpingStepConfig
from gentropy.dataset.variant_index import VariantIndex
from gentropy.datasource.gwas_catalog.associations import (
GWASCatalogCuratedAssociationsParser,
Expand Down Expand Up @@ -30,6 +31,7 @@ def __init__(
gnomad_variant_path: str,
catalog_studies_out: str,
catalog_associations_out: str,
distance: int = WindowBasedClumpingStepConfig().distance,
gwas_catalog_study_curation_file: str | None = None,
inclusion_list_path: str | None = None,
) -> None:
Expand All @@ -44,6 +46,7 @@ def __init__(
gnomad_variant_path (str): Path to GnomAD variants.
catalog_studies_out (str): Output GWAS catalog studies path.
catalog_associations_out (str): Output GWAS catalog associations path.
distance (int): Distance, within which tagging variants are collected around the semi-index.
gwas_catalog_study_curation_file (str | None): file of the curation table. Optional.
inclusion_list_path (str | None): optional inclusion list (parquet)
"""
Expand Down Expand Up @@ -86,4 +89,9 @@ def __init__(

# Load
study_index.df.write.mode(session.write_mode).parquet(catalog_studies_out)
study_locus.df.write.mode(session.write_mode).parquet(catalog_associations_out)

(
study_locus.window_based_clumping(distance)
.df.write.mode(session.write_mode)
.parquet(catalog_associations_out)
)
4 changes: 2 additions & 2 deletions src/gentropy/l2g.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def _generate_feature_matrix(self, write_feature_matrix: bool) -> L2GFeatureMatr
ValueError: If write_feature_matrix is set to True but a path is not provided.
ValueError: If dependencies to build features are not set.
"""
if self.gs_curation and self.interactions and self.v2g and self.studies:
if self.gs_curation and self.interactions and self.v2g:
study_locus_overlap = StudyLocus(
_df=self.credible_set.df.join(
f.broadcast(
Expand All @@ -225,7 +225,7 @@ def _generate_feature_matrix(self, write_feature_matrix: bool) -> L2GFeatureMatr
"inner",
),
_schema=StudyLocus.get_schema(),
).find_overlaps(self.studies)
).find_overlaps()

gold_standards = L2GGoldStandard.from_otg_curation(
gold_standard_curation=self.gs_curation,
Expand Down
4 changes: 2 additions & 2 deletions src/gentropy/method/colocalisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def colocalise(
f.col("statistics.right_posteriorProbability"),
),
)
.groupBy("leftStudyLocusId", "rightStudyLocusId", "chromosome")
.groupBy("leftStudyLocusId", "rightStudyLocusId", "rightStudyType", "chromosome")
.agg(
f.count("*").alias("numberColocalisingVariants"),
f.sum(f.col("clpp")).alias("clpp"),
Expand Down Expand Up @@ -168,7 +168,7 @@ def colocalise(
f.col("left_logBF") + f.col("right_logBF"),
)
# Group by overlapping peak and generating dense vectors of log_BF:
.groupBy("chromosome", "leftStudyLocusId", "rightStudyLocusId")
.groupBy("chromosome", "leftStudyLocusId", "rightStudyLocusId", "rightStudyType")
.agg(
f.count("*").alias("numberColocalisingVariants"),
fml.array_to_vector(f.collect_list(f.col("left_logBF"))).alias(
Expand Down
Loading

0 comments on commit c9eada2

Please sign in to comment.