Merge branch 'dev' into dependabot/pip/lxml-5.2.1

opentargets · Apr 29, 2024 · ce3f834 · ce3f834
2 parents 70c0c19 + d163215
commit ce3f834
Show file tree

Hide file tree

Showing 20 changed files with 3,610 additions and 64 deletions.
diff --git a/.github/workflows/artifact.yml b/.github/workflows/artifact.yml
@@ -0,0 +1,39 @@
+name: Build and Push to Artifact Registry
+
+"on":
+  push:
+    branches: ["dev"]
+
+env:
+  PROJECT_ID: open-targets-genetics-dev
+  REGION: europe-west1
+  GAR_LOCATION: europe-west1-docker.pkg.dev/open-targets-genetics-dev
+  IMAGE_NAME: gentropy-app
+
+jobs:
+  build-push-artifact:
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Checkout"
+        uses: "actions/checkout@v3"
+
+      - name: "auth"
+        uses: "google-github-actions/auth@v2"
+        with:
+          credentials_json: "${{ secrets.SERVICE_ACCOUNT_KEY }}"
+
+      - name: "Set up Cloud SDK"
+        uses: "google-github-actions/setup-gcloud@v2"
+
+      - name: "Use gcloud CLI"
+        run: "gcloud info"
+
+      - name: "Docker auth"
+        run: |-
+          gcloud auth configure-docker ${{ env.REGION }}-docker.pkg.dev --quiet
+
+      - name: Build image
+        run: docker build . --tag "${{ env.GAR_LOCATION }}/${{ env.IMAGE_NAME }}/gentropy:${{ github.ref_name }}"
+
+      - name: Push image
+        run: docker push "${{ env.GAR_LOCATION }}/${{ env.IMAGE_NAME }}/gentropy:${{ github.ref_name }}"
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,33 @@
+FROM python:3.10-bullseye
+
+
+RUN apt-get update && \
+    apt-get install -y openjdk-11-jdk && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+RUN java -version
+
+# Set environment variables for Java
+ENV JAVA_HOME /usr/lib/jvm/java-11-openjdk-amd64
+ENV PATH=$PATH:$JAVA_HOME/bin
+
+RUN pip install poetry==1.7.1
+
+ENV POETRY_NO_INTERACTION=1 \
+    POETRY_VIRTUALENVS_IN_PROJECT=1 \
+    POETRY_VIRTUALENVS_CREATE=1 \
+    POETRY_CACHE_DIR=/tmp/poetry_cache
+
+WORKDIR /app
+
+COPY pyproject.toml poetry.lock ./
+RUN touch README.md
+
+RUN poetry config installer.max-workers 10
+RUN poetry install --without dev,docs,tests --no-root --no-interaction --no-ansi -vvv && rm -rf $POETRY_CACHE_DIR
+
+COPY src ./src
+
+RUN poetry install --without dev,docs,tests
+
+ENTRYPOINT ["poetry", "run", "gentropy"]
diff --git a/config/step/ot_variant_index.yaml b/config/step/ot_variant_index.yaml
@@ -2,5 +2,5 @@ defaults:
   - variant_index
 
 variant_annotation_path: ${datasets.variant_annotation}
-credible_set_path: ${datasets.study_locus}
+credible_set_path: ${datasets.credible_set}
 variant_index_path: ${datasets.variant_index}
diff --git a/docs/python_api/methods/sumstat_quality_controls.md b/docs/python_api/methods/sumstat_quality_controls.md
@@ -0,0 +1,18 @@
+---
+title: QC of GWAS Summary Statistics
+---
+
+This class consists of several general quality control checks for GWAS with full summary statistics.
+There are several checks included:
+
+1. Genomic control lambda (median of the distribution of Chi2 statistics divided by expected for Chi2 with df=1). Lambda should be reasonably close to 1. Ideally not bigger than 2.
+
+2. P-Z check: the linear regression between log10 of reported p-values and log10 of p-values inferred from betas and standard errors. Intercept of the regression should be close to 0, slope close to 1.
+
+3. Mean beta check: mean of beta. Should be close to 0.
+
+4. The N_eff check: It estimates the ratio between effective sample size and the expected one and checks its distribution. It is possible to conduct only if the effective allele frequency is provided in the study. The median ratio is always close to 1, standard error should be close to 0.
+
+5. Number of SNPs and number of significant SNPs.
+
+:::gentropy.method.sumstat_quality_controls.SummaryStatisticsQC
diff --git a/notebooks/FineMapping_AlzheimierDisease.ipynb b/notebooks/FineMapping_AlzheimierDisease.ipynb
diff --git a/notebooks/Mapping_EFO_finngen.ipynb b/notebooks/Mapping_EFO_finngen.ipynb
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -74,7 +74,7 @@ apache-airflow = "^2.8.0"
 apache-airflow-providers-google = "^10.13.1"
 pydoclint = ">=0.3.8,<0.5.0"
 prettier = "^0.0.7"
-deptry = ">=0.12,<0.15"
+deptry = ">=0.12,<0.17"
 python-semantic-release = ">=8.7,<10.0"
 yamllint = "^1.33.0"
 

diff --git a/src/airflow/dags/genetics_etl.py b/src/airflow/dags/genetics_etl.py
@@ -35,16 +35,16 @@
     # PICS credible sets from GWAS Catalog curated associations:
     "gwas_catalog_curated_credible_set": {
         "source_bucket": GWAS_CATALOG_BUCKET_NAME,
-        "source_object": "credible_set_datasets/gwas_catalog_curated",
+        "source_object": "credible_set_datasets/gwas_catalog_PICSed_curated_associations",
         "destination_bucket": RELEASE_BUCKET_NAME,
-        "destination_object": f"releases/{RELEASE_VERSION}/credible_set/gwas_catalog_pics_from_curation",
+        "destination_object": f"releases/{RELEASE_VERSION}/credible_set/gwas_catalog_PICSed_curated_associations",
     },
     # PICS credible sets from GWAS Catalog summary statistics:
     "gwas_catalog_sumstats_credible_set": {
         "source_bucket": GWAS_CATALOG_BUCKET_NAME,
-        "source_object": "credible_set_datasets/gwas_catalog_summary_stats",
+        "source_object": "credible_set_datasets/gwas_catalog_PICSed_summary_statistics",
         "destination_bucket": RELEASE_BUCKET_NAME,
-        "destination_object": f"releases/{RELEASE_VERSION}/credible_set/gwas_catalog_pics_from_summary_statistics",
+        "destination_object": f"releases/{RELEASE_VERSION}/credible_set/gwas_catalog_PICSed_summary_statistics",
     },
     # GWAS Catalog manifest files:
     "gwas_catalog_manifests": {

diff --git a/src/airflow/dags/gwas_catalog_preprocess.py b/src/airflow/dags/gwas_catalog_preprocess.py
@@ -45,12 +45,8 @@
 WINDOW_BASED_CLUMPED = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_summary_stats_window_clumped"
 LD_BASED_CLUMPED = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_summary_stats_ld_clumped"
 # Credible sets:
-CURATED_CREDIBLE_SETS = (
-    f"gs://{GWAS_CATALOG_BUCKET_NAME}/credible_set_datasets/gwas_catalog_curated"
-)
-SUMMARY_STATISTICS_CREDIBLE_SETS = (
-    f"gs://{GWAS_CATALOG_BUCKET_NAME}/credible_set_datasets/gwas_catalog_summary_stats"
-)
+CURATED_CREDIBLE_SETS = f"gs://{GWAS_CATALOG_BUCKET_NAME}/credible_set_datasets/gwas_catalog_PICSed_curated_associations"
+SUMMARY_STATISTICS_CREDIBLE_SETS = f"gs://{GWAS_CATALOG_BUCKET_NAME}/credible_set_datasets/gwas_catalog_PICSed_summary_statistics"
 
 
 def upload_harmonized_study_list(

diff --git a/src/gentropy/config.py b/src/gentropy/config.py
@@ -328,6 +328,24 @@ class WindowBasedClumpingStep(StepConfig):
     _target_: str = "gentropy.window_based_clumping.WindowBasedClumpingStep"
 
 
+@dataclass
+class FinemapperConfig(StepConfig):
+    """SuSiE fine-mapper step configuration."""
+
+    session: Any = field(
+        default_factory=lambda: {
+            "start_hail": True,
+        }
+    )
+    study_locus_to_finemap: str = MISSING
+    study_locus_collected_path: str = MISSING
+    study_index_path: str = MISSING
+    output_path: str = MISSING
+    locus_radius: int = MISSING
+    max_causal_snps: int = MISSING
+    _target_: str = "gentropy.susie_finemapper.SusieFineMapperStep"
+
+
 @dataclass
 class Config:
     """Application configuration."""
@@ -385,3 +403,4 @@ def register_config() -> None:
     cs.store(group="step", name="variant_index", node=VariantIndexConfig)
     cs.store(group="step", name="variant_to_gene", node=VariantToGeneConfig)
     cs.store(group="step", name="window_based_clumping", node=WindowBasedClumpingStep)
+    cs.store(group="step", name="susie_finemapping", node=FinemapperConfig)
diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py
@@ -82,31 +82,52 @@ class StudyLocus(Dataset):
     """
 
     @staticmethod
-    def _overlapping_peaks(credset_to_overlap: DataFrame) -> DataFrame:
+    def _overlapping_peaks(
+        credset_to_overlap: DataFrame, intra_study_overlap: bool = False
+    ) -> DataFrame:
         """Calculate overlapping signals (study-locus) between GWAS-GWAS and GWAS-Molecular trait.
 
         Args:
             credset_to_overlap (DataFrame): DataFrame containing at least `studyLocusId`, `studyType`, `chromosome` and `tagVariantId` columns.
+            intra_study_overlap (bool): When True, finds intra-study overlaps for credible set deduplication. Default is False.
 
         Returns:
             DataFrame: containing `leftStudyLocusId`, `rightStudyLocusId` and `chromosome` columns.
         """
         # Reduce columns to the minimum to reduce the size of the dataframe
         credset_to_overlap = credset_to_overlap.select(
-            "studyLocusId", "studyType", "chromosome", "tagVariantId"
+            "studyLocusId",
+            "studyId",
+            "studyType",
+            "chromosome",
+            "region",
+            "tagVariantId",
         )
+        # Define join condition - if intra_study_overlap is True, finds overlaps within the same study. Otherwise finds gwas vs everything overlaps for coloc.
+        join_condition = (
+            [
+                f.col("left.studyId") == f.col("right.studyId"),
+                f.col("left.chromosome") == f.col("right.chromosome"),
+                f.col("left.tagVariantId") == f.col("right.tagVariantId"),
+                f.col("left.studyLocusId") > f.col("right.studyLocusId"),
+                f.col("left.region") != f.col("right.region"),
+            ]
+            if intra_study_overlap
+            else [
+                f.col("left.chromosome") == f.col("right.chromosome"),
+                f.col("left.tagVariantId") == f.col("right.tagVariantId"),
+                (f.col("right.studyType") != "gwas")
+                | (f.col("left.studyLocusId") > f.col("right.studyLocusId")),
+                f.col("left.studyType") == f.lit("gwas"),
+            ]
+        )
+
         return (
             credset_to_overlap.alias("left")
-            .filter(f.col("studyType") == "gwas")
-            # Self join with complex condition. Left it's all gwas and right can be gwas or molecular trait
+            # Self join with complex condition.
             .join(
                 credset_to_overlap.alias("right"),
-                on=[
-                    f.col("left.chromosome") == f.col("right.chromosome"),
-                    f.col("left.tagVariantId") == f.col("right.tagVariantId"),
-                    (f.col("right.studyType") != "gwas")
-                    | (f.col("left.studyLocusId") > f.col("right.studyLocusId")),
-                ],
+                on=join_condition,
                 how="inner",
             )
             .select(
@@ -305,14 +326,17 @@ def filter_credible_set(
         )
         return self
 
-    def find_overlaps(self: StudyLocus, study_index: StudyIndex) -> StudyLocusOverlap:
+    def find_overlaps(
+        self: StudyLocus, study_index: StudyIndex, intra_study_overlap: bool = False
+    ) -> StudyLocusOverlap:
         """Calculate overlapping study-locus.
 
         Find overlapping study-locus that share at least one tagging variant. All GWAS-GWAS and all GWAS-Molecular traits are computed with the Molecular traits always
         appearing on the right side.
 
         Args:
             study_index (StudyIndex): Study index to resolve study types.
+            intra_study_overlap (bool): If True, finds intra-study overlaps for credible set deduplication. Default is False.
 
         Returns:
             StudyLocusOverlap: Pairs of overlapping study-locus with aligned tags.
@@ -322,8 +346,10 @@ def find_overlaps(self: StudyLocus, study_index: StudyIndex) -> StudyLocusOverla
             .withColumn("locus", f.explode("locus"))
             .select(
                 "studyLocusId",
+                "studyId",
                 "studyType",
                 "chromosome",
+                "region",
                 f.col("locus.variantId").alias("tagVariantId"),
                 f.col("locus.logBF").alias("logBF"),
                 f.col("locus.posteriorProbability").alias("posteriorProbability"),
@@ -335,7 +361,7 @@ def find_overlaps(self: StudyLocus, study_index: StudyIndex) -> StudyLocusOverla
         )
 
         # overlapping study-locus
-        peak_overlaps = self._overlapping_peaks(loci_to_overlap)
+        peak_overlaps = self._overlapping_peaks(loci_to_overlap, intra_study_overlap)
 
         # study-locus overlap by aligning overlapping variants
         return self._align_overlapping_tags(loci_to_overlap, peak_overlaps)

diff --git a/src/gentropy/dataset/summary_statistics.py b/src/gentropy/dataset/summary_statistics.py
@@ -103,3 +103,29 @@ def exclude_region(self: SummaryStatistics, region: str) -> SummaryStatistics:
             ),
             _schema=SummaryStatistics.get_schema(),
         )
+
+    def sanity_filter(self: SummaryStatistics) -> SummaryStatistics:
+        """The function filters the summary statistics by sanity filters.
+
+        The function filters the summary statistics by the following filters:
+            - The p-value should not be eqaul 1.
+            - The beta and se should not be equal 0.
+            - The p-value, beta and se should not be NaN.
+
+        Returns:
+            SummaryStatistics: The filtered summary statistics.
+        """
+        gwas_df = self._df
+        gwas_df = gwas_df.dropna(
+            subset=["beta", "standardError", "pValueMantissa", "pValueExponent"]
+        )
+
+        gwas_df = gwas_df.filter((f.col("beta") != 0) & (f.col("standardError") != 0))
+        gwas_df = gwas_df.filter(
+            f.col("pValueMantissa") * 10 ** f.col("pValueExponent") != 1
+        )
+
+        return SummaryStatistics(
+            _df=gwas_df,
+            _schema=SummaryStatistics.get_schema(),
+        )
diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py
@@ -453,23 +453,23 @@ def get_ld_matrix_slice(
     def get_locus_index(
         self: GnomADLDMatrix,
         study_locus_row: Row,
-        window_size: int = 1_000_000,
+        radius: int = 500_000,
         major_population: str = "nfe",
     ) -> DataFrame:
         """Extract hail matrix index from StudyLocus rows.
 
         Args:
             study_locus_row (Row): Study-locus row
-            window_size (int): Window size to extract from gnomad matrix
+            radius (int): Locus radius to extract from gnomad matrix
             major_population (str): Major population to extract from gnomad matrix, default is "nfe"
 
         Returns:
             DataFrame: Returns the index of the gnomad matrix for the locus
 
         """
         chromosome = str("chr" + study_locus_row["chromosome"])
-        start = study_locus_row["position"] - window_size // 2
-        end = study_locus_row["position"] + window_size // 2
+        start = study_locus_row["position"] - radius
+        end = study_locus_row["position"] + radius
 
         liftover_ht = hl.read_table(self.liftover_ht_path)
         liftover_ht = (