Merge pull request #595 from opentargets/dev

chore: trigger release process
opentargets · May 17, 2024 · b0f8903 · b0f8903
2 parents 18f0fa7 + bb9f9c6
commit b0f8903
Show file tree

Hide file tree

Showing 74 changed files with 7,387 additions and 1,175 deletions.
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -1,4 +1,4 @@
-version: "1"
+version: 1
 labels:
   - label: "size-XS"
     size:

diff --git a/.github/workflows/artifact.yml b/.github/workflows/artifact.yml
@@ -0,0 +1,39 @@
+name: Build and Push to Artifact Registry
+
+"on":
+  push:
+    branches: ["dev"]
+
+env:
+  PROJECT_ID: open-targets-genetics-dev
+  REGION: europe-west1
+  GAR_LOCATION: europe-west1-docker.pkg.dev/open-targets-genetics-dev
+  IMAGE_NAME: gentropy-app
+
+jobs:
+  build-push-artifact:
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Checkout"
+        uses: "actions/checkout@v3"
+
+      - name: "auth"
+        uses: "google-github-actions/auth@v2"
+        with:
+          credentials_json: "${{ secrets.SERVICE_ACCOUNT_KEY }}"
+
+      - name: "Set up Cloud SDK"
+        uses: "google-github-actions/setup-gcloud@v2"
+
+      - name: "Use gcloud CLI"
+        run: "gcloud info"
+
+      - name: "Docker auth"
+        run: |-
+          gcloud auth configure-docker ${{ env.REGION }}-docker.pkg.dev --quiet
+
+      - name: Build image
+        run: docker build . --tag "${{ env.GAR_LOCATION }}/${{ env.IMAGE_NAME }}/gentropy:${{ github.ref_name }}"
+
+      - name: Push image
+        run: docker push "${{ env.GAR_LOCATION }}/${{ env.IMAGE_NAME }}/gentropy:${{ github.ref_name }}"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -6,7 +6,7 @@ ci:
   skip: [poetry-lock]
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.3.0
+    rev: v0.4.3
     hooks:
       - id: ruff
         args:
@@ -15,7 +15,7 @@ repos:
         files: ^((gentropy|utils|tests)/.+)?[^/]+\.py$
 
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v4.6.0
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer
@@ -59,14 +59,14 @@ repos:
         exclude: "CHANGELOG.md"
 
   - repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook
-    rev: v9.11.0
+    rev: v9.16.0
     hooks:
       - id: commitlint
-        additional_dependencies: ["@commitlint/config-conventional"]
+        additional_dependencies: ["@commitlint/config-conventional@18.6.3"]
         stages: [commit-msg]
 
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: "v1.8.0"
+    rev: "v1.10.0"
     hooks:
       - id: mypy
         args:
@@ -82,7 +82,7 @@ repos:
           - "--disallow-untyped-defs"
 
   - repo: https://github.com/econchick/interrogate
-    rev: 1.5.0
+    rev: 1.7.0
     hooks:
       - id: interrogate
         args: [--verbose]
@@ -104,7 +104,7 @@ repos:
       - id: pydoclint
 
   - repo: https://github.com/python-poetry/poetry
-    rev: "1.8.2"
+    rev: "1.8.0"
     hooks:
       - id: poetry-check
       - id: poetry-lock

diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,33 @@
+FROM python:3.10-bullseye
+
+
+RUN apt-get update && \
+    apt-get install -y openjdk-11-jdk && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+RUN java -version
+
+# Set environment variables for Java
+ENV JAVA_HOME /usr/lib/jvm/java-11-openjdk-amd64
+ENV PATH=$PATH:$JAVA_HOME/bin
+
+RUN pip install poetry==1.7.1
+
+ENV POETRY_NO_INTERACTION=1 \
+    POETRY_VIRTUALENVS_IN_PROJECT=1 \
+    POETRY_VIRTUALENVS_CREATE=1 \
+    POETRY_CACHE_DIR=/tmp/poetry_cache
+
+WORKDIR /app
+
+COPY pyproject.toml poetry.lock ./
+RUN touch README.md
+
+RUN poetry config installer.max-workers 10
+RUN poetry install --without dev,docs,tests --no-root --no-interaction --no-ansi -vvv && rm -rf $POETRY_CACHE_DIR
+
+COPY src ./src
+
+RUN poetry install --without dev,docs,tests
+
+ENTRYPOINT ["poetry", "run", "gentropy"]
diff --git a/config/datasets/ot_gcp.yaml b/config/datasets/ot_gcp.yaml
@@ -1,10 +1,10 @@
 # Release specific configuration:
-release_version: "24.01"
+release_version: "24.03"
 dev_version: XX.XX
 release_folder: gs://genetics_etl_python_playground/releases/${datasets.release_version}
 
 inputs: gs://genetics_etl_python_playground/input
-static_assets: gs://genetics_etl_python_playground/static_assetss
+static_assets: gs://genetics_etl_python_playground/static_assets
 outputs: gs://genetics_etl_python_playground/output/python_etl/parquet/${datasets.dev_version}
 
 ## Datasets:
@@ -36,9 +36,9 @@ anderson: ${datasets.static_assets}/andersson2014/enhancer_tss_associations.bed
 javierre: ${datasets.static_assets}/javierre_2016_preprocessed
 jung: ${datasets.static_assets}/jung2019_pchic_tableS3.csv
 thurman: ${datasets.static_assets}/thurman2012/genomewideCorrs_above0.7_promoterPlusMinus500kb_withGeneNames_32celltypeCategories.bed8.gz
-target_index: ${datasets.release_folder}/targets # OTP 23.12 data
+target_index: ${datasets.static_assets}/targets # OTP 23.12 data
+gene_interactions: ${datasets.static_assets}/interaction # OTP 23.12 data
 
-gene_interactions: ${datasets.release_folder}/interaction # OTP 23.12 data
 finngen_finemapping_results_path: ${datasets.inputs}/Finngen_susie_finemapping_r10/full
 finngen_finemapping_summaries_path: ${datasets.inputs}/Finngen_susie_finemapping_r10/Finngen_susie_credset_summary_r10.tsv
 

diff --git a/config/step/ot_colocalisation.yaml → config/step/ot_colocalisation_coloc.yaml b/config/step/ot_colocalisation.yaml → config/step/ot_colocalisation_coloc.yaml
@@ -4,3 +4,4 @@ defaults:
 credible_set_path: ${datasets.credible_set}
 study_index_path: ${datasets.study_index}
 coloc_path: ${datasets.colocalisation}
+colocalisation_method: Coloc
diff --git a/config/step/ot_colocalisation_ecaviar.yaml b/config/step/ot_colocalisation_ecaviar.yaml
@@ -0,0 +1,7 @@
+defaults:
+  - colocalisation
+
+credible_set_path: ${datasets.credible_set}
+study_index_path: ${datasets.study_index}
+coloc_path: ${datasets.colocalisation}
+colocalisation_method: ECaviar
diff --git a/config/step/ot_variant_index.yaml b/config/step/ot_variant_index.yaml
@@ -2,5 +2,5 @@ defaults:
   - variant_index
 
 variant_annotation_path: ${datasets.variant_annotation}
-credible_set_path: ${datasets.study_locus}
+credible_set_path: ${datasets.credible_set}
 variant_index_path: ${datasets.variant_index}
diff --git a/docs/python_api/methods/sumstat_imputation.md b/docs/python_api/methods/sumstat_imputation.md
@@ -0,0 +1,28 @@
+---
+title: Summary Statistics Imputation
+---
+
+Summary statistics imputation leverages linkage disequilibrium (LD) information to compute Z-scores of missing SNPs from neighbouring observed SNPs
+SNPs by taking advantage of the Linkage Disequilibrium.
+
+We implemented the basic model from RAISS (Robust and Accurate Imputation from Summary Statistics) package (see the original [paper](https://academic.oup.com/bioinformatics/article/35/22/4837/5512360)).
+
+The full repository for the RAISS package can be found [here](https://gitlab.pasteur.fr/statistical-genetics/raiss).
+
+The original model was suggested in 2014 by Bogdan Pasaniuc et al. [here](https://pubmed.ncbi.nlm.nih.gov/24990607/).
+
+It represents the following formula:
+
+E(z*i|z_t) = M*{i,t} \cdot (M\_{t,t})^{-1} \cdot z_t
+
+Where:
+
+- E(z_i|z_t) represents the expected z-score of SNP 'i' given the observed z-scores at known SNP indexes 't'.
+
+- M\_{i,t} represents the LD (Linkage Disequilibrium) matrix between SNP 'i' and the known SNPs at indexes 't'.
+
+- (M\_{t,t})^{-1} represents the inverse of the LD matrix of the known SNPs at indexes 't'.
+
+- z_t represents the vector of observed z-scores at the known SNP indexes 't'.
+
+:::gentropy.method.sumstat_imputation.SummaryStatisticsImputation
diff --git a/docs/python_api/methods/sumstat_quality_controls.md b/docs/python_api/methods/sumstat_quality_controls.md
@@ -0,0 +1,18 @@
+---
+title: QC of GWAS Summary Statistics
+---
+
+This class consists of several general quality control checks for GWAS with full summary statistics.
+There are several checks included:
+
+1. Genomic control lambda (median of the distribution of Chi2 statistics divided by expected for Chi2 with df=1). Lambda should be reasonably close to 1. Ideally not bigger than 2.
+
+2. P-Z check: the linear regression between log10 of reported p-values and log10 of p-values inferred from betas and standard errors. Intercept of the regression should be close to 0, slope close to 1.
+
+3. Mean beta check: mean of beta. Should be close to 0.
+
+4. The N_eff check: It estimates the ratio between effective sample size and the expected one and checks its distribution. It is possible to conduct only if the effective allele frequency is provided in the study. The median ratio is always close to 1, standard error should be close to 0.
+
+5. Number of SNPs and number of significant SNPs.
+
+:::gentropy.method.sumstat_quality_controls.SummaryStatisticsQC
diff --git a/docs/src_snippets/howto/python_api/c_applying_methods.py b/docs/src_snippets/howto/python_api/c_applying_methods.py
@@ -23,7 +23,7 @@ def apply_class_method_clumping(summary_stats: SummaryStatistics) -> StudyLocus:
     from gentropy.method.window_based_clumping import WindowBasedClumping
 
     clumped_summary_statistics = WindowBasedClumping.clump(
-        summary_stats, window_length=500_000
+        summary_stats, distance=250_000
     )
     # --8<-- [end:apply_class_method_clumping]
     return clumped_summary_statistics