diff --git a/.github/labeler.yml b/.github/labeler.yml index 060922825..fdb0e58b6 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -1,4 +1,4 @@ -version: "1" +version: 1 labels: - label: "size-XS" size: diff --git a/.github/workflows/artifact.yml b/.github/workflows/artifact.yml new file mode 100644 index 000000000..ce7b47b6c --- /dev/null +++ b/.github/workflows/artifact.yml @@ -0,0 +1,39 @@ +name: Build and Push to Artifact Registry + +"on": + push: + branches: ["dev"] + +env: + PROJECT_ID: open-targets-genetics-dev + REGION: europe-west1 + GAR_LOCATION: europe-west1-docker.pkg.dev/open-targets-genetics-dev + IMAGE_NAME: gentropy-app + +jobs: + build-push-artifact: + runs-on: ubuntu-latest + steps: + - name: "Checkout" + uses: "actions/checkout@v3" + + - name: "auth" + uses: "google-github-actions/auth@v2" + with: + credentials_json: "${{ secrets.SERVICE_ACCOUNT_KEY }}" + + - name: "Set up Cloud SDK" + uses: "google-github-actions/setup-gcloud@v2" + + - name: "Use gcloud CLI" + run: "gcloud info" + + - name: "Docker auth" + run: |- + gcloud auth configure-docker ${{ env.REGION }}-docker.pkg.dev --quiet + + - name: Build image + run: docker build . --tag "${{ env.GAR_LOCATION }}/${{ env.IMAGE_NAME }}/gentropy:${{ github.ref_name }}" + + - name: Push image + run: docker push "${{ env.GAR_LOCATION }}/${{ env.IMAGE_NAME }}/gentropy:${{ github.ref_name }}" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 51d4e05a8..00d914dc0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ ci: skip: [poetry-lock] repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.3.0 + rev: v0.4.3 hooks: - id: ruff args: @@ -15,7 +15,7 @@ repos: files: ^((gentropy|utils|tests)/.+)?[^/]+\.py$ - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 + rev: v4.6.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer @@ -59,14 +59,14 @@ repos: exclude: "CHANGELOG.md" - repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook - rev: v9.11.0 + rev: v9.16.0 hooks: - id: commitlint - additional_dependencies: ["@commitlint/config-conventional"] + additional_dependencies: ["@commitlint/config-conventional@18.6.3"] stages: [commit-msg] - repo: https://github.com/pre-commit/mirrors-mypy - rev: "v1.8.0" + rev: "v1.10.0" hooks: - id: mypy args: @@ -82,7 +82,7 @@ repos: - "--disallow-untyped-defs" - repo: https://github.com/econchick/interrogate - rev: 1.5.0 + rev: 1.7.0 hooks: - id: interrogate args: [--verbose] @@ -104,7 +104,7 @@ repos: - id: pydoclint - repo: https://github.com/python-poetry/poetry - rev: "1.8.2" + rev: "1.8.0" hooks: - id: poetry-check - id: poetry-lock diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..deb43bcd8 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,33 @@ +FROM python:3.10-bullseye + + +RUN apt-get update && \ + apt-get install -y openjdk-11-jdk && \ + apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN java -version + +# Set environment variables for Java +ENV JAVA_HOME /usr/lib/jvm/java-11-openjdk-amd64 +ENV PATH=$PATH:$JAVA_HOME/bin + +RUN pip install poetry==1.7.1 + +ENV POETRY_NO_INTERACTION=1 \ + POETRY_VIRTUALENVS_IN_PROJECT=1 \ + POETRY_VIRTUALENVS_CREATE=1 \ + POETRY_CACHE_DIR=/tmp/poetry_cache + +WORKDIR /app + +COPY pyproject.toml poetry.lock ./ +RUN touch README.md + +RUN poetry config installer.max-workers 10 +RUN poetry install --without dev,docs,tests --no-root --no-interaction --no-ansi -vvv && rm -rf $POETRY_CACHE_DIR + +COPY src ./src + +RUN poetry install --without dev,docs,tests + +ENTRYPOINT ["poetry", "run", "gentropy"] diff --git a/config/datasets/ot_gcp.yaml b/config/datasets/ot_gcp.yaml index ebc9fc9c2..566e2ec50 100644 --- a/config/datasets/ot_gcp.yaml +++ b/config/datasets/ot_gcp.yaml @@ -1,10 +1,10 @@ # Release specific configuration: -release_version: "24.01" +release_version: "24.03" dev_version: XX.XX release_folder: gs://genetics_etl_python_playground/releases/${datasets.release_version} inputs: gs://genetics_etl_python_playground/input -static_assets: gs://genetics_etl_python_playground/static_assetss +static_assets: gs://genetics_etl_python_playground/static_assets outputs: gs://genetics_etl_python_playground/output/python_etl/parquet/${datasets.dev_version} ## Datasets: @@ -36,9 +36,9 @@ anderson: ${datasets.static_assets}/andersson2014/enhancer_tss_associations.bed javierre: ${datasets.static_assets}/javierre_2016_preprocessed jung: ${datasets.static_assets}/jung2019_pchic_tableS3.csv thurman: ${datasets.static_assets}/thurman2012/genomewideCorrs_above0.7_promoterPlusMinus500kb_withGeneNames_32celltypeCategories.bed8.gz -target_index: ${datasets.release_folder}/targets # OTP 23.12 data +target_index: ${datasets.static_assets}/targets # OTP 23.12 data +gene_interactions: ${datasets.static_assets}/interaction # OTP 23.12 data -gene_interactions: ${datasets.release_folder}/interaction # OTP 23.12 data finngen_finemapping_results_path: ${datasets.inputs}/Finngen_susie_finemapping_r10/full finngen_finemapping_summaries_path: ${datasets.inputs}/Finngen_susie_finemapping_r10/Finngen_susie_credset_summary_r10.tsv diff --git a/config/step/ot_colocalisation.yaml b/config/step/ot_colocalisation_coloc.yaml similarity index 84% rename from config/step/ot_colocalisation.yaml rename to config/step/ot_colocalisation_coloc.yaml index 4433595ef..f01335514 100644 --- a/config/step/ot_colocalisation.yaml +++ b/config/step/ot_colocalisation_coloc.yaml @@ -4,3 +4,4 @@ defaults: credible_set_path: ${datasets.credible_set} study_index_path: ${datasets.study_index} coloc_path: ${datasets.colocalisation} +colocalisation_method: Coloc diff --git a/config/step/ot_colocalisation_ecaviar.yaml b/config/step/ot_colocalisation_ecaviar.yaml new file mode 100644 index 000000000..d57887c93 --- /dev/null +++ b/config/step/ot_colocalisation_ecaviar.yaml @@ -0,0 +1,7 @@ +defaults: + - colocalisation + +credible_set_path: ${datasets.credible_set} +study_index_path: ${datasets.study_index} +coloc_path: ${datasets.colocalisation} +colocalisation_method: ECaviar diff --git a/config/step/ot_variant_index.yaml b/config/step/ot_variant_index.yaml index 1625c7126..3834196b2 100644 --- a/config/step/ot_variant_index.yaml +++ b/config/step/ot_variant_index.yaml @@ -2,5 +2,5 @@ defaults: - variant_index variant_annotation_path: ${datasets.variant_annotation} -credible_set_path: ${datasets.study_locus} +credible_set_path: ${datasets.credible_set} variant_index_path: ${datasets.variant_index} diff --git a/docs/python_api/methods/sumstat_imputation.md b/docs/python_api/methods/sumstat_imputation.md new file mode 100644 index 000000000..6e64d35b1 --- /dev/null +++ b/docs/python_api/methods/sumstat_imputation.md @@ -0,0 +1,28 @@ +--- +title: Summary Statistics Imputation +--- + +Summary statistics imputation leverages linkage disequilibrium (LD) information to compute Z-scores of missing SNPs from neighbouring observed SNPs +SNPs by taking advantage of the Linkage Disequilibrium. + +We implemented the basic model from RAISS (Robust and Accurate Imputation from Summary Statistics) package (see the original [paper](https://academic.oup.com/bioinformatics/article/35/22/4837/5512360)). + +The full repository for the RAISS package can be found [here](https://gitlab.pasteur.fr/statistical-genetics/raiss). + +The original model was suggested in 2014 by Bogdan Pasaniuc et al. [here](https://pubmed.ncbi.nlm.nih.gov/24990607/). + +It represents the following formula: + +E(z*i|z_t) = M*{i,t} \cdot (M\_{t,t})^{-1} \cdot z_t + +Where: + +- E(z_i|z_t) represents the expected z-score of SNP 'i' given the observed z-scores at known SNP indexes 't'. + +- M\_{i,t} represents the LD (Linkage Disequilibrium) matrix between SNP 'i' and the known SNPs at indexes 't'. + +- (M\_{t,t})^{-1} represents the inverse of the LD matrix of the known SNPs at indexes 't'. + +- z_t represents the vector of observed z-scores at the known SNP indexes 't'. + +:::gentropy.method.sumstat_imputation.SummaryStatisticsImputation diff --git a/docs/python_api/methods/sumstat_quality_controls.md b/docs/python_api/methods/sumstat_quality_controls.md new file mode 100644 index 000000000..dfc5c9d16 --- /dev/null +++ b/docs/python_api/methods/sumstat_quality_controls.md @@ -0,0 +1,18 @@ +--- +title: QC of GWAS Summary Statistics +--- + +This class consists of several general quality control checks for GWAS with full summary statistics. +There are several checks included: + +1. Genomic control lambda (median of the distribution of Chi2 statistics divided by expected for Chi2 with df=1). Lambda should be reasonably close to 1. Ideally not bigger than 2. + +2. P-Z check: the linear regression between log10 of reported p-values and log10 of p-values inferred from betas and standard errors. Intercept of the regression should be close to 0, slope close to 1. + +3. Mean beta check: mean of beta. Should be close to 0. + +4. The N_eff check: It estimates the ratio between effective sample size and the expected one and checks its distribution. It is possible to conduct only if the effective allele frequency is provided in the study. The median ratio is always close to 1, standard error should be close to 0. + +5. Number of SNPs and number of significant SNPs. + +:::gentropy.method.sumstat_quality_controls.SummaryStatisticsQC diff --git a/docs/src_snippets/howto/python_api/c_applying_methods.py b/docs/src_snippets/howto/python_api/c_applying_methods.py index 12eaf61ac..d0bec9edc 100644 --- a/docs/src_snippets/howto/python_api/c_applying_methods.py +++ b/docs/src_snippets/howto/python_api/c_applying_methods.py @@ -23,7 +23,7 @@ def apply_class_method_clumping(summary_stats: SummaryStatistics) -> StudyLocus: from gentropy.method.window_based_clumping import WindowBasedClumping clumped_summary_statistics = WindowBasedClumping.clump( - summary_stats, window_length=500_000 + summary_stats, distance=250_000 ) # --8<-- [end:apply_class_method_clumping] return clumped_summary_statistics diff --git a/notebooks/FineMapping_AlzheimierDisease.ipynb b/notebooks/FineMapping_AlzheimierDisease.ipynb new file mode 100644 index 000000000..8a785cc3f --- /dev/null +++ b/notebooks/FineMapping_AlzheimierDisease.ipynb @@ -0,0 +1,1734 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Fine-mapping of Alzheimer's disease GWAS summary statistics using GentroPy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook presents an example of fine-mapping of the GWAS catalog study for Alzheimer's disease ([link to study](https://genetics.opentargets.org/Study/GCST90012877/associations)). The study itself is a good benchmarking example for fine-mapping - relatively large number of SNPs, very strong signal on the 19th chromosome (APOE). It's worth noting that usually very strong signals are excluded from fine-mapping due to instability.\n", + "\n", + "Also, we excluded MHC region (6:28M-34M) from fine-mapping because it has a huge density of the variants.\n", + "\n", + "To execute it on your local machine (not dataproc) you need to install https://github.com/broadinstitute/install-gcs-connector." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialization" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Your browser has been opened to visit:\n", + "\n", + " https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=2Jvk4c7unAsigRvEKhceIxcrpGmeK8&access_type=offline&code_challenge=84guS6MmOY7qgvNpHLxoJbhRDBUAEUS93teMwQboD3Q&code_challenge_method=S256\n", + "\n", + "\n", + "Credentials saved to file: [/Users/yt4/.config/gcloud/application_default_credentials.json]\n", + "\n", + "These credentials will be used by any library that requests Application Default Credentials (ADC).\n", + "\n", + "Quota project \"open-targets-genetics-dev\" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.\n", + "\n", + "\n", + "Updates are available for some Google Cloud CLI components. To install them,\n", + "please run:\n", + " $ gcloud components update\n", + "\n" + ] + } + ], + "source": [ + "!gcloud auth application-default login" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " Loading BokehJS ...\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": "(function(root) {\n function now() {\n return new Date();\n }\n\n const force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\nconst JS_MIME_TYPE = 'application/javascript';\n const HTML_MIME_TYPE = 'text/html';\n const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n const CLASS_NAME = 'output_bokeh rendered_html';\n\n /**\n * Render data to the DOM node\n */\n function render(props, node) {\n const script = document.createElement(\"script\");\n node.appendChild(script);\n }\n\n /**\n * Handle when an output is cleared or removed\n */\n function handleClearOutput(event, handle) {\n function drop(id) {\n const view = Bokeh.index.get_by_id(id)\n if (view != null) {\n view.model.document.clear()\n Bokeh.index.delete(view)\n }\n }\n\n const cell = handle.cell;\n\n const id = cell.output_area._bokeh_element_id;\n const server_id = cell.output_area._bokeh_server_id;\n\n // Clean up Bokeh references\n if (id != null) {\n drop(id)\n }\n\n if (server_id !== undefined) {\n // Clean up Bokeh references\n const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n cell.notebook.kernel.execute(cmd_clean, {\n iopub: {\n output: function(msg) {\n const id = msg.content.text.trim()\n drop(id)\n }\n }\n });\n // Destroy server and session\n const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n cell.notebook.kernel.execute(cmd_destroy);\n }\n }\n\n /**\n * Handle when a new output is added\n */\n function handleAddOutput(event, handle) {\n const output_area = handle.output_area;\n const output = handle.output;\n\n // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n return\n }\n\n const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n\n if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n // store reference to embed id on output_area\n output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n }\n if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n const bk_div = document.createElement(\"div\");\n bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n const script_attrs = bk_div.children[0].attributes;\n for (let i = 0; i < script_attrs.length; i++) {\n toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n }\n // store reference to server id on output_area\n output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n }\n }\n\n function register_renderer(events, OutputArea) {\n\n function append_mime(data, metadata, element) {\n // create a DOM node to render to\n const toinsert = this.create_output_subarea(\n metadata,\n CLASS_NAME,\n EXEC_MIME_TYPE\n );\n this.keyboard_manager.register_events(toinsert);\n // Render to node\n const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n render(props, toinsert[toinsert.length - 1]);\n element.append(toinsert);\n return toinsert\n }\n\n /* Handle when an output is cleared or removed */\n events.on('clear_output.CodeCell', handleClearOutput);\n events.on('delete.Cell', handleClearOutput);\n\n /* Handle when a new output is added */\n events.on('output_added.OutputArea', handleAddOutput);\n\n /**\n * Register the mime type and append_mime function with output_area\n */\n OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n /* Is output safe? */\n safe: true,\n /* Index of renderer in `output_area.display_order` */\n index: 0\n });\n }\n\n // register the mime type if in Jupyter Notebook environment and previously unregistered\n if (root.Jupyter !== undefined) {\n const events = require('base/js/events');\n const OutputArea = require('notebook/js/outputarea').OutputArea;\n\n if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n register_renderer(events, OutputArea);\n }\n }\n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n const NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n const el = document.getElementById(\"a8e9a6a6-96f7-4efd-a426-b2299499ef03\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error(url) {\n console.error(\"failed to load \" + url);\n }\n\n for (let i = 0; i < css_urls.length; i++) {\n const url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n for (let i = 0; i < js_urls.length; i++) {\n const url = js_urls[i];\n const element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.3.0.min.js\"];\n const css_urls = [];\n\n const inline_js = [ function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\nfunction(Bokeh) {\n }\n ];\n\n function run_inline_js() {\n if (root.Bokeh !== undefined || force === true) {\n for (let i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\nif (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n const cell = $(document.getElementById(\"a8e9a6a6-96f7-4efd-a426-b2299499ef03\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));", + "application/vnd.bokehjs_load.v0+json": "" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/04/09 10:40:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "pip-installed Hail requires additional configuration options in Spark referring\n", + " to the path to the Hail Python module directory HAIL_DIR,\n", + " e.g. /path/to/python/site-packages/hail:\n", + " spark.jars=HAIL_DIR/backend/hail-all-spark.jar\n", + " spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar\n", + " spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 3.3.4\n", + "SparkUI available at http://192.168.0.232:4040\n", + "Welcome to\n", + " __ __ <>__\n", + " / /_/ /__ __/ /\n", + " / __ / _ `/ / /\n", + " /_/ /_/\\_,_/_/_/ version 0.2.127-bb535cd096c5\n", + "LOGGING: writing to /dev/null\n" + ] + } + ], + "source": [ + "import os\n", + "import hail as hl\n", + "import pyspark.sql.functions as f\n", + "import pandas as pd\n", + "pd.set_option('display.max_colwidth', None)\n", + "pd.set_option('display.expand_frame_repr', False)\n", + "\n", + "from gentropy.common.session import Session\n", + "from gentropy.dataset.study_index import StudyIndex\n", + "from gentropy.dataset.summary_statistics import SummaryStatistics\n", + "from gentropy.dataset.study_index import StudyIndex\n", + "from gentropy.method.window_based_clumping import WindowBasedClumping\n", + "from gentropy.susie_finemapper import SusieFineMapperStep\n", + "\n", + "hail_dir = os.path.dirname(hl.__file__)\n", + "session = Session(hail_home=hail_dir, start_hail=True, extended_spark_conf={\"spark.driver.memory\": \"12g\",\n", + " \"spark.kryoserializer.buffer.max\": \"500m\",\"spark.driver.maxResultSize\":\"3g\"})\n", + "hl.init(sc=session.spark.sparkContext, log=\"/dev/null\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading the data and clumping" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of SNPs in GWAS: 10607272\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 7:> (0 + 1) / 1]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of clumps: 33\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "path_gwas1=\"gs://gwas_catalog_data/harmonised_summary_statistics/GCST90012877.parquet\"\n", + "path_si=\"gs://gwas_catalog_data/study_index\"\n", + "\n", + "gwas1 = SummaryStatistics.from_parquet(session, path_gwas1)\n", + "study_index = StudyIndex.from_parquet(session, path_si)\n", + "\n", + "slt=WindowBasedClumping.clump(gwas1,gwas_significance=5e-8,distance=1e6)\n", + "slt_df=slt._df\n", + "\n", + "print(\"Number of SNPs in GWAS: \",gwas1._df.count())\n", + "print(\"Number of clumps: \",slt_df.count())" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 15:===================================================> (9 + 1) / 10]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------+----------------+----------+---------+----------------+----------+--------------+--------------+-------------------------------+----------------+--------------------+---------------+\n", + "| studyId| variantId|chromosome| position| beta|sampleSize|pValueMantissa|pValueExponent|effectAlleleFrequencyFromSource| standardError| studyLocusId|qualityControls|\n", + "+------------+----------------+----------+---------+----------------+----------+--------------+--------------+-------------------------------+----------------+--------------------+---------------+\n", + "|GCST90012877| 1_161185602_G_A| 1|161185602| 0.0609052805639| null| 4.302| -8| 0.23499| 0.0111181765833| 6360456299763482946| []|\n", + "|GCST90012877| 1_207577223_T_C| 1|207577223| -0.122752564739| null| 1.403| -23| 0.822818| 0.0122652043685|-6742466305250328444| []|\n", + "|GCST90012877| 10_11678309_A_G| 10| 11678309| 0.0668997305692| null| 1.085| -11| 0.380517|0.00984571382836| 3672202482976347473| []|\n", + "|GCST90012877| 10_59886075_G_T| 10| 59886075|-0.0523916765294| null| 3.802| -8| 0.480668|0.00952612570169| 760299597568413738| []|\n", + "|GCST90012877| 10_80520381_T_G| 10| 80520381| 0.0701098772587| null| 2.736| -9| 0.793475| 0.0117897597766|-6168361428432361140| []|\n", + "|GCST90012877|11_121564878_T_C| 11|121564878| -0.186386086749| null| 5.586| -14| 0.037005| 0.0247938672944|-7548659272243096830| []|\n", + "|GCST90012877| 11_47370397_G_A| 11| 47370397| 0.0634588530202| null| 6.911| -11| 0.387521| 0.0097291000298| 1916491992423016132| []|\n", + "|GCST90012877| 11_60328267_T_C| 11| 60328267|-0.0892048800109| null| 9.335| -20| 0.371215|0.00980658024905| 3318332793803757311| []|\n", + "|GCST90012877| 11_86156833_A_G| 11| 86156833| 0.103281644827| null| 5.214| -26| 0.629462|0.00979200684254| 3806751464721795080| []|\n", + "|GCST90012877| 14_52924962_A_G| 14| 52924962| 0.102404628268| null| 3.69| -10| 0.092233| 0.0163413709974|-8640267085448358001| []|\n", + "|GCST90012877| 14_92472511_G_A| 14| 92472511|-0.0762776811698| null| 7.454| -14| 0.339674| 0.0101980809801| 8895835730818824947| []|\n", + "|GCST90012877| 15_50707194_C_G| 15| 50707194|-0.0722934881552| null| 1.639| -9| 0.197469| 0.0119888249532|-4585712009512019667| []|\n", + "|GCST90012877| 15_58730416_T_C| 15| 58730416|-0.0675867539589| null| 2.674| -11| 0.319058| 0.010142839928|-9173595866829505633| []|\n", + "|GCST90012877| 15_63277703_C_T| 15| 63277703| 0.0849598934189| null| 1.052| -8| 0.139487| 0.0148475601067|-6181511576673508209| []|\n", + "|GCST90012877| 16_31115000_C_A| 16| 31115000|-0.0620662164665| null| 4.466| -9| 0.281382| 0.0105807514538|-3612515273077152914| []|\n", + "|GCST90012877| 17_5229833_T_C| 17| 5229833|-0.0849787931131| null| 1.352| -9| 0.875068| 0.0140203927902|-7070596043624425654| []|\n", + "|GCST90012877| 17_63483402_T_C| 17| 63483402| 0.0542810764988| null| 1.215| -8| 0.529632|0.00952697587266| 7171154626284587162| []|\n", + "|GCST90012877| 19_1050875_A_G| 19| 1050875|-0.0772974277902| null| 2.415| -13| 0.674169| 0.0105546077307| 6109438569946056978| []|\n", + "|GCST90012877| 19_44892009_G_A| 19| 44892009| 0.352722374032| null| 1.995| -277| 0.605067|0.00991069396551| 6814727764900576662| []|\n", + "|GCST90012877| 19_51224706_C_A| 19| 51224706|-0.0582180344342| null| 1.295| -8| 0.325551| 0.010237506551|-8288099943480320096| []|\n", + "+------------+----------------+----------+---------+----------------+----------+--------------+--------------+-------------------------------+----------------+--------------------+---------------+\n", + "only showing top 20 rows\n", + "\n", + "None\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "print(slt_df.show())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fine-mapping without outliers detection and imputation using 2M as window size" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-04-09 10:41:57.354 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 10:42:15.499 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:42:28.284 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:44:41.305 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 10:44:51.854 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:45:03.059 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:47:04.871 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 10:47:17.310 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:47:29.113 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:50:32.790 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 10:50:46.191 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:50:57.958 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:53:22.698 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 10:53:34.535 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:53:45.816 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:57:23.189 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 10:57:34.563 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:57:43.988 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:59:39.834 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 10:59:52.878 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:00:06.629 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:02:11.433 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:02:22.101 Hail: INFO: Ordering unsorted dataset with network shuffle\n", + "2024-04-09 11:02:32.320 Hail: INFO: wrote table with 175330 rows in 8 partitions to /tmp/__iruid_20813-EyC6kjgQ1hAjFSiH1Xp7sB\n", + "2024-04-09 11:02:35.350 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:04:16.225 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:04:27.837 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:04:41.879 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:07:09.950 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:07:21.139 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:07:33.197 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:09:56.240 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:10:08.288 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:10:20.802 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:14:07.114 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:14:20.204 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:14:32.464 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:16:41.133 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:16:53.191 Hail: INFO: Ordering unsorted dataset with network shuffle\n", + "2024-04-09 11:17:03.328 Hail: INFO: wrote table with 211068 rows in 9 partitions to /tmp/__iruid_35318-By6CsozcY2JvH6dhwjdBPU\n", + "2024-04-09 11:17:10.133 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:19:10.141 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:19:21.964 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:19:34.636 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:21:47.445 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:21:59.127 Hail: INFO: Ordering unsorted dataset with network shuffle\n", + "2024-04-09 11:22:15.902 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:23:18.294 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:23:32.131 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:23:48.719 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:26:51.703 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:27:02.820 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:27:18.412 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:29:02.997 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:29:19.516 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:29:41.224 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:33:19.553 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:33:32.903 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:33:49.144 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:36:16.552 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:36:28.952 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:36:46.964 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:40:40.837 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:40:51.976 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:41:04.014 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:43:54.259 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:44:05.534 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:44:20.087 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:46:45.605 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:46:59.301 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:47:13.181 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:49:50.219 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:50:02.311 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:50:16.072 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:52:34.864 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:52:46.513 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:53:00.919 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:55:50.580 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:56:02.124 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:56:16.907 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:59:15.457 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:59:27.380 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:59:40.184 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:03:31.055 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 12:03:42.554 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:03:53.915 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:06:33.123 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 12:06:45.813 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:06:58.340 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:09:23.153 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 12:09:33.531 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:09:43.693 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:11:13.739 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 12:11:26.087 Hail: INFO: Ordering unsorted dataset with network shuffle\n", + "2024-04-09 12:11:38.950 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:13:17.020 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 12:13:29.727 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:13:45.213 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:20:03.844 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 12:20:13.834 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:20:24.282 Hail: INFO: Coerced sorted dataset\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 1:160185602-162185602 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/p5/4t9crp1563l792qz8xz_3x5h0000gq/T/ipykernel_46905/1319014212.py:29: FutureWarning:\n", + "\n", + "The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + "\n", + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 1:206577223-208577223 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 10:10678309-12678309 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 10:58886075-60886075 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 10:79520381-81520381 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 11:120564878-122564878 ; number of CSs: 2 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 11:46370397-48370397 ; number of CSs: 6 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 11:59328267-61328267 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 11:85156833-87156833 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 14:51924962-53924962 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 14:91472511-93472511 ; number of CSs: 2 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 15:49707194-51707194 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 15:57730416-59730416 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 15:62277703-64277703 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 16:30115000-32115000 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 17:4229833-6229833 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 17:62483402-64483402 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 19:50875-2050875 ; number of CSs: 2 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 19:43892009-45892009 ; number of CSs: 10 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 19:50224706-52224706 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 2:104749599-106749599 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 2:126135234-128135234 ; number of CSs: 2 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 2:232117202-234117202 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 2:64381229-66381229 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 20:55423488-57423488 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 1199:=====================> (3 + 5) / 8]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 21:25775872-27775872 ; number of CSs: 2 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 4:10025995-12025995 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 6:39974457-41974457 ; number of CSs: 2 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 6:46627419-48627419 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 1403:===================================> (5 + 3) / 8]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 7:142410495-144410495 ; number of CSs: 2 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 7:99374211-101374211 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 1505:> (0 + 8) / 8]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 8:26610986-28610986 ; number of CSs: 3 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "df = slt_df.withColumn(\"row_index\", f.monotonically_increasing_id())\n", + "\n", + "columns = ['N_gwas', 'N_ld', 'N_overlap', 'N_outliers', 'N_imputed', 'N_final_to_fm', 'eleapsed_time']\n", + "logs = pd.DataFrame(columns=columns)\n", + "\n", + "for i in range(0,df.count()):\n", + " if i!=27:\n", + " one_row = df.filter(df.row_index == i).first()\n", + "\n", + " res=SusieFineMapperStep.susie_finemapper_one_studylocus_row_v2_dev(\n", + " GWAS=gwas1,\n", + " session=session,\n", + " study_locus_row=one_row,\n", + " study_index=study_index,\n", + " window= 2_000_000,\n", + " L=10,\n", + " susie_est_tausq=False,\n", + " run_carma=False,\n", + " run_sumstat_imputation=False,\n", + " carma_time_limit=600,\n", + " imputed_r2_threshold=0.8,\n", + " ld_score_threshold=4\n", + " )\n", + "\n", + " sl=res[\"study_locus\"]\n", + " #print(sl._df.withColumn(\"size\", f.size(sl._df[\"locus\"])).show())\n", + " print(\"Region: \",sl._df.collect()[0]['region'], \"; number of CSs: \",sl._df.count(), \"; log:\")\n", + " #print(res[\"log\"])\n", + " logs=pd.concat([logs,res[\"log\"]])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " N_gwas N_ld N_overlap N_outliers N_imputed N_final_to_fm eleapsed_time\n", + "0 7120 10431 6456 0 0 6456 56.839336\n", + "0 7128 8657 5769 0 0 5769 46.149004\n", + "0 9203 12106 7930 0 0 7930 93.531924\n", + "0 8351 10014 6995 0 0 6995 74.174323\n", + "0 9388 12551 8337 0 0 8337 120.602071\n", + "0 6560 8729 5758 0 0 5758 45.064894\n", + "0 5005 7701 3954 0 0 3954 55.229344\n", + "0 7012 8940 5815 0 0 5815 38.824251\n", + "0 8661 10303 7291 0 0 7291 68.802810\n", + "0 8081 9966 6771 0 0 6771 64.327746\n", + "0 8375 11213 7467 0 0 7467 141.808555\n", + "0 7377 9622 6369 0 0 6369 51.198955\n", + "0 8181 10864 7116 0 0 7116 49.033224\n", + "0 7976 10135 6704 0 0 6704 58.357743\n", + "0 3369 6542 2972 0 0 2972 17.138722\n", + "0 9006 12776 7969 0 0 7969 83.551872\n", + "0 4565 7018 3887 0 0 3887 37.801989\n", + "0 8278 13741 7852 0 0 7852 105.623508\n", + "0 7582 10448 6100 0 0 6100 58.572944\n", + "0 9145 12706 8242 0 0 8242 129.442009\n", + "0 8795 11311 7584 0 0 7584 86.336201\n", + "0 7852 10028 7041 0 0 7041 67.886754\n", + "0 8393 10850 7195 0 0 7195 72.375809\n", + "0 7639 10031 6520 0 0 6520 61.900982\n", + "0 8899 11509 7922 0 0 7922 86.535298\n", + "0 8908 11309 7889 0 0 7889 93.595320\n", + "0 10654 12663 8990 0 0 8990 133.390712\n", + "0 9073 10228 7398 0 0 7398 79.774280\n", + "0 8033 9785 6822 0 0 6822 63.950340\n", + "0 4570 5516 3162 0 0 3162 27.943313\n", + "0 5716 8785 4760 0 0 4760 30.024706\n", + "0 9243 10989 7869 0 0 7869 108.898056\n" + ] + } + ], + "source": [ + "pd.set_option('display.max_rows', None)\n", + "print(logs)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "6653.3125\n" + ] + } + ], + "source": [ + "summary = logs['N_overlap'].mean()\n", + "print(summary)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Fine-mapping of APOE locus" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "text/plain": [ + "Row(studyId='GCST90012877', variantId='19_44892009_G_A', chromosome='19', position=44892009, beta=0.352722374032, sampleSize=None, pValueMantissa=1.9950000047683716, pValueExponent=-277, effectAlleleFrequencyFromSource=0.6050670146942139, standardError=0.00991069396551, studyLocusId=6814727764900576662, qualityControls=[], row_index=18)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = slt_df.withColumn(\"row_index\", f.monotonically_increasing_id())\n", + "one_row = df.filter(df.row_index == 18).first()\n", + "one_row" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Without CARMA, without imputation" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-04-08 21:34:03.208 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + "2024-04-08 21:34:19.253 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 21:34:34.941 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 21:37:16.576 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + "2024-04-08 21:37:28.867 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 21:37:44.733 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 21:52:03.198 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + "2024-04-08 21:52:15.100 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 21:52:30.553 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 22:21:27.877 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + "2024-04-08 22:21:40.137 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 22:21:55.249 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 22:23:21.795 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + "2024-04-08 22:23:35.031 Hail: INFO: Coerced sorted dataset\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "| studyLocusId| studyId| region|credibleSetIndex| locus| variantId|chromosome|position|finemappingMethod|credibleSetlog10BF|size|\n", + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "|-6417720984991662128|GCST90012877|19:43892009-45892009| 1|[{19_44908684_T_C...|19_44908684_T_C| 19|44908684| SuSiE-inf| 2135.710824756712| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 2|[{19_44921094_A_T...|19_44921094_A_T| 19|44921094| SuSiE-inf| 955.4948390766739| 1|\n", + "| 8324745608044585165|GCST90012877|19:43892009-45892009| 3|[{19_44917947_C_T...|19_44917947_C_T| 19|44917947| SuSiE-inf| 690.0307437138443| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 4|[{19_44921094_A_T...|19_44921094_A_T| 19|44921094| SuSiE-inf|425.33378303492805| 1|\n", + "| 2240477384494621278|GCST90012877|19:43892009-45892009| 5|[{19_44891079_T_C...|19_44891079_T_C| 19|44891079| SuSiE-inf|395.31055398960274| 1|\n", + "| 1029535804909934921|GCST90012877|19:43892009-45892009| 6|[{19_44894695_T_C...|19_44894695_T_C| 19|44894695| SuSiE-inf| 333.9497424582455| 1|\n", + "|-6417720984991662128|GCST90012877|19:43892009-45892009| 7|[{19_44908684_T_C...|19_44908684_T_C| 19|44908684| SuSiE-inf| 261.573648706883| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 8|[{19_44921094_A_T...|19_44921094_A_T| 19|44921094| SuSiE-inf|186.66554412409607| 1|\n", + "|-7370952295217410456|GCST90012877|19:43892009-45892009| 9|[{19_44922505_T_G...|19_44922505_T_G| 19|44922505| SuSiE-inf| 78.06352464083552| 1|\n", + "| 3925446284512644964|GCST90012877|19:43892009-45892009| 10|[{19_44913574_T_G...|19_44913574_T_G| 19|44913574| SuSiE-inf|55.346197523194675| 1|\n", + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "\n", + "None\n", + " N_gwas N_ld N_overlap N_outliers N_imputed N_final_to_fm eleapsed_time\n", + "0 7582 10448 6100 0 0 6100 66.112839\n" + ] + } + ], + "source": [ + "res=SusieFineMapperStep.susie_finemapper_one_studylocus_row_v2_dev(\n", + " GWAS=gwas1,\n", + " session=session,\n", + " study_locus_row=one_row,\n", + " study_index=study_index,\n", + " window= 2_000_000,\n", + " L=10,\n", + " susie_est_tausq=False,\n", + " run_carma=False,\n", + " run_sumstat_imputation=False,\n", + " carma_time_limit=1000,\n", + " imputed_r2_threshold=0.8,\n", + " ld_score_threshold=4\n", + ")\n", + "sl=res[\"study_locus\"]\n", + "print(sl._df.withColumn(\"size\", f.size(sl._df[\"locus\"])).show())\n", + "print(res[\"log\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### With CARMA, without imputation" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "| studyLocusId| studyId| region|credibleSetIndex| locus| variantId|chromosome|position|finemappingMethod|credibleSetlog10BF|size|\n", + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "|-6417720984991662128|GCST90012877|19:43892009-45892009| 1|[{19_44908684_T_C...|19_44908684_T_C| 19|44908684| SuSiE-inf|1995.6574121818223| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 2|[{19_44921094_A_T...|19_44921094_A_T| 19|44921094| SuSiE-inf| 721.2637360279233| 1|\n", + "| 7760477027903907683|GCST90012877|19:43892009-45892009| 3|[{19_44911142_C_A...|19_44911142_C_A| 19|44911142| SuSiE-inf|248.39159334060017| 1|\n", + "|-1172224975892516254|GCST90012877|19:43892009-45892009| 4|[{19_44894255_A_C...|19_44894255_A_C| 19|44894255| SuSiE-inf| 96.16160678286879| 1|\n", + "| 8852802213660052283|GCST90012877|19:43892009-45892009| 5|[{19_44862190_G_A...|19_44862190_G_A| 19|44862190| SuSiE-inf| 55.80518621838019| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 6|[{19_44921094_A_T...|19_44921094_A_T| 19|44921094| SuSiE-inf| 53.24772075097935| 1|\n", + "|-6417720984991662128|GCST90012877|19:43892009-45892009| 7|[{19_44908684_T_C...|19_44908684_T_C| 19|44908684| SuSiE-inf| 45.65754067281976| 1|\n", + "|-6417720984991662128|GCST90012877|19:43892009-45892009| 8|[{19_44908684_T_C...|19_44908684_T_C| 19|44908684| SuSiE-inf| 39.3840804563262| 1|\n", + "| 6986973025714240626|GCST90012877|19:43892009-45892009| 9|[{19_44873060_C_G...|19_44873060_C_G| 19|44873060| SuSiE-inf| 38.54912041595975| 1|\n", + "| 3640651426400620880|GCST90012877|19:43892009-45892009| 10|[{19_44845920_G_C...|19_44845920_G_C| 19|44845920| SuSiE-inf|35.378479810047224| 2|\n", + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "\n", + "None\n", + " N_gwas N_ld N_overlap N_outliers N_imputed N_final_to_fm eleapsed_time\n", + "0 7582 10448 6100 151 0 5949 783.939477\n" + ] + } + ], + "source": [ + "res=SusieFineMapperStep.susie_finemapper_one_studylocus_row_v2_dev(\n", + " GWAS=gwas1,\n", + " session=session,\n", + " study_locus_row=one_row,\n", + " study_index=study_index,\n", + " window= 2_000_000,\n", + " L=10,\n", + " susie_est_tausq=False,\n", + " run_carma=True,\n", + " run_sumstat_imputation=False,\n", + " carma_time_limit=1000,\n", + " imputed_r2_threshold=0.8,\n", + " ld_score_threshold=4\n", + ")\n", + "sl=res[\"study_locus\"]\n", + "print(sl._df.withColumn(\"size\", f.size(sl._df[\"locus\"])).show())\n", + "print(res[\"log\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Without CARMA, with imputation" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-04-08 22:25:15.739 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-08 22:25:30.625 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 22:25:46.020 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 22:32:35.094 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-08 22:32:47.616 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 22:33:02.484 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 22:51:33.149 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-08 22:51:45.708 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 22:52:00.731 Hail: INFO: Coerced sorted dataset\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+------------+--------------------+----------------+--------------------+-----------------+----------+--------+-----------------+------------------+----+\n", + "| studyLocusId| studyId| region|credibleSetIndex| locus| variantId|chromosome|position|finemappingMethod|credibleSetlog10BF|size|\n", + "+--------------------+------------+--------------------+----------------+--------------------+-----------------+----------+--------+-----------------+------------------+----+\n", + "|-1350283509846281677|GCST90012877|19:43892009-45892009| 1|[{19_44909967_TGG...|19_44909967_TGG_T| 19|44909967| SuSiE-inf| 2310.665662473933| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 2|[{19_44921094_A_T...| 19_44921094_A_T| 19|44921094| SuSiE-inf| 903.6138342773536| 1|\n", + "| 8324745608044585165|GCST90012877|19:43892009-45892009| 3|[{19_44917947_C_T...| 19_44917947_C_T| 19|44917947| SuSiE-inf| 700.3080514793324| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 4|[{19_44921094_A_T...| 19_44921094_A_T| 19|44921094| SuSiE-inf|431.79459690536703| 1|\n", + "| 1029535804909934921|GCST90012877|19:43892009-45892009| 5|[{19_44894695_T_C...| 19_44894695_T_C| 19|44894695| SuSiE-inf|402.50010763388156| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 6|[{19_44921094_A_T...| 19_44921094_A_T| 19|44921094| SuSiE-inf|225.93101254172214| 1|\n", + "| -60207296485035224|GCST90012877|19:43892009-45892009| 7|[{19_44888997_C_T...| 19_44888997_C_T| 19|44888997| SuSiE-inf| 191.4947272198485| 1|\n", + "|-1350283509846281677|GCST90012877|19:43892009-45892009| 8|[{19_44909967_TGG...|19_44909967_TGG_T| 19|44909967| SuSiE-inf|105.04460057482835| 1|\n", + "|-4078755027603845519|GCST90012877|19:43892009-45892009| 9|[{19_44918393_G_A...| 19_44918393_G_A| 19|44918393| SuSiE-inf| 63.30243818120949| 1|\n", + "| 3925446284512644964|GCST90012877|19:43892009-45892009| 10|[{19_44913574_T_G...| 19_44913574_T_G| 19|44913574| SuSiE-inf|54.079307276192694| 1|\n", + "+--------------------+------------+--------------------+----------------+--------------------+-----------------+----------+--------+-----------------+------------------+----+\n", + "\n", + "None\n", + " N_gwas N_ld N_overlap N_outliers N_imputed N_final_to_fm eleapsed_time\n", + "0 7582 10448 6100 0 681 6781 334.328722\n" + ] + } + ], + "source": [ + "res=SusieFineMapperStep.susie_finemapper_one_studylocus_row_v2_dev(\n", + " GWAS=gwas1,\n", + " session=session,\n", + " study_locus_row=one_row,\n", + " study_index=study_index,\n", + " window= 2_000_000,\n", + " L=10,\n", + " susie_est_tausq=False,\n", + " run_carma=False,\n", + " run_sumstat_imputation=True,\n", + " carma_time_limit=10000,\n", + " imputed_r2_threshold=0.8,\n", + " ld_score_threshold=4\n", + ")\n", + "sl=res[\"study_locus\"]\n", + "print(sl._df.withColumn(\"size\", f.size(sl._df[\"locus\"])).show())\n", + "print(res[\"log\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### With CARMA, with imputation" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "| studyLocusId| studyId| region|credibleSetIndex| locus| variantId|chromosome|position|finemappingMethod|credibleSetlog10BF|size|\n", + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "| 3030414938485808431|GCST90012877|19:43892009-45892009| 1|[{19_44895007_C_T...|19_44895007_C_T| 19|44895007| SuSiE-inf|2680.9099711333456| 1|\n", + "|-2201142982564351776|GCST90012877|19:43892009-45892009| 2|[{19_44900601_A_G...|19_44900601_A_G| 19|44900601| SuSiE-inf| 2103.873956796136| 1|\n", + "|-6417720984991662128|GCST90012877|19:43892009-45892009| 3|[{19_44908684_T_C...|19_44908684_T_C| 19|44908684| SuSiE-inf|1968.8126348567705| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 4|[{19_44921094_A_T...|19_44921094_A_T| 19|44921094| SuSiE-inf|1089.9033376410644| 1|\n", + "| 7760477027903907683|GCST90012877|19:43892009-45892009| 5|[{19_44911142_C_A...|19_44911142_C_A| 19|44911142| SuSiE-inf|188.55568384844716| 1|\n", + "|-6417720984991662128|GCST90012877|19:43892009-45892009| 6|[{19_44908684_T_C...|19_44908684_T_C| 19|44908684| SuSiE-inf| 83.57344085238768| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 7|[{19_44921094_A_T...|19_44921094_A_T| 19|44921094| SuSiE-inf| 82.01732099119907| 1|\n", + "| 8852802213660052283|GCST90012877|19:43892009-45892009| 8|[{19_44862190_G_A...|19_44862190_G_A| 19|44862190| SuSiE-inf| 45.92126992319222| 1|\n", + "|-1611304699666037367|GCST90012877|19:43892009-45892009| 9|[{19_44821259_C_T...|19_44821259_C_T| 19|44821259| SuSiE-inf|37.363613067645254| 1|\n", + "| 3556335645959991344|GCST90012877|19:43892009-45892009| 10|[{19_45017701_G_T...|19_45017701_G_T| 19|45017701| SuSiE-inf|30.736039473626658| 4|\n", + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "\n", + "None\n", + " N_gwas N_ld N_overlap N_outliers N_imputed N_final_to_fm eleapsed_time\n", + "0 7582 10448 6100 152 715 6663 1036.467428\n" + ] + } + ], + "source": [ + "res=SusieFineMapperStep.susie_finemapper_one_studylocus_row_v2_dev(\n", + " GWAS=gwas1,\n", + " session=session,\n", + " study_locus_row=one_row,\n", + " study_index=study_index,\n", + " window= 2_000_000,\n", + " L=10,\n", + " susie_est_tausq=False,\n", + " run_carma=True,\n", + " run_sumstat_imputation=True,\n", + " carma_time_limit=10000,\n", + " imputed_r2_threshold=0.8,\n", + " ld_score_threshold=4\n", + ")\n", + "sl=res[\"study_locus\"]\n", + "print(sl._df.withColumn(\"size\", f.size(sl._df[\"locus\"])).show())\n", + "print(res[\"log\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### With CARMA, with imputation, with estimation of infinitisimal effects (susie_est_tausq=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "| studyLocusId| studyId| region|credibleSetIndex| locus| variantId|chromosome|position|finemappingMethod|credibleSetlog10BF|size|\n", + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "|-6417720984991662128|GCST90012877|19:43892009-45892009| 1|[{19_44908684_T_C...|19_44908684_T_C| 19|44908684| SuSiE-inf| 1105.297844890198| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 2|[{19_44921094_A_T...|19_44921094_A_T| 19|44921094| SuSiE-inf|1042.0949995382389| 1|\n", + "|-2201142982564351776|GCST90012877|19:43892009-45892009| 3|[{19_44900601_A_G...|19_44900601_A_G| 19|44900601| SuSiE-inf| 760.0654878716481| 1|\n", + "| 3030414938485808431|GCST90012877|19:43892009-45892009| 4|[{19_44895007_C_T...|19_44895007_C_T| 19|44895007| SuSiE-inf| 388.8928142354868| 1|\n", + "| -251577639520141451|GCST90012877|19:43892009-45892009| 5|[{19_44899220_C_T...|19_44899220_C_T| 19|44899220| SuSiE-inf| 259.5645544847559| 1|\n", + "| 7760477027903907683|GCST90012877|19:43892009-45892009| 6|[{19_44911142_C_A...|19_44911142_C_A| 19|44911142| SuSiE-inf|231.66277856324325| 1|\n", + "| 4133344777320628094|GCST90012877|19:43892009-45892009| 7|[{19_44904531_G_A...|19_44904531_G_A| 19|44904531| SuSiE-inf|143.22657752219786| 1|\n", + "|-1764089385585984368|GCST90012877|19:43892009-45892009| 8|[{19_44893642_T_C...|19_44893642_T_C| 19|44893642| SuSiE-inf| 87.72507299242906| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 9|[{19_44921094_A_T...|19_44921094_A_T| 19|44921094| SuSiE-inf| 71.4171763690986| 1|\n", + "|-6417720984991662128|GCST90012877|19:43892009-45892009| 10|[{19_44908684_T_C...|19_44908684_T_C| 19|44908684| SuSiE-inf| 43.36071977593145| 1|\n", + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "\n", + "None\n", + " N_gwas N_ld N_overlap N_outliers N_imputed N_final_to_fm eleapsed_time\n", + "0 7582 10448 6100 151 720 6669 1229.515921\n" + ] + } + ], + "source": [ + "res=SusieFineMapperStep.susie_finemapper_one_studylocus_row_v2_dev(\n", + " GWAS=gwas1,\n", + " session=session,\n", + " study_locus_row=one_row,\n", + " study_index=study_index,\n", + " window= 2_000_000,\n", + " L=10,\n", + " susie_est_tausq=True,\n", + " run_carma=True,\n", + " run_sumstat_imputation=True,\n", + " carma_time_limit=10000,\n", + " imputed_r2_threshold=0.8,\n", + " ld_score_threshold=4\n", + ")\n", + "sl=res[\"study_locus\"]\n", + "print(sl._df.withColumn(\"size\", f.size(sl._df[\"locus\"])).show())\n", + "print(res[\"log\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Fine-mapping of MHC region using 1Mb window" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "text/plain": [ + "Row(studyId='GCST90012877', variantId='6_32592248_A_G', chromosome='6', position=32592248, beta=-0.103604380043, sampleSize=None, pValueMantissa=2.877000093460083, pValueExponent=-15, effectAlleleFrequencyFromSource=0.21086899936199188, standardError=0.0131209374957, studyLocusId=5718491981995302674, qualityControls=[], row_index=27)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = slt_df.withColumn(\"row_index\", f.monotonically_increasing_id())\n", + "one_row = df.filter(df.row_index == 27).first()\n", + "one_row" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 1541:==========================================> (6 + 2) / 8]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+------------+-------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "| studyLocusId| studyId| region|credibleSetIndex| locus| variantId|chromosome|position|finemappingMethod|credibleSetlog10BF|size|\n", + "+--------------------+------------+-------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "|-3446214959021623473|GCST90012877|6:32092248-33092248| 1|[{6_32557997_G_A,...| 6_32557997_G_A| 6|32557997| SuSiE-inf| 4323.908142062261| 1|\n", + "| -439738150050389281|GCST90012877|6:32092248-33092248| 2|[{6_32558002_G_T,...| 6_32558002_G_T| 6|32558002| SuSiE-inf|3428.8321277074765| 1|\n", + "| 5831857384024844796|GCST90012877|6:32092248-33092248| 3|[{6_32557987_C_A,...| 6_32557987_C_A| 6|32557987| SuSiE-inf|1699.8680349563335| 1|\n", + "|-1087057043201011402|GCST90012877|6:32092248-33092248| 4|[{6_32557977_T_C,...| 6_32557977_T_C| 6|32557977| SuSiE-inf| 965.9753305300063| 1|\n", + "| 6919234179916081233|GCST90012877|6:32092248-33092248| 5|[{6_32649735_C_T,...| 6_32649735_C_T| 6|32649735| SuSiE-inf| 369.9698233117616| 1|\n", + "| 7781006900918060896|GCST90012877|6:32092248-33092248| 6|[{6_32652962_C_T,...| 6_32652962_C_T| 6|32652962| SuSiE-inf| 328.6834447478274| 1|\n", + "|-7512794333418509403|GCST90012877|6:32092248-33092248| 7|[{6_32591896_T_G,...| 6_32591896_T_G| 6|32591896| SuSiE-inf|326.52393082050276| 1|\n", + "| 4056478719932360430|GCST90012877|6:32092248-33092248| 8|[{6_32621456_GC_G...|6_32621456_GC_G| 6|32621456| SuSiE-inf|263.48518383939836| 1|\n", + "| 8380896542014789747|GCST90012877|6:32092248-33092248| 9|[{6_32648039_G_A,...| 6_32648039_G_A| 6|32648039| SuSiE-inf|176.62947310155317| 1|\n", + "| 9053545161380162736|GCST90012877|6:32092248-33092248| 10|[{6_32700030_C_T,...| 6_32700030_C_T| 6|32700030| SuSiE-inf| 69.83226092797517| 2|\n", + "+--------------------+------------+-------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "\n", + "None\n", + " N_gwas N_ld N_overlap N_outliers N_imputed N_final_to_fm eleapsed_time\n", + "0 19311 22318 13188 0 0 13188 298.784288\n" + ] + } + ], + "source": [ + "res=SusieFineMapperStep.susie_finemapper_one_studylocus_row_v2_dev(\n", + " GWAS=gwas1,\n", + " session=session,\n", + " study_locus_row=one_row,\n", + " study_index=study_index,\n", + " window= 1_000_000,\n", + " L=10,\n", + " susie_est_tausq=False,\n", + " run_carma=False,\n", + " run_sumstat_imputation=False,\n", + " carma_time_limit=10000,\n", + " imputed_r2_threshold=0.8,\n", + " ld_score_threshold=4\n", + ")\n", + "sl=res[\"study_locus\"]\n", + "print(sl._df.withColumn(\"size\", f.size(sl._df[\"locus\"])).show())\n", + "print(res[\"log\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "gentropy-krNFZEZg-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/Mapping_EFO_finngen.ipynb b/notebooks/Mapping_EFO_finngen.ipynb new file mode 100644 index 000000000..9bd82d8d4 --- /dev/null +++ b/notebooks/Mapping_EFO_finngen.ipynb @@ -0,0 +1,768 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Mapping EFOs for the FinnGen study index using old study index from the previos prod" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook adds EFOs from previos prod version of study_index to the new FinnGen study_index using trait name as a matching key.\n", + "\n", + "The rsulting study index has 1542 rows with not null EFOs (out of 2408 rows).\n", + "\n", + "The new study index is saved here:\n", + "\"gs://genetics-portal-dev-analysis/yt4/study_index_finngen_with_efo\"" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Your browser has been opened to visit:\n", + "\n", + " https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=XHb8Uk43SsVjvFRqwgrX4Tgg2tTOHS&access_type=offline&code_challenge=OkiqDAkHXDGEgJQbX8r0ZYKfZ7gcgfXS8mfZc5a913Y&code_challenge_method=S256\n", + "\n", + "\n", + "Credentials saved to file: [/Users/yt4/.config/gcloud/application_default_credentials.json]\n", + "\n", + "These credentials will be used by any library that requests Application Default Credentials (ADC).\n", + "\n", + "Quota project \"open-targets-genetics-dev\" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.\n" + ] + } + ], + "source": [ + "!gcloud auth application-default login" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " Loading BokehJS ...\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": "(function(root) {\n function now() {\n return new Date();\n }\n\n const force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\nconst JS_MIME_TYPE = 'application/javascript';\n const HTML_MIME_TYPE = 'text/html';\n const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n const CLASS_NAME = 'output_bokeh rendered_html';\n\n /**\n * Render data to the DOM node\n */\n function render(props, node) {\n const script = document.createElement(\"script\");\n node.appendChild(script);\n }\n\n /**\n * Handle when an output is cleared or removed\n */\n function handleClearOutput(event, handle) {\n function drop(id) {\n const view = Bokeh.index.get_by_id(id)\n if (view != null) {\n view.model.document.clear()\n Bokeh.index.delete(view)\n }\n }\n\n const cell = handle.cell;\n\n const id = cell.output_area._bokeh_element_id;\n const server_id = cell.output_area._bokeh_server_id;\n\n // Clean up Bokeh references\n if (id != null) {\n drop(id)\n }\n\n if (server_id !== undefined) {\n // Clean up Bokeh references\n const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n cell.notebook.kernel.execute(cmd_clean, {\n iopub: {\n output: function(msg) {\n const id = msg.content.text.trim()\n drop(id)\n }\n }\n });\n // Destroy server and session\n const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n cell.notebook.kernel.execute(cmd_destroy);\n }\n }\n\n /**\n * Handle when a new output is added\n */\n function handleAddOutput(event, handle) {\n const output_area = handle.output_area;\n const output = handle.output;\n\n // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n return\n }\n\n const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n\n if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n // store reference to embed id on output_area\n output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n }\n if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n const bk_div = document.createElement(\"div\");\n bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n const script_attrs = bk_div.children[0].attributes;\n for (let i = 0; i < script_attrs.length; i++) {\n toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n }\n // store reference to server id on output_area\n output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n }\n }\n\n function register_renderer(events, OutputArea) {\n\n function append_mime(data, metadata, element) {\n // create a DOM node to render to\n const toinsert = this.create_output_subarea(\n metadata,\n CLASS_NAME,\n EXEC_MIME_TYPE\n );\n this.keyboard_manager.register_events(toinsert);\n // Render to node\n const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n render(props, toinsert[toinsert.length - 1]);\n element.append(toinsert);\n return toinsert\n }\n\n /* Handle when an output is cleared or removed */\n events.on('clear_output.CodeCell', handleClearOutput);\n events.on('delete.Cell', handleClearOutput);\n\n /* Handle when a new output is added */\n events.on('output_added.OutputArea', handleAddOutput);\n\n /**\n * Register the mime type and append_mime function with output_area\n */\n OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n /* Is output safe? */\n safe: true,\n /* Index of renderer in `output_area.display_order` */\n index: 0\n });\n }\n\n // register the mime type if in Jupyter Notebook environment and previously unregistered\n if (root.Jupyter !== undefined) {\n const events = require('base/js/events');\n const OutputArea = require('notebook/js/outputarea').OutputArea;\n\n if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n register_renderer(events, OutputArea);\n }\n }\n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n const NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n const el = document.getElementById(\"c92e22c1-acc6-4a9b-8a5a-529fec6e60ae\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error(url) {\n console.error(\"failed to load \" + url);\n }\n\n for (let i = 0; i < css_urls.length; i++) {\n const url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n for (let i = 0; i < js_urls.length; i++) {\n const url = js_urls[i];\n const element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.3.0.min.js\"];\n const css_urls = [];\n\n const inline_js = [ function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\nfunction(Bokeh) {\n }\n ];\n\n function run_inline_js() {\n if (root.Bokeh !== undefined || force === true) {\n for (let i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\nif (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n const cell = $(document.getElementById(\"c92e22c1-acc6-4a9b-8a5a-529fec6e60ae\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));", + "application/vnd.bokehjs_load.v0+json": "" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/04/14 16:03:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "pip-installed Hail requires additional configuration options in Spark referring\n", + " to the path to the Hail Python module directory HAIL_DIR,\n", + " e.g. /path/to/python/site-packages/hail:\n", + " spark.jars=HAIL_DIR/backend/hail-all-spark.jar\n", + " spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar\n", + " spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 3.3.4\n", + "SparkUI available at http://192.168.0.232:4040\n", + "Welcome to\n", + " __ __ <>__\n", + " / /_/ /__ __/ /\n", + " / __ / _ `/ / /\n", + " /_/ /_/\\_,_/_/_/ version 0.2.127-bb535cd096c5\n", + "LOGGING: writing to /dev/null\n" + ] + } + ], + "source": [ + "import os\n", + "import hail as hl\n", + "import pyspark.sql.functions as f\n", + "import pandas as pd\n", + "pd.set_option('display.max_colwidth', None)\n", + "pd.set_option('display.expand_frame_repr', False)\n", + "\n", + "from gentropy.common.session import Session\n", + "from gentropy.dataset.study_index import StudyIndex\n", + "\n", + "\n", + "hail_dir = os.path.dirname(hl.__file__)\n", + "session = Session(hail_home=hail_dir, start_hail=True, extended_spark_conf={\"spark.driver.memory\": \"12g\",\n", + " \"spark.kryoserializer.buffer.max\": \"500m\",\"spark.driver.maxResultSize\":\"3g\"})\n", + "hl.init(sc=session.spark.sparkContext, log=\"/dev/null\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "path_si=\"gs://genetics_etl_python_playground/releases/24.03/study_index/finngen/study_index\"\n", + "path_si_old=\"gs://genetics-portal-dev-analysis/yt4/study_index.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "si_old=session.spark.read.csv(path_si_old, header=True,sep=\"\\t\")\n", + "si_new=StudyIndex.from_parquet(session=session, path=path_si)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-------------------+--------------------+-------+---------+-------------+----+----------+----------+-----------+---------+------------+--------------+-------+--------------------+-----------------+--------------------+\n", + "| study_id| ancestry_initial|ancestry_replication|n_cases|n_initial|n_replication|pmid|pub_author| pub_date|pub_journal|pub_title|has_sumstats|num_assoc_loci| source| trait_reported| trait_efos| trait_category|\n", + "+--------------------+-------------------+--------------------+-------+---------+-------------+----+----------+----------+-----------+---------+------------+--------------+-------+--------------------+-----------------+--------------------+\n", + "|FINNGEN_R6_M13_MU...|['European=253458']| []| 108.0| 253458| 0.0|null|FINNGEN_R6|2022-01-24| null| null| True| 0|FINNGEN|Multifocal fibros...|['MONDO_0009230']|immune system dis...|\n", + "|FINNGEN_R6_M13_MU...|['European=199528']| []| 1804.0| 199528| 0.0|null|FINNGEN_R6|2022-01-24| null| null| True| 0|FINNGEN|Disorders of muscles| ['EFO_0002970']|musculoskeletal o...|\n", + "|FINNGEN_R6_M13_MU...|['European=197821']| []| 97.0| 197821| 0.0|null|FINNGEN_R6|2022-01-24| null| null| True| 0|FINNGEN|\"\"\"Muscle wasting...| ['EFO_0009851']| biological process|\n", + "|FINNGEN_R6_M13_MU...|['European=198253']| []| 529.0| 198253| 0.0|null|FINNGEN_R6|2022-01-24| null| null| True| 0|FINNGEN|Other specified d...| ['EFO_0002970']|musculoskeletal o...|\n", + "|FINNGEN_R6_M13_MU...|['European=198179']| []| 455.0| 198179| 0.0|null|FINNGEN_R6|2022-01-24| null| null| True| 1|FINNGEN| Muscle strain| ['EFO_0010686']|injury, poisoning...|\n", + "+--------------------+-------------------+--------------------+-------+---------+-------------+----+----------+----------+-----------+---------+------------+--------------+-------+--------------------+-----------------+--------------------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "si_old.show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "| studyId| projectId|studyType| traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds| initialSampleSize|nCases|nControls|nSamples| cohorts|ldPopulationStructure| discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Actinomycosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 101| 363227| 363328|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Amoebiasis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 160| 367214| 367374|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Anogenital herpes...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1986| 400197| 402183|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Aspergillosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 211| 403213| 403424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Atypical virus in...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 282| 409849| 410131|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "si_new_df=si_new.df\n", + "si_new_df.show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "57246\n", + "2408\n" + ] + } + ], + "source": [ + "print(si_old.count())\n", + "print(si_new_df.count())" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-----------------+\n", + "| trait_reported| trait_efos|\n", + "+--------------------+-----------------+\n", + "|Multifocal fibros...|['MONDO_0009230']|\n", + "|Disorders of muscles| ['EFO_0002970']|\n", + "|\"\"\"Muscle wasting...| ['EFO_0009851']|\n", + "|Other specified d...| ['EFO_0002970']|\n", + "| Muscle strain| ['EFO_0010686']|\n", + "+--------------------+-----------------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "si_old=si_old.select(\"trait_reported\",\"trait_efos\")\n", + "si_old.show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql.functions import lower\n", + "\n", + "si_old = si_old.withColumn(\"trait_reported_low\", lower(si_old[\"trait_reported\"])).select(\"trait_reported_low\",\"trait_efos\")\n", + "si_new_df= si_new_df.withColumn(\"trait_reported_low\", lower(si_new_df[\"traitFromSource\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "text/plain": [ + "2408" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "si_old = si_old.dropDuplicates(['trait_reported_low'])\n", + "joined_df = si_new_df.join(si_old, \"trait_reported_low\", how='left')\n", + "joined_df.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+--------------------+\n", + "| trait_reported_low| studyId| projectId|studyType| traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds| initialSampleSize|nCases|nControls|nSamples| cohorts|ldPopulationStructure| discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats| trait_efos|\n", + "+--------------------+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+--------------------+\n", + "| actinomycosis|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Actinomycosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 101| 363227| 363328|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| null|\n", + "| amoebiasis|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Amoebiasis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 160| 367214| 367374|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_0007144']|\n", + "|anogenital herpes...|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Anogenital herpes...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1986| 400197| 402183|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_0007282']|\n", + "| aspergillosis|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Aspergillosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 211| 403213| 403424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_0007157']|\n", + "|atypical virus in...|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Atypical virus in...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 282| 409849| 410131|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['MONDO_0024318']|\n", + "|bacterial infecti...|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial infecti...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 20226| 363227| 383453|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| null|\n", + "|bacterial, viral ...|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial, viral ...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2852| 409329| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| null|\n", + "|other bacterial i...|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Other bacterial i...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 6145| 367214| 373359|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_0000771']|\n", + "| candidiasis|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Candidiasis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 4306| 403213| 407519|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['MONDO_0002026']|\n", + "|other sexually tr...|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas|Other sexually tr...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2186| 400197| 402383|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|['MONDO_0021681',...|\n", + "| cholera|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Cholera| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1385| 367214| 368599|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_1001235']|\n", + "|dengue fever [cla...|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas|Dengue fever [cla...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 53| 409137| 409190|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| null|\n", + "| dermatophytosis|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas| Dermatophytosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 3921| 403213| 407134|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['MONDO_0004678']|\n", + "| early syphilis|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Early syphilis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 308| 400197| 400505|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_0007504']|\n", + "|infectious mononu...| FINNGEN_R10_AB1_EBV|FINNGEN_R10| gwas|Infectious mononu...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2979| 400974| 403953|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_0007326']|\n", + "| enterobiasis|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Enterobiasis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 112| 411658| 411770|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| null|\n", + "| erysipelas|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Erysipelas| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 22261| 363227| 385488|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_1001462']|\n", + "|diarrhoea and gas...|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Diarrhoea and gas...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 32210| 367214| 399424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['MONDO_0045031']|\n", + "|gonococcal infection|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Gonococcal infection| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 954| 400197| 401151|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['DOID_7551']|\n", + "| helminthiases|FINNGEN_R10_AB1_H...|FINNGEN_R10| gwas| Helminthiases| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 523| 411658| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|['EFO_0007245', '...|\n", + "+--------------------+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "joined_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1542\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 22:===========================================> (3 + 1) / 4]\r" + ] + } + ], + "source": [ + "num_non_null_rows = joined_df.filter(joined_df.trait_efos.isNotNull()).count()\n", + "print(num_non_null_rows)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------+---------+---------+--------------------+------------------------+------+------------------+--------+--------------------+----------------------+---------------+--------------------+----------------------------------+--------------------+------+---------+--------+--------------------+---------------------+--------------------+--------------------+---------------+-------------+--------------------+-----------+\n", + "| studyId|projectId|studyType| traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId| publicationTitle|publicationFirstAuthor|publicationDate| publicationJournal|backgroundTraitFromSourceMappedIds| initialSampleSize|nCases|nControls|nSamples| cohorts|ldPopulationStructure| discoverySamples| replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|\n", + "+------------+---------+---------+--------------------+------------------------+------+------------------+--------+--------------------+----------------------+---------------+--------------------+----------------------------------+--------------------+------+---------+--------+--------------------+---------------------+--------------------+--------------------+---------------+-------------+--------------------+-----------+\n", + "| GCST000102| GCST| gwas|Endothelial funct...| [EFO_0004298]| null| null|17903301|Genome-wide assoc...| Vasan RS| 2007-09-19| BMC Med Genet| null|Up to 1,238 Europ...| 0| 0| 1238| [null]| [{nfe, 1.0}]| [{1238, European}]| []| null| null| null| false|\n", + "| GCST000272| GCST| gwas| Height| [EFO_0004339]| null| null|19030899|Genome-wide assoc...| Lei SF| 2008-11-23| Hum Genet| null|618 Chinese ances...| 0| 0| 618| [null]| [{eas, 1.0}]| [{618, East Asian}]|[{2953, East Asian}]| null| null| null| false|\n", + "| GCST000436| GCST| gwas|Acenocoumarol mai...| [GO_0061476]| null| null|19578179|A genome-wide ass...| Teichert M| 2009-07-04| Hum Mol Genet| null|1,451 European an...| 0| 0| 1451| [null]| [{nfe, 1.0}]| [{1451, European}]| [{287, NR}]| null| null| null| false|\n", + "| GCST000514| GCST| gwas|Response to antip...| [GO_0097332]| null| null|19875103|Genomewide associ...| Aberg K| 2009-10-27| Biol Psychiatry| null|421 European ance...| 738| 0| 738| [null]| [{afr, 0.28997289...|[{214, African Am...| []| null| null| null| false|\n", + "| GCST000550| GCST| gwas| Metabolite levels| [EFO_0004725]| null| null|20037589|A genome-wide per...| Illig T| 2009-12-27| Nat Genet| null|1,029 European an...| 0| 0| 1029| [null]| [{nfe, 1.0}]| [{1029, European}]| [{1202, European}]| null| null| null| false|\n", + "| GCST000708| GCST| gwas| Freckling| [EFO_0003963]| null| null|20585627|Web-based, partic...| Eriksson N| 2010-06-24| PLoS Genet| null|9,126 European an...| 0| 0| 9126| [null]| [{nfe, 1.0}]| [{9126, European}]| []| null| null| null| false|\n", + "| GCST000754| GCST| gwas|Personality dimen...| [EFO_0004365]| null| null|20691247|A genome-wide ass...| Verweij KJ| 2010-08-04| Biol Psychol| null|5,117 European an...| 0| 0| 5117| [null]| [{nfe, 1.0}]| [{5117, European}]| []| null| null| null| false|\n", + "| GCST000880| GCST| gwas|Menarche (age at ...| [EFO_0004703]| null| null|21102462|Thirty new loci f...| Elks CE| 2010-11-21| Nat Genet| null|86,142 European a...| 0| 0| 87802| [null]| [{nfe, 1.0}]| [{87802, European}]| [{14731, European}]| null| null| null| false|\n", + "| GCST001031| GCST| gwas|Large B-cell lymp...| [EFO_0000403]| null| null|21471979|Common variants o...| Kumar V| 2011-04-07| J Hum Genet| null|74 Japanese ances...| 74| 934| 1008| [null]| [{eas, 1.0}]|[{1008, East Asian}]|[{3634, East Asian}]| null| null| null| false|\n", + "| GCST001032| GCST| gwas|Caffeine consumption| [EFO_0004330]| null| null|21490707|Genome-wide meta-...| Cornelis MC| 2011-04-07| PLoS Genet| null|47,431 European a...| 0| 0| 47431| [null]| [{nfe, 1.0}]| [{47431, European}]| []| null| null| null| false|\n", + "| GCST001059| GCST| gwas| Neutrophil count| [EFO_0004833]| null| null|21507922|Duffy-null-associ...| Ramsuran V| 2011-05-01| Clin Infect Dis| null|115 African ances...| 0| 0| 115| [null]| [{afr, 1.0}]|[{115, Sub-Sahara...| []| null| null| null| false|\n", + "| GCST002187| GCST| gwas|Systolic blood pr...| [EFO_0006335]| null| null|24058526|Genome-wide meta-...| Bhatnagar P| 2013-09-13| PLoS One| [MONDO_0011382]|1617 African Amer...| 1617| 0| 1617| [null]| [{afr, 1.0}]|[{1617, African A...| []| null| null| null| false|\n", + "| GCST002623| GCST| gwas| L-arginine levels| [EFO_0006524]| null| null|25245031|Genome-wide assoc...| Luneburg N| 2014-09-21|Circ Cardiovasc G...| null|3,747 European an...| 0| 0| 6739| [null]| [{nfe, 1.0}]|[{3747, European}...| [{1159, European}]| null| null| null| false|\n", + "| GCST003261| GCST| gwas|Ischemic stroke (...| [HP_0002140]| null| null|26708676|Loci associated w...| Pulit SL| 2015-12-18| Lancet Neurol| null|up to 8,062 Europ...| 9510| 32473| 41983| [null]| [{amr, 0.06647928...|[{2791, Hispanic ...|[{256, African Am...| null| null| null| false|\n", + "| GCST003427| GCST| gwas|Alzheimer disease...| [EFO_0004847, MON...| null| null|26830138|Family-based asso...| Herold C| 2016-02-02| Mol Psychiatry| null|2,478 European an...| 2478| 979| 3457| [null]| [{nfe, 1.0}]| [{3524, European}]| []| null| null| null| false|\n", + "| GCST003665| GCST| gwas|Free cholesterol ...| [EFO_0004611, EFO...| null| null|27005778|Genome-wide study...| Kettunen J| 2016-03-23| Nat Commun| null|21,555 European a...| 0| 0| 21555|[EGCUT, ERF, FTC,...| [{nfe, 1.0}]| [{21555, European}]| []| null| null| null| false|\n", + "| GCST003773| GCST| gwas|Loneliness (multi...| [EFO_0007865]| null| null|27629369|Genome-Wide Assoc...| Gao J| 2016-09-15|Neuropsychopharma...| null|8,490 European an...| 0| 0| 10760| [null]| [{nfe, 0.80529739...|[{8490, European}...| []| null| null| null| false|\n", + "| GCST003791| GCST| gwas|Response to metfo...| [EFO_0006952, GO_...| null| null|28173075|Metformin pharmac...| Niu N| 2016-09-11| Hum Mol Genet| null|up to 96 African ...| 0| 0| 288| [null]| [{afr, 0.33333333...|[{96, African Ame...| []| null| null| null| false|\n", + "| GCST003824| GCST| gwas|Depression in res...| [EFO_0007006, EFO...| null| null|27723809|Genome-Wide Assoc...| Matsunami K| 2016-10-10| PLoS One| [EFO_0004220]|45 Japanese ances...| 45| 179| 224| [null]| [{eas, 1.0}]| [{224, East Asian}]| [{160, East Asian}]| null| null| null| false|\n", + "| GCST003837| GCST| gwas| Chronotype| [EFO_0004354]| null| null|27494321|Genome-Wide Assoc...| Jones SE| 2016-08-05| PLoS Genet| null|127,898 British i...| 0| 0| 127898| [null]| [{nfe, 1.0}]|[{127898, European}]| [{89283, NR}]| []| []|ftp://ftp.ebi.ac....| true|\n", + "| GCST004678| GCST| gwas|Psychosis pronene...| [EFO_0008337]| null| null|28525603|Genome-Wide Assoc...| Ortega-Alonso A| 2017-05-19| Schizophr Bull| null|3,967 Finnish anc...| 0| 0| 3967| [null]| [{nfe, 1.0}]| [{3967, European}]| []| null| null| null| false|\n", + "| GCST005189| GCST| gwas| Tanning| [EFO_0004279]| null| null|29195075|An Unexpectedly C...| Martin AR| 2017-11-30| Cell| null|216 Sub-Saharan A...| 0| 0| 216| [null]| [{afr, 1.0}]|[{216, Sub-Sahara...|[{240, Sub-Sahara...| null| null| null| false|\n", + "| GCST005437| GCST| gwas|Random C-peptide ...| [EFO_0005187]| null| null|29404672|Meta-genome-wide ...| Roshandel D| 2018-02-05| Diabetologia| [MONDO_0005147]|1,497 European an...| 0| 0| 1497| [null]| [{nfe, 1.0}]| [{1497, European}]| []| null| null| null| false|\n", + "| GCST005503| GCST| gwas|Medium HDL partic...| [EFO_0004612]| null| null|29084231|Common, low-frequ...| Davis JP| 2017-10-30| PLoS Genet| null|8,372 Finnish anc...| 0| 0| 8372| [null]| [{nfe, 1.0}]| [{8372, European}]| []| null| null| null| false|\n", + "| GCST005669| GCST| gwas|Delta-6 desaturas...| [EFO_0007765, EFO...| null| null|29246731|A common variant ...| de Toro-Martin J| 2017-11-02| J Clin Lipidol| null|81 extreme respon...| 0| 0| 141| [null]| [{nfe, 1.0}]| [{141, NR}]| []| null| null| null| false|\n", + "| GCST005749| GCST| gwas|Digit length rati...| [EFO_0004841]| null| null|29659830|Genome-wide assoc...| Warrington NM| 2018-04-12| Hum Mol Genet| null|14,382 European a...| 0| 0| 15661| [null]| [{nfe, 1.0}]|[{14382, European...| []| null| null| null| false|\n", + "| GCST006420| GCST| gwas|Affective disorde...| [EFO_0004247, EFO...| null| null|30116032|Genetics of suici...| Erlangsen A| 2018-08-16| Mol Psychiatry| null|4,302 European an...| 4302| 13294| 17596| [null]| [{nfe, 1.0}]| [{17596, European}]| []| null| null| null| false|\n", + "| GCST006484| GCST| gwas| Type 2 diabetes| [MONDO_0005148]| null| null|30130595|Pilot genome-wide...| Dominguez-Cruz MG| 2018-08-18| Gene| null|45 Maya ancestry ...| 45| 47| 92| [null]| [{amr, 1.0}]|[{92, Native Amer...| []| null| null| null| false|\n", + "| GCST006496| GCST| gwas|Glomerular filtra...| [EFO_0006829, EFO...| null| null|30160337|Genome Wide Assoc...| Asleh R| 2018-08-30| Clin Transplant| null|243 European ance...| 0| 0| 251| [null]| [{nfe, 0.99601593...|[{243, European},...| []| null| null| null| false|\n", + "| GCST006739| GCST| gwas|Proportion of mis...| [EFO_0006923]| null| null|30188897|Detecting past an...| Jeong C| 2018-09-06| PLoS Genet| null|981 Tibetan ances...| 0| 0| 981| [null]| [{nfe, 1.0}]| [{981, NR}]| []| null| null| null| false|\n", + "| GCST006907| GCST| gwas|Ischemic stroke (...| [EFO_0005524]| null| null|29531354|Multiancestry gen...| Malik R| 2018-03-12| Nat Genet| null|4,373 European an...| 4373| 406111| 410484| [null]| [{nfe, 1.0}]|[{150765, European}]| []| []| []|ftp://ftp.ebi.ac....| true|\n", + "| GCST006960| GCST| gwas|Inflammatory bowe...| [EFO_0003767]| null| null|26490195|Inherited determi...| Cleynen I| 2015-10-18| Lancet| null|16,902 European a...| 29838| 0| 29838| [null]| [{nfe, 1.0}]| [{29838, European}]| [{6182, European}]| null| null| null| false|\n", + "| GCST007217| GCST| gwas|RR interval (hear...| [EFO_0004831]| null| null|30679814|Genome-wide assoc...| van Setten J| 2019-01-24| Eur J Hum Genet| null|2,006 Erasmus Ruc...| 0| 0| 28698| [null]| [{nfe, 1.0}]| [{28698, European}]| []| null| null| null| false|\n", + "| GCST008154| GCST| gwas| Trunk fat mass| [EFO_0005409]| null| null|28552196|Whole-Genome Sequ...| Tachmazidou I| 2017-06-01| Am J Hum Genet| null|3,399 whole genom...| 0| 0| 16237| [null]| [{nfe, 1.0}]|[{3538, NR}, {128...| [{10667, European}]| null| null| null| false|\n", + "| GCST008483| GCST| gwas| Ulcerative colitis| [EFO_0000729]| null| null|26398853|Identification of...| Ye BD| 2016-01-01| Inflamm Bowel Dis| null|705 Korean ancest...| 705| 1178| 1883| [null]| [{eas, 1.0}]|[{1883, South Asi...|[{3674, South Asi...| null| null| null| false|\n", + "| GCST008671| GCST| gwas|Phlegm x occupati...| [EFO_0007939, EFO...| null| null|30449631|Genome-wide inter...| Zeng X| 2018-11-15| Environ Int| null|1,702 Dutch ances...| 1702| 6274| 7976| [null]| [{nfe, 1.0}]| [{7976, European}]| [{6789, European}]| null| null| null| false|\n", + "| GCST008675| GCST| gwas|Maximum habitual ...| [EFO_0007878]| null| null|31151762|Genome-wide Assoc...| Gelernter J| 2019-04-08| Biol Psychiatry| null|126,936 European ...| 0| 0| 143965| [null]| [{afr, 0.11828569...|[{17029, African ...| []| null| null| null| false|\n", + "| GCST008775| GCST| gwas|Birth weight or w...| [EFO_0004342, EFO...| null| null|30858448|Genetic overlap b...| Tekola-Ayele F| 2019-03-11| Sci Rep| null|153,781 European ...| 0| 0| 378240| [null]| [{nfe, 1.0}]|[{246502, Europea...| []| null| null| null| false|\n", + "| GCST008870| GCST| gwas|Keratinocyte canc...| [EFO_0010176]| null| null|31174203|Combined analysis...| Liyanage UE| 2019-06-07| Hum Mol Genet| null|at least 18,538 E...| 18538| 340302| 358840| [null]| [{nfe, 1.0}]|[{358840, European}]| []| null| null| null| false|\n", + "| GCST009173| GCST| gwas|Response to (pegy...| [EFO_0007859]| null| null|30715261|Genome Wide Assoc...| Brouwer WP| 2019-02-02| Clin Infect Dis| [EFO_0004239]|121 Asian, Europe...| 0| 0| 509| [null]| [{nfe, 0.5}, {afr...|[{127, European},...| []| null| null| null| false|\n", + "| GCST009364| GCST| gwas|Triglyceride leve...| [EFO_0004530, EFO...| null| null|31719535|Multi-ancestry sl...| Noordam R| 2019-11-12| Nat Commun| null|at least 2,926 Af...| 0| 49886| 61990| [null]| [{eas, 0.03837715...|[{2096, East Asia...|[{12579, Hispanic...| null| null| null| false|\n", + "| GCST009391| GCST| gwas|Metabolite levels...| [EFO_0005132]| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_2| GCST| gwas| Metabolite levels| []| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_3| GCST| gwas| Metabolite levels| [EFO_0004468, EFO...| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_4| GCST| gwas| Metabolite levels| [EFO_0004518]| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_5| GCST| gwas| Metabolite levels| [EFO_0004761]| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_6| GCST| gwas| Metabolite levels| [EFO_0004846]| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_7| GCST| gwas| Metabolite levels| [EFO_0005001]| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_8| GCST| gwas| Metabolite levels| [EFO_0005002]| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_9| GCST| gwas| Metabolite levels| [EFO_0005058]| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "+------------+---------+---------+--------------------+------------------------+------+------------------+--------+--------------------+----------------------+---------------+--------------------+----------------------------------+--------------------+------+---------+--------+--------------------+---------------------+--------------------+--------------------+---------------+-------------+--------------------+-----------+\n", + "only showing top 50 rows\n", + "\n" + ] + } + ], + "source": [ + "path_tmp=\"gs://gwas_catalog_data/study_index\"\n", + "tmp=StudyIndex.from_parquet(session=session, path=path_tmp)\n", + "tmp.df.show(50)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "joined_df=joined_df.withColumn(\"traitFromSourceMappedIds\",joined_df[\"trait_efos\"]).drop(\"trait_efos\",\"trait_reported_low\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "| studyId| projectId|studyType| traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds| initialSampleSize|nCases|nControls|nSamples| cohorts|ldPopulationStructure| discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Actinomycosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 101| 363227| 363328|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Amoebiasis| ['EFO_0007144']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 160| 367214| 367374|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Anogenital herpes...| ['EFO_0007282']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1986| 400197| 402183|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Aspergillosis| ['EFO_0007157']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 211| 403213| 403424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Atypical virus in...| ['MONDO_0024318']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 282| 409849| 410131|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial infecti...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 20226| 363227| 383453|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial, viral ...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2852| 409329| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Other bacterial i...| ['EFO_0000771']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 6145| 367214| 373359|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Candidiasis| ['MONDO_0002026']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 4306| 403213| 407519|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas|Other sexually tr...| ['MONDO_0021681',...| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2186| 400197| 402383|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Cholera| ['EFO_1001235']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1385| 367214| 368599|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas|Dengue fever [cla...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 53| 409137| 409190|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas| Dermatophytosis| ['MONDO_0004678']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 3921| 403213| 407134|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Early syphilis| ['EFO_0007504']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 308| 400197| 400505|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "| FINNGEN_R10_AB1_EBV|FINNGEN_R10| gwas|Infectious mononu...| ['EFO_0007326']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2979| 400974| 403953|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Enterobiasis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 112| 411658| 411770|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Erysipelas| ['EFO_1001462']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 22261| 363227| 385488|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Diarrhoea and gas...| ['MONDO_0045031']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 32210| 367214| 399424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Gonococcal infection| ['DOID_7551']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 954| 400197| 401151|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_H...|FINNGEN_R10| gwas| Helminthiases| ['EFO_0007245', '...| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 523| 411658| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "joined_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "string\n" + ] + } + ], + "source": [ + "column_type = dict(joined_df.dtypes)[\"traitFromSourceMappedIds\"]\n", + "print(column_type)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql.functions import from_json\n", + "from pyspark.sql.types import ArrayType, StringType\n", + "\n", + "# Assuming joined_df is your DataFrame\n", + "joined_df = joined_df.withColumn(\n", + " \"traitFromSourceMappedIds\",\n", + " from_json(\"traitFromSourceMappedIds\", ArrayType(StringType()))\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "| studyId| projectId|studyType| traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds| initialSampleSize|nCases|nControls|nSamples| cohorts|ldPopulationStructure| discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Actinomycosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 101| 363227| 363328|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Amoebiasis| [EFO_0007144]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 160| 367214| 367374|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Anogenital herpes...| [EFO_0007282]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1986| 400197| 402183|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Aspergillosis| [EFO_0007157]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 211| 403213| 403424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Atypical virus in...| [MONDO_0024318]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 282| 409849| 410131|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial infecti...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 20226| 363227| 383453|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial, viral ...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2852| 409329| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Other bacterial i...| [EFO_0000771]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 6145| 367214| 373359|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Candidiasis| [MONDO_0002026]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 4306| 403213| 407519|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas|Other sexually tr...| [MONDO_0021681, E...| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2186| 400197| 402383|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Cholera| [EFO_1001235]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1385| 367214| 368599|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas|Dengue fever [cla...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 53| 409137| 409190|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas| Dermatophytosis| [MONDO_0004678]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 3921| 403213| 407134|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Early syphilis| [EFO_0007504]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 308| 400197| 400505|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "| FINNGEN_R10_AB1_EBV|FINNGEN_R10| gwas|Infectious mononu...| [EFO_0007326]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2979| 400974| 403953|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Enterobiasis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 112| 411658| 411770|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Erysipelas| [EFO_1001462]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 22261| 363227| 385488|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Diarrhoea and gas...| [MONDO_0045031]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 32210| 367214| 399424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Gonococcal infection| [DOID_7551]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 954| 400197| 401151|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_H...|FINNGEN_R10| gwas| Helminthiases| [EFO_0007245, EFO...| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 523| 411658| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "only showing top 20 rows\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "joined_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "array\n" + ] + } + ], + "source": [ + "column_type = dict(joined_df.dtypes)[\"traitFromSourceMappedIds\"]\n", + "print(column_type)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "si=StudyIndex(_df=joined_df, _schema=StudyIndex.get_schema())" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "| studyId| projectId|studyType| traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds| initialSampleSize|nCases|nControls|nSamples| cohorts|ldPopulationStructure| discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Actinomycosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 101| 363227| 363328|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Amoebiasis| [EFO_0007144]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 160| 367214| 367374|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Anogenital herpes...| [EFO_0007282]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1986| 400197| 402183|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Aspergillosis| [EFO_0007157]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 211| 403213| 403424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Atypical virus in...| [MONDO_0024318]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 282| 409849| 410131|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial infecti...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 20226| 363227| 383453|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial, viral ...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2852| 409329| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Other bacterial i...| [EFO_0000771]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 6145| 367214| 373359|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Candidiasis| [MONDO_0002026]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 4306| 403213| 407519|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas|Other sexually tr...| [MONDO_0021681, E...| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2186| 400197| 402383|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Cholera| [EFO_1001235]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1385| 367214| 368599|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas|Dengue fever [cla...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 53| 409137| 409190|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas| Dermatophytosis| [MONDO_0004678]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 3921| 403213| 407134|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Early syphilis| [EFO_0007504]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 308| 400197| 400505|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "| FINNGEN_R10_AB1_EBV|FINNGEN_R10| gwas|Infectious mononu...| [EFO_0007326]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2979| 400974| 403953|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Enterobiasis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 112| 411658| 411770|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Erysipelas| [EFO_1001462]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 22261| 363227| 385488|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Diarrhoea and gas...| [MONDO_0045031]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 32210| 367214| 399424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Gonococcal infection| [DOID_7551]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 954| 400197| 401151|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_H...|FINNGEN_R10| gwas| Helminthiases| [EFO_0007245, EFO...| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 523| 411658| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "si.df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "text/plain": [ + "2408" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "si.df.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "si.df.write.parquet(path=\"gs://genetics-portal-dev-analysis/yt4/study_index_finngen_with_efo\")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "path_to_study_index=\"gs://genetics-portal-dev-analysis/yt4/study_index_finngen_with_efo\"\n", + "si=StudyIndex.from_parquet(session=session, path=path_to_study_index)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "gentropy-krNFZEZg-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/Release_QC_metrics.ipynb b/notebooks/Release_QC_metrics.ipynb new file mode 100644 index 000000000..0052a3cc8 --- /dev/null +++ b/notebooks/Release_QC_metrics.ipynb @@ -0,0 +1,1103 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Notebook Title: Exploratory Data Analysis for Release QC Metrics\n", + "\n", + "## Description:\n", + "This notebook provides an exploratory data analysis for release quality control (QC) metrics. The notebook utilizes PySpark and pandas for data processing and visualization.\n", + "\n", + "## Notebook Workflow:\n", + "1. Import necessary modules and set up the release path and version.\n", + "2. Load and analyze the variant index data:\n", + " - Count the number of unique variants.\n", + "3. Load and analyze the variant-to-gene (v2g) data:\n", + " - Count the number of unique variants and total variant-to-gene assignments.\n", + " - Count the number of v2g assignments where the score is > 0.8.\n", + " - Plot a histogram/density plot for the \"score\" column.\n", + "4. Load and analyze the study index data for different data sources (FinnGen, GWASCat, eQTLcat):\n", + " - Count the number of unique studies for each data source.\n", + "5. Analyze the credible sets for each datasource (Finngen, gwascat, eqtlcat):\n", + " - Analyze the credible sets:\n", + " - Count the number of unique credible sets and unique study IDs.\n", + " - Plot a scatter plot of the credible set size vs. the top posterior probability.\n", + " - Count the number of credible sets with a top SNP posterior probability > 0.9..\n", + "6. Analyze colocalization data:\n", + " - Count the total number of colocalizations and the number with clpp > 0.8.\n", + " - Calculate the average number of overlaps per credible set.\n", + "7. Analyze locus-to-gene (L2G) predictions:\n", + " - Load the locus-to-gene predictions data.\n", + " - How many Studylocus contains a \"good\" l2g prediction? (l2g_score > 0.5)\n", + " - How does l2g perform based on different datasource inputs? (impossible to tell)\n", + " \n", + "Note: This notebook assumes that the necessary data files are available in the specified release path and version.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setup steps\n", + "##### 1. Import necessary modules and set up the release path and version." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/03/20 13:24:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + } + ], + "source": [ + "\"\"\"notebook for release qc metrics.\"\"\"\n", + "\n", + "import sys\n", + "from gentropy.common.session import Session\n", + "from pyspark.sql import functions as f\n", + "\n", + "sys.path.append(\"../../gentropy/src/\")\n", + "release_path=\"../../otg_releases\"\n", + "release_ver=\"2403\"\n", + "\n", + "session=Session()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2. Load and analyze the variant index data:\n", + " - Count the number of unique variants." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 3:> (0 + 8) / 9]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Variant index contains 5468737 unique variants.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "\n", + "variant_index_path = f\"{release_path}/{release_ver}/variant_index\"\n", + "variant_index=session.spark.read.parquet(variant_index_path, recursiveFileLookup=True)\n", + "\n", + "# How many variants?\n", + "print(\"Variant index contains \", variant_index.select(f.col(\"variantId\")).distinct().count(), \" unique variants.\")\n", + "\n", + "# How many variants with MAF>=0.01 for EUR population?\n", + "#variant_index.filter(variant_index[\"alleleFrequencies.populationName\"] > 0.05).show(10, False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "#### 3. Load and analyze the variant-to-gene (v2g) data:\n", + " - Count the number of unique variants and total variant-to-gene assignments.\n", + " - Count the number of v2g assignments where the score is > 0.8." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Unique variants in v2g release: 5090991 , total variant to gene assignments: 105771851 , number of v2g assignments where score > 0.8: 23176515 ( 4.552 %)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary of v2g_score: Mean: 0.5909395615801637 L.quart: 0.29 Median: 0.62 U.quart: 0.94\n" + ] + } + ], + "source": [ + "#v2g_path='gs://genetics_etl_python_playground/releases/24.03/variant_to_gene'\n", + "v2g_path=f\"{release_path}/{release_ver}/variant_to_gene\"\n", + "v2g=session.spark.read.parquet(v2g_path, recursiveFileLookup=True)\n", + "\n", + "#How many variants?\n", + "print(\"Unique variants in v2g release: \", v2g.select(f.col(\"variantId\")).distinct().count(), \", total variant to gene assignments: \", v2g.count(), \", number of v2g assignments where score > 0.8: \", v2g.filter(v2g[\"score\"] > 0.8).count(), \"(\", round( v2g.filter(v2g[\"score\"] > 0.8).count()/v2g.select(f.col(\"variantId\")).distinct().count(), 3), \"%)\")\n", + "sample_size_quartiles = v2g.stat.approxQuantile(\"score\", [0.25, 0.5, 0.75], 0.01)\n", + "print(\"Summary of v2g_score: Mean: \", v2g.select(f.mean(v2g[\"score\"])).collect()[0][0], \"L.quart: \", sample_size_quartiles[0], \"Median: \", sample_size_quartiles[1], \"U.quart: \", sample_size_quartiles[2])\n", + "#v2g.select().toPandas().plot.hist()\n", + "#v2g.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " - Plot a histogram/density plot for the \"score\" column." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#The histogram/density plot for “score”\n", + "# Out of mem error:\n", + "#v2g.select(f.col(\"score\")).toPandas().plot.hist(bins=10, alpha=0.5, label=\"v2g scores\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### 5. Analyze the credible sets:\n", + "For each datasource and finemapping methods:\n", + "- Count the number of unique credible sets and unique study IDs.\n", + "- Plot a scatter plot of the credible set size vs. the top posterior probability.\n", + "- Count the number of credible sets with a top SNP posterior probability > 0.9.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of unique finngen susie CSs: 13966\n", + "Ingested 13966 Credible sets from 2408 finngen studies\n", + "Summary of finngen sample sizes: Mean: 355917.896179402 L.quart: 316100.0 Median: 392089.0 U.quart: 409683.0\n", + "Number of finngen studies with at least one CS: 1194\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of CS with top SNP PP > 0.9: 1898\n", + "Summary of finngen credible set sizes: L.quart: 4.0 Median: 14.0 U.quart: 41.0\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# study_index='gs://genetics_etl_python_playground/releases/24.03/study_index'\n", + "# Study Index. It has different files for different datasource (FinnGen, GWASCat, eQTLcat).\n", + "# Credible_set. Please use Daniels’ notebook as a reference. For each subfolder:\n", + "\n", + "finngen_index_path=f\"{release_path}/{release_ver}/study_index/finngen\"\n", + "# finngen susie:\n", + "finngen_susie_path=f\"{release_path}/{release_ver}/credible_set/finngen_susie\"\n", + "\n", + "finngen_susie=session.spark.read.parquet(finngen_susie_path, recursiveFileLookup=True)\n", + "print(\"Number of unique finngen susie CSs: \", finngen_susie.select(\"studyId\", \"region\", \"credibleSetIndex\").distinct().count())\n", + "\n", + "# FinnGen:\n", + "finngen_index=session.spark.read.parquet(finngen_index_path, recursiveFileLookup=True)\n", + "# Number of CSs, studies.\n", + "print(\"Ingested \", finngen_susie.select(\"studyId\", \"region\", \"credibleSetIndex\").distinct().count(), \" Credible sets from\", finngen_index.select(f.col(\"studyId\")).distinct().count(), \"finngen studies\")\n", + "sample_size_quartiles = finngen_index.stat.approxQuantile(\"nSamples\", [0.25, 0.5, 0.75], 0.01)\n", + "print(\"Summary of finngen sample sizes: Mean: \", finngen_index.select(f.mean(finngen_index[\"nSamples\"])).collect()[0][0], \"L.quart: \", sample_size_quartiles[0], \"Median: \", sample_size_quartiles[1], \"U.quart: \", sample_size_quartiles[2])\n", + "# Number of unique studyids with at leas one CS.\n", + "print(\"Number of finngen studies with at least one CS: \", finngen_susie.select(\"studyId\").distinct().count())\n", + "# Number of CSs with at leas one SNP with PIP>0.9\n", + "print(\"Number of CS with top SNP PP > 0.9: \", finngen_susie.select(\"studyId\", \"region\", \"credibleSetIndex\", \"locus.posteriorProbability\").withColumn(\"top_PP\", f.col(\"posteriorProbability\").getItem(0)).filter(f.col(\"top_PP\") > 0.9).count())\n", + "# The descriptive summary of 99% CS size and histogram/density plot\n", + "\n", + "credset_size_quartiles = finngen_susie.select(\"studyId\", \"region\", \"credibleSetIndex\", \"locus.posteriorProbability\").withColumn(\"credset_size\", f.size(f.col(\"posteriorProbability\"))).stat.approxQuantile(\"credset_size\", [0.25, 0.5, 0.75], 0.01)\n", + "print(\"Summary of finngen credible set sizes: L.quart: \", credset_size_quartiles[0], \"Median: \", credset_size_quartiles[1], \"U.quart: \", credset_size_quartiles[2])\n", + "\n", + "finngen_susie.select(\"studyId\", \"region\", \"credibleSetIndex\", \"locus.posteriorProbability\").withColumn(\"top_PP\", f.col(\"posteriorProbability\").getItem(0)).withColumn(\"credset_size\", f.size(f.col(\"posteriorProbability\"))).toPandas().plot.scatter(x=\"credset_size\", y=\"top_PP\", xlim=[0, 500], alpha=0.05, label=\"finngen susie CSs\", title=\"finngen susie credsets\")\n", + "\n", + "finngen_susie_fm=finngen_susie.select(\"studyId\", \"studyLocusId\", \"locus.posteriorProbability\").withColumn(\"top_PP\", f.col(\"posteriorProbability\").getItem(0)).withColumn(\"credset_size\", f.size(f.col(\"posteriorProbability\")))\n", + "# The histogram/density plot for total sample size\n", + "#finngen_index.select(f.col(\"nSamples\")).toPandas().plot.hist(bins=10, alpha=0.5, label=\"FinnGen sample size\", title=\"FinnGen sample sizes\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of unique finngen pics CSs: 19967 in 1342 studies.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of finngen_pics CS with top SNP PP > 0.9: 7239\n", + "Summary of finngen_pics sample sizes: L.quart: 316100.0 Median: 392089.0 U.quart: 409683.0\n", + "Summary of finngen_pics credset sizes: Mean: 36.04337156307908 L.quart: 1.0 Median: 7.0 U.quart: 30.0\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "# finngen (PICs):\n", + "finngen_index_path=f\"{release_path}/{release_ver}/study_index/finngen\"\n", + "finngen_index=session.spark.read.parquet(finngen_index_path, recursiveFileLookup=True)\n", + "finngen_pics_path=f\"{release_path}/{release_ver}/credible_set/finngen_pics\"\n", + "# Number of CSs.\n", + "finngen_pics=session.spark.read.parquet(finngen_pics_path, recursiveFileLookup=True)\n", + "#gwascat_sumstats.printSchema()\n", + "print(\"Number of unique finngen pics CSs: \", finngen_pics.select(\"studyLocusId\").distinct().count(), \" in \", finngen_pics.select(\"studyId\").distinct().count(), \" studies.\")\n", + "# keep only credible sets snps\n", + "\n", + "finngen_pics_fm=finngen_pics.select(\"studyId\", \"studyLocusId\", \"locus.posteriorProbability\").withColumn(\"top_PP\", f.col(\"posteriorProbability\").getItem(0)).withColumn(\"credset_size\", f.size(f.col(\"posteriorProbability\")))\n", + "finngen_pics_fm.select(\"credset_size\", \"top_PP\").toPandas().plot.scatter(x=\"credset_size\", y=\"top_PP\", alpha=0.05, xlim=[0, 500], label=\"finngen PICS CS\", title=\"finngen_pics CS\")\n", + "print(\"Number of finngen_pics CS with top SNP PP > 0.9: \", finngen_pics_fm.filter(f.col(\"top_PP\") > 0.9).distinct().count())\n", + "sample_size_quartiles = finngen_index.stat.approxQuantile(\"nSamples\", [0.25, 0.5, 0.75], 0.01)\n", + "print(\"Summary of finngen_pics sample sizes: L.quart: \", sample_size_quartiles[0], \"Median: \", sample_size_quartiles[1], \"U.quart: \", sample_size_quartiles[2])\n", + "\n", + "sample_size_quartiles = finngen_pics_fm.stat.approxQuantile(\"credset_size\", [0.25, 0.5, 0.75], 0.01)\n", + "print(\"Summary of finngen_pics credset sizes: Mean: \", finngen_pics_fm.select(f.mean(finngen_pics_fm[\"credset_size\"])).collect()[0][0], \"L.quart: \", sample_size_quartiles[0], \"Median: \", sample_size_quartiles[1], \"U.quart: \", sample_size_quartiles[2])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 9966 common loci between finngen susie and finngen pics\n" + ] + } + ], + "source": [ + "print(\"There are \", finngen_susie.join(finngen_pics, on=[\"studyId\", \"studyLocusId\"], how=\"inner\").count(), \" common loci between finngen susie and finngen pics\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "correlation coef: 0.6244595901320829\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "finngen_matching=finngen_pics_fm.withColumnRenamed(\"top_PP\", \"pics_PP\").join(finngen_susie_fm.withColumnRenamed(\"top_PP\", \"susie_PP\"), on=[\"studyId\", \"studyLocusId\"], how=\"inner\")\n", + "finngen_matching.select(\"pics_PP\", \"susie_PP\").toPandas().plot.scatter(x=\"susie_PP\", y=\"pics_PP\", alpha=0.05, title=\"finngen_pics vs finngen_susie CS\")\n", + "print(\"correlation coef: \", finngen_matching.stat.corr(\"pics_PP\", \"susie_PP\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of unique eQTLcat studies: 1801315\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of unqiue eQTLcat tissues: 76\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ingested 2055350 Credible sets from 1801315 eQTL catalog studies\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary of eQTL catalog sample sizes: Mean: 336.1455965527037 L.quart: 190.0 Median: 322.0 U.quart: 483.0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of eQTL catalog studies with at least one CS: 1801315\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of CS with top SNP PP > 0.9: 386227\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 146:===============================================> (32 + 5) / 37]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary of eQTL credible set sizes: L.quart: 3.0 Median: 10.0 U.quart: 27.0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "eqtl_index_path=f\"{release_path}/{release_ver}/study_index/eqtl_catalogue\"\n", + "# eQTLcat:\n", + "# Number of studies\n", + "eqtl_index=session.spark.read.parquet(eqtl_index_path, recursiveFileLookup=True)\n", + "print(\"Number of unique eQTLcat studies: \", eqtl_index.select(f.col(\"studyId\")).distinct().count())\n", + "# Number of tissues, list of tissues\n", + "print(\"Number of unqiue eQTLcat tissues: \", eqtl_index.select(f.col(\"tissueFromSourceId\")).distinct().count())\n", + "#eqtl_index.select(f.col(\"tissueFromSourceId\")).distinct().show(truncate=False)\n", + "\n", + "# Credible_set. Please use Daniels’ notebook as a reference. For each subfolder:\n", + "# eqtl catalog susie:\n", + "eqtlcat_susie_path=f\"{release_path}/{release_ver}/credible_set/eqtl_catalogue_susie\"\n", + "# Number of CSs.\n", + "eqtlcat_susie=session.spark.read.parquet(eqtlcat_susie_path, recursiveFileLookup=True)\n", + "\n", + "\n", + "# Number of CSs, studies.\n", + "print(\"Ingested \", eqtlcat_susie.select(\"studyId\", \"region\", \"credibleSetIndex\").distinct().count(), \" Credible sets from\", eqtl_index.select(f.col(\"studyId\")).distinct().count(), \"eQTL catalog studies\")\n", + "sample_size_quartiles = eqtl_index.stat.approxQuantile(\"nSamples\", [0.25, 0.5, 0.75], 0.01)\n", + "print(\"Summary of eQTL catalog sample sizes: Mean: \", eqtl_index.select(f.mean(eqtl_index[\"nSamples\"])).collect()[0][0], \"L.quart: \", sample_size_quartiles[0], \"Median: \", sample_size_quartiles[1], \"U.quart: \", sample_size_quartiles[2])\n", + "# Number of unique studyids with at leas one CS.\n", + "print(\"Number of eQTL catalog studies with at least one CS: \", eqtlcat_susie.select(\"studyId\").distinct().count())\n", + "# Number of CSs with at leas one SNP with PIP>0.9\n", + "print(\"Number of CS with top SNP PP > 0.9: \", eqtlcat_susie.select(\"studyId\", \"region\", \"credibleSetIndex\", \"locus.posteriorProbability\").withColumn(\"top_PP\", f.col(\"posteriorProbability\").getItem(0)).filter(f.col(\"top_PP\") > 0.9).count())\n", + "# The descriptive summary of 99% CS size and histogram/density plot\n", + "\n", + "credset_size_quartiles = eqtlcat_susie.select(\"studyId\", \"region\", \"credibleSetIndex\", \"locus.posteriorProbability\").withColumn(\"credset_size\", f.size(f.col(\"posteriorProbability\"))).stat.approxQuantile(\"credset_size\", [0.25, 0.5, 0.75], 0.01)\n", + "print(\"Summary of eQTL credible set sizes: L.quart: \", credset_size_quartiles[0], \"Median: \", credset_size_quartiles[1], \"U.quart: \", credset_size_quartiles[2])\n", + "\n", + "# Out of mem error:\n", + "#eqtlcat_susie.select(\"studyId\", \"region\", \"credibleSetIndex\", \"locus.posteriorProbability\").withColumn(\"top_PP\", f.col(\"posteriorProbability\").getItem(0)).withColumn(\"credset_size\", f.size(f.col(\"posteriorProbability\"))).toPandas().plot.scatter(x=\"credset_size\", y=\"top_PP\", xlim=[0, 500], alpha=0.05, label=\"finngen susie CSs\", title=\"finngen susie credsets\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of unique gwascat studies: 79858\n", + "Number of unique SUMSTATS gwascat studies: 18435\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of unique gwas catalog sumstats CSs: 247782 in 9679 studies.\n", + "Summary of SUMSTATS gwas sample sizes: L.quart: 88329.0 Median: 357580.0 U.quart: 445573.0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of SUMSTATS CS with top SNP PP > 0.9: 77721\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary of SUMSTATS gwascat pics credset sizes: Mean: 39.7659733230078 L.quart: 1.0 Median: 8.0 U.quart: 35.0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of unique gwas catalog curated CSs: 531198 in 35662 studies.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of CURATED CS with top SNP PP > 0.9: 92000\n", + "Summary of CURATED gwas sample sizes: L.quart: 687.0 Median: 4960.0 U.quart: 21282.0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary of CURATED gwascat pics credset sizes: Mean: 35.42119007174911 L.quart: 1.0 Median: 8.0 U.quart: 34.0\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "gwascat_path=f\"{release_path}/{release_ver}/study_index/gwas_catalog\"\n", + "# Gwas Catalog:\n", + "gwascat_index=session.spark.read.parquet(gwascat_path, recursiveFileLookup=True)\n", + "# Number of GWAS curated studies\n", + "print(\"Number of unique gwascat studies: \", gwascat_index.select(f.col(\"studyId\")).distinct().count())\n", + "# Number of studies with full GWAS sumstats\n", + "print(\"Number of unique SUMSTATS gwascat studies: \", gwascat_index.filter(f.col(\"hasSumstats\") == True).select(f.col(\"studyId\")).distinct().count())\n", + "#gwascat_index\n", + "# The histogram/density plot for total sample size separately for curated studies and full GWAS\n", + "#gwascat_index.filter(f.col(\"hasSumstats\") == True).select(f.col(\"nSamples\")).toPandas().plot.hist(bins=25, alpha=0.5, label=\"Sumstats GWAScat sample size\", title=\"Sumstats GWAScat sample size\")\n", + "#gwascat_index.filter(f.col(\"hasSumstats\") == False).select(f.col(\"nSamples\")).toPandas().plot.hist(bins=25, alpha=0.5, label=\"Sumstats GWAScat sample size\", title=\"Curated GWAScat sample size\")\n", + "\n", + "# Credible_set. Please use Daniels’ notebook as a reference. For each subfolder:\n", + "# gwas catalog sumstats (PICs):\n", + "gwascat_sumstats_path=f\"{release_path}/{release_ver}/credible_set/gwas_catalog_PICSed_summary_statistics\"\n", + "# Number of CSs.\n", + "gwascat_sumstats=session.spark.read.parquet(gwascat_sumstats_path, recursiveFileLookup=True)\n", + "\n", + "print(\"Number of unique gwas catalog sumstats CSs: \", gwascat_sumstats.select(\"studyLocusId\").distinct().count(), \" in \", gwascat_sumstats.select(\"studyId\").distinct().count(), \" studies.\")\n", + "\n", + "\n", + "sample_size_quartiles = gwascat_index.join(gwascat_sumstats, how=\"inner\", on=\"studyId\").stat.approxQuantile(\"nSamples\", [0.25, 0.5, 0.75], 0.01)\n", + "print(\"Summary of SUMSTATS gwas sample sizes: L.quart: \", sample_size_quartiles[0], \"Median: \", sample_size_quartiles[1], \"U.quart: \", sample_size_quartiles[2])\n", + "#\n", + "\n", + "\n", + "gwascat_sumstats_fm=gwascat_sumstats.select(\"studyId\", \"studyLocusId\", \"locus.posteriorProbability\").withColumn(\"top_PP\", f.col(\"posteriorProbability\").getItem(0)).withColumn(\"credset_size\", f.size(f.col(\"posteriorProbability\")))\n", + "gwascat_sumstats_fm.select(\"credset_size\", \"top_PP\").toPandas().plot.scatter(x=\"credset_size\", y=\"top_PP\", alpha=0.05, label=\"gwascat sumstats PICS CS\", title=\"gwascat sumstats PICS CS\")\n", + "print(\"Number of SUMSTATS CS with top SNP PP > 0.9: \", gwascat_sumstats_fm.filter(f.col(\"top_PP\") > 0.9).distinct().count())\n", + "\n", + "sample_size_quartiles = gwascat_sumstats_fm.stat.approxQuantile(\"credset_size\", [0.25, 0.5, 0.75], 0.01)\n", + "print(\"Summary of SUMSTATS gwascat pics credset sizes: Mean: \", gwascat_sumstats_fm.select(f.mean(gwascat_sumstats_fm[\"credset_size\"])).collect()[0][0], \"L.quart: \", sample_size_quartiles[0], \"Median: \", sample_size_quartiles[1], \"U.quart: \", sample_size_quartiles[2])\n", + "\n", + "\n", + "# gwas catalog curated (PICs):\n", + "gwascat_curated_path=f\"{release_path}/{release_ver}/credible_set/gwas_catalog_PICSed_curated_associations\"\n", + "# Number of CSs.\n", + "gwascat_curated=session.spark.read.parquet(gwascat_curated_path, recursiveFileLookup=True)\n", + "#gwascat_sumstats.printSchema()\n", + "print(\"Number of unique gwas catalog curated CSs: \", gwascat_curated.select(\"studyLocusId\").distinct().count(), \" in \", gwascat_curated.select(\"studyId\").distinct().count(), \" studies.\")\n", + "# keep only credible sets snps\n", + "\n", + "gwascat_curated_fm=gwascat_curated.select(\"studyId\", \"studyLocusId\", \"locus.posteriorProbability\").withColumn(\"top_PP\", f.col(\"posteriorProbability\").getItem(0)).withColumn(\"credset_size\", f.size(f.col(\"posteriorProbability\")))\n", + "gwascat_curated_fm.select(\"credset_size\", \"top_PP\").toPandas().plot.scatter(x=\"credset_size\", y=\"top_PP\", alpha=0.05, label=\"gwascat curated PICS CS\", title=\"gwascat curated PICS CS\")\n", + "print(\"Number of CURATED CS with top SNP PP > 0.9: \", gwascat_curated_fm.filter(f.col(\"top_PP\") > 0.9).distinct().count())\n", + "sample_size_quartiles = gwascat_index.join(gwascat_sumstats, how=\"anti\", on=\"studyId\").stat.approxQuantile(\"nSamples\", [0.25, 0.5, 0.75], 0.01)\n", + "print(\"Summary of CURATED gwas sample sizes: L.quart: \", sample_size_quartiles[0], \"Median: \", sample_size_quartiles[1], \"U.quart: \", sample_size_quartiles[2])\n", + "\n", + "sample_size_quartiles = gwascat_curated_fm.stat.approxQuantile(\"credset_size\", [0.25, 0.5, 0.75], 0.01)\n", + "print(\"Summary of CURATED gwascat pics credset sizes: Mean: \", gwascat_curated_fm.select(f.mean(gwascat_curated_fm[\"credset_size\"])).collect()[0][0], \"L.quart: \", sample_size_quartiles[0], \"Median: \", sample_size_quartiles[1], \"U.quart: \", sample_size_quartiles[2])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### 6. Analyze colocalization data:\n", + " - Count the total number of colocalizations and the number with clpp > 0.8.\n", + " - Calculate the average number of overlaps per credible set." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of colocalisations: 46733065 , of which, 4031821 > 0.8 clpp ( 8.6 %)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 360:============================================> (12 + 3) / 15]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average number of overlaps per CS: 90.79159130338489\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "coloc_path=f\"{release_path}/{release_ver}/colocalisation\"\n", + "coloc=session.spark.read.parquet(coloc_path, recursiveFileLookup=True)\n", + "\n", + "print(\"Number of colocalisations: \", coloc.count(), \" , of which, \", coloc.filter(f.col(\"clpp\") > 0.8).count(), \" > 0.8 clpp (\", round((coloc.filter(f.col(\"clpp\") > 0.8).count()/coloc.count()), 3)*100, \"%)\")\n", + "Avg_overlaps=coloc.groupBy(\"leftStudyLocusId\").count().agg(f.avg(\"count\")).collect()[0][0]\n", + "print(\"Average number of overlaps per CS: \", Avg_overlaps)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### 7. Analyze locus-to-gene (L2G) predictions:\n", + " - Histogram of l2g score distribution.\n", + " - Number of CS with at least one gene with L2G>=0.5\n", + " - Number of CS with more than one gene with L2G>=0.5" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A total of 10561869 l2g predictions were computed.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 8591772 UNIQUE locus to gene predictions for 607372 unique studyloci\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Where 357875 studyloci contains at least one gene with score > 0.5\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Of these, 122485 studyloci contains more than one gene with score > 0.5\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "l2g_path=f\"{release_path}/{release_ver}/locus_to_gene_predictions\"\n", + "l2g=session.spark.read.parquet(l2g_path, recursiveFileLookup=True)\n", + "l2g.select(\"score\").toPandas().plot.hist(bins=10, alpha=0.5, title=\"l2g scores\")\n", + "print(\"A total of \", l2g.select(\"studyLocusId\", \"geneId\").count(), \"l2g predictions were computed.\")\n", + "print(\"There are\", l2g.select(\"studyLocusId\", \"geneId\").distinct().count(), \" UNIQUE locus to gene predictions for\", l2g.select(\"studyLocusId\").distinct().count(), \" unique studyloci\")\n", + "print(\"Where \", l2g.filter(f.col(\"score\") > 0.5).select(\"studyLocusId\").distinct().count(), \" studyloci contains at least one gene with score > 0.5\")\n", + "print(\"Of these, \", l2g.filter(f.col(\"score\") > 0.5).groupBy(\"studyLocusId\").count().filter(f.col(\"count\") > 1).count(), \" studyloci contains more than one gene with score > 0.5\")\n", + "\n", + "# There are duplicated l2g predictions studyLocusId with finngen pics and susie" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are definitely duplicated studylocusIDs in the l2g predictions, and still around 20% of studylocus contains more than 1 gene with score>0.5. It is not possible to separate out these predictions based on whether they came from pics or susie, as the l2g outputs only contains the studylocusID (duplicated between pics and susie). \n", + "\n", + "If 20% is too high then it implies finngen pics and susie l2g are pointing (confidently) at different genes for the same studylocus.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### 7. Analyze locus-to-gene (L2G) predictions:\n", + " - Consider only the top gene assignments from l2g:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from pyspark.sql import Window\n", + "\n", + "window = Window.partitionBy(l2g['studyLocusId']).orderBy(l2g['score'].desc())\n", + "l2g = l2g.withColumn('rn', f.row_number().over(window))\n", + "l2g_max_scores = l2g.filter(l2g['rn'] == 1).drop('rn')\n", + "l2g_max_scores.select(\"score\").toPandas().plot.hist(bins=10, alpha=0.5, title=\"l2g scores (top gene assignment)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### 7. Analyze locus-to-gene (L2G) predictions:\n", + " - Consider only the top gene assignments from l2g:\n", + " - How does this look when separated by datasource?" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary of finngen PICS l2g scores: mean: 0.5615633023783144 L.quart: 0.2371208220720291 Median: 0.6145811676979065 U.quart: 0.8626531362533569\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "l2g_finngen_pics=l2g_max_scores.join(finngen_pics.select(\"studyLocusId\", \"studyId\"), on=\"studyLocusId\", how=\"inner\")\n", + "l2g_finngen_pics.select(\"score\").toPandas().plot.hist(bins=10, alpha=0.5, title=\"l2g scores (top gene assignment), finngen_pics\")\n", + "\n", + "sample_size_quartiles = l2g_finngen_pics.stat.approxQuantile(\"score\", [0.25, 0.5, 0.75], 0.01)\n", + "print(\"Summary of finngen PICS l2g scores: mean:\", l2g_finngen_pics.select(f.mean(l2g_finngen_pics[\"score\"])).collect()[0][0], \"L.quart: \", sample_size_quartiles[0], \"Median: \", sample_size_quartiles[1], \"U.quart: \", sample_size_quartiles[2])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary of l2g_finngen_susie: mean: 0.5593829938336615 L.quart: 0.2241434007883072 Median: 0.6244931817054749 U.quart: 0.8736903071403503\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "l2g_finngen_susie=l2g_max_scores.join(finngen_susie.select(\"studyLocusId\", \"studyId\"), on=\"studyLocusId\", how=\"inner\")\n", + "l2g_finngen_susie.select(\"score\").toPandas().plot.hist(bins=10, alpha=0.5, title=\"(top gene assignment), l2g_finngen_susie\")\n", + "\n", + "sample_size_quartiles = l2g_finngen_susie.stat.approxQuantile(\"score\", [0.25, 0.5, 0.75], 0.01)\n", + "print(\"Summary of l2g_finngen_susie: mean:\", l2g_finngen_susie.select(f.mean(l2g_finngen_susie[\"score\"])).collect()[0][0], \"L.quart: \", sample_size_quartiles[0], \"Median: \", sample_size_quartiles[1], \"U.quart: \", sample_size_quartiles[2])" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary of l2g_gwas_curated PICS l2g scores: mean: 0.5761322645683256 L.quart: 0.22704362869262695 Median: 0.653652012348175 U.quart: 0.8937538862228394\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "l2g_gwas_curated=l2g_max_scores.join(gwascat_curated.select(\"studyLocusId\", \"studyId\"), on=\"studyLocusId\", how=\"inner\")\n", + "l2g_gwas_curated.select(\"score\").toPandas().plot.hist(bins=10, alpha=0.5, title=\"(top gene assignment), l2g_gwas_curated_pics\")\n", + "\n", + "sample_size_quartiles = l2g_gwas_curated.stat.approxQuantile(\"score\", [0.25, 0.5, 0.75], 0.01)\n", + "print(\"Summary of l2g_gwas_curated PICS l2g scores: mean:\", l2g_gwas_curated.select(f.mean(l2g_gwas_curated[\"score\"])).collect()[0][0], \"L.quart: \", sample_size_quartiles[0], \"Median: \", sample_size_quartiles[1], \"U.quart: \", sample_size_quartiles[2])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Summary of l2g_gwas_sumstats PICS l2g scores: mean: 0.5751686123272477 L.quart: 0.23706066608428955 Median: 0.647024929523468 U.quart: 0.8808161020278931\n" + ] + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "l2g_gwas_sumstats=l2g_max_scores.join(gwascat_sumstats.select(\"studyLocusId\", \"studyId\"), on=\"studyLocusId\", how=\"inner\")\n", + "l2g_gwas_sumstats.select(\"score\").toPandas().plot.hist(bins=10, alpha=0.5, title=\"(top gene assignment), l2g_gwas_sumstats\")\n", + "\n", + "sample_size_quartiles = l2g_gwas_sumstats.stat.approxQuantile(\"score\", [0.25, 0.5, 0.75], 0.01)\n", + "print(\"Summary of l2g_gwas_sumstats PICS l2g scores: mean:\", l2g_gwas_sumstats.select(f.mean(l2g_gwas_sumstats[\"score\"])).collect()[0][0], \"L.quart: \", sample_size_quartiles[0], \"Median: \", sample_size_quartiles[1], \"U.quart: \", sample_size_quartiles[2])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "gentropy-NMtW8s8F-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/poetry.lock b/poetry.lock index f1b435867..8a3664c64 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "aiodns" @@ -1203,6 +1203,25 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} +[[package]] +name = "click-option-group" +version = "0.5.6" +description = "Option groups missing in Click" +optional = false +python-versions = ">=3.6,<4" +files = [ + {file = "click-option-group-0.5.6.tar.gz", hash = "sha256:97d06703873518cc5038509443742b25069a3c7562d1ea72ff08bfadde1ce777"}, + {file = "click_option_group-0.5.6-py3-none-any.whl", hash = "sha256:38a26d963ee3ad93332ddf782f9259c5bdfe405e73408d943ef5e7d0c3767ec7"}, +] + +[package.dependencies] +Click = ">=7.0,<9" + +[package.extras] +docs = ["Pallets-Sphinx-Themes", "m2r2", "sphinx"] +tests = ["pytest"] +tests-cov = ["coverage", "coveralls", "pytest", "pytest-cov"] + [[package]] name = "clickclick" version = "20.10.2" @@ -1627,21 +1646,27 @@ dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"] [[package]] name = "deptry" -version = "0.12.0" +version = "0.16.1" description = "A command line utility to check for unused, missing and transitive dependencies in a Python project." optional = false -python-versions = ">=3.8,<4.0" +python-versions = ">=3.8" files = [ - {file = "deptry-0.12.0-py3-none-any.whl", hash = "sha256:69c801a6ae1b39c7b8e0daf40dbe8b75f1f161277d206dd8f921f32cd22dad91"}, - {file = "deptry-0.12.0.tar.gz", hash = "sha256:ac3cd32d149c92a9af12f63cd9486ddd1760f0277ed0cf306c6ef0388f57ff0a"}, + {file = "deptry-0.16.1-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:29ed8ae61b8f5664dd484717c79eef7ec66d965940efd828fca0d3c09220a1db"}, + {file = "deptry-0.16.1-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:738a772b538f51e9a7bb8d5cb9a61cfea8794a79371d171919b01cff0dc895bf"}, + {file = "deptry-0.16.1-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:56b78f7c860def8000e93f88345a24809f1b91e2f7836ac9a08285cb405e2762"}, + {file = "deptry-0.16.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3e86a04ea87ddece0f68ba204feb950f588205808c8320e6628300f03ff66dc"}, + {file = "deptry-0.16.1-cp38-abi3-win_amd64.whl", hash = "sha256:01b5098739a56c93f3e1e40efec5f20452f22a9a8436a59809d46201fcb94bcf"}, + {file = "deptry-0.16.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7e29dc4c1bbb933c9482e8cef85fafe2be7f46aeb90a8a07ba5f2b22af60876f"}, + {file = "deptry-0.16.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8dfab68c247566c87a40f55f405be8549ffe4cea0b9b5384b7ae73a6f1d5cd1"}, + {file = "deptry-0.16.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1228493926b6e59cd2df7cb6016e10c255553cc31db24edcf7fc8d5474b81be6"}, + {file = "deptry-0.16.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:99c3ac60b78ad1b8fb9844c25393e7ebc969cc950601ce3c050f56d196da5a79"}, + {file = "deptry-0.16.1.tar.gz", hash = "sha256:39fb62da4a8f4d17ed282310f7bcaadec55a95a8c471b01e0fcdf5351a7ac323"}, ] [package.dependencies] -chardet = ">=4.0.0" -click = ">=8.0.0,<9.0.0" +click = ">=8.0.0,<9" colorama = {version = ">=0.4.6", markers = "sys_platform == \"win32\""} -pathspec = ">=0.9.0" -tomli = {version = ">=2.0.1,<3.0.0", markers = "python_version < \"3.11\""} +tomli = {version = ">=2.0.1", markers = "python_version < \"3.11\""} [[package]] name = "dill" @@ -1765,13 +1790,13 @@ test = ["pytest (>=6)"] [[package]] name = "execnet" -version = "2.0.2" +version = "2.1.1" description = "execnet: rapid multi-Python deployment" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "execnet-2.0.2-py3-none-any.whl", hash = "sha256:88256416ae766bc9e8895c76a87928c0012183da3cc4fc18016e6f050e025f41"}, - {file = "execnet-2.0.2.tar.gz", hash = "sha256:cc59bc4423742fd71ad227122eb0dd44db51efb3dc4095b45ac9a08c770096af"}, + {file = "execnet-2.1.1-py3-none-any.whl", hash = "sha256:26dee51f1b80cebd6d0ca8e74dd8745419761d3bef34163928cbebbdc4749fdc"}, + {file = "execnet-2.1.1.tar.gz", hash = "sha256:5189b52c6121c24feae288166ab41b32549c7e2348652736540b9e6e7d4e72e3"}, ] [package.extras] @@ -3426,6 +3451,156 @@ files = [ {file = "google_re2-1.1-3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d140c7b9395b4d1e654127aa1c99bcc603ed01000b7bc7e28c52562f1894ec12"}, {file = "google_re2-1.1-3-cp39-cp39-win32.whl", hash = "sha256:80c5fc200f64b2d903eeb07b8d6cefc620a872a0240c7caaa9aca05b20f5568f"}, {file = "google_re2-1.1-3-cp39-cp39-win_amd64.whl", hash = "sha256:9eb6dbcee9b5dc4069bbc0634f2eb039ca524a14bed5868fdf6560aaafcbca06"}, + {file = "google_re2-1.1-4-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:0db114d7e1aa96dbcea452a40136d7d747d60cbb61394965774688ef59cccd4e"}, + {file = "google_re2-1.1-4-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:82133958e003a1344e5b7a791b9a9dd7560b5c8f96936dbe16f294604524a633"}, + {file = "google_re2-1.1-4-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:9e74fd441d1f3d917d3303e319f61b82cdbd96b9a5ba919377a6eef1504a1e2b"}, + {file = "google_re2-1.1-4-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:734a2e7a4541c57253b5ebee24f3f3366ba3658bcad01da25fb623c78723471a"}, + {file = "google_re2-1.1-4-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:d88d5eecbc908abe16132456fae13690d0508f3ac5777f320ef95cb6cab9a961"}, + {file = "google_re2-1.1-4-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:b91db80b171ecec435a07977a227757dd487356701a32f556fa6fca5d0a40522"}, + {file = "google_re2-1.1-4-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b23129887a64bb9948af14c84705273ed1a40054e99433b4acccab4dcf6a226"}, + {file = "google_re2-1.1-4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5dc1a0cc7cd19261dcaf76763e2499305dbb7e51dc69555167cdb8af98782698"}, + {file = "google_re2-1.1-4-cp310-cp310-win32.whl", hash = "sha256:3b2ab1e2420b5dd9743a2d6bc61b64e5f708563702a75b6db86637837eaeaf2f"}, + {file = "google_re2-1.1-4-cp310-cp310-win_amd64.whl", hash = "sha256:92efca1a7ef83b6df012d432a1cbc71d10ff42200640c0f9a5ff5b343a48e633"}, + {file = "google_re2-1.1-4-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:854818fd4ce79787aca5ba459d6e5abe4ca9be2c684a5b06a7f1757452ca3708"}, + {file = "google_re2-1.1-4-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:4ceef51174b6f653b6659a8fdaa9c38960c5228b44b25be2a3bcd8566827554f"}, + {file = "google_re2-1.1-4-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:ee49087c3db7e6f5238105ab5299c09e9b77516fe8cfb0a37e5f1e813d76ecb8"}, + {file = "google_re2-1.1-4-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:dc2312854bdc01410acc5d935f1906a49cb1f28980341c20a68797ad89d8e178"}, + {file = "google_re2-1.1-4-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:0dc0d2e42296fa84a3cb3e1bd667c6969389cd5cdf0786e6b1f911ae2d75375b"}, + {file = "google_re2-1.1-4-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:6bf04ced98453b035f84320f348f67578024f44d2997498def149054eb860ae8"}, + {file = "google_re2-1.1-4-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1d6b6ef11dc4ab322fa66c2f3561925f2b5372a879c3ed764d20e939e2fd3e5f"}, + {file = "google_re2-1.1-4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0dcde6646fa9a97fd3692b3f6ae7daf7f3277d7500b6c253badeefa11db8956a"}, + {file = "google_re2-1.1-4-cp311-cp311-win32.whl", hash = "sha256:5f4f0229deb057348893574d5b0a96d055abebac6debf29d95b0c0e26524c9f6"}, + {file = "google_re2-1.1-4-cp311-cp311-win_amd64.whl", hash = "sha256:4713ddbe48a18875270b36a462b0eada5e84d6826f8df7edd328d8706b6f9d07"}, + {file = "google_re2-1.1-4-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:40a698300b8faddbb325662973f839489c89b960087060bd389c376828978a04"}, + {file = "google_re2-1.1-4-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:103d2d7ac92ba23911a151fd1fc7035cbf6dc92a7f6aea92270ebceb5cd5acd3"}, + {file = "google_re2-1.1-4-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:51fb7182bccab05e8258a2b6a63dda1a6b4a9e8dfb9b03ec50e50c49c2827dd4"}, + {file = "google_re2-1.1-4-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:65383022abd63d7b620221eba7935132b53244b8b463d8fdce498c93cf58b7b7"}, + {file = "google_re2-1.1-4-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:396281fc68a9337157b3ffcd9392c6b7fcb8aab43e5bdab496262a81d56a4ecc"}, + {file = "google_re2-1.1-4-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:8198adcfcff1c680e052044124621730fc48d08005f90a75487f5651f1ebfce2"}, + {file = "google_re2-1.1-4-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:81f7bff07c448aec4db9ca453d2126ece8710dbd9278b8bb09642045d3402a96"}, + {file = "google_re2-1.1-4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b7dacf730fd7d6ec71b11d6404b0b26e230814bfc8e9bb0d3f13bec9b5531f8d"}, + {file = "google_re2-1.1-4-cp312-cp312-win32.whl", hash = "sha256:8c764f62f4b1d89d1ef264853b6dd9fee14a89e9b86a81bc2157fe3531425eb4"}, + {file = "google_re2-1.1-4-cp312-cp312-win_amd64.whl", hash = "sha256:0be2666df4bc5381a5d693585f9bbfefb0bfd3c07530d7e403f181f5de47254a"}, + {file = "google_re2-1.1-4-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:5cb1b63a0bfd8dd65d39d2f3b2e5ae0a06ce4b2ce5818a1d1fc78a786a252673"}, + {file = "google_re2-1.1-4-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:e41751ce6b67a95230edd0772226dc94c2952a2909674cd69df9804ed0125307"}, + {file = "google_re2-1.1-4-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:b998cfa2d50bf4c063e777c999a7e8645ec7e5d7baf43ad71b1e2e10bb0300c3"}, + {file = "google_re2-1.1-4-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:226ca3b0c2e970f3fc82001ac89e845ecc7a4bb7c68583e7a76cda70b61251a7"}, + {file = "google_re2-1.1-4-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:9adec1f734ebad7c72e56c85f205a281d8fe9bf6583bc21020157d3f2812ce89"}, + {file = "google_re2-1.1-4-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:9c34f3c64ba566af967d29e11299560e6fdfacd8ca695120a7062b6ed993b179"}, + {file = "google_re2-1.1-4-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e1b85385fe293838e0d0b6e19e6c48ba8c6f739ea92ce2e23b718afe7b343363"}, + {file = "google_re2-1.1-4-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4694daa8a8987cfb568847aa872f9990e930c91a68c892ead876411d4b9012c3"}, + {file = "google_re2-1.1-4-cp38-cp38-win32.whl", hash = "sha256:5e671e9be1668187e2995aac378de574fa40df70bb6f04657af4d30a79274ce0"}, + {file = "google_re2-1.1-4-cp38-cp38-win_amd64.whl", hash = "sha256:f66c164d6049a8299f6dfcfa52d1580576b4b9724d6fcdad2f36f8f5da9304b6"}, + {file = "google_re2-1.1-4-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:25cb17ae0993a48c70596f3a3ef5d659638106401cc8193f51c0d7961b3b3eb7"}, + {file = "google_re2-1.1-4-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:5f101f86d14ca94ca4dcf63cceaa73d351f2be2481fcaa29d9e68eeab0dc2a88"}, + {file = "google_re2-1.1-4-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:4e82591e85bf262a6d74cff152867e05fc97867c68ba81d6836ff8b0e7e62365"}, + {file = "google_re2-1.1-4-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:1f61c09b93ffd34b1e2557e5a9565039f935407a5786dbad46f64f1a484166e6"}, + {file = "google_re2-1.1-4-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:12b390ad8c7e74bab068732f774e75e0680dade6469b249a721f3432f90edfc3"}, + {file = "google_re2-1.1-4-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:1284343eb31c2e82ed2d8159f33ba6842238a56782c881b07845a6d85613b055"}, + {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6c7b38e0daf2c06e4d3163f4c732ab3ad2521aecfed6605b69e4482c612da303"}, + {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f4d4f0823e8b2f6952a145295b1ff25245ce9bb136aff6fe86452e507d4c1dd"}, + {file = "google_re2-1.1-4-cp39-cp39-win32.whl", hash = "sha256:1afae56b2a07bb48cfcfefaa15ed85bae26a68f5dc7f9e128e6e6ea36914e847"}, + {file = "google_re2-1.1-4-cp39-cp39-win_amd64.whl", hash = "sha256:aa7d6d05911ab9c8adbf3c225a7a120ab50fd2784ac48f2f0d140c0b7afc2b55"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:222fc2ee0e40522de0b21ad3bc90ab8983be3bf3cec3d349c80d76c8bb1a4beb"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:d4763b0b9195b72132a4e7de8e5a9bf1f05542f442a9115aa27cfc2a8004f581"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:209649da10c9d4a93d8a4d100ecbf9cc3b0252169426bec3e8b4ad7e57d600cf"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:68813aa333c1604a2df4a495b2a6ed065d7c8aebf26cc7e7abb5a6835d08353c"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:370a23ec775ad14e9d1e71474d56f381224dcf3e72b15d8ca7b4ad7dd9cd5853"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:14664a66a3ddf6bc9e56f401bf029db2d169982c53eff3f5876399104df0e9a6"}, + {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3ea3722cc4932cbcebd553b69dce1b4a73572823cff4e6a244f1c855da21d511"}, + {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e14bb264c40fd7c627ef5678e295370cd6ba95ca71d835798b6e37502fc4c690"}, + {file = "google_re2-1.1-5-cp310-cp310-win32.whl", hash = "sha256:39512cd0151ea4b3969c992579c79b423018b464624ae955be685fc07d94556c"}, + {file = "google_re2-1.1-5-cp310-cp310-win_amd64.whl", hash = "sha256:ac66537aa3bc5504320d922b73156909e3c2b6da19739c866502f7827b3f9fdf"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:5b5ea68d54890c9edb1b930dcb2658819354e5d3f2201f811798bbc0a142c2b4"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:33443511b6b83c35242370908efe2e8e1e7cae749c766b2b247bf30e8616066c"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:413d77bdd5ba0bfcada428b4c146e87707452ec50a4091ec8e8ba1413d7e0619"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:5171686e43304996a34baa2abcee6f28b169806d0e583c16d55e5656b092a414"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3b284db130283771558e31a02d8eb8fb756156ab98ce80035ae2e9e3a5f307c4"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:296e6aed0b169648dc4b870ff47bd34c702a32600adb9926154569ef51033f47"}, + {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:38d50e68ead374160b1e656bbb5d101f0b95fb4cc57f4a5c12100155001480c5"}, + {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2a0416a35921e5041758948bcb882456916f22845f66a93bc25070ef7262b72a"}, + {file = "google_re2-1.1-5-cp311-cp311-win32.whl", hash = "sha256:a1d59568bbb5de5dd56dd6cdc79907db26cce63eb4429260300c65f43469e3e7"}, + {file = "google_re2-1.1-5-cp311-cp311-win_amd64.whl", hash = "sha256:72f5a2f179648b8358737b2b493549370debd7d389884a54d331619b285514e3"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:cbc72c45937b1dc5acac3560eb1720007dccca7c9879138ff874c7f6baf96005"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5fadd1417fbef7235fa9453dba4eb102e6e7d94b1e4c99d5fa3dd4e288d0d2ae"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:040f85c63cc02696485b59b187a5ef044abe2f99b92b4fb399de40b7d2904ccc"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:64e3b975ee6d9bbb2420494e41f929c1a0de4bcc16d86619ab7a87f6ea80d6bd"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8ee370413e00f4d828eaed0e83b8af84d7a72e8ee4f4bd5d3078bc741dfc430a"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:5b89383001079323f693ba592d7aad789d7a02e75adb5d3368d92b300f5963fd"}, + {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:63cb4fdfbbda16ae31b41a6388ea621510db82feb8217a74bf36552ecfcd50ad"}, + {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ebedd84ae8be10b7a71a16162376fd67a2386fe6361ef88c622dcf7fd679daf"}, + {file = "google_re2-1.1-5-cp312-cp312-win32.whl", hash = "sha256:c8e22d1692bc2c81173330c721aff53e47ffd3c4403ff0cd9d91adfd255dd150"}, + {file = "google_re2-1.1-5-cp312-cp312-win_amd64.whl", hash = "sha256:5197a6af438bb8c4abda0bbe9c4fbd6c27c159855b211098b29d51b73e4cbcf6"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:b6727e0b98417e114b92688ad2aa256102ece51f29b743db3d831df53faf1ce3"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:711e2b6417eb579c61a4951029d844f6b95b9b373b213232efd413659889a363"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:71ae8b3df22c5c154c8af0f0e99d234a450ef1644393bc2d7f53fc8c0a1e111c"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:94a04e214bc521a3807c217d50cf099bbdd0c0a80d2d996c0741dbb995b5f49f"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:a770f75358508a9110c81a1257721f70c15d9bb592a2fb5c25ecbd13566e52a5"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:07c9133357f7e0b17c6694d5dcb82e0371f695d7c25faef2ff8117ef375343ff"}, + {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:204ca6b1cf2021548f4a9c29ac015e0a4ab0a7b6582bf2183d838132b60c8fda"}, + {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f0b95857c2c654f419ca684ec38c9c3325c24e6ba7d11910a5110775a557bb18"}, + {file = "google_re2-1.1-5-cp38-cp38-win32.whl", hash = "sha256:347ac770e091a0364e822220f8d26ab53e6fdcdeaec635052000845c5a3fb869"}, + {file = "google_re2-1.1-5-cp38-cp38-win_amd64.whl", hash = "sha256:ec32bb6de7ffb112a07d210cf9f797b7600645c2d5910703fa07f456dd2150e0"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:eb5adf89060f81c5ff26c28e261e6b4997530a923a6093c9726b8dec02a9a326"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a22630c9dd9ceb41ca4316bccba2643a8b1d5c198f21c00ed5b50a94313aaf10"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:544dc17fcc2d43ec05f317366375796351dec44058e1164e03c3f7d050284d58"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:19710af5ea88751c7768575b23765ce0dfef7324d2539de576f75cdc319d6654"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:f82995a205e08ad896f4bd5ce4847c834fab877e1772a44e5f262a647d8a1dec"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:63533c4d58da9dc4bc040250f1f52b089911699f0368e0e6e15f996387a984ed"}, + {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79e00fcf0cb04ea35a22b9014712d448725ce4ddc9f08cc818322566176ca4b0"}, + {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bc41afcefee2da6c4ed883a93d7f527c4b960cd1d26bbb0020a7b8c2d341a60a"}, + {file = "google_re2-1.1-5-cp39-cp39-win32.whl", hash = "sha256:486730b5e1f1c31b0abc6d80abe174ce4f1188fe17d1b50698f2bf79dc6e44be"}, + {file = "google_re2-1.1-5-cp39-cp39-win_amd64.whl", hash = "sha256:4de637ca328f1d23209e80967d1b987d6b352cd01b3a52a84b4d742c69c3da6c"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:621e9c199d1ff0fdb2a068ad450111a84b3bf14f96dfe5a8a7a0deae5f3f4cce"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:220acd31e7dde95373f97c3d1f3b3bd2532b38936af28b1917ee265d25bebbf4"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:db34e1098d164f76251a6ece30e8f0ddfd65bb658619f48613ce71acb3f9cbdb"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:5152bac41d8073977582f06257219541d0fc46ad99b0bbf30e8f60198a43b08c"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:6191294799e373ee1735af91f55abd23b786bdfd270768a690d9d55af9ea1b0d"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:070cbafbb4fecbb02e98feb28a1eb292fb880f434d531f38cc33ee314b521f1f"}, + {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8437d078b405a59a576cbed544490fe041140f64411f2d91012e8ec05ab8bf86"}, + {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f00f9a9af8896040e37896d9b9fc409ad4979f1ddd85bb188694a7d95ddd1164"}, + {file = "google_re2-1.1-6-cp310-cp310-win32.whl", hash = "sha256:df26345f229a898b4fd3cafd5f82259869388cee6268fc35af16a8e2293dd4e5"}, + {file = "google_re2-1.1-6-cp310-cp310-win_amd64.whl", hash = "sha256:3665d08262c57c9b28a5bdeb88632ad792c4e5f417e5645901695ab2624f5059"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b26b869d8aa1d8fe67c42836bf3416bb72f444528ee2431cfb59c0d3e02c6ce3"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:41fd4486c57dea4f222a6bb7f1ff79accf76676a73bdb8da0fcbd5ba73f8da71"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:0ee378e2e74e25960070c338c28192377c4dd41e7f4608f2688064bd2badc41e"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:a00cdbf662693367b36d075b29feb649fd7ee1b617cf84f85f2deebeda25fc64"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:4c09455014217a41499432b8c8f792f25f3df0ea2982203c3a8c8ca0e7895e69"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:6501717909185327935c7945e23bb5aa8fc7b6f237b45fe3647fa36148662158"}, + {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3510b04790355f199e7861c29234081900e1e1cbf2d1484da48aa0ba6d7356ab"}, + {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8c0e64c187ca406764f9e9ad6e750d62e69ed8f75bf2e865d0bfbc03b642361c"}, + {file = "google_re2-1.1-6-cp311-cp311-win32.whl", hash = "sha256:2a199132350542b0de0f31acbb3ca87c3a90895d1d6e5235f7792bb0af02e523"}, + {file = "google_re2-1.1-6-cp311-cp311-win_amd64.whl", hash = "sha256:83bdac8ceaece8a6db082ea3a8ba6a99a2a1ee7e9f01a9d6d50f79c6f251a01d"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:81985ff894cd45ab5a73025922ac28c0707759db8171dd2f2cc7a0e856b6b5ad"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5635af26065e6b45456ccbea08674ae2ab62494008d9202df628df3b267bc095"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:813b6f04de79f4a8fdfe05e2cb33e0ccb40fe75d30ba441d519168f9d958bd54"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:5ec2f5332ad4fd232c3f2d6748c2c7845ccb66156a87df73abcc07f895d62ead"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:5a687b3b32a6cbb731647393b7c4e3fde244aa557f647df124ff83fb9b93e170"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:39a62f9b3db5d3021a09a47f5b91708b64a0580193e5352751eb0c689e4ad3d7"}, + {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ca0f0b45d4a1709cbf5d21f355e5809ac238f1ee594625a1e5ffa9ff7a09eb2b"}, + {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a64b3796a7a616c7861247bd061c9a836b5caf0d5963e5ea8022125601cf7b09"}, + {file = "google_re2-1.1-6-cp312-cp312-win32.whl", hash = "sha256:32783b9cb88469ba4cd9472d459fe4865280a6b1acdad4480a7b5081144c4eb7"}, + {file = "google_re2-1.1-6-cp312-cp312-win_amd64.whl", hash = "sha256:259ff3fd2d39035b9cbcbf375995f83fa5d9e6a0c5b94406ff1cc168ed41d6c6"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:e4711bcffe190acd29104d8ecfea0c0e42b754837de3fb8aad96e6cc3c613cdc"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:4d081cce43f39c2e813fe5990e1e378cbdb579d3f66ded5bade96130269ffd75"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:4f123b54d48450d2d6b14d8fad38e930fb65b5b84f1b022c10f2913bd956f5b5"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:e1928b304a2b591a28eb3175f9db7f17c40c12cf2d4ec2a85fdf1cc9c073ff91"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:3a69f76146166aec1173003c1f547931bdf288c6b135fda0020468492ac4149f"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:fc08c388f4ebbbca345e84a0c56362180d33d11cbe9ccfae663e4db88e13751e"}, + {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b057adf38ce4e616486922f2f47fc7d19c827ba0a7f69d540a3664eba2269325"}, + {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4138c0b933ab099e96f5d8defce4486f7dfd480ecaf7f221f2409f28022ccbc5"}, + {file = "google_re2-1.1-6-cp38-cp38-win32.whl", hash = "sha256:9693e45b37b504634b1abbf1ee979471ac6a70a0035954592af616306ab05dd6"}, + {file = "google_re2-1.1-6-cp38-cp38-win_amd64.whl", hash = "sha256:5674d437baba0ea287a5a7f8f81f24265d6ae8f8c09384e2ef7b6f84b40a7826"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:7783137cb2e04f458a530c6d0ee9ef114815c1d48b9102f023998c371a3b060e"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a49b7153935e7a303675f4deb5f5d02ab1305adefc436071348706d147c889e0"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:a96a8bb309182090704593c60bdb369a2756b38fe358bbf0d40ddeb99c71769f"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:dff3d4be9f27ef8ec3705eed54f19ef4ab096f5876c15fe011628c69ba3b561c"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:40f818b0b39e26811fa677978112a8108269977fdab2ba0453ac4363c35d9e66"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:8a7e53538cdb40ef4296017acfbb05cab0c19998be7552db1cfb85ba40b171b9"}, + {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6ee18e7569fb714e5bb8c42809bf8160738637a5e71ed5a4797757a1fb4dc4de"}, + {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1cda4f6d1a7d5b43ea92bc395f23853fba0caf8b1e1efa6e8c48685f912fcb89"}, + {file = "google_re2-1.1-6-cp39-cp39-win32.whl", hash = "sha256:6a9cdbdc36a2bf24f897be6a6c85125876dc26fea9eb4247234aec0decbdccfd"}, + {file = "google_re2-1.1-6-cp39-cp39-win_amd64.whl", hash = "sha256:73f646cecfad7cc5b4330b4192c25f2e29730a3b8408e089ffd2078094208196"}, ] [[package]] @@ -3552,13 +3727,13 @@ test = ["objgraph", "psutil"] [[package]] name = "griffe" -version = "0.38.0" +version = "0.44.0" description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API." optional = false python-versions = ">=3.8" files = [ - {file = "griffe-0.38.0-py3-none-any.whl", hash = "sha256:6a5bc457320e8e199006aa5fbb03e162f5e21abe31aa6221f7a5c37ea0724c71"}, - {file = "griffe-0.38.0.tar.gz", hash = "sha256:9b97487b583042b543d1e28196caee638ecd766c8c4c98135071806cb5333ac2"}, + {file = "griffe-0.44.0-py3-none-any.whl", hash = "sha256:8a4471c469ba980b87c843f1168850ce39d0c1d0c7be140dca2480f76c8e5446"}, + {file = "griffe-0.44.0.tar.gz", hash = "sha256:34aee1571042f9bf00529bc715de4516fb6f482b164e90d030300601009e0223"}, ] [package.dependencies] @@ -3925,13 +4100,13 @@ files = [ [[package]] name = "interrogate" -version = "1.5.0" +version = "1.7.0" description = "Interrogate a codebase for docstring coverage." optional = false -python-versions = ">=3.6" +python-versions = ">=3.8" files = [ - {file = "interrogate-1.5.0-py3-none-any.whl", hash = "sha256:a4ccc5cbd727c74acc98dee6f5e79ef264c0bcfa66b68d4e123069b2af89091a"}, - {file = "interrogate-1.5.0.tar.gz", hash = "sha256:b6f325f0aa84ac3ac6779d8708264d366102226c5af7d69058cecffcff7a6d6c"}, + {file = "interrogate-1.7.0-py3-none-any.whl", hash = "sha256:b13ff4dd8403369670e2efe684066de9fcb868ad9d7f2b4095d8112142dc9d12"}, + {file = "interrogate-1.7.0.tar.gz", hash = "sha256:a320d6ec644dfd887cc58247a345054fc4d9f981100c45184470068f4b3719b0"}, ] [package.dependencies] @@ -3940,13 +4115,13 @@ click = ">=7.1" colorama = "*" py = "*" tabulate = "*" -toml = "*" +tomli = {version = "*", markers = "python_version < \"3.11\""} [package.extras] -dev = ["cairosvg", "pre-commit", "pytest", "pytest-cov", "pytest-mock", "sphinx", "sphinx-autobuild", "wheel"] +dev = ["cairosvg", "coverage[toml]", "pre-commit", "pytest", "pytest-cov", "pytest-mock", "sphinx", "sphinx-autobuild", "wheel"] docs = ["sphinx", "sphinx-autobuild"] png = ["cairosvg"] -tests = ["pytest", "pytest-cov", "pytest-mock"] +tests = ["coverage[toml]", "pytest", "pytest-cov", "pytest-mock"] [[package]] name = "ipykernel" @@ -3983,13 +4158,13 @@ test = ["flaky", "ipyparallel", "pre-commit", "pytest (>=7.0)", "pytest-asyncio [[package]] name = "ipython" -version = "8.22.1" +version = "8.24.0" description = "IPython: Productive Interactive Computing" optional = false python-versions = ">=3.10" files = [ - {file = "ipython-8.22.1-py3-none-any.whl", hash = "sha256:869335e8cded62ffb6fac8928e5287a05433d6462e3ebaac25f4216474dd6bc4"}, - {file = "ipython-8.22.1.tar.gz", hash = "sha256:39c6f9efc079fb19bfb0f17eee903978fe9a290b1b82d68196c641cecb76ea22"}, + {file = "ipython-8.24.0-py3-none-any.whl", hash = "sha256:d7bf2f6c4314984e3e02393213bab8703cf163ede39672ce5918c51fe253a2a3"}, + {file = "ipython-8.24.0.tar.gz", hash = "sha256:010db3f8a728a578bb641fdd06c063b9fb8e96a9464c63aec6310fbcb5e80501"}, ] [package.dependencies] @@ -4003,18 +4178,20 @@ prompt-toolkit = ">=3.0.41,<3.1.0" pygments = ">=2.4.0" stack-data = "*" traitlets = ">=5.13.0" +typing-extensions = {version = ">=4.6", markers = "python_version < \"3.12\""} [package.extras] -all = ["ipython[black,doc,kernel,nbconvert,nbformat,notebook,parallel,qtconsole,terminal]", "ipython[test,test-extra]"] +all = ["ipython[black,doc,kernel,matplotlib,nbconvert,nbformat,notebook,parallel,qtconsole]", "ipython[test,test-extra]"] black = ["black"] doc = ["docrepr", "exceptiongroup", "ipykernel", "ipython[test]", "matplotlib", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "sphinxcontrib-jquery", "stack-data", "typing-extensions"] kernel = ["ipykernel"] +matplotlib = ["matplotlib"] nbconvert = ["nbconvert"] nbformat = ["nbformat"] notebook = ["ipywidgets", "notebook"] parallel = ["ipyparallel"] qtconsole = ["qtconsole"] -test = ["pickleshare", "pytest (<8)", "pytest-asyncio (<0.22)", "testpath"] +test = ["pickleshare", "pytest", "pytest-asyncio (<0.22)", "testpath"] test-extra = ["curio", "ipython[test]", "matplotlib (!=3.2.0)", "nbformat", "numpy (>=1.23)", "pandas", "trio"] [[package]] @@ -4353,96 +4530,173 @@ typing-extensions = ">=4.1.1" [[package]] name = "lxml" -version = "5.1.0" +version = "5.2.1" description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." optional = false python-versions = ">=3.6" files = [ - {file = "lxml-5.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:704f5572ff473a5f897745abebc6df40f22d4133c1e0a1f124e4f2bd3330ff7e"}, - {file = "lxml-5.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9d3c0f8567ffe7502d969c2c1b809892dc793b5d0665f602aad19895f8d508da"}, - {file = "lxml-5.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5fcfbebdb0c5d8d18b84118842f31965d59ee3e66996ac842e21f957eb76138c"}, - {file = "lxml-5.1.0-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2f37c6d7106a9d6f0708d4e164b707037b7380fcd0b04c5bd9cae1fb46a856fb"}, - {file = "lxml-5.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2befa20a13f1a75c751f47e00929fb3433d67eb9923c2c0b364de449121f447c"}, - {file = "lxml-5.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22b7ee4c35f374e2c20337a95502057964d7e35b996b1c667b5c65c567d2252a"}, - {file = "lxml-5.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bf8443781533b8d37b295016a4b53c1494fa9a03573c09ca5104550c138d5c05"}, - {file = "lxml-5.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:82bddf0e72cb2af3cbba7cec1d2fd11fda0de6be8f4492223d4a268713ef2147"}, - {file = "lxml-5.1.0-cp310-cp310-win32.whl", hash = "sha256:b66aa6357b265670bb574f050ffceefb98549c721cf28351b748be1ef9577d93"}, - {file = "lxml-5.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:4946e7f59b7b6a9e27bef34422f645e9a368cb2be11bf1ef3cafc39a1f6ba68d"}, - {file = "lxml-5.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:14deca1460b4b0f6b01f1ddc9557704e8b365f55c63070463f6c18619ebf964f"}, - {file = "lxml-5.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ed8c3d2cd329bf779b7ed38db176738f3f8be637bb395ce9629fc76f78afe3d4"}, - {file = "lxml-5.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:436a943c2900bb98123b06437cdd30580a61340fbdb7b28aaf345a459c19046a"}, - {file = "lxml-5.1.0-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:acb6b2f96f60f70e7f34efe0c3ea34ca63f19ca63ce90019c6cbca6b676e81fa"}, - {file = "lxml-5.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:af8920ce4a55ff41167ddbc20077f5698c2e710ad3353d32a07d3264f3a2021e"}, - {file = "lxml-5.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7cfced4a069003d8913408e10ca8ed092c49a7f6cefee9bb74b6b3e860683b45"}, - {file = "lxml-5.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:9e5ac3437746189a9b4121db2a7b86056ac8786b12e88838696899328fc44bb2"}, - {file = "lxml-5.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f4c9bda132ad108b387c33fabfea47866af87f4ea6ffb79418004f0521e63204"}, - {file = "lxml-5.1.0-cp311-cp311-win32.whl", hash = "sha256:bc64d1b1dab08f679fb89c368f4c05693f58a9faf744c4d390d7ed1d8223869b"}, - {file = "lxml-5.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:a5ab722ae5a873d8dcee1f5f45ddd93c34210aed44ff2dc643b5025981908cda"}, - {file = "lxml-5.1.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:9aa543980ab1fbf1720969af1d99095a548ea42e00361e727c58a40832439114"}, - {file = "lxml-5.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6f11b77ec0979f7e4dc5ae081325a2946f1fe424148d3945f943ceaede98adb8"}, - {file = "lxml-5.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a36c506e5f8aeb40680491d39ed94670487ce6614b9d27cabe45d94cd5d63e1e"}, - {file = "lxml-5.1.0-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f643ffd2669ffd4b5a3e9b41c909b72b2a1d5e4915da90a77e119b8d48ce867a"}, - {file = "lxml-5.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16dd953fb719f0ffc5bc067428fc9e88f599e15723a85618c45847c96f11f431"}, - {file = "lxml-5.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:16018f7099245157564d7148165132c70adb272fb5a17c048ba70d9cc542a1a1"}, - {file = "lxml-5.1.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:82cd34f1081ae4ea2ede3d52f71b7be313756e99b4b5f829f89b12da552d3aa3"}, - {file = "lxml-5.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:19a1bc898ae9f06bccb7c3e1dfd73897ecbbd2c96afe9095a6026016e5ca97b8"}, - {file = "lxml-5.1.0-cp312-cp312-win32.whl", hash = "sha256:13521a321a25c641b9ea127ef478b580b5ec82aa2e9fc076c86169d161798b01"}, - {file = "lxml-5.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:1ad17c20e3666c035db502c78b86e58ff6b5991906e55bdbef94977700c72623"}, - {file = "lxml-5.1.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:24ef5a4631c0b6cceaf2dbca21687e29725b7c4e171f33a8f8ce23c12558ded1"}, - {file = "lxml-5.1.0-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8d2900b7f5318bc7ad8631d3d40190b95ef2aa8cc59473b73b294e4a55e9f30f"}, - {file = "lxml-5.1.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:601f4a75797d7a770daed8b42b97cd1bb1ba18bd51a9382077a6a247a12aa38d"}, - {file = "lxml-5.1.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4b68c961b5cc402cbd99cca5eb2547e46ce77260eb705f4d117fd9c3f932b95"}, - {file = "lxml-5.1.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:afd825e30f8d1f521713a5669b63657bcfe5980a916c95855060048b88e1adb7"}, - {file = "lxml-5.1.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:262bc5f512a66b527d026518507e78c2f9c2bd9eb5c8aeeb9f0eb43fcb69dc67"}, - {file = "lxml-5.1.0-cp36-cp36m-win32.whl", hash = "sha256:e856c1c7255c739434489ec9c8aa9cdf5179785d10ff20add308b5d673bed5cd"}, - {file = "lxml-5.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:c7257171bb8d4432fe9d6fdde4d55fdbe663a63636a17f7f9aaba9bcb3153ad7"}, - {file = "lxml-5.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b9e240ae0ba96477682aa87899d94ddec1cc7926f9df29b1dd57b39e797d5ab5"}, - {file = "lxml-5.1.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a96f02ba1bcd330807fc060ed91d1f7a20853da6dd449e5da4b09bfcc08fdcf5"}, - {file = "lxml-5.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e3898ae2b58eeafedfe99e542a17859017d72d7f6a63de0f04f99c2cb125936"}, - {file = "lxml-5.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61c5a7edbd7c695e54fca029ceb351fc45cd8860119a0f83e48be44e1c464862"}, - {file = "lxml-5.1.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:3aeca824b38ca78d9ee2ab82bd9883083d0492d9d17df065ba3b94e88e4d7ee6"}, - {file = "lxml-5.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8f52fe6859b9db71ee609b0c0a70fea5f1e71c3462ecf144ca800d3f434f0764"}, - {file = "lxml-5.1.0-cp37-cp37m-win32.whl", hash = "sha256:d42e3a3fc18acc88b838efded0e6ec3edf3e328a58c68fbd36a7263a874906c8"}, - {file = "lxml-5.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:eac68f96539b32fce2c9b47eb7c25bb2582bdaf1bbb360d25f564ee9e04c542b"}, - {file = "lxml-5.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:ae15347a88cf8af0949a9872b57a320d2605ae069bcdf047677318bc0bba45b1"}, - {file = "lxml-5.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c26aab6ea9c54d3bed716b8851c8bfc40cb249b8e9880e250d1eddde9f709bf5"}, - {file = "lxml-5.1.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:342e95bddec3a698ac24378d61996b3ee5ba9acfeb253986002ac53c9a5f6f84"}, - {file = "lxml-5.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:725e171e0b99a66ec8605ac77fa12239dbe061482ac854d25720e2294652eeaa"}, - {file = "lxml-5.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d184e0d5c918cff04cdde9dbdf9600e960161d773666958c9d7b565ccc60c45"}, - {file = "lxml-5.1.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:98f3f020a2b736566c707c8e034945c02aa94e124c24f77ca097c446f81b01f1"}, - {file = "lxml-5.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6d48fc57e7c1e3df57be5ae8614bab6d4e7b60f65c5457915c26892c41afc59e"}, - {file = "lxml-5.1.0-cp38-cp38-win32.whl", hash = "sha256:7ec465e6549ed97e9f1e5ed51c657c9ede767bc1c11552f7f4d022c4df4a977a"}, - {file = "lxml-5.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:b21b4031b53d25b0858d4e124f2f9131ffc1530431c6d1321805c90da78388d1"}, - {file = "lxml-5.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:52427a7eadc98f9e62cb1368a5079ae826f94f05755d2d567d93ee1bc3ceb354"}, - {file = "lxml-5.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6a2a2c724d97c1eb8cf966b16ca2915566a4904b9aad2ed9a09c748ffe14f969"}, - {file = "lxml-5.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:843b9c835580d52828d8f69ea4302537337a21e6b4f1ec711a52241ba4a824f3"}, - {file = "lxml-5.1.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9b99f564659cfa704a2dd82d0684207b1aadf7d02d33e54845f9fc78e06b7581"}, - {file = "lxml-5.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f8b0c78e7aac24979ef09b7f50da871c2de2def043d468c4b41f512d831e912"}, - {file = "lxml-5.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9bcf86dfc8ff3e992fed847c077bd875d9e0ba2fa25d859c3a0f0f76f07f0c8d"}, - {file = "lxml-5.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:49a9b4af45e8b925e1cd6f3b15bbba2c81e7dba6dce170c677c9cda547411e14"}, - {file = "lxml-5.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:280f3edf15c2a967d923bcfb1f8f15337ad36f93525828b40a0f9d6c2ad24890"}, - {file = "lxml-5.1.0-cp39-cp39-win32.whl", hash = "sha256:ed7326563024b6e91fef6b6c7a1a2ff0a71b97793ac33dbbcf38f6005e51ff6e"}, - {file = "lxml-5.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:8d7b4beebb178e9183138f552238f7e6613162a42164233e2bda00cb3afac58f"}, - {file = "lxml-5.1.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:9bd0ae7cc2b85320abd5e0abad5ccee5564ed5f0cc90245d2f9a8ef330a8deae"}, - {file = "lxml-5.1.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d8c1d679df4361408b628f42b26a5d62bd3e9ba7f0c0e7969f925021554755aa"}, - {file = "lxml-5.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:2ad3a8ce9e8a767131061a22cd28fdffa3cd2dc193f399ff7b81777f3520e372"}, - {file = "lxml-5.1.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:304128394c9c22b6569eba2a6d98392b56fbdfbad58f83ea702530be80d0f9df"}, - {file = "lxml-5.1.0-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d74fcaf87132ffc0447b3c685a9f862ffb5b43e70ea6beec2fb8057d5d2a1fea"}, - {file = "lxml-5.1.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:8cf5877f7ed384dabfdcc37922c3191bf27e55b498fecece9fd5c2c7aaa34c33"}, - {file = "lxml-5.1.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:877efb968c3d7eb2dad540b6cabf2f1d3c0fbf4b2d309a3c141f79c7e0061324"}, - {file = "lxml-5.1.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f14a4fb1c1c402a22e6a341a24c1341b4a3def81b41cd354386dcb795f83897"}, - {file = "lxml-5.1.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:25663d6e99659544ee8fe1b89b1a8c0aaa5e34b103fab124b17fa958c4a324a6"}, - {file = "lxml-5.1.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:8b9f19df998761babaa7f09e6bc169294eefafd6149aaa272081cbddc7ba4ca3"}, - {file = "lxml-5.1.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e53d7e6a98b64fe54775d23a7c669763451340c3d44ad5e3a3b48a1efbdc96f"}, - {file = "lxml-5.1.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c3cd1fc1dc7c376c54440aeaaa0dcc803d2126732ff5c6b68ccd619f2e64be4f"}, - {file = "lxml-5.1.0.tar.gz", hash = "sha256:3eea6ed6e6c918e468e693c41ef07f3c3acc310b70ddd9cc72d9ef84bc9564ca"}, + {file = "lxml-5.2.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1f7785f4f789fdb522729ae465adcaa099e2a3441519df750ebdccc481d961a1"}, + {file = "lxml-5.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6cc6ee342fb7fa2471bd9b6d6fdfc78925a697bf5c2bcd0a302e98b0d35bfad3"}, + {file = "lxml-5.2.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:794f04eec78f1d0e35d9e0c36cbbb22e42d370dda1609fb03bcd7aeb458c6377"}, + {file = "lxml-5.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c817d420c60a5183953c783b0547d9eb43b7b344a2c46f69513d5952a78cddf3"}, + {file = "lxml-5.2.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2213afee476546a7f37c7a9b4ad4d74b1e112a6fafffc9185d6d21f043128c81"}, + {file = "lxml-5.2.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b070bbe8d3f0f6147689bed981d19bbb33070225373338df755a46893528104a"}, + {file = "lxml-5.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e02c5175f63effbd7c5e590399c118d5db6183bbfe8e0d118bdb5c2d1b48d937"}, + {file = "lxml-5.2.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:3dc773b2861b37b41a6136e0b72a1a44689a9c4c101e0cddb6b854016acc0aa8"}, + {file = "lxml-5.2.1-cp310-cp310-manylinux_2_28_ppc64le.whl", hash = "sha256:d7520db34088c96cc0e0a3ad51a4fd5b401f279ee112aa2b7f8f976d8582606d"}, + {file = "lxml-5.2.1-cp310-cp310-manylinux_2_28_s390x.whl", hash = "sha256:bcbf4af004f98793a95355980764b3d80d47117678118a44a80b721c9913436a"}, + {file = "lxml-5.2.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a2b44bec7adf3e9305ce6cbfa47a4395667e744097faed97abb4728748ba7d47"}, + {file = "lxml-5.2.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:1c5bb205e9212d0ebddf946bc07e73fa245c864a5f90f341d11ce7b0b854475d"}, + {file = "lxml-5.2.1-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2c9d147f754b1b0e723e6afb7ba1566ecb162fe4ea657f53d2139bbf894d050a"}, + {file = "lxml-5.2.1-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:3545039fa4779be2df51d6395e91a810f57122290864918b172d5dc7ca5bb433"}, + {file = "lxml-5.2.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a91481dbcddf1736c98a80b122afa0f7296eeb80b72344d7f45dc9f781551f56"}, + {file = "lxml-5.2.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:2ddfe41ddc81f29a4c44c8ce239eda5ade4e7fc305fb7311759dd6229a080052"}, + {file = "lxml-5.2.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:a7baf9ffc238e4bf401299f50e971a45bfcc10a785522541a6e3179c83eabf0a"}, + {file = "lxml-5.2.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:31e9a882013c2f6bd2f2c974241bf4ba68c85eba943648ce88936d23209a2e01"}, + {file = "lxml-5.2.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0a15438253b34e6362b2dc41475e7f80de76320f335e70c5528b7148cac253a1"}, + {file = "lxml-5.2.1-cp310-cp310-win32.whl", hash = "sha256:6992030d43b916407c9aa52e9673612ff39a575523c5f4cf72cdef75365709a5"}, + {file = "lxml-5.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:da052e7962ea2d5e5ef5bc0355d55007407087392cf465b7ad84ce5f3e25fe0f"}, + {file = "lxml-5.2.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:70ac664a48aa64e5e635ae5566f5227f2ab7f66a3990d67566d9907edcbbf867"}, + {file = "lxml-5.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1ae67b4e737cddc96c99461d2f75d218bdf7a0c3d3ad5604d1f5e7464a2f9ffe"}, + {file = "lxml-5.2.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f18a5a84e16886898e51ab4b1d43acb3083c39b14c8caeb3589aabff0ee0b270"}, + {file = "lxml-5.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6f2c8372b98208ce609c9e1d707f6918cc118fea4e2c754c9f0812c04ca116d"}, + {file = "lxml-5.2.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:394ed3924d7a01b5bd9a0d9d946136e1c2f7b3dc337196d99e61740ed4bc6fe1"}, + {file = "lxml-5.2.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d077bc40a1fe984e1a9931e801e42959a1e6598edc8a3223b061d30fbd26bbc"}, + {file = "lxml-5.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:764b521b75701f60683500d8621841bec41a65eb739b8466000c6fdbc256c240"}, + {file = "lxml-5.2.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:3a6b45da02336895da82b9d472cd274b22dc27a5cea1d4b793874eead23dd14f"}, + {file = "lxml-5.2.1-cp311-cp311-manylinux_2_28_ppc64le.whl", hash = "sha256:5ea7b6766ac2dfe4bcac8b8595107665a18ef01f8c8343f00710b85096d1b53a"}, + {file = "lxml-5.2.1-cp311-cp311-manylinux_2_28_s390x.whl", hash = "sha256:e196a4ff48310ba62e53a8e0f97ca2bca83cdd2fe2934d8b5cb0df0a841b193a"}, + {file = "lxml-5.2.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:200e63525948e325d6a13a76ba2911f927ad399ef64f57898cf7c74e69b71095"}, + {file = "lxml-5.2.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:dae0ed02f6b075426accbf6b2863c3d0a7eacc1b41fb40f2251d931e50188dad"}, + {file = "lxml-5.2.1-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:ab31a88a651039a07a3ae327d68ebdd8bc589b16938c09ef3f32a4b809dc96ef"}, + {file = "lxml-5.2.1-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:df2e6f546c4df14bc81f9498bbc007fbb87669f1bb707c6138878c46b06f6510"}, + {file = "lxml-5.2.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5dd1537e7cc06efd81371f5d1a992bd5ab156b2b4f88834ca852de4a8ea523fa"}, + {file = "lxml-5.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:9b9ec9c9978b708d488bec36b9e4c94d88fd12ccac3e62134a9d17ddba910ea9"}, + {file = "lxml-5.2.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:8e77c69d5892cb5ba71703c4057091e31ccf534bd7f129307a4d084d90d014b8"}, + {file = "lxml-5.2.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a8d5c70e04aac1eda5c829a26d1f75c6e5286c74743133d9f742cda8e53b9c2f"}, + {file = "lxml-5.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c94e75445b00319c1fad60f3c98b09cd63fe1134a8a953dcd48989ef42318534"}, + {file = "lxml-5.2.1-cp311-cp311-win32.whl", hash = "sha256:4951e4f7a5680a2db62f7f4ab2f84617674d36d2d76a729b9a8be4b59b3659be"}, + {file = "lxml-5.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:5c670c0406bdc845b474b680b9a5456c561c65cf366f8db5a60154088c92d102"}, + {file = "lxml-5.2.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:abc25c3cab9ec7fcd299b9bcb3b8d4a1231877e425c650fa1c7576c5107ab851"}, + {file = "lxml-5.2.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6935bbf153f9a965f1e07c2649c0849d29832487c52bb4a5c5066031d8b44fd5"}, + {file = "lxml-5.2.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d793bebb202a6000390a5390078e945bbb49855c29c7e4d56a85901326c3b5d9"}, + {file = "lxml-5.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afd5562927cdef7c4f5550374acbc117fd4ecc05b5007bdfa57cc5355864e0a4"}, + {file = "lxml-5.2.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0e7259016bc4345a31af861fdce942b77c99049d6c2107ca07dc2bba2435c1d9"}, + {file = "lxml-5.2.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:530e7c04f72002d2f334d5257c8a51bf409db0316feee7c87e4385043be136af"}, + {file = "lxml-5.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:59689a75ba8d7ffca577aefd017d08d659d86ad4585ccc73e43edbfc7476781a"}, + {file = "lxml-5.2.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:f9737bf36262046213a28e789cc82d82c6ef19c85a0cf05e75c670a33342ac2c"}, + {file = "lxml-5.2.1-cp312-cp312-manylinux_2_28_ppc64le.whl", hash = "sha256:3a74c4f27167cb95c1d4af1c0b59e88b7f3e0182138db2501c353555f7ec57f4"}, + {file = "lxml-5.2.1-cp312-cp312-manylinux_2_28_s390x.whl", hash = "sha256:68a2610dbe138fa8c5826b3f6d98a7cfc29707b850ddcc3e21910a6fe51f6ca0"}, + {file = "lxml-5.2.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:f0a1bc63a465b6d72569a9bba9f2ef0334c4e03958e043da1920299100bc7c08"}, + {file = "lxml-5.2.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c2d35a1d047efd68027817b32ab1586c1169e60ca02c65d428ae815b593e65d4"}, + {file = "lxml-5.2.1-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:79bd05260359170f78b181b59ce871673ed01ba048deef4bf49a36ab3e72e80b"}, + {file = "lxml-5.2.1-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:865bad62df277c04beed9478fe665b9ef63eb28fe026d5dedcb89b537d2e2ea6"}, + {file = "lxml-5.2.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:44f6c7caff88d988db017b9b0e4ab04934f11e3e72d478031efc7edcac6c622f"}, + {file = "lxml-5.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:71e97313406ccf55d32cc98a533ee05c61e15d11b99215b237346171c179c0b0"}, + {file = "lxml-5.2.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:057cdc6b86ab732cf361f8b4d8af87cf195a1f6dc5b0ff3de2dced242c2015e0"}, + {file = "lxml-5.2.1-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:f3bbbc998d42f8e561f347e798b85513ba4da324c2b3f9b7969e9c45b10f6169"}, + {file = "lxml-5.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:491755202eb21a5e350dae00c6d9a17247769c64dcf62d8c788b5c135e179dc4"}, + {file = "lxml-5.2.1-cp312-cp312-win32.whl", hash = "sha256:8de8f9d6caa7f25b204fc861718815d41cbcf27ee8f028c89c882a0cf4ae4134"}, + {file = "lxml-5.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:f2a9efc53d5b714b8df2b4b3e992accf8ce5bbdfe544d74d5c6766c9e1146a3a"}, + {file = "lxml-5.2.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:70a9768e1b9d79edca17890175ba915654ee1725975d69ab64813dd785a2bd5c"}, + {file = "lxml-5.2.1-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c38d7b9a690b090de999835f0443d8aa93ce5f2064035dfc48f27f02b4afc3d0"}, + {file = "lxml-5.2.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5670fb70a828663cc37552a2a85bf2ac38475572b0e9b91283dc09efb52c41d1"}, + {file = "lxml-5.2.1-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:958244ad566c3ffc385f47dddde4145088a0ab893504b54b52c041987a8c1863"}, + {file = "lxml-5.2.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:b6241d4eee5f89453307c2f2bfa03b50362052ca0af1efecf9fef9a41a22bb4f"}, + {file = "lxml-5.2.1-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:2a66bf12fbd4666dd023b6f51223aed3d9f3b40fef06ce404cb75bafd3d89536"}, + {file = "lxml-5.2.1-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:9123716666e25b7b71c4e1789ec829ed18663152008b58544d95b008ed9e21e9"}, + {file = "lxml-5.2.1-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:0c3f67e2aeda739d1cc0b1102c9a9129f7dc83901226cc24dd72ba275ced4218"}, + {file = "lxml-5.2.1-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:5d5792e9b3fb8d16a19f46aa8208987cfeafe082363ee2745ea8b643d9cc5b45"}, + {file = "lxml-5.2.1-cp36-cp36m-musllinux_1_2_aarch64.whl", hash = "sha256:88e22fc0a6684337d25c994381ed8a1580a6f5ebebd5ad41f89f663ff4ec2885"}, + {file = "lxml-5.2.1-cp36-cp36m-musllinux_1_2_ppc64le.whl", hash = "sha256:21c2e6b09565ba5b45ae161b438e033a86ad1736b8c838c766146eff8ceffff9"}, + {file = "lxml-5.2.1-cp36-cp36m-musllinux_1_2_s390x.whl", hash = "sha256:afbbdb120d1e78d2ba8064a68058001b871154cc57787031b645c9142b937a62"}, + {file = "lxml-5.2.1-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:627402ad8dea044dde2eccde4370560a2b750ef894c9578e1d4f8ffd54000461"}, + {file = "lxml-5.2.1-cp36-cp36m-win32.whl", hash = "sha256:e89580a581bf478d8dcb97d9cd011d567768e8bc4095f8557b21c4d4c5fea7d0"}, + {file = "lxml-5.2.1-cp36-cp36m-win_amd64.whl", hash = "sha256:59565f10607c244bc4c05c0c5fa0c190c990996e0c719d05deec7030c2aa8289"}, + {file = "lxml-5.2.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:857500f88b17a6479202ff5fe5f580fc3404922cd02ab3716197adf1ef628029"}, + {file = "lxml-5.2.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:56c22432809085b3f3ae04e6e7bdd36883d7258fcd90e53ba7b2e463efc7a6af"}, + {file = "lxml-5.2.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a55ee573116ba208932e2d1a037cc4b10d2c1cb264ced2184d00b18ce585b2c0"}, + {file = "lxml-5.2.1-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:6cf58416653c5901e12624e4013708b6e11142956e7f35e7a83f1ab02f3fe456"}, + {file = "lxml-5.2.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:64c2baa7774bc22dd4474248ba16fe1a7f611c13ac6123408694d4cc93d66dbd"}, + {file = "lxml-5.2.1-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:74b28c6334cca4dd704e8004cba1955af0b778cf449142e581e404bd211fb619"}, + {file = "lxml-5.2.1-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:7221d49259aa1e5a8f00d3d28b1e0b76031655ca74bb287123ef56c3db92f213"}, + {file = "lxml-5.2.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:3dbe858ee582cbb2c6294dc85f55b5f19c918c2597855e950f34b660f1a5ede6"}, + {file = "lxml-5.2.1-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:04ab5415bf6c86e0518d57240a96c4d1fcfc3cb370bb2ac2a732b67f579e5a04"}, + {file = "lxml-5.2.1-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:6ab833e4735a7e5533711a6ea2df26459b96f9eec36d23f74cafe03631647c41"}, + {file = "lxml-5.2.1-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:f443cdef978430887ed55112b491f670bba6462cea7a7742ff8f14b7abb98d75"}, + {file = "lxml-5.2.1-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:9e2addd2d1866fe112bc6f80117bcc6bc25191c5ed1bfbcf9f1386a884252ae8"}, + {file = "lxml-5.2.1-cp37-cp37m-win32.whl", hash = "sha256:f51969bac61441fd31f028d7b3b45962f3ecebf691a510495e5d2cd8c8092dbd"}, + {file = "lxml-5.2.1-cp37-cp37m-win_amd64.whl", hash = "sha256:b0b58fbfa1bf7367dde8a557994e3b1637294be6cf2169810375caf8571a085c"}, + {file = "lxml-5.2.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:804f74efe22b6a227306dd890eecc4f8c59ff25ca35f1f14e7482bbce96ef10b"}, + {file = "lxml-5.2.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:08802f0c56ed150cc6885ae0788a321b73505d2263ee56dad84d200cab11c07a"}, + {file = "lxml-5.2.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f8c09ed18ecb4ebf23e02b8e7a22a05d6411911e6fabef3a36e4f371f4f2585"}, + {file = "lxml-5.2.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e3d30321949861404323c50aebeb1943461a67cd51d4200ab02babc58bd06a86"}, + {file = "lxml-5.2.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:b560e3aa4b1d49e0e6c847d72665384db35b2f5d45f8e6a5c0072e0283430533"}, + {file = "lxml-5.2.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:058a1308914f20784c9f4674036527e7c04f7be6fb60f5d61353545aa7fcb739"}, + {file = "lxml-5.2.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:adfb84ca6b87e06bc6b146dc7da7623395db1e31621c4785ad0658c5028b37d7"}, + {file = "lxml-5.2.1-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:417d14450f06d51f363e41cace6488519038f940676ce9664b34ebf5653433a5"}, + {file = "lxml-5.2.1-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:a2dfe7e2473f9b59496247aad6e23b405ddf2e12ef0765677b0081c02d6c2c0b"}, + {file = "lxml-5.2.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:bf2e2458345d9bffb0d9ec16557d8858c9c88d2d11fed53998512504cd9df49b"}, + {file = "lxml-5.2.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:58278b29cb89f3e43ff3e0c756abbd1518f3ee6adad9e35b51fb101c1c1daaec"}, + {file = "lxml-5.2.1-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:64641a6068a16201366476731301441ce93457eb8452056f570133a6ceb15fca"}, + {file = "lxml-5.2.1-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:78bfa756eab503673991bdcf464917ef7845a964903d3302c5f68417ecdc948c"}, + {file = "lxml-5.2.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:11a04306fcba10cd9637e669fd73aa274c1c09ca64af79c041aa820ea992b637"}, + {file = "lxml-5.2.1-cp38-cp38-win32.whl", hash = "sha256:66bc5eb8a323ed9894f8fa0ee6cb3e3fb2403d99aee635078fd19a8bc7a5a5da"}, + {file = "lxml-5.2.1-cp38-cp38-win_amd64.whl", hash = "sha256:9676bfc686fa6a3fa10cd4ae6b76cae8be26eb5ec6811d2a325636c460da1806"}, + {file = "lxml-5.2.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:cf22b41fdae514ee2f1691b6c3cdeae666d8b7fa9434de445f12bbeee0cf48dd"}, + {file = "lxml-5.2.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ec42088248c596dbd61d4ae8a5b004f97a4d91a9fd286f632e42e60b706718d7"}, + {file = "lxml-5.2.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd53553ddad4a9c2f1f022756ae64abe16da1feb497edf4d9f87f99ec7cf86bd"}, + {file = "lxml-5.2.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:feaa45c0eae424d3e90d78823f3828e7dc42a42f21ed420db98da2c4ecf0a2cb"}, + {file = "lxml-5.2.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ddc678fb4c7e30cf830a2b5a8d869538bc55b28d6c68544d09c7d0d8f17694dc"}, + {file = "lxml-5.2.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:853e074d4931dbcba7480d4dcab23d5c56bd9607f92825ab80ee2bd916edea53"}, + {file = "lxml-5.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc4691d60512798304acb9207987e7b2b7c44627ea88b9d77489bbe3e6cc3bd4"}, + {file = "lxml-5.2.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:beb72935a941965c52990f3a32d7f07ce869fe21c6af8b34bf6a277b33a345d3"}, + {file = "lxml-5.2.1-cp39-cp39-manylinux_2_28_ppc64le.whl", hash = "sha256:6588c459c5627fefa30139be4d2e28a2c2a1d0d1c265aad2ba1935a7863a4913"}, + {file = "lxml-5.2.1-cp39-cp39-manylinux_2_28_s390x.whl", hash = "sha256:588008b8497667f1ddca7c99f2f85ce8511f8f7871b4a06ceede68ab62dff64b"}, + {file = "lxml-5.2.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b6787b643356111dfd4032b5bffe26d2f8331556ecb79e15dacb9275da02866e"}, + {file = "lxml-5.2.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7c17b64b0a6ef4e5affae6a3724010a7a66bda48a62cfe0674dabd46642e8b54"}, + {file = "lxml-5.2.1-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:27aa20d45c2e0b8cd05da6d4759649170e8dfc4f4e5ef33a34d06f2d79075d57"}, + {file = "lxml-5.2.1-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:d4f2cc7060dc3646632d7f15fe68e2fa98f58e35dd5666cd525f3b35d3fed7f8"}, + {file = "lxml-5.2.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff46d772d5f6f73564979cd77a4fffe55c916a05f3cb70e7c9c0590059fb29ef"}, + {file = "lxml-5.2.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:96323338e6c14e958d775700ec8a88346014a85e5de73ac7967db0367582049b"}, + {file = "lxml-5.2.1-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:52421b41ac99e9d91934e4d0d0fe7da9f02bfa7536bb4431b4c05c906c8c6919"}, + {file = "lxml-5.2.1-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:7a7efd5b6d3e30d81ec68ab8a88252d7c7c6f13aaa875009fe3097eb4e30b84c"}, + {file = "lxml-5.2.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:0ed777c1e8c99b63037b91f9d73a6aad20fd035d77ac84afcc205225f8f41188"}, + {file = "lxml-5.2.1-cp39-cp39-win32.whl", hash = "sha256:644df54d729ef810dcd0f7732e50e5ad1bd0a135278ed8d6bcb06f33b6b6f708"}, + {file = "lxml-5.2.1-cp39-cp39-win_amd64.whl", hash = "sha256:9ca66b8e90daca431b7ca1408cae085d025326570e57749695d6a01454790e95"}, + {file = "lxml-5.2.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:9b0ff53900566bc6325ecde9181d89afadc59c5ffa39bddf084aaedfe3b06a11"}, + {file = "lxml-5.2.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd6037392f2d57793ab98d9e26798f44b8b4da2f2464388588f48ac52c489ea1"}, + {file = "lxml-5.2.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b9c07e7a45bb64e21df4b6aa623cb8ba214dfb47d2027d90eac197329bb5e94"}, + {file = "lxml-5.2.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:3249cc2989d9090eeac5467e50e9ec2d40704fea9ab72f36b034ea34ee65ca98"}, + {file = "lxml-5.2.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f42038016852ae51b4088b2862126535cc4fc85802bfe30dea3500fdfaf1864e"}, + {file = "lxml-5.2.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:533658f8fbf056b70e434dff7e7aa611bcacb33e01f75de7f821810e48d1bb66"}, + {file = "lxml-5.2.1-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:622020d4521e22fb371e15f580d153134bfb68d6a429d1342a25f051ec72df1c"}, + {file = "lxml-5.2.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:efa7b51824aa0ee957ccd5a741c73e6851de55f40d807f08069eb4c5a26b2baa"}, + {file = "lxml-5.2.1-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c6ad0fbf105f6bcc9300c00010a2ffa44ea6f555df1a2ad95c88f5656104817"}, + {file = "lxml-5.2.1-pp37-pypy37_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:e233db59c8f76630c512ab4a4daf5a5986da5c3d5b44b8e9fc742f2a24dbd460"}, + {file = "lxml-5.2.1-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:6a014510830df1475176466b6087fc0c08b47a36714823e58d8b8d7709132a96"}, + {file = "lxml-5.2.1-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:d38c8f50ecf57f0463399569aa388b232cf1a2ffb8f0a9a5412d0db57e054860"}, + {file = "lxml-5.2.1-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:5aea8212fb823e006b995c4dda533edcf98a893d941f173f6c9506126188860d"}, + {file = "lxml-5.2.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ff097ae562e637409b429a7ac958a20aab237a0378c42dabaa1e3abf2f896e5f"}, + {file = "lxml-5.2.1-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f5d65c39f16717a47c36c756af0fb36144069c4718824b7533f803ecdf91138"}, + {file = "lxml-5.2.1-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:3d0c3dd24bb4605439bf91068598d00c6370684f8de4a67c2992683f6c309d6b"}, + {file = "lxml-5.2.1-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e32be23d538753a8adb6c85bd539f5fd3b15cb987404327c569dfc5fd8366e85"}, + {file = "lxml-5.2.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:cc518cea79fd1e2f6c90baafa28906d4309d24f3a63e801d855e7424c5b34144"}, + {file = "lxml-5.2.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a0af35bd8ebf84888373630f73f24e86bf016642fb8576fba49d3d6b560b7cbc"}, + {file = "lxml-5.2.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f8aca2e3a72f37bfc7b14ba96d4056244001ddcc18382bd0daa087fd2e68a354"}, + {file = "lxml-5.2.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ca1e8188b26a819387b29c3895c47a5e618708fe6f787f3b1a471de2c4a94d9"}, + {file = "lxml-5.2.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c8ba129e6d3b0136a0f50345b2cb3db53f6bda5dd8c7f5d83fbccba97fb5dcb5"}, + {file = "lxml-5.2.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e998e304036198b4f6914e6a1e2b6f925208a20e2042563d9734881150c6c246"}, + {file = "lxml-5.2.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:d3be9b2076112e51b323bdf6d5a7f8a798de55fb8d95fcb64bd179460cdc0704"}, + {file = "lxml-5.2.1.tar.gz", hash = "sha256:3f7765e69bbce0906a7c74d5fe46d2c7a7596147318dbc08e4a2431f3060e306"}, ] [package.extras] cssselect = ["cssselect (>=0.7)"] +html-clean = ["lxml-html-clean"] html5 = ["html5lib"] htmlsoup = ["BeautifulSoup4"] -source = ["Cython (>=3.0.7)"] +source = ["Cython (>=3.0.10)"] [[package]] name = "mako" @@ -4465,13 +4719,13 @@ testing = ["pytest"] [[package]] name = "markdown" -version = "3.5.1" +version = "3.6" description = "Python implementation of John Gruber's Markdown." optional = false python-versions = ">=3.8" files = [ - {file = "Markdown-3.5.1-py3-none-any.whl", hash = "sha256:5874b47d4ee3f0b14d764324d2c94c03ea66bee56f2d929da9f2508d65e722dc"}, - {file = "Markdown-3.5.1.tar.gz", hash = "sha256:b65d7beb248dc22f2e8a31fb706d93798093c308dc1aba295aedeb9d41a813bd"}, + {file = "Markdown-3.6-py3-none-any.whl", hash = "sha256:48f276f4d8cfb8ce6527c8f79e2ee29708508bf4d40aa410fbc3b4ee832c850f"}, + {file = "Markdown-3.6.tar.gz", hash = "sha256:ed4f41f6daecbeeb96e576ce414c41d2d876daa9a16cb35fa8ed8c2ddfad0224"}, ] [package.extras] @@ -4874,21 +5128,23 @@ mkdocs = ">=1.2" [[package]] name = "mkdocstrings" -version = "0.23.0" +version = "0.25.1" description = "Automatic documentation from sources, for MkDocs." optional = false python-versions = ">=3.8" files = [ - {file = "mkdocstrings-0.23.0-py3-none-any.whl", hash = "sha256:051fa4014dfcd9ed90254ae91de2dbb4f24e166347dae7be9a997fe16316c65e"}, - {file = "mkdocstrings-0.23.0.tar.gz", hash = "sha256:d9c6a37ffbe7c14a7a54ef1258c70b8d394e6a33a1c80832bce40b9567138d1c"}, + {file = "mkdocstrings-0.25.1-py3-none-any.whl", hash = "sha256:da01fcc2670ad61888e8fe5b60afe9fee5781017d67431996832d63e887c2e51"}, + {file = "mkdocstrings-0.25.1.tar.gz", hash = "sha256:c3a2515f31577f311a9ee58d089e4c51fc6046dbd9e9b4c3de4c3194667fe9bf"}, ] [package.dependencies] +click = ">=7.0" Jinja2 = ">=2.11.1" Markdown = ">=3.3" MarkupSafe = ">=1.1" -mkdocs = ">=1.2" +mkdocs = ">=1.4" mkdocs-autorefs = ">=0.3.1" +platformdirs = ">=2.2.0" pymdown-extensions = ">=6.3" [package.extras] @@ -4898,18 +5154,18 @@ python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"] [[package]] name = "mkdocstrings-python" -version = "1.8.0" +version = "1.10.0" description = "A Python handler for mkdocstrings." optional = false python-versions = ">=3.8" files = [ - {file = "mkdocstrings_python-1.8.0-py3-none-any.whl", hash = "sha256:4209970cc90bec194568682a535848a8d8489516c6ed4adbe58bbc67b699ca9d"}, - {file = "mkdocstrings_python-1.8.0.tar.gz", hash = "sha256:1488bddf50ee42c07d9a488dddc197f8e8999c2899687043ec5dd1643d057192"}, + {file = "mkdocstrings_python-1.10.0-py3-none-any.whl", hash = "sha256:ba833fbd9d178a4b9d5cb2553a4df06e51dc1f51e41559a4d2398c16a6f69ecc"}, + {file = "mkdocstrings_python-1.10.0.tar.gz", hash = "sha256:71678fac657d4d2bb301eed4e4d2d91499c095fd1f8a90fa76422a87a5693828"}, ] [package.dependencies] -griffe = ">=0.37" -mkdocstrings = ">=0.20" +griffe = ">=0.44" +mkdocstrings = ">=0.24.2" [[package]] name = "msal" @@ -5054,38 +5310,38 @@ files = [ [[package]] name = "mypy" -version = "1.8.0" +version = "1.10.0" description = "Optional static typing for Python" optional = false python-versions = ">=3.8" files = [ - {file = "mypy-1.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:485a8942f671120f76afffff70f259e1cd0f0cfe08f81c05d8816d958d4577d3"}, - {file = "mypy-1.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:df9824ac11deaf007443e7ed2a4a26bebff98d2bc43c6da21b2b64185da011c4"}, - {file = "mypy-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2afecd6354bbfb6e0160f4e4ad9ba6e4e003b767dd80d85516e71f2e955ab50d"}, - {file = "mypy-1.8.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8963b83d53ee733a6e4196954502b33567ad07dfd74851f32be18eb932fb1cb9"}, - {file = "mypy-1.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:e46f44b54ebddbeedbd3d5b289a893219065ef805d95094d16a0af6630f5d410"}, - {file = "mypy-1.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:855fe27b80375e5c5878492f0729540db47b186509c98dae341254c8f45f42ae"}, - {file = "mypy-1.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4c886c6cce2d070bd7df4ec4a05a13ee20c0aa60cb587e8d1265b6c03cf91da3"}, - {file = "mypy-1.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d19c413b3c07cbecf1f991e2221746b0d2a9410b59cb3f4fb9557f0365a1a817"}, - {file = "mypy-1.8.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9261ed810972061388918c83c3f5cd46079d875026ba97380f3e3978a72f503d"}, - {file = "mypy-1.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:51720c776d148bad2372ca21ca29256ed483aa9a4cdefefcef49006dff2a6835"}, - {file = "mypy-1.8.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:52825b01f5c4c1c4eb0db253ec09c7aa17e1a7304d247c48b6f3599ef40db8bd"}, - {file = "mypy-1.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f5ac9a4eeb1ec0f1ccdc6f326bcdb464de5f80eb07fb38b5ddd7b0de6bc61e55"}, - {file = "mypy-1.8.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afe3fe972c645b4632c563d3f3eff1cdca2fa058f730df2b93a35e3b0c538218"}, - {file = "mypy-1.8.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:42c6680d256ab35637ef88891c6bd02514ccb7e1122133ac96055ff458f93fc3"}, - {file = "mypy-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:720a5ca70e136b675af3af63db533c1c8c9181314d207568bbe79051f122669e"}, - {file = "mypy-1.8.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:028cf9f2cae89e202d7b6593cd98db6759379f17a319b5faf4f9978d7084cdc6"}, - {file = "mypy-1.8.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4e6d97288757e1ddba10dd9549ac27982e3e74a49d8d0179fc14d4365c7add66"}, - {file = "mypy-1.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f1478736fcebb90f97e40aff11a5f253af890c845ee0c850fe80aa060a267c6"}, - {file = "mypy-1.8.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:42419861b43e6962a649068a61f4a4839205a3ef525b858377a960b9e2de6e0d"}, - {file = "mypy-1.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:2b5b6c721bd4aabaadead3a5e6fa85c11c6c795e0c81a7215776ef8afc66de02"}, - {file = "mypy-1.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5c1538c38584029352878a0466f03a8ee7547d7bd9f641f57a0f3017a7c905b8"}, - {file = "mypy-1.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ef4be7baf08a203170f29e89d79064463b7fc7a0908b9d0d5114e8009c3a259"}, - {file = "mypy-1.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7178def594014aa6c35a8ff411cf37d682f428b3b5617ca79029d8ae72f5402b"}, - {file = "mypy-1.8.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ab3c84fa13c04aeeeabb2a7f67a25ef5d77ac9d6486ff33ded762ef353aa5592"}, - {file = "mypy-1.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:99b00bc72855812a60d253420d8a2eae839b0afa4938f09f4d2aa9bb4654263a"}, - {file = "mypy-1.8.0-py3-none-any.whl", hash = "sha256:538fd81bb5e430cc1381a443971c0475582ff9f434c16cd46d2c66763ce85d9d"}, - {file = "mypy-1.8.0.tar.gz", hash = "sha256:6ff8b244d7085a0b425b56d327b480c3b29cafbd2eff27316a004f9a7391ae07"}, + {file = "mypy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:da1cbf08fb3b851ab3b9523a884c232774008267b1f83371ace57f412fe308c2"}, + {file = "mypy-1.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:12b6bfc1b1a66095ab413160a6e520e1dc076a28f3e22f7fb25ba3b000b4ef99"}, + {file = "mypy-1.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e36fb078cce9904c7989b9693e41cb9711e0600139ce3970c6ef814b6ebc2b2"}, + {file = "mypy-1.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2b0695d605ddcd3eb2f736cd8b4e388288c21e7de85001e9f85df9187f2b50f9"}, + {file = "mypy-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:cd777b780312ddb135bceb9bc8722a73ec95e042f911cc279e2ec3c667076051"}, + {file = "mypy-1.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3be66771aa5c97602f382230165b856c231d1277c511c9a8dd058be4784472e1"}, + {file = "mypy-1.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8b2cbaca148d0754a54d44121b5825ae71868c7592a53b7292eeb0f3fdae95ee"}, + {file = "mypy-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ec404a7cbe9fc0e92cb0e67f55ce0c025014e26d33e54d9e506a0f2d07fe5de"}, + {file = "mypy-1.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e22e1527dc3d4aa94311d246b59e47f6455b8729f4968765ac1eacf9a4760bc7"}, + {file = "mypy-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:a87dbfa85971e8d59c9cc1fcf534efe664d8949e4c0b6b44e8ca548e746a8d53"}, + {file = "mypy-1.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a781f6ad4bab20eef8b65174a57e5203f4be627b46291f4589879bf4e257b97b"}, + {file = "mypy-1.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b808e12113505b97d9023b0b5e0c0705a90571c6feefc6f215c1df9381256e30"}, + {file = "mypy-1.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f55583b12156c399dce2df7d16f8a5095291354f1e839c252ec6c0611e86e2e"}, + {file = "mypy-1.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4cf18f9d0efa1b16478c4c129eabec36148032575391095f73cae2e722fcf9d5"}, + {file = "mypy-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:bc6ac273b23c6b82da3bb25f4136c4fd42665f17f2cd850771cb600bdd2ebeda"}, + {file = "mypy-1.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9fd50226364cd2737351c79807775136b0abe084433b55b2e29181a4c3c878c0"}, + {file = "mypy-1.10.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f90cff89eea89273727d8783fef5d4a934be2fdca11b47def50cf5d311aff727"}, + {file = "mypy-1.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fcfc70599efde5c67862a07a1aaf50e55bce629ace26bb19dc17cece5dd31ca4"}, + {file = "mypy-1.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:075cbf81f3e134eadaf247de187bd604748171d6b79736fa9b6c9685b4083061"}, + {file = "mypy-1.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:3f298531bca95ff615b6e9f2fc0333aae27fa48052903a0ac90215021cdcfa4f"}, + {file = "mypy-1.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa7ef5244615a2523b56c034becde4e9e3f9b034854c93639adb667ec9ec2976"}, + {file = "mypy-1.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3236a4c8f535a0631f85f5fcdffba71c7feeef76a6002fcba7c1a8e57c8be1ec"}, + {file = "mypy-1.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a2b5cdbb5dd35aa08ea9114436e0d79aceb2f38e32c21684dcf8e24e1e92821"}, + {file = "mypy-1.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92f93b21c0fe73dc00abf91022234c79d793318b8a96faac147cd579c1671746"}, + {file = "mypy-1.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:28d0e038361b45f099cc086d9dd99c15ff14d0188f44ac883010e172ce86c38a"}, + {file = "mypy-1.10.0-py3-none-any.whl", hash = "sha256:f8c083976eb530019175aabadb60921e73b4f45736760826aa1689dda8208aee"}, + {file = "mypy-1.10.0.tar.gz", hash = "sha256:3d087fcbec056c4ee34974da493a826ce316947485cef3901f511848e687c131"}, ] [package.dependencies] @@ -5750,13 +6006,13 @@ tenacity = ">=6.2.0" [[package]] name = "pluggy" -version = "1.4.0" +version = "1.5.0" description = "plugin and hook calling mechanisms for python" optional = false python-versions = ">=3.8" files = [ - {file = "pluggy-1.4.0-py3-none-any.whl", hash = "sha256:7db9f7b503d67d1c5b95f59773ebb58a8c1c288129a88665838012cfb07b8981"}, - {file = "pluggy-1.4.0.tar.gz", hash = "sha256:8c85c2876142a764e5b7548e7d9a0e0ddb46f5185161049a79b7e974454223be"}, + {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, + {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, ] [package.extras] @@ -5784,13 +6040,13 @@ tests = ["pytest (>=5.4.1)", "pytest-cov (>=2.8.1)", "pytest-mypy (>=0.8.0)", "p [[package]] name = "pre-commit" -version = "3.6.0" +version = "3.7.0" description = "A framework for managing and maintaining multi-language pre-commit hooks." optional = false python-versions = ">=3.9" files = [ - {file = "pre_commit-3.6.0-py2.py3-none-any.whl", hash = "sha256:c255039ef399049a5544b6ce13d135caba8f2c28c3b4033277a788f434308376"}, - {file = "pre_commit-3.6.0.tar.gz", hash = "sha256:d30bad9abf165f7785c15a21a1f46da7d0677cb00ee7ff4c579fd38922efe15d"}, + {file = "pre_commit-3.7.0-py2.py3-none-any.whl", hash = "sha256:5eae9e10c2b5ac51577c3452ec0a490455c45a0533f7960f993a0d01e59decab"}, + {file = "pre_commit-3.7.0.tar.gz", hash = "sha256:e209d61b8acdcf742404408531f0c37d49d2c734fd7cff2d6076083d191cb060"}, ] [package.dependencies] @@ -6355,17 +6611,17 @@ files = [ [[package]] name = "pymdown-extensions" -version = "10.7" +version = "10.8.1" description = "Extension pack for Python Markdown." optional = false python-versions = ">=3.8" files = [ - {file = "pymdown_extensions-10.7-py3-none-any.whl", hash = "sha256:6ca215bc57bc12bf32b414887a68b810637d039124ed9b2e5bd3325cbb2c050c"}, - {file = "pymdown_extensions-10.7.tar.gz", hash = "sha256:c0d64d5cf62566f59e6b2b690a4095c931107c250a8c8e1351c1de5f6b036deb"}, + {file = "pymdown_extensions-10.8.1-py3-none-any.whl", hash = "sha256:f938326115884f48c6059c67377c46cf631c733ef3629b6eed1349989d1b30cb"}, + {file = "pymdown_extensions-10.8.1.tar.gz", hash = "sha256:3ab1db5c9e21728dabf75192d71471f8e50f216627e9a1fa9535ecb0231b9940"}, ] [package.dependencies] -markdown = ">=3.5" +markdown = ">=3.6" pyyaml = "*" [package.extras] @@ -6424,13 +6680,13 @@ sql = ["pandas (>=1.0.5)", "pyarrow (>=1.0.0)"] [[package]] name = "pytest" -version = "8.1.0" +version = "8.2.0" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.8" files = [ - {file = "pytest-8.1.0-py3-none-any.whl", hash = "sha256:ee32db7af8de4629a455806befa90559f307424c07b8413ccfc30bf5b221dd7e"}, - {file = "pytest-8.1.0.tar.gz", hash = "sha256:f8fa04ab8f98d185113ae60ea6d79c22f8143b14bc1caeced44a0ab844928323"}, + {file = "pytest-8.2.0-py3-none-any.whl", hash = "sha256:1733f0620f6cda4095bbf0d9ff8022486e91892245bb9e7d5542c018f612f233"}, + {file = "pytest-8.2.0.tar.gz", hash = "sha256:d507d4482197eac0ba2bae2e9babf0672eb333017bcedaa5fb1a3d42c1174b3f"}, ] [package.dependencies] @@ -6438,21 +6694,21 @@ colorama = {version = "*", markers = "sys_platform == \"win32\""} exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} iniconfig = "*" packaging = "*" -pluggy = ">=1.4,<2.0" +pluggy = ">=1.5,<2.0" tomli = {version = ">=1", markers = "python_version < \"3.11\""} [package.extras] -testing = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] [[package]] name = "pytest-cov" -version = "4.1.0" +version = "5.0.0" description = "Pytest plugin for measuring coverage." optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "pytest-cov-4.1.0.tar.gz", hash = "sha256:3904b13dfbfec47f003b8e77fd5b589cd11904a21ddf1ab38a64f204d6a10ef6"}, - {file = "pytest_cov-4.1.0-py3-none-any.whl", hash = "sha256:6ba70b9e97e69fcc3fb45bfeab2d0a138fb65c4d0d6a41ef33983ad114be8c3a"}, + {file = "pytest-cov-5.0.0.tar.gz", hash = "sha256:5837b58e9f6ebd335b0f8060eecce69b662415b16dc503883a02f45dfeb14857"}, + {file = "pytest_cov-5.0.0-py3-none-any.whl", hash = "sha256:4f0764a1219df53214206bf1feea4633c3b558a2925c8b59f144f682861ce652"}, ] [package.dependencies] @@ -6460,7 +6716,7 @@ coverage = {version = ">=5.2.1", extras = ["toml"]} pytest = ">=4.6" [package.extras] -testing = ["fields", "hunter", "process-tests", "pytest-xdist", "six", "virtualenv"] +testing = ["fields", "hunter", "process-tests", "pytest-xdist", "virtualenv"] [[package]] name = "pytest-sugar" @@ -6483,18 +6739,18 @@ dev = ["black", "flake8", "pre-commit"] [[package]] name = "pytest-xdist" -version = "3.5.0" +version = "3.6.1" description = "pytest xdist plugin for distributed testing, most importantly across multiple CPUs" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "pytest-xdist-3.5.0.tar.gz", hash = "sha256:cbb36f3d67e0c478baa57fa4edc8843887e0f6cfc42d677530a36d7472b32d8a"}, - {file = "pytest_xdist-3.5.0-py3-none-any.whl", hash = "sha256:d075629c7e00b611df89f490a5063944bee7a4362a5ff11c7cc7824a03dfce24"}, + {file = "pytest_xdist-3.6.1-py3-none-any.whl", hash = "sha256:9ed4adfb68a016610848639bb7e02c9352d5d9f03d04809919e2dafc3be4cca7"}, + {file = "pytest_xdist-3.6.1.tar.gz", hash = "sha256:ead156a4db231eec769737f57668ef58a2084a34b2e55c4a8fa20d861107300d"}, ] [package.dependencies] -execnet = ">=1.1" -pytest = ">=6.2.0" +execnet = ">=2.1" +pytest = ">=7.0.0" [package.extras] psutil = ["psutil (>=3.0)"] @@ -6537,13 +6793,13 @@ six = ">=1.5" [[package]] name = "python-gitlab" -version = "3.15.0" -description = "Interact with GitLab API" +version = "4.4.0" +description = "A python wrapper for the GitLab API" optional = false -python-versions = ">=3.7.0" +python-versions = ">=3.8.0" files = [ - {file = "python-gitlab-3.15.0.tar.gz", hash = "sha256:c9e65eb7612a9fbb8abf0339972eca7fd7a73d4da66c9b446ffe528930aff534"}, - {file = "python_gitlab-3.15.0-py3-none-any.whl", hash = "sha256:8f8d1c0d387f642eb1ac7bf5e8e0cd8b3dd49c6f34170cee3c7deb7d384611f3"}, + {file = "python-gitlab-4.4.0.tar.gz", hash = "sha256:1d117bf7b433ae8255e5d74e72c660978f50ee85eb62248c9fb52ef43c3e3814"}, + {file = "python_gitlab-4.4.0-py3-none-any.whl", hash = "sha256:cdad39d016f59664cdaad0f878f194c79cb4357630776caa9a92c1da25c8d986"}, ] [package.dependencies] @@ -6552,7 +6808,7 @@ requests-toolbelt = ">=0.10.1" [package.extras] autocompletion = ["argcomplete (>=1.10.0,<3)"] -yaml = ["PyYaml (>=5.2)"] +yaml = ["PyYaml (>=6.0.1)"] [[package]] name = "python-json-logger" @@ -6581,33 +6837,34 @@ python-slugify = ">=1.2.5" [[package]] name = "python-semantic-release" -version = "9.1.0" +version = "9.4.1" description = "Automatic Semantic Versioning for Python projects" optional = false python-versions = ">=3.8" files = [ - {file = "python-semantic-release-9.1.0.tar.gz", hash = "sha256:0317e72a940c5080c8aa6ff56cce6230f045b1a8f91f0a58fdc8f80745912369"}, - {file = "python_semantic_release-9.1.0-py3-none-any.whl", hash = "sha256:7ca5fb1ea4d5215182db477fbc53d9f4c907b7030520c4ac6640b9aaa71dbce7"}, + {file = "python-semantic-release-9.4.1.tar.gz", hash = "sha256:78d8a8674edbcc8f389799c27d1c5a604f5bc0f061ce14951a6b061e68b25895"}, + {file = "python_semantic_release-9.4.1-py3-none-any.whl", hash = "sha256:efe419bfb2e9eb8459d67109a725f00f1a93f487999ca02d757cf442c7f72679"}, ] [package.dependencies] -click = ">=8,<9" -dotty-dict = ">=1.3.0,<2" -gitpython = ">=3.0.8,<4" -importlib-resources = ">=5.7,<7" -jinja2 = ">=3.1.2,<4" -pydantic = ">=2,<3" -python-gitlab = ">=2,<5" -requests = ">=2.25,<3" -rich = ">=12.5.1" -shellingham = ">=1.5.0.post1" +click = ">=8.0,<9.0" +click-option-group = ">=0.5,<1.0" +dotty-dict = ">=1.3,<2.0" +gitpython = ">=3.0,<4.0" +importlib-resources = ">=6.0,<7.0" +jinja2 = ">=3.1,<4.0" +pydantic = ">=2.0,<3.0" +python-gitlab = ">=4.0,<5.0" +requests = ">=2.25,<3.0" +rich = ">=12.5,<13.0" +shellingham = ">=1.5,<2.0" tomlkit = ">=0.11,<1.0" [package.extras] -dev = ["pre-commit", "ruff (==0.1.11)", "tox"] -docs = ["Sphinx (<=6.0.0)", "furo (>=2023.3.27)", "sphinx-autobuild (==2021.03.14)", "sphinxcontrib-apidoc (==0.3.0)"] -mypy = ["mypy", "types-requests"] -test = ["coverage[toml] (>=6,<8)", "pytest (>=7,<8)", "pytest-clarity (>=1.0.1)", "pytest-cov (>=4,<5)", "pytest-env (>=1.0,<2.0)", "pytest-lazy-fixture (>=0.6.3,<0.7.0)", "pytest-mock (>=3,<4)", "pytest-pretty (>=1.2.0,<2)", "pytest-xdist (>=2,<4)", "requests-mock (>=1.10.0,<2)", "responses (==0.23.3)", "types-pytest-lazy-fixture (>=0.6.3.3)"] +dev = ["pre-commit (>=3.5,<4.0)", "ruff (==0.3.5)", "tox (>=4.11,<5.0)"] +docs = ["Sphinx (>=6.0,<7.0)", "furo (>=2023.3,<2024.0)", "sphinx-autobuild (==2024.2.4)", "sphinxcontrib-apidoc (==0.5.0)"] +mypy = ["mypy (==1.9.0)", "types-requests (>=2.31.0,<2.32.0)"] +test = ["coverage[toml] (>=7.0,<8.0)", "pytest (>=7.0,<8.0)", "pytest-clarity (>=1.0,<2.0)", "pytest-cov (>=5.0,<6.0)", "pytest-env (>=1.0,<2.0)", "pytest-lazy-fixture (>=0.6.3,<0.7.0)", "pytest-mock (>=3.0,<4.0)", "pytest-pretty (>=1.2,<2.0)", "pytest-xdist (>=3.0,<4.0)", "requests-mock (>=1.10,<2.0)", "responses (>=0.25.0,<0.26.0)", "types-pytest-lazy-fixture (>=0.6.3,<0.7.0)"] [[package]] name = "python-slugify" @@ -6696,6 +6953,7 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -7184,28 +7442,28 @@ pyasn1 = ">=0.1.3" [[package]] name = "ruff" -version = "0.2.0" +version = "0.4.3" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.2.0-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:638ea3294f800d18bae84a492cb5a245c8d29c90d19a91d8e338937a4c27fca0"}, - {file = "ruff-0.2.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:3ff35433fcf4dff6d610738712152df6b7d92351a1bde8e00bd405b08b3d5759"}, - {file = "ruff-0.2.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf9faafbdcf4f53917019f2c230766da437d4fd5caecd12ddb68bb6a17d74399"}, - {file = "ruff-0.2.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8153a3e4128ed770871c47545f1ae7b055023e0c222ff72a759f5a341ee06483"}, - {file = "ruff-0.2.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e8a75a98ae989a27090e9c51f763990ad5bbc92d20626d54e9701c7fe597f399"}, - {file = "ruff-0.2.0-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:87057dd2fdde297130ff99553be8549ca38a2965871462a97394c22ed2dfc19d"}, - {file = "ruff-0.2.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6d232f99d3ab00094ebaf88e0fb7a8ccacaa54cc7fa3b8993d9627a11e6aed7a"}, - {file = "ruff-0.2.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d3c641f95f435fc6754b05591774a17df41648f0daf3de0d75ad3d9f099ab92"}, - {file = "ruff-0.2.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3826fb34c144ef1e171b323ed6ae9146ab76d109960addca730756dc19dc7b22"}, - {file = "ruff-0.2.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:eceab7d85d09321b4de18b62d38710cf296cb49e98979960a59c6b9307c18cfe"}, - {file = "ruff-0.2.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:30ad74687e1f4a9ff8e513b20b82ccadb6bd796fe5697f1e417189c5cde6be3e"}, - {file = "ruff-0.2.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:a7e3818698f8460bd0f8d4322bbe99db8327e9bc2c93c789d3159f5b335f47da"}, - {file = "ruff-0.2.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:edf23041242c48b0d8295214783ef543847ef29e8226d9f69bf96592dba82a83"}, - {file = "ruff-0.2.0-py3-none-win32.whl", hash = "sha256:e155147199c2714ff52385b760fe242bb99ea64b240a9ffbd6a5918eb1268843"}, - {file = "ruff-0.2.0-py3-none-win_amd64.whl", hash = "sha256:ba918e01cdd21e81b07555564f40d307b0caafa9a7a65742e98ff244f5035c59"}, - {file = "ruff-0.2.0-py3-none-win_arm64.whl", hash = "sha256:3fbaff1ba9564a2c5943f8f38bc221f04bac687cc7485e45237579fee7ccda79"}, - {file = "ruff-0.2.0.tar.gz", hash = "sha256:63856b91837606c673537d2889989733d7dffde553828d3b0f0bacfa6def54be"}, + {file = "ruff-0.4.3-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:b70800c290f14ae6fcbb41bbe201cf62dfca024d124a1f373e76371a007454ce"}, + {file = "ruff-0.4.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:08a0d6a22918ab2552ace96adeaca308833873a4d7d1d587bb1d37bae8728eb3"}, + {file = "ruff-0.4.3-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eba1f14df3c758dd7de5b55fbae7e1c8af238597961e5fb628f3de446c3c40c5"}, + {file = "ruff-0.4.3-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:819fb06d535cc76dfddbfe8d3068ff602ddeb40e3eacbc90e0d1272bb8d97113"}, + {file = "ruff-0.4.3-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0bfc9e955e6dc6359eb6f82ea150c4f4e82b660e5b58d9a20a0e42ec3bb6342b"}, + {file = "ruff-0.4.3-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:510a67d232d2ebe983fddea324dbf9d69b71c4d2dfeb8a862f4a127536dd4cfb"}, + {file = "ruff-0.4.3-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dc9ff11cd9a092ee7680a56d21f302bdda14327772cd870d806610a3503d001f"}, + {file = "ruff-0.4.3-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:29efff25bf9ee685c2c8390563a5b5c006a3fee5230d28ea39f4f75f9d0b6f2f"}, + {file = "ruff-0.4.3-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18b00e0bcccf0fc8d7186ed21e311dffd19761cb632241a6e4fe4477cc80ef6e"}, + {file = "ruff-0.4.3-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:262f5635e2c74d80b7507fbc2fac28fe0d4fef26373bbc62039526f7722bca1b"}, + {file = "ruff-0.4.3-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:7363691198719c26459e08cc17c6a3dac6f592e9ea3d2fa772f4e561b5fe82a3"}, + {file = "ruff-0.4.3-py3-none-musllinux_1_2_i686.whl", hash = "sha256:eeb039f8428fcb6725bb63cbae92ad67b0559e68b5d80f840f11914afd8ddf7f"}, + {file = "ruff-0.4.3-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:927b11c1e4d0727ce1a729eace61cee88a334623ec424c0b1c8fe3e5f9d3c865"}, + {file = "ruff-0.4.3-py3-none-win32.whl", hash = "sha256:25cacda2155778beb0d064e0ec5a3944dcca9c12715f7c4634fd9d93ac33fd30"}, + {file = "ruff-0.4.3-py3-none-win_amd64.whl", hash = "sha256:7a1c3a450bc6539ef00da6c819fb1b76b6b065dec585f91456e7c0d6a0bbc725"}, + {file = "ruff-0.4.3-py3-none-win_arm64.whl", hash = "sha256:71ca5f8ccf1121b95a59649482470c5601c60a416bf189d553955b0338e34614"}, + {file = "ruff-0.4.3.tar.gz", hash = "sha256:ff0a3ef2e3c4b6d133fbedcf9586abfbe38d076041f2dc18ffb2c7e0485d5a07"}, ] [[package]] @@ -7620,30 +7878,51 @@ description = "Database Abstraction Library" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" files = [ + {file = "SQLAlchemy-1.4.50-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:54138aa80d2dedd364f4e8220eef284c364d3270aaef621570aa2bd99902e2e8"}, {file = "SQLAlchemy-1.4.50-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d00665725063692c42badfd521d0c4392e83c6c826795d38eb88fb108e5660e5"}, {file = "SQLAlchemy-1.4.50-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85292ff52ddf85a39367057c3d7968a12ee1fb84565331a36a8fead346f08796"}, {file = "SQLAlchemy-1.4.50-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d0fed0f791d78e7767c2db28d34068649dfeea027b83ed18c45a423f741425cb"}, {file = "SQLAlchemy-1.4.50-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db4db3c08ffbb18582f856545f058a7a5e4ab6f17f75795ca90b3c38ee0a8ba4"}, + {file = "SQLAlchemy-1.4.50-cp310-cp310-win32.whl", hash = "sha256:6c78e3fb4a58e900ec433b6b5f4efe1a0bf81bbb366ae7761c6e0051dd310ee3"}, + {file = "SQLAlchemy-1.4.50-cp310-cp310-win_amd64.whl", hash = "sha256:d55f7a33e8631e15af1b9e67c9387c894fedf6deb1a19f94be8731263c51d515"}, + {file = "SQLAlchemy-1.4.50-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:324b1fdd50e960a93a231abb11d7e0f227989a371e3b9bd4f1259920f15d0304"}, {file = "SQLAlchemy-1.4.50-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:14b0cacdc8a4759a1e1bd47dc3ee3f5db997129eb091330beda1da5a0e9e5bd7"}, {file = "SQLAlchemy-1.4.50-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1fb9cb60e0f33040e4f4681e6658a7eb03b5cb4643284172f91410d8c493dace"}, + {file = "SQLAlchemy-1.4.50-cp311-cp311-win32.whl", hash = "sha256:8bdab03ff34fc91bfab005e96f672ae207d87e0ac7ee716d74e87e7046079d8b"}, + {file = "SQLAlchemy-1.4.50-cp311-cp311-win_amd64.whl", hash = "sha256:52e01d60b06f03b0a5fc303c8aada405729cbc91a56a64cead8cb7c0b9b13c1a"}, + {file = "SQLAlchemy-1.4.50-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:77fde9bf74f4659864c8e26ac08add8b084e479b9a18388e7db377afc391f926"}, {file = "SQLAlchemy-1.4.50-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4cb501d585aa74a0f86d0ea6263b9c5e1d1463f8f9071392477fd401bd3c7cc"}, {file = "SQLAlchemy-1.4.50-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a7a66297e46f85a04d68981917c75723e377d2e0599d15fbe7a56abed5e2d75"}, + {file = "SQLAlchemy-1.4.50-cp312-cp312-win32.whl", hash = "sha256:e86c920b7d362cfa078c8b40e7765cbc34efb44c1007d7557920be9ddf138ec7"}, + {file = "SQLAlchemy-1.4.50-cp312-cp312-win_amd64.whl", hash = "sha256:6b3df20fbbcbcd1c1d43f49ccf3eefb370499088ca251ded632b8cbaee1d497d"}, + {file = "SQLAlchemy-1.4.50-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:fb9adc4c6752d62c6078c107d23327aa3023ef737938d0135ece8ffb67d07030"}, {file = "SQLAlchemy-1.4.50-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1db0221cb26d66294f4ca18c533e427211673ab86c1fbaca8d6d9ff78654293"}, {file = "SQLAlchemy-1.4.50-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b7dbe6369677a2bea68fe9812c6e4bbca06ebfa4b5cde257b2b0bf208709131"}, {file = "SQLAlchemy-1.4.50-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a9bddb60566dc45c57fd0a5e14dd2d9e5f106d2241e0a2dc0c1da144f9444516"}, {file = "SQLAlchemy-1.4.50-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82dd4131d88395df7c318eeeef367ec768c2a6fe5bd69423f7720c4edb79473c"}, + {file = "SQLAlchemy-1.4.50-cp36-cp36m-win32.whl", hash = "sha256:1b9c4359d3198f341480e57494471201e736de459452caaacf6faa1aca852bd8"}, + {file = "SQLAlchemy-1.4.50-cp36-cp36m-win_amd64.whl", hash = "sha256:35e4520f7c33c77f2636a1e860e4f8cafaac84b0b44abe5de4c6c8890b6aaa6d"}, + {file = "SQLAlchemy-1.4.50-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:f5b1fb2943d13aba17795a770d22a2ec2214fc65cff46c487790192dda3a3ee7"}, {file = "SQLAlchemy-1.4.50-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:273505fcad22e58cc67329cefab2e436006fc68e3c5423056ee0513e6523268a"}, {file = "SQLAlchemy-1.4.50-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3257a6e09626d32b28a0c5b4f1a97bced585e319cfa90b417f9ab0f6145c33c"}, {file = "SQLAlchemy-1.4.50-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d69738d582e3a24125f0c246ed8d712b03bd21e148268421e4a4d09c34f521a5"}, {file = "SQLAlchemy-1.4.50-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:34e1c5d9cd3e6bf3d1ce56971c62a40c06bfc02861728f368dcfec8aeedb2814"}, + {file = "SQLAlchemy-1.4.50-cp37-cp37m-win32.whl", hash = "sha256:7b4396452273aedda447e5aebe68077aa7516abf3b3f48408793e771d696f397"}, + {file = "SQLAlchemy-1.4.50-cp37-cp37m-win_amd64.whl", hash = "sha256:752f9df3dddbacb5f42d8405b2d5885675a93501eb5f86b88f2e47a839cf6337"}, + {file = "SQLAlchemy-1.4.50-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:35c7ed095a4b17dbc8813a2bfb38b5998318439da8e6db10a804df855e3a9e3a"}, {file = "SQLAlchemy-1.4.50-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1fcee5a2c859eecb4ed179edac5ffbc7c84ab09a5420219078ccc6edda45436"}, {file = "SQLAlchemy-1.4.50-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbaf6643a604aa17e7a7afd74f665f9db882df5c297bdd86c38368f2c471f37d"}, {file = "SQLAlchemy-1.4.50-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2e70e0673d7d12fa6cd363453a0d22dac0d9978500aa6b46aa96e22690a55eab"}, {file = "SQLAlchemy-1.4.50-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b881ac07d15fb3e4f68c5a67aa5cdaf9eb8f09eb5545aaf4b0a5f5f4659be18"}, + {file = "SQLAlchemy-1.4.50-cp38-cp38-win32.whl", hash = "sha256:8a219688297ee5e887a93ce4679c87a60da4a5ce62b7cb4ee03d47e9e767f558"}, + {file = "SQLAlchemy-1.4.50-cp38-cp38-win_amd64.whl", hash = "sha256:a648770db002452703b729bdcf7d194e904aa4092b9a4d6ab185b48d13252f63"}, + {file = "SQLAlchemy-1.4.50-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:4be4da121d297ce81e1ba745a0a0521c6cf8704634d7b520e350dce5964c71ac"}, {file = "SQLAlchemy-1.4.50-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f6997da81114daef9203d30aabfa6b218a577fc2bd797c795c9c88c9eb78d49"}, {file = "SQLAlchemy-1.4.50-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bdb77e1789e7596b77fd48d99ec1d2108c3349abd20227eea0d48d3f8cf398d9"}, {file = "SQLAlchemy-1.4.50-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:128a948bd40780667114b0297e2cc6d657b71effa942e0a368d8cc24293febb3"}, {file = "SQLAlchemy-1.4.50-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2d526aeea1bd6a442abc7c9b4b00386fd70253b80d54a0930c0a216230a35be"}, + {file = "SQLAlchemy-1.4.50-cp39-cp39-win32.whl", hash = "sha256:a7c9b9dca64036008962dd6b0d9fdab2dfdbf96c82f74dbd5d86006d8d24a30f"}, + {file = "SQLAlchemy-1.4.50-cp39-cp39-win_amd64.whl", hash = "sha256:df200762efbd672f7621b253721644642ff04a6ff957236e0e2fe56d9ca34d2c"}, {file = "SQLAlchemy-1.4.50.tar.gz", hash = "sha256:3b97ddf509fc21e10b09403b5219b06c5b558b27fc2453150274fa4e70707dbf"}, ] @@ -7856,17 +8135,6 @@ files = [ {file = "threadpoolctl-3.2.0.tar.gz", hash = "sha256:c96a0ba3bdddeaca37dc4cc7344aafad41cdb8c313f74fdfe387a867bba93355"}, ] -[[package]] -name = "toml" -version = "0.10.2" -description = "Python Library for Tom's Obvious, Minimal Language" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" -files = [ - {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, - {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, -] - [[package]] name = "tomli" version = "2.0.1" @@ -7947,13 +8215,13 @@ test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6. [[package]] name = "typing-extensions" -version = "4.10.0" +version = "4.11.0" description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" files = [ - {file = "typing_extensions-4.10.0-py3-none-any.whl", hash = "sha256:69b1a937c3a517342112fb4c6df7e72fc39a38e7891a5730ed4985b5214b5475"}, - {file = "typing_extensions-4.10.0.tar.gz", hash = "sha256:b0abd7c89e8fb96f98db18d86106ff1d90ab692004eb746cf6eda2682f91b3cb"}, + {file = "typing_extensions-4.11.0-py3-none-any.whl", hash = "sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a"}, + {file = "typing_extensions-4.11.0.tar.gz", hash = "sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0"}, ] [[package]] @@ -8480,4 +8748,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "^3.10, <3.11" -content-hash = "8d85933db4bf8f4ca92ef227e9a767d26f4f7d2b360083449cfafc54878885fc" +content-hash = "f3118a34b55508e9d5876d1aefaa7754a8307277591c650fcd8c119033290552" diff --git a/pyproject.toml b/pyproject.toml index d94d8412b..818d0685a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,13 +35,13 @@ scikit-learn = "^1.3.2" pandas = "^2.1.4" [tool.poetry.dev-dependencies] -pre-commit = "^3.6.0" -mypy = "^1.8" +pre-commit = "^3.7.0" +mypy = "^1.10" pep8-naming = "^0.13.2" -interrogate = "^1.5.0" +interrogate = "^1.7.0" isort = "^5.13.2" darglint = "^1.8.1" -ruff = "^0.2.0" +ruff = "^0.4.3" [tool.poetry.group.docs.dependencies] mkdocs = "^1.5.3" @@ -58,7 +58,7 @@ pymdown-extensions = "^10.7" [tool.poetry.group.tests.dependencies] -pytest-cov = "^4.1.0" +pytest-cov = ">=4.1,<6.0" pytest-sugar = ">=0.9.5,<1.1.0" dbldatagen = "^0.3.1" pyparsing = "^3.1.1" @@ -74,7 +74,7 @@ apache-airflow = "^2.8.0" apache-airflow-providers-google = "^10.13.1" pydoclint = ">=0.3.8,<0.5.0" prettier = "^0.0.7" -deptry = "^0.12.0" +deptry = ">=0.12,<0.17" python-semantic-release = ">=8.7,<10.0" yamllint = "^1.33.0" diff --git a/src/airflow/dags/common_airflow.py b/src/airflow/dags/common_airflow.py index e3dc56ccb..f3ec89bb2 100644 --- a/src/airflow/dags/common_airflow.py +++ b/src/airflow/dags/common_airflow.py @@ -13,7 +13,7 @@ DataprocSubmitJobOperator, ) from airflow.utils.trigger_rule import TriggerRule -from google.cloud import dataproc_v1 +from google.cloud import dataproc_v1, storage if TYPE_CHECKING: from pathlib import Path @@ -59,6 +59,22 @@ } +def check_gcp_folder_exists(bucket_name: str, folder_path: str) -> bool: + """Check if a folder exists in a Google Cloud bucket. + + Args: + bucket_name (str): The name of the Google Cloud bucket. + folder_path (str): The path of the folder to check. + + Returns: + bool: True if the folder exists, False otherwise. + """ + client = storage.Client() + bucket = client.get_bucket(bucket_name) + blobs = bucket.list_blobs(prefix=folder_path) + return any(blobs) + + def create_cluster( cluster_name: str, master_machine_type: str = "n1-highmem-16", @@ -111,9 +127,9 @@ def create_cluster( # Create a disk config section if it does not exist. cluster_config[worker_section].setdefault("disk_config", {}) # Specify the number of local SSDs. - cluster_config[worker_section]["disk_config"][ - "num_local_ssds" - ] = num_local_ssds + cluster_config[worker_section]["disk_config"]["num_local_ssds"] = ( + num_local_ssds + ) # Return the cluster creation operator. return DataprocCreateClusterOperator( diff --git a/src/airflow/dags/configs/dag.yaml b/src/airflow/dags/configs/dag.yaml index 0b634caaa..b6d343343 100644 --- a/src/airflow/dags/configs/dag.yaml +++ b/src/airflow/dags/configs/dag.yaml @@ -4,15 +4,18 @@ prerequisites: - "ot_variant_index" - "ot_gene_index" -- id: "ot_colocalisation" +- id: "ot_colocalisation_ecaviar" +- id: "ot_colocalisation_coloc" - id: "ot_locus_to_gene_train" prerequisites: - "ot_variant_index" - "ot_variant_to_gene" - - "ot_colocalisation" + - "ot_colocalisation_ecaviar" + - "ot_colocalisation_coloc" - id: "ot_locus_to_gene_predict" prerequisites: - "ot_locus_to_gene_train" - "ot_variant_index" - "ot_variant_to_gene" - - "ot_colocalisation" + - "ot_colocalisation_ecaviar" + - "ot_colocalisation_coloc" diff --git a/src/airflow/dags/dag_genetics_etl.py b/src/airflow/dags/dag_genetics_etl.py deleted file mode 100644 index 1d27b8bba..000000000 --- a/src/airflow/dags/dag_genetics_etl.py +++ /dev/null @@ -1,35 +0,0 @@ -"""Airflow DAG for the ETL part of the pipeline.""" -from __future__ import annotations - -from pathlib import Path - -import common_airflow as common -from airflow.models.dag import DAG - -CLUSTER_NAME = "otg-etl" -SOURCE_CONFIG_FILE_PATH = Path(__file__).parent / "configs" / "dag.yaml" - - -with DAG( - dag_id=Path(__file__).stem, - description="Open Targets Genetics ETL workflow", - default_args=common.shared_dag_args, - **common.shared_dag_kwargs, -): - # Parse and define all steps and their prerequisites. - tasks = {} - steps = common.read_yaml_config(SOURCE_CONFIG_FILE_PATH) - for step in steps: - # Define task for the current step. - step_id = step["id"] - this_task = common.submit_step( - cluster_name=CLUSTER_NAME, - step_id=step_id, - task_id=step_id, - ) - # Chain prerequisites. - tasks[step_id] = this_task - for prerequisite in step.get("prerequisites", []): - this_task.set_upstream(tasks[prerequisite]) - # Construct the DAG with all tasks. - dag = common.generate_dag(cluster_name=CLUSTER_NAME, tasks=list(tasks.values())) diff --git a/src/airflow/dags/eqtl_preprocess.py b/src/airflow/dags/eqtl_preprocess.py index ec346d137..5433d8968 100644 --- a/src/airflow/dags/eqtl_preprocess.py +++ b/src/airflow/dags/eqtl_preprocess.py @@ -15,11 +15,11 @@ AUTOSCALING = "eqtl-preprocess" PROJECT_ID = "open-targets-genetics-dev" -EQTL_CATALOG_SUSIE_LOCATION = "gs://eqtl_catalog_data/ebi_ftp/susie" -TEMP_DECOMPRESS_LOCATION = "gs://eqtl_catalog_data/susie_decompressed_tmp" +EQTL_CATALOG_SUSIE_LOCATION = "gs://eqtl_catalogue_data/ebi_ftp/susie" +TEMP_DECOMPRESS_LOCATION = "gs://eqtl_catalogue_data/susie_decompressed_tmp" DECOMPRESS_FAILED_LOG = f"{TEMP_DECOMPRESS_LOCATION}.log" -STUDY_INDEX_PATH = "gs://eqtl_catalog_data/study_index" -CREDIBLE_SET_PATH = "gs://eqtl_catalog_data/credible_set_datasets/susie" +STUDY_INDEX_PATH = "gs://eqtl_catalogue_data/study_index" +CREDIBLE_SET_PATH = "gs://eqtl_catalogue_data/credible_set_datasets/susie" with DAG( dag_id=Path(__file__).stem, diff --git a/src/airflow/dags/finngen_preprocess.py b/src/airflow/dags/finngen_preprocess.py index 38aeb2510..d4fc9446d 100644 --- a/src/airflow/dags/finngen_preprocess.py +++ b/src/airflow/dags/finngen_preprocess.py @@ -26,7 +26,7 @@ ) FINNGEN_FM_SUMMARIES = "gs://genetics_etl_python_playground/input/Finngen_susie_finemapping_r10/Finngen_susie_credset_summary_r10.tsv" FINNGEN_PREFIX = "FINNGEN_R10_" -FINNGEN_FM_OUT = "gs://genetics_etl_python_playground/output/python_etl/parquet/XX.XX/finngen_susie_processed" +FINNGEN_FM_OUT = f"{FINNGEN_BUCKET}/credible_set_datasets/finngen_susie" with DAG( dag_id=Path(__file__).stem, diff --git a/src/airflow/dags/genetics_etl.py b/src/airflow/dags/genetics_etl.py new file mode 100644 index 000000000..ae510131c --- /dev/null +++ b/src/airflow/dags/genetics_etl.py @@ -0,0 +1,160 @@ +"""Test DAG to prototype data transfer.""" + +from __future__ import annotations + +from pathlib import Path + +import common_airflow as common +from airflow.models.dag import DAG +from airflow.operators.python import ShortCircuitOperator +from airflow.providers.google.cloud.transfers.gcs_to_gcs import GCSToGCSOperator +from airflow.utils.task_group import TaskGroup + +CLUSTER_NAME = "otg-etl" +SOURCE_CONFIG_FILE_PATH = Path(__file__).parent / "configs" / "dag.yaml" + +# Release specific variables: +RELEASE_VERSION = "24.03" +RELEASE_BUCKET_NAME = "genetics_etl_python_playground" + +# Datasource paths: +GWAS_CATALOG_BUCKET_NAME = "gwas_catalog_data" +EQTL_BUCKET_NAME = "eqtl_catalogue_data" +FINNGEN_BUCKET_NAME = "finngen_data" +FINNGEN_RELEASE = "r10" + +# Files to move: +DATA_TO_MOVE = { + # GWAS Catalog summary study index: + "gwas_catalog_study_index": { + "source_bucket": GWAS_CATALOG_BUCKET_NAME, + "source_object": "study_index", + "destination_bucket": RELEASE_BUCKET_NAME, + "destination_object": f"releases/{RELEASE_VERSION}/study_index/gwas_catalog", + }, + # PICS credible sets from GWAS Catalog curated associations: + "gwas_catalog_curated_credible_set": { + "source_bucket": GWAS_CATALOG_BUCKET_NAME, + "source_object": "credible_set_datasets/gwas_catalog_PICSed_curated_associations", + "destination_bucket": RELEASE_BUCKET_NAME, + "destination_object": f"releases/{RELEASE_VERSION}/credible_set/gwas_catalog_PICSed_curated_associations", + }, + # PICS credible sets from GWAS Catalog summary statistics: + "gwas_catalog_sumstats_credible_set": { + "source_bucket": GWAS_CATALOG_BUCKET_NAME, + "source_object": "credible_set_datasets/gwas_catalog_PICSed_summary_statistics", + "destination_bucket": RELEASE_BUCKET_NAME, + "destination_object": f"releases/{RELEASE_VERSION}/credible_set/gwas_catalog_PICSed_summary_statistics", + }, + # GWAS Catalog manifest files: + "gwas_catalog_manifests": { + "source_bucket": GWAS_CATALOG_BUCKET_NAME, + "source_object": "manifests", + "destination_bucket": RELEASE_BUCKET_NAME, + "destination_object": f"releases/{RELEASE_VERSION}/manifests", + }, + # eQTL Catalog study index: + "eqtl_catalogue_study_index": { + "source_bucket": EQTL_BUCKET_NAME, + "source_object": "study_index", + "destination_bucket": RELEASE_BUCKET_NAME, + "destination_object": f"releases/{RELEASE_VERSION}/study_index/eqtl_catalogue", + }, + # eQTL Catalog SuSiE credible sets: + "eqtl_catalogue_susie_credible_set": { + "source_bucket": EQTL_BUCKET_NAME, + "source_object": "credible_set_datasets/susie", + "destination_bucket": RELEASE_BUCKET_NAME, + "destination_object": f"releases/{RELEASE_VERSION}/credible_set/eqtl_catalogue_susie", + }, + # Finngen study index: + "finngen_study_index": { + "source_bucket": FINNGEN_BUCKET_NAME, + "source_object": f"{FINNGEN_RELEASE}/study_index", + "destination_bucket": RELEASE_BUCKET_NAME, + "destination_object": f"releases/{RELEASE_VERSION}/study_index/finngen", + }, + # Finngen summary statistics: + "finngen_PICS_credible_set": { + "source_bucket": FINNGEN_BUCKET_NAME, + "source_object": f"{FINNGEN_RELEASE}/credible_set_datasets/finngen_pics", + "destination_bucket": RELEASE_BUCKET_NAME, + "destination_object": f"releases/{RELEASE_VERSION}/credible_set/finngen_pics", + }, + # Finngen SuSiE credible sets: + "finngen_susie_credible_set": { + "source_bucket": FINNGEN_BUCKET_NAME, + "source_object": f"{FINNGEN_RELEASE}/credible_set_datasets/finngen_susie_processed", + "destination_bucket": RELEASE_BUCKET_NAME, + "destination_object": f"releases/{RELEASE_VERSION}/credible_set/finngen_susie", + }, + # L2G gold standard: + "gold_standard": { + "source_bucket": "genetics_etl_python_playground", + "source_object": "input/l2g/gold_standard/curation.json", + "destination_bucket": RELEASE_BUCKET_NAME, + "destination_object": f"releases/{RELEASE_VERSION}/locus_to_gene_gold_standard.json", + }, +} + + +# This operator meant to fail the DAG if the release folder exists: +ensure_release_folder_not_exists = ShortCircuitOperator( + task_id="test_release_folder_exists", + python_callable=lambda bucket, path: not common.check_gcp_folder_exists( + bucket, path + ), + op_kwargs={ + "bucket": RELEASE_BUCKET_NAME, + "path": f"releases/{RELEASE_VERSION}", + }, +) + +with DAG( + dag_id=Path(__file__).stem, + description="Open Targets Genetics ETL workflow", + default_args=common.shared_dag_args, + **common.shared_dag_kwargs, +): + # Compiling tasks for moving data to the right place: + with TaskGroup(group_id="data_transfer") as data_transfer: + # Defining the tasks to execute in the task group: + [ + GCSToGCSOperator( + task_id=f"move_{data_name}", + source_bucket=data["source_bucket"], + source_object=data["source_object"], + destination_bucket=data["destination_bucket"], + destination_object=data["destination_object"], + ) + for data_name, data in DATA_TO_MOVE.items() + ] + + with TaskGroup(group_id="genetics_etl") as genetics_etl: + # Parse and define all steps and their prerequisites. + tasks = {} + steps = common.read_yaml_config(SOURCE_CONFIG_FILE_PATH) + for step in steps: + # Define task for the current step. + step_id = step["id"] + this_task = common.submit_step( + cluster_name=CLUSTER_NAME, + step_id=step_id, + task_id=step_id, + ) + # Chain prerequisites. + tasks[step_id] = this_task + for prerequisite in step.get("prerequisites", []): + this_task.set_upstream(tasks[prerequisite]) + + common.generate_dag(cluster_name=CLUSTER_NAME, tasks=list(tasks.values())) + + # DAG description: + ( + # Test that the release folder doesn't exist: + ensure_release_folder_not_exists + # Run data transfer: + >> data_transfer + # Once datasets are transferred, run the rest of the steps: + >> genetics_etl + ) diff --git a/src/airflow/dags/dag_preprocess.py b/src/airflow/dags/gnomad_preprocess.py similarity index 94% rename from src/airflow/dags/dag_preprocess.py rename to src/airflow/dags/gnomad_preprocess.py index 4439914c5..22e7fa056 100644 --- a/src/airflow/dags/dag_preprocess.py +++ b/src/airflow/dags/gnomad_preprocess.py @@ -1,4 +1,5 @@ """Airflow DAG for the Preprocess part of the pipeline.""" + from __future__ import annotations from pathlib import Path @@ -6,7 +7,7 @@ import common_airflow as common from airflow.models.dag import DAG -CLUSTER_NAME = "otg-preprocess" +CLUSTER_NAME = "gnomad-preprocess" ALL_STEPS = [ "ot_ld_index", diff --git a/src/airflow/dags/gwas_catalog_preprocess.py b/src/airflow/dags/gwas_catalog_preprocess.py index 36130c87e..1814ddf2d 100644 --- a/src/airflow/dags/gwas_catalog_preprocess.py +++ b/src/airflow/dags/gwas_catalog_preprocess.py @@ -45,12 +45,8 @@ WINDOW_BASED_CLUMPED = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_summary_stats_window_clumped" LD_BASED_CLUMPED = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_summary_stats_ld_clumped" # Credible sets: -CURATED_CREDIBLE_SETS = ( - f"gs://{GWAS_CATALOG_BUCKET_NAME}/credible_set_datasets/gwas_catalog_curated" -) -SUMMARY_STATISTICS_CREDIBLE_SETS = ( - f"gs://{GWAS_CATALOG_BUCKET_NAME}/credible_set_datasets/gwas_catalog_summary_stats" -) +CURATED_CREDIBLE_SETS = f"gs://{GWAS_CATALOG_BUCKET_NAME}/credible_set_datasets/gwas_catalog_PICSed_curated_associations" +SUMMARY_STATISTICS_CREDIBLE_SETS = f"gs://{GWAS_CATALOG_BUCKET_NAME}/credible_set_datasets/gwas_catalog_PICSed_summary_statistics" def upload_harmonized_study_list( diff --git a/src/airflow/dags/gwas_curation_update.py b/src/airflow/dags/gwas_curation_update.py index 1ef0f39f9..830007e6d 100644 --- a/src/airflow/dags/gwas_curation_update.py +++ b/src/airflow/dags/gwas_curation_update.py @@ -18,7 +18,7 @@ ): update_gwas_curation = common.submit_step( cluster_name=CLUSTER_NAME, - step_id="gwas_catalog_curation_update", + step_id="ot_gwas_catalog_study_curation", task_id="gwas_catalog_curation_update", other_args=[ f"step.gwas_catalog_study_curation_out=gs://genetics_etl_python_playground/input/v2d/GWAS_Catalog_study_curation_{RUN_DATE}.tsv", diff --git a/src/gentropy/assets/schemas/l2g_feature_matrix.json b/src/gentropy/assets/schemas/l2g_feature_matrix.json index a1b58040b..114936168 100644 --- a/src/gentropy/assets/schemas/l2g_feature_matrix.json +++ b/src/gentropy/assets/schemas/l2g_feature_matrix.json @@ -125,6 +125,30 @@ "name": "sqtlColocLlrMaximumNeighborhood", "nullable": true, "type": "float" + }, + { + "metadata": {}, + "name": "tuqtlColocClppMaximum", + "nullable": true, + "type": "float" + }, + { + "metadata": {}, + "name": "tuqtlColocClppMaximumNeighborhood", + "nullable": true, + "type": "float" + }, + { + "metadata": {}, + "name": "tuqtlColocLlrMaximum", + "nullable": true, + "type": "float" + }, + { + "metadata": {}, + "name": "tuqtlColocLlrMaximumNeighborhood", + "nullable": true, + "type": "float" } ], "type": "struct" diff --git a/src/gentropy/assets/schemas/pairwise_ld.json b/src/gentropy/assets/schemas/pairwise_ld.json new file mode 100644 index 000000000..bac781ac3 --- /dev/null +++ b/src/gentropy/assets/schemas/pairwise_ld.json @@ -0,0 +1,23 @@ +{ + "fields": [ + { + "metadata": {}, + "name": "variantIdI", + "nullable": false, + "type": "string" + }, + { + "metadata": {}, + "name": "variantIdJ", + "nullable": false, + "type": "string" + }, + { + "metadata": {}, + "name": "r", + "nullable": false, + "type": "double" + } + ], + "type": "struct" +} diff --git a/src/gentropy/assets/schemas/study_locus.json b/src/gentropy/assets/schemas/study_locus.json index e0f366d68..bf18a73b8 100644 --- a/src/gentropy/assets/schemas/study_locus.json +++ b/src/gentropy/assets/schemas/study_locus.json @@ -42,6 +42,12 @@ "nullable": true, "type": "double" }, + { + "metadata": {}, + "name": "zScore", + "nullable": true, + "type": "double" + }, { "metadata": {}, "name": "pValueMantissa", @@ -100,6 +106,18 @@ "nullable": true, "type": "double" }, + { + "metadata": {}, + "name": "purityMeanR2", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "purityMinR2", + "nullable": true, + "type": "double" + }, { "metadata": {}, "name": "sampleSize", diff --git a/src/gentropy/assets/schemas/variant_annotation.json b/src/gentropy/assets/schemas/variant_annotation.json index 826eabaf9..ab8767389 100644 --- a/src/gentropy/assets/schemas/variant_annotation.json +++ b/src/gentropy/assets/schemas/variant_annotation.json @@ -19,12 +19,6 @@ "nullable": false, "metadata": {} }, - { - "name": "gnomadVariantId", - "type": "string", - "nullable": false, - "metadata": {} - }, { "name": "referenceAllele", "type": "string", diff --git a/src/gentropy/clump.py b/src/gentropy/clump.py deleted file mode 100644 index 9ea3306ac..000000000 --- a/src/gentropy/clump.py +++ /dev/null @@ -1,70 +0,0 @@ -"""Step to run clump associations from summary statistics or study locus.""" -from __future__ import annotations - -from typing import Optional - -from gentropy.common.session import Session -from gentropy.dataset.ld_index import LDIndex -from gentropy.dataset.study_index import StudyIndex -from gentropy.dataset.study_locus import StudyLocus -from gentropy.dataset.summary_statistics import SummaryStatistics - - -class ClumpStep: - """Perform clumping of an association dataset to identify independent signals. - - Two types of clumping are supported and are applied based on the input dataset: - - Clumping of summary statistics based on a window-based approach. - - Clumping of study locus based on LD. - - Both approaches yield a StudyLocus dataset. - """ - - def __init__( - self, - session: Session, - input_path: str, - clumped_study_locus_path: str, - study_index_path: Optional[str] = None, - ld_index_path: Optional[str] = None, - locus_collect_distance: Optional[int] = None, - ) -> None: - """Run the clumping step. - - Args: - session (Session): Session object. - input_path (str): Input path for the study locus or summary statistics files. - clumped_study_locus_path (str): Output path for the clumped study locus dataset. - study_index_path (Optional[str]): Input path for the study index dataset. - ld_index_path (Optional[str]): Input path for the LD index dataset. - locus_collect_distance (Optional[int]): Distance in base pairs to collect variants around the study locus. - - Raises: - ValueError: If study index and LD index paths are not provided for study locus. - """ - input_cols = session.spark.read.parquet( - input_path, recursiveFileLookup=True - ).columns - if "studyLocusId" in input_cols: - if study_index_path is None or ld_index_path is None: - raise ValueError( - "Study index and LD index paths are required for clumping study locus." - ) - study_locus = StudyLocus.from_parquet(session, input_path) - ld_index = LDIndex.from_parquet(session, ld_index_path) - study_index = StudyIndex.from_parquet(session, study_index_path) - - clumped_study_locus = study_locus.annotate_ld( - study_index=study_index, ld_index=ld_index - ).clump() - else: - sumstats = SummaryStatistics.from_parquet( - session, input_path, recursiveFileLookup=True - ).coalesce(4000) - clumped_study_locus = sumstats.window_based_clumping( - locus_collect_distance=locus_collect_distance - ) - - clumped_study_locus.df.write.mode(session.write_mode).parquet( - clumped_study_locus_path - ) diff --git a/src/gentropy/colocalisation.py b/src/gentropy/colocalisation.py index 010827713..1d71ed447 100644 --- a/src/gentropy/colocalisation.py +++ b/src/gentropy/colocalisation.py @@ -1,10 +1,15 @@ """Step to generate colocalisation results.""" from __future__ import annotations +import inspect +from importlib import import_module + +from pyspark.sql.functions import col + from gentropy.common.session import Session from gentropy.dataset.study_index import StudyIndex from gentropy.dataset.study_locus import CredibleInterval, StudyLocus -from gentropy.method.colocalisation import ECaviar +from gentropy.method.colocalisation import Coloc class ColocalisationStep: @@ -19,6 +24,7 @@ def __init__( credible_set_path: str, study_index_path: str, coloc_path: str, + colocalisation_method: str, ) -> None: """Run Colocalisation step. @@ -27,10 +33,18 @@ def __init__( credible_set_path (str): Input credible sets path. study_index_path (str): Input study index path. coloc_path (str): Output Colocalisation path. + colocalisation_method (str): Colocalisation method. """ + colocalisation_class = self._get_colocalisation_class(colocalisation_method) # Extract - credible_set = StudyLocus.from_parquet( - session, credible_set_path, recursiveFileLookup=True + credible_set = ( + StudyLocus.from_parquet( + session, credible_set_path, recursiveFileLookup=True + ).filter(col("finemappingMethod") == "SuSie") + if colocalisation_class is Coloc + else StudyLocus.from_parquet( + session, credible_set_path, recursiveFileLookup=True + ) ) si = StudyIndex.from_parquet( session, study_index_path, recursiveFileLookup=True @@ -40,7 +54,39 @@ def __init__( overlaps = credible_set.filter_credible_set( CredibleInterval.IS95 ).find_overlaps(si) - ecaviar_results = ECaviar.colocalise(overlaps) + colocalisation_results = colocalisation_class.colocalise(overlaps) # type: ignore # Load - ecaviar_results.df.write.mode(session.write_mode).parquet(coloc_path) + colocalisation_results.df.write.mode(session.write_mode).parquet( + f"{coloc_path}/{colocalisation_method.lower()}" + ) + + @classmethod + def _get_colocalisation_class(cls: type[ColocalisationStep], method: str) -> type: + """Get colocalisation class. + + Args: + method (str): Colocalisation method. + + Returns: + type: Colocalisation class. + + Raises: + ValueError: if method not available. + + Examples: + >>> ColocalisationStep._get_colocalisation_class("ECaviar") + + """ + module_name = "gentropy.method.colocalisation" + module = import_module(module_name) + + available_methods = [] + for class_name, class_obj in inspect.getmembers(module, inspect.isclass): + if class_obj.__module__ == module_name: + available_methods.append(class_name) + if class_name == method: + return class_obj + raise ValueError( + f"Method {method} is not supported. Available: {(', ').join(available_methods)}" + ) diff --git a/src/gentropy/common/utils.py b/src/gentropy/common/utils.py index 4cdda9ad2..3ec361a50 100644 --- a/src/gentropy/common/utils.py +++ b/src/gentropy/common/utils.py @@ -1,4 +1,5 @@ """Common functions in the Genetics datasets.""" + from __future__ import annotations import sys @@ -208,41 +209,6 @@ def parse_pvalue(pv: Column) -> list[Column]: ] -def convert_gnomad_position_to_ensembl( - position: Column, reference: Column, alternate: Column -) -> Column: - """Convert GnomAD variant position to Ensembl variant position. - - For indels (the reference or alternate allele is longer than 1), then adding 1 to the position, for SNPs, - the position is unchanged. More info about the problem: https://www.biostars.org/p/84686/ - - Args: - position (Column): Position of the variant in GnomAD's coordinates system. - reference (Column): The reference allele in GnomAD's coordinates system. - alternate (Column): The alternate allele in GnomAD's coordinates system. - - Returns: - Column: The position of the variant in the Ensembl genome. - - Examples: - >>> d = [(1, "A", "C"), (2, "AA", "C"), (3, "A", "AA")] - >>> df = spark.createDataFrame(d).toDF("position", "reference", "alternate") - >>> df.withColumn("new_position", convert_gnomad_position_to_ensembl(f.col("position"), f.col("reference"), f.col("alternate"))).show() - +--------+---------+---------+------------+ - |position|reference|alternate|new_position| - +--------+---------+---------+------------+ - | 1| A| C| 1| - | 2| AA| C| 3| - | 3| A| AA| 4| - +--------+---------+---------+------------+ - - - """ - return f.when( - (f.length(reference) > 1) | (f.length(alternate) > 1), position + 1 - ).otherwise(position) - - def _liftover_loci( variant_index: Table, chain_path: str, dest_reference_genome: str ) -> Table: diff --git a/src/gentropy/config.py b/src/gentropy/config.py index 7202ee578..59452fed8 100644 --- a/src/gentropy/config.py +++ b/src/gentropy/config.py @@ -37,6 +37,7 @@ class ColocalisationConfig(StepConfig): credible_set_path: str = MISSING study_index_path: str = MISSING coloc_path: str = MISSING + colocalisation_method: str = MISSING _target_: str = "gentropy.colocalisation.ColocalisationStep" @@ -182,6 +183,7 @@ class LocusToGeneConfig(StepConfig): "spark.dynamicAllocation.enabled": "false", "spark.driver.memory": "48g", "spark.executor.memory": "48g", + "spark.sql.shuffle.partitions": "800", } } ) @@ -213,13 +215,17 @@ class LocusToGeneConfig(StepConfig): # max clpp for each (study, locus) aggregating over all eQTLs "eqtlColocClppMaximumNeighborhood", # max clpp for each (study, locus, gene) aggregating over all pQTLs - # "pqtlColocClppMaximum", + "pqtlColocClppMaximum", # max clpp for each (study, locus) aggregating over all pQTLs - # "pqtlColocClppMaximumNeighborhood", + "pqtlColocClppMaximumNeighborhood", # max clpp for each (study, locus, gene) aggregating over all sQTLs - # "sqtlColocClppMaximum", + "sqtlColocClppMaximum", # max clpp for each (study, locus) aggregating over all sQTLs - # "sqtlColocClppMaximumNeighborhood", + "sqtlColocClppMaximumNeighborhood", + # max clpp for each (study, locus) aggregating over all tuQTLs + "tuqtlColocClppMaximum", + # max clpp for each (study, locus, gene) aggregating over all tuQTLs + "tuqtlColocClppMaximumNeighborhood", # # max log-likelihood ratio value for each (study, locus, gene) aggregating over all eQTLs # "eqtlColocLlrLocalMaximum", # # max log-likelihood ratio value for each (study, locus) aggregating over all eQTLs @@ -316,12 +322,45 @@ class WindowBasedClumpingStep(StepConfig): summary_statistics_input_path: str = MISSING study_locus_output_path: str = MISSING + distance: int = 500_000 + collect_locus: bool = False + collect_locus_distance: int = 500_000 inclusion_list_path: str | None = None - locus_collect_distance: str | None = None - _target_: str = "gentropy.window_based_clumping.WindowBasedClumpingStep" +@dataclass +class FinemapperConfig(StepConfig): + """SuSiE fine-mapper step configuration.""" + + session: Any = field( + default_factory=lambda: { + "start_hail": True, + } + ) + study_locus_to_finemap: str = MISSING + study_locus_collected_path: str = MISSING + study_index_path: str = MISSING + output_path: str = MISSING + locus_radius: int = MISSING + max_causal_snps: int = MISSING + primary_signal_pval_threshold: float = MISSING + secondary_signal_pval_threshold: float = MISSING + purity_mean_r2_threshold: float = MISSING + purity_min_r2_threshold: float = MISSING + cs_lbf_th: float = MISSING + sum_pips: float = MISSING + logging: bool = MISSING + susie_est_tausq: bool = MISSING + run_carma: bool = MISSING + run_sumstat_imputation: bool = MISSING + carma_time_limit: int = MISSING + imputed_r2_threshold: float = MISSING + ld_score_threshold: float = MISSING + output_path_log: str = MISSING + _target_: str = "gentropy.susie_finemapper.SusieFineMapperStep" + + @dataclass class Config: """Application configuration.""" @@ -379,3 +418,4 @@ def register_config() -> None: cs.store(group="step", name="variant_index", node=VariantIndexConfig) cs.store(group="step", name="variant_to_gene", node=VariantToGeneConfig) cs.store(group="step", name="window_based_clumping", node=WindowBasedClumpingStep) + cs.store(group="step", name="susie_finemapping", node=FinemapperConfig) diff --git a/src/gentropy/dataset/l2g_feature_matrix.py b/src/gentropy/dataset/l2g_feature_matrix.py index fa84499dc..e5be1a019 100644 --- a/src/gentropy/dataset/l2g_feature_matrix.py +++ b/src/gentropy/dataset/l2g_feature_matrix.py @@ -40,7 +40,7 @@ def __post_init__(self: L2GFeatureMatrix) -> None: def generate_features( cls: Type[L2GFeatureMatrix], features_list: list[str], - study_locus: StudyLocus, + credible_set: StudyLocus, study_index: StudyIndex, variant_gene: V2G, colocalisation: Colocalisation, @@ -49,7 +49,7 @@ def generate_features( Args: features_list (list[str]): List of features to generate - study_locus (StudyLocus): Study locus dataset + credible_set (StudyLocus): Credible set dataset study_index (StudyIndex): Study index dataset variant_gene (V2G): Variant to gene dataset colocalisation (Colocalisation): Colocalisation dataset @@ -62,11 +62,13 @@ def generate_features( """ if features_dfs := [ # Extract features - ColocalisationFactory._get_coloc_features( - study_locus, study_index, colocalisation + ColocalisationFactory._get_max_coloc_per_credible_set( + colocalisation, + credible_set, + study_index, ).df, - StudyLocusFactory._get_tss_distance_features(study_locus, variant_gene).df, - StudyLocusFactory._get_vep_features(study_locus, variant_gene).df, + StudyLocusFactory._get_tss_distance_features(credible_set, variant_gene).df, + StudyLocusFactory._get_vep_features(credible_set, variant_gene).df, ]: fm = reduce( lambda x, y: x.unionByName(y), @@ -162,8 +164,6 @@ def train_test_split( """ train, test = self._df.randomSplit([fraction, 1 - fraction], seed=42) return ( - L2GFeatureMatrix( - _df=train, _schema=L2GFeatureMatrix.get_schema() - ).persist(), - L2GFeatureMatrix(_df=test, _schema=L2GFeatureMatrix.get_schema()).persist(), + L2GFeatureMatrix(_df=train, _schema=L2GFeatureMatrix.get_schema()), + L2GFeatureMatrix(_df=test, _schema=L2GFeatureMatrix.get_schema()), ) diff --git a/src/gentropy/dataset/l2g_prediction.py b/src/gentropy/dataset/l2g_prediction.py index e24688da3..c4eccdf45 100644 --- a/src/gentropy/dataset/l2g_prediction.py +++ b/src/gentropy/dataset/l2g_prediction.py @@ -44,7 +44,7 @@ def from_credible_set( cls: Type[L2GPrediction], model_path: str, features_list: list[str], - study_locus: StudyLocus, + credible_set: StudyLocus, study_index: StudyIndex, v2g: V2G, coloc: Colocalisation, @@ -54,7 +54,7 @@ def from_credible_set( Args: model_path (str): Path to the fitted model features_list (list[str]): List of features to use for the model - study_locus (StudyLocus): Study locus dataset + credible_set (StudyLocus): Credible set dataset study_index (StudyIndex): Study index dataset v2g (V2G): Variant to gene dataset coloc (Colocalisation): Colocalisation dataset @@ -64,7 +64,7 @@ def from_credible_set( """ fm = L2GFeatureMatrix.generate_features( features_list=features_list, - study_locus=study_locus, + credible_set=credible_set, study_index=study_index, variant_gene=v2g, colocalisation=coloc, @@ -73,7 +73,7 @@ def from_credible_set( gwas_fm = L2GFeatureMatrix( _df=( fm.df.join( - study_locus.filter_by_study_type("gwas", study_index).df, + credible_set.filter_by_study_type("gwas", study_index).df, on="studyLocusId", ) ), diff --git a/src/gentropy/dataset/pairwise_ld.py b/src/gentropy/dataset/pairwise_ld.py new file mode 100644 index 000000000..9650efa32 --- /dev/null +++ b/src/gentropy/dataset/pairwise_ld.py @@ -0,0 +1,104 @@ +"""Pairwise LD dataset.""" +from __future__ import annotations + +from dataclasses import dataclass, field +from math import sqrt +from typing import TYPE_CHECKING + +import numpy as np +from pyspark.sql import functions as f +from pyspark.sql import types as t + +from gentropy.common.schemas import parse_spark_schema +from gentropy.dataset.dataset import Dataset + +if TYPE_CHECKING: + from pyspark.sql.types import StructType + + +@dataclass +class PairwiseLD(Dataset): + """Pairwise variant correlation dataset. + + This class captures logic applied on pairwise linkage data + by validation ensuring data quality. + """ + + dimension: tuple[int, int] = field(init=False) + + def __post_init__(self: PairwiseLD) -> None: + """Validating the dataset upon creation. + + - Besides the schema, a pairwise LD table is expected have rows being a square number. + """ + row_count = self.df.count() + + assert ( + int(sqrt(row_count)) == sqrt(row_count) + ), f"The number of rows in a pairwise LD table has to be square. Got: {row_count}" + + self.dimension = (int(sqrt(row_count)), int(sqrt(row_count))) + + @classmethod + def get_schema(cls: type[PairwiseLD]) -> StructType: + """Provide the schema for the StudyIndex dataset. + + Returns: + StructType: The schema of the StudyIndex dataset. + """ + return parse_spark_schema("pairwise_ld.json") + + def overlap_with_locus(self: PairwiseLD, locus_variants: list[str]) -> PairwiseLD: + """Subset pairwise LD table with locus. + + Args: + locus_variants (list[str]): List of variants found in the locus. + + Returns: + PairwiseLD: _description_ + """ + return PairwiseLD( + _df=( + self.df.filter( + f.col("variantIdI").isin(locus_variants) + & f.col("variantIdJ").isin(locus_variants) + ) + ), + _schema=PairwiseLD.get_schema(), + ) + + def r_to_numpy_matrix(self) -> np.ndarray: + """Convert pairwise LD to a numpy square matrix. + + Returns: + np.ndarray: 2D square matrix with r values. + """ + return np.array( + self.df.select( + f.split("variantIdI", "_")[1].cast(t.IntegerType()).alias("position_i"), + f.split("variantIdJ", "_")[1].cast(t.IntegerType()).alias("position_j"), + "r", + ) + .orderBy(f.col("position_i").asc(), f.col("position_j").asc()) + .select("r") + .collect() + ).reshape(self.dimension) + + def get_variant_list(self) -> list[str]: + """Return a list of unique variants from the dataset. + + Returns: + list[str]: list of variant identifiers sorted by position. + """ + return [ + row["variantId"] + for row in ( + self.df.select( + f.col("variantIdI").alias("variantId"), + f.split(f.col("variantIdI"), "_")[1] + .cast(t.IntegerType()) + .alias("position"), + ) + .orderBy(f.col("position").asc()) + .collect() + ) + ] diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py index 4ff9f0172..77c663800 100644 --- a/src/gentropy/dataset/study_locus.py +++ b/src/gentropy/dataset/study_locus.py @@ -1,4 +1,5 @@ """Study locus dataset.""" + from __future__ import annotations from dataclasses import dataclass @@ -24,6 +25,7 @@ from gentropy.dataset.ld_index import LDIndex from gentropy.dataset.study_index import StudyIndex + from gentropy.dataset.summary_statistics import SummaryStatistics class StudyLocusQualityCheck(Enum): @@ -80,31 +82,52 @@ class StudyLocus(Dataset): """ @staticmethod - def _overlapping_peaks(credset_to_overlap: DataFrame) -> DataFrame: + def _overlapping_peaks( + credset_to_overlap: DataFrame, intra_study_overlap: bool = False + ) -> DataFrame: """Calculate overlapping signals (study-locus) between GWAS-GWAS and GWAS-Molecular trait. Args: credset_to_overlap (DataFrame): DataFrame containing at least `studyLocusId`, `studyType`, `chromosome` and `tagVariantId` columns. + intra_study_overlap (bool): When True, finds intra-study overlaps for credible set deduplication. Default is False. Returns: DataFrame: containing `leftStudyLocusId`, `rightStudyLocusId` and `chromosome` columns. """ # Reduce columns to the minimum to reduce the size of the dataframe credset_to_overlap = credset_to_overlap.select( - "studyLocusId", "studyType", "chromosome", "tagVariantId" + "studyLocusId", + "studyId", + "studyType", + "chromosome", + "region", + "tagVariantId", ) + # Define join condition - if intra_study_overlap is True, finds overlaps within the same study. Otherwise finds gwas vs everything overlaps for coloc. + join_condition = ( + [ + f.col("left.studyId") == f.col("right.studyId"), + f.col("left.chromosome") == f.col("right.chromosome"), + f.col("left.tagVariantId") == f.col("right.tagVariantId"), + f.col("left.studyLocusId") > f.col("right.studyLocusId"), + f.col("left.region") != f.col("right.region"), + ] + if intra_study_overlap + else [ + f.col("left.chromosome") == f.col("right.chromosome"), + f.col("left.tagVariantId") == f.col("right.tagVariantId"), + (f.col("right.studyType") != "gwas") + | (f.col("left.studyLocusId") > f.col("right.studyLocusId")), + f.col("left.studyType") == f.lit("gwas"), + ] + ) + return ( credset_to_overlap.alias("left") - .filter(f.col("studyType") == "gwas") - # Self join with complex condition. Left it's all gwas and right can be gwas or molecular trait + # Self join with complex condition. .join( credset_to_overlap.alias("right"), - on=[ - f.col("left.chromosome") == f.col("right.chromosome"), - f.col("left.tagVariantId") == f.col("right.tagVariantId"), - (f.col("right.studyType") != "gwas") - | (f.col("left.studyLocusId") > f.col("right.studyLocusId")), - ], + on=join_condition, how="inner", ) .select( @@ -303,7 +326,9 @@ def filter_credible_set( ) return self - def find_overlaps(self: StudyLocus, study_index: StudyIndex) -> StudyLocusOverlap: + def find_overlaps( + self: StudyLocus, study_index: StudyIndex, intra_study_overlap: bool = False + ) -> StudyLocusOverlap: """Calculate overlapping study-locus. Find overlapping study-locus that share at least one tagging variant. All GWAS-GWAS and all GWAS-Molecular traits are computed with the Molecular traits always @@ -311,6 +336,7 @@ def find_overlaps(self: StudyLocus, study_index: StudyIndex) -> StudyLocusOverla Args: study_index (StudyIndex): Study index to resolve study types. + intra_study_overlap (bool): If True, finds intra-study overlaps for credible set deduplication. Default is False. Returns: StudyLocusOverlap: Pairs of overlapping study-locus with aligned tags. @@ -320,8 +346,10 @@ def find_overlaps(self: StudyLocus, study_index: StudyIndex) -> StudyLocusOverla .withColumn("locus", f.explode("locus")) .select( "studyLocusId", + "studyId", "studyType", "chromosome", + "region", f.col("locus.variantId").alias("tagVariantId"), f.col("locus.logBF").alias("logBF"), f.col("locus.posteriorProbability").alias("posteriorProbability"), @@ -333,7 +361,7 @@ def find_overlaps(self: StudyLocus, study_index: StudyIndex) -> StudyLocusOverla ) # overlapping study-locus - peak_overlaps = self._overlapping_peaks(loci_to_overlap) + peak_overlaps = self._overlapping_peaks(loci_to_overlap, intra_study_overlap) # study-locus overlap by aligning overlapping variants return self._align_overlapping_tags(loci_to_overlap, peak_overlaps) @@ -427,6 +455,74 @@ def annotate_credible_sets(self: StudyLocus) -> StudyLocus: ) return self + def annotate_locus_statistics( + self: StudyLocus, + summary_statistics: SummaryStatistics, + collect_locus_distance: int, + ) -> StudyLocus: + """Annotates study locus with summary statistics in the specified distance around the position. + + Args: + summary_statistics (SummaryStatistics): Summary statistics to be used for annotation. + collect_locus_distance (int): distance from variant defining window for inclusion of variants in locus. + + Returns: + StudyLocus: Study locus annotated with summary statistics in `locus` column. If no statistics are found, the `locus` column will be empty. + """ + # The clumps will be used several times (persisting) + self.df.persist() + # Renaming columns: + sumstats_renamed = summary_statistics.df.selectExpr( + *[f"{col} as tag_{col}" for col in summary_statistics.df.columns] + ).alias("sumstat") + + locus_df = ( + sumstats_renamed + # Joining the two datasets together: + .join( + f.broadcast( + self.df.alias("clumped").select( + "position", "chromosome", "studyId", "studyLocusId" + ) + ), + on=[ + (f.col("sumstat.tag_studyId") == f.col("clumped.studyId")) + & (f.col("sumstat.tag_chromosome") == f.col("clumped.chromosome")) + & ( + f.col("sumstat.tag_position") + >= (f.col("clumped.position") - collect_locus_distance) + ) + & ( + f.col("sumstat.tag_position") + <= (f.col("clumped.position") + collect_locus_distance) + ) + ], + how="inner", + ) + .withColumn( + "locus", + f.struct( + f.col("tag_variantId").alias("variantId"), + f.col("tag_beta").alias("beta"), + f.col("tag_pValueMantissa").alias("pValueMantissa"), + f.col("tag_pValueExponent").alias("pValueExponent"), + f.col("tag_standardError").alias("standardError"), + ), + ) + .groupBy("studyLocusId") + .agg( + f.collect_list(f.col("locus")).alias("locus"), + ) + ) + + self.df = self.df.drop("locus").join( + locus_df, + on="studyLocusId", + how="left", + ) + + return self + def annotate_ld( self: StudyLocus, study_index: StudyIndex, ld_index: LDIndex ) -> StudyLocus: diff --git a/src/gentropy/dataset/summary_statistics.py b/src/gentropy/dataset/summary_statistics.py index 442672c58..6244d5879 100644 --- a/src/gentropy/dataset/summary_statistics.py +++ b/src/gentropy/dataset/summary_statistics.py @@ -9,7 +9,6 @@ from gentropy.common.schemas import parse_spark_schema from gentropy.common.utils import parse_region, split_pvalue from gentropy.dataset.dataset import Dataset -from gentropy.method.window_based_clumping import WindowBasedClumping if TYPE_CHECKING: from pyspark.sql.types import StructType @@ -59,34 +58,24 @@ def window_based_clumping( self: SummaryStatistics, distance: int = 500_000, gwas_significance: float = 5e-8, - baseline_significance: float = 0.05, - locus_collect_distance: int | None = None, ) -> StudyLocus: - """Generate study-locus from summary statistics by distance based clumping + collect locus. + """Generate study-locus from summary statistics using window-based clumping. + + For more info, see [`WindowBasedClumping`][gentropy.method.window_based_clumping.WindowBasedClumping] Args: distance (int): Distance in base pairs to be used for clumping. Defaults to 500_000. gwas_significance (float, optional): GWAS significance threshold. Defaults to 5e-8. - baseline_significance (float, optional): Baseline significance threshold for inclusion in the locus. Defaults to 0.05. - locus_collect_distance (int | None): The distance to collect locus around semi-indices. If not provided, locus is not collected. Returns: - StudyLocus: Clumped study-locus containing variants based on window. + StudyLocus: Clumped study-locus optionally containing variants based on window. """ - return ( - WindowBasedClumping.clump_with_locus( - self, - window_length=distance, - p_value_significance=gwas_significance, - p_value_baseline=baseline_significance, - locus_window_length=locus_collect_distance, - ) - if locus_collect_distance - else WindowBasedClumping.clump( - self, - window_length=distance, - p_value_significance=gwas_significance, - ) + from gentropy.method.window_based_clumping import WindowBasedClumping + + return WindowBasedClumping.clump( + self, + distance=distance, + gwas_significance=gwas_significance, ) def exclude_region(self: SummaryStatistics, region: str) -> SummaryStatistics: @@ -114,3 +103,29 @@ def exclude_region(self: SummaryStatistics, region: str) -> SummaryStatistics: ), _schema=SummaryStatistics.get_schema(), ) + + def sanity_filter(self: SummaryStatistics) -> SummaryStatistics: + """The function filters the summary statistics by sanity filters. + + The function filters the summary statistics by the following filters: + - The p-value should not be eqaul 1. + - The beta and se should not be equal 0. + - The p-value, beta and se should not be NaN. + + Returns: + SummaryStatistics: The filtered summary statistics. + """ + gwas_df = self._df + gwas_df = gwas_df.dropna( + subset=["beta", "standardError", "pValueMantissa", "pValueExponent"] + ) + + gwas_df = gwas_df.filter((f.col("beta") != 0) & (f.col("standardError") != 0)) + gwas_df = gwas_df.filter( + f.col("pValueMantissa") * 10 ** f.col("pValueExponent") != 1 + ) + + return SummaryStatistics( + _df=gwas_df, + _schema=SummaryStatistics.get_schema(), + ) diff --git a/src/gentropy/datasource/finngen/summary_stats.py b/src/gentropy/datasource/finngen/summary_stats.py index 0d77f7d5c..08403bed5 100644 --- a/src/gentropy/datasource/finngen/summary_stats.py +++ b/src/gentropy/datasource/finngen/summary_stats.py @@ -50,7 +50,6 @@ def from_source( Returns: SummaryStatistics: Processed summary statistics dataset """ - study_id = raw_file.split("/")[-1].split(".")[0].upper() processed_summary_stats_df = ( spark.read.schema(cls.raw_schema) .option("delimiter", "\t") @@ -59,7 +58,11 @@ def from_source( .filter(f.col("pos").cast(t.IntegerType()).isNotNull()) .select( # From the full path, extracts just the filename, and converts to upper case to get the study ID. - f.lit(study_id).alias("studyId"), + f.upper( + f.regexp_extract( + f.input_file_name(), r"([^/]+)(\.tsv\.gz|\.gz|\.tsv)", 1 + ) + ).alias("studyId"), # Add variant information. f.concat_ws( "_", diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py index 1d2d6c18f..471b87cae 100644 --- a/src/gentropy/datasource/gnomad/ld.py +++ b/src/gentropy/datasource/gnomad/ld.py @@ -1,4 +1,5 @@ """Step to import filtered version of a LD matrix (block matrix).""" + from __future__ import annotations import sys @@ -7,16 +8,17 @@ from typing import TYPE_CHECKING import hail as hl +import numpy as np import pyspark.sql.functions as f from hail.linalg import BlockMatrix from pyspark.sql import Window from gentropy.common.spark_helpers import get_top_ranked_in_window, get_value_from_row -from gentropy.common.utils import _liftover_loci, convert_gnomad_position_to_ensembl +from gentropy.common.utils import _liftover_loci from gentropy.dataset.ld_index import LDIndex if TYPE_CHECKING: - from pyspark.sql import DataFrame + from pyspark.sql import DataFrame, Row @dataclass @@ -34,6 +36,7 @@ class GnomADLDMatrix: ld_matrix_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm" ld_index_raw_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht" + liftover_ht_path: str = "gs://gcp-public-data--gnomad/release/2.1.1/liftover_grch38/ht/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.ht" grch37_to_grch38_chain_path: str = ( "gs://hail-common/references/grch37_to_grch38.over.chain.gz" ) @@ -175,24 +178,15 @@ def _process_variant_indices( ld_index_38.to_spark() # Filter out variants where the liftover failed .filter(f.col("`locus_GRCh38.position`").isNotNull()) - .withColumn( - "chromosome", f.regexp_replace("`locus_GRCh38.contig`", "chr", "") - ) - .withColumn( - "position", - convert_gnomad_position_to_ensembl( - f.col("`locus_GRCh38.position`"), - f.col("`alleles`").getItem(0), - f.col("`alleles`").getItem(1), - ), - ) .select( - "chromosome", - "position", + f.regexp_replace("`locus_GRCh38.contig`", "chr", "").alias( + "chromosome" + ), + f.col("`locus_GRCh38.position`").alias("position"), f.concat_ws( "_", - f.col("chromosome"), - f.col("position"), + f.regexp_replace("`locus_GRCh38.contig`", "chr", ""), + f.col("`locus_GRCh38.position`"), f.col("`alleles`").getItem(0), f.col("`alleles`").getItem(1), ).alias("variantId"), @@ -218,9 +212,9 @@ def _resolve_variant_indices( DataFrame: Dataframe with variant IDs instead of `i` and `j` indices """ ld_index_i = ld_index.selectExpr( - "idx as i", "variantId as variantId_i", "chromosome" + "idx as i", "variantId as variantIdI", "chromosome" ) - ld_index_j = ld_index.selectExpr("idx as j", "variantId as variantId_j") + ld_index_j = ld_index.selectExpr("idx as j", "variantId as variantIdJ") return ( ld_matrix.join(ld_index_i, on="i", how="inner") .join(ld_index_j, on="j", how="inner") @@ -238,35 +232,35 @@ def _transpose_ld_matrix(ld_matrix: DataFrame) -> DataFrame: DataFrame: Square LD matrix without diagonal duplicates Examples: - >>> df = spark.createDataFrame( - ... [ - ... (1, 1, 1.0, "1", "AFR"), - ... (1, 2, 0.5, "1", "AFR"), - ... (2, 2, 1.0, "1", "AFR"), - ... ], - ... ["variantId_i", "variantId_j", "r", "chromosome", "population"], - ... ) - >>> GnomADLDMatrix._transpose_ld_matrix(df).show() - +-----------+-----------+---+----------+----------+ - |variantId_i|variantId_j| r|chromosome|population| - +-----------+-----------+---+----------+----------+ - | 1| 2|0.5| 1| AFR| - | 1| 1|1.0| 1| AFR| - | 2| 1|0.5| 1| AFR| - | 2| 2|1.0| 1| AFR| - +-----------+-----------+---+----------+----------+ - + >>> df = spark.createDataFrame( + ... [ + ... (1, 1, 1.0, "1", "AFR"), + ... (1, 2, 0.5, "1", "AFR"), + ... (2, 2, 1.0, "1", "AFR"), + ... ], + ... ["variantIdI", "variantIdJ", "r", "chromosome", "population"], + ... ) + >>> GnomADLDMatrix._transpose_ld_matrix(df).show() + +----------+----------+---+----------+----------+ + |variantIdI|variantIdJ| r|chromosome|population| + +----------+----------+---+----------+----------+ + | 1| 2|0.5| 1| AFR| + | 1| 1|1.0| 1| AFR| + | 2| 1|0.5| 1| AFR| + | 2| 2|1.0| 1| AFR| + +----------+----------+---+----------+----------+ + """ ld_matrix_transposed = ld_matrix.selectExpr( - "variantId_i as variantId_j", - "variantId_j as variantId_i", + "variantIdI as variantIdJ", + "variantIdJ as variantIdI", "r", "chromosome", "population", ) - return ld_matrix.filter( - f.col("variantId_i") != f.col("variantId_j") - ).unionByName(ld_matrix_transposed) + return ld_matrix.filter(f.col("variantIdI") != f.col("variantIdJ")).unionByName( + ld_matrix_transposed + ) def as_ld_index( self: GnomADLDMatrix, @@ -307,8 +301,8 @@ def as_ld_index( GnomADLDMatrix._transpose_ld_matrix( reduce(lambda df1, df2: df1.unionByName(df2), ld_indices_unaggregated) ) - .withColumnRenamed("variantId_i", "variantId") - .withColumnRenamed("variantId_j", "tagVariantId") + .withColumnRenamed("variantIdI", "variantId") + .withColumnRenamed("variantIdJ", "tagVariantId") ) return LDIndex( _df=self._aggregate_ld_index_across_populations(ld_index_unaggregated), @@ -345,7 +339,6 @@ def get_ld_variants( & (f.col("position") <= end) ) .select("chromosome", "position", "variantId", "idx") - .persist() ) if ld_index_df.limit(1).count() == 0: @@ -395,7 +388,7 @@ def _extract_square_matrix( .join( ld_index_df.select( f.col("idx").alias("idx_i"), - f.col("variantId").alias("variantId_i"), + f.col("variantId").alias("variantIdI"), ), on="idx_i", how="inner", @@ -403,12 +396,12 @@ def _extract_square_matrix( .join( ld_index_df.select( f.col("idx").alias("idx_j"), - f.col("variantId").alias("variantId_j"), + f.col("variantId").alias("variantIdJ"), ), on="idx_j", how="inner", ) - .select("variantId_i", "variantId_j", "r") + .select("variantIdI", "variantIdJ", "r") ) def get_ld_matrix_slice( @@ -448,3 +441,73 @@ def get_ld_matrix_slice( .alias("r"), ) ) + + def get_locus_index( + self: GnomADLDMatrix, + study_locus_row: Row, + radius: int = 500_000, + major_population: str = "nfe", + ) -> DataFrame: + """Extract hail matrix index from StudyLocus rows. + + Args: + study_locus_row (Row): Study-locus row + radius (int): Locus radius to extract from gnomad matrix + major_population (str): Major population to extract from gnomad matrix, default is "nfe" + + Returns: + DataFrame: Returns the index of the gnomad matrix for the locus + + """ + chromosome = str("chr" + study_locus_row["chromosome"]) + start = study_locus_row["position"] - radius + end = study_locus_row["position"] + radius + + liftover_ht = hl.read_table(self.liftover_ht_path) + liftover_ht = ( + liftover_ht.filter( + (liftover_ht.locus.contig == chromosome) + & (liftover_ht.locus.position >= start) + & (liftover_ht.locus.position <= end) + ) + .key_by() + .select("locus", "alleles", "original_locus") + .key_by("original_locus", "alleles") + .naive_coalesce(20) + ) + + hail_index = hl.read_table( + self.ld_index_raw_template.format(POP=major_population) + ) + + joined_index = ( + liftover_ht.join(hail_index, how="inner").order_by("idx").to_spark() + ) + + return joined_index + + @staticmethod + def get_numpy_matrix( + locus_index: DataFrame, + gnomad_ancestry: str = "nfe", + ) -> np.ndarray: + """Extract the LD block matrix for a locus. + + Args: + locus_index (DataFrame): hail matrix variant index table + gnomad_ancestry (str): GnomAD major ancestry label eg. `nfe` + + Returns: + np.ndarray: LD block matrix for the locus + """ + idx = [row["idx"] for row in locus_index.select("idx").collect()] + + half_matrix = ( + BlockMatrix.read( + GnomADLDMatrix.ld_matrix_template.format(POP=gnomad_ancestry) + ) + .filter(idx, idx) + .to_numpy() + ) + + return (half_matrix + half_matrix.T) - np.diag(np.diag(half_matrix)) diff --git a/src/gentropy/datasource/gnomad/variants.py b/src/gentropy/datasource/gnomad/variants.py index 58f5f8093..b06b4ba6c 100644 --- a/src/gentropy/datasource/gnomad/variants.py +++ b/src/gentropy/datasource/gnomad/variants.py @@ -1,4 +1,5 @@ """Import gnomAD variants dataset.""" + from __future__ import annotations from dataclasses import dataclass, field @@ -9,7 +10,7 @@ from gentropy.dataset.variant_annotation import VariantAnnotation if TYPE_CHECKING: - from hail.expr.expressions import Int32Expression, StringExpression + pass @dataclass @@ -39,29 +40,6 @@ class GnomADVariants: ] ) - @staticmethod - def _convert_gnomad_position_to_ensembl_hail( - position: Int32Expression, - reference: StringExpression, - alternate: StringExpression, - ) -> Int32Expression: - """Convert GnomAD variant position to Ensembl variant position in hail table. - - For indels (the reference or alternate allele is longer than 1), then adding 1 to the position, for SNPs, the position is unchanged. - More info about the problem: https://www.biostars.org/p/84686/ - - Args: - position (Int32Expression): Position of the variant in the GnomAD genome. - reference (StringExpression): The reference allele. - alternate (StringExpression): The alternate allele - - Returns: - Int32Expression: The position of the variant according to Ensembl genome. - """ - return hl.if_else( - (reference.length() > 1) | (alternate.length() > 1), position + 1, position - ) - def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation: """Generate variant annotation dataset from gnomAD. @@ -93,7 +71,7 @@ def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation: return VariantAnnotation( _df=( ht.select( - gnomadVariantId=hl.str("-").join( + variantId=hl.str("_").join( [ ht.locus.contig.replace("chr", ""), hl.str(ht.locus.position), @@ -102,21 +80,7 @@ def as_variant_annotation(self: GnomADVariants) -> VariantAnnotation: ] ), chromosome=ht.locus.contig.replace("chr", ""), - position=GnomADVariants._convert_gnomad_position_to_ensembl_hail( - ht.locus.position, ht.alleles[0], ht.alleles[1] - ), - variantId=hl.str("_").join( - [ - ht.locus.contig.replace("chr", ""), - hl.str( - GnomADVariants._convert_gnomad_position_to_ensembl_hail( - ht.locus.position, ht.alleles[0], ht.alleles[1] - ) - ), - ht.alleles[0], - ht.alleles[1], - ] - ), + position=ht.locus.position, chromosomeB37=ht.locus_GRCh37.contig.replace("chr", ""), positionB37=ht.locus_GRCh37.position, referenceAllele=ht.alleles[0], diff --git a/src/gentropy/datasource/gwas_catalog/associations.py b/src/gentropy/datasource/gwas_catalog/associations.py index 5ff499f2c..658facea7 100644 --- a/src/gentropy/datasource/gwas_catalog/associations.py +++ b/src/gentropy/datasource/gwas_catalog/associations.py @@ -1,4 +1,5 @@ """Study Locus for GWAS Catalog data source.""" + from __future__ import annotations import importlib.resources as pkg_resources @@ -31,6 +32,40 @@ class GWASCatalogCuratedAssociationsParser: """GWAS Catalog curated associations parser.""" + @staticmethod + def convert_gnomad_position_to_ensembl( + position: Column, reference: Column, alternate: Column + ) -> Column: + """Convert GnomAD variant position to Ensembl variant position. + + For indels (the reference or alternate allele is longer than 1), then adding 1 to the position, for SNPs, + the position is unchanged. More info about the problem: https://www.biostars.org/p/84686/ + + Args: + position (Column): Position of the variant in GnomAD's coordinates system. + reference (Column): The reference allele in GnomAD's coordinates system. + alternate (Column): The alternate allele in GnomAD's coordinates system. + + Returns: + Column: The position of the variant in the Ensembl genome. + + Examples: + >>> d = [(1, "A", "C"), (2, "AA", "C"), (3, "A", "AA")] + >>> df = spark.createDataFrame(d).toDF("position", "reference", "alternate") + >>> df.withColumn("new_position", GWASCatalogCuratedAssociationsParser.convert_gnomad_position_to_ensembl(f.col("position"), f.col("reference"), f.col("alternate"))).show() + +--------+---------+---------+------------+ + |position|reference|alternate|new_position| + +--------+---------+---------+------------+ + | 1| A| C| 1| + | 2| AA| C| 3| + | 3| A| AA| 4| + +--------+---------+---------+------------+ + + """ + return f.when( + (f.length(reference) > 1) | (f.length(alternate) > 1), position + 1 + ).otherwise(position) + @staticmethod def _parse_pvalue(pvalue: Column) -> tuple[Column, Column]: """Parse p-value column. @@ -178,7 +213,8 @@ def _map_to_variant_annotation_variants( gwas_associations_subset = gwas_associations.select( "studyLocusId", f.col("CHR_ID").alias("chromosome"), - f.col("CHR_POS").cast(IntegerType()).alias("position"), + # The positions from GWAS Catalog are from ensembl that causes discrepancy for indels: + f.col("CHR_POS").cast(IntegerType()).alias("ensemblPosition"), # List of all SNPs associated with the variant GWASCatalogCuratedAssociationsParser._collect_rsids( f.split(f.col("SNPS"), "; ").getItem(0), @@ -194,6 +230,11 @@ def _map_to_variant_annotation_variants( va_subset = variant_annotation.df.select( "variantId", "chromosome", + # Calculate the position in Ensembl coordinates for indels: + GWASCatalogCuratedAssociationsParser.convert_gnomad_position_to_ensembl( + f.col("position"), f.col("referenceAllele"), f.col("alternateAllele") + ).alias("ensemblPosition"), + # Keeping GnomAD position: "position", f.col("rsIds").alias("rsIdsGnomad"), "referenceAllele", @@ -202,9 +243,11 @@ def _map_to_variant_annotation_variants( variant_annotation.max_maf().alias("maxMaf"), ).join( f.broadcast( - gwas_associations_subset.select("chromosome", "position").distinct() + gwas_associations_subset.select( + "chromosome", "ensemblPosition" + ).distinct() ), - on=["chromosome", "position"], + on=["chromosome", "ensemblPosition"], how="inner", ) @@ -213,7 +256,7 @@ def _map_to_variant_annotation_variants( filtered_associations = ( gwas_associations_subset.join( f.broadcast(va_subset), - on=["chromosome", "position"], + on=["chromosome", "ensemblPosition"], how="left", ) .withColumn( diff --git a/src/gentropy/datasource/gwas_catalog/study_index.py b/src/gentropy/datasource/gwas_catalog/study_index.py index cb6d3338a..e8b49b0da 100644 --- a/src/gentropy/datasource/gwas_catalog/study_index.py +++ b/src/gentropy/datasource/gwas_catalog/study_index.py @@ -305,6 +305,7 @@ def _parse_study_table( parse_efos(f.col("MAPPED BACKGROUND TRAIT URI")).alias( "backgroundTraitFromSourceMappedIds" ), + cls.parse_cohorts(f.col("COHORT")).alias("cohorts"), ), _schema=StudyIndexGWASCatalog.get_schema(), ) @@ -548,14 +549,6 @@ def annotate_ancestries( ) # studyId has not been split yet ) - # Parsing cohort information: - cohorts = ancestry_lut.select( - f.col("STUDY ACCESSION").alias("studyId"), - GWASCatalogStudyIndexParser.parse_cohorts(f.col("COHORT(S)")).alias( - "cohorts" - ), - ).distinct() - # Get a high resolution dataset on experimental stage: ancestry_stages = ( ancestry.groupBy("studyId") @@ -644,9 +637,7 @@ def annotate_ancestries( ).select( "studyId", "discoverySamples", "ldPopulationStructure", "replicationSamples" ) - self.df = self.df.join(parsed_ancestry_lut, on="studyId", how="left").join( - cohorts, on="studyId", how="left" - ) + self.df = self.df.join(parsed_ancestry_lut, on="studyId", how="left") return self def annotate_sumstats_info( diff --git a/src/gentropy/datasource/intervals/thurman.py b/src/gentropy/datasource/intervals/thurman.py index 88ed5b6ad..a8113e5a6 100644 --- a/src/gentropy/datasource/intervals/thurman.py +++ b/src/gentropy/datasource/intervals/thurman.py @@ -41,7 +41,7 @@ def read(spark: SparkSession, path: str) -> DataFrame: t.StructField("score", t.FloatType(), False), ] ) - return spark.read.csv(path, sep="\t", header=True, schema=thurman_schema) + return spark.read.csv(path, sep="\t", header=False, schema=thurman_schema) @classmethod def parse( diff --git a/src/gentropy/l2g.py b/src/gentropy/l2g.py index d00a91596..d0acd2b95 100644 --- a/src/gentropy/l2g.py +++ b/src/gentropy/l2g.py @@ -73,7 +73,9 @@ def __init__( session, study_index_path, recursiveFileLookup=True ) v2g = V2G.from_parquet(session, variant_gene_path) - coloc = Colocalisation.from_parquet(session, colocalisation_path) + coloc = Colocalisation.from_parquet( + session, colocalisation_path, recursiveFileLookup=True + ) if run_mode == "predict": if not model_path or not predictions_path: @@ -91,7 +93,7 @@ def __init__( and gene_interactions_path ): # Process gold standard and L2G features - gs_curation = session.spark.read.json(gold_standard_curation_path).persist() + gs_curation = session.spark.read.json(gold_standard_curation_path) interactions = session.spark.read.parquet(gene_interactions_path) study_locus_overlap = StudyLocus( # We just extract overlaps of associations in the gold standard. This parsing is a duplication of the one in the gold standard curation, @@ -126,23 +128,27 @@ def __init__( fm = L2GFeatureMatrix.generate_features( features_list=features_list, - study_locus=credible_set, + credible_set=credible_set, study_index=studies, variant_gene=v2g, colocalisation=coloc, ) - # Join and fill null values with 0 - data = L2GFeatureMatrix( - _df=fm.df.join( - f.broadcast( - gold_standards.df.drop("variantId", "studyId", "sources") + data = ( + # Annotate gold standards with features + L2GFeatureMatrix( + _df=fm.df.join( + f.broadcast( + gold_standards.df.drop("variantId", "studyId", "sources") + ), + on=["studyLocusId", "geneId"], + how="inner", ), - on=["studyLocusId", "geneId"], - how="inner", - ), - _schema=L2GFeatureMatrix.get_schema(), - ).fill_na() + _schema=L2GFeatureMatrix.get_schema(), + ) + .fill_na() + .select_features(list(features_list)) + ) # Instantiate classifier estimator = SparkXGBClassifier( @@ -165,9 +171,8 @@ def __init__( else: # Train model LocusToGeneTrainer.train( - data=data, + gold_standard_data=data, l2g_model=l2g_model, - features_list=list(features_list), model_path=model_path, evaluate=True, wandb_run_name=wandb_run_name, diff --git a/src/gentropy/method/carma.py b/src/gentropy/method/carma.py index 75cb32c79..af8816706 100644 --- a/src/gentropy/method/carma.py +++ b/src/gentropy/method/carma.py @@ -2,6 +2,7 @@ from __future__ import annotations import concurrent.futures +import warnings from itertools import combinations from math import floor, lgamma from typing import Any @@ -32,6 +33,8 @@ def time_limited_CARMA_spike_slab_noEM( - B_list: A dataframe containing the marginal likelihoods and the corresponding model space or None. - Outliers: A list of outlier SNPs or None. """ + # Ignore pandas future warnings + warnings.simplefilter(action="ignore", category=FutureWarning) try: # Execute CARMA.CARMA_spike_slab_noEM with a timeout with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: @@ -854,9 +857,19 @@ def _MCS_modified( # noqa: C901 sec_sample = np.random.choice( range(0, 3), 1, p=np.exp(aa) / np.sum(np.exp(aa)) ) - S = set_gamma[sec_sample[0]][ - int(set_star["gamma_set_index"][sec_sample[0]]) - ].tolist() + if set_gamma[sec_sample[0]] is not None: + S = set_gamma[sec_sample[0]][ + int(set_star["gamma_set_index"][sec_sample[0]]) + ].tolist() + else: + sec_sample = np.random.choice( + range(1, 3), + 1, + p=np.exp(aa)[[1, 2]] / np.sum(np.exp(aa)[[1, 2]]), + ) + S = set_gamma[sec_sample[0]][ + int(set_star["gamma_set_index"][sec_sample[0]]) + ].tolist() for item in conditional_S: if item not in S: diff --git a/src/gentropy/method/colocalisation.py b/src/gentropy/method/colocalisation.py index 3e4e91c74..18d97fdf8 100644 --- a/src/gentropy/method/colocalisation.py +++ b/src/gentropy/method/colocalisation.py @@ -26,6 +26,9 @@ class ECaviar: It extends [CAVIAR](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5142122/#bib18) framework to explicitly estimate the posterior probability that the same variant is causal in 2 studies while accounting for the uncertainty of LD. eCAVIAR computes the colocalization posterior probability (**CLPP**) by utilizing the marginal posterior probabilities. This framework allows for **multiple variants to be causal** in a single locus. """ + METHOD_NAME: str = "eCAVIAR" + METHOD_METRIC: str = "clpp" + @staticmethod def _get_clpp(left_pp: Column, right_pp: Column) -> Column: """Calculate the colocalisation posterior probability (CLPP). @@ -81,7 +84,7 @@ def colocalise( f.count("*").alias("numberColocalisingVariants"), f.sum(f.col("clpp")).alias("clpp"), ) - .withColumn("colocalisationMethod", f.lit("eCAVIAR")) + .withColumn("colocalisationMethod", f.lit(cls.METHOD_NAME)) ), _schema=Colocalisation.get_schema(), ) @@ -108,6 +111,8 @@ class Coloc: PSEUDOCOUNT (float): Pseudocount to avoid log(0). Defaults to 1e-10. """ + METHOD_NAME: str = "COLOC" + METHOD_METRIC: str = "llr" PSEUDOCOUNT: float = 1e-10 @staticmethod @@ -154,24 +159,24 @@ def colocalise( posteriors = f.udf(Coloc._get_posteriors, VectorUDT()) return Colocalisation( _df=( - overlapping_signals.df + overlapping_signals.df.select("*", "statistics.*") # Before summing log_BF columns nulls need to be filled with 0: - .fillna(0, subset=["statistics.left_logBF", "statistics.right_logBF"]) + .fillna(0, subset=["left_logBF", "right_logBF"]) # Sum of log_BFs for each pair of signals .withColumn( "sum_log_bf", - f.col("statistics.left_logBF") + f.col("statistics.right_logBF"), + f.col("left_logBF") + f.col("right_logBF"), ) # Group by overlapping peak and generating dense vectors of log_BF: .groupBy("chromosome", "leftStudyLocusId", "rightStudyLocusId") .agg( f.count("*").alias("numberColocalisingVariants"), - fml.array_to_vector( - f.collect_list(f.col("statistics.left_logBF")) - ).alias("left_logBF"), - fml.array_to_vector( - f.collect_list(f.col("statistics.right_logBF")) - ).alias("right_logBF"), + fml.array_to_vector(f.collect_list(f.col("left_logBF"))).alias( + "left_logBF" + ), + fml.array_to_vector(f.collect_list(f.col("right_logBF"))).alias( + "right_logBF" + ), fml.array_to_vector(f.collect_list(f.col("sum_log_bf"))).alias( "sum_log_bf" ), @@ -253,7 +258,7 @@ def colocalise( "lH3bf", "lH4bf", ) - .withColumn("colocalisationMethod", f.lit("COLOC")) + .withColumn("colocalisationMethod", f.lit(cls.METHOD_NAME)) ), _schema=Colocalisation.get_schema(), ) diff --git a/src/gentropy/method/l2g/feature_factory.py b/src/gentropy/method/l2g/feature_factory.py index a3dba6688..f037b57b4 100644 --- a/src/gentropy/method/l2g/feature_factory.py +++ b/src/gentropy/method/l2g/feature_factory.py @@ -2,6 +2,7 @@ from __future__ import annotations from functools import reduce +from itertools import chain from typing import TYPE_CHECKING import pyspark.sql.functions as f @@ -12,6 +13,7 @@ ) from gentropy.dataset.l2g_feature import L2GFeature from gentropy.dataset.study_locus import CredibleInterval, StudyLocus +from gentropy.method.colocalisation import Coloc, ECaviar if TYPE_CHECKING: from pyspark.sql import Column, DataFrame @@ -24,197 +26,136 @@ class ColocalisationFactory: """Feature extraction in colocalisation.""" + @classmethod + def _add_colocalisation_metric(cls: type[ColocalisationFactory]) -> Column: + """Expression that adds a `colocalisationMetric` column to the colocalisation dataframe in preparation for feature extraction. + + Returns: + Column: The expression that adds a `colocalisationMetric` column with the derived metric + """ + method_metric_map = { + ECaviar.METHOD_NAME: ECaviar.METHOD_METRIC, + Coloc.METHOD_NAME: Coloc.METHOD_METRIC, + } + map_expr = f.create_map(*[f.lit(x) for x in chain(*method_metric_map.items())]) + return map_expr[f.col("colocalisationMethod")].alias("colocalisationMetric") + @staticmethod - def _get_max_coloc_per_study_locus( - study_locus: StudyLocus, - studies: StudyIndex, + def _get_max_coloc_per_credible_set( colocalisation: Colocalisation, - colocalisation_method: str, + credible_set: StudyLocus, + studies: StudyIndex, ) -> L2GFeature: """Get the maximum colocalisation posterior probability for each pair of overlapping study-locus per type of colocalisation method and QTL type. Args: - study_locus (StudyLocus): Study locus dataset - studies (StudyIndex): Study index dataset colocalisation (Colocalisation): Colocalisation dataset - colocalisation_method (str): Colocalisation method to extract the max from + credible_set (StudyLocus): Study locus dataset + studies (StudyIndex): Study index dataset Returns: L2GFeature: Stores the features with the max coloc probabilities for each pair of study-locus - - Raises: - ValueError: If the colocalisation method is not supported """ - if colocalisation_method not in ["COLOC", "eCAVIAR"]: - raise ValueError( - f"Colocalisation method {colocalisation_method} not supported" - ) - if colocalisation_method == "COLOC": - coloc_score_col_name = "log2h4h3" - coloc_feature_col_template = "ColocLlrMaximum" - - elif colocalisation_method == "eCAVIAR": - coloc_score_col_name = "clpp" - coloc_feature_col_template = "ColocClppMaximum" + colocalisation_df = colocalisation.df.select( + f.col("leftStudyLocusId").alias("studyLocusId"), + "rightStudyLocusId", + f.coalesce("log2h4h3", "clpp").alias("score"), + ColocalisationFactory._add_colocalisation_metric(), + ) - colocalising_study_locus = ( - study_locus.df.select("studyLocusId", "studyId") + colocalising_credible_sets = ( + credible_set.df.select("studyLocusId", "studyId") # annotate studyLoci with overlapping IDs on the left - to just keep GWAS associations .join( - colocalisation.df.selectExpr( - "leftStudyLocusId as studyLocusId", - "rightStudyLocusId", - "colocalisationMethod", - f"{coloc_score_col_name} as coloc_score", - ), + colocalisation_df, on="studyLocusId", how="inner", ) # bring study metadata to just keep QTL studies on the right .join( - study_locus.df.selectExpr( - "studyLocusId as rightStudyLocusId", "studyId as right_studyId" + credible_set.df.join( + studies.df.select("studyId", "studyType", "geneId"), "studyId" + ).selectExpr( + "studyLocusId as rightStudyLocusId", + "studyType as right_studyType", + "geneId", ), on="rightStudyLocusId", how="inner", ) - .join( - f.broadcast( - studies.df.selectExpr( - "studyId as right_studyId", - "studyType as right_studyType", - "geneId", - ) - ), - on="right_studyId", - how="inner", - ) - .filter( - (f.col("colocalisationMethod") == colocalisation_method) - & (f.col("right_studyType") != "gwas") + .filter(f.col("right_studyType") != "gwas") + .select( + "studyLocusId", + "right_studyType", + "geneId", + "score", + "colocalisationMetric", ) - .select("studyLocusId", "right_studyType", "geneId", "coloc_score") ) - # Max PP calculation per studyLocus AND type of QTL - local_max = get_record_with_maximum_value( - colocalising_study_locus, - ["studyLocusId", "right_studyType", "geneId"], - "coloc_score", - ).persist() - - intercept = 0.0001 - neighbourhood_max = ( - ( - local_max.selectExpr( - "studyLocusId", "coloc_score as coloc_local_max", "geneId" - ) - .join( - # Add maximum in the neighborhood - get_record_with_maximum_value( - colocalising_study_locus.withColumnRenamed( - "coloc_score", "coloc_neighborhood_max" - ), - ["studyLocusId", "right_studyType"], - "coloc_neighborhood_max", - ).drop("geneId"), - on="studyLocusId", - ) - .withColumn( - f"{coloc_feature_col_template}Neighborhood", - f.log10( - f.abs( - f.col("coloc_local_max") - - f.col("coloc_neighborhood_max") - + f.lit(intercept) - ) - ), - ) + # Max PP calculation per credible set AND type of QTL AND colocalisation method + local_max = ( + get_record_with_maximum_value( + colocalising_credible_sets, + ["studyLocusId", "right_studyType", "geneId", "colocalisationMetric"], + "score", ) - .drop("coloc_neighborhood_max", "coloc_local_max") - .persist() + .select( + "*", + f.col("score").alias("max_score"), + f.lit("Local").alias("score_type"), + ) + .drop("score") ) - # Split feature per molQTL - local_dfs = [] - nbh_dfs = [] - qtl_types: list[str] = ( - colocalising_study_locus.select("right_studyType") - .distinct() - .toPandas()["right_studyType"] - .tolist() - ) or ["eqtl", "pqtl", "sqtl"] - for qtl_type in qtl_types: - filtered_local_max = ( - local_max.filter(f.col("right_studyType") == qtl_type) - .withColumnRenamed( - "coloc_score", - f"{qtl_type}{coloc_feature_col_template}", - ) - .drop("right_studyType") + neighbourhood_max = ( + local_max.selectExpr( + "studyLocusId", "max_score as local_max_score", "geneId" ) - local_dfs.append(filtered_local_max) - - filtered_neighbourhood_max = ( - neighbourhood_max.filter(f.col("right_studyType") == qtl_type) - .withColumnRenamed( - f"{coloc_feature_col_template}Neighborhood", - f"{qtl_type}{coloc_feature_col_template}Neighborhood", - ) - .drop("right_studyType") + .join( + # Add maximum in the neighborhood + get_record_with_maximum_value( + colocalising_credible_sets.withColumnRenamed( + "score", "tmp_nbh_max_score" + ), + ["studyLocusId", "right_studyType", "colocalisationMetric"], + "tmp_nbh_max_score", + ).drop("geneId"), + on="studyLocusId", ) - nbh_dfs.append(filtered_neighbourhood_max) - - wide_dfs = reduce( - lambda x, y: x.unionByName(y, allowMissingColumns=True), - local_dfs + nbh_dfs, - ) - - return L2GFeature( - _df=convert_from_wide_to_long( - wide_dfs.groupBy("studyLocusId", "geneId").agg( - *( - f.first(f.col(c), ignorenulls=True).alias(c) - for c in wide_dfs.columns - if c - not in [ - "studyLocusId", - "geneId", - ] + .withColumn("score_type", f.lit("Neighborhood")) + .withColumn( + "max_score", + f.log10( + f.abs( + f.col("local_max_score") + - f.col("tmp_nbh_max_score") + + f.lit(0.0001) # intercept ) ), - id_vars=("studyLocusId", "geneId"), - var_name="featureName", - value_name="featureValue", - ), - _schema=L2GFeature.get_schema(), - ) - - @staticmethod - def _get_coloc_features( - study_locus: StudyLocus, studies: StudyIndex, colocalisation: Colocalisation - ) -> L2GFeature: - """Calls _get_max_coloc_per_study_locus for both methods and concatenates the results. - - !!! note "Colocalisation features are only available for the eCAVIAR results for now." - - Args: - study_locus (StudyLocus): Study locus dataset - studies (StudyIndex): Study index dataset - colocalisation (Colocalisation): Colocalisation dataset - - Returns: - L2GFeature: Stores the features with the max coloc probabilities for each pair of study-locus - """ - coloc_clpp = ColocalisationFactory._get_max_coloc_per_study_locus( - study_locus, - studies, - colocalisation, - "eCAVIAR", - ) + ) + ).drop("tmp_nbh_max_score", "local_max_score") return L2GFeature( - _df=coloc_clpp.df, + _df=( + # Combine local and neighborhood metrics + local_max.unionByName( + neighbourhood_max, allowMissingColumns=True + ).select( + "studyLocusId", + "geneId", + # Feature name is a concatenation of the QTL type, colocalisation metric and if it's local or in the vicinity + f.concat_ws( + "", + f.col("right_studyType"), + f.lit("Coloc"), + f.initcap(f.col("colocalisationMetric")), + f.lit("Maximum"), + f.regexp_replace(f.col("score_type"), "Local", ""), + ).alias("featureName"), + f.col("max_score").cast("float").alias("featureValue"), + ) + ), _schema=L2GFeature.get_schema(), ) @@ -223,37 +164,43 @@ class StudyLocusFactory(StudyLocus): """Feature extraction in study locus.""" @staticmethod - def _get_tss_distance_features( - study_locus: StudyLocus, distances: V2G - ) -> L2GFeature: - """Joins StudyLocus with the V2G to extract the minimum distance to a gene TSS of all variants in a StudyLocus credible set. + def _get_tss_distance_features(credible_set: StudyLocus, v2g: V2G) -> L2GFeature: + """Joins StudyLocus with the V2G to extract a score that is based on the distance to a gene TSS of any variant weighted by its posterior probability in a credible set. Args: - study_locus (StudyLocus): Study locus dataset - distances (V2G): Dataframe containing the distances of all variants to all genes TSS within a region + credible_set (StudyLocus): Credible set dataset + v2g (V2G): Dataframe containing the distances of all variants to all genes TSS within a region Returns: - L2GFeature: Stores the features with the minimum distance among all variants in the credible set and a gene TSS. + L2GFeature: Stores the features with the score of weighting the distance to the TSS by the posterior probability of the variant """ wide_df = ( - study_locus.filter_credible_set(CredibleInterval.IS95) - .df.select( + credible_set.filter_credible_set(CredibleInterval.IS95) + .df.withColumn("variantInLocus", f.explode_outer("locus")) + .select( "studyLocusId", "variantId", - f.explode("locus.variantId").alias("tagVariantId"), + f.col("variantInLocus.variantId").alias("variantInLocusId"), + f.col("variantInLocus.posteriorProbability").alias( + "variantInLocusPosteriorProbability" + ), ) .join( - distances.df.selectExpr( - "variantId as tagVariantId", "geneId", "distance" + v2g.df.filter(f.col("datasourceId") == "canonical_tss").selectExpr( + "variantId as variantInLocusId", "geneId", "score" ), - on="tagVariantId", + on="variantInLocusId", how="inner", ) + .withColumn( + "weightedScore", + f.col("score") * f.col("variantInLocusPosteriorProbability"), + ) .groupBy("studyLocusId", "geneId") .agg( - f.min("distance").alias("distanceTssMinimum"), - f.mean("distance").alias("distanceTssMean"), + f.min("weightedScore").alias("distanceTssMinimum"), + f.mean("weightedScore").alias("distanceTssMean"), ) ) @@ -321,32 +268,33 @@ def _aggregate_vep_feature( credible_set_w_variant_consequences = ( credible_set.filter_credible_set(CredibleInterval.IS95) - .df.withColumn("variantInLocusId", f.explode(f.col("locus.variantId"))) - .withColumn( - "variantInLocusPosteriorProbability", - f.explode(f.col("locus.posteriorProbability")), + .df.withColumn("variantInLocus", f.explode_outer("locus")) + .select( + f.col("studyLocusId"), + f.col("variantId"), + f.col("studyId"), + f.col("variantInLocus.variantId").alias("variantInLocusId"), + f.col("variantInLocus.posteriorProbability").alias( + "variantInLocusPosteriorProbability" + ), ) .join( # Join with V2G to get variant consequences - v2g.df.filter( - f.col("datasourceId") == "variantConsequence" - ).withColumnRenamed("variantId", "variantInLocusId"), + v2g.df.filter(f.col("datasourceId") == "variantConsequence").selectExpr( + "variantId as variantInLocusId", "geneId", "score" + ), on="variantInLocusId", ) - .withColumn( - "weightedScore", - f.col("score") * f.col("variantInLocusPosteriorProbability"), - ) .select( "studyLocusId", "variantId", "studyId", "geneId", - "score", - "weightedScore", + (f.col("score") * f.col("variantInLocusPosteriorProbability")).alias( + "weightedScore" + ), ) .distinct() - .persist() ) return L2GFeature( @@ -357,14 +305,14 @@ def _aggregate_vep_feature( # Calculate overall max VEP score for all genes in the vicinity credible_set_w_variant_consequences.transform( _aggregate_vep_feature, - f.max("score"), + f.max("weightedScore"), ["studyLocusId"], "vepMaximumNeighborhood", ), # Calculate overall max VEP score per gene credible_set_w_variant_consequences.transform( _aggregate_vep_feature, - f.max("score"), + f.max("weightedScore"), ["studyLocusId", "geneId"], "vepMaximum", ), diff --git a/src/gentropy/method/l2g/model.py b/src/gentropy/method/l2g/model.py index 5df189d33..f8d892e07 100644 --- a/src/gentropy/method/l2g/model.py +++ b/src/gentropy/method/l2g/model.py @@ -207,7 +207,7 @@ def evaluate( results: DataFrame, hyperparameters: dict[str, Any], wandb_run_name: str | None, - training_data: L2GFeatureMatrix | None = None, + gold_standard_data: L2GFeatureMatrix | None = None, ) -> None: """Perform evaluation of the model predictions for the test set and track the results with W&B. @@ -215,7 +215,7 @@ def evaluate( results (DataFrame): Dataframe containing the predictions hyperparameters (dict[str, Any]): Hyperparameters used for the model wandb_run_name (str | None): Descriptive name for the run to be tracked with W&B - training_data (L2GFeatureMatrix | None): Training data used for the model. If provided, the ratio of positive to negative labels will be logged to W&B + gold_standard_data (L2GFeatureMatrix | None): Feature matrix for the associations in the gold standard. If provided, the ratio of positive to negative labels will be logged to W&B """ binary_evaluator = BinaryClassificationEvaluator( rawPredictionCol="rawPrediction", labelCol="label" @@ -224,7 +224,7 @@ def evaluate( labelCol="label", predictionCol="prediction" ) - if wandb_run_name and training_data: + if wandb_run_name and gold_standard_data: run = wandb_init( project=self.wandb_l2g_project_name, config=hyperparameters, @@ -232,7 +232,10 @@ def evaluate( ) if isinstance(run, Run): self.log_to_wandb( - results, training_data, [binary_evaluator, multi_evaluator], run + results, + gold_standard_data, + [binary_evaluator, multi_evaluator], + run, ) run.finish() diff --git a/src/gentropy/method/l2g/trainer.py b/src/gentropy/method/l2g/trainer.py index 8d2b3d7aa..1638a0417 100644 --- a/src/gentropy/method/l2g/trainer.py +++ b/src/gentropy/method/l2g/trainer.py @@ -22,9 +22,8 @@ class LocusToGeneTrainer: @classmethod def train( cls: type[LocusToGeneTrainer], - data: L2GFeatureMatrix, + gold_standard_data: L2GFeatureMatrix, l2g_model: LocusToGeneModel, - features_list: list[str], evaluate: bool, wandb_run_name: str | None = None, model_path: str | None = None, @@ -33,9 +32,8 @@ def train( """Train the Locus to Gene model. Args: - data (L2GFeatureMatrix): Feature matrix containing the data + gold_standard_data (L2GFeatureMatrix): Feature matrix for the associations in the gold standard l2g_model (LocusToGeneModel): Model to fit to the data on - features_list (list[str]): List of features to use for the model evaluate (bool): Whether to evaluate the model on a test set wandb_run_name (str | None): Descriptive name for the run to be tracked with W&B model_path (str | None): Path to save the model to @@ -44,7 +42,7 @@ def train( Returns: LocusToGeneModel: Trained model """ - train, test = data.select_features(features_list).train_test_split(fraction=0.8) + train, test = gold_standard_data.train_test_split(fraction=0.8) model = l2g_model.add_pipeline_stage(l2g_model.estimator).fit(train) @@ -53,7 +51,7 @@ def train( results=model.predict(test), hyperparameters=hyperparams, wandb_run_name=wandb_run_name, - training_data=train, + gold_standard_data=gold_standard_data, ) if model_path: l2g_model.save(model_path) diff --git a/src/gentropy/method/sumstat_imputation.py b/src/gentropy/method/sumstat_imputation.py new file mode 100644 index 000000000..8375fa84f --- /dev/null +++ b/src/gentropy/method/sumstat_imputation.py @@ -0,0 +1,172 @@ +"""RAISS summary statstics imputation model.""" + +from __future__ import annotations + +from typing import Any + +import numpy as np +import scipy.linalg + + +class SummaryStatisticsImputation: + """Implementation of RAISS summary statstics imputation model.""" + + @staticmethod + def raiss_model( + z_scores_known: np.ndarray, + ld_matrix_known: np.ndarray, + ld_matrix_known_missing: np.ndarray, + lamb: float = 0.01, + rtol: float = 0.01, + ) -> dict[str, Any]: + """Compute the imputation of the z-score using the RAISS model. + + Args: + z_scores_known (np.ndarray): the vector of known Z scores + ld_matrix_known (np.ndarray) : the matrix of known LD correlations + ld_matrix_known_missing (np.ndarray): LD matrix of known SNPs with other unknown SNPs in large matrix (similar to ld[unknowns, :][:,known]) + lamb (float): size of the small value added to the diagonal of the covariance matrix before inversion. Defaults to 0.01. + rtol (float): threshold to filter eigenvectos by its eigenvalue. It makes an inversion biased but much more numerically robust. Default to 0.01. + + Returns: + dict[str, Any]: + - var (np.ndarray): variance of the imputed SNPs + - mu (np.ndarray): the estimation of the zscore of the imputed SNPs + - ld_score (np.ndarray): the linkage disequilibrium score of the imputed SNPs + - condition_number (np.ndarray): the condition number of the correlation matrix + - correct_inversion (np.ndarray): a boolean array indicating if the inversion was successful + - imputation_r2 (np.ndarray): the R2 of the imputation + """ + sig_t_inv = SummaryStatisticsImputation._invert_sig_t( + ld_matrix_known, lamb, rtol + ) + if sig_t_inv is None: + return { + "var": None, + "mu": None, + "ld_score": None, + "condition_number": None, + "correct_inversion": None, + "imputation_r2": None, + } + else: + condition_number = np.array( + [np.linalg.cond(ld_matrix_known)] * ld_matrix_known_missing.shape[0] + ) + correct_inversion = np.array( + [ + SummaryStatisticsImputation._check_inversion( + ld_matrix_known, sig_t_inv + ) + ] + * ld_matrix_known_missing.shape[0] + ) + + var, ld_score = SummaryStatisticsImputation._compute_var( + ld_matrix_known_missing, sig_t_inv, lamb + ) + + mu = SummaryStatisticsImputation._compute_mu( + ld_matrix_known_missing, sig_t_inv, z_scores_known + ) + var_norm = SummaryStatisticsImputation._var_in_boundaries(var, lamb) + + R2 = (1 + lamb) - var_norm + + mu = mu / np.sqrt(R2) + return { + "var": var, + "mu": mu, + "ld_score": ld_score, + "condition_number": condition_number, + "correct_inversion": correct_inversion, + "imputation_r2": 1 - var, + } + + @staticmethod + def _compute_mu( + sig_i_t: np.ndarray, sig_t_inv: np.ndarray, zt: np.ndarray + ) -> np.ndarray: + """Compute the estimation of z-score from neighborring snp. + + Args: + sig_i_t (np.ndarray) : correlation matrix with line corresponding to unknown Snp (snp to impute) and column to known SNPs + sig_t_inv (np.ndarray): inverse of the correlation matrix of known matrix + zt (np.ndarray): Zscores of known snp + Returns: + np.ndarray: a vector of length i containing the estimate of zscore + + """ + return np.dot(sig_i_t, np.dot(sig_t_inv, zt)) + + @staticmethod + def _compute_var( + sig_i_t: np.ndarray, sig_t_inv: np.ndarray, lamb: float + ) -> tuple[np.ndarray, np.ndarray]: + """Compute the expected variance of the imputed SNPs. + + Args: + sig_i_t (np.ndarray) : correlation matrix with line corresponding to unknown Snp (snp to impute) and column to known SNPs + sig_t_inv (np.ndarray): inverse of the correlation matrix of known matrix + lamb (float): regularization term added to matrix + + Returns: + tuple[np.ndarray, np.ndarray]: a tuple containing the variance and the ld score + """ + var = (1 + lamb) - np.einsum( + "ij,jk,ki->i", sig_i_t, sig_t_inv, sig_i_t.transpose() + ) + ld_score = (sig_i_t**2).sum(1) + + return var, ld_score + + @staticmethod + def _check_inversion(sig_t: np.ndarray, sig_t_inv: np.ndarray) -> bool: + """Check if the inversion is correct. + + Args: + sig_t (np.ndarray): the correlation matrix + sig_t_inv (np.ndarray): the inverse of the correlation matrix + Returns: + bool: True if the inversion is correct, False otherwise + """ + return np.allclose(sig_t, np.dot(sig_t, np.dot(sig_t_inv, sig_t))) + + @staticmethod + def _var_in_boundaries(var: np.ndarray, lamb: float) -> np.ndarray: + """Forces the variance to be in the 0 to 1+lambda boundary. Theoritically we shouldn't have to do that. + + Args: + var (np.ndarray): the variance of the imputed SNPs + lamb (float): regularization term added to the diagonal of the sig_t matrix + + Returns: + np.ndarray: the variance of the imputed SNPs + """ + id_neg = np.where(var < 0) + var[id_neg] = 0 + id_inf = np.where(var > (0.99999 + lamb)) + var[id_inf] = 1 + + return var + + @staticmethod + def _invert_sig_t(sig_t: np.ndarray, lamb: float, rtol: float) -> np.ndarray: + """Invert the correlation matrix. If the provided regularization values are not enough to stabilize the inversion process for the given matrix, the function calls itself recursively, increasing lamb and rtol by 10%. + + Args: + sig_t (np.ndarray): the correlation matrix + lamb (float): regularization term added to the diagonal of the sig_t matrix + rtol (float): threshold to filter eigenvector with a eigenvalue under rtol make inversion biased but much more numerically robust + + Returns: + np.ndarray: the inverse of the correlation matrix + """ + try: + np.fill_diagonal(sig_t, (1 + lamb)) + sig_t_inv = scipy.linalg.pinv(sig_t, rtol=rtol, atol=0) + return sig_t_inv + except np.linalg.LinAlgError: + return SummaryStatisticsImputation._invert_sig_t( + sig_t, lamb * 1.1, rtol * 1.1 + ) diff --git a/src/gentropy/method/sumstat_quality_controls.py b/src/gentropy/method/sumstat_quality_controls.py new file mode 100644 index 000000000..2858f4813 --- /dev/null +++ b/src/gentropy/method/sumstat_quality_controls.py @@ -0,0 +1,285 @@ +"""Summary statistics qulity control methods.""" +from __future__ import annotations + +import numpy as np +import pyspark.sql.functions as f +import pyspark.sql.types as t +import scipy as sc +from pyspark.sql import DataFrame +from pyspark.sql.functions import expr, log10, row_number +from pyspark.sql.window import Window +from scipy.stats import chi2 + +from gentropy.dataset.summary_statistics import SummaryStatistics + + +class SummaryStatisticsQC: + """Summary statistics QC methods. + + This module contains methods for quality control of GWAS summary statistics. + The list of methods includes: + + - sumstat_qc_beta_check: This is the mean beta check. The mean beta should be close to 0. + + - sumstat_qc_pz_check: This is the PZ check. It runs a linear regression between reported p-values and p-values inferred from z-scores. + + - sumstat_n_eff_check: This is the effective sample size check. It estimates the ratio between the effective sample size and the expected one and checks its distribution. + + - gc_lambda_check: This is the genomic control lambda check. + + - number_of_snps: This function calculates the number of SNPs and the number of SNPs with a p-value less than 5e-8. + """ + + @staticmethod + def sumstat_qc_beta_check( + gwas_for_qc: SummaryStatistics, + ) -> DataFrame: + """The mean beta check for QC of GWAS summary statstics. + + Args: + gwas_for_qc (SummaryStatistics): The instance of the SummaryStatistics class. + + Returns: + DataFrame: PySpark DataFrame with the mean beta for each study. + """ + gwas_df = gwas_for_qc._df + qc_c = gwas_df.groupBy("studyId").agg( + f.mean("beta").alias("mean_beta"), + ) + return qc_c + + @staticmethod + def _calculate_logpval(z2: float) -> float: + """Calculate negative log10-pval from Z-score. + + Args: + z2 (float): Z-score squared. + + Returns: + float: log10-pval. + + Examples: + >>> SummaryStatisticsQC._calculate_logpval(1.0) + 0.49851554582799334 + """ + logpval = -np.log10(sc.stats.chi2.sf((z2), 1)) + return float(logpval) + + @staticmethod + def sumstat_qc_pz_check( + gwas_for_qc: SummaryStatistics, + limit: int = 10_000_000, + ) -> DataFrame: + """The PZ check for QC of GWAS summary statstics. It runs linear regression between reported p-values and p-values infered from z-scores. + + Args: + gwas_for_qc (SummaryStatistics): The instance of the SummaryStatistics class. + limit (int): The limit for the number of variants to be used for the estimation. + + Returns: + DataFrame: PySpark DataFrame with the results of the linear regression for each study. + """ + gwas_df = gwas_for_qc._df + + calculate_logpval_udf = f.udf( + SummaryStatisticsQC._calculate_logpval, t.DoubleType() + ) + + window = Window.partitionBy("studyId").orderBy("studyId") + + gwas_df = ( + gwas_df.withColumn("row_num", row_number().over(window)) + .filter(f.col("row_num") <= limit) + .drop("row_num") + ) + + qc_c = ( + gwas_df.withColumn("zscore", f.col("beta") / f.col("standardError")) + .withColumn("new_logpval", calculate_logpval_udf(f.col("zscore") ** 2)) + .withColumn("log_mantissa", log10("pValueMantissa")) + .withColumn( + "diffpval", + -f.col("log_mantissa") - f.col("pValueExponent") - f.col("new_logpval"), + ) + .groupBy("studyId") + .agg( + f.mean("diffpval").alias("mean_diff_pz"), + f.stddev("diffpval").alias("se_diff_pz"), + ) + .select("studyId", "mean_diff_pz", "se_diff_pz") + ) + + return qc_c + + @staticmethod + def sumstat_n_eff_check( + gwas_for_qc: SummaryStatistics, + n_total: int = 100_000, + limit: int = 10_000_000, + min_count: int = 100, + ) -> DataFrame: + """The effective sample size check for QC of GWAS summary statstics. + + It estiamtes the ratio between effective sample size and the expected one and checks it's distribution. + It is possible to conduct only if the effective allele frequency is provided in the study. + The median rartio is always close to 1, but standard error could be inflated. + + Args: + gwas_for_qc (SummaryStatistics): The instance of the SummaryStatistics class. + n_total (int): The reported sample size of the study. The QC metrics is robust toward the sample size. + limit (int): The limit for the number of variants to be used for the estimation. + min_count (int): The minimum number of variants to be used for the estimation. + + Returns: + DataFrame: PySpark DataFrame with the effective sample size ratio for each study. + """ + gwas_df = gwas_for_qc._df + + gwas_df = gwas_df.dropna(subset=["effectAlleleFrequencyFromSource"]) + + counts_df = gwas_df.groupBy("studyId").count() + + # Join the original DataFrame with the counts DataFrame + df_with_counts = gwas_df.join(counts_df, on="studyId") + + # Filter the DataFrame to keep only the groups with count greater than or equal to min_count + filtered_df = df_with_counts.filter(f.col("count") >= min_count).drop("count") + + window = Window.partitionBy("studyId").orderBy("studyId") + gwas_df = ( + filtered_df.withColumn("row_num", row_number().over(window)) + .filter(f.col("row_num") <= limit) + .drop("row_num") + ) + + gwas_df = gwas_df.withColumn( + "var_af", + 2 + * ( + f.col("effectAlleleFrequencyFromSource") + * (1 - f.col("effectAlleleFrequencyFromSource")) + ), + ).withColumn( + "pheno_var", + ((f.col("standardError") ** 2) * n_total * f.col("var_af")) + + ((f.col("beta") ** 2) * f.col("var_af")), + ) + + window = Window.partitionBy("studyId").orderBy("studyId") + + # Calculate the median of 'pheno_var' for each 'studyId' and add it as a new column + gwas_df = gwas_df.withColumn( + "pheno_median", expr("percentile_approx(pheno_var, 0.5)").over(window) + ) + + gwas_df = gwas_df.withColumn( + "N_hat_ratio", + ( + (f.col("pheno_median") - ((f.col("beta") ** 2) * f.col("var_af"))) + / ((f.col("standardError") ** 2) * f.col("var_af") * n_total) + ), + ) + + qc_c = ( + gwas_df.groupBy("studyId") + .agg( + f.stddev("N_hat_ratio").alias("se_N"), + ) + .select("studyId", "se_N") + ) + + return qc_c + + @staticmethod + def gc_lambda_check( + gwas_for_qc: SummaryStatistics, + limit: int = 10_000_000, + ) -> DataFrame: + """The genomic control lambda check for QC of GWAS summary statstics. + + Args: + gwas_for_qc (SummaryStatistics): The instance of the SummaryStatistics class. + limit (int): The limit for the number of variants to be used for the estimation. + + Returns: + DataFrame: PySpark DataFrame with the genomic control lambda for each study. + """ + gwas_df = gwas_for_qc._df + window = Window.partitionBy("studyId").orderBy("studyId") + gwas_df = ( + gwas_df.withColumn("row_num", row_number().over(window)) + .filter(f.col("row_num") <= limit) + .drop("row_num") + ) + + qc_c = ( + gwas_df.select("studyId", "beta", "standardError") + .withColumn("Z2", (f.col("beta") / f.col("standardError")) ** 2) + .groupBy("studyId") + .agg(f.expr("percentile_approx(Z2, 0.5)").alias("gc_lambda")) + .withColumn("gc_lambda", f.col("gc_lambda") / chi2.ppf(0.5, df=1)) + .select("studyId", "gc_lambda") + ) + + return qc_c + + @staticmethod + def number_of_snps( + gwas_for_qc: SummaryStatistics, pval_threhod: float = 5e-8 + ) -> DataFrame: + """The function caluates number of SNPs and number of SNPs with p-value less than 5e-8. + + Args: + gwas_for_qc (SummaryStatistics): The instance of the SummaryStatistics class. + pval_threhod (float): The threshold for the p-value. + + Returns: + DataFrame: PySpark DataFrame with the number of SNPs and number of SNPs with p-value less than threshold. + """ + gwas_df = gwas_for_qc._df + + snp_counts = gwas_df.groupBy("studyId").agg( + f.count("*").alias("n_variants"), + f.sum( + ( + f.log10(f.col("pValueMantissa")) + f.col("pValueExponent") + <= np.log10(pval_threhod) + ).cast("int") + ).alias("n_variants_sig"), + ) + + return snp_counts + + @staticmethod + def get_quality_control_metrics( + gwas: SummaryStatistics, + limit: int = 100_000_000, + min_count: int = 100_000, + n_total: int = 100_000, + ) -> DataFrame: + """The function calculates the quality control metrics for the summary statistics. + + Args: + gwas (SummaryStatistics): The instance of the SummaryStatistics class. + limit (int): The limit for the number of variants to be used for the estimation. + min_count (int): The minimum number of variants to be used for the estimation. + n_total (int): The total sample size. + + Returns: + DataFrame: PySpark DataFrame with the quality control metrics for the summary statistics. + """ + qc1 = SummaryStatisticsQC.sumstat_qc_beta_check(gwas_for_qc=gwas) + qc2 = SummaryStatisticsQC.sumstat_qc_pz_check(gwas_for_qc=gwas, limit=limit) + qc3 = SummaryStatisticsQC.sumstat_n_eff_check( + gwas_for_qc=gwas, n_total=n_total, limit=limit, min_count=min_count + ) + qc4 = SummaryStatisticsQC.gc_lambda_check(gwas_for_qc=gwas, limit=limit) + qc5 = SummaryStatisticsQC.number_of_snps(gwas_for_qc=gwas) + df = ( + qc1.join(qc2, on="studyId", how="outer") + .join(qc3, on="studyId", how="outer") + .join(qc4, on="studyId", how="outer") + .join(qc5, on="studyId", how="outer") + ) + + return df diff --git a/src/gentropy/method/susie_inf.py b/src/gentropy/method/susie_inf.py index d493285a0..522f77193 100644 --- a/src/gentropy/method/susie_inf.py +++ b/src/gentropy/method/susie_inf.py @@ -34,7 +34,7 @@ def susie_inf( # noqa: C901 ssq_range: tuple[float, float] = (0, 1), pi0: np.ndarray | None = None, est_sigmasq: bool = True, - est_tausq: bool = True, + est_tausq: bool = False, sigmasq: float = 1, tausq: float = 0, sigmasq_range: tuple[float, float] | None = None, @@ -399,7 +399,7 @@ def g(x: float) -> float: def cred_inf( PIP: np.ndarray, n: int = 100_000, - coverage: float = 0.9, + coverage: float = 0.99, purity: float = 0.5, LD: np.ndarray | None = None, V: np.ndarray | None = None, diff --git a/src/gentropy/method/window_based_clumping.py b/src/gentropy/method/window_based_clumping.py index a2ae12419..57a24c559 100644 --- a/src/gentropy/method/window_based_clumping.py +++ b/src/gentropy/method/window_based_clumping.py @@ -151,22 +151,21 @@ def _prune_peak(position: NDArray[np.float64], window_size: int) -> DenseVector: return DenseVector(is_lead) - @classmethod + @staticmethod def clump( - cls: type[WindowBasedClumping], - summary_stats: SummaryStatistics, - window_length: int, - p_value_significance: float = 5e-8, + summary_statistics: SummaryStatistics, + distance: int = 500_000, + gwas_significance: float = 5e-8, ) -> StudyLocus: - """Clump summary statistics by distance. + """Clump significant signals from summary statistics based on window. Args: - summary_stats (SummaryStatistics): summary statistics to clump - window_length (int): window length in basepair - p_value_significance (float): only more significant variants are considered + summary_statistics (SummaryStatistics): Summary statistics to be used for clumping. + distance (int): Distance in base pairs to be used for clumping. Defaults to 500_000. + gwas_significance (float): GWAS significance threshold. Defaults to 5e-8. Returns: - StudyLocus: clumped summary statistics + StudyLocus: clumped summary statistics (without locus collection) """ # Create window for locus clusters # - variants where the distance between subsequent variants is below the defined threshold. @@ -177,9 +176,9 @@ def clump( return StudyLocus( _df=( - summary_stats + summary_statistics # Dropping snps below significance - all subsequent steps are done on significant variants: - .pvalue_filter(p_value_significance) + .pvalue_filter(gwas_significance) .df # Clustering summary variants for efficient windowing (complexity reduction): .withColumn( @@ -188,7 +187,7 @@ def clump( f.col("studyId"), f.col("chromosome"), f.col("position"), - window_length, + distance, ), ) # Within each cluster variants are ranked by significance: @@ -213,7 +212,7 @@ def clump( fml.vector_to_array( f.udf(WindowBasedClumping._prune_peak, VectorUDT())( fml.array_to_vector(f.col("collectedPositions")), - f.lit(window_length), + f.lit(distance), ) ), ), @@ -245,91 +244,3 @@ def clump( ), _schema=StudyLocus.get_schema(), ) - - @classmethod - def clump_with_locus( - cls: type[WindowBasedClumping], - summary_stats: SummaryStatistics, - window_length: int, - p_value_significance: float = 5e-8, - p_value_baseline: float = 0.05, - locus_window_length: int | None = None, - ) -> StudyLocus: - """Clump significant associations while collecting locus around them. - - Args: - summary_stats (SummaryStatistics): Input summary statistics dataset - window_length (int): Window size in bp, used for distance based clumping. - p_value_significance (float): GWAS significance threshold used to filter peaks. Defaults to 5e-8. - p_value_baseline (float): Least significant threshold. Below this, all snps are dropped. Defaults to 0.05. - locus_window_length (int | None): The distance for collecting locus around the semi indices. Defaults to None. - - Returns: - StudyLocus: StudyLocus after clumping with information about the `locus` - """ - # If no locus window provided, using the same value: - if locus_window_length is None: - locus_window_length = window_length - - # Run distance based clumping on the summary stats: - clumped_dataframe = WindowBasedClumping.clump( - summary_stats, - window_length=window_length, - p_value_significance=p_value_significance, - ).df.alias("clumped") - - # Get list of columns from clumped dataset for further propagation: - clumped_columns = clumped_dataframe.columns - - # Dropping variants not meeting the baseline criteria: - sumstats_baseline = summary_stats.pvalue_filter(p_value_baseline).df - - # Renaming columns: - sumstats_baseline_renamed = sumstats_baseline.selectExpr( - *[f"{col} as tag_{col}" for col in sumstats_baseline.columns] - ).alias("sumstat") - - study_locus_df = ( - sumstats_baseline_renamed - # Joining the two datasets together: - .join( - f.broadcast(clumped_dataframe), - on=[ - (f.col("sumstat.tag_studyId") == f.col("clumped.studyId")) - & (f.col("sumstat.tag_chromosome") == f.col("clumped.chromosome")) - & ( - f.col("sumstat.tag_position") - >= (f.col("clumped.position") - locus_window_length) - ) - & ( - f.col("sumstat.tag_position") - <= (f.col("clumped.position") + locus_window_length) - ) - ], - how="right", - ) - .withColumn( - "locus", - f.struct( - f.col("tag_variantId").alias("variantId"), - f.col("tag_beta").alias("beta"), - f.col("tag_pValueMantissa").alias("pValueMantissa"), - f.col("tag_pValueExponent").alias("pValueExponent"), - f.col("tag_standardError").alias("standardError"), - ), - ) - .groupby("studyLocusId") - .agg( - *[ - f.first(col).alias(col) - for col in clumped_columns - if col != "studyLocusId" - ], - f.collect_list(f.col("locus")).alias("locus"), - ) - ) - - return StudyLocus( - _df=study_locus_df, - _schema=StudyLocus.get_schema(), - ) diff --git a/src/gentropy/pics.py b/src/gentropy/pics.py index c2ed9bf66..80421b9ae 100644 --- a/src/gentropy/pics.py +++ b/src/gentropy/pics.py @@ -3,7 +3,7 @@ from __future__ import annotations from gentropy.common.session import Session -from gentropy.dataset.study_locus import StudyLocus +from gentropy.dataset.study_locus import CredibleInterval, StudyLocus from gentropy.method.pics import PICS @@ -28,6 +28,10 @@ def __init__( session, study_locus_ld_annotated_in ) # PICS - picsed_sl = PICS.finemap(study_locus_ld_annotated).annotate_credible_sets() + picsed_sl = ( + PICS.finemap(study_locus_ld_annotated) + .annotate_credible_sets() + .filter_credible_set(credible_interval=CredibleInterval.IS99) + ) # Write picsed_sl.df.write.mode(session.write_mode).parquet(picsed_study_locus_out) diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py new file mode 100644 index 000000000..cc4c7a6a1 --- /dev/null +++ b/src/gentropy/susie_finemapper.py @@ -0,0 +1,1125 @@ +"""Step to run a finemapping using.""" + +from __future__ import annotations + +import logging +import time +from typing import Any + +import hail as hl +import numpy as np +import pandas as pd +import pyspark.sql.functions as f +import scipy as sc +from pyspark.sql import DataFrame, Row, Window +from pyspark.sql.functions import row_number +from pyspark.sql.types import ( + DoubleType, + IntegerType, + StringType, + StructField, + StructType, +) + +from gentropy.common.session import Session +from gentropy.common.spark_helpers import neglog_pvalue_to_mantissa_and_exponent +from gentropy.dataset.study_index import StudyIndex +from gentropy.dataset.study_locus import StudyLocus +from gentropy.dataset.summary_statistics import SummaryStatistics +from gentropy.datasource.gnomad.ld import GnomADLDMatrix +from gentropy.method.carma import CARMA +from gentropy.method.sumstat_imputation import SummaryStatisticsImputation +from gentropy.method.susie_inf import SUSIE_inf + + +class SusieFineMapperStep: + """SuSie finemaping. It has generic methods to run SuSie fine mapping for a study locus. + + This class/step is the temporary solution of the fine-mapping warpper for the development purposes. + In the future this step will be refactored and moved to the methods module. + """ + + def __init__( + self, + session: Session, + study_locus_to_finemap: str, + study_locus_collected_path: str, + study_index_path: str, + output_path: str, + locus_radius: int = 500_000, + max_causal_snps: int = 10, + primary_signal_pval_threshold: float = 1, + secondary_signal_pval_threshold: float = 1, + purity_mean_r2_threshold: float = 0, + purity_min_r2_threshold: float = 0, + cs_lbf_thr: float = 2, + sum_pips: float = 0.99, + logging: bool = False, + susie_est_tausq: bool = False, + run_carma: bool = False, + run_sumstat_imputation: bool = False, + carma_time_limit: int = 600, + imputed_r2_threshold: float = 0.9, + ld_score_threshold: float = 5, + output_path_log: str = "~/", + ) -> None: + """Run fine-mapping on a studyLocusId from a collected studyLocus table. + + Args: + session (Session): Spark session + study_locus_to_finemap (str): path to the study locus to fine-map + study_locus_collected_path (str): path to the collected study locus + study_index_path (str): path to the study index + output_path (str): path to the output + locus_radius (int): Radius of base-pair window around the locus, default is 500_000 + max_causal_snps (int): Maximum number of causal variants in locus, default is 10 + primary_signal_pval_threshold (float): p-value threshold for the lead variant from the primary signal (credibleSetIndex==1), default is 5e-8 + secondary_signal_pval_threshold (float): p-value threshold for the lead variant from the secondary signals, default is 1e-7 + purity_mean_r2_threshold (float): thrshold for purity mean r2 qc metrics for filtering credible sets, default is 0 + purity_min_r2_threshold (float): thrshold for purity min r2 qc metrics for filtering credible sets, default is 0.25 + cs_lbf_thr (float): credible set logBF threshold for filtering credible sets, default is 2 + sum_pips (float): the expected sum of posterior probabilities in the locus, default is 0.99 (99% credible set) + logging (bool): enable logging, default is False, runs diffrent FM wrapper + susie_est_tausq (bool): estimate tau squared, default is False + run_carma (bool): run CARMA, default is False + run_sumstat_imputation (bool): run summary statistics imputation, default is False + carma_time_limit (int): CARMA time limit, default is 600 seconds + imputed_r2_threshold (float): imputed R2 threshold, default is 0.9 + ld_score_threshold (float): LD score threshold ofr imputation, default is 5 + output_path_log (str): path to the output log + """ + # Initialise Hail + hl.init(sc=session.spark.sparkContext, log="/dev/null") + # Read studyLocus + study_locus = ( + StudyLocus.from_parquet(session, study_locus_collected_path) + .df.filter(f.col("studyLocusId") == study_locus_to_finemap) + .collect()[0] + ) + study_index = StudyIndex.from_parquet(session, study_index_path) + # Run fine-mapping + if logging: + result_logging = ( + self.susie_finemapper_one_studylocus_row_v3_dev_ss_gathered( + session=session, + study_locus_row=study_locus, + study_index=study_index, + radius=locus_radius, + max_causal_snps=max_causal_snps, + primary_signal_pval_threshold=primary_signal_pval_threshold, + secondary_signal_pval_threshold=secondary_signal_pval_threshold, + purity_mean_r2_threshold=purity_mean_r2_threshold, + purity_min_r2_threshold=purity_min_r2_threshold, + cs_lbf_thr=cs_lbf_thr, + sum_pips=sum_pips, + susie_est_tausq=susie_est_tausq, + run_carma=run_carma, + run_sumstat_imputation=run_sumstat_imputation, + carma_time_limit=carma_time_limit, + imputed_r2_threshold=imputed_r2_threshold, + ld_score_threshold=ld_score_threshold, + ) + ) + # Write result + result_logging["study_locus"].df.write.mode(session.write_mode).parquet( + output_path + "/" + study_locus_to_finemap + ) + # Write log + result_logging["log"].df.write.mode(session.write_mode).parquet( + output_path_log + "/" + study_locus_to_finemap + ) + else: + result = self.susie_finemapper_ss_gathered( + session=session, + study_locus_row=study_locus, + study_index=study_index, + radius=locus_radius, + max_causal_snps=max_causal_snps, + primary_signal_pval_threshold=primary_signal_pval_threshold, + secondary_signal_pval_threshold=secondary_signal_pval_threshold, + purity_mean_r2_threshold=purity_mean_r2_threshold, + purity_min_r2_threshold=purity_min_r2_threshold, + cs_lbf_thr=cs_lbf_thr, + sum_pips=sum_pips, + ) + # Write result + if result is not None: + result.df.write.mode(session.write_mode).parquet( + output_path + "/" + study_locus_to_finemap + ) + + @staticmethod + def susie_finemapper_one_studylocus_row( + GWAS: SummaryStatistics, + session: Session, + study_locus_row: Row, + study_index: StudyIndex, + radius: int = 1_000_000, + max_causal_snps: int = 10, + primary_signal_pval_threshold: float = 5e-8, + secondary_signal_pval_threshold: float = 1e-7, + purity_mean_r2_threshold: float = 0, + purity_min_r2_threshold: float = 0.25, + sum_pips: float = 0.99, + cs_lbf_thr: float = 2, + ) -> StudyLocus: + """Susie fine-mapper for StudyLocus row with SummaryStatistics object. + + Args: + GWAS (SummaryStatistics): GWAS summary statistics + session (Session): Spark session + study_locus_row (Row): StudyLocus row + study_index (StudyIndex): StudyIndex object + radius (int): window size for fine-mapping + max_causal_snps (int): number of causal variants + primary_signal_pval_threshold (float): p-value threshold for the lead variant from the primary signal (credibleSetIndex==1) + secondary_signal_pval_threshold (float): p-value threshold for the lead variant from the secondary signals + purity_mean_r2_threshold (float): thrshold for purity mean r2 qc metrics for filtering credible sets + purity_min_r2_threshold (float): thrshold for purity min r2 qc metrics for filtering credible sets + sum_pips (float): the expected sum of posterior probabilities in the locus, default is 0.99 (99% credible set) + cs_lbf_thr (float): credible set logBF threshold for filtering credible sets, default is 2 + + Returns: + StudyLocus: StudyLocus object with fine-mapped credible sets + """ + # PLEASE DO NOT REMOVE THIS LINE + pd.DataFrame.iteritems = pd.DataFrame.items + + chromosome = study_locus_row["chromosome"] + position = study_locus_row["position"] + studyId = study_locus_row["studyId"] + + study_index_df = study_index._df + study_index_df = study_index_df.filter(f.col("studyId") == studyId) + major_population = study_index_df.select( + "studyId", + f.array_max(f.col("ldPopulationStructure")) + .getItem("ldPopulation") + .alias("majorPopulation"), + ).collect()[0]["majorPopulation"] + + region = ( + chromosome + + ":" + + str(int(position - radius)) + + "-" + + str(int(position + radius)) + ) + + gwas_df = ( + GWAS.df.withColumn("z", f.col("beta") / f.col("standardError")) + .withColumn("chromosome", f.split(f.col("variantId"), "_")[0]) + .withColumn("position", f.split(f.col("variantId"), "_")[1]) + .filter(f.col("studyId") == studyId) + .filter(f.col("z").isNotNull()) + ) + # Remove ALL duplicated variants from GWAS DataFrame - we don't know which is correct + variant_counts = gwas_df.groupBy("variantId").count() + unique_variants = variant_counts.filter(f.col("count") == 1) + gwas_df = gwas_df.join(unique_variants, on="variantId", how="left_semi") + + ld_index = ( + GnomADLDMatrix() + .get_locus_index( + study_locus_row=study_locus_row, + radius=radius, + major_population=major_population, + ) + .withColumn( + "variantId", + f.concat( + f.lit(chromosome), + f.lit("_"), + f.col("`locus.position`"), + f.lit("_"), + f.col("alleles").getItem(0), + f.lit("_"), + f.col("alleles").getItem(1), + ).cast("string"), + ) + ) + + # Filtering out the variants that are not in the LD matrix, we don't need them + gwas_index = gwas_df.join( + ld_index.select("variantId", "alleles", "idx"), on="variantId" + ).sort("idx") + + gnomad_ld = GnomADLDMatrix.get_numpy_matrix( + gwas_index, gnomad_ancestry=major_population + ) + + pd_df = gwas_index.toPandas() + z_to_fm = np.array(pd_df["z"]) + ld_to_fm = gnomad_ld + + susie_output = SUSIE_inf.susie_inf(z=z_to_fm, LD=ld_to_fm, L=max_causal_snps) + + schema = StructType( + [ + StructField("variantId", StringType(), True), + StructField("chromosome", StringType(), True), + StructField("position", IntegerType(), True), + StructField("z", DoubleType(), True), + ] + ) + pd_df["position"] = pd_df["position"].astype(int) + variant_index = session.spark.createDataFrame( + pd_df[["variantId", "chromosome", "position", "z"]], + schema=schema, + ) + + return SusieFineMapperStep.susie_inf_to_studylocus( + susie_output=susie_output, + session=session, + studyId=studyId, + region=region, + variant_index=variant_index, + ld_matrix=ld_to_fm, + primary_signal_pval_threshold=primary_signal_pval_threshold, + secondary_signal_pval_threshold=secondary_signal_pval_threshold, + purity_mean_r2_threshold=purity_mean_r2_threshold, + purity_min_r2_threshold=purity_min_r2_threshold, + sum_pips=sum_pips, + cs_lbf_thr=cs_lbf_thr, + ) + + @staticmethod + def susie_inf_to_studylocus( + susie_output: dict[str, Any], + session: Session, + studyId: str, + region: str, + variant_index: DataFrame, + ld_matrix: np.ndarray, + cs_lbf_thr: float = 2, + sum_pips: float = 0.99, + primary_signal_pval_threshold: float = 1, + secondary_signal_pval_threshold: float = 1, + purity_mean_r2_threshold: float = 0, + purity_min_r2_threshold: float = 0, + ) -> StudyLocus: + """Convert SuSiE-inf output to StudyLocus DataFrame. + + Args: + susie_output (dict[str, Any]): SuSiE-inf output dictionary + session (Session): Spark session + studyId (str): study ID + region (str): region + variant_index (DataFrame): DataFrame with variant information + ld_matrix (np.ndarray): LD matrix used for fine-mapping + cs_lbf_thr (float): credible set logBF threshold for filtering credible sets, default is 2 + sum_pips (float): the expected sum of posterior probabilities in the locus, default is 0.99 (99% credible set) + primary_signal_pval_threshold (float): p-value threshold for the lead variant from the primary signal (credibleSetIndex==1) + secondary_signal_pval_threshold (float): p-value threshold for the lead variant from the secondary signals + purity_mean_r2_threshold (float): thrshold for purity mean r2 qc metrics for filtering credible sets + purity_min_r2_threshold (float): thrshold for purity min r2 qc metrics for filtering credible sets + + Returns: + StudyLocus: StudyLocus object with fine-mapped credible sets + """ + # PLEASE DO NOT REMOVE THIS LINE + pd.DataFrame.iteritems = pd.DataFrame.items + + variants = np.array( + [row["variantId"] for row in variant_index.select("variantId").collect()] + ).reshape(-1, 1) + + PIPs = susie_output["PIP"] + lbfs = susie_output["lbf_variable"] + mu = susie_output["mu"] + susie_result = np.hstack((variants, PIPs, lbfs, mu)) + + L_snps = PIPs.shape[1] + + # Extracting credible sets + order_creds = list(enumerate(susie_output["lbf"])) + order_creds.sort(key=lambda x: x[1], reverse=True) + + counter = 0 + for i, cs_lbf_value in order_creds: + if counter > 0 and cs_lbf_value < cs_lbf_thr: + counter += 1 + continue + counter += 1 + sorted_arr = susie_result[ + susie_result[:, i + 1].astype(float).argsort()[::-1] + ] + cumsum_arr = np.cumsum(sorted_arr[:, i + 1].astype(float)) + filter_row = np.argmax(cumsum_arr >= sum_pips) + if filter_row == 0 and cumsum_arr[0] < sum_pips: + filter_row = len(cumsum_arr) + filter_row += 1 + filtered_arr = sorted_arr[:filter_row] + cred_set = filtered_arr[:, [0, i + 1, i + L_snps + 1, i + 2 * L_snps + 1]] + win = Window.rowsBetween( + Window.unboundedPreceding, Window.unboundedFollowing + ) + + cred_set = ( + session.spark.createDataFrame( + cred_set.tolist(), + ["variantId", "posteriorProbability", "logBF", "beta"], + ) + .join( + variant_index.select( + "variantId", + "chromosome", + "position", + ), + "variantId", + ) + .sort(f.desc("posteriorProbability")) + .withColumn( + "locus", + f.collect_list( + f.struct( + f.col("variantId").cast("string").alias("variantId"), + f.col("posteriorProbability") + .cast("double") + .alias("posteriorProbability"), + f.col("logBF").cast("double").alias("logBF"), + f.col("beta").cast("double").alias("beta"), + ) + ).over(win), + ) + .limit(1) + .withColumns( + { + "studyId": f.lit(studyId), + "region": f.lit(region), + "credibleSetIndex": f.lit(counter), + "credibleSetlog10BF": f.lit(cs_lbf_value * 0.4342944819), + "finemappingMethod": f.lit("SuSiE-inf"), + } + ) + .withColumn( + "studyLocusId", + StudyLocus.assign_study_locus_id( + f.col("studyId"), f.col("variantId") + ), + ) + .select( + "studyLocusId", + "studyId", + "region", + "credibleSetIndex", + "locus", + "variantId", + "chromosome", + "position", + "finemappingMethod", + "credibleSetlog10BF", + ) + ) + if counter == 1: + cred_sets = cred_set + else: + cred_sets = cred_sets.unionByName(cred_set) + + # Calulating purity + variant_index_df = variant_index.toPandas() + cred_sets_variantId = cred_sets.select("locus.variantId").toPandas() + + lead_variantId_list = ( + cred_sets.select("variantId").toPandas()["variantId"].tolist() + ) + cred_set_index = ( + cred_sets.select("credibleSetIndex").toPandas()["credibleSetIndex"].tolist() + ) + vlist_series = pd.Series(lead_variantId_list) + ind = vlist_series.map(variant_index_df.set_index("variantId").index.get_loc) + z_values = variant_index_df.iloc[ind]["z"].tolist() + z_values_array = np.array(z_values) + pval = sc.stats.chi2.sf((z_values_array**2), 1) + + # sometimes pval is 0, we need to avoid it + pval[pval < 1e-322] = 1e-322 + + neglogpval = -np.log10(pval) + neglogpval = neglogpval.tolist() + + list_purity_mean_r2 = [] + list_purity_min_r2 = [] + for _, row in cred_sets_variantId.iterrows(): + row = row.iloc[0] + vlist_series = pd.Series(row) + ind = vlist_series.map( + variant_index_df.set_index("variantId").index.get_loc + ) + # print(variant_index_df.iloc[ind,0]==vlist) + squared_matrix = ld_matrix[ind, :][:, ind] ** 2 + purity_mean_r2 = np.mean(squared_matrix) + purity_min_r2 = np.min(squared_matrix) + list_purity_mean_r2.append(purity_mean_r2) + list_purity_min_r2.append(purity_min_r2) + + cred_sets = cred_sets.drop("pValueMantissa", "pValueExponent") + + df = pd.DataFrame( + { + "credibleSetIndex": cred_set_index, + "purityMeanR2": purity_mean_r2, + "purityMinR2": purity_min_r2, + "zScore": z_values, + "neglogpval": neglogpval, + } + ) + schema = StructType( + [ + StructField("credibleSetIndex", IntegerType(), True), + StructField("purityMeanR2", DoubleType(), True), + StructField("purityMinR2", DoubleType(), True), + StructField("zScore", DoubleType(), True), + StructField("neglogpval", DoubleType(), True), + ] + ) + + df_spark = session.spark.createDataFrame(df, schema=schema) + + cred_sets = cred_sets.join(df_spark, on="credibleSetIndex") + + mantissa, exponent = neglog_pvalue_to_mantissa_and_exponent( + cred_sets.neglogpval + ) + + cred_sets = cred_sets.withColumn("pValueMantissa", mantissa) + cred_sets = cred_sets.withColumn("pValueExponent", exponent) + + cred_sets = cred_sets.withColumn( + "pValueMantissa", f.col("pValueMantissa").cast("float") + ) + + cred_sets = cred_sets.filter( + (f.col("neglogpval") >= -np.log10(secondary_signal_pval_threshold)) + | (f.col("credibleSetIndex") == 1) + ) + + cred_sets = cred_sets.filter( + (f.col("neglogpval") >= -np.log10(primary_signal_pval_threshold)) + | (f.col("credibleSetIndex") > 1) + ) + + cred_sets = cred_sets.drop("neglogpval") + + cred_sets = cred_sets.filter( + (f.col("credibleSetlog10BF") >= cs_lbf_thr * 0.4342944819) + | (f.col("credibleSetIndex") == 1) + ) + + cred_sets = cred_sets.filter(f.col("purityMeanR2") >= purity_mean_r2_threshold) + cred_sets = cred_sets.filter(f.col("purityMinR2") >= purity_min_r2_threshold) + + window = Window.partitionBy("studyLocusId").orderBy("credibleSetIndex") + cred_sets = cred_sets.withColumn("rank", row_number().over(window)) + cred_sets = cred_sets.filter(cred_sets["rank"] == 1).drop("rank") + + return StudyLocus( + _df=cred_sets, + _schema=StudyLocus.get_schema(), + ) + + @staticmethod + def susie_finemapper_ss_gathered( + session: Session, + study_locus_row: Row, + study_index: StudyIndex, + radius: int = 1_000_000, + max_causal_snps: int = 10, + primary_signal_pval_threshold: float = 5e-8, + secondary_signal_pval_threshold: float = 1e-7, + purity_mean_r2_threshold: float = 0, + purity_min_r2_threshold: float = 0.25, + cs_lbf_thr: float = 2, + sum_pips: float = 0.99, + ) -> StudyLocus | None: + """Susie fine-mapper for StudyLocus row with locus annotated summary statistics. + + Args: + session (Session): Spark session + study_locus_row (Row): StudyLocus row + study_index (StudyIndex): StudyIndex object + radius (int): window size for fine-mapping + max_causal_snps (int): number of causal variants + primary_signal_pval_threshold (float): p-value threshold for the lead variant from the primary signal (credibleSetIndex==1) + secondary_signal_pval_threshold (float): p-value threshold for the lead variant from the secondary signals + purity_mean_r2_threshold (float): thrshold for purity mean r2 qc metrics for filtering credible sets + purity_min_r2_threshold (float): thrshold for purity min r2 qc metrics for filtering credible sets + cs_lbf_thr (float): credible set logBF threshold for filtering credible sets + sum_pips (float): the expected sum of posterior probabilities in the locus, default is 0.99 (99% credible set) + + Returns: + StudyLocus | None: StudyLocus object with fine-mapped credible sets, or None + """ + # PLEASE DO NOT REMOVE THIS LINE + pd.DataFrame.iteritems = pd.DataFrame.items + + chromosome = study_locus_row["chromosome"] + position = study_locus_row["position"] + studyId = study_locus_row["studyId"] + + study_index_df = study_index._df + study_index_df = study_index_df.filter(f.col("studyId") == studyId) + major_population = study_index_df.select( + "studyId", + f.array_max(f.col("ldPopulationStructure")) + .getItem("ldPopulation") + .alias("majorPopulation"), + ).collect()[0]["majorPopulation"] + + region = ( + chromosome + + ":" + + str(int(position - radius)) + + "-" + + str(int(position + radius)) + ) + + schema = StudyLocus.get_schema() + gwas_df = session.spark.createDataFrame([study_locus_row], schema=schema) + exploded_df = gwas_df.select(f.explode("locus").alias("locus")) + + result_df = exploded_df.select( + "locus.variantId", "locus.beta", "locus.standardError" + ) + gwas_df = ( + result_df.withColumn("z", f.col("beta") / f.col("standardError")) + .withColumn("chromosome", f.split(f.col("variantId"), "_")[0]) + .withColumn("position", f.split(f.col("variantId"), "_")[1]) + .filter(f.col("z").isNotNull()) + ) + # Remove ALL duplicated variants from GWAS DataFrame - we don't know which is correct + variant_counts = gwas_df.groupBy("variantId").count() + unique_variants = variant_counts.filter(f.col("count") == 1) + gwas_df = gwas_df.join(unique_variants, on="variantId", how="left_semi") + + ld_index = ( + GnomADLDMatrix() + .get_locus_index( + study_locus_row=study_locus_row, + radius=radius, + major_population=major_population, + ) + .withColumn( + "variantId", + f.concat( + f.lit(chromosome), + f.lit("_"), + f.col("`locus.position`"), + f.lit("_"), + f.col("alleles").getItem(0), + f.lit("_"), + f.col("alleles").getItem(1), + ).cast("string"), + ) + ) + + # Filtering out the variants that are not in the LD matrix, we don't need them + gwas_index = gwas_df.join( + ld_index.select("variantId", "alleles", "idx"), on="variantId" + ).sort("idx") + if gwas_index.rdd.isEmpty(): + logging.warning("No overlapping variants in the LD Index") + return None + gnomad_ld = GnomADLDMatrix.get_numpy_matrix( + gwas_index, gnomad_ancestry=major_population + ) + + pd_df = gwas_index.toPandas() + z_to_fm = np.array(pd_df["z"]) + ld_to_fm = gnomad_ld + + susie_output = SUSIE_inf.susie_inf(z=z_to_fm, LD=ld_to_fm, L=max_causal_snps) + + schema = StructType( + [ + StructField("variantId", StringType(), True), + StructField("chromosome", StringType(), True), + StructField("position", IntegerType(), True), + StructField("z", DoubleType(), True), + ] + ) + pd_df["position"] = pd_df["position"].astype(int) + variant_index = session.spark.createDataFrame( + pd_df[["variantId", "chromosome", "position", "z"]], + schema=schema, + ) + + return SusieFineMapperStep.susie_inf_to_studylocus( + susie_output=susie_output, + session=session, + studyId=studyId, + region=region, + variant_index=variant_index, + ld_matrix=ld_to_fm, + primary_signal_pval_threshold=primary_signal_pval_threshold, + secondary_signal_pval_threshold=secondary_signal_pval_threshold, + purity_mean_r2_threshold=purity_mean_r2_threshold, + purity_min_r2_threshold=purity_min_r2_threshold, + cs_lbf_thr=cs_lbf_thr, + sum_pips=sum_pips, + ) + + @staticmethod + def susie_finemapper_from_prepared_dataframes( + GWAS_df: DataFrame, + ld_index: DataFrame, + gnomad_ld: np.ndarray, + L: int, + session: Session, + studyId: str, + region: str, + susie_est_tausq: bool = False, + run_carma: bool = False, + run_sumstat_imputation: bool = False, + carma_time_limit: int = 600, + imputed_r2_threshold: float = 0.8, + ld_score_threshold: float = 4, + sum_pips: float = 0.99, + primary_signal_pval_threshold: float = 5e-8, + secondary_signal_pval_threshold: float = 1e-7, + purity_mean_r2_threshold: float = 0, + purity_min_r2_threshold: float = 0.25, + cs_lbf_thr: float = 2, + ) -> dict[str, Any]: + """Susie fine-mapper function that uses LD, z-scores, variant info and other options for Fine-Mapping. + + Args: + GWAS_df (DataFrame): GWAS DataFrame with mandotary columns: z, variantId + ld_index (DataFrame): LD index DataFrame + gnomad_ld (np.ndarray): GnomAD LD matrix + L (int): number of causal variants + session (Session): Spark session + studyId (str): study ID + region (str): region + susie_est_tausq (bool): estimate tau squared, default is False + run_carma (bool): run CARMA, default is False + run_sumstat_imputation (bool): run summary statistics imputation, default is False + carma_time_limit (int): CARMA time limit, default is 600 seconds + imputed_r2_threshold (float): imputed R2 threshold, default is 0.8 + ld_score_threshold (float): LD score threshold ofr imputation, default is 4 + sum_pips (float): the expected sum of posterior probabilities in the locus, default is 0.99 (99% credible set) + primary_signal_pval_threshold (float): p-value threshold for the lead variant from the primary signal (credibleSetIndex==1) + secondary_signal_pval_threshold (float): p-value threshold for the lead variant from the secondary signals + purity_mean_r2_threshold (float): thrshold for purity mean r2 qc metrics for filtering credible sets + purity_min_r2_threshold (float): thrshold for purity min r2 qc metrics for filtering credible sets + cs_lbf_thr (float): credible set logBF threshold for filtering credible sets, default is 2 + + Returns: + dict[str, Any]: dictionary with study locus, number of GWAS variants, number of LD variants, number of variants after merge, number of outliers, number of imputed variants, number of variants to fine-map + """ + # PLEASE DO NOT REMOVE THIS LINE + pd.DataFrame.iteritems = pd.DataFrame.items + + start_time = time.time() + GWAS_df = GWAS_df.toPandas() + N_gwas_before_dedupl = len(GWAS_df) + + GWAS_df = GWAS_df.drop_duplicates(subset="variantId", keep=False) + GWAS_df = GWAS_df.reset_index() + + ld_index = ld_index.toPandas() + ld_index = ld_index.reset_index() + + N_gwas = len(GWAS_df) + N_ld = len(ld_index) + + # Filtering out the variants that are not in the LD matrix, we don't need them + df_columns = ["variantId", "z"] + GWAS_df = GWAS_df.merge(ld_index, on="variantId", how="inner") + GWAS_df = GWAS_df[df_columns].reset_index() + N_after_merge = len(GWAS_df) + + merged_df = GWAS_df.merge( + ld_index, left_on="variantId", right_on="variantId", how="inner" + ) + indices = merged_df["index_y"].values + + ld_to_fm = gnomad_ld[indices][:, indices] + z_to_fm = GWAS_df["z"].values + + if run_carma: + carma_output = CARMA.time_limited_CARMA_spike_slab_noEM( + z=z_to_fm, ld=ld_to_fm, sec_threshold=carma_time_limit + ) + if carma_output["Outliers"] != [] and carma_output["Outliers"] is not None: + GWAS_df.drop(carma_output["Outliers"], inplace=True) + GWAS_df = GWAS_df.reset_index() + ld_index = ld_index.reset_index() + merged_df = GWAS_df.merge( + ld_index, left_on="variantId", right_on="variantId", how="inner" + ) + indices = merged_df["index_y"].values + + ld_to_fm = gnomad_ld[indices][:, indices] + z_to_fm = GWAS_df["z"].values + N_outliers = len(carma_output["Outliers"]) + else: + N_outliers = 0 + else: + N_outliers = 0 + + if run_sumstat_imputation: + known = indices + unknown = [ + index for index in list(range(len(gnomad_ld))) if index not in known + ] + sig_t = gnomad_ld[known, :][:, known] + sig_i_t = gnomad_ld[unknown, :][:, known] + zt = z_to_fm + + sumstat_imp_res = SummaryStatisticsImputation.raiss_model( + z_scores_known=zt, + ld_matrix_known=sig_t, + ld_matrix_known_missing=sig_i_t, + lamb=0.01, + rtol=0.01, + ) + + bool_index = (sumstat_imp_res["imputation_r2"] >= imputed_r2_threshold) * ( + sumstat_imp_res["ld_score"] >= ld_score_threshold + ) + if sum(bool_index) >= 1: + indices = np.where(bool_index)[0] + index_to_add = [unknown[i] for i in indices] + index_to_fm = np.concatenate((known, index_to_add)) + + ld_to_fm = gnomad_ld[index_to_fm][:, index_to_fm] + + snp_info_to_add = pd.DataFrame( + { + "variantId": ld_index.iloc[index_to_add, :]["variantId"], + "z": sumstat_imp_res["mu"][indices], + } + ) + GWAS_df = pd.concat([GWAS_df, snp_info_to_add], ignore_index=True) + z_to_fm = GWAS_df["z"].values + + N_imputed = len(indices) + else: + N_imputed = 0 + else: + N_imputed = 0 + + susie_output = SUSIE_inf.susie_inf( + z=z_to_fm, LD=ld_to_fm, L=L, est_tausq=susie_est_tausq + ) + + schema = StructType( + [ + StructField("variantId", StringType(), True), + StructField("z", DoubleType(), True), + ] + ) + variant_index = ( + session.spark.createDataFrame( + GWAS_df[["variantId", "z"]], + schema=schema, + ) + .withColumn( + "chromosome", f.split(f.col("variantId"), "_")[0].cast("string") + ) + .withColumn("position", f.split(f.col("variantId"), "_")[1].cast("int")) + ) + + study_locus = SusieFineMapperStep.susie_inf_to_studylocus( + susie_output=susie_output, + session=session, + studyId=studyId, + region=region, + variant_index=variant_index, + sum_pips=sum_pips, + ld_matrix=ld_to_fm, + primary_signal_pval_threshold=primary_signal_pval_threshold, + secondary_signal_pval_threshold=secondary_signal_pval_threshold, + purity_mean_r2_threshold=purity_mean_r2_threshold, + purity_min_r2_threshold=purity_min_r2_threshold, + cs_lbf_thr=cs_lbf_thr, + ) + + end_time = time.time() + + log_df = pd.DataFrame( + { + "N_gwas_before_dedupl": N_gwas_before_dedupl, + "N_gwas": N_gwas, + "N_ld": N_ld, + "N_overlap": N_after_merge, + "N_outliers": N_outliers, + "N_imputed": N_imputed, + "N_final_to_fm": len(ld_to_fm), + "eleapsed_time": end_time - start_time, + }, + index=[0], + ) + + return { + "study_locus": study_locus, + "log": log_df, + } + + @staticmethod + def susie_finemapper_one_studylocus_row_v2_dev( + GWAS: SummaryStatistics, + session: Session, + study_locus_row: Row, + study_index: StudyIndex, + radius: int = 1_000_000, + max_causal_snps: int = 10, + susie_est_tausq: bool = False, + run_carma: bool = False, + run_sumstat_imputation: bool = False, + carma_time_limit: int = 600, + imputed_r2_threshold: float = 0.9, + ld_score_threshold: float = 5, + sum_pips: float = 0.99, + primary_signal_pval_threshold: float = 5e-8, + secondary_signal_pval_threshold: float = 1e-7, + purity_mean_r2_threshold: float = 0, + purity_min_r2_threshold: float = 0.25, + cs_lbf_thr: float = 2, + ) -> dict[str, Any]: + """Susie fine-mapper function that uses Summary Statstics, chromosome and position as inputs. + + Args: + GWAS (SummaryStatistics): GWAS summary statistics + session (Session): Spark session + study_locus_row (Row): StudyLocus row + study_index (StudyIndex): StudyIndex object + radius (int): Radius in base-pairs of window for fine-mapping + max_causal_snps (int): maximum number of causal variants + susie_est_tausq (bool): estimate tau squared, default is False + run_carma (bool): run CARMA, default is False + run_sumstat_imputation (bool): run summary statistics imputation, default is False + carma_time_limit (int): CARMA time limit, default is 600 seconds + imputed_r2_threshold (float): imputed R2 threshold, default is 0.8 + ld_score_threshold (float): LD score threshold ofr imputation, default is 4 + sum_pips (float): the expected sum of posterior probabilities in the locus, default is 0.99 (99% credible set) + primary_signal_pval_threshold (float): p-value threshold for the lead variant from the primary signal (credibleSetIndex==1) + secondary_signal_pval_threshold (float): p-value threshold for the lead variant from the secondary signals + purity_mean_r2_threshold (float): thrshold for purity mean r2 qc metrics for filtering credible sets + purity_min_r2_threshold (float): thrshold for purity min r2 qc metrics for filtering credible sets + cs_lbf_thr (float): credible set logBF threshold for filtering credible sets, default is 2 + + Returns: + dict[str, Any]: dictionary with study locus, number of GWAS variants, number of LD variants, number of variants after merge, number of outliers, number of imputed variants, number of variants to fine-map + """ + # PLEASE DO NOT REMOVE THIS LINE + pd.DataFrame.iteritems = pd.DataFrame.items + + chromosome = study_locus_row["chromosome"] + position = study_locus_row["position"] + studyId = study_locus_row["studyId"] + + study_index_df = study_index._df + study_index_df = study_index_df.filter(f.col("studyId") == studyId) + major_population = study_index_df.select( + "studyId", + f.array_max(f.col("ldPopulationStructure")) + .getItem("ldPopulation") + .alias("majorPopulation"), + ).collect()[0]["majorPopulation"] + + region = ( + chromosome + + ":" + + str(int(position - radius)) + + "-" + + str(int(position + radius)) + ) + gwas_df = ( + GWAS.df.withColumn("z", f.col("beta") / f.col("standardError")) + .withColumn( + "chromosome", f.split(f.col("variantId"), "_")[0].cast("string") + ) + .withColumn("position", f.split(f.col("variantId"), "_")[1].cast("int")) + .filter(f.col("studyId") == studyId) + .filter(f.col("z").isNotNull()) + .filter(f.col("chromosome") == chromosome) + .filter(f.col("position") >= position - radius) + .filter(f.col("position") <= position + radius) + ) + + ld_index = ( + GnomADLDMatrix() + .get_locus_index( + study_locus_row=study_locus_row, + radius=radius, + major_population=major_population, + ) + .withColumn( + "variantId", + f.concat( + f.lit(chromosome), + f.lit("_"), + f.col("`locus.position`"), + f.lit("_"), + f.col("alleles").getItem(0), + f.lit("_"), + f.col("alleles").getItem(1), + ).cast("string"), + ) + ) + + gnomad_ld = GnomADLDMatrix.get_numpy_matrix( + ld_index, gnomad_ancestry=major_population + ) + + out = SusieFineMapperStep.susie_finemapper_from_prepared_dataframes( + GWAS_df=gwas_df, + ld_index=ld_index, + gnomad_ld=gnomad_ld, + L=max_causal_snps, + session=session, + studyId=studyId, + region=region, + susie_est_tausq=susie_est_tausq, + run_carma=run_carma, + run_sumstat_imputation=run_sumstat_imputation, + carma_time_limit=carma_time_limit, + imputed_r2_threshold=imputed_r2_threshold, + ld_score_threshold=ld_score_threshold, + sum_pips=sum_pips, + primary_signal_pval_threshold=primary_signal_pval_threshold, + secondary_signal_pval_threshold=secondary_signal_pval_threshold, + purity_mean_r2_threshold=purity_mean_r2_threshold, + purity_min_r2_threshold=purity_min_r2_threshold, + cs_lbf_thr=cs_lbf_thr, + ) + + return out + + @staticmethod + def susie_finemapper_one_studylocus_row_v3_dev_ss_gathered( + session: Session, + study_locus_row: Row, + study_index: StudyIndex, + radius: int = 1_000_000, + max_causal_snps: int = 10, + susie_est_tausq: bool = False, + run_carma: bool = False, + run_sumstat_imputation: bool = False, + carma_time_limit: int = 600, + imputed_r2_threshold: float = 0.9, + ld_score_threshold: float = 5, + sum_pips: float = 0.99, + primary_signal_pval_threshold: float = 5e-8, + secondary_signal_pval_threshold: float = 1e-7, + purity_mean_r2_threshold: float = 0, + purity_min_r2_threshold: float = 0.25, + cs_lbf_thr: float = 2, + ) -> dict[str, Any]: + """Susie fine-mapper function that uses study-locus row with collected locus, chromosome and position as inputs. + + Args: + session (Session): Spark session + study_locus_row (Row): StudyLocus row with collected locus + study_index (StudyIndex): StudyIndex object + radius (int): Radius in base-pairs of window for fine-mapping + max_causal_snps (int): maximum number of causal variants + susie_est_tausq (bool): estimate tau squared, default is False + run_carma (bool): run CARMA, default is False + run_sumstat_imputation (bool): run summary statistics imputation, default is False + carma_time_limit (int): CARMA time limit, default is 600 seconds + imputed_r2_threshold (float): imputed R2 threshold, default is 0.8 + ld_score_threshold (float): LD score threshold ofr imputation, default is 4 + sum_pips (float): the expected sum of posterior probabilities in the locus, default is 0.99 (99% credible set) + primary_signal_pval_threshold (float): p-value threshold for the lead variant from the primary signal (credibleSetIndex==1) + secondary_signal_pval_threshold (float): p-value threshold for the lead variant from the secondary signals + purity_mean_r2_threshold (float): thrshold for purity mean r2 qc metrics for filtering credible sets + purity_min_r2_threshold (float): thrshold for purity min r2 qc metrics for filtering credible sets + cs_lbf_thr (float): credible set logBF threshold for filtering credible sets, default is 2 + + Returns: + dict[str, Any]: dictionary with study locus, number of GWAS variants, number of LD variants, number of variants after merge, number of outliers, number of imputed variants, number of variants to fine-map + """ + # PLEASE DO NOT REMOVE THIS LINE + pd.DataFrame.iteritems = pd.DataFrame.items + + chromosome = study_locus_row["chromosome"] + position = study_locus_row["position"] + studyId = study_locus_row["studyId"] + + study_index_df = study_index._df + study_index_df = study_index_df.filter(f.col("studyId") == studyId) + major_population = study_index_df.select( + "studyId", + f.array_max(f.col("ldPopulationStructure")) + .getItem("ldPopulation") + .alias("majorPopulation"), + ).collect()[0]["majorPopulation"] + + region = ( + chromosome + + ":" + + str(int(position - radius)) + + "-" + + str(int(position + radius)) + ) + + schema = StudyLocus.get_schema() + gwas_df = session.spark.createDataFrame([study_locus_row], schema=schema) + exploded_df = gwas_df.select(f.explode("locus").alias("locus")) + + result_df = exploded_df.select( + "locus.variantId", "locus.beta", "locus.standardError" + ) + gwas_df = ( + result_df.withColumn("z", f.col("beta") / f.col("standardError")) + .withColumn( + "chromosome", f.split(f.col("variantId"), "_")[0].cast("string") + ) + .withColumn("position", f.split(f.col("variantId"), "_")[1].cast("int")) + .filter(f.col("chromosome") == chromosome) + .filter(f.col("position") >= position - radius) + .filter(f.col("position") <= position + radius) + .filter(f.col("z").isNotNull()) + ) + + ld_index = ( + GnomADLDMatrix() + .get_locus_index( + study_locus_row=study_locus_row, + radius=radius, + major_population=major_population, + ) + .withColumn( + "variantId", + f.concat( + f.lit(chromosome), + f.lit("_"), + f.col("`locus.position`"), + f.lit("_"), + f.col("alleles").getItem(0), + f.lit("_"), + f.col("alleles").getItem(1), + ).cast("string"), + ) + ) + + gnomad_ld = GnomADLDMatrix.get_numpy_matrix( + ld_index, gnomad_ancestry=major_population + ) + + out = SusieFineMapperStep.susie_finemapper_from_prepared_dataframes( + GWAS_df=gwas_df, + ld_index=ld_index, + gnomad_ld=gnomad_ld, + L=max_causal_snps, + session=session, + studyId=studyId, + region=region, + susie_est_tausq=susie_est_tausq, + run_carma=run_carma, + run_sumstat_imputation=run_sumstat_imputation, + carma_time_limit=carma_time_limit, + imputed_r2_threshold=imputed_r2_threshold, + ld_score_threshold=ld_score_threshold, + sum_pips=sum_pips, + primary_signal_pval_threshold=primary_signal_pval_threshold, + secondary_signal_pval_threshold=secondary_signal_pval_threshold, + purity_mean_r2_threshold=purity_mean_r2_threshold, + purity_min_r2_threshold=purity_min_r2_threshold, + cs_lbf_thr=cs_lbf_thr, + ) + + return out diff --git a/src/gentropy/window_based_clumping.py b/src/gentropy/window_based_clumping.py index fcc680ef7..bce9edd37 100644 --- a/src/gentropy/window_based_clumping.py +++ b/src/gentropy/window_based_clumping.py @@ -1,4 +1,5 @@ """Step to run window based clumping on summary statistics datasts.""" + from __future__ import annotations from gentropy.common.session import Session @@ -13,8 +14,10 @@ def __init__( session: Session, summary_statistics_input_path: str, study_locus_output_path: str, + distance: int = 500_000, + collect_locus: bool = False, + collect_locus_distance: int = 500_000, inclusion_list_path: str | None = None, - locus_collect_distance: int | None = None, ) -> None: """Run window-based clumping step. @@ -22,8 +25,10 @@ def __init__( session (Session): Session object. summary_statistics_input_path (str): Path to the harmonized summary statistics dataset. study_locus_output_path (str): Output path for the resulting study locus dataset. + distance (int): Distance, within which tagging variants are collected around the semi-index. Optional. + collect_locus (bool): Whether to collect locus around semi-indices. Optional. + collect_locus_distance (int): Distance, within which tagging variants are collected around the semi-index. Optional. inclusion_list_path (str | None): Path to the inclusion list (list of white-listed study identifier). Optional. - locus_collect_distance (int | None): Distance, within which tagging variants are collected around the semi-index. Optional. """ # If inclusion list path is provided, only these studies will be read: if inclusion_list_path: @@ -35,16 +40,22 @@ def __init__( # If no inclusion list is provided, read all summary stats in folder: study_ids_to_ingest = [summary_statistics_input_path] - ( - SummaryStatistics.from_parquet( - session, - study_ids_to_ingest, - recursiveFileLookup=True, - ) - .coalesce(4000) - # Applying window based clumping: - .window_based_clumping(locus_collect_distance=locus_collect_distance) - # Save resulting study locus dataset: - .df.write.mode(session.write_mode) - .parquet(study_locus_output_path) + ss = SummaryStatistics.from_parquet( + session, + study_ids_to_ingest, + recursiveFileLookup=True, + ) + + # Clumping: + study_locus = ss.window_based_clumping( + distance=distance, ) + + # Optional locus collection: + if collect_locus: + # Collecting locus around semi-indices: + study_locus = study_locus.annotate_locus_statistics( + ss, collect_locus_distance=collect_locus_distance + ) + + study_locus.df.write.mode(session.write_mode).parquet(study_locus_output_path) diff --git a/tests/gentropy/conftest.py b/tests/gentropy/conftest.py index 13409ac16..ec106f975 100644 --- a/tests/gentropy/conftest.py +++ b/tests/gentropy/conftest.py @@ -84,6 +84,11 @@ def mock_colocalisation(spark: SparkSession) -> Colocalisation: .withColumnSpec("h4", percentNulls=0.1) .withColumnSpec("log2h4h3", percentNulls=0.1) .withColumnSpec("clpp", percentNulls=0.1) + .withColumnSpec( + "colocalisationMethod", + percentNulls=0.0, + values=["COLOC", "eCAVIAR"], + ) ) return Colocalisation(_df=data_spec.build(), _schema=coloc_schema) @@ -414,7 +419,7 @@ def mock_ld_index(spark: SparkSession) -> LDIndex: def sample_gwas_catalog_studies(spark: SparkSession) -> DataFrame: """Sample GWAS Catalog studies.""" return spark.read.csv( - "tests/gentropy/data_samples/gwas_catalog_studies_sample-r2022-11-29.tsv", + "tests/gentropy/data_samples/gwas_catalog_studies.tsv", sep="\t", header=True, ) @@ -424,7 +429,7 @@ def sample_gwas_catalog_studies(spark: SparkSession) -> DataFrame: def sample_gwas_catalog_ancestries_lut(spark: SparkSession) -> DataFrame: """Sample GWAS ancestries sample data.""" return spark.read.csv( - "tests/gentropy/data_samples/gwas_catalog_ancestries_sample_v1.0.3-r2022-11-29.tsv", + "tests/gentropy/data_samples/gwas_catalog_ancestries.tsv", sep="\t", header=True, ) @@ -444,7 +449,7 @@ def sample_gwas_catalog_harmonised_sumstats_list(spark: SparkSession) -> DataFra def sample_gwas_catalog_associations(spark: SparkSession) -> DataFrame: """Sample GWAS raw associations sample data.""" return spark.read.csv( - "tests/gentropy/data_samples/gwas_catalog_associations_sample_e107_r2022-11-29.tsv", + "tests/gentropy/data_samples/gwas_catalog_associations.tsv", sep="\t", header=True, ) diff --git a/tests/gentropy/data_samples/coloc_test_data.snappy.parquet b/tests/gentropy/data_samples/coloc_test_data.snappy.parquet deleted file mode 100644 index 71b3913eb..000000000 Binary files a/tests/gentropy/data_samples/coloc_test_data.snappy.parquet and /dev/null differ diff --git a/tests/gentropy/data_samples/gwas_catalog_ancestries.tsv b/tests/gentropy/data_samples/gwas_catalog_ancestries.tsv new file mode 100644 index 000000000..48423eeee --- /dev/null +++ b/tests/gentropy/data_samples/gwas_catalog_ancestries.tsv @@ -0,0 +1,20 @@ +STUDY ACCESSION PUBMED ID FIRST AUTHOR DATE INITIAL SAMPLE DESCRIPTION REPLICATION SAMPLE DESCRIPTION STAGE NUMBER OF INDIVIDUALS BROAD ANCESTRAL CATEGORY COUNTRY OF ORIGIN COUNTRY OF RECRUITMENT ADDITIONAL ANCESTRY DESCRIPTION ANCESTRY DESCRIPTOR FOUNDER/GENETICALLY ISOLATED POPULATION NUMBER OF CASES NUMBER OF CONTROLS SAMPLE DESCRIPTION +GCST004795 28763065 Xia K 2017-08-01 295 European, African American, Asian, Native American and other admixed ancestry infants, 17 European, African American, Asian, Native American and other admixed ancestry sibling pairs, 116 European, African American, Asian, Native American and other admixed ancestry twin pairs. NA initial 205 Asian unspecified, African American or Afro-Caribbean, Native American, Other admixed ancestry NR U.S. +GCST004795 28763065 Xia K 2017-08-01 295 European, African American, Asian, Native American and other admixed ancestry infants, 17 European, African American, Asian, Native American and other admixed ancestry sibling pairs, 116 European, African American, Asian, Native American and other admixed ancestry twin pairs. NA initial 356 European NR U.S. +GCST004796 28763065 Xia K 2017-08-01 295 European, African American, Asian, Native American and other admixed ancestry infants, 17 European, African American, Asian, Native American and other admixed ancestry sibling pairs, 116 European, African American, Asian, Native American and other admixed ancestry twin pairs. NA initial 356 European NR U.S. +GCST004796 28763065 Xia K 2017-08-01 295 European, African American, Asian, Native American and other admixed ancestry infants, 17 European, African American, Asian, Native American and other admixed ancestry sibling pairs, 116 European, African American, Asian, Native American and other admixed ancestry twin pairs. NA initial 205 Asian unspecified, African American or Afro-Caribbean, Native American, Other admixed ancestry NR U.S. +GCST004797 28763065 Xia K 2017-08-01 295 European, African American, Asian, Native American and other admixed ancestry infants, 17 European, African American, Asian, Native American and other admixed ancestry sibling pairs, 116 European, African American, Asian, Native American and other admixed ancestry twin pairs. NA initial 205 Asian unspecified, African American or Afro-Caribbean, Native American, Other admixed ancestry NR U.S. +GCST004797 28763065 Xia K 2017-08-01 295 European, African American, Asian, Native American and other admixed ancestry infants, 17 European, African American, Asian, Native American and other admixed ancestry sibling pairs, 116 European, African American, Asian, Native American and other admixed ancestry twin pairs. NA initial 356 European NR U.S. +GCST004794 28763065 Xia K 2017-08-01 295 European, African American, Asian, Native American and other admixed ancestry infants, 17 European, African American, Asian, Native American and other admixed ancestry sibling pairs, 116 European, African American, Asian, Native American and other admixed ancestry twin pairs. NA initial 356 European NR U.S. +GCST004794 28763065 Xia K 2017-08-01 295 European, African American, Asian, Native American and other admixed ancestry infants, 17 European, African American, Asian, Native American and other admixed ancestry sibling pairs, 116 European, African American, Asian, Native American and other admixed ancestry twin pairs. NA initial 205 Asian unspecified, African American or Afro-Caribbean, Native American, Other admixed ancestry NR U.S. +GCST005522 23459209 Faraco J 2013-01-01 1,886 European ancestry cases, 10,421 European ancestry controls NA initial 12307 European NR Canada, U.S., Australia, Austria, France, Germany, Netherlands, Switzerland, Argentina, Israel, Turkey, Czech Republic, Poland, Slovakia, Denmark, Finland, Norway, U.K., Italy, Portugal, Spain +GCST004692 27455348 van Rheenen W 2016-07-25 12,577 European ancestry cases, 23,475 European ancestry controls 2,579 European ancestry cases, 2,767 European ancestry controls initial 36052 European NR U.S., Belgium, France, Germany, Netherlands, Switzerland, Finland, Republic of Ireland, Sweden, U.K., Italy, Portugal, Spain +GCST004692 27455348 van Rheenen W 2016-07-25 12,577 European ancestry cases, 23,475 European ancestry controls 2,579 European ancestry cases, 2,767 European ancestry controls replication 5346 European NR Australia, Belgium, France, Germany, Netherlands, Turkey, Republic of Ireland, Italy +GCST005134 28800628 Salem JE 2017-08-11 448 European ancestry individuals, 47 North African ancestry individuals 431 European ancestry individuals, 64 North African ancestry individuals replication 64 Greater Middle Eastern (Middle Eastern, North African or Persian) NR France +GCST005134 28800628 Salem JE 2017-08-11 448 European ancestry individuals, 47 North African ancestry individuals 431 European ancestry individuals, 64 North African ancestry individuals replication 431 European NR France +GCST005134 28800628 Salem JE 2017-08-11 448 European ancestry individuals, 47 North African ancestry individuals 431 European ancestry individuals, 64 North African ancestry individuals initial 448 European NR France +GCST005134 28800628 Salem JE 2017-08-11 448 European ancestry individuals, 47 North African ancestry individuals 431 European ancestry individuals, 64 North African ancestry individuals initial 47 Greater Middle Eastern (Middle Eastern, North African or Persian) NR France +GCST005135 28800628 Salem JE 2017-08-11 448 European ancestry individuals, 47 North African ancestry individuals 431 European ancestry individuals, 64 North African ancestry individuals initial 448 European NR France +GCST005135 28800628 Salem JE 2017-08-11 448 European ancestry individuals, 47 North African ancestry individuals 431 European ancestry individuals, 64 North African ancestry individuals initial 47 Greater Middle Eastern (Middle Eastern, North African or Persian) NR France +GCST005135 28800628 Salem JE 2017-08-11 448 European ancestry individuals, 47 North African ancestry individuals 431 European ancestry individuals, 64 North African ancestry individuals replication 431 European NR France +GCST005135 28800628 Salem JE 2017-08-11 448 European ancestry individuals, 47 North African ancestry individuals 431 European ancestry individuals, 64 North African ancestry individuals replication 64 Greater Middle Eastern (Middle Eastern, North African or Persian) NR France diff --git a/tests/gentropy/data_samples/gwas_catalog_ancestries_sample_v1.0.3-r2022-11-29.tsv b/tests/gentropy/data_samples/gwas_catalog_ancestries_sample_v1.0.3-r2022-11-29.tsv deleted file mode 100644 index 92d87d92b..000000000 --- a/tests/gentropy/data_samples/gwas_catalog_ancestries_sample_v1.0.3-r2022-11-29.tsv +++ /dev/null @@ -1,20 +0,0 @@ -STUDY ACCESSION PUBMED ID FIRST AUTHOR DATE INITIAL SAMPLE DESCRIPTION REPLICATION SAMPLE DESCRIPTION STAGE NUMBER OF INDIVIDUALS BROAD ANCESTRAL CATEGORY COUNTRY OF ORIGIN COUNTRY OF RECRUITMENT ADDITIONAL ANCESTRY DESCRIPTION ANCESTRY DESCRIPTOR FOUNDER/GENETICALLY ISOLATED POPULATION NUMBER OF CASES NUMBER OF CONTROLS SAMPLE DESCRIPTION COHORT(S) COHORT-SPECIFIC REFERENCE -GCST008644 26546613 Gutierrez-Achury J 2016-01-01 371 South Asian ancestry celiac disease cases, 3,138 European ancestry celiac disease cases, 4,418 European ancestry rheumatoid arthritis cases, 509 South Asian ancestry celiac disease controls, 2,473 European ancestry celiac disease controls, 3,300 European ancestry rheumatoid arthritis controls, 8,872 celiac disease cases, 9,401 rheumatoid arthritis cases, 4,845 celiac disease controls, 9,627 rheumatoid arthritis controls NA initial 32475 NR NR U.S., Netherlands, U.K. -GCST008644 26546613 Gutierrez-Achury J 2016-01-01 371 South Asian ancestry celiac disease cases, 3,138 European ancestry celiac disease cases, 4,418 European ancestry rheumatoid arthritis cases, 509 South Asian ancestry celiac disease controls, 2,473 European ancestry celiac disease controls, 3,300 European ancestry rheumatoid arthritis controls, 8,872 celiac disease cases, 9,401 rheumatoid arthritis cases, 4,845 celiac disease controls, 9,627 rheumatoid arthritis controls NA initial 13329 European NR Sweden, Poland, Italy, Spain -GCST008644 26546613 Gutierrez-Achury J 2016-01-01 371 South Asian ancestry celiac disease cases, 3,138 European ancestry celiac disease cases, 4,418 European ancestry rheumatoid arthritis cases, 509 South Asian ancestry celiac disease controls, 2,473 European ancestry celiac disease controls, 3,300 European ancestry rheumatoid arthritis controls, 8,872 celiac disease cases, 9,401 rheumatoid arthritis cases, 4,845 celiac disease controls, 9,627 rheumatoid arthritis controls NA initial 880 South Asian NR India -GCST004026 27911795 Schumann G 2016-11-28 up to 70,460 European ancestry drinker individuals up to 35,438 European ancestry drinker individuals replication 35438 European NR Finland, Sweden, Italy, Netherlands, U.K., Austria, France, Republic of Ireland -GCST004026 27911795 Schumann G 2016-11-28 up to 70,460 European ancestry drinker individuals up to 35,438 European ancestry drinker individuals initial 70460 European NR Finland, U.S., Australia, Iceland, Netherlands, Germany, U.K., Switzerland, Estonia, NR -GCST004027 27911795 Schumann G 2016-11-28 up to 74,711 European ancestry heavy and light/non-drinker individuals up to 31,021 European ancestry heavy and light/non-drinker individuals initial 74711 European NR Finland, U.S., Australia, Iceland, Netherlands, Germany, U.K., Switzerland, Estonia, NR, France -GCST004027 27911795 Schumann G 2016-11-28 up to 74,711 European ancestry heavy and light/non-drinker individuals up to 31,021 European ancestry heavy and light/non-drinker individuals replication 31021 European NR Finland, U.S., Italy, Netherlands, U.K., Austria, Republic of Ireland -GCST004281 28235828 Traglia M 2017-04-03 329 Hispanic mothers, 273 European ancestry mothers, 122 Asian ancestry mothers, 23 South Asian ancestry mothers, 21 African American mothers, 22 other ancestry mothers NA initial 23 South Asian NR U.S. -GCST004281 28235828 Traglia M 2017-04-03 329 Hispanic mothers, 273 European ancestry mothers, 122 Asian ancestry mothers, 23 South Asian ancestry mothers, 21 African American mothers, 22 other ancestry mothers NA initial 329 Hispanic or Latin American NR U.S. -GCST004281 28235828 Traglia M 2017-04-03 329 Hispanic mothers, 273 European ancestry mothers, 122 Asian ancestry mothers, 23 South Asian ancestry mothers, 21 African American mothers, 22 other ancestry mothers NA initial 21 African American or Afro-Caribbean NR U.S. -GCST004281 28235828 Traglia M 2017-04-03 329 Hispanic mothers, 273 European ancestry mothers, 122 Asian ancestry mothers, 23 South Asian ancestry mothers, 21 African American mothers, 22 other ancestry mothers NA initial 273 European NR U.S. -GCST004281 28235828 Traglia M 2017-04-03 329 Hispanic mothers, 273 European ancestry mothers, 122 Asian ancestry mothers, 23 South Asian ancestry mothers, 21 African American mothers, 22 other ancestry mothers NA initial 22 Other NR U.S. -GCST004281 28235828 Traglia M 2017-04-03 329 Hispanic mothers, 273 European ancestry mothers, 122 Asian ancestry mothers, 23 South Asian ancestry mothers, 21 African American mothers, 22 other ancestry mothers NA initial 122 Asian unspecified NR U.S. -GCST004284 28235828 Traglia M 2017-04-03 764 fetuses NA initial 764 European, South Asian, Asian unspecified, African American or Afro-Caribbean, Hispanic or Latin American, Other NR U.S. -GCST004285 28235828 Traglia M 2017-04-03 329 Hispanic mothers, 273 European ancestry mothers, 122 Asian ancestry mothers, 23 South Asian ancestry mothers, 21 African American mothers, 22 other ancestry mothers NA initial 329 Hispanic or Latin American NR U.S. -GCST004285 28235828 Traglia M 2017-04-03 329 Hispanic mothers, 273 European ancestry mothers, 122 Asian ancestry mothers, 23 South Asian ancestry mothers, 21 African American mothers, 22 other ancestry mothers NA initial 273 European NR U.S. -GCST004285 28235828 Traglia M 2017-04-03 329 Hispanic mothers, 273 European ancestry mothers, 122 Asian ancestry mothers, 23 South Asian ancestry mothers, 21 African American mothers, 22 other ancestry mothers NA initial 23 South Asian NR U.S. -GCST004285 28235828 Traglia M 2017-04-03 329 Hispanic mothers, 273 European ancestry mothers, 122 Asian ancestry mothers, 23 South Asian ancestry mothers, 21 African American mothers, 22 other ancestry mothers NA initial 21 African American or Afro-Caribbean NR U.S. -GCST004285 28235828 Traglia M 2017-04-03 329 Hispanic mothers, 273 European ancestry mothers, 122 Asian ancestry mothers, 23 South Asian ancestry mothers, 21 African American mothers, 22 other ancestry mothers NA initial 22 Other NR U.S. diff --git a/tests/gentropy/data_samples/gwas_catalog_associations_sample_e107_r2022-11-29.tsv b/tests/gentropy/data_samples/gwas_catalog_associations.tsv similarity index 100% rename from tests/gentropy/data_samples/gwas_catalog_associations_sample_e107_r2022-11-29.tsv rename to tests/gentropy/data_samples/gwas_catalog_associations.tsv diff --git a/tests/gentropy/data_samples/gwas_catalog_studies.tsv b/tests/gentropy/data_samples/gwas_catalog_studies.tsv new file mode 100644 index 000000000..aca9ed666 --- /dev/null +++ b/tests/gentropy/data_samples/gwas_catalog_studies.tsv @@ -0,0 +1,20 @@ +DATE ADDED TO CATALOG PUBMED ID FIRST AUTHOR DATE JOURNAL LINK STUDY DISEASE/TRAIT INITIAL SAMPLE SIZE REPLICATION SAMPLE SIZE PLATFORM [SNPS PASSING QC] ASSOCIATION COUNT MAPPED_TRAIT MAPPED_TRAIT_URI STUDY ACCESSION GENOTYPING TECHNOLOGY SUBMISSION DATE STATISTICAL MODEL BACKGROUND TRAIT MAPPED BACKGROUND TRAIT MAPPED BACKGROUND TRAIT URI COHORT FULL SUMMARY STATISTICS SUMMARY STATS LOCATION +2017-09-11 28604731 Hammerschlag AR 2017-06-12 Nat Genet www.ncbi.nlm.nih.gov/pubmed/28604731 Genome-wide association analysis of insomnia complaints identifies risk genes and genetic overlap with psychiatric and metabolic traits. Insomnia complaints 12,863 European ancestry male cases, 19,521 European ancestry female cases, 40,776 European ancestry male controls, 39,846 European ancestry female controls 1,983 Icelandic ancestry male cases, 1,791 Icelandic ancestry female cases, 2,064 Icelandic ancestry male controls, 1,727 Icelandic ancestry female controls Affymetrix [at least 12428592] (imputed) 2 insomnia http://www.ebi.ac.uk/efo/EFO_0004698 GCST004695 Genome-wide genotyping array yes http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST004001-GCST005000/GCST004695 +2017-09-11 28604731 Hammerschlag AR 2017-06-12 Nat Genet www.ncbi.nlm.nih.gov/pubmed/28604731 Genome-wide association analysis of insomnia complaints identifies risk genes and genetic overlap with psychiatric and metabolic traits. Sleep duration 112,411 European ancestry male individuals NA Affymetrix [at least 12428592] (imputed) 3 sleep duration http://www.ebi.ac.uk/efo/EFO_0005271 GCST004694 Genome-wide genotyping array no NA +2017-09-11 28604731 Hammerschlag AR 2017-06-12 Nat Genet www.ncbi.nlm.nih.gov/pubmed/28604731 Genome-wide association analysis of insomnia complaints identifies risk genes and genetic overlap with psychiatric and metabolic traits. Insomnia complaints (sex interaction) 12,863 European ancestry male cases, 19,521 European ancestry female cases, 40,776 European ancestry male controls, 39,846 European ancestry female controls NA Affymetrix [at least 12428592] (imputed) 0 insomnia http://www.ebi.ac.uk/efo/EFO_0004698 GCST004700 Genome-wide genotyping array no NA +2017-09-11 28604731 Hammerschlag AR 2017-06-12 Nat Genet www.ncbi.nlm.nih.gov/pubmed/28604731 Genome-wide association analysis of insomnia complaints identifies risk genes and genetic overlap with psychiatric and metabolic traits. Insomnia complaints (continuous) 12,863 European ancestry male cases, 19,521 European ancestry female cases, 40,776 European ancestry male controls, 39,846 European ancestry female controls NA Affymetrix [at least 12428592] (imputed) 1 insomnia http://www.ebi.ac.uk/efo/EFO_0004698 GCST004701 Genome-wide genotyping array no NA +2017-09-11 28604731 Hammerschlag AR 2017-06-12 Nat Genet www.ncbi.nlm.nih.gov/pubmed/28604731 Genome-wide association analysis of insomnia complaints identifies risk genes and genetic overlap with psychiatric and metabolic traits. Insomnia complaints (dichotomous) 32,384 European ancestry cases, 27,128 European ancestry controls NA Affymetrix [at least 12428592] (imputed) 1 insomnia http://www.ebi.ac.uk/efo/EFO_0004698 GCST004702 Genome-wide genotyping array no NA +2017-09-11 28604731 Hammerschlag AR 2017-06-12 Nat Genet www.ncbi.nlm.nih.gov/pubmed/28604731 Genome-wide association analysis of insomnia complaints identifies risk genes and genetic overlap with psychiatric and metabolic traits. Chronotype 101,185 European ancestry individuals NA Affymetrix [at least 12428592] (imputed) 9 circadian rhythm http://www.ebi.ac.uk/efo/EFO_0004354 GCST004696 Genome-wide genotyping array no NA +2017-09-11 28604731 Hammerschlag AR 2017-06-12 Nat Genet www.ncbi.nlm.nih.gov/pubmed/28604731 Genome-wide association analysis of insomnia complaints identifies risk genes and genetic overlap with psychiatric and metabolic traits. Ease of getting up in the morning 112,866 European ancestry individuals NA Affymetrix [at least 12428592] (imputed) 5 chronotype measurement http://www.ebi.ac.uk/efo/EFO_0008328 GCST004697 Genome-wide genotyping array no NA +2017-09-11 28604731 Hammerschlag AR 2017-06-12 Nat Genet www.ncbi.nlm.nih.gov/pubmed/28604731 Genome-wide association analysis of insomnia complaints identifies risk genes and genetic overlap with psychiatric and metabolic traits. Daytime nap 113,054 European ancestry cases and controls NA Affymetrix [at least 12428592] (imputed) 4 daytime rest measurement http://www.ebi.ac.uk/efo/EFO_0007828 GCST004693 Genome-wide genotyping array no NA +2017-09-11 28604731 Hammerschlag AR 2017-06-12 Nat Genet www.ncbi.nlm.nih.gov/pubmed/28604731 Genome-wide association analysis of insomnia complaints identifies risk genes and genetic overlap with psychiatric and metabolic traits. Snoring 105,377 European ancestry cases and controls NA Affymetrix [at least 12428592] (imputed) 1 snoring measurement http://www.ebi.ac.uk/efo/EFO_0008341 GCST004698 Genome-wide genotyping array no NA +2017-09-11 28604731 Hammerschlag AR 2017-06-12 Nat Genet www.ncbi.nlm.nih.gov/pubmed/28604731 Genome-wide association analysis of insomnia complaints identifies risk genes and genetic overlap with psychiatric and metabolic traits. Daytime sleepiness 112,717 European ancestry cases and controls NA Affymetrix [at least 12428592] (imputed) 6 excessive daytime sleepiness measurement http://www.ebi.ac.uk/efo/EFO_0007875 GCST004699 Genome-wide genotyping array no NA +2018-11-02 28604731 Hammerschlag AR 2017-06-12 Nat Genet www.ncbi.nlm.nih.gov/pubmed/28604731 Genome-wide association analysis of insomnia complaints identifies risk genes and genetic overlap with psychiatric and metabolic traits. Insomnia complaints 19,521 European ancestry female cases, 39,846 European ancestry female controls 1,983 Icelandic ancestry male cases, 1,791 Icelandic ancestry female cases, 2,064 Icelandic ancestry male controls, 1,727 Icelandic ancestry female controls Affymetrix [at least 12428592] (imputed) 1 insomnia http://www.ebi.ac.uk/efo/EFO_0004698 GCST006487 Genome-wide genotyping array yes http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST006001-GCST007000/GCST006487 +2018-11-02 28604731 Hammerschlag AR 2017-06-12 Nat Genet www.ncbi.nlm.nih.gov/pubmed/28604731 Genome-wide association analysis of insomnia complaints identifies risk genes and genetic overlap with psychiatric and metabolic traits. Insomnia complaints 12,863 European ancestry male cases, 40,776 European ancestry male controls 1,983 Icelandic ancestry male cases, 1,791 Icelandic ancestry female cases, 2,064 Icelandic ancestry male controls, 1,727 Icelandic ancestry female controls Affymetrix [at least 12428592] (imputed) 2 insomnia http://www.ebi.ac.uk/efo/EFO_0004698 GCST006488 Genome-wide genotyping array yes http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST006001-GCST007000/GCST006488 +2017-09-14 28610988 Kerr KF 2017-06-10 Heart Rhythm www.ncbi.nlm.nih.gov/pubmed/28610988 Genome-wide association study of heart rate and its variability in Hispanic/Latino cohorts. Heart rate 13,184 Hispanic/Latino individuals 7,073 European ancestry individuals, 4,771 African American individuals Affymetrix, Illumina [16967914] (imputed) 2 heart rate http://www.ebi.ac.uk/efo/EFO_0004326 GCST004715 Genome-wide genotyping array no NA +2017-09-14 28610988 Kerr KF 2017-06-10 Heart Rhythm www.ncbi.nlm.nih.gov/pubmed/28610988 Genome-wide association study of heart rate and its variability in Hispanic/Latino cohorts. Heart rate variability traits (RMSSD) 13,767 Hispanic/Latino individuals 4,730 European ancestry individuals, 2,908 African American individuals Affymetrix, Illumina [17209892] (imputed) 2 heart rate variability measurement http://www.ebi.ac.uk/efo/EFO_0008003 GCST004716 Genome-wide genotyping array no NA +2017-09-14 28610988 Kerr KF 2017-06-10 Heart Rhythm www.ncbi.nlm.nih.gov/pubmed/28610988 Genome-wide association study of heart rate and its variability in Hispanic/Latino cohorts. Heart rate variability traits (SDNN) 13,184 Hispanic/Latino individuals 7,073 European ancestry individuals, 2,908 African American individuals Affymetrix, Illumina [17209740] (imputed) 3 heart rate variability measurement http://www.ebi.ac.uk/efo/EFO_0008003 GCST004714 Genome-wide genotyping array no NA +2017-09-18 28604730 McKay JD 2017-06-12 Nat Genet www.ncbi.nlm.nih.gov/pubmed/28604730 Large-scale association analysis identifies new lung cancer susceptibility loci and heterogeneity in genetic susceptibility across histological subtypes. Lung cancer 29,266 European ancestry cases, 56,450 European ancestry controls NA Illumina [10439017] (imputed) 135 lung carcinoma http://www.ebi.ac.uk/efo/EFO_0001071 GCST004748 Genome-wide genotyping array yes http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST004001-GCST005000/GCST004748 +2017-09-18 28604730 McKay JD 2017-06-12 Nat Genet www.ncbi.nlm.nih.gov/pubmed/28604730 Large-scale association analysis identifies new lung cancer susceptibility loci and heterogeneity in genetic susceptibility across histological subtypes. Lung adenocarcinoma 11,273 European ancestry cases, 55,483 European ancestry controls NA Illumina [10439017] (imputed) 79 lung adenocarcinoma http://www.ebi.ac.uk/efo/EFO_0000571 GCST004744 Genome-wide genotyping array yes http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST004001-GCST005000/GCST004744 +2017-09-18 28604730 McKay JD 2017-06-12 Nat Genet www.ncbi.nlm.nih.gov/pubmed/28604730 Large-scale association analysis identifies new lung cancer susceptibility loci and heterogeneity in genetic susceptibility across histological subtypes. Squamous cell lung carcinoma 7,426 European ancestry cases, 55,627 European ancestry controls NA Illumina [10439017] (imputed) 101 squamous cell lung carcinoma http://www.ebi.ac.uk/efo/EFO_0000708 GCST004750 Genome-wide genotyping array yes http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST004001-GCST005000/GCST004750 +2017-09-18 28604730 McKay JD 2017-06-12 Nat Genet www.ncbi.nlm.nih.gov/pubmed/28604730 Large-scale association analysis identifies new lung cancer susceptibility loci and heterogeneity in genetic susceptibility across histological subtypes. Small cell lung carcinoma 2,664 European ancestry cases, 21,444 European ancestry controls NA Illumina [10439017] (imputed) 50 small cell lung carcinoma http://www.ebi.ac.uk/efo/EFO_0000702 GCST004746 Genome-wide genotyping array yes http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST004001-GCST005000/GCST004746 diff --git a/tests/gentropy/data_samples/gwas_catalog_studies_sample-r2022-11-29.tsv b/tests/gentropy/data_samples/gwas_catalog_studies_sample-r2022-11-29.tsv deleted file mode 100644 index 7db8f5302..000000000 --- a/tests/gentropy/data_samples/gwas_catalog_studies_sample-r2022-11-29.tsv +++ /dev/null @@ -1,20 +0,0 @@ -DATE ADDED TO CATALOG PUBMED ID FIRST AUTHOR DATE JOURNAL LINK STUDY DISEASE/TRAIT INITIAL SAMPLE SIZE REPLICATION SAMPLE SIZE PLATFORM [SNPS PASSING QC] ASSOCIATION COUNT MAPPED_TRAIT MAPPED_TRAIT_URI STUDY ACCESSION GENOTYPING TECHNOLOGY SUMMARY STATS LOCATION SUBMISSION DATE STATISTICAL MODEL BACKGROUND TRAIT MAPPED BACKGROUND TRAIT MAPPED BACKGROUND TRAIT URI -2019-09-11 26546613 Gutierrez-Achury J 2016-01-01 Hum Mol Genet www.ncbi.nlm.nih.gov/pubmed/26546613 Functional implications of disease-specific variants in loci jointly associated with coeliac disease and rheumatoid arthritis. Celiac disease and Rheumatoid arthritis 371 South Asian ancestry celiac disease cases, 3,138 European ancestry celiac disease cases, 4,418 European ancestry rheumatoid arthritis cases, 509 South Asian ancestry celiac disease controls, 2,473 European ancestry celiac disease controls, 3,300 European ancestry rheumatoid arthritis controls, 8,872 celiac disease cases, 9,401 rheumatoid arthritis cases, 4,845 celiac disease controls, 9,627 rheumatoid arthritis controls NA Illumina [109572] 24 rheumatoid arthritis, celiac disease http://www.ebi.ac.uk/efo/EFO_0000685, http://www.ebi.ac.uk/efo/EFO_0001060 GCST008644 Targeted genotyping array [Immunochip] -2017-05-12 27911795 Schumann G 2016-11-28 Proc Natl Acad Sci U S A www.ncbi.nlm.nih.gov/pubmed/27911795 KLB is associated with alcohol drinking, and its gene product β-Klotho is necessary for FGF21 regulation of alcohol preference. Alcohol consumption up to 70,460 European ancestry drinker individuals up to 35,438 European ancestry drinker individuals Affymetrix, Illumina, Perlegen [at least 316407] (imputed) 3 alcohol consumption measurement http://www.ebi.ac.uk/efo/EFO_0007878 GCST004026 Genome-wide genotyping array -2017-05-12 27911795 Schumann G 2016-11-28 Proc Natl Acad Sci U S A www.ncbi.nlm.nih.gov/pubmed/27911795 KLB is associated with alcohol drinking, and its gene product β-Klotho is necessary for FGF21 regulation of alcohol preference. Alcohol consumption (heavy vs. light/non-drinkers) up to 74,711 European ancestry heavy and light/non-drinker individuals up to 31,021 European ancestry heavy and light/non-drinker individuals Affymetrix, Illumina, Perlegen [at least 316407] (imputed) 2 alcohol consumption measurement http://www.ebi.ac.uk/efo/EFO_0007878 GCST004027 Genome-wide genotyping array -2017-07-03 28235828 Traglia M 2017-04-03 G3 (Bethesda) www.ncbi.nlm.nih.gov/pubmed/28235828 Independent Maternal and Fetal Genetic Effects on Midgestational Circulating Levels of Environmental Pollutants. Midgestational circulating levels of organochlorine pesticides 329 Hispanic mothers, 273 European ancestry mothers, 122 Asian ancestry mothers, 23 South Asian ancestry mothers, 21 African American mothers, 22 other ancestry mothers NA Affymetrix [629686] 2 gestational serum measurement, organochlorine pesticide measurement http://www.ebi.ac.uk/efo/EFO_0007964, http://www.ebi.ac.uk/efo/EFO_0007960 GCST004281 Genome-wide genotyping array -2017-07-03 28235828 Traglia M 2017-04-03 G3 (Bethesda) www.ncbi.nlm.nih.gov/pubmed/28235828 Independent Maternal and Fetal Genetic Effects on Midgestational Circulating Levels of Environmental Pollutants. Midgestational circulating levels of PCBs (fetal genetic effect) 764 fetuses NA Affymetrix [622716] 11 polychlorinated biphenyls measurement, gestational serum measurement, fetal genotype effect measurement http://www.ebi.ac.uk/efo/EFO_0007042, http://www.ebi.ac.uk/efo/EFO_0007964, http://www.ebi.ac.uk/efo/EFO_0007959 GCST004284 Genome-wide genotyping array -2017-07-03 28235828 Traglia M 2017-04-03 G3 (Bethesda) www.ncbi.nlm.nih.gov/pubmed/28235828 Independent Maternal and Fetal Genetic Effects on Midgestational Circulating Levels of Environmental Pollutants. Midgestational circulating levels of PBDEs 329 Hispanic mothers, 273 European ancestry mothers, 122 Asian ancestry mothers, 23 South Asian ancestry mothers, 21 African American mothers, 22 other ancestry mothers NA Affymetrix [629686] 9 polybrominated biphenyl measurement, gestational serum measurement, polybrominated diphenyl ether measurement http://www.ebi.ac.uk/efo/EFO_0007961, http://www.ebi.ac.uk/efo/EFO_0007964, http://www.ebi.ac.uk/efo/EFO_0007962 GCST004285 Genome-wide genotyping array -2017-07-03 28235828 Traglia M 2017-04-03 G3 (Bethesda) www.ncbi.nlm.nih.gov/pubmed/28235828 Independent Maternal and Fetal Genetic Effects on Midgestational Circulating Levels of Environmental Pollutants. Midgestational circulating levels of PBDEs (fetal genetic effect) 764 fetuses NA Affymetrix [629686] 11 polybrominated biphenyl measurement, gestational serum measurement, fetal genotype effect measurement, polybrominated diphenyl ether measurement http://www.ebi.ac.uk/efo/EFO_0007961, http://www.ebi.ac.uk/efo/EFO_0007964, http://www.ebi.ac.uk/efo/EFO_0007959, http://www.ebi.ac.uk/efo/EFO_0007962 GCST004286 Genome-wide genotyping array -2017-07-03 28235828 Traglia M 2017-04-03 G3 (Bethesda) www.ncbi.nlm.nih.gov/pubmed/28235828 Independent Maternal and Fetal Genetic Effects on Midgestational Circulating Levels of Environmental Pollutants. Midgestational circulating levels of organochlorine pesticides (fetal genetic effect) 764 fetuses NA Affymetrix [629686] 0 gestational serum measurement, fetal genotype effect measurement, organochlorine pesticide measurement http://www.ebi.ac.uk/efo/EFO_0007964, http://www.ebi.ac.uk/efo/EFO_0007959, http://www.ebi.ac.uk/efo/EFO_0007960 GCST004282 Genome-wide genotyping array -2017-07-03 28235828 Traglia M 2017-04-03 G3 (Bethesda) www.ncbi.nlm.nih.gov/pubmed/28235828 Independent Maternal and Fetal Genetic Effects on Midgestational Circulating Levels of Environmental Pollutants. Midgestational circulating levels of PCBs 329 Hispanic mothers, 273 European ancestry mothers, 122 Asian ancestry mothers, 23 South Asian ancestry mothers, 21 African American mothers, 22 other ancestry mothers NA Affymetrix [629686] 25 polychlorinated biphenyls measurement, gestational serum measurement http://www.ebi.ac.uk/efo/EFO_0007042, http://www.ebi.ac.uk/efo/EFO_0007964 GCST004283 Genome-wide genotyping array -2016-09-12 26325155 Brehm JM 2015-09-01 Am J Respir Crit Care Med www.ncbi.nlm.nih.gov/pubmed/26325155 A Genome-Wide Association Study of Post-bronchodilator Lung Function in Children with Asthma. Post-bronchodilator lung function in asthma (FEV1) 447 Puerto Rican ancestry cases 568 European, black or Hispanic cases, 2,414 Hispanic cases Illumina [NR] 0 pulmonary function measurement, forced expiratory volume, response to bronchodilator http://www.ebi.ac.uk/efo/EFO_0003892, http://www.ebi.ac.uk/efo/EFO_0004314, http://purl.obolibrary.org/obo/GO_0097366 GCST003110 Genome-wide genotyping array asthma http://purl.obolibrary.org/obo/MONDO_0004979 -2016-09-12 26325155 Brehm JM 2015-09-01 Am J Respir Crit Care Med www.ncbi.nlm.nih.gov/pubmed/26325155 A Genome-Wide Association Study of Post-bronchodilator Lung Function in Children with Asthma. Post-bronchodilator lung function in asthma (FEV1/FVC) 447 Puerto Rican ancestry cases 568 European, black or Hispanic cases, 2,414 Hispanic cases Illumina [NR] 0 pulmonary function measurement, response to bronchodilator, FEV/FEC ratio http://www.ebi.ac.uk/efo/EFO_0003892, http://purl.obolibrary.org/obo/GO_0097366, http://www.ebi.ac.uk/efo/EFO_0004713 GCST003107 Genome-wide genotyping array asthma http://purl.obolibrary.org/obo/MONDO_0004979 -2016-07-27 26237429 Aminkeng F 2015-08-03 Nat Genet www.ncbi.nlm.nih.gov/pubmed/26237429 A coding variant in RARG confers susceptibility to anthracycline-induced cardiotoxicity in childhood cancer. Anthracycline-induced cardiotoxicity in childhood cancer 32 European ancestry cases with cardiotoxicity, 248 European ancestry cases without cardiotoxicity 22 European ancestry cases with cardiotoxicity, 74 European ancestry cases without cardiotoxicity, 2 African cases with cardiotoxicity, 9 African cases without cardiotoxicity, 5 Hispanic cases with cardiotoxicity, 18 Hispanic cases without cardiotoxicity, 8 East Asian cases with cardiotoxicity, 23 East Asian cases without cardiotoxicity, 4 Aboriginal Canadian cases with cardiotoxicity, 11 Aboriginal Canadian cases without cardiotoxicity Illumina [657694] 2 cardiotoxicity, response to anthracycline-based chemotherapy http://www.ebi.ac.uk/efo/EFO_1001482, http://www.ebi.ac.uk/efo/EFO_0005257 GCST003062 Genome-wide genotyping array childhood cancer http://www.ebi.ac.uk/efo/EFO_1000654 -2017-02-25 27387956 Murk W 2016-07-07 BMC Genet www.ncbi.nlm.nih.gov/pubmed/27387956 Genome-wide search identifies a gene-gene interaction between 20p13 and 2q14 in asthma. Asthma (SNP x SNP interaction) 802 European ancestry cases, 823 European ancestry controls 754 European ancestry cases, 57 European and unknown ancestry cases, 2,573 cases, 880 European ancestry controls, 68 European and unknown ancestry controls, 2,145 controls Illumina [301547] 0 asthma http://purl.obolibrary.org/obo/MONDO_0004979 GCST003682 Genome-wide genotyping array -2016-12-09 27008869 Chen MM 2016-03-23 Hum Mol Genet www.ncbi.nlm.nih.gov/pubmed/27008869 GWAS meta-analysis of 16 852 women identifies new susceptibility locus for endometrial cancer. Endometrial cancer 4,907 European ancestry cases, 11,945 European ancestry controls NA Illumina [9486271] (imputed) 4 endometrial carcinoma http://www.ebi.ac.uk/efo/EFO_1001512 GCST003436 Genome-wide genotyping array -2016-12-01 27016271 Kornilov SA 2016-03-25 Pediatrics www.ncbi.nlm.nih.gov/pubmed/27016271 Genome-Wide Association and Exome Sequencing Study of Language Disorder in an Isolated Population. Developmental language disorder 149 isolated population cases, 210 isolated population controls NA Illumina [223580] 4 specific language impairment http://www.ebi.ac.uk/efo/EFO_1001510 GCST003396 Genome-wide genotyping array -2016-12-01 27016271 Kornilov SA 2016-03-25 Pediatrics www.ncbi.nlm.nih.gov/pubmed/27016271 Genome-Wide Association and Exome Sequencing Study of Language Disorder in an Isolated Population. Developmental language disorder (linguistic errors) 149 isolated population cases, 210 isolated population controls NA Illumina [223580] 9 linguistic error measurement, specific language impairment http://www.ebi.ac.uk/efo/EFO_0007798, http://www.ebi.ac.uk/efo/EFO_1001510 GCST003397 Genome-wide genotyping array -2016-12-01 27016271 Kornilov SA 2016-03-25 Pediatrics www.ncbi.nlm.nih.gov/pubmed/27016271 Genome-Wide Association and Exome Sequencing Study of Language Disorder in an Isolated Population. Developmental language disorder (syntactic complexity) 149 isolated population cases, 210 isolated population controls NA Illumina [223580] 3 syntactic complexity measurement, specific language impairment http://www.ebi.ac.uk/efo/EFO_0007799, http://www.ebi.ac.uk/efo/EFO_1001510 GCST003398 Genome-wide genotyping array -2016-12-13 27005419 Borne Y 2016-03-22 Hum Mol Genet www.ncbi.nlm.nih.gov/pubmed/27005419 Genome wide association study identifies two loci associated with cadmium in erythrocytes among never-smokers. Erythrocyte cadmium concentration 2,704 ever-smoker individuals, 1,728 never-smoker individuals NA Illumina [658884] 5 erythrocyte cadmium measurement http://www.ebi.ac.uk/efo/EFO_0007807 GCST003449 Genome-wide genotyping array -2016-12-13 27005419 Borne Y 2016-03-22 Hum Mol Genet www.ncbi.nlm.nih.gov/pubmed/27005419 Genome wide association study identifies two loci associated with cadmium in erythrocytes among never-smokers. Erythrocyte cadmium concentration in never smokers 1,728 individuals NA Illumina [658884] 7 erythrocyte cadmium measurement http://www.ebi.ac.uk/efo/EFO_0007807 GCST003448 Genome-wide genotyping array diff --git a/tests/gentropy/dataset/test_pairwise_ld.py b/tests/gentropy/dataset/test_pairwise_ld.py new file mode 100644 index 000000000..11ebf75ca --- /dev/null +++ b/tests/gentropy/dataset/test_pairwise_ld.py @@ -0,0 +1,102 @@ +"""Testing pairwise LD dataset.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np +import pytest +from gentropy.dataset.pairwise_ld import PairwiseLD +from pyspark.sql import functions as f +from pyspark.sql.window import Window + +if TYPE_CHECKING: + from pyspark.sql import SparkSession + + +class TestPairwiseLD: + """Test suit for pairwise LD dataset and associated methods.""" + + variants = [ + "1_8_A_C", + "1_9_A_C", + "1_10_A_C", + "1_99_A_C", + ] + + @pytest.fixture(scope="class") + def mock_pairwise_ld(self: TestPairwiseLD, spark: SparkSession) -> PairwiseLD: + """Generate a mock pairwise LD dataset. + + Args: + spark (SparkSession): _description_ + + Returns: + PairwiseLD: _description_ + """ + spark = spark.builder.getOrCreate() + + data = [(v1, v2) for v1 in self.variants for v2 in self.variants] + return PairwiseLD( + _df=( + spark.createDataFrame(data, ["variantIdI", "variantIdJ"]) + .withColumn( + "r", + f.row_number() + .over(Window.partitionBy(f.lit("x")).orderBy("variantIdI")) + .cast("double"), + ) + .withColumn( + "r", + f.when(f.col("variantIdI") == f.col("variantIdJ"), 1.0).otherwise( + f.col("r") + ), + ) + .persist() + ), + _schema=PairwiseLD.get_schema(), + ) + + @staticmethod + def test_pairwise_ld__type(mock_pairwise_ld: PairwiseLD) -> None: + """Testing type.""" + assert isinstance(mock_pairwise_ld, PairwiseLD) + + def test_pariwise_ld__get_variants( + self: TestPairwiseLD, mock_pairwise_ld: PairwiseLD + ) -> None: + """Testing function that returns list of variants from the LD table. + + Args: + mock_pairwise_ld (PairwiseLD): _description_ + """ + variant_set_expected = set(self.variants) + variant_set_from_data = set(mock_pairwise_ld.get_variant_list()) + + assert variant_set_from_data == variant_set_expected + + def test_pairwise_ld__r_to_numpy_matrix__type( + self: TestPairwiseLD, mock_pairwise_ld: PairwiseLD + ) -> None: + """Testing the returned numpy array.""" + assert isinstance(mock_pairwise_ld.r_to_numpy_matrix(), np.ndarray) + + def test_pairwise_ld__r_to_numpy_matrix__dimensions( + self: TestPairwiseLD, mock_pairwise_ld: PairwiseLD + ) -> None: + """Testing the returned numpy array.""" + assert mock_pairwise_ld.r_to_numpy_matrix().shape == ( + len(self.variants), + len(self.variants), + ) + + def test_pairwise_ld__overlap_with_locus( + self: TestPairwiseLD, mock_pairwise_ld: PairwiseLD + ) -> None: + """Testing the returned numpy array.""" + variant_subset = self.variants[1:3] + + assert ( + mock_pairwise_ld.overlap_with_locus(variant_subset).df.count() + == len(variant_subset) ** 2 + ) diff --git a/tests/gentropy/dataset/test_study_locus.py b/tests/gentropy/dataset/test_study_locus.py index c12597d54..1401b9dd3 100644 --- a/tests/gentropy/dataset/test_study_locus.py +++ b/tests/gentropy/dataset/test_study_locus.py @@ -10,6 +10,7 @@ from gentropy.dataset.study_index import StudyIndex from gentropy.dataset.study_locus import CredibleInterval, StudyLocus from gentropy.dataset.study_locus_overlap import StudyLocusOverlap +from gentropy.dataset.summary_statistics import SummaryStatistics from pyspark.sql import Column, SparkSession from pyspark.sql.types import ( ArrayType, @@ -214,6 +215,16 @@ def test_filter_by_study_type( assert observed.df.count() == expected_sl_count +def test_annotate_locus_statistics( + mock_study_locus: StudyLocus, mock_summary_statistics: SummaryStatistics +) -> None: + """Test annotate locus statistics returns a StudyLocus.""" + assert isinstance( + mock_study_locus.annotate_locus_statistics(mock_summary_statistics, 100), + StudyLocus, + ) + + def test_filter_credible_set(mock_study_locus: StudyLocus) -> None: """Test credible interval filter.""" assert isinstance( diff --git a/tests/gentropy/dataset/test_study_locus_overlaps.py b/tests/gentropy/dataset/test_study_locus_overlaps.py index ee89eec84..8e732fc5c 100644 --- a/tests/gentropy/dataset/test_study_locus_overlaps.py +++ b/tests/gentropy/dataset/test_study_locus_overlaps.py @@ -30,46 +30,89 @@ def test_study_locus_overlap_from_associations( @pytest.mark.parametrize( - ("observed", "expected"), + ("observed", "intrastudy", "expected"), [ ( # observed - input DataFrame representing gwas and nongwas data to find overlapping signals [ { "studyLocusId": 1, + "studyId": "A", "studyType": "gwas", "chromosome": "1", "tagVariantId": "A", }, { "studyLocusId": 2, + "studyId": "B", "studyType": "eqtl", "chromosome": "1", "tagVariantId": "A", }, { "studyLocusId": 3, + "studyId": "C", "studyType": "gwas", "chromosome": "1", "tagVariantId": "B", }, ], + # intrastudy - bool of whether or not to use inter-study or intra-study logic + False, # expected - output DataFrame with overlapping signals [ {"leftStudyLocusId": 1, "rightStudyLocusId": 2, "chromosome": "1"}, ], ), + ( + # observed - input DataFrame representing intra-study data to find overlapping signals in the same study + [ + { + "studyLocusId": 1, + "studyId": "A", + "studyType": "gwas", + "chromosome": "1", + "region": "X", + "tagVariantId": "A", + }, + { + "studyLocusId": 2, + "studyId": "A", + "studyType": "gwas", + "chromosome": "1", + "region": "Y", + "tagVariantId": "A", + }, + { + "studyLocusId": 3, + "studyId": "B", + "studyType": "gwas", + "chromosome": "1", + "region": "X", + "tagVariantId": "A", + }, + ], + # intrastudy - bool of whether or not to use inter-study or intra-study logic + True, + # expected - output DataFrame with overlapping signals + [{"leftStudyLocusId": 2, "rightStudyLocusId": 1, "chromosome": "1"}], + ), ], ) def test_overlapping_peaks( - spark: SparkSession, observed: list[dict[str, Any]], expected: list[dict[str, Any]] + spark: SparkSession, + observed: list[dict[str, Any]], + intrastudy: bool, + expected: list[dict[str, Any]], ) -> None: """Test overlapping signals between GWAS-GWAS and GWAS-Molecular trait to make sure that mQTLs are always on the right.""" mock_schema = t.StructType( [ t.StructField("studyLocusId", t.LongType()), + t.StructField("studyId", t.StringType()), t.StructField("studyType", t.StringType()), t.StructField("chromosome", t.StringType()), + t.StructField("region", t.StringType()), t.StructField("tagVariantId", t.StringType()), ] ) @@ -81,6 +124,6 @@ def test_overlapping_peaks( ] ) observed_df = spark.createDataFrame(observed, mock_schema) - result_df = StudyLocus._overlapping_peaks(observed_df) + result_df = StudyLocus._overlapping_peaks(observed_df, intrastudy) expected_df = spark.createDataFrame(expected, expected_schema) assert result_df.collect() == expected_df.collect() diff --git a/tests/gentropy/method/test_colocalisation_method.py b/tests/gentropy/method/test_colocalisation_method.py index f90d54b3f..e58b0e562 100644 --- a/tests/gentropy/method/test_colocalisation_method.py +++ b/tests/gentropy/method/test_colocalisation_method.py @@ -2,11 +2,15 @@ from __future__ import annotations +from typing import Any + +import pytest from gentropy.dataset.colocalisation import Colocalisation from gentropy.dataset.study_locus_overlap import StudyLocusOverlap from gentropy.method.colocalisation import Coloc, ECaviar +from pandas.testing import assert_frame_equal from pyspark.sql import SparkSession -from pyspark.sql import functions as f +from pyspark.sql.types import DoubleType, LongType, StringType, StructField, StructType def test_coloc(mock_study_locus_overlap: StudyLocusOverlap) -> None: @@ -14,165 +18,142 @@ def test_coloc(mock_study_locus_overlap: StudyLocusOverlap) -> None: assert isinstance(Coloc.colocalise(mock_study_locus_overlap), Colocalisation) -def test_coloc_colocalise( - spark: SparkSession, - threshold: float = 1e-5, -) -> None: - """Compare COLOC results with R implementation, using provided sample dataset from R package (StudyLocusOverlap).""" - test_overlap_df = spark.read.parquet( - "tests/gentropy/data_samples/coloc_test_data.snappy.parquet", header=True - ) - test_overlap = StudyLocusOverlap(test_overlap_df, StudyLocusOverlap.get_schema()) - test_result = Coloc.colocalise(test_overlap) - - expected = spark.createDataFrame( - [ - { - "h0": 1.3769995397857477e-18, - "h1": 2.937336451601565e-10, - "h2": 8.593226431647826e-12, - "h3": 8.338916748775843e-4, - "h4": 0.9991661080227981, - } - ] - ) - difference = test_result.df.select("h0", "h1", "h2", "h3", "h4").subtract(expected) - for col in difference.columns: - assert difference.filter(f.abs(f.col(col)) > threshold).count() == 0 - - -def test_single_snp_coloc( +@pytest.mark.parametrize( + "observed_data, expected_data", + [ + # associations with a single overlapping SNP + ( + # observed overlap + [ + { + "leftStudyLocusId": 1, + "rightStudyLocusId": 2, + "chromosome": "1", + "tagVariantId": "snp", + "statistics": {"left_logBF": 10.3, "right_logBF": 10.5}, + }, + ], + # expected coloc + [ + { + "h0": 9.254841951638903e-5, + "h1": 2.7517068829182966e-4, + "h2": 3.3609423764447284e-4, + "h3": 9.254841952564387e-13, + "h4": 0.9992961866536217, + }, + ], + ), + # associations with multiple overlapping SNPs + ( + # observed overlap + [ + { + "leftStudyLocusId": 1, + "rightStudyLocusId": 2, + "chromosome": "1", + "tagVariantId": "snp1", + "statistics": {"left_logBF": 10.3, "right_logBF": 10.5}, + }, + { + "leftStudyLocusId": 1, + "rightStudyLocusId": 2, + "chromosome": "1", + "tagVariantId": "snp2", + "statistics": {"left_logBF": 10.3, "right_logBF": 10.5}, + }, + ], + # expected coloc + [ + { + "h0": 4.6230151407950416e-5, + "h1": 2.749086942648107e-4, + "h2": 3.357742374172504e-4, + "h3": 9.983447421747411e-4, + "h4": 0.9983447421747356, + }, + ], + ), + ], +) +def test_coloc_semantic( spark: SparkSession, - threshold: float = 1e-5, + observed_data: list[Any], + expected_data: list[Any], ) -> None: - """Test edge case of coloc where only one causal SNP is present in the StudyLocusOverlap.""" - test_overlap_df = spark.createDataFrame( - [ - { - "leftStudyLocusId": 1, - "rightStudyLocusId": 2, - "chromosome": "1", - "tagVariantId": "snp", - "left_logBF": 10.3, - "right_logBF": 10.5, - } - ] + """Test our COLOC with the implementation in R.""" + observed_overlap = StudyLocusOverlap( + _df=spark.createDataFrame(observed_data, schema=StudyLocusOverlap.get_schema()), + _schema=StudyLocusOverlap.get_schema(), ) - test_overlap = StudyLocusOverlap( - test_overlap_df.select( - "leftStudyLocusId", - "rightStudyLocusId", - "chromosome", - "tagVariantId", - f.struct(f.col("left_logBF"), f.col("right_logBF")).alias("statistics"), - ), - StudyLocusOverlap.get_schema(), + observed_coloc_pdf = ( + Coloc.colocalise(observed_overlap) + .df.select("h0", "h1", "h2", "h3", "h4") + .toPandas() ) - test_result = Coloc.colocalise(test_overlap) - - expected = spark.createDataFrame( - [ - { - "h0": 9.254841951638903e-5, - "h1": 2.7517068829182966e-4, - "h2": 3.3609423764447284e-4, - "h3": 9.254841952564387e-13, - "h4": 0.9992961866536217, - } - ] + expected_coloc_pdf = ( + spark.createDataFrame(expected_data) + .select("h0", "h1", "h2", "h3", "h4") + .toPandas() ) - difference = test_result.df.select("h0", "h1", "h2", "h3", "h4").subtract(expected) - for col in difference.columns: - assert difference.filter(f.abs(f.col(col)) > threshold).count() == 0 - -def test_single_snp_coloc_one_negative( - spark: SparkSession, - threshold: float = 1e-5, -) -> None: - """Test edge case of coloc where only one causal SNP is present (On one side!) in the StudyLocusOverlap.""" - test_overlap_df = spark.createDataFrame( - [ - { - "leftStudyLocusId": 1, - "rightStudyLocusId": 2, - "chromosome": "1", - "tagVariantId": "snp", - "left_logBF": 18.3, - "right_logBF": 0.01, - } - ] + assert_frame_equal( + observed_coloc_pdf, + expected_coloc_pdf, + check_exact=False, + check_dtype=True, ) - test_overlap = StudyLocusOverlap( - test_overlap_df.select( - "leftStudyLocusId", - "rightStudyLocusId", - "chromosome", - "tagVariantId", - f.struct(f.col("left_logBF"), f.col("right_logBF")).alias("statistics"), - ), - StudyLocusOverlap.get_schema(), - ) - test_result = Coloc.colocalise(test_overlap) - test_result.df.show(1, False) - expected = spark.createDataFrame( - [ - { - "h0": 1.0246538505087709e-4, - "h1": 0.9081680002273896, - "h2": 1.0349517929098209e-8, - "h3": 1.0246538506112363e-12, - "h4": 0.09172952403701702, - } - ] - ) - difference = test_result.df.select("h0", "h1", "h2", "h3", "h4").subtract(expected) - for col in difference.columns: - assert difference.filter(f.abs(f.col(col)) > threshold).count() == 0 -def test_single_snp_coloc_both_negative( +def test_coloc_no_logbf( spark: SparkSession, - threshold: float = 1e-5, + minimum_expected_h0: float = 0.99, + maximum_expected_h4: float = 1e-5, ) -> None: - """Test edge case of coloc where only one non-causal SNP overlaps in the StudyLocusOverlap.""" - test_overlap_df = spark.createDataFrame( - [ - { - "leftStudyLocusId": 1, - "rightStudyLocusId": 2, - "chromosome": "1", - "tagVariantId": "snp", - "left_logBF": 0.03, - "right_logBF": 0.01, - } - ] - ) - test_overlap = StudyLocusOverlap( - test_overlap_df.select( - "leftStudyLocusId", - "rightStudyLocusId", - "chromosome", - "tagVariantId", - f.struct(f.col("left_logBF"), f.col("right_logBF")).alias("statistics"), + """Test COLOC output when the input data has irrelevant logBF.""" + observed_overlap = StudyLocusOverlap( + ( + spark.createDataFrame( + [ + { + "leftStudyLocusId": 1, + "rightStudyLocusId": 2, + "chromosome": "1", + "tagVariantId": "snp", + "statistics": { + "left_logBF": None, + "right_logBF": None, + }, # irrelevant for COLOC + } + ], + schema=StructType( + [ + StructField("leftStudyLocusId", LongType(), False), + StructField("rightStudyLocusId", LongType(), False), + StructField("chromosome", StringType(), False), + StructField("tagVariantId", StringType(), False), + StructField( + "statistics", + StructType( + [ + StructField("left_logBF", DoubleType(), True), + StructField("right_logBF", DoubleType(), True), + ] + ), + ), + ] + ), + ) ), StudyLocusOverlap.get_schema(), ) - test_result = Coloc.colocalise(test_overlap) - expected = spark.createDataFrame( - [ - { - "h0": 0.9997855774090624, - "h1": 1.0302335812225042e-4, - "h2": 1.0098335895103664e-4, - "h3": 9.9978557750904e-9, - "h4": 1.0405876008495098e-5, - } - ] - ) - difference = test_result.df.select("h0", "h1", "h2", "h3", "h4").subtract(expected) - for col in difference.columns: - assert difference.filter(f.abs(f.col(col)) > threshold).count() == 0 + observed_coloc_df = Coloc.colocalise(observed_overlap).df + assert ( + observed_coloc_df.select("h0").collect()[0]["h0"] > minimum_expected_h0 + ), "COLOC should return a high h0 (no association) when the input data has irrelevant logBF." + assert ( + observed_coloc_df.select("h4").collect()[0]["h4"] < maximum_expected_h4 + ), "COLOC should return a low h4 (traits are associated) when the input data has irrelevant logBF." def test_ecaviar(mock_study_locus_overlap: StudyLocusOverlap) -> None: diff --git a/tests/gentropy/method/test_locus_to_gene.py b/tests/gentropy/method/test_locus_to_gene.py index d97f4dc4d..898252f9f 100644 --- a/tests/gentropy/method/test_locus_to_gene.py +++ b/tests/gentropy/method/test_locus_to_gene.py @@ -67,9 +67,8 @@ def test_train( ) -> None: """Test the training function.""" trained_model = LocusToGeneTrainer.train( - mock_l2g_feature_matrix.fill_na(), + mock_l2g_feature_matrix.fill_na().select_features(["distanceTssMean"]), model, - features_list=["distanceTssMean"], evaluate=False, ) # Check that `model` is a PipelineModel object and not None @@ -81,32 +80,23 @@ def test_train( class TestColocalisationFactory: """Test the ColocalisationFactory methods.""" - @pytest.mark.parametrize( - "colocalisation_method", - [ - "COLOC", - "eCAVIAR", - ], - ) - def test_get_max_coloc_per_study_locus( + def test_get_max_coloc_per_credible_set( self: TestColocalisationFactory, mock_study_locus: StudyLocus, mock_study_index: StudyIndex, mock_colocalisation: Colocalisation, - colocalisation_method: str, ) -> None: """Test the function that extracts the maximum log likelihood ratio for each pair of overlapping study-locus returns the right data type.""" - coloc_features = ColocalisationFactory._get_max_coloc_per_study_locus( + coloc_features = ColocalisationFactory._get_max_coloc_per_credible_set( + mock_colocalisation, mock_study_locus, mock_study_index, - mock_colocalisation, - colocalisation_method, ) assert isinstance( coloc_features, L2GFeature - ), "Unexpected model type returned from _get_max_coloc_per_study_locus" + ), "Unexpected type returned from _get_max_coloc_per_credible_set" - def test_get_max_coloc_per_study_locus_semantic( + def test_get_max_coloc_per_credible_set_semantic( self: TestColocalisationFactory, spark: SparkSession, ) -> None: @@ -170,8 +160,10 @@ def test_get_max_coloc_per_study_locus_semantic( "colocalisationMethod": "eCAVIAR", "numberColocalisingVariants": 1, "clpp": 0.81, # 0.9*0.9 + "log2h4h3": None, } - ] + ], + schema=Colocalisation.get_schema(), ), _schema=Colocalisation.get_schema(), ) @@ -183,28 +175,13 @@ def test_get_max_coloc_per_study_locus_semantic( L2GFeature.get_schema(), ) # Test - coloc_features = ColocalisationFactory._get_max_coloc_per_study_locus( + coloc_features = ColocalisationFactory._get_max_coloc_per_credible_set( + coloc, credset, studies, - coloc, - "eCAVIAR", ) assert coloc_features.df.collect() == expected_coloc_features_df.collect() - def test_get_coloc_features( - self: TestColocalisationFactory, - mock_study_locus: StudyLocus, - mock_study_index: StudyIndex, - mock_colocalisation: Colocalisation, - ) -> None: - """Test the function that calls all the methods to produce colocalisation features.""" - coloc_features = ColocalisationFactory._get_coloc_features( - mock_study_locus, mock_study_index, mock_colocalisation - ) - assert isinstance( - coloc_features, L2GFeature - ), "Unexpected model type returned from _get_coloc_features" - class TestStudyLocusFactory: """Test the StudyLocusFactory methods.""" diff --git a/tests/gentropy/method/test_qc_of_sumstats.py b/tests/gentropy/method/test_qc_of_sumstats.py new file mode 100644 index 000000000..8480fce8d --- /dev/null +++ b/tests/gentropy/method/test_qc_of_sumstats.py @@ -0,0 +1,63 @@ +"""Test of the qc of summary statistics.""" + +from __future__ import annotations + +import numpy as np +import pandas as pd +import pyspark.sql.functions as f +from gentropy.dataset.summary_statistics import SummaryStatistics +from gentropy.method.sumstat_quality_controls import SummaryStatisticsQC +from pyspark.sql.functions import rand, when + + +def test_qc_functions( + sample_summary_statistics: SummaryStatistics, +) -> None: + """Test all sumstat qc functions.""" + gwas = sample_summary_statistics.sanity_filter() + QC = SummaryStatisticsQC.get_quality_control_metrics( + gwas=gwas, limit=100000, min_count=100, n_total=100000 + ) + QC = QC.toPandas() + + assert QC["n_variants"].iloc[0] == 1663 + assert QC["n_variants_sig"].iloc[0] == 29 + assert np.round(QC["gc_lambda"].iloc[0], 4) == 1.916 + assert np.round(QC["mean_beta"].iloc[0], 4) == 0.0013 + assert np.round(QC["mean_diff_pz"].iloc[0], 6) == 0 + assert np.round(QC["se_diff_pz"].iloc[0], 6) == 0 + assert pd.isna(QC["se_N"].iloc[0]) + + +def test_neff_check_eaf( + sample_summary_statistics: SummaryStatistics, +) -> None: + """Test N_eff check using mock EAFs.""" + gwas = sample_summary_statistics.sanity_filter() + gwas_df = gwas._df + gwas_df = gwas_df.withColumn("effectAlleleFrequencyFromSource", f.lit(0.5)) + gwas._df = gwas_df + + QC = SummaryStatisticsQC.get_quality_control_metrics( + gwas=gwas, limit=100000, min_count=100, n_total=100000 + ) + QC = QC.toPandas() + assert np.round(QC["se_N"].iloc[0], 4) == 0.5586 + + +def test_several_studyid( + sample_summary_statistics: SummaryStatistics, +) -> None: + """Test stability when several studyIds are present.""" + gwas = sample_summary_statistics.sanity_filter() + gwas_df = gwas._df + gwas_df = gwas_df.withColumn( + "studyId", when(rand() < 0.5, "new_value").otherwise(gwas_df["studyId"]) + ) + gwas._df = gwas_df + + QC = SummaryStatisticsQC.get_quality_control_metrics( + gwas=gwas, limit=100000, min_count=100, n_total=100000 + ) + QC = QC.toPandas() + assert QC.shape == (2, 8) diff --git a/tests/gentropy/method/test_sumstat_imputation.py b/tests/gentropy/method/test_sumstat_imputation.py new file mode 100644 index 000000000..aea59f76b --- /dev/null +++ b/tests/gentropy/method/test_sumstat_imputation.py @@ -0,0 +1,31 @@ +"""Test of sumstat imputation functions.""" + +from __future__ import annotations + +import numpy as np +from gentropy.method.sumstat_imputation import SummaryStatisticsImputation + + +class TestSSImp: + """Test of RAISS sumstat imputation main function.""" + + def test_sumstat_imputation( + self: TestSSImp, sample_data_for_carma: list[np.ndarray] + ) -> None: + """Test of RAISS.""" + ld = sample_data_for_carma[0] + z = sample_data_for_carma[1] + + unknowns = [5] + known = [index for index in list(range(21)) if index not in unknowns] + sig_t = ld[known, :][:, known] + sig_i_t = ld[unknowns, :][:, known] + zt = z[known] + + _l = SummaryStatisticsImputation.raiss_model( + zt, sig_t, sig_i_t, lamb=0.01, rtol=0.01 + ) + assert ( + np.round(_l["imputation_r2"][0], decimals=4) == 0.9304 + and np.round(_l["mu"][0], decimals=4) == 9.7215 + ) diff --git a/tests/gentropy/method/test_susie_inf.py b/tests/gentropy/method/test_susie_inf.py index d8e855a5b..45d79bcae 100644 --- a/tests/gentropy/method/test_susie_inf.py +++ b/tests/gentropy/method/test_susie_inf.py @@ -3,7 +3,12 @@ from __future__ import annotations import numpy as np +import pyspark.sql.functions as f +from gentropy.common.session import Session +from gentropy.dataset.study_locus import StudyLocus +from gentropy.dataset.summary_statistics import SummaryStatistics from gentropy.method.susie_inf import SUSIE_inf +from gentropy.susie_finemapper import SusieFineMapperStep class TestSUSIE_inf: @@ -16,7 +21,7 @@ def test_SUSIE_inf_lbf_moments( ld = sample_data_for_susie_inf[0] z = sample_data_for_susie_inf[1] lbf_moments = sample_data_for_susie_inf[2] - susie_output = SUSIE_inf.susie_inf(z=z, LD=ld, method="moments") + susie_output = SUSIE_inf.susie_inf(z=z, LD=ld, est_tausq=True, method="moments") lbf_calc = susie_output["lbf_variable"][:, 0] assert np.allclose( lbf_calc, lbf_moments @@ -29,7 +34,7 @@ def test_SUSIE_inf_lbf_mle( ld = sample_data_for_susie_inf[0] z = sample_data_for_susie_inf[1] lbf_mle = sample_data_for_susie_inf[3] - susie_output = SUSIE_inf.susie_inf(z=z, LD=ld, method="MLE") + susie_output = SUSIE_inf.susie_inf(z=z, LD=ld, est_tausq=True, method="MLE") lbf_calc = susie_output["lbf_variable"][:, 0] assert np.allclose( lbf_calc, lbf_mle, atol=1e-1 @@ -41,6 +46,45 @@ def test_SUSIE_inf_cred( """Test of SuSiE-inf credible set generator.""" ld = sample_data_for_susie_inf[0] z = sample_data_for_susie_inf[1] - susie_output = SUSIE_inf.susie_inf(z=z, LD=ld) + susie_output = SUSIE_inf.susie_inf( + z=z, + LD=ld, + est_tausq=True, + ) cred = SUSIE_inf.cred_inf(susie_output["PIP"], LD=ld) assert cred[0] == [5] + + def test_SUSIE_inf_convert_to_study_locus( + self: TestSUSIE_inf, + sample_data_for_susie_inf: list[np.ndarray], + sample_summary_statistics: SummaryStatistics, + session: Session, + ) -> None: + """Test of SuSiE-inf credible set generator.""" + ld = sample_data_for_susie_inf[0] + z = sample_data_for_susie_inf[1] + susie_output = SUSIE_inf.susie_inf( + z=z, + LD=ld, + est_tausq=False, + ) + gwas_df = sample_summary_statistics._df.withColumn( + "z", f.col("beta") / f.col("standardError") + ).filter(f.col("z").isNotNull()) + gwas_df = gwas_df.limit(21) + + L1 = SusieFineMapperStep.susie_inf_to_studylocus( + susie_output=susie_output, + session=session, + studyId="sample_id", + region="sample_region", + variant_index=gwas_df, + cs_lbf_thr=2, + ld_matrix=ld, + primary_signal_pval_threshold=1, + secondary_signal_pval_threshold=1, + purity_mean_r2_threshold=0, + purity_min_r2_threshold=0, + sum_pips=0.99, + ) + assert isinstance(L1, StudyLocus), "L1 is not an instance of StudyLocus" diff --git a/tests/gentropy/method/test_window_based_clumping.py b/tests/gentropy/method/test_window_based_clumping.py index 03546df9d..cd583bac2 100644 --- a/tests/gentropy/method/test_window_based_clumping.py +++ b/tests/gentropy/method/test_window_based_clumping.py @@ -8,12 +8,12 @@ from gentropy.method.window_based_clumping import WindowBasedClumping from pyspark.ml import functions as fml from pyspark.ml.linalg import VectorUDT +from pyspark.sql import SparkSession from pyspark.sql import functions as f from pyspark.sql.window import Window if TYPE_CHECKING: from gentropy.dataset.summary_statistics import SummaryStatistics - from pyspark.sql import SparkSession def test_window_based_clump__return_type( @@ -21,7 +21,14 @@ def test_window_based_clump__return_type( ) -> None: """Test window-based clumping.""" assert isinstance( - WindowBasedClumping.clump_with_locus(mock_summary_statistics, 250_000), + WindowBasedClumping.clump(mock_summary_statistics, distance=250_000), + StudyLocus, + ) + assert isinstance( + WindowBasedClumping.clump( + mock_summary_statistics, + distance=250_000, + ), StudyLocus, ) @@ -44,7 +51,10 @@ def test_window_based_clump_with_locus__correctness( ) -> None: """Test window-based clumping.""" clumped = sample_summary_statistics.window_based_clumping( - distance=250_000, locus_collect_distance=250_000 + distance=250_000, + ) + clumped = clumped.annotate_locus_statistics( + sample_summary_statistics, collect_locus_distance=250_000 ) # Asserting the presence of locus key: @@ -57,7 +67,7 @@ def test_window_based_clump_with_locus__correctness( assert (clumped.df.filter(f.col("variantId") == "18_12843138_T_C").count()) == 1 # Assert the number of variants in the locus: - assert (clumped.df.select(f.explode_outer("locus").alias("loci")).count()) == 132 + assert (clumped.df.select(f.explode_outer("locus").alias("loci")).count()) == 218 def test_prune_peak(spark: SparkSession) -> None: diff --git a/utils/update_GWAS_Catalog_data.sh b/utils/update_GWAS_Catalog_data.sh index 1e380d30c..00ee44d8f 100755 --- a/utils/update_GWAS_Catalog_data.sh +++ b/utils/update_GWAS_Catalog_data.sh @@ -15,7 +15,7 @@ get_release_url(){ # Function to get the Ensembl and EFO version which used to ground GWAS data: get_release_info(){ - curl -s https://www.ebi.ac.uk/gwas/api/search/stats | jq -r '"\(.ensemblbuild) \(.efoversion)"' + curl -s "${1}" | jq -r '"\(.ensemblbuild) \(.efoversion)"' } logging(){ @@ -41,6 +41,18 @@ upload_file_to_gcp(){ fi } +fetch_from_ftp(){ + URL=${1} + TARGET=${2} + wget -q ${URL} -O ${TARGET} + if [ $? -ne 0 ]; then + logging "Failed to fetch ${URL}" + return + else + logging "File ${TARGET} saved." + fi +} + # Resources: export BASE_URL=ftp://ftp.ebi.ac.uk/pub/databases/gwas export RELEASE_INFO_URL=https://www.ebi.ac.uk/gwas/api/search/stats @@ -71,7 +83,7 @@ read YEAR MONTH DAY < <(get_release_url) logging "Most recent GWAS Catalog release: ${YEAR}/${MONTH}/${DAY}" # Capturing release metadata: -read ENSEMBL EFO < <(get_release_info) +read ENSEMBL EFO < <(get_release_info ${RELEASE_INFO_URL}) logging "Genes were mapped to v${ENSEMBL} Ensembl release." logging "Diseases were mapped to ${EFO} EFO release." @@ -80,26 +92,19 @@ RELEASE_URL=${BASE_URL}/releases/${YEAR}/${MONTH}/${DAY} logging "Datafiles are fetching from ${RELEASE_URL}" # Fetching files while assigning properly dated and annotated names: -wget -q ${RELEASE_URL}/gwas-catalog-associations_ontology-annotated.tsv -O ${ASSOCIATION_FILE} -logging "File ${ASSOCIATION_FILE} saved." +fetch_from_ftp ${RELEASE_URL}/gwas-catalog-associations_ontology-annotated.tsv ${ASSOCIATION_FILE} -wget -q ${RELEASE_URL}/gwas-catalog-download-studies-v1.0.3.txt -O ${PUBLISHED_STUDIES_FILE} -logging "File ${PUBLISHED_STUDIES_FILE} saved." +fetch_from_ftp ${RELEASE_URL}/gwas-catalog-download-studies-v1.0.3.1.txt ${PUBLISHED_STUDIES_FILE} -wget -q ${RELEASE_URL}/gwas-catalog-unpublished-studies-v1.0.3.tsv -O ${UNPUBLISHED_STUDIES_FILE} -logging "File ${UNPUBLISHED_STUDIES_FILE} saved." +fetch_from_ftp ${RELEASE_URL}/gwas-catalog-unpublished-studies-v1.0.3.1.tsv ${UNPUBLISHED_STUDIES_FILE} -wget -q ${RELEASE_URL}/gwas-catalog-download-ancestries-v1.0.3.txt -O ${PUBLISHED_ANCESTRIES_FILE} -logging "File ${PUBLISHED_ANCESTRIES_FILE} saved." +fetch_from_ftp ${RELEASE_URL}/gwas-catalog-download-ancestries-v1.0.3.1.txt ${PUBLISHED_ANCESTRIES_FILE} -wget -q ${RELEASE_URL}/gwas-catalog-unpublished-ancestries-v1.0.3.tsv -O ${UNPUBLISHED_ANCESTRIES_FILE} -logging "File ${UNPUBLISHED_ANCESTRIES_FILE} saved." +fetch_from_ftp ${RELEASE_URL}/gwas-catalog-unpublished-ancestries-v1.0.3.1.tsv ${UNPUBLISHED_ANCESTRIES_FILE} -wget -q ${BASE_URL}/summary_statistics/harmonised_list.txt -O ${HARMONISED_LIST_FILE} -logging "File ${HARMONISED_LIST_FILE} saved." +fetch_from_ftp ${BASE_URL}/summary_statistics/harmonised_list.txt ${HARMONISED_LIST_FILE} -wget -q ${GWAS_CATALOG_STUDY_CURATION_URL} -O ${GWAS_CATALOG_STUDY_CURATION_FILE} -logging "In-house GWAS Catalog study curation file fetched from GitHub." +fetch_from_ftp ${GWAS_CATALOG_STUDY_CURATION_URL} ${GWAS_CATALOG_STUDY_CURATION_FILE} logging "Copying files to GCP..."