From dc4e3674d30baf736fd540871c1fc00179777290 Mon Sep 17 00:00:00 2001 From: Yakov Date: Wed, 10 Apr 2024 13:23:44 +0100 Subject: [PATCH 1/7] fix: adding deduplication for GWAS in locus (#573) --- src/gentropy/susie_finemapper.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py index aba5b4c40..db62fe359 100644 --- a/src/gentropy/susie_finemapper.py +++ b/src/gentropy/susie_finemapper.py @@ -427,6 +427,11 @@ def susie_finemapper_from_prepared_dataframes( start_time = time.time() GWAS_df = GWAS_df.toPandas() + N_gwas_before_dedupl = len(GWAS_df) + + GWAS_df = GWAS_df.drop_duplicates(subset="variantId", keep=False) + GWAS_df = GWAS_df.reset_index() + ld_index = ld_index.toPandas() ld_index = ld_index.reset_index() @@ -539,6 +544,7 @@ def susie_finemapper_from_prepared_dataframes( log_df = pd.DataFrame( { + "N_gwas_before_dedupl": N_gwas_before_dedupl, "N_gwas": N_gwas, "N_ld": N_ld, "N_overlap": N_after_merge, From a5b62f2359aa8285ce638329400c39a55101d622 Mon Sep 17 00:00:00 2001 From: Yakov Date: Wed, 10 Apr 2024 14:02:51 +0100 Subject: [PATCH 2/7] feat: add benchmarking for fine-mapping using Alzheimer as example (#572) * feat: add benchmarking for fine-mapping using Alzheimer as example * fix: small fix in notebook --- notebooks/FineMapping_AlzheimierDisease.ipynb | 1734 +++++++++++++++++ 1 file changed, 1734 insertions(+) create mode 100644 notebooks/FineMapping_AlzheimierDisease.ipynb diff --git a/notebooks/FineMapping_AlzheimierDisease.ipynb b/notebooks/FineMapping_AlzheimierDisease.ipynb new file mode 100644 index 000000000..8a785cc3f --- /dev/null +++ b/notebooks/FineMapping_AlzheimierDisease.ipynb @@ -0,0 +1,1734 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Fine-mapping of Alzheimer's disease GWAS summary statistics using GentroPy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook presents an example of fine-mapping of the GWAS catalog study for Alzheimer's disease ([link to study](https://genetics.opentargets.org/Study/GCST90012877/associations)). The study itself is a good benchmarking example for fine-mapping - relatively large number of SNPs, very strong signal on the 19th chromosome (APOE). It's worth noting that usually very strong signals are excluded from fine-mapping due to instability.\n", + "\n", + "Also, we excluded MHC region (6:28M-34M) from fine-mapping because it has a huge density of the variants.\n", + "\n", + "To execute it on your local machine (not dataproc) you need to install https://github.com/broadinstitute/install-gcs-connector." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialization" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Your browser has been opened to visit:\n", + "\n", + " https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=2Jvk4c7unAsigRvEKhceIxcrpGmeK8&access_type=offline&code_challenge=84guS6MmOY7qgvNpHLxoJbhRDBUAEUS93teMwQboD3Q&code_challenge_method=S256\n", + "\n", + "\n", + "Credentials saved to file: [/Users/yt4/.config/gcloud/application_default_credentials.json]\n", + "\n", + "These credentials will be used by any library that requests Application Default Credentials (ADC).\n", + "\n", + "Quota project \"open-targets-genetics-dev\" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.\n", + "\n", + "\n", + "Updates are available for some Google Cloud CLI components. To install them,\n", + "please run:\n", + " $ gcloud components update\n", + "\n" + ] + } + ], + "source": [ + "!gcloud auth application-default login" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " Loading BokehJS ...\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": "(function(root) {\n function now() {\n return new Date();\n }\n\n const force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\nconst JS_MIME_TYPE = 'application/javascript';\n const HTML_MIME_TYPE = 'text/html';\n const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n const CLASS_NAME = 'output_bokeh rendered_html';\n\n /**\n * Render data to the DOM node\n */\n function render(props, node) {\n const script = document.createElement(\"script\");\n node.appendChild(script);\n }\n\n /**\n * Handle when an output is cleared or removed\n */\n function handleClearOutput(event, handle) {\n function drop(id) {\n const view = Bokeh.index.get_by_id(id)\n if (view != null) {\n view.model.document.clear()\n Bokeh.index.delete(view)\n }\n }\n\n const cell = handle.cell;\n\n const id = cell.output_area._bokeh_element_id;\n const server_id = cell.output_area._bokeh_server_id;\n\n // Clean up Bokeh references\n if (id != null) {\n drop(id)\n }\n\n if (server_id !== undefined) {\n // Clean up Bokeh references\n const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n cell.notebook.kernel.execute(cmd_clean, {\n iopub: {\n output: function(msg) {\n const id = msg.content.text.trim()\n drop(id)\n }\n }\n });\n // Destroy server and session\n const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n cell.notebook.kernel.execute(cmd_destroy);\n }\n }\n\n /**\n * Handle when a new output is added\n */\n function handleAddOutput(event, handle) {\n const output_area = handle.output_area;\n const output = handle.output;\n\n // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n return\n }\n\n const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n\n if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n // store reference to embed id on output_area\n output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n }\n if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n const bk_div = document.createElement(\"div\");\n bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n const script_attrs = bk_div.children[0].attributes;\n for (let i = 0; i < script_attrs.length; i++) {\n toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n }\n // store reference to server id on output_area\n output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n }\n }\n\n function register_renderer(events, OutputArea) {\n\n function append_mime(data, metadata, element) {\n // create a DOM node to render to\n const toinsert = this.create_output_subarea(\n metadata,\n CLASS_NAME,\n EXEC_MIME_TYPE\n );\n this.keyboard_manager.register_events(toinsert);\n // Render to node\n const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n render(props, toinsert[toinsert.length - 1]);\n element.append(toinsert);\n return toinsert\n }\n\n /* Handle when an output is cleared or removed */\n events.on('clear_output.CodeCell', handleClearOutput);\n events.on('delete.Cell', handleClearOutput);\n\n /* Handle when a new output is added */\n events.on('output_added.OutputArea', handleAddOutput);\n\n /**\n * Register the mime type and append_mime function with output_area\n */\n OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n /* Is output safe? */\n safe: true,\n /* Index of renderer in `output_area.display_order` */\n index: 0\n });\n }\n\n // register the mime type if in Jupyter Notebook environment and previously unregistered\n if (root.Jupyter !== undefined) {\n const events = require('base/js/events');\n const OutputArea = require('notebook/js/outputarea').OutputArea;\n\n if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n register_renderer(events, OutputArea);\n }\n }\n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n const NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"
    \\n\"+\n \"
  • re-rerun `output_notebook()` to attempt to load from CDN again, or
  • \\n\"+\n \"
  • use INLINE resources instead, as so:
  • \\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n const el = document.getElementById(\"a8e9a6a6-96f7-4efd-a426-b2299499ef03\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error(url) {\n console.error(\"failed to load \" + url);\n }\n\n for (let i = 0; i < css_urls.length; i++) {\n const url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n for (let i = 0; i < js_urls.length; i++) {\n const url = js_urls[i];\n const element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.3.0.min.js\"];\n const css_urls = [];\n\n const inline_js = [ function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\nfunction(Bokeh) {\n }\n ];\n\n function run_inline_js() {\n if (root.Bokeh !== undefined || force === true) {\n for (let i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\nif (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n const cell = $(document.getElementById(\"a8e9a6a6-96f7-4efd-a426-b2299499ef03\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));", + "application/vnd.bokehjs_load.v0+json": "" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/04/09 10:40:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "pip-installed Hail requires additional configuration options in Spark referring\n", + " to the path to the Hail Python module directory HAIL_DIR,\n", + " e.g. /path/to/python/site-packages/hail:\n", + " spark.jars=HAIL_DIR/backend/hail-all-spark.jar\n", + " spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar\n", + " spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 3.3.4\n", + "SparkUI available at http://192.168.0.232:4040\n", + "Welcome to\n", + " __ __ <>__\n", + " / /_/ /__ __/ /\n", + " / __ / _ `/ / /\n", + " /_/ /_/\\_,_/_/_/ version 0.2.127-bb535cd096c5\n", + "LOGGING: writing to /dev/null\n" + ] + } + ], + "source": [ + "import os\n", + "import hail as hl\n", + "import pyspark.sql.functions as f\n", + "import pandas as pd\n", + "pd.set_option('display.max_colwidth', None)\n", + "pd.set_option('display.expand_frame_repr', False)\n", + "\n", + "from gentropy.common.session import Session\n", + "from gentropy.dataset.study_index import StudyIndex\n", + "from gentropy.dataset.summary_statistics import SummaryStatistics\n", + "from gentropy.dataset.study_index import StudyIndex\n", + "from gentropy.method.window_based_clumping import WindowBasedClumping\n", + "from gentropy.susie_finemapper import SusieFineMapperStep\n", + "\n", + "hail_dir = os.path.dirname(hl.__file__)\n", + "session = Session(hail_home=hail_dir, start_hail=True, extended_spark_conf={\"spark.driver.memory\": \"12g\",\n", + " \"spark.kryoserializer.buffer.max\": \"500m\",\"spark.driver.maxResultSize\":\"3g\"})\n", + "hl.init(sc=session.spark.sparkContext, log=\"/dev/null\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading the data and clumping" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of SNPs in GWAS: 10607272\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 7:> (0 + 1) / 1]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of clumps: 33\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "path_gwas1=\"gs://gwas_catalog_data/harmonised_summary_statistics/GCST90012877.parquet\"\n", + "path_si=\"gs://gwas_catalog_data/study_index\"\n", + "\n", + "gwas1 = SummaryStatistics.from_parquet(session, path_gwas1)\n", + "study_index = StudyIndex.from_parquet(session, path_si)\n", + "\n", + "slt=WindowBasedClumping.clump(gwas1,gwas_significance=5e-8,distance=1e6)\n", + "slt_df=slt._df\n", + "\n", + "print(\"Number of SNPs in GWAS: \",gwas1._df.count())\n", + "print(\"Number of clumps: \",slt_df.count())" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 15:===================================================> (9 + 1) / 10]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------+----------------+----------+---------+----------------+----------+--------------+--------------+-------------------------------+----------------+--------------------+---------------+\n", + "| studyId| variantId|chromosome| position| beta|sampleSize|pValueMantissa|pValueExponent|effectAlleleFrequencyFromSource| standardError| studyLocusId|qualityControls|\n", + "+------------+----------------+----------+---------+----------------+----------+--------------+--------------+-------------------------------+----------------+--------------------+---------------+\n", + "|GCST90012877| 1_161185602_G_A| 1|161185602| 0.0609052805639| null| 4.302| -8| 0.23499| 0.0111181765833| 6360456299763482946| []|\n", + "|GCST90012877| 1_207577223_T_C| 1|207577223| -0.122752564739| null| 1.403| -23| 0.822818| 0.0122652043685|-6742466305250328444| []|\n", + "|GCST90012877| 10_11678309_A_G| 10| 11678309| 0.0668997305692| null| 1.085| -11| 0.380517|0.00984571382836| 3672202482976347473| []|\n", + "|GCST90012877| 10_59886075_G_T| 10| 59886075|-0.0523916765294| null| 3.802| -8| 0.480668|0.00952612570169| 760299597568413738| []|\n", + "|GCST90012877| 10_80520381_T_G| 10| 80520381| 0.0701098772587| null| 2.736| -9| 0.793475| 0.0117897597766|-6168361428432361140| []|\n", + "|GCST90012877|11_121564878_T_C| 11|121564878| -0.186386086749| null| 5.586| -14| 0.037005| 0.0247938672944|-7548659272243096830| []|\n", + "|GCST90012877| 11_47370397_G_A| 11| 47370397| 0.0634588530202| null| 6.911| -11| 0.387521| 0.0097291000298| 1916491992423016132| []|\n", + "|GCST90012877| 11_60328267_T_C| 11| 60328267|-0.0892048800109| null| 9.335| -20| 0.371215|0.00980658024905| 3318332793803757311| []|\n", + "|GCST90012877| 11_86156833_A_G| 11| 86156833| 0.103281644827| null| 5.214| -26| 0.629462|0.00979200684254| 3806751464721795080| []|\n", + "|GCST90012877| 14_52924962_A_G| 14| 52924962| 0.102404628268| null| 3.69| -10| 0.092233| 0.0163413709974|-8640267085448358001| []|\n", + "|GCST90012877| 14_92472511_G_A| 14| 92472511|-0.0762776811698| null| 7.454| -14| 0.339674| 0.0101980809801| 8895835730818824947| []|\n", + "|GCST90012877| 15_50707194_C_G| 15| 50707194|-0.0722934881552| null| 1.639| -9| 0.197469| 0.0119888249532|-4585712009512019667| []|\n", + "|GCST90012877| 15_58730416_T_C| 15| 58730416|-0.0675867539589| null| 2.674| -11| 0.319058| 0.010142839928|-9173595866829505633| []|\n", + "|GCST90012877| 15_63277703_C_T| 15| 63277703| 0.0849598934189| null| 1.052| -8| 0.139487| 0.0148475601067|-6181511576673508209| []|\n", + "|GCST90012877| 16_31115000_C_A| 16| 31115000|-0.0620662164665| null| 4.466| -9| 0.281382| 0.0105807514538|-3612515273077152914| []|\n", + "|GCST90012877| 17_5229833_T_C| 17| 5229833|-0.0849787931131| null| 1.352| -9| 0.875068| 0.0140203927902|-7070596043624425654| []|\n", + "|GCST90012877| 17_63483402_T_C| 17| 63483402| 0.0542810764988| null| 1.215| -8| 0.529632|0.00952697587266| 7171154626284587162| []|\n", + "|GCST90012877| 19_1050875_A_G| 19| 1050875|-0.0772974277902| null| 2.415| -13| 0.674169| 0.0105546077307| 6109438569946056978| []|\n", + "|GCST90012877| 19_44892009_G_A| 19| 44892009| 0.352722374032| null| 1.995| -277| 0.605067|0.00991069396551| 6814727764900576662| []|\n", + "|GCST90012877| 19_51224706_C_A| 19| 51224706|-0.0582180344342| null| 1.295| -8| 0.325551| 0.010237506551|-8288099943480320096| []|\n", + "+------------+----------------+----------+---------+----------------+----------+--------------+--------------+-------------------------------+----------------+--------------------+---------------+\n", + "only showing top 20 rows\n", + "\n", + "None\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "print(slt_df.show())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fine-mapping without outliers detection and imputation using 2M as window size" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-04-09 10:41:57.354 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 10:42:15.499 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:42:28.284 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:44:41.305 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 10:44:51.854 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:45:03.059 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:47:04.871 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 10:47:17.310 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:47:29.113 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:50:32.790 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 10:50:46.191 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:50:57.958 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:53:22.698 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 10:53:34.535 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:53:45.816 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:57:23.189 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 10:57:34.563 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:57:43.988 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 10:59:39.834 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 10:59:52.878 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:00:06.629 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:02:11.433 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:02:22.101 Hail: INFO: Ordering unsorted dataset with network shuffle\n", + "2024-04-09 11:02:32.320 Hail: INFO: wrote table with 175330 rows in 8 partitions to /tmp/__iruid_20813-EyC6kjgQ1hAjFSiH1Xp7sB\n", + "2024-04-09 11:02:35.350 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:04:16.225 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:04:27.837 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:04:41.879 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:07:09.950 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:07:21.139 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:07:33.197 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:09:56.240 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:10:08.288 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:10:20.802 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:14:07.114 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:14:20.204 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:14:32.464 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:16:41.133 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:16:53.191 Hail: INFO: Ordering unsorted dataset with network shuffle\n", + "2024-04-09 11:17:03.328 Hail: INFO: wrote table with 211068 rows in 9 partitions to /tmp/__iruid_35318-By6CsozcY2JvH6dhwjdBPU\n", + "2024-04-09 11:17:10.133 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:19:10.141 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:19:21.964 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:19:34.636 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:21:47.445 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:21:59.127 Hail: INFO: Ordering unsorted dataset with network shuffle\n", + "2024-04-09 11:22:15.902 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:23:18.294 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:23:32.131 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:23:48.719 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:26:51.703 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:27:02.820 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:27:18.412 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:29:02.997 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:29:19.516 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:29:41.224 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:33:19.553 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:33:32.903 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:33:49.144 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:36:16.552 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:36:28.952 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:36:46.964 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:40:40.837 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:40:51.976 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:41:04.014 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:43:54.259 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:44:05.534 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:44:20.087 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:46:45.605 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:46:59.301 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:47:13.181 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:49:50.219 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:50:02.311 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:50:16.072 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:52:34.864 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:52:46.513 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:53:00.919 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:55:50.580 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:56:02.124 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:56:16.907 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:59:15.457 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 11:59:27.380 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 11:59:40.184 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:03:31.055 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 12:03:42.554 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:03:53.915 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:06:33.123 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 12:06:45.813 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:06:58.340 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:09:23.153 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 12:09:33.531 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:09:43.693 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:11:13.739 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 12:11:26.087 Hail: INFO: Ordering unsorted dataset with network shuffle\n", + "2024-04-09 12:11:38.950 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:13:17.020 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 12:13:29.727 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:13:45.213 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:20:03.844 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-09 12:20:13.834 Hail: INFO: Coerced sorted dataset\n", + "2024-04-09 12:20:24.282 Hail: INFO: Coerced sorted dataset\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 1:160185602-162185602 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/p5/4t9crp1563l792qz8xz_3x5h0000gq/T/ipykernel_46905/1319014212.py:29: FutureWarning:\n", + "\n", + "The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + "\n", + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 1:206577223-208577223 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 10:10678309-12678309 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 10:58886075-60886075 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 10:79520381-81520381 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 11:120564878-122564878 ; number of CSs: 2 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 11:46370397-48370397 ; number of CSs: 6 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 11:59328267-61328267 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 11:85156833-87156833 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 14:51924962-53924962 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 14:91472511-93472511 ; number of CSs: 2 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 15:49707194-51707194 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 15:57730416-59730416 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 15:62277703-64277703 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 16:30115000-32115000 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 17:4229833-6229833 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 17:62483402-64483402 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 19:50875-2050875 ; number of CSs: 2 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 19:43892009-45892009 ; number of CSs: 10 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 19:50224706-52224706 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 2:104749599-106749599 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 2:126135234-128135234 ; number of CSs: 2 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 2:232117202-234117202 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 2:64381229-66381229 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 20:55423488-57423488 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 1199:=====================> (3 + 5) / 8]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 21:25775872-27775872 ; number of CSs: 2 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 4:10025995-12025995 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 6:39974457-41974457 ; number of CSs: 2 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 6:46627419-48627419 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 1403:===================================> (5 + 3) / 8]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 7:142410495-144410495 ; number of CSs: 2 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 7:99374211-101374211 ; number of CSs: 1 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 1505:> (0 + 8) / 8]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region: 8:26610986-28610986 ; number of CSs: 3 ; log:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "df = slt_df.withColumn(\"row_index\", f.monotonically_increasing_id())\n", + "\n", + "columns = ['N_gwas', 'N_ld', 'N_overlap', 'N_outliers', 'N_imputed', 'N_final_to_fm', 'eleapsed_time']\n", + "logs = pd.DataFrame(columns=columns)\n", + "\n", + "for i in range(0,df.count()):\n", + " if i!=27:\n", + " one_row = df.filter(df.row_index == i).first()\n", + "\n", + " res=SusieFineMapperStep.susie_finemapper_one_studylocus_row_v2_dev(\n", + " GWAS=gwas1,\n", + " session=session,\n", + " study_locus_row=one_row,\n", + " study_index=study_index,\n", + " window= 2_000_000,\n", + " L=10,\n", + " susie_est_tausq=False,\n", + " run_carma=False,\n", + " run_sumstat_imputation=False,\n", + " carma_time_limit=600,\n", + " imputed_r2_threshold=0.8,\n", + " ld_score_threshold=4\n", + " )\n", + "\n", + " sl=res[\"study_locus\"]\n", + " #print(sl._df.withColumn(\"size\", f.size(sl._df[\"locus\"])).show())\n", + " print(\"Region: \",sl._df.collect()[0]['region'], \"; number of CSs: \",sl._df.count(), \"; log:\")\n", + " #print(res[\"log\"])\n", + " logs=pd.concat([logs,res[\"log\"]])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " N_gwas N_ld N_overlap N_outliers N_imputed N_final_to_fm eleapsed_time\n", + "0 7120 10431 6456 0 0 6456 56.839336\n", + "0 7128 8657 5769 0 0 5769 46.149004\n", + "0 9203 12106 7930 0 0 7930 93.531924\n", + "0 8351 10014 6995 0 0 6995 74.174323\n", + "0 9388 12551 8337 0 0 8337 120.602071\n", + "0 6560 8729 5758 0 0 5758 45.064894\n", + "0 5005 7701 3954 0 0 3954 55.229344\n", + "0 7012 8940 5815 0 0 5815 38.824251\n", + "0 8661 10303 7291 0 0 7291 68.802810\n", + "0 8081 9966 6771 0 0 6771 64.327746\n", + "0 8375 11213 7467 0 0 7467 141.808555\n", + "0 7377 9622 6369 0 0 6369 51.198955\n", + "0 8181 10864 7116 0 0 7116 49.033224\n", + "0 7976 10135 6704 0 0 6704 58.357743\n", + "0 3369 6542 2972 0 0 2972 17.138722\n", + "0 9006 12776 7969 0 0 7969 83.551872\n", + "0 4565 7018 3887 0 0 3887 37.801989\n", + "0 8278 13741 7852 0 0 7852 105.623508\n", + "0 7582 10448 6100 0 0 6100 58.572944\n", + "0 9145 12706 8242 0 0 8242 129.442009\n", + "0 8795 11311 7584 0 0 7584 86.336201\n", + "0 7852 10028 7041 0 0 7041 67.886754\n", + "0 8393 10850 7195 0 0 7195 72.375809\n", + "0 7639 10031 6520 0 0 6520 61.900982\n", + "0 8899 11509 7922 0 0 7922 86.535298\n", + "0 8908 11309 7889 0 0 7889 93.595320\n", + "0 10654 12663 8990 0 0 8990 133.390712\n", + "0 9073 10228 7398 0 0 7398 79.774280\n", + "0 8033 9785 6822 0 0 6822 63.950340\n", + "0 4570 5516 3162 0 0 3162 27.943313\n", + "0 5716 8785 4760 0 0 4760 30.024706\n", + "0 9243 10989 7869 0 0 7869 108.898056\n" + ] + } + ], + "source": [ + "pd.set_option('display.max_rows', None)\n", + "print(logs)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "6653.3125\n" + ] + } + ], + "source": [ + "summary = logs['N_overlap'].mean()\n", + "print(summary)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Fine-mapping of APOE locus" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "text/plain": [ + "Row(studyId='GCST90012877', variantId='19_44892009_G_A', chromosome='19', position=44892009, beta=0.352722374032, sampleSize=None, pValueMantissa=1.9950000047683716, pValueExponent=-277, effectAlleleFrequencyFromSource=0.6050670146942139, standardError=0.00991069396551, studyLocusId=6814727764900576662, qualityControls=[], row_index=18)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = slt_df.withColumn(\"row_index\", f.monotonically_increasing_id())\n", + "one_row = df.filter(df.row_index == 18).first()\n", + "one_row" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Without CARMA, without imputation" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-04-08 21:34:03.208 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + "2024-04-08 21:34:19.253 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 21:34:34.941 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 21:37:16.576 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + "2024-04-08 21:37:28.867 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 21:37:44.733 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 21:52:03.198 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + "2024-04-08 21:52:15.100 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 21:52:30.553 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 22:21:27.877 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + "2024-04-08 22:21:40.137 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 22:21:55.249 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 22:23:21.795 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + "2024-04-08 22:23:35.031 Hail: INFO: Coerced sorted dataset\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "| studyLocusId| studyId| region|credibleSetIndex| locus| variantId|chromosome|position|finemappingMethod|credibleSetlog10BF|size|\n", + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "|-6417720984991662128|GCST90012877|19:43892009-45892009| 1|[{19_44908684_T_C...|19_44908684_T_C| 19|44908684| SuSiE-inf| 2135.710824756712| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 2|[{19_44921094_A_T...|19_44921094_A_T| 19|44921094| SuSiE-inf| 955.4948390766739| 1|\n", + "| 8324745608044585165|GCST90012877|19:43892009-45892009| 3|[{19_44917947_C_T...|19_44917947_C_T| 19|44917947| SuSiE-inf| 690.0307437138443| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 4|[{19_44921094_A_T...|19_44921094_A_T| 19|44921094| SuSiE-inf|425.33378303492805| 1|\n", + "| 2240477384494621278|GCST90012877|19:43892009-45892009| 5|[{19_44891079_T_C...|19_44891079_T_C| 19|44891079| SuSiE-inf|395.31055398960274| 1|\n", + "| 1029535804909934921|GCST90012877|19:43892009-45892009| 6|[{19_44894695_T_C...|19_44894695_T_C| 19|44894695| SuSiE-inf| 333.9497424582455| 1|\n", + "|-6417720984991662128|GCST90012877|19:43892009-45892009| 7|[{19_44908684_T_C...|19_44908684_T_C| 19|44908684| SuSiE-inf| 261.573648706883| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 8|[{19_44921094_A_T...|19_44921094_A_T| 19|44921094| SuSiE-inf|186.66554412409607| 1|\n", + "|-7370952295217410456|GCST90012877|19:43892009-45892009| 9|[{19_44922505_T_G...|19_44922505_T_G| 19|44922505| SuSiE-inf| 78.06352464083552| 1|\n", + "| 3925446284512644964|GCST90012877|19:43892009-45892009| 10|[{19_44913574_T_G...|19_44913574_T_G| 19|44913574| SuSiE-inf|55.346197523194675| 1|\n", + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "\n", + "None\n", + " N_gwas N_ld N_overlap N_outliers N_imputed N_final_to_fm eleapsed_time\n", + "0 7582 10448 6100 0 0 6100 66.112839\n" + ] + } + ], + "source": [ + "res=SusieFineMapperStep.susie_finemapper_one_studylocus_row_v2_dev(\n", + " GWAS=gwas1,\n", + " session=session,\n", + " study_locus_row=one_row,\n", + " study_index=study_index,\n", + " window= 2_000_000,\n", + " L=10,\n", + " susie_est_tausq=False,\n", + " run_carma=False,\n", + " run_sumstat_imputation=False,\n", + " carma_time_limit=1000,\n", + " imputed_r2_threshold=0.8,\n", + " ld_score_threshold=4\n", + ")\n", + "sl=res[\"study_locus\"]\n", + "print(sl._df.withColumn(\"size\", f.size(sl._df[\"locus\"])).show())\n", + "print(res[\"log\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### With CARMA, without imputation" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "| studyLocusId| studyId| region|credibleSetIndex| locus| variantId|chromosome|position|finemappingMethod|credibleSetlog10BF|size|\n", + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "|-6417720984991662128|GCST90012877|19:43892009-45892009| 1|[{19_44908684_T_C...|19_44908684_T_C| 19|44908684| SuSiE-inf|1995.6574121818223| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 2|[{19_44921094_A_T...|19_44921094_A_T| 19|44921094| SuSiE-inf| 721.2637360279233| 1|\n", + "| 7760477027903907683|GCST90012877|19:43892009-45892009| 3|[{19_44911142_C_A...|19_44911142_C_A| 19|44911142| SuSiE-inf|248.39159334060017| 1|\n", + "|-1172224975892516254|GCST90012877|19:43892009-45892009| 4|[{19_44894255_A_C...|19_44894255_A_C| 19|44894255| SuSiE-inf| 96.16160678286879| 1|\n", + "| 8852802213660052283|GCST90012877|19:43892009-45892009| 5|[{19_44862190_G_A...|19_44862190_G_A| 19|44862190| SuSiE-inf| 55.80518621838019| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 6|[{19_44921094_A_T...|19_44921094_A_T| 19|44921094| SuSiE-inf| 53.24772075097935| 1|\n", + "|-6417720984991662128|GCST90012877|19:43892009-45892009| 7|[{19_44908684_T_C...|19_44908684_T_C| 19|44908684| SuSiE-inf| 45.65754067281976| 1|\n", + "|-6417720984991662128|GCST90012877|19:43892009-45892009| 8|[{19_44908684_T_C...|19_44908684_T_C| 19|44908684| SuSiE-inf| 39.3840804563262| 1|\n", + "| 6986973025714240626|GCST90012877|19:43892009-45892009| 9|[{19_44873060_C_G...|19_44873060_C_G| 19|44873060| SuSiE-inf| 38.54912041595975| 1|\n", + "| 3640651426400620880|GCST90012877|19:43892009-45892009| 10|[{19_44845920_G_C...|19_44845920_G_C| 19|44845920| SuSiE-inf|35.378479810047224| 2|\n", + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "\n", + "None\n", + " N_gwas N_ld N_overlap N_outliers N_imputed N_final_to_fm eleapsed_time\n", + "0 7582 10448 6100 151 0 5949 783.939477\n" + ] + } + ], + "source": [ + "res=SusieFineMapperStep.susie_finemapper_one_studylocus_row_v2_dev(\n", + " GWAS=gwas1,\n", + " session=session,\n", + " study_locus_row=one_row,\n", + " study_index=study_index,\n", + " window= 2_000_000,\n", + " L=10,\n", + " susie_est_tausq=False,\n", + " run_carma=True,\n", + " run_sumstat_imputation=False,\n", + " carma_time_limit=1000,\n", + " imputed_r2_threshold=0.8,\n", + " ld_score_threshold=4\n", + ")\n", + "sl=res[\"study_locus\"]\n", + "print(sl._df.withColumn(\"size\", f.size(sl._df[\"locus\"])).show())\n", + "print(res[\"log\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Without CARMA, with imputation" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-04-08 22:25:15.739 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-08 22:25:30.625 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 22:25:46.020 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 22:32:35.094 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-08 22:32:47.616 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 22:33:02.484 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 22:51:33.149 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n", + " 'age_index_dict' -> 'age_index_dict_1'\n", + " 'freq_index_dict' -> 'freq_index_dict_1'\n", + " 'faf_index_dict' -> 'faf_index_dict_1'\n", + " 'freq_meta' -> 'freq_meta_1'\n", + " 'rf' -> 'rf_1'\n", + " 'age_distribution' -> 'age_distribution_1'\n", + " 'popmax_index_dict' -> 'popmax_index_dict_1'\n", + "2024-04-08 22:51:45.708 Hail: INFO: Coerced sorted dataset\n", + "2024-04-08 22:52:00.731 Hail: INFO: Coerced sorted dataset\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+------------+--------------------+----------------+--------------------+-----------------+----------+--------+-----------------+------------------+----+\n", + "| studyLocusId| studyId| region|credibleSetIndex| locus| variantId|chromosome|position|finemappingMethod|credibleSetlog10BF|size|\n", + "+--------------------+------------+--------------------+----------------+--------------------+-----------------+----------+--------+-----------------+------------------+----+\n", + "|-1350283509846281677|GCST90012877|19:43892009-45892009| 1|[{19_44909967_TGG...|19_44909967_TGG_T| 19|44909967| SuSiE-inf| 2310.665662473933| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 2|[{19_44921094_A_T...| 19_44921094_A_T| 19|44921094| SuSiE-inf| 903.6138342773536| 1|\n", + "| 8324745608044585165|GCST90012877|19:43892009-45892009| 3|[{19_44917947_C_T...| 19_44917947_C_T| 19|44917947| SuSiE-inf| 700.3080514793324| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 4|[{19_44921094_A_T...| 19_44921094_A_T| 19|44921094| SuSiE-inf|431.79459690536703| 1|\n", + "| 1029535804909934921|GCST90012877|19:43892009-45892009| 5|[{19_44894695_T_C...| 19_44894695_T_C| 19|44894695| SuSiE-inf|402.50010763388156| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 6|[{19_44921094_A_T...| 19_44921094_A_T| 19|44921094| SuSiE-inf|225.93101254172214| 1|\n", + "| -60207296485035224|GCST90012877|19:43892009-45892009| 7|[{19_44888997_C_T...| 19_44888997_C_T| 19|44888997| SuSiE-inf| 191.4947272198485| 1|\n", + "|-1350283509846281677|GCST90012877|19:43892009-45892009| 8|[{19_44909967_TGG...|19_44909967_TGG_T| 19|44909967| SuSiE-inf|105.04460057482835| 1|\n", + "|-4078755027603845519|GCST90012877|19:43892009-45892009| 9|[{19_44918393_G_A...| 19_44918393_G_A| 19|44918393| SuSiE-inf| 63.30243818120949| 1|\n", + "| 3925446284512644964|GCST90012877|19:43892009-45892009| 10|[{19_44913574_T_G...| 19_44913574_T_G| 19|44913574| SuSiE-inf|54.079307276192694| 1|\n", + "+--------------------+------------+--------------------+----------------+--------------------+-----------------+----------+--------+-----------------+------------------+----+\n", + "\n", + "None\n", + " N_gwas N_ld N_overlap N_outliers N_imputed N_final_to_fm eleapsed_time\n", + "0 7582 10448 6100 0 681 6781 334.328722\n" + ] + } + ], + "source": [ + "res=SusieFineMapperStep.susie_finemapper_one_studylocus_row_v2_dev(\n", + " GWAS=gwas1,\n", + " session=session,\n", + " study_locus_row=one_row,\n", + " study_index=study_index,\n", + " window= 2_000_000,\n", + " L=10,\n", + " susie_est_tausq=False,\n", + " run_carma=False,\n", + " run_sumstat_imputation=True,\n", + " carma_time_limit=10000,\n", + " imputed_r2_threshold=0.8,\n", + " ld_score_threshold=4\n", + ")\n", + "sl=res[\"study_locus\"]\n", + "print(sl._df.withColumn(\"size\", f.size(sl._df[\"locus\"])).show())\n", + "print(res[\"log\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### With CARMA, with imputation" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "| studyLocusId| studyId| region|credibleSetIndex| locus| variantId|chromosome|position|finemappingMethod|credibleSetlog10BF|size|\n", + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "| 3030414938485808431|GCST90012877|19:43892009-45892009| 1|[{19_44895007_C_T...|19_44895007_C_T| 19|44895007| SuSiE-inf|2680.9099711333456| 1|\n", + "|-2201142982564351776|GCST90012877|19:43892009-45892009| 2|[{19_44900601_A_G...|19_44900601_A_G| 19|44900601| SuSiE-inf| 2103.873956796136| 1|\n", + "|-6417720984991662128|GCST90012877|19:43892009-45892009| 3|[{19_44908684_T_C...|19_44908684_T_C| 19|44908684| SuSiE-inf|1968.8126348567705| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 4|[{19_44921094_A_T...|19_44921094_A_T| 19|44921094| SuSiE-inf|1089.9033376410644| 1|\n", + "| 7760477027903907683|GCST90012877|19:43892009-45892009| 5|[{19_44911142_C_A...|19_44911142_C_A| 19|44911142| SuSiE-inf|188.55568384844716| 1|\n", + "|-6417720984991662128|GCST90012877|19:43892009-45892009| 6|[{19_44908684_T_C...|19_44908684_T_C| 19|44908684| SuSiE-inf| 83.57344085238768| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 7|[{19_44921094_A_T...|19_44921094_A_T| 19|44921094| SuSiE-inf| 82.01732099119907| 1|\n", + "| 8852802213660052283|GCST90012877|19:43892009-45892009| 8|[{19_44862190_G_A...|19_44862190_G_A| 19|44862190| SuSiE-inf| 45.92126992319222| 1|\n", + "|-1611304699666037367|GCST90012877|19:43892009-45892009| 9|[{19_44821259_C_T...|19_44821259_C_T| 19|44821259| SuSiE-inf|37.363613067645254| 1|\n", + "| 3556335645959991344|GCST90012877|19:43892009-45892009| 10|[{19_45017701_G_T...|19_45017701_G_T| 19|45017701| SuSiE-inf|30.736039473626658| 4|\n", + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "\n", + "None\n", + " N_gwas N_ld N_overlap N_outliers N_imputed N_final_to_fm eleapsed_time\n", + "0 7582 10448 6100 152 715 6663 1036.467428\n" + ] + } + ], + "source": [ + "res=SusieFineMapperStep.susie_finemapper_one_studylocus_row_v2_dev(\n", + " GWAS=gwas1,\n", + " session=session,\n", + " study_locus_row=one_row,\n", + " study_index=study_index,\n", + " window= 2_000_000,\n", + " L=10,\n", + " susie_est_tausq=False,\n", + " run_carma=True,\n", + " run_sumstat_imputation=True,\n", + " carma_time_limit=10000,\n", + " imputed_r2_threshold=0.8,\n", + " ld_score_threshold=4\n", + ")\n", + "sl=res[\"study_locus\"]\n", + "print(sl._df.withColumn(\"size\", f.size(sl._df[\"locus\"])).show())\n", + "print(res[\"log\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### With CARMA, with imputation, with estimation of infinitisimal effects (susie_est_tausq=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "| studyLocusId| studyId| region|credibleSetIndex| locus| variantId|chromosome|position|finemappingMethod|credibleSetlog10BF|size|\n", + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "|-6417720984991662128|GCST90012877|19:43892009-45892009| 1|[{19_44908684_T_C...|19_44908684_T_C| 19|44908684| SuSiE-inf| 1105.297844890198| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 2|[{19_44921094_A_T...|19_44921094_A_T| 19|44921094| SuSiE-inf|1042.0949995382389| 1|\n", + "|-2201142982564351776|GCST90012877|19:43892009-45892009| 3|[{19_44900601_A_G...|19_44900601_A_G| 19|44900601| SuSiE-inf| 760.0654878716481| 1|\n", + "| 3030414938485808431|GCST90012877|19:43892009-45892009| 4|[{19_44895007_C_T...|19_44895007_C_T| 19|44895007| SuSiE-inf| 388.8928142354868| 1|\n", + "| -251577639520141451|GCST90012877|19:43892009-45892009| 5|[{19_44899220_C_T...|19_44899220_C_T| 19|44899220| SuSiE-inf| 259.5645544847559| 1|\n", + "| 7760477027903907683|GCST90012877|19:43892009-45892009| 6|[{19_44911142_C_A...|19_44911142_C_A| 19|44911142| SuSiE-inf|231.66277856324325| 1|\n", + "| 4133344777320628094|GCST90012877|19:43892009-45892009| 7|[{19_44904531_G_A...|19_44904531_G_A| 19|44904531| SuSiE-inf|143.22657752219786| 1|\n", + "|-1764089385585984368|GCST90012877|19:43892009-45892009| 8|[{19_44893642_T_C...|19_44893642_T_C| 19|44893642| SuSiE-inf| 87.72507299242906| 1|\n", + "|-1158278093713046158|GCST90012877|19:43892009-45892009| 9|[{19_44921094_A_T...|19_44921094_A_T| 19|44921094| SuSiE-inf| 71.4171763690986| 1|\n", + "|-6417720984991662128|GCST90012877|19:43892009-45892009| 10|[{19_44908684_T_C...|19_44908684_T_C| 19|44908684| SuSiE-inf| 43.36071977593145| 1|\n", + "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "\n", + "None\n", + " N_gwas N_ld N_overlap N_outliers N_imputed N_final_to_fm eleapsed_time\n", + "0 7582 10448 6100 151 720 6669 1229.515921\n" + ] + } + ], + "source": [ + "res=SusieFineMapperStep.susie_finemapper_one_studylocus_row_v2_dev(\n", + " GWAS=gwas1,\n", + " session=session,\n", + " study_locus_row=one_row,\n", + " study_index=study_index,\n", + " window= 2_000_000,\n", + " L=10,\n", + " susie_est_tausq=True,\n", + " run_carma=True,\n", + " run_sumstat_imputation=True,\n", + " carma_time_limit=10000,\n", + " imputed_r2_threshold=0.8,\n", + " ld_score_threshold=4\n", + ")\n", + "sl=res[\"study_locus\"]\n", + "print(sl._df.withColumn(\"size\", f.size(sl._df[\"locus\"])).show())\n", + "print(res[\"log\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Fine-mapping of MHC region using 1Mb window" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "text/plain": [ + "Row(studyId='GCST90012877', variantId='6_32592248_A_G', chromosome='6', position=32592248, beta=-0.103604380043, sampleSize=None, pValueMantissa=2.877000093460083, pValueExponent=-15, effectAlleleFrequencyFromSource=0.21086899936199188, standardError=0.0131209374957, studyLocusId=5718491981995302674, qualityControls=[], row_index=27)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = slt_df.withColumn(\"row_index\", f.monotonically_increasing_id())\n", + "one_row = df.filter(df.row_index == 27).first()\n", + "one_row" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 1541:==========================================> (6 + 2) / 8]\r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+------------+-------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "| studyLocusId| studyId| region|credibleSetIndex| locus| variantId|chromosome|position|finemappingMethod|credibleSetlog10BF|size|\n", + "+--------------------+------------+-------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "|-3446214959021623473|GCST90012877|6:32092248-33092248| 1|[{6_32557997_G_A,...| 6_32557997_G_A| 6|32557997| SuSiE-inf| 4323.908142062261| 1|\n", + "| -439738150050389281|GCST90012877|6:32092248-33092248| 2|[{6_32558002_G_T,...| 6_32558002_G_T| 6|32558002| SuSiE-inf|3428.8321277074765| 1|\n", + "| 5831857384024844796|GCST90012877|6:32092248-33092248| 3|[{6_32557987_C_A,...| 6_32557987_C_A| 6|32557987| SuSiE-inf|1699.8680349563335| 1|\n", + "|-1087057043201011402|GCST90012877|6:32092248-33092248| 4|[{6_32557977_T_C,...| 6_32557977_T_C| 6|32557977| SuSiE-inf| 965.9753305300063| 1|\n", + "| 6919234179916081233|GCST90012877|6:32092248-33092248| 5|[{6_32649735_C_T,...| 6_32649735_C_T| 6|32649735| SuSiE-inf| 369.9698233117616| 1|\n", + "| 7781006900918060896|GCST90012877|6:32092248-33092248| 6|[{6_32652962_C_T,...| 6_32652962_C_T| 6|32652962| SuSiE-inf| 328.6834447478274| 1|\n", + "|-7512794333418509403|GCST90012877|6:32092248-33092248| 7|[{6_32591896_T_G,...| 6_32591896_T_G| 6|32591896| SuSiE-inf|326.52393082050276| 1|\n", + "| 4056478719932360430|GCST90012877|6:32092248-33092248| 8|[{6_32621456_GC_G...|6_32621456_GC_G| 6|32621456| SuSiE-inf|263.48518383939836| 1|\n", + "| 8380896542014789747|GCST90012877|6:32092248-33092248| 9|[{6_32648039_G_A,...| 6_32648039_G_A| 6|32648039| SuSiE-inf|176.62947310155317| 1|\n", + "| 9053545161380162736|GCST90012877|6:32092248-33092248| 10|[{6_32700030_C_T,...| 6_32700030_C_T| 6|32700030| SuSiE-inf| 69.83226092797517| 2|\n", + "+--------------------+------------+-------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n", + "\n", + "None\n", + " N_gwas N_ld N_overlap N_outliers N_imputed N_final_to_fm eleapsed_time\n", + "0 19311 22318 13188 0 0 13188 298.784288\n" + ] + } + ], + "source": [ + "res=SusieFineMapperStep.susie_finemapper_one_studylocus_row_v2_dev(\n", + " GWAS=gwas1,\n", + " session=session,\n", + " study_locus_row=one_row,\n", + " study_index=study_index,\n", + " window= 1_000_000,\n", + " L=10,\n", + " susie_est_tausq=False,\n", + " run_carma=False,\n", + " run_sumstat_imputation=False,\n", + " carma_time_limit=10000,\n", + " imputed_r2_threshold=0.8,\n", + " ld_score_threshold=4\n", + ")\n", + "sl=res[\"study_locus\"]\n", + "print(sl._df.withColumn(\"size\", f.size(sl._df[\"locus\"])).show())\n", + "print(res[\"log\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "gentropy-krNFZEZg-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From ecd806331813bb28eada4d4a56c983e5a419b833 Mon Sep 17 00:00:00 2001 From: Daniel-Considine <113430683+Daniel-Considine@users.noreply.github.com> Date: Wed, 10 Apr 2024 14:41:07 +0100 Subject: [PATCH 3/7] fix: removing all duplicated variants in sumstats for finemapping functions (#574) --- src/gentropy/susie_finemapper.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py index db62fe359..d37298436 100644 --- a/src/gentropy/susie_finemapper.py +++ b/src/gentropy/susie_finemapper.py @@ -81,6 +81,10 @@ def susie_finemapper_one_studylocus_row( .filter(f.col("studyId") == studyId) .filter(f.col("z").isNotNull()) ) + # Remove ALL duplicated variants from GWAS DataFrame - we don't know which is correct + variant_counts = gwas_df.groupBy("variantId").count() + unique_variants = variant_counts.filter(f.col("count") == 1) + gwas_df = gwas_df.join(unique_variants, on="variantId", how="left_semi") ld_index = ( GnomADLDMatrix() @@ -320,6 +324,10 @@ def susie_finemapper_ss_gathered( .withColumn("position", f.split(f.col("variantId"), "_")[1]) .filter(f.col("z").isNotNull()) ) + # Remove ALL duplicated variants from GWAS DataFrame - we don't know which is correct + variant_counts = gwas_df.groupBy("variantId").count() + unique_variants = variant_counts.filter(f.col("count") == 1) + gwas_df = gwas_df.join(unique_variants, on="variantId", how="left_semi") ld_index = ( GnomADLDMatrix() From 7ed4703fcb3589af82f9fc76425f0691a080dee4 Mon Sep 17 00:00:00 2001 From: Yakov Date: Sun, 14 Apr 2024 16:42:34 +0100 Subject: [PATCH 4/7] feat: adding notebook for mapping EFOs for the FinnGen study index (#575) --- notebooks/Mapping_EFO_finngen.ipynb | 768 ++++++++++++++++++++++++++++ 1 file changed, 768 insertions(+) create mode 100644 notebooks/Mapping_EFO_finngen.ipynb diff --git a/notebooks/Mapping_EFO_finngen.ipynb b/notebooks/Mapping_EFO_finngen.ipynb new file mode 100644 index 000000000..9bd82d8d4 --- /dev/null +++ b/notebooks/Mapping_EFO_finngen.ipynb @@ -0,0 +1,768 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Mapping EFOs for the FinnGen study index using old study index from the previos prod" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook adds EFOs from previos prod version of study_index to the new FinnGen study_index using trait name as a matching key.\n", + "\n", + "The rsulting study index has 1542 rows with not null EFOs (out of 2408 rows).\n", + "\n", + "The new study index is saved here:\n", + "\"gs://genetics-portal-dev-analysis/yt4/study_index_finngen_with_efo\"" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Your browser has been opened to visit:\n", + "\n", + " https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=XHb8Uk43SsVjvFRqwgrX4Tgg2tTOHS&access_type=offline&code_challenge=OkiqDAkHXDGEgJQbX8r0ZYKfZ7gcgfXS8mfZc5a913Y&code_challenge_method=S256\n", + "\n", + "\n", + "Credentials saved to file: [/Users/yt4/.config/gcloud/application_default_credentials.json]\n", + "\n", + "These credentials will be used by any library that requests Application Default Credentials (ADC).\n", + "\n", + "Quota project \"open-targets-genetics-dev\" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.\n" + ] + } + ], + "source": [ + "!gcloud auth application-default login" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " Loading BokehJS ...\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": "(function(root) {\n function now() {\n return new Date();\n }\n\n const force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\nconst JS_MIME_TYPE = 'application/javascript';\n const HTML_MIME_TYPE = 'text/html';\n const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n const CLASS_NAME = 'output_bokeh rendered_html';\n\n /**\n * Render data to the DOM node\n */\n function render(props, node) {\n const script = document.createElement(\"script\");\n node.appendChild(script);\n }\n\n /**\n * Handle when an output is cleared or removed\n */\n function handleClearOutput(event, handle) {\n function drop(id) {\n const view = Bokeh.index.get_by_id(id)\n if (view != null) {\n view.model.document.clear()\n Bokeh.index.delete(view)\n }\n }\n\n const cell = handle.cell;\n\n const id = cell.output_area._bokeh_element_id;\n const server_id = cell.output_area._bokeh_server_id;\n\n // Clean up Bokeh references\n if (id != null) {\n drop(id)\n }\n\n if (server_id !== undefined) {\n // Clean up Bokeh references\n const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n cell.notebook.kernel.execute(cmd_clean, {\n iopub: {\n output: function(msg) {\n const id = msg.content.text.trim()\n drop(id)\n }\n }\n });\n // Destroy server and session\n const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n cell.notebook.kernel.execute(cmd_destroy);\n }\n }\n\n /**\n * Handle when a new output is added\n */\n function handleAddOutput(event, handle) {\n const output_area = handle.output_area;\n const output = handle.output;\n\n // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n return\n }\n\n const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n\n if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n // store reference to embed id on output_area\n output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n }\n if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n const bk_div = document.createElement(\"div\");\n bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n const script_attrs = bk_div.children[0].attributes;\n for (let i = 0; i < script_attrs.length; i++) {\n toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n }\n // store reference to server id on output_area\n output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n }\n }\n\n function register_renderer(events, OutputArea) {\n\n function append_mime(data, metadata, element) {\n // create a DOM node to render to\n const toinsert = this.create_output_subarea(\n metadata,\n CLASS_NAME,\n EXEC_MIME_TYPE\n );\n this.keyboard_manager.register_events(toinsert);\n // Render to node\n const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n render(props, toinsert[toinsert.length - 1]);\n element.append(toinsert);\n return toinsert\n }\n\n /* Handle when an output is cleared or removed */\n events.on('clear_output.CodeCell', handleClearOutput);\n events.on('delete.Cell', handleClearOutput);\n\n /* Handle when a new output is added */\n events.on('output_added.OutputArea', handleAddOutput);\n\n /**\n * Register the mime type and append_mime function with output_area\n */\n OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n /* Is output safe? */\n safe: true,\n /* Index of renderer in `output_area.display_order` */\n index: 0\n });\n }\n\n // register the mime type if in Jupyter Notebook environment and previously unregistered\n if (root.Jupyter !== undefined) {\n const events = require('base/js/events');\n const OutputArea = require('notebook/js/outputarea').OutputArea;\n\n if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n register_renderer(events, OutputArea);\n }\n }\n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n const NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"
    \\n\"+\n \"
  • re-rerun `output_notebook()` to attempt to load from CDN again, or
  • \\n\"+\n \"
  • use INLINE resources instead, as so:
  • \\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n const el = document.getElementById(\"c92e22c1-acc6-4a9b-8a5a-529fec6e60ae\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error(url) {\n console.error(\"failed to load \" + url);\n }\n\n for (let i = 0; i < css_urls.length; i++) {\n const url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n for (let i = 0; i < js_urls.length; i++) {\n const url = js_urls[i];\n const element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.3.0.min.js\"];\n const css_urls = [];\n\n const inline_js = [ function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\nfunction(Bokeh) {\n }\n ];\n\n function run_inline_js() {\n if (root.Bokeh !== undefined || force === true) {\n for (let i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\nif (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n const cell = $(document.getElementById(\"c92e22c1-acc6-4a9b-8a5a-529fec6e60ae\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));", + "application/vnd.bokehjs_load.v0+json": "" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/04/14 16:03:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "pip-installed Hail requires additional configuration options in Spark referring\n", + " to the path to the Hail Python module directory HAIL_DIR,\n", + " e.g. /path/to/python/site-packages/hail:\n", + " spark.jars=HAIL_DIR/backend/hail-all-spark.jar\n", + " spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar\n", + " spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 3.3.4\n", + "SparkUI available at http://192.168.0.232:4040\n", + "Welcome to\n", + " __ __ <>__\n", + " / /_/ /__ __/ /\n", + " / __ / _ `/ / /\n", + " /_/ /_/\\_,_/_/_/ version 0.2.127-bb535cd096c5\n", + "LOGGING: writing to /dev/null\n" + ] + } + ], + "source": [ + "import os\n", + "import hail as hl\n", + "import pyspark.sql.functions as f\n", + "import pandas as pd\n", + "pd.set_option('display.max_colwidth', None)\n", + "pd.set_option('display.expand_frame_repr', False)\n", + "\n", + "from gentropy.common.session import Session\n", + "from gentropy.dataset.study_index import StudyIndex\n", + "\n", + "\n", + "hail_dir = os.path.dirname(hl.__file__)\n", + "session = Session(hail_home=hail_dir, start_hail=True, extended_spark_conf={\"spark.driver.memory\": \"12g\",\n", + " \"spark.kryoserializer.buffer.max\": \"500m\",\"spark.driver.maxResultSize\":\"3g\"})\n", + "hl.init(sc=session.spark.sparkContext, log=\"/dev/null\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "path_si=\"gs://genetics_etl_python_playground/releases/24.03/study_index/finngen/study_index\"\n", + "path_si_old=\"gs://genetics-portal-dev-analysis/yt4/study_index.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "si_old=session.spark.read.csv(path_si_old, header=True,sep=\"\\t\")\n", + "si_new=StudyIndex.from_parquet(session=session, path=path_si)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-------------------+--------------------+-------+---------+-------------+----+----------+----------+-----------+---------+------------+--------------+-------+--------------------+-----------------+--------------------+\n", + "| study_id| ancestry_initial|ancestry_replication|n_cases|n_initial|n_replication|pmid|pub_author| pub_date|pub_journal|pub_title|has_sumstats|num_assoc_loci| source| trait_reported| trait_efos| trait_category|\n", + "+--------------------+-------------------+--------------------+-------+---------+-------------+----+----------+----------+-----------+---------+------------+--------------+-------+--------------------+-----------------+--------------------+\n", + "|FINNGEN_R6_M13_MU...|['European=253458']| []| 108.0| 253458| 0.0|null|FINNGEN_R6|2022-01-24| null| null| True| 0|FINNGEN|Multifocal fibros...|['MONDO_0009230']|immune system dis...|\n", + "|FINNGEN_R6_M13_MU...|['European=199528']| []| 1804.0| 199528| 0.0|null|FINNGEN_R6|2022-01-24| null| null| True| 0|FINNGEN|Disorders of muscles| ['EFO_0002970']|musculoskeletal o...|\n", + "|FINNGEN_R6_M13_MU...|['European=197821']| []| 97.0| 197821| 0.0|null|FINNGEN_R6|2022-01-24| null| null| True| 0|FINNGEN|\"\"\"Muscle wasting...| ['EFO_0009851']| biological process|\n", + "|FINNGEN_R6_M13_MU...|['European=198253']| []| 529.0| 198253| 0.0|null|FINNGEN_R6|2022-01-24| null| null| True| 0|FINNGEN|Other specified d...| ['EFO_0002970']|musculoskeletal o...|\n", + "|FINNGEN_R6_M13_MU...|['European=198179']| []| 455.0| 198179| 0.0|null|FINNGEN_R6|2022-01-24| null| null| True| 1|FINNGEN| Muscle strain| ['EFO_0010686']|injury, poisoning...|\n", + "+--------------------+-------------------+--------------------+-------+---------+-------------+----+----------+----------+-----------+---------+------------+--------------+-------+--------------------+-----------------+--------------------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "si_old.show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "| studyId| projectId|studyType| traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds| initialSampleSize|nCases|nControls|nSamples| cohorts|ldPopulationStructure| discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Actinomycosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 101| 363227| 363328|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Amoebiasis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 160| 367214| 367374|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Anogenital herpes...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1986| 400197| 402183|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Aspergillosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 211| 403213| 403424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Atypical virus in...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 282| 409849| 410131|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "si_new_df=si_new.df\n", + "si_new_df.show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "57246\n", + "2408\n" + ] + } + ], + "source": [ + "print(si_old.count())\n", + "print(si_new_df.count())" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-----------------+\n", + "| trait_reported| trait_efos|\n", + "+--------------------+-----------------+\n", + "|Multifocal fibros...|['MONDO_0009230']|\n", + "|Disorders of muscles| ['EFO_0002970']|\n", + "|\"\"\"Muscle wasting...| ['EFO_0009851']|\n", + "|Other specified d...| ['EFO_0002970']|\n", + "| Muscle strain| ['EFO_0010686']|\n", + "+--------------------+-----------------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "si_old=si_old.select(\"trait_reported\",\"trait_efos\")\n", + "si_old.show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql.functions import lower\n", + "\n", + "si_old = si_old.withColumn(\"trait_reported_low\", lower(si_old[\"trait_reported\"])).select(\"trait_reported_low\",\"trait_efos\")\n", + "si_new_df= si_new_df.withColumn(\"trait_reported_low\", lower(si_new_df[\"traitFromSource\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "text/plain": [ + "2408" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "si_old = si_old.dropDuplicates(['trait_reported_low'])\n", + "joined_df = si_new_df.join(si_old, \"trait_reported_low\", how='left')\n", + "joined_df.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+--------------------+\n", + "| trait_reported_low| studyId| projectId|studyType| traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds| initialSampleSize|nCases|nControls|nSamples| cohorts|ldPopulationStructure| discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats| trait_efos|\n", + "+--------------------+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+--------------------+\n", + "| actinomycosis|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Actinomycosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 101| 363227| 363328|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| null|\n", + "| amoebiasis|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Amoebiasis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 160| 367214| 367374|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_0007144']|\n", + "|anogenital herpes...|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Anogenital herpes...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1986| 400197| 402183|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_0007282']|\n", + "| aspergillosis|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Aspergillosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 211| 403213| 403424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_0007157']|\n", + "|atypical virus in...|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Atypical virus in...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 282| 409849| 410131|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['MONDO_0024318']|\n", + "|bacterial infecti...|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial infecti...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 20226| 363227| 383453|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| null|\n", + "|bacterial, viral ...|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial, viral ...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2852| 409329| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| null|\n", + "|other bacterial i...|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Other bacterial i...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 6145| 367214| 373359|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_0000771']|\n", + "| candidiasis|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Candidiasis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 4306| 403213| 407519|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['MONDO_0002026']|\n", + "|other sexually tr...|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas|Other sexually tr...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2186| 400197| 402383|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|['MONDO_0021681',...|\n", + "| cholera|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Cholera| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1385| 367214| 368599|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_1001235']|\n", + "|dengue fever [cla...|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas|Dengue fever [cla...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 53| 409137| 409190|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| null|\n", + "| dermatophytosis|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas| Dermatophytosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 3921| 403213| 407134|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['MONDO_0004678']|\n", + "| early syphilis|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Early syphilis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 308| 400197| 400505|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_0007504']|\n", + "|infectious mononu...| FINNGEN_R10_AB1_EBV|FINNGEN_R10| gwas|Infectious mononu...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2979| 400974| 403953|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_0007326']|\n", + "| enterobiasis|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Enterobiasis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 112| 411658| 411770|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| null|\n", + "| erysipelas|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Erysipelas| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 22261| 363227| 385488|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_1001462']|\n", + "|diarrhoea and gas...|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Diarrhoea and gas...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 32210| 367214| 399424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['MONDO_0045031']|\n", + "|gonococcal infection|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Gonococcal infection| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 954| 400197| 401151|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['DOID_7551']|\n", + "| helminthiases|FINNGEN_R10_AB1_H...|FINNGEN_R10| gwas| Helminthiases| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 523| 411658| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|['EFO_0007245', '...|\n", + "+--------------------+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "joined_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1542\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 22:===========================================> (3 + 1) / 4]\r" + ] + } + ], + "source": [ + "num_non_null_rows = joined_df.filter(joined_df.trait_efos.isNotNull()).count()\n", + "print(num_non_null_rows)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------+---------+---------+--------------------+------------------------+------+------------------+--------+--------------------+----------------------+---------------+--------------------+----------------------------------+--------------------+------+---------+--------+--------------------+---------------------+--------------------+--------------------+---------------+-------------+--------------------+-----------+\n", + "| studyId|projectId|studyType| traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId| publicationTitle|publicationFirstAuthor|publicationDate| publicationJournal|backgroundTraitFromSourceMappedIds| initialSampleSize|nCases|nControls|nSamples| cohorts|ldPopulationStructure| discoverySamples| replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|\n", + "+------------+---------+---------+--------------------+------------------------+------+------------------+--------+--------------------+----------------------+---------------+--------------------+----------------------------------+--------------------+------+---------+--------+--------------------+---------------------+--------------------+--------------------+---------------+-------------+--------------------+-----------+\n", + "| GCST000102| GCST| gwas|Endothelial funct...| [EFO_0004298]| null| null|17903301|Genome-wide assoc...| Vasan RS| 2007-09-19| BMC Med Genet| null|Up to 1,238 Europ...| 0| 0| 1238| [null]| [{nfe, 1.0}]| [{1238, European}]| []| null| null| null| false|\n", + "| GCST000272| GCST| gwas| Height| [EFO_0004339]| null| null|19030899|Genome-wide assoc...| Lei SF| 2008-11-23| Hum Genet| null|618 Chinese ances...| 0| 0| 618| [null]| [{eas, 1.0}]| [{618, East Asian}]|[{2953, East Asian}]| null| null| null| false|\n", + "| GCST000436| GCST| gwas|Acenocoumarol mai...| [GO_0061476]| null| null|19578179|A genome-wide ass...| Teichert M| 2009-07-04| Hum Mol Genet| null|1,451 European an...| 0| 0| 1451| [null]| [{nfe, 1.0}]| [{1451, European}]| [{287, NR}]| null| null| null| false|\n", + "| GCST000514| GCST| gwas|Response to antip...| [GO_0097332]| null| null|19875103|Genomewide associ...| Aberg K| 2009-10-27| Biol Psychiatry| null|421 European ance...| 738| 0| 738| [null]| [{afr, 0.28997289...|[{214, African Am...| []| null| null| null| false|\n", + "| GCST000550| GCST| gwas| Metabolite levels| [EFO_0004725]| null| null|20037589|A genome-wide per...| Illig T| 2009-12-27| Nat Genet| null|1,029 European an...| 0| 0| 1029| [null]| [{nfe, 1.0}]| [{1029, European}]| [{1202, European}]| null| null| null| false|\n", + "| GCST000708| GCST| gwas| Freckling| [EFO_0003963]| null| null|20585627|Web-based, partic...| Eriksson N| 2010-06-24| PLoS Genet| null|9,126 European an...| 0| 0| 9126| [null]| [{nfe, 1.0}]| [{9126, European}]| []| null| null| null| false|\n", + "| GCST000754| GCST| gwas|Personality dimen...| [EFO_0004365]| null| null|20691247|A genome-wide ass...| Verweij KJ| 2010-08-04| Biol Psychol| null|5,117 European an...| 0| 0| 5117| [null]| [{nfe, 1.0}]| [{5117, European}]| []| null| null| null| false|\n", + "| GCST000880| GCST| gwas|Menarche (age at ...| [EFO_0004703]| null| null|21102462|Thirty new loci f...| Elks CE| 2010-11-21| Nat Genet| null|86,142 European a...| 0| 0| 87802| [null]| [{nfe, 1.0}]| [{87802, European}]| [{14731, European}]| null| null| null| false|\n", + "| GCST001031| GCST| gwas|Large B-cell lymp...| [EFO_0000403]| null| null|21471979|Common variants o...| Kumar V| 2011-04-07| J Hum Genet| null|74 Japanese ances...| 74| 934| 1008| [null]| [{eas, 1.0}]|[{1008, East Asian}]|[{3634, East Asian}]| null| null| null| false|\n", + "| GCST001032| GCST| gwas|Caffeine consumption| [EFO_0004330]| null| null|21490707|Genome-wide meta-...| Cornelis MC| 2011-04-07| PLoS Genet| null|47,431 European a...| 0| 0| 47431| [null]| [{nfe, 1.0}]| [{47431, European}]| []| null| null| null| false|\n", + "| GCST001059| GCST| gwas| Neutrophil count| [EFO_0004833]| null| null|21507922|Duffy-null-associ...| Ramsuran V| 2011-05-01| Clin Infect Dis| null|115 African ances...| 0| 0| 115| [null]| [{afr, 1.0}]|[{115, Sub-Sahara...| []| null| null| null| false|\n", + "| GCST002187| GCST| gwas|Systolic blood pr...| [EFO_0006335]| null| null|24058526|Genome-wide meta-...| Bhatnagar P| 2013-09-13| PLoS One| [MONDO_0011382]|1617 African Amer...| 1617| 0| 1617| [null]| [{afr, 1.0}]|[{1617, African A...| []| null| null| null| false|\n", + "| GCST002623| GCST| gwas| L-arginine levels| [EFO_0006524]| null| null|25245031|Genome-wide assoc...| Luneburg N| 2014-09-21|Circ Cardiovasc G...| null|3,747 European an...| 0| 0| 6739| [null]| [{nfe, 1.0}]|[{3747, European}...| [{1159, European}]| null| null| null| false|\n", + "| GCST003261| GCST| gwas|Ischemic stroke (...| [HP_0002140]| null| null|26708676|Loci associated w...| Pulit SL| 2015-12-18| Lancet Neurol| null|up to 8,062 Europ...| 9510| 32473| 41983| [null]| [{amr, 0.06647928...|[{2791, Hispanic ...|[{256, African Am...| null| null| null| false|\n", + "| GCST003427| GCST| gwas|Alzheimer disease...| [EFO_0004847, MON...| null| null|26830138|Family-based asso...| Herold C| 2016-02-02| Mol Psychiatry| null|2,478 European an...| 2478| 979| 3457| [null]| [{nfe, 1.0}]| [{3524, European}]| []| null| null| null| false|\n", + "| GCST003665| GCST| gwas|Free cholesterol ...| [EFO_0004611, EFO...| null| null|27005778|Genome-wide study...| Kettunen J| 2016-03-23| Nat Commun| null|21,555 European a...| 0| 0| 21555|[EGCUT, ERF, FTC,...| [{nfe, 1.0}]| [{21555, European}]| []| null| null| null| false|\n", + "| GCST003773| GCST| gwas|Loneliness (multi...| [EFO_0007865]| null| null|27629369|Genome-Wide Assoc...| Gao J| 2016-09-15|Neuropsychopharma...| null|8,490 European an...| 0| 0| 10760| [null]| [{nfe, 0.80529739...|[{8490, European}...| []| null| null| null| false|\n", + "| GCST003791| GCST| gwas|Response to metfo...| [EFO_0006952, GO_...| null| null|28173075|Metformin pharmac...| Niu N| 2016-09-11| Hum Mol Genet| null|up to 96 African ...| 0| 0| 288| [null]| [{afr, 0.33333333...|[{96, African Ame...| []| null| null| null| false|\n", + "| GCST003824| GCST| gwas|Depression in res...| [EFO_0007006, EFO...| null| null|27723809|Genome-Wide Assoc...| Matsunami K| 2016-10-10| PLoS One| [EFO_0004220]|45 Japanese ances...| 45| 179| 224| [null]| [{eas, 1.0}]| [{224, East Asian}]| [{160, East Asian}]| null| null| null| false|\n", + "| GCST003837| GCST| gwas| Chronotype| [EFO_0004354]| null| null|27494321|Genome-Wide Assoc...| Jones SE| 2016-08-05| PLoS Genet| null|127,898 British i...| 0| 0| 127898| [null]| [{nfe, 1.0}]|[{127898, European}]| [{89283, NR}]| []| []|ftp://ftp.ebi.ac....| true|\n", + "| GCST004678| GCST| gwas|Psychosis pronene...| [EFO_0008337]| null| null|28525603|Genome-Wide Assoc...| Ortega-Alonso A| 2017-05-19| Schizophr Bull| null|3,967 Finnish anc...| 0| 0| 3967| [null]| [{nfe, 1.0}]| [{3967, European}]| []| null| null| null| false|\n", + "| GCST005189| GCST| gwas| Tanning| [EFO_0004279]| null| null|29195075|An Unexpectedly C...| Martin AR| 2017-11-30| Cell| null|216 Sub-Saharan A...| 0| 0| 216| [null]| [{afr, 1.0}]|[{216, Sub-Sahara...|[{240, Sub-Sahara...| null| null| null| false|\n", + "| GCST005437| GCST| gwas|Random C-peptide ...| [EFO_0005187]| null| null|29404672|Meta-genome-wide ...| Roshandel D| 2018-02-05| Diabetologia| [MONDO_0005147]|1,497 European an...| 0| 0| 1497| [null]| [{nfe, 1.0}]| [{1497, European}]| []| null| null| null| false|\n", + "| GCST005503| GCST| gwas|Medium HDL partic...| [EFO_0004612]| null| null|29084231|Common, low-frequ...| Davis JP| 2017-10-30| PLoS Genet| null|8,372 Finnish anc...| 0| 0| 8372| [null]| [{nfe, 1.0}]| [{8372, European}]| []| null| null| null| false|\n", + "| GCST005669| GCST| gwas|Delta-6 desaturas...| [EFO_0007765, EFO...| null| null|29246731|A common variant ...| de Toro-Martin J| 2017-11-02| J Clin Lipidol| null|81 extreme respon...| 0| 0| 141| [null]| [{nfe, 1.0}]| [{141, NR}]| []| null| null| null| false|\n", + "| GCST005749| GCST| gwas|Digit length rati...| [EFO_0004841]| null| null|29659830|Genome-wide assoc...| Warrington NM| 2018-04-12| Hum Mol Genet| null|14,382 European a...| 0| 0| 15661| [null]| [{nfe, 1.0}]|[{14382, European...| []| null| null| null| false|\n", + "| GCST006420| GCST| gwas|Affective disorde...| [EFO_0004247, EFO...| null| null|30116032|Genetics of suici...| Erlangsen A| 2018-08-16| Mol Psychiatry| null|4,302 European an...| 4302| 13294| 17596| [null]| [{nfe, 1.0}]| [{17596, European}]| []| null| null| null| false|\n", + "| GCST006484| GCST| gwas| Type 2 diabetes| [MONDO_0005148]| null| null|30130595|Pilot genome-wide...| Dominguez-Cruz MG| 2018-08-18| Gene| null|45 Maya ancestry ...| 45| 47| 92| [null]| [{amr, 1.0}]|[{92, Native Amer...| []| null| null| null| false|\n", + "| GCST006496| GCST| gwas|Glomerular filtra...| [EFO_0006829, EFO...| null| null|30160337|Genome Wide Assoc...| Asleh R| 2018-08-30| Clin Transplant| null|243 European ance...| 0| 0| 251| [null]| [{nfe, 0.99601593...|[{243, European},...| []| null| null| null| false|\n", + "| GCST006739| GCST| gwas|Proportion of mis...| [EFO_0006923]| null| null|30188897|Detecting past an...| Jeong C| 2018-09-06| PLoS Genet| null|981 Tibetan ances...| 0| 0| 981| [null]| [{nfe, 1.0}]| [{981, NR}]| []| null| null| null| false|\n", + "| GCST006907| GCST| gwas|Ischemic stroke (...| [EFO_0005524]| null| null|29531354|Multiancestry gen...| Malik R| 2018-03-12| Nat Genet| null|4,373 European an...| 4373| 406111| 410484| [null]| [{nfe, 1.0}]|[{150765, European}]| []| []| []|ftp://ftp.ebi.ac....| true|\n", + "| GCST006960| GCST| gwas|Inflammatory bowe...| [EFO_0003767]| null| null|26490195|Inherited determi...| Cleynen I| 2015-10-18| Lancet| null|16,902 European a...| 29838| 0| 29838| [null]| [{nfe, 1.0}]| [{29838, European}]| [{6182, European}]| null| null| null| false|\n", + "| GCST007217| GCST| gwas|RR interval (hear...| [EFO_0004831]| null| null|30679814|Genome-wide assoc...| van Setten J| 2019-01-24| Eur J Hum Genet| null|2,006 Erasmus Ruc...| 0| 0| 28698| [null]| [{nfe, 1.0}]| [{28698, European}]| []| null| null| null| false|\n", + "| GCST008154| GCST| gwas| Trunk fat mass| [EFO_0005409]| null| null|28552196|Whole-Genome Sequ...| Tachmazidou I| 2017-06-01| Am J Hum Genet| null|3,399 whole genom...| 0| 0| 16237| [null]| [{nfe, 1.0}]|[{3538, NR}, {128...| [{10667, European}]| null| null| null| false|\n", + "| GCST008483| GCST| gwas| Ulcerative colitis| [EFO_0000729]| null| null|26398853|Identification of...| Ye BD| 2016-01-01| Inflamm Bowel Dis| null|705 Korean ancest...| 705| 1178| 1883| [null]| [{eas, 1.0}]|[{1883, South Asi...|[{3674, South Asi...| null| null| null| false|\n", + "| GCST008671| GCST| gwas|Phlegm x occupati...| [EFO_0007939, EFO...| null| null|30449631|Genome-wide inter...| Zeng X| 2018-11-15| Environ Int| null|1,702 Dutch ances...| 1702| 6274| 7976| [null]| [{nfe, 1.0}]| [{7976, European}]| [{6789, European}]| null| null| null| false|\n", + "| GCST008675| GCST| gwas|Maximum habitual ...| [EFO_0007878]| null| null|31151762|Genome-wide Assoc...| Gelernter J| 2019-04-08| Biol Psychiatry| null|126,936 European ...| 0| 0| 143965| [null]| [{afr, 0.11828569...|[{17029, African ...| []| null| null| null| false|\n", + "| GCST008775| GCST| gwas|Birth weight or w...| [EFO_0004342, EFO...| null| null|30858448|Genetic overlap b...| Tekola-Ayele F| 2019-03-11| Sci Rep| null|153,781 European ...| 0| 0| 378240| [null]| [{nfe, 1.0}]|[{246502, Europea...| []| null| null| null| false|\n", + "| GCST008870| GCST| gwas|Keratinocyte canc...| [EFO_0010176]| null| null|31174203|Combined analysis...| Liyanage UE| 2019-06-07| Hum Mol Genet| null|at least 18,538 E...| 18538| 340302| 358840| [null]| [{nfe, 1.0}]|[{358840, European}]| []| null| null| null| false|\n", + "| GCST009173| GCST| gwas|Response to (pegy...| [EFO_0007859]| null| null|30715261|Genome Wide Assoc...| Brouwer WP| 2019-02-02| Clin Infect Dis| [EFO_0004239]|121 Asian, Europe...| 0| 0| 509| [null]| [{nfe, 0.5}, {afr...|[{127, European},...| []| null| null| null| false|\n", + "| GCST009364| GCST| gwas|Triglyceride leve...| [EFO_0004530, EFO...| null| null|31719535|Multi-ancestry sl...| Noordam R| 2019-11-12| Nat Commun| null|at least 2,926 Af...| 0| 49886| 61990| [null]| [{eas, 0.03837715...|[{2096, East Asia...|[{12579, Hispanic...| null| null| null| false|\n", + "| GCST009391| GCST| gwas|Metabolite levels...| [EFO_0005132]| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_2| GCST| gwas| Metabolite levels| []| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_3| GCST| gwas| Metabolite levels| [EFO_0004468, EFO...| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_4| GCST| gwas| Metabolite levels| [EFO_0004518]| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_5| GCST| gwas| Metabolite levels| [EFO_0004761]| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_6| GCST| gwas| Metabolite levels| [EFO_0004846]| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_7| GCST| gwas| Metabolite levels| [EFO_0005001]| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_8| GCST| gwas| Metabolite levels| [EFO_0005002]| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_9| GCST| gwas| Metabolite levels| [EFO_0005058]| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "+------------+---------+---------+--------------------+------------------------+------+------------------+--------+--------------------+----------------------+---------------+--------------------+----------------------------------+--------------------+------+---------+--------+--------------------+---------------------+--------------------+--------------------+---------------+-------------+--------------------+-----------+\n", + "only showing top 50 rows\n", + "\n" + ] + } + ], + "source": [ + "path_tmp=\"gs://gwas_catalog_data/study_index\"\n", + "tmp=StudyIndex.from_parquet(session=session, path=path_tmp)\n", + "tmp.df.show(50)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "joined_df=joined_df.withColumn(\"traitFromSourceMappedIds\",joined_df[\"trait_efos\"]).drop(\"trait_efos\",\"trait_reported_low\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "| studyId| projectId|studyType| traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds| initialSampleSize|nCases|nControls|nSamples| cohorts|ldPopulationStructure| discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Actinomycosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 101| 363227| 363328|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Amoebiasis| ['EFO_0007144']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 160| 367214| 367374|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Anogenital herpes...| ['EFO_0007282']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1986| 400197| 402183|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Aspergillosis| ['EFO_0007157']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 211| 403213| 403424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Atypical virus in...| ['MONDO_0024318']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 282| 409849| 410131|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial infecti...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 20226| 363227| 383453|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial, viral ...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2852| 409329| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Other bacterial i...| ['EFO_0000771']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 6145| 367214| 373359|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Candidiasis| ['MONDO_0002026']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 4306| 403213| 407519|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas|Other sexually tr...| ['MONDO_0021681',...| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2186| 400197| 402383|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Cholera| ['EFO_1001235']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1385| 367214| 368599|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas|Dengue fever [cla...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 53| 409137| 409190|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas| Dermatophytosis| ['MONDO_0004678']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 3921| 403213| 407134|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Early syphilis| ['EFO_0007504']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 308| 400197| 400505|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "| FINNGEN_R10_AB1_EBV|FINNGEN_R10| gwas|Infectious mononu...| ['EFO_0007326']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2979| 400974| 403953|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Enterobiasis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 112| 411658| 411770|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Erysipelas| ['EFO_1001462']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 22261| 363227| 385488|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Diarrhoea and gas...| ['MONDO_0045031']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 32210| 367214| 399424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Gonococcal infection| ['DOID_7551']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 954| 400197| 401151|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_H...|FINNGEN_R10| gwas| Helminthiases| ['EFO_0007245', '...| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 523| 411658| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "joined_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "string\n" + ] + } + ], + "source": [ + "column_type = dict(joined_df.dtypes)[\"traitFromSourceMappedIds\"]\n", + "print(column_type)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql.functions import from_json\n", + "from pyspark.sql.types import ArrayType, StringType\n", + "\n", + "# Assuming joined_df is your DataFrame\n", + "joined_df = joined_df.withColumn(\n", + " \"traitFromSourceMappedIds\",\n", + " from_json(\"traitFromSourceMappedIds\", ArrayType(StringType()))\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "| studyId| projectId|studyType| traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds| initialSampleSize|nCases|nControls|nSamples| cohorts|ldPopulationStructure| discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Actinomycosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 101| 363227| 363328|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Amoebiasis| [EFO_0007144]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 160| 367214| 367374|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Anogenital herpes...| [EFO_0007282]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1986| 400197| 402183|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Aspergillosis| [EFO_0007157]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 211| 403213| 403424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Atypical virus in...| [MONDO_0024318]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 282| 409849| 410131|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial infecti...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 20226| 363227| 383453|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial, viral ...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2852| 409329| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Other bacterial i...| [EFO_0000771]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 6145| 367214| 373359|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Candidiasis| [MONDO_0002026]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 4306| 403213| 407519|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas|Other sexually tr...| [MONDO_0021681, E...| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2186| 400197| 402383|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Cholera| [EFO_1001235]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1385| 367214| 368599|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas|Dengue fever [cla...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 53| 409137| 409190|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas| Dermatophytosis| [MONDO_0004678]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 3921| 403213| 407134|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Early syphilis| [EFO_0007504]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 308| 400197| 400505|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "| FINNGEN_R10_AB1_EBV|FINNGEN_R10| gwas|Infectious mononu...| [EFO_0007326]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2979| 400974| 403953|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Enterobiasis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 112| 411658| 411770|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Erysipelas| [EFO_1001462]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 22261| 363227| 385488|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Diarrhoea and gas...| [MONDO_0045031]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 32210| 367214| 399424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Gonococcal infection| [DOID_7551]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 954| 400197| 401151|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_H...|FINNGEN_R10| gwas| Helminthiases| [EFO_0007245, EFO...| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 523| 411658| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "only showing top 20 rows\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "joined_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "array\n" + ] + } + ], + "source": [ + "column_type = dict(joined_df.dtypes)[\"traitFromSourceMappedIds\"]\n", + "print(column_type)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "si=StudyIndex(_df=joined_df, _schema=StudyIndex.get_schema())" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "| studyId| projectId|studyType| traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds| initialSampleSize|nCases|nControls|nSamples| cohorts|ldPopulationStructure| discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Actinomycosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 101| 363227| 363328|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Amoebiasis| [EFO_0007144]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 160| 367214| 367374|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Anogenital herpes...| [EFO_0007282]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1986| 400197| 402183|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Aspergillosis| [EFO_0007157]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 211| 403213| 403424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Atypical virus in...| [MONDO_0024318]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 282| 409849| 410131|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial infecti...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 20226| 363227| 383453|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial, viral ...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2852| 409329| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Other bacterial i...| [EFO_0000771]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 6145| 367214| 373359|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Candidiasis| [MONDO_0002026]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 4306| 403213| 407519|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas|Other sexually tr...| [MONDO_0021681, E...| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2186| 400197| 402383|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Cholera| [EFO_1001235]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1385| 367214| 368599|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas|Dengue fever [cla...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 53| 409137| 409190|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas| Dermatophytosis| [MONDO_0004678]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 3921| 403213| 407134|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Early syphilis| [EFO_0007504]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 308| 400197| 400505|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "| FINNGEN_R10_AB1_EBV|FINNGEN_R10| gwas|Infectious mononu...| [EFO_0007326]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2979| 400974| 403953|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Enterobiasis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 112| 411658| 411770|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Erysipelas| [EFO_1001462]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 22261| 363227| 385488|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Diarrhoea and gas...| [MONDO_0045031]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 32210| 367214| 399424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Gonococcal infection| [DOID_7551]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 954| 400197| 401151|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_H...|FINNGEN_R10| gwas| Helminthiases| [EFO_0007245, EFO...| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 523| 411658| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "si.df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "text/plain": [ + "2408" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "si.df.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "si.df.write.parquet(path=\"gs://genetics-portal-dev-analysis/yt4/study_index_finngen_with_efo\")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "path_to_study_index=\"gs://genetics-portal-dev-analysis/yt4/study_index_finngen_with_efo\"\n", + "si=StudyIndex.from_parquet(session=session, path=path_to_study_index)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "gentropy-krNFZEZg-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 900dd649779c2727e44c30b11671b0e3c7261036 Mon Sep 17 00:00:00 2001 From: Daniel-Considine <113430683+Daniel-Considine@users.noreply.github.com> Date: Tue, 23 Apr 2024 13:33:16 +0100 Subject: [PATCH 5/7] feat: adding init to finemapping step (#577) * feat: adding init to finemapping step * fix: removing some commented lines * chore: fixing indents * fix: schema * feat: changing output path to include studyLocusId mapped --------- Co-authored-by: Yakov --- src/gentropy/susie_finemapper.py | 51 ++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py index d37298436..7b6f81b3a 100644 --- a/src/gentropy/susie_finemapper.py +++ b/src/gentropy/susie_finemapper.py @@ -28,6 +28,47 @@ class SusieFineMapperStep: In the future this step will be refactored and moved to the methods module. """ + def __init__( + self, + session: Session, + study_locus_to_finemap: str, + study_locus_collected_path: str, + study_index_path: str, + output_path: str, + locus_radius: int = 500_000, + locus_L: int = 10, + ) -> None: + """Run fine-mapping on a studyLocusId from a collected studyLocus table. + + Args: + session (Session): Spark session + study_locus_to_finemap (str): path to the study locus to fine-map + study_locus_collected_path (str): path to the collected study locus + study_index_path (str): path to the study index + output_path (str): path to the output + locus_radius (int): Radius of base-pair window around the locus, default is 500_000 + locus_L (int): Maximum number of causal variants in locus, default is 10 + """ + # Read studyLocus + study_locus = ( + StudyLocus.from_parquet(session, study_locus_collected_path) + .df.filter(f.col("studyLocusId") == study_locus_to_finemap) + .collect()[0] + ) + study_index = StudyIndex.from_parquet(session, study_index_path) + # Run fine-mapping + result = self.susie_finemapper_ss_gathered( + session, + study_locus, + study_index, + locus_radius * 2, + locus_L, + ) + # Write result + result.df.write.mode(session.write_mode).parquet( + output_path + "/" + study_locus_to_finemap + ) + @staticmethod def susie_finemapper_one_studylocus_row( GWAS: SummaryStatistics, @@ -317,9 +358,15 @@ def susie_finemapper_ss_gathered( + str(int(position + window / 2)) ) + schema = StudyLocus.get_schema() + gwas_df = session.spark.createDataFrame([study_locus_row], schema=schema) + exploded_df = gwas_df.select(f.explode("locus").alias("locus")) + + result_df = exploded_df.select( + "locus.variantId", "locus.beta", "locus.standardError" + ) gwas_df = ( - session.spark.createDataFrame(study_locus_row.locus) - .withColumn("z", f.col("beta") / f.col("standardError")) + result_df.withColumn("z", f.col("beta") / f.col("standardError")) .withColumn("chromosome", f.split(f.col("variantId"), "_")[0]) .withColumn("position", f.split(f.col("variantId"), "_")[1]) .filter(f.col("z").isNotNull()) From 78fcf1b85a72751fb6a7a006b3769d8170348aed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= <45119610+ireneisdoomed@users.noreply.github.com> Date: Tue, 23 Apr 2024 13:55:35 +0100 Subject: [PATCH 6/7] feat: dockerise gentropy python package (#579) Co-authored-by: David Ochoa --- Dockerfile | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..1221ec637 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,8 @@ +FROM python:3.10 + +RUN pip install poetry==1.7.1 + +COPY . . +RUN poetry install --without dev,docs,tests + +ENTRYPOINT ["poetry", "run", "gentropy"] From 82b8a7c539d6ee669a73c09db1d30783eec41d69 Mon Sep 17 00:00:00 2001 From: Daniel-Considine <113430683+Daniel-Considine@users.noreply.github.com> Date: Tue, 23 Apr 2024 14:27:50 +0100 Subject: [PATCH 7/7] feat: updating step config file (#580) --- src/gentropy/config.py | 19 +++++++++++++++++++ src/gentropy/susie_finemapper.py | 6 +++--- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/gentropy/config.py b/src/gentropy/config.py index 127d90844..0a76b2c84 100644 --- a/src/gentropy/config.py +++ b/src/gentropy/config.py @@ -328,6 +328,24 @@ class WindowBasedClumpingStep(StepConfig): _target_: str = "gentropy.window_based_clumping.WindowBasedClumpingStep" +@dataclass +class FinemapperConfig(StepConfig): + """SuSiE fine-mapper step configuration.""" + + session: Any = field( + default_factory=lambda: { + "start_hail": True, + } + ) + study_locus_to_finemap: str = MISSING + study_locus_collected_path: str = MISSING + study_index_path: str = MISSING + output_path: str = MISSING + locus_radius: int = MISSING + locus_l: int = MISSING + _target_: str = "gentropy.susie_finemapping.SusieFineMapperStep" + + @dataclass class Config: """Application configuration.""" @@ -385,3 +403,4 @@ def register_config() -> None: cs.store(group="step", name="variant_index", node=VariantIndexConfig) cs.store(group="step", name="variant_to_gene", node=VariantToGeneConfig) cs.store(group="step", name="window_based_clumping", node=WindowBasedClumpingStep) + cs.store(group="step", name="susie_finemapping", node=FinemapperConfig) diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py index 7b6f81b3a..5a0fa31f3 100644 --- a/src/gentropy/susie_finemapper.py +++ b/src/gentropy/susie_finemapper.py @@ -36,7 +36,7 @@ def __init__( study_index_path: str, output_path: str, locus_radius: int = 500_000, - locus_L: int = 10, + locus_l: int = 10, ) -> None: """Run fine-mapping on a studyLocusId from a collected studyLocus table. @@ -47,7 +47,7 @@ def __init__( study_index_path (str): path to the study index output_path (str): path to the output locus_radius (int): Radius of base-pair window around the locus, default is 500_000 - locus_L (int): Maximum number of causal variants in locus, default is 10 + locus_l (int): Maximum number of causal variants in locus, default is 10 """ # Read studyLocus study_locus = ( @@ -62,7 +62,7 @@ def __init__( study_locus, study_index, locus_radius * 2, - locus_L, + locus_l, ) # Write result result.df.write.mode(session.write_mode).parquet(