diff --git a/notebooks/Mapping_EFO_finngen.ipynb b/notebooks/Mapping_EFO_finngen.ipynb new file mode 100644 index 000000000..9bd82d8d4 --- /dev/null +++ b/notebooks/Mapping_EFO_finngen.ipynb @@ -0,0 +1,768 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Mapping EFOs for the FinnGen study index using old study index from the previos prod" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook adds EFOs from previos prod version of study_index to the new FinnGen study_index using trait name as a matching key.\n", + "\n", + "The rsulting study index has 1542 rows with not null EFOs (out of 2408 rows).\n", + "\n", + "The new study index is saved here:\n", + "\"gs://genetics-portal-dev-analysis/yt4/study_index_finngen_with_efo\"" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Your browser has been opened to visit:\n", + "\n", + " https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=XHb8Uk43SsVjvFRqwgrX4Tgg2tTOHS&access_type=offline&code_challenge=OkiqDAkHXDGEgJQbX8r0ZYKfZ7gcgfXS8mfZc5a913Y&code_challenge_method=S256\n", + "\n", + "\n", + "Credentials saved to file: [/Users/yt4/.config/gcloud/application_default_credentials.json]\n", + "\n", + "These credentials will be used by any library that requests Application Default Credentials (ADC).\n", + "\n", + "Quota project \"open-targets-genetics-dev\" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.\n" + ] + } + ], + "source": [ + "!gcloud auth application-default login" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " Loading BokehJS ...\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": "(function(root) {\n function now() {\n return new Date();\n }\n\n const force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\nconst JS_MIME_TYPE = 'application/javascript';\n const HTML_MIME_TYPE = 'text/html';\n const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n const CLASS_NAME = 'output_bokeh rendered_html';\n\n /**\n * Render data to the DOM node\n */\n function render(props, node) {\n const script = document.createElement(\"script\");\n node.appendChild(script);\n }\n\n /**\n * Handle when an output is cleared or removed\n */\n function handleClearOutput(event, handle) {\n function drop(id) {\n const view = Bokeh.index.get_by_id(id)\n if (view != null) {\n view.model.document.clear()\n Bokeh.index.delete(view)\n }\n }\n\n const cell = handle.cell;\n\n const id = cell.output_area._bokeh_element_id;\n const server_id = cell.output_area._bokeh_server_id;\n\n // Clean up Bokeh references\n if (id != null) {\n drop(id)\n }\n\n if (server_id !== undefined) {\n // Clean up Bokeh references\n const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n cell.notebook.kernel.execute(cmd_clean, {\n iopub: {\n output: function(msg) {\n const id = msg.content.text.trim()\n drop(id)\n }\n }\n });\n // Destroy server and session\n const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n cell.notebook.kernel.execute(cmd_destroy);\n }\n }\n\n /**\n * Handle when a new output is added\n */\n function handleAddOutput(event, handle) {\n const output_area = handle.output_area;\n const output = handle.output;\n\n // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n return\n }\n\n const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n\n if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n // store reference to embed id on output_area\n output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n }\n if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n const bk_div = document.createElement(\"div\");\n bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n const script_attrs = bk_div.children[0].attributes;\n for (let i = 0; i < script_attrs.length; i++) {\n toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n }\n // store reference to server id on output_area\n output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n }\n }\n\n function register_renderer(events, OutputArea) {\n\n function append_mime(data, metadata, element) {\n // create a DOM node to render to\n const toinsert = this.create_output_subarea(\n metadata,\n CLASS_NAME,\n EXEC_MIME_TYPE\n );\n this.keyboard_manager.register_events(toinsert);\n // Render to node\n const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n render(props, toinsert[toinsert.length - 1]);\n element.append(toinsert);\n return toinsert\n }\n\n /* Handle when an output is cleared or removed */\n events.on('clear_output.CodeCell', handleClearOutput);\n events.on('delete.Cell', handleClearOutput);\n\n /* Handle when a new output is added */\n events.on('output_added.OutputArea', handleAddOutput);\n\n /**\n * Register the mime type and append_mime function with output_area\n */\n OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n /* Is output safe? */\n safe: true,\n /* Index of renderer in `output_area.display_order` */\n index: 0\n });\n }\n\n // register the mime type if in Jupyter Notebook environment and previously unregistered\n if (root.Jupyter !== undefined) {\n const events = require('base/js/events');\n const OutputArea = require('notebook/js/outputarea').OutputArea;\n\n if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n register_renderer(events, OutputArea);\n }\n }\n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n const NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n const el = document.getElementById(\"c92e22c1-acc6-4a9b-8a5a-529fec6e60ae\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error(url) {\n console.error(\"failed to load \" + url);\n }\n\n for (let i = 0; i < css_urls.length; i++) {\n const url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n for (let i = 0; i < js_urls.length; i++) {\n const url = js_urls[i];\n const element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.3.0.min.js\"];\n const css_urls = [];\n\n const inline_js = [ function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\nfunction(Bokeh) {\n }\n ];\n\n function run_inline_js() {\n if (root.Bokeh !== undefined || force === true) {\n for (let i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\nif (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n const cell = $(document.getElementById(\"c92e22c1-acc6-4a9b-8a5a-529fec6e60ae\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));", + "application/vnd.bokehjs_load.v0+json": "" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24/04/14 16:03:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "pip-installed Hail requires additional configuration options in Spark referring\n", + " to the path to the Hail Python module directory HAIL_DIR,\n", + " e.g. /path/to/python/site-packages/hail:\n", + " spark.jars=HAIL_DIR/backend/hail-all-spark.jar\n", + " spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar\n", + " spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 3.3.4\n", + "SparkUI available at http://192.168.0.232:4040\n", + "Welcome to\n", + " __ __ <>__\n", + " / /_/ /__ __/ /\n", + " / __ / _ `/ / /\n", + " /_/ /_/\\_,_/_/_/ version 0.2.127-bb535cd096c5\n", + "LOGGING: writing to /dev/null\n" + ] + } + ], + "source": [ + "import os\n", + "import hail as hl\n", + "import pyspark.sql.functions as f\n", + "import pandas as pd\n", + "pd.set_option('display.max_colwidth', None)\n", + "pd.set_option('display.expand_frame_repr', False)\n", + "\n", + "from gentropy.common.session import Session\n", + "from gentropy.dataset.study_index import StudyIndex\n", + "\n", + "\n", + "hail_dir = os.path.dirname(hl.__file__)\n", + "session = Session(hail_home=hail_dir, start_hail=True, extended_spark_conf={\"spark.driver.memory\": \"12g\",\n", + " \"spark.kryoserializer.buffer.max\": \"500m\",\"spark.driver.maxResultSize\":\"3g\"})\n", + "hl.init(sc=session.spark.sparkContext, log=\"/dev/null\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "path_si=\"gs://genetics_etl_python_playground/releases/24.03/study_index/finngen/study_index\"\n", + "path_si_old=\"gs://genetics-portal-dev-analysis/yt4/study_index.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "si_old=session.spark.read.csv(path_si_old, header=True,sep=\"\\t\")\n", + "si_new=StudyIndex.from_parquet(session=session, path=path_si)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-------------------+--------------------+-------+---------+-------------+----+----------+----------+-----------+---------+------------+--------------+-------+--------------------+-----------------+--------------------+\n", + "| study_id| ancestry_initial|ancestry_replication|n_cases|n_initial|n_replication|pmid|pub_author| pub_date|pub_journal|pub_title|has_sumstats|num_assoc_loci| source| trait_reported| trait_efos| trait_category|\n", + "+--------------------+-------------------+--------------------+-------+---------+-------------+----+----------+----------+-----------+---------+------------+--------------+-------+--------------------+-----------------+--------------------+\n", + "|FINNGEN_R6_M13_MU...|['European=253458']| []| 108.0| 253458| 0.0|null|FINNGEN_R6|2022-01-24| null| null| True| 0|FINNGEN|Multifocal fibros...|['MONDO_0009230']|immune system dis...|\n", + "|FINNGEN_R6_M13_MU...|['European=199528']| []| 1804.0| 199528| 0.0|null|FINNGEN_R6|2022-01-24| null| null| True| 0|FINNGEN|Disorders of muscles| ['EFO_0002970']|musculoskeletal o...|\n", + "|FINNGEN_R6_M13_MU...|['European=197821']| []| 97.0| 197821| 0.0|null|FINNGEN_R6|2022-01-24| null| null| True| 0|FINNGEN|\"\"\"Muscle wasting...| ['EFO_0009851']| biological process|\n", + "|FINNGEN_R6_M13_MU...|['European=198253']| []| 529.0| 198253| 0.0|null|FINNGEN_R6|2022-01-24| null| null| True| 0|FINNGEN|Other specified d...| ['EFO_0002970']|musculoskeletal o...|\n", + "|FINNGEN_R6_M13_MU...|['European=198179']| []| 455.0| 198179| 0.0|null|FINNGEN_R6|2022-01-24| null| null| True| 1|FINNGEN| Muscle strain| ['EFO_0010686']|injury, poisoning...|\n", + "+--------------------+-------------------+--------------------+-------+---------+-------------+----+----------+----------+-----------+---------+------------+--------------+-------+--------------------+-----------------+--------------------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "si_old.show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "| studyId| projectId|studyType| traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds| initialSampleSize|nCases|nControls|nSamples| cohorts|ldPopulationStructure| discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Actinomycosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 101| 363227| 363328|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Amoebiasis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 160| 367214| 367374|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Anogenital herpes...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1986| 400197| 402183|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Aspergillosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 211| 403213| 403424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Atypical virus in...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 282| 409849| 410131|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "si_new_df=si_new.df\n", + "si_new_df.show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "57246\n", + "2408\n" + ] + } + ], + "source": [ + "print(si_old.count())\n", + "print(si_new_df.count())" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-----------------+\n", + "| trait_reported| trait_efos|\n", + "+--------------------+-----------------+\n", + "|Multifocal fibros...|['MONDO_0009230']|\n", + "|Disorders of muscles| ['EFO_0002970']|\n", + "|\"\"\"Muscle wasting...| ['EFO_0009851']|\n", + "|Other specified d...| ['EFO_0002970']|\n", + "| Muscle strain| ['EFO_0010686']|\n", + "+--------------------+-----------------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "si_old=si_old.select(\"trait_reported\",\"trait_efos\")\n", + "si_old.show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql.functions import lower\n", + "\n", + "si_old = si_old.withColumn(\"trait_reported_low\", lower(si_old[\"trait_reported\"])).select(\"trait_reported_low\",\"trait_efos\")\n", + "si_new_df= si_new_df.withColumn(\"trait_reported_low\", lower(si_new_df[\"traitFromSource\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "text/plain": [ + "2408" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "si_old = si_old.dropDuplicates(['trait_reported_low'])\n", + "joined_df = si_new_df.join(si_old, \"trait_reported_low\", how='left')\n", + "joined_df.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+--------------------+\n", + "| trait_reported_low| studyId| projectId|studyType| traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds| initialSampleSize|nCases|nControls|nSamples| cohorts|ldPopulationStructure| discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats| trait_efos|\n", + "+--------------------+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+--------------------+\n", + "| actinomycosis|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Actinomycosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 101| 363227| 363328|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| null|\n", + "| amoebiasis|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Amoebiasis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 160| 367214| 367374|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_0007144']|\n", + "|anogenital herpes...|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Anogenital herpes...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1986| 400197| 402183|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_0007282']|\n", + "| aspergillosis|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Aspergillosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 211| 403213| 403424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_0007157']|\n", + "|atypical virus in...|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Atypical virus in...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 282| 409849| 410131|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['MONDO_0024318']|\n", + "|bacterial infecti...|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial infecti...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 20226| 363227| 383453|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| null|\n", + "|bacterial, viral ...|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial, viral ...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2852| 409329| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| null|\n", + "|other bacterial i...|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Other bacterial i...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 6145| 367214| 373359|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_0000771']|\n", + "| candidiasis|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Candidiasis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 4306| 403213| 407519|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['MONDO_0002026']|\n", + "|other sexually tr...|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas|Other sexually tr...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2186| 400197| 402383|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|['MONDO_0021681',...|\n", + "| cholera|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Cholera| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1385| 367214| 368599|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_1001235']|\n", + "|dengue fever [cla...|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas|Dengue fever [cla...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 53| 409137| 409190|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| null|\n", + "| dermatophytosis|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas| Dermatophytosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 3921| 403213| 407134|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['MONDO_0004678']|\n", + "| early syphilis|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Early syphilis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 308| 400197| 400505|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_0007504']|\n", + "|infectious mononu...| FINNGEN_R10_AB1_EBV|FINNGEN_R10| gwas|Infectious mononu...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2979| 400974| 403953|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_0007326']|\n", + "| enterobiasis|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Enterobiasis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 112| 411658| 411770|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| null|\n", + "| erysipelas|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Erysipelas| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 22261| 363227| 385488|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['EFO_1001462']|\n", + "|diarrhoea and gas...|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Diarrhoea and gas...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 32210| 367214| 399424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['MONDO_0045031']|\n", + "|gonococcal infection|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Gonococcal infection| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 954| 400197| 401151|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true| ['DOID_7551']|\n", + "| helminthiases|FINNGEN_R10_AB1_H...|FINNGEN_R10| gwas| Helminthiases| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 523| 411658| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|['EFO_0007245', '...|\n", + "+--------------------+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+--------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "joined_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1542\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Stage 22:===========================================> (3 + 1) / 4]\r" + ] + } + ], + "source": [ + "num_non_null_rows = joined_df.filter(joined_df.trait_efos.isNotNull()).count()\n", + "print(num_non_null_rows)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------+---------+---------+--------------------+------------------------+------+------------------+--------+--------------------+----------------------+---------------+--------------------+----------------------------------+--------------------+------+---------+--------+--------------------+---------------------+--------------------+--------------------+---------------+-------------+--------------------+-----------+\n", + "| studyId|projectId|studyType| traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId| publicationTitle|publicationFirstAuthor|publicationDate| publicationJournal|backgroundTraitFromSourceMappedIds| initialSampleSize|nCases|nControls|nSamples| cohorts|ldPopulationStructure| discoverySamples| replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|\n", + "+------------+---------+---------+--------------------+------------------------+------+------------------+--------+--------------------+----------------------+---------------+--------------------+----------------------------------+--------------------+------+---------+--------+--------------------+---------------------+--------------------+--------------------+---------------+-------------+--------------------+-----------+\n", + "| GCST000102| GCST| gwas|Endothelial funct...| [EFO_0004298]| null| null|17903301|Genome-wide assoc...| Vasan RS| 2007-09-19| BMC Med Genet| null|Up to 1,238 Europ...| 0| 0| 1238| [null]| [{nfe, 1.0}]| [{1238, European}]| []| null| null| null| false|\n", + "| GCST000272| GCST| gwas| Height| [EFO_0004339]| null| null|19030899|Genome-wide assoc...| Lei SF| 2008-11-23| Hum Genet| null|618 Chinese ances...| 0| 0| 618| [null]| [{eas, 1.0}]| [{618, East Asian}]|[{2953, East Asian}]| null| null| null| false|\n", + "| GCST000436| GCST| gwas|Acenocoumarol mai...| [GO_0061476]| null| null|19578179|A genome-wide ass...| Teichert M| 2009-07-04| Hum Mol Genet| null|1,451 European an...| 0| 0| 1451| [null]| [{nfe, 1.0}]| [{1451, European}]| [{287, NR}]| null| null| null| false|\n", + "| GCST000514| GCST| gwas|Response to antip...| [GO_0097332]| null| null|19875103|Genomewide associ...| Aberg K| 2009-10-27| Biol Psychiatry| null|421 European ance...| 738| 0| 738| [null]| [{afr, 0.28997289...|[{214, African Am...| []| null| null| null| false|\n", + "| GCST000550| GCST| gwas| Metabolite levels| [EFO_0004725]| null| null|20037589|A genome-wide per...| Illig T| 2009-12-27| Nat Genet| null|1,029 European an...| 0| 0| 1029| [null]| [{nfe, 1.0}]| [{1029, European}]| [{1202, European}]| null| null| null| false|\n", + "| GCST000708| GCST| gwas| Freckling| [EFO_0003963]| null| null|20585627|Web-based, partic...| Eriksson N| 2010-06-24| PLoS Genet| null|9,126 European an...| 0| 0| 9126| [null]| [{nfe, 1.0}]| [{9126, European}]| []| null| null| null| false|\n", + "| GCST000754| GCST| gwas|Personality dimen...| [EFO_0004365]| null| null|20691247|A genome-wide ass...| Verweij KJ| 2010-08-04| Biol Psychol| null|5,117 European an...| 0| 0| 5117| [null]| [{nfe, 1.0}]| [{5117, European}]| []| null| null| null| false|\n", + "| GCST000880| GCST| gwas|Menarche (age at ...| [EFO_0004703]| null| null|21102462|Thirty new loci f...| Elks CE| 2010-11-21| Nat Genet| null|86,142 European a...| 0| 0| 87802| [null]| [{nfe, 1.0}]| [{87802, European}]| [{14731, European}]| null| null| null| false|\n", + "| GCST001031| GCST| gwas|Large B-cell lymp...| [EFO_0000403]| null| null|21471979|Common variants o...| Kumar V| 2011-04-07| J Hum Genet| null|74 Japanese ances...| 74| 934| 1008| [null]| [{eas, 1.0}]|[{1008, East Asian}]|[{3634, East Asian}]| null| null| null| false|\n", + "| GCST001032| GCST| gwas|Caffeine consumption| [EFO_0004330]| null| null|21490707|Genome-wide meta-...| Cornelis MC| 2011-04-07| PLoS Genet| null|47,431 European a...| 0| 0| 47431| [null]| [{nfe, 1.0}]| [{47431, European}]| []| null| null| null| false|\n", + "| GCST001059| GCST| gwas| Neutrophil count| [EFO_0004833]| null| null|21507922|Duffy-null-associ...| Ramsuran V| 2011-05-01| Clin Infect Dis| null|115 African ances...| 0| 0| 115| [null]| [{afr, 1.0}]|[{115, Sub-Sahara...| []| null| null| null| false|\n", + "| GCST002187| GCST| gwas|Systolic blood pr...| [EFO_0006335]| null| null|24058526|Genome-wide meta-...| Bhatnagar P| 2013-09-13| PLoS One| [MONDO_0011382]|1617 African Amer...| 1617| 0| 1617| [null]| [{afr, 1.0}]|[{1617, African A...| []| null| null| null| false|\n", + "| GCST002623| GCST| gwas| L-arginine levels| [EFO_0006524]| null| null|25245031|Genome-wide assoc...| Luneburg N| 2014-09-21|Circ Cardiovasc G...| null|3,747 European an...| 0| 0| 6739| [null]| [{nfe, 1.0}]|[{3747, European}...| [{1159, European}]| null| null| null| false|\n", + "| GCST003261| GCST| gwas|Ischemic stroke (...| [HP_0002140]| null| null|26708676|Loci associated w...| Pulit SL| 2015-12-18| Lancet Neurol| null|up to 8,062 Europ...| 9510| 32473| 41983| [null]| [{amr, 0.06647928...|[{2791, Hispanic ...|[{256, African Am...| null| null| null| false|\n", + "| GCST003427| GCST| gwas|Alzheimer disease...| [EFO_0004847, MON...| null| null|26830138|Family-based asso...| Herold C| 2016-02-02| Mol Psychiatry| null|2,478 European an...| 2478| 979| 3457| [null]| [{nfe, 1.0}]| [{3524, European}]| []| null| null| null| false|\n", + "| GCST003665| GCST| gwas|Free cholesterol ...| [EFO_0004611, EFO...| null| null|27005778|Genome-wide study...| Kettunen J| 2016-03-23| Nat Commun| null|21,555 European a...| 0| 0| 21555|[EGCUT, ERF, FTC,...| [{nfe, 1.0}]| [{21555, European}]| []| null| null| null| false|\n", + "| GCST003773| GCST| gwas|Loneliness (multi...| [EFO_0007865]| null| null|27629369|Genome-Wide Assoc...| Gao J| 2016-09-15|Neuropsychopharma...| null|8,490 European an...| 0| 0| 10760| [null]| [{nfe, 0.80529739...|[{8490, European}...| []| null| null| null| false|\n", + "| GCST003791| GCST| gwas|Response to metfo...| [EFO_0006952, GO_...| null| null|28173075|Metformin pharmac...| Niu N| 2016-09-11| Hum Mol Genet| null|up to 96 African ...| 0| 0| 288| [null]| [{afr, 0.33333333...|[{96, African Ame...| []| null| null| null| false|\n", + "| GCST003824| GCST| gwas|Depression in res...| [EFO_0007006, EFO...| null| null|27723809|Genome-Wide Assoc...| Matsunami K| 2016-10-10| PLoS One| [EFO_0004220]|45 Japanese ances...| 45| 179| 224| [null]| [{eas, 1.0}]| [{224, East Asian}]| [{160, East Asian}]| null| null| null| false|\n", + "| GCST003837| GCST| gwas| Chronotype| [EFO_0004354]| null| null|27494321|Genome-Wide Assoc...| Jones SE| 2016-08-05| PLoS Genet| null|127,898 British i...| 0| 0| 127898| [null]| [{nfe, 1.0}]|[{127898, European}]| [{89283, NR}]| []| []|ftp://ftp.ebi.ac....| true|\n", + "| GCST004678| GCST| gwas|Psychosis pronene...| [EFO_0008337]| null| null|28525603|Genome-Wide Assoc...| Ortega-Alonso A| 2017-05-19| Schizophr Bull| null|3,967 Finnish anc...| 0| 0| 3967| [null]| [{nfe, 1.0}]| [{3967, European}]| []| null| null| null| false|\n", + "| GCST005189| GCST| gwas| Tanning| [EFO_0004279]| null| null|29195075|An Unexpectedly C...| Martin AR| 2017-11-30| Cell| null|216 Sub-Saharan A...| 0| 0| 216| [null]| [{afr, 1.0}]|[{216, Sub-Sahara...|[{240, Sub-Sahara...| null| null| null| false|\n", + "| GCST005437| GCST| gwas|Random C-peptide ...| [EFO_0005187]| null| null|29404672|Meta-genome-wide ...| Roshandel D| 2018-02-05| Diabetologia| [MONDO_0005147]|1,497 European an...| 0| 0| 1497| [null]| [{nfe, 1.0}]| [{1497, European}]| []| null| null| null| false|\n", + "| GCST005503| GCST| gwas|Medium HDL partic...| [EFO_0004612]| null| null|29084231|Common, low-frequ...| Davis JP| 2017-10-30| PLoS Genet| null|8,372 Finnish anc...| 0| 0| 8372| [null]| [{nfe, 1.0}]| [{8372, European}]| []| null| null| null| false|\n", + "| GCST005669| GCST| gwas|Delta-6 desaturas...| [EFO_0007765, EFO...| null| null|29246731|A common variant ...| de Toro-Martin J| 2017-11-02| J Clin Lipidol| null|81 extreme respon...| 0| 0| 141| [null]| [{nfe, 1.0}]| [{141, NR}]| []| null| null| null| false|\n", + "| GCST005749| GCST| gwas|Digit length rati...| [EFO_0004841]| null| null|29659830|Genome-wide assoc...| Warrington NM| 2018-04-12| Hum Mol Genet| null|14,382 European a...| 0| 0| 15661| [null]| [{nfe, 1.0}]|[{14382, European...| []| null| null| null| false|\n", + "| GCST006420| GCST| gwas|Affective disorde...| [EFO_0004247, EFO...| null| null|30116032|Genetics of suici...| Erlangsen A| 2018-08-16| Mol Psychiatry| null|4,302 European an...| 4302| 13294| 17596| [null]| [{nfe, 1.0}]| [{17596, European}]| []| null| null| null| false|\n", + "| GCST006484| GCST| gwas| Type 2 diabetes| [MONDO_0005148]| null| null|30130595|Pilot genome-wide...| Dominguez-Cruz MG| 2018-08-18| Gene| null|45 Maya ancestry ...| 45| 47| 92| [null]| [{amr, 1.0}]|[{92, Native Amer...| []| null| null| null| false|\n", + "| GCST006496| GCST| gwas|Glomerular filtra...| [EFO_0006829, EFO...| null| null|30160337|Genome Wide Assoc...| Asleh R| 2018-08-30| Clin Transplant| null|243 European ance...| 0| 0| 251| [null]| [{nfe, 0.99601593...|[{243, European},...| []| null| null| null| false|\n", + "| GCST006739| GCST| gwas|Proportion of mis...| [EFO_0006923]| null| null|30188897|Detecting past an...| Jeong C| 2018-09-06| PLoS Genet| null|981 Tibetan ances...| 0| 0| 981| [null]| [{nfe, 1.0}]| [{981, NR}]| []| null| null| null| false|\n", + "| GCST006907| GCST| gwas|Ischemic stroke (...| [EFO_0005524]| null| null|29531354|Multiancestry gen...| Malik R| 2018-03-12| Nat Genet| null|4,373 European an...| 4373| 406111| 410484| [null]| [{nfe, 1.0}]|[{150765, European}]| []| []| []|ftp://ftp.ebi.ac....| true|\n", + "| GCST006960| GCST| gwas|Inflammatory bowe...| [EFO_0003767]| null| null|26490195|Inherited determi...| Cleynen I| 2015-10-18| Lancet| null|16,902 European a...| 29838| 0| 29838| [null]| [{nfe, 1.0}]| [{29838, European}]| [{6182, European}]| null| null| null| false|\n", + "| GCST007217| GCST| gwas|RR interval (hear...| [EFO_0004831]| null| null|30679814|Genome-wide assoc...| van Setten J| 2019-01-24| Eur J Hum Genet| null|2,006 Erasmus Ruc...| 0| 0| 28698| [null]| [{nfe, 1.0}]| [{28698, European}]| []| null| null| null| false|\n", + "| GCST008154| GCST| gwas| Trunk fat mass| [EFO_0005409]| null| null|28552196|Whole-Genome Sequ...| Tachmazidou I| 2017-06-01| Am J Hum Genet| null|3,399 whole genom...| 0| 0| 16237| [null]| [{nfe, 1.0}]|[{3538, NR}, {128...| [{10667, European}]| null| null| null| false|\n", + "| GCST008483| GCST| gwas| Ulcerative colitis| [EFO_0000729]| null| null|26398853|Identification of...| Ye BD| 2016-01-01| Inflamm Bowel Dis| null|705 Korean ancest...| 705| 1178| 1883| [null]| [{eas, 1.0}]|[{1883, South Asi...|[{3674, South Asi...| null| null| null| false|\n", + "| GCST008671| GCST| gwas|Phlegm x occupati...| [EFO_0007939, EFO...| null| null|30449631|Genome-wide inter...| Zeng X| 2018-11-15| Environ Int| null|1,702 Dutch ances...| 1702| 6274| 7976| [null]| [{nfe, 1.0}]| [{7976, European}]| [{6789, European}]| null| null| null| false|\n", + "| GCST008675| GCST| gwas|Maximum habitual ...| [EFO_0007878]| null| null|31151762|Genome-wide Assoc...| Gelernter J| 2019-04-08| Biol Psychiatry| null|126,936 European ...| 0| 0| 143965| [null]| [{afr, 0.11828569...|[{17029, African ...| []| null| null| null| false|\n", + "| GCST008775| GCST| gwas|Birth weight or w...| [EFO_0004342, EFO...| null| null|30858448|Genetic overlap b...| Tekola-Ayele F| 2019-03-11| Sci Rep| null|153,781 European ...| 0| 0| 378240| [null]| [{nfe, 1.0}]|[{246502, Europea...| []| null| null| null| false|\n", + "| GCST008870| GCST| gwas|Keratinocyte canc...| [EFO_0010176]| null| null|31174203|Combined analysis...| Liyanage UE| 2019-06-07| Hum Mol Genet| null|at least 18,538 E...| 18538| 340302| 358840| [null]| [{nfe, 1.0}]|[{358840, European}]| []| null| null| null| false|\n", + "| GCST009173| GCST| gwas|Response to (pegy...| [EFO_0007859]| null| null|30715261|Genome Wide Assoc...| Brouwer WP| 2019-02-02| Clin Infect Dis| [EFO_0004239]|121 Asian, Europe...| 0| 0| 509| [null]| [{nfe, 0.5}, {afr...|[{127, European},...| []| null| null| null| false|\n", + "| GCST009364| GCST| gwas|Triglyceride leve...| [EFO_0004530, EFO...| null| null|31719535|Multi-ancestry sl...| Noordam R| 2019-11-12| Nat Commun| null|at least 2,926 Af...| 0| 49886| 61990| [null]| [{eas, 0.03837715...|[{2096, East Asia...|[{12579, Hispanic...| null| null| null| false|\n", + "| GCST009391| GCST| gwas|Metabolite levels...| [EFO_0005132]| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_2| GCST| gwas| Metabolite levels| []| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_3| GCST| gwas| Metabolite levels| [EFO_0004468, EFO...| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_4| GCST| gwas| Metabolite levels| [EFO_0004518]| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_5| GCST| gwas| Metabolite levels| [EFO_0004761]| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_6| GCST| gwas| Metabolite levels| [EFO_0004846]| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_7| GCST| gwas| Metabolite levels| [EFO_0005001]| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_8| GCST| gwas| Metabolite levels| [EFO_0005002]| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "|GCST009391_9| GCST| gwas| Metabolite levels| [EFO_0005058]| null| null|23823483|A genome-wide ass...| Rhee EP| 2013-07-02| Cell Metab| null|2,076 European an...| 0| 0| 2076| [null]| [{nfe, 1.0}]| [{2076, European}]| []| null| null| null| false|\n", + "+------------+---------+---------+--------------------+------------------------+------+------------------+--------+--------------------+----------------------+---------------+--------------------+----------------------------------+--------------------+------+---------+--------+--------------------+---------------------+--------------------+--------------------+---------------+-------------+--------------------+-----------+\n", + "only showing top 50 rows\n", + "\n" + ] + } + ], + "source": [ + "path_tmp=\"gs://gwas_catalog_data/study_index\"\n", + "tmp=StudyIndex.from_parquet(session=session, path=path_tmp)\n", + "tmp.df.show(50)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "joined_df=joined_df.withColumn(\"traitFromSourceMappedIds\",joined_df[\"trait_efos\"]).drop(\"trait_efos\",\"trait_reported_low\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "| studyId| projectId|studyType| traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds| initialSampleSize|nCases|nControls|nSamples| cohorts|ldPopulationStructure| discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Actinomycosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 101| 363227| 363328|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Amoebiasis| ['EFO_0007144']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 160| 367214| 367374|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Anogenital herpes...| ['EFO_0007282']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1986| 400197| 402183|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Aspergillosis| ['EFO_0007157']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 211| 403213| 403424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Atypical virus in...| ['MONDO_0024318']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 282| 409849| 410131|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial infecti...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 20226| 363227| 383453|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial, viral ...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2852| 409329| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Other bacterial i...| ['EFO_0000771']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 6145| 367214| 373359|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Candidiasis| ['MONDO_0002026']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 4306| 403213| 407519|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas|Other sexually tr...| ['MONDO_0021681',...| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2186| 400197| 402383|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Cholera| ['EFO_1001235']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1385| 367214| 368599|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas|Dengue fever [cla...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 53| 409137| 409190|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas| Dermatophytosis| ['MONDO_0004678']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 3921| 403213| 407134|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Early syphilis| ['EFO_0007504']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 308| 400197| 400505|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "| FINNGEN_R10_AB1_EBV|FINNGEN_R10| gwas|Infectious mononu...| ['EFO_0007326']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2979| 400974| 403953|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Enterobiasis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 112| 411658| 411770|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Erysipelas| ['EFO_1001462']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 22261| 363227| 385488|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Diarrhoea and gas...| ['MONDO_0045031']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 32210| 367214| 399424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Gonococcal infection| ['DOID_7551']| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 954| 400197| 401151|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_H...|FINNGEN_R10| gwas| Helminthiases| ['EFO_0007245', '...| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 523| 411658| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "joined_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "string\n" + ] + } + ], + "source": [ + "column_type = dict(joined_df.dtypes)[\"traitFromSourceMappedIds\"]\n", + "print(column_type)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql.functions import from_json\n", + "from pyspark.sql.types import ArrayType, StringType\n", + "\n", + "# Assuming joined_df is your DataFrame\n", + "joined_df = joined_df.withColumn(\n", + " \"traitFromSourceMappedIds\",\n", + " from_json(\"traitFromSourceMappedIds\", ArrayType(StringType()))\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "| studyId| projectId|studyType| traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds| initialSampleSize|nCases|nControls|nSamples| cohorts|ldPopulationStructure| discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Actinomycosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 101| 363227| 363328|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Amoebiasis| [EFO_0007144]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 160| 367214| 367374|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Anogenital herpes...| [EFO_0007282]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1986| 400197| 402183|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Aspergillosis| [EFO_0007157]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 211| 403213| 403424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Atypical virus in...| [MONDO_0024318]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 282| 409849| 410131|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial infecti...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 20226| 363227| 383453|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial, viral ...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2852| 409329| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Other bacterial i...| [EFO_0000771]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 6145| 367214| 373359|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Candidiasis| [MONDO_0002026]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 4306| 403213| 407519|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas|Other sexually tr...| [MONDO_0021681, E...| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2186| 400197| 402383|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Cholera| [EFO_1001235]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1385| 367214| 368599|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas|Dengue fever [cla...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 53| 409137| 409190|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas| Dermatophytosis| [MONDO_0004678]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 3921| 403213| 407134|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Early syphilis| [EFO_0007504]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 308| 400197| 400505|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "| FINNGEN_R10_AB1_EBV|FINNGEN_R10| gwas|Infectious mononu...| [EFO_0007326]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2979| 400974| 403953|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Enterobiasis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 112| 411658| 411770|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Erysipelas| [EFO_1001462]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 22261| 363227| 385488|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Diarrhoea and gas...| [MONDO_0045031]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 32210| 367214| 399424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Gonococcal infection| [DOID_7551]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 954| 400197| 401151|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_H...|FINNGEN_R10| gwas| Helminthiases| [EFO_0007245, EFO...| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 523| 411658| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "only showing top 20 rows\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "joined_df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "array\n" + ] + } + ], + "source": [ + "column_type = dict(joined_df.dtypes)[\"traitFromSourceMappedIds\"]\n", + "print(column_type)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "si=StudyIndex(_df=joined_df, _schema=StudyIndex.get_schema())" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "| studyId| projectId|studyType| traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds| initialSampleSize|nCases|nControls|nSamples| cohorts|ldPopulationStructure| discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Actinomycosis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 101| 363227| 363328|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Amoebiasis| [EFO_0007144]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 160| 367214| 367374|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Anogenital herpes...| [EFO_0007282]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1986| 400197| 402183|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas| Aspergillosis| [EFO_0007157]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 211| 403213| 403424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_A...|FINNGEN_R10| gwas|Atypical virus in...| [MONDO_0024318]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 282| 409849| 410131|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial infecti...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 20226| 363227| 383453|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Bacterial, viral ...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2852| 409329| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_B...|FINNGEN_R10| gwas|Other bacterial i...| [EFO_0000771]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 6145| 367214| 373359|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Candidiasis| [MONDO_0002026]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 4306| 403213| 407519|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas|Other sexually tr...| [MONDO_0021681, E...| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2186| 400197| 402383|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_C...|FINNGEN_R10| gwas| Cholera| [EFO_1001235]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 1385| 367214| 368599|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas|Dengue fever [cla...| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 53| 409137| 409190|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_D...|FINNGEN_R10| gwas| Dermatophytosis| [MONDO_0004678]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 3921| 403213| 407134|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Early syphilis| [EFO_0007504]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 308| 400197| 400505|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "| FINNGEN_R10_AB1_EBV|FINNGEN_R10| gwas|Infectious mononu...| [EFO_0007326]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 2979| 400974| 403953|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Enterobiasis| null| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 112| 411658| 411770|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_E...|FINNGEN_R10| gwas| Erysipelas| [EFO_1001462]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 22261| 363227| 385488|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Diarrhoea and gas...| [MONDO_0045031]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 32210| 367214| 399424|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_G...|FINNGEN_R10| gwas|Gonococcal infection| [DOID_7551]| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 954| 400197| 401151|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "|FINNGEN_R10_AB1_H...|FINNGEN_R10| gwas| Helminthiases| [EFO_0007245, EFO...| null| null| null| null| null| null| null| null|377,277 (210,870 ...| 523| 411658| 412181|[FinnGen]| [{fin, 1.0}]|[{377277, Finnish}]| null| null| null|gs://finngen-publ...| true|\n", + "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "si.df.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "text/plain": [ + "2408" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "si.df.count()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + } + ], + "source": [ + "si.df.write.parquet(path=\"gs://genetics-portal-dev-analysis/yt4/study_index_finngen_with_efo\")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "path_to_study_index=\"gs://genetics-portal-dev-analysis/yt4/study_index_finngen_with_efo\"\n", + "si=StudyIndex.from_parquet(session=session, path=path_to_study_index)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "gentropy-krNFZEZg-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}