From d8286384266937d4d8b3a33670fb262574118a9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20Rold=C3=A1n?= Date: Tue, 15 Jan 2019 20:21:09 +0000 Subject: [PATCH] spark: change oskar-spak/src/main/python structure and biodata dependency version #13 this way we will be able to work from this folder's notebook using the shell scripts. Also we've modified POM's Biodata version to 1.5.0-SNAPSHOT. --- .../python/notebooks/conf/spark-defaults.conf | 30 + .../src/main/python/notebooks/facets.ipynb | 651 ------------ .../plotly trials-checkpoint.ipynb | 997 ++++++++++++++++++ .../notebooks/{ => notebooks}/GWAS.ipynb | 0 .../python/notebooks/notebooks/facets.ipynb | 651 ++++++++++++ .../notebooks/notebooks/my_notebook.ipynb | 480 +++++++++ .../notebooks/notebooks/plotly trials.ipynb | 997 ++++++++++++++++++ .../{ => notebooks/pyoskar}/__init__.py | 0 .../notebooks/notebooks/pyoskar/analysis.py | 469 ++++++++ .../notebooks/notebooks/pyoskar/core.py | 369 +++++++ .../python/notebooks/notebooks/pyoskar/sql.py | 175 +++ .../notebooks/{ => notebooks}/stats.ipynb | 0 .../{ => notebooks}/variant_filtering.ipynb | 2 +- .../variant_filtering_advanced.ipynb | 156 +++ .../variant_filtering_advanced.ipynb | 123 --- pom.xml | 2 +- 16 files changed, 4326 insertions(+), 776 deletions(-) create mode 100644 oskar-spark/src/main/python/notebooks/conf/spark-defaults.conf delete mode 100644 oskar-spark/src/main/python/notebooks/facets.ipynb create mode 100644 oskar-spark/src/main/python/notebooks/notebooks/.ipynb_checkpoints/plotly trials-checkpoint.ipynb rename oskar-spark/src/main/python/notebooks/{ => notebooks}/GWAS.ipynb (100%) create mode 100644 oskar-spark/src/main/python/notebooks/notebooks/facets.ipynb create mode 100644 oskar-spark/src/main/python/notebooks/notebooks/my_notebook.ipynb create mode 100644 oskar-spark/src/main/python/notebooks/notebooks/plotly trials.ipynb rename oskar-spark/src/main/python/notebooks/{ => notebooks/pyoskar}/__init__.py (100%) create mode 100644 oskar-spark/src/main/python/notebooks/notebooks/pyoskar/analysis.py create mode 100644 oskar-spark/src/main/python/notebooks/notebooks/pyoskar/core.py create mode 100644 oskar-spark/src/main/python/notebooks/notebooks/pyoskar/sql.py rename oskar-spark/src/main/python/notebooks/{ => notebooks}/stats.ipynb (100%) rename oskar-spark/src/main/python/notebooks/{ => notebooks}/variant_filtering.ipynb (99%) create mode 100644 oskar-spark/src/main/python/notebooks/notebooks/variant_filtering_advanced.ipynb delete mode 100644 oskar-spark/src/main/python/notebooks/variant_filtering_advanced.ipynb diff --git a/oskar-spark/src/main/python/notebooks/conf/spark-defaults.conf b/oskar-spark/src/main/python/notebooks/conf/spark-defaults.conf new file mode 100644 index 0000000..46aa2bf --- /dev/null +++ b/oskar-spark/src/main/python/notebooks/conf/spark-defaults.conf @@ -0,0 +1,30 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: +# spark.master spark://master:7077 +# spark.eventLog.enabled true +# spark.eventLog.dir hdfs://namenode:8021/directory +# spark.serializer org.apache.spark.serializer.KryoSerializer +# spark.driver.memory 5g +# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" + + +spark.jars ../libs/oskar-spark-0.1.0.jar,../libs/oskar-spark-0.1.0-jar-with-dependencies.jar \ No newline at end of file diff --git a/oskar-spark/src/main/python/notebooks/facets.ipynb b/oskar-spark/src/main/python/notebooks/facets.ipynb deleted file mode 100644 index cc05560..0000000 --- a/oskar-spark/src/main/python/notebooks/facets.ipynb +++ /dev/null @@ -1,651 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# **Facets tutorial**\n", - "One of the most sofisticated transformations that Pyoskar provides us is [ **facet** ]. As every transformation it pertains to Oskar class and we can acces to it through our Oskar instance.\n", - "
\n", - "Usage:\n", - "```\n", - "facet(df[DataFrame], facet[str])\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We have to to import both Spark and Oskar APIs, as well as loading our data into a spark DataFrame:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from pyoskar.core import Oskar\n", - "from pyoskar.sql import *\n", - "from pyoskar.analysis import *\n", - "from pyspark.sql.functions import col, udf, count, explode, concat, when, expr\n", - "from pyspark.sql.functions import *\n", - "\n", - "oskar = Oskar(spark)\n", - "df = oskar.load(\"/home/roldanx/appl/oskar/oskar-spark/src/test/resources/platinum_chr22.small.parquet\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Simple facets\n", - "Now that we have loaded our data, we start with an easy facet. This example executes the classics \"groupBy\" and \"count\" upon our dataframe. The next format was designed to be applied on categorical or discrete quantitative variants. That could be any among these: \n", - " - Chromosome [ **chromosome** ]\n", - " - Variant type [ **type** ]\n", - " - Studies [ **studies** ]\n", - " - Biotype [ **biotype** ]\n", - " - Consequence type [ **ct** ]\n", - " - Gene [ **gene** ]\n", - " - Ensemble gene ID [ **ensemblGeneId** ]\n", - " - Ensemble gene transcript [ **ensemblTranscriptId** ]" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-----+-----+\n", - "| type|count|\n", - "+-----+-----+\n", - "|INDEL| 106|\n", - "| SNV| 894|\n", - "+-----+-----+\n", - "\n" - ] - } - ], - "source": [ - "oskar.facet(df, \"type\").show()" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+----------------------------------+-----+\n", - "|ct |count|\n", - "+----------------------------------+-----+\n", - "|2KB_downstream_variant |146 |\n", - "|2KB_upstream_variant |146 |\n", - "|3_prime_UTR_variant |9 |\n", - "|5_prime_UTR_variant |1 |\n", - "|NMD_transcript_variant |140 |\n", - "|TF_binding_site_variant |108 |\n", - "|downstream_gene_variant |163 |\n", - "|intergenic_variant |222 |\n", - "|intron_variant |543 |\n", - "|missense_variant |4 |\n", - "|non_coding_transcript_exon_variant|45 |\n", - "|non_coding_transcript_variant |385 |\n", - "|regulatory_region_variant |764 |\n", - "|splice_donor_variant |1 |\n", - "|splice_region_variant |2 |\n", - "|synonymous_variant |6 |\n", - "|upstream_gene_variant |200 |\n", - "+----------------------------------+-----+\n", - "\n" - ] - } - ], - "source": [ - "oskar.facet(df, \"ct\").show(truncate=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Include facets\n", - "We can also applies a filtering based on the values we explicit in the function:" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-------+-----+\n", - "| gene|count|\n", - "+-------+-----+\n", - "|BCL2L13| 8|\n", - "| CECR2| 11|\n", - "+-------+-----+\n", - "\n" - ] - } - ], - "source": [ - "oskar.facet(df, \"gene[BCL2L13,CECR2]\").show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Range facets\n", - "Using a similar syntax as with \"Include facets\" but dealing with continuous quantitative we find that we can apply facets by range, where we can determine both upper and downer thresholds as well as the step e.g. [start..end]:step. Available range fields:\n", - " - Conservation scores: grep [ **grep** ], phylop [ **phylop** ] or phastCons [ **phastCons** ]\n", - " - Functional scores: cadd_scaled [ **cadd_scaled** ] or cadd_raw [ **cadd_raw** ]\n", - " - Substitution scores: sift [ **sift** ] or polyphen [ **polyphen** ]" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-----------+-----+\n", - "|phylopRange|count|\n", - "+-----------+-----+\n", - "| -4.0| 3|\n", - "| -3.0| 12|\n", - "| -2.0| 55|\n", - "| -1.0| 171|\n", - "| 0.0| 681|\n", - "+-----------+-----+\n", - "\n" - ] - } - ], - "source": [ - "oskar.facet(df, \"phylop[-5..0]:1\").show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Furthermore, we dispose two more rangeable fields where we could use our facets. These ones have the peculiarity that they need extra inputs to be fully defined; we will need to explicit the study which they pertain and the cohort delimited by two underscores. These fields are:\n", - " - Global alternate population frequency [ **popFreq** ] \n", - " - Dataframe alternate population frequency [ **stats** ], which is included in the dataframe stats field." - ] - }, - { - "cell_type": "code", - "execution_count": 123, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---------------------------------+-----+\n", - "|popFreq__GNOMAD_GENOMES__ALLRange|count|\n", - "+---------------------------------+-----+\n", - "| 0.0| 514|\n", - "| 0.1| 112|\n", - "| 0.2| 75|\n", - "| 0.30000000000000004| 97|\n", - "| 0.4| 77|\n", - "| 0.5| 26|\n", - "| 0.6000000000000001| 35|\n", - "| 0.7000000000000001| 23|\n", - "| 0.8| 18|\n", - "| 0.9| 15|\n", - "| 1.0| 8|\n", - "+---------------------------------+-----+\n", - "\n" - ] - } - ], - "source": [ - "oskar.facet(df, \"popFreq__GNOMAD_GENOMES__ALL[0..1]:0.1\").show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In case we want to get the dataframe alternate population frequency, first we will need to fill the stats field as explained in the \"stats\" tutorial:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+----------------------------------------------------+-----+\n", - "|stats__hgvauser@platinum:illumina_platinum__ALLRange|count|\n", - "+----------------------------------------------------+-----+\n", - "| 0.0| 41|\n", - "| 0.1| 15|\n", - "| 0.2| 14|\n", - "| 0.30000000000000004| 10|\n", - "| 0.4| 7|\n", - "| 0.5| 714|\n", - "| 0.6000000000000001| 68|\n", - "| 0.7000000000000001| 49|\n", - "| 0.8| 12|\n", - "| 0.9| 21|\n", - "| 1.0| 48|\n", - "+----------------------------------------------------+-----+\n", - "\n" - ] - } - ], - "source": [ - "samples = oskar.metadata.samples(df)[\"hgvauser@platinum:illumina_platinum\"]\n", - "df2 = oskar.stats(df,studyId=\"hgvauser@platinum:illumina_platinum\",cohort=\"ALL\",samples=samples)\n", - "oskar.facet(df2, \"stats__hgvauser@platinum:illumina_platinum__ALL[0..1]:0.1\").show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Aggregation facets\n", - "We may want to check whether the compounds of all variants have historically been well conservated or otherways have notably evolved. For this task we could use the aggregation facets, with substitutes the default \"count\" function for another one we decide among this ones:\n", - " - Average [ **avg** ]\n", - " - Maximum [ **max** ]\n", - " - Minimum [ **min** ]\n", - " - Sumatory [ **sum** ]\n", - " - Squared sumatory [ **sumsq** ]\n", - " - Standard deviation [ **stddev** ]\n", - " - Variance [ **var** ]\n", - " - Percentile values [ **percentile** ]\n", - " - Set of values [ **unique** ]" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-------------------+-----+\n", - "| avg(gerp)|count|\n", - "+-------------------+-----+\n", - "|-0.3518712293113349| 1000|\n", - "+-------------------+-----+\n", - "\n" - ] - } - ], - "source": [ - "oskar.facet(df, \"avg(gerp)\").show(truncate=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 162, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+---------------------------------------------------------------------------------------+-----+\n", - "|percentile(gerp) |count|\n", - "+---------------------------------------------------------------------------------------+-----+\n", - "|[-2.152000093460083, -0.6257500052452087, 0.0, 0.14900000393390656, 0.7430999755859375]|1000 |\n", - "+---------------------------------------------------------------------------------------+-----+\n", - "\n" - ] - } - ], - "source": [ - "oskar.facet(df, \"percentile(gerp)\").show(truncate=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Nested facets\n", - "The last feature we find available for our facet queries is nesting, which allows us to concatenate gruops and reach complex studies by using \">>\" separator." - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-----------------------+--------------------+-----+\n", - "|biotype |ct |count|\n", - "+-----------------------+--------------------+-----+\n", - "|nonsense_mediated_decay|splice_donor_variant|1 |\n", - "|processed_transcript |splice_donor_variant|1 |\n", - "|protein_coding |splice_donor_variant|1 |\n", - "|retained_intron |splice_donor_variant|1 |\n", - "+-----------------------+--------------------+-----+\n", - "\n" - ] - } - ], - "source": [ - "oskar.facet(df, \"biotype>>ct[splice_donor_variant]\").show(truncate=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Final facets\n", - "Now it is up to us to mix all these ingredients:" - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-----+-----------------------+-------------+----------------+--------------------+-----+\n", - "|gene |biotype |cadd_rawRange|cadd_scaledRange|min(phylop) |count|\n", - "+-----+-----------------------+-------------+----------------+--------------------+-----+\n", - "|AIFM3|lincRNA |0.2 |6.0 |-0.2809999883174896 |1 |\n", - "|AIFM3|nonsense_mediated_decay|0.2 |6.0 |-0.2809999883174896 |1 |\n", - "|AIFM3|processed_transcript |0.2 |6.0 |-0.2809999883174896 |1 |\n", - "|AIFM3|protein_coding |0.2 |6.0 |-0.2809999883174896 |1 |\n", - "|AIFM3|retained_intron |0.2 |6.0 |-0.2809999883174896 |1 |\n", - "|GGT1 |nonsense_mediated_decay|-0.2 |0.0 |-1.1380000114440918 |2 |\n", - "|GGT1 |nonsense_mediated_decay|0.0 |1.0 |0.10199999809265137 |2 |\n", - "|GGT1 |nonsense_mediated_decay|0.0 |2.0 |0.054999999701976776|1 |\n", - "|GGT1 |nonsense_mediated_decay|0.0 |4.0 |0.2809999883174896 |1 |\n", - "|GGT1 |nonsense_mediated_decay|0.4 |7.0 |-1.50600004196167 |2 |\n", - "|GGT1 |processed_transcript |-0.2 |0.0 |0.10199999809265137 |1 |\n", - "|GGT1 |processed_transcript |0.0 |1.0 |0.10199999809265137 |1 |\n", - "|GGT1 |processed_transcript |0.0 |2.0 |0.054999999701976776|1 |\n", - "|GGT1 |processed_transcript |0.0 |4.0 |0.2809999883174896 |1 |\n", - "|GGT1 |processed_transcript |0.4 |7.0 |-1.50600004196167 |2 |\n", - "|GGT1 |protein_coding |-0.2 |0.0 |-1.1380000114440918 |2 |\n", - "|GGT1 |protein_coding |0.0 |1.0 |0.10199999809265137 |2 |\n", - "|GGT1 |protein_coding |0.0 |2.0 |0.054999999701976776|1 |\n", - "|GGT1 |protein_coding |0.0 |4.0 |0.2809999883174896 |1 |\n", - "|GGT1 |protein_coding |0.4 |7.0 |-1.50600004196167 |2 |\n", - "|GGT1 |retained_intron |-0.2 |0.0 |0.10199999809265137 |1 |\n", - "|GGT1 |retained_intron |0.0 |1.0 |0.10199999809265137 |2 |\n", - "|GGT1 |retained_intron |0.4 |7.0 |-0.7080000042915344 |1 |\n", - "|GGT1 |sense_intronic |-0.2 |0.0 |-1.1380000114440918 |1 |\n", - "|GGT1 |unprocessed_pseudogene |0.4 |7.0 |-1.50600004196167 |1 |\n", - "+-----+-----------------------+-------------+----------------+--------------------+-----+\n", - "\n" - ] - } - ], - "source": [ - "oskar.facet(df, \"gene[AIFM3,GGT1]>>biotype>>cadd_raw[-10..10]:0.2>>cadd_scaled[-10..10]:1>>min(phylop)\").show(25, truncate=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 96, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+----+---------------------------------+-----+\n", - "|type|popFreq__GNOMAD_GENOMES__ALLRange|count|\n", - "+----+---------------------------------+-----+\n", - "| SNV| 0.0| 478|\n", - "| SNV| 0.1| 95|\n", - "| SNV| 0.2| 57|\n", - "| SNV| 0.30000000000000004| 90|\n", - "| SNV| 0.4| 66|\n", - "| SNV| 0.5| 21|\n", - "+----+---------------------------------+-----+\n", - "\n", - "+-----+---------------------------------+-----+\n", - "| type|popFreq__GNOMAD_GENOMES__ALLRange|count|\n", - "+-----+---------------------------------+-----+\n", - "|INDEL| 0.0| 36|\n", - "|INDEL| 0.1| 17|\n", - "|INDEL| 0.2| 18|\n", - "|INDEL| 0.30000000000000004| 7|\n", - "|INDEL| 0.4| 11|\n", - "|INDEL| 0.5| 5|\n", - "+-----+---------------------------------+-----+\n", - "\n" - ] - } - ], - "source": [ - "snvPandas = oskar.facet(df, \"type[SNV]>>popFreq__GNOMAD_GENOMES__ALL[0..0.5]:0.1\").show()\n", - "indelPandas = oskar.facet(df, \"type[INDEL]>>popFreq__GNOMAD_GENOMES__ALL[0..0.5]:0.1\").show()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "" - ], - "text/vnd.plotly.v1+html": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.plotly.v1+json": { - "config": { - "linkText": "Export to plot.ly", - "plotlyServerURL": "https://plot.ly", - "showLink": true - }, - "data": [ - { - "domain": { - "x": [ - 0, - 0.48 - ] - }, - "hole": 0.4, - "labels": [ - 0, - 0.1, - 0.2, - 0.30000000000000004, - 0.4, - 0.5 - ], - "type": "pie", - "uid": "935ed220-f713-4035-8a30-1c79dee5aa7d", - "values": [ - 478, - 95, - 57, - 90, - 66, - 21 - ] - }, - { - "domain": { - "x": [ - 0.51, - 1 - ] - }, - "hole": 0.4, - "labels": [ - 0, - 0.1, - 0.2, - 0.30000000000000004, - 0.4, - 0.5 - ], - "type": "pie", - "uid": "fcf24555-7061-411c-9ed7-ac58ebb3c67c", - "values": [ - 36, - 17, - 18, - 7, - 11, - 5 - ] - } - ], - "layout": { - "annotations": [ - { - "font": { - "size": 18 - }, - "showarrow": false, - "text": "SNV", - "x": 0.22, - "y": 0.5 - }, - { - "font": { - "size": 18 - }, - "showarrow": false, - "text": "INDEL", - "x": 0.78, - "y": 0.5 - } - ], - "autosize": false, - "height": 600, - "title": "Global alternate population frequencies [ALL]", - "width": 1500 - } - }, - "image/png": "", - "text/html": [ - "
" - ], - "text/vnd.plotly.v1+html": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import plotly.offline as py\n", - "import plotly.graph_objs as go\n", - "from plotly.offline import get_plotlyjs, init_notebook_mode\n", - "\n", - "snvPandas = oskar.facet(df, \"type[SNV]>>popFreq__GNOMAD_GENOMES__ALL[0..0.5]:0.1\").toPandas()\n", - "indelPandas = oskar.facet(df, \"type[INDEL]>>popFreq__GNOMAD_GENOMES__ALL[0..0.5]:0.1\").toPandas()\n", - "\n", - "init_notebook_mode(connected=True)\n", - "\n", - "fig = {\n", - " \"data\": [\n", - " {\n", - " \"values\": snvPandas[\"count\"],\n", - " \"labels\": snvPandas[\"popFreq__GNOMAD_GENOMES__ALLRange\"],\n", - " \"domain\": {\"x\": [0, .48]},\n", - " \"hole\": .4,\n", - " \"type\": \"pie\"\n", - " },\n", - " {\n", - " \"values\": indelPandas[\"count\"],\n", - " \"labels\": indelPandas[\"popFreq__GNOMAD_GENOMES__ALLRange\"],\n", - " \"domain\": {\"x\": [.51, 1]},\n", - " \"hole\": .4,\n", - " \"type\": \"pie\"\n", - " }],\n", - " \"layout\": {\n", - " \"title\":\"Global alternate population frequencies [ALL]\",\n", - " \"autosize\":False,\n", - " \"width\":1500,\n", - " \"height\":600,\n", - " \"annotations\": [\n", - " {\n", - " \"font\": {\n", - " \"size\": 18\n", - " },\n", - " \"showarrow\": False,\n", - " \"text\": \"SNV\",\n", - " \"x\": 0.22,\n", - " \"y\": 0.5\n", - " },\n", - " {\n", - " \"font\": {\n", - " \"size\": 18\n", - " },\n", - " \"showarrow\": False,\n", - " \"text\": \"INDEL\",\n", - " \"x\": 0.78,\n", - " \"y\": 0.5\n", - " }\n", - " ]\n", - " }\n", - "}\n", - "py.iplot(fig, filename='donut')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/oskar-spark/src/main/python/notebooks/notebooks/.ipynb_checkpoints/plotly trials-checkpoint.ipynb b/oskar-spark/src/main/python/notebooks/notebooks/.ipynb_checkpoints/plotly trials-checkpoint.ipynb new file mode 100644 index 0000000..22c3221 --- /dev/null +++ b/oskar-spark/src/main/python/notebooks/notebooks/.ipynb_checkpoints/plotly trials-checkpoint.ipynb @@ -0,0 +1,997 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from pyoskar.core import Oskar\n", + "from pyoskar.sql import *\n", + "from pyoskar.analysis import *\n", + "from pyspark.sql.functions import col, udf, count, explode, concat, when, expr\n", + "from pyspark.sql.functions import *\n", + "\n", + "oskar = Oskar(spark)\n", + "df = oskar.load(\"/home/roldanx/appl/oskar/oskar-spark/src/test/resources/platinum_chr22.small.parquet\")\n", + "df.createOrReplaceTempView(\"platinum\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Variant histogram" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " start count\n", + "0 16000000 101\n", + "1 17000000 100\n", + "2 18000000 85\n", + "3 19000000 53\n", + "4 20000000 46\n", + "5 21000000 85\n", + "6 22000000 52\n", + "7 23000000 49\n", + "8 24000000 44\n", + "9 25000000 62\n", + "10 26000000 42\n", + "11 27000000 54\n", + "12 28000000 21\n", + "13 29000000 29\n", + "14 30000000 38\n", + "15 31000000 46\n", + "16 32000000 44\n", + "17 33000000 34\n", + "18 34000000 15\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "pandasDF = oskar.histogram(df,\"start\",1000000).toPandas()\n", + "print(pandasDF)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Text(0, 0.5, 'Counts'), Text(0.5, 0, 'Region')]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABXQAAAEyCAYAAABau1igAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3XuYZWdZJ+zfkzQQOZiQgxHSYEdBEx0gSDgow0AS0CAqfA4Q0A8ConEGMaKMEgY/SWZ0Jo7KBBgBUQ6JwxAwHoKCHISAohxygDGGQIIQtJFATAjiOAghz/fH3t2p7l6r06l01apd+76va19Ve61dVU//rvdda/dTq95V3R0AAAAAADa+A6YuAAAAAACAfaOhCwAAAACwIDR0AQAAAAAWhIYuAAAAAMCC0NAFAAAAAFgQGroAAAAAAAtCQxcAAAAAYEFo6AIAAAAALAgNXQAAAACABbFl6gJuj8MPP7y3bds2dRkAAAAAALfLpZde+g/dfcStvW6hG7rbtm3LJZdcMnUZAAAAAAC3S1V9el9eZ8kFAAAAAIAFoaELAAAAALAgNHQBAAAAABbEQq+hCwAAAAAsnq9+9avZvn17vvzlL09dyro76KCDsnXr1tzhDndY1ddr6AIAAAAA62r79u25293ulm3btqWqpi5n3XR3rr/++mzfvj1HH330qr6HJRcAAAAAgHX15S9/OYcddthSNXOTpKpy2GGH3a4rkzV0AQAAAIB1t2zN3B1u7797zRq6VfWaqvp8Vf31im2HVtU7q+rq+ce7z7dXVb20qj5RVX9VVd+5VnUBAAAAACyqtVxD93VJ/keS81ZsOyPJu7r77Ko6Y/78+Ukem+S+88dDk7xi/nFdbDvjLev1o27VNWc/buoSAAAAAGBd7e/+3EbosZ1zzjk57bTTcuc733m/ft81u0K3u/8syQ27bX58knPnn5+b5Akrtp/XMx9IckhV3WOtagMAAAAAWEvnnHNO/vmf/3m/f9/1XkP3yO7+7Pzza5McOf/8qCR/t+J12+fbAAAAAADWxHnnnZf73//+ecADHpCnPe1pueaaa3LiiSfm/ve/f0466aT87d/+bZLkGc94Ri644IKdX3fXu941SfKe97wnj3rUo/LEJz4xxxxzTH7kR34k3Z2XvvSl+fu///uccMIJOeGEE/ZrzWu55MJedXdXVd/Wr6uq05KcliT3vve993tdrHDmwVNXcIszvzh1BQAAAABsIldccUV+6Zd+KX/5l3+Zww8/PDfccENOPfXUnY/XvOY1Of300/OHf/iHe/0+H/7wh3PFFVfknve8Zx7+8IfnL/7iL3L66afnxS9+cS666KIcfvjh+7Xu9b5C93M7llKYf/z8fPtnktxrxeu2zrftobtf1d3Hd/fxRxxxxJoWCwAAAABsTu9+97vzpCc9aWfD9dBDD8373//+/PAP/3CS5GlPe1re97733er3echDHpKtW7fmgAMOyHHHHZdrrrlmLcte94bum5OcOv/81CQXrtj+9Jp5WJIvrliaAQAAAABgMlu2bMnNN9+cJLn55pvzla98Zee+O93pTjs/P/DAA3PTTTetaS1r1tCtqjckeX+Sb6uq7VX1rCRnJ3lMVV2d5NHz50ny1iSfTPKJJL+V5NlrVRcAAAAAwIknnpjf/d3fzfXXX58kueGGG/Ld3/3dOf/885Mkr3/96/OIRzwiSbJt27ZceumlSZI3v/nN+epXv3qr3/9ud7tbvvSlL+33utdsDd3ufurIrpMGXttJfnKtagEAAAAANq5rzn7cuv/M7/iO78gLX/jCPPKRj8yBBx6YBz7wgXnZy16WZz7zmfnVX/3VHHHEEXnta1+bJPnxH//xPP7xj88DHvCAnHzyybnLXe5yq9//tNNOy8knn5x73vOeueiii/Zb3TXrpS6m448/vi+55JLb/X22nfGW/VDN/jHF4B3lpmgAAAAArIErr7wyxx577NRlTGbo319Vl3b38bf2teu9hi4AAAAAAKukoQsAAAAAsCA0dAEAAACAdbfIS8HeHrf3362hCwAAAACsq4MOOijXX3/90jV1uzvXX399DjrooFV/jy37sR4AAAAAgFu1devWbN++Pdddd93Upay7gw46KFu3bl3112voAgAAAADr6g53uEOOPvroqctYSJZcAAAAAABYEBq6AAAAAAALQkMXAAAAAGBBaOgCAAAAACwIDV0AAAAAgAWhoQsAAAAAsCA0dAEAAAAAFoSGLgAAAADAgtDQBQAAAABYEBq6AAAAAAALQkMXAAAAAGBBaOgCAAAAACwIDV0AAAAAgAWhoQsAAAAAsCA0dAEAAAAAFoSGLgAAAADAgtDQBQAAAABYEBq6AAAAAAALYsvUBcAiut+595u6hJ0uP/XyqUuAVfv1U75/6hJ2et4b/3jqEgAAAOBWuUIXAAAAAGBBaOgCAAAAACwIDV0AAAAAgAWhoQsAAAAAsCA0dAEAAAAAFoSGLgAAAADAgpikoVtVP1NVV1TVX1fVG6rqoKo6uqo+WFWfqKo3VtUdp6gNAAAAAGCjWveGblUdleT0JMd3979KcmCSpyT5lST/vbvvk+QLSZ613rUBAAAAAGxkUy25sCXJ11XVliR3TvLZJCcmuWC+/9wkT5ioNgAAAACADWndG7rd/Zkkv5bkbzNr5H4xyaVJbuzum+Yv257kqPWuDQAAAABgI5tiyYW7J3l8kqOT3DPJXZKcfBu+/rSquqSqLrnuuuvWqEoAAAAAgI1niiUXHp3kU919XXd/NcnvJ3l4kkPmSzAkydYknxn64u5+VXcf393HH3HEEetTMQAAAADABjBFQ/dvkzysqu5cVZXkpCQfTXJRkifOX3NqkgsnqA0AAAAAYMOaYg3dD2Z287PLklw+r+FVSZ6f5Ger6hNJDkvy6vWuDQAAAABgI9ty6y/Z/7r7RUletNvmTyZ5yATlAAAAAAAshCmWXAAAAAAAYBU0dAEAAAAAFoSGLgAAAADAgtDQBQAAAABYEBq6AAAAAAALQkMXAAAAAGBBaOgCAAAAACwIDV0AAAAAgAWhoQsAAAAAsCA0dAEAAAAAFoSGLgAAAADAgtDQBQAAAABYEBq6AAAAAAALQkMXAAAAAGBBaOgCAAAAACwIDV0AAAAAgAWhoQsAAAAAsCA0dAEAAAAAFoSGLgAAAADAgtDQBQAAAABYEBq6AAAAAAALQkMXAAAAAGBBaOgCAAAAACwIDV0AAAAAgAWhoQsAAAAAsCA0dAEAAAAAFoSGLgAAAADAgtDQBQAAAABYEBq6AAAAAAALQkMXAAAAAGBBaOgCAAAAACyILVMXAAAAa+UbL/rI1CXsdO0Jx01dAgAAm8AkV+hW1SFVdUFVfayqrqyq76qqQ6vqnVV19fzj3aeoDQAAAABgo5pqyYWXJHlbdx+T5AFJrkxyRpJ3dfd9k7xr/hwAAAAAgLl1b+hW1cFJ/k2SVydJd3+lu29M8vgk585fdm6SJ6x3bQAAAAAAG9ltbuhW1d2r6v6342ceneS6JK+tqg9X1W9X1V2SHNndn52/5tokR96OnwEAAAAAsOnsU0O3qt5TVV9fVYcmuSzJb1XVi1f5M7ck+c4kr+juByb5P9lteYXu7iQ9UstpVXVJVV1y3XXXrbIEAAAAAIDFs69X6B7c3f+Y5IeSnNfdD03y6FX+zO1Jtnf3B+fPL8iswfu5qrpHksw/fn7oi7v7Vd19fHcff8QRR6yyBAAAAACAxbOvDd0t8ybrk5P88e35gd19bZK/q6pvm286KclHk7w5yanzbacmufD2/BwAAAAAgM1myz6+7qwkb0/yvu6+uKq+OcnVt+Pn/lSS11fVHZN8MskzM2suv6mqnpXk05k1jwEAAAAAmNvXhu5nu3vnjdC6+5O3Yw3ddPdHkhw/sOuk1X5PAAAAAIDNbl+XXHjZPm4DAAAAAGCN7PUK3ar6riTfneSIqvrZFbu+PsmBa1kYAAAAAAC7urUlF+6Y5K7z191txfZ/TPLEtSoKAAAAAIA97bWh293vTfLeqnpdd396nWoCAAAAAGDAvt4U7U5V9aok21Z+TXefuBZFAQAAAACwp31t6P5uklcm+e0kX1u7cgAAAAAAGLOvDd2buvsVa1oJAAAAAAB7ta8N3T+qqmcn+YMk/7JjY3ffsCZVAQAATGDbGW+ZuoSdrjn7cVOXAABsQPva0D11/vHnVmzrJN+8f8sBAAAAAGDMPjV0u/votS4EWHxXHnPs1CXsdOzHrpy6BAAAAID9bp8aulX19KHt3X3e/i0HAAAAAIAx+7rkwoNXfH5QkpOSXJZEQxcA1sD2M/586hJ22nr2I6YuAQAAgLl9XXLhp1Y+r6pDkpy/JhUBAAAAADDogFV+3f9JYl1dAAAAAIB1tK9r6P5Rkp4/PTDJsUnetFZFAQAAAACwp31dQ/fXVnx+U5JPd/f2NagHAAAAAIAR+7TkQne/N8nHktwtyd2TfGUtiwIAAAAAYE/7uuTCk5P8apL3JKkkL6uqn+vuC9awNgCAXZx55plTl7DTRqoFAABYHvu65MILkzy4uz+fJFV1RJI/TaKhCwAAAACwTvZpyYUkB+xo5s5dfxu+FgAAAACA/WBfr9B9W1W9Pckb5s9PSfLWtSkJAAAAAIAhe23oVtV9khzZ3T9XVT+U5F/Pd70/yevXujgAAAAAAG5xa1fonpPkBUnS3b+f5PeTpKruN9/3A2taHQAAAAAAO91aQ/fI7r58943dfXlVbVuTigA2od/4d++euoSdfvKVJ05dAgAALC3/NwBur1u7sdkhe9n3dfuzEAAAAAAA9u7WGrqXVNWP776xqn4syaVrUxIAAAAAAENubcmF5yb5g6r6kdzSwD0+yR2T/D9rWRgAAAAAALvaa0O3uz+X5Lur6oQk/2q++S3dvXEWfAEAIO9697dMXcJOJ534N1OXAAAsmF8/5funLmEXz3vjH09dAoy6tSt0kyTdfVGSi9a4FgAAAAAA9uLW1tAFAAAAAGCD0NAFAAAAAFgQ+7TkAgAAsLlsO+MtU5ew0zVnP27qEgAAFsZkV+hW1YFV9eGq+uP586Or6oNV9YmqemNV3XGq2gAAAAAANqIpl1z46SRXrnj+K0n+e3ffJ8kXkjxrkqoAAAAAADaoSRq6VbU1yeOS/Pb8eSU5MckF85ecm+QJU9QGAAAAALBRTXWF7jlJfj7JzfPnhyW5sbtvmj/fnuSoKQoDAAAAANio1r2hW1Xfn+Tz3X3pKr/+tKq6pKouue666/ZzdQAAAAAAG9cUV+g+PMkPVtU1Sc7PbKmFlyQ5pKq2zF+zNclnhr64u1/V3cd39/FHHHHEetQLAAAAALAhrHtDt7tf0N1bu3tbkqckeXd3/0iSi5I8cf6yU5NcuN61AQAAAABsZFOtoTvk+Ul+tqo+kdmauq+euB4AAAAAgA1ly62/ZO1093uSvGf++SeTPGTKegAAAIDb78pjjp26hF0c+7Erpy4BYL/ZSFfoAgAAAACwFxq6AAAAAAALQkMXAAAAAGBBaOgCAAAAACwIDV0AAAAAgAWhoQsAAAAAsCA0dAEAAAAAFsSWqQsAAAAAAIZtP+PPpy5hp61nP2LqEogrdAEAAAAAFoaGLgAAAADAgtDQBQAAAABYEBq6AAAAAAALQkMXAAAAAGBBaOgCAAAAACwIDV0AAAAAgAWhoQsAAAAAsCA0dAEAAAAAFsSWqQsAAACARXW/c+83dQk7XX7q5VOXAMA6cIUuAAAAAMCC0NAFAAAAAFgQGroAAAAAAAtCQxcAAAAAYEFo6AIAAAAALAgNXQAAAACABaGhCwAAAACwIDR0AQAAAAAWhIYuAAAAAMCC0NAFAAAAAFgQGroAAAAAAAtCQxcAAAAAYEFo6AIAAAAALAgNXQAAAACABbHuDd2quldVXVRVH62qK6rqp+fbD62qd1bV1fOPd1/v2gAAAAAANrItE/zMm5I8r7svq6q7Jbm0qt6Z5BlJ3tXdZ1fVGUnOSPL8CeoDAABgd2cePHUFtzjzi1NXAACTWfcrdLv7s9192fzzLyW5MslRSR6f5Nz5y85N8oT1rg0AAAAAYCObdA3dqtqW5IFJPpjkyO7+7HzXtUmOnKgsAAAAAIANabKGblXdNcnvJXlud//jyn3d3Ul65OtOq6pLquqS6667bh0qBQAAAADYGCZp6FbVHTJr5r6+u39/vvlzVXWP+f57JPn80Nd296u6+/juPv6II45Yn4IBAAAAADaAdW/oVlUleXWSK7v7xSt2vTnJqfPPT01y4XrXBgAAAACwkW2Z4Gc+PMnTklxeVR+Zb/uPSc5O8qaqelaSTyd58gS1AQAAAABsWOve0O3u9yWpkd0nrWctAAAAAACLZLKbogEAAAAAcNto6AIAAAAALAgNXQAAAACABaGhCwAAAACwIDR0AQAAAAAWxJapCwAAAAAAuC3OPPPMqUvYxXrW4wpdAAAAAIAFoaELAAAAALAgNHQBAAAAABaEhi4AAAAAwILQ0AUAAAAAWBAaugAAAAAAC0JDFwAAAABgQWjoAgAAAAAsCA1dAAAAAIAFoaELAAAAALAgNHQBAAAAABaEhi4AAAAAwILQ0AUAAAAAWBAaugAAAAAAC0JDFwAAAABgQWjoAgAAAAAsCA1dAAAAAIAFoaELAAAAALAgNHQBAAAAABaEhi4AAAAAwILQ0AUAAAAAWBAaugAAAAAAC0JDFwAAAABgQWjoAgAAAAAsCA1dAAAAAIAFoaELAAAAALAgNlRDt6pOrqqPV9UnquqMqesBAAAAANhINkxDt6oOTPIbSR6b5NuTPLWqvn3aqgAAAAAANo4N09BN8pAkn+juT3b3V5Kcn+TxE9cEAAAAALBhbKSG7lFJ/m7F8+3zbQAAAAAAJKnunrqGJElVPTHJyd39Y/PnT0vy0O5+zm6vOy3JafOn35bk4+ta6LjDk/zD1EVsULIZJpdxshkml3GyGSaXcbIZJpdxshkml3GyGSaXcbIZJpdxshkml3GyGbaRcvmm7j7i1l60ZT0q2UefSXKvFc+3zrftortfleRV61XUvqqqS7r7+Knr2IhkM0wu42QzTC7jZDNMLuNkM0wu42QzTC7jZDNMLuNkM0wu42QzTC7jZDNsEXPZSEsuXJzkvlV1dFXdMclTkrx54poAAAAAADaMDXOFbnffVFXPSfL2JAcmeU13XzFxWQAAAAAAG8aGaegmSXe/Nclbp65jlTbcMhAbiGyGyWWcbIbJZZxshsllnGyGyWWcbIbJZZxshsllnGyGyWWcbIbJZZxshi1cLhvmpmgAAAAAAOzdRlpDFwAAAACAvdDQBQAAAABYEBq6AAAAAAALYkPdFA2AW1TVwUlOTnLUfNNnkry9u2+crqrpVdUxSR6fXXN5c3dfOV1VG4NshplL42QzzFwaJ5thchnnODPMmBlXVZXkIdk1mw/1kt8AyFwaJ5th5tK4zZCNK3RXqaqOqarnV9VL54/nV9WxU9e1EchmWFUdXFWnVNXPzh+nVNUhU9e1ERgze6qqpye5LMmjktx5/jghyaXzfUupqp6f5PwkleRD80cleUNVnTFlbVOTzTBzaZxshplL42QzTC7jHGeGGTPjqup7klyd5Mwk3zd/nJXk6vm+pWQujZPNMHNp3GbJphao+bxhzE/AT83sJLx9vnlrkqckOb+7z56qtqnJZtj8RPKiJO/I7Dc/ySyXxyQ5q7vPm6q2qRkzw6rq40keuvtvlavq7kk+2N3fOk1l06qqq5J8R3d/dbftd0xyRXffd5rKpiebYebSONkMM5fGyWaYXMY5zgwzZsZV1ZVJHtvd1+y2/egkb+3upbzow1waJ5th5tK4zZKNJRdW51kZPgG/OMkVSZayATUnm2EvTPKgsZNMkqVt6MaYGVNJhn7jdvN837K6Ock9k3x6t+33mO9bZrIZZi6Nk80wc2mcbIbJZZzjzDBjZtyW3HKRx0qfSXKHda5lIzGXxslmmLk0blNko6G7Ok7A42QzzElmnDEz7JeTXFZV70jyd/Nt987squ7/PFlV03tukndV1dXZNZf7JHnOZFVtDLIZZi6Nk80wc2mcbIbJZZzjzDBjZtxrklxcVefnlmzuldlf7716sqqmZy6Nk80wc2ncpsjGkgurUFUnJ/kfma25sccJuLvfNlVtU5PNsKo6NckvZrbkwh4nme5+3USlTc6YGTe/gvt7s+fi/l+YrqrpVdUB2XMB+4u7+2vTVbUxyGaYuTRONsPMpXGyGSaXcY4zw4yZcVX17Ul+MHveMO6j01U1PXNpnGyGmUvjNkM2Grqr5AQ8TjbDnGTGGTPjqurIrMiluz83ZT0bQdXi35F0rchmnLk0TjZ7MpfGyWaYXPbOcWZPxsytq6pDk6S7b5i6lo3CXBonm3Hm0rhFzsaSC6vXKx47ni/zn4evJJsB3f2Fqroou55klr6ZO2fM7KaqjkvyyiQHZ7a+TyXZWlU3Jnl2d182ZX1Tmd919OWZXdG98gaD96mqZ3f3OyYrbmKyGWYujZPNMHNpnGyGyWWc48wwY2ZcVd07yX9LcmKSL8421dcneXeSM3a/idGyMJfGyWaYuTRus2TjCt1V2NsJOLMDxjKfgGUzYOwkk2SpTzKJMTOmqj6S5Ce6+4O7bX9Ykt/s7gdMU9m0NssdSdeCbIaZS+NkM8xcGiebYXIZ5zgzzJgZV1XvT3JOkgt2/LVeVR2Y5ElJntvdD5uyvqmYS+NkM8xcGrdZsnGF7uq8JMmjx07ASZb2BBzZjHldxk8yr02ylCeZOWNm2F12Hy9J0t0fqKq7TFHQBrEp7ki6RmQzzFwaJ5th5tI42QyTyzjHmWHGzLjDu/uNKzfMGy7nV9Uy3+DKXBonm2Hm0rhNkY2G7uo4AY+TzTAnmXHGzLA/qaq3JDkvu9558+lJlvZGcdkkdyRdI7IZZi6Nk80wc2mcbIbJZZzjzDBjZtylVfXyJOdm12xOTfLhyaqanrk0TjbDzKVxmyIbSy6sQlW9IMmTkwydgN/U3f91qtqmJpthVfXSJN+S4ZPMp7r7OVPVNjVjZlxVPTbJ47PnnTffOl1V09sMdyRdK7IZZi6Nk80wc2mcbIbJZZzjzDBjZlhV3THJszIwZpK8urv/ZarapmYujZPNnsylcZslGw3dVXICHiebYU4y44wZVmOR70i61mQD+4e5NE42w+TCbWXMALAaGrq3kxPwONlwWxkzt6iqg5O8ILNfAhyZpJN8PsmFSc7u7hsnLG8yQ3ckTbJwdyRdC7IZZi6Nk80wc2mcbIbJZZzjzDBjZlxVbcnsyrknZNeLPS7M7Mq5r05V25TMpXGyGWYujdss2RwwdQGLqKruXVXnV9Xnk3wwyYeq6vPzbdumrW5ashlWVQdX1dlVdWVV3VBV188/P7uqDpm6vikZM6PelOQLSU7o7kO7+7AkJyS5cb5vWb0xyR8kuUd337e775PkHkn+MLNlO5aZbIaZS+NkM8xcGiebYXIZ5zgzzJgZ9ztJjktyVpLvmz/Oyuwm0v9zwrqmZi6Nk80wc2ncpsjGFbqrUFXvT3JOkgvmd8JLVR2Y5ElJntvdD5uyvinJZlhVvT2z37if293Xzrd9Y5JnJDmxu79nwvImZcwMq6qPd/e33dZ9m11VXd3d972t+5aBbIaZS+NkM8xcGiebYXIZ5zgzzJgZV1VXdfe33tZ9m525NE42w8ylcZslG1fors7h3f3GHc2nJOnur3X3+UkOm7CujUA2w7Z196/saOYmSXdf291nJ/mmCevaCIyZYZ+uqp+vqiN3bKiqI6vq+bnl5nHL6NKqenlVPbSq7jl/PHR+l9KFuSPpGpHNMHNpnGyGmUvjZDNMLuMcZ4YZM+NuqKonVdXOXkVVHVBVp2R2FeayMpfGyWaYuTRuU2TjCt1VqKrzk9yQ5NzccoC4V5JTM2tOPXmq2qYmm2FV9Y4kf5rZFbqfm287MrMrdB/T3Y+esLxJGTPDquruSc7ILWtBJcm1md1581eWdZ3h2iR3JF0LshlmLo2TzTBzaZxshsllnOPMMGNmXM2WXPuVzNYX/kJm6wsfklvWF/7UZMVNyFwaJ5th5tK4zZKNhu4qOAGPk80wJ5lxxgwAALC7qjosSbr7+qlrgUVmLo1b5Gw0dAE2qKr63gzcebO73zZdVdOqTXJH0rUgm3Hm0jjZ7MlcGiebYXLZO8eZPRkze1dVx2TPiz0u7O6PTVfV9MylcbIZZi6N2wzZaOiughPwONmMc5IZZswMq6pzknxrkvOSbJ9v3prk6Umu7u6fnqq2KVXVGzK7Y+252TWXU5Mc2t2nTFXb1GQzzFwaJ5th5tI42QyTyzjHmWHGzLj5uqdPTXJ+ds3mKUnOn9+DZOmYS+NkM8xcGrdZstHQXQUn4HGyGeYkM86YGVYjd9esqkpy1bLe/Xgsl1vbtwxkM8xcGiebYebSONkMk8s4x5lhxsy4qroqyXfsflHHfJm2K4yZPbYv9VxKZDPGXBq3WbLZMnUBC+pBAweM7Uk+MB8Yy0w2w75v5CTzxiRXJVnahm6MmTFfrqoHd/fFu21/cJIvT1HQBnFDVT0pye91983J7I6kSZ6UBboj6RqRzTBzaZxshplL42QzTC7jHGeGGTPjbk5yzySf3m37Peb7lpW5NE42w8ylcZsiGw3d1XECHiebYU4y44yZYc9I8oqqultuuXL5Xkm+ON+3rJ6S2R1JX15Vu9+R9ClTFrYByGbYM2IujXlGZDPEXBonm2FyGfeMOM4MMWbGPTfJu6rq6iR/N9927yT3SfKcyaqa3jNiLo15RmQzxFwatymyseTCKlTVtsxOwCdm1nBaeQI+o7s/NVlxE5PNsKr6ziSvSDJ0kvnJ7r50qtqmZszsXVV9Y1asLdzd105Zz0ayyHckXWuy2ZO5NE4248ylcbIZJpdhjjPjjJk9zS/ueEh2vb/Gxd39temq2hjMpXGy2ZO5NG4zZOMK3VXo7muSnJI4Ae9ONsO6+7IkD3WS2ZMxM66qDk7yyKwYM1X19u6+ccKyJle73ZG0qhbujqRrRTbDzKVxshlmLo2TzTC5jHOcGWbM7FWveOx4vjB/Br1WzKVxshkI3II2AAARK0lEQVRlLo1b+GwOmLqARVVVx9TszngvSvKiqnr+/KS89GQzbMVJZuejqg6ZtqqNwZjZU1U9PcllSR6V5M7zxwlJLp3vW0rzcXJ+Zldyf2j+qCTnV9UZU9Y2NdkMM5fGyWaYuTRONsPkMs5xZpgxM66qvifJ1UnOTPJ988dZSa6e71tK5tI42Qwzl8ZtlmwsubAK8xPwUzM7Ce/48/mtma13dH53nz1VbVOTzbD5ieRFSd6R2aX8ySyXxyQ5q7vPm6q2qRkzw6rq40keuvtvlavq7kk+uKx3P65NckfStSCbYebSONkMM5fGyWaYXMY5zgwzZsZV1ZVJHjv/K76V249O8tbuPnaSwiZmLo2TzTBzadxmycaSC6vzrAyfgF+c5IokS9mAmpPNsBcmedDYSSbJ0jZ0Y8yMqdzy5x8r3Tzft6w2xR1J14hshplL42QzzFwaJ5thchnnODPMmBm3Jbdc5LHSZ5LcYZ1r2UjMpXGyGWYujdsU2Wjoro4T8DjZDHOSGWfMDPvlJJdV1Tuy6503H5PkP09W1fQ2xR1J14hshplL42QzzFwaJ5thchnnODPMmBn3miQXV9X5uSWbe2X213uvnqyq6ZlL42QzzFwatymyseTCKlTVyUn+R2ZrbuxxAu7ut01V29RkM6yqTk3yi5ktubDHSaa7XzdRaZMzZsbNr+D+3ux65823d/cXpqtqerUJ7ki6VmQzzFwaJ5th5tI42QyTyzjHmWHGzLiq+vYkP5hds3lzd390uqqmZy6Nk80wc2ncZshGQ3eVnIDHyWaYk8w4Y2ZcVR2ZFbl09+emrGcjqKrKnuPlQ+2EJpu9MJfGyWZP5tI42QyTy945zuzJmLl1VXVoknT3DVPXslGYS+NkM85cGrfI2VhyYfV6xWPH82X+8/CVZDOgu79QVRdl15PM0jdz54yZ3VTVcUlemeTgzNb3qSRbq+rGJM/u7sumrG8q87uOvjyzK7pX3mDwPlX17O5+x2TFTUw2w8ylcbIZZi6Nk80wuYxznBlmzIyrqnsn+W9JTkzyxdmm+vok705yxu43MVoW5tI42Qwzl8ZtlmxcobsKezsBZ3bAWOYTsGwGjJ1kkiz1SSYxZsZU1UeS/ER3f3C37Q9L8pvd/YBpKpvWZrkj6VqQzTBzaZxshplL42QzTC7jHGeGGTPjqur9Sc5JcsGOv9arqgOTPCnJc7v7YVPWNxVzaZxshplL4zZLNq7QXZ2XJHn02Ak4ydKegCObMa/L+EnmtUmW8iQzZ8wMu8vu4yVJuvsDVXWXKQraIDbFHUnXiGyGmUvjZDPMXBonm2FyGec4M8yYGXd4d79x5YZ5w+X8qlrmG1yZS+NkM8xcGrcpstHQXR0n4HGyGeYkM86YGfYnVfWWJOdl1ztvPj3J0t4oLpvkjqRrRDbDzKVxshlmLo0byubeSU7JcmdjzIxznBlmzIy7tKpenuTc7JrNqUk+PFlV0zOXxslmmLk0blNkY8mFVaiqFyR5cpKhN7Nv6u7/OlVtUxvJZsebk6XNpqpemuRbMnyS+VR3P2eq2qZmzIyrqscmeXz2vPPmW6eranpVdWyGc1mYO5KuldoEd2tdC1X1fRnOZannUuI4M8ZxZpxshsllnGPwMOfsYVV1xyTPysB8SvLq7v6XqWqbmnP2ONnsyVwat1my0dBdJW/axslmmDez47yhBdgYquobuvvzU9fB4qiqw7r7+qnrAABYJhq6+4k3s8D+VFUHJ3lBZr8cOTJJJ/l8kguTnN3dN05Y3oZUVX/S3Y+duo6pzO/M+oLMbir41u5+w4p9L+/uZ09W3ISq6huTvCjJzUl+MclPJfmhJB9L8tPd/dkJy5tUVR06sPmyJA/M7D3iDetc0oZQVSd399vmnx+c5NeTPCTJXyf5me7+3JT1Tamqzk7ya939D1X1oCS/m+RrSe6Y5Ond/d5JC5xIVV2W5PeT/K/u/uTU9WwkVfXgzO4k/pnMzlGvSfLgzG6Ge1p3L8yftu5PVXXXJD+f5N9mdt7+SpK/SfLK7n7dhKVNrqq2ZHbl3BOy68UeF2Z25dxXp6pto6qqV3X3aVPXMZX5zax+LLO59Cfd/Zcr9v1Cd//SZMVNqKrunOQ5mf0/8mWZ/UX5v83sPfB/6u5/mrC8Daeqrurub526jtvigKkLWERVdXZVHT7//EFV9ckkH6iqT1fVIycub1JVdVlV/UJVffPUtWwkVfXgqrqoqv5nVd2rqt5ZVTdW1cVV9cCp65tSVd21qv5TVV1RVV+squuq6gNV9Yypa5vYm5J8IckJ3X1odx+W5IQkN873LaWq+s6Rx4OSHDd1fRN7bZJK8ntJnlpVv1dVd5rvW4g7ta6R1yX5aGZLulyU5P8meVySP0/yyunK2hD+Icmluz2Oyqype8mEdU3tv6z4/NeTXJvkB5JcnOQ3J6lo43hcd//D/PNfS3JKd983yWMyy2pZ3T3JIUneU1Ufqqqfqap7Tl3UBvEbmTV035LkLzO74/whSc5I8vIpC5vY65N8Msn3JjkryUuTPC3JCVX1X/b2hUvgdzJ7T3dWku+bP87K7CbS/3PCuiZVVYeOPA7LLKNl9ptJHpnk+iQvq6oXr9j3Q9OUtCG8LrMLg47O7Bj84CS/mtn/F14xXVnTq6ovVdU/zj9+qaq+lORbdmyfur595QrdVaiqy7v7fvPPL0ry8919cVV9a2a/mT9+2gqnU1WfyqyZ8OTM/gP0hiRv7O6/n7SwiVXVhzK7QuyQzN7U/kx3X1BVJyX5pe7+rkkLnFBVXZjkD5L8aWbj5i6Zraf7C0k+093/ccLyJlNVH+/ub7ut+za7qvpakvdm9kZkdw/r7q9b55I2jKr6SHcft+L5CzN7g/+DSd7Z3d85WXETqqoPd/cD55//bXffe8W+XTJbNlX1vMwacT/X3ZfPt32qu4+etrJpVdVlO+bLwLxa9jFzZZL7dfdNVfWB7n7Yin073x8vm93GzCOSPDWzJsKVSd7Q3a+asr4p3coxeOe+ZVNV/7u7H7Di+cXd/eCqOiDJR7v7mAnLm9TerpJbxCvo9pf5e+BPZ9f3wD1/flR333GSwjaAqvqr7r7//PMtmf2y6PDMjsUfWOLjzEe6+7iqqiSfTXKP7u758/+9I7NlVLN7HB2S2Xvgz823Ldx7YFfors6W+YEiSb6uuy9Oku6+Ksmdxr9sKXyhu//D/M3a85LcN8ll86tTl/bPQJLcobv/ZP4n0N3dF2T2ybuSHDRtaZPb1t2v6+7t3f3iJD/Y3VcneWaW+zeqn66qn6+qI3dsqKojq+r5ueXmccvoyiQ/0d0n7P7I7GrDZXan+X8EkyTd/ctJfivJnyU5bLKqprfyvc55u+07cD0L2Wi6+9cz+xPFX6yqF1fV3TL7z+Gy+4aq+tl5w/vr5//x2WHZ3zu/PMlbq+rEJG+rqpdU1SOr6qwkH5m4tg2hu/98vsTNUUl+JcnS/tJ+7stV9T1V9aQkXVVPSJL5XzV+bdrSJvV/qupfJ0lV/WCSG5Kku2/O8C+tl8kNVfWkle9pquqAqjols79eW1afTPKo7j56xeOb5w2opV0KaG5nM7u7b5ovP/G/k7w7yV0nq2qD6NlVnG+df9zxfKnf73X36UlekuQNVXX6/HizcJks+5vS1fJmdh94Q7sLb2bHeUM77JTMmnDvraovVNUNSd6T5NDMrmReVmdm/Nz1U+tYx0b0R0lOXLlhvg7f8zJbm29ZXViztQrT3b+wY2NV3SfJxyeraoOY/zLtSZkdX96Z5M7TVrQh/FaSu2X2n8BzM7vKZ8d6zEv9Pq+7X5bZkhQ/kdka7ycmeX5m61v+6ISlTe2q3Td099e6+23d/cwpCtpA/l1m56EfzWx5gROq6sbM/j91+pSFTezfJ3lxVX0hs7V0T0+Sqjois2UqltlTkjwxybVVdVVVXZXZX37+0Hzfsjons+Vdhvy39SxkA7qkqk5euaG7z8psObJtk1S0MVyy4j3wznN0VX1Lki9NVtUG0d2XJnn0/Ol7s4AX2llyYZWq6lGZnYi/NcmWzK6Y+8Mkr+numyYsbVJVdX53L/OJdlBVPSCzE+3NSX4ms7Fzamb/AfrxlQu3L5uqun+S387sau4rkvxod181f0P71O5+6aQFTqiqjslscf8PrFy0vlbcsGcZzXM5KskH5bKrvWTz2O7+k+kqm5YxM25lNpn9gvFbuvuvlz0bY2acbIbJZVxVHZvknpHNLua5HBXv8/ZQVQ/N7Gq5v0lyTGYXBn20u986aWETq6qHZHaB5cVV9e1JTk7ysWXPJZHNmJFcPp4VV+wuq92yeURm96u5ZJHGjIbuflZVz+zu105dx0Ykm2FyGbfM2VTV6Ul+MrMlBo5L8tPdfeF83861+paNXMZV1U9ldidb2awgl3Hm0zBjZpwxM8yYGTcfM8/O7K7qspmTy7iqelGSx2Z20dQ7kzwks78ieUySt8+XlFo6A7k8NLObvS51Lolsxshl3GY5zmjo7me122L/3EI2w+QybpmzqarLk3xXd/9TVW1LckGS3+nul9Ry30RELiNkM0wu42QzTC7jZDNMLuNkM0wu4+bZHJfZvWmuTbK1u/+xqr4us6u8l/JGTnIZJ5thchm3WbLZcusvYXdV9Vdju5IcObJvKchmmFzGyWbUATv+/K67r5kv83JBVX1TlnttYbmMk80wuYyTzTC5jJPNMLmMk80wuYy7qbu/luSfq+pvuvsfk6S7/29V3TxxbVOSyzjZDJPLuE2RjYbu6hyZ2aL+u99ls5Is7Vqoc7IZJpdxshn2uao6rrs/kiTzKzi+P8lrktxv2tImJZdxshkml3GyGSaXcbIZJpdxshkml3Ffqao7d/c/J3nQjo1VdXBm9yNZVnIZJ5thchm3KbKx5MIqVNWrk7y2u983sO9/dfcPT1DWhiCbYXIZJ5thVbU1s98cXjuw7+Hd/RcTlDU5uYyTzTC5jJPNMLmMk80wuYyTzTC5jKuqO3X3vwxsPzzJPbr78gnKmpxcxslmmFzGbZZsNHQBAAAAABbEAVMXAAAAAADAvtHQBQAAAABYEBq6AABsalX1tar6SFX9dVX9UVUdcju+13+qqkfvz/oAAOC2sIYuAACbWlX9U3ffdf75uUmu6u5fnrgsAABYFVfoAgCwTN6f5KgdT6rq56rq4qr6q6o6a8X2/6+qPl5V76uqN1TVf5hvf11VPXH++UlV9eGquryqXlNVd5pvv6aqzqqqy+b7jlnnfyMAAJuYhi4AAEuhqg5MclKSN8+ff0+S+yZ5SJLjkjyoqv5NVT04yb9N8oAkj01y/MD3OijJ65Kc0t33S7Ilyb9f8ZJ/6O7vTPKKJP9hrf5NAAAsHw1dAAA2u6+rqo8kuTbJkUneOd/+PfPHh5NcluSYzBq8D09yYXd/ubu/lOSPBr7ntyX5VHdfNX9+bpJ/s2L/788/Xppk2/77pwAAsOw0dAEA2Oz+b3cfl+SbklSSn5xvryT/tbuPmz/u092v3k8/81/mH7+W2dW7AACwX2joAgCwFLr7n5OcnuR5VbUlyduT/GhV7bhh2lFV9Q1J/iLJD1TVQfN93z/w7T6eZFtV3Wf+/GlJ3rvm/wgAAJaeqwUAAFga3f3hqvqrJE/t7t+pqmOTvL+qkuSfkvy/3X1xVb05yV8l+VySy5N8cbfv8+WqemaS3503hy9O8sr1/LcAALCcqrunrgEAADaUqrprd/9TVd05yZ8lOa27L5u6LgAAcIUuAADs6VVV9e1JDkpyrmYuAAAbhSt0AQAAAAAWhJuiAQAAAAAsCA1dAAAAAIAFoaELAAAAALAgNHQBAAAAABaEhi4AAAAAwILQ0AUAAAAAWBD/Pwh24m5qDXjPAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "pandasDF = oskar.histogram(df,\"start\",1000000).toPandas()\n", + "histogram = pandasDF.plot(x = \"start\", y = \"count\", kind = \"bar\", figsize=(24,4))\n", + "histogram.set(xlabel=\"Region\", ylabel=\"Counts\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/vnd.plotly.v1+html": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "linkText": "Export to plot.ly", + "plotlyServerURL": "https://plot.ly", + "showLink": true + }, + "data": [ + { + "type": "bar", + "uid": "75513143-1b57-4b64-9da2-b4cd1339e703", + "x": [ + 16000000, + 17000000, + 18000000, + 19000000, + 20000000, + 21000000, + 22000000, + 23000000, + 24000000, + 25000000, + 26000000, + 27000000, + 28000000, + 29000000, + 30000000, + 31000000, + 32000000, + 33000000, + 34000000 + ], + "y": [ + 101, + 100, + 85, + 53, + 46, + 85, + 52, + 49, + 44, + 62, + 42, + 54, + 21, + 29, + 38, + 46, + 44, + 34, + 15 + ] + } + ], + "layout": { + "autosize": true, + "xaxis": { + "autorange": true, + "range": [ + 15500000, + 34500000 + ], + "type": "linear" + }, + "yaxis": { + "autorange": true, + "range": [ + 0, + 106.3157894736842 + ], + "type": "linear" + } + } + }, + "image/png": "iVBORw0KGgoAAAANSUhEUgAABQkAAAFoCAYAAAD0GX0ZAAAgAElEQVR4nO3d33Pd9X3n8f1PerU3nV7tpDd7se3OnPH6KHZaj7QsAc+E2Okwtae7GjODmVWcyFXiMWZhi7ODEaGlwdk6iQm0NALGMWQhJpFLAQMBTOQ4iSmk/IgsYyODbOu9F6lPZKzvVz6Wvzl6n8/jMfO6kI585vv11zJfPa0j/l0AAAAAAEX7d70+AAAAAACgt0RCAAAAACicSAgAAAAAhRMJAQAAAKBwIiEAAAAAFE4kBAAAAIDCiYQAAAAAUDiREAAAAAAKJxICAAAAQOFEQgAAAAAonEgIAAAAAIUTCQEAAACgcCIhAAAAABROJAQAAACAwomEAAAAAFA4kRAAAAAACicSAgAAAEDhREIAAAAAKJxICAAAAACFEwkBAAAAoHAiIQAAAAAUTiQEAAAAgMKJhAAAAABQOJEQAAAAAAonEgIAAABA4URCAAAAACicSAgAAAAAhRMJAQAAAKBwIiEAAAAAFE4kBAAAAIDCiYQAAAAAUDiREAAAAAAKJxICAAAAQOFEQgAAAAAonEgIAAAAAIUTCQEAAACgcCIhAAAAABROJAQAAACAwomEAAAAAFA4kRAAAAAACicSAgAAAEDhREIAAAAAKJxICAAAAACFEwkBAAAAoHAiIQAAAAAUTiQEAAAAgMKJhAAAAABQOJEQAAAAAAonEgIAAABA4URCAAAAACicSAgAAAAAhRMJAQAAAKBwIiEAAAAAFE4kBAAAAIDCiYQAAAAAUDiREAAAAAAKJxICAAAAQOFEQgAAAAAonEgIAAAAAIUTCQEAAACgcCIhAAAAABROJAQAAACAwomEAAAAAFA4kRAAAAAACicSAgAAAEDhREIAAAAAKJxICAAAAACFEwkBAAAAoHAiIQAAAAAUTiQEAAAAgMKJhAAAAABQOJEQAAAAAAonEgIAAABA4URCAAAAACicSAgAAAAAhRMJAQAAAKBwIiEAAAAAFE4kBAAAAIDCiYQAAAAAUDiREAAAAAAKJxICAAAAQOFEQgAAAAAonEgIAAAAAIUTCQEAAACgcCIhAAAAABROJAQAAACAwomEAAAAAFA4kRAAAAAACicSAgAAAEDhREIAAAAAKJxICAAAAACFEwkBAAAAoHAiIQAAAAAUTiQEAAAAgMKJhAAAAABQOJEQAAAAAAonEgIAAABA4URCAAAAACicSAgAAAAAhRMJAQAAAKBwIiEAAAAAFE4kBAAAAIDCiYQAAAAAUDiREAAAAAAKJxICAAAAQOGKiITnzp2Lu+66K3bv3n3ZYzMzM7Ft27YYGBiIoaGh2LdvX1ePAwAAAEB2fR8Jn3322bjhhhvi05/+9KKRcGxsLEZHR2N6ejqOHj0ag4ODMTk5ecWPAwAAAEB2fR8JL7r33nsvi4Rzc3PRbrfj2LFjnfeNj4/Hzp07r+hxAAAAAOgHRUfCEydORKvVirNnz3beNzExEZs3b76ix6ucOnsufm/r99Lua09OXcPfeQAAAABWuqIj4dTUVLRarZifn++87+DBg7Fhw4Yrejwi4tSpU5ftzXd+3fPQt5z9r8d+suh5Ve3Fn/0qDr5yIuWeefXNrs7VrBc7ffp0z4/BzPp7Z86c6fkxmFl/z98zZva7GMtTdCS8+J2Cc3NznfdNTEzEpk2brujxiIjz589ftukzH/U89C1nd3//jUXPq2r/fd8LPT/mq90ffuX7XZ2rWS924cKFnh+DmfX3/D1jZk3P3zNm9rsYy1N0JJybm4vVq1df8jMH9+zZEzt27Liix6uU9nLj//GtF3t+zMuJhAAAAAClKzoSRkSMjo7G9u3b4+TJkzE1NRVDQ0Nx6NChK358MSJhnomEAAAAAAVEwqeeeirWrVsXAwMD0W63Y926dfH00093Hp+eno6RkZFot9sxODgYe/fuveTXL/X4YkTCPBMJAQAAAAqIhL0gEuaZSAgAAAAgEjZCJMwzkRAAAABAJGyESJhnIiEAAACASNgIkTDPREIAAAAAkbARImGeiYQAAAAAImEjRMI8EwkBAAAARMJGiIR5JhICAAAAiISNEAnzTCQEAAAAEAkbIRLmmUgIAAAAIBI2QiTMM5EQAAAAQCRshEiYZyIhAAAAgEjYCJEwz0RCAAAAAJGwESJhnomEAAAAACJhI0TCPBMJAQAAAETCRoiEeSYSAgAAAIiEjRAJ80wkBAAAABAJGyES5plICAAAACASNkIkzDOREAAAAEAkbIRImGciIQAAAIBI2AiRMM9EQgAAAACRsBEiYZ6JhAAAAAAiYSNEwjzrNhI+/8uTPT/m5ezxV37V1fnSv35/2+M9//N4tdv29z/p9W8fAABA3xEJGyAS5plISKlEQgAAABYSCRsgEuaZSEipREIAAAAWEgkbIBLmmUhIqURCAAAAFio+Er755puxdevWuO666+L666+P3bt3x9zcXOfxmZmZ2LZtWwwMDMTQ0FDs27dvyecUCfNMJKRUIiEAAAALFR8JN27cGPv374+IiDNnzsTw8HA8+OCDncfHxsZidHQ0pqen4+jRozE4OBiTk5O1zykS5plISKlEQgAAABYqOhJeuHAhVq1aFUeOHOm87957741du3ZFRMTc3Fy02+04duxY5/Hx8fHYuXNn7fOKhHkmElIqkRAAAICFio6EEb+JfoODg/Hoo4/GsWPHYsOGDZ0oeOLEiWi1WnH27NnOx09MTMTmzZtrn1MkzDORkFKJhAAAACxUfCR8+eWX4+abb46dO3fGmjVrYnR0NGZnZyMiYmpqKlqtVszPz3c+/uDBg7Fhw4bO29PT05ftl796r+dfRC9nu7738qLnVbU/f/Bwz4/5avepsQNdnesPXvllz495Odv/46muztdWxmZmZq75c2aOhLd++/meXxOzftvp06d7fgxm1t87depUz4/BzPp/LE/RkfDUqVPxmc98Jt56662IiPjggw/i1ltvjdHR0Yj47XcSLvwfmUxMTMSmTZs6b8/Pz1+2mdm5nn8RvZztPvjTRc+ratm/k7Cbc/3nX0z3/JiXs8defrur87WVs2stcyT0nYQAkE+v76XMrIyxPEVHwldeeSXWrFlzyfsOHjwY69evj4jf/EzC1atXX/IzCffs2RM7duyofV4vN84zLzemVCIhAAAACxUdCc+cORPr1q2L/fv3x/z8fMzOzsbIyEjcfffdnY8ZHR2N7du3x8mTJ2NqaiqGhobi0KFDtc8rEuaZSEipREIAAAAWKjoSRkS8/vrrMTw8HENDQ3HDDTfEPffcEx999FHn8enp6RgZGYl2ux2Dg4Oxd+/eJZ9TJMwzkZBSiYQAAAAsVHwkbIJImGciIaUSCQEAAFhIJGyASJhnIiGlEgkBAABYSCRsgEiYZyIhpRIJAQAAWEgkbIBImGciIaUSCQEAAFhIJGyASJhnIiGlEgkBAABYSCRsgEiYZyIhpRIJAQAAWEgkbIBImGciIaUSCQEAAFhIJGyASJhnIiGlEgkBAABYSCRsgEiYZyIhpRIJAQAAWEgkbIBImGciIaUSCQEAAFhIJGyASJhnIiGlEgkBAABYSCRsgEiYZyIhpRIJAQAAWEgkbIBImGciIaUSCQEAAFhIJGyASJhnIiGlEgkBAABYSCRsgEiYZyIhpRIJAQAAWEgkbIBImGciIaUSCQEAAFhIJGyASJhnIiGlEgkBAABYSCRsgEiYZyIhpRIJAQAAWEgkbIBImGciIaUSCQEAAFhIJGyASJhnIiGlEgkBAABYSCRsgEiYZyIhpRIJAQAAWEgkbIBImGciIaUSCQEAAFhIJGyASJhnIiGlEgkBAABYSCRsgEiYZyIhpRIJAQAAWEgkbIBImGciIaUSCQEAAFhIJIyI1157LbZu3Rpr166Ndrsd7733XuexmZmZ2LZtWwwMDMTQ0FDs27dvyecTCfNMJKRUIiEAAAALFR8JX3311RgcHIyJiYl4991345133om5ubnO42NjYzE6OhrT09Nx9OjRGBwcjMnJydrnFAnzTCSkVCIh5PLMT9/r+efecvbMT99b+iQBAOip4iPh8PBwPPbYY4s+Njc3F+12O44dO9Z53/j4eOzcubP2OUXCPBMJKZVICLmIhAAANK3oSHjmzJlotVoxNjYWQ0NDsWbNmti+fXucOXMmIiJOnDgRrVYrzp492/k1ExMTsXnz5s7b8/Pzl21mdq7nN+PL2e6DP130vKqWPRJ2c67//Ivpnh/zcvbYy293db7Wv8seCXv9+2f2u97Tb7zb88+95ezpN97t+e+hmZmZ9f9YnqIj4fHjx6PVasXk5GTMzs7Ge++9F7fcckuMjY1FRMTU1FS0Wq1L/qAdPHgwNmzY0Hn75MmTl+2Xv8r9r/13TLyy6HlV7c8fPNzzY77afWrsQFfn+v9+cqLnx7ycPTR5rKvztZWxU6dOXfPnzBwJb/328z2/Jma/6z32ws97/rm3nD32ws97/ntoZr3dBx980PNjMLP+H8sjErZaceHChc77nnvuuVi7dm1E/PY7CRf+jMKJiYnYtGlT7fN6uXGeebkxpcocCb3cmBJ5uTEAAE0rOhLOzs7GqlWr4vjx4533TU5Oxo033hgRv/mZhKtXr77kZxLu2bMnduzYUfu8ImGeiYSUSiSEXERCAACaVnQkjIj46le/Glu2bInp6el4//33Y3h4OB544IHO46Ojo7F9+/Y4efJkTE1NxdDQUBw6dKj2OUXCPBMJKZVICLmIhAAANK34SPjhhx/GHXfcEevWrYvrrrsu7r///jh37lzn8enp6RgZGYl2ux2Dg4Oxd+/eJZ9TJMwzkZBSiYSQi0gIAEDTio+ETRAJ80wkpFQiIeQiEgIA0DSRsAEiYZ6JhJRKJIRcREIAAJomEjZAJMwzkZBSiYSQi0gIAEDTRMIGiIR5JhJSKpEQchEJAQBomkjYAJEwz0RCSiUSQi4iIQAATRMJGyAS5plISKlEQshFJOxfL5w4GXcdeCPtPjh7rte/hQDANSISNkAkzDORkFKJhJCLSNi//ubQz3t+fZazt2fO9vq3EAC4RkTCBoiEeSYS1vufD78c/238xyk39r1XuzrX0oiEkItI2L9EQgBgpRAJGyAS5plIWO8zXzvU82O+2n32vh93da6lEQkhF5Gwf4mEAMBKIRI2QCTMM5GwnkjYv0qKhDsff73nx7ycnb8w39CfAjIRCfuXSAgArBQiYQNEwjwTCeuJhP1LJMwzkZAIkbCfiYQAwEohEjZAJMwzkbCeSNi/RMI8EwmJEAn7mUgIAKwUImEDRMI8EwnriYT9SyTMM5GQCJGwn4mEAMBKIRI2QCTMM5GwnkjYv0TCPBMJiRAJ+5lICACsFCJhA0TCPBMJ64mE/UskzLNuIuHbM2d7frzL2d8c+nmXf5LLIRL2L5EQAFgpRMIGiIR5JhLWEwn7l0iYZyIhESJhPxMJAYCVQiRsgEiYZyJhPZGwf4mEeSYSEiES9jOREABYKUTCBoiEeSYS1hMJ+5dImGciIREiYT8TCQGAlUIkbIBImGciYT2RsH+JhHkmEhIhEvYzkRAAWClEwgaIhHkmEtYrKRLu/+c3e37My9nUO6e7Ol+RMM9EQiJEwn4mEkI+n73vxz3/3LvafeZrh3r92wesYCJhA0TCPBMJ64mEeSYSVhMJ80wkrCYS9i+REPIRCYF+JRI2QCTMM5GwnkiYZyJhNZEwz0TCaiJh/xIJIR+REOhXImEDRMI8EwnriYR5JhJWEwnzrNtI+A9H3oq7DryRcg882925ioT9SySEfEqKhFPvnO75fzOXM39HQXdEwgaIhHkmEtYTCfNMJKwmEuZZt5HwC994rufHfLX74zt+0NW5ioT9SySEfEqKhI+/8queH/Ny9vwvTzb0pwD6k0j4bz7++OPYsmVLbNmy5ZL3z8zMxLZt22JgYCCGhoZi3759Sz6XSJhnImE9kTDPRMJqImGeiYTVRML+JRJCPiJhnomE0B2RMCIuXLgQX/7yl2Pz5s2XRcKxsbEYHR2N6enpOHr0aAwODsbk5GTt84mEeSYS1hMJ80wkrCYS5plIWE0k7F8iIeQjEuaZSAjdEQkj4s4774zx8fF44oknLomEc3Nz0W6349ixY533jY+Px86dO2ufTyTMM5GwnkiYZyJhNZEwz0TCaqVFwj++4wc9P+ar3Re+8VxX5yoSQj4iYZ6JhNCd4iPh/fffH7fffnvMz89fFglPnDgRrVYrzp797c3PxMREbN68ufY5RcI8EwnriYR5JhJWEwnzTCSsJhLmmUgI/U8kzDORELpTdCQ8cOBAjIyMxPnz5yMiLouEU1NT0Wq1Yn7+t1+gHTx4MDZs2NB5+9y5c5ft16dzf5F29/ffWPS8qvbf973Q82O+2v3hV74fc3NzV7zDP8v9Rdo/vvhmV+e7dvcPe37MV7vrx3/U1bl+6/Aven7My9nrb53s6vM2cyQcefjlrs51x8SrPT/m5eyjj+eu+FxPvH+658e7nN3/zM+6urYb//afen7MV7s/vuMHXZ3rD17/154f83L2g9f/tavzzRwJN/7tP3V1rvc/87OeH/NyduL9012dr5W9bu7NVvKuH/9Rzz/3rnZrd/+wq3P9xxdz/0P64Z+91/M/L/ab/a7+nmF5io6E9913X6xevTra7Xa02+1YtWpVtFqtaLfbcfbs2c53Es7NzXV+zcTERGzatKnz9qlTpy7bm+/8uud/GS5n/+uxnyx6XlXbtDfvF2mfGjsQH3zwwRXvh6+/1fNjXs4e/qfjXZ3vp//q6Z4f89Xuuj2HujrXvYemen7My9mR4//a1flmjoS37X+xq3P9y394qefHvJydnDl1xef603/J/Q8Ze5482tW1vemv834nx3+6/cmuzvWJI7/s+TEvZ08c+WVX5/ufbn+y58d8tbvpr3/c1bnuefJoz495Ofvpv7zX1flauTt9+nTPj+Fa7bo9eV9t8+m/erqrc334n473/JiXsx++/lbP/7zYb9ZNY1jOWJ6iI+EnLfYzCVevXn3JzyTcs2dP7Nixo/Z5vNw4z7zcuJ6XG+eZlxtX83LjPPNy42pebpxnXm4M/c/LjfPMy42hOyLhAp+MhBERo6OjsX379jh58mRMTU3F0NBQHDpU/xerSJhnImE9kTDPRMJqImGeiYTVRMI8Ewmh/4mEeSYSQndEwgUWi4TT09MxMjIS7XY7BgcHY+/evUs+j0iYZyJhPZEwz0TCaiJhnomE1UTCPBMJKdHs3Pn40c/eT7t3P/ioq/MVCfNMJITuiIQNEAnzTCSsJxLmmUhYTSTMM5GwmkiYZyIhJZp6J/f/OGv/P7/Z1fmKhHkmEkJ3RMIGiIR5JhLWEwnzTCSsJhLmmUhYTSTMM5GQEomEeSYSAnVEwgaIhHkmEtYTCfNMJKwmEuaZSFhNJMwzkZASiYR5JhICdUTCBoiEeSYS1hMJ80wkrCYS5plIWE0kzDORkBKJhHkmEgJ1RMIGiIR5JhLWEwnzTCSsJhLmmUhYTSTMM5GQEomEeSYSAnVEwgaIhHkmEtYTCfNMJKwmEuaZSFhNJMwzkZASiYR5JhICdUTCBoiEeSYS1hMJ80wkrCYS5plIWE0kzDORkBKJhHkmEta77+mfxV0H3ki5br/Wg8WIhA0QCfNMJKwnEuaZSFhNJMwzkbCaSJhnIiElEgnzTCSs94df+X7Pj/lq9z++9WJX5wqLEQkbIBLmmUhYTyTMM5GwmkiYZyJhNZEwz0RCSiQS5plIWE8kpHQiYQNEwjwTCeuJhHkmElYTCfNMJKwmEuaZSEiJRMI8EwnriYSUTiRsgEiYZyJhPZEwz0TCaiJhnomE1UTCPBMJKZFImGciYT2RkNKJhA0QCfNMJKwnEuaZSFhNJMwzkbCaSJhnIiElEgnzTCSsJxJSOpGwASJhnomE9UTCPBMJq4mEeSYSVhMJ80wkpEQiYZ6JhPVEQkonEjZAJMwzkbCeSJhnImE1kTDPRMJqImGeiYSUSCTMM5GwnkhI6UTCBoiEeSYS1hMJ80wkrCYS5plIWE0kzDORkBKJhHkmEtYTCSmdSNgAkTDPRMJ6ImGeiYTVRMI8EwmriYR5JhJy0ba//0nPr8/V7ve3Pd7VuYqEeSYS1hMJKZ1I2ACRMM9EwnoiYZ6JhNVEwjwTCauJhHkmEnKRSJhnImE1kTDPREKuBZGwASJhnomE9UTCPBMJq4mEeSYSVhMJ80wk5CKRMM9EwmoiYZ6JhFwLImEDRMI8EwnriYR5JhJWEwnzTCSsJhLmmUjIRSJhnomE1UTCPBMJuRZEwgaIhHkmEtYTCfNMJKwmEuaZSFhNJMwzkZCLRMI8EwmriYR5JhJyLYiEDRAJ80wkrCcS5plIWE0kzDORsJpImGciIReJhHkmElYTCfNMJORaEAkbIBLmmUhYTyTMM5GwmkiYZyJhNZEwz0RCLhIJ80wkrCYS5plIyLUgEjZAJMwzkbCeSJhnImE1kTDPRMJqImGeiYRcJBLmmUhYTSTMM5GQa0EkbIBImGciYT2RMM9EwmoiYZ6JhNVEwjwTCblIJMwzkbCaSJhnIiHXgkjYAJEwz0TCeiJhnomE1UTCPBMJq4mEeSYScpFImGciYTWRMM9EQq6F4iPhAw88EJ/73Oei3W7H+vXr4+GHH77k8ZmZmdi2bVsMDAzE0NBQ7Nu3b8nnFAnzTCSsJxLmmUhYTSTMM5GwmkiYZyIhF4mEeSYSVhMJ80wk5FooPhKOj4/HkSNH4uTJk3H48OEYGBiIF154ofP42NhYjI6OxvT0dBw9ejQGBwdjcnKy9jlFwjwTCeuJhHkmElYTCfNMJKwmEuaZSMhFImGeiYTVRMI8Ewm5FoqPhJ80PDwcDz30UEREzM3NRbvdjmPHjnUeHx8fj507d9Y+h0iYZyJhPZEwz0TCaiJhnomE1UTCPBMJuUgkzDORsJpImGciIdeCSLjA2bNnY926dfH8889HRMSJEyei1WrF2bO/vfmZmJiIzZs31z6PSJhnImE9kTDPRMJqImGeiYTVRMI8Ewnr3XXgjbT74VR3f45FwjwTCauJhHkmEnItiIQL7Nq1K7Zu3Rrz87/5gmxqaiparVbn7YiIgwcPxoYNGzpvf/zxx5ftvVMf9vwviOXsrw4cXfS8qvYXf/d8z4/5aveHX/l+V+c6eezdnh/zcvaPL77Z1fmu3f3Dnh/z1e768R91da77JnN/kfbav0x3db6ZI+HIwy93da5f/d5Pen7My9ns2Y+u+Fx/8e6pnh/vcvb1p491dW03PHC458d8tfujXU91da5Pvvp2z495OXvy1be7Ot8/2vVUz4/5arfhgcNdnevXnz7W82Nezn7x7qkrPtfZsx/1/HiXs69+7yddXduRh1/u+TFf7X5/2+Ndnetr/zLd82NezvZN/ryr871+/Ec9P+ar3drdP+zqXP/xxdz/kD557N2uzjdzJPyLv3u+q3Pt17E8IuG/ueeee+Lmm2+OM2fOdN538TsJ5+bmOu+bmJiITZs2dd4+c+bMZXv7/Zme/wWxnN35+KuLnlfVNn8z73dyfGrsQFfneuho7i/SHnnu512d75q7n+n5MV/trttzqKtz/eazub9Ie+nn73R1vpkj4W37X+zqXMcezftF2u9t/V6c+uD0FZ/rsbd/3fPjXc7ufeqNrq7t5/9msufHfLX7o11PdXWuB17O/UXagZff7Op8M0fCz//NZFfneu9Tb/T8mJezY2//+orP9dQHub/bbOzRl7u6trftz/tqm9/f9nhX5/rSz9/p+TEvZ9989lhX53vdnryvtllz9zNdnesjz+X+h/RDR9/u6nw/NXag58d8tdv8zee6Otd+HctTfCQ8f/587Nq1K4aHhy/7AzU3NxerV6++5GcS7tmzJ3bs2FH7nF5unGdeblzPy43zzMuNq3m5cZ55uXE1LzfOMy83rnb+wnzPj3c52/n4611dWy83zjMvN67m5cZ55uXGXAtFR8Lz58/HrbfeGl/84hdjZmYmZmdnY3Z29pKfQTg6Ohrbt2+PkydPxtTUVAwNDcWhQ/V/sYqEeSYS1hMJ80wkrCYS5plIWE0kzDORsJpImGciYT2RMM9EQuhO0ZHw9OnT0Wq1LttNN93U+Zjp6ekYGRmJdrsdg4ODsXfv3iWfVyTMM5GwnkiYZyJhNZEwz0TCaiJhnomE1UTCPBMJ64mEeSYSQneKjoRNEQnzTCSsJxLmmUhYTSTMM5GwmkiYZyJhNZEwz0TCeiJhnomE0B2RsAEiYZ6JhPVEwjwTCauJhHkmElYTCfNMJKwmEuaZSFhPJMwzkRC6IxI2QCTMM5GwnkiYZyJhNZEwz0TCaiJhnomE1UTCPBMJ64mEeSYSQndEwgaIhHkmEtYTCfNMJKwmEuaZSFhNJMwzkbCaSJhnImE9kTDPRELojkjYAJEwz0TCeiJhnomE1UTCPBMJq4mEeSYSVhMJ80wkrCcS5plIWO3w8V/HXQfeSDuaIRI2QCTMM5GwnkiYZyJhNZEwz0TCaiJhnomE1UTCPBMJ64mEeSYSVvvak1M9P+bl7NTZc12dL1dGJGyASJhnImE9kTDPRMJqImGeiYTVRMI8EwmriYR5JhLWEwnzTCSsJhKyGJGwASJhnomE9UTCPBMJq4mEeSYSVhMJ80wkrCYS5plIWE8kzDORsJpIyGJEwgaIhHkmEtYTCfNMJKwmEuaZSFhNJMwzkbCaSJhnImE9kTDPRMJqIiGLEQkbIBLmmUhYTyTMM5GwmkiYZyJhNZEwz0TCaiJhnomE9UTCPBMJq4mELEYkbIBImGciYT2RMM9EwmoiYZ6JhNVEwjwTCauJhHkmEtYTCfNMJKwmErIYkbABImGeiYT1RMI8EwmriYR5JhJWEwnzTCSsJhLmmUhYTyTMM5GwmkjIYkTCBoiEeSYS1hMJ80wkrCYS5plIWE0kzDORsJpImGciYT2RMM9EwpxC9+IAAA34SURBVGoiIYsRCRsgEuaZSFhPJMwzkbCaSJhnImE1kTDPRMJqImGeiYT1RMI8EwmriYQsRiRsgEiYZyJhPZEwz0TCaiJhnomE1UTCPBMJq4mEeSYS1hMJ80wkrCYSshiRsAEiYZ6JhPVEwjwTCauJhHkmElYTCfNMJKwmEuaZSFhPJMwzkbCaSMhiRMIGiIR5JhLWEwnzTCSsJhLmmUhYTSTMM5GwmkiYZyJhPZEwz0TCaiIhixEJGyAS5plIWE8kzDORsJpImGciYTWRMM9EwmoiYZ6JhPVEwjwTCauJhCxGJGyASJhnImE9kTDPRMJqImGeiYTVRMI8EwmriYR5JhLWEwnzTCSsJhKyGJGwASJhnomE9UTCPBMJq4mEeSYSVhMJ80wkrCYS5plIWE8kzDORsJpIyGJEwgaIhHkmEtYTCfNMJKwmEuaZSFhNJMwzkbCaSJhnImE9kTDPRMJqIiGLEQkbIBLmmUhYTyTMM5GwmkiYZyJhNZEwz0TCaiJhnomE9UTCPBMJq4mELEYkbIBImGciYT2RMM9EwmoiYZ6JhNVEwjwTCauJhHkmEtYTCfNMJKwmErIYkbABImGeiYT1RMI8EwmriYR5JhJWEwnzTCSsJhLmmUhYTyTMM5GwmkjIYkTCBoiEeSYS1hMJ80wkrCYS5plIWE0kzDORsJpImGciYT2RMM9EwmoiIYsRCZcwMzMT27Zti4GBgRgaGop9+/Yt+WtEwjwTCeuJhHkmElYTCfNMJKwmEuaZSFhNJMwzkbCeSJhnImE1kZDFiIRLGBsbi9HR0Zieno6jR4/G4OBgTE5O1v4akTDPRMJ6ImGeiYTVRMI8EwmriYR5JhJWEwnzTCSsJxLmmUhYTSRkMSJhjbm5uWi323Hs2LHO+8bHx2Pnzp21v04kzDORsJ5ImGciYTWRMM9EwmoiYZ6JhNVEwjwTCeuJhHkmElYTCVmMSFjjxIkT0Wq14uzZ3978TExMxObNm2t/nUiYZyJhPZEwz0TCaiJhnomE1UTCPBMJq4mEeSYS1hMJ80wkrCYSshiRsMbU1FS0Wq2Yn//tF2gHDx6MDRs2dN5+8MEHL9v4A3t7/gmznG28c/+i51W1tTu+2/Njvtr9wcijXZ3rV+79u54f83J22//5Vlfn+x+//Pc9P+ar3X/+y0e6Otfhu7/V82Nezu78+v/t6nz//W2P9vyYr3b/dedDXZ3r+jv29/yYl7O//caVn+vX/vqbPT/e5ezP//d3urq2/+UrD/f8mK92/2HbP3R1rl+6Z1/Pj3k5+9I9+7o63/+w7R96fsxXu//ylYe7Otc//9/f6fkxL2df++tvXvG5/u03Huz58S5n6+/o7h75v+58qOfHfLX797d1d49859f/b8+PeTkbvru7e+T//JeP9PyYr3b/8ct/39W53vZ/ct8jf+Xev+vqfP9gJO898tod3+3qXDfemfseefyBvYueF8sjEta4+J2Ec3NznfdNTEzEpk2bOm9/4xvfMDNrdN/+9rd7fgxm1t/77ne/2/NjMLP+nvsZM/tdjOURCWvMzc3F6tWrL/mZhHv27IkdO3b08KgAAAAA4NoSCZcwOjoa27dvj5MnT8bU1FQMDQ3FoUPd/RwHAAAAAFjJRMIlTE9Px8jISLTb7RgcHIy9e/f2+pAAAAAA4JoSCVeQc+fOxV133RW7d+9e9PHXXnsttm7dGmvXro12ux3vvffb/1Pgxo0b40//9E9jdna2877z58/HDTfcEH/yJ3/S+LFTr+7avvnmm7F169a47rrr4vrrr4/du3df8nMwXduV6YEHHojPfe5z0W63Y/369fHwww9f8vjMzExs27YtBgYGYmhoKPbt23fJ467ryrXUtV3qcdd25Vrq2l308ccfx5YtW2LLli2XvN+1Xbmu5Nq6j8ppqWvrPiqv73znO/GFL3whBgYGYt26dXH77bfHhx9+2HncvVROS11X91F5LXVtL3IflZtIuEI8++yzccMNN8SnP/3pRUPSq6++GoODgzExMRHvvvtuvPPOO5fdAN14442xf//+zvuefPLJWL9+vU+4Hlvq2m7cuLFz3c6cORPDw8OX/F+ZXNuVaXx8PI4cORInT56Mw4cPx8DAQLzwwgudx8fGxmJ0dDSmp6fj6NGjMTg4GJOTk53HXdeVa6lru9Tjru3KtdS1i4i4cOFCfPnLX47NmzcvenPr2q5MS11b91F5Xcnfue6jcnriiSfi9ddfj1OnTsUvfvGL+PznPx/f/va3O4+7l8ppqevqPiqvpa5thPuofiASrjD33nvvoiFpeHg4Hnvsscpft3HjxnjkkUfis5/9bJw7dy4iIjZt2hSPPPKIT7gVYrFre+HChVi1alUcOXLkko/btWtX523XNofh4eF46KGHIuI3/9Ojdrt9yf/0aHx8PHbu3Nl523XNY+G1vZLHXds8Fru2d955Z4yPj8cTTzyx6M2ta5vDJ6+t+6j+sfDauo/qDxcuXIjjx4/HTTfdFC+++GJEuJfqB4td18W4j8qn7tq6j8pPJFxhFgtJZ86ciVarFWNjYzE0NBRr1qyJ7du3x5kzZzofs3Hjxjhy5Ejcdttt8cQTT8RLL70Ut9xyS7z22ms+4VaIqgA8Pj4eg4OD8eijj8axY8diw4YNl9wQubYr39mzZ2PdunXx/PPPR0TEiRMnotVqxdmzZzsfMzExEZs3b+687brm8MlreyWPu7Y5LHbt7r///rj99ttjfn6+8ubWtV35Pnlt3Uf1j8U+b91H5ddqtWL16tWXhHz3Uvktdl0/yX1UTlXX1n1UfxAJV5jFQtLx48ej1WrF5ORkzM7OxnvvvRe33HJLjI2NdT7m4ifcCy+8EBs2bIiRkZE4fPiwT7gVpCoSvvzyy3HzzTfHzp07Y82aNTE6OnrJz2lwbVe+Xbt2xdatW2N+fj4iIqampqLVanXejog4ePBgbNiwofO265rDJ6/tlTzu2ubwyWt34MCBGBkZifPnz0dE1N7curYr2yevrfuo/rHY37nuo/K7+F1JN954YzzzzDMR4V6qHyx2XT/JfVROi11b91H9QyRcYeoi4YULFzrve+6552Lt2rWdty9+wkX85lt2/+zP/iwiwifcCrLYtT116lR85jOfibfeeisiIj744IO49dZbY3R0tPMxru3Kds8998TNN998yXekXPzX74U/72piYiI2bdrUedt1XfkWu7ZX8rhru/Itdu3uu+++WL16dbTb7Wi327Fq1apotVrRbrc738ni2q58i11b91H9YbFr6z6qv3z961/vxHv3Uv1j4XVdyH1Ufguvrfuo/iESrjCLhaTZ2dlYtWpVHD9+vPO+ycnJuPHGGztvL/yEe+WVV+Kll16KCJ9wK8li1/aVV16JNWvWXPK+gwcPxvr16ztvu7Yr0/nz52PXrl0xPDx82c3N3NxcrF69+pKXO+3Zsyd27NjRedt1Xbnqru2VPO7arlxLXbuF6v4FPMK1XWnqrq37qNzqrq37qP5y9913x+233x4R7qX6ycLrGuE+qp988tou5D4qL5Fwhal6SepXv/rV2LJlS0xPT8f7778fw8PD8cADD3QeX/gJt5BPuJWj6udNrlu3Lvbv3x/z8/MxOzsbIyMjcffdd3c+xrVdec6fPx+33nprfPGLX4yZmZmYnZ2N2dnZS35uzujoaGzfvj1OnjwZU1NTMTQ0FIcOHeo87rquTEtd2yu59q7tynQl126hpW5uF3Jte+tKrq37qJyWurbuo/K6cOFCfOlLX4oXX3wxpqen45lnnok1a9bE4cOHOx/jXiqfpa6r+6i8ruRzdiH3UXmJhCvEU089FevWrYuBgYFot9uxbt26ePrppzuPf/jhh3HHHXfEunXr4rrrrov777+/838EivAJt5ItdW1ff/31GB4ejqGhobjhhhvinnvuiY8++qjzuGu78pw+fTpardZlu+mmmzofMz09HSMjI9Fut2NwcDD27t17yXO4rivTUtf2Sq69a7syXcm1W8jNbR5Xcm3dR+V0JdfWfVRO8/Pzcccdd8T69euj3W7Hhg0b4sCBA5d8jHupfJa6ru6j8rqSz9mF3EflJRICAAAAQOFEQgAAAAAonEgIAAAAAIUTCQEAAACgcCIhAAAAABROJAQAAACAwomEAAAAAFA4kRAAAAAACicSAgAAAEDhREIAAAAAKJxICAAAAACFEwkBAAAAoHAiIQAAAAAUTiQEAAAAgMKJhAAAAABQOJEQAAAAAAonEgIAAABA4URCAAAAACicSAgAAAAAhRMJAQAAAKBwIiEAAAAAFE4kBAAAAIDCiYQAAAAAUDiREAAAAAAKJxICAAAAQOFEQgAAAAAonEgIAAAAAIUTCQEAAACgcCIhAAAAABROJAQAAACAwomEAAAAAFA4kRAAAAAACicSAgAAAEDhREIAAAAAKJxICAAAAACFEwkBAAAAoHAiIQAAAAAUTiQEAAAAgMKJhAAAAABQOJEQAAAAAAonEgIAAABA4URCAAAAACicSAgAAAAAhRMJAQAAAKBwIiEAAAAAFE4kBAAAAIDCiYQAAAAAUDiREAAAAAAKJxICAAAAQOFEQgAAAAAonEgIAAAAAIUTCQEAAACgcCIhAAAAABROJAQAAACAwomEAAAAAFA4kRAAAAAACicSAgAAAEDhREIAAAAAKJxICAAAAACFEwkBAAAAoHAiIQAAAAAUTiQEAAAAgMKJhAAAAABQOJEQAAAAAAonEgIAAABA4URCAAAAACicSAgAAAAAhfv/yhwjfdCwcaIAAAAASUVORK5CYII=", + "text/html": [ + "
" + ], + "text/vnd.plotly.v1+html": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import scipy as sp\n", + "import plotly.offline as py\n", + "import plotly.figure_factory as ff\n", + "import plotly.graph_objs as go\n", + "from plotly.offline import get_plotlyjs, init_notebook_mode\n", + "\n", + "pandasDF = oskar.histogram(df,\"start\",1000000).toPandas()\n", + "\n", + "get_plotlyjs()\n", + "init_notebook_mode(connected=True)\n", + "\n", + "data = [go.Bar(x=pandasDF[\"start\"], y=pandasDF[\"count\"])]\n", + "py.iplot(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/vnd.plotly.v1+html": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "linkText": "Export to plot.ly", + "plotlyServerURL": "https://plot.ly", + "showLink": true + }, + "data": [ + { + "hoverinfo": "text", + "marker": { + "color": "rgb(61,153,112)" + }, + "mode": "lines", + "type": "scatter", + "uid": "33df76e2-a40b-46f6-85b6-eaa89342b8f1", + "x": [ + 15, + 15, + 25, + 25 + ], + "xaxis": "x", + "y": [ + 0, + 14, + 14, + 0 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(61,153,112)" + }, + "mode": "lines", + "type": "scatter", + "uid": "f0e4718c-8a94-43a3-b7b2-f2aead5e43cc", + "x": [ + 45, + 45, + 55, + 55 + ], + "xaxis": "x", + "y": [ + 0, + 7.937253933193772, + 7.937253933193772, + 0 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(61,153,112)" + }, + "mode": "lines", + "type": "scatter", + "uid": "964ac19f-f803-4502-9e62-94537c374184", + "x": [ + 35, + 35, + 50, + 50 + ], + "xaxis": "x", + "y": [ + 0, + 22.538855339169288, + 22.538855339169288, + 7.937253933193772 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(61,153,112)" + }, + "mode": "lines", + "type": "scatter", + "uid": "612b220c-63a5-49ce-9902-97cd98ee444c", + "x": [ + 20, + 20, + 42.5, + 42.5 + ], + "xaxis": "x", + "y": [ + 14, + 40.22437072223753, + 40.22437072223753, + 22.538855339169288 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(61,153,112)" + }, + "mode": "lines", + "type": "scatter", + "uid": "99d81210-eac1-4fb5-9d1c-75019cd9ba9d", + "x": [ + 5, + 5, + 31.25, + 31.25 + ], + "xaxis": "x", + "y": [ + 0, + 49.01020301937138, + 49.01020301937138, + 40.22437072223753 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(61,153,112)" + }, + "mode": "lines", + "type": "scatter", + "uid": "1eb57830-1275-4d76-81ee-a26134239240", + "x": [ + 85, + 85, + 95, + 95 + ], + "xaxis": "x", + "y": [ + 0, + 1, + 1, + 0 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(61,153,112)" + }, + "mode": "lines", + "type": "scatter", + "uid": "f5f976b6-0ca1-4397-8133-9545231b441c", + "x": [ + 75, + 75, + 90, + 90 + ], + "xaxis": "x", + "y": [ + 0, + 21.93171219946131, + 21.93171219946131, + 1 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(61,153,112)" + }, + "mode": "lines", + "type": "scatter", + "uid": "05b2e115-a852-4a79-adc2-9e68134c67c2", + "x": [ + 65, + 65, + 82.5, + 82.5 + ], + "xaxis": "x", + "y": [ + 0, + 55.00909015790027, + 55.00909015790027, + 21.93171219946131 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(61,153,112)" + }, + "mode": "lines", + "type": "scatter", + "uid": "af396f60-958e-4763-91c7-b1417617e588", + "x": [ + 18.125, + 18.125, + 73.75, + 73.75 + ], + "xaxis": "x", + "y": [ + 49.01020301937138, + 89.05616205518852, + 89.05616205518852, + 55.00909015790027 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(255,65,54)" + }, + "mode": "lines", + "type": "scatter", + "uid": "ac6eb5b2-1794-4981-a763-36535141c8cd", + "x": [ + 105, + 105, + 115, + 115 + ], + "xaxis": "x", + "y": [ + 0, + 22, + 22, + 0 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(255,65,54)" + }, + "mode": "lines", + "type": "scatter", + "uid": "72fa80a4-dca7-4223-b98f-948230261967", + "x": [ + 125, + 125, + 135, + 135 + ], + "xaxis": "x", + "y": [ + 0, + 8, + 8, + 0 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(255,65,54)" + }, + "mode": "lines", + "type": "scatter", + "uid": "97a53470-398c-4472-a1be-dc9fd9a272ec", + "x": [ + 145, + 145, + 155, + 155 + ], + "xaxis": "x", + "y": [ + 0, + 20.8806130178211, + 20.8806130178211, + 0 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(255,65,54)" + }, + "mode": "lines", + "type": "scatter", + "uid": "7502d126-c5b2-4e46-9dc4-fa0a20bff326", + "x": [ + 130, + 130, + 150, + 150 + ], + "xaxis": "x", + "y": [ + 8, + 36.742346141747674, + 36.742346141747674, + 20.8806130178211 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(255,65,54)" + }, + "mode": "lines", + "type": "scatter", + "uid": "d1f3cbe0-0915-46f6-81e7-71cd2aa4b1ae", + "x": [ + 110, + 110, + 140, + 140 + ], + "xaxis": "x", + "y": [ + 22, + 51.0098029794274, + 51.0098029794274, + 36.742346141747674 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(255,65,54)" + }, + "mode": "lines", + "type": "scatter", + "uid": "8942934b-1325-4855-b6df-51c5ab3a5605", + "x": [ + 175, + 175, + 185, + 185 + ], + "xaxis": "x", + "y": [ + 0, + 28.844410203711913, + 28.844410203711913, + 0 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(255,65,54)" + }, + "mode": "lines", + "type": "scatter", + "uid": "fd1c7e73-f0d9-4e55-9b05-9143c1585f6a", + "x": [ + 165, + 165, + 180, + 180 + ], + "xaxis": "x", + "y": [ + 0, + 73.72923436466705, + 73.72923436466705, + 28.844410203711913 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(255,65,54)" + }, + "mode": "lines", + "type": "scatter", + "uid": "07311b97-5403-43f4-ae38-42b9da358e23", + "x": [ + 125, + 125, + 172.5, + 172.5 + ], + "xaxis": "x", + "y": [ + 51.0098029794274, + 128.7555824032496, + 128.7555824032496, + 73.72923436466705 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(0,116,217)" + }, + "mode": "lines", + "type": "scatter", + "uid": "29f1e126-b9c0-4758-a313-c8830bb76b8d", + "x": [ + 45.9375, + 45.9375, + 148.75, + 148.75 + ], + "xaxis": "x", + "y": [ + 89.05616205518852, + 200.37215375395854, + 200.37215375395854, + 128.7555824032496 + ], + "yaxis": "y" + } + ], + "layout": { + "autosize": false, + "height": 400, + "hovermode": "closest", + "showlegend": false, + "width": 1440, + "xaxis": { + "autorange": true, + "mirror": "allticks", + "range": [ + 0, + 185 + ], + "rangemode": "tozero", + "showgrid": false, + "showline": true, + "showticklabels": true, + "tickmode": "array", + "ticks": "outside", + "ticktext": [ + "9", + "3", + "4", + "11", + "6", + "7", + "5", + "2", + "0", + "1", + "14", + "17", + "15", + "16", + "8", + "10", + "18", + "12", + "13" + ], + "tickvals": [ + 5, + 15, + 25, + 35, + 45, + 55, + 65, + 75, + 85, + 95, + 105, + 115, + 125, + 135, + 145, + 155, + 165, + 175, + 185 + ], + "type": "linear", + "zeroline": false + }, + "yaxis": { + "autorange": true, + "mirror": "allticks", + "range": [ + 0, + 210.91805658311426 + ], + "rangemode": "tozero", + "showgrid": false, + "showline": true, + "showticklabels": true, + "ticks": "outside", + "type": "linear", + "zeroline": false + } + } + }, + "image/png": "", + "text/html": [ + "
" + ], + "text/vnd.plotly.v1+html": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import plotly.offline as py\n", + "import plotly.figure_factory as ff\n", + "from plotly.offline import get_plotlyjs, init_notebook_mode\n", + "\n", + "import numpy as np\n", + "\n", + "init_notebook_mode(connected=True)\n", + "\n", + "pandasDF = oskar.histogram(df,\"start\",1000000).toPandas()\n", + "# print(pandasDF)\n", + "dim = len(pandasDF[\"count\"])\n", + "X = np.zeros((dim, dim))\n", + "for i in range(dim):\n", + " for j in range(i,dim):\n", + " X[j, i] = pandasDF[\"count\"][i] - pandasDF[\"count\"][j]\n", + "# print(X)\n", + "# labels = [pandasDF[\"start\"]]\n", + "fig = ff.create_dendrogram(X)\n", + "fig['layout'].update({'width':1440, 'height':400})\n", + "py.iplot(fig, filename='dendrogram_with_labels')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Con extensión renderizando con Plotly Python" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "linkText": "Export to plot.ly", + "plotlyServerURL": "https://plot.ly", + "showLink": true + }, + "data": [ + { + "type": "heatmap", + "uid": "f1070ce3-43dc-472c-8e1f-9b6b4431f77b", + "z": [ + [ + 1, + 20, + 30 + ], + [ + 20, + 1, + 60 + ], + [ + 30, + 60, + 1 + ] + ], + "zauto": true, + "zmax": 60, + "zmin": 1 + } + ], + "layout": { + "autosize": true, + "xaxis": { + "autorange": true, + "range": [ + -0.5, + 2.5 + ] + }, + "yaxis": { + "autorange": true, + "range": [ + -0.5, + 2.5 + ] + } + } + }, + "image/png": "", + "text/html": [ + "
" + ], + "text/vnd.plotly.v1+html": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import plotly\n", + "from plotly.offline import iplot\n", + "\n", + "trace = plotly.graph_objs.Heatmap(z=[[1, 20, 30],\n", + " [20, 1, 60],\n", + " [30, 60, 1]])\n", + "\n", + "fig = dict(data=[trace])\n", + "\n", + "iplot(fig)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Con extensión renderizando con Plotly JSON" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "data": [ + { + "type": "scatter", + "x": [ + 1999, + 2000, + 2001, + 2002 + ], + "y": [ + 10, + 15, + 13, + 17 + ] + }, + { + "type": "scatter", + "x": [ + 1999, + 2000, + 2001, + 2002 + ], + "y": [ + 16, + 5, + 11, + 9 + ] + } + ], + "layout": { + "autosize": true, + "title": "Sales Growth", + "xaxis": { + "autorange": true, + "range": [ + 1998.820445406743, + 2002.179554593257 + ], + "showgrid": false, + "title": "Year", + "type": "linear", + "zeroline": false + }, + "yaxis": { + "autorange": true, + "range": [ + 4.009708737864078, + 17.990291262135923 + ], + "showline": false, + "title": "Percent", + "type": "linear" + } + } + }, + "image/png": "" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from IPython.display import display\n", + "\n", + "def Plotly(data=[], layout={}):\n", + " bundle = {}\n", + " bundle['application/vnd.plotly.v1+json'] = {\n", + " 'data': data,\n", + " 'layout': layout,\n", + " }\n", + " display(bundle, raw=True)\n", + "\n", + "data = [\n", + " {'x': [1999, 2000, 2001, 2002], 'y': [10, 15, 13, 17], 'type': 'scatter'},\n", + " {'x': [1999, 2000, 2001, 2002], 'y': [16, 5, 11, 9], 'type': 'scatter'}\n", + "]\n", + "\n", + "layout = {\n", + " 'title': 'Sales Growth',\n", + " 'xaxis': {'title': 'Year', 'showgrid': False, 'zeroline': False},\n", + " 'yaxis': {'title': 'Percent', 'showline': False}\n", + "}\n", + "\n", + "Plotly(data, layout)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/oskar-spark/src/main/python/notebooks/GWAS.ipynb b/oskar-spark/src/main/python/notebooks/notebooks/GWAS.ipynb similarity index 100% rename from oskar-spark/src/main/python/notebooks/GWAS.ipynb rename to oskar-spark/src/main/python/notebooks/notebooks/GWAS.ipynb diff --git a/oskar-spark/src/main/python/notebooks/notebooks/facets.ipynb b/oskar-spark/src/main/python/notebooks/notebooks/facets.ipynb new file mode 100644 index 0000000..4e4e3dd --- /dev/null +++ b/oskar-spark/src/main/python/notebooks/notebooks/facets.ipynb @@ -0,0 +1,651 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# **Facets tutorial**\n", + "One of the most sofisticated transformations that Pyoskar provides us is [ **facet** ]. As every transformation it pertains to Oskar class and we can acces to it through our Oskar instance.\n", + "
\n", + "Usage:\n", + "```\n", + "facet(df[DataFrame], facet[str])\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have to to import both Spark and Oskar APIs, as well as loading our data into a spark DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from pyoskar.core import Oskar\n", + "from pyoskar.sql import *\n", + "from pyoskar.analysis import *\n", + "from pyspark.sql.functions import col, udf, count, explode, concat, when, expr\n", + "from pyspark.sql.functions import *\n", + "\n", + "oskar = Oskar(spark)\n", + "df = oskar.load(\"/home/roldanx/appl/oskar/oskar-spark/src/test/resources/platinum_chr22.small.parquet\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Simple facets\n", + "Now that we have loaded our data, we start with an easy facet. This example executes the classics \"groupBy\" and \"count\" upon our dataframe. The next format was designed to be applied on categorical or discrete quantitative variants. That could be any among these: \n", + " - Chromosome [ **chromosome** ]\n", + " - Variant type [ **type** ]\n", + " - Studies [ **studies** ]\n", + " - Biotype [ **biotype** ]\n", + " - Consequence type [ **ct** ]\n", + " - Gene [ **gene** ]\n", + " - Ensemble gene ID [ **ensemblGeneId** ]\n", + " - Ensemble gene transcript [ **ensemblTranscriptId** ]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----+-----+\n", + "| type|count|\n", + "+-----+-----+\n", + "|INDEL| 106|\n", + "| SNV| 894|\n", + "+-----+-----+\n", + "\n" + ] + } + ], + "source": [ + "oskar.facet(df, \"type\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------------------------------+-----+\n", + "|ct |count|\n", + "+----------------------------------+-----+\n", + "|2KB_downstream_variant |146 |\n", + "|2KB_upstream_variant |146 |\n", + "|3_prime_UTR_variant |9 |\n", + "|5_prime_UTR_variant |1 |\n", + "|NMD_transcript_variant |140 |\n", + "|TF_binding_site_variant |108 |\n", + "|downstream_gene_variant |163 |\n", + "|intergenic_variant |222 |\n", + "|intron_variant |543 |\n", + "|missense_variant |4 |\n", + "|non_coding_transcript_exon_variant|45 |\n", + "|non_coding_transcript_variant |385 |\n", + "|regulatory_region_variant |764 |\n", + "|splice_donor_variant |1 |\n", + "|splice_region_variant |2 |\n", + "|synonymous_variant |6 |\n", + "|upstream_gene_variant |200 |\n", + "+----------------------------------+-----+\n", + "\n" + ] + } + ], + "source": [ + "oskar.facet(df, \"ct\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Include facets\n", + "We can also applies a filtering based on the values we explicit in the function:" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+-----+\n", + "| gene|count|\n", + "+-------+-----+\n", + "|BCL2L13| 8|\n", + "| CECR2| 11|\n", + "+-------+-----+\n", + "\n" + ] + } + ], + "source": [ + "oskar.facet(df, \"gene[BCL2L13,CECR2]\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Range facets\n", + "Using a similar syntax as with \"Include facets\" but dealing with continuous quantitative we find that we can apply facets by range, where we can determine both upper and downer thresholds as well as the step e.g. [start..end]:step. Available range fields:\n", + " - Conservation scores: grep [ **grep** ], phylop [ **phylop** ] or phastCons [ **phastCons** ]\n", + " - Functional scores: cadd_scaled [ **cadd_scaled** ] or cadd_raw [ **cadd_raw** ]\n", + " - Substitution scores: sift [ **sift** ] or polyphen [ **polyphen** ]" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------+-----+\n", + "|phylopRange|count|\n", + "+-----------+-----+\n", + "| -4.0| 3|\n", + "| -3.0| 12|\n", + "| -2.0| 55|\n", + "| -1.0| 171|\n", + "| 0.0| 681|\n", + "+-----------+-----+\n", + "\n" + ] + } + ], + "source": [ + "oskar.facet(df, \"phylop[-5..0]:1\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Furthermore, we dispose two more rangeable fields where we could use our facets. These ones have the peculiarity that they need extra inputs to be fully defined; we will need to explicit the study which they pertain and the cohort delimited by two underscores. These fields are:\n", + " - Global alternate population frequency [ **popFreq** ] \n", + " - Dataframe alternate population frequency [ **stats** ], which is included in the dataframe stats field." + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------------------------+-----+\n", + "|popFreq__GNOMAD_GENOMES__ALLRange|count|\n", + "+---------------------------------+-----+\n", + "| 0.0| 514|\n", + "| 0.1| 112|\n", + "| 0.2| 75|\n", + "| 0.30000000000000004| 97|\n", + "| 0.4| 77|\n", + "| 0.5| 26|\n", + "| 0.6000000000000001| 35|\n", + "| 0.7000000000000001| 23|\n", + "| 0.8| 18|\n", + "| 0.9| 15|\n", + "| 1.0| 8|\n", + "+---------------------------------+-----+\n", + "\n" + ] + } + ], + "source": [ + "oskar.facet(df, \"popFreq__GNOMAD_GENOMES__ALL[0..1]:0.1\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In case we want to get the dataframe alternate population frequency, first we will need to fill the stats field as explained in the \"stats\" tutorial:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------------------------------------------------+-----+\n", + "|stats__hgvauser@platinum:illumina_platinum__ALLRange|count|\n", + "+----------------------------------------------------+-----+\n", + "| 0.0| 41|\n", + "| 0.1| 15|\n", + "| 0.2| 14|\n", + "| 0.30000000000000004| 10|\n", + "| 0.4| 7|\n", + "| 0.5| 714|\n", + "| 0.6000000000000001| 68|\n", + "| 0.7000000000000001| 49|\n", + "| 0.8| 12|\n", + "| 0.9| 21|\n", + "| 1.0| 48|\n", + "+----------------------------------------------------+-----+\n", + "\n" + ] + } + ], + "source": [ + "samples = oskar.metadata.samples(df)[\"hgvauser@platinum:illumina_platinum\"]\n", + "df2 = oskar.stats(df,studyId=\"hgvauser@platinum:illumina_platinum\",cohort=\"ALL\",samples=samples)\n", + "oskar.facet(df2, \"stats__hgvauser@platinum:illumina_platinum__ALL[0..1]:0.1\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Aggregation facets\n", + "We may want to check whether the compounds of all variants have historically been well conservated or otherways have notably evolved. For this task we could use the aggregation facets, with substitutes the default \"count\" function for another one we decide among this ones:\n", + " - Average [ **avg** ]\n", + " - Maximum [ **max** ]\n", + " - Minimum [ **min** ]\n", + " - Sumatory [ **sum** ]\n", + " - Squared sumatory [ **sumsq** ]\n", + " - Standard deviation [ **stddev** ]\n", + " - Variance [ **var** ]\n", + " - Percentile values [ **percentile** ]\n", + " - Set of values [ **unique** ]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------------------+-----+\n", + "| avg(gerp)|count|\n", + "+-------------------+-----+\n", + "|-0.3518712293113349| 1000|\n", + "+-------------------+-----+\n", + "\n" + ] + } + ], + "source": [ + "oskar.facet(df, \"avg(gerp)\").show(truncate=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------------------------------------------------------------------------------+-----+\n", + "|percentile(gerp) |count|\n", + "+---------------------------------------------------------------------------------------+-----+\n", + "|[-2.152000093460083, -0.6257500052452087, 0.0, 0.14900000393390656, 0.7430999755859375]|1000 |\n", + "+---------------------------------------------------------------------------------------+-----+\n", + "\n" + ] + } + ], + "source": [ + "oskar.facet(df, \"percentile(gerp)\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Nested facets\n", + "The last feature we find available for our facet queries is nesting, which allows us to concatenate gruops and reach complex studies by using \">>\" separator." + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----------------------+--------------------+-----+\n", + "|biotype |ct |count|\n", + "+-----------------------+--------------------+-----+\n", + "|nonsense_mediated_decay|splice_donor_variant|1 |\n", + "|processed_transcript |splice_donor_variant|1 |\n", + "|protein_coding |splice_donor_variant|1 |\n", + "|retained_intron |splice_donor_variant|1 |\n", + "+-----------------------+--------------------+-----+\n", + "\n" + ] + } + ], + "source": [ + "oskar.facet(df, \"biotype>>ct[splice_donor_variant]\").show(truncate=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Final facets\n", + "Now it is up to us to mix all these ingredients:" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-----+-----------------------+-------------+----------------+--------------------+-----+\n", + "|gene |biotype |cadd_rawRange|cadd_scaledRange|min(phylop) |count|\n", + "+-----+-----------------------+-------------+----------------+--------------------+-----+\n", + "|AIFM3|lincRNA |0.2 |6.0 |-0.2809999883174896 |1 |\n", + "|AIFM3|nonsense_mediated_decay|0.2 |6.0 |-0.2809999883174896 |1 |\n", + "|AIFM3|processed_transcript |0.2 |6.0 |-0.2809999883174896 |1 |\n", + "|AIFM3|protein_coding |0.2 |6.0 |-0.2809999883174896 |1 |\n", + "|AIFM3|retained_intron |0.2 |6.0 |-0.2809999883174896 |1 |\n", + "|GGT1 |nonsense_mediated_decay|-0.2 |0.0 |-1.1380000114440918 |2 |\n", + "|GGT1 |nonsense_mediated_decay|0.0 |1.0 |0.10199999809265137 |2 |\n", + "|GGT1 |nonsense_mediated_decay|0.0 |2.0 |0.054999999701976776|1 |\n", + "|GGT1 |nonsense_mediated_decay|0.0 |4.0 |0.2809999883174896 |1 |\n", + "|GGT1 |nonsense_mediated_decay|0.4 |7.0 |-1.50600004196167 |2 |\n", + "|GGT1 |processed_transcript |-0.2 |0.0 |0.10199999809265137 |1 |\n", + "|GGT1 |processed_transcript |0.0 |1.0 |0.10199999809265137 |1 |\n", + "|GGT1 |processed_transcript |0.0 |2.0 |0.054999999701976776|1 |\n", + "|GGT1 |processed_transcript |0.0 |4.0 |0.2809999883174896 |1 |\n", + "|GGT1 |processed_transcript |0.4 |7.0 |-1.50600004196167 |2 |\n", + "|GGT1 |protein_coding |-0.2 |0.0 |-1.1380000114440918 |2 |\n", + "|GGT1 |protein_coding |0.0 |1.0 |0.10199999809265137 |2 |\n", + "|GGT1 |protein_coding |0.0 |2.0 |0.054999999701976776|1 |\n", + "|GGT1 |protein_coding |0.0 |4.0 |0.2809999883174896 |1 |\n", + "|GGT1 |protein_coding |0.4 |7.0 |-1.50600004196167 |2 |\n", + "|GGT1 |retained_intron |-0.2 |0.0 |0.10199999809265137 |1 |\n", + "|GGT1 |retained_intron |0.0 |1.0 |0.10199999809265137 |2 |\n", + "|GGT1 |retained_intron |0.4 |7.0 |-0.7080000042915344 |1 |\n", + "|GGT1 |sense_intronic |-0.2 |0.0 |-1.1380000114440918 |1 |\n", + "|GGT1 |unprocessed_pseudogene |0.4 |7.0 |-1.50600004196167 |1 |\n", + "+-----+-----------------------+-------------+----------------+--------------------+-----+\n", + "\n" + ] + } + ], + "source": [ + "oskar.facet(df, \"gene[AIFM3,GGT1]>>biotype>>cadd_raw[-10..10]:0.2>>cadd_scaled[-10..10]:1>>min(phylop)\").show(25, truncate=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----+---------------------------------+-----+\n", + "|type|popFreq__GNOMAD_GENOMES__ALLRange|count|\n", + "+----+---------------------------------+-----+\n", + "| SNV| 0.0| 478|\n", + "| SNV| 0.1| 95|\n", + "| SNV| 0.2| 57|\n", + "| SNV| 0.30000000000000004| 90|\n", + "| SNV| 0.4| 66|\n", + "| SNV| 0.5| 21|\n", + "+----+---------------------------------+-----+\n", + "\n", + "+-----+---------------------------------+-----+\n", + "| type|popFreq__GNOMAD_GENOMES__ALLRange|count|\n", + "+-----+---------------------------------+-----+\n", + "|INDEL| 0.0| 36|\n", + "|INDEL| 0.1| 17|\n", + "|INDEL| 0.2| 18|\n", + "|INDEL| 0.30000000000000004| 7|\n", + "|INDEL| 0.4| 11|\n", + "|INDEL| 0.5| 5|\n", + "+-----+---------------------------------+-----+\n", + "\n" + ] + } + ], + "source": [ + "snvPandas = oskar.facet(df, \"type[SNV]>>popFreq__GNOMAD_GENOMES__ALL[0..0.5]:0.1\").show()\n", + "indelPandas = oskar.facet(df, \"type[INDEL]>>popFreq__GNOMAD_GENOMES__ALL[0..0.5]:0.1\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/vnd.plotly.v1+html": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "linkText": "Export to plot.ly", + "plotlyServerURL": "https://plot.ly", + "showLink": true + }, + "data": [ + { + "domain": { + "x": [ + 0, + 0.48 + ] + }, + "hole": 0.4, + "labels": [ + 0, + 0.1, + 0.2, + 0.30000000000000004, + 0.4, + 0.5 + ], + "type": "pie", + "uid": "935ed220-f713-4035-8a30-1c79dee5aa7d", + "values": [ + 478, + 95, + 57, + 90, + 66, + 21 + ] + }, + { + "domain": { + "x": [ + 0.51, + 1 + ] + }, + "hole": 0.4, + "labels": [ + 0, + 0.1, + 0.2, + 0.30000000000000004, + 0.4, + 0.5 + ], + "type": "pie", + "uid": "fcf24555-7061-411c-9ed7-ac58ebb3c67c", + "values": [ + 36, + 17, + 18, + 7, + 11, + 5 + ] + } + ], + "layout": { + "annotations": [ + { + "font": { + "size": 18 + }, + "showarrow": false, + "text": "SNV", + "x": 0.22, + "y": 0.5 + }, + { + "font": { + "size": 18 + }, + "showarrow": false, + "text": "INDEL", + "x": 0.78, + "y": 0.5 + } + ], + "autosize": false, + "height": 600, + "title": "Global alternate population frequencies [ALL]", + "width": 1500 + } + }, + "image/png": "", + "text/html": [ + "
" + ], + "text/vnd.plotly.v1+html": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import plotly.offline as py\n", + "import plotly.graph_objs as go\n", + "from plotly.offline import get_plotlyjs, init_notebook_mode\n", + "\n", + "snvPandas = oskar.facet(df, \"type[SNV]>>popFreq__GNOMAD_GENOMES__ALL[0..0.5]:0.1\").toPandas()\n", + "indelPandas = oskar.facet(df, \"type[INDEL]>>popFreq__GNOMAD_GENOMES__ALL[0..0.5]:0.1\").toPandas()\n", + "\n", + "init_notebook_mode(connected=True)\n", + "\n", + "fig = {\n", + " \"data\": [\n", + " {\n", + " \"values\": snvPandas[\"count\"],\n", + " \"labels\": snvPandas[\"popFreq__GNOMAD_GENOMES__ALLRange\"],\n", + " \"domain\": {\"x\": [0, .48]},\n", + " \"hole\": .4,\n", + " \"type\": \"pie\"\n", + " },\n", + " {\n", + " \"values\": indelPandas[\"count\"],\n", + " \"labels\": indelPandas[\"popFreq__GNOMAD_GENOMES__ALLRange\"],\n", + " \"domain\": {\"x\": [.51, 1]},\n", + " \"hole\": .4,\n", + " \"type\": \"pie\"\n", + " }],\n", + " \"layout\": {\n", + " \"title\":\"Global alternate population frequencies [ALL]\",\n", + " \"autosize\":False,\n", + " \"width\":1500,\n", + " \"height\":600,\n", + " \"annotations\": [\n", + " {\n", + " \"font\": {\n", + " \"size\": 18\n", + " },\n", + " \"showarrow\": False,\n", + " \"text\": \"SNV\",\n", + " \"x\": 0.22,\n", + " \"y\": 0.5\n", + " },\n", + " {\n", + " \"font\": {\n", + " \"size\": 18\n", + " },\n", + " \"showarrow\": False,\n", + " \"text\": \"INDEL\",\n", + " \"x\": 0.78,\n", + " \"y\": 0.5\n", + " }\n", + " ]\n", + " }\n", + "}\n", + "py.iplot(fig, filename='donut')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/oskar-spark/src/main/python/notebooks/notebooks/my_notebook.ipynb b/oskar-spark/src/main/python/notebooks/notebooks/my_notebook.ipynb new file mode 100644 index 0000000..259542d --- /dev/null +++ b/oskar-spark/src/main/python/notebooks/notebooks/my_notebook.ipynb @@ -0,0 +1,480 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + "

SparkSession - hive

\n", + " \n", + "
\n", + "

SparkContext

\n", + "\n", + "

Spark UI

\n", + "\n", + "
\n", + "
Version
\n", + "
v2.4.0
\n", + "
Master
\n", + "
local[*]
\n", + "
AppName
\n", + "
PySparkShell
\n", + "
\n", + "
\n", + " \n", + "
\n", + " " + ], + "text/plain": [ + "" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spark" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'Oskar' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mOskar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'Oskar' is not defined" + ] + } + ], + "source": [ + "Oskar()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "'JavaPackage' object is not callable", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpyspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msql\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfunctions\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0moskar\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mOskar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspark\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moskar\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/home/roldanx/appl/oskar/oskar-spark/src/test/resources/platinum_chr22.small.parquet\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/appl/oskar/oskar-spark/src/main/python/pyoskar/core.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, spark)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspark\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mOskar\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_java_obj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_new_java_obj\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"org.opencb.oskar.spark.variant.Oskar\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jsparkSession\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspark\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mspark\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetadata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mVariantMetadataManager\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/soft/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/wrapper.py\u001b[0m in \u001b[0;36m_new_java_obj\u001b[0;34m(java_class, *args)\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0mjava_obj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjava_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0mjava_args\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0m_py2java\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0marg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 67\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mjava_obj\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mjava_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 68\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mstaticmethod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: 'JavaPackage' object is not callable" + ] + } + ], + "source": [ + "from pyoskar.core import Oskar\n", + "from pyoskar.sql import *\n", + "from pyoskar.analysis import *\n", + "from pyspark.sql.functions import col, udf, count, explode, concat, when, expr\n", + "from pyspark.sql.functions import *\n", + "\n", + "oskar = Oskar(spark)\n", + "df = oskar.load(\"/home/roldanx/appl/oskar/oskar-spark/src/test/resources/platinum_chr22.small.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "pandaDF = oskar.histogram(df,\"start\",100000).toPandas()\n", + "pandaDF.plot(x = \"start\", y = \"count\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'hgvauser@platinum:illumina_platinum': ['NA12877', 'NA12878', 'NA12879', 'NA12880', 'NA12881', 'NA12882', 'NA12883', 'NA12884', 'NA12885', 'NA12886', 'NA12887', 'NA12888', 'NA12889', 'NA12890', 'NA12891', 'NA12892', 'NA12893']}\n", + "+---------------+--------------------+\n", + "| id| HWE|\n", + "+---------------+--------------------+\n", + "|22:16054454:C:T| 1.0|\n", + "|22:16065809:T:C| 1.0|\n", + "|22:16077310:T:A| 0.9254727474972191|\n", + "|22:16080499:A:G| 1.0|\n", + "|22:16084621:T:C| 1.0|\n", + "|22:16091610:G:T| 1.0|\n", + "|22:16096040:G:A| 0.4746014089729329|\n", + "|22:16099957:C:T|0.016007636455477054|\n", + "|22:16100462:A:G|0.001011008618240...|\n", + "|22:16105660:G:A| 0.3037449017426771|\n", + "|22:16112391:G:A| 0.17718452601656157|\n", + "|22:16114913:A:T| 0.6855353685164587|\n", + "|22:16127471:A:-| 1.0|\n", + "|22:16134019:G:T| 0.17718452601656157|\n", + "|22:16138943:C:G| 0.6855353685164587|\n", + "|22:16144239:T:C| 0.18181818181818182|\n", + "|22:16147398:G:A|0.001011224592982...|\n", + "|22:16149692:G:T| 0.08884238232789762|\n", + "|22:16195955:G:A|0.007751066073178...|\n", + "|22:16196041:C:T| 0.6855353685164587|\n", + "+---------------+--------------------+\n", + "only showing top 20 rows\n", + "\n", + "+-----+-----+\n", + "| type|count|\n", + "+-----+-----+\n", + "|INDEL| 106|\n", + "| SNV| 894|\n", + "+-----+-----+\n", + "\n", + "+-----+--------+\n", + "| type|count(1)|\n", + "+-----+--------+\n", + "|INDEL| 106|\n", + "| SNV| 894|\n", + "+-----+--------+\n", + "\n", + "+---------+---------+-----+\n", + "|reference|alternate|count|\n", + "+---------+---------+-----+\n", + "| C| T| 142|\n", + "| G| A| 138|\n", + "| A| G| 120|\n", + "| T| C| 111|\n", + "| C| A| 63|\n", + "| G| T| 53|\n", + "| A| C| 53|\n", + "| T| A| 52|\n", + "| T| G| 48|\n", + "| A| T| 47|\n", + "| C| G| 34|\n", + "| G| C| 33|\n", + "+---------+---------+-----+\n", + "\n", + "+---------+---------+-----+\n", + "|reference|alternate|count|\n", + "+---------+---------+-----+\n", + "| C| T| 142|\n", + "| G| A| 138|\n", + "| A| G| 120|\n", + "| T| C| 111|\n", + "| C| A| 63|\n", + "| A| C| 53|\n", + "| G| T| 53|\n", + "| T| A| 52|\n", + "| T| G| 48|\n", + "| A| T| 47|\n", + "| C| G| 34|\n", + "| G| C| 33|\n", + "+---------+---------+-----+\n", + "\n", + "+---------+---------+-----+\n", + "|reference|alternate|count|\n", + "+---------+---------+-----+\n", + "| C| T| 142|\n", + "| G| A| 138|\n", + "| A| G| 120|\n", + "| T| C| 111|\n", + "| C| A| 63|\n", + "| A| C| 53|\n", + "| G| T| 53|\n", + "| T| A| 52|\n", + "| T| G| 48|\n", + "| A| T| 47|\n", + "| C| G| 34|\n", + "| G| C| 33|\n", + "+---------+---------+-----+\n", + "\n", + "+---------+---------+--------+\n", + "|reference|alternate|count(1)|\n", + "+---------+---------+--------+\n", + "| C| T| 142|\n", + "| G| A| 138|\n", + "| A| G| 120|\n", + "| T| C| 111|\n", + "| C| A| 63|\n", + "| A| C| 53|\n", + "| G| T| 53|\n", + "| T| A| 52|\n", + "| T| G| 48|\n", + "| A| T| 47|\n", + "| C| G| 34|\n", + "| G| C| 33|\n", + "+---------+---------+--------+\n", + "\n" + ] + } + ], + "source": [ + "df.createOrReplaceTempView(\"chr22\")\n", + "print(oskar.metadata.samples(df))\n", + "oskar.hardy_weinberg(df, \"hgvauser@platinum:illumina_platinum\").select(\"id\", \"HWE\").show()\n", + "\n", + "# Group by type\n", + "df.groupBy(\"type\").count().show()\n", + "spark.sql(\"SELECT type, count(*) FROM chr22 GROUP BY type\").show()\n", + "\n", + "## Group by variant\n", + "# 1)\n", + "df.where(\"type = 'SNV'\").select(\"reference\", \"alternate\").groupBy(\"reference\", \"alternate\").count().sort(\"count\", ascending=False).show()\n", + "# 2)\n", + "df.where(col(\"type\").isin(\"SNV\", \"SNP\")).select(\"reference\", \"alternate\").groupBy(\"reference\", \"alternate\").count().sort(\"count\", ascending=False).show()\n", + "# 3)\n", + "df.where(df.type.isin(\"SNV\", \"SNP\")).select(df.reference, df.alternate).groupBy(df.reference, df.alternate).count().sort(\"count\", ascending=False).show()\n", + "# 4)\n", + "spark.sql(\"SELECT reference, alternate, count(*) FROM chr22 WHERE type = 'SNV' GROUP BY reference,alternate ORDER BY count(*) DESC\").show()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------+-----------+--------------+--------------+-------------+-------------+--------------------+------------------------------------+------------------+--------------------+----+----+---------+-----------+\n", + "|id |alleleCount|refAlleleCount|altAlleleCount|refAlleleFreq|altAlleleFreq|genotypeCount |genotypeFreq |missingAlleleCount|missingGenotypeCount|maf |mgf |mafAllele|mgfGenotype|\n", + "+---------------+-----------+--------------+--------------+-------------+-------------+--------------------+------------------------------------+------------------+--------------------+----+----+---------+-----------+\n", + "|22:16054454:C:T|-1 |0 |0 |-1.0 |-1.0 |[./. -> 2] |[] |3 |1 |-1.0|-1.0|null |null |\n", + "|22:16065809:T:C|2 |1 |1 |0.5 |0.5 |[0/1 -> 1, ./. -> 1]|[0/0 -> 0.0, 0/1 -> 0.0, 1/1 -> 0.0]|1 |0 |0.5 |0.0 |T |1/1 |\n", + "|22:16077310:T:A|2 |1 |1 |0.5 |0.5 |[0/1 -> 1, ./. -> 1]|[0/0 -> 0.0, 0/1 -> 0.0, 1/1 -> 0.0]|1 |0 |0.5 |0.0 |T |1/1 |\n", + "|22:16080499:A:G|2 |1 |1 |0.5 |0.5 |[0/1 -> 1, ./. -> 1]|[0/0 -> 0.0, 0/1 -> 0.0, 1/1 -> 0.0]|1 |0 |0.5 |0.0 |A |1/1 |\n", + "|22:16084621:T:C|2 |1 |1 |0.5 |0.5 |[0/1 -> 1, ./. -> 1]|[0/0 -> 0.0, 0/1 -> 0.0, 1/1 -> 0.0]|1 |0 |0.5 |0.0 |T |1/1 |\n", + "|22:16091610:G:T|-1 |0 |0 |-1.0 |-1.0 |[./. -> 2] |[] |3 |1 |-1.0|-1.0|null |null |\n", + "|22:16096040:G:A|-1 |0 |0 |-1.0 |-1.0 |[./. -> 2] |[] |3 |1 |-1.0|-1.0|null |null |\n", + "|22:16099957:C:T|4 |2 |2 |0.5 |0.5 |[0/1 -> 2] |[0/0 -> 0.0, 0/1 -> 0.0, 1/1 -> 0.0]|-1 |-1 |0.5 |0.0 |C |1/1 |\n", + "|22:16100462:A:G|4 |2 |2 |0.5 |0.5 |[0/1 -> 2] |[0/0 -> 0.0, 0/1 -> 0.0, 1/1 -> 0.0]|-1 |-1 |0.5 |0.0 |A |1/1 |\n", + "|22:16105660:G:A|2 |1 |1 |0.5 |0.5 |[0/1 -> 1, ./. -> 1]|[0/0 -> 0.0, 0/1 -> 0.0, 1/1 -> 0.0]|1 |0 |0.5 |0.0 |G |1/1 |\n", + "+---------------+-----------+--------------+--------------+-------------+-------------+--------------------+------------------------------------+------------------+--------------------+----+----+---------+-----------+\n", + "only showing top 10 rows\n", + "\n" + ] + } + ], + "source": [ + "oskar.stats(df, cohort=\"AFR\", samples=['NA12877', 'NA12878']).selectExpr(\"id\", \"studies[0].stats['AFR'] as stats\").selectExpr(\"id\", \"stats.*\").show(10,False)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------+----------+--------+--------+\n", + "| id|chromosome| start| end|\n", + "+---------------+----------+--------+--------+\n", + "|22:17001352:C:G| 22|17001352|17001352|\n", + "|22:17002352:C:A| 22|17002352|17002352|\n", + "|22:17004097:G:A| 22|17004097|17004097|\n", + "|22:17011943:G:C| 22|17011943|17011943|\n", + "|22:17012760:G:A| 22|17012760|17012760|\n", + "+---------------+----------+--------+--------+\n", + "only showing top 5 rows\n", + "\n", + "+---------------+----------+--------+--------+\n", + "| id|chromosome| start| end|\n", + "+---------------+----------+--------+--------+\n", + "|22:17001352:C:G| 22|17001352|17001352|\n", + "|22:17002352:C:A| 22|17002352|17002352|\n", + "|22:17004097:G:A| 22|17004097|17004097|\n", + "|22:17011943:G:C| 22|17011943|17011943|\n", + "|22:17012760:G:A| 22|17012760|17012760|\n", + "+---------------+----------+--------+--------+\n", + "only showing top 5 rows\n", + "\n", + "+---------------+----------+--------+--------+\n", + "| id|chromosome| start| end|\n", + "+---------------+----------+--------+--------+\n", + "|22:17001352:C:G| 22|17001352|17001352|\n", + "|22:17002352:C:A| 22|17002352|17002352|\n", + "|22:17004097:G:A| 22|17004097|17004097|\n", + "|22:17011943:G:C| 22|17011943|17011943|\n", + "|22:17012760:G:A| 22|17012760|17012760|\n", + "+---------------+----------+--------+--------+\n", + "only showing top 5 rows\n", + "\n", + "+---------------+----------+--------+--------+\n", + "| id|chromosome| start| end|\n", + "+---------------+----------+--------+--------+\n", + "|22:17001352:C:G| 22|17001352|17001352|\n", + "|22:17002352:C:A| 22|17002352|17002352|\n", + "|22:17004097:G:A| 22|17004097|17004097|\n", + "|22:17011943:G:C| 22|17011943|17011943|\n", + "|22:17012760:G:A| 22|17012760|17012760|\n", + "+---------------+----------+--------+--------+\n", + "only showing top 5 rows\n", + "\n", + "+---------------+----------+--------+--------+\n", + "| id|chromosome| start| end|\n", + "+---------------+----------+--------+--------+\n", + "|22:17001352:C:G| 22|17001352|17001352|\n", + "|22:17002352:C:A| 22|17002352|17002352|\n", + "|22:17004097:G:A| 22|17004097|17004097|\n", + "|22:17011943:G:C| 22|17011943|17011943|\n", + "|22:17012760:G:A| 22|17012760|17012760|\n", + "+---------------+----------+--------+--------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "source": [ + "df.select(\"id\", \"chromosome\", \"start\", \"end\").filter(df.chromosome == 22).filter(df.start > 17000000).filter(df.end < 17500000).show(5)\n", + "df.select(\"id\", \"chromosome\", \"start\", \"end\").filter(col(\"chromosome\") == 22).filter(col(\"start\") > 17000000).filter(col(\"end\") < 17500000).show(5)\n", + "df.select(\"id\", \"chromosome\", \"start\", \"end\").filter((col(\"chromosome\") == 22) & (col(\"start\") > 17000000) & (col(\"end\") < 17500000)).show(5)\n", + "df.select(\"id\", \"chromosome\", \"start\", \"end\").filter(expr(\"chromosome =='22' AND start > 17000000 AND end < 17500000\")).show(5)\n", + "df.select(\"id\", \"chromosome\", col(\"start\").alias(\"start\"), \"end\").filter(col(\"start\") > 17000000).show(5)\n", + "# spark.sql(\"SELECT id,chromosome,start,end FROM platinum WHERE chromosome =='22' AND start > 17000000 AND end < 17500000\").show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------+--------------------+\n", + "| id| gene|\n", + "+---------------+--------------------+\n", + "|22:16096040:G:A| [NBEAP3]|\n", + "|22:16099957:C:T| [NBEAP3]|\n", + "|22:16100462:A:G| [NBEAP3]|\n", + "|22:16105660:G:A| [NBEAP3]|\n", + "|22:16112391:G:A| [NBEAP3]|\n", + "|22:16114913:A:T| [NBEAP3]|\n", + "|22:16127471:A:-|[LA16c-60H5.7, NB...|\n", + "+---------------+--------------------+\n", + "\n", + "+---------------+--------------------+\n", + "| id| gene|\n", + "+---------------+--------------------+\n", + "|22:16096040:G:A| [NBEAP3]|\n", + "|22:16099957:C:T| [NBEAP3]|\n", + "|22:16100462:A:G| [NBEAP3]|\n", + "|22:16105660:G:A| [NBEAP3]|\n", + "|22:16112391:G:A| [NBEAP3]|\n", + "|22:16114913:A:T| [NBEAP3]|\n", + "|22:16127471:A:-|[LA16c-60H5.7, NB...|\n", + "+---------------+--------------------+\n", + "\n" + ] + } + ], + "source": [ + "df.select(df.id, genes(\"annotation\").alias(\"gene\")).filter(array_contains(\"gene\", \"NBEAP3\")).show()\n", + "df.selectExpr(\"id\", \"genes(annotation) AS gene\").filter(array_contains(\"gene\", \"NBEAP3\")).show()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------+-----+\n", + "|variant|count|\n", + "+-------+-----+\n", + "| C -> T| 142|\n", + "| G -> A| 138|\n", + "| A -> G| 120|\n", + "| T -> C| 111|\n", + "| C -> A| 63|\n", + "| A -> C| 53|\n", + "| G -> T| 53|\n", + "| T -> A| 52|\n", + "| T -> G| 48|\n", + "| A -> T| 47|\n", + "| C -> G| 34|\n", + "| G -> C| 33|\n", + "+-------+-----+\n", + "\n" + ] + } + ], + "source": [ + "pandas = df.where(\"type = 'SNV'\") \\\n", + " .select(\"reference\", \"alternate\") \\\n", + " .groupBy(\"reference\", \"alternate\") \\\n", + " .count() \\\n", + " .sort(\"count\", ascending=False) \\\n", + " .select(concat(col(\"reference\"), lit(\" -> \"), col(\"alternate\")).alias(\"variant\"), col(\"count\")).show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/oskar-spark/src/main/python/notebooks/notebooks/plotly trials.ipynb b/oskar-spark/src/main/python/notebooks/notebooks/plotly trials.ipynb new file mode 100644 index 0000000..f3519b9 --- /dev/null +++ b/oskar-spark/src/main/python/notebooks/notebooks/plotly trials.ipynb @@ -0,0 +1,997 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from pyoskar.core import Oskar\n", + "from pyoskar.sql import *\n", + "from pyoskar.analysis import *\n", + "from pyspark.sql.functions import col, udf, count, explode, concat, when, expr\n", + "from pyspark.sql.functions import *\n", + "\n", + "oskar = Oskar(spark)\n", + "df = oskar.load(\"/home/roldanx/appl/oskar/oskar-spark/src/test/resources/platinum_chr22.small.parquet\")\n", + "df.createOrReplaceTempView(\"platinum\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Variant histogram" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " start count\n", + "0 16000000 101\n", + "1 17000000 100\n", + "2 18000000 85\n", + "3 19000000 53\n", + "4 20000000 46\n", + "5 21000000 85\n", + "6 22000000 52\n", + "7 23000000 49\n", + "8 24000000 44\n", + "9 25000000 62\n", + "10 26000000 42\n", + "11 27000000 54\n", + "12 28000000 21\n", + "13 29000000 29\n", + "14 30000000 38\n", + "15 31000000 46\n", + "16 32000000 44\n", + "17 33000000 34\n", + "18 34000000 15\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "pandasDF = oskar.histogram(df,\"start\",1000000).toPandas()\n", + "print(pandasDF)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Text(0, 0.5, 'Counts'), Text(0.5, 0, 'Region')]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "pandasDF = oskar.histogram(df,\"start\",1000000).toPandas()\n", + "histogram = pandasDF.plot(x = \"start\", y = \"count\", kind = \"bar\", figsize=(24,4))\n", + "histogram.set(xlabel=\"Region\", ylabel=\"Counts\")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/vnd.plotly.v1+html": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "linkText": "Export to plot.ly", + "plotlyServerURL": "https://plot.ly", + "showLink": true + }, + "data": [ + { + "type": "bar", + "uid": "75513143-1b57-4b64-9da2-b4cd1339e703", + "x": [ + 16000000, + 17000000, + 18000000, + 19000000, + 20000000, + 21000000, + 22000000, + 23000000, + 24000000, + 25000000, + 26000000, + 27000000, + 28000000, + 29000000, + 30000000, + 31000000, + 32000000, + 33000000, + 34000000 + ], + "y": [ + 101, + 100, + 85, + 53, + 46, + 85, + 52, + 49, + 44, + 62, + 42, + 54, + 21, + 29, + 38, + 46, + 44, + 34, + 15 + ] + } + ], + "layout": { + "autosize": true, + "xaxis": { + "autorange": true, + "range": [ + 15500000, + 34500000 + ], + "type": "linear" + }, + "yaxis": { + "autorange": true, + "range": [ + 0, + 106.3157894736842 + ], + "type": "linear" + } + } + }, + "image/png": "", + "text/html": [ + "
" + ], + "text/vnd.plotly.v1+html": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import scipy as sp\n", + "import plotly.offline as py\n", + "import plotly.figure_factory as ff\n", + "import plotly.graph_objs as go\n", + "from plotly.offline import get_plotlyjs, init_notebook_mode\n", + "\n", + "pandasDF = oskar.histogram(df,\"start\",1000000).toPandas()\n", + "\n", + "get_plotlyjs()\n", + "init_notebook_mode(connected=True)\n", + "\n", + "data = [go.Bar(x=pandasDF[\"start\"], y=pandasDF[\"count\"])]\n", + "py.iplot(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "" + ], + "text/vnd.plotly.v1+html": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "linkText": "Export to plot.ly", + "plotlyServerURL": "https://plot.ly", + "showLink": true + }, + "data": [ + { + "hoverinfo": "text", + "marker": { + "color": "rgb(61,153,112)" + }, + "mode": "lines", + "type": "scatter", + "uid": "33df76e2-a40b-46f6-85b6-eaa89342b8f1", + "x": [ + 15, + 15, + 25, + 25 + ], + "xaxis": "x", + "y": [ + 0, + 14, + 14, + 0 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(61,153,112)" + }, + "mode": "lines", + "type": "scatter", + "uid": "f0e4718c-8a94-43a3-b7b2-f2aead5e43cc", + "x": [ + 45, + 45, + 55, + 55 + ], + "xaxis": "x", + "y": [ + 0, + 7.937253933193772, + 7.937253933193772, + 0 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(61,153,112)" + }, + "mode": "lines", + "type": "scatter", + "uid": "964ac19f-f803-4502-9e62-94537c374184", + "x": [ + 35, + 35, + 50, + 50 + ], + "xaxis": "x", + "y": [ + 0, + 22.538855339169288, + 22.538855339169288, + 7.937253933193772 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(61,153,112)" + }, + "mode": "lines", + "type": "scatter", + "uid": "612b220c-63a5-49ce-9902-97cd98ee444c", + "x": [ + 20, + 20, + 42.5, + 42.5 + ], + "xaxis": "x", + "y": [ + 14, + 40.22437072223753, + 40.22437072223753, + 22.538855339169288 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(61,153,112)" + }, + "mode": "lines", + "type": "scatter", + "uid": "99d81210-eac1-4fb5-9d1c-75019cd9ba9d", + "x": [ + 5, + 5, + 31.25, + 31.25 + ], + "xaxis": "x", + "y": [ + 0, + 49.01020301937138, + 49.01020301937138, + 40.22437072223753 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(61,153,112)" + }, + "mode": "lines", + "type": "scatter", + "uid": "1eb57830-1275-4d76-81ee-a26134239240", + "x": [ + 85, + 85, + 95, + 95 + ], + "xaxis": "x", + "y": [ + 0, + 1, + 1, + 0 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(61,153,112)" + }, + "mode": "lines", + "type": "scatter", + "uid": "f5f976b6-0ca1-4397-8133-9545231b441c", + "x": [ + 75, + 75, + 90, + 90 + ], + "xaxis": "x", + "y": [ + 0, + 21.93171219946131, + 21.93171219946131, + 1 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(61,153,112)" + }, + "mode": "lines", + "type": "scatter", + "uid": "05b2e115-a852-4a79-adc2-9e68134c67c2", + "x": [ + 65, + 65, + 82.5, + 82.5 + ], + "xaxis": "x", + "y": [ + 0, + 55.00909015790027, + 55.00909015790027, + 21.93171219946131 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(61,153,112)" + }, + "mode": "lines", + "type": "scatter", + "uid": "af396f60-958e-4763-91c7-b1417617e588", + "x": [ + 18.125, + 18.125, + 73.75, + 73.75 + ], + "xaxis": "x", + "y": [ + 49.01020301937138, + 89.05616205518852, + 89.05616205518852, + 55.00909015790027 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(255,65,54)" + }, + "mode": "lines", + "type": "scatter", + "uid": "ac6eb5b2-1794-4981-a763-36535141c8cd", + "x": [ + 105, + 105, + 115, + 115 + ], + "xaxis": "x", + "y": [ + 0, + 22, + 22, + 0 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(255,65,54)" + }, + "mode": "lines", + "type": "scatter", + "uid": "72fa80a4-dca7-4223-b98f-948230261967", + "x": [ + 125, + 125, + 135, + 135 + ], + "xaxis": "x", + "y": [ + 0, + 8, + 8, + 0 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(255,65,54)" + }, + "mode": "lines", + "type": "scatter", + "uid": "97a53470-398c-4472-a1be-dc9fd9a272ec", + "x": [ + 145, + 145, + 155, + 155 + ], + "xaxis": "x", + "y": [ + 0, + 20.8806130178211, + 20.8806130178211, + 0 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(255,65,54)" + }, + "mode": "lines", + "type": "scatter", + "uid": "7502d126-c5b2-4e46-9dc4-fa0a20bff326", + "x": [ + 130, + 130, + 150, + 150 + ], + "xaxis": "x", + "y": [ + 8, + 36.742346141747674, + 36.742346141747674, + 20.8806130178211 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(255,65,54)" + }, + "mode": "lines", + "type": "scatter", + "uid": "d1f3cbe0-0915-46f6-81e7-71cd2aa4b1ae", + "x": [ + 110, + 110, + 140, + 140 + ], + "xaxis": "x", + "y": [ + 22, + 51.0098029794274, + 51.0098029794274, + 36.742346141747674 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(255,65,54)" + }, + "mode": "lines", + "type": "scatter", + "uid": "8942934b-1325-4855-b6df-51c5ab3a5605", + "x": [ + 175, + 175, + 185, + 185 + ], + "xaxis": "x", + "y": [ + 0, + 28.844410203711913, + 28.844410203711913, + 0 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(255,65,54)" + }, + "mode": "lines", + "type": "scatter", + "uid": "fd1c7e73-f0d9-4e55-9b05-9143c1585f6a", + "x": [ + 165, + 165, + 180, + 180 + ], + "xaxis": "x", + "y": [ + 0, + 73.72923436466705, + 73.72923436466705, + 28.844410203711913 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(255,65,54)" + }, + "mode": "lines", + "type": "scatter", + "uid": "07311b97-5403-43f4-ae38-42b9da358e23", + "x": [ + 125, + 125, + 172.5, + 172.5 + ], + "xaxis": "x", + "y": [ + 51.0098029794274, + 128.7555824032496, + 128.7555824032496, + 73.72923436466705 + ], + "yaxis": "y" + }, + { + "hoverinfo": "text", + "marker": { + "color": "rgb(0,116,217)" + }, + "mode": "lines", + "type": "scatter", + "uid": "29f1e126-b9c0-4758-a313-c8830bb76b8d", + "x": [ + 45.9375, + 45.9375, + 148.75, + 148.75 + ], + "xaxis": "x", + "y": [ + 89.05616205518852, + 200.37215375395854, + 200.37215375395854, + 128.7555824032496 + ], + "yaxis": "y" + } + ], + "layout": { + "autosize": false, + "height": 400, + "hovermode": "closest", + "showlegend": false, + "width": 1440, + "xaxis": { + "autorange": true, + "mirror": "allticks", + "range": [ + 0, + 185 + ], + "rangemode": "tozero", + "showgrid": false, + "showline": true, + "showticklabels": true, + "tickmode": "array", + "ticks": "outside", + "ticktext": [ + "9", + "3", + "4", + "11", + "6", + "7", + "5", + "2", + "0", + "1", + "14", + "17", + "15", + "16", + "8", + "10", + "18", + "12", + "13" + ], + "tickvals": [ + 5, + 15, + 25, + 35, + 45, + 55, + 65, + 75, + 85, + 95, + 105, + 115, + 125, + 135, + 145, + 155, + 165, + 175, + 185 + ], + "type": "linear", + "zeroline": false + }, + "yaxis": { + "autorange": true, + "mirror": "allticks", + "range": [ + 0, + 210.91805658311426 + ], + "rangemode": "tozero", + "showgrid": false, + "showline": true, + "showticklabels": true, + "ticks": "outside", + "type": "linear", + "zeroline": false + } + } + }, + "image/png": "", + "text/html": [ + "
" + ], + "text/vnd.plotly.v1+html": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import plotly.offline as py\n", + "import plotly.figure_factory as ff\n", + "from plotly.offline import get_plotlyjs, init_notebook_mode\n", + "\n", + "import numpy as np\n", + "\n", + "init_notebook_mode(connected=True)\n", + "\n", + "pandasDF = oskar.histogram(df,\"start\",1000000).toPandas()\n", + "# print(pandasDF)\n", + "dim = len(pandasDF[\"count\"])\n", + "X = np.zeros((dim, dim))\n", + "for i in range(dim):\n", + " for j in range(i,dim):\n", + " X[j, i] = pandasDF[\"count\"][i] - pandasDF[\"count\"][j]\n", + "# print(X)\n", + "# labels = [pandasDF[\"start\"]]\n", + "fig = ff.create_dendrogram(X)\n", + "fig['layout'].update({'width':1440, 'height':400})\n", + "py.iplot(fig, filename='dendrogram_with_labels')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Con extensión renderizando con Plotly Python" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "linkText": "Export to plot.ly", + "plotlyServerURL": "https://plot.ly", + "showLink": true + }, + "data": [ + { + "type": "heatmap", + "uid": "f1070ce3-43dc-472c-8e1f-9b6b4431f77b", + "z": [ + [ + 1, + 20, + 30 + ], + [ + 20, + 1, + 60 + ], + [ + 30, + 60, + 1 + ] + ], + "zauto": true, + "zmax": 60, + "zmin": 1 + } + ], + "layout": { + "autosize": true, + "xaxis": { + "autorange": true, + "range": [ + -0.5, + 2.5 + ] + }, + "yaxis": { + "autorange": true, + "range": [ + -0.5, + 2.5 + ] + } + } + }, + "image/png": "", + "text/html": [ + "
" + ], + "text/vnd.plotly.v1+html": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import plotly\n", + "from plotly.offline import iplot\n", + "\n", + "trace = plotly.graph_objs.Heatmap(z=[[1, 20, 30],\n", + " [20, 1, 60],\n", + " [30, 60, 1]])\n", + "\n", + "fig = dict(data=[trace])\n", + "\n", + "iplot(fig)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Con extensión renderizando con Plotly JSON" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "data": [ + { + "type": "scatter", + "x": [ + 1999, + 2000, + 2001, + 2002 + ], + "y": [ + 10, + 15, + 13, + 17 + ] + }, + { + "type": "scatter", + "x": [ + 1999, + 2000, + 2001, + 2002 + ], + "y": [ + 16, + 5, + 11, + 9 + ] + } + ], + "layout": { + "autosize": true, + "title": "Sales Growth", + "xaxis": { + "autorange": true, + "range": [ + 1998.820445406743, + 2002.179554593257 + ], + "showgrid": false, + "title": "Year", + "type": "linear", + "zeroline": false + }, + "yaxis": { + "autorange": true, + "range": [ + 4.009708737864078, + 17.990291262135923 + ], + "showline": false, + "title": "Percent", + "type": "linear" + } + } + }, + "image/png": "" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from IPython.display import display\n", + "\n", + "def Plotly(data=[], layout={}):\n", + " bundle = {}\n", + " bundle['application/vnd.plotly.v1+json'] = {\n", + " 'data': data,\n", + " 'layout': layout,\n", + " }\n", + " display(bundle, raw=True)\n", + "\n", + "data = [\n", + " {'x': [1999, 2000, 2001, 2002], 'y': [10, 15, 13, 17], 'type': 'scatter'},\n", + " {'x': [1999, 2000, 2001, 2002], 'y': [16, 5, 11, 9], 'type': 'scatter'}\n", + "]\n", + "\n", + "layout = {\n", + " 'title': 'Sales Growth',\n", + " 'xaxis': {'title': 'Year', 'showgrid': False, 'zeroline': False},\n", + " 'yaxis': {'title': 'Percent', 'showline': False}\n", + "}\n", + "\n", + "Plotly(data, layout)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/oskar-spark/src/main/python/notebooks/__init__.py b/oskar-spark/src/main/python/notebooks/notebooks/pyoskar/__init__.py similarity index 100% rename from oskar-spark/src/main/python/notebooks/__init__.py rename to oskar-spark/src/main/python/notebooks/notebooks/pyoskar/__init__.py diff --git a/oskar-spark/src/main/python/notebooks/notebooks/pyoskar/analysis.py b/oskar-spark/src/main/python/notebooks/notebooks/pyoskar/analysis.py new file mode 100644 index 0000000..8619591 --- /dev/null +++ b/oskar-spark/src/main/python/notebooks/notebooks/pyoskar/analysis.py @@ -0,0 +1,469 @@ + +import sys +from pyspark import keyword_only +from pyspark.ml.param.shared import * +from pyspark.ml.util import JavaMLReadable, JavaMLWritable +from pyspark.ml.wrapper import JavaTransformer + +if sys.version > '3': + basestring = str + +DEFAULT_COHORT = "ALL" + + +class AbstractTransformer(JavaTransformer, JavaMLReadable, JavaMLWritable): + + def setParams(self, **kwargs): + filtered = {k: v for k, v in kwargs.items() if v is not None} + return self._set(**filtered) + + +class VariantStatsTransformer(AbstractTransformer): + cohort = Param(Params._dummy(), "cohort", "Name of the cohort to calculate stats from. By default, " + DEFAULT_COHORT, + typeConverter=TypeConverters.toString) + samples = Param(Params._dummy(), "samples", "Samples belonging to the cohort. If empty, will try to read from metadata. " + + "If missing, will use all samples from the dataset.", typeConverter=TypeConverters.toListString) + studyId = Param(Params._dummy(), "studyId", "Id of the study to calculate the stats from.", typeConverter=TypeConverters.toString) + missingAsReference = Param(Params._dummy(), "missingAsReference", "Count missing alleles as reference alleles.", + typeConverter=TypeConverters.toBoolean) + + @keyword_only + def __init__(self, cohort=None, samples=None, studyId=None, missingAsReference=None): + super(VariantStatsTransformer, self).__init__() + self._java_obj = self._new_java_obj("org.opencb.oskar.spark.variant.analysis.VariantStatsTransformer", self.uid) + self.setParams(**self._input_kwargs) + + def getCohort(self): + return self.getOrDefault(self.cohort) + + def setCohort(self, value): + return self._set(cohort=value) + + def getSamples(self): + return self.getOrDefault(self.samples) + + def setSamples(self, value): + return self._set(samples=value) + + def getStudyId(self): + return self.getOrDefault(self.studyId) + + def setStudyId(self, value): + return self._set(studyId=value) + + def getMissingAsReference(self): + return self.getOrDefault(self.missingAsReference) + + def setMissingAsReference(self, value): + return self._set(missingAsReference=value) + + def transformDataframe(self, df): + return self._call_java(df) + + +class VariantSetStatsTransformer(AbstractTransformer): + studyId = Param(Params._dummy(), "studyId", "", typeConverter=TypeConverters.toString) + fileId = Param(Params._dummy(), "fileId", "", typeConverter=TypeConverters.toString) + + @keyword_only + def __init__(self, studyId=None, fileId=None): + super(VariantSetStatsTransformer, self).__init__() + self._java_obj = self._new_java_obj("org.opencb.oskar.spark.variant.analysis.VariantSetStatsTransformer", self.uid) + self.setParams(**self._input_kwargs) + + def getStudyId(self): + return self.getOrDefault(self.studyId) + + def setStudyId(self, value): + return self._set(studyId=value) + + def getFileId(self): + return self.getOrDefault(self.fileId) + + def setFileId(self, value): + return self._set(fileId=value) + + +class HistogramTransformer(AbstractTransformer): + step = Param(Params._dummy(), "step", "", typeConverter=TypeConverters.toFloat) + inputCol = Param(Params._dummy(), "inputCol", "", typeConverter=TypeConverters.toString) + + @keyword_only + def __init__(self, step=None, inputCol=None): + super(HistogramTransformer, self).__init__() + self._java_obj = self._new_java_obj("org.opencb.oskar.spark.variant.analysis.HistogramTransformer", self.uid) + self.setParams(**self._input_kwargs) + + def getStep(self): + return self.getOrDefault(self.step) + + def setStep(self, value): + return self._set(step=value) + + def getInputCol(self): + return self.getOrDefault(self.inputCol) + + def setInputCol(self, value): + return self._set(inputCol=value) + + +class HardyWeinbergTransformer(AbstractTransformer): + studyId = Param(Params._dummy(), "studyId", "", typeConverter=TypeConverters.toString) + + @keyword_only + def __init__(self, studyId=None): + super(HardyWeinbergTransformer, self).__init__() + self._java_obj = self._new_java_obj("org.opencb.oskar.spark.variant.analysis.HardyWeinbergTransformer", self.uid) + self.setParams(**self._input_kwargs) + + def getStudyId(self): + return self.getOrDefault(self.studyId) + + def setStudyId(self, value): + return self._set(studyId=value) + + +class IBSTransformer(AbstractTransformer): + samples = Param(Params._dummy(), "samples", "List of samples to use for calculating the IBS", + typeConverter=TypeConverters.toListString) + skipMultiAllelic = Param(Params._dummy(), "skipMultiAllelic", "Skip variants where any of the samples has a secondary alternate", + typeConverter=TypeConverters.toBoolean) + skipReference = Param(Params._dummy(), "skipReference", "Skip variants where both samples of the pair are HOM_REF", + typeConverter=TypeConverters.toBoolean) + numPairs = Param(Params._dummy(), "numPairs", "", typeConverter=TypeConverters.toInt) + + @keyword_only + def __init__(self, samples=None, skipMultiAllelic=None, skipReference=None, numPairs=None): + super(IBSTransformer, self).__init__() + self._java_obj = self._new_java_obj("org.opencb.oskar.spark.variant.analysis.IBSTransformer", self.uid) + self.setParams(**self._input_kwargs) + + def getSamples(self): + return self.getOrDefault(self.samples) + + def setSamples(self, value): + return self._set(samples=value) + + def getSkipMultiAllelic(self): + return self.getOrDefault(self.skipMultiAllelic) + + def setSkipMultiAllelic(self, value): + return self._set(skipMultiAllelic=value) + + def getSkipReference(self): + return self.getOrDefault(self.skipReference) + + def setSkipReference(self, value): + return self._set(skipReference=value) + + def getNumPairs(self): + return self.getOrDefault(self.numPairs) + + def setNumPairs(self, value): + return self._set(numPairs=value) + + +class ChiSquareTransformer(AbstractTransformer): + studyId = Param(Params._dummy(), "studyId", "", typeConverter=TypeConverters.toString) + phenotype = Param(Params._dummy(), "phenotype", "", typeConverter=TypeConverters.toString) + + @keyword_only + def __init__(self, studyId=None, phenotype=None): + super(ChiSquareTransformer, self).__init__() + self._java_obj = self._new_java_obj("org.opencb.oskar.spark.variant.analysis.ChiSquareTransformer", self.uid) + self.setParams(**self._input_kwargs) + + def getStudyId(self): + return self.getOrDefault(self.studyId) + + def setStudyId(self, value): + return self._set(studyId=value) + + def getPhenotype(self): + return self.getOrDefault(self.phenotype) + + def setPhenotype(self, value): + return self._set(phenotype=value) + + +class CompoundHeterozigoteTransformer(AbstractTransformer): + father = Param(Params._dummy(), "father", "", typeConverter=TypeConverters.toString) + mother = Param(Params._dummy(), "mother", "", typeConverter=TypeConverters.toString) + child = Param(Params._dummy(), "child", "", typeConverter=TypeConverters.toString) + studyId = Param(Params._dummy(), "studyId", "", typeConverter=TypeConverters.toString) + missingGenotypeAsReference = Param(Params._dummy(), "missingGenotypeAsReference", "", typeConverter=TypeConverters.toBoolean) + + @keyword_only + def __init__(self, father=None, mother=None, child=None, studyId=None, missingGenotypeAsReference=None): + super(CompoundHeterozigoteTransformer, self).__init__() + self._java_obj = self._new_java_obj("org.opencb.oskar.spark.variant.analysis.CompoundHeterozigoteTransformer", self.uid) + self.setParams(**self._input_kwargs) + + def getFather(self): + return self.getOrDefault(self.father) + + def setFather(self, value): + return self._set(father=value) + + def getMother(self): + return self.getOrDefault(self.mother) + + def setMother(self, value): + return self._set(mother=value) + + def getChild(self): + return self.getOrDefault(self.child) + + def setChild(self, value): + return self._set(child=value) + + def getStudyId(self): + return self.getOrDefault(self.studyId) + + def setStudyId(self, value): + return self._set(studyId=value) + + def getMissingGenotypeAsReference(self): + return self.getOrDefault(self.missingGenotypeAsReference) + + def setMissingGenotypeAsReference(self, value): + return self._set(missingGenotypeAsReference=value) + + +class FacetTransformer(AbstractTransformer): + facet = Param(Params._dummy(), "facet", "", typeConverter=TypeConverters.toString) + + @keyword_only + def __init__(self, facet=None): + super(FacetTransformer, self).__init__() + self._java_obj = self._new_java_obj("org.opencb.oskar.spark.variant.analysis.FacetTransformer", self.uid) + self.setParams(**self._input_kwargs) + + def getFacet(self): + return self.getOrDefault(self.facet) + + def setFacet(self, value): + return self._set(facet=value) + + +class FisherTransformer(AbstractTransformer): + studyId = Param(Params._dummy(), "studyId", "", typeConverter=TypeConverters.toString) + phenotype = Param(Params._dummy(), "phenotype", "", typeConverter=TypeConverters.toString) + + @keyword_only + def __init__(self, studyId=None, phenotype=None): + super(FisherTransformer, self).__init__() + self._java_obj = self._new_java_obj("org.opencb.oskar.spark.variant.analysis.FisherTransformer", self.uid) + self.setParams(**self._input_kwargs) + + def getStudyId(self): + return self.getOrDefault(self.studyId) + + def setStudyId(self, value): + return self._set(studyId=value) + + def getPhenotype(self): + return self.getOrDefault(self.phenotype) + + def setPhenotype(self, value): + return self._set(phenotype=value) + + +class ImputeSexTransformer(AbstractTransformer): + lowerThreshold = Param(Params._dummy(), "lowerThreshold", "", typeConverter=TypeConverters.toFloat) + upperThreshold = Param(Params._dummy(), "upperThreshold", "", typeConverter=TypeConverters.toFloat) + chromosomeX = Param(Params._dummy(), "chromosomeX", "", typeConverter=TypeConverters.toString) + includePseudoautosomalRegions = Param(Params._dummy(), "includePseudoautosomalRegions", "", typeConverter=TypeConverters.toBoolean) + par1chrX = Param(Params._dummy(), "par1chrX", "", typeConverter=TypeConverters.toString) + par2chrX = Param(Params._dummy(), "par2chrX", "", typeConverter=TypeConverters.toString) + + @keyword_only + def __init__(self, lowerThreshold=None, upperThreshold=None, chromosomeX=None, includePseudoautosomalRegions=None, par1chrX=None, + par2chrX=None): + super(ImputeSexTransformer, self).__init__() + self._java_obj = self._new_java_obj("org.opencb.oskar.spark.variant.analysis.ImputeSexTransformer", self.uid) + self.setParams(**self._input_kwargs) + + def getLowerThreshold(self): + return self.getOrDefault(self.lowerThreshold) + + def setLowerThreshold(self, value): + return self._set(lowerThreshold=value) + + def getUpperThreshold(self): + return self.getOrDefault(self.upperThreshold) + + def setUpperThreshold(self, value): + return self._set(upperThreshold=value) + + def getChromosomeX(self): + return self.getOrDefault(self.chromosomeX) + + def setChromosomeX(self, value): + return self._set(chromosomeX=value) + + def getIncludePseudoautosomalRegions(self): + return self.getOrDefault(self.includePseudoautosomalRegions) + + def setIncludePseudoautosomalRegions(self, value): + return self._set(includePseudoautosomalRegions=value) + + def getPar1chrX(self): + return self.getOrDefault(self.par1chrX) + + def setPar1chrX(self, value): + return self._set(par1chrX=value) + + def getPar2chrX(self): + return self.getOrDefault(self.par2chrX) + + def setPar2chrX(self, value): + return self._set(par2chrX=value) + + +class InbreedingCoefficientTransformer(AbstractTransformer): + missingGenotypesAsHomRef = Param(Params._dummy(), "missingGenotypesAsHomRef", "Treat missing genotypes as HomRef genotypes", + typeConverter=TypeConverters.toBoolean) + includeMultiAllelicGenotypes = Param(Params._dummy(), "includeMultiAllelicGenotypes", "Include multi-allelic variants in the calculation", + typeConverter=TypeConverters.toBoolean) + mafThreshold = Param(Params._dummy(), "mafThreshold", "Include multi-allelic variants in the calculation", + typeConverter=TypeConverters.toFloat) + + @keyword_only + def __init__(self, missingGenotypesAsHomRef=None, includeMultiAllelicGenotypes=None, mafThreshold=None): + super(InbreedingCoefficientTransformer, self).__init__() + self._java_obj = self._new_java_obj("org.opencb.oskar.spark.variant.analysis.InbreedingCoefficientTransformer", self.uid) + self.setParams(**self._input_kwargs) + + def getMissingGenotypesAsHomRef(self): + return self.getOrDefault(self.missingGenotypesAsHomRef) + + def setMissingGenotypesAsHomRef(self, value): + return self._set(missingGenotypesAsHomRef=value) + + def getIncludeMultiAllelicGenotypes(self): + return self.getOrDefault(self.includeMultiAllelicGenotypes) + + def setIncludeMultiAllelicGenotypes(self, value): + return self._set(includeMultiAllelicGenotypes=value) + + def getMafThreshold(self): + return self.getOrDefault(self.mafThreshold) + + def setMafThreshold(self, value): + return self._set(mafThreshold=value) + + +class MendelianErrorTransformer(AbstractTransformer): + studyId = Param(Params._dummy(), "studyId", "", typeConverter=TypeConverters.toString) + father = Param(Params._dummy(), "father", "", typeConverter=TypeConverters.toString) + mother = Param(Params._dummy(), "mother", "", typeConverter=TypeConverters.toString) + child = Param(Params._dummy(), "child", "", typeConverter=TypeConverters.toString) + + @keyword_only + def __init__(self, studyId=None, father=None, mother=None, child=None): + super(MendelianErrorTransformer, self).__init__() + self._java_obj = self._new_java_obj("org.opencb.oskar.spark.variant.analysis.MendelianErrorTransformer", self.uid) + self.setParams(**self._input_kwargs) + + def getStudyId(self): + return self.getOrDefault(self.studyId) + + def setStudyId(self, value): + return self._set(studyId=value) + + def getFather(self): + return self.getOrDefault(self.father) + + def setFather(self, value): + return self._set(father=value) + + def getMother(self): + return self.getOrDefault(self.mother) + + def setMother(self, value): + return self._set(mother=value) + + def getChild(self): + return self.getOrDefault(self.child) + + def setChild(self, value): + return self._set(child=value) + + +class ModeOfInheritanceTransformer(AbstractTransformer): + family = Param(Params._dummy(), "family", "Select family to apply the filter", typeConverter=TypeConverters.toString) + modeOfInheritance = Param(Params._dummy(), "modeOfInheritance", "Filter by mode of inheritance from a given family. Accepted values: " + + "monoallelic (dominant), biallelic (recessive), xLinkedMonoallelic, xLinkedBiallelic, yLinked", + typeConverter=TypeConverters.toString) + studyId = Param(Params._dummy(), "studyId", "", typeConverter=TypeConverters.toString) + phenotype = Param(Params._dummy(), "phenotype", "", typeConverter=TypeConverters.toString) + incompletePenetrance = Param(Params._dummy(), "incompletePenetrance", "Allow variants with an incomplete penetrance mode of inheritance", + typeConverter=TypeConverters.toBoolean) + missingAsReference = Param(Params._dummy(), "missingAsReference", "Select family to apply the filter", + typeConverter=TypeConverters.toBoolean) + + @keyword_only + def __init__(self, family=None, modeOfInheritance=None, studyId=None, phenotype=None, incompletePenetrance=None, missingAsReference=None): + super(ModeOfInheritanceTransformer, self).__init__() + self._java_obj = self._new_java_obj("org.opencb.oskar.spark.variant.analysis.ModeOfInheritanceTransformer", self.uid) + self.setParams(**self._input_kwargs) + + def getFamily(self): + return self.getOrDefault(self.family) + + def setFamily(self, value): + return self._set(family=value) + + def getModeOfInheritance(self): + return self.getOrDefault(self.modeOfInheritance) + + def setModeOfInheritance(self, value): + return self._set(modeOfInheritance=value) + + def getStudyId(self): + return self.getOrDefault(self.studyId) + + def setStudyId(self, value): + return self._set(studyId=value) + + def getPhenotype(self): + return self.getOrDefault(self.phenotype) + + def setPhenotype(self, value): + return self._set(phenotype=value) + + def getIncompletePenetrance(self): + return self.getOrDefault(self.incompletePenetrance) + + def setIncompletePenetrance(self, value): + return self._set(incompletePenetrance=value) + + def getMissingAsReference(self): + return self.getOrDefault(self.missingAsReference) + + def setMissingAsReference(self, value): + return self._set(missingAsReference=value) + + +class TdtTransformer(AbstractTransformer): + studyId = Param(Params._dummy(), "studyId", "", typeConverter=TypeConverters.toString) + phenotype = Param(Params._dummy(), "phenotype", "", typeConverter=TypeConverters.toString) + + @keyword_only + def __init__(self, studyId=None, phenotype=None): + super(TdtTransformer, self).__init__() + self._java_obj = self._new_java_obj("org.opencb.oskar.spark.variant.analysis.TdtTransformer", self.uid) + self.setParams(**self._input_kwargs) + + def getStudyId(self): + return self.getOrDefault(self.studyId) + + def setStudyId(self, value): + return self._set(studyId=value) + + def getPhenotype(self): + return self.getOrDefault(self.phenotype) + + def setPhenotype(self, value): + return self._set(phenotype=value) diff --git a/oskar-spark/src/main/python/notebooks/notebooks/pyoskar/core.py b/oskar-spark/src/main/python/notebooks/notebooks/pyoskar/core.py new file mode 100644 index 0000000..09d12d3 --- /dev/null +++ b/oskar-spark/src/main/python/notebooks/notebooks/pyoskar/core.py @@ -0,0 +1,369 @@ +import json + +from pyspark.ml.wrapper import JavaWrapper +from pyspark.sql.dataframe import DataFrame + +from pyoskar.analysis import * + +__all__ = ['Oskar'] + + +class Oskar(JavaWrapper): + + def __init__(self, spark): + super(Oskar, self).__init__() + self._java_obj = self._new_java_obj("org.opencb.oskar.spark.variant.Oskar", spark._jsparkSession) + self.spark = spark + self.metadata = VariantMetadataManager() + self.python_utils = PythonUtils() + + def load(self, file_path): + """ + + :param file_path: + :return: + """ + df = self._call_java("load", file_path) + return df + + def chiSquare(self, df, studyId, phenotype): + """ + + :param df: + :param studyId: + :param phenotype: + :return: + """ + return ChiSquareTransformer(studyId=studyId, phenotype=phenotype).transform(df) + + def compoundHeterozygote(self, df, father, mother, child, studyId=None, missingGenotypeAsReference=None): + """ + + :param df: + :param father: + :param mother: + :param child: + :param studyId: + :param missingGenotypeAsReference: + :return: + """ + return CompoundHeterozigoteTransformer(father=father, mother=mother, child=child, studyId=studyId, + missingGenotypeAsReference=missingGenotypeAsReference).transform(df) + + def facet(self, df, facet): + """ + + :param df: + :param facet: + :return: + """ + return FacetTransformer(facet=facet).transform(df) + + def fisher(self, df, studyId, phenotype): + """ + + :param df: + :param studyId: + :param phenotype: + :return: + """ + return FisherTransformer(studyId=studyId, phenotype=phenotype).transform(df) + + def hardyWeinberg(self, df, studyId=None): + """ + + :type df: DataFrame + :param df: Original dataframe + + :type studyId: str + :param studyId: + + :rtype: DataFrame + :return: Transformed dataframe + """ + return HardyWeinbergTransformer(studyId=studyId).transform(df) + + def histogram(self, df, inputCol, step=None): + """ + + :param df: + :param inputCol: + :param step: + :return: + """ + return HistogramTransformer(inputCol=inputCol, step=step).transform(df) + + def ibs(self, df, samples=None, skipMultiAllelic=None, skipReference=None, numPairs=None): + """ + Calculates the Identity By State. + + :type df: DataFrame + :param df: Original dataframe + + :type samples: list + :param samples: List of samples to use for calculating the IBS + + :type skipMultiAllelic: bool + :param skipMultiAllelic: Skip variants where any of the samples has a secondary alternate + + :type skipReference: bool + :param skipReference: Skip variants where both samples of the pair are HOM_REF + + :type numPairs: int + :param numPairs: + + :rtype: DataFrame + :return: Transformed dataframe + """ + return IBSTransformer(samples=samples, skipReference=skipReference, skipMultiAllelic=skipMultiAllelic, + numPairs=numPairs).transform(df) + + def imputeSex(self, df, lowerThreshold=None, upperThreshold=None, chromosomeX=None, includePseudoautosomalRegions=None, par1chrX=None, par2chrX=None): + """ + Estimate sex of the individuals calculating the inbreeding coefficients F on the chromosome X. + + :type df: DataFrame + :param df: Original dataframe + + :type lowerThreshold: float + :param lowerThreshold: + + :type upperThreshold: float + :param upperThreshold: + + :type chromosomeX: str + :param chromosomeX: + + :type includePseudoautosomalRegions: bool + :param includePseudoautosomalRegions: + + :type par1chrX: str + :param par1chrX: + + :type par2chrX: str + :param par2chrX: + + :rtype: DataFrame + :return: Transformed dataframe + """ + return ImputeSexTransformer(lowerThreshold=lowerThreshold, upperThreshold=upperThreshold, chromosomeX=chromosomeX, + includePseudoautosomalRegions=includePseudoautosomalRegions, par1chrX=par1chrX, par2chrX=par2chrX).transform(df) + + def inbreedingCoefficient(self, df, missingGenotypesAsHomRef=None, includeMultiAllelicGenotypes=None, mafThreshold=None): + """ + Count observed and expected autosomal homozygous genotype for each sample, and report method-of-moments F coefficient estimates. (Ritland, Kermit. 1996) + Values: + - Total genotypes Count : Total count of genotypes for sample + - Observed homozygotes : Count of observed homozygote genotypes for each sample, in each variant + - Expected homozygotes : Count of expected homozygote genotypes for each sample, in each variant. + Calculated with the MAF of the cohort ALL. 1.0-(2.0*maf*(1.0-maf)) + - F : Inbreeding coefficient. Calculated as: + ([observed hom. count] - [expected count]) / ([total genotypes count] - [expected count]) + Unless otherwise specified, the genotype counts will exclude the missing and multi-allelic genotypes. + + :type df: DataFrame + :param df: Original dataframe + + :type missingGenotypesAsHomRef: bool + :param missingGenotypesAsHomRef: Treat missing genotypes as HomRef genotypes + + :type includeMultiAllelicGenotypes: bool + :param includeMultiAllelicGenotypes: Include multi-allelic variants in the calculation + + :type mafThreshold: float + :param mafThreshold: Include multi-allelic variants in the calculation + + :rtype: DataFrame + :return: Transformed dataframe + """ + return InbreedingCoefficientTransformer(missingGenotypesAsHomRef=missingGenotypesAsHomRef, + includeMultiAllelicGenotypes=includeMultiAllelicGenotypes, mafThreshold=mafThreshold).transform(df) + + def mendel(self, df, father, mother, child, studyId=None): + """ + Using Plink Mendel error codes + https://www.cog-genomics.org/plink2/basic_stats#mendel + + :type df: DataFrame + :param df: Original dataframe + + :type father: str + :param father: + + :type mother: str + :param mother: + + :type child: str + :param child: + + :type studyId: str + :param studyId: + + + :rtype: DataFrame + :return: Transformed dataframe + """ + return MendelianErrorTransformer(father=father, mother=mother, child=child, studyId=studyId).transform(df) + + # def de_novo(self, df): + # def ld_matrix(self, df): + # def impute_sex(self, df): + # def hwe_normalized_pca(self, df): + # def concordance(self, df): + # def cancer_signature(self, df): #https://cancer.sanger.ac.uk/cosmic/signatures + + def modeOfInheritance(self, df, family, modeOfInheritance, phenotype, studyId=None, incompletePenetrance=None, missingAsReference=None): + """ + Filter variants that match a given Mode Of Inheritance pattern. + + Accepted patterns: + - monoallelic, also known as dominant + - biallelic, also known as recessive + - xLinked + - yLinked + + :type df: DataFrame + :param df: Original dataframe + + :type family: str + :param family: Select family to apply the filter + + :type modeOfInheritance: str + :param modeOfInheritance: Filter by mode of inheritance from a given family. Accepted values: monoallelic (dominant), + biallelic (recessive), xLinkedMonoallelic, xLinkedBiallelic, yLinked" + + :type phenotype: str + :param phenotype: + + :type studyId: str + :param studyId: + + :type incompletePenetrance: bool + :param incompletePenetrance: Allow variants with an incomplete penetrance mode of inheritance + + :type missingAsReference: bool + :param missingAsReference: + + :rtype: DataFrame + :return: Transformed dataframe + """ + return ModeOfInheritanceTransformer(family=family, modeOfInheritance=modeOfInheritance, phenotype=phenotype, studyId=studyId, + incompletePenetrance=incompletePenetrance, missingAsReference=missingAsReference).transform(df) + + def tdt(self, df, studyId, phenotype): + """ + + :type df: DataFrame + """ + return TdtTransformer(studyId=studyId, phenotype=phenotype).transform(df) + + def stats(self, df, studyId=None, cohort=None, samples=None, missingAsReference=None): + """ + + :type df: DataFrame + :param df: Original dataframe + + :type studyId: str + :param studyId: + + :type cohort: str + :param cohort: Name of the cohort to calculate stats from. By default, 'ALL' + + :type samples: list + :param samples: Samples belonging to the cohort. If empty, will try to read from metadata. If missing, will use all samples + from the dataset + + :type missingAsReference: bool + :param missingAsReference: Count missing alleles as reference alleles + + :rtype: DataFrame + :return: Transformed dataframe + """ + return VariantStatsTransformer(studyId=studyId, cohort=cohort, samples=samples, missingAsReference=missingAsReference).transform(df) + + def globalStats(self, df, studyId=None, fileId=None): + """ + + :type df: DataFrame + """ + return VariantSetStatsTransformer(studyId=studyId, fileId=fileId).transform(df) + + +class VariantMetadataManager(JavaWrapper): + + def __init__(self): + super(VariantMetadataManager, self).__init__() + self._java_obj = self._new_java_obj("org.opencb.oskar.spark.variant.VariantMetadataManager") + self.python_utils = PythonUtils() + + def getMetadataPath(self, path): + return path + ".meta.json.gz" + + def readMetadata(self, meta_path): + """ + Writes the VariantMetadata into the schema metadata from the given dataframe. + + :type meta_path: str + :param meta_path: Path to the metadata file + + :rtype: dict + :return: An instance of VariantMetadata + """ + java_vm = self._call_java("readMetadata", meta_path) + return self.python_utils.toPythonDict(java_vm) + + def setVariantMetadata(self, df, variant_metadata): + """ + Writes the VariantMetadata into the schema metadata from the given dataframe. + + :type df: DataFrame + :param df: DataFrame to modify + + :type variant_metadata: VariantMetadata + :param variant_metadata: VariantMetadata to set + + :rtype: DataFrame + :return: Modified DataFrame + """ + java_object = self.python_utils.toJavaObject(variant_metadata, + "org.opencb.biodata.models.variant.metadata.VariantMetadata") + return self._call_java("setVariantMetadata", df, java_object) + + def variantMetadata(self, df): + java_vm = self._call_java("variantMetadata", df) + return self.python_utils.toPythonDict(java_vm) + + def samples(self, df, studyId=None): + if studyId is None: + return self.python_utils.toPythonDict(self._call_java("samples", df)) + else: + return self.python_utils.toPythonDict(self._call_java("samples", df, studyId)) + + def pedigrees(self, df, studyId=None): + if studyId is None: + java_vm = self._call_java("pedigrees", df) + return self.python_utils.toPythonDict(java_vm) + else: + java_vm = self._call_java("pedigrees", df, studyId) + return self.python_utils.toPythonDict(java_vm) + + +class OskarException(Exception): + + def __init__(self, *args, **kwargs): + super(OskarException, self).__init__(*args, **kwargs) + + +class PythonUtils(JavaWrapper): + + def __init__(self): + super(PythonUtils, self).__init__() + self._java_obj = self._new_java_obj("org.opencb.oskar.spark.commons.PythonUtils") + + def toJavaObject(self, python_dict, class_name): + js = json.dumps(python_dict, ensure_ascii=False) + return self._call_java("toJavaObject", js, class_name) + + def toPythonDict(self, java_object): + js = self._call_java("toJsonString", java_object) + return json.loads(js) diff --git a/oskar-spark/src/main/python/notebooks/notebooks/pyoskar/sql.py b/oskar-spark/src/main/python/notebooks/notebooks/pyoskar/sql.py new file mode 100644 index 0000000..ab8016c --- /dev/null +++ b/oskar-spark/src/main/python/notebooks/notebooks/pyoskar/sql.py @@ -0,0 +1,175 @@ + +import sys + +from pyspark.ml.util import _jvm + +if sys.version > '3': + basestring = str + +from pyspark.ml.wrapper import JavaWrapper +from pyspark.sql.column import Column, _to_java_column + + +class VariantUdfManager(JavaWrapper): + __java_class = None + + @classmethod + def _java_class(cls): + if cls.__java_class is None: + cls.__java_class = _jvm().org.opencb.oskar.spark.variant.udf.VariantUdfManager + return cls.__java_class + + def __init__(self): + super(VariantUdfManager, self).__init__() + self._java_obj = self._new_java_obj("org.opencb.oskar.spark.variant.udf.VariantUdfManager") + + def loadVariantUdfs(self, spark): + self._call_java("loadVariantUdfs", spark._jsparkSession) + + +def revcomp(allele): + """ + Reverse and complementary. + + :param allele: + :return: + """ + jc = VariantUdfManager._java_class().revcomp(_to_java_column(allele)) + return Column(jc) + + +def include(column, include): + jc = VariantUdfManager._java_class().include(_to_java_column(column), include) + return Column(jc) + + +def include_studies(column, include): + jc = VariantUdfManager._java_class().include_studies(_to_java_column(column), include) + return Column(jc) + + +def study(studies, studyId): + jc = VariantUdfManager._java_class().study(_to_java_column(studies), studyId) + return Column(jc) + + +def file(studies, fileId): + jc = VariantUdfManager._java_class().file(_to_java_column(studies), fileId) + return Column(jc) + + +def file_attribute(studies, fileId, info): + jc = VariantUdfManager._java_class().file_attribute(_to_java_column(studies), fileId, info) + return Column(jc) + + +def file_filter(studies, fileId): + jc = VariantUdfManager._java_class().file_filter(_to_java_column(studies), fileId) + return Column(jc) + + +def file_qual(studies, fileId): + jc = VariantUdfManager._java_class().file_qual(_to_java_column(studies), fileId) + return Column(jc) + + +def genotype(studies, sample): + jc = VariantUdfManager._java_class().genotype(_to_java_column(studies), sample) + return Column(jc) + + +def sample_data(studies, sample): + jc = VariantUdfManager._java_class().sample_data(_to_java_column(studies), sample) + return Column(jc) + + +def sample_data_field(studies, sample, formatField): + jc = VariantUdfManager._java_class().sample_data_field(_to_java_column(studies), sample, formatField) + return Column(jc) + + +def genes(annotation): + jc = VariantUdfManager._java_class().genes(_to_java_column(annotation)) + return Column(jc) + + +def consequence_types(annotation): + jc = VariantUdfManager._java_class().consequence_types(_to_java_column(annotation)) + return Column(jc) + + +def consequence_types_by_gene(annotation, gene): + jc = VariantUdfManager._java_class().consequence_types_by_gene(_to_java_column(annotation), gene) + return Column(jc) + + +def protein_substitution(annotation, score): + """ + Returns an array with the MIN and the MAX value of the given ProteinSubstitutionScore. Empty array if not found. + + :type annotation: str + :param annotation: Annotation field + + :type score: str + :param score: + + :rtype: + :return: + """ + jc = VariantUdfManager._java_class().protein_substitution(_to_java_column(annotation), score) + return Column(jc) + + +def population_frequency(annotation, study, population): + jc = VariantUdfManager._java_class().population_frequency(_to_java_column(annotation), study, population) + return Column(jc) + + +def population_frequency_as_map(annotation): + jc = VariantUdfManager._java_class().population_frequency_as_map(_to_java_column(annotation)) + return Column(jc) + + +def biotypes(annotation): + jc = VariantUdfManager._java_class().biotypes(_to_java_column(annotation)) + return Column(jc) + + +def functional(annotation, source): + """ + Read the value for the Functional Score. Null if none. Main functional scores are: cadd_scaled and cadd_raw. + + :type annotation: str + :param annotation: Annotation field + + :type source: str + :param source: Study source + + :rtype: + :return: Functional score + """ + + jc = VariantUdfManager._java_class().functional(_to_java_column(annotation), source) + return Column(jc) + + +def ensembl_genes(annotation): + jc = VariantUdfManager._java_class().ensembl_genes(_to_java_column(annotation)) + return Column(jc) + + +def conservation(annotation, source): + """ + Read the value for the Conservation Score. Null if none. Main conservation scores are: gerp, phastCons and phylop + + :type annotation: str + :param annotation: Annotation field + + :type source: str + :param source: Study source + + :rtype: + :return: Conservation score + """ + jc = VariantUdfManager._java_class().conservation(_to_java_column(annotation), source) + return Column(jc) diff --git a/oskar-spark/src/main/python/notebooks/stats.ipynb b/oskar-spark/src/main/python/notebooks/notebooks/stats.ipynb similarity index 100% rename from oskar-spark/src/main/python/notebooks/stats.ipynb rename to oskar-spark/src/main/python/notebooks/notebooks/stats.ipynb diff --git a/oskar-spark/src/main/python/notebooks/variant_filtering.ipynb b/oskar-spark/src/main/python/notebooks/notebooks/variant_filtering.ipynb similarity index 99% rename from oskar-spark/src/main/python/notebooks/variant_filtering.ipynb rename to oskar-spark/src/main/python/notebooks/notebooks/variant_filtering.ipynb index 8de1afc..fda498f 100644 --- a/oskar-spark/src/main/python/notebooks/variant_filtering.ipynb +++ b/oskar-spark/src/main/python/notebooks/notebooks/variant_filtering.ipynb @@ -48,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 2, "metadata": {}, "outputs": [ { diff --git a/oskar-spark/src/main/python/notebooks/notebooks/variant_filtering_advanced.ipynb b/oskar-spark/src/main/python/notebooks/notebooks/variant_filtering_advanced.ipynb new file mode 100644 index 0000000..01e7169 --- /dev/null +++ b/oskar-spark/src/main/python/notebooks/notebooks/variant_filtering_advanced.ipynb @@ -0,0 +1,156 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# **Advanced variant filtering**" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from pyoskar.core import Oskar\n", + "from pyoskar.sql import *\n", + "from pyoskar.analysis import *\n", + "from pyspark.sql.functions import col, udf, count, explode, concat, when, expr\n", + "from pyspark.sql.functions import *\n", + "\n", + "oskar = Oskar(spark)\n", + "df = oskar.load(\"/home/roldanx/appl/oskar/oskar-spark/src/test/resources/platinum_chr22.small.parquet\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hardy Weinberg" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------+--------------------+\n", + "| id| HWE|\n", + "+---------------+--------------------+\n", + "|22:16054454:C:T| 1.0|\n", + "|22:16065809:T:C| 1.0|\n", + "|22:16077310:T:A| 0.9254727474972191|\n", + "|22:16080499:A:G| 1.0|\n", + "|22:16084621:T:C| 1.0|\n", + "|22:16091610:G:T| 1.0|\n", + "|22:16096040:G:A| 0.4746014089729329|\n", + "|22:16099957:C:T|0.016007636455477054|\n", + "|22:16100462:A:G|0.001011008618240...|\n", + "|22:16105660:G:A| 0.3037449017426771|\n", + "+---------------+--------------------+\n", + "only showing top 10 rows\n", + "\n" + ] + } + ], + "source": [ + "oskar.hardyWeinberg(df,\"hgvauser@platinum:illumina_platinum\").select(\"id\", \"HWE\").show(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Inbreeding coefficient" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------+-------------------+-----------+------------------+--------------+\n", + "|SampleId| F|ObservedHom| ExpectedHom|GenotypesCount|\n", + "+--------+-------------------+-----------+------------------+--------------+\n", + "| NA12877|-1.0857581722788996| 70|233.97577702999115| 385|\n", + "| NA12878|-1.1024114888695444| 69|244.65916746854782| 404|\n", + "| NA12879|-1.1890914293957586| 69| 247.7093403339386| 398|\n", + "| NA12880|-1.1013660394101679| 71|248.15224742889404| 409|\n", + "| NA12881|-1.1560267972581504| 65| 252.6643579006195| 415|\n", + "| NA12882|-1.0112382612189488| 76| 224.8269881606102| 372|\n", + "| NA12883|-1.0602574055431329| 67|229.62110525369644| 383|\n", + "| NA12884|-1.0340014363992485| 74|224.47404664754868| 370|\n", + "| NA12885|-1.1105665251221366| 78| 254.8010356426239| 414|\n", + "| NA12886| -1.067867784696387| 72|244.48096668720245| 406|\n", + "+--------+-------------------+-----------+------------------+--------------+\n", + "only showing top 10 rows\n", + "\n" + ] + } + ], + "source": [ + "df2 = oskar.stats(df, studyId=\"hgvauser@platinum:illumina_platinum\", missingAsReference=True)\n", + "oskar.inbreedingCoefficient(df2).show(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Mendelian error" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------------+--------------+\n", + "| id|mendelianError|\n", + "+------------------+--------------+\n", + "|22:19748211:CCCC:-| 1|\n", + "+------------------+--------------+\n", + "\n" + ] + } + ], + "source": [ + "oskar.mendel(df, \"NA12877\", \"NA12878\", \"NA12879\").select(\"id\", \"mendelianError\").filter(col(\"mendelianError\") != \"0\").show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/oskar-spark/src/main/python/notebooks/variant_filtering_advanced.ipynb b/oskar-spark/src/main/python/notebooks/variant_filtering_advanced.ipynb deleted file mode 100644 index cf6719d..0000000 --- a/oskar-spark/src/main/python/notebooks/variant_filtering_advanced.ipynb +++ /dev/null @@ -1,123 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# **Advanced variant filtering**" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "ename": "TypeError", - "evalue": "'JavaPackage' object is not callable", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mpyspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msql\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfunctions\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0moskar\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mOskar\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mspark\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0moskar\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"/home/roldanx/appl/oskar/oskar-spark/src/test/resources/platinum_chr22.small.parquet\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/appl/oskar/oskar-spark/src/main/python/pyoskar/core.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, spark)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspark\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mOskar\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__init__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_java_obj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_new_java_obj\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"org.opencb.oskar.spark.variant.Oskar\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mspark\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jsparkSession\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mspark\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mspark\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmetadata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mVariantMetadataManager\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/soft/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/wrapper.py\u001b[0m in \u001b[0;36m_new_java_obj\u001b[0;34m(java_class, *args)\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0mjava_obj\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjava_obj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 66\u001b[0m \u001b[0mjava_args\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0m_py2java\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0marg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 67\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mjava_obj\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mjava_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 68\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mstaticmethod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: 'JavaPackage' object is not callable" - ] - } - ], - "source": [ - "from pyoskar.core import Oskar\n", - "from pyoskar.sql import *\n", - "from pyoskar.analysis import *\n", - "from pyspark.sql.functions import col, udf, count, explode, concat, when, expr\n", - "from pyspark.sql.functions import *\n", - "\n", - "oskar = Oskar(spark)\n", - "df = oskar.load(\"/home/roldanx/appl/oskar/oskar-spark/src/test/resources/platinum_chr22.small.parquet\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Hardy Weinberg" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'oskar' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0moskar\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhardyWeinberg\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"hgvauser@platinum:illumina_platinum\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mselect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"id\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"HWE\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mNameError\u001b[0m: name 'oskar' is not defined" - ] - } - ], - "source": [ - "oskar.hardyWeinberg(df,\"hgvauser@platinum:illumina_platinum\").select(\"id\", \"HWE\").show(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Inbreeding coefficient" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df2 = oskar.stats(df, studyId=\"hgvauser@platinum:illumina_platinum\", missingAsReference=True)\n", - "oskar.inbreedingCoefficient(df2).show(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Mendelian error" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "oskar.mendel(df, \"NA12877\", \"NA12878\", \"NA12879\").select(\"id\", \"mendelianError\").filter(col(\"mendelianError\") != \"0\").show()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/pom.xml b/pom.xml index 68378f8..0364bad 100644 --- a/pom.xml +++ b/pom.xml @@ -19,7 +19,7 @@ 0.1.0 4.6.1 - 1.4.3-SNAPSHOT + 1.5.0-SNAPSHOT 3.8.0-SNAPSHOT 2.3.2 2.7.3