diff --git a/components/dataset/study_locus/_study_locus/index.html b/components/dataset/study_locus/_study_locus/index.html
index 80e257249..0b6d60f2a 100644
--- a/components/dataset/study_locus/_study_locus/index.html
+++ b/components/dataset/study_locus/_study_locus/index.html
@@ -1073,7 +1073,7 @@
  |-- betaConfidenceIntervalUpper: double (nullable = true)
  |-- pValueMantissa: float (nullable = true)
  |-- pValueExponent: integer (nullable = true)
- |-- effectAlleleFrequencyFromSource: double (nullable = true)
+ |-- effectAlleleFrequencyFromSource: float (nullable = true)
  |-- standardError: double (nullable = true)
  |-- subStudyDescription: string (nullable = true)
  |-- qualityControls: array (nullable = true)
diff --git a/components/dataset/summary_statistics/index.html b/components/dataset/summary_statistics/index.html
index 17455ee9e..6e6429eef 100644
--- a/components/dataset/summary_statistics/index.html
+++ b/components/dataset/summary_statistics/index.html
@@ -1,7 +1,4 @@
-<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=../study_locus_overlap/ rel=prev><link href=../variant_annotation/ rel=next><link rel=icon href=../../../assets/favicon.png><meta name=generator content="mkdocs-1.4.2, mkdocs-material-9.1.6"><title>Summary statistics - Genetics Portal Pipeline</title><link rel=stylesheet href=../../../assets/stylesheets/main.ded33207.min.css><link rel=stylesheet href=../../../assets/stylesheets/palette.a0c5b2b5.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../assets/_mkdocstrings.css><link rel=stylesheet href=../../../assets/stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script></head> <body dir=ltr data-md-color-scheme=default data-md-color-primary=blue-grey data-md-color-accent=light-blue> <script>var palette=__md_get("__palette");if(palette&&"object"==typeof palette.color)for(var key of Object.keys(palette.color))document.body.setAttribute("data-md-color-"+key,palette.color[key])</script> <input class=md-toggle data-md-toggle=drawer type=checkbox id=__drawer autocomplete=off> <input class=md-toggle data-md-toggle=search type=checkbox id=__search autocomplete=off> <label class=md-overlay for=__drawer></label> <div data-md-component=skip> <a href=#otg.dataset.summary_statistics.SummaryStatistics class=md-skip> Skip to content </a> </div> <div data-md-component=announce> </div> <header class="md-header md-header--shadow" data-md-component=header> <nav class="md-header__inner md-grid" aria-label=Header> <a href=../../.. title="Genetics Portal Pipeline" class="md-header__button md-logo" aria-label="Genetics Portal Pipeline" data-md-component=logo> <img src=../../../assets/otlogo.png alt=logo> </a> <label class="md-header__button md-icon" for=__drawer> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class=md-header__title data-md-component=header-title> <div class=md-header__ellipsis> <div class=md-header__topic> <span class=md-ellipsis> Genetics Portal Pipeline </span> </div> <div class=md-header__topic data-md-component=header-topic> <span class=md-ellipsis> Summary statistics </span> </div> </div> </div> <form class=md-header__option data-md-component=palette> <input class=md-option data-md-color-media="(prefers-color-scheme: light)" data-md-color-scheme=default data-md-color-primary=blue-grey data-md-color-accent=light-blue aria-label="Switch to dark mode" type=radio name=__palette id=__palette_1> <label class="md-header__button md-icon" title="Switch to dark mode" for=__palette_2 hidden> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M17 6H7c-3.31 0-6 2.69-6 6s2.69 6 6 6h10c3.31 0 6-2.69 6-6s-2.69-6-6-6zm0 10H7c-2.21 0-4-1.79-4-4s1.79-4 4-4h10c2.21 0 4 1.79 4 4s-1.79 4-4 4zM7 9c-1.66 0-3 1.34-3 3s1.34 3 3 3 3-1.34 3-3-1.34-3-3-3z"/></svg> </label> <input class=md-option data-md-color-media="(prefers-color-scheme: dark)" data-md-color-scheme=slate data-md-color-primary=blue-grey data-md-color-accent=light-blue aria-label="Switch to light mode" type=radio name=__palette id=__palette_2> <label class="md-header__button md-icon" title="Switch to light mode" for=__palette_1 hidden> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M17 7H7a5 5 0 0 0-5 5 5 5 0 0 0 5 5h10a5 5 0 0 0 5-5 5 5 0 0 0-5-5m0 8a3 3 0 0 1-3-3 3 3 0 0 1 3-3 3 3 0 0 1 3 3 3 3 0 0 1-3 3Z"/></svg> </label> </form> <label class="md-header__button md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> </label> <div class=md-search data-md-component=search role=dialog> <label class=md-search__overlay for=__search></label> <div class=md-search__inner role=search> <form class=md-search__form name=search> <input type=text class=md-search__input name=query aria-label=Search placeholder=Search autocapitalize=off autocorrect=off autocomplete=off spellcheck=false data-md-component=search-query required> <label class="md-search__icon md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg> </label> <nav class=md-search__options aria-label=Search> <button type=reset class="md-search__icon md-icon" title=Clear aria-label=Clear tabindex=-1> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg> </button> </nav> <div class=md-search__suggest data-md-component=search-suggest></div> </form> <div class=md-search__output> <div class=md-search__scrollwrap data-md-scrollfix> <div class=md-search-result data-md-component=search-result> <div class=md-search-result__meta> Initializing search </div> <ol class=md-search-result__list role=presentation></ol> </div> </div> </div> </div> </div> <div class=md-header__source> <a href=https://github.com/opentargets/genetics_etl_python title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> opentargets/genetics_etl_python </div> </a> </div> </nav> </header> <div class=md-container data-md-component=container> <main class=md-main data-md-component=main> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component=sidebar data-md-type=navigation> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--primary" aria-label=Navigation data-md-level=0> <label class=md-nav__title for=__drawer> <a href=../../.. title="Genetics Portal Pipeline" class="md-nav__button md-logo" aria-label="Genetics Portal Pipeline" data-md-component=logo> <img src=../../../assets/otlogo.png alt=logo> </a> Genetics Portal Pipeline </label> <div class=md-nav__source> <a href=https://github.com/opentargets/genetics_etl_python title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> opentargets/genetics_etl_python </div> </a> </div> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../.. class=md-nav__link> Home </a> </li> <li class=md-nav__item> <a href=../../../contributing/ class=md-nav__link> Environment configuration and contributing changes </a> </li> <li class=md-nav__item> <a href=../../../roadmap/ class=md-nav__link> Roadmap </a> </li> <li class=md-nav__item> <a href=../../../troubleshooting/ class=md-nav__link> Troubleshooting </a> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_5 checked> <label class=md-nav__link for=__nav_5 id=__nav_5_label tabindex=0> Components <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_5_label aria-expanded=true> <label class=md-nav__title for=__nav_5> <span class="md-nav__icon md-icon"></span> Components </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_5_1 checked> <div class="md-nav__link md-nav__link--index "> <a href=../_dataset/ >Dataset</a> <label for=__nav_5_1> <span class="md-nav__icon md-icon"></span> </label> </div> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_5_1_label aria-expanded=true> <label class=md-nav__title for=__nav_5_1> <span class="md-nav__icon md-icon"></span> Dataset </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../colocalisation/ class=md-nav__link> Colocalisation </a> </li> <li class=md-nav__item> <a href=../gene_index/ class=md-nav__link> Gene index </a> </li> <li class=md-nav__item> <a href=../intervals/ class=md-nav__link> Intervals </a> </li> <li class=md-nav__item> <a href=../ld_index/ class=md-nav__link> LD index </a> </li> <li class=md-nav__item> <a href=../study_locus_overlap/ class=md-nav__link> Study locus overlap </a> </li> <li class="md-nav__item md-nav__item--active"> <input class="md-nav__toggle md-toggle" type=checkbox id=__toc> <label class="md-nav__link md-nav__link--active" for=__toc> Summary statistics <span class="md-nav__icon md-icon"></span> </label> <a href=./ class="md-nav__link md-nav__link--active"> Summary statistics </a> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics class=md-nav__link> otg.dataset.summary_statistics.SummaryStatistics </a> </li> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics.calculate_confidence_interval class=md-nav__link> calculate_confidence_interval() </a> </li> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics.from_gwas_harmonized_summary_stats class=md-nav__link> from_gwas_harmonized_summary_stats() </a> </li> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics.from_parquet class=md-nav__link> from_parquet() </a> </li> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics.pvalue_filter class=md-nav__link> pvalue_filter() </a> </li> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics.window_based_clumping class=md-nav__link> window_based_clumping() </a> </li> <li class=md-nav__item> <a href=#schema class=md-nav__link> Schema </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../variant_annotation/ class=md-nav__link> Variant annotation </a> </li> <li class=md-nav__item> <a href=../variant_index/ class=md-nav__link> Variant index </a> </li> <li class=md-nav__item> <a href=../variant_to_gene/ class=md-nav__link> Variant to gene </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_5_1_10> <div class="md-nav__link md-nav__link--index "> <a href=../study_index/_study_index/ >Study index</a> <label for=__nav_5_1_10> <span class="md-nav__icon md-icon"></span> </label> </div> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_5_1_10_label aria-expanded=false> <label class=md-nav__title for=__nav_5_1_10> <span class="md-nav__icon md-icon"></span> Study index </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../study_index/study_index_finngen/ class=md-nav__link> Study index finngen </a> </li> <li class=md-nav__item> <a href=../study_index/study_index_gwas_catalog/ class=md-nav__link> Study index gwas catalog </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_5_1_11> <div class="md-nav__link md-nav__link--index "> <a href=../study_locus/_study_locus/ >Study locus</a> <label for=__nav_5_1_11> <span class="md-nav__icon md-icon"></span> </label> </div> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_5_1_11_label aria-expanded=false> <label class=md-nav__title for=__nav_5_1_11> <span class="md-nav__icon md-icon"></span> Study locus </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../study_locus/study_locus_gwas_catalog/ class=md-nav__link> Study locus gwas catalog </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_5_2> <div class="md-nav__link md-nav__link--index "> <a href=../../method/_method/ >Method</a> <label for=__nav_5_2> <span class="md-nav__icon md-icon"></span> </label> </div> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_5_2_label aria-expanded=false> <label class=md-nav__title for=__nav_5_2> <span class="md-nav__icon md-icon"></span> Method </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../method/clumping/ class=md-nav__link> Clumping </a> </li> <li class=md-nav__item> <a href=../../method/coloc/ class=md-nav__link> coloc </a> </li> <li class=md-nav__item> <a href=../../method/ecaviar/ class=md-nav__link> eCAVIAR </a> </li> <li class=md-nav__item> <a href=../../method/ld_annotator/ class=md-nav__link> LD annotator </a> </li> <li class=md-nav__item> <a href=../../method/pics/ class=md-nav__link> PICS </a> </li> <li class=md-nav__item> <a href=../../method/window_based_clumping/ class=md-nav__link> Window-based clumping </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_5_3> <div class="md-nav__link md-nav__link--index "> <a href=../../step/_step/ >Step</a> <label for=__nav_5_3> <span class="md-nav__icon md-icon"></span> </label> </div> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_5_3_label aria-expanded=false> <label class=md-nav__title for=__nav_5_3> <span class="md-nav__icon md-icon"></span> Step </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../step/colocalisation/ class=md-nav__link> Colocalisation </a> </li> <li class=md-nav__item> <a href=../../step/finngen/ class=md-nav__link> FinnGen </a> </li> <li class=md-nav__item> <a href=../../step/gene_index/ class=md-nav__link> Gene index </a> </li> <li class=md-nav__item> <a href=../../step/gwas_catalog/ class=md-nav__link> GWAS Catalog </a> </li> <li class=md-nav__item> <a href=../../step/gwas_catalog_sumstat_preprocess/ class=md-nav__link> GWAS Catalog sumstat preprocess </a> </li> <li class=md-nav__item> <a href=../../step/ld_index/ class=md-nav__link> LD index </a> </li> <li class=md-nav__item> <a href=../../step/ukbiobank/ class=md-nav__link> UKBiobank </a> </li> <li class=md-nav__item> <a href=../../step/variant_annotation_step/ class=md-nav__link> Variant annotation </a> </li> <li class=md-nav__item> <a href=../../step/variant_index_step/ class=md-nav__link> Variant index </a> </li> <li class=md-nav__item> <a href=../../step/variant_to_gene_step/ class=md-nav__link> V2G </a> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component=sidebar data-md-type=toc> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics class=md-nav__link> otg.dataset.summary_statistics.SummaryStatistics </a> </li> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics.calculate_confidence_interval class=md-nav__link> calculate_confidence_interval() </a> </li> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics.from_gwas_harmonized_summary_stats class=md-nav__link> from_gwas_harmonized_summary_stats() </a> </li> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics.from_parquet class=md-nav__link> from_parquet() </a> </li> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics.pvalue_filter class=md-nav__link> pvalue_filter() </a> </li> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics.window_based_clumping class=md-nav__link> window_based_clumping() </a> </li> <li class=md-nav__item> <a href=#schema class=md-nav__link> Schema </a> </li> </ul> </nav> </div> </div> </div> <div class=md-content data-md-component=content> <article class="md-content__inner md-typeset"> <h1>Summary statistics</h1> <div class="doc doc-object doc-class"> <a id=otg.dataset.summary_statistics.SummaryStatistics></a> <div class="doc doc-contents first"> <p class="doc doc-class-bases"> Bases: <code><a class="autorefs autorefs-internal" title="otg.dataset.dataset.Dataset" href="../_dataset/#otg.dataset.dataset.Dataset">Dataset</a></code></p> <p>Summary Statistics dataset.</p> <p>A summary statistics dataset contains all single point statistics resulting from a GWAS.</p> <details class=quote> <summary>Source code in <code>src/otg/dataset/summary_statistics.py</code></summary> <div class=highlight><table class=highlighttable><tr><td class=linenos><div class=linenodiv><pre><span></span><span class=normal> 24</span>
-<span class=normal> 25</span>
-<span class=normal> 26</span>
-<span class=normal> 27</span>
+<!doctype html><html lang=en class=no-js> <head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1"><link href=../study_locus_overlap/ rel=prev><link href=../variant_annotation/ rel=next><link rel=icon href=../../../assets/favicon.png><meta name=generator content="mkdocs-1.4.2, mkdocs-material-9.1.6"><title>Summary statistics - Genetics Portal Pipeline</title><link rel=stylesheet href=../../../assets/stylesheets/main.ded33207.min.css><link rel=stylesheet href=../../../assets/stylesheets/palette.a0c5b2b5.min.css><link rel=preconnect href=https://fonts.gstatic.com crossorigin><link rel=stylesheet href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback"><style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style><link rel=stylesheet href=../../../css/timeago.css><link rel=stylesheet href=../../../assets/_mkdocstrings.css><link rel=stylesheet href=../../../assets/stylesheets/extra.css><script>__md_scope=new URL("../../..",location),__md_hash=e=>[...e].reduce((e,_)=>(e<<5)-e+_.charCodeAt(0),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script></head> <body dir=ltr data-md-color-scheme=default data-md-color-primary=blue-grey data-md-color-accent=light-blue> <script>var palette=__md_get("__palette");if(palette&&"object"==typeof palette.color)for(var key of Object.keys(palette.color))document.body.setAttribute("data-md-color-"+key,palette.color[key])</script> <input class=md-toggle data-md-toggle=drawer type=checkbox id=__drawer autocomplete=off> <input class=md-toggle data-md-toggle=search type=checkbox id=__search autocomplete=off> <label class=md-overlay for=__drawer></label> <div data-md-component=skip> <a href=#otg.dataset.summary_statistics.SummaryStatistics class=md-skip> Skip to content </a> </div> <div data-md-component=announce> </div> <header class="md-header md-header--shadow" data-md-component=header> <nav class="md-header__inner md-grid" aria-label=Header> <a href=../../.. title="Genetics Portal Pipeline" class="md-header__button md-logo" aria-label="Genetics Portal Pipeline" data-md-component=logo> <img src=../../../assets/otlogo.png alt=logo> </a> <label class="md-header__button md-icon" for=__drawer> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M3 6h18v2H3V6m0 5h18v2H3v-2m0 5h18v2H3v-2Z"/></svg> </label> <div class=md-header__title data-md-component=header-title> <div class=md-header__ellipsis> <div class=md-header__topic> <span class=md-ellipsis> Genetics Portal Pipeline </span> </div> <div class=md-header__topic data-md-component=header-topic> <span class=md-ellipsis> Summary statistics </span> </div> </div> </div> <form class=md-header__option data-md-component=palette> <input class=md-option data-md-color-media="(prefers-color-scheme: light)" data-md-color-scheme=default data-md-color-primary=blue-grey data-md-color-accent=light-blue aria-label="Switch to dark mode" type=radio name=__palette id=__palette_1> <label class="md-header__button md-icon" title="Switch to dark mode" for=__palette_2 hidden> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M17 6H7c-3.31 0-6 2.69-6 6s2.69 6 6 6h10c3.31 0 6-2.69 6-6s-2.69-6-6-6zm0 10H7c-2.21 0-4-1.79-4-4s1.79-4 4-4h10c2.21 0 4 1.79 4 4s-1.79 4-4 4zM7 9c-1.66 0-3 1.34-3 3s1.34 3 3 3 3-1.34 3-3-1.34-3-3-3z"/></svg> </label> <input class=md-option data-md-color-media="(prefers-color-scheme: dark)" data-md-color-scheme=slate data-md-color-primary=blue-grey data-md-color-accent=light-blue aria-label="Switch to light mode" type=radio name=__palette id=__palette_2> <label class="md-header__button md-icon" title="Switch to light mode" for=__palette_1 hidden> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M17 7H7a5 5 0 0 0-5 5 5 5 0 0 0 5 5h10a5 5 0 0 0 5-5 5 5 0 0 0-5-5m0 8a3 3 0 0 1-3-3 3 3 0 0 1 3-3 3 3 0 0 1 3 3 3 3 0 0 1-3 3Z"/></svg> </label> </form> <label class="md-header__button md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> </label> <div class=md-search data-md-component=search role=dialog> <label class=md-search__overlay for=__search></label> <div class=md-search__inner role=search> <form class=md-search__form name=search> <input type=text class=md-search__input name=query aria-label=Search placeholder=Search autocapitalize=off autocorrect=off autocomplete=off spellcheck=false data-md-component=search-query required> <label class="md-search__icon md-icon" for=__search> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M9.5 3A6.5 6.5 0 0 1 16 9.5c0 1.61-.59 3.09-1.56 4.23l.27.27h.79l5 5-1.5 1.5-5-5v-.79l-.27-.27A6.516 6.516 0 0 1 9.5 16 6.5 6.5 0 0 1 3 9.5 6.5 6.5 0 0 1 9.5 3m0 2C7 5 5 7 5 9.5S7 14 9.5 14 14 12 14 9.5 12 5 9.5 5Z"/></svg> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M20 11v2H8l5.5 5.5-1.42 1.42L4.16 12l7.92-7.92L13.5 5.5 8 11h12Z"/></svg> </label> <nav class=md-search__options aria-label=Search> <button type=reset class="md-search__icon md-icon" title=Clear aria-label=Clear tabindex=-1> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M19 6.41 17.59 5 12 10.59 6.41 5 5 6.41 10.59 12 5 17.59 6.41 19 12 13.41 17.59 19 19 17.59 13.41 12 19 6.41Z"/></svg> </button> </nav> <div class=md-search__suggest data-md-component=search-suggest></div> </form> <div class=md-search__output> <div class=md-search__scrollwrap data-md-scrollfix> <div class=md-search-result data-md-component=search-result> <div class=md-search-result__meta> Initializing search </div> <ol class=md-search-result__list role=presentation></ol> </div> </div> </div> </div> </div> <div class=md-header__source> <a href=https://github.com/opentargets/genetics_etl_python title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> opentargets/genetics_etl_python </div> </a> </div> </nav> </header> <div class=md-container data-md-component=container> <main class=md-main data-md-component=main> <div class="md-main__inner md-grid"> <div class="md-sidebar md-sidebar--primary" data-md-component=sidebar data-md-type=navigation> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--primary" aria-label=Navigation data-md-level=0> <label class=md-nav__title for=__drawer> <a href=../../.. title="Genetics Portal Pipeline" class="md-nav__button md-logo" aria-label="Genetics Portal Pipeline" data-md-component=logo> <img src=../../../assets/otlogo.png alt=logo> </a> Genetics Portal Pipeline </label> <div class=md-nav__source> <a href=https://github.com/opentargets/genetics_etl_python title="Go to repository" class=md-source data-md-component=source> <div class="md-source__icon md-icon"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 448 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M439.55 236.05 244 40.45a28.87 28.87 0 0 0-40.81 0l-40.66 40.63 51.52 51.52c27.06-9.14 52.68 16.77 43.39 43.68l49.66 49.66c34.23-11.8 61.18 31 35.47 56.69-26.49 26.49-70.21-2.87-56-37.34L240.22 199v121.85c25.3 12.54 22.26 41.85 9.08 55a34.34 34.34 0 0 1-48.55 0c-17.57-17.6-11.07-46.91 11.25-56v-123c-20.8-8.51-24.6-30.74-18.64-45L142.57 101 8.45 235.14a28.86 28.86 0 0 0 0 40.81l195.61 195.6a28.86 28.86 0 0 0 40.8 0l194.69-194.69a28.86 28.86 0 0 0 0-40.81z"/></svg> </div> <div class=md-source__repository> opentargets/genetics_etl_python </div> </a> </div> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../.. class=md-nav__link> Home </a> </li> <li class=md-nav__item> <a href=../../../contributing/ class=md-nav__link> Environment configuration and contributing changes </a> </li> <li class=md-nav__item> <a href=../../../roadmap/ class=md-nav__link> Roadmap </a> </li> <li class=md-nav__item> <a href=../../../troubleshooting/ class=md-nav__link> Troubleshooting </a> </li> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_5 checked> <label class=md-nav__link for=__nav_5 id=__nav_5_label tabindex=0> Components <span class="md-nav__icon md-icon"></span> </label> <nav class=md-nav data-md-level=1 aria-labelledby=__nav_5_label aria-expanded=true> <label class=md-nav__title for=__nav_5> <span class="md-nav__icon md-icon"></span> Components </label> <ul class=md-nav__list data-md-scrollfix> <li class="md-nav__item md-nav__item--active md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_5_1 checked> <div class="md-nav__link md-nav__link--index "> <a href=../_dataset/ >Dataset</a> <label for=__nav_5_1> <span class="md-nav__icon md-icon"></span> </label> </div> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_5_1_label aria-expanded=true> <label class=md-nav__title for=__nav_5_1> <span class="md-nav__icon md-icon"></span> Dataset </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../colocalisation/ class=md-nav__link> Colocalisation </a> </li> <li class=md-nav__item> <a href=../gene_index/ class=md-nav__link> Gene index </a> </li> <li class=md-nav__item> <a href=../intervals/ class=md-nav__link> Intervals </a> </li> <li class=md-nav__item> <a href=../ld_index/ class=md-nav__link> LD index </a> </li> <li class=md-nav__item> <a href=../study_locus_overlap/ class=md-nav__link> Study locus overlap </a> </li> <li class="md-nav__item md-nav__item--active"> <input class="md-nav__toggle md-toggle" type=checkbox id=__toc> <label class="md-nav__link md-nav__link--active" for=__toc> Summary statistics <span class="md-nav__icon md-icon"></span> </label> <a href=./ class="md-nav__link md-nav__link--active"> Summary statistics </a> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics class=md-nav__link> otg.dataset.summary_statistics.SummaryStatistics </a> </li> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics.from_gwas_harmonized_summary_stats class=md-nav__link> from_gwas_harmonized_summary_stats() </a> </li> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics.from_parquet class=md-nav__link> from_parquet() </a> </li> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics.pvalue_filter class=md-nav__link> pvalue_filter() </a> </li> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics.window_based_clumping class=md-nav__link> window_based_clumping() </a> </li> <li class=md-nav__item> <a href=#schema class=md-nav__link> Schema </a> </li> </ul> </nav> </li> <li class=md-nav__item> <a href=../variant_annotation/ class=md-nav__link> Variant annotation </a> </li> <li class=md-nav__item> <a href=../variant_index/ class=md-nav__link> Variant index </a> </li> <li class=md-nav__item> <a href=../variant_to_gene/ class=md-nav__link> Variant to gene </a> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_5_1_10> <div class="md-nav__link md-nav__link--index "> <a href=../study_index/_study_index/ >Study index</a> <label for=__nav_5_1_10> <span class="md-nav__icon md-icon"></span> </label> </div> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_5_1_10_label aria-expanded=false> <label class=md-nav__title for=__nav_5_1_10> <span class="md-nav__icon md-icon"></span> Study index </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../study_index/study_index_finngen/ class=md-nav__link> Study index finngen </a> </li> <li class=md-nav__item> <a href=../study_index/study_index_gwas_catalog/ class=md-nav__link> Study index gwas catalog </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_5_1_11> <div class="md-nav__link md-nav__link--index "> <a href=../study_locus/_study_locus/ >Study locus</a> <label for=__nav_5_1_11> <span class="md-nav__icon md-icon"></span> </label> </div> <nav class=md-nav data-md-level=3 aria-labelledby=__nav_5_1_11_label aria-expanded=false> <label class=md-nav__title for=__nav_5_1_11> <span class="md-nav__icon md-icon"></span> Study locus </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../study_locus/study_locus_gwas_catalog/ class=md-nav__link> Study locus gwas catalog </a> </li> </ul> </nav> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_5_2> <div class="md-nav__link md-nav__link--index "> <a href=../../method/_method/ >Method</a> <label for=__nav_5_2> <span class="md-nav__icon md-icon"></span> </label> </div> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_5_2_label aria-expanded=false> <label class=md-nav__title for=__nav_5_2> <span class="md-nav__icon md-icon"></span> Method </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../method/clumping/ class=md-nav__link> Clumping </a> </li> <li class=md-nav__item> <a href=../../method/coloc/ class=md-nav__link> coloc </a> </li> <li class=md-nav__item> <a href=../../method/ecaviar/ class=md-nav__link> eCAVIAR </a> </li> <li class=md-nav__item> <a href=../../method/ld_annotator/ class=md-nav__link> LD annotator </a> </li> <li class=md-nav__item> <a href=../../method/pics/ class=md-nav__link> PICS </a> </li> <li class=md-nav__item> <a href=../../method/window_based_clumping/ class=md-nav__link> Window-based clumping </a> </li> </ul> </nav> </li> <li class="md-nav__item md-nav__item--nested"> <input class="md-nav__toggle md-toggle " type=checkbox id=__nav_5_3> <div class="md-nav__link md-nav__link--index "> <a href=../../step/_step/ >Step</a> <label for=__nav_5_3> <span class="md-nav__icon md-icon"></span> </label> </div> <nav class=md-nav data-md-level=2 aria-labelledby=__nav_5_3_label aria-expanded=false> <label class=md-nav__title for=__nav_5_3> <span class="md-nav__icon md-icon"></span> Step </label> <ul class=md-nav__list data-md-scrollfix> <li class=md-nav__item> <a href=../../step/colocalisation/ class=md-nav__link> Colocalisation </a> </li> <li class=md-nav__item> <a href=../../step/finngen/ class=md-nav__link> FinnGen </a> </li> <li class=md-nav__item> <a href=../../step/gene_index/ class=md-nav__link> Gene index </a> </li> <li class=md-nav__item> <a href=../../step/gwas_catalog/ class=md-nav__link> GWAS Catalog </a> </li> <li class=md-nav__item> <a href=../../step/gwas_catalog_sumstat_preprocess/ class=md-nav__link> GWAS Catalog sumstat preprocess </a> </li> <li class=md-nav__item> <a href=../../step/ld_index/ class=md-nav__link> LD index </a> </li> <li class=md-nav__item> <a href=../../step/ukbiobank/ class=md-nav__link> UKBiobank </a> </li> <li class=md-nav__item> <a href=../../step/variant_annotation_step/ class=md-nav__link> Variant annotation </a> </li> <li class=md-nav__item> <a href=../../step/variant_index_step/ class=md-nav__link> Variant index </a> </li> <li class=md-nav__item> <a href=../../step/variant_to_gene_step/ class=md-nav__link> V2G </a> </li> </ul> </nav> </li> </ul> </nav> </li> </ul> </nav> </div> </div> </div> <div class="md-sidebar md-sidebar--secondary" data-md-component=sidebar data-md-type=toc> <div class=md-sidebar__scrollwrap> <div class=md-sidebar__inner> <nav class="md-nav md-nav--secondary" aria-label="Table of contents"> <label class=md-nav__title for=__toc> <span class="md-nav__icon md-icon"></span> Table of contents </label> <ul class=md-nav__list data-md-component=toc data-md-scrollfix> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics class=md-nav__link> otg.dataset.summary_statistics.SummaryStatistics </a> </li> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics.from_gwas_harmonized_summary_stats class=md-nav__link> from_gwas_harmonized_summary_stats() </a> </li> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics.from_parquet class=md-nav__link> from_parquet() </a> </li> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics.pvalue_filter class=md-nav__link> pvalue_filter() </a> </li> <li class=md-nav__item> <a href=#otg.dataset.summary_statistics.SummaryStatistics.window_based_clumping class=md-nav__link> window_based_clumping() </a> </li> <li class=md-nav__item> <a href=#schema class=md-nav__link> Schema </a> </li> </ul> </nav> </div> </div> </div> <div class=md-content data-md-component=content> <article class="md-content__inner md-typeset"> <h1>Summary statistics</h1> <div class="doc doc-object doc-class"> <a id=otg.dataset.summary_statistics.SummaryStatistics></a> <div class="doc doc-contents first"> <p class="doc doc-class-bases"> Bases: <code><a class="autorefs autorefs-internal" title="otg.dataset.dataset.Dataset" href="../_dataset/#otg.dataset.dataset.Dataset">Dataset</a></code></p> <p>Summary Statistics dataset.</p> <p>A summary statistics dataset contains all single point statistics resulting from a GWAS.</p> <details class=quote> <summary>Source code in <code>src/otg/dataset/summary_statistics.py</code></summary> <div class=highlight><table class=highlighttable><tr><td class=linenos><div class=linenodiv><pre><span></span><span class=normal> 27</span>
 <span class=normal> 28</span>
 <span class=normal> 29</span>
 <span class=normal> 30</span>
@@ -121,100 +118,7 @@
 <span class=normal>144</span>
 <span class=normal>145</span>
 <span class=normal>146</span>
-<span class=normal>147</span>
-<span class=normal>148</span>
-<span class=normal>149</span>
-<span class=normal>150</span>
-<span class=normal>151</span>
-<span class=normal>152</span>
-<span class=normal>153</span>
-<span class=normal>154</span>
-<span class=normal>155</span>
-<span class=normal>156</span>
-<span class=normal>157</span>
-<span class=normal>158</span>
-<span class=normal>159</span>
-<span class=normal>160</span>
-<span class=normal>161</span>
-<span class=normal>162</span>
-<span class=normal>163</span>
-<span class=normal>164</span>
-<span class=normal>165</span>
-<span class=normal>166</span>
-<span class=normal>167</span>
-<span class=normal>168</span>
-<span class=normal>169</span>
-<span class=normal>170</span>
-<span class=normal>171</span>
-<span class=normal>172</span>
-<span class=normal>173</span>
-<span class=normal>174</span>
-<span class=normal>175</span>
-<span class=normal>176</span>
-<span class=normal>177</span>
-<span class=normal>178</span>
-<span class=normal>179</span>
-<span class=normal>180</span>
-<span class=normal>181</span>
-<span class=normal>182</span>
-<span class=normal>183</span>
-<span class=normal>184</span>
-<span class=normal>185</span>
-<span class=normal>186</span>
-<span class=normal>187</span>
-<span class=normal>188</span>
-<span class=normal>189</span>
-<span class=normal>190</span>
-<span class=normal>191</span>
-<span class=normal>192</span>
-<span class=normal>193</span>
-<span class=normal>194</span>
-<span class=normal>195</span>
-<span class=normal>196</span>
-<span class=normal>197</span>
-<span class=normal>198</span>
-<span class=normal>199</span>
-<span class=normal>200</span>
-<span class=normal>201</span>
-<span class=normal>202</span>
-<span class=normal>203</span>
-<span class=normal>204</span>
-<span class=normal>205</span>
-<span class=normal>206</span>
-<span class=normal>207</span>
-<span class=normal>208</span>
-<span class=normal>209</span>
-<span class=normal>210</span>
-<span class=normal>211</span>
-<span class=normal>212</span>
-<span class=normal>213</span>
-<span class=normal>214</span>
-<span class=normal>215</span>
-<span class=normal>216</span>
-<span class=normal>217</span>
-<span class=normal>218</span>
-<span class=normal>219</span>
-<span class=normal>220</span>
-<span class=normal>221</span>
-<span class=normal>222</span>
-<span class=normal>223</span>
-<span class=normal>224</span>
-<span class=normal>225</span>
-<span class=normal>226</span>
-<span class=normal>227</span>
-<span class=normal>228</span>
-<span class=normal>229</span>
-<span class=normal>230</span>
-<span class=normal>231</span>
-<span class=normal>232</span>
-<span class=normal>233</span>
-<span class=normal>234</span>
-<span class=normal>235</span>
-<span class=normal>236</span>
-<span class=normal>237</span>
-<span class=normal>238</span>
-<span class=normal>239</span>
-<span class=normal>240</span></pre></div></td><td class=code><div><pre><span></span><code><span class=nd>@dataclass</span>
+<span class=normal>147</span></pre></div></td><td class=code><div><pre><span></span><code><span class=nd>@dataclass</span>
 <span class=k>class</span> <span class=nc>SummaryStatistics</span><span class=p>(</span><span class=n>Dataset</span><span class=p>):</span>
 <span class=w>    </span><span class=sd>&quot;&quot;&quot;Summary Statistics dataset.</span>
 
@@ -223,85 +127,6 @@
 
     <span class=n>_schema</span><span class=p>:</span> <span class=n>t</span><span class=o>.</span><span class=n>StructType</span> <span class=o>=</span> <span class=n>parse_spark_schema</span><span class=p>(</span><span class=s2>&quot;summary_statistics.json&quot;</span><span class=p>)</span>
 
-    <span class=nd>@staticmethod</span>
-    <span class=k>def</span> <span class=nf>_convert_odds_ratio_to_beta</span><span class=p>(</span>
-        <span class=n>beta</span><span class=p>:</span> <span class=n>Column</span><span class=p>,</span> <span class=n>odds_ratio</span><span class=p>:</span> <span class=n>Column</span><span class=p>,</span> <span class=n>standard_error</span><span class=p>:</span> <span class=n>Column</span>
-    <span class=p>)</span> <span class=o>-&gt;</span> <span class=nb>tuple</span><span class=p>:</span>
-<span class=w>        </span><span class=sd>&quot;&quot;&quot;Harmonizes effect and standard error to beta.</span>
-
-<span class=sd>        Args:</span>
-<span class=sd>            beta (Column): Effect in beta</span>
-<span class=sd>            odds_ratio (Column): Effect in odds ratio</span>
-<span class=sd>            standard_error (Column): Standard error of the effect</span>
-
-<span class=sd>        Returns:</span>
-<span class=sd>            tuple: beta, standard error</span>
-
-<span class=sd>        Examples:</span>
-<span class=sd>            &gt;&gt;&gt; df = spark.createDataFrame([{&quot;beta&quot;: 0.1, &quot;oddsRatio&quot;: 1.1, &quot;standardError&quot;: 0.1}, {&quot;beta&quot;: None, &quot;oddsRatio&quot;: 1.1, &quot;standardError&quot;: 0.1}, {&quot;beta&quot;: 0.1, &quot;oddsRatio&quot;: None, &quot;standardError&quot;: 0.1}, {&quot;beta&quot;: 0.1, &quot;oddsRatio&quot;: 1.1, &quot;standardError&quot;: None}])</span>
-<span class=sd>            &gt;&gt;&gt; df.select(&quot;*&quot;, *SummaryStatistics._convert_odds_ratio_to_beta(f.col(&quot;beta&quot;), f.col(&quot;oddsRatio&quot;), f.col(&quot;standardError&quot;))).show()</span>
-<span class=sd>            +----+---------+-------------+-------------------+-------------+</span>
-<span class=sd>            |beta|oddsRatio|standardError|               beta|standardError|</span>
-<span class=sd>            +----+---------+-------------+-------------------+-------------+</span>
-<span class=sd>            | 0.1|      1.1|          0.1|                0.1|          0.1|</span>
-<span class=sd>            |null|      1.1|          0.1|0.09531017980432493|         null|</span>
-<span class=sd>            | 0.1|     null|          0.1|                0.1|          0.1|</span>
-<span class=sd>            | 0.1|      1.1|         null|                0.1|         null|</span>
-<span class=sd>            +----+---------+-------------+-------------------+-------------+</span>
-<span class=sd>            &lt;BLANKLINE&gt;</span>
-
-<span class=sd>        &quot;&quot;&quot;</span>
-        <span class=c1># We keep standard error when effect is given in beta, otherwise drop.</span>
-        <span class=n>standard_error</span> <span class=o>=</span> <span class=n>f</span><span class=o>.</span><span class=n>when</span><span class=p>(</span>
-            <span class=n>standard_error</span><span class=o>.</span><span class=n>isNotNull</span><span class=p>()</span> <span class=o>&amp;</span> <span class=n>beta</span><span class=o>.</span><span class=n>isNotNull</span><span class=p>(),</span> <span class=n>standard_error</span>
-        <span class=p>)</span><span class=o>.</span><span class=n>alias</span><span class=p>(</span><span class=s2>&quot;standardError&quot;</span><span class=p>)</span>
-
-        <span class=c1># Odds ratio is converted to beta:</span>
-        <span class=n>beta</span> <span class=o>=</span> <span class=p>(</span>
-            <span class=n>f</span><span class=o>.</span><span class=n>when</span><span class=p>(</span><span class=n>beta</span><span class=o>.</span><span class=n>isNotNull</span><span class=p>(),</span> <span class=n>beta</span><span class=p>)</span>
-            <span class=o>.</span><span class=n>when</span><span class=p>(</span><span class=n>odds_ratio</span><span class=o>.</span><span class=n>isNotNull</span><span class=p>(),</span> <span class=n>f</span><span class=o>.</span><span class=n>log</span><span class=p>(</span><span class=n>odds_ratio</span><span class=p>))</span>
-            <span class=o>.</span><span class=n>alias</span><span class=p>(</span><span class=s2>&quot;beta&quot;</span><span class=p>)</span>
-        <span class=p>)</span>
-
-        <span class=k>return</span> <span class=p>(</span><span class=n>beta</span><span class=p>,</span> <span class=n>standard_error</span><span class=p>)</span>
-
-    <span class=nd>@staticmethod</span>
-    <span class=k>def</span> <span class=nf>_calculate_confidence_interval</span><span class=p>(</span>
-        <span class=n>pvalue_mantissa</span><span class=p>:</span> <span class=n>Column</span><span class=p>,</span>
-        <span class=n>pvalue_exponent</span><span class=p>:</span> <span class=n>Column</span><span class=p>,</span>
-        <span class=n>beta</span><span class=p>:</span> <span class=n>Column</span><span class=p>,</span>
-        <span class=n>standard_error</span><span class=p>:</span> <span class=n>Column</span><span class=p>,</span>
-    <span class=p>)</span> <span class=o>-&gt;</span> <span class=nb>tuple</span><span class=p>:</span>
-<span class=w>        </span><span class=sd>&quot;&quot;&quot;This function calculates the confidence interval for the effect based on the p-value and the effect size.</span>
-
-<span class=sd>        If the standard error already available, don&#39;t re-calculate from p-value.</span>
-
-<span class=sd>        Args:</span>
-<span class=sd>            pvalue_mantissa (Column): p-value mantissa (float)</span>
-<span class=sd>            pvalue_exponent (Column): p-value exponent (integer)</span>
-<span class=sd>            beta (Column): effect size in beta (float)</span>
-<span class=sd>            standard_error (Column): standard error.</span>
-
-<span class=sd>        Returns:</span>
-<span class=sd>            tuple: betaConfidenceIntervalLower (float), betaConfidenceIntervalUpper (float)</span>
-<span class=sd>        &quot;&quot;&quot;</span>
-        <span class=c1># Calculate p-value from mantissa and exponent:</span>
-        <span class=n>pvalue</span> <span class=o>=</span> <span class=n>pvalue_mantissa</span> <span class=o>*</span> <span class=n>f</span><span class=o>.</span><span class=n>pow</span><span class=p>(</span><span class=mi>10</span><span class=p>,</span> <span class=n>pvalue_exponent</span><span class=p>)</span>
-
-        <span class=c1># Fix p-value underflow:</span>
-        <span class=n>pvalue</span> <span class=o>=</span> <span class=n>f</span><span class=o>.</span><span class=n>when</span><span class=p>(</span><span class=n>pvalue</span> <span class=o>==</span> <span class=mi>0</span><span class=p>,</span> <span class=n>sys</span><span class=o>.</span><span class=n>float_info</span><span class=o>.</span><span class=n>min</span><span class=p>)</span><span class=o>.</span><span class=n>otherwise</span><span class=p>(</span><span class=n>pvalue</span><span class=p>)</span>
-
-        <span class=c1># Compute missing standard error:</span>
-        <span class=n>standard_error</span> <span class=o>=</span> <span class=n>f</span><span class=o>.</span><span class=n>when</span><span class=p>(</span>
-            <span class=n>standard_error</span><span class=o>.</span><span class=n>isNull</span><span class=p>(),</span> <span class=n>f</span><span class=o>.</span><span class=n>abs</span><span class=p>(</span><span class=n>beta</span><span class=p>)</span> <span class=o>/</span> <span class=n>f</span><span class=o>.</span><span class=n>abs</span><span class=p>(</span><span class=n>pvalue_to_zscore</span><span class=p>(</span><span class=n>pvalue</span><span class=p>))</span>
-        <span class=p>)</span><span class=o>.</span><span class=n>otherwise</span><span class=p>(</span><span class=n>standard_error</span><span class=p>)</span>
-
-        <span class=c1># Calculate upper and lower confidence interval:</span>
-        <span class=n>ci_lower</span> <span class=o>=</span> <span class=p>(</span><span class=n>beta</span> <span class=o>-</span> <span class=n>standard_error</span><span class=p>)</span><span class=o>.</span><span class=n>alias</span><span class=p>(</span><span class=s2>&quot;betaConfidenceIntervalLower&quot;</span><span class=p>)</span>
-        <span class=n>ci_upper</span> <span class=o>=</span> <span class=p>(</span><span class=n>beta</span> <span class=o>+</span> <span class=n>standard_error</span><span class=p>)</span><span class=o>.</span><span class=n>alias</span><span class=p>(</span><span class=s2>&quot;betaConfidenceIntervalUpper&quot;</span><span class=p>)</span>
-
-        <span class=k>return</span> <span class=p>(</span><span class=n>ci_lower</span><span class=p>,</span> <span class=n>ci_upper</span><span class=p>)</span>
-
     <span class=nd>@classmethod</span>
     <span class=k>def</span> <span class=nf>from_parquet</span><span class=p>(</span>
         <span class=bp>cls</span><span class=p>:</span> <span class=nb>type</span><span class=p>[</span><span class=n>SummaryStatistics</span><span class=p>],</span> <span class=n>session</span><span class=p>:</span> <span class=n>Session</span><span class=p>,</span> <span class=n>path</span><span class=p>:</span> <span class=nb>str</span>
@@ -324,25 +149,28 @@
         <span class=n>sumstats_df</span><span class=p>:</span> <span class=n>DataFrame</span><span class=p>,</span>
         <span class=n>study_id</span><span class=p>:</span> <span class=nb>str</span><span class=p>,</span>
     <span class=p>)</span> <span class=o>-&gt;</span> <span class=n>SummaryStatistics</span><span class=p>:</span>
-<span class=w>        </span><span class=sd>&quot;&quot;&quot;Create summary statistics object from summary statistics harmonized by the GWAS Catalog.</span>
+<span class=w>        </span><span class=sd>&quot;&quot;&quot;Create summary statistics object from summary statistics flatfile, harmonized by the GWAS Catalog.</span>
 
 <span class=sd>        Args:</span>
-<span class=sd>            sumstats_df (DataFrame): Harmonized dataset read as dataframe from GWAS Catalog.</span>
-<span class=sd>            study_id (str): GWAS Catalog Study accession.</span>
+<span class=sd>            sumstats_df (DataFrame): Harmonized dataset read as a spark dataframe from GWAS Catalog.</span>
+<span class=sd>            study_id (str): GWAS Catalog study accession.</span>
 
 <span class=sd>        Returns:</span>
 <span class=sd>            SummaryStatistics</span>
 <span class=sd>        &quot;&quot;&quot;</span>
         <span class=c1># The effect allele frequency is an optional column, we have to test if it is there:</span>
         <span class=n>allele_frequency_expression</span> <span class=o>=</span> <span class=p>(</span>
-            <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;hm_effect_allele_frequency&quot;</span><span class=p>)</span><span class=o>.</span><span class=n>cast</span><span class=p>(</span><span class=n>t</span><span class=o>.</span><span class=n>DoubleType</span><span class=p>())</span>
+            <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;hm_effect_allele_frequency&quot;</span><span class=p>)</span><span class=o>.</span><span class=n>cast</span><span class=p>(</span><span class=n>t</span><span class=o>.</span><span class=n>FloatType</span><span class=p>())</span>
             <span class=k>if</span> <span class=s2>&quot;hm_effect_allele_frequency&quot;</span> <span class=ow>in</span> <span class=n>sumstats_df</span><span class=o>.</span><span class=n>columns</span>
             <span class=k>else</span> <span class=n>f</span><span class=o>.</span><span class=n>lit</span><span class=p>(</span><span class=kc>None</span><span class=p>)</span>
         <span class=p>)</span>
 
         <span class=c1># Processing columns of interest:</span>
         <span class=n>processed_sumstats_df</span> <span class=o>=</span> <span class=p>(</span>
-            <span class=n>sumstats_df</span><span class=o>.</span><span class=n>select</span><span class=p>(</span>
+            <span class=n>sumstats_df</span>
+            <span class=c1># Dropping rows which doesn&#39;t have proper position:</span>
+            <span class=o>.</span><span class=n>filter</span><span class=p>(</span><span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;hm_pos&quot;</span><span class=p>)</span><span class=o>.</span><span class=n>cast</span><span class=p>(</span><span class=n>t</span><span class=o>.</span><span class=n>IntegerType</span><span class=p>())</span><span class=o>.</span><span class=n>isNotNull</span><span class=p>())</span>
+            <span class=o>.</span><span class=n>select</span><span class=p>(</span>
                 <span class=c1># Adding study identifier:</span>
                 <span class=n>f</span><span class=o>.</span><span class=n>lit</span><span class=p>(</span><span class=n>study_id</span><span class=p>)</span><span class=o>.</span><span class=n>cast</span><span class=p>(</span><span class=n>t</span><span class=o>.</span><span class=n>StringType</span><span class=p>())</span><span class=o>.</span><span class=n>alias</span><span class=p>(</span><span class=s2>&quot;studyId&quot;</span><span class=p>),</span>
                 <span class=c1># Adding variant identifier:</span>
@@ -350,15 +178,25 @@
                 <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;hm_chrom&quot;</span><span class=p>)</span><span class=o>.</span><span class=n>alias</span><span class=p>(</span><span class=s2>&quot;chromosome&quot;</span><span class=p>),</span>
                 <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;hm_pos&quot;</span><span class=p>)</span><span class=o>.</span><span class=n>cast</span><span class=p>(</span><span class=n>t</span><span class=o>.</span><span class=n>IntegerType</span><span class=p>())</span><span class=o>.</span><span class=n>alias</span><span class=p>(</span><span class=s2>&quot;position&quot;</span><span class=p>),</span>
                 <span class=c1># Parsing p-value mantissa and exponent:</span>
-                <span class=o>*</span><span class=n>parse_pvalue</span><span class=p>(</span><span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;p_value&quot;</span><span class=p>)</span><span class=o>.</span><span class=n>cast</span><span class=p>(</span><span class=n>t</span><span class=o>.</span><span class=n>FloatType</span><span class=p>())),</span>
+                <span class=o>*</span><span class=n>parse_pvalue</span><span class=p>(</span><span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;p_value&quot;</span><span class=p>)),</span>
                 <span class=c1># Converting/calculating effect and confidence interval:</span>
-                <span class=o>*</span><span class=bp>cls</span><span class=o>.</span><span class=n>_convert_odds_ratio_to_beta</span><span class=p>(</span>
+                <span class=o>*</span><span class=n>convert_odds_ratio_to_beta</span><span class=p>(</span>
                     <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;hm_beta&quot;</span><span class=p>)</span><span class=o>.</span><span class=n>cast</span><span class=p>(</span><span class=n>t</span><span class=o>.</span><span class=n>DoubleType</span><span class=p>()),</span>
                     <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;hm_odds_ratio&quot;</span><span class=p>)</span><span class=o>.</span><span class=n>cast</span><span class=p>(</span><span class=n>t</span><span class=o>.</span><span class=n>DoubleType</span><span class=p>()),</span>
                     <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;standard_error&quot;</span><span class=p>)</span><span class=o>.</span><span class=n>cast</span><span class=p>(</span><span class=n>t</span><span class=o>.</span><span class=n>DoubleType</span><span class=p>()),</span>
                 <span class=p>),</span>
                 <span class=n>allele_frequency_expression</span><span class=o>.</span><span class=n>alias</span><span class=p>(</span><span class=s2>&quot;effectAlleleFrequencyFromSource&quot;</span><span class=p>),</span>
             <span class=p>)</span>
+            <span class=c1># The previous select expression generated the necessary fields for calculating the confidence intervals:</span>
+            <span class=o>.</span><span class=n>select</span><span class=p>(</span>
+                <span class=s2>&quot;*&quot;</span><span class=p>,</span>
+                <span class=o>*</span><span class=n>calculate_confidence_interval</span><span class=p>(</span>
+                    <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;pValueMantissa&quot;</span><span class=p>),</span>
+                    <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;pValueExponent&quot;</span><span class=p>),</span>
+                    <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;beta&quot;</span><span class=p>),</span>
+                    <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;standardError&quot;</span><span class=p>),</span>
+                <span class=p>),</span>
+            <span class=p>)</span>
             <span class=o>.</span><span class=n>repartition</span><span class=p>(</span><span class=mi>200</span><span class=p>,</span> <span class=s2>&quot;chromosome&quot;</span><span class=p>)</span>
             <span class=o>.</span><span class=n>sortWithinPartitions</span><span class=p>(</span><span class=s2>&quot;position&quot;</span><span class=p>)</span>
         <span class=p>)</span>
@@ -368,36 +206,6 @@
             <span class=n>_df</span><span class=o>=</span><span class=n>processed_sumstats_df</span><span class=p>,</span>
         <span class=p>)</span>
 
-    <span class=k>def</span> <span class=nf>calculate_confidence_interval</span><span class=p>(</span><span class=bp>self</span><span class=p>:</span> <span class=n>SummaryStatistics</span><span class=p>)</span> <span class=o>-&gt;</span> <span class=n>SummaryStatistics</span><span class=p>:</span>
-<span class=w>        </span><span class=sd>&quot;&quot;&quot;A Function to add upper and lower confidence interval to a summary statistics dataset.</span>
-
-<span class=sd>        Returns:</span>
-<span class=sd>            SummaryStatistics:</span>
-<span class=sd>        &quot;&quot;&quot;</span>
-        <span class=n>columns</span> <span class=o>=</span> <span class=bp>self</span><span class=o>.</span><span class=n>_df</span><span class=o>.</span><span class=n>columns</span>
-
-        <span class=c1># If confidence interval has already been calculated skip:</span>
-        <span class=k>if</span> <span class=p>(</span>
-            <span class=s2>&quot;betaConfidenceIntervalLower&quot;</span> <span class=ow>in</span> <span class=n>columns</span>
-            <span class=ow>and</span> <span class=s2>&quot;betaConfidenceIntervalUpper&quot;</span> <span class=ow>in</span> <span class=n>columns</span>
-        <span class=p>):</span>
-            <span class=k>return</span> <span class=bp>self</span>
-
-        <span class=c1># Calculate CI:</span>
-        <span class=k>return</span> <span class=n>SummaryStatistics</span><span class=p>(</span>
-            <span class=n>_df</span><span class=o>=</span><span class=p>(</span>
-                <span class=bp>self</span><span class=o>.</span><span class=n>_df</span><span class=o>.</span><span class=n>select</span><span class=p>(</span>
-                    <span class=s2>&quot;*&quot;</span><span class=p>,</span>
-                    <span class=o>*</span><span class=bp>self</span><span class=o>.</span><span class=n>_calculate_confidence_interval</span><span class=p>(</span>
-                        <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;pValueMantissa&quot;</span><span class=p>),</span>
-                        <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;pValueExponent&quot;</span><span class=p>),</span>
-                        <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;beta&quot;</span><span class=p>),</span>
-                        <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;standardError&quot;</span><span class=p>),</span>
-                    <span class=p>),</span>
-                <span class=p>)</span>
-            <span class=p>)</span>
-        <span class=p>)</span>
-
     <span class=k>def</span> <span class=nf>pvalue_filter</span><span class=p>(</span><span class=bp>self</span><span class=p>:</span> <span class=n>SummaryStatistics</span><span class=p>,</span> <span class=n>pvalue</span><span class=p>:</span> <span class=nb>float</span><span class=p>)</span> <span class=o>-&gt;</span> <span class=n>SummaryStatistics</span><span class=p>:</span>
 <span class=w>        </span><span class=sd>&quot;&quot;&quot;Filter summary statistics based on the provided p-value threshold.</span>
 
@@ -431,136 +239,95 @@
 <span class=sd>        &quot;&quot;&quot;</span>
         <span class=c1># Calculate distance-based clumping:</span>
         <span class=k>return</span> <span class=n>WindowBasedClumping</span><span class=o>.</span><span class=n>clump</span><span class=p>(</span><span class=bp>self</span><span class=p>,</span> <span class=n>distance</span><span class=p>)</span>
-</code></pre></div></td></tr></table></div> </details> <div class="doc doc-children"> <div class="doc doc-object doc-function"> <h2 id=otg.dataset.summary_statistics.SummaryStatistics.calculate_confidence_interval class="doc doc-heading"> <code class="highlight language-python"><span class=n>calculate_confidence_interval</span><span class=p>()</span></code> </h2> <div class="doc doc-contents "> <p>A Function to add upper and lower confidence interval to a summary statistics dataset.</p> <p><strong>Returns:</strong></p> <table> <thead> <tr> <th>Name</th> <th>Type</th> <th>Description</th> </tr> </thead> <tbody> <tr> <td><code>SummaryStatistics</code></td> <td> <code><a class="autorefs autorefs-internal" title="otg.dataset.summary_statistics.SummaryStatistics" href="#otg.dataset.summary_statistics.SummaryStatistics">SummaryStatistics</a></code> </td> <td></td> </tr> </tbody> </table> <details class=quote> <summary>Source code in <code>src/otg/dataset/summary_statistics.py</code></summary> <div class=highlight><table class=highlighttable><tr><td class=linenos><div class=linenodiv><pre><span></span><span class=normal>178</span>
-<span class=normal>179</span>
-<span class=normal>180</span>
-<span class=normal>181</span>
-<span class=normal>182</span>
-<span class=normal>183</span>
-<span class=normal>184</span>
-<span class=normal>185</span>
-<span class=normal>186</span>
-<span class=normal>187</span>
-<span class=normal>188</span>
-<span class=normal>189</span>
-<span class=normal>190</span>
-<span class=normal>191</span>
-<span class=normal>192</span>
-<span class=normal>193</span>
-<span class=normal>194</span>
-<span class=normal>195</span>
-<span class=normal>196</span>
-<span class=normal>197</span>
-<span class=normal>198</span>
-<span class=normal>199</span>
-<span class=normal>200</span>
-<span class=normal>201</span>
-<span class=normal>202</span>
-<span class=normal>203</span>
-<span class=normal>204</span>
-<span class=normal>205</span>
-<span class=normal>206</span></pre></div></td><td class=code><div><pre><span></span><code><span class=k>def</span> <span class=nf>calculate_confidence_interval</span><span class=p>(</span><span class=bp>self</span><span class=p>:</span> <span class=n>SummaryStatistics</span><span class=p>)</span> <span class=o>-&gt;</span> <span class=n>SummaryStatistics</span><span class=p>:</span>
-<span class=w>    </span><span class=sd>&quot;&quot;&quot;A Function to add upper and lower confidence interval to a summary statistics dataset.</span>
-
-<span class=sd>    Returns:</span>
-<span class=sd>        SummaryStatistics:</span>
-<span class=sd>    &quot;&quot;&quot;</span>
-    <span class=n>columns</span> <span class=o>=</span> <span class=bp>self</span><span class=o>.</span><span class=n>_df</span><span class=o>.</span><span class=n>columns</span>
-
-    <span class=c1># If confidence interval has already been calculated skip:</span>
-    <span class=k>if</span> <span class=p>(</span>
-        <span class=s2>&quot;betaConfidenceIntervalLower&quot;</span> <span class=ow>in</span> <span class=n>columns</span>
-        <span class=ow>and</span> <span class=s2>&quot;betaConfidenceIntervalUpper&quot;</span> <span class=ow>in</span> <span class=n>columns</span>
-    <span class=p>):</span>
-        <span class=k>return</span> <span class=bp>self</span>
-
-    <span class=c1># Calculate CI:</span>
-    <span class=k>return</span> <span class=n>SummaryStatistics</span><span class=p>(</span>
-        <span class=n>_df</span><span class=o>=</span><span class=p>(</span>
-            <span class=bp>self</span><span class=o>.</span><span class=n>_df</span><span class=o>.</span><span class=n>select</span><span class=p>(</span>
-                <span class=s2>&quot;*&quot;</span><span class=p>,</span>
-                <span class=o>*</span><span class=bp>self</span><span class=o>.</span><span class=n>_calculate_confidence_interval</span><span class=p>(</span>
-                    <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;pValueMantissa&quot;</span><span class=p>),</span>
-                    <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;pValueExponent&quot;</span><span class=p>),</span>
-                    <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;beta&quot;</span><span class=p>),</span>
-                    <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;standardError&quot;</span><span class=p>),</span>
-                <span class=p>),</span>
-            <span class=p>)</span>
-        <span class=p>)</span>
-    <span class=p>)</span>
-</code></pre></div></td></tr></table></div> </details> </div> </div> <div class="doc doc-object doc-function"> <h2 id=otg.dataset.summary_statistics.SummaryStatistics.from_gwas_harmonized_summary_stats class="doc doc-heading"> <code class="highlight language-python"><span class=n>from_gwas_harmonized_summary_stats</span><span class=p>(</span><span class=n>sumstats_df</span><span class=p>,</span> <span class=n>study_id</span><span class=p>)</span></code> <span class="doc doc-labels"> <small class="doc doc-label doc-label-classmethod"><code>classmethod</code></small> </span> </h2> <div class="doc doc-contents "> <p>Create summary statistics object from summary statistics harmonized by the GWAS Catalog.</p> <p><strong>Parameters:</strong></p> <table> <thead> <tr> <th>Name</th> <th>Type</th> <th>Description</th> <th>Default</th> </tr> </thead> <tbody> <tr> <td><code>sumstats_df</code></td> <td> <code><span title="pyspark.sql.DataFrame">DataFrame</span></code> </td> <td><p>Harmonized dataset read as dataframe from GWAS Catalog.</p></td> <td> <em>required</em> </td> </tr> <tr> <td><code>study_id</code></td> <td> <code>str</code> </td> <td><p>GWAS Catalog Study accession.</p></td> <td> <em>required</em> </td> </tr> </tbody> </table> <p><strong>Returns:</strong></p> <table> <thead> <tr> <th>Type</th> <th>Description</th> </tr> </thead> <tbody> <tr> <td> <code><a class="autorefs autorefs-internal" title="otg.dataset.summary_statistics.SummaryStatistics" href="#otg.dataset.summary_statistics.SummaryStatistics">SummaryStatistics</a></code> </td> <td><p>SummaryStatistics</p></td> </tr> </tbody> </table> <details class=quote> <summary>Source code in <code>src/otg/dataset/summary_statistics.py</code></summary> <div class=highlight><table class=highlighttable><tr><td class=linenos><div class=linenodiv><pre><span></span><span class=normal>128</span>
-<span class=normal>129</span>
-<span class=normal>130</span>
-<span class=normal>131</span>
-<span class=normal>132</span>
-<span class=normal>133</span>
-<span class=normal>134</span>
-<span class=normal>135</span>
-<span class=normal>136</span>
-<span class=normal>137</span>
-<span class=normal>138</span>
-<span class=normal>139</span>
-<span class=normal>140</span>
-<span class=normal>141</span>
-<span class=normal>142</span>
-<span class=normal>143</span>
-<span class=normal>144</span>
-<span class=normal>145</span>
-<span class=normal>146</span>
-<span class=normal>147</span>
-<span class=normal>148</span>
-<span class=normal>149</span>
-<span class=normal>150</span>
-<span class=normal>151</span>
-<span class=normal>152</span>
-<span class=normal>153</span>
-<span class=normal>154</span>
-<span class=normal>155</span>
-<span class=normal>156</span>
-<span class=normal>157</span>
-<span class=normal>158</span>
-<span class=normal>159</span>
-<span class=normal>160</span>
-<span class=normal>161</span>
-<span class=normal>162</span>
-<span class=normal>163</span>
-<span class=normal>164</span>
-<span class=normal>165</span>
-<span class=normal>166</span>
-<span class=normal>167</span>
-<span class=normal>168</span>
-<span class=normal>169</span>
-<span class=normal>170</span>
-<span class=normal>171</span>
-<span class=normal>172</span>
-<span class=normal>173</span>
-<span class=normal>174</span>
-<span class=normal>175</span>
-<span class=normal>176</span></pre></div></td><td class=code><div><pre><span></span><code><span class=nd>@classmethod</span>
+</code></pre></div></td></tr></table></div> </details> <div class="doc doc-children"> <div class="doc doc-object doc-function"> <h2 id=otg.dataset.summary_statistics.SummaryStatistics.from_gwas_harmonized_summary_stats class="doc doc-heading"> <code class="highlight language-python"><span class=n>from_gwas_harmonized_summary_stats</span><span class=p>(</span><span class=n>sumstats_df</span><span class=p>,</span> <span class=n>study_id</span><span class=p>)</span></code> <span class="doc doc-labels"> <small class="doc doc-label doc-label-classmethod"><code>classmethod</code></small> </span> </h2> <div class="doc doc-contents "> <p>Create summary statistics object from summary statistics flatfile, harmonized by the GWAS Catalog.</p> <p><strong>Parameters:</strong></p> <table> <thead> <tr> <th>Name</th> <th>Type</th> <th>Description</th> <th>Default</th> </tr> </thead> <tbody> <tr> <td><code>sumstats_df</code></td> <td> <code><span title="pyspark.sql.DataFrame">DataFrame</span></code> </td> <td><p>Harmonized dataset read as a spark dataframe from GWAS Catalog.</p></td> <td> <em>required</em> </td> </tr> <tr> <td><code>study_id</code></td> <td> <code>str</code> </td> <td><p>GWAS Catalog study accession.</p></td> <td> <em>required</em> </td> </tr> </tbody> </table> <p><strong>Returns:</strong></p> <table> <thead> <tr> <th>Type</th> <th>Description</th> </tr> </thead> <tbody> <tr> <td> <code><a class="autorefs autorefs-internal" title="otg.dataset.summary_statistics.SummaryStatistics" href="#otg.dataset.summary_statistics.SummaryStatistics">SummaryStatistics</a></code> </td> <td><p>SummaryStatistics</p></td> </tr> </tbody> </table> <details class=quote> <summary>Source code in <code>src/otg/dataset/summary_statistics.py</code></summary> <div class=highlight><table class=highlighttable><tr><td class=linenos><div class=linenodiv><pre><span></span><span class=normal> 52</span>
+<span class=normal> 53</span>
+<span class=normal> 54</span>
+<span class=normal> 55</span>
+<span class=normal> 56</span>
+<span class=normal> 57</span>
+<span class=normal> 58</span>
+<span class=normal> 59</span>
+<span class=normal> 60</span>
+<span class=normal> 61</span>
+<span class=normal> 62</span>
+<span class=normal> 63</span>
+<span class=normal> 64</span>
+<span class=normal> 65</span>
+<span class=normal> 66</span>
+<span class=normal> 67</span>
+<span class=normal> 68</span>
+<span class=normal> 69</span>
+<span class=normal> 70</span>
+<span class=normal> 71</span>
+<span class=normal> 72</span>
+<span class=normal> 73</span>
+<span class=normal> 74</span>
+<span class=normal> 75</span>
+<span class=normal> 76</span>
+<span class=normal> 77</span>
+<span class=normal> 78</span>
+<span class=normal> 79</span>
+<span class=normal> 80</span>
+<span class=normal> 81</span>
+<span class=normal> 82</span>
+<span class=normal> 83</span>
+<span class=normal> 84</span>
+<span class=normal> 85</span>
+<span class=normal> 86</span>
+<span class=normal> 87</span>
+<span class=normal> 88</span>
+<span class=normal> 89</span>
+<span class=normal> 90</span>
+<span class=normal> 91</span>
+<span class=normal> 92</span>
+<span class=normal> 93</span>
+<span class=normal> 94</span>
+<span class=normal> 95</span>
+<span class=normal> 96</span>
+<span class=normal> 97</span>
+<span class=normal> 98</span>
+<span class=normal> 99</span>
+<span class=normal>100</span>
+<span class=normal>101</span>
+<span class=normal>102</span>
+<span class=normal>103</span>
+<span class=normal>104</span>
+<span class=normal>105</span>
+<span class=normal>106</span>
+<span class=normal>107</span>
+<span class=normal>108</span>
+<span class=normal>109</span>
+<span class=normal>110</span>
+<span class=normal>111</span>
+<span class=normal>112</span>
+<span class=normal>113</span></pre></div></td><td class=code><div><pre><span></span><code><span class=nd>@classmethod</span>
 <span class=k>def</span> <span class=nf>from_gwas_harmonized_summary_stats</span><span class=p>(</span>
     <span class=bp>cls</span><span class=p>:</span> <span class=nb>type</span><span class=p>[</span><span class=n>SummaryStatistics</span><span class=p>],</span>
     <span class=n>sumstats_df</span><span class=p>:</span> <span class=n>DataFrame</span><span class=p>,</span>
     <span class=n>study_id</span><span class=p>:</span> <span class=nb>str</span><span class=p>,</span>
 <span class=p>)</span> <span class=o>-&gt;</span> <span class=n>SummaryStatistics</span><span class=p>:</span>
-<span class=w>    </span><span class=sd>&quot;&quot;&quot;Create summary statistics object from summary statistics harmonized by the GWAS Catalog.</span>
+<span class=w>    </span><span class=sd>&quot;&quot;&quot;Create summary statistics object from summary statistics flatfile, harmonized by the GWAS Catalog.</span>
 
 <span class=sd>    Args:</span>
-<span class=sd>        sumstats_df (DataFrame): Harmonized dataset read as dataframe from GWAS Catalog.</span>
-<span class=sd>        study_id (str): GWAS Catalog Study accession.</span>
+<span class=sd>        sumstats_df (DataFrame): Harmonized dataset read as a spark dataframe from GWAS Catalog.</span>
+<span class=sd>        study_id (str): GWAS Catalog study accession.</span>
 
 <span class=sd>    Returns:</span>
 <span class=sd>        SummaryStatistics</span>
 <span class=sd>    &quot;&quot;&quot;</span>
     <span class=c1># The effect allele frequency is an optional column, we have to test if it is there:</span>
     <span class=n>allele_frequency_expression</span> <span class=o>=</span> <span class=p>(</span>
-        <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;hm_effect_allele_frequency&quot;</span><span class=p>)</span><span class=o>.</span><span class=n>cast</span><span class=p>(</span><span class=n>t</span><span class=o>.</span><span class=n>DoubleType</span><span class=p>())</span>
+        <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;hm_effect_allele_frequency&quot;</span><span class=p>)</span><span class=o>.</span><span class=n>cast</span><span class=p>(</span><span class=n>t</span><span class=o>.</span><span class=n>FloatType</span><span class=p>())</span>
         <span class=k>if</span> <span class=s2>&quot;hm_effect_allele_frequency&quot;</span> <span class=ow>in</span> <span class=n>sumstats_df</span><span class=o>.</span><span class=n>columns</span>
         <span class=k>else</span> <span class=n>f</span><span class=o>.</span><span class=n>lit</span><span class=p>(</span><span class=kc>None</span><span class=p>)</span>
     <span class=p>)</span>
 
     <span class=c1># Processing columns of interest:</span>
     <span class=n>processed_sumstats_df</span> <span class=o>=</span> <span class=p>(</span>
-        <span class=n>sumstats_df</span><span class=o>.</span><span class=n>select</span><span class=p>(</span>
+        <span class=n>sumstats_df</span>
+        <span class=c1># Dropping rows which doesn&#39;t have proper position:</span>
+        <span class=o>.</span><span class=n>filter</span><span class=p>(</span><span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;hm_pos&quot;</span><span class=p>)</span><span class=o>.</span><span class=n>cast</span><span class=p>(</span><span class=n>t</span><span class=o>.</span><span class=n>IntegerType</span><span class=p>())</span><span class=o>.</span><span class=n>isNotNull</span><span class=p>())</span>
+        <span class=o>.</span><span class=n>select</span><span class=p>(</span>
             <span class=c1># Adding study identifier:</span>
             <span class=n>f</span><span class=o>.</span><span class=n>lit</span><span class=p>(</span><span class=n>study_id</span><span class=p>)</span><span class=o>.</span><span class=n>cast</span><span class=p>(</span><span class=n>t</span><span class=o>.</span><span class=n>StringType</span><span class=p>())</span><span class=o>.</span><span class=n>alias</span><span class=p>(</span><span class=s2>&quot;studyId&quot;</span><span class=p>),</span>
             <span class=c1># Adding variant identifier:</span>
@@ -568,15 +335,25 @@
             <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;hm_chrom&quot;</span><span class=p>)</span><span class=o>.</span><span class=n>alias</span><span class=p>(</span><span class=s2>&quot;chromosome&quot;</span><span class=p>),</span>
             <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;hm_pos&quot;</span><span class=p>)</span><span class=o>.</span><span class=n>cast</span><span class=p>(</span><span class=n>t</span><span class=o>.</span><span class=n>IntegerType</span><span class=p>())</span><span class=o>.</span><span class=n>alias</span><span class=p>(</span><span class=s2>&quot;position&quot;</span><span class=p>),</span>
             <span class=c1># Parsing p-value mantissa and exponent:</span>
-            <span class=o>*</span><span class=n>parse_pvalue</span><span class=p>(</span><span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;p_value&quot;</span><span class=p>)</span><span class=o>.</span><span class=n>cast</span><span class=p>(</span><span class=n>t</span><span class=o>.</span><span class=n>FloatType</span><span class=p>())),</span>
+            <span class=o>*</span><span class=n>parse_pvalue</span><span class=p>(</span><span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;p_value&quot;</span><span class=p>)),</span>
             <span class=c1># Converting/calculating effect and confidence interval:</span>
-            <span class=o>*</span><span class=bp>cls</span><span class=o>.</span><span class=n>_convert_odds_ratio_to_beta</span><span class=p>(</span>
+            <span class=o>*</span><span class=n>convert_odds_ratio_to_beta</span><span class=p>(</span>
                 <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;hm_beta&quot;</span><span class=p>)</span><span class=o>.</span><span class=n>cast</span><span class=p>(</span><span class=n>t</span><span class=o>.</span><span class=n>DoubleType</span><span class=p>()),</span>
                 <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;hm_odds_ratio&quot;</span><span class=p>)</span><span class=o>.</span><span class=n>cast</span><span class=p>(</span><span class=n>t</span><span class=o>.</span><span class=n>DoubleType</span><span class=p>()),</span>
                 <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;standard_error&quot;</span><span class=p>)</span><span class=o>.</span><span class=n>cast</span><span class=p>(</span><span class=n>t</span><span class=o>.</span><span class=n>DoubleType</span><span class=p>()),</span>
             <span class=p>),</span>
             <span class=n>allele_frequency_expression</span><span class=o>.</span><span class=n>alias</span><span class=p>(</span><span class=s2>&quot;effectAlleleFrequencyFromSource&quot;</span><span class=p>),</span>
         <span class=p>)</span>
+        <span class=c1># The previous select expression generated the necessary fields for calculating the confidence intervals:</span>
+        <span class=o>.</span><span class=n>select</span><span class=p>(</span>
+            <span class=s2>&quot;*&quot;</span><span class=p>,</span>
+            <span class=o>*</span><span class=n>calculate_confidence_interval</span><span class=p>(</span>
+                <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;pValueMantissa&quot;</span><span class=p>),</span>
+                <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;pValueExponent&quot;</span><span class=p>),</span>
+                <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;beta&quot;</span><span class=p>),</span>
+                <span class=n>f</span><span class=o>.</span><span class=n>col</span><span class=p>(</span><span class=s2>&quot;standardError&quot;</span><span class=p>),</span>
+            <span class=p>),</span>
+        <span class=p>)</span>
         <span class=o>.</span><span class=n>repartition</span><span class=p>(</span><span class=mi>200</span><span class=p>,</span> <span class=s2>&quot;chromosome&quot;</span><span class=p>)</span>
         <span class=o>.</span><span class=n>sortWithinPartitions</span><span class=p>(</span><span class=s2>&quot;position&quot;</span><span class=p>)</span>
     <span class=p>)</span>
@@ -585,21 +362,21 @@
     <span class=k>return</span> <span class=bp>cls</span><span class=p>(</span>
         <span class=n>_df</span><span class=o>=</span><span class=n>processed_sumstats_df</span><span class=p>,</span>
     <span class=p>)</span>
-</code></pre></div></td></tr></table></div> </details> </div> </div> <div class="doc doc-object doc-function"> <h2 id=otg.dataset.summary_statistics.SummaryStatistics.from_parquet class="doc doc-heading"> <code class="highlight language-python"><span class=n>from_parquet</span><span class=p>(</span><span class=n>session</span><span class=p>,</span> <span class=n>path</span><span class=p>)</span></code> <span class="doc doc-labels"> <small class="doc doc-label doc-label-classmethod"><code>classmethod</code></small> </span> </h2> <div class="doc doc-contents "> <p>Initialise SummaryStatistics from parquet file.</p> <p><strong>Parameters:</strong></p> <table> <thead> <tr> <th>Name</th> <th>Type</th> <th>Description</th> <th>Default</th> </tr> </thead> <tbody> <tr> <td><code>session</code></td> <td> <code><span title="otg.common.session.Session">Session</span></code> </td> <td><p>Session</p></td> <td> <em>required</em> </td> </tr> <tr> <td><code>path</code></td> <td> <code>str</code> </td> <td><p>Path to parquet file</p></td> <td> <em>required</em> </td> </tr> </tbody> </table> <p><strong>Returns:</strong></p> <table> <thead> <tr> <th>Name</th> <th>Type</th> <th>Description</th> </tr> </thead> <tbody> <tr> <td><code>SummaryStatistics</code></td> <td> <code><a class="autorefs autorefs-internal" title="otg.dataset.summary_statistics.SummaryStatistics" href="#otg.dataset.summary_statistics.SummaryStatistics">SummaryStatistics</a></code> </td> <td><p>SummaryStatistics dataset</p></td> </tr> </tbody> </table> <details class=quote> <summary>Source code in <code>src/otg/dataset/summary_statistics.py</code></summary> <div class=highlight><table class=highlighttable><tr><td class=linenos><div class=linenodiv><pre><span></span><span class=normal>112</span>
-<span class=normal>113</span>
-<span class=normal>114</span>
-<span class=normal>115</span>
-<span class=normal>116</span>
-<span class=normal>117</span>
-<span class=normal>118</span>
-<span class=normal>119</span>
-<span class=normal>120</span>
-<span class=normal>121</span>
-<span class=normal>122</span>
-<span class=normal>123</span>
-<span class=normal>124</span>
-<span class=normal>125</span>
-<span class=normal>126</span></pre></div></td><td class=code><div><pre><span></span><code><span class=nd>@classmethod</span>
+</code></pre></div></td></tr></table></div> </details> </div> </div> <div class="doc doc-object doc-function"> <h2 id=otg.dataset.summary_statistics.SummaryStatistics.from_parquet class="doc doc-heading"> <code class="highlight language-python"><span class=n>from_parquet</span><span class=p>(</span><span class=n>session</span><span class=p>,</span> <span class=n>path</span><span class=p>)</span></code> <span class="doc doc-labels"> <small class="doc doc-label doc-label-classmethod"><code>classmethod</code></small> </span> </h2> <div class="doc doc-contents "> <p>Initialise SummaryStatistics from parquet file.</p> <p><strong>Parameters:</strong></p> <table> <thead> <tr> <th>Name</th> <th>Type</th> <th>Description</th> <th>Default</th> </tr> </thead> <tbody> <tr> <td><code>session</code></td> <td> <code><span title="otg.common.session.Session">Session</span></code> </td> <td><p>Session</p></td> <td> <em>required</em> </td> </tr> <tr> <td><code>path</code></td> <td> <code>str</code> </td> <td><p>Path to parquet file</p></td> <td> <em>required</em> </td> </tr> </tbody> </table> <p><strong>Returns:</strong></p> <table> <thead> <tr> <th>Name</th> <th>Type</th> <th>Description</th> </tr> </thead> <tbody> <tr> <td><code>SummaryStatistics</code></td> <td> <code><a class="autorefs autorefs-internal" title="otg.dataset.summary_statistics.SummaryStatistics" href="#otg.dataset.summary_statistics.SummaryStatistics">SummaryStatistics</a></code> </td> <td><p>SummaryStatistics dataset</p></td> </tr> </tbody> </table> <details class=quote> <summary>Source code in <code>src/otg/dataset/summary_statistics.py</code></summary> <div class=highlight><table class=highlighttable><tr><td class=linenos><div class=linenodiv><pre><span></span><span class=normal>36</span>
+<span class=normal>37</span>
+<span class=normal>38</span>
+<span class=normal>39</span>
+<span class=normal>40</span>
+<span class=normal>41</span>
+<span class=normal>42</span>
+<span class=normal>43</span>
+<span class=normal>44</span>
+<span class=normal>45</span>
+<span class=normal>46</span>
+<span class=normal>47</span>
+<span class=normal>48</span>
+<span class=normal>49</span>
+<span class=normal>50</span></pre></div></td><td class=code><div><pre><span></span><code><span class=nd>@classmethod</span>
 <span class=k>def</span> <span class=nf>from_parquet</span><span class=p>(</span>
     <span class=bp>cls</span><span class=p>:</span> <span class=nb>type</span><span class=p>[</span><span class=n>SummaryStatistics</span><span class=p>],</span> <span class=n>session</span><span class=p>:</span> <span class=n>Session</span><span class=p>,</span> <span class=n>path</span><span class=p>:</span> <span class=nb>str</span>
 <span class=p>)</span> <span class=o>-&gt;</span> <span class=n>SummaryStatistics</span><span class=p>:</span>
@@ -614,27 +391,27 @@
 <span class=sd>    &quot;&quot;&quot;</span>
     <span class=n>df</span> <span class=o>=</span> <span class=n>session</span><span class=o>.</span><span class=n>read_parquet</span><span class=p>(</span><span class=n>path</span><span class=o>=</span><span class=n>path</span><span class=p>,</span> <span class=n>schema</span><span class=o>=</span><span class=bp>cls</span><span class=o>.</span><span class=n>_schema</span><span class=p>)</span>
     <span class=k>return</span> <span class=bp>cls</span><span class=p>(</span><span class=n>_df</span><span class=o>=</span><span class=n>df</span><span class=p>,</span> <span class=n>_schema</span><span class=o>=</span><span class=bp>cls</span><span class=o>.</span><span class=n>_schema</span><span class=p>)</span>
-</code></pre></div></td></tr></table></div> </details> </div> </div> <div class="doc doc-object doc-function"> <h2 id=otg.dataset.summary_statistics.SummaryStatistics.pvalue_filter class="doc doc-heading"> <code class="highlight language-python"><span class=n>pvalue_filter</span><span class=p>(</span><span class=n>pvalue</span><span class=p>)</span></code> </h2> <div class="doc doc-contents "> <p>Filter summary statistics based on the provided p-value threshold.</p> <p><strong>Parameters:</strong></p> <table> <thead> <tr> <th>Name</th> <th>Type</th> <th>Description</th> <th>Default</th> </tr> </thead> <tbody> <tr> <td><code>pvalue</code></td> <td> <code>float</code> </td> <td><p>upper limit of the p-value to be filtered upon.</p></td> <td> <em>required</em> </td> </tr> </tbody> </table> <p><strong>Returns:</strong></p> <table> <thead> <tr> <th>Name</th> <th>Type</th> <th>Description</th> </tr> </thead> <tbody> <tr> <td><code>SummaryStatistics</code></td> <td> <code><a class="autorefs autorefs-internal" title="otg.dataset.summary_statistics.SummaryStatistics" href="#otg.dataset.summary_statistics.SummaryStatistics">SummaryStatistics</a></code> </td> <td><p>summary statistics object containing single point associations with p-values at least as significant as the provided threshold.</p></td> </tr> </tbody> </table> <details class=quote> <summary>Source code in <code>src/otg/dataset/summary_statistics.py</code></summary> <div class=highlight><table class=highlighttable><tr><td class=linenos><div class=linenodiv><pre><span></span><span class=normal>208</span>
-<span class=normal>209</span>
-<span class=normal>210</span>
-<span class=normal>211</span>
-<span class=normal>212</span>
-<span class=normal>213</span>
-<span class=normal>214</span>
-<span class=normal>215</span>
-<span class=normal>216</span>
-<span class=normal>217</span>
-<span class=normal>218</span>
-<span class=normal>219</span>
-<span class=normal>220</span>
-<span class=normal>221</span>
-<span class=normal>222</span>
-<span class=normal>223</span>
-<span class=normal>224</span>
-<span class=normal>225</span>
-<span class=normal>226</span>
-<span class=normal>227</span>
-<span class=normal>228</span></pre></div></td><td class=code><div><pre><span></span><code><span class=k>def</span> <span class=nf>pvalue_filter</span><span class=p>(</span><span class=bp>self</span><span class=p>:</span> <span class=n>SummaryStatistics</span><span class=p>,</span> <span class=n>pvalue</span><span class=p>:</span> <span class=nb>float</span><span class=p>)</span> <span class=o>-&gt;</span> <span class=n>SummaryStatistics</span><span class=p>:</span>
+</code></pre></div></td></tr></table></div> </details> </div> </div> <div class="doc doc-object doc-function"> <h2 id=otg.dataset.summary_statistics.SummaryStatistics.pvalue_filter class="doc doc-heading"> <code class="highlight language-python"><span class=n>pvalue_filter</span><span class=p>(</span><span class=n>pvalue</span><span class=p>)</span></code> </h2> <div class="doc doc-contents "> <p>Filter summary statistics based on the provided p-value threshold.</p> <p><strong>Parameters:</strong></p> <table> <thead> <tr> <th>Name</th> <th>Type</th> <th>Description</th> <th>Default</th> </tr> </thead> <tbody> <tr> <td><code>pvalue</code></td> <td> <code>float</code> </td> <td><p>upper limit of the p-value to be filtered upon.</p></td> <td> <em>required</em> </td> </tr> </tbody> </table> <p><strong>Returns:</strong></p> <table> <thead> <tr> <th>Name</th> <th>Type</th> <th>Description</th> </tr> </thead> <tbody> <tr> <td><code>SummaryStatistics</code></td> <td> <code><a class="autorefs autorefs-internal" title="otg.dataset.summary_statistics.SummaryStatistics" href="#otg.dataset.summary_statistics.SummaryStatistics">SummaryStatistics</a></code> </td> <td><p>summary statistics object containing single point associations with p-values at least as significant as the provided threshold.</p></td> </tr> </tbody> </table> <details class=quote> <summary>Source code in <code>src/otg/dataset/summary_statistics.py</code></summary> <div class=highlight><table class=highlighttable><tr><td class=linenos><div class=linenodiv><pre><span></span><span class=normal>115</span>
+<span class=normal>116</span>
+<span class=normal>117</span>
+<span class=normal>118</span>
+<span class=normal>119</span>
+<span class=normal>120</span>
+<span class=normal>121</span>
+<span class=normal>122</span>
+<span class=normal>123</span>
+<span class=normal>124</span>
+<span class=normal>125</span>
+<span class=normal>126</span>
+<span class=normal>127</span>
+<span class=normal>128</span>
+<span class=normal>129</span>
+<span class=normal>130</span>
+<span class=normal>131</span>
+<span class=normal>132</span>
+<span class=normal>133</span>
+<span class=normal>134</span>
+<span class=normal>135</span></pre></div></td><td class=code><div><pre><span></span><code><span class=k>def</span> <span class=nf>pvalue_filter</span><span class=p>(</span><span class=bp>self</span><span class=p>:</span> <span class=n>SummaryStatistics</span><span class=p>,</span> <span class=n>pvalue</span><span class=p>:</span> <span class=nb>float</span><span class=p>)</span> <span class=o>-&gt;</span> <span class=n>SummaryStatistics</span><span class=p>:</span>
 <span class=w>    </span><span class=sd>&quot;&quot;&quot;Filter summary statistics based on the provided p-value threshold.</span>
 
 <span class=sd>    Args:</span>
@@ -655,17 +432,17 @@
         <span class=p>)</span>
     <span class=p>)</span>
     <span class=k>return</span> <span class=n>SummaryStatistics</span><span class=p>(</span><span class=n>_df</span><span class=o>=</span><span class=n>df</span><span class=p>)</span>
-</code></pre></div></td></tr></table></div> </details> </div> </div> <div class="doc doc-object doc-function"> <h2 id=otg.dataset.summary_statistics.SummaryStatistics.window_based_clumping class="doc doc-heading"> <code class="highlight language-python"><span class=n>window_based_clumping</span><span class=p>(</span><span class=n>distance</span><span class=p>)</span></code> </h2> <div class="doc doc-contents "> <p>Perform distance-based clumping.</p> <p><strong>Parameters:</strong></p> <table> <thead> <tr> <th>Name</th> <th>Type</th> <th>Description</th> <th>Default</th> </tr> </thead> <tbody> <tr> <td><code>distance</code></td> <td> <code>int</code> </td> <td><p>Distance in base pairs</p></td> <td> <em>required</em> </td> </tr> </tbody> </table> <p><strong>Returns:</strong></p> <table> <thead> <tr> <th>Name</th> <th>Type</th> <th>Description</th> </tr> </thead> <tbody> <tr> <td><code>StudyLocus</code></td> <td> <code><a class="autorefs autorefs-internal" title="otg.dataset.study_locus.StudyLocus" href="../study_locus/_study_locus/#otg.dataset.study_locus.StudyLocus">StudyLocus</a></code> </td> <td><p>StudyLocus object</p></td> </tr> </tbody> </table> <details class=quote> <summary>Source code in <code>src/otg/dataset/summary_statistics.py</code></summary> <div class=highlight><table class=highlighttable><tr><td class=linenos><div class=linenodiv><pre><span></span><span class=normal>230</span>
-<span class=normal>231</span>
-<span class=normal>232</span>
-<span class=normal>233</span>
-<span class=normal>234</span>
-<span class=normal>235</span>
-<span class=normal>236</span>
-<span class=normal>237</span>
-<span class=normal>238</span>
-<span class=normal>239</span>
-<span class=normal>240</span></pre></div></td><td class=code><div><pre><span></span><code><span class=k>def</span> <span class=nf>window_based_clumping</span><span class=p>(</span><span class=bp>self</span><span class=p>:</span> <span class=n>SummaryStatistics</span><span class=p>,</span> <span class=n>distance</span><span class=p>:</span> <span class=nb>int</span><span class=p>)</span> <span class=o>-&gt;</span> <span class=n>StudyLocus</span><span class=p>:</span>
+</code></pre></div></td></tr></table></div> </details> </div> </div> <div class="doc doc-object doc-function"> <h2 id=otg.dataset.summary_statistics.SummaryStatistics.window_based_clumping class="doc doc-heading"> <code class="highlight language-python"><span class=n>window_based_clumping</span><span class=p>(</span><span class=n>distance</span><span class=p>)</span></code> </h2> <div class="doc doc-contents "> <p>Perform distance-based clumping.</p> <p><strong>Parameters:</strong></p> <table> <thead> <tr> <th>Name</th> <th>Type</th> <th>Description</th> <th>Default</th> </tr> </thead> <tbody> <tr> <td><code>distance</code></td> <td> <code>int</code> </td> <td><p>Distance in base pairs</p></td> <td> <em>required</em> </td> </tr> </tbody> </table> <p><strong>Returns:</strong></p> <table> <thead> <tr> <th>Name</th> <th>Type</th> <th>Description</th> </tr> </thead> <tbody> <tr> <td><code>StudyLocus</code></td> <td> <code><a class="autorefs autorefs-internal" title="otg.dataset.study_locus.StudyLocus" href="../study_locus/_study_locus/#otg.dataset.study_locus.StudyLocus">StudyLocus</a></code> </td> <td><p>StudyLocus object</p></td> </tr> </tbody> </table> <details class=quote> <summary>Source code in <code>src/otg/dataset/summary_statistics.py</code></summary> <div class=highlight><table class=highlighttable><tr><td class=linenos><div class=linenodiv><pre><span></span><span class=normal>137</span>
+<span class=normal>138</span>
+<span class=normal>139</span>
+<span class=normal>140</span>
+<span class=normal>141</span>
+<span class=normal>142</span>
+<span class=normal>143</span>
+<span class=normal>144</span>
+<span class=normal>145</span>
+<span class=normal>146</span>
+<span class=normal>147</span></pre></div></td><td class=code><div><pre><span></span><code><span class=k>def</span> <span class=nf>window_based_clumping</span><span class=p>(</span><span class=bp>self</span><span class=p>:</span> <span class=n>SummaryStatistics</span><span class=p>,</span> <span class=n>distance</span><span class=p>:</span> <span class=nb>int</span><span class=p>)</span> <span class=o>-&gt;</span> <span class=n>StudyLocus</span><span class=p>:</span>
 <span class=w>    </span><span class=sd>&quot;&quot;&quot;Perform distance-based clumping.</span>
 
 <span class=sd>    Args:</span>
@@ -686,6 +463,6 @@
  |-- betaConfidenceIntervalUpper: double (nullable = true)
  |-- pValueMantissa: float (nullable = false)
  |-- pValueExponent: integer (nullable = false)
- |-- effectAlleleFrequencyFromSource: double (nullable = true)
+ |-- effectAlleleFrequencyFromSource: float (nullable = true)
  |-- standardError: double (nullable = true)
 </code></pre></div> <hr> <div class=md-source-file> <small> <ul class="metadata page-metadata" data-bi-name="page info" lang=en-us dir=ltr> <!-- mkdocs-git-revision-date-plugin --> <li> <span class="icon twemoji"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M14.47 15.08 11 13V7h1.5v5.25l3.08 1.83c-.41.28-.79.62-1.11 1m-1.39 4.84c-.36.05-.71.08-1.08.08-4.42 0-8-3.58-8-8s3.58-8 8-8 8 3.58 8 8c0 .37-.03.72-.08 1.08.69.1 1.33.32 1.92.64.1-.56.16-1.13.16-1.72 0-5.5-4.5-10-10-10S2 6.5 2 12s4.47 10 10 10c.59 0 1.16-.06 1.72-.16-.32-.59-.54-1.23-.64-1.92M18 15v3h-3v2h3v3h2v-3h3v-2h-3v-3h-2Z"/></svg> </span> <span class=label> <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-03-30T14:21:01+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-03-30</span> </span> </li> <li> <span class="icon twemoji"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M21 13.1c-.1 0-.3.1-.4.2l-1 1 2.1 2.1 1-1c.2-.2.2-.6 0-.8l-1.3-1.3c-.1-.1-.2-.2-.4-.2m-1.9 1.8-6.1 6V23h2.1l6.1-6.1-2.1-2M12.5 7v5.2l4 2.4-1 1L11 13V7h1.5M11 21.9c-5.1-.5-9-4.8-9-9.9C2 6.5 6.5 2 12 2c5.3 0 9.6 4.1 10 9.3-.3-.1-.6-.2-1-.2s-.7.1-1 .2C19.6 7.2 16.2 4 12 4c-4.4 0-8 3.6-8 8 0 4.1 3.1 7.5 7.1 7.9l-.1.2v1.8Z"/></svg> </span> <span class=label> <span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-timeago"><span class=timeago datetime=2023-05-25T14:15:01+00:00 locale=en></span></span><span class="git-revision-date-localized-plugin git-revision-date-localized-plugin-iso_date">2023-05-25</span> </span> </li> <li> <span class="icon twemoji"> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M12 2A10 10 0 0 0 2 12c0 4.42 2.87 8.17 6.84 9.5.5.08.66-.23.66-.5v-1.69c-2.77.6-3.36-1.34-3.36-1.34-.46-1.16-1.11-1.47-1.11-1.47-.91-.62.07-.6.07-.6 1 .07 1.53 1.03 1.53 1.03.87 1.52 2.34 1.07 2.91.83.09-.65.35-1.09.63-1.34-2.22-.25-4.55-1.11-4.55-4.92 0-1.11.38-2 1.03-2.71-.1-.25-.45-1.29.1-2.64 0 0 .84-.27 2.75 1.02.79-.22 1.65-.33 2.5-.33.85 0 1.71.11 2.5.33 1.91-1.29 2.75-1.02 2.75-1.02.55 1.35.2 2.39.1 2.64.65.71 1.03 1.6 1.03 2.71 0 3.82-2.34 4.66-4.57 4.91.36.31.69.92.69 1.85V21c0 .27.16.59.67.5C19.14 20.16 22 16.42 22 12A10 10 0 0 0 12 2Z"/></svg> </span> <span class=label> Contributors </span> </li> <li class=contributors-holder> <ul class=contributors data-bi-name=contributors><li><a href=https://github.com/d0choa title=d0choa data-bi-name=contributorprofile target=_blank><img src=https://avatars.githubusercontent.com/u/5097586 alt=d0choa></a></li></ul> </li> </ul> </small> </div> </article> </div> </div> <button type=button class="md-top md-icon" data-md-component=top hidden> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 24 24"><path d="M13 20h-2V8l-5.5 5.5-1.42-1.42L12 4.16l7.92 7.92-1.42 1.42L13 8v12Z"/></svg> Back to top </button> </main> <footer class=md-footer> <div class="md-footer-meta md-typeset"> <div class="md-footer-meta__inner md-grid"> <div class=md-copyright> Made with <a href=https://squidfunk.github.io/mkdocs-material/ target=_blank rel=noopener> Material for MkDocs </a> </div> <div class=md-social> <a href=https://github.com/genetics_etl_python target=_blank rel=noopener title=github.com class=md-social__link> <svg xmlns=http://www.w3.org/2000/svg viewbox="0 0 496 512"><!-- Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--><path d="M165.9 397.4c0 2-2.3 3.6-5.2 3.6-3.3.3-5.6-1.3-5.6-3.6 0-2 2.3-3.6 5.2-3.6 3-.3 5.6 1.3 5.6 3.6zm-31.1-4.5c-.7 2 1.3 4.3 4.3 4.9 2.6 1 5.6 0 6.2-2s-1.3-4.3-4.3-5.2c-2.6-.7-5.5.3-6.2 2.3zm44.2-1.7c-2.9.7-4.9 2.6-4.6 4.9.3 2 2.9 3.3 5.9 2.6 2.9-.7 4.9-2.6 4.6-4.6-.3-1.9-3-3.2-5.9-2.9zM244.8 8C106.1 8 0 113.3 0 252c0 110.9 69.8 205.8 169.5 239.2 12.8 2.3 17.3-5.6 17.3-12.1 0-6.2-.3-40.4-.3-61.4 0 0-70 15-84.7-29.8 0 0-11.4-29.1-27.8-36.6 0 0-22.9-15.7 1.6-15.4 0 0 24.9 2 38.6 25.8 21.9 38.6 58.6 27.5 72.9 20.9 2.3-16 8.8-27.1 16-33.7-55.9-6.2-112.3-14.3-112.3-110.5 0-27.5 7.6-41.3 23.6-58.9-2.6-6.5-11.1-33.3 2.6-67.9 20.9-6.5 69 27 69 27 20-5.6 41.5-8.5 62.8-8.5s42.8 2.9 62.8 8.5c0 0 48.1-33.6 69-27 13.7 34.7 5.2 61.4 2.6 67.9 16 17.7 25.8 31.5 25.8 58.9 0 96.5-58.9 104.2-114.8 110.5 9.2 7.9 17 22.9 17 46.4 0 33.7-.3 75.4-.3 83.6 0 6.5 4.6 14.4 17.3 12.1C428.2 457.8 496 362.9 496 252 496 113.3 383.5 8 244.8 8zM97.2 352.9c-1.3 1-1 3.3.7 5.2 1.6 1.6 3.9 2.3 5.2 1 1.3-1 1-3.3-.7-5.2-1.6-1.6-3.9-2.3-5.2-1zm-10.8-8.1c-.7 1.3.3 2.9 2.3 3.9 1.6 1 3.6.7 4.3-.7.7-1.3-.3-2.9-2.3-3.9-2-.6-3.6-.3-4.3.7zm32.4 35.6c-1.6 1.3-1 4.3 1.3 6.2 2.3 2.3 5.2 2.6 6.5 1 1.3-1.3.7-4.3-1.3-6.2-2.2-2.3-5.2-2.6-6.5-1zm-11.4-14.7c-1.6 1-1.6 3.6 0 5.9 1.6 2.3 4.3 3.3 5.6 2.3 1.6-1.3 1.6-3.9 0-6.2-1.4-2.3-4-3.3-5.6-2z"/></svg> </a> </div> </div> </div> </footer> </div> <div class=md-dialog data-md-component=dialog> <div class="md-dialog__inner md-typeset"></div> </div> <script id=__config type=application/json>{"base": "../../..", "features": ["navigation.instant", "navigation.top", "content.code.annotate", "search.suggest", "search.highlight"], "search": "../../../assets/javascripts/workers/search.208ed371.min.js", "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}}</script> <script src=../../../assets/javascripts/bundle.51198bba.min.js></script> <script src=../../../js/timeago.min.js></script> <script src=../../../js/timeago_mkdocs_material.js></script> <script src=../../../assets/javascripts/extra.js></script> </body> </html>
\ No newline at end of file
diff --git a/objects.inv b/objects.inv
index 836b896d1..6dcc88f9d 100644
Binary files a/objects.inv and b/objects.inv differ
diff --git a/search/search_index.json b/search/search_index.json
index 268949d28..31167e36c 100644
--- a/search/search_index.json
+++ b/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Home","text":"<p>Ingestion and analysis of genetic and functional genomic data for the identification and prioritisation of drug targets.</p> <p>This project is still in experimental phase. Please refer to the roadmap section for more information.</p> <p>For information on how to configure the development environment, run the code, or contribute changes, see the contributing section. For known technical issues and solutions to them, see the troubleshooting section.</p>"},{"location":"contributing/","title":"Environment configuration and contributing changes","text":""},{"location":"contributing/#one-time-configuration","title":"One-time configuration","text":"<p>The steps in this section only ever need to be done once on any particular system.</p> <p>Google Cloud configuration: 1. Install Google Cloud SDK: https://cloud.google.com/sdk/docs/install. 1. Log in to your work Google Account: run <code>gcloud auth login</code> and follow instructions. 1. Obtain Google application credentials: run <code>gcloud auth application-default login</code> and follow instructions.</p> <p>Check that you have the <code>make</code> utility installed, and if not (which is unlikely), install it using your system package manager.</p> <p>Check that you have <code>java</code> installed.</p>"},{"location":"contributing/#environment-configuration","title":"Environment configuration","text":"<p>Run <code>make setup-dev</code> to install/update the necessary packages and activate the development environment. You need to do this every time you open a new shell.</p> <p>It is recommended to use VS Code as an IDE for development.</p>"},{"location":"contributing/#how-to-run-the-code","title":"How to run the code","text":"<p>All pipelines in this repository are intended to be run in Google Dataproc. Running them locally is not currently supported.</p> <p>In order to run the code:</p> <ol> <li> <p>Manually edit your local <code>workflow/dag.yaml</code> file and comment out the steps you do not want to run.</p> </li> <li> <p>Manually edit your local <code>pyproject.toml</code> file and modify the version of the code.</p> <ul> <li>This must be different from the version used by any other people working on the repository to avoid any deployment conflicts, so it's a good idea to use your name, for example: <code>1.2.3+jdoe</code>.</li> <li>You can also add a brief branch description, for example: <code>1.2.3+jdoe.myfeature</code>.</li> <li>Note that the version must comply with PEP440 conventions, otherwise Poetry will not allow it to be deployed.</li> <li>Do not use underscores or hyphens in your version name. When building the WHL file, they will be automatically converted to dots, which means the file name will no longer match the version and the build will fail. Use dots instead.</li> </ul> </li> <li> <p>Run <code>make build</code>.</p> <ul> <li>This will create a bundle containing the neccessary code, configuration and dependencies to run the ETL pipeline, and then upload this bundle to Google Cloud.</li> <li>A version specific subpath is used, so uploading the code will not affect any branches but your own.</li> <li>If there was already a code bundle uploaded with the same version number, it will be replaced.</li> </ul> </li> <li> <p>Submit the Dataproc job with <code>poetry run python workflow/workflow_template.py</code></p> <ul> <li>You will need to specify additional parameters, some are mandatory and some are optional. Run with <code>--help</code> to see usage.</li> <li>The script will provision the cluster and submit the job.</li> <li>The cluster will take a few minutes to get provisioned and running, during which the script will not output anything, this is normal.</li> <li>Once submitted, you can monitor the progress of your job on this page: https://console.cloud.google.com/dataproc/jobs?project=open-targets-genetics-dev.</li> <li>On completion (whether successful or a failure), the cluster will be automatically removed, so you don't have to worry about shutting it down to avoid incurring charges.</li> </ul> </li> </ol>"},{"location":"contributing/#how-to-generate-a-local-copy-of-the-documentation","title":"How to generate a local copy of the documentation","text":"<p>Run <code>poetry run mkdocs serve</code>. This will generate the local copy of the documentation and will start a local server to browse it (URL will be printed, usually http://127.0.0.1:8000/).</p>"},{"location":"contributing/#how-to-run-the-tests","title":"How to run the tests","text":"<p>Run <code>poetry run pytest</code>.</p>"},{"location":"contributing/#contributing-checklist","title":"Contributing checklist","text":"<p>When making changes, and especially when implementing a new module or feature, it's essential to ensure that all relevant sections of the code base are modified.</p>"},{"location":"contributing/#documentation","title":"Documentation","text":"<ul> <li>If during development you had a question which wasn't covered in the documentation, and someone explained it to you, add it to the documentation. The same applies if you encountered any instructions in the documentation which were obsolete or incorrect.</li> <li>Documentation autogeneration expressions start with <code>:::</code>. They will automatically generate sections of the documentation based on class and method docstrings. Be sure to update them for:</li> <li>Dataset definitions in <code>docs/reference/dataset</code> (example: <code>docs/reference/dataset/study_index/study_index_finngen.md</code>)</li> <li>Step definitions in <code>docs/reference/step</code> (example: <code>docs/reference/step/finngen.md</code>)</li> </ul>"},{"location":"contributing/#configuration","title":"Configuration","text":"<ul> <li>Input and output paths in <code>config/datasets/gcp.yaml</code></li> <li>Step configuration in <code>config/step/my_STEP.yaml</code> (example: <code>config/step/my_finngen.yaml</code>)</li> </ul>"},{"location":"contributing/#classes","title":"Classes","text":"<ul> <li>Step configuration class in <code>src/org/config.py</code> (example: <code>FinnGenStepConfig</code> class in that module)</li> <li>Dataset class in <code>src/org/dataset/</code> (example: <code>src/otg/dataset/study_index.py</code> \u2192 <code>StudyIndexFinnGen</code>)</li> <li>Step main running class in <code>src/org/STEP.py</code> (example: <code>src/org/finngen.py</code>)</li> </ul>"},{"location":"contributing/#tests","title":"Tests","text":"<ul> <li>Test study fixture in <code>tests/conftest.py</code> (example: <code>mock_study_index_finngen</code> in that module)</li> <li>Test sample data in <code>tests/data_samples</code> (example: <code>tests/data_samples/finngen_studies_sample.json</code>)</li> <li>Test definition in <code>tests/</code> (example: <code>tests/dataset/test_study_index.py</code> \u2192 <code>test_study_index_finngen_creation</code>)</li> </ul>"},{"location":"roadmap/","title":"Roadmap","text":"<p>The Open Targets core team is working on refactoring Open Targets Genetics, aiming to:</p> <ul> <li>Re-focus the product around Target ID</li> <li>Create a gold standard toolkit for post-GWAS analysis</li> <li>Faster/robust addition of new datasets and datatypes</li> <li>Reduce computational and financial cost</li> </ul> <p>See here for a list of open issues for this project.</p> <p>Schematic diagram representing the drafted process:</p> <p></p>"},{"location":"troubleshooting/","title":"Troubleshooting","text":""},{"location":"troubleshooting/#blaslapack","title":"BLAS/LAPACK","text":"<p>If you see errors related to BLAS/LAPACK libraries, see this StackOverflow post for guidance.</p>"},{"location":"troubleshooting/#pyenv-and-poetry","title":"Pyenv and Poetry","text":"<p>If you see various errors thrown by Pyenv or Poetry, they can be hard to specifically diagnose and resolve. In this case, it often helps to remove those tools from the system completely. Follow these steps:</p> <ol> <li>Close your currently activated environment, if any: <code>exit</code></li> <li>Uninstall Poetry: <code>curl -sSL https://install.python-poetry.org | python3 - --uninstall</code></li> <li>Clear Poetry cache: <code>rm -rf ~/.cache/pypoetry</code></li> <li>Clear pre-commit cache: <code>rm -rf ~/.cache/pre-commit</code></li> <li>Switch to system Python shell: <code>pyenv shell system</code></li> <li>Edit <code>~/.bashrc</code> to remove the lines related to Pyenv configuration</li> <li>Remove Pyenv configuration and cache: <code>rm -rf ~/.pyenv</code></li> </ol> <p>After that, open a fresh shell session and run <code>make setup-dev</code> again.</p>"},{"location":"troubleshooting/#java","title":"Java","text":"<p>Officially, PySpark requires Java version 8 (a.k.a. 1.8) or above to work. However, if you have a very recent version of Java, you may experience issues, as it may introduce breaking changes that PySpark hasn't had time to integrate. For example, as of May 2023, PySpark did not work with Java 20.</p> <p>If you are encountering problems with initialising a Spark session, try using Java 11.</p>"},{"location":"troubleshooting/#pre-commit","title":"Pre-commit","text":"<p>If you see an error message thrown by pre-commit, which looks like this (<code>SyntaxError: Unexpected token '?'</code>), followed by a JavaScript traceback, the issue is likely with your system NodeJS version.</p> <p>One solution which can help in this case is to upgrade your system NodeJS version. However, this may not always be possible. For example, Ubuntu repository is several major versions behind the latest version as of July 2023.</p> <p>Another solution which helps is to remove Node, NodeJS, and npm from your system entirely. In this case, pre-commit will not try to rely on a system version of NodeJS and will install its own, suitable one.</p> <p>On Ubuntu, this can be done using <code>sudo apt remove node nodejs npm</code>, followed by <code>sudo apt autoremove</code>. But in some cases, depending on your existing installation, you may need to also manually remove some files. See this StackOverflow answer for guidance.</p> <p>After running these commands, you are advised to open a fresh shell, and then also reinstall Pyenv and Poetry to make sure they pick up the changes (see relevant section above).</p>"},{"location":"components/dataset/_dataset/","title":"Dataset","text":"<p>Open Targets Genetics Dataset.</p> <p><code>Dataset</code> is a wrapper around a Spark DataFrame with a predefined schema. Schemas for each child dataset are described in the <code>json.schemas</code> module.</p> Source code in <code>src/otg/dataset/dataset.py</code> <pre><code>@dataclass\nclass Dataset:\n\"\"\"Open Targets Genetics Dataset.\n\n    `Dataset` is a wrapper around a Spark DataFrame with a predefined schema. Schemas for each child dataset are described in the `json.schemas` module.\n    \"\"\"\n\n    _df: DataFrame\n    _schema: StructType\n\n    def __post_init__(self: Dataset) -&gt; None:\n\"\"\"Post init.\"\"\"\n        self.validate_schema()\n\n    @property\n    def df(self: Dataset) -&gt; DataFrame:\n\"\"\"Dataframe included in the Dataset.\"\"\"\n        return self._df\n\n    @df.setter\n    def df(self: Dataset, new_df: DataFrame) -&gt; None:  # noqa: CCE001\n\"\"\"Dataframe setter.\"\"\"\n        self._df = new_df\n        self.validate_schema()\n\n    @property\n    def schema(self: Dataset) -&gt; StructType:\n\"\"\"Dataframe expected schema.\"\"\"\n        return self._schema\n\n    @classmethod\n    def from_parquet(\n        cls: type[Dataset], session: Session, path: str, schema: StructType\n    ) -&gt; Dataset:\n\"\"\"Reads a parquet file into a Dataset with a given schema.\n\n        Args:\n            session (Session): ETL session\n            path (str): Path to parquet file\n            schema (StructType): Schema to use\n\n        Returns:\n            Dataset: Dataset with given schema\n        \"\"\"\n        df = session.read_parquet(path=path, schema=schema)\n        return cls(_df=df, _schema=schema)\n\n    def validate_schema(self: Dataset) -&gt; None:  # sourcery skip: invert-any-all\n\"\"\"Validate DataFrame schema against expected class schema.\n\n        Raises:\n            ValueError: DataFrame schema is not valid\n        \"\"\"\n        expected_schema = self._schema\n        expected_fields = flatten_schema(expected_schema)\n        observed_schema = self._df.schema\n        observed_fields = flatten_schema(observed_schema)\n\n        # Unexpected fields in dataset\n        if unexpected_struct_fields := [\n            x for x in observed_fields if x not in expected_fields\n        ]:\n            raise ValueError(\n                f\"The {unexpected_struct_fields} fields are not included in DataFrame schema: {expected_fields}\"\n            )\n\n        # Required fields not in dataset\n        required_fields = [x.name for x in expected_schema if not x.nullable]\n        if missing_required_fields := [\n            req\n            for req in required_fields\n            if not any(field.name == req for field in observed_fields)\n        ]:\n            raise ValueError(\n                f\"The {missing_required_fields} fields are required but missing: {required_fields}\"\n            )\n\n        # Fields with duplicated names\n        if duplicated_fields := [\n            x for x in set(observed_fields) if observed_fields.count(x) &gt; 1\n        ]:\n            raise ValueError(\n                f\"The following fields are duplicated in DataFrame schema: {duplicated_fields}\"\n            )\n\n        # Fields with different datatype\n        if fields_with_different_observed_datatype := [\n            field\n            for field in set(observed_fields)\n            if observed_fields.count(field) != expected_fields.count(field)\n        ]:\n            raise ValueError(\n                f\"The following fields present differences in their datatypes: {fields_with_different_observed_datatype}.\"\n            )\n</code></pre>"},{"location":"components/dataset/_dataset/#otg.dataset.dataset.Dataset.df","title":"<code>df: DataFrame</code>  <code>property</code> <code>writable</code>","text":"<p>Dataframe included in the Dataset.</p>"},{"location":"components/dataset/_dataset/#otg.dataset.dataset.Dataset.schema","title":"<code>schema: StructType</code>  <code>property</code>","text":"<p>Dataframe expected schema.</p>"},{"location":"components/dataset/_dataset/#otg.dataset.dataset.Dataset.__post_init__","title":"<code>__post_init__()</code>","text":"<p>Post init.</p> Source code in <code>src/otg/dataset/dataset.py</code> <pre><code>def __post_init__(self: Dataset) -&gt; None:\n\"\"\"Post init.\"\"\"\n    self.validate_schema()\n</code></pre>"},{"location":"components/dataset/_dataset/#otg.dataset.dataset.Dataset.from_parquet","title":"<code>from_parquet(session, path, schema)</code>  <code>classmethod</code>","text":"<p>Reads a parquet file into a Dataset with a given schema.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>ETL session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <code>schema</code> <code>StructType</code> <p>Schema to use</p> required <p>Returns:</p> Name Type Description <code>Dataset</code> <code>Dataset</code> <p>Dataset with given schema</p> Source code in <code>src/otg/dataset/dataset.py</code> <pre><code>@classmethod\ndef from_parquet(\n    cls: type[Dataset], session: Session, path: str, schema: StructType\n) -&gt; Dataset:\n\"\"\"Reads a parquet file into a Dataset with a given schema.\n\n    Args:\n        session (Session): ETL session\n        path (str): Path to parquet file\n        schema (StructType): Schema to use\n\n    Returns:\n        Dataset: Dataset with given schema\n    \"\"\"\n    df = session.read_parquet(path=path, schema=schema)\n    return cls(_df=df, _schema=schema)\n</code></pre>"},{"location":"components/dataset/_dataset/#otg.dataset.dataset.Dataset.validate_schema","title":"<code>validate_schema()</code>","text":"<p>Validate DataFrame schema against expected class schema.</p> <p>Raises:</p> Type Description <code>ValueError</code> <p>DataFrame schema is not valid</p> Source code in <code>src/otg/dataset/dataset.py</code> <pre><code>def validate_schema(self: Dataset) -&gt; None:  # sourcery skip: invert-any-all\n\"\"\"Validate DataFrame schema against expected class schema.\n\n    Raises:\n        ValueError: DataFrame schema is not valid\n    \"\"\"\n    expected_schema = self._schema\n    expected_fields = flatten_schema(expected_schema)\n    observed_schema = self._df.schema\n    observed_fields = flatten_schema(observed_schema)\n\n    # Unexpected fields in dataset\n    if unexpected_struct_fields := [\n        x for x in observed_fields if x not in expected_fields\n    ]:\n        raise ValueError(\n            f\"The {unexpected_struct_fields} fields are not included in DataFrame schema: {expected_fields}\"\n        )\n\n    # Required fields not in dataset\n    required_fields = [x.name for x in expected_schema if not x.nullable]\n    if missing_required_fields := [\n        req\n        for req in required_fields\n        if not any(field.name == req for field in observed_fields)\n    ]:\n        raise ValueError(\n            f\"The {missing_required_fields} fields are required but missing: {required_fields}\"\n        )\n\n    # Fields with duplicated names\n    if duplicated_fields := [\n        x for x in set(observed_fields) if observed_fields.count(x) &gt; 1\n    ]:\n        raise ValueError(\n            f\"The following fields are duplicated in DataFrame schema: {duplicated_fields}\"\n        )\n\n    # Fields with different datatype\n    if fields_with_different_observed_datatype := [\n        field\n        for field in set(observed_fields)\n        if observed_fields.count(field) != expected_fields.count(field)\n    ]:\n        raise ValueError(\n            f\"The following fields present differences in their datatypes: {fields_with_different_observed_datatype}.\"\n        )\n</code></pre>"},{"location":"components/dataset/colocalisation/","title":"Colocalisation","text":"<p>         Bases: <code>Dataset</code></p> <p>Colocalisation results for pairs of overlapping study-locus.</p> Source code in <code>src/otg/dataset/colocalisation.py</code> <pre><code>@dataclass\nclass Colocalisation(Dataset):\n\"\"\"Colocalisation results for pairs of overlapping study-locus.\"\"\"\n\n    _schema: StructType = parse_spark_schema(\"colocalisation.json\")\n\n    @classmethod\n    def from_parquet(\n        cls: type[Colocalisation], session: Session, path: str\n    ) -&gt; Colocalisation:\n\"\"\"Initialise Colocalisation dataset from parquet file.\n\n        Args:\n            session (Session): ETL session\n            path (str): Path to parquet file\n\n        Returns:\n            Colocalisation: Colocalisation results\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/colocalisation/#otg.dataset.colocalisation.Colocalisation.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise Colocalisation dataset from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>ETL session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>Colocalisation</code> <code>Colocalisation</code> <p>Colocalisation results</p> Source code in <code>src/otg/dataset/colocalisation.py</code> <pre><code>@classmethod\ndef from_parquet(\n    cls: type[Colocalisation], session: Session, path: str\n) -&gt; Colocalisation:\n\"\"\"Initialise Colocalisation dataset from parquet file.\n\n    Args:\n        session (Session): ETL session\n        path (str): Path to parquet file\n\n    Returns:\n        Colocalisation: Colocalisation results\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/colocalisation/#schema","title":"Schema","text":"<pre><code>root\n |-- left_studyLocusId: long (nullable = false)\n |-- right_studyLocusId: long (nullable = false)\n |-- chromosome: string (nullable = false)\n |-- colocalisationMethod: string (nullable = false)\n |-- coloc_n_vars: long (nullable = false)\n |-- coloc_h0: double (nullable = true)\n |-- coloc_h1: double (nullable = true)\n |-- coloc_h2: double (nullable = true)\n |-- coloc_h3: double (nullable = true)\n |-- coloc_h4: double (nullable = true)\n |-- coloc_log2_h4_h3: double (nullable = true)\n |-- clpp: double (nullable = true)\n</code></pre>"},{"location":"components/dataset/gene_index/","title":"Gene index","text":"<p>         Bases: <code>Dataset</code></p> <p>Gene index dataset.</p> <p>Gene-based annotation.</p> Source code in <code>src/otg/dataset/gene_index.py</code> <pre><code>@dataclass\nclass GeneIndex(Dataset):\n\"\"\"Gene index dataset.\n\n    Gene-based annotation.\n    \"\"\"\n\n    _schema: StructType = parse_spark_schema(\"targets.json\")\n\n    @staticmethod\n    def _get_gene_tss(strand_col: Column, start_col: Column, end_col: Column) -&gt; Column:\n\"\"\"Returns the TSS of a gene based on its orientation.\n\n        Args:\n            strand_col (Column): Column containing 1 if the coding strand of the gene is forward, and -1 if it is reverse.\n            start_col (Column): Column containing the start position of the gene.\n            end_col (Column): Column containing the end position of the gene.\n\n        Returns:\n            Column: Column containing the TSS of the gene.\n\n        Examples:\n            &gt;&gt;&gt; df = spark.createDataFrame([{\"strand\": 1, \"start\": 100, \"end\": 200}, {\"strand\": -1, \"start\": 100, \"end\": 200}])\n            &gt;&gt;&gt; df.withColumn(\"tss\", GeneIndex._get_gene_tss(f.col(\"strand\"), f.col(\"start\"), f.col(\"end\"))).show()\n            +---+-----+------+---+\n            |end|start|strand|tss|\n            +---+-----+------+---+\n            |200|  100|     1|100|\n            |200|  100|    -1|200|\n            +---+-----+------+---+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        return f.when(strand_col == 1, start_col).when(strand_col == -1, end_col)\n\n    @classmethod\n    def from_source(cls: type[GeneIndex], target_index: DataFrame) -&gt; GeneIndex:\n\"\"\"Initialise GeneIndex from source dataset.\n\n        Args:\n            target_index (DataFrame): Target index dataframe\n\n        Returns:\n            GeneIndex: Gene index dataset\n        \"\"\"\n        return cls(\n            _df=target_index.select(\n                f.coalesce(f.col(\"id\"), f.lit(\"unknown\")).alias(\"geneId\"),\n                f.coalesce(f.col(\"genomicLocation.chromosome\"), f.lit(\"unknown\")).alias(\n                    \"chromosome\"\n                ),\n                GeneIndex._get_gene_tss(\n                    f.col(\"genomicLocation.strand\"),\n                    f.col(\"genomicLocation.start\"),\n                    f.col(\"genomicLocation.end\"),\n                ).alias(\"tss\"),\n                \"biotype\",\n                \"approvedSymbol\",\n                \"obsoleteSymbols\",\n            )\n        )\n\n    @classmethod\n    def from_parquet(cls: type[GeneIndex], session: Session, path: str) -&gt; GeneIndex:\n\"\"\"Initialise GeneIndex from parquet file.\n\n        Args:\n            session (Session): ETL session\n            path (str): Path to parquet file\n\n        Returns:\n            GeneIndex: Gene index dataset\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n\n    def filter_by_biotypes(self: GeneIndex, biotypes: list) -&gt; GeneIndex:\n\"\"\"Filter by approved biotypes.\n\n        Args:\n            biotypes (list): List of Ensembl biotypes to keep.\n\n        Returns:\n            GeneIndex: Gene index dataset filtered by biotypes.\n        \"\"\"\n        self.df = self._df.filter(f.col(\"biotype\").isin(biotypes))\n        return self\n\n    def locations_lut(self: GeneIndex) -&gt; DataFrame:\n\"\"\"Gene location information.\n\n        Returns:\n            DataFrame: Gene LUT including genomic location information.\n        \"\"\"\n        return self.df.select(\n            \"geneId\",\n            \"chromosome\",\n            \"tss\",\n        )\n\n    def symbols_lut(self: GeneIndex) -&gt; DataFrame:\n\"\"\"Gene symbol lookup table.\n\n        Pre-processess gene/target dataset to create lookup table of gene symbols, including\n        obsoleted gene symbols.\n\n        Returns:\n            DataFrame: Gene LUT for symbol mapping containing `geneId` and `geneSymbol` columns.\n        \"\"\"\n        return self.df.select(\n            \"geneId\",\n            f.explode(\n                f.array_union(f.array(\"approvedSymbol\"), f.col(\"obsoleteSymbols.label\"))\n            ).alias(\"geneSymbol\"),\n        )\n</code></pre>"},{"location":"components/dataset/gene_index/#otg.dataset.gene_index.GeneIndex.filter_by_biotypes","title":"<code>filter_by_biotypes(biotypes)</code>","text":"<p>Filter by approved biotypes.</p> <p>Parameters:</p> Name Type Description Default <code>biotypes</code> <code>list</code> <p>List of Ensembl biotypes to keep.</p> required <p>Returns:</p> Name Type Description <code>GeneIndex</code> <code>GeneIndex</code> <p>Gene index dataset filtered by biotypes.</p> Source code in <code>src/otg/dataset/gene_index.py</code> <pre><code>def filter_by_biotypes(self: GeneIndex, biotypes: list) -&gt; GeneIndex:\n\"\"\"Filter by approved biotypes.\n\n    Args:\n        biotypes (list): List of Ensembl biotypes to keep.\n\n    Returns:\n        GeneIndex: Gene index dataset filtered by biotypes.\n    \"\"\"\n    self.df = self._df.filter(f.col(\"biotype\").isin(biotypes))\n    return self\n</code></pre>"},{"location":"components/dataset/gene_index/#otg.dataset.gene_index.GeneIndex.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise GeneIndex from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>ETL session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>GeneIndex</code> <code>GeneIndex</code> <p>Gene index dataset</p> Source code in <code>src/otg/dataset/gene_index.py</code> <pre><code>@classmethod\ndef from_parquet(cls: type[GeneIndex], session: Session, path: str) -&gt; GeneIndex:\n\"\"\"Initialise GeneIndex from parquet file.\n\n    Args:\n        session (Session): ETL session\n        path (str): Path to parquet file\n\n    Returns:\n        GeneIndex: Gene index dataset\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/gene_index/#otg.dataset.gene_index.GeneIndex.from_source","title":"<code>from_source(target_index)</code>  <code>classmethod</code>","text":"<p>Initialise GeneIndex from source dataset.</p> <p>Parameters:</p> Name Type Description Default <code>target_index</code> <code>DataFrame</code> <p>Target index dataframe</p> required <p>Returns:</p> Name Type Description <code>GeneIndex</code> <code>GeneIndex</code> <p>Gene index dataset</p> Source code in <code>src/otg/dataset/gene_index.py</code> <pre><code>@classmethod\ndef from_source(cls: type[GeneIndex], target_index: DataFrame) -&gt; GeneIndex:\n\"\"\"Initialise GeneIndex from source dataset.\n\n    Args:\n        target_index (DataFrame): Target index dataframe\n\n    Returns:\n        GeneIndex: Gene index dataset\n    \"\"\"\n    return cls(\n        _df=target_index.select(\n            f.coalesce(f.col(\"id\"), f.lit(\"unknown\")).alias(\"geneId\"),\n            f.coalesce(f.col(\"genomicLocation.chromosome\"), f.lit(\"unknown\")).alias(\n                \"chromosome\"\n            ),\n            GeneIndex._get_gene_tss(\n                f.col(\"genomicLocation.strand\"),\n                f.col(\"genomicLocation.start\"),\n                f.col(\"genomicLocation.end\"),\n            ).alias(\"tss\"),\n            \"biotype\",\n            \"approvedSymbol\",\n            \"obsoleteSymbols\",\n        )\n    )\n</code></pre>"},{"location":"components/dataset/gene_index/#otg.dataset.gene_index.GeneIndex.locations_lut","title":"<code>locations_lut()</code>","text":"<p>Gene location information.</p> <p>Returns:</p> Name Type Description <code>DataFrame</code> <code>DataFrame</code> <p>Gene LUT including genomic location information.</p> Source code in <code>src/otg/dataset/gene_index.py</code> <pre><code>def locations_lut(self: GeneIndex) -&gt; DataFrame:\n\"\"\"Gene location information.\n\n    Returns:\n        DataFrame: Gene LUT including genomic location information.\n    \"\"\"\n    return self.df.select(\n        \"geneId\",\n        \"chromosome\",\n        \"tss\",\n    )\n</code></pre>"},{"location":"components/dataset/gene_index/#otg.dataset.gene_index.GeneIndex.symbols_lut","title":"<code>symbols_lut()</code>","text":"<p>Gene symbol lookup table.</p> <p>Pre-processess gene/target dataset to create lookup table of gene symbols, including obsoleted gene symbols.</p> <p>Returns:</p> Name Type Description <code>DataFrame</code> <code>DataFrame</code> <p>Gene LUT for symbol mapping containing <code>geneId</code> and <code>geneSymbol</code> columns.</p> Source code in <code>src/otg/dataset/gene_index.py</code> <pre><code>def symbols_lut(self: GeneIndex) -&gt; DataFrame:\n\"\"\"Gene symbol lookup table.\n\n    Pre-processess gene/target dataset to create lookup table of gene symbols, including\n    obsoleted gene symbols.\n\n    Returns:\n        DataFrame: Gene LUT for symbol mapping containing `geneId` and `geneSymbol` columns.\n    \"\"\"\n    return self.df.select(\n        \"geneId\",\n        f.explode(\n            f.array_union(f.array(\"approvedSymbol\"), f.col(\"obsoleteSymbols.label\"))\n        ).alias(\"geneSymbol\"),\n    )\n</code></pre>"},{"location":"components/dataset/gene_index/#schema","title":"Schema","text":"<pre><code>root\n |-- geneId: string (nullable = false)\n |-- chromosome: string (nullable = false)\n |-- approvedSymbol: string (nullable = true)\n |-- biotype: string (nullable = true)\n |-- approvedName: string (nullable = true)\n |-- obsoleteSymbols: array (nullable = true)\n |    |-- element: struct (containsNull = true)\n |    |    |-- label: string (nullable = true)\n |    |    |-- source: string (nullable = true)\n |-- tss: long (nullable = true)\n</code></pre>"},{"location":"components/dataset/intervals/","title":"Intervals","text":"<p>         Bases: <code>Dataset</code></p> <p>Intervals dataset links genes to genomic regions based on genome interaction studies.</p> Source code in <code>src/otg/dataset/intervals.py</code> <pre><code>@dataclass\nclass Intervals(Dataset):\n\"\"\"Intervals dataset links genes to genomic regions based on genome interaction studies.\"\"\"\n\n    _schema: StructType = parse_spark_schema(\"intervals.json\")\n\n    @classmethod\n    def from_parquet(cls: type[Intervals], session: Session, path: str) -&gt; Intervals:\n\"\"\"Initialise Intervals from parquet file.\n\n        Args:\n            session (Session): ETL session\n            path (str): Path to parquet file\n\n        Returns:\n            Intervals: Intervals dataset\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n\n    @classmethod\n    def parse_andersson(\n        cls: type[Intervals],\n        session: Session,\n        path: str,\n        gene_index: GeneIndex,\n        lift: LiftOverSpark,\n    ) -&gt; Intervals:\n\"\"\"Parse Andersson et al. 2014 dataset.\n\n        Args:\n            session (Session): session\n            path (str): Path to dataset\n            gene_index (GeneIndex): Gene index\n            lift (LiftOverSpark): LiftOverSpark instance\n\n        Returns:\n            Intervals: Intervals dataset\n        \"\"\"\n        # Constant values:\n        dataset_name = \"andersson2014\"\n        experiment_type = \"fantom5\"\n        pmid = \"24670763\"\n        bio_feature = \"aggregate\"\n        twosided_threshold = 2.45e6  # &lt;-  this needs to phased out. Filter by percentile instead of absolute value.\n\n        session.logger.info(\"Parsing Andersson 2014 data...\")\n        session.logger.info(f\"Reading data from {path}\")\n\n        # Expected andersson et al. schema:\n        input_schema = t.StructType.fromJson(\n            json.loads(\n                pkg_resources.read_text(schemas, \"andersson2014.json\", encoding=\"utf-8\")\n            )\n        )\n\n        # Read the anderson file:\n        parsed_anderson_df = (\n            session.spark.read.option(\"delimiter\", \"\\t\")\n            .option(\"header\", \"true\")\n            .schema(input_schema)\n            .csv(path)\n            # Parsing score column and casting as float:\n            .withColumn(\"score\", f.col(\"score\").cast(\"float\") / f.lit(1000))\n            # Parsing the 'name' column:\n            .withColumn(\"parsedName\", f.split(f.col(\"name\"), \";\"))\n            .withColumn(\"gene_symbol\", f.col(\"parsedName\")[2])\n            .withColumn(\"location\", f.col(\"parsedName\")[0])\n            .withColumn(\n                \"chrom\",\n                f.regexp_replace(f.split(f.col(\"location\"), \":|-\")[0], \"chr\", \"\"),\n            )\n            .withColumn(\n                \"start\", f.split(f.col(\"location\"), \":|-\")[1].cast(t.IntegerType())\n            )\n            .withColumn(\n                \"end\", f.split(f.col(\"location\"), \":|-\")[2].cast(t.IntegerType())\n            )\n            # Select relevant columns:\n            .select(\"chrom\", \"start\", \"end\", \"gene_symbol\", \"score\")\n            # Drop rows with non-canonical chromosomes:\n            .filter(\n                f.col(\"chrom\").isin([str(x) for x in range(1, 23)] + [\"X\", \"Y\", \"MT\"])\n            )\n            # For each region/gene, keep only one row with the highest score:\n            .groupBy(\"chrom\", \"start\", \"end\", \"gene_symbol\")\n            .agg(f.max(\"score\").alias(\"resourceScore\"))\n            .orderBy(\"chrom\", \"start\")\n        )\n\n        return cls(\n            _df=(\n                # Lift over the intervals:\n                lift.convert_intervals(parsed_anderson_df, \"chrom\", \"start\", \"end\")\n                .drop(\"start\", \"end\")\n                .withColumnRenamed(\"mapped_start\", \"start\")\n                .withColumnRenamed(\"mapped_end\", \"end\")\n                .distinct()\n                # Joining with the gene index\n                .alias(\"intervals\")\n                .join(\n                    gene_index.symbols_lut().alias(\"genes\"),\n                    on=[f.col(\"intervals.gene_symbol\") == f.col(\"genes.geneSymbol\")],\n                    how=\"left\",\n                )\n                .filter(\n                    # Drop rows where the gene is not on the same chromosome\n                    (f.col(\"chrom\") == f.col(\"chromosome\"))\n                    # Drop rows where the TSS is far from the start of the region\n                    &amp; (\n                        f.abs((f.col(\"start\") + f.col(\"end\")) / 2 - f.col(\"tss\"))\n                        &lt;= twosided_threshold\n                    )\n                )\n                # Select relevant columns:\n                .select(\n                    \"chromosome\",\n                    \"start\",\n                    \"end\",\n                    \"geneId\",\n                    \"resourceScore\",\n                    f.lit(dataset_name).alias(\"datasourceId\"),\n                    f.lit(experiment_type).alias(\"datatypeId\"),\n                    f.lit(pmid).alias(\"pmid\"),\n                    f.lit(bio_feature).alias(\"biofeature\"),\n                )\n            )\n        )\n\n    @classmethod\n    def parse_javierre(\n        cls: type[Intervals],\n        session: Session,\n        path: str,\n        gene_index: GeneIndex,\n        lift: LiftOverSpark,\n    ) -&gt; Intervals:\n\"\"\"Parse Javierre et al. 2016 dataset.\n\n        Args:\n            session (Session): session\n            path (str): Path to dataset\n            gene_index (GeneIndex): Gene index\n            lift (LiftOverSpark): LiftOverSpark instance\n\n        Returns:\n            Intervals: Javierre et al. 2016 interval data\n        \"\"\"\n        # Constant values:\n        dataset_name = \"javierre2016\"\n        experiment_type = \"pchic\"\n        pmid = \"27863249\"\n        twosided_threshold = 2.45e6\n\n        session.logger.info(\"Parsing Javierre 2016 data...\")\n        session.logger.info(f\"Reading data from {path}\")\n\n        # Read Javierre data:\n        javierre_raw = (\n            session.spark.read.parquet(path)\n            # Splitting name column into chromosome, start, end, and score:\n            .withColumn(\"name_split\", f.split(f.col(\"name\"), r\":|-|,\"))\n            .withColumn(\n                \"name_chr\",\n                f.regexp_replace(f.col(\"name_split\")[0], \"chr\", \"\").cast(\n                    t.StringType()\n                ),\n            )\n            .withColumn(\"name_start\", f.col(\"name_split\")[1].cast(t.IntegerType()))\n            .withColumn(\"name_end\", f.col(\"name_split\")[2].cast(t.IntegerType()))\n            .withColumn(\"name_score\", f.col(\"name_split\")[3].cast(t.FloatType()))\n            # Cleaning up chromosome:\n            .withColumn(\n                \"chrom\",\n                f.regexp_replace(f.col(\"chrom\"), \"chr\", \"\").cast(t.StringType()),\n            )\n            .drop(\"name_split\", \"name\", \"annotation\")\n            # Keep canonical chromosomes and consistent chromosomes with scores:\n            .filter(\n                (f.col(\"name_score\").isNotNull())\n                &amp; (f.col(\"chrom\") == f.col(\"name_chr\"))\n                &amp; f.col(\"name_chr\").isin(\n                    [f\"{x}\" for x in range(1, 23)] + [\"X\", \"Y\", \"MT\"]\n                )\n            )\n        )\n\n        # Lifting over intervals:\n        javierre_remapped = (\n            javierre_raw\n            # Lifting over to GRCh38 interval 1:\n            .transform(lambda df: lift.convert_intervals(df, \"chrom\", \"start\", \"end\"))\n            .drop(\"start\", \"end\")\n            .withColumnRenamed(\"mapped_chrom\", \"chrom\")\n            .withColumnRenamed(\"mapped_start\", \"start\")\n            .withColumnRenamed(\"mapped_end\", \"end\")\n            # Lifting over interval 2 to GRCh38:\n            .transform(\n                lambda df: lift.convert_intervals(\n                    df, \"name_chr\", \"name_start\", \"name_end\"\n                )\n            )\n            .drop(\"name_start\", \"name_end\")\n            .withColumnRenamed(\"mapped_name_chr\", \"name_chr\")\n            .withColumnRenamed(\"mapped_name_start\", \"name_start\")\n            .withColumnRenamed(\"mapped_name_end\", \"name_end\")\n        )\n\n        # Once the intervals are lifted, extracting the unique intervals:\n        unique_intervals_with_genes = (\n            javierre_remapped.alias(\"intervals\")\n            .select(\n                f.col(\"chrom\"),\n                f.col(\"start\").cast(t.IntegerType()),\n                f.col(\"end\").cast(t.IntegerType()),\n            )\n            .distinct()\n            .join(\n                gene_index.locations_lut().alias(\"genes\"),\n                on=[f.col(\"intervals.chrom\") == f.col(\"genes.chromosome\")],\n                how=\"left\",\n            )\n            # TODO: add filter as part of the join condition\n            .filter(\n                (\n                    (f.col(\"start\") &gt;= f.col(\"genomicLocation.start\"))\n                    &amp; (f.col(\"start\") &lt;= f.col(\"genomicLocation.end\"))\n                )\n                | (\n                    (f.col(\"end\") &gt;= f.col(\"genomicLocation.start\"))\n                    &amp; (f.col(\"end\") &lt;= f.col(\"genomicLocation.end\"))\n                )\n            )\n            .select(\"chrom\", \"start\", \"end\", \"geneId\", \"tss\")\n        )\n\n        # Joining back the data:\n        return cls(\n            _df=(\n                javierre_remapped.join(\n                    unique_intervals_with_genes,\n                    on=[\"chrom\", \"start\", \"end\"],\n                    how=\"left\",\n                )\n                .filter(\n                    # Drop rows where the TSS is far from the start of the region\n                    f.abs((f.col(\"start\") + f.col(\"end\")) / 2 - f.col(\"tss\"))\n                    &lt;= twosided_threshold\n                )\n                # For each gene, keep only the highest scoring interval:\n                .groupBy(\n                    \"name_chr\", \"name_start\", \"name_end\", \"genes.geneId\", \"bio_feature\"\n                )\n                .agg(f.max(f.col(\"name_score\")).alias(\"resourceScore\"))\n                # Create the output:\n                .select(\n                    f.col(\"name_chr\").alias(\"chromosome\"),\n                    f.col(\"name_start\").alias(\"start\"),\n                    f.col(\"name_end\").alias(\"end\"),\n                    f.col(\"resourceScore\"),\n                    f.col(\"genes.geneId\").alias(\"geneId\"),\n                    f.col(\"bio_feature\").alias(\"biofeature\"),\n                    f.lit(dataset_name).alias(\"datasourceId\"),\n                    f.lit(experiment_type).alias(\"datatypeId\"),\n                    f.lit(pmid).alias(\"pmid\"),\n                )\n            )\n        )\n\n    @classmethod\n    def parse_jung(\n        cls: type[Intervals],\n        session: Session,\n        path: str,\n        gene_index: GeneIndex,\n        lift: LiftOverSpark,\n    ) -&gt; Intervals:\n\"\"\"Parse the Jung et al. 2019 dataset.\n\n        Args:\n            session (Session): session\n            path (str): path to the Jung et al. 2019 dataset\n            gene_index (GeneIndex): gene index\n            lift (LiftOverSpark): LiftOverSpark instance\n\n        Returns:\n            Intervals: _description_\n        \"\"\"\n        dataset_name = \"javierre2016\"\n        experiment_type = \"pchic\"\n        pmid = \"27863249\"\n\n        session.logger.info(\"Parsing Jung 2019 data...\")\n        session.logger.info(f\"Reading data from {path}\")\n\n        # Read Jung data:\n        jung_raw = (\n            session.spark.read.csv(path, sep=\",\", header=True)\n            .withColumn(\"interval\", f.split(f.col(\"Interacting_fragment\"), r\"\\.\"))\n            .select(\n                # Parsing intervals:\n                f.regexp_replace(f.col(\"interval\")[0], \"chr\", \"\").alias(\"chrom\"),\n                f.col(\"interval\")[1].cast(t.IntegerType()).alias(\"start\"),\n                f.col(\"interval\")[2].cast(t.IntegerType()).alias(\"end\"),\n                # Extract other columns:\n                f.col(\"Promoter\").alias(\"gene_name\"),\n                f.col(\"Tissue_type\").alias(\"tissue\"),\n            )\n        )\n\n        # Lifting over the coordinates:\n        return cls(\n            _df=(\n                jung_raw\n                # Lifting over to GRCh38 interval 1:\n                .transform(\n                    lambda df: lift.convert_intervals(df, \"chrom\", \"start\", \"end\")\n                )\n                .select(\n                    \"chrom\",\n                    f.col(\"mapped_start\").alias(\"start\"),\n                    f.col(\"mapped_end\").alias(\"end\"),\n                    f.explode(f.split(f.col(\"gene_name\"), \";\")).alias(\"gene_name\"),\n                    \"tissue\",\n                )\n                .alias(\"intervals\")\n                # Joining with genes:\n                .join(\n                    gene_index.symbols_lut().alias(\"genes\"),\n                    on=[f.col(\"intervals.gene_name\") == f.col(\"genes.geneSymbol\")],\n                    how=\"inner\",\n                )\n                # Finalize dataset:\n                .select(\n                    \"chromosome\",\n                    \"start\",\n                    \"end\",\n                    \"geneId\",\n                    f.col(\"tissue\").alias(\"biofeature\"),\n                    f.lit(1.0).alias(\"score\"),\n                    f.lit(dataset_name).alias(\"datasourceId\"),\n                    f.lit(experiment_type).alias(\"datatypeId\"),\n                    f.lit(pmid).alias(\"pmid\"),\n                )\n                .drop_duplicates()\n            )\n        )\n\n    @classmethod\n    def parse_thurman(\n        cls: type[Intervals],\n        session: Session,\n        path: str,\n        gene_index: GeneIndex,\n        lift: LiftOverSpark,\n    ) -&gt; Intervals:\n\"\"\"Parse the Thurman et al. 2019 dataset.\n\n        Args:\n            session (Session): session\n            path (str): path to the Thurman et al. 2019 dataset\n            gene_index (GeneIndex): gene index\n            lift (LiftOverSpark): LiftOverSpark instance\n\n        Returns:\n            Intervals: _description_\n        \"\"\"\n        dataset_name = \"thurman2012\"\n        experiment_type = \"dhscor\"\n        pmid = \"22955617\"\n\n        session.logger.info(\"Parsing Jung 2019 data...\")\n        session.logger.info(f\"Reading data from {path}\")\n\n        # Read Jung data:\n        jung_raw = (\n            session.spark.read.csv(path, sep=\",\", header=True)\n            .withColumn(\"interval\", f.split(f.col(\"Interacting_fragment\"), r\"\\.\"))\n            .select(\n                # Parsing intervals:\n                f.regexp_replace(f.col(\"interval\")[0], \"chr\", \"\").alias(\"chrom\"),\n                f.col(\"interval\")[1].cast(t.IntegerType()).alias(\"start\"),\n                f.col(\"interval\")[2].cast(t.IntegerType()).alias(\"end\"),\n                # Extract other columns:\n                f.col(\"Promoter\").alias(\"gene_name\"),\n                f.col(\"Tissue_type\").alias(\"tissue\"),\n            )\n        )\n\n        return cls(\n            _df=(\n                jung_raw\n                # Lifting over to GRCh38 interval 1:\n                .transform(\n                    lambda df: lift.convert_intervals(df, \"chrom\", \"start\", \"end\")\n                )\n                .select(\n                    \"chrom\",\n                    f.col(\"mapped_start\").alias(\"start\"),\n                    f.col(\"mapped_end\").alias(\"end\"),\n                    f.explode(f.split(f.col(\"gene_name\"), \";\")).alias(\"gene_name\"),\n                    \"tissue\",\n                )\n                .alias(\"intervals\")\n                # Joining with genes:\n                .join(\n                    gene_index.symbols_lut().alias(\"genes\"),\n                    on=[f.col(\"intervals.gene_name\") == f.col(\"genes.geneSymbol\")],\n                    how=\"inner\",\n                )\n                # Finalize dataset:\n                .select(\n                    \"chromosome\",\n                    \"start\",\n                    \"end\",\n                    \"geneId\",\n                    f.col(\"tissue\").alias(\"biofeature\"),\n                    f.lit(1.0).alias(\"score\"),\n                    f.lit(dataset_name).alias(\"datasourceId\"),\n                    f.lit(experiment_type).alias(\"datatypeId\"),\n                    f.lit(pmid).alias(\"pmid\"),\n                )\n                .drop_duplicates()\n            )\n        )\n\n    def v2g(self: Intervals, variant_index: VariantIndex) -&gt; V2G:\n\"\"\"Convert intervals into V2G by intersecting with a variant index.\n\n        Args:\n            variant_index (VariantIndex): Variant index dataset\n\n        Returns:\n            V2G: Variant-to-gene evidence dataset\n        \"\"\"\n        return V2G(\n            _df=(\n                # TODO: We can include the start and end position as part of the `on` clause in the join\n                self.df.alias(\"interval\")\n                .join(\n                    variant_index.df.selectExpr(\n                        \"chromosome as vi_chromosome\", \"variantId\", \"position\"\n                    ).alias(\"vi\"),\n                    on=[\n                        f.col(\"vi.vi_chromosome\") == f.col(\"interval.chromosome\"),\n                        f.col(\"vi.position\").between(\n                            f.col(\"interval.start\"), f.col(\"interval.end\")\n                        ),\n                    ],\n                    how=\"inner\",\n                )\n                .drop(\"start\", \"end\", \"vi_chromosome\")\n            )\n        )\n</code></pre>"},{"location":"components/dataset/intervals/#otg.dataset.intervals.Intervals.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise Intervals from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>ETL session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>Intervals</code> <code>Intervals</code> <p>Intervals dataset</p> Source code in <code>src/otg/dataset/intervals.py</code> <pre><code>@classmethod\ndef from_parquet(cls: type[Intervals], session: Session, path: str) -&gt; Intervals:\n\"\"\"Initialise Intervals from parquet file.\n\n    Args:\n        session (Session): ETL session\n        path (str): Path to parquet file\n\n    Returns:\n        Intervals: Intervals dataset\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/intervals/#otg.dataset.intervals.Intervals.parse_andersson","title":"<code>parse_andersson(session, path, gene_index, lift)</code>  <code>classmethod</code>","text":"<p>Parse Andersson et al. 2014 dataset.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>session</p> required <code>path</code> <code>str</code> <p>Path to dataset</p> required <code>gene_index</code> <code>GeneIndex</code> <p>Gene index</p> required <code>lift</code> <code>LiftOverSpark</code> <p>LiftOverSpark instance</p> required <p>Returns:</p> Name Type Description <code>Intervals</code> <code>Intervals</code> <p>Intervals dataset</p> Source code in <code>src/otg/dataset/intervals.py</code> <pre><code>@classmethod\ndef parse_andersson(\n    cls: type[Intervals],\n    session: Session,\n    path: str,\n    gene_index: GeneIndex,\n    lift: LiftOverSpark,\n) -&gt; Intervals:\n\"\"\"Parse Andersson et al. 2014 dataset.\n\n    Args:\n        session (Session): session\n        path (str): Path to dataset\n        gene_index (GeneIndex): Gene index\n        lift (LiftOverSpark): LiftOverSpark instance\n\n    Returns:\n        Intervals: Intervals dataset\n    \"\"\"\n    # Constant values:\n    dataset_name = \"andersson2014\"\n    experiment_type = \"fantom5\"\n    pmid = \"24670763\"\n    bio_feature = \"aggregate\"\n    twosided_threshold = 2.45e6  # &lt;-  this needs to phased out. Filter by percentile instead of absolute value.\n\n    session.logger.info(\"Parsing Andersson 2014 data...\")\n    session.logger.info(f\"Reading data from {path}\")\n\n    # Expected andersson et al. schema:\n    input_schema = t.StructType.fromJson(\n        json.loads(\n            pkg_resources.read_text(schemas, \"andersson2014.json\", encoding=\"utf-8\")\n        )\n    )\n\n    # Read the anderson file:\n    parsed_anderson_df = (\n        session.spark.read.option(\"delimiter\", \"\\t\")\n        .option(\"header\", \"true\")\n        .schema(input_schema)\n        .csv(path)\n        # Parsing score column and casting as float:\n        .withColumn(\"score\", f.col(\"score\").cast(\"float\") / f.lit(1000))\n        # Parsing the 'name' column:\n        .withColumn(\"parsedName\", f.split(f.col(\"name\"), \";\"))\n        .withColumn(\"gene_symbol\", f.col(\"parsedName\")[2])\n        .withColumn(\"location\", f.col(\"parsedName\")[0])\n        .withColumn(\n            \"chrom\",\n            f.regexp_replace(f.split(f.col(\"location\"), \":|-\")[0], \"chr\", \"\"),\n        )\n        .withColumn(\n            \"start\", f.split(f.col(\"location\"), \":|-\")[1].cast(t.IntegerType())\n        )\n        .withColumn(\n            \"end\", f.split(f.col(\"location\"), \":|-\")[2].cast(t.IntegerType())\n        )\n        # Select relevant columns:\n        .select(\"chrom\", \"start\", \"end\", \"gene_symbol\", \"score\")\n        # Drop rows with non-canonical chromosomes:\n        .filter(\n            f.col(\"chrom\").isin([str(x) for x in range(1, 23)] + [\"X\", \"Y\", \"MT\"])\n        )\n        # For each region/gene, keep only one row with the highest score:\n        .groupBy(\"chrom\", \"start\", \"end\", \"gene_symbol\")\n        .agg(f.max(\"score\").alias(\"resourceScore\"))\n        .orderBy(\"chrom\", \"start\")\n    )\n\n    return cls(\n        _df=(\n            # Lift over the intervals:\n            lift.convert_intervals(parsed_anderson_df, \"chrom\", \"start\", \"end\")\n            .drop(\"start\", \"end\")\n            .withColumnRenamed(\"mapped_start\", \"start\")\n            .withColumnRenamed(\"mapped_end\", \"end\")\n            .distinct()\n            # Joining with the gene index\n            .alias(\"intervals\")\n            .join(\n                gene_index.symbols_lut().alias(\"genes\"),\n                on=[f.col(\"intervals.gene_symbol\") == f.col(\"genes.geneSymbol\")],\n                how=\"left\",\n            )\n            .filter(\n                # Drop rows where the gene is not on the same chromosome\n                (f.col(\"chrom\") == f.col(\"chromosome\"))\n                # Drop rows where the TSS is far from the start of the region\n                &amp; (\n                    f.abs((f.col(\"start\") + f.col(\"end\")) / 2 - f.col(\"tss\"))\n                    &lt;= twosided_threshold\n                )\n            )\n            # Select relevant columns:\n            .select(\n                \"chromosome\",\n                \"start\",\n                \"end\",\n                \"geneId\",\n                \"resourceScore\",\n                f.lit(dataset_name).alias(\"datasourceId\"),\n                f.lit(experiment_type).alias(\"datatypeId\"),\n                f.lit(pmid).alias(\"pmid\"),\n                f.lit(bio_feature).alias(\"biofeature\"),\n            )\n        )\n    )\n</code></pre>"},{"location":"components/dataset/intervals/#otg.dataset.intervals.Intervals.parse_javierre","title":"<code>parse_javierre(session, path, gene_index, lift)</code>  <code>classmethod</code>","text":"<p>Parse Javierre et al. 2016 dataset.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>session</p> required <code>path</code> <code>str</code> <p>Path to dataset</p> required <code>gene_index</code> <code>GeneIndex</code> <p>Gene index</p> required <code>lift</code> <code>LiftOverSpark</code> <p>LiftOverSpark instance</p> required <p>Returns:</p> Name Type Description <code>Intervals</code> <code>Intervals</code> <p>Javierre et al. 2016 interval data</p> Source code in <code>src/otg/dataset/intervals.py</code> <pre><code>@classmethod\ndef parse_javierre(\n    cls: type[Intervals],\n    session: Session,\n    path: str,\n    gene_index: GeneIndex,\n    lift: LiftOverSpark,\n) -&gt; Intervals:\n\"\"\"Parse Javierre et al. 2016 dataset.\n\n    Args:\n        session (Session): session\n        path (str): Path to dataset\n        gene_index (GeneIndex): Gene index\n        lift (LiftOverSpark): LiftOverSpark instance\n\n    Returns:\n        Intervals: Javierre et al. 2016 interval data\n    \"\"\"\n    # Constant values:\n    dataset_name = \"javierre2016\"\n    experiment_type = \"pchic\"\n    pmid = \"27863249\"\n    twosided_threshold = 2.45e6\n\n    session.logger.info(\"Parsing Javierre 2016 data...\")\n    session.logger.info(f\"Reading data from {path}\")\n\n    # Read Javierre data:\n    javierre_raw = (\n        session.spark.read.parquet(path)\n        # Splitting name column into chromosome, start, end, and score:\n        .withColumn(\"name_split\", f.split(f.col(\"name\"), r\":|-|,\"))\n        .withColumn(\n            \"name_chr\",\n            f.regexp_replace(f.col(\"name_split\")[0], \"chr\", \"\").cast(\n                t.StringType()\n            ),\n        )\n        .withColumn(\"name_start\", f.col(\"name_split\")[1].cast(t.IntegerType()))\n        .withColumn(\"name_end\", f.col(\"name_split\")[2].cast(t.IntegerType()))\n        .withColumn(\"name_score\", f.col(\"name_split\")[3].cast(t.FloatType()))\n        # Cleaning up chromosome:\n        .withColumn(\n            \"chrom\",\n            f.regexp_replace(f.col(\"chrom\"), \"chr\", \"\").cast(t.StringType()),\n        )\n        .drop(\"name_split\", \"name\", \"annotation\")\n        # Keep canonical chromosomes and consistent chromosomes with scores:\n        .filter(\n            (f.col(\"name_score\").isNotNull())\n            &amp; (f.col(\"chrom\") == f.col(\"name_chr\"))\n            &amp; f.col(\"name_chr\").isin(\n                [f\"{x}\" for x in range(1, 23)] + [\"X\", \"Y\", \"MT\"]\n            )\n        )\n    )\n\n    # Lifting over intervals:\n    javierre_remapped = (\n        javierre_raw\n        # Lifting over to GRCh38 interval 1:\n        .transform(lambda df: lift.convert_intervals(df, \"chrom\", \"start\", \"end\"))\n        .drop(\"start\", \"end\")\n        .withColumnRenamed(\"mapped_chrom\", \"chrom\")\n        .withColumnRenamed(\"mapped_start\", \"start\")\n        .withColumnRenamed(\"mapped_end\", \"end\")\n        # Lifting over interval 2 to GRCh38:\n        .transform(\n            lambda df: lift.convert_intervals(\n                df, \"name_chr\", \"name_start\", \"name_end\"\n            )\n        )\n        .drop(\"name_start\", \"name_end\")\n        .withColumnRenamed(\"mapped_name_chr\", \"name_chr\")\n        .withColumnRenamed(\"mapped_name_start\", \"name_start\")\n        .withColumnRenamed(\"mapped_name_end\", \"name_end\")\n    )\n\n    # Once the intervals are lifted, extracting the unique intervals:\n    unique_intervals_with_genes = (\n        javierre_remapped.alias(\"intervals\")\n        .select(\n            f.col(\"chrom\"),\n            f.col(\"start\").cast(t.IntegerType()),\n            f.col(\"end\").cast(t.IntegerType()),\n        )\n        .distinct()\n        .join(\n            gene_index.locations_lut().alias(\"genes\"),\n            on=[f.col(\"intervals.chrom\") == f.col(\"genes.chromosome\")],\n            how=\"left\",\n        )\n        # TODO: add filter as part of the join condition\n        .filter(\n            (\n                (f.col(\"start\") &gt;= f.col(\"genomicLocation.start\"))\n                &amp; (f.col(\"start\") &lt;= f.col(\"genomicLocation.end\"))\n            )\n            | (\n                (f.col(\"end\") &gt;= f.col(\"genomicLocation.start\"))\n                &amp; (f.col(\"end\") &lt;= f.col(\"genomicLocation.end\"))\n            )\n        )\n        .select(\"chrom\", \"start\", \"end\", \"geneId\", \"tss\")\n    )\n\n    # Joining back the data:\n    return cls(\n        _df=(\n            javierre_remapped.join(\n                unique_intervals_with_genes,\n                on=[\"chrom\", \"start\", \"end\"],\n                how=\"left\",\n            )\n            .filter(\n                # Drop rows where the TSS is far from the start of the region\n                f.abs((f.col(\"start\") + f.col(\"end\")) / 2 - f.col(\"tss\"))\n                &lt;= twosided_threshold\n            )\n            # For each gene, keep only the highest scoring interval:\n            .groupBy(\n                \"name_chr\", \"name_start\", \"name_end\", \"genes.geneId\", \"bio_feature\"\n            )\n            .agg(f.max(f.col(\"name_score\")).alias(\"resourceScore\"))\n            # Create the output:\n            .select(\n                f.col(\"name_chr\").alias(\"chromosome\"),\n                f.col(\"name_start\").alias(\"start\"),\n                f.col(\"name_end\").alias(\"end\"),\n                f.col(\"resourceScore\"),\n                f.col(\"genes.geneId\").alias(\"geneId\"),\n                f.col(\"bio_feature\").alias(\"biofeature\"),\n                f.lit(dataset_name).alias(\"datasourceId\"),\n                f.lit(experiment_type).alias(\"datatypeId\"),\n                f.lit(pmid).alias(\"pmid\"),\n            )\n        )\n    )\n</code></pre>"},{"location":"components/dataset/intervals/#otg.dataset.intervals.Intervals.parse_jung","title":"<code>parse_jung(session, path, gene_index, lift)</code>  <code>classmethod</code>","text":"<p>Parse the Jung et al. 2019 dataset.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>session</p> required <code>path</code> <code>str</code> <p>path to the Jung et al. 2019 dataset</p> required <code>gene_index</code> <code>GeneIndex</code> <p>gene index</p> required <code>lift</code> <code>LiftOverSpark</code> <p>LiftOverSpark instance</p> required <p>Returns:</p> Name Type Description <code>Intervals</code> <code>Intervals</code> <p>description</p> Source code in <code>src/otg/dataset/intervals.py</code> <pre><code>@classmethod\ndef parse_jung(\n    cls: type[Intervals],\n    session: Session,\n    path: str,\n    gene_index: GeneIndex,\n    lift: LiftOverSpark,\n) -&gt; Intervals:\n\"\"\"Parse the Jung et al. 2019 dataset.\n\n    Args:\n        session (Session): session\n        path (str): path to the Jung et al. 2019 dataset\n        gene_index (GeneIndex): gene index\n        lift (LiftOverSpark): LiftOverSpark instance\n\n    Returns:\n        Intervals: _description_\n    \"\"\"\n    dataset_name = \"javierre2016\"\n    experiment_type = \"pchic\"\n    pmid = \"27863249\"\n\n    session.logger.info(\"Parsing Jung 2019 data...\")\n    session.logger.info(f\"Reading data from {path}\")\n\n    # Read Jung data:\n    jung_raw = (\n        session.spark.read.csv(path, sep=\",\", header=True)\n        .withColumn(\"interval\", f.split(f.col(\"Interacting_fragment\"), r\"\\.\"))\n        .select(\n            # Parsing intervals:\n            f.regexp_replace(f.col(\"interval\")[0], \"chr\", \"\").alias(\"chrom\"),\n            f.col(\"interval\")[1].cast(t.IntegerType()).alias(\"start\"),\n            f.col(\"interval\")[2].cast(t.IntegerType()).alias(\"end\"),\n            # Extract other columns:\n            f.col(\"Promoter\").alias(\"gene_name\"),\n            f.col(\"Tissue_type\").alias(\"tissue\"),\n        )\n    )\n\n    # Lifting over the coordinates:\n    return cls(\n        _df=(\n            jung_raw\n            # Lifting over to GRCh38 interval 1:\n            .transform(\n                lambda df: lift.convert_intervals(df, \"chrom\", \"start\", \"end\")\n            )\n            .select(\n                \"chrom\",\n                f.col(\"mapped_start\").alias(\"start\"),\n                f.col(\"mapped_end\").alias(\"end\"),\n                f.explode(f.split(f.col(\"gene_name\"), \";\")).alias(\"gene_name\"),\n                \"tissue\",\n            )\n            .alias(\"intervals\")\n            # Joining with genes:\n            .join(\n                gene_index.symbols_lut().alias(\"genes\"),\n                on=[f.col(\"intervals.gene_name\") == f.col(\"genes.geneSymbol\")],\n                how=\"inner\",\n            )\n            # Finalize dataset:\n            .select(\n                \"chromosome\",\n                \"start\",\n                \"end\",\n                \"geneId\",\n                f.col(\"tissue\").alias(\"biofeature\"),\n                f.lit(1.0).alias(\"score\"),\n                f.lit(dataset_name).alias(\"datasourceId\"),\n                f.lit(experiment_type).alias(\"datatypeId\"),\n                f.lit(pmid).alias(\"pmid\"),\n            )\n            .drop_duplicates()\n        )\n    )\n</code></pre>"},{"location":"components/dataset/intervals/#otg.dataset.intervals.Intervals.parse_thurman","title":"<code>parse_thurman(session, path, gene_index, lift)</code>  <code>classmethod</code>","text":"<p>Parse the Thurman et al. 2019 dataset.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>session</p> required <code>path</code> <code>str</code> <p>path to the Thurman et al. 2019 dataset</p> required <code>gene_index</code> <code>GeneIndex</code> <p>gene index</p> required <code>lift</code> <code>LiftOverSpark</code> <p>LiftOverSpark instance</p> required <p>Returns:</p> Name Type Description <code>Intervals</code> <code>Intervals</code> <p>description</p> Source code in <code>src/otg/dataset/intervals.py</code> <pre><code>@classmethod\ndef parse_thurman(\n    cls: type[Intervals],\n    session: Session,\n    path: str,\n    gene_index: GeneIndex,\n    lift: LiftOverSpark,\n) -&gt; Intervals:\n\"\"\"Parse the Thurman et al. 2019 dataset.\n\n    Args:\n        session (Session): session\n        path (str): path to the Thurman et al. 2019 dataset\n        gene_index (GeneIndex): gene index\n        lift (LiftOverSpark): LiftOverSpark instance\n\n    Returns:\n        Intervals: _description_\n    \"\"\"\n    dataset_name = \"thurman2012\"\n    experiment_type = \"dhscor\"\n    pmid = \"22955617\"\n\n    session.logger.info(\"Parsing Jung 2019 data...\")\n    session.logger.info(f\"Reading data from {path}\")\n\n    # Read Jung data:\n    jung_raw = (\n        session.spark.read.csv(path, sep=\",\", header=True)\n        .withColumn(\"interval\", f.split(f.col(\"Interacting_fragment\"), r\"\\.\"))\n        .select(\n            # Parsing intervals:\n            f.regexp_replace(f.col(\"interval\")[0], \"chr\", \"\").alias(\"chrom\"),\n            f.col(\"interval\")[1].cast(t.IntegerType()).alias(\"start\"),\n            f.col(\"interval\")[2].cast(t.IntegerType()).alias(\"end\"),\n            # Extract other columns:\n            f.col(\"Promoter\").alias(\"gene_name\"),\n            f.col(\"Tissue_type\").alias(\"tissue\"),\n        )\n    )\n\n    return cls(\n        _df=(\n            jung_raw\n            # Lifting over to GRCh38 interval 1:\n            .transform(\n                lambda df: lift.convert_intervals(df, \"chrom\", \"start\", \"end\")\n            )\n            .select(\n                \"chrom\",\n                f.col(\"mapped_start\").alias(\"start\"),\n                f.col(\"mapped_end\").alias(\"end\"),\n                f.explode(f.split(f.col(\"gene_name\"), \";\")).alias(\"gene_name\"),\n                \"tissue\",\n            )\n            .alias(\"intervals\")\n            # Joining with genes:\n            .join(\n                gene_index.symbols_lut().alias(\"genes\"),\n                on=[f.col(\"intervals.gene_name\") == f.col(\"genes.geneSymbol\")],\n                how=\"inner\",\n            )\n            # Finalize dataset:\n            .select(\n                \"chromosome\",\n                \"start\",\n                \"end\",\n                \"geneId\",\n                f.col(\"tissue\").alias(\"biofeature\"),\n                f.lit(1.0).alias(\"score\"),\n                f.lit(dataset_name).alias(\"datasourceId\"),\n                f.lit(experiment_type).alias(\"datatypeId\"),\n                f.lit(pmid).alias(\"pmid\"),\n            )\n            .drop_duplicates()\n        )\n    )\n</code></pre>"},{"location":"components/dataset/intervals/#otg.dataset.intervals.Intervals.v2g","title":"<code>v2g(variant_index)</code>","text":"<p>Convert intervals into V2G by intersecting with a variant index.</p> <p>Parameters:</p> Name Type Description Default <code>variant_index</code> <code>VariantIndex</code> <p>Variant index dataset</p> required <p>Returns:</p> Name Type Description <code>V2G</code> <code>V2G</code> <p>Variant-to-gene evidence dataset</p> Source code in <code>src/otg/dataset/intervals.py</code> <pre><code>def v2g(self: Intervals, variant_index: VariantIndex) -&gt; V2G:\n\"\"\"Convert intervals into V2G by intersecting with a variant index.\n\n    Args:\n        variant_index (VariantIndex): Variant index dataset\n\n    Returns:\n        V2G: Variant-to-gene evidence dataset\n    \"\"\"\n    return V2G(\n        _df=(\n            # TODO: We can include the start and end position as part of the `on` clause in the join\n            self.df.alias(\"interval\")\n            .join(\n                variant_index.df.selectExpr(\n                    \"chromosome as vi_chromosome\", \"variantId\", \"position\"\n                ).alias(\"vi\"),\n                on=[\n                    f.col(\"vi.vi_chromosome\") == f.col(\"interval.chromosome\"),\n                    f.col(\"vi.position\").between(\n                        f.col(\"interval.start\"), f.col(\"interval.end\")\n                    ),\n                ],\n                how=\"inner\",\n            )\n            .drop(\"start\", \"end\", \"vi_chromosome\")\n        )\n    )\n</code></pre>"},{"location":"components/dataset/intervals/#schema","title":"Schema","text":"<pre><code>root\n |-- chromosome: string (nullable = false)\n |-- start: string (nullable = false)\n |-- end: string (nullable = false)\n |-- geneId: string (nullable = false)\n |-- resourceScore: double (nullable = true)\n |-- score: double (nullable = true)\n |-- datasourceId: string (nullable = false)\n |-- datatypeId: string (nullable = false)\n |-- pmid: string (nullable = true)\n |-- biofeature: string (nullable = true)\n</code></pre>"},{"location":"components/dataset/ld_index/","title":"LD index","text":"<p>         Bases: <code>Dataset</code></p> <p>Dataset to index access to LD information from GnomAD.</p> Source code in <code>src/otg/dataset/ld_index.py</code> <pre><code>@dataclass\nclass LDIndex(Dataset):\n\"\"\"Dataset to index access to LD information from GnomAD.\"\"\"\n\n    _schema: StructType = parse_spark_schema(\"ld_index.json\")\n\n    @staticmethod\n    def _liftover_loci(variant_index: Table, grch37_to_grch38_chain_path: str) -&gt; Table:\n\"\"\"Liftover hail table with LD variant index.\n\n        Args:\n            variant_index (Table): LD variant indexes\n            grch37_to_grch38_chain_path (str): Path to chain file\n\n        Returns:\n            Table: LD variant index with locus 38 coordinates\n        \"\"\"\n        if not hl.get_reference(\"GRCh37\").has_liftover(\"GRCh38\"):\n            rg37 = hl.get_reference(\"GRCh37\")\n            rg38 = hl.get_reference(\"GRCh38\")\n            rg37.add_liftover(grch37_to_grch38_chain_path, rg38)\n\n        return variant_index.annotate(\n            locus38=hl.liftover(variant_index.locus, \"GRCh38\")\n        )\n\n    @staticmethod\n    def _interval_start(contig: Column, position: Column, ld_radius: int) -&gt; Column:\n\"\"\"Start position of the interval based on available positions.\n\n        Args:\n            contig (Column): genomic contigs\n            position (Column): genomic positions\n            ld_radius (int): bp around locus\n\n        Returns:\n            Column: Position of the locus starting the interval\n\n        Examples:\n            &gt;&gt;&gt; d = [\n            ...     {\"contig\": \"21\", \"pos\": 100},\n            ...     {\"contig\": \"21\", \"pos\": 200},\n            ...     {\"contig\": \"21\", \"pos\": 300},\n            ... ]\n            &gt;&gt;&gt; df = spark.createDataFrame(d)\n            &gt;&gt;&gt; df.withColumn(\"start\", LDIndex._interval_start(f.col(\"contig\"), f.col(\"pos\"), 100)).show()\n            +------+---+-----+\n            |contig|pos|start|\n            +------+---+-----+\n            |    21|100|  100|\n            |    21|200|  100|\n            |    21|300|  200|\n            +------+---+-----+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        w = (\n            Window.partitionBy(contig)\n            .orderBy(position)\n            .rangeBetween(-ld_radius, ld_radius)\n        )\n        return f.min(position).over(w)\n\n    @staticmethod\n    def _interval_stop(contig: Column, position: Column, ld_radius: int) -&gt; Column:\n\"\"\"Stop position of the interval based on available positions.\n\n        Args:\n            contig (Column): genomic contigs\n            position (Column): genomic positions\n            ld_radius (int): bp around locus\n\n        Returns:\n            Column: Position of the locus at the end of the interval\n\n        Examples:\n            &gt;&gt;&gt; d = [\n            ...     {\"contig\": \"21\", \"pos\": 100},\n            ...     {\"contig\": \"21\", \"pos\": 200},\n            ...     {\"contig\": \"21\", \"pos\": 300},\n            ... ]\n            &gt;&gt;&gt; df = spark.createDataFrame(d)\n            &gt;&gt;&gt; df.withColumn(\"start\", LDIndex._interval_stop(f.col(\"contig\"), f.col(\"pos\"), 100)).show()\n            +------+---+-----+\n            |contig|pos|start|\n            +------+---+-----+\n            |    21|100|  200|\n            |    21|200|  300|\n            |    21|300|  300|\n            +------+---+-----+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        w = (\n            Window.partitionBy(contig)\n            .orderBy(position)\n            .rangeBetween(-ld_radius, ld_radius)\n        )\n        return f.max(position).over(w)\n\n    @classmethod\n    def from_parquet(cls: type[LDIndex], session: Session, path: str) -&gt; LDIndex:\n\"\"\"Initialise LD index from parquet file.\n\n        Args:\n            session (Session): ETL session\n            path (str): Path to parquet file\n\n        Returns:\n            LDIndex: LD index dataset\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n\n    @classmethod\n    def create(\n        cls: type[LDIndex],\n        pop_ldindex_path: str,\n        ld_radius: int,\n        grch37_to_grch38_chain_path: str,\n    ) -&gt; LDIndex:\n\"\"\"Parse LD index and annotate with interval start and stop.\n\n        Args:\n            pop_ldindex_path (str): path to gnomAD LD index\n            ld_radius (int): radius\n            grch37_to_grch38_chain_path (str): path to chain file for liftover\n\n        Returns:\n            LDIndex: Created GnomAD LD index\n        \"\"\"\n        ld_index = hl.read_table(pop_ldindex_path).naive_coalesce(400)\n        ld_index_38 = LDIndex._liftover_loci(ld_index, grch37_to_grch38_chain_path)\n\n        return cls(\n            _df=ld_index_38.to_spark()\n            .filter(f.col(\"`locus38.position`\").isNotNull())\n            .select(\n                f.coalesce(f.col(\"idx\"), f.monotonically_increasing_id()).alias(\"idx\"),\n                f.coalesce(\n                    f.regexp_replace(\"`locus38.contig`\", \"chr\", \"\"), f.lit(\"unknown\")\n                ).alias(\"chromosome\"),\n                f.coalesce(f.col(\"`locus38.position`\"), f.lit(-1)).alias(\"position\"),\n                f.coalesce(f.col(\"`alleles`\").getItem(0), f.lit(\"?\")).alias(\n                    \"referenceAllele\"\n                ),\n                f.coalesce(f.col(\"`alleles`\").getItem(1), f.lit(\"?\")).alias(\n                    \"alternateAllele\"\n                ),\n            )\n            # Convert gnomad position to Ensembl position (1-based for indels)\n            .withColumn(\n                \"position\",\n                convert_gnomad_position_to_ensembl(\n                    f.col(\"position\"),\n                    f.col(\"referenceAllele\"),\n                    f.col(\"alternateAllele\"),\n                ),\n            )\n            .withColumn(\n                \"variantId\",\n                f.concat_ws(\n                    \"_\",\n                    f.col(\"chromosome\"),\n                    f.col(\"position\"),\n                    f.col(\"referenceAllele\"),\n                    f.col(\"alternateAllele\"),\n                ),\n            )\n            # Filter out variants mapping to several indices due to liftover\n            .withColumn(\"count\", f.count(\"*\").over(Window.partitionBy([\"variantId\"])))\n            .filter(f.col(\"count\") == 1)\n            .drop(\"count\")\n            .withColumn(\"start_idx\", f.lit(None).cast(t.LongType()))\n            .withColumn(\"stop_idx\", f.lit(None).cast(t.LongType()))\n            .repartition(400, \"chromosome\")\n            .sortWithinPartitions(\"position\")\n            .persist()\n        ).annotate_index_intervals(ld_radius)\n\n    def annotate_index_intervals(self: LDIndex, ld_radius: int) -&gt; LDIndex:\n\"\"\"Annotate LD index with indices starting and stopping at a given interval.\n\n        Args:\n            ld_radius (int): radius around each position\n\n        Returns:\n            LDIndex: including `start_idx` and `stop_idx` columns\n        \"\"\"\n        index_with_positions = (\n            self._df.drop(\"start_idx\", \"stop_idx\")\n            .select(\n                \"*\",\n                LDIndex._interval_start(\n                    contig=f.col(\"chromosome\"),\n                    position=f.col(\"position\"),\n                    ld_radius=ld_radius,\n                ).alias(\"start_pos\"),\n                LDIndex._interval_stop(\n                    contig=f.col(\"chromosome\"),\n                    position=f.col(\"position\"),\n                    ld_radius=ld_radius,\n                ).alias(\"stop_pos\"),\n            )\n            .persist()\n        )\n\n        self.df = (\n            index_with_positions.join(\n                (\n                    index_with_positions\n                    # Given the multiple variants with the same chromosome/position can have different indices, filter for the lowest index:\n                    .transform(\n                        lambda df: get_record_with_minimum_value(\n                            df, [\"chromosome\", \"position\"], \"idx\"\n                        )\n                    ).select(\n                        \"chromosome\",\n                        f.col(\"position\").alias(\"start_pos\"),\n                        f.col(\"idx\").alias(\"start_idx\"),\n                    )\n                ),\n                on=[\"chromosome\", \"start_pos\"],\n            )\n            .join(\n                (\n                    index_with_positions\n                    # Given the multiple variants with the same chromosome/position can have different indices, filter for the highest index:\n                    .transform(\n                        lambda df: get_record_with_maximum_value(\n                            df, [\"chromosome\", \"position\"], \"idx\"\n                        )\n                    ).select(\n                        \"chromosome\",\n                        f.col(\"position\").alias(\"stop_pos\"),\n                        f.col(\"idx\").alias(\"stop_idx\"),\n                    )\n                ),\n                on=[\"chromosome\", \"stop_pos\"],\n            )\n            # Filter out variants for which start idx &gt; stop idx due to liftover\n            .filter(f.col(\"start_idx\") &lt; f.col(\"stop_idx\"))\n            .drop(\"start_pos\", \"stop_pos\")\n        )\n\n        return self\n</code></pre>"},{"location":"components/dataset/ld_index/#otg.dataset.ld_index.LDIndex.annotate_index_intervals","title":"<code>annotate_index_intervals(ld_radius)</code>","text":"<p>Annotate LD index with indices starting and stopping at a given interval.</p> <p>Parameters:</p> Name Type Description Default <code>ld_radius</code> <code>int</code> <p>radius around each position</p> required <p>Returns:</p> Name Type Description <code>LDIndex</code> <code>LDIndex</code> <p>including <code>start_idx</code> and <code>stop_idx</code> columns</p> Source code in <code>src/otg/dataset/ld_index.py</code> <pre><code>def annotate_index_intervals(self: LDIndex, ld_radius: int) -&gt; LDIndex:\n\"\"\"Annotate LD index with indices starting and stopping at a given interval.\n\n    Args:\n        ld_radius (int): radius around each position\n\n    Returns:\n        LDIndex: including `start_idx` and `stop_idx` columns\n    \"\"\"\n    index_with_positions = (\n        self._df.drop(\"start_idx\", \"stop_idx\")\n        .select(\n            \"*\",\n            LDIndex._interval_start(\n                contig=f.col(\"chromosome\"),\n                position=f.col(\"position\"),\n                ld_radius=ld_radius,\n            ).alias(\"start_pos\"),\n            LDIndex._interval_stop(\n                contig=f.col(\"chromosome\"),\n                position=f.col(\"position\"),\n                ld_radius=ld_radius,\n            ).alias(\"stop_pos\"),\n        )\n        .persist()\n    )\n\n    self.df = (\n        index_with_positions.join(\n            (\n                index_with_positions\n                # Given the multiple variants with the same chromosome/position can have different indices, filter for the lowest index:\n                .transform(\n                    lambda df: get_record_with_minimum_value(\n                        df, [\"chromosome\", \"position\"], \"idx\"\n                    )\n                ).select(\n                    \"chromosome\",\n                    f.col(\"position\").alias(\"start_pos\"),\n                    f.col(\"idx\").alias(\"start_idx\"),\n                )\n            ),\n            on=[\"chromosome\", \"start_pos\"],\n        )\n        .join(\n            (\n                index_with_positions\n                # Given the multiple variants with the same chromosome/position can have different indices, filter for the highest index:\n                .transform(\n                    lambda df: get_record_with_maximum_value(\n                        df, [\"chromosome\", \"position\"], \"idx\"\n                    )\n                ).select(\n                    \"chromosome\",\n                    f.col(\"position\").alias(\"stop_pos\"),\n                    f.col(\"idx\").alias(\"stop_idx\"),\n                )\n            ),\n            on=[\"chromosome\", \"stop_pos\"],\n        )\n        # Filter out variants for which start idx &gt; stop idx due to liftover\n        .filter(f.col(\"start_idx\") &lt; f.col(\"stop_idx\"))\n        .drop(\"start_pos\", \"stop_pos\")\n    )\n\n    return self\n</code></pre>"},{"location":"components/dataset/ld_index/#otg.dataset.ld_index.LDIndex.create","title":"<code>create(pop_ldindex_path, ld_radius, grch37_to_grch38_chain_path)</code>  <code>classmethod</code>","text":"<p>Parse LD index and annotate with interval start and stop.</p> <p>Parameters:</p> Name Type Description Default <code>pop_ldindex_path</code> <code>str</code> <p>path to gnomAD LD index</p> required <code>ld_radius</code> <code>int</code> <p>radius</p> required <code>grch37_to_grch38_chain_path</code> <code>str</code> <p>path to chain file for liftover</p> required <p>Returns:</p> Name Type Description <code>LDIndex</code> <code>LDIndex</code> <p>Created GnomAD LD index</p> Source code in <code>src/otg/dataset/ld_index.py</code> <pre><code>@classmethod\ndef create(\n    cls: type[LDIndex],\n    pop_ldindex_path: str,\n    ld_radius: int,\n    grch37_to_grch38_chain_path: str,\n) -&gt; LDIndex:\n\"\"\"Parse LD index and annotate with interval start and stop.\n\n    Args:\n        pop_ldindex_path (str): path to gnomAD LD index\n        ld_radius (int): radius\n        grch37_to_grch38_chain_path (str): path to chain file for liftover\n\n    Returns:\n        LDIndex: Created GnomAD LD index\n    \"\"\"\n    ld_index = hl.read_table(pop_ldindex_path).naive_coalesce(400)\n    ld_index_38 = LDIndex._liftover_loci(ld_index, grch37_to_grch38_chain_path)\n\n    return cls(\n        _df=ld_index_38.to_spark()\n        .filter(f.col(\"`locus38.position`\").isNotNull())\n        .select(\n            f.coalesce(f.col(\"idx\"), f.monotonically_increasing_id()).alias(\"idx\"),\n            f.coalesce(\n                f.regexp_replace(\"`locus38.contig`\", \"chr\", \"\"), f.lit(\"unknown\")\n            ).alias(\"chromosome\"),\n            f.coalesce(f.col(\"`locus38.position`\"), f.lit(-1)).alias(\"position\"),\n            f.coalesce(f.col(\"`alleles`\").getItem(0), f.lit(\"?\")).alias(\n                \"referenceAllele\"\n            ),\n            f.coalesce(f.col(\"`alleles`\").getItem(1), f.lit(\"?\")).alias(\n                \"alternateAllele\"\n            ),\n        )\n        # Convert gnomad position to Ensembl position (1-based for indels)\n        .withColumn(\n            \"position\",\n            convert_gnomad_position_to_ensembl(\n                f.col(\"position\"),\n                f.col(\"referenceAllele\"),\n                f.col(\"alternateAllele\"),\n            ),\n        )\n        .withColumn(\n            \"variantId\",\n            f.concat_ws(\n                \"_\",\n                f.col(\"chromosome\"),\n                f.col(\"position\"),\n                f.col(\"referenceAllele\"),\n                f.col(\"alternateAllele\"),\n            ),\n        )\n        # Filter out variants mapping to several indices due to liftover\n        .withColumn(\"count\", f.count(\"*\").over(Window.partitionBy([\"variantId\"])))\n        .filter(f.col(\"count\") == 1)\n        .drop(\"count\")\n        .withColumn(\"start_idx\", f.lit(None).cast(t.LongType()))\n        .withColumn(\"stop_idx\", f.lit(None).cast(t.LongType()))\n        .repartition(400, \"chromosome\")\n        .sortWithinPartitions(\"position\")\n        .persist()\n    ).annotate_index_intervals(ld_radius)\n</code></pre>"},{"location":"components/dataset/ld_index/#otg.dataset.ld_index.LDIndex.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise LD index from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>ETL session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>LDIndex</code> <code>LDIndex</code> <p>LD index dataset</p> Source code in <code>src/otg/dataset/ld_index.py</code> <pre><code>@classmethod\ndef from_parquet(cls: type[LDIndex], session: Session, path: str) -&gt; LDIndex:\n\"\"\"Initialise LD index from parquet file.\n\n    Args:\n        session (Session): ETL session\n        path (str): Path to parquet file\n\n    Returns:\n        LDIndex: LD index dataset\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/ld_index/#schema","title":"Schema","text":"<pre><code>root\n |-- variantId: string (nullable = false)\n |-- chromosome: string (nullable = false)\n |-- position: integer (nullable = false)\n |-- referenceAllele: string (nullable = false)\n |-- alternateAllele: string (nullable = false)\n |-- idx: long (nullable = false)\n |-- start_idx: long (nullable = true)\n |-- stop_idx: long (nullable = true)\n</code></pre>"},{"location":"components/dataset/study_locus_overlap/","title":"Study locus overlap","text":"<p>         Bases: <code>Dataset</code></p> <p>Study-Locus overlap.</p> <p>This dataset captures pairs of overlapping <code>StudyLocus</code>.</p> Source code in <code>src/otg/dataset/study_locus_overlap.py</code> <pre><code>@dataclass\nclass StudyLocusOverlap(Dataset):\n\"\"\"Study-Locus overlap.\n\n    This dataset captures pairs of overlapping `StudyLocus`.\n    \"\"\"\n\n    _schema: StructType = parse_spark_schema(\"study_locus_overlap.json\")\n\n    @classmethod\n    def from_parquet(\n        cls: type[StudyLocusOverlap], session: Session, path: str\n    ) -&gt; StudyLocusOverlap:\n\"\"\"Initialise StudyLocusOverlap from parquet file.\n\n        Args:\n            session (Session): ETL session\n            path (str): Path to parquet file\n\n        Returns:\n            StudyLocusOverlap: Study-locus overlap dataset\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/study_locus_overlap/#otg.dataset.study_locus_overlap.StudyLocusOverlap.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise StudyLocusOverlap from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>ETL session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>StudyLocusOverlap</code> <code>StudyLocusOverlap</code> <p>Study-locus overlap dataset</p> Source code in <code>src/otg/dataset/study_locus_overlap.py</code> <pre><code>@classmethod\ndef from_parquet(\n    cls: type[StudyLocusOverlap], session: Session, path: str\n) -&gt; StudyLocusOverlap:\n\"\"\"Initialise StudyLocusOverlap from parquet file.\n\n    Args:\n        session (Session): ETL session\n        path (str): Path to parquet file\n\n    Returns:\n        StudyLocusOverlap: Study-locus overlap dataset\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/study_locus_overlap/#schema","title":"Schema","text":"<pre><code>root\n |-- left_studyLocusId: long (nullable = false)\n |-- right_studyLocusId: long (nullable = false)\n |-- chromosome: string (nullable = false)\n |-- tagVariantId: string (nullable = false)\n |-- right_logABF: double (nullable = true)\n |-- left_logABF: double (nullable = true)\n |-- right_posteriorProbability: double (nullable = true)\n |-- left_posteriorProbability: double (nullable = true)\n</code></pre>"},{"location":"components/dataset/summary_statistics/","title":"Summary statistics","text":"<p>         Bases: <code>Dataset</code></p> <p>Summary Statistics dataset.</p> <p>A summary statistics dataset contains all single point statistics resulting from a GWAS.</p> Source code in <code>src/otg/dataset/summary_statistics.py</code> <pre><code>@dataclass\nclass SummaryStatistics(Dataset):\n\"\"\"Summary Statistics dataset.\n\n    A summary statistics dataset contains all single point statistics resulting from a GWAS.\n    \"\"\"\n\n    _schema: t.StructType = parse_spark_schema(\"summary_statistics.json\")\n\n    @staticmethod\n    def _convert_odds_ratio_to_beta(\n        beta: Column, odds_ratio: Column, standard_error: Column\n    ) -&gt; tuple:\n\"\"\"Harmonizes effect and standard error to beta.\n\n        Args:\n            beta (Column): Effect in beta\n            odds_ratio (Column): Effect in odds ratio\n            standard_error (Column): Standard error of the effect\n\n        Returns:\n            tuple: beta, standard error\n\n        Examples:\n            &gt;&gt;&gt; df = spark.createDataFrame([{\"beta\": 0.1, \"oddsRatio\": 1.1, \"standardError\": 0.1}, {\"beta\": None, \"oddsRatio\": 1.1, \"standardError\": 0.1}, {\"beta\": 0.1, \"oddsRatio\": None, \"standardError\": 0.1}, {\"beta\": 0.1, \"oddsRatio\": 1.1, \"standardError\": None}])\n            &gt;&gt;&gt; df.select(\"*\", *SummaryStatistics._convert_odds_ratio_to_beta(f.col(\"beta\"), f.col(\"oddsRatio\"), f.col(\"standardError\"))).show()\n            +----+---------+-------------+-------------------+-------------+\n            |beta|oddsRatio|standardError|               beta|standardError|\n            +----+---------+-------------+-------------------+-------------+\n            | 0.1|      1.1|          0.1|                0.1|          0.1|\n            |null|      1.1|          0.1|0.09531017980432493|         null|\n            | 0.1|     null|          0.1|                0.1|          0.1|\n            | 0.1|      1.1|         null|                0.1|         null|\n            +----+---------+-------------+-------------------+-------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        # We keep standard error when effect is given in beta, otherwise drop.\n        standard_error = f.when(\n            standard_error.isNotNull() &amp; beta.isNotNull(), standard_error\n        ).alias(\"standardError\")\n\n        # Odds ratio is converted to beta:\n        beta = (\n            f.when(beta.isNotNull(), beta)\n            .when(odds_ratio.isNotNull(), f.log(odds_ratio))\n            .alias(\"beta\")\n        )\n\n        return (beta, standard_error)\n\n    @staticmethod\n    def _calculate_confidence_interval(\n        pvalue_mantissa: Column,\n        pvalue_exponent: Column,\n        beta: Column,\n        standard_error: Column,\n    ) -&gt; tuple:\n\"\"\"This function calculates the confidence interval for the effect based on the p-value and the effect size.\n\n        If the standard error already available, don't re-calculate from p-value.\n\n        Args:\n            pvalue_mantissa (Column): p-value mantissa (float)\n            pvalue_exponent (Column): p-value exponent (integer)\n            beta (Column): effect size in beta (float)\n            standard_error (Column): standard error.\n\n        Returns:\n            tuple: betaConfidenceIntervalLower (float), betaConfidenceIntervalUpper (float)\n        \"\"\"\n        # Calculate p-value from mantissa and exponent:\n        pvalue = pvalue_mantissa * f.pow(10, pvalue_exponent)\n\n        # Fix p-value underflow:\n        pvalue = f.when(pvalue == 0, sys.float_info.min).otherwise(pvalue)\n\n        # Compute missing standard error:\n        standard_error = f.when(\n            standard_error.isNull(), f.abs(beta) / f.abs(pvalue_to_zscore(pvalue))\n        ).otherwise(standard_error)\n\n        # Calculate upper and lower confidence interval:\n        ci_lower = (beta - standard_error).alias(\"betaConfidenceIntervalLower\")\n        ci_upper = (beta + standard_error).alias(\"betaConfidenceIntervalUpper\")\n\n        return (ci_lower, ci_upper)\n\n    @classmethod\n    def from_parquet(\n        cls: type[SummaryStatistics], session: Session, path: str\n    ) -&gt; SummaryStatistics:\n\"\"\"Initialise SummaryStatistics from parquet file.\n\n        Args:\n            session (Session): Session\n            path (str): Path to parquet file\n\n        Returns:\n            SummaryStatistics: SummaryStatistics dataset\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n\n    @classmethod\n    def from_gwas_harmonized_summary_stats(\n        cls: type[SummaryStatistics],\n        sumstats_df: DataFrame,\n        study_id: str,\n    ) -&gt; SummaryStatistics:\n\"\"\"Create summary statistics object from summary statistics harmonized by the GWAS Catalog.\n\n        Args:\n            sumstats_df (DataFrame): Harmonized dataset read as dataframe from GWAS Catalog.\n            study_id (str): GWAS Catalog Study accession.\n\n        Returns:\n            SummaryStatistics\n        \"\"\"\n        # The effect allele frequency is an optional column, we have to test if it is there:\n        allele_frequency_expression = (\n            f.col(\"hm_effect_allele_frequency\").cast(t.DoubleType())\n            if \"hm_effect_allele_frequency\" in sumstats_df.columns\n            else f.lit(None)\n        )\n\n        # Processing columns of interest:\n        processed_sumstats_df = (\n            sumstats_df.select(\n                # Adding study identifier:\n                f.lit(study_id).cast(t.StringType()).alias(\"studyId\"),\n                # Adding variant identifier:\n                f.col(\"hm_variant_id\").alias(\"variantId\"),\n                f.col(\"hm_chrom\").alias(\"chromosome\"),\n                f.col(\"hm_pos\").cast(t.IntegerType()).alias(\"position\"),\n                # Parsing p-value mantissa and exponent:\n                *parse_pvalue(f.col(\"p_value\").cast(t.FloatType())),\n                # Converting/calculating effect and confidence interval:\n                *cls._convert_odds_ratio_to_beta(\n                    f.col(\"hm_beta\").cast(t.DoubleType()),\n                    f.col(\"hm_odds_ratio\").cast(t.DoubleType()),\n                    f.col(\"standard_error\").cast(t.DoubleType()),\n                ),\n                allele_frequency_expression.alias(\"effectAlleleFrequencyFromSource\"),\n            )\n            .repartition(200, \"chromosome\")\n            .sortWithinPartitions(\"position\")\n        )\n\n        # Initializing summary statistics object:\n        return cls(\n            _df=processed_sumstats_df,\n        )\n\n    def calculate_confidence_interval(self: SummaryStatistics) -&gt; SummaryStatistics:\n\"\"\"A Function to add upper and lower confidence interval to a summary statistics dataset.\n\n        Returns:\n            SummaryStatistics:\n        \"\"\"\n        columns = self._df.columns\n\n        # If confidence interval has already been calculated skip:\n        if (\n            \"betaConfidenceIntervalLower\" in columns\n            and \"betaConfidenceIntervalUpper\" in columns\n        ):\n            return self\n\n        # Calculate CI:\n        return SummaryStatistics(\n            _df=(\n                self._df.select(\n                    \"*\",\n                    *self._calculate_confidence_interval(\n                        f.col(\"pValueMantissa\"),\n                        f.col(\"pValueExponent\"),\n                        f.col(\"beta\"),\n                        f.col(\"standardError\"),\n                    ),\n                )\n            )\n        )\n\n    def pvalue_filter(self: SummaryStatistics, pvalue: float) -&gt; SummaryStatistics:\n\"\"\"Filter summary statistics based on the provided p-value threshold.\n\n        Args:\n            pvalue (float): upper limit of the p-value to be filtered upon.\n\n        Returns:\n            SummaryStatistics: summary statistics object containing single point associations with p-values at least as significant as the provided threshold.\n        \"\"\"\n        # Converting p-value to mantissa and exponent:\n        (mantissa, exponent) = split_pvalue(pvalue)\n\n        # Applying filter:\n        df = self._df.filter(\n            (f.col(\"pValueExponent\") &lt; exponent)\n            | (\n                (f.col(\"pValueExponent\") == exponent)\n                &amp; (f.col(\"pValueMantissa\") &lt;= mantissa)\n            )\n        )\n        return SummaryStatistics(_df=df)\n\n    def window_based_clumping(self: SummaryStatistics, distance: int) -&gt; StudyLocus:\n\"\"\"Perform distance-based clumping.\n\n        Args:\n            distance (int): Distance in base pairs\n\n        Returns:\n            StudyLocus: StudyLocus object\n        \"\"\"\n        # Calculate distance-based clumping:\n        return WindowBasedClumping.clump(self, distance)\n</code></pre>"},{"location":"components/dataset/summary_statistics/#otg.dataset.summary_statistics.SummaryStatistics.calculate_confidence_interval","title":"<code>calculate_confidence_interval()</code>","text":"<p>A Function to add upper and lower confidence interval to a summary statistics dataset.</p> <p>Returns:</p> Name Type Description <code>SummaryStatistics</code> <code>SummaryStatistics</code> Source code in <code>src/otg/dataset/summary_statistics.py</code> <pre><code>def calculate_confidence_interval(self: SummaryStatistics) -&gt; SummaryStatistics:\n\"\"\"A Function to add upper and lower confidence interval to a summary statistics dataset.\n\n    Returns:\n        SummaryStatistics:\n    \"\"\"\n    columns = self._df.columns\n\n    # If confidence interval has already been calculated skip:\n    if (\n        \"betaConfidenceIntervalLower\" in columns\n        and \"betaConfidenceIntervalUpper\" in columns\n    ):\n        return self\n\n    # Calculate CI:\n    return SummaryStatistics(\n        _df=(\n            self._df.select(\n                \"*\",\n                *self._calculate_confidence_interval(\n                    f.col(\"pValueMantissa\"),\n                    f.col(\"pValueExponent\"),\n                    f.col(\"beta\"),\n                    f.col(\"standardError\"),\n                ),\n            )\n        )\n    )\n</code></pre>"},{"location":"components/dataset/summary_statistics/#otg.dataset.summary_statistics.SummaryStatistics.from_gwas_harmonized_summary_stats","title":"<code>from_gwas_harmonized_summary_stats(sumstats_df, study_id)</code>  <code>classmethod</code>","text":"<p>Create summary statistics object from summary statistics harmonized by the GWAS Catalog.</p> <p>Parameters:</p> Name Type Description Default <code>sumstats_df</code> <code>DataFrame</code> <p>Harmonized dataset read as dataframe from GWAS Catalog.</p> required <code>study_id</code> <code>str</code> <p>GWAS Catalog Study accession.</p> required <p>Returns:</p> Type Description <code>SummaryStatistics</code> <p>SummaryStatistics</p> Source code in <code>src/otg/dataset/summary_statistics.py</code> <pre><code>@classmethod\ndef from_gwas_harmonized_summary_stats(\n    cls: type[SummaryStatistics],\n    sumstats_df: DataFrame,\n    study_id: str,\n) -&gt; SummaryStatistics:\n\"\"\"Create summary statistics object from summary statistics harmonized by the GWAS Catalog.\n\n    Args:\n        sumstats_df (DataFrame): Harmonized dataset read as dataframe from GWAS Catalog.\n        study_id (str): GWAS Catalog Study accession.\n\n    Returns:\n        SummaryStatistics\n    \"\"\"\n    # The effect allele frequency is an optional column, we have to test if it is there:\n    allele_frequency_expression = (\n        f.col(\"hm_effect_allele_frequency\").cast(t.DoubleType())\n        if \"hm_effect_allele_frequency\" in sumstats_df.columns\n        else f.lit(None)\n    )\n\n    # Processing columns of interest:\n    processed_sumstats_df = (\n        sumstats_df.select(\n            # Adding study identifier:\n            f.lit(study_id).cast(t.StringType()).alias(\"studyId\"),\n            # Adding variant identifier:\n            f.col(\"hm_variant_id\").alias(\"variantId\"),\n            f.col(\"hm_chrom\").alias(\"chromosome\"),\n            f.col(\"hm_pos\").cast(t.IntegerType()).alias(\"position\"),\n            # Parsing p-value mantissa and exponent:\n            *parse_pvalue(f.col(\"p_value\").cast(t.FloatType())),\n            # Converting/calculating effect and confidence interval:\n            *cls._convert_odds_ratio_to_beta(\n                f.col(\"hm_beta\").cast(t.DoubleType()),\n                f.col(\"hm_odds_ratio\").cast(t.DoubleType()),\n                f.col(\"standard_error\").cast(t.DoubleType()),\n            ),\n            allele_frequency_expression.alias(\"effectAlleleFrequencyFromSource\"),\n        )\n        .repartition(200, \"chromosome\")\n        .sortWithinPartitions(\"position\")\n    )\n\n    # Initializing summary statistics object:\n    return cls(\n        _df=processed_sumstats_df,\n    )\n</code></pre>"},{"location":"components/dataset/summary_statistics/#otg.dataset.summary_statistics.SummaryStatistics.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise SummaryStatistics from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>Session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>SummaryStatistics</code> <code>SummaryStatistics</code> <p>SummaryStatistics dataset</p> Source code in <code>src/otg/dataset/summary_statistics.py</code> <pre><code>@classmethod\ndef from_parquet(\n    cls: type[SummaryStatistics], session: Session, path: str\n) -&gt; SummaryStatistics:\n\"\"\"Initialise SummaryStatistics from parquet file.\n\n    Args:\n        session (Session): Session\n        path (str): Path to parquet file\n\n    Returns:\n        SummaryStatistics: SummaryStatistics dataset\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/summary_statistics/#otg.dataset.summary_statistics.SummaryStatistics.pvalue_filter","title":"<code>pvalue_filter(pvalue)</code>","text":"<p>Filter summary statistics based on the provided p-value threshold.</p> <p>Parameters:</p> Name Type Description Default <code>pvalue</code> <code>float</code> <p>upper limit of the p-value to be filtered upon.</p> required <p>Returns:</p> Name Type Description <code>SummaryStatistics</code> <code>SummaryStatistics</code> <p>summary statistics object containing single point associations with p-values at least as significant as the provided threshold.</p> Source code in <code>src/otg/dataset/summary_statistics.py</code> <pre><code>def pvalue_filter(self: SummaryStatistics, pvalue: float) -&gt; SummaryStatistics:\n\"\"\"Filter summary statistics based on the provided p-value threshold.\n\n    Args:\n        pvalue (float): upper limit of the p-value to be filtered upon.\n\n    Returns:\n        SummaryStatistics: summary statistics object containing single point associations with p-values at least as significant as the provided threshold.\n    \"\"\"\n    # Converting p-value to mantissa and exponent:\n    (mantissa, exponent) = split_pvalue(pvalue)\n\n    # Applying filter:\n    df = self._df.filter(\n        (f.col(\"pValueExponent\") &lt; exponent)\n        | (\n            (f.col(\"pValueExponent\") == exponent)\n            &amp; (f.col(\"pValueMantissa\") &lt;= mantissa)\n        )\n    )\n    return SummaryStatistics(_df=df)\n</code></pre>"},{"location":"components/dataset/summary_statistics/#otg.dataset.summary_statistics.SummaryStatistics.window_based_clumping","title":"<code>window_based_clumping(distance)</code>","text":"<p>Perform distance-based clumping.</p> <p>Parameters:</p> Name Type Description Default <code>distance</code> <code>int</code> <p>Distance in base pairs</p> required <p>Returns:</p> Name Type Description <code>StudyLocus</code> <code>StudyLocus</code> <p>StudyLocus object</p> Source code in <code>src/otg/dataset/summary_statistics.py</code> <pre><code>def window_based_clumping(self: SummaryStatistics, distance: int) -&gt; StudyLocus:\n\"\"\"Perform distance-based clumping.\n\n    Args:\n        distance (int): Distance in base pairs\n\n    Returns:\n        StudyLocus: StudyLocus object\n    \"\"\"\n    # Calculate distance-based clumping:\n    return WindowBasedClumping.clump(self, distance)\n</code></pre>"},{"location":"components/dataset/summary_statistics/#schema","title":"Schema","text":"<pre><code>root\n |-- studyId: string (nullable = false)\n |-- variantId: string (nullable = false)\n |-- chromosome: string (nullable = false)\n |-- position: integer (nullable = false)\n |-- beta: double (nullable = false)\n |-- betaConfidenceIntervalLower: double (nullable = true)\n |-- betaConfidenceIntervalUpper: double (nullable = true)\n |-- pValueMantissa: float (nullable = false)\n |-- pValueExponent: integer (nullable = false)\n |-- effectAlleleFrequencyFromSource: double (nullable = true)\n |-- standardError: double (nullable = true)\n</code></pre>"},{"location":"components/dataset/variant_annotation/","title":"Variant annotation","text":"<p>         Bases: <code>Dataset</code></p> <p>Dataset with variant-level annotations derived from GnomAD.</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>@dataclass\nclass VariantAnnotation(Dataset):\n\"\"\"Dataset with variant-level annotations derived from GnomAD.\"\"\"\n\n    _schema: StructType = parse_spark_schema(\"variant_annotation.json\")\n\n    @classmethod\n    def from_parquet(\n        cls: type[VariantAnnotation], session: Session, path: str\n    ) -&gt; VariantAnnotation:\n\"\"\"Initialise VariantAnnotation from parquet file.\n\n        Args:\n            session (Session): ETL session\n            path (str): Path to parquet file\n\n        Returns:\n            VariantAnnotation: VariantAnnotation dataset\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n\n    @classmethod\n    def from_gnomad(\n        cls: type[VariantAnnotation],\n        gnomad_file: str,\n        grch38_to_grch37_chain: str,\n        populations: list,\n    ) -&gt; VariantAnnotation:\n\"\"\"Generate variant annotation dataset from gnomAD.\n\n        Some relevant modifications to the original dataset are:\n\n        1. The transcript consequences features provided by VEP are filtered to only refer to the Ensembl canonical transcript.\n        2. Genome coordinates are liftovered from GRCh38 to GRCh37 to keep as annotation.\n        3. Field names are converted to camel case to follow the convention.\n\n        Args:\n            gnomad_file (str): Path to `gnomad.genomes.vX.X.X.sites.ht` gnomAD dataset\n            grch38_to_grch37_chain (str): Path to chain file for liftover\n            populations (list): List of populations to include in the dataset\n\n        Returns:\n            VariantAnnotation: Variant annotation dataset\n        \"\"\"\n        # Load variants dataset\n        ht = hl.read_table(\n            gnomad_file,\n            _load_refs=False,\n        )\n\n        # Liftover\n        grch37 = hl.get_reference(\"GRCh37\")\n        grch38 = hl.get_reference(\"GRCh38\")\n        grch38.add_liftover(grch38_to_grch37_chain, grch37)\n\n        # Drop non biallelic variants\n        ht = ht.filter(ht.alleles.length() == 2)\n        # Liftover\n        ht = ht.annotate(locus_GRCh37=hl.liftover(ht.locus, \"GRCh37\"))\n        # Select relevant fields and nested records to create class\n        return cls(\n            _df=(\n                ht.select(\n                    gnomad3VariantId=hl.str(\"-\").join(\n                        [\n                            ht.locus.contig.replace(\"chr\", \"\"),\n                            hl.str(ht.locus.position),\n                            ht.alleles[0],\n                            ht.alleles[1],\n                        ]\n                    ),\n                    chromosome=ht.locus.contig.replace(\"chr\", \"\"),\n                    position=convert_gnomad_position_to_ensembl_hail(\n                        ht.locus.position, ht.alleles[0], ht.alleles[1]\n                    ),\n                    variantId=hl.str(\"_\").join(\n                        [\n                            ht.locus.contig.replace(\"chr\", \"\"),\n                            hl.str(\n                                convert_gnomad_position_to_ensembl_hail(\n                                    ht.locus.position, ht.alleles[0], ht.alleles[1]\n                                )\n                            ),\n                            ht.alleles[0],\n                            ht.alleles[1],\n                        ]\n                    ),\n                    chromosomeB37=ht.locus_GRCh37.contig.replace(\"chr\", \"\"),\n                    positionB37=ht.locus_GRCh37.position,\n                    referenceAllele=ht.alleles[0],\n                    alternateAllele=ht.alleles[1],\n                    rsIds=ht.rsid,\n                    alleleType=ht.allele_info.allele_type,\n                    cadd=hl.struct(\n                        phred=ht.cadd.phred,\n                        raw=ht.cadd.raw_score,\n                    ),\n                    alleleFrequencies=hl.set([f\"{pop}-adj\" for pop in populations]).map(\n                        lambda p: hl.struct(\n                            populationName=p,\n                            alleleFrequency=ht.freq[ht.globals.freq_index_dict[p]].AF,\n                        )\n                    ),\n                    vep=hl.struct(\n                        mostSevereConsequence=ht.vep.most_severe_consequence,\n                        transcriptConsequences=hl.map(\n                            lambda x: hl.struct(\n                                aminoAcids=x.amino_acids,\n                                consequenceTerms=x.consequence_terms,\n                                geneId=x.gene_id,\n                                lof=x.lof,\n                                polyphenScore=x.polyphen_score,\n                                polyphenPrediction=x.polyphen_prediction,\n                                siftScore=x.sift_score,\n                                siftPrediction=x.sift_prediction,\n                            ),\n                            # Only keeping canonical transcripts\n                            ht.vep.transcript_consequences.filter(\n                                lambda x: (x.canonical == 1)\n                                &amp; (x.gene_symbol_source == \"HGNC\")\n                            ),\n                        ),\n                    ),\n                )\n                .key_by(\"chromosome\", \"position\")\n                .drop(\"locus\", \"alleles\")\n                .select_globals()\n                .to_spark(flatten=False)\n            )\n        )\n\n    def persist(self: VariantAnnotation) -&gt; VariantAnnotation:\n\"\"\"Persist DataFrame included in the Dataset.\"\"\"\n        self.df = self._df.persist()\n        return self\n\n    def max_maf(self: VariantAnnotation) -&gt; Column:\n\"\"\"Maximum minor allele frequency accross all populations.\n\n        Returns:\n            Column: Maximum minor allele frequency accross all populations.\n        \"\"\"\n        return f.array_max(\n            f.transform(\n                self.df.alleleFrequencies,\n                lambda af: f.when(\n                    af.alleleFrequency &gt; 0.5, 1 - af.alleleFrequency\n                ).otherwise(af.alleleFrequency),\n            )\n        )\n\n    def filter_by_variant_df(\n        self: VariantAnnotation, df: DataFrame, cols: list[str]\n    ) -&gt; VariantAnnotation:\n\"\"\"Filter variant annotation dataset by a variant dataframe.\n\n        Args:\n            df (DataFrame): A dataframe of variants\n            cols (List[str]): A list of columns to join on\n\n        Returns:\n            VariantAnnotation: A filtered variant annotation dataset\n        \"\"\"\n        self.df = self._df.join(f.broadcast(df.select(cols)), on=cols, how=\"inner\")\n        return self\n\n    def get_transcript_consequence_df(\n        self: VariantAnnotation, filter_by: Optional[GeneIndex] = None\n    ) -&gt; DataFrame:\n\"\"\"Dataframe of exploded transcript consequences.\n\n        Optionally the trancript consequences can be reduced to the universe of a gene index.\n\n        Args:\n            filter_by (GeneIndex): A gene index. Defaults to None.\n\n        Returns:\n            DataFrame: A dataframe exploded by transcript consequences with the columns variantId, chromosome, transcriptConsequence\n        \"\"\"\n        # exploding the array removes records without VEP annotation\n        transript_consequences = self.df.withColumn(\n            \"transcriptConsequence\", f.explode(\"vep.transcriptConsequences\")\n        ).select(\n            \"variantId\",\n            \"chromosome\",\n            \"position\",\n            \"transcriptConsequence\",\n            f.col(\"transcriptConsequence.geneId\").alias(\"geneId\"),\n        )\n        if filter_by:\n            transript_consequences = transript_consequences.join(\n                f.broadcast(filter_by.df),\n                on=[\"chromosome\", \"geneId\"],\n            )\n        return transript_consequences.persist()\n\n    def get_most_severe_vep_v2g(\n        self: VariantAnnotation,\n        vep_consequences: DataFrame,\n        filter_by: GeneIndex,\n    ) -&gt; V2G:\n\"\"\"Creates a dataset with variant to gene assignments based on VEP's predicted consequence on the transcript.\n\n        Optionally the trancript consequences can be reduced to the universe of a gene index.\n\n        Args:\n            vep_consequences (DataFrame): A dataframe of VEP consequences\n            filter_by (GeneIndex): A gene index to filter by. Defaults to None.\n\n        Returns:\n            V2G: High and medium severity variant to gene assignments\n        \"\"\"\n        vep_lut = vep_consequences.select(\n            f.element_at(f.split(\"Accession\", r\"/\"), -1).alias(\n                \"variantFunctionalConsequenceId\"\n            ),\n            f.col(\"Term\").alias(\"label\"),\n            f.col(\"v2g_score\").cast(\"double\").alias(\"score\"),\n        )\n\n        return V2G(\n            _df=self.get_transcript_consequence_df(filter_by)\n            .select(\n                \"variantId\",\n                \"chromosome\",\n                \"position\",\n                f.col(\"transcriptConsequence.geneId\").alias(\"geneId\"),\n                f.explode(\"transcriptConsequence.consequenceTerms\").alias(\"label\"),\n                f.lit(\"vep\").alias(\"datatypeId\"),\n                f.lit(\"variantConsequence\").alias(\"datasourceId\"),\n            )\n            # A variant can have multiple predicted consequences on a transcript, the most severe one is selected\n            .join(\n                f.broadcast(vep_lut),\n                on=\"label\",\n                how=\"inner\",\n            )\n            .filter(f.col(\"score\") != 0)\n            .transform(\n                lambda df: get_record_with_maximum_value(\n                    df, [\"variantId\", \"geneId\"], \"score\"\n                )\n            )\n        )\n\n    def get_polyphen_v2g(\n        self: VariantAnnotation, filter_by: Optional[GeneIndex] = None\n    ) -&gt; V2G:\n\"\"\"Creates a dataset with variant to gene assignments with a PolyPhen's predicted score on the transcript.\n\n        Polyphen informs about the probability that a substitution is damaging. Optionally the trancript consequences can be reduced to the universe of a gene index.\n\n        Args:\n            filter_by (GeneIndex): A gene index to filter by. Defaults to None.\n\n        Returns:\n            V2G: variant to gene assignments with their polyphen scores\n        \"\"\"\n        return V2G(\n            _df=self.get_transcript_consequence_df(filter_by)\n            .filter(f.col(\"transcriptConsequence.polyphenScore\").isNotNull())\n            .select(\n                \"variantId\",\n                \"chromosome\",\n                \"position\",\n                \"geneId\",\n                f.col(\"transcriptConsequence.polyphenScore\").alias(\"score\"),\n                f.col(\"transcriptConsequence.polyphenPrediction\").alias(\"label\"),\n                f.lit(\"vep\").alias(\"datatypeId\"),\n                f.lit(\"polyphen\").alias(\"datasourceId\"),\n            )\n        )\n\n    def get_sift_v2g(self: VariantAnnotation, filter_by: GeneIndex) -&gt; V2G:\n\"\"\"Creates a dataset with variant to gene assignments with a SIFT's predicted score on the transcript.\n\n        SIFT informs about the probability that a substitution is tolerated so scores nearer zero are more likely to be deleterious.\n        Optionally the trancript consequences can be reduced to the universe of a gene index.\n\n        Args:\n            filter_by (GeneIndex): A gene index to filter by.\n\n        Returns:\n            V2G: variant to gene assignments with their SIFT scores\n        \"\"\"\n        return V2G(\n            _df=self.get_transcript_consequence_df(filter_by)\n            .filter(f.col(\"transcriptConsequence.siftScore\").isNotNull())\n            .select(\n                \"variantId\",\n                \"chromosome\",\n                \"position\",\n                \"geneId\",\n                f.expr(\"1 - transcriptConsequence.siftScore\").alias(\"score\"),\n                f.col(\"transcriptConsequence.siftPrediction\").alias(\"label\"),\n                f.lit(\"vep\").alias(\"datatypeId\"),\n                f.lit(\"sift\").alias(\"datasourceId\"),\n            )\n        )\n\n    def get_plof_v2g(self: VariantAnnotation, filter_by: GeneIndex) -&gt; V2G:\n\"\"\"Creates a dataset with variant to gene assignments with a flag indicating if the variant is predicted to be a loss-of-function variant by the LOFTEE algorithm.\n\n        Optionally the trancript consequences can be reduced to the universe of a gene index.\n\n        Args:\n            filter_by (GeneIndex): A gene index to filter by.\n\n        Returns:\n            V2G: variant to gene assignments from the LOFTEE algorithm\n        \"\"\"\n        return V2G(\n            _df=self.get_transcript_consequence_df(filter_by)\n            .filter(f.col(\"transcriptConsequence.lof\").isNotNull())\n            .withColumn(\n                \"isHighQualityPlof\",\n                f.when(f.col(\"transcriptConsequence.lof\") == \"HC\", True).when(\n                    f.col(\"transcriptConsequence.lof\") == \"LC\", False\n                ),\n            )\n            .withColumn(\n                \"score\",\n                f.when(f.col(\"isHighQualityPlof\"), 1.0).when(\n                    ~f.col(\"isHighQualityPlof\"), 0\n                ),\n            )\n            .select(\n                \"variantId\",\n                \"chromosome\",\n                \"position\",\n                \"geneId\",\n                \"isHighQualityPlof\",\n                f.col(\"score\"),\n                f.lit(\"vep\").alias(\"datatypeId\"),\n                f.lit(\"loftee\").alias(\"datasourceId\"),\n            )\n        )\n\n    def get_distance_to_tss(\n        self: VariantAnnotation,\n        filter_by: GeneIndex,\n        max_distance: int = 500_000,\n    ) -&gt; V2G:\n\"\"\"Extracts variant to gene assignments for variants falling within a window of a gene's TSS.\n\n        Args:\n            filter_by (GeneIndex): A gene index to filter by.\n            max_distance (int): The maximum distance from the TSS to consider. Defaults to 500_000.\n\n        Returns:\n            V2G: variant to gene assignments with their distance to the TSS\n        \"\"\"\n        return V2G(\n            _df=self.df.alias(\"variant\")\n            .join(\n                f.broadcast(filter_by.locations_lut()).alias(\"gene\"),\n                on=[\n                    f.col(\"variant.chromosome\") == f.col(\"gene.chromosome\"),\n                    f.abs(f.col(\"variant.position\") - f.col(\"gene.tss\"))\n                    &lt;= max_distance,\n                ],\n                how=\"inner\",\n            )\n            .withColumn(\n                \"inverse_distance\",\n                max_distance - f.abs(f.col(\"variant.position\") - f.col(\"gene.tss\")),\n            )\n            .transform(lambda df: normalise_column(df, \"inverse_distance\", \"score\"))\n            .select(\n                \"variantId\",\n                f.col(\"variant.chromosome\").alias(\"chromosome\"),\n                \"position\",\n                \"geneId\",\n                \"score\",\n                f.lit(\"distance\").alias(\"datatypeId\"),\n                f.lit(\"canonical_tss\").alias(\"datasourceId\"),\n            )\n        )\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.filter_by_variant_df","title":"<code>filter_by_variant_df(df, cols)</code>","text":"<p>Filter variant annotation dataset by a variant dataframe.</p> <p>Parameters:</p> Name Type Description Default <code>df</code> <code>DataFrame</code> <p>A dataframe of variants</p> required <code>cols</code> <code>List[str]</code> <p>A list of columns to join on</p> required <p>Returns:</p> Name Type Description <code>VariantAnnotation</code> <code>VariantAnnotation</code> <p>A filtered variant annotation dataset</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>def filter_by_variant_df(\n    self: VariantAnnotation, df: DataFrame, cols: list[str]\n) -&gt; VariantAnnotation:\n\"\"\"Filter variant annotation dataset by a variant dataframe.\n\n    Args:\n        df (DataFrame): A dataframe of variants\n        cols (List[str]): A list of columns to join on\n\n    Returns:\n        VariantAnnotation: A filtered variant annotation dataset\n    \"\"\"\n    self.df = self._df.join(f.broadcast(df.select(cols)), on=cols, how=\"inner\")\n    return self\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.from_gnomad","title":"<code>from_gnomad(gnomad_file, grch38_to_grch37_chain, populations)</code>  <code>classmethod</code>","text":"<p>Generate variant annotation dataset from gnomAD.</p> <p>Some relevant modifications to the original dataset are:</p> <ol> <li>The transcript consequences features provided by VEP are filtered to only refer to the Ensembl canonical transcript.</li> <li>Genome coordinates are liftovered from GRCh38 to GRCh37 to keep as annotation.</li> <li>Field names are converted to camel case to follow the convention.</li> </ol> <p>Parameters:</p> Name Type Description Default <code>gnomad_file</code> <code>str</code> <p>Path to <code>gnomad.genomes.vX.X.X.sites.ht</code> gnomAD dataset</p> required <code>grch38_to_grch37_chain</code> <code>str</code> <p>Path to chain file for liftover</p> required <code>populations</code> <code>list</code> <p>List of populations to include in the dataset</p> required <p>Returns:</p> Name Type Description <code>VariantAnnotation</code> <code>VariantAnnotation</code> <p>Variant annotation dataset</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>@classmethod\ndef from_gnomad(\n    cls: type[VariantAnnotation],\n    gnomad_file: str,\n    grch38_to_grch37_chain: str,\n    populations: list,\n) -&gt; VariantAnnotation:\n\"\"\"Generate variant annotation dataset from gnomAD.\n\n    Some relevant modifications to the original dataset are:\n\n    1. The transcript consequences features provided by VEP are filtered to only refer to the Ensembl canonical transcript.\n    2. Genome coordinates are liftovered from GRCh38 to GRCh37 to keep as annotation.\n    3. Field names are converted to camel case to follow the convention.\n\n    Args:\n        gnomad_file (str): Path to `gnomad.genomes.vX.X.X.sites.ht` gnomAD dataset\n        grch38_to_grch37_chain (str): Path to chain file for liftover\n        populations (list): List of populations to include in the dataset\n\n    Returns:\n        VariantAnnotation: Variant annotation dataset\n    \"\"\"\n    # Load variants dataset\n    ht = hl.read_table(\n        gnomad_file,\n        _load_refs=False,\n    )\n\n    # Liftover\n    grch37 = hl.get_reference(\"GRCh37\")\n    grch38 = hl.get_reference(\"GRCh38\")\n    grch38.add_liftover(grch38_to_grch37_chain, grch37)\n\n    # Drop non biallelic variants\n    ht = ht.filter(ht.alleles.length() == 2)\n    # Liftover\n    ht = ht.annotate(locus_GRCh37=hl.liftover(ht.locus, \"GRCh37\"))\n    # Select relevant fields and nested records to create class\n    return cls(\n        _df=(\n            ht.select(\n                gnomad3VariantId=hl.str(\"-\").join(\n                    [\n                        ht.locus.contig.replace(\"chr\", \"\"),\n                        hl.str(ht.locus.position),\n                        ht.alleles[0],\n                        ht.alleles[1],\n                    ]\n                ),\n                chromosome=ht.locus.contig.replace(\"chr\", \"\"),\n                position=convert_gnomad_position_to_ensembl_hail(\n                    ht.locus.position, ht.alleles[0], ht.alleles[1]\n                ),\n                variantId=hl.str(\"_\").join(\n                    [\n                        ht.locus.contig.replace(\"chr\", \"\"),\n                        hl.str(\n                            convert_gnomad_position_to_ensembl_hail(\n                                ht.locus.position, ht.alleles[0], ht.alleles[1]\n                            )\n                        ),\n                        ht.alleles[0],\n                        ht.alleles[1],\n                    ]\n                ),\n                chromosomeB37=ht.locus_GRCh37.contig.replace(\"chr\", \"\"),\n                positionB37=ht.locus_GRCh37.position,\n                referenceAllele=ht.alleles[0],\n                alternateAllele=ht.alleles[1],\n                rsIds=ht.rsid,\n                alleleType=ht.allele_info.allele_type,\n                cadd=hl.struct(\n                    phred=ht.cadd.phred,\n                    raw=ht.cadd.raw_score,\n                ),\n                alleleFrequencies=hl.set([f\"{pop}-adj\" for pop in populations]).map(\n                    lambda p: hl.struct(\n                        populationName=p,\n                        alleleFrequency=ht.freq[ht.globals.freq_index_dict[p]].AF,\n                    )\n                ),\n                vep=hl.struct(\n                    mostSevereConsequence=ht.vep.most_severe_consequence,\n                    transcriptConsequences=hl.map(\n                        lambda x: hl.struct(\n                            aminoAcids=x.amino_acids,\n                            consequenceTerms=x.consequence_terms,\n                            geneId=x.gene_id,\n                            lof=x.lof,\n                            polyphenScore=x.polyphen_score,\n                            polyphenPrediction=x.polyphen_prediction,\n                            siftScore=x.sift_score,\n                            siftPrediction=x.sift_prediction,\n                        ),\n                        # Only keeping canonical transcripts\n                        ht.vep.transcript_consequences.filter(\n                            lambda x: (x.canonical == 1)\n                            &amp; (x.gene_symbol_source == \"HGNC\")\n                        ),\n                    ),\n                ),\n            )\n            .key_by(\"chromosome\", \"position\")\n            .drop(\"locus\", \"alleles\")\n            .select_globals()\n            .to_spark(flatten=False)\n        )\n    )\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise VariantAnnotation from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>ETL session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>VariantAnnotation</code> <code>VariantAnnotation</code> <p>VariantAnnotation dataset</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>@classmethod\ndef from_parquet(\n    cls: type[VariantAnnotation], session: Session, path: str\n) -&gt; VariantAnnotation:\n\"\"\"Initialise VariantAnnotation from parquet file.\n\n    Args:\n        session (Session): ETL session\n        path (str): Path to parquet file\n\n    Returns:\n        VariantAnnotation: VariantAnnotation dataset\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.get_distance_to_tss","title":"<code>get_distance_to_tss(filter_by, max_distance=500000)</code>","text":"<p>Extracts variant to gene assignments for variants falling within a window of a gene's TSS.</p> <p>Parameters:</p> Name Type Description Default <code>filter_by</code> <code>GeneIndex</code> <p>A gene index to filter by.</p> required <code>max_distance</code> <code>int</code> <p>The maximum distance from the TSS to consider. Defaults to 500_000.</p> <code>500000</code> <p>Returns:</p> Name Type Description <code>V2G</code> <code>V2G</code> <p>variant to gene assignments with their distance to the TSS</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>def get_distance_to_tss(\n    self: VariantAnnotation,\n    filter_by: GeneIndex,\n    max_distance: int = 500_000,\n) -&gt; V2G:\n\"\"\"Extracts variant to gene assignments for variants falling within a window of a gene's TSS.\n\n    Args:\n        filter_by (GeneIndex): A gene index to filter by.\n        max_distance (int): The maximum distance from the TSS to consider. Defaults to 500_000.\n\n    Returns:\n        V2G: variant to gene assignments with their distance to the TSS\n    \"\"\"\n    return V2G(\n        _df=self.df.alias(\"variant\")\n        .join(\n            f.broadcast(filter_by.locations_lut()).alias(\"gene\"),\n            on=[\n                f.col(\"variant.chromosome\") == f.col(\"gene.chromosome\"),\n                f.abs(f.col(\"variant.position\") - f.col(\"gene.tss\"))\n                &lt;= max_distance,\n            ],\n            how=\"inner\",\n        )\n        .withColumn(\n            \"inverse_distance\",\n            max_distance - f.abs(f.col(\"variant.position\") - f.col(\"gene.tss\")),\n        )\n        .transform(lambda df: normalise_column(df, \"inverse_distance\", \"score\"))\n        .select(\n            \"variantId\",\n            f.col(\"variant.chromosome\").alias(\"chromosome\"),\n            \"position\",\n            \"geneId\",\n            \"score\",\n            f.lit(\"distance\").alias(\"datatypeId\"),\n            f.lit(\"canonical_tss\").alias(\"datasourceId\"),\n        )\n    )\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.get_most_severe_vep_v2g","title":"<code>get_most_severe_vep_v2g(vep_consequences, filter_by)</code>","text":"<p>Creates a dataset with variant to gene assignments based on VEP's predicted consequence on the transcript.</p> <p>Optionally the trancript consequences can be reduced to the universe of a gene index.</p> <p>Parameters:</p> Name Type Description Default <code>vep_consequences</code> <code>DataFrame</code> <p>A dataframe of VEP consequences</p> required <code>filter_by</code> <code>GeneIndex</code> <p>A gene index to filter by. Defaults to None.</p> required <p>Returns:</p> Name Type Description <code>V2G</code> <code>V2G</code> <p>High and medium severity variant to gene assignments</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>def get_most_severe_vep_v2g(\n    self: VariantAnnotation,\n    vep_consequences: DataFrame,\n    filter_by: GeneIndex,\n) -&gt; V2G:\n\"\"\"Creates a dataset with variant to gene assignments based on VEP's predicted consequence on the transcript.\n\n    Optionally the trancript consequences can be reduced to the universe of a gene index.\n\n    Args:\n        vep_consequences (DataFrame): A dataframe of VEP consequences\n        filter_by (GeneIndex): A gene index to filter by. Defaults to None.\n\n    Returns:\n        V2G: High and medium severity variant to gene assignments\n    \"\"\"\n    vep_lut = vep_consequences.select(\n        f.element_at(f.split(\"Accession\", r\"/\"), -1).alias(\n            \"variantFunctionalConsequenceId\"\n        ),\n        f.col(\"Term\").alias(\"label\"),\n        f.col(\"v2g_score\").cast(\"double\").alias(\"score\"),\n    )\n\n    return V2G(\n        _df=self.get_transcript_consequence_df(filter_by)\n        .select(\n            \"variantId\",\n            \"chromosome\",\n            \"position\",\n            f.col(\"transcriptConsequence.geneId\").alias(\"geneId\"),\n            f.explode(\"transcriptConsequence.consequenceTerms\").alias(\"label\"),\n            f.lit(\"vep\").alias(\"datatypeId\"),\n            f.lit(\"variantConsequence\").alias(\"datasourceId\"),\n        )\n        # A variant can have multiple predicted consequences on a transcript, the most severe one is selected\n        .join(\n            f.broadcast(vep_lut),\n            on=\"label\",\n            how=\"inner\",\n        )\n        .filter(f.col(\"score\") != 0)\n        .transform(\n            lambda df: get_record_with_maximum_value(\n                df, [\"variantId\", \"geneId\"], \"score\"\n            )\n        )\n    )\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.get_plof_v2g","title":"<code>get_plof_v2g(filter_by)</code>","text":"<p>Creates a dataset with variant to gene assignments with a flag indicating if the variant is predicted to be a loss-of-function variant by the LOFTEE algorithm.</p> <p>Optionally the trancript consequences can be reduced to the universe of a gene index.</p> <p>Parameters:</p> Name Type Description Default <code>filter_by</code> <code>GeneIndex</code> <p>A gene index to filter by.</p> required <p>Returns:</p> Name Type Description <code>V2G</code> <code>V2G</code> <p>variant to gene assignments from the LOFTEE algorithm</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>def get_plof_v2g(self: VariantAnnotation, filter_by: GeneIndex) -&gt; V2G:\n\"\"\"Creates a dataset with variant to gene assignments with a flag indicating if the variant is predicted to be a loss-of-function variant by the LOFTEE algorithm.\n\n    Optionally the trancript consequences can be reduced to the universe of a gene index.\n\n    Args:\n        filter_by (GeneIndex): A gene index to filter by.\n\n    Returns:\n        V2G: variant to gene assignments from the LOFTEE algorithm\n    \"\"\"\n    return V2G(\n        _df=self.get_transcript_consequence_df(filter_by)\n        .filter(f.col(\"transcriptConsequence.lof\").isNotNull())\n        .withColumn(\n            \"isHighQualityPlof\",\n            f.when(f.col(\"transcriptConsequence.lof\") == \"HC\", True).when(\n                f.col(\"transcriptConsequence.lof\") == \"LC\", False\n            ),\n        )\n        .withColumn(\n            \"score\",\n            f.when(f.col(\"isHighQualityPlof\"), 1.0).when(\n                ~f.col(\"isHighQualityPlof\"), 0\n            ),\n        )\n        .select(\n            \"variantId\",\n            \"chromosome\",\n            \"position\",\n            \"geneId\",\n            \"isHighQualityPlof\",\n            f.col(\"score\"),\n            f.lit(\"vep\").alias(\"datatypeId\"),\n            f.lit(\"loftee\").alias(\"datasourceId\"),\n        )\n    )\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.get_polyphen_v2g","title":"<code>get_polyphen_v2g(filter_by=None)</code>","text":"<p>Creates a dataset with variant to gene assignments with a PolyPhen's predicted score on the transcript.</p> <p>Polyphen informs about the probability that a substitution is damaging. Optionally the trancript consequences can be reduced to the universe of a gene index.</p> <p>Parameters:</p> Name Type Description Default <code>filter_by</code> <code>GeneIndex</code> <p>A gene index to filter by. Defaults to None.</p> <code>None</code> <p>Returns:</p> Name Type Description <code>V2G</code> <code>V2G</code> <p>variant to gene assignments with their polyphen scores</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>def get_polyphen_v2g(\n    self: VariantAnnotation, filter_by: Optional[GeneIndex] = None\n) -&gt; V2G:\n\"\"\"Creates a dataset with variant to gene assignments with a PolyPhen's predicted score on the transcript.\n\n    Polyphen informs about the probability that a substitution is damaging. Optionally the trancript consequences can be reduced to the universe of a gene index.\n\n    Args:\n        filter_by (GeneIndex): A gene index to filter by. Defaults to None.\n\n    Returns:\n        V2G: variant to gene assignments with their polyphen scores\n    \"\"\"\n    return V2G(\n        _df=self.get_transcript_consequence_df(filter_by)\n        .filter(f.col(\"transcriptConsequence.polyphenScore\").isNotNull())\n        .select(\n            \"variantId\",\n            \"chromosome\",\n            \"position\",\n            \"geneId\",\n            f.col(\"transcriptConsequence.polyphenScore\").alias(\"score\"),\n            f.col(\"transcriptConsequence.polyphenPrediction\").alias(\"label\"),\n            f.lit(\"vep\").alias(\"datatypeId\"),\n            f.lit(\"polyphen\").alias(\"datasourceId\"),\n        )\n    )\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.get_sift_v2g","title":"<code>get_sift_v2g(filter_by)</code>","text":"<p>Creates a dataset with variant to gene assignments with a SIFT's predicted score on the transcript.</p> <p>SIFT informs about the probability that a substitution is tolerated so scores nearer zero are more likely to be deleterious. Optionally the trancript consequences can be reduced to the universe of a gene index.</p> <p>Parameters:</p> Name Type Description Default <code>filter_by</code> <code>GeneIndex</code> <p>A gene index to filter by.</p> required <p>Returns:</p> Name Type Description <code>V2G</code> <code>V2G</code> <p>variant to gene assignments with their SIFT scores</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>def get_sift_v2g(self: VariantAnnotation, filter_by: GeneIndex) -&gt; V2G:\n\"\"\"Creates a dataset with variant to gene assignments with a SIFT's predicted score on the transcript.\n\n    SIFT informs about the probability that a substitution is tolerated so scores nearer zero are more likely to be deleterious.\n    Optionally the trancript consequences can be reduced to the universe of a gene index.\n\n    Args:\n        filter_by (GeneIndex): A gene index to filter by.\n\n    Returns:\n        V2G: variant to gene assignments with their SIFT scores\n    \"\"\"\n    return V2G(\n        _df=self.get_transcript_consequence_df(filter_by)\n        .filter(f.col(\"transcriptConsequence.siftScore\").isNotNull())\n        .select(\n            \"variantId\",\n            \"chromosome\",\n            \"position\",\n            \"geneId\",\n            f.expr(\"1 - transcriptConsequence.siftScore\").alias(\"score\"),\n            f.col(\"transcriptConsequence.siftPrediction\").alias(\"label\"),\n            f.lit(\"vep\").alias(\"datatypeId\"),\n            f.lit(\"sift\").alias(\"datasourceId\"),\n        )\n    )\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.get_transcript_consequence_df","title":"<code>get_transcript_consequence_df(filter_by=None)</code>","text":"<p>Dataframe of exploded transcript consequences.</p> <p>Optionally the trancript consequences can be reduced to the universe of a gene index.</p> <p>Parameters:</p> Name Type Description Default <code>filter_by</code> <code>GeneIndex</code> <p>A gene index. Defaults to None.</p> <code>None</code> <p>Returns:</p> Name Type Description <code>DataFrame</code> <code>DataFrame</code> <p>A dataframe exploded by transcript consequences with the columns variantId, chromosome, transcriptConsequence</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>def get_transcript_consequence_df(\n    self: VariantAnnotation, filter_by: Optional[GeneIndex] = None\n) -&gt; DataFrame:\n\"\"\"Dataframe of exploded transcript consequences.\n\n    Optionally the trancript consequences can be reduced to the universe of a gene index.\n\n    Args:\n        filter_by (GeneIndex): A gene index. Defaults to None.\n\n    Returns:\n        DataFrame: A dataframe exploded by transcript consequences with the columns variantId, chromosome, transcriptConsequence\n    \"\"\"\n    # exploding the array removes records without VEP annotation\n    transript_consequences = self.df.withColumn(\n        \"transcriptConsequence\", f.explode(\"vep.transcriptConsequences\")\n    ).select(\n        \"variantId\",\n        \"chromosome\",\n        \"position\",\n        \"transcriptConsequence\",\n        f.col(\"transcriptConsequence.geneId\").alias(\"geneId\"),\n    )\n    if filter_by:\n        transript_consequences = transript_consequences.join(\n            f.broadcast(filter_by.df),\n            on=[\"chromosome\", \"geneId\"],\n        )\n    return transript_consequences.persist()\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.max_maf","title":"<code>max_maf()</code>","text":"<p>Maximum minor allele frequency accross all populations.</p> <p>Returns:</p> Name Type Description <code>Column</code> <code>Column</code> <p>Maximum minor allele frequency accross all populations.</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>def max_maf(self: VariantAnnotation) -&gt; Column:\n\"\"\"Maximum minor allele frequency accross all populations.\n\n    Returns:\n        Column: Maximum minor allele frequency accross all populations.\n    \"\"\"\n    return f.array_max(\n        f.transform(\n            self.df.alleleFrequencies,\n            lambda af: f.when(\n                af.alleleFrequency &gt; 0.5, 1 - af.alleleFrequency\n            ).otherwise(af.alleleFrequency),\n        )\n    )\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.persist","title":"<code>persist()</code>","text":"<p>Persist DataFrame included in the Dataset.</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>def persist(self: VariantAnnotation) -&gt; VariantAnnotation:\n\"\"\"Persist DataFrame included in the Dataset.\"\"\"\n    self.df = self._df.persist()\n    return self\n</code></pre>"},{"location":"components/dataset/variant_annotation/#schema","title":"Schema","text":"<pre><code>root\n |-- variantId: string (nullable = false)\n |-- chromosome: string (nullable = false)\n |-- position: integer (nullable = false)\n |-- gnomad3VariantId: string (nullable = false)\n |-- referenceAllele: string (nullable = false)\n |-- alternateAllele: string (nullable = false)\n |-- chromosomeB37: string (nullable = true)\n |-- positionB37: integer (nullable = true)\n |-- alleleType: string (nullable = true)\n |-- rsIds: array (nullable = true)\n |    |-- element: string (containsNull = true)\n |-- alleleFrequencies: array (nullable = false)\n |    |-- element: struct (containsNull = true)\n |    |    |-- populationName: string (nullable = true)\n |    |    |-- alleleFrequency: double (nullable = true)\n |-- cadd: struct (nullable = true)\n |    |-- phred: float (nullable = true)\n |    |-- raw: float (nullable = true)\n |-- vep: struct (nullable = false)\n |    |-- mostSevereConsequence: string (nullable = true)\n |    |-- transcriptConsequences: array (nullable = true)\n |    |    |-- element: struct (containsNull = true)\n |    |    |    |-- aminoAcids: string (nullable = true)\n |    |    |    |-- consequenceTerms: array (nullable = true)\n |    |    |    |    |-- element: string (containsNull = true)\n |    |    |    |-- geneId: string (nullable = true)\n |    |    |    |-- lof: string (nullable = true)\n |    |    |    |-- polyphenScore: double (nullable = true)\n |    |    |    |-- polyphenPrediction: string (nullable = true)\n |    |    |    |-- siftScore: double (nullable = true)\n |    |    |    |-- siftPrediction: string (nullable = true)\n</code></pre>"},{"location":"components/dataset/variant_index/","title":"Variant index","text":"<p>         Bases: <code>Dataset</code></p> <p>Variant index dataset.</p> <p>Variant index dataset is the result of intersecting the variant annotation (gnomad) dataset with the variants with V2D available information.</p> Source code in <code>src/otg/dataset/variant_index.py</code> <pre><code>@dataclass\nclass VariantIndex(Dataset):\n\"\"\"Variant index dataset.\n\n    Variant index dataset is the result of intersecting the variant annotation (gnomad) dataset with the variants with V2D available information.\n    \"\"\"\n\n    _schema: StructType = parse_spark_schema(\"variant_index.json\")\n\n    @classmethod\n    def from_parquet(\n        cls: type[VariantIndex], session: Session, path: str\n    ) -&gt; VariantIndex:\n\"\"\"Initialise VariantIndex from parquet file.\n\n        Args:\n            session (Session): ETL session\n            path (str): Path to parquet file\n\n        Returns:\n            VariantIndex: VariantIndex dataset\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n\n    @classmethod\n    def from_variant_annotation(\n        cls: type[VariantIndex],\n        variant_annotation: VariantAnnotation,\n    ) -&gt; VariantIndex:\n\"\"\"Initialise VariantIndex from pre-existing variant annotation dataset.\"\"\"\n        unchanged_cols = [\n            \"variantId\",\n            \"chromosome\",\n            \"position\",\n            \"referenceAllele\",\n            \"alternateAllele\",\n            \"chromosomeB37\",\n            \"positionB37\",\n            \"alleleType\",\n            \"alleleFrequencies\",\n            \"cadd\",\n        ]\n        vi = cls(\n            _df=variant_annotation.df.select(\n                *unchanged_cols,\n                f.col(\"vep.mostSevereConsequence\").alias(\"mostSevereConsequence\"),\n                # filters/rsid are arrays that can be empty, in this case we convert them to null\n                nullify_empty_array(f.col(\"rsIds\")).alias(\"rsIds\"),\n            ),\n        )\n        return VariantIndex(\n            _df=vi.df.repartition(\n                400,\n                \"chromosome\",\n            ).sortWithinPartitions(\"chromosome\", \"position\")\n        )\n\n    def persist(self: VariantIndex) -&gt; VariantIndex:\n\"\"\"Persist DataFrame included in the Dataset.\"\"\"\n        self.df = self._df.persist()\n        return self\n</code></pre>"},{"location":"components/dataset/variant_index/#otg.dataset.variant_index.VariantIndex.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise VariantIndex from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>ETL session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>VariantIndex</code> <code>VariantIndex</code> <p>VariantIndex dataset</p> Source code in <code>src/otg/dataset/variant_index.py</code> <pre><code>@classmethod\ndef from_parquet(\n    cls: type[VariantIndex], session: Session, path: str\n) -&gt; VariantIndex:\n\"\"\"Initialise VariantIndex from parquet file.\n\n    Args:\n        session (Session): ETL session\n        path (str): Path to parquet file\n\n    Returns:\n        VariantIndex: VariantIndex dataset\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/variant_index/#otg.dataset.variant_index.VariantIndex.from_variant_annotation","title":"<code>from_variant_annotation(variant_annotation)</code>  <code>classmethod</code>","text":"<p>Initialise VariantIndex from pre-existing variant annotation dataset.</p> Source code in <code>src/otg/dataset/variant_index.py</code> <pre><code>@classmethod\ndef from_variant_annotation(\n    cls: type[VariantIndex],\n    variant_annotation: VariantAnnotation,\n) -&gt; VariantIndex:\n\"\"\"Initialise VariantIndex from pre-existing variant annotation dataset.\"\"\"\n    unchanged_cols = [\n        \"variantId\",\n        \"chromosome\",\n        \"position\",\n        \"referenceAllele\",\n        \"alternateAllele\",\n        \"chromosomeB37\",\n        \"positionB37\",\n        \"alleleType\",\n        \"alleleFrequencies\",\n        \"cadd\",\n    ]\n    vi = cls(\n        _df=variant_annotation.df.select(\n            *unchanged_cols,\n            f.col(\"vep.mostSevereConsequence\").alias(\"mostSevereConsequence\"),\n            # filters/rsid are arrays that can be empty, in this case we convert them to null\n            nullify_empty_array(f.col(\"rsIds\")).alias(\"rsIds\"),\n        ),\n    )\n    return VariantIndex(\n        _df=vi.df.repartition(\n            400,\n            \"chromosome\",\n        ).sortWithinPartitions(\"chromosome\", \"position\")\n    )\n</code></pre>"},{"location":"components/dataset/variant_index/#otg.dataset.variant_index.VariantIndex.persist","title":"<code>persist()</code>","text":"<p>Persist DataFrame included in the Dataset.</p> Source code in <code>src/otg/dataset/variant_index.py</code> <pre><code>def persist(self: VariantIndex) -&gt; VariantIndex:\n\"\"\"Persist DataFrame included in the Dataset.\"\"\"\n    self.df = self._df.persist()\n    return self\n</code></pre>"},{"location":"components/dataset/variant_index/#schema","title":"Schema","text":"<pre><code>root\n |-- variantId: string (nullable = false)\n |-- chromosome: string (nullable = false)\n |-- position: integer (nullable = false)\n |-- referenceAllele: string (nullable = false)\n |-- alternateAllele: string (nullable = false)\n |-- chromosomeB37: string (nullable = true)\n |-- positionB37: integer (nullable = true)\n |-- alleleType: string (nullable = false)\n |-- alleleFrequencies: array (nullable = false)\n |    |-- element: struct (containsNull = true)\n |    |    |-- populationName: string (nullable = true)\n |    |    |-- alleleFrequency: double (nullable = true)\n |-- cadd: struct (nullable = true)\n |    |-- phred: float (nullable = true)\n |    |-- raw: float (nullable = true)\n |-- mostSevereConsequence: string (nullable = true)\n |-- rsIds: array (nullable = true)\n |    |-- element: string (containsNull = true)\n</code></pre>"},{"location":"components/dataset/variant_to_gene/","title":"Variant to gene","text":"<p>         Bases: <code>Dataset</code></p> <p>Variant-to-gene (V2G) evidence dataset.</p> <p>A variant-to-gene (V2G) evidence is understood as any piece of evidence that supports the association of a variant with a likely causal gene. The evidence can sometimes be context-specific and refer to specific <code>biofeatures</code> (e.g. cell types)</p> Source code in <code>src/otg/dataset/v2g.py</code> <pre><code>@dataclass\nclass V2G(Dataset):\n\"\"\"Variant-to-gene (V2G) evidence dataset.\n\n    A variant-to-gene (V2G) evidence is understood as any piece of evidence that supports the association of a variant with a likely causal gene. The evidence can sometimes be context-specific and refer to specific `biofeatures` (e.g. cell types)\n    \"\"\"\n\n    _schema: StructType = parse_spark_schema(\"v2g.json\")\n\n    @classmethod\n    def from_parquet(cls: type[V2G], session: Session, path: str) -&gt; V2G:\n\"\"\"Initialise V2G from parquet file.\n\n        Args:\n            session (Session): ETL session\n            path (str): Path to parquet file\n\n        Returns:\n            V2G: V2G dataset\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n\n    def filter_by_genes(self: V2G, genes: GeneIndex) -&gt; V2G:\n\"\"\"Filter by V2G dataset by genes.\n\n        Args:\n            genes (GeneIndex): Gene index dataset to filter by\n\n        Returns:\n            V2G: V2G dataset filtered by genes\n        \"\"\"\n        self.df = self._df.join(genes.df.select(\"geneId\"), on=\"geneId\", how=\"inner\")\n        return self\n</code></pre>"},{"location":"components/dataset/variant_to_gene/#otg.dataset.v2g.V2G.filter_by_genes","title":"<code>filter_by_genes(genes)</code>","text":"<p>Filter by V2G dataset by genes.</p> <p>Parameters:</p> Name Type Description Default <code>genes</code> <code>GeneIndex</code> <p>Gene index dataset to filter by</p> required <p>Returns:</p> Name Type Description <code>V2G</code> <code>V2G</code> <p>V2G dataset filtered by genes</p> Source code in <code>src/otg/dataset/v2g.py</code> <pre><code>def filter_by_genes(self: V2G, genes: GeneIndex) -&gt; V2G:\n\"\"\"Filter by V2G dataset by genes.\n\n    Args:\n        genes (GeneIndex): Gene index dataset to filter by\n\n    Returns:\n        V2G: V2G dataset filtered by genes\n    \"\"\"\n    self.df = self._df.join(genes.df.select(\"geneId\"), on=\"geneId\", how=\"inner\")\n    return self\n</code></pre>"},{"location":"components/dataset/variant_to_gene/#otg.dataset.v2g.V2G.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise V2G from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>ETL session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>V2G</code> <code>V2G</code> <p>V2G dataset</p> Source code in <code>src/otg/dataset/v2g.py</code> <pre><code>@classmethod\ndef from_parquet(cls: type[V2G], session: Session, path: str) -&gt; V2G:\n\"\"\"Initialise V2G from parquet file.\n\n    Args:\n        session (Session): ETL session\n        path (str): Path to parquet file\n\n    Returns:\n        V2G: V2G dataset\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/variant_to_gene/#schema","title":"Schema","text":"<pre><code>root\n |-- geneId: string (nullable = false)\n |-- variantId: string (nullable = false)\n |-- distance: long (nullable = true)\n |-- chromosome: string (nullable = false)\n |-- datatypeId: string (nullable = false)\n |-- datasourceId: string (nullable = false)\n |-- score: double (nullable = true)\n |-- resourceScore: double (nullable = true)\n |-- pmid: string (nullable = true)\n |-- biofeature: string (nullable = true)\n |-- position: integer (nullable = false)\n |-- label: string (nullable = true)\n |-- variantFunctionalConsequenceId: string (nullable = true)\n |-- isHighQualityPlof: boolean (nullable = true)\n</code></pre>"},{"location":"components/dataset/study_index/_study_index/","title":"Study index","text":"<p>         Bases: <code>Dataset</code></p> <p>Study index dataset.</p> <p>A study index dataset captures all the metadata for all studies including GWAS and Molecular QTL.</p> Source code in <code>src/otg/dataset/study_index.py</code> <pre><code>@dataclass\nclass StudyIndex(Dataset):\n\"\"\"Study index dataset.\n\n    A study index dataset captures all the metadata for all studies including GWAS and Molecular QTL.\n    \"\"\"\n\n    _schema: StructType = parse_spark_schema(\"studies.json\")\n\n    @classmethod\n    def from_parquet(cls: type[StudyIndex], session: Session, path: str) -&gt; StudyIndex:\n\"\"\"Initialise StudyIndex from parquet file.\n\n        Args:\n            session (Session): ETL session\n            path (str): Path to parquet file\n\n        Returns:\n            StudyIndex: Study index dataset\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n\n    def study_type_lut(self: StudyIndex) -&gt; DataFrame:\n\"\"\"Return a lookup table of study type.\n\n        Returns:\n            DataFrame: A dataframe containing `studyId` and `studyType` columns.\n        \"\"\"\n        return self.df.select(\"studyId\", \"studyType\")\n</code></pre>"},{"location":"components/dataset/study_index/_study_index/#otg.dataset.study_index.StudyIndex.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise StudyIndex from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>ETL session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>StudyIndex</code> <code>StudyIndex</code> <p>Study index dataset</p> Source code in <code>src/otg/dataset/study_index.py</code> <pre><code>@classmethod\ndef from_parquet(cls: type[StudyIndex], session: Session, path: str) -&gt; StudyIndex:\n\"\"\"Initialise StudyIndex from parquet file.\n\n    Args:\n        session (Session): ETL session\n        path (str): Path to parquet file\n\n    Returns:\n        StudyIndex: Study index dataset\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/study_index/_study_index/#otg.dataset.study_index.StudyIndex.study_type_lut","title":"<code>study_type_lut()</code>","text":"<p>Return a lookup table of study type.</p> <p>Returns:</p> Name Type Description <code>DataFrame</code> <code>DataFrame</code> <p>A dataframe containing <code>studyId</code> and <code>studyType</code> columns.</p> Source code in <code>src/otg/dataset/study_index.py</code> <pre><code>def study_type_lut(self: StudyIndex) -&gt; DataFrame:\n\"\"\"Return a lookup table of study type.\n\n    Returns:\n        DataFrame: A dataframe containing `studyId` and `studyType` columns.\n    \"\"\"\n    return self.df.select(\"studyId\", \"studyType\")\n</code></pre>"},{"location":"components/dataset/study_index/_study_index/#schema","title":"Schema","text":"<pre><code>root\n |-- studyId: string (nullable = false)\n |-- projectId: string (nullable = false)\n |-- studyType: string (nullable = false)\n |-- traitFromSource: string (nullable = false)\n |-- traitFromSourceMappedIds: array (nullable = true)\n |    |-- element: string (containsNull = true)\n |-- pubmedId: string (nullable = true)\n |-- publicationTitle: string (nullable = true)\n |-- publicationFirstAuthor: string (nullable = true)\n |-- publicationDate: string (nullable = true)\n |-- publicationJournal: string (nullable = true)\n |-- backgroundTraitFromSourceMappedIds: array (nullable = true)\n |    |-- element: string (containsNull = true)\n |-- initialSampleSize: string (nullable = true)\n |-- nCases: long (nullable = true)\n |-- nControls: long (nullable = true)\n |-- nSamples: long (nullable = true)\n |-- discoverySamples: array (nullable = true)\n |    |-- element: struct (containsNull = false)\n |    |    |-- sampleSize: string (nullable = true)\n |    |    |-- ancestry: string (nullable = true)\n |-- replicationSamples: array (nullable = true)\n |    |-- element: struct (containsNull = false)\n |    |    |-- sampleSize: string (nullable = true)\n |    |    |-- ancestry: string (nullable = true)\n |-- summarystatsLocation: string (nullable = true)\n |-- hasSumstats: boolean (nullable = true)\n</code></pre>"},{"location":"components/dataset/study_index/study_index_finngen/","title":"Study index finngen","text":"<p>         Bases: <code>StudyIndex</code></p> <p>Study index dataset from FinnGen.</p> <p>The following information is aggregated/extracted:</p> <ul> <li>Study ID in the special format (FINNGEN_R9_*)</li> <li>Trait name (for example, Amoebiasis)</li> <li>Number of cases and controls</li> <li>Link to the summary statistics location</li> </ul> <p>Some fields are also populated as constants, such as study type and the initial sample size.</p> Source code in <code>src/otg/dataset/study_index.py</code> <pre><code>@dataclass\nclass StudyIndexFinnGen(StudyIndex):\n\"\"\"Study index dataset from FinnGen.\n\n    The following information is aggregated/extracted:\n\n    - Study ID in the special format (FINNGEN_R9_*)\n    - Trait name (for example, Amoebiasis)\n    - Number of cases and controls\n    - Link to the summary statistics location\n\n    Some fields are also populated as constants, such as study type and the initial sample size.\n    \"\"\"\n\n    @classmethod\n    def from_source(\n        cls: type[StudyIndexFinnGen],\n        finngen_studies: DataFrame,\n        finngen_release_prefix: str,\n        finngen_sumstat_url_prefix: str,\n        finngen_sumstat_url_suffix: str,\n    ) -&gt; StudyIndexFinnGen:\n\"\"\"This function ingests study level metadata from FinnGen.\n\n        Args:\n            finngen_studies (DataFrame): FinnGen raw study table\n            finngen_release_prefix (str): Release prefix pattern.\n            finngen_sumstat_url_prefix (str): URL prefix for summary statistics location.\n            finngen_sumstat_url_suffix (str): URL prefix suffix for summary statistics location.\n\n        Returns:\n            StudyIndexFinnGen: Parsed and annotated FinnGen study table.\n        \"\"\"\n        return cls(\n            _df=(\n                # Read FinnGen raw data.\n                finngen_studies.select(\n                    # Select the desired columns.\n                    f.concat(\n                        f.lit(finngen_release_prefix + \"_\"), f.col(\"phenocode\")\n                    ).alias(\"studyId\"),\n                    f.col(\"phenostring\").alias(\"traitFromSource\"),\n                    f.col(\"num_cases\").alias(\"nCases\"),\n                    f.col(\"num_controls\").alias(\"nControls\"),\n                    # Set constant value columns.\n                    f.lit(finngen_release_prefix).alias(\"projectId\"),\n                    f.lit(\"gwas\").alias(\"studyType\"),\n                    f.lit(True).alias(\"hasSumstats\"),\n                    f.lit(\"377,277 (210,870 females and 166,407 males)\").alias(\n                        \"initialSampleSize\"\n                    ),\n                )\n                .withColumn(\"nSamples\", f.col(\"nCases\") + f.col(\"nControls\"))\n                .withColumn(\n                    \"summarystatsLocation\",\n                    f.concat(\n                        f.lit(finngen_sumstat_url_prefix),\n                        f.col(\"studyId\"),\n                        f.lit(finngen_sumstat_url_suffix),\n                    ),\n                )\n            )\n        )\n</code></pre>"},{"location":"components/dataset/study_index/study_index_finngen/#otg.dataset.study_index.StudyIndexFinnGen.from_source","title":"<code>from_source(finngen_studies, finngen_release_prefix, finngen_sumstat_url_prefix, finngen_sumstat_url_suffix)</code>  <code>classmethod</code>","text":"<p>This function ingests study level metadata from FinnGen.</p> <p>Parameters:</p> Name Type Description Default <code>finngen_studies</code> <code>DataFrame</code> <p>FinnGen raw study table</p> required <code>finngen_release_prefix</code> <code>str</code> <p>Release prefix pattern.</p> required <code>finngen_sumstat_url_prefix</code> <code>str</code> <p>URL prefix for summary statistics location.</p> required <code>finngen_sumstat_url_suffix</code> <code>str</code> <p>URL prefix suffix for summary statistics location.</p> required <p>Returns:</p> Name Type Description <code>StudyIndexFinnGen</code> <code>StudyIndexFinnGen</code> <p>Parsed and annotated FinnGen study table.</p> Source code in <code>src/otg/dataset/study_index.py</code> <pre><code>@classmethod\ndef from_source(\n    cls: type[StudyIndexFinnGen],\n    finngen_studies: DataFrame,\n    finngen_release_prefix: str,\n    finngen_sumstat_url_prefix: str,\n    finngen_sumstat_url_suffix: str,\n) -&gt; StudyIndexFinnGen:\n\"\"\"This function ingests study level metadata from FinnGen.\n\n    Args:\n        finngen_studies (DataFrame): FinnGen raw study table\n        finngen_release_prefix (str): Release prefix pattern.\n        finngen_sumstat_url_prefix (str): URL prefix for summary statistics location.\n        finngen_sumstat_url_suffix (str): URL prefix suffix for summary statistics location.\n\n    Returns:\n        StudyIndexFinnGen: Parsed and annotated FinnGen study table.\n    \"\"\"\n    return cls(\n        _df=(\n            # Read FinnGen raw data.\n            finngen_studies.select(\n                # Select the desired columns.\n                f.concat(\n                    f.lit(finngen_release_prefix + \"_\"), f.col(\"phenocode\")\n                ).alias(\"studyId\"),\n                f.col(\"phenostring\").alias(\"traitFromSource\"),\n                f.col(\"num_cases\").alias(\"nCases\"),\n                f.col(\"num_controls\").alias(\"nControls\"),\n                # Set constant value columns.\n                f.lit(finngen_release_prefix).alias(\"projectId\"),\n                f.lit(\"gwas\").alias(\"studyType\"),\n                f.lit(True).alias(\"hasSumstats\"),\n                f.lit(\"377,277 (210,870 females and 166,407 males)\").alias(\n                    \"initialSampleSize\"\n                ),\n            )\n            .withColumn(\"nSamples\", f.col(\"nCases\") + f.col(\"nControls\"))\n            .withColumn(\n                \"summarystatsLocation\",\n                f.concat(\n                    f.lit(finngen_sumstat_url_prefix),\n                    f.col(\"studyId\"),\n                    f.lit(finngen_sumstat_url_suffix),\n                ),\n            )\n        )\n    )\n</code></pre>"},{"location":"components/dataset/study_index/study_index_finngen/#schema","title":"Schema","text":"<pre><code>root\n |-- studyId: string (nullable = false)\n |-- projectId: string (nullable = false)\n |-- studyType: string (nullable = false)\n |-- traitFromSource: string (nullable = false)\n |-- traitFromSourceMappedIds: array (nullable = true)\n |    |-- element: string (containsNull = true)\n |-- pubmedId: string (nullable = true)\n |-- publicationTitle: string (nullable = true)\n |-- publicationFirstAuthor: string (nullable = true)\n |-- publicationDate: string (nullable = true)\n |-- publicationJournal: string (nullable = true)\n |-- backgroundTraitFromSourceMappedIds: array (nullable = true)\n |    |-- element: string (containsNull = true)\n |-- initialSampleSize: string (nullable = true)\n |-- nCases: long (nullable = true)\n |-- nControls: long (nullable = true)\n |-- nSamples: long (nullable = true)\n |-- discoverySamples: array (nullable = true)\n |    |-- element: struct (containsNull = false)\n |    |    |-- sampleSize: string (nullable = true)\n |    |    |-- ancestry: string (nullable = true)\n |-- replicationSamples: array (nullable = true)\n |    |-- element: struct (containsNull = false)\n |    |    |-- sampleSize: string (nullable = true)\n |    |    |-- ancestry: string (nullable = true)\n |-- summarystatsLocation: string (nullable = true)\n |-- hasSumstats: boolean (nullable = true)\n</code></pre>"},{"location":"components/dataset/study_index/study_index_gwas_catalog/","title":"Study index gwas catalog","text":"<p>         Bases: <code>StudyIndex</code></p> <p>Study index dataset from GWAS Catalog.</p> <p>The following information is harmonised from the GWAS Catalog:</p> <ul> <li>All publication related information retained.</li> <li>Mapped measured and background traits parsed.</li> <li>Flagged if harmonized summary statistics datasets available.</li> <li>If available, the ftp path to these files presented.</li> <li>Ancestries from the discovery and replication stages are structured with sample counts.</li> <li>Case/control counts extracted.</li> <li>The number of samples with European ancestry extracted.</li> </ul> Source code in <code>src/otg/dataset/study_index.py</code> <pre><code>@dataclass\nclass StudyIndexGWASCatalog(StudyIndex):\n\"\"\"Study index dataset from GWAS Catalog.\n\n    The following information is harmonised from the GWAS Catalog:\n\n    - All publication related information retained.\n    - Mapped measured and background traits parsed.\n    - Flagged if harmonized summary statistics datasets available.\n    - If available, the ftp path to these files presented.\n    - Ancestries from the discovery and replication stages are structured with sample counts.\n    - Case/control counts extracted.\n    - The number of samples with European ancestry extracted.\n\n    \"\"\"\n\n    @staticmethod\n    def _gwas_ancestry_to_gnomad(gwas_catalog_ancestry: Column) -&gt; Column:\n\"\"\"Normalised ancestry column from GWAS Catalog into Gnomad ancestry.\n\n        Args:\n            gwas_catalog_ancestry (Column): GWAS Catalog ancestry\n\n        Returns:\n            Column: mapped Gnomad ancestry using LUT\n        \"\"\"\n        # GWAS Catalog to p-value mapping\n        json_dict = json.loads(\n            pkg_resources.read_text(\n                data, \"gwascat_2_gnomad_superpopulation_map.json\", encoding=\"utf-8\"\n            )\n        )\n        map_expr = f.create_map(*[f.lit(x) for x in chain(*json_dict.items())])\n\n        return f.transform(gwas_catalog_ancestry, lambda x: map_expr[x])\n\n    @classmethod\n    def _parse_study_table(\n        cls: type[StudyIndexGWASCatalog], catalog_studies: DataFrame\n    ) -&gt; StudyIndexGWASCatalog:\n\"\"\"Harmonise GWASCatalog study table with `StudyIndex` schema.\n\n        Args:\n            catalog_studies (DataFrame): GWAS Catalog study table\n\n        Returns:\n            StudyIndexGWASCatalog:\n        \"\"\"\n        return cls(\n            _df=catalog_studies.select(\n                f.coalesce(\n                    f.col(\"STUDY ACCESSION\"), f.monotonically_increasing_id()\n                ).alias(\"studyId\"),\n                f.lit(\"GCST\").alias(\"projectId\"),\n                f.lit(\"gwas\").alias(\"studyType\"),\n                f.col(\"PUBMED ID\").alias(\"pubmedId\"),\n                f.col(\"FIRST AUTHOR\").alias(\"publicationFirstAuthor\"),\n                f.col(\"DATE\").alias(\"publicationDate\"),\n                f.col(\"JOURNAL\").alias(\"publicationJournal\"),\n                f.col(\"STUDY\").alias(\"publicationTitle\"),\n                f.coalesce(f.col(\"DISEASE/TRAIT\"), f.lit(\"Unreported\")).alias(\n                    \"traitFromSource\"\n                ),\n                f.col(\"INITIAL SAMPLE SIZE\").alias(\"initialSampleSize\"),\n                parse_efos(f.col(\"MAPPED_TRAIT_URI\")).alias(\"traitFromSourceMappedIds\"),\n                parse_efos(f.col(\"MAPPED BACKGROUND TRAIT URI\")).alias(\n                    \"backgroundTraitFromSourceMappedIds\"\n                ),\n            )\n        )\n\n    @classmethod\n    def from_source(\n        cls: type[StudyIndexGWASCatalog],\n        catalog_studies: DataFrame,\n        ancestry_file: DataFrame,\n        sumstats_lut: DataFrame,\n    ) -&gt; StudyIndexGWASCatalog:\n\"\"\"This function ingests study level metadata from the GWAS Catalog.\n\n        Args:\n            catalog_studies (DataFrame): GWAS Catalog raw study table\n            ancestry_file (DataFrame): GWAS Catalog ancestry table.\n            sumstats_lut (DataFrame): GWAS Catalog summary statistics list.\n\n        Returns:\n            StudyIndexGWASCatalog: Parsed and annotated GWAS Catalog study table.\n        \"\"\"\n        # Read GWAS Catalogue raw data\n        return (\n            cls._parse_study_table(catalog_studies)\n            ._annotate_ancestries(ancestry_file)\n            ._annotate_sumstats_info(sumstats_lut)\n            ._annotate_discovery_sample_sizes()\n        )\n\n    def get_gnomad_ancestry_sample_sizes(self: StudyIndexGWASCatalog) -&gt; DataFrame:\n\"\"\"Get all studies and their ancestries.\n\n        Returns:\n            DataFrame: containing `studyId`, `gnomadPopulation` and `relativeSampleSize` columns\n        \"\"\"\n        # Study ancestries\n        w_study = Window.partitionBy(\"studyId\")\n        return (\n            self.df\n            # Excluding studies where no sample discription is provided:\n            .filter(f.col(\"discoverySamples\").isNotNull())\n            # Exploding sample description and study identifier:\n            .withColumn(\"discoverySample\", f.explode(f.col(\"discoverySamples\")))\n            # Splitting sample descriptions further:\n            .withColumn(\n                \"ancestries\",\n                f.split(f.col(\"discoverySample.ancestry\"), r\",\\s(?![^()]*\\))\"),\n            )\n            # Dividing sample sizes assuming even distribution\n            .withColumn(\n                \"adjustedSampleSize\",\n                f.col(\"discoverySample.sampleSize\") / f.size(f.col(\"ancestries\")),\n            )\n            # mapped to gnomAD superpopulation and exploded\n            .withColumn(\n                \"gnomadPopulation\",\n                f.explode(\n                    StudyIndexGWASCatalog._gwas_ancestry_to_gnomad(f.col(\"ancestries\"))\n                ),\n            )\n            # Group by studies and aggregate for major population:\n            .groupBy(\"studyId\", \"gnomadPopulation\")\n            .agg(f.sum(f.col(\"adjustedSampleSize\")).alias(\"sampleSize\"))\n            # Calculate proportions for each study\n            .withColumn(\n                \"relativeSampleSize\",\n                f.col(\"sampleSize\") / f.sum(\"sampleSize\").over(w_study),\n            )\n            .drop(\"sampleSize\")\n        )\n\n    def update_study_id(\n        self: StudyIndexGWASCatalog, study_annotation: DataFrame\n    ) -&gt; StudyIndexGWASCatalog:\n\"\"\"Update studyId with a dataframe containing study.\n\n        Args:\n            study_annotation (DataFrame): Dataframe containing `updatedStudyId`, `traitFromSource`, `traitFromSourceMappedIds` and key column `studyId`.\n\n        Returns:\n            StudyIndexGWASCatalog: Updated study table.\n        \"\"\"\n        self.df = (\n            self._df.join(\n                study_annotation.select(\n                    *[\n                        f.col(c).alias(f\"updated{c}\")\n                        if c not in [\"studyId\", \"updatedStudyId\"]\n                        else f.col(c)\n                        for c in study_annotation.columns\n                    ]\n                ),\n                on=\"studyId\",\n                how=\"left\",\n            )\n            .withColumn(\n                \"studyId\",\n                f.coalesce(f.col(\"updatedStudyId\"), f.col(\"studyId\")),\n            )\n            .withColumn(\n                \"traitFromSource\",\n                f.coalesce(f.col(\"updatedtraitFromSource\"), f.col(\"traitFromSource\")),\n            )\n            .withColumn(\n                \"traitFromSourceMappedIds\",\n                f.coalesce(\n                    f.col(\"updatedtraitFromSourceMappedIds\"),\n                    f.col(\"traitFromSourceMappedIds\"),\n                ),\n            )\n            .select(self._df.columns)\n        )\n\n        return self\n\n    def _annotate_ancestries(\n        self: StudyIndexGWASCatalog, ancestry_lut: DataFrame\n    ) -&gt; StudyIndexGWASCatalog:\n\"\"\"Extracting sample sizes and ancestry information.\n\n        This function parses the ancestry data. Also get counts for the europeans in the same\n        discovery stage.\n\n        Args:\n            ancestry_lut (DataFrame): Ancestry table as downloaded from the GWAS Catalog\n\n        Returns:\n            StudyIndexGWASCatalog: Slimmed and cleaned version of the ancestry annotation.\n        \"\"\"\n        ancestry = (\n            ancestry_lut\n            # Convert column headers to camelcase:\n            .transform(\n                lambda df: df.select(\n                    *[f.expr(column2camel_case(x)) for x in df.columns]\n                )\n            ).withColumnRenamed(\n                \"studyAccession\", \"studyId\"\n            )  # studyId has not been split yet\n        )\n\n        # Get a high resolution dataset on experimental stage:\n        ancestry_stages = (\n            ancestry.groupBy(\"studyId\")\n            .pivot(\"stage\")\n            .agg(\n                f.collect_set(\n                    f.struct(\n                        f.col(\"numberOfIndividuals\").alias(\"sampleSize\"),\n                        f.col(\"broadAncestralCategory\").alias(\"ancestry\"),\n                    )\n                )\n            )\n            .withColumnRenamed(\"initial\", \"discoverySamples\")\n            .withColumnRenamed(\"replication\", \"replicationSamples\")\n            .persist()\n        )\n\n        # Generate information on the ancestry composition of the discovery stage, and calculate\n        # the proportion of the Europeans:\n        europeans_deconvoluted = (\n            ancestry\n            # Focus on discovery stage:\n            .filter(f.col(\"stage\") == \"initial\")\n            # Sorting ancestries if European:\n            .withColumn(\n                \"ancestryFlag\",\n                # Excluding finnish:\n                f.when(\n                    f.col(\"initialSampleDescription\").contains(\"Finnish\"),\n                    f.lit(\"other\"),\n                )\n                # Excluding Icelandic population:\n                .when(\n                    f.col(\"initialSampleDescription\").contains(\"Icelandic\"),\n                    f.lit(\"other\"),\n                )\n                # Including European ancestry:\n                .when(f.col(\"broadAncestralCategory\") == \"European\", f.lit(\"european\"))\n                # Exclude all other population:\n                .otherwise(\"other\"),\n            )\n            # Grouping by study accession and initial sample description:\n            .groupBy(\"studyId\")\n            .pivot(\"ancestryFlag\")\n            .agg(\n                # Summarizing sample sizes for all ancestries:\n                f.sum(f.col(\"numberOfIndividuals\"))\n            )\n            # Do arithmetics to make sure we have the right proportion of european in the set:\n            .withColumn(\n                \"initialSampleCountEuropean\",\n                f.when(f.col(\"european\").isNull(), f.lit(0)).otherwise(\n                    f.col(\"european\")\n                ),\n            )\n            .withColumn(\n                \"initialSampleCountOther\",\n                f.when(f.col(\"other\").isNull(), f.lit(0)).otherwise(f.col(\"other\")),\n            )\n            .withColumn(\n                \"initialSampleCount\",\n                f.col(\"initialSampleCountEuropean\") + f.col(\"other\"),\n            )\n            .drop(\n                \"european\",\n                \"other\",\n                \"initialSampleCount\",\n                \"initialSampleCountEuropean\",\n                \"initialSampleCountOther\",\n            )\n        )\n\n        parsed_ancestry_lut = ancestry_stages.join(\n            europeans_deconvoluted, on=\"studyId\", how=\"outer\"\n        )\n\n        self.df = self.df.join(parsed_ancestry_lut, on=\"studyId\", how=\"left\")\n        return self\n\n    def _annotate_sumstats_info(\n        self: StudyIndexGWASCatalog, sumstats_lut: DataFrame\n    ) -&gt; StudyIndexGWASCatalog:\n\"\"\"Annotate summary stat locations.\n\n        Args:\n            sumstats_lut (DataFrame): listing GWAS Catalog summary stats paths\n\n        Returns:\n            StudyIndexGWASCatalog: including `summarystatsLocation` and `hasSumstats` columns\n        \"\"\"\n        gwas_sumstats_base_uri = (\n            \"ftp://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/\"\n        )\n\n        parsed_sumstats_lut = sumstats_lut.withColumn(\n            \"summarystatsLocation\",\n            f.concat(\n                f.lit(gwas_sumstats_base_uri),\n                f.regexp_replace(f.col(\"_c0\"), r\"^\\.\\/\", \"\"),\n            ),\n        ).select(\n            f.regexp_extract(f.col(\"summarystatsLocation\"), r\"\\/(GCST\\d+)\\/\", 1).alias(\n                \"studyId\"\n            ),\n            \"summarystatsLocation\",\n            f.lit(True).alias(\"hasSumstats\"),\n        )\n\n        self.df = (\n            self.df.drop(\"hasSumstats\")\n            .join(parsed_sumstats_lut, on=\"studyId\", how=\"left\")\n            .withColumn(\"hasSumstats\", f.coalesce(f.col(\"hasSumstats\"), f.lit(False)))\n        )\n        return self\n\n    def _annotate_discovery_sample_sizes(\n        self: StudyIndexGWASCatalog,\n    ) -&gt; StudyIndexGWASCatalog:\n\"\"\"Extract the sample size of the discovery stage of the study as annotated in the GWAS Catalog.\n\n        For some studies that measure quantitative traits, nCases and nControls can't be extracted. Therefore, we assume these are 0.\n\n        Returns:\n            StudyIndexGWASCatalog: object with columns `nCases`, `nControls`, and `nSamples` per `studyId` correctly extracted.\n        \"\"\"\n        sample_size_lut = (\n            self.df.select(\n                \"studyId\",\n                f.explode_outer(f.split(f.col(\"initialSampleSize\"), r\",\\s+\")).alias(\n                    \"samples\"\n                ),\n            )\n            # Extracting the sample size from the string:\n            .withColumn(\n                \"sampleSize\",\n                f.regexp_extract(\n                    f.regexp_replace(f.col(\"samples\"), \",\", \"\"), r\"[0-9,]+\", 0\n                ).cast(t.IntegerType()),\n            )\n            .select(\n                \"studyId\",\n                \"sampleSize\",\n                f.when(f.col(\"samples\").contains(\"cases\"), f.col(\"sampleSize\"))\n                .otherwise(f.lit(0))\n                .alias(\"nCases\"),\n                f.when(f.col(\"samples\").contains(\"controls\"), f.col(\"sampleSize\"))\n                .otherwise(f.lit(0))\n                .alias(\"nControls\"),\n            )\n            # Aggregating sample sizes for all ancestries:\n            .groupBy(\"studyId\")  # studyId has not been split yet\n            .agg(\n                f.sum(\"nCases\").alias(\"nCases\"),\n                f.sum(\"nControls\").alias(\"nControls\"),\n                f.sum(\"sampleSize\").alias(\"nSamples\"),\n            )\n        )\n        self.df = self.df.join(sample_size_lut, on=\"studyId\", how=\"left\")\n        return self\n</code></pre>"},{"location":"components/dataset/study_index/study_index_gwas_catalog/#otg.dataset.study_index.StudyIndexGWASCatalog.from_source","title":"<code>from_source(catalog_studies, ancestry_file, sumstats_lut)</code>  <code>classmethod</code>","text":"<p>This function ingests study level metadata from the GWAS Catalog.</p> <p>Parameters:</p> Name Type Description Default <code>catalog_studies</code> <code>DataFrame</code> <p>GWAS Catalog raw study table</p> required <code>ancestry_file</code> <code>DataFrame</code> <p>GWAS Catalog ancestry table.</p> required <code>sumstats_lut</code> <code>DataFrame</code> <p>GWAS Catalog summary statistics list.</p> required <p>Returns:</p> Name Type Description <code>StudyIndexGWASCatalog</code> <code>StudyIndexGWASCatalog</code> <p>Parsed and annotated GWAS Catalog study table.</p> Source code in <code>src/otg/dataset/study_index.py</code> <pre><code>@classmethod\ndef from_source(\n    cls: type[StudyIndexGWASCatalog],\n    catalog_studies: DataFrame,\n    ancestry_file: DataFrame,\n    sumstats_lut: DataFrame,\n) -&gt; StudyIndexGWASCatalog:\n\"\"\"This function ingests study level metadata from the GWAS Catalog.\n\n    Args:\n        catalog_studies (DataFrame): GWAS Catalog raw study table\n        ancestry_file (DataFrame): GWAS Catalog ancestry table.\n        sumstats_lut (DataFrame): GWAS Catalog summary statistics list.\n\n    Returns:\n        StudyIndexGWASCatalog: Parsed and annotated GWAS Catalog study table.\n    \"\"\"\n    # Read GWAS Catalogue raw data\n    return (\n        cls._parse_study_table(catalog_studies)\n        ._annotate_ancestries(ancestry_file)\n        ._annotate_sumstats_info(sumstats_lut)\n        ._annotate_discovery_sample_sizes()\n    )\n</code></pre>"},{"location":"components/dataset/study_index/study_index_gwas_catalog/#otg.dataset.study_index.StudyIndexGWASCatalog.get_gnomad_ancestry_sample_sizes","title":"<code>get_gnomad_ancestry_sample_sizes()</code>","text":"<p>Get all studies and their ancestries.</p> <p>Returns:</p> Name Type Description <code>DataFrame</code> <code>DataFrame</code> <p>containing <code>studyId</code>, <code>gnomadPopulation</code> and <code>relativeSampleSize</code> columns</p> Source code in <code>src/otg/dataset/study_index.py</code> <pre><code>def get_gnomad_ancestry_sample_sizes(self: StudyIndexGWASCatalog) -&gt; DataFrame:\n\"\"\"Get all studies and their ancestries.\n\n    Returns:\n        DataFrame: containing `studyId`, `gnomadPopulation` and `relativeSampleSize` columns\n    \"\"\"\n    # Study ancestries\n    w_study = Window.partitionBy(\"studyId\")\n    return (\n        self.df\n        # Excluding studies where no sample discription is provided:\n        .filter(f.col(\"discoverySamples\").isNotNull())\n        # Exploding sample description and study identifier:\n        .withColumn(\"discoverySample\", f.explode(f.col(\"discoverySamples\")))\n        # Splitting sample descriptions further:\n        .withColumn(\n            \"ancestries\",\n            f.split(f.col(\"discoverySample.ancestry\"), r\",\\s(?![^()]*\\))\"),\n        )\n        # Dividing sample sizes assuming even distribution\n        .withColumn(\n            \"adjustedSampleSize\",\n            f.col(\"discoverySample.sampleSize\") / f.size(f.col(\"ancestries\")),\n        )\n        # mapped to gnomAD superpopulation and exploded\n        .withColumn(\n            \"gnomadPopulation\",\n            f.explode(\n                StudyIndexGWASCatalog._gwas_ancestry_to_gnomad(f.col(\"ancestries\"))\n            ),\n        )\n        # Group by studies and aggregate for major population:\n        .groupBy(\"studyId\", \"gnomadPopulation\")\n        .agg(f.sum(f.col(\"adjustedSampleSize\")).alias(\"sampleSize\"))\n        # Calculate proportions for each study\n        .withColumn(\n            \"relativeSampleSize\",\n            f.col(\"sampleSize\") / f.sum(\"sampleSize\").over(w_study),\n        )\n        .drop(\"sampleSize\")\n    )\n</code></pre>"},{"location":"components/dataset/study_index/study_index_gwas_catalog/#otg.dataset.study_index.StudyIndexGWASCatalog.update_study_id","title":"<code>update_study_id(study_annotation)</code>","text":"<p>Update studyId with a dataframe containing study.</p> <p>Parameters:</p> Name Type Description Default <code>study_annotation</code> <code>DataFrame</code> <p>Dataframe containing <code>updatedStudyId</code>, <code>traitFromSource</code>, <code>traitFromSourceMappedIds</code> and key column <code>studyId</code>.</p> required <p>Returns:</p> Name Type Description <code>StudyIndexGWASCatalog</code> <code>StudyIndexGWASCatalog</code> <p>Updated study table.</p> Source code in <code>src/otg/dataset/study_index.py</code> <pre><code>def update_study_id(\n    self: StudyIndexGWASCatalog, study_annotation: DataFrame\n) -&gt; StudyIndexGWASCatalog:\n\"\"\"Update studyId with a dataframe containing study.\n\n    Args:\n        study_annotation (DataFrame): Dataframe containing `updatedStudyId`, `traitFromSource`, `traitFromSourceMappedIds` and key column `studyId`.\n\n    Returns:\n        StudyIndexGWASCatalog: Updated study table.\n    \"\"\"\n    self.df = (\n        self._df.join(\n            study_annotation.select(\n                *[\n                    f.col(c).alias(f\"updated{c}\")\n                    if c not in [\"studyId\", \"updatedStudyId\"]\n                    else f.col(c)\n                    for c in study_annotation.columns\n                ]\n            ),\n            on=\"studyId\",\n            how=\"left\",\n        )\n        .withColumn(\n            \"studyId\",\n            f.coalesce(f.col(\"updatedStudyId\"), f.col(\"studyId\")),\n        )\n        .withColumn(\n            \"traitFromSource\",\n            f.coalesce(f.col(\"updatedtraitFromSource\"), f.col(\"traitFromSource\")),\n        )\n        .withColumn(\n            \"traitFromSourceMappedIds\",\n            f.coalesce(\n                f.col(\"updatedtraitFromSourceMappedIds\"),\n                f.col(\"traitFromSourceMappedIds\"),\n            ),\n        )\n        .select(self._df.columns)\n    )\n\n    return self\n</code></pre>"},{"location":"components/dataset/study_index/study_index_gwas_catalog/#schema","title":"Schema","text":"<pre><code>root\n |-- studyId: string (nullable = false)\n |-- projectId: string (nullable = false)\n |-- studyType: string (nullable = false)\n |-- traitFromSource: string (nullable = false)\n |-- traitFromSourceMappedIds: array (nullable = true)\n |    |-- element: string (containsNull = true)\n |-- pubmedId: string (nullable = true)\n |-- publicationTitle: string (nullable = true)\n |-- publicationFirstAuthor: string (nullable = true)\n |-- publicationDate: string (nullable = true)\n |-- publicationJournal: string (nullable = true)\n |-- backgroundTraitFromSourceMappedIds: array (nullable = true)\n |    |-- element: string (containsNull = true)\n |-- initialSampleSize: string (nullable = true)\n |-- nCases: long (nullable = true)\n |-- nControls: long (nullable = true)\n |-- nSamples: long (nullable = true)\n |-- discoverySamples: array (nullable = true)\n |    |-- element: struct (containsNull = false)\n |    |    |-- sampleSize: string (nullable = true)\n |    |    |-- ancestry: string (nullable = true)\n |-- replicationSamples: array (nullable = true)\n |    |-- element: struct (containsNull = false)\n |    |    |-- sampleSize: string (nullable = true)\n |    |    |-- ancestry: string (nullable = true)\n |-- summarystatsLocation: string (nullable = true)\n |-- hasSumstats: boolean (nullable = true)\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/","title":"Study-locus","text":"<p>         Bases: <code>Dataset</code></p> <p>Study-Locus dataset.</p> <p>This dataset captures associations between study/traits and a genetic loci as provided by finemapping methods.</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>@dataclass\nclass StudyLocus(Dataset):\n\"\"\"Study-Locus dataset.\n\n    This dataset captures associations between study/traits and a genetic loci as provided by finemapping methods.\n    \"\"\"\n\n    _schema: StructType = parse_spark_schema(\"study_locus.json\")\n\n    @staticmethod\n    def _overlapping_peaks(credset_to_overlap: DataFrame) -&gt; DataFrame:\n\"\"\"Calculate overlapping signals (study-locus) between GWAS-GWAS and GWAS-Molecular trait.\n\n        Args:\n            credset_to_overlap (DataFrame): DataFrame containing at least `studyLocusId`, `studyType`, `chromosome` and `tagVariantId` columns.\n\n        Returns:\n            DataFrame: containing `left_studyLocusId`, `right_studyLocusId` and `chromosome` columns.\n        \"\"\"\n        # Reduce columns to the minimum to reduce the size of the dataframe\n        credset_to_overlap = credset_to_overlap.select(\n            \"studyLocusId\", \"studyType\", \"chromosome\", \"tagVariantId\"\n        )\n        return (\n            credset_to_overlap.alias(\"left\")\n            .filter(f.col(\"studyType\") == \"gwas\")\n            # Self join with complex condition. Left it's all gwas and right can be gwas or molecular trait\n            .join(\n                credset_to_overlap.alias(\"right\"),\n                on=[\n                    f.col(\"left.chromosome\") == f.col(\"right.chromosome\"),\n                    f.col(\"left.tagVariantId\") == f.col(\"right.tagVariantId\"),\n                    (f.col(\"right.studyType\") != \"gwas\")\n                    | (f.col(\"left.studyLocusId\") &gt; f.col(\"right.studyLocusId\")),\n                ],\n                how=\"inner\",\n            )\n            .select(\n                f.col(\"left.studyLocusId\").alias(\"left_studyLocusId\"),\n                f.col(\"right.studyLocusId\").alias(\"right_studyLocusId\"),\n                f.col(\"left.chromosome\").alias(\"chromosome\"),\n            )\n            .distinct()\n            .repartition(\"chromosome\")\n            .persist()\n        )\n\n    @staticmethod\n    def _align_overlapping_tags(\n        credset_to_overlap: DataFrame, peak_overlaps: DataFrame\n    ) -&gt; StudyLocusOverlap:\n\"\"\"Align overlapping tags in pairs of overlapping study-locus, keeping all tags in both loci.\n\n        Args:\n            credset_to_overlap (DataFrame): containing `studyLocusId`, `studyType`, `chromosome`, `tagVariantId`, `logABF` and `posteriorProbability` columns.\n            peak_overlaps (DataFrame): containing `left_studyLocusId`, `right_studyLocusId` and `chromosome` columns.\n\n        Returns:\n            StudyLocusOverlap: Pairs of overlapping study-locus with aligned tags.\n        \"\"\"\n        # Complete information about all tags in the left study-locus of the overlap\n        overlapping_left = credset_to_overlap.select(\n            f.col(\"chromosome\"),\n            f.col(\"tagVariantId\"),\n            f.col(\"studyLocusId\").alias(\"left_studyLocusId\"),\n            f.col(\"logABF\").alias(\"left_logABF\"),\n            f.col(\"posteriorProbability\").alias(\"left_posteriorProbability\"),\n        ).join(peak_overlaps, on=[\"chromosome\", \"left_studyLocusId\"], how=\"inner\")\n\n        # Complete information about all tags in the right study-locus of the overlap\n        overlapping_right = credset_to_overlap.select(\n            f.col(\"chromosome\"),\n            f.col(\"tagVariantId\"),\n            f.col(\"studyLocusId\").alias(\"right_studyLocusId\"),\n            f.col(\"logABF\").alias(\"right_logABF\"),\n            f.col(\"posteriorProbability\").alias(\"right_posteriorProbability\"),\n        ).join(peak_overlaps, on=[\"chromosome\", \"right_studyLocusId\"], how=\"inner\")\n\n        # Include information about all tag variants in both study-locus aligned by tag variant id\n        return StudyLocusOverlap(\n            _df=overlapping_left.join(\n                overlapping_right,\n                on=[\n                    \"chromosome\",\n                    \"right_studyLocusId\",\n                    \"left_studyLocusId\",\n                    \"tagVariantId\",\n                ],\n                how=\"outer\",\n            )\n            # ensures nullable=false for following columns\n            .fillna(\n                value=\"unknown\",\n                subset=[\n                    \"chromosome\",\n                    \"right_studyLocusId\",\n                    \"left_studyLocusId\",\n                    \"tagVariantId\",\n                ],\n            )\n        )\n\n    @staticmethod\n    def _update_quality_flag(\n        qc: Column, flag_condition: Column, flag_text: StudyLocusQualityCheck\n    ) -&gt; Column:\n\"\"\"Update the provided quality control list with a new flag if condition is met.\n\n        Args:\n            qc (Column): Array column with the current list of qc flags.\n            flag_condition (Column): This is a column of booleans, signing which row should be flagged\n            flag_text (StudyLocusQualityCheck): Text for the new quality control flag\n\n        Returns:\n            Column: Array column with the updated list of qc flags.\n        \"\"\"\n        qc = f.when(qc.isNull(), f.array()).otherwise(qc)\n        return f.when(\n            flag_condition,\n            f.array_union(qc, f.array(f.lit(flag_text.value))),\n        ).otherwise(qc)\n\n    @classmethod\n    def from_parquet(cls: type[StudyLocus], session: Session, path: str) -&gt; StudyLocus:\n\"\"\"Initialise StudyLocus from parquet file.\n\n        Args:\n            session (Session): spark session\n            path (str): Path to parquet file\n\n        Returns:\n            StudyLocus: Study-locus dataset\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n\n    def credible_set(\n        self: StudyLocus,\n        credible_interval: CredibleInterval,\n    ) -&gt; StudyLocus:\n\"\"\"Filter study-locus tag variants based on given credible interval.\n\n        Args:\n            credible_interval (CredibleInterval): Credible interval to filter for.\n\n        Returns:\n            StudyLocus: Filtered study-locus dataset.\n        \"\"\"\n        self.df = self._df.withColumn(\n            \"credibleSet\",\n            f.expr(f\"filter(credibleSet, tag -&gt; (tag.{credible_interval.value}))\"),\n        )\n        return self\n\n    def overlaps(self: StudyLocus, study_index: StudyIndex) -&gt; StudyLocusOverlap:\n\"\"\"Calculate overlapping study-locus.\n\n        Find overlapping study-locus that share at least one tagging variant. All GWAS-GWAS and all GWAS-Molecular traits are computed with the Molecular traits always\n        appearing on the right side.\n\n        Args:\n            study_index (StudyIndex): Study index to resolve study types.\n\n        Returns:\n            StudyLocusOverlap: Pairs of overlapping study-locus with aligned tags.\n        \"\"\"\n        credset_to_overlap = (\n            self.df.join(study_index.study_type_lut(), on=\"studyId\", how=\"inner\")\n            .withColumn(\"credibleSet\", f.explode(\"credibleSet\"))\n            .select(\n                \"studyLocusId\",\n                \"studyType\",\n                \"chromosome\",\n                f.col(\"credibleSet.tagVariantId\").alias(\"tagVariantId\"),\n                f.col(\"credibleSet.logABF\").alias(\"logABF\"),\n                f.col(\"credibleSet.posteriorProbability\").alias(\"posteriorProbability\"),\n            )\n            .persist()\n        )\n\n        # overlapping study-locus\n        peak_overlaps = self._overlapping_peaks(credset_to_overlap)\n\n        # study-locus overlap by aligning overlapping variants\n        return self._align_overlapping_tags(credset_to_overlap, peak_overlaps)\n\n    def unique_lead_tag_variants(self: StudyLocus) -&gt; DataFrame:\n\"\"\"All unique lead and tag variants contained in the `StudyLocus` dataframe.\n\n        Returns:\n            DataFrame: A dataframe containing `variantId` and `chromosome` columns.\n        \"\"\"\n        lead_tags = (\n            self.df.select(\n                f.col(\"variantId\"),\n                f.col(\"chromosome\"),\n                f.explode(\"credibleSet.tagVariantId\").alias(\"tagVariantId\"),\n            )\n            .repartition(\"chromosome\")\n            .persist()\n        )\n        return (\n            lead_tags.select(\"variantId\", \"chromosome\")\n            .union(\n                lead_tags.select(f.col(\"tagVariantId\").alias(\"variantId\"), \"chromosome\")\n            )\n            .distinct()\n        )\n\n    def unique_study_locus_ancestries(\n        self: StudyLocus, studies: StudyIndexGWASCatalog\n    ) -&gt; DataFrame:\n\"\"\"All unique lead variant and ancestries contained in the `StudyLocus`.\n\n        Args:\n            studies (StudyIndexGWASCatalog): Metadata about studies in the `StudyLocus`.\n\n        Returns:\n            DataFrame: unique [\"variantId\", \"studyId\", \"gnomadPopulation\", \"chromosome\", \"relativeSampleSize\"]\n\n        Note:\n            This method is only available for GWAS Catalog studies.\n        \"\"\"\n        return (\n            self.df.join(\n                studies.get_gnomad_ancestry_sample_sizes(), on=\"studyId\", how=\"left\"\n            )\n            .filter(f.col(\"position\").isNotNull())\n            .select(\n                \"variantId\",\n                \"chromosome\",\n                \"studyId\",\n                \"gnomadPopulation\",\n                \"relativeSampleSize\",\n            )\n            .distinct()\n        )\n\n    def neglog_pvalue(self: StudyLocus) -&gt; Column:\n\"\"\"Returns the negative log p-value.\n\n        Returns:\n            Column: Negative log p-value\n        \"\"\"\n        return calculate_neglog_pvalue(\n            self.df.pValueMantissa,\n            self.df.pValueExponent,\n        )\n\n    def annotate_credible_sets(self: StudyLocus) -&gt; StudyLocus:\n\"\"\"Annotate study-locus dataset with credible set flags.\n\n        Sorts the array in the `credibleSet` column elements by their `posteriorProbability` values in descending order and adds\n        `is95CredibleSet` and `is99CredibleSet` fields to the elements, indicating which are the tagging variants whose cumulative sum\n        of their `posteriorProbability` values is below 0.95 and 0.99, respectively.\n\n        Returns:\n            StudyLocus: including annotation on `is95CredibleSet` and `is99CredibleSet`.\n        \"\"\"\n        self.df = self.df.withColumn(\n            # Sort credible set by posterior probability in descending order\n            \"credibleSet\",\n            f.when(\n                f.size(f.col(\"credibleSet\")) &gt; 0,\n                order_array_of_structs_by_field(\"credibleSet\", \"posteriorProbability\"),\n            ).when(f.size(f.col(\"credibleSet\")) == 0, f.col(\"credibleSet\")),\n        ).withColumn(\n            # Calculate array of cumulative sums of posterior probabilities to determine which variants are in the 95% and 99% credible sets\n            # and zip the cumulative sums array with the credible set array to add the flags\n            \"credibleSet\",\n            f.when(\n                f.size(f.col(\"credibleSet\")) &gt; 0,\n                f.zip_with(\n                    f.col(\"credibleSet\"),\n                    f.transform(\n                        f.sequence(f.lit(1), f.size(f.col(\"credibleSet\"))),\n                        lambda index: f.aggregate(\n                            f.slice(\n                                # By using `index - 1` we introduce a value of `0.0` in the cumulative sums array. to ensure that the last variant\n                                # that exceeds the 0.95 threshold is included in the cumulative sum, as its probability is necessary to satisfy the threshold.\n                                f.col(\"credibleSet.posteriorProbability\"),\n                                1,\n                                index - 1,\n                            ),\n                            f.lit(0.0),\n                            lambda acc, el: acc + el,\n                        ),\n                    ),\n                    lambda struct_e, acc: struct_e.withField(\n                        CredibleInterval.IS95.value, acc &lt; 0.95\n                    ).withField(CredibleInterval.IS99.value, acc &lt; 0.99),\n                ),\n            ).when(f.size(f.col(\"credibleSet\")) == 0, f.col(\"credibleSet\")),\n        )\n        return self\n\n    def clump(self: StudyLocus) -&gt; StudyLocus:\n\"\"\"Perform LD clumping of the studyLocus.\n\n        Evaluates whether a lead variant is linked to a tag (with lowest p-value) in the same studyLocus dataset.\n\n        Returns:\n            StudyLocus: with empty credible sets for linked variants and QC flag.\n        \"\"\"\n        self.df = (\n            self.df.withColumn(\n                \"is_lead_linked\",\n                LDclumping._is_lead_linked(\n                    self.df.studyId,\n                    self.df.variantId,\n                    self.df.pValueExponent,\n                    self.df.pValueMantissa,\n                    self.df.credibleSet,\n                ),\n            )\n            .withColumn(\n                \"credibleSet\",\n                f.when(f.col(\"is_lead_linked\"), f.array()).otherwise(\n                    f.col(\"credibleSet\")\n                ),\n            )\n            .withColumn(\n                \"qualityControls\",\n                StudyLocus._update_quality_flag(\n                    f.col(\"qualityControls\"),\n                    f.col(\"is_lead_linked\"),\n                    StudyLocusQualityCheck.LD_CLUMPED,\n                ),\n            )\n            .drop(\"is_lead_linked\")\n        )\n        return self\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#otg.dataset.study_locus.StudyLocus.annotate_credible_sets","title":"<code>annotate_credible_sets()</code>","text":"<p>Annotate study-locus dataset with credible set flags.</p> <p>Sorts the array in the <code>credibleSet</code> column elements by their <code>posteriorProbability</code> values in descending order and adds <code>is95CredibleSet</code> and <code>is99CredibleSet</code> fields to the elements, indicating which are the tagging variants whose cumulative sum of their <code>posteriorProbability</code> values is below 0.95 and 0.99, respectively.</p> <p>Returns:</p> Name Type Description <code>StudyLocus</code> <code>StudyLocus</code> <p>including annotation on <code>is95CredibleSet</code> and <code>is99CredibleSet</code>.</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>def annotate_credible_sets(self: StudyLocus) -&gt; StudyLocus:\n\"\"\"Annotate study-locus dataset with credible set flags.\n\n    Sorts the array in the `credibleSet` column elements by their `posteriorProbability` values in descending order and adds\n    `is95CredibleSet` and `is99CredibleSet` fields to the elements, indicating which are the tagging variants whose cumulative sum\n    of their `posteriorProbability` values is below 0.95 and 0.99, respectively.\n\n    Returns:\n        StudyLocus: including annotation on `is95CredibleSet` and `is99CredibleSet`.\n    \"\"\"\n    self.df = self.df.withColumn(\n        # Sort credible set by posterior probability in descending order\n        \"credibleSet\",\n        f.when(\n            f.size(f.col(\"credibleSet\")) &gt; 0,\n            order_array_of_structs_by_field(\"credibleSet\", \"posteriorProbability\"),\n        ).when(f.size(f.col(\"credibleSet\")) == 0, f.col(\"credibleSet\")),\n    ).withColumn(\n        # Calculate array of cumulative sums of posterior probabilities to determine which variants are in the 95% and 99% credible sets\n        # and zip the cumulative sums array with the credible set array to add the flags\n        \"credibleSet\",\n        f.when(\n            f.size(f.col(\"credibleSet\")) &gt; 0,\n            f.zip_with(\n                f.col(\"credibleSet\"),\n                f.transform(\n                    f.sequence(f.lit(1), f.size(f.col(\"credibleSet\"))),\n                    lambda index: f.aggregate(\n                        f.slice(\n                            # By using `index - 1` we introduce a value of `0.0` in the cumulative sums array. to ensure that the last variant\n                            # that exceeds the 0.95 threshold is included in the cumulative sum, as its probability is necessary to satisfy the threshold.\n                            f.col(\"credibleSet.posteriorProbability\"),\n                            1,\n                            index - 1,\n                        ),\n                        f.lit(0.0),\n                        lambda acc, el: acc + el,\n                    ),\n                ),\n                lambda struct_e, acc: struct_e.withField(\n                    CredibleInterval.IS95.value, acc &lt; 0.95\n                ).withField(CredibleInterval.IS99.value, acc &lt; 0.99),\n            ),\n        ).when(f.size(f.col(\"credibleSet\")) == 0, f.col(\"credibleSet\")),\n    )\n    return self\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#otg.dataset.study_locus.StudyLocus.clump","title":"<code>clump()</code>","text":"<p>Perform LD clumping of the studyLocus.</p> <p>Evaluates whether a lead variant is linked to a tag (with lowest p-value) in the same studyLocus dataset.</p> <p>Returns:</p> Name Type Description <code>StudyLocus</code> <code>StudyLocus</code> <p>with empty credible sets for linked variants and QC flag.</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>def clump(self: StudyLocus) -&gt; StudyLocus:\n\"\"\"Perform LD clumping of the studyLocus.\n\n    Evaluates whether a lead variant is linked to a tag (with lowest p-value) in the same studyLocus dataset.\n\n    Returns:\n        StudyLocus: with empty credible sets for linked variants and QC flag.\n    \"\"\"\n    self.df = (\n        self.df.withColumn(\n            \"is_lead_linked\",\n            LDclumping._is_lead_linked(\n                self.df.studyId,\n                self.df.variantId,\n                self.df.pValueExponent,\n                self.df.pValueMantissa,\n                self.df.credibleSet,\n            ),\n        )\n        .withColumn(\n            \"credibleSet\",\n            f.when(f.col(\"is_lead_linked\"), f.array()).otherwise(\n                f.col(\"credibleSet\")\n            ),\n        )\n        .withColumn(\n            \"qualityControls\",\n            StudyLocus._update_quality_flag(\n                f.col(\"qualityControls\"),\n                f.col(\"is_lead_linked\"),\n                StudyLocusQualityCheck.LD_CLUMPED,\n            ),\n        )\n        .drop(\"is_lead_linked\")\n    )\n    return self\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#otg.dataset.study_locus.StudyLocus.credible_set","title":"<code>credible_set(credible_interval)</code>","text":"<p>Filter study-locus tag variants based on given credible interval.</p> <p>Parameters:</p> Name Type Description Default <code>credible_interval</code> <code>CredibleInterval</code> <p>Credible interval to filter for.</p> required <p>Returns:</p> Name Type Description <code>StudyLocus</code> <code>StudyLocus</code> <p>Filtered study-locus dataset.</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>def credible_set(\n    self: StudyLocus,\n    credible_interval: CredibleInterval,\n) -&gt; StudyLocus:\n\"\"\"Filter study-locus tag variants based on given credible interval.\n\n    Args:\n        credible_interval (CredibleInterval): Credible interval to filter for.\n\n    Returns:\n        StudyLocus: Filtered study-locus dataset.\n    \"\"\"\n    self.df = self._df.withColumn(\n        \"credibleSet\",\n        f.expr(f\"filter(credibleSet, tag -&gt; (tag.{credible_interval.value}))\"),\n    )\n    return self\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#otg.dataset.study_locus.StudyLocus.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise StudyLocus from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>spark session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>StudyLocus</code> <code>StudyLocus</code> <p>Study-locus dataset</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>@classmethod\ndef from_parquet(cls: type[StudyLocus], session: Session, path: str) -&gt; StudyLocus:\n\"\"\"Initialise StudyLocus from parquet file.\n\n    Args:\n        session (Session): spark session\n        path (str): Path to parquet file\n\n    Returns:\n        StudyLocus: Study-locus dataset\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#otg.dataset.study_locus.StudyLocus.neglog_pvalue","title":"<code>neglog_pvalue()</code>","text":"<p>Returns the negative log p-value.</p> <p>Returns:</p> Name Type Description <code>Column</code> <code>Column</code> <p>Negative log p-value</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>def neglog_pvalue(self: StudyLocus) -&gt; Column:\n\"\"\"Returns the negative log p-value.\n\n    Returns:\n        Column: Negative log p-value\n    \"\"\"\n    return calculate_neglog_pvalue(\n        self.df.pValueMantissa,\n        self.df.pValueExponent,\n    )\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#otg.dataset.study_locus.StudyLocus.overlaps","title":"<code>overlaps(study_index)</code>","text":"<p>Calculate overlapping study-locus.</p> <p>Find overlapping study-locus that share at least one tagging variant. All GWAS-GWAS and all GWAS-Molecular traits are computed with the Molecular traits always appearing on the right side.</p> <p>Parameters:</p> Name Type Description Default <code>study_index</code> <code>StudyIndex</code> <p>Study index to resolve study types.</p> required <p>Returns:</p> Name Type Description <code>StudyLocusOverlap</code> <code>StudyLocusOverlap</code> <p>Pairs of overlapping study-locus with aligned tags.</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>def overlaps(self: StudyLocus, study_index: StudyIndex) -&gt; StudyLocusOverlap:\n\"\"\"Calculate overlapping study-locus.\n\n    Find overlapping study-locus that share at least one tagging variant. All GWAS-GWAS and all GWAS-Molecular traits are computed with the Molecular traits always\n    appearing on the right side.\n\n    Args:\n        study_index (StudyIndex): Study index to resolve study types.\n\n    Returns:\n        StudyLocusOverlap: Pairs of overlapping study-locus with aligned tags.\n    \"\"\"\n    credset_to_overlap = (\n        self.df.join(study_index.study_type_lut(), on=\"studyId\", how=\"inner\")\n        .withColumn(\"credibleSet\", f.explode(\"credibleSet\"))\n        .select(\n            \"studyLocusId\",\n            \"studyType\",\n            \"chromosome\",\n            f.col(\"credibleSet.tagVariantId\").alias(\"tagVariantId\"),\n            f.col(\"credibleSet.logABF\").alias(\"logABF\"),\n            f.col(\"credibleSet.posteriorProbability\").alias(\"posteriorProbability\"),\n        )\n        .persist()\n    )\n\n    # overlapping study-locus\n    peak_overlaps = self._overlapping_peaks(credset_to_overlap)\n\n    # study-locus overlap by aligning overlapping variants\n    return self._align_overlapping_tags(credset_to_overlap, peak_overlaps)\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#otg.dataset.study_locus.StudyLocus.unique_lead_tag_variants","title":"<code>unique_lead_tag_variants()</code>","text":"<p>All unique lead and tag variants contained in the <code>StudyLocus</code> dataframe.</p> <p>Returns:</p> Name Type Description <code>DataFrame</code> <code>DataFrame</code> <p>A dataframe containing <code>variantId</code> and <code>chromosome</code> columns.</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>def unique_lead_tag_variants(self: StudyLocus) -&gt; DataFrame:\n\"\"\"All unique lead and tag variants contained in the `StudyLocus` dataframe.\n\n    Returns:\n        DataFrame: A dataframe containing `variantId` and `chromosome` columns.\n    \"\"\"\n    lead_tags = (\n        self.df.select(\n            f.col(\"variantId\"),\n            f.col(\"chromosome\"),\n            f.explode(\"credibleSet.tagVariantId\").alias(\"tagVariantId\"),\n        )\n        .repartition(\"chromosome\")\n        .persist()\n    )\n    return (\n        lead_tags.select(\"variantId\", \"chromosome\")\n        .union(\n            lead_tags.select(f.col(\"tagVariantId\").alias(\"variantId\"), \"chromosome\")\n        )\n        .distinct()\n    )\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#otg.dataset.study_locus.StudyLocus.unique_study_locus_ancestries","title":"<code>unique_study_locus_ancestries(studies)</code>","text":"<p>All unique lead variant and ancestries contained in the <code>StudyLocus</code>.</p> <p>Parameters:</p> Name Type Description Default <code>studies</code> <code>StudyIndexGWASCatalog</code> <p>Metadata about studies in the <code>StudyLocus</code>.</p> required <p>Returns:</p> Name Type Description <code>DataFrame</code> <code>DataFrame</code> <p>unique [\"variantId\", \"studyId\", \"gnomadPopulation\", \"chromosome\", \"relativeSampleSize\"]</p> Note <p>This method is only available for GWAS Catalog studies.</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>def unique_study_locus_ancestries(\n    self: StudyLocus, studies: StudyIndexGWASCatalog\n) -&gt; DataFrame:\n\"\"\"All unique lead variant and ancestries contained in the `StudyLocus`.\n\n    Args:\n        studies (StudyIndexGWASCatalog): Metadata about studies in the `StudyLocus`.\n\n    Returns:\n        DataFrame: unique [\"variantId\", \"studyId\", \"gnomadPopulation\", \"chromosome\", \"relativeSampleSize\"]\n\n    Note:\n        This method is only available for GWAS Catalog studies.\n    \"\"\"\n    return (\n        self.df.join(\n            studies.get_gnomad_ancestry_sample_sizes(), on=\"studyId\", how=\"left\"\n        )\n        .filter(f.col(\"position\").isNotNull())\n        .select(\n            \"variantId\",\n            \"chromosome\",\n            \"studyId\",\n            \"gnomadPopulation\",\n            \"relativeSampleSize\",\n        )\n        .distinct()\n    )\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#schema","title":"Schema","text":"<pre><code>root\n |-- studyLocusId: long (nullable = false)\n |-- variantId: string (nullable = false)\n |-- chromosome: string (nullable = true)\n |-- position: integer (nullable = true)\n |-- studyId: string (nullable = false)\n |-- beta: double (nullable = true)\n |-- oddsRatio: double (nullable = true)\n |-- oddsRatioConfidenceIntervalLower: double (nullable = true)\n |-- oddsRatioConfidenceIntervalUpper: double (nullable = true)\n |-- betaConfidenceIntervalLower: double (nullable = true)\n |-- betaConfidenceIntervalUpper: double (nullable = true)\n |-- pValueMantissa: float (nullable = true)\n |-- pValueExponent: integer (nullable = true)\n |-- effectAlleleFrequencyFromSource: double (nullable = true)\n |-- standardError: double (nullable = true)\n |-- subStudyDescription: string (nullable = true)\n |-- qualityControls: array (nullable = true)\n |    |-- element: string (containsNull = false)\n |-- finemappingMethod: string (nullable = true)\n |-- credibleSet: array (nullable = true)\n |    |-- element: struct (containsNull = true)\n |    |    |-- is95CredibleSet: boolean (nullable = true)\n |    |    |-- is99CredibleSet: boolean (nullable = true)\n |    |    |-- logABF: double (nullable = true)\n |    |    |-- posteriorProbability: double (nullable = true)\n |    |    |-- tagVariantId: string (nullable = true)\n |    |    |-- tagPValue: double (nullable = true)\n |    |    |-- tagPValueConditioned: double (nullable = true)\n |    |    |-- tagBeta: double (nullable = true)\n |    |    |-- tagStandardError: double (nullable = true)\n |    |    |-- tagBetaConditioned: double (nullable = true)\n |    |    |-- tagStandardErrorConditioned: double (nullable = true)\n |    |    |-- r2Overall: double (nullable = true)\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#study-locus-quality-controls","title":"Study-locus quality controls","text":"<p>         Bases: <code>Enum</code></p> <p>Study-Locus quality control options listing concerns on the quality of the association.</p> <p>Attributes:</p> Name Type Description <code>SUBSIGNIFICANT_FLAG</code> <code>str</code> <p>p-value below significance threshold</p> <code>NO_GENOMIC_LOCATION_FLAG</code> <code>str</code> <p>Incomplete genomic mapping</p> <code>COMPOSITE_FLAG</code> <code>str</code> <p>Composite association due to variant x variant interactions</p> <code>VARIANT_INCONSISTENCY_FLAG</code> <code>str</code> <p>Inconsistencies in the reported variants</p> <code>NON_MAPPED_VARIANT_FLAG</code> <code>str</code> <p>Variant not mapped to GnomAd</p> <code>PALINDROMIC_ALLELE_FLAG</code> <code>str</code> <p>Alleles are palindromic - cannot harmonize</p> <code>AMBIGUOUS_STUDY</code> <code>str</code> <p>Association with ambiguous study</p> <code>UNRESOLVED_LD</code> <code>str</code> <p>Variant not found in LD reference</p> <code>LD_CLUMPED</code> <code>str</code> <p>Explained by a more significant variant in high LD (clumped)</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>class StudyLocusQualityCheck(Enum):\n\"\"\"Study-Locus quality control options listing concerns on the quality of the association.\n\n    Attributes:\n        SUBSIGNIFICANT_FLAG (str): p-value below significance threshold\n        NO_GENOMIC_LOCATION_FLAG (str): Incomplete genomic mapping\n        COMPOSITE_FLAG (str): Composite association due to variant x variant interactions\n        VARIANT_INCONSISTENCY_FLAG (str): Inconsistencies in the reported variants\n        NON_MAPPED_VARIANT_FLAG (str): Variant not mapped to GnomAd\n        PALINDROMIC_ALLELE_FLAG (str): Alleles are palindromic - cannot harmonize\n        AMBIGUOUS_STUDY (str): Association with ambiguous study\n        UNRESOLVED_LD (str): Variant not found in LD reference\n        LD_CLUMPED (str): Explained by a more significant variant in high LD (clumped)\n    \"\"\"\n\n    SUBSIGNIFICANT_FLAG = \"Subsignificant p-value\"\n    NO_GENOMIC_LOCATION_FLAG = \"Incomplete genomic mapping\"\n    COMPOSITE_FLAG = \"Composite association\"\n    INCONSISTENCY_FLAG = \"Variant inconsistency\"\n    NON_MAPPED_VARIANT_FLAG = \"No mapping in GnomAd\"\n    PALINDROMIC_ALLELE_FLAG = \"Palindrome alleles - cannot harmonize\"\n    AMBIGUOUS_STUDY = \"Association with ambiguous study\"\n    UNRESOLVED_LD = \"Variant not found in LD reference\"\n    LD_CLUMPED = \"Explained by a more significant variant in high LD (clumped)\"\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#credible-interval","title":"Credible interval","text":"<p>         Bases: <code>Enum</code></p> <p>Credible interval enum.</p> <p>Interval within which an unobserved parameter value falls with a particular probability.</p> <p>Attributes:</p> Name Type Description <code>IS95</code> <code>str</code> <p>95% credible interval</p> <code>IS99</code> <code>str</code> <p>99% credible interval</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>class CredibleInterval(Enum):\n\"\"\"Credible interval enum.\n\n    Interval within which an unobserved parameter value falls with a particular probability.\n\n    Attributes:\n        IS95 (str): 95% credible interval\n        IS99 (str): 99% credible interval\n    \"\"\"\n\n    IS95 = \"is95CredibleSet\"\n    IS99 = \"is99CredibleSet\"\n</code></pre>"},{"location":"components/dataset/study_locus/study_locus_gwas_catalog/","title":"Study locus gwas catalog","text":"<p>         Bases: <code>StudyLocus</code></p> <p>Study-locus dataset derived from GWAS Catalog.</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>class StudyLocusGWASCatalog(StudyLocus):\n\"\"\"Study-locus dataset derived from GWAS Catalog.\"\"\"\n\n    @staticmethod\n    def _parse_pvalue(pvalue: Column) -&gt; tuple[Column, Column]:\n\"\"\"Parse p-value column.\n\n        Args:\n            pvalue (Column): p-value [string]\n\n        Returns:\n            tuple[Column, Column]: p-value mantissa and exponent\n\n        Example:\n            &gt;&gt;&gt; import pyspark.sql.types as t\n            &gt;&gt;&gt; d = [(\"1.0\"), (\"0.5\"), (\"1E-20\"), (\"3E-3\"), (\"1E-1000\")]\n            &gt;&gt;&gt; df = spark.createDataFrame(d, t.StringType())\n            &gt;&gt;&gt; df.select('value',*StudyLocusGWASCatalog._parse_pvalue(f.col('value'))).show()\n            +-------+--------------+--------------+\n            |  value|pValueMantissa|pValueExponent|\n            +-------+--------------+--------------+\n            |    1.0|           1.0|             1|\n            |    0.5|           0.5|             1|\n            |  1E-20|           1.0|           -20|\n            |   3E-3|           3.0|            -3|\n            |1E-1000|           1.0|         -1000|\n            +-------+--------------+--------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        split = f.split(pvalue, \"E\")\n        return split.getItem(0).cast(\"float\").alias(\"pValueMantissa\"), f.coalesce(\n            split.getItem(1).cast(\"integer\"), f.lit(1)\n        ).alias(\"pValueExponent\")\n\n    @staticmethod\n    def _normalise_pvaluetext(p_value_text: Column) -&gt; Column:\n\"\"\"Normalised p-value text column to a standardised format.\n\n        For cases where there is no mapping, the value is set to null.\n\n        Args:\n            p_value_text (Column): `pValueText` column from GWASCatalog\n\n        Returns:\n            Column: Array column after using GWAS Catalog mappings. There might be multiple mappings for a single p-value text.\n\n        Example:\n            &gt;&gt;&gt; import pyspark.sql.types as t\n            &gt;&gt;&gt; d = [(\"European Ancestry\"), (\"African ancestry\"), (\"Alzheimer\u2019s Disease\"), (\"(progression)\"), (\"\"), (None)]\n            &gt;&gt;&gt; df = spark.createDataFrame(d, t.StringType())\n            &gt;&gt;&gt; df.withColumn('normalised', StudyLocusGWASCatalog._normalise_pvaluetext(f.col('value'))).show()\n            +-------------------+----------+\n            |              value|normalised|\n            +-------------------+----------+\n            |  European Ancestry|      [EA]|\n            |   African ancestry|      [AA]|\n            |Alzheimer\u2019s Disease|      [AD]|\n            |      (progression)|      null|\n            |                   |      null|\n            |               null|      null|\n            +-------------------+----------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        # GWAS Catalog to p-value mapping\n        json_dict = json.loads(\n            pkg_resources.read_text(data, \"gwas_pValueText_map.json\", encoding=\"utf-8\")\n        )\n        map_expr = f.create_map(*[f.lit(x) for x in chain(*json_dict.items())])\n\n        splitted_col = f.split(f.regexp_replace(p_value_text, r\"[\\(\\)]\", \"\"), \",\")\n        mapped_col = f.transform(splitted_col, lambda x: map_expr[x])\n        return f.when(f.forall(mapped_col, lambda x: x.isNull()), None).otherwise(\n            mapped_col\n        )\n\n    @staticmethod\n    def _normalise_risk_allele(risk_allele: Column) -&gt; Column:\n\"\"\"Normalised risk allele column to a standardised format.\n\n        If multiple risk alleles are present, the first one is returned.\n\n        Args:\n            risk_allele (Column): `riskAllele` column from GWASCatalog\n\n        Returns:\n            Column: mapped using GWAS Catalog mapping\n\n        Example:\n            &gt;&gt;&gt; import pyspark.sql.types as t\n            &gt;&gt;&gt; d = [(\"rs1234-A-G\"), (\"rs1234-A\"), (\"rs1234-A; rs1235-G\")]\n            &gt;&gt;&gt; df = spark.createDataFrame(d, t.StringType())\n            &gt;&gt;&gt; df.withColumn('normalised', StudyLocusGWASCatalog._normalise_risk_allele(f.col('value'))).show()\n            +------------------+----------+\n            |             value|normalised|\n            +------------------+----------+\n            |        rs1234-A-G|         A|\n            |          rs1234-A|         A|\n            |rs1234-A; rs1235-G|         A|\n            +------------------+----------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        # GWAS Catalog to risk allele mapping\n        return f.split(f.split(risk_allele, \"; \").getItem(0), \"-\").getItem(1)\n\n    @staticmethod\n    def _collect_rsids(\n        snp_id: Column, snp_id_current: Column, risk_allele: Column\n    ) -&gt; Column:\n\"\"\"It takes three columns, and returns an array of distinct values from those columns.\n\n        Args:\n            snp_id (Column): The original snp id from the GWAS catalog.\n            snp_id_current (Column): The current snp id field is just a number at the moment (stored as a string). Adding 'rs' prefix if looks good.\n            risk_allele (Column): The risk allele for the SNP.\n\n        Returns:\n            An array of distinct values.\n        \"\"\"\n        # The current snp id field is just a number at the moment (stored as a string). Adding 'rs' prefix if looks good.\n        snp_id_current = f.when(\n            snp_id_current.rlike(\"^[0-9]*$\"),\n            f.format_string(\"rs%s\", snp_id_current),\n        )\n        # Cleaning risk allele:\n        risk_allele = f.split(risk_allele, \"-\").getItem(0)\n\n        # Collecting all values:\n        return f.array_distinct(f.array(snp_id, snp_id_current, risk_allele))\n\n    @staticmethod\n    def _map_to_variant_annotation_variants(\n        gwas_associations: DataFrame, variant_annotation: VariantAnnotation\n    ) -&gt; DataFrame:\n\"\"\"Add variant metadata in associations.\n\n        Args:\n            gwas_associations (DataFrame): raw GWAS Catalog associations\n            variant_annotation (VariantAnnotation): variant annotation dataset\n\n        Returns:\n            DataFrame: GWAS Catalog associations data including `variantId`, `referenceAllele`,\n            `alternateAllele`, `chromosome`, `position` with variant metadata\n        \"\"\"\n        # Subset of GWAS Catalog associations required for resolving variant IDs:\n        gwas_associations_subset = gwas_associations.select(\n            \"studyLocusId\",\n            f.col(\"CHR_ID\").alias(\"chromosome\"),\n            f.col(\"CHR_POS\").cast(IntegerType()).alias(\"position\"),\n            # List of all SNPs associated with the variant\n            StudyLocusGWASCatalog._collect_rsids(\n                f.split(f.col(\"SNPS\"), \"; \").getItem(0),\n                f.col(\"SNP_ID_CURRENT\"),\n                f.split(f.col(\"STRONGEST SNP-RISK ALLELE\"), \"; \").getItem(0),\n            ).alias(\"rsIdsGwasCatalog\"),\n            StudyLocusGWASCatalog._normalise_risk_allele(\n                f.col(\"STRONGEST SNP-RISK ALLELE\")\n            ).alias(\"riskAllele\"),\n        )\n\n        # Subset of variant annotation required for GWAS Catalog annotations:\n        va_subset = variant_annotation.df.select(\n            \"variantId\",\n            \"chromosome\",\n            \"position\",\n            f.col(\"rsIds\").alias(\"rsIdsGnomad\"),\n            \"referenceAllele\",\n            \"alternateAllele\",\n            \"alleleFrequencies\",\n            variant_annotation.max_maf().alias(\"maxMaf\"),\n        ).join(\n            f.broadcast(\n                gwas_associations_subset.select(\"chromosome\", \"position\").distinct()\n            ),\n            on=[\"chromosome\", \"position\"],\n            how=\"inner\",\n        )\n\n        # Semi-resolved ids (still contains duplicates when conclusion was not possible to make\n        # based on rsIds or allele concordance)\n        filtered_associations = (\n            gwas_associations_subset.join(\n                f.broadcast(va_subset),\n                on=[\"chromosome\", \"position\"],\n                how=\"left\",\n            )\n            .withColumn(\n                \"rsIdFilter\",\n                StudyLocusGWASCatalog._flag_mappings_to_retain(\n                    f.col(\"studyLocusId\"),\n                    StudyLocusGWASCatalog._compare_rsids(\n                        f.col(\"rsIdsGnomad\"), f.col(\"rsIdsGwasCatalog\")\n                    ),\n                ),\n            )\n            .withColumn(\n                \"concordanceFilter\",\n                StudyLocusGWASCatalog._flag_mappings_to_retain(\n                    f.col(\"studyLocusId\"),\n                    StudyLocusGWASCatalog._check_concordance(\n                        f.col(\"riskAllele\"),\n                        f.col(\"referenceAllele\"),\n                        f.col(\"alternateAllele\"),\n                    ),\n                ),\n            )\n            .filter(\n                # Filter out rows where GWAS Catalog rsId does not match with GnomAD rsId,\n                # but there is corresponding variant for the same association\n                f.col(\"rsIdFilter\")\n                # or filter out rows where GWAS Catalog alleles are not concordant with GnomAD alleles,\n                # but there is corresponding variant for the same association\n                | f.col(\"concordanceFilter\")\n            )\n        )\n\n        # Keep only highest maxMaf variant per studyLocusId\n        fully_mapped_associations = get_record_with_maximum_value(\n            filtered_associations, grouping_col=\"studyLocusId\", sorting_col=\"maxMaf\"\n        ).select(\n            \"studyLocusId\",\n            \"variantId\",\n            \"referenceAllele\",\n            \"alternateAllele\",\n            \"chromosome\",\n            \"position\",\n        )\n\n        return gwas_associations.join(\n            fully_mapped_associations, on=\"studyLocusId\", how=\"left\"\n        )\n\n    @staticmethod\n    def _compare_rsids(gnomad: Column, gwas: Column) -&gt; Column:\n\"\"\"If the intersection of the two arrays is greater than 0, return True, otherwise return False.\n\n        Args:\n            gnomad (Column): rsids from gnomad\n            gwas (Column): rsids from the GWAS Catalog\n\n        Returns:\n            A boolean column that is true if the GnomAD rsIDs can be found in the GWAS rsIDs.\n\n        Examples:\n            &gt;&gt;&gt; d = [\n            ...    (1, [\"rs123\", \"rs523\"], [\"rs123\"]),\n            ...    (2, [], [\"rs123\"]),\n            ...    (3, [\"rs123\", \"rs523\"], []),\n            ...    (4, [], []),\n            ... ]\n            &gt;&gt;&gt; df = spark.createDataFrame(d, ['associationId', 'gnomad', 'gwas'])\n            &gt;&gt;&gt; df.withColumn(\"rsid_matches\", StudyLocusGWASCatalog._compare_rsids(f.col(\"gnomad\"),f.col('gwas'))).show()\n            +-------------+--------------+-------+------------+\n            |associationId|        gnomad|   gwas|rsid_matches|\n            +-------------+--------------+-------+------------+\n            |            1|[rs123, rs523]|[rs123]|        true|\n            |            2|            []|[rs123]|       false|\n            |            3|[rs123, rs523]|     []|       false|\n            |            4|            []|     []|       false|\n            +-------------+--------------+-------+------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        return f.when(f.size(f.array_intersect(gnomad, gwas)) &gt; 0, True).otherwise(\n            False\n        )\n\n    @staticmethod\n    def _flag_mappings_to_retain(\n        association_id: Column, filter_column: Column\n    ) -&gt; Column:\n\"\"\"Flagging mappings to drop for each association.\n\n        Some associations have multiple mappings. Some has matching rsId others don't. We only\n        want to drop the non-matching mappings, when a matching is available for the given association.\n        This logic can be generalised for other measures eg. allele concordance.\n\n        Args:\n            association_id (Column): association identifier column\n            filter_column (Column): boolean col indicating to keep a mapping\n\n        Returns:\n            A column with a boolean value.\n\n        Examples:\n        &gt;&gt;&gt; d = [\n        ...    (1, False),\n        ...    (1, False),\n        ...    (2, False),\n        ...    (2, True),\n        ...    (3, True),\n        ...    (3, True),\n        ... ]\n        &gt;&gt;&gt; df = spark.createDataFrame(d, ['associationId', 'filter'])\n        &gt;&gt;&gt; df.withColumn(\"isConcordant\", StudyLocusGWASCatalog._flag_mappings_to_retain(f.col(\"associationId\"),f.col('filter'))).show()\n        +-------------+------+------------+\n        |associationId|filter|isConcordant|\n        +-------------+------+------------+\n        |            1| false|        true|\n        |            1| false|        true|\n        |            2| false|       false|\n        |            2|  true|        true|\n        |            3|  true|        true|\n        |            3|  true|        true|\n        +-------------+------+------------+\n        &lt;BLANKLINE&gt;\n\n        \"\"\"\n        w = Window.partitionBy(association_id)\n\n        # Generating a boolean column informing if the filter column contains true anywhere for the association:\n        aggregated_filter = f.when(\n            f.array_contains(f.collect_set(filter_column).over(w), True), True\n        ).otherwise(False)\n\n        # Generate a filter column:\n        return f.when(aggregated_filter &amp; (~filter_column), False).otherwise(True)\n\n    @staticmethod\n    def _check_concordance(\n        risk_allele: Column, reference_allele: Column, alternate_allele: Column\n    ) -&gt; Column:\n\"\"\"A function to check if the risk allele is concordant with the alt or ref allele.\n\n        If the risk allele is the same as the reference or alternate allele, or if the reverse complement of\n        the risk allele is the same as the reference or alternate allele, then the allele is concordant.\n        If no mapping is available (ref/alt is null), the function returns True.\n\n        Args:\n            risk_allele (Column): The allele that is associated with the risk of the disease.\n            reference_allele (Column): The reference allele from the GWAS catalog\n            alternate_allele (Column): The alternate allele of the variant.\n\n        Returns:\n            A boolean column that is True if the risk allele is the same as the reference or alternate allele,\n            or if the reverse complement of the risk allele is the same as the reference or alternate allele.\n\n        Examples:\n            &gt;&gt;&gt; d = [\n            ...     ('A', 'A', 'G'),\n            ...     ('A', 'T', 'G'),\n            ...     ('A', 'C', 'G'),\n            ...     ('A', 'A', '?'),\n            ...     (None, None, 'A'),\n            ... ]\n            &gt;&gt;&gt; df = spark.createDataFrame(d, ['riskAllele', 'referenceAllele', 'alternateAllele'])\n            &gt;&gt;&gt; df.withColumn(\"isConcordant\", StudyLocusGWASCatalog._check_concordance(f.col(\"riskAllele\"),f.col('referenceAllele'), f.col('alternateAllele'))).show()\n            +----------+---------------+---------------+------------+\n            |riskAllele|referenceAllele|alternateAllele|isConcordant|\n            +----------+---------------+---------------+------------+\n            |         A|              A|              G|        true|\n            |         A|              T|              G|        true|\n            |         A|              C|              G|       false|\n            |         A|              A|              ?|        true|\n            |      null|           null|              A|        true|\n            +----------+---------------+---------------+------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        # Calculating the reverse complement of the risk allele:\n        risk_allele_reverse_complement = f.when(\n            risk_allele.rlike(r\"^[ACTG]+$\"),\n            f.reverse(f.translate(risk_allele, \"ACTG\", \"TGAC\")),\n        ).otherwise(risk_allele)\n\n        # OK, is the risk allele or the reverse complent is the same as the mapped alleles:\n        return (\n            f.when(\n                (risk_allele == reference_allele) | (risk_allele == alternate_allele),\n                True,\n            )\n            # If risk allele is found on the negative strand:\n            .when(\n                (risk_allele_reverse_complement == reference_allele)\n                | (risk_allele_reverse_complement == alternate_allele),\n                True,\n            )\n            # If risk allele is ambiguous, still accepted: &lt; This condition could be reconsidered\n            .when(risk_allele == \"?\", True)\n            # If the association could not be mapped we keep it:\n            .when(reference_allele.isNull(), True)\n            # Allele is discordant:\n            .otherwise(False)\n        )\n\n    @staticmethod\n    def _get_reverse_complement(allele_col: Column) -&gt; Column:\n\"\"\"A function to return the reverse complement of an allele column.\n\n        It takes a string and returns the reverse complement of that string if it's a DNA sequence,\n        otherwise it returns the original string. Assumes alleles in upper case.\n\n        Args:\n            allele_col (Column): The column containing the allele to reverse complement.\n\n        Returns:\n            A column that is the reverse complement of the allele column.\n\n        Examples:\n            &gt;&gt;&gt; d = [{\"allele\": 'A'}, {\"allele\": 'T'},{\"allele\": 'G'}, {\"allele\": 'C'},{\"allele\": 'AC'}, {\"allele\": 'GTaatc'},{\"allele\": '?'}, {\"allele\": None}]\n            &gt;&gt;&gt; df = spark.createDataFrame(d)\n            &gt;&gt;&gt; df.withColumn(\"revcom_allele\", StudyLocusGWASCatalog._get_reverse_complement(f.col(\"allele\"))).show()\n            +------+-------------+\n            |allele|revcom_allele|\n            +------+-------------+\n            |     A|            T|\n            |     T|            A|\n            |     G|            C|\n            |     C|            G|\n            |    AC|           GT|\n            |GTaatc|       GATTAC|\n            |     ?|            ?|\n            |  null|         null|\n            +------+-------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        allele_col = f.upper(allele_col)\n        return f.when(\n            allele_col.rlike(\"[ACTG]+\"),\n            f.reverse(f.translate(allele_col, \"ACTG\", \"TGAC\")),\n        ).otherwise(allele_col)\n\n    @staticmethod\n    def _effect_needs_harmonisation(\n        risk_allele: Column, reference_allele: Column\n    ) -&gt; Column:\n\"\"\"A function to check if the effect allele needs to be harmonised.\n\n        Args:\n            risk_allele (Column): Risk allele column\n            reference_allele (Column): Effect allele column\n\n        Returns:\n            A boolean column indicating if the effect allele needs to be harmonised.\n\n        Examples:\n            &gt;&gt;&gt; d = [{\"risk\": 'A', \"reference\": 'A'}, {\"risk\": 'A', \"reference\": 'T'}, {\"risk\": 'AT', \"reference\": 'TA'}, {\"risk\": 'AT', \"reference\": 'AT'}]\n            &gt;&gt;&gt; df = spark.createDataFrame(d)\n            &gt;&gt;&gt; df.withColumn(\"needs_harmonisation\", StudyLocusGWASCatalog._effect_needs_harmonisation(f.col(\"risk\"), f.col(\"reference\"))).show()\n            +---------+----+-------------------+\n            |reference|risk|needs_harmonisation|\n            +---------+----+-------------------+\n            |        A|   A|               true|\n            |        T|   A|               true|\n            |       TA|  AT|              false|\n            |       AT|  AT|               true|\n            +---------+----+-------------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        return (risk_allele == reference_allele) | (\n            risk_allele\n            == StudyLocusGWASCatalog._get_reverse_complement(reference_allele)\n        )\n\n    @staticmethod\n    def _are_alleles_palindromic(\n        reference_allele: Column, alternate_allele: Column\n    ) -&gt; Column:\n\"\"\"A function to check if the alleles are palindromic.\n\n        Args:\n            reference_allele (Column): Reference allele column\n            alternate_allele (Column): Alternate allele column\n\n        Returns:\n            A boolean column indicating if the alleles are palindromic.\n\n        Examples:\n            &gt;&gt;&gt; d = [{\"reference\": 'A', \"alternate\": 'T'}, {\"reference\": 'AT', \"alternate\": 'AG'}, {\"reference\": 'AT', \"alternate\": 'AT'}, {\"reference\": 'CATATG', \"alternate\": 'CATATG'}, {\"reference\": '-', \"alternate\": None}]\n            &gt;&gt;&gt; df = spark.createDataFrame(d)\n            &gt;&gt;&gt; df.withColumn(\"is_palindromic\", StudyLocusGWASCatalog._are_alleles_palindromic(f.col(\"reference\"), f.col(\"alternate\"))).show()\n            +---------+---------+--------------+\n            |alternate|reference|is_palindromic|\n            +---------+---------+--------------+\n            |        T|        A|          true|\n            |       AG|       AT|         false|\n            |       AT|       AT|          true|\n            |   CATATG|   CATATG|          true|\n            |     null|        -|         false|\n            +---------+---------+--------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        revcomp = StudyLocusGWASCatalog._get_reverse_complement(alternate_allele)\n        return (\n            f.when(reference_allele == revcomp, True)\n            .when(revcomp.isNull(), False)\n            .otherwise(False)\n        )\n\n    @staticmethod\n    def _harmonise_beta(\n        risk_allele: Column,\n        reference_allele: Column,\n        alternate_allele: Column,\n        effect_size: Column,\n        confidence_interval: Column,\n    ) -&gt; Column:\n\"\"\"A function to extract the beta value from the effect size and confidence interval.\n\n        If the confidence interval contains the word \"increase\" or \"decrease\" it indicates, we are dealing with betas.\n        If it's \"increase\" and the effect size needs to be harmonized, then multiply the effect size by -1\n\n        Args:\n            risk_allele (Column): Risk allele column\n            reference_allele (Column): Reference allele column\n            alternate_allele (Column): Alternate allele column\n            effect_size (Column): GWAS Catalog effect size column\n            confidence_interval (Column): GWAS Catalog confidence interval column\n\n        Returns:\n            A column containing the beta value.\n        \"\"\"\n        return (\n            f.when(\n                StudyLocusGWASCatalog._are_alleles_palindromic(\n                    reference_allele, alternate_allele\n                ),\n                None,\n            )\n            .when(\n                (\n                    StudyLocusGWASCatalog._effect_needs_harmonisation(\n                        risk_allele, reference_allele\n                    )\n                    &amp; confidence_interval.contains(\"increase\")\n                )\n                | (\n                    ~StudyLocusGWASCatalog._effect_needs_harmonisation(\n                        risk_allele, reference_allele\n                    )\n                    &amp; confidence_interval.contains(\"decrease\")\n                ),\n                -effect_size,\n            )\n            .otherwise(effect_size)\n            .cast(DoubleType())\n        )\n\n    @staticmethod\n    def _harmonise_beta_ci(\n        risk_allele: Column,\n        reference_allele: Column,\n        alternate_allele: Column,\n        effect_size: Column,\n        confidence_interval: Column,\n        p_value: Column,\n        direction: str,\n    ) -&gt; Column:\n\"\"\"Calculating confidence intervals for beta values.\n\n        Args:\n            risk_allele (Column): Risk allele column\n            reference_allele (Column): Reference allele column\n            alternate_allele (Column): Alternate allele column\n            effect_size (Column): GWAS Catalog effect size column\n            confidence_interval (Column): GWAS Catalog confidence interval column\n            p_value (Column): GWAS Catalog p-value column\n            direction (str): This is the direction of the confidence interval. It can be either \"upper\" or \"lower\".\n\n        Returns:\n            The upper and lower bounds of the confidence interval for the beta coefficient.\n        \"\"\"\n        zscore_95 = f.lit(1.96)\n        beta = StudyLocusGWASCatalog._harmonise_beta(\n            risk_allele,\n            reference_allele,\n            alternate_allele,\n            effect_size,\n            confidence_interval,\n        )\n        zscore = pvalue_to_zscore(p_value)\n        return (\n            f.when(f.lit(direction) == \"upper\", beta + f.abs(zscore_95 * beta) / zscore)\n            .when(f.lit(direction) == \"lower\", beta - f.abs(zscore_95 * beta) / zscore)\n            .otherwise(None)\n        )\n\n    @staticmethod\n    def _harmonise_odds_ratio(\n        risk_allele: Column,\n        reference_allele: Column,\n        alternate_allele: Column,\n        effect_size: Column,\n        confidence_interval: Column,\n    ) -&gt; Column:\n\"\"\"Harmonizing odds ratio.\n\n        Args:\n            risk_allele (Column): Risk allele column\n            reference_allele (Column): Reference allele column\n            alternate_allele (Column): Alternate allele column\n            effect_size (Column): GWAS Catalog effect size column\n            confidence_interval (Column): GWAS Catalog confidence interval column\n\n        Returns:\n            A column with the odds ratio, or 1/odds_ratio if harmonization required.\n        \"\"\"\n        return (\n            f.when(\n                StudyLocusGWASCatalog._are_alleles_palindromic(\n                    reference_allele, alternate_allele\n                ),\n                None,\n            )\n            .when(\n                (\n                    StudyLocusGWASCatalog._effect_needs_harmonisation(\n                        risk_allele, reference_allele\n                    )\n                    &amp; ~confidence_interval.rlike(\"|\".join([\"decrease\", \"increase\"]))\n                ),\n                1 / effect_size,\n            )\n            .otherwise(effect_size)\n            .cast(DoubleType())\n        )\n\n    @staticmethod\n    def _harmonise_odds_ratio_ci(\n        risk_allele: Column,\n        reference_allele: Column,\n        alternate_allele: Column,\n        effect_size: Column,\n        confidence_interval: Column,\n        p_value: Column,\n        direction: str,\n    ) -&gt; Column:\n\"\"\"Calculating confidence intervals for beta values.\n\n        Args:\n            risk_allele (Column): Risk allele column\n            reference_allele (Column): Reference allele column\n            alternate_allele (Column): Alternate allele column\n            effect_size (Column): GWAS Catalog effect size column\n            confidence_interval (Column): GWAS Catalog confidence interval column\n            p_value (Column): GWAS Catalog p-value column\n            direction (str): This is the direction of the confidence interval. It can be either \"upper\" or \"lower\".\n\n        Returns:\n            The upper and lower bounds of the 95% confidence interval for the odds ratio.\n        \"\"\"\n        zscore_95 = f.lit(1.96)\n        odds_ratio = StudyLocusGWASCatalog._harmonise_odds_ratio(\n            risk_allele,\n            reference_allele,\n            alternate_allele,\n            effect_size,\n            confidence_interval,\n        )\n        odds_ratio_estimate = f.log(odds_ratio)\n        zscore = pvalue_to_zscore(p_value)\n        odds_ratio_se = odds_ratio_estimate / zscore\n        return f.when(\n            f.lit(direction) == \"upper\",\n            f.exp(odds_ratio_estimate + f.abs(zscore_95 * odds_ratio_se)),\n        ).when(\n            f.lit(direction) == \"lower\",\n            f.exp(odds_ratio_estimate - f.abs(zscore_95 * odds_ratio_se)),\n        )\n\n    @staticmethod\n    def _concatenate_substudy_description(\n        association_trait: Column, pvalue_text: Column, mapped_trait_uri: Column\n    ) -&gt; Column:\n\"\"\"Substudy description parsing. Complex string containing metadata about the substudy (e.g. QTL, specific EFO, etc.).\n\n        Args:\n            association_trait (Column): GWAS Catalog association trait column\n            pvalue_text (Column): GWAS Catalog p-value text column\n            mapped_trait_uri (Column): GWAS Catalog mapped trait URI column\n\n        Returns:\n            A column with the substudy description in the shape trait|pvaluetext1_pvaluetext2|EFO1_EFO2.\n\n        Examples:\n        &gt;&gt;&gt; df = spark.createDataFrame([\n        ...    (\"Height\", \"http://www.ebi.ac.uk/efo/EFO_0000001,http://www.ebi.ac.uk/efo/EFO_0000002\", \"European Ancestry\"),\n        ...    (\"Schizophrenia\", \"http://www.ebi.ac.uk/efo/MONDO_0005090\", None)],\n        ...    [\"association_trait\", \"mapped_trait_uri\", \"pvalue_text\"]\n        ... )\n        &gt;&gt;&gt; df.withColumn('substudy_description', StudyLocusGWASCatalog._concatenate_substudy_description(df.association_trait, df.pvalue_text, df.mapped_trait_uri)).show(truncate=False)\n        +-----------------+-------------------------------------------------------------------------+-----------------+------------------------------------------+\n        |association_trait|mapped_trait_uri                                                         |pvalue_text      |substudy_description                      |\n        +-----------------+-------------------------------------------------------------------------+-----------------+------------------------------------------+\n        |Height           |http://www.ebi.ac.uk/efo/EFO_0000001,http://www.ebi.ac.uk/efo/EFO_0000002|European Ancestry|Height|EA|EFO_0000001/EFO_0000002         |\n        |Schizophrenia    |http://www.ebi.ac.uk/efo/MONDO_0005090                                   |null             |Schizophrenia|no_pvalue_text|MONDO_0005090|\n        +-----------------+-------------------------------------------------------------------------+-----------------+------------------------------------------+\n        &lt;BLANKLINE&gt;\n        \"\"\"\n        p_value_text = f.coalesce(\n            StudyLocusGWASCatalog._normalise_pvaluetext(pvalue_text),\n            f.array(f.lit(\"no_pvalue_text\")),\n        )\n        return f.concat_ws(\n            \"|\",\n            association_trait,\n            f.concat_ws(\n                \"/\",\n                p_value_text,\n            ),\n            f.concat_ws(\n                \"/\",\n                parse_efos(mapped_trait_uri),\n            ),\n        )\n\n    @staticmethod\n    def _qc_all(\n        qc: Column,\n        chromosome: Column,\n        position: Column,\n        reference_allele: Column,\n        alternate_allele: Column,\n        strongest_snp_risk_allele: Column,\n        p_value_mantissa: Column,\n        p_value_exponent: Column,\n        p_value_cutoff: float,\n    ) -&gt; Column:\n\"\"\"Flag associations that fail any QC.\n\n        Args:\n            qc (Column): QC column\n            chromosome (Column): Chromosome column\n            position (Column): Position column\n            reference_allele (Column): Reference allele column\n            alternate_allele (Column): Alternate allele column\n            strongest_snp_risk_allele (Column): Strongest SNP risk allele column\n            p_value_mantissa (Column): P-value mantissa column\n            p_value_exponent (Column): P-value exponent column\n            p_value_cutoff (float): P-value cutoff\n\n        Returns:\n            Column: Updated QC column with flag.\n        \"\"\"\n        qc = StudyLocusGWASCatalog._qc_variant_interactions(\n            qc, strongest_snp_risk_allele\n        )\n        qc = StudyLocusGWASCatalog._qc_subsignificant_associations(\n            qc, p_value_mantissa, p_value_exponent, p_value_cutoff\n        )\n        qc = StudyLocusGWASCatalog._qc_genomic_location(qc, chromosome, position)\n        qc = StudyLocusGWASCatalog._qc_variant_inconsistencies(\n            qc, chromosome, position, strongest_snp_risk_allele\n        )\n        qc = StudyLocusGWASCatalog._qc_unmapped_variants(qc, alternate_allele)\n        qc = StudyLocusGWASCatalog._qc_palindromic_alleles(\n            qc, reference_allele, alternate_allele\n        )\n        return qc\n\n    @staticmethod\n    def _qc_variant_interactions(\n        qc: Column, strongest_snp_risk_allele: Column\n    ) -&gt; Column:\n\"\"\"Flag associations based on variant x variant interactions.\n\n        Args:\n            qc (Column): QC column\n            strongest_snp_risk_allele (Column): Column with the strongest SNP risk allele\n\n        Returns:\n            Column: Updated QC column with flag.\n        \"\"\"\n        return StudyLocusGWASCatalog._update_quality_flag(\n            qc,\n            strongest_snp_risk_allele.contains(\";\"),\n            StudyLocusQualityCheck.COMPOSITE_FLAG,\n        )\n\n    @staticmethod\n    def _qc_subsignificant_associations(\n        qc: Column,\n        p_value_mantissa: Column,\n        p_value_exponent: Column,\n        pvalue_cutoff: float,\n    ) -&gt; Column:\n\"\"\"Flag associations below significant threshold.\n\n        Args:\n            qc (Column): QC column\n            p_value_mantissa (Column): P-value mantissa column\n            p_value_exponent (Column): P-value exponent column\n            pvalue_cutoff (float): association p-value cut-off\n\n        Returns:\n            Column: Updated QC column with flag.\n\n        Examples:\n            &gt;&gt;&gt; import pyspark.sql.types as t\n            &gt;&gt;&gt; d = [{'qc': None, 'p_value_mantissa': 1, 'p_value_exponent': -7}, {'qc': None, 'p_value_mantissa': 1, 'p_value_exponent': -8}, {'qc': None, 'p_value_mantissa': 5, 'p_value_exponent': -8}, {'qc': None, 'p_value_mantissa': 1, 'p_value_exponent': -9}]\n            &gt;&gt;&gt; df = spark.createDataFrame(d, t.StructType([t.StructField('qc', t.ArrayType(t.StringType()), True), t.StructField('p_value_mantissa', t.IntegerType()), t.StructField('p_value_exponent', t.IntegerType())]))\n            &gt;&gt;&gt; df.withColumn('qc', StudyLocusGWASCatalog._qc_subsignificant_associations(f.col(\"qc\"), f.col(\"p_value_mantissa\"), f.col(\"p_value_exponent\"), 5e-8)).show(truncate = False)\n            +------------------------+----------------+----------------+\n            |qc                      |p_value_mantissa|p_value_exponent|\n            +------------------------+----------------+----------------+\n            |[Subsignificant p-value]|1               |-7              |\n            |[]                      |1               |-8              |\n            |[]                      |5               |-8              |\n            |[]                      |1               |-9              |\n            +------------------------+----------------+----------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        return StudyLocus._update_quality_flag(\n            qc,\n            calculate_neglog_pvalue(p_value_mantissa, p_value_exponent)\n            &lt; f.lit(-np.log10(pvalue_cutoff)),\n            StudyLocusQualityCheck.SUBSIGNIFICANT_FLAG,\n        )\n\n    @staticmethod\n    def _qc_genomic_location(\n        qc: Column, chromosome: Column, position: Column\n    ) -&gt; Column:\n\"\"\"Flag associations without genomic location in GWAS Catalog.\n\n        Args:\n            qc (Column): QC column\n            chromosome (Column): Chromosome column in GWAS Catalog\n            position (Column): Position column in GWAS Catalog\n\n        Returns:\n            Column: Updated QC column with flag.\n\n        Examples:\n            &gt;&gt;&gt; import pyspark.sql.types as t\n            &gt;&gt;&gt; d = [{'qc': None, 'chromosome': None, 'position': None}, {'qc': None, 'chromosome': '1', 'position': None}, {'qc': None, 'chromosome': None, 'position': 1}, {'qc': None, 'chromosome': '1', 'position': 1}]\n            &gt;&gt;&gt; df = spark.createDataFrame(d, schema=t.StructType([t.StructField('qc', t.ArrayType(t.StringType()), True), t.StructField('chromosome', t.StringType()), t.StructField('position', t.IntegerType())]))\n            &gt;&gt;&gt; df.withColumn('qc', StudyLocusGWASCatalog._qc_genomic_location(df.qc, df.chromosome, df.position)).show(truncate=False)\n            +----------------------------+----------+--------+\n            |qc                          |chromosome|position|\n            +----------------------------+----------+--------+\n            |[Incomplete genomic mapping]|null      |null    |\n            |[Incomplete genomic mapping]|1         |null    |\n            |[Incomplete genomic mapping]|null      |1       |\n            |[]                          |1         |1       |\n            +----------------------------+----------+--------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        return StudyLocus._update_quality_flag(\n            qc,\n            position.isNull() | chromosome.isNull(),\n            StudyLocusQualityCheck.NO_GENOMIC_LOCATION_FLAG,\n        )\n\n    @staticmethod\n    def _qc_variant_inconsistencies(\n        qc: Column,\n        chromosome: Column,\n        position: Column,\n        strongest_snp_risk_allele: Column,\n    ) -&gt; Column:\n\"\"\"Flag associations with inconsistencies in the variant annotation.\n\n        Args:\n            qc (Column): QC column\n            chromosome (Column): Chromosome column in GWAS Catalog\n            position (Column): Position column in GWAS Catalog\n            strongest_snp_risk_allele (Column): Strongest SNP risk allele column in GWAS Catalog\n\n        Returns:\n            Column: Updated QC column with flag.\n        \"\"\"\n        return StudyLocusGWASCatalog._update_quality_flag(\n            qc,\n            # Number of chromosomes does not correspond to the number of positions:\n            (f.size(f.split(chromosome, \";\")) != f.size(f.split(position, \";\")))\n            # Number of chromosome values different from riskAllele values:\n            | (\n                f.size(f.split(chromosome, \";\"))\n                != f.size(f.split(strongest_snp_risk_allele, \";\"))\n            ),\n            StudyLocusQualityCheck.INCONSISTENCY_FLAG,\n        )\n\n    @staticmethod\n    def _qc_unmapped_variants(qc: Column, alternate_allele: Column) -&gt; Column:\n\"\"\"Flag associations with variants not mapped to variantAnnotation.\n\n        Args:\n            qc (Column): QC column\n            alternate_allele (Column): alternate allele\n\n        Returns:\n            Column: Updated QC column with flag.\n\n        Example:\n            &gt;&gt;&gt; import pyspark.sql.types as t\n            &gt;&gt;&gt; d = [{'alternate_allele': 'A', 'qc': None}, {'alternate_allele': None, 'qc': None}]\n            &gt;&gt;&gt; schema = t.StructType([t.StructField('alternate_allele', t.StringType(), True), t.StructField('qc', t.ArrayType(t.StringType()), True)])\n            &gt;&gt;&gt; df = spark.createDataFrame(data=d, schema=schema)\n            &gt;&gt;&gt; df.withColumn(\"new_qc\", StudyLocusGWASCatalog._qc_unmapped_variants(f.col(\"qc\"), f.col(\"alternate_allele\"))).show()\n            +----------------+----+--------------------+\n            |alternate_allele|  qc|              new_qc|\n            +----------------+----+--------------------+\n            |               A|null|                  []|\n            |            null|null|[No mapping in Gn...|\n            +----------------+----+--------------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        return StudyLocus._update_quality_flag(\n            qc,\n            alternate_allele.isNull(),\n            StudyLocusQualityCheck.NON_MAPPED_VARIANT_FLAG,\n        )\n\n    @staticmethod\n    def _qc_palindromic_alleles(\n        qc: Column, reference_allele: Column, alternate_allele: Column\n    ) -&gt; Column:\n\"\"\"Flag associations with palindromic variants which effects can not be harmonised.\n\n        Args:\n            qc (Column): QC column\n            reference_allele (Column): reference allele\n            alternate_allele (Column): alternate allele\n\n        Returns:\n            Column: Updated QC column with flag.\n\n        Example:\n            &gt;&gt;&gt; import pyspark.sql.types as t\n            &gt;&gt;&gt; schema = t.StructType([t.StructField('reference_allele', t.StringType(), True), t.StructField('alternate_allele', t.StringType(), True), t.StructField('qc', t.ArrayType(t.StringType()), True)])\n            &gt;&gt;&gt; d = [{'reference_allele': 'A', 'alternate_allele': 'T', 'qc': None}, {'reference_allele': 'AT', 'alternate_allele': 'TA', 'qc': None}, {'reference_allele': 'AT', 'alternate_allele': 'AT', 'qc': None}]\n            &gt;&gt;&gt; df = spark.createDataFrame(data=d, schema=schema)\n            &gt;&gt;&gt; df.withColumn(\"qc\", StudyLocusGWASCatalog._qc_palindromic_alleles(f.col(\"qc\"), f.col(\"reference_allele\"), f.col(\"alternate_allele\"))).show(truncate=False)\n            +----------------+----------------+---------------------------------------+\n            |reference_allele|alternate_allele|qc                                     |\n            +----------------+----------------+---------------------------------------+\n            |A               |T               |[Palindrome alleles - cannot harmonize]|\n            |AT              |TA              |[]                                     |\n            |AT              |AT              |[Palindrome alleles - cannot harmonize]|\n            +----------------+----------------+---------------------------------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        return StudyLocus._update_quality_flag(\n            qc,\n            StudyLocusGWASCatalog._are_alleles_palindromic(\n                reference_allele, alternate_allele\n            ),\n            StudyLocusQualityCheck.PALINDROMIC_ALLELE_FLAG,\n        )\n\n    @classmethod\n    def from_source(\n        cls: type[StudyLocusGWASCatalog],\n        gwas_associations: DataFrame,\n        variant_annotation: VariantAnnotation,\n        pvalue_threshold: float = 5e-8,\n    ) -&gt; StudyLocusGWASCatalog:\n\"\"\"Read GWASCatalog associations.\n\n        It reads the GWAS Catalog association dataset, selects and renames columns, casts columns, and\n        applies some pre-defined filters on the data:\n\n        Args:\n            gwas_associations (DataFrame): GWAS Catalog raw associations dataset\n            variant_annotation (VariantAnnotation): Variant annotation dataset\n            pvalue_threshold (float): P-value threshold for flagging associations\n\n        Returns:\n            StudyLocusGWASCatalog: StudyLocusGWASCatalog dataset\n        \"\"\"\n        return cls(\n            _df=gwas_associations.withColumn(\n                \"studyLocusId\", f.monotonically_increasing_id().cast(LongType())\n            )\n            .transform(\n                # Map/harmonise variants to variant annotation dataset:\n                # This function adds columns: variantId, referenceAllele, alternateAllele, chromosome, position\n                lambda df: StudyLocusGWASCatalog._map_to_variant_annotation_variants(\n                    df, variant_annotation\n                )\n            )\n            .withColumn(\n                # Perform all quality control checks:\n                \"qualityControls\",\n                StudyLocusGWASCatalog._qc_all(\n                    f.array().alias(\"qualityControls\"),\n                    f.col(\"CHR_ID\"),\n                    f.col(\"CHR_POS\").cast(IntegerType()),\n                    f.col(\"referenceAllele\"),\n                    f.col(\"alternateAllele\"),\n                    f.col(\"STRONGEST SNP-RISK ALLELE\"),\n                    *StudyLocusGWASCatalog._parse_pvalue(f.col(\"P-VALUE\")),\n                    pvalue_threshold,\n                ),\n            )\n            .select(\n                # INSIDE STUDY-LOCUS SCHEMA:\n                \"studyLocusId\",\n                \"variantId\",\n                # Mapped genomic location of the variant (; separated list)\n                \"chromosome\",\n                \"position\",\n                f.col(\"STUDY ACCESSION\").alias(\"studyId\"),\n                # beta value of the association\n                StudyLocusGWASCatalog._harmonise_beta(\n                    StudyLocusGWASCatalog._normalise_risk_allele(\n                        f.col(\"STRONGEST SNP-RISK ALLELE\")\n                    ),\n                    f.col(\"referenceAllele\"),\n                    f.col(\"alternateAllele\"),\n                    f.col(\"OR or BETA\"),\n                    f.col(\"95% CI (TEXT)\"),\n                ).alias(\"beta\"),\n                # odds ratio of the association\n                StudyLocusGWASCatalog._harmonise_odds_ratio(\n                    StudyLocusGWASCatalog._normalise_risk_allele(\n                        f.col(\"STRONGEST SNP-RISK ALLELE\")\n                    ),\n                    f.col(\"referenceAllele\"),\n                    f.col(\"alternateAllele\"),\n                    f.col(\"OR or BETA\"),\n                    f.col(\"95% CI (TEXT)\"),\n                ).alias(\"oddsRatio\"),\n                # CI lower of the beta value\n                StudyLocusGWASCatalog._harmonise_beta_ci(\n                    StudyLocusGWASCatalog._normalise_risk_allele(\n                        f.col(\"STRONGEST SNP-RISK ALLELE\")\n                    ),\n                    f.col(\"referenceAllele\"),\n                    f.col(\"alternateAllele\"),\n                    f.col(\"OR or BETA\"),\n                    f.col(\"95% CI (TEXT)\"),\n                    f.col(\"P-VALUE\"),\n                    \"lower\",\n                ).alias(\"betaConfidenceIntervalLower\"),\n                # CI upper for the beta value\n                StudyLocusGWASCatalog._harmonise_beta_ci(\n                    StudyLocusGWASCatalog._normalise_risk_allele(\n                        f.col(\"STRONGEST SNP-RISK ALLELE\")\n                    ),\n                    f.col(\"referenceAllele\"),\n                    f.col(\"alternateAllele\"),\n                    f.col(\"OR or BETA\"),\n                    f.col(\"95% CI (TEXT)\"),\n                    f.col(\"P-VALUE\"),\n                    \"upper\",\n                ).alias(\"betaConfidenceIntervalUpper\"),\n                # CI lower of the odds ratio value\n                StudyLocusGWASCatalog._harmonise_odds_ratio_ci(\n                    StudyLocusGWASCatalog._normalise_risk_allele(\n                        f.col(\"STRONGEST SNP-RISK ALLELE\")\n                    ),\n                    f.col(\"referenceAllele\"),\n                    f.col(\"alternateAllele\"),\n                    f.col(\"OR or BETA\"),\n                    f.col(\"95% CI (TEXT)\"),\n                    f.col(\"P-VALUE\"),\n                    \"lower\",\n                ).alias(\"oddsRatioConfidenceIntervalLower\"),\n                # CI upper of the odds ratio value\n                StudyLocusGWASCatalog._harmonise_odds_ratio_ci(\n                    StudyLocusGWASCatalog._normalise_risk_allele(\n                        f.col(\"STRONGEST SNP-RISK ALLELE\")\n                    ),\n                    f.col(\"referenceAllele\"),\n                    f.col(\"alternateAllele\"),\n                    f.col(\"OR or BETA\"),\n                    f.col(\"95% CI (TEXT)\"),\n                    f.col(\"P-VALUE\"),\n                    \"upper\",\n                ).alias(\"oddsRatioConfidenceIntervalUpper\"),\n                # p-value of the association, string: split into exponent and mantissa.\n                *StudyLocusGWASCatalog._parse_pvalue(f.col(\"P-VALUE\")),\n                # Capturing phenotype granularity at the association level\n                StudyLocusGWASCatalog._concatenate_substudy_description(\n                    f.col(\"DISEASE/TRAIT\"),\n                    f.col(\"P-VALUE (TEXT)\"),\n                    f.col(\"MAPPED_TRAIT_URI\"),\n                ).alias(\"subStudyDescription\"),\n                # Quality controls (array of strings)\n                \"qualityControls\",\n            )\n        )\n\n    def update_study_id(\n        self: StudyLocusGWASCatalog, study_annotation: DataFrame\n    ) -&gt; StudyLocusGWASCatalog:\n\"\"\"Update studyId with a dataframe containing study.\n\n        Args:\n            study_annotation (DataFrame): Dataframe containing `updatedStudyId` and key columns `studyId` and `subStudyDescription`.\n\n        Returns:\n            StudyLocusGWASCatalog: Updated study locus.\n        \"\"\"\n        self.df = (\n            self._df.join(\n                study_annotation, on=[\"studyId\", \"subStudyDescription\"], how=\"left\"\n            )\n            .withColumn(\"studyId\", f.coalesce(\"updatedStudyId\", \"studyId\"))\n            .drop(\"subStudyDescription\", \"updatedStudyId\")\n        )\n        return self\n\n    def annotate_ld(\n        self: StudyLocusGWASCatalog,\n        session: Session,\n        studies: StudyIndexGWASCatalog,\n        ld_populations: list[str],\n        ld_index_template: str,\n        ld_matrix_template: str,\n        min_r2: float,\n    ) -&gt; StudyLocus:\n\"\"\"Annotate LD set for every studyLocus using gnomAD.\n\n        Args:\n            session (Session): Session\n            studies (StudyIndexGWASCatalog): Study index containing ancestry information\n            ld_populations (list[str]): List of populations to annotate\n            ld_index_template (str): Template path of the LD matrix index containing `{POP}` where the population is expected\n            ld_matrix_template (str): Template path of the LD matrix containing `{POP}` where the population is expected\n            min_r2 (float): Minimum r2 to include in the LD set\n\n        Returns:\n            StudyLocus: Study-locus with an annotated credible set.\n        \"\"\"\n        # TODO: call unique_study_locus_ancestries here so that it is not duplicated with ld_annotation_by_locus_ancestry\n        # LD annotation for all unique lead variants in all populations (study independent).\n        ld_r = LDAnnotatorGnomad.ld_annotation_by_locus_ancestry(\n            session,\n            self,\n            studies,\n            ld_populations,\n            ld_index_template,\n            ld_matrix_template,\n            min_r2,\n        ).coalesce(400)\n\n        ld_set = (\n            self.unique_study_locus_ancestries(studies)\n            .join(ld_r, on=[\"chromosome\", \"variantId\", \"gnomadPopulation\"], how=\"left\")\n            .withColumn(\"r2\", f.pow(f.col(\"r\"), f.lit(2)))\n            .withColumn(\n                \"r2Overall\",\n                LDAnnotatorGnomad.weighted_r_overall(\n                    f.col(\"chromosome\"),\n                    f.col(\"studyId\"),\n                    f.col(\"variantId\"),\n                    f.col(\"tagVariantId\"),\n                    f.col(\"relativeSampleSize\"),\n                    f.col(\"r2\"),\n                ),\n            )\n            .groupBy(\"chromosome\", \"studyId\", \"variantId\")\n            .agg(\n                f.collect_set(\n                    f.when(\n                        f.col(\"tagVariantId\").isNotNull(),\n                        f.struct(\"tagVariantId\", \"r2Overall\"),\n                    )\n                ).alias(\"credibleSet\")\n            )\n        )\n\n        self.df = self.df.join(\n            ld_set, on=[\"chromosome\", \"studyId\", \"variantId\"], how=\"left\"\n        )\n\n        return self._qc_unresolved_ld()\n\n    def _qc_ambiguous_study(self: StudyLocusGWASCatalog) -&gt; StudyLocusGWASCatalog:\n\"\"\"Flag associations with variants that can not be unambiguously associated with one study.\n\n        Returns:\n            StudyLocusGWASCatalog: Updated study locus.\n        \"\"\"\n        assoc_ambiguity_window = Window.partitionBy(\n            f.col(\"studyId\"), f.col(\"variantId\")\n        )\n\n        self._df.withColumn(\n            \"qualityControls\",\n            StudyLocus._update_quality_flag(\n                f.col(\"qualityControls\"),\n                f.count(f.col(\"variantId\")).over(assoc_ambiguity_window) &gt; 1,\n                StudyLocusQualityCheck.AMBIGUOUS_STUDY,\n            ),\n        )\n        return self\n\n    def _qc_unresolved_ld(self: StudyLocusGWASCatalog) -&gt; StudyLocusGWASCatalog:\n\"\"\"Flag associations with variants that are not found in the LD reference.\n\n        Returns:\n            StudyLocusGWASCatalog: Updated study locus.\n        \"\"\"\n        self._df.withColumn(\n            \"qualityControls\",\n            StudyLocus._update_quality_flag(\n                f.col(\"qualityControls\"),\n                f.col(\"credibleSet\").isNull(),\n                StudyLocusQualityCheck.UNRESOLVED_LD,\n            ),\n        )\n        return self\n</code></pre>"},{"location":"components/dataset/study_locus/study_locus_gwas_catalog/#otg.dataset.study_locus.StudyLocusGWASCatalog.annotate_ld","title":"<code>annotate_ld(session, studies, ld_populations, ld_index_template, ld_matrix_template, min_r2)</code>","text":"<p>Annotate LD set for every studyLocus using gnomAD.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>Session</p> required <code>studies</code> <code>StudyIndexGWASCatalog</code> <p>Study index containing ancestry information</p> required <code>ld_populations</code> <code>list[str]</code> <p>List of populations to annotate</p> required <code>ld_index_template</code> <code>str</code> <p>Template path of the LD matrix index containing <code>{POP}</code> where the population is expected</p> required <code>ld_matrix_template</code> <code>str</code> <p>Template path of the LD matrix containing <code>{POP}</code> where the population is expected</p> required <code>min_r2</code> <code>float</code> <p>Minimum r2 to include in the LD set</p> required <p>Returns:</p> Name Type Description <code>StudyLocus</code> <code>StudyLocus</code> <p>Study-locus with an annotated credible set.</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>def annotate_ld(\n    self: StudyLocusGWASCatalog,\n    session: Session,\n    studies: StudyIndexGWASCatalog,\n    ld_populations: list[str],\n    ld_index_template: str,\n    ld_matrix_template: str,\n    min_r2: float,\n) -&gt; StudyLocus:\n\"\"\"Annotate LD set for every studyLocus using gnomAD.\n\n    Args:\n        session (Session): Session\n        studies (StudyIndexGWASCatalog): Study index containing ancestry information\n        ld_populations (list[str]): List of populations to annotate\n        ld_index_template (str): Template path of the LD matrix index containing `{POP}` where the population is expected\n        ld_matrix_template (str): Template path of the LD matrix containing `{POP}` where the population is expected\n        min_r2 (float): Minimum r2 to include in the LD set\n\n    Returns:\n        StudyLocus: Study-locus with an annotated credible set.\n    \"\"\"\n    # TODO: call unique_study_locus_ancestries here so that it is not duplicated with ld_annotation_by_locus_ancestry\n    # LD annotation for all unique lead variants in all populations (study independent).\n    ld_r = LDAnnotatorGnomad.ld_annotation_by_locus_ancestry(\n        session,\n        self,\n        studies,\n        ld_populations,\n        ld_index_template,\n        ld_matrix_template,\n        min_r2,\n    ).coalesce(400)\n\n    ld_set = (\n        self.unique_study_locus_ancestries(studies)\n        .join(ld_r, on=[\"chromosome\", \"variantId\", \"gnomadPopulation\"], how=\"left\")\n        .withColumn(\"r2\", f.pow(f.col(\"r\"), f.lit(2)))\n        .withColumn(\n            \"r2Overall\",\n            LDAnnotatorGnomad.weighted_r_overall(\n                f.col(\"chromosome\"),\n                f.col(\"studyId\"),\n                f.col(\"variantId\"),\n                f.col(\"tagVariantId\"),\n                f.col(\"relativeSampleSize\"),\n                f.col(\"r2\"),\n            ),\n        )\n        .groupBy(\"chromosome\", \"studyId\", \"variantId\")\n        .agg(\n            f.collect_set(\n                f.when(\n                    f.col(\"tagVariantId\").isNotNull(),\n                    f.struct(\"tagVariantId\", \"r2Overall\"),\n                )\n            ).alias(\"credibleSet\")\n        )\n    )\n\n    self.df = self.df.join(\n        ld_set, on=[\"chromosome\", \"studyId\", \"variantId\"], how=\"left\"\n    )\n\n    return self._qc_unresolved_ld()\n</code></pre>"},{"location":"components/dataset/study_locus/study_locus_gwas_catalog/#otg.dataset.study_locus.StudyLocusGWASCatalog.from_source","title":"<code>from_source(gwas_associations, variant_annotation, pvalue_threshold=5e-08)</code>  <code>classmethod</code>","text":"<p>Read GWASCatalog associations.</p> <p>It reads the GWAS Catalog association dataset, selects and renames columns, casts columns, and applies some pre-defined filters on the data:</p> <p>Parameters:</p> Name Type Description Default <code>gwas_associations</code> <code>DataFrame</code> <p>GWAS Catalog raw associations dataset</p> required <code>variant_annotation</code> <code>VariantAnnotation</code> <p>Variant annotation dataset</p> required <code>pvalue_threshold</code> <code>float</code> <p>P-value threshold for flagging associations</p> <code>5e-08</code> <p>Returns:</p> Name Type Description <code>StudyLocusGWASCatalog</code> <code>StudyLocusGWASCatalog</code> <p>StudyLocusGWASCatalog dataset</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>@classmethod\ndef from_source(\n    cls: type[StudyLocusGWASCatalog],\n    gwas_associations: DataFrame,\n    variant_annotation: VariantAnnotation,\n    pvalue_threshold: float = 5e-8,\n) -&gt; StudyLocusGWASCatalog:\n\"\"\"Read GWASCatalog associations.\n\n    It reads the GWAS Catalog association dataset, selects and renames columns, casts columns, and\n    applies some pre-defined filters on the data:\n\n    Args:\n        gwas_associations (DataFrame): GWAS Catalog raw associations dataset\n        variant_annotation (VariantAnnotation): Variant annotation dataset\n        pvalue_threshold (float): P-value threshold for flagging associations\n\n    Returns:\n        StudyLocusGWASCatalog: StudyLocusGWASCatalog dataset\n    \"\"\"\n    return cls(\n        _df=gwas_associations.withColumn(\n            \"studyLocusId\", f.monotonically_increasing_id().cast(LongType())\n        )\n        .transform(\n            # Map/harmonise variants to variant annotation dataset:\n            # This function adds columns: variantId, referenceAllele, alternateAllele, chromosome, position\n            lambda df: StudyLocusGWASCatalog._map_to_variant_annotation_variants(\n                df, variant_annotation\n            )\n        )\n        .withColumn(\n            # Perform all quality control checks:\n            \"qualityControls\",\n            StudyLocusGWASCatalog._qc_all(\n                f.array().alias(\"qualityControls\"),\n                f.col(\"CHR_ID\"),\n                f.col(\"CHR_POS\").cast(IntegerType()),\n                f.col(\"referenceAllele\"),\n                f.col(\"alternateAllele\"),\n                f.col(\"STRONGEST SNP-RISK ALLELE\"),\n                *StudyLocusGWASCatalog._parse_pvalue(f.col(\"P-VALUE\")),\n                pvalue_threshold,\n            ),\n        )\n        .select(\n            # INSIDE STUDY-LOCUS SCHEMA:\n            \"studyLocusId\",\n            \"variantId\",\n            # Mapped genomic location of the variant (; separated list)\n            \"chromosome\",\n            \"position\",\n            f.col(\"STUDY ACCESSION\").alias(\"studyId\"),\n            # beta value of the association\n            StudyLocusGWASCatalog._harmonise_beta(\n                StudyLocusGWASCatalog._normalise_risk_allele(\n                    f.col(\"STRONGEST SNP-RISK ALLELE\")\n                ),\n                f.col(\"referenceAllele\"),\n                f.col(\"alternateAllele\"),\n                f.col(\"OR or BETA\"),\n                f.col(\"95% CI (TEXT)\"),\n            ).alias(\"beta\"),\n            # odds ratio of the association\n            StudyLocusGWASCatalog._harmonise_odds_ratio(\n                StudyLocusGWASCatalog._normalise_risk_allele(\n                    f.col(\"STRONGEST SNP-RISK ALLELE\")\n                ),\n                f.col(\"referenceAllele\"),\n                f.col(\"alternateAllele\"),\n                f.col(\"OR or BETA\"),\n                f.col(\"95% CI (TEXT)\"),\n            ).alias(\"oddsRatio\"),\n            # CI lower of the beta value\n            StudyLocusGWASCatalog._harmonise_beta_ci(\n                StudyLocusGWASCatalog._normalise_risk_allele(\n                    f.col(\"STRONGEST SNP-RISK ALLELE\")\n                ),\n                f.col(\"referenceAllele\"),\n                f.col(\"alternateAllele\"),\n                f.col(\"OR or BETA\"),\n                f.col(\"95% CI (TEXT)\"),\n                f.col(\"P-VALUE\"),\n                \"lower\",\n            ).alias(\"betaConfidenceIntervalLower\"),\n            # CI upper for the beta value\n            StudyLocusGWASCatalog._harmonise_beta_ci(\n                StudyLocusGWASCatalog._normalise_risk_allele(\n                    f.col(\"STRONGEST SNP-RISK ALLELE\")\n                ),\n                f.col(\"referenceAllele\"),\n                f.col(\"alternateAllele\"),\n                f.col(\"OR or BETA\"),\n                f.col(\"95% CI (TEXT)\"),\n                f.col(\"P-VALUE\"),\n                \"upper\",\n            ).alias(\"betaConfidenceIntervalUpper\"),\n            # CI lower of the odds ratio value\n            StudyLocusGWASCatalog._harmonise_odds_ratio_ci(\n                StudyLocusGWASCatalog._normalise_risk_allele(\n                    f.col(\"STRONGEST SNP-RISK ALLELE\")\n                ),\n                f.col(\"referenceAllele\"),\n                f.col(\"alternateAllele\"),\n                f.col(\"OR or BETA\"),\n                f.col(\"95% CI (TEXT)\"),\n                f.col(\"P-VALUE\"),\n                \"lower\",\n            ).alias(\"oddsRatioConfidenceIntervalLower\"),\n            # CI upper of the odds ratio value\n            StudyLocusGWASCatalog._harmonise_odds_ratio_ci(\n                StudyLocusGWASCatalog._normalise_risk_allele(\n                    f.col(\"STRONGEST SNP-RISK ALLELE\")\n                ),\n                f.col(\"referenceAllele\"),\n                f.col(\"alternateAllele\"),\n                f.col(\"OR or BETA\"),\n                f.col(\"95% CI (TEXT)\"),\n                f.col(\"P-VALUE\"),\n                \"upper\",\n            ).alias(\"oddsRatioConfidenceIntervalUpper\"),\n            # p-value of the association, string: split into exponent and mantissa.\n            *StudyLocusGWASCatalog._parse_pvalue(f.col(\"P-VALUE\")),\n            # Capturing phenotype granularity at the association level\n            StudyLocusGWASCatalog._concatenate_substudy_description(\n                f.col(\"DISEASE/TRAIT\"),\n                f.col(\"P-VALUE (TEXT)\"),\n                f.col(\"MAPPED_TRAIT_URI\"),\n            ).alias(\"subStudyDescription\"),\n            # Quality controls (array of strings)\n            \"qualityControls\",\n        )\n    )\n</code></pre>"},{"location":"components/dataset/study_locus/study_locus_gwas_catalog/#otg.dataset.study_locus.StudyLocusGWASCatalog.update_study_id","title":"<code>update_study_id(study_annotation)</code>","text":"<p>Update studyId with a dataframe containing study.</p> <p>Parameters:</p> Name Type Description Default <code>study_annotation</code> <code>DataFrame</code> <p>Dataframe containing <code>updatedStudyId</code> and key columns <code>studyId</code> and <code>subStudyDescription</code>.</p> required <p>Returns:</p> Name Type Description <code>StudyLocusGWASCatalog</code> <code>StudyLocusGWASCatalog</code> <p>Updated study locus.</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>def update_study_id(\n    self: StudyLocusGWASCatalog, study_annotation: DataFrame\n) -&gt; StudyLocusGWASCatalog:\n\"\"\"Update studyId with a dataframe containing study.\n\n    Args:\n        study_annotation (DataFrame): Dataframe containing `updatedStudyId` and key columns `studyId` and `subStudyDescription`.\n\n    Returns:\n        StudyLocusGWASCatalog: Updated study locus.\n    \"\"\"\n    self.df = (\n        self._df.join(\n            study_annotation, on=[\"studyId\", \"subStudyDescription\"], how=\"left\"\n        )\n        .withColumn(\"studyId\", f.coalesce(\"updatedStudyId\", \"studyId\"))\n        .drop(\"subStudyDescription\", \"updatedStudyId\")\n    )\n    return self\n</code></pre>"},{"location":"components/method/_method/","title":"Method","text":"<p>Methods used accross the Open Targets Genetics Pipeline</p>"},{"location":"components/method/clumping/","title":"Clumping","text":"<p>Clumping is a commonly used post-processing method that allows for identification of independent association signals from GWAS summary statistics and curated associations. This process is critical because of the complex linkage disequilibrium (LD) structure in human populations, which can result in multiple statistically significant associations within the same genomic region. Clumping methods help reduce redundancy in GWAS results and ensure that each reported association represents an independent signal.</p> <p>We have implemented 2 clumping methods:</p>"},{"location":"components/method/clumping/#clumping-based-on-linkage-disequilibrium-ld","title":"Clumping based on Linkage Disequilibrium (LD)","text":"<p>LD clumping reports the most significant genetic associations in a region in terms of a smaller number of \u201cclumps\u201d of genetically linked SNPs.</p> Source code in <code>src/otg/method/clump.py</code> <pre><code>class LDclumping:\n\"\"\"LD clumping reports the most significant genetic associations in a region in terms of a smaller number of \u201cclumps\u201d of genetically linked SNPs.\"\"\"\n\n    @staticmethod\n    def _is_lead_linked(\n        study_id: Column,\n        variant_id: Column,\n        p_value_exponent: Column,\n        p_value_mantissa: Column,\n        credible_set: Column,\n    ) -&gt; Column:\n\"\"\"Evaluates whether a lead variant is linked to a tag (with lowest p-value) in the same studyLocus dataset.\n\n        Args:\n            study_id (Column): studyId\n            variant_id (Column): Lead variant id\n            p_value_exponent (Column): p-value exponent\n            p_value_mantissa (Column): p-value mantissa\n            credible_set (Column): Credible set &lt;array of structs&gt;\n\n        Returns:\n            Column: Boolean in which True indicates that the lead is linked to another tag in the same dataset.\n        \"\"\"\n        leads_in_study = f.collect_set(variant_id).over(Window.partitionBy(study_id))\n        tags_in_studylocus = f.array_union(\n            # Get all tag variants from the credible set per studyLocusId\n            f.transform(credible_set, lambda x: x.tagVariantId),\n            # And append the lead variant so that the intersection is the same for all studyLocusIds in a study\n            f.array(f.col(\"variantId\")),\n        )\n        intersect_lead_tags = f.array_sort(\n            f.array_intersect(leads_in_study, tags_in_studylocus)\n        )\n        return (\n            # If the lead is in the credible set, we rank the peaks by p-value\n            f.when(\n                f.size(intersect_lead_tags) &gt; 0,\n                f.row_number().over(\n                    Window.partitionBy(study_id, intersect_lead_tags).orderBy(\n                        p_value_exponent, p_value_mantissa\n                    )\n                )\n                &gt; 1,\n            )\n            # If the intersection is empty (lead is not in the credible set or cred set is empty), the association is not linked\n            .otherwise(f.lit(False))\n        )\n\n    @classmethod\n    def clump(cls: type[LDclumping], associations: StudyLocus) -&gt; StudyLocus:\n\"\"\"Perform clumping on studyLocus dataset.\n\n        Args:\n            associations (StudyLocus): StudyLocus dataset\n\n        Returns:\n            StudyLocus: including flag and removing credibleSet information for LD clumped loci.\n        \"\"\"\n        return associations.clump()\n</code></pre>"},{"location":"components/method/clumping/#otg.method.clump.LDclumping.clump","title":"<code>clump(associations)</code>  <code>classmethod</code>","text":"<p>Perform clumping on studyLocus dataset.</p> <p>Parameters:</p> Name Type Description Default <code>associations</code> <code>StudyLocus</code> <p>StudyLocus dataset</p> required <p>Returns:</p> Name Type Description <code>StudyLocus</code> <code>StudyLocus</code> <p>including flag and removing credibleSet information for LD clumped loci.</p> Source code in <code>src/otg/method/clump.py</code> <pre><code>@classmethod\ndef clump(cls: type[LDclumping], associations: StudyLocus) -&gt; StudyLocus:\n\"\"\"Perform clumping on studyLocus dataset.\n\n    Args:\n        associations (StudyLocus): StudyLocus dataset\n\n    Returns:\n        StudyLocus: including flag and removing credibleSet information for LD clumped loci.\n    \"\"\"\n    return associations.clump()\n</code></pre>"},{"location":"components/method/coloc/","title":"coloc","text":"<p>Calculate bayesian colocalisation based on overlapping signals from credible sets.</p> <p>Based on the R COLOC package, which uses the Bayes factors from the credible set to estimate the posterior probability of colocalisation. This method makes the simplifying assumption that only one single causal variant exists for any given trait in any genomic region.</p> Hypothesis Description H0 no association with either trait in the region H1 association with trait 1 only H2 association with trait 2 only H3 both traits are associated, but have different single causal variants H4 both traits are associated and share the same single causal variant <p>Approximate Bayes factors required</p> <p>Coloc requires the availability of approximate Bayes factors (ABF) for each variant in the credible set (<code>logABF</code> column).</p> Source code in <code>src/otg/method/colocalisation.py</code> <pre><code>class Coloc:\n\"\"\"Calculate bayesian colocalisation based on overlapping signals from credible sets.\n\n    Based on the [R COLOC package](https://github.com/chr1swallace/coloc/blob/main/R/claudia.R), which uses the Bayes factors from the credible set to estimate the posterior probability of colocalisation. This method makes the simplifying assumption that **only one single causal variant** exists for any given trait in any genomic region.\n\n    | Hypothesis    | Description                                                           |\n    | ------------- | --------------------------------------------------------------------- |\n    | H&lt;sub&gt;0&lt;/sub&gt; | no association with either trait in the region                        |\n    | H&lt;sub&gt;1&lt;/sub&gt; | association with trait 1 only                                         |\n    | H&lt;sub&gt;2&lt;/sub&gt; | association with trait 2 only                                         |\n    | H&lt;sub&gt;3&lt;/sub&gt; | both traits are associated, but have different single causal variants |\n    | H&lt;sub&gt;4&lt;/sub&gt; | both traits are associated and share the same single causal variant   |\n\n    !!! warning \"Approximate Bayes factors required\"\n        Coloc requires the availability of approximate Bayes factors (ABF) for each variant in the credible set (`logABF` column).\n\n    \"\"\"\n\n    @staticmethod\n    def _get_logsum(log_abf: ndarray) -&gt; float:\n\"\"\"Calculates logsum of vector.\n\n        This function calculates the log of the sum of the exponentiated\n        logs taking out the max, i.e. insuring that the sum is not Inf\n\n        Args:\n            log_abf (ndarray): log approximate bayes factor\n\n        Returns:\n            float: logsum\n\n        Example:\n            &gt;&gt;&gt; l = [0.2, 0.1, 0.05, 0]\n            &gt;&gt;&gt; round(Coloc._get_logsum(l), 6)\n            1.476557\n        \"\"\"\n        themax = np.max(log_abf)\n        result = themax + np.log(np.sum(np.exp(log_abf - themax)))\n        return float(result)\n\n    @staticmethod\n    def _get_posteriors(all_abfs: ndarray) -&gt; DenseVector:\n\"\"\"Calculate posterior probabilities for each hypothesis.\n\n        Args:\n            all_abfs (ndarray): h0-h4 bayes factors\n\n        Returns:\n            DenseVector: Posterior\n\n        Example:\n            &gt;&gt;&gt; l = np.array([0.2, 0.1, 0.05, 0])\n            &gt;&gt;&gt; Coloc._get_posteriors(l)\n            DenseVector([0.279, 0.2524, 0.2401, 0.2284])\n        \"\"\"\n        diff = all_abfs - Coloc._get_logsum(all_abfs)\n        abfs_posteriors = np.exp(diff)\n        return Vectors.dense(abfs_posteriors)\n\n    @classmethod\n    def colocalise(\n        cls: type[Coloc],\n        overlapping_signals: StudyLocusOverlap,\n        priorc1: float = 1e-4,\n        priorc2: float = 1e-4,\n        priorc12: float = 1e-5,\n    ) -&gt; Colocalisation:\n\"\"\"Calculate bayesian colocalisation based on overlapping signals.\n\n        Args:\n            overlapping_signals (StudyLocusOverlap): overlapping peaks\n            priorc1 (float): Prior on variant being causal for trait 1. Defaults to 1e-4.\n            priorc2 (float): Prior on variant being causal for trait 2. Defaults to 1e-4.\n            priorc12 (float): Prior on variant being causal for traits 1 and 2. Defaults to 1e-5.\n\n        Returns:\n            Colocalisation: Colocalisation results\n        \"\"\"\n        # register udfs\n        logsum = f.udf(Coloc._get_logsum, DoubleType())\n        posteriors = f.udf(Coloc._get_posteriors, VectorUDT())\n        return Colocalisation(\n            _df=(\n                overlapping_signals.df\n                # Before summing log_abf columns nulls need to be filled with 0:\n                .fillna(0, subset=[\"left_logABF\", \"right_logABF\"])\n                # Sum of log_abfs for each pair of signals\n                .withColumn(\"sum_log_abf\", f.col(\"left_logABF\") + f.col(\"right_logABF\"))\n                # Group by overlapping peak and generating dense vectors of log_abf:\n                .groupBy(\"chromosome\", \"left_studyLocusId\", \"right_studyLocusId\")\n                .agg(\n                    f.count(\"*\").alias(\"coloc_n_vars\"),\n                    fml.array_to_vector(f.collect_list(f.col(\"left_logABF\"))).alias(\n                        \"left_logABF\"\n                    ),\n                    fml.array_to_vector(f.collect_list(f.col(\"right_logABF\"))).alias(\n                        \"right_logABF\"\n                    ),\n                    fml.array_to_vector(f.collect_list(f.col(\"sum_log_abf\"))).alias(\n                        \"sum_log_abf\"\n                    ),\n                )\n                .withColumn(\"logsum1\", logsum(f.col(\"left_logABF\")))\n                .withColumn(\"logsum2\", logsum(f.col(\"right_logABF\")))\n                .withColumn(\"logsum12\", logsum(f.col(\"sum_log_abf\")))\n                .drop(\"left_logABF\", \"right_logABF\", \"sum_log_abf\")\n                # Add priors\n                # priorc1 Prior on variant being causal for trait 1\n                .withColumn(\"priorc1\", f.lit(priorc1))\n                # priorc2 Prior on variant being causal for trait 2\n                .withColumn(\"priorc2\", f.lit(priorc2))\n                # priorc12 Prior on variant being causal for traits 1 and 2\n                .withColumn(\"priorc12\", f.lit(priorc12))\n                # h0-h2\n                .withColumn(\"lH0abf\", f.lit(0))\n                .withColumn(\"lH1abf\", f.log(f.col(\"priorc1\")) + f.col(\"logsum1\"))\n                .withColumn(\"lH2abf\", f.log(f.col(\"priorc2\")) + f.col(\"logsum2\"))\n                # h3\n                .withColumn(\"sumlogsum\", f.col(\"logsum1\") + f.col(\"logsum2\"))\n                # exclude null H3/H4s: due to sumlogsum == logsum12\n                .filter(f.col(\"sumlogsum\") != f.col(\"logsum12\"))\n                .withColumn(\"max\", f.greatest(\"sumlogsum\", \"logsum12\"))\n                .withColumn(\n                    \"logdiff\",\n                    (\n                        f.col(\"max\")\n                        + f.log(\n                            f.exp(f.col(\"sumlogsum\") - f.col(\"max\"))\n                            - f.exp(f.col(\"logsum12\") - f.col(\"max\"))\n                        )\n                    ),\n                )\n                .withColumn(\n                    \"lH3abf\",\n                    f.log(f.col(\"priorc1\"))\n                    + f.log(f.col(\"priorc2\"))\n                    + f.col(\"logdiff\"),\n                )\n                .drop(\"right_logsum\", \"left_logsum\", \"sumlogsum\", \"max\", \"logdiff\")\n                # h4\n                .withColumn(\"lH4abf\", f.log(f.col(\"priorc12\")) + f.col(\"logsum12\"))\n                # cleaning\n                .drop(\n                    \"priorc1\", \"priorc2\", \"priorc12\", \"logsum1\", \"logsum2\", \"logsum12\"\n                )\n                # posteriors\n                .withColumn(\n                    \"allABF\",\n                    fml.array_to_vector(\n                        f.array(\n                            f.col(\"lH0abf\"),\n                            f.col(\"lH1abf\"),\n                            f.col(\"lH2abf\"),\n                            f.col(\"lH3abf\"),\n                            f.col(\"lH4abf\"),\n                        )\n                    ),\n                )\n                .withColumn(\n                    \"posteriors\", fml.vector_to_array(posteriors(f.col(\"allABF\")))\n                )\n                .withColumn(\"coloc_h0\", f.col(\"posteriors\").getItem(0))\n                .withColumn(\"coloc_h1\", f.col(\"posteriors\").getItem(1))\n                .withColumn(\"coloc_h2\", f.col(\"posteriors\").getItem(2))\n                .withColumn(\"coloc_h3\", f.col(\"posteriors\").getItem(3))\n                .withColumn(\"coloc_h4\", f.col(\"posteriors\").getItem(4))\n                .withColumn(\"coloc_h4_h3\", f.col(\"coloc_h4\") / f.col(\"coloc_h3\"))\n                .withColumn(\"coloc_log2_h4_h3\", f.log2(f.col(\"coloc_h4_h3\")))\n                # clean up\n                .drop(\n                    \"posteriors\",\n                    \"allABF\",\n                    \"coloc_h4_h3\",\n                    \"lH0abf\",\n                    \"lH1abf\",\n                    \"lH2abf\",\n                    \"lH3abf\",\n                    \"lH4abf\",\n                )\n                .withColumn(\"colocalisationMethod\", f.lit(\"COLOC\"))\n            )\n        )\n</code></pre>"},{"location":"components/method/coloc/#otg.method.colocalisation.Coloc.colocalise","title":"<code>colocalise(overlapping_signals, priorc1=0.0001, priorc2=0.0001, priorc12=1e-05)</code>  <code>classmethod</code>","text":"<p>Calculate bayesian colocalisation based on overlapping signals.</p> <p>Parameters:</p> Name Type Description Default <code>overlapping_signals</code> <code>StudyLocusOverlap</code> <p>overlapping peaks</p> required <code>priorc1</code> <code>float</code> <p>Prior on variant being causal for trait 1. Defaults to 1e-4.</p> <code>0.0001</code> <code>priorc2</code> <code>float</code> <p>Prior on variant being causal for trait 2. Defaults to 1e-4.</p> <code>0.0001</code> <code>priorc12</code> <code>float</code> <p>Prior on variant being causal for traits 1 and 2. Defaults to 1e-5.</p> <code>1e-05</code> <p>Returns:</p> Name Type Description <code>Colocalisation</code> <code>Colocalisation</code> <p>Colocalisation results</p> Source code in <code>src/otg/method/colocalisation.py</code> <pre><code>@classmethod\ndef colocalise(\n    cls: type[Coloc],\n    overlapping_signals: StudyLocusOverlap,\n    priorc1: float = 1e-4,\n    priorc2: float = 1e-4,\n    priorc12: float = 1e-5,\n) -&gt; Colocalisation:\n\"\"\"Calculate bayesian colocalisation based on overlapping signals.\n\n    Args:\n        overlapping_signals (StudyLocusOverlap): overlapping peaks\n        priorc1 (float): Prior on variant being causal for trait 1. Defaults to 1e-4.\n        priorc2 (float): Prior on variant being causal for trait 2. Defaults to 1e-4.\n        priorc12 (float): Prior on variant being causal for traits 1 and 2. Defaults to 1e-5.\n\n    Returns:\n        Colocalisation: Colocalisation results\n    \"\"\"\n    # register udfs\n    logsum = f.udf(Coloc._get_logsum, DoubleType())\n    posteriors = f.udf(Coloc._get_posteriors, VectorUDT())\n    return Colocalisation(\n        _df=(\n            overlapping_signals.df\n            # Before summing log_abf columns nulls need to be filled with 0:\n            .fillna(0, subset=[\"left_logABF\", \"right_logABF\"])\n            # Sum of log_abfs for each pair of signals\n            .withColumn(\"sum_log_abf\", f.col(\"left_logABF\") + f.col(\"right_logABF\"))\n            # Group by overlapping peak and generating dense vectors of log_abf:\n            .groupBy(\"chromosome\", \"left_studyLocusId\", \"right_studyLocusId\")\n            .agg(\n                f.count(\"*\").alias(\"coloc_n_vars\"),\n                fml.array_to_vector(f.collect_list(f.col(\"left_logABF\"))).alias(\n                    \"left_logABF\"\n                ),\n                fml.array_to_vector(f.collect_list(f.col(\"right_logABF\"))).alias(\n                    \"right_logABF\"\n                ),\n                fml.array_to_vector(f.collect_list(f.col(\"sum_log_abf\"))).alias(\n                    \"sum_log_abf\"\n                ),\n            )\n            .withColumn(\"logsum1\", logsum(f.col(\"left_logABF\")))\n            .withColumn(\"logsum2\", logsum(f.col(\"right_logABF\")))\n            .withColumn(\"logsum12\", logsum(f.col(\"sum_log_abf\")))\n            .drop(\"left_logABF\", \"right_logABF\", \"sum_log_abf\")\n            # Add priors\n            # priorc1 Prior on variant being causal for trait 1\n            .withColumn(\"priorc1\", f.lit(priorc1))\n            # priorc2 Prior on variant being causal for trait 2\n            .withColumn(\"priorc2\", f.lit(priorc2))\n            # priorc12 Prior on variant being causal for traits 1 and 2\n            .withColumn(\"priorc12\", f.lit(priorc12))\n            # h0-h2\n            .withColumn(\"lH0abf\", f.lit(0))\n            .withColumn(\"lH1abf\", f.log(f.col(\"priorc1\")) + f.col(\"logsum1\"))\n            .withColumn(\"lH2abf\", f.log(f.col(\"priorc2\")) + f.col(\"logsum2\"))\n            # h3\n            .withColumn(\"sumlogsum\", f.col(\"logsum1\") + f.col(\"logsum2\"))\n            # exclude null H3/H4s: due to sumlogsum == logsum12\n            .filter(f.col(\"sumlogsum\") != f.col(\"logsum12\"))\n            .withColumn(\"max\", f.greatest(\"sumlogsum\", \"logsum12\"))\n            .withColumn(\n                \"logdiff\",\n                (\n                    f.col(\"max\")\n                    + f.log(\n                        f.exp(f.col(\"sumlogsum\") - f.col(\"max\"))\n                        - f.exp(f.col(\"logsum12\") - f.col(\"max\"))\n                    )\n                ),\n            )\n            .withColumn(\n                \"lH3abf\",\n                f.log(f.col(\"priorc1\"))\n                + f.log(f.col(\"priorc2\"))\n                + f.col(\"logdiff\"),\n            )\n            .drop(\"right_logsum\", \"left_logsum\", \"sumlogsum\", \"max\", \"logdiff\")\n            # h4\n            .withColumn(\"lH4abf\", f.log(f.col(\"priorc12\")) + f.col(\"logsum12\"))\n            # cleaning\n            .drop(\n                \"priorc1\", \"priorc2\", \"priorc12\", \"logsum1\", \"logsum2\", \"logsum12\"\n            )\n            # posteriors\n            .withColumn(\n                \"allABF\",\n                fml.array_to_vector(\n                    f.array(\n                        f.col(\"lH0abf\"),\n                        f.col(\"lH1abf\"),\n                        f.col(\"lH2abf\"),\n                        f.col(\"lH3abf\"),\n                        f.col(\"lH4abf\"),\n                    )\n                ),\n            )\n            .withColumn(\n                \"posteriors\", fml.vector_to_array(posteriors(f.col(\"allABF\")))\n            )\n            .withColumn(\"coloc_h0\", f.col(\"posteriors\").getItem(0))\n            .withColumn(\"coloc_h1\", f.col(\"posteriors\").getItem(1))\n            .withColumn(\"coloc_h2\", f.col(\"posteriors\").getItem(2))\n            .withColumn(\"coloc_h3\", f.col(\"posteriors\").getItem(3))\n            .withColumn(\"coloc_h4\", f.col(\"posteriors\").getItem(4))\n            .withColumn(\"coloc_h4_h3\", f.col(\"coloc_h4\") / f.col(\"coloc_h3\"))\n            .withColumn(\"coloc_log2_h4_h3\", f.log2(f.col(\"coloc_h4_h3\")))\n            # clean up\n            .drop(\n                \"posteriors\",\n                \"allABF\",\n                \"coloc_h4_h3\",\n                \"lH0abf\",\n                \"lH1abf\",\n                \"lH2abf\",\n                \"lH3abf\",\n                \"lH4abf\",\n            )\n            .withColumn(\"colocalisationMethod\", f.lit(\"COLOC\"))\n        )\n    )\n</code></pre>"},{"location":"components/method/ecaviar/","title":"eCAVIAR","text":"<p>ECaviar-based colocalisation analysis.</p> <p>It extends CAVIAR\u00a0framework to explicitly estimate the posterior probability that the same variant is causal in 2 studies while accounting for the uncertainty of LD. eCAVIAR computes the colocalization posterior probability (CLPP) by utilizing the marginal posterior probabilities. This framework allows for multiple variants to be causal in a single locus.</p> Source code in <code>src/otg/method/colocalisation.py</code> <pre><code>class ECaviar:\n\"\"\"ECaviar-based colocalisation analysis.\n\n    It extends [CAVIAR](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5142122/#bib18)\u00a0framework to explicitly estimate the posterior probability that the same variant is causal in 2 studies while accounting for the uncertainty of LD. eCAVIAR computes the colocalization posterior probability (**CLPP**) by utilizing the marginal posterior probabilities. This framework allows for **multiple variants to be causal** in a single locus.\n    \"\"\"\n\n    @staticmethod\n    def _get_clpp(left_pp: Column, right_pp: Column) -&gt; Column:\n\"\"\"Calculate the colocalisation posterior probability (CLPP).\n\n        If the fact that the same variant is found causal for two studies are independent events,\n        CLPP is defined as the product of posterior porbabilities that a variant is causal in both studies.\n\n        Args:\n            left_pp (Column): left posterior probability\n            right_pp (Column): right posterior probability\n\n        Returns:\n            Column: CLPP\n\n        Examples:\n            &gt;&gt;&gt; d = [{\"left_pp\": 0.5, \"right_pp\": 0.5}, {\"left_pp\": 0.25, \"right_pp\": 0.75}]\n            &gt;&gt;&gt; df = spark.createDataFrame(d)\n            &gt;&gt;&gt; df.withColumn(\"clpp\", ECaviar._get_clpp(f.col(\"left_pp\"), f.col(\"right_pp\"))).show()\n            +-------+--------+------+\n            |left_pp|right_pp|  clpp|\n            +-------+--------+------+\n            |    0.5|     0.5|  0.25|\n            |   0.25|    0.75|0.1875|\n            +-------+--------+------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        return left_pp * right_pp\n\n    @classmethod\n    def colocalise(\n        cls: type[ECaviar], overlapping_signals: StudyLocusOverlap\n    ) -&gt; Colocalisation:\n\"\"\"Calculate bayesian colocalisation based on overlapping signals.\n\n        Args:\n            overlapping_signals (StudyLocusOverlap): overlapping signals.\n\n        Returns:\n            Colocalisation: colocalisation results based on eCAVIAR.\n        \"\"\"\n        return Colocalisation(\n            _df=(\n                overlapping_signals.df.withColumn(\n                    \"clpp\",\n                    ECaviar._get_clpp(\n                        f.col(\"left_posteriorProbability\"),\n                        f.col(\"right_posteriorProbability\"),\n                    ),\n                )\n                .groupBy(\"left_studyLocusId\", \"right_studyLocusId\", \"chromosome\")\n                .agg(\n                    f.count(\"*\").alias(\"coloc_n_vars\"),\n                    f.sum(f.col(\"clpp\")).alias(\"clpp\"),\n                )\n                .withColumn(\"colocalisationMethod\", f.lit(\"eCAVIAR\"))\n            )\n        )\n</code></pre>"},{"location":"components/method/ecaviar/#otg.method.colocalisation.ECaviar.colocalise","title":"<code>colocalise(overlapping_signals)</code>  <code>classmethod</code>","text":"<p>Calculate bayesian colocalisation based on overlapping signals.</p> <p>Parameters:</p> Name Type Description Default <code>overlapping_signals</code> <code>StudyLocusOverlap</code> <p>overlapping signals.</p> required <p>Returns:</p> Name Type Description <code>Colocalisation</code> <code>Colocalisation</code> <p>colocalisation results based on eCAVIAR.</p> Source code in <code>src/otg/method/colocalisation.py</code> <pre><code>@classmethod\ndef colocalise(\n    cls: type[ECaviar], overlapping_signals: StudyLocusOverlap\n) -&gt; Colocalisation:\n\"\"\"Calculate bayesian colocalisation based on overlapping signals.\n\n    Args:\n        overlapping_signals (StudyLocusOverlap): overlapping signals.\n\n    Returns:\n        Colocalisation: colocalisation results based on eCAVIAR.\n    \"\"\"\n    return Colocalisation(\n        _df=(\n            overlapping_signals.df.withColumn(\n                \"clpp\",\n                ECaviar._get_clpp(\n                    f.col(\"left_posteriorProbability\"),\n                    f.col(\"right_posteriorProbability\"),\n                ),\n            )\n            .groupBy(\"left_studyLocusId\", \"right_studyLocusId\", \"chromosome\")\n            .agg(\n                f.count(\"*\").alias(\"coloc_n_vars\"),\n                f.sum(f.col(\"clpp\")).alias(\"clpp\"),\n            )\n            .withColumn(\"colocalisationMethod\", f.lit(\"eCAVIAR\"))\n        )\n    )\n</code></pre>"},{"location":"components/method/ld_annotator/","title":"LD annotator","text":"<p>Class to annotate linkage disequilibrium (LD) operations from GnomAD.</p> Source code in <code>src/otg/method/ld.py</code> <pre><code>class LDAnnotatorGnomad:\n\"\"\"Class to annotate linkage disequilibrium (LD) operations from GnomAD.\"\"\"\n\n    @staticmethod\n    def _query_block_matrix(\n        bm: BlockMatrix,\n        idxs: list[int],\n        starts: list[int],\n        stops: list[int],\n        min_r2: float,\n    ) -&gt; DataFrame:\n\"\"\"Query block matrix for idxs rows sparsified by start/stop columns.\n\n        Args:\n            bm (BlockMatrix): LD matrix containing r values\n            idxs (List[int]): Row indexes to query (distinct and incremental)\n            starts (List[int]): Interval start column indexes (same size as idxs)\n            stops (List[int]): Interval stop column indexes (same size as idxs)\n            min_r2 (float): Minimum r2 to keep\n\n        Returns:\n            DataFrame: i,j,r where i and j are the row and column indexes and r is the LD\n\n        Examples:\n            &gt;&gt;&gt; import numpy as np\n            &gt;&gt;&gt; r = np.array([[1, 0.8, 0.7, 0.2],\n            ...               [0.8, 1, 0.6, 0.1],\n            ...               [0.7, 0.6, 1, 0.3],\n            ...               [0.2, 0.1, 0.3, 1]])\n            &gt;&gt;&gt; bm_r = BlockMatrix.from_numpy(r) # doctest: +SKIP\n            &gt;&gt;&gt; LDAnnotatorGnomad._query_block_matrix(bm_r, [1, 2], [0, 1], [3, 4], 0.5).show() # doctest: +SKIP\n            +---+---+---+\n            |  i|  j|  r|\n            +---+---+---+\n            |  0|  0|0.8|\n            |  0|  1|1.0|\n            |  1|  2|1.0|\n            +---+---+---+\n            &lt;BLANKLINE&gt;\n        \"\"\"\n        bm_sparsified = bm.filter_rows(idxs).sparsify_row_intervals(\n            starts, stops, blocks_only=True\n        )\n        entries = bm_sparsified.entries(keyed=False)\n\n        return (\n            entries.rename({\"entry\": \"r\"})\n            .to_spark()\n            .filter(f.col(\"r\") ** 2 &gt;= min_r2)\n            .withColumn(\"r\", f.when(f.col(\"r\") &gt;= 1, f.lit(1)).otherwise(f.col(\"r\")))\n        )\n\n    @staticmethod\n    def _variant_coordinates_in_ldindex(\n        variants_df: DataFrame,\n        ld_index: LDIndex,\n    ) -&gt; DataFrame:\n\"\"\"Idxs for variants, first variant in the region and last variant in the region in precomputed ld index.\n\n        It checks if the window defined by the start/stop indices is maintained after lifting over the variants.\n\n        Args:\n            variants_df (DataFrame): Lead variants from `_annotate_index_intervals` output\n            ld_index (LDIndex): LD index precomputed\n\n        Returns:\n            DataFrame: LD coordinates [variantId, chromosome, gnomadPopulation, i, idxs, start_idx and stop_idx]\n        \"\"\"\n        w = Window.orderBy(\"chromosome\", \"idx\")\n        return (\n            variants_df.join(\n                ld_index.df,\n                on=[\"variantId\", \"chromosome\"],\n            )\n            .select(\n                \"variantId\",\n                \"chromosome\",\n                \"gnomadPopulation\",\n                \"idx\",\n                \"start_idx\",\n                \"stop_idx\",\n            )\n            .distinct()\n            # necessary to resolve return of .entries() function\n            .withColumn(\"i\", f.row_number().over(w))\n            # the dataframe has to be ordered to query the block matrix\n            .orderBy(\"idx\")\n        )\n\n    @staticmethod\n    def weighted_r_overall(\n        chromosome: Column,\n        study_id: Column,\n        variant_id: Column,\n        tag_variant_id: Column,\n        relative_sample_size: Column,\n        r: Column,\n    ) -&gt; Column:\n\"\"\"Aggregation of weighted R information using ancestry proportions.\n\n        The method implements a simple average weighted by the relative population sizes.\n\n        Args:\n            chromosome (Column): Chromosome\n            study_id (Column): Study identifier\n            variant_id (Column): Variant identifier\n            tag_variant_id (Column): Tag variant identifier\n            relative_sample_size (Column): Relative sample size\n            r (Column): Correlation\n\n        Returns:\n            Column: Estimates weighted R information\n\n        Examples:\n            &gt;&gt;&gt; data = [('t3', 0.25, 0.2), ('t3', 0.25, 0.2), ('t3', 0.5, 0.99)]\n            &gt;&gt;&gt; columns = ['tag_variant_id', 'relative_sample_size', 'r']\n            &gt;&gt;&gt; (\n            ...    spark.createDataFrame(data, columns)\n            ...     .withColumn('chr', f.lit('chr1'))\n            ...     .withColumn('study_id', f.lit('s1'))\n            ...     .withColumn('variant_id', f.lit('v1'))\n            ...     .withColumn(\n            ...         'r_overall',\n            ...         LDAnnotatorGnomad.weighted_r_overall(\n            ...             f.col('chr'),\n            ...             f.col('study_id'),\n            ...             f.col('variant_id'),\n            ...             f.col('tag_variant_id'),\n            ...             f.col('relative_sample_size'),\n            ...             f.col('r')\n            ...         )\n            ...     )\n            ...     .show()\n            ... )\n            +--------------+--------------------+----+----+--------+----------+---------+\n            |tag_variant_id|relative_sample_size|   r| chr|study_id|variant_id|r_overall|\n            +--------------+--------------------+----+----+--------+----------+---------+\n            |            t3|                0.25| 0.2|chr1|      s1|        v1|    0.595|\n            |            t3|                0.25| 0.2|chr1|      s1|        v1|    0.595|\n            |            t3|                 0.5|0.99|chr1|      s1|        v1|    0.595|\n            +--------------+--------------------+----+----+--------+----------+---------+\n            &lt;BLANKLINE&gt;\n        \"\"\"\n        pseudo_r = f.when(r &gt;= 1, 0.9999995).otherwise(r)\n        return f.round(\n            f.sum(pseudo_r * relative_sample_size).over(\n                Window.partitionBy(chromosome, study_id, variant_id, tag_variant_id)\n            ),\n            6,\n        )\n\n    @staticmethod\n    def _flag_partial_mapped(\n        study_id: Column, variant_id: Column, tag_variant_id: Column\n    ) -&gt; Column:\n\"\"\"Generate flag for lead/tag pairs.\n\n        Some lead variants can be resolved in one population but not in other. Those rows interfere with PICS calculation, so they needs to be dropped.\n\n        Args:\n            study_id (Column): Study identifier column\n            variant_id (Column): Identifier of the lead variant\n            tag_variant_id (Column): Identifier of the tag variant\n\n        Returns:\n            Column: Boolean\n\n        Examples:\n            &gt;&gt;&gt; data = [\n            ...     ('study_1', 'lead_1', 'tag_1'),  # &lt;- keep row as tag available.\n            ...     ('study_1', 'lead_1', 'tag_2'),  # &lt;- keep row as tag available.\n            ...     ('study_1', 'lead_2', 'tag_3'),  # &lt;- keep row as tag available\n            ...     ('study_1', 'lead_2', None),  # &lt;- drop row as lead 2 is resolved.\n            ...     ('study_1', 'lead_3', None)   # &lt;- keep row as lead 3 is not resolved.\n            ... ]\n            &gt;&gt;&gt; (\n            ...     spark.createDataFrame(data, ['studyId', 'variantId', 'tagVariantId'])\n            ...     .withColumn(\"flag_to_keep_tag\", LDAnnotatorGnomad._flag_partial_mapped(f.col('studyId'), f.col('variantId'), f.col('tagVariantId')))\n            ...     .show()\n            ... )\n            +-------+---------+------------+----------------+\n            |studyId|variantId|tagVariantId|flag_to_keep_tag|\n            +-------+---------+------------+----------------+\n            |study_1|   lead_1|       tag_1|            true|\n            |study_1|   lead_1|       tag_2|            true|\n            |study_1|   lead_2|       tag_3|            true|\n            |study_1|   lead_2|        null|           false|\n            |study_1|   lead_3|        null|            true|\n            +-------+---------+------------+----------------+\n            &lt;BLANKLINE&gt;\n        \"\"\"\n        return tag_variant_id.isNotNull() | ~f.array_contains(\n            f.collect_set(tag_variant_id.isNotNull()).over(\n                Window.partitionBy(study_id, variant_id)\n            ),\n            True,\n        )\n\n    @staticmethod\n    def get_ld_annotated_assocs_for_population(\n        population: str,\n        ld_index: LDIndex,\n        ld_matrix: BlockMatrix,\n        locus_ancestry: DataFrame,\n        min_r2: float,\n    ) -&gt; DataFrame:\n\"\"\"This function annotates association data with LD information.\"\"\"\n        # map variants to precomputed LD indexes from gnomAD\n        variants_in_pop = locus_ancestry.filter(f.col(\"gnomadPopulation\") == population)\n        variants_ld_coordinates = LDAnnotatorGnomad._variant_coordinates_in_ldindex(\n            variants_in_pop, ld_index\n        ).persist()\n\n        # idxs for lead, first variant in the region and last variant in the region\n        variants_ld_scores = LDAnnotatorGnomad._query_block_matrix(\n            ld_matrix + ld_matrix.T,\n            variants_ld_coordinates.rdd.map(lambda x: x.idx).collect(),\n            variants_ld_coordinates.rdd.map(lambda x: x.start_idx).collect(),\n            variants_ld_coordinates.rdd.map(lambda x: x.stop_idx).collect(),\n            min_r2,\n        )\n\n        # aggregate LD info\n        variants_ld_info = variants_ld_scores.join(\n            f.broadcast(variants_ld_coordinates),\n            on=\"i\",\n            how=\"inner\",\n        ).select(\"variantId\", \"chromosome\", \"gnomadPopulation\", \"j\", \"r\")\n\n        variants_ld_coordinates.unpersist()\n        return LDAnnotatorGnomad.variants_in_ld_in_gnomad_pop(\n            variants_ld_info=variants_ld_info,\n            ld_index=ld_index,\n        )\n\n    @classmethod\n    def variants_in_ld_in_gnomad_pop(\n        cls: type[LDAnnotatorGnomad],\n        variants_ld_info: DataFrame,\n        ld_index: LDIndex,\n    ) -&gt; DataFrame:\n\"\"\"Return LD annotation for variants in specific gnomad population.\n\n        Args:\n            variants_ld_info (DataFrame): variant and their LD scores (r) and coordinates from the LD matrix of a population\n            ld_index (LDIndex): LD index precomputed\n\n        Returns:\n            DataFrame: LD information in the columns [\"variantId\", \"chromosome\", \"gnomadPopulation\", \"tagVariantId\", \"r\"]\n        \"\"\"\n        return (\n            variants_ld_info.alias(\"left\")\n            .join(\n                ld_index.df.select(\n                    f.col(\"chromosome\"),\n                    f.col(\"variantId\").alias(\"tagVariantId\"),\n                    f.col(\"idx\").alias(\"tag_idx\"),\n                ).alias(\"tags\"),\n                on=[\n                    f.col(\"left.chromosome\") == f.col(\"tags.chromosome\"),\n                    f.col(\"left.j\") == f.col(\"tags.tag_idx\"),\n                ],\n            )\n            .select(\n                \"variantId\", \"left.chromosome\", \"gnomadPopulation\", \"tagVariantId\", \"r\"\n            )\n        )\n\n    @classmethod\n    def ld_annotation_by_locus_ancestry(\n        cls: type[LDAnnotatorGnomad],\n        session: Session,\n        associations: StudyLocusGWASCatalog,\n        studies: StudyIndexGWASCatalog,\n        ld_populations: list[str],\n        ld_index_template: str,\n        ld_matrix_template: str,\n        min_r2: float,\n    ) -&gt; DataFrame:\n\"\"\"LD information for all locus and ancestries.\n\n        Args:\n            session (Session): Session\n            associations (StudyLocusGWASCatalog): GWAS associations\n            studies (StudyIndexGWASCatalog): study metadata of the associations\n            ld_populations (list[str]): List of populations to annotate\n            ld_index_template (str): Template path of the LD matrix index containing `{POP}` where the population is expected\n            ld_matrix_template (str): Template path of the LD matrix containing `{POP}` where the population is expected\n            min_r2 (float): minimum r2 to keep\n\n        Returns:\n            DataFrame: LD annotation [\"variantId\", \"chromosome\", \"gnomadPopulation\", \"tagVariantId\", \"r\"]\n        \"\"\"\n        # Unique lead - population pairs:\n        locus_ancestry = (\n            associations.unique_study_locus_ancestries(studies)\n            # Ignoring study information / relativeSampleSize to get unique lead-ancestry pairs\n            .drop(\"studyId\", \"relativeSampleSize\")\n            .distinct()\n            .persist()\n        )\n\n        # All gnomad populations captured in associations:\n        assoc_populations = locus_ancestry.rdd.map(\n            lambda x: x.gnomadPopulation\n        ).collect()\n\n        # Retrieve LD information from gnomAD\n        ld_annotated_assocs = []\n        for population in ld_populations:\n            if population in assoc_populations:\n                pop_parsed_ldindex_path = ld_index_template.format(POP=population)\n                pop_matrix_path = ld_matrix_template.format(POP=population)\n                ld_index = LDIndex.from_parquet(session, pop_parsed_ldindex_path)\n                ld_matrix = BlockMatrix.read(pop_matrix_path)\n                ld_annotated_assocs.append(\n                    LDAnnotatorGnomad.get_ld_annotated_assocs_for_population(\n                        population,\n                        ld_index,\n                        ld_matrix,\n                        locus_ancestry,\n                        min_r2,\n                    ).coalesce(400)\n                )\n        return reduce(DataFrame.unionByName, ld_annotated_assocs)\n</code></pre>"},{"location":"components/method/ld_annotator/#otg.method.ld.LDAnnotatorGnomad.get_ld_annotated_assocs_for_population","title":"<code>get_ld_annotated_assocs_for_population(population, ld_index, ld_matrix, locus_ancestry, min_r2)</code>  <code>staticmethod</code>","text":"<p>This function annotates association data with LD information.</p> Source code in <code>src/otg/method/ld.py</code> <pre><code>@staticmethod\ndef get_ld_annotated_assocs_for_population(\n    population: str,\n    ld_index: LDIndex,\n    ld_matrix: BlockMatrix,\n    locus_ancestry: DataFrame,\n    min_r2: float,\n) -&gt; DataFrame:\n\"\"\"This function annotates association data with LD information.\"\"\"\n    # map variants to precomputed LD indexes from gnomAD\n    variants_in_pop = locus_ancestry.filter(f.col(\"gnomadPopulation\") == population)\n    variants_ld_coordinates = LDAnnotatorGnomad._variant_coordinates_in_ldindex(\n        variants_in_pop, ld_index\n    ).persist()\n\n    # idxs for lead, first variant in the region and last variant in the region\n    variants_ld_scores = LDAnnotatorGnomad._query_block_matrix(\n        ld_matrix + ld_matrix.T,\n        variants_ld_coordinates.rdd.map(lambda x: x.idx).collect(),\n        variants_ld_coordinates.rdd.map(lambda x: x.start_idx).collect(),\n        variants_ld_coordinates.rdd.map(lambda x: x.stop_idx).collect(),\n        min_r2,\n    )\n\n    # aggregate LD info\n    variants_ld_info = variants_ld_scores.join(\n        f.broadcast(variants_ld_coordinates),\n        on=\"i\",\n        how=\"inner\",\n    ).select(\"variantId\", \"chromosome\", \"gnomadPopulation\", \"j\", \"r\")\n\n    variants_ld_coordinates.unpersist()\n    return LDAnnotatorGnomad.variants_in_ld_in_gnomad_pop(\n        variants_ld_info=variants_ld_info,\n        ld_index=ld_index,\n    )\n</code></pre>"},{"location":"components/method/ld_annotator/#otg.method.ld.LDAnnotatorGnomad.ld_annotation_by_locus_ancestry","title":"<code>ld_annotation_by_locus_ancestry(session, associations, studies, ld_populations, ld_index_template, ld_matrix_template, min_r2)</code>  <code>classmethod</code>","text":"<p>LD information for all locus and ancestries.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>Session</p> required <code>associations</code> <code>StudyLocusGWASCatalog</code> <p>GWAS associations</p> required <code>studies</code> <code>StudyIndexGWASCatalog</code> <p>study metadata of the associations</p> required <code>ld_populations</code> <code>list[str]</code> <p>List of populations to annotate</p> required <code>ld_index_template</code> <code>str</code> <p>Template path of the LD matrix index containing <code>{POP}</code> where the population is expected</p> required <code>ld_matrix_template</code> <code>str</code> <p>Template path of the LD matrix containing <code>{POP}</code> where the population is expected</p> required <code>min_r2</code> <code>float</code> <p>minimum r2 to keep</p> required <p>Returns:</p> Name Type Description <code>DataFrame</code> <code>DataFrame</code> <p>LD annotation [\"variantId\", \"chromosome\", \"gnomadPopulation\", \"tagVariantId\", \"r\"]</p> Source code in <code>src/otg/method/ld.py</code> <pre><code>@classmethod\ndef ld_annotation_by_locus_ancestry(\n    cls: type[LDAnnotatorGnomad],\n    session: Session,\n    associations: StudyLocusGWASCatalog,\n    studies: StudyIndexGWASCatalog,\n    ld_populations: list[str],\n    ld_index_template: str,\n    ld_matrix_template: str,\n    min_r2: float,\n) -&gt; DataFrame:\n\"\"\"LD information for all locus and ancestries.\n\n    Args:\n        session (Session): Session\n        associations (StudyLocusGWASCatalog): GWAS associations\n        studies (StudyIndexGWASCatalog): study metadata of the associations\n        ld_populations (list[str]): List of populations to annotate\n        ld_index_template (str): Template path of the LD matrix index containing `{POP}` where the population is expected\n        ld_matrix_template (str): Template path of the LD matrix containing `{POP}` where the population is expected\n        min_r2 (float): minimum r2 to keep\n\n    Returns:\n        DataFrame: LD annotation [\"variantId\", \"chromosome\", \"gnomadPopulation\", \"tagVariantId\", \"r\"]\n    \"\"\"\n    # Unique lead - population pairs:\n    locus_ancestry = (\n        associations.unique_study_locus_ancestries(studies)\n        # Ignoring study information / relativeSampleSize to get unique lead-ancestry pairs\n        .drop(\"studyId\", \"relativeSampleSize\")\n        .distinct()\n        .persist()\n    )\n\n    # All gnomad populations captured in associations:\n    assoc_populations = locus_ancestry.rdd.map(\n        lambda x: x.gnomadPopulation\n    ).collect()\n\n    # Retrieve LD information from gnomAD\n    ld_annotated_assocs = []\n    for population in ld_populations:\n        if population in assoc_populations:\n            pop_parsed_ldindex_path = ld_index_template.format(POP=population)\n            pop_matrix_path = ld_matrix_template.format(POP=population)\n            ld_index = LDIndex.from_parquet(session, pop_parsed_ldindex_path)\n            ld_matrix = BlockMatrix.read(pop_matrix_path)\n            ld_annotated_assocs.append(\n                LDAnnotatorGnomad.get_ld_annotated_assocs_for_population(\n                    population,\n                    ld_index,\n                    ld_matrix,\n                    locus_ancestry,\n                    min_r2,\n                ).coalesce(400)\n            )\n    return reduce(DataFrame.unionByName, ld_annotated_assocs)\n</code></pre>"},{"location":"components/method/ld_annotator/#otg.method.ld.LDAnnotatorGnomad.variants_in_ld_in_gnomad_pop","title":"<code>variants_in_ld_in_gnomad_pop(variants_ld_info, ld_index)</code>  <code>classmethod</code>","text":"<p>Return LD annotation for variants in specific gnomad population.</p> <p>Parameters:</p> Name Type Description Default <code>variants_ld_info</code> <code>DataFrame</code> <p>variant and their LD scores (r) and coordinates from the LD matrix of a population</p> required <code>ld_index</code> <code>LDIndex</code> <p>LD index precomputed</p> required <p>Returns:</p> Name Type Description <code>DataFrame</code> <code>DataFrame</code> <p>LD information in the columns [\"variantId\", \"chromosome\", \"gnomadPopulation\", \"tagVariantId\", \"r\"]</p> Source code in <code>src/otg/method/ld.py</code> <pre><code>@classmethod\ndef variants_in_ld_in_gnomad_pop(\n    cls: type[LDAnnotatorGnomad],\n    variants_ld_info: DataFrame,\n    ld_index: LDIndex,\n) -&gt; DataFrame:\n\"\"\"Return LD annotation for variants in specific gnomad population.\n\n    Args:\n        variants_ld_info (DataFrame): variant and their LD scores (r) and coordinates from the LD matrix of a population\n        ld_index (LDIndex): LD index precomputed\n\n    Returns:\n        DataFrame: LD information in the columns [\"variantId\", \"chromosome\", \"gnomadPopulation\", \"tagVariantId\", \"r\"]\n    \"\"\"\n    return (\n        variants_ld_info.alias(\"left\")\n        .join(\n            ld_index.df.select(\n                f.col(\"chromosome\"),\n                f.col(\"variantId\").alias(\"tagVariantId\"),\n                f.col(\"idx\").alias(\"tag_idx\"),\n            ).alias(\"tags\"),\n            on=[\n                f.col(\"left.chromosome\") == f.col(\"tags.chromosome\"),\n                f.col(\"left.j\") == f.col(\"tags.tag_idx\"),\n            ],\n        )\n        .select(\n            \"variantId\", \"left.chromosome\", \"gnomadPopulation\", \"tagVariantId\", \"r\"\n        )\n    )\n</code></pre>"},{"location":"components/method/ld_annotator/#otg.method.ld.LDAnnotatorGnomad.weighted_r_overall","title":"<code>weighted_r_overall(chromosome, study_id, variant_id, tag_variant_id, relative_sample_size, r)</code>  <code>staticmethod</code>","text":"<p>Aggregation of weighted R information using ancestry proportions.</p> <p>The method implements a simple average weighted by the relative population sizes.</p> <p>Parameters:</p> Name Type Description Default <code>chromosome</code> <code>Column</code> <p>Chromosome</p> required <code>study_id</code> <code>Column</code> <p>Study identifier</p> required <code>variant_id</code> <code>Column</code> <p>Variant identifier</p> required <code>tag_variant_id</code> <code>Column</code> <p>Tag variant identifier</p> required <code>relative_sample_size</code> <code>Column</code> <p>Relative sample size</p> required <code>r</code> <code>Column</code> <p>Correlation</p> required <p>Returns:</p> Name Type Description <code>Column</code> <code>Column</code> <p>Estimates weighted R information</p> <p>Examples:</p> <pre><code>&gt;&gt;&gt; data = [('t3', 0.25, 0.2), ('t3', 0.25, 0.2), ('t3', 0.5, 0.99)]\n&gt;&gt;&gt; columns = ['tag_variant_id', 'relative_sample_size', 'r']\n&gt;&gt;&gt; (\n...    spark.createDataFrame(data, columns)\n...     .withColumn('chr', f.lit('chr1'))\n...     .withColumn('study_id', f.lit('s1'))\n...     .withColumn('variant_id', f.lit('v1'))\n...     .withColumn(\n...         'r_overall',\n...         LDAnnotatorGnomad.weighted_r_overall(\n...             f.col('chr'),\n...             f.col('study_id'),\n...             f.col('variant_id'),\n...             f.col('tag_variant_id'),\n...             f.col('relative_sample_size'),\n...             f.col('r')\n...         )\n...     )\n...     .show()\n... )\n+--------------+--------------------+----+----+--------+----------+---------+\n|tag_variant_id|relative_sample_size|   r| chr|study_id|variant_id|r_overall|\n+--------------+--------------------+----+----+--------+----------+---------+\n|            t3|                0.25| 0.2|chr1|      s1|        v1|    0.595|\n|            t3|                0.25| 0.2|chr1|      s1|        v1|    0.595|\n|            t3|                 0.5|0.99|chr1|      s1|        v1|    0.595|\n+--------------+--------------------+----+----+--------+----------+---------+\n</code></pre> Source code in <code>src/otg/method/ld.py</code> <pre><code>@staticmethod\ndef weighted_r_overall(\n    chromosome: Column,\n    study_id: Column,\n    variant_id: Column,\n    tag_variant_id: Column,\n    relative_sample_size: Column,\n    r: Column,\n) -&gt; Column:\n\"\"\"Aggregation of weighted R information using ancestry proportions.\n\n    The method implements a simple average weighted by the relative population sizes.\n\n    Args:\n        chromosome (Column): Chromosome\n        study_id (Column): Study identifier\n        variant_id (Column): Variant identifier\n        tag_variant_id (Column): Tag variant identifier\n        relative_sample_size (Column): Relative sample size\n        r (Column): Correlation\n\n    Returns:\n        Column: Estimates weighted R information\n\n    Examples:\n        &gt;&gt;&gt; data = [('t3', 0.25, 0.2), ('t3', 0.25, 0.2), ('t3', 0.5, 0.99)]\n        &gt;&gt;&gt; columns = ['tag_variant_id', 'relative_sample_size', 'r']\n        &gt;&gt;&gt; (\n        ...    spark.createDataFrame(data, columns)\n        ...     .withColumn('chr', f.lit('chr1'))\n        ...     .withColumn('study_id', f.lit('s1'))\n        ...     .withColumn('variant_id', f.lit('v1'))\n        ...     .withColumn(\n        ...         'r_overall',\n        ...         LDAnnotatorGnomad.weighted_r_overall(\n        ...             f.col('chr'),\n        ...             f.col('study_id'),\n        ...             f.col('variant_id'),\n        ...             f.col('tag_variant_id'),\n        ...             f.col('relative_sample_size'),\n        ...             f.col('r')\n        ...         )\n        ...     )\n        ...     .show()\n        ... )\n        +--------------+--------------------+----+----+--------+----------+---------+\n        |tag_variant_id|relative_sample_size|   r| chr|study_id|variant_id|r_overall|\n        +--------------+--------------------+----+----+--------+----------+---------+\n        |            t3|                0.25| 0.2|chr1|      s1|        v1|    0.595|\n        |            t3|                0.25| 0.2|chr1|      s1|        v1|    0.595|\n        |            t3|                 0.5|0.99|chr1|      s1|        v1|    0.595|\n        +--------------+--------------------+----+----+--------+----------+---------+\n        &lt;BLANKLINE&gt;\n    \"\"\"\n    pseudo_r = f.when(r &gt;= 1, 0.9999995).otherwise(r)\n    return f.round(\n        f.sum(pseudo_r * relative_sample_size).over(\n            Window.partitionBy(chromosome, study_id, variant_id, tag_variant_id)\n        ),\n        6,\n    )\n</code></pre>"},{"location":"components/method/pics/","title":"PICS","text":"<p>Probabilistic Identification of Causal SNPs (PICS), an algorithm estimating the probability that an individual variant is causal considering the haplotype structure and observed pattern of association at the genetic locus.</p> Source code in <code>src/otg/method/pics.py</code> <pre><code>class PICS:\n\"\"\"Probabilistic Identification of Causal SNPs (PICS), an algorithm estimating the probability that an individual variant is causal considering the haplotype structure and observed pattern of association at the genetic locus.\"\"\"\n\n    @staticmethod\n    def _pics_relative_posterior_probability(\n        neglog_p: float, pics_snp_mu: float, pics_snp_std: float\n    ) -&gt; float:\n\"\"\"Compute the PICS posterior probability for a given SNP.\n\n        !!! info \"This probability needs to be scaled to take into account the probabilities of the other variants in the locus.\"\n\n        Args:\n            neglog_p (float): Negative log p-value of the lead variant\n            pics_snp_mu (float): Mean P value of the association between a SNP and a trait\n            pics_snp_std (float): Standard deviation for the P value of the association between a SNP and a trait\n\n        Returns:\n            Relative posterior probability of a SNP being causal in a locus\n\n        Examples:\n            &gt;&gt;&gt; rel_prob = PICS._pics_relative_posterior_probability(neglog_p=10.0, pics_snp_mu=1.0, pics_snp_std=10.0)\n            &gt;&gt;&gt; round(rel_prob, 3)\n            0.368\n        \"\"\"\n        return float(norm(pics_snp_mu, pics_snp_std).sf(neglog_p) * 2)\n\n    @staticmethod\n    def _pics_standard_deviation(neglog_p: float, r2: float, k: float) -&gt; float | None:\n\"\"\"Compute the PICS standard deviation.\n\n        This distribution is obtained after a series of permutation tests described in the PICS method, and it is only\n        valid when the SNP is highly linked with the lead (r2 &gt; 0.5).\n\n        Args:\n            neglog_p (float): Negative log p-value of the lead variant\n            r2 (float): LD score between a given SNP and the lead variant\n            k (float): Empiric constant that can be adjusted to fit the curve, 6.4 recommended.\n\n        Returns:\n            Standard deviation for the P value of the association between a SNP and a trait\n\n        Examples:\n            &gt;&gt;&gt; PICS._pics_standard_deviation(neglog_p=1.0, r2=1.0, k=6.4)\n            0.0\n            &gt;&gt;&gt; round(PICS._pics_standard_deviation(neglog_p=10.0, r2=0.5, k=6.4), 3)\n            0.143\n            &gt;&gt;&gt; print(PICS._pics_standard_deviation(neglog_p=1.0, r2=0.0, k=6.4))\n            None\n        \"\"\"\n        return (\n            (1 - abs(r2) ** 0.5**k) ** 0.5 * (neglog_p) ** 0.5 / 2\n            if r2 &gt;= 0.5\n            else None\n        )\n\n    @staticmethod\n    def _pics_mu(neglog_p: float, r2: float) -&gt; float | None:\n\"\"\"Compute the PICS mu that estimates the probability of association between a given SNP and the trait.\n\n        This distribution is obtained after a series of permutation tests described in the PICS method, and it is only\n        valid when the SNP is highly linked with the lead (r2 &gt; 0.5).\n\n        Args:\n            neglog_p (float): Negative log p-value of the lead variant\n            r2 (float): LD score between a given SNP and the lead variant\n\n        Returns:\n            Mean P value of the association between a SNP and a trait\n\n        Examples:\n            &gt;&gt;&gt; PICS._pics_mu(neglog_p=1.0, r2=1.0)\n            1.0\n            &gt;&gt;&gt; PICS._pics_mu(neglog_p=10.0, r2=0.5)\n            5.0\n            &gt;&gt;&gt; print(PICS._pics_mu(neglog_p=10.0, r2=0.3))\n            None\n        \"\"\"\n        return neglog_p * r2 if r2 &gt;= 0.5 else None\n\n    @staticmethod\n    def _finemap(\n        credible_set: list[Row], lead_neglog_p: float, k: float\n    ) -&gt; list | None:\n\"\"\"Calculates the probability of a variant being causal in a study-locus context by applying the PICS method.\n\n        It is intended to be applied as an UDF in `PICS.finemap`, where each row is a StudyLocus association.\n        The function iterates over every SNP in the `credibleSet` array, and it returns an updated credibleSet with\n        its association signal and causality probability as of PICS.\n\n        Args:\n            credible_set (list): list of tagging variants after expanding the locus\n            lead_neglog_p (float): P value of the association signal between the lead variant and the study in the form of -log10.\n            k (float): Empiric constant that can be adjusted to fit the curve, 6.4 recommended.\n\n        Returns:\n            List of tagging variants with an estimation of the association signal and their posterior probability as of PICS.\n        \"\"\"\n        if credible_set is None:\n            return None\n        elif not credible_set:\n            return []\n\n        tmp_credible_set = []\n        new_credible_set = []\n        # First iteration: calculation of mu, standard deviation, and the relative posterior probability\n        for tag_struct in credible_set:\n            tag_dict = (\n                tag_struct.asDict()\n            )  # tag_struct is of type pyspark.Row, we'll represent it as a dict\n            if (\n                not tag_dict[\"r2Overall\"]\n                or tag_dict[\"r2Overall\"] &lt; 0.5\n                or not lead_neglog_p\n            ):\n                # If PICS cannot be calculated, we'll return the original credible set\n                new_credible_set.append(tag_dict)\n                continue\n            pics_snp_mu = PICS._pics_mu(lead_neglog_p, tag_dict[\"r2Overall\"])\n            pics_snp_std = PICS._pics_standard_deviation(\n                lead_neglog_p, tag_dict[\"r2Overall\"], k\n            )\n            pics_snp_std = 0.001 if pics_snp_std == 0 else pics_snp_std\n            if pics_snp_mu is not None and pics_snp_std is not None:\n                posterior_probability = PICS._pics_relative_posterior_probability(\n                    lead_neglog_p, pics_snp_mu, pics_snp_std\n                )\n                tag_dict[\"tagPValue\"] = 10**-pics_snp_mu\n                tag_dict[\"tagStandardError\"] = 10**-pics_snp_std\n                tag_dict[\"relativePosteriorProbability\"] = posterior_probability\n\n                tmp_credible_set.append(tag_dict)\n\n        # Second iteration: calculation of the sum of all the posteriors in each study-locus, so that we scale them between 0-1\n        total_posteriors = sum(\n            tag_dict.get(\"relativePosteriorProbability\", 0)\n            for tag_dict in tmp_credible_set\n        )\n\n        # Third iteration: calculation of the final posteriorProbability\n        for tag_dict in tmp_credible_set:\n            if total_posteriors != 0:\n                tag_dict[\"posteriorProbability\"] = float(\n                    tag_dict.get(\"relativePosteriorProbability\", 0) / total_posteriors\n                )\n            tag_dict.pop(\"relativePosteriorProbability\")\n            new_credible_set.append(tag_dict)\n        return new_credible_set\n\n    @classmethod\n    def finemap(\n        cls: type[PICS], associations: StudyLocus, k: float = 6.4\n    ) -&gt; StudyLocus:\n\"\"\"Run PICS on a study locus.\n\n        !!! info \"Study locus needs to be LD annotated\"\n            The study locus needs to be LD annotated before PICS can be calculated.\n\n        Args:\n            associations (StudyLocus): Study locus to finemap using PICS\n            k (float): Empiric constant that can be adjusted to fit the curve, 6.4 recommended.\n\n        Returns:\n            StudyLocus: Study locus with PICS results\n        \"\"\"\n        # Register UDF by defining the structure of the output credibleSet array of structs\n        credset_schema = t.ArrayType(\n            [field.dataType.elementType for field in associations.schema if field.name == \"credibleSet\"][0]  # type: ignore\n        )\n        _finemap_udf = f.udf(\n            lambda credible_set, neglog_p: PICS._finemap(credible_set, neglog_p, k),\n            credset_schema,\n        )\n\n        associations.df = (\n            associations.df.withColumn(\"neglog_pvalue\", associations.neglog_pvalue())\n            .withColumn(\n                \"credibleSet\",\n                f.when(\n                    f.col(\"credibleSet\").isNotNull(),\n                    _finemap_udf(f.col(\"credibleSet\"), f.col(\"neglog_pvalue\")),\n                ),\n            )\n            .drop(\"neglog_pvalue\")\n        )\n        return associations\n</code></pre>"},{"location":"components/method/pics/#otg.method.pics.PICS.finemap","title":"<code>finemap(associations, k=6.4)</code>  <code>classmethod</code>","text":"<p>Run PICS on a study locus.</p> <p>Study locus needs to be LD annotated</p> <p>The study locus needs to be LD annotated before PICS can be calculated.</p> <p>Parameters:</p> Name Type Description Default <code>associations</code> <code>StudyLocus</code> <p>Study locus to finemap using PICS</p> required <code>k</code> <code>float</code> <p>Empiric constant that can be adjusted to fit the curve, 6.4 recommended.</p> <code>6.4</code> <p>Returns:</p> Name Type Description <code>StudyLocus</code> <code>StudyLocus</code> <p>Study locus with PICS results</p> Source code in <code>src/otg/method/pics.py</code> <pre><code>@classmethod\ndef finemap(\n    cls: type[PICS], associations: StudyLocus, k: float = 6.4\n) -&gt; StudyLocus:\n\"\"\"Run PICS on a study locus.\n\n    !!! info \"Study locus needs to be LD annotated\"\n        The study locus needs to be LD annotated before PICS can be calculated.\n\n    Args:\n        associations (StudyLocus): Study locus to finemap using PICS\n        k (float): Empiric constant that can be adjusted to fit the curve, 6.4 recommended.\n\n    Returns:\n        StudyLocus: Study locus with PICS results\n    \"\"\"\n    # Register UDF by defining the structure of the output credibleSet array of structs\n    credset_schema = t.ArrayType(\n        [field.dataType.elementType for field in associations.schema if field.name == \"credibleSet\"][0]  # type: ignore\n    )\n    _finemap_udf = f.udf(\n        lambda credible_set, neglog_p: PICS._finemap(credible_set, neglog_p, k),\n        credset_schema,\n    )\n\n    associations.df = (\n        associations.df.withColumn(\"neglog_pvalue\", associations.neglog_pvalue())\n        .withColumn(\n            \"credibleSet\",\n            f.when(\n                f.col(\"credibleSet\").isNotNull(),\n                _finemap_udf(f.col(\"credibleSet\"), f.col(\"neglog_pvalue\")),\n            ),\n        )\n        .drop(\"neglog_pvalue\")\n    )\n    return associations\n</code></pre>"},{"location":"components/method/window_based_clumping/","title":"Window-based clumping","text":"<p>Get semi-lead snps from summary statistics using a window based function.</p> Source code in <code>src/otg/method/window_based_clumping.py</code> <pre><code>class WindowBasedClumping:\n\"\"\"Get semi-lead snps from summary statistics using a window based function.\"\"\"\n\n    @staticmethod\n    def _identify_cluster_peaks(\n        study: Column, chromosome: Column, position: Column, window_length: int\n    ) -&gt; Column:\n\"\"\"Cluster GWAS significant variants, were clusters are separated by a defined distance.\n\n        !! Important to note that the length of the clusters can be arbitrarily big.\n\n        Args:\n            study (Column): study identifier\n            chromosome (Column): chromosome identifier\n            position (Column): position of the variant\n            window_length (int): window length in basepair\n\n        Returns:\n            Column: containing cluster identifier\n\n        Examples:\n            &gt;&gt;&gt; data = [\n            ...     # Cluster 1:\n            ...     ('s1', 'chr1', 2),\n            ...     ('s1', 'chr1', 4),\n            ...     ('s1', 'chr1', 12),\n            ...     # Cluster 2 - Same chromosome:\n            ...     ('s1', 'chr1', 31),\n            ...     ('s1', 'chr1', 38),\n            ...     ('s1', 'chr1', 42),\n            ...     # Cluster 3 - New chromosome:\n            ...     ('s1', 'chr2', 41),\n            ...     ('s1', 'chr2', 44),\n            ...     ('s1', 'chr2', 50),\n            ...     # Cluster 4 - other study:\n            ...     ('s2', 'chr2', 55),\n            ...     ('s2', 'chr2', 62),\n            ...     ('s2', 'chr2', 70),\n            ... ]\n            &gt;&gt;&gt; window_length = 10\n            &gt;&gt;&gt; (\n            ...     spark.createDataFrame(data, ['studyId', 'chromosome', 'position'])\n            ...     .withColumn(\"cluster_id\",\n            ...         WindowBasedClumping._identify_cluster_peaks(\n            ...             f.col('studyId'),\n            ...             f.col('chromosome'),\n            ...             f.col('position'),\n            ...             window_length\n            ...         )\n            ...     ).show()\n            ... )\n            +-------+----------+--------+----------+\n            |studyId|chromosome|position|cluster_id|\n            +-------+----------+--------+----------+\n            |     s1|      chr1|       2| s1_chr1_2|\n            |     s1|      chr1|       4| s1_chr1_2|\n            |     s1|      chr1|      12| s1_chr1_2|\n            |     s1|      chr1|      31|s1_chr1_31|\n            |     s1|      chr1|      38|s1_chr1_31|\n            |     s1|      chr1|      42|s1_chr1_31|\n            |     s1|      chr2|      41|s1_chr2_41|\n            |     s1|      chr2|      44|s1_chr2_41|\n            |     s1|      chr2|      50|s1_chr2_41|\n            |     s2|      chr2|      55|s2_chr2_55|\n            |     s2|      chr2|      62|s2_chr2_55|\n            |     s2|      chr2|      70|s2_chr2_55|\n            +-------+----------+--------+----------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        # By adding previous position, the cluster boundary can be identified:\n        previous_position = f.lag(position).over(\n            Window.partitionBy(study, chromosome).orderBy(position)\n        )\n        # We consider a cluster boudary if subsequent snps are further than the defined window:\n        cluster_id = f.when(\n            (previous_position.isNull())\n            | (position - previous_position &gt; window_length),\n            f.concat_ws(\"_\", study, chromosome, position),\n        )\n        # The cluster identifier is propagated across every variant of the cluster:\n        return f.when(\n            cluster_id.isNull(),\n            f.last(cluster_id, ignorenulls=True).over(\n                Window.partitionBy(study, chromosome)\n                .orderBy(position)\n                .rowsBetween(Window.unboundedPreceding, Window.currentRow)\n            ),\n        ).otherwise(cluster_id)\n\n    @staticmethod\n    @f.udf(VectorUDT())\n    def _find_peak(position: ndarray, window_size: int) -&gt; DenseVector:\n\"\"\"Establish lead snps based on their positions listed by p-value.\n\n        The function `find_peak` assigns lead SNPs based on their positions listed by p-value within a specified window size.\n\n        Args:\n            position (ndarray): positions of the SNPs sorted by p-value.\n            window_size (int): the distance in bp within which associations are clumped together around the lead snp.\n\n        Returns:\n            DenseVector: binary vector where 1 indicates a lead SNP and 0 indicates a non-lead SNP.\n\n        Examples:\n            &gt;&gt;&gt; from pyspark.ml import functions as fml\n            &gt;&gt;&gt; data = [\n            ...     ('c', 3, 4.0, True),\n            ...     ('c', 4, 2.0, False),\n            ...     ('c', 6, 1.0, True),\n            ...     ('c', 8, 2.5, False),\n            ...     ('c', 9, 3.0, True)\n            ... ]\n            &gt;&gt;&gt; (\n            ...     spark.createDataFrame(data, ['cluster', 'position', 'negLogPValue', 'isSemiIndex'])\n            ...     .withColumn(\n            ...        'collected_positions',\n            ...         f.collect_list(\n            ...             f.col('position'))\n            ...         .over(\n            ...             Window.partitionBy('cluster')\n            ...             .orderBy(f.col('negLogPValue').desc())\n            ...             .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)\n            ...         )\n            ...     )\n            ...     .withColumn('isLeadList', WindowBasedClumping._find_peak(fml.array_to_vector(f.col('collected_positions')), f.lit(2)))\n            ...     .show(truncate=False)\n            ... )\n            +-------+--------+------------+-----------+-------------------+---------------------+\n            |cluster|position|negLogPValue|isSemiIndex|collected_positions|isLeadList           |\n            +-------+--------+------------+-----------+-------------------+---------------------+\n            |c      |3       |4.0         |true       |[3, 9, 8, 4, 6]    |[1.0,1.0,0.0,0.0,1.0]|\n            |c      |9       |3.0         |true       |[3, 9, 8, 4, 6]    |[1.0,1.0,0.0,0.0,1.0]|\n            |c      |8       |2.5         |false      |[3, 9, 8, 4, 6]    |[1.0,1.0,0.0,0.0,1.0]|\n            |c      |4       |2.0         |false      |[3, 9, 8, 4, 6]    |[1.0,1.0,0.0,0.0,1.0]|\n            |c      |6       |1.0         |true       |[3, 9, 8, 4, 6]    |[1.0,1.0,0.0,0.0,1.0]|\n            +-------+--------+------------+-----------+-------------------+---------------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        # Initializing the lead list with zeroes:\n        is_lead: ndarray = np.zeros(len(position))\n\n        # List containing indices of leads:\n        lead_indices: list = []\n\n        # Looping through all positions:\n        for index in range(len(position)):\n            # Looping through leads to find out if they are within a window:\n            for lead_index in lead_indices:\n                # If any of the leads within the window:\n                if abs(position[lead_index] - position[index]) &lt; window_size:\n                    # Skipping further checks:\n                    break\n            else:\n                # None of the leads were within the window:\n                lead_indices.append(index)\n                is_lead[index] = 1\n\n        return DenseVector(is_lead)\n\n    @staticmethod\n    def _filter_leads(clump: Column, window_length: int) -&gt; Column:\n\"\"\"Filter lead snps from a column containing clumps with prioritised variants.\n\n        Args:\n            clump (Column): column containing array of structs with all variants in the clump sorted by priority.\n            window_length (int): window length in basepair\n\n        Returns:\n            Column: column containing array of structs with only lead variants.\n\n        Examples:\n            &gt;&gt;&gt; data = [\n            ...     ('v6', 10),\n            ...     ('v4', 6),\n            ...     ('v1', 3),\n            ...     ('v2', 4),\n            ...     ('v3', 5),\n            ...     ('v5', 8),\n            ...     ('v7', 13),\n            ...     ('v8', 20)\n            ... ]\n            &gt;&gt;&gt; window_length = 2\n            &gt;&gt;&gt; (\n            ...    spark.createDataFrame(data, ['variantId', 'position']).withColumn(\"study\", f.lit(\"s1\"))\n            ...    .groupBy(\"study\")\n            ...    .agg(f.collect_list(f.struct(\"*\")).alias(\"clump\"))\n            ...    .select(WindowBasedClumping._filter_leads(f.col('clump'), window_length).alias(\"filtered_clump\"))\n            ...    .show(truncate=False)\n            ... )\n            +---------------------------------------------------------------------------------------------------------------+\n            |filtered_clump                                                                                                 |\n            +---------------------------------------------------------------------------------------------------------------+\n            |[{v6, 10, s1, 1.0}, {v4, 6, s1, 1.0}, {v1, 3, s1, 1.0}, {v5, 8, s1, 1.0}, {v7, 13, s1, 1.0}, {v8, 20, s1, 1.0}]|\n            +---------------------------------------------------------------------------------------------------------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        # Combine the lead position vector with the aggregated fields and dropping non-lead snps:\n        return f.filter(\n            f.zip_with(\n                clump,\n                # Extract the position vector and identify positions of the leads:\n                fml.vector_to_array(\n                    WindowBasedClumping._find_peak(\n                        fml.array_to_vector(f.transform(clump, lambda x: x.position)),\n                        f.lit(window_length),\n                    )\n                ),\n                lambda x, y: f.when(y == 1.0, x.withField(\"isLead\", y)),\n            ),\n            lambda col: col.isNotNull(),\n        )\n\n    @staticmethod\n    def _collect_clump(mantissa: Column, exponent: Column) -&gt; Column:\n\"\"\"Collect clump into a sorted struct.\n\n        Args:\n            mantissa (Column): mantissa of the p-value\n            exponent (Column): exponent of the p-value\n\n        Returns:\n            Column: struct containing clumped variants sorted by negLogPValue in descending order\n\n        Examples:\n            &gt;&gt;&gt; data = [\n            ...     ('clump_1', 2, 0.1, -1),\n            ...     ('clump_1', 4, 0.2, -1),\n            ...     ('clump_1', 12, 0.3, -1),\n            ...     ('clump_1', 31, 0.4, -1),\n            ...     ('clump_1', 38, 0.5, -1),\n            ...     ('clump_1', 42, 0.6, -1),\n            ...     ('clump_2', 41, 0.7, -1),\n            ...     ('clump_2', 44, 0.8, -1),\n            ...     ('clump_2', 50, 0.9, -1),\n            ...     ('clump_3', 55, 1.0, -1),\n            ...     ('clump_3', 62, 1.1, -1),\n            ...     ('clump_3', 70, 1.2, -1),\n            ... ]\n            &gt;&gt;&gt; (\n            ...    spark.createDataFrame(data, ['clump_id', 'position', 'pValueMantissa', 'pValueExponent'])\n            ...     .groupBy('clump_id')\n            ...     .agg(WindowBasedClumping._collect_clump(\n            ...                 f.col('pValueMantissa'),\n            ...                 f.col('pValueExponent')\n            ...             ).alias(\"clump\")\n            ...     ).show(truncate=False)\n            ... )\n            +--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n            |clump_id|clump                                                                                                                                                                                                                                                  |\n            +--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n            |clump_1 |[{2.0, clump_1, 2, 0.1, -1}, {1.6989700043360187, clump_1, 4, 0.2, -1}, {1.5228787452803376, clump_1, 12, 0.3, -1}, {1.3979400086720375, clump_1, 31, 0.4, -1}, {1.3010299956639813, clump_1, 38, 0.5, -1}, {1.2218487496163564, clump_1, 42, 0.6, -1}]|\n            |clump_2 |[{1.154901959985743, clump_2, 41, 0.7, -1}, {1.0969100130080565, clump_2, 44, 0.8, -1}, {1.045757490560675, clump_2, 50, 0.9, -1}]                                                                                                                     |\n            |clump_3 |[{1.0, clump_3, 55, 1.0, -1}, {0.958607314841775, clump_3, 62, 1.1, -1}, {0.9208187539523752, clump_3, 70, 1.2, -1}]                                                                                                                                   |\n            +--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        return f.sort_array(\n            f.collect_list(\n                f.struct(\n                    calculate_neglog_pvalue(mantissa, exponent).alias(\"negLogPValue\"),\n                    \"*\",\n                )\n            ),\n            False,\n        )\n\n    @classmethod\n    def clump(\n        cls: type[WindowBasedClumping],\n        summary_stats: SummaryStatistics,\n        window_length: int,\n    ) -&gt; StudyLocus:\n\"\"\"Clump summary statistics by distance.\n\n        Args:\n            summary_stats (SummaryStatistics): summary statistics to clump\n            window_length (int): window length in basepair\n\n        Returns:\n            StudyLocus: clumped summary statistics\n        \"\"\"\n        return StudyLocus(\n            _df=summary_stats.df.withColumn(\n                \"cluster_id\",\n                # First identify clusters of variants within the window\n                WindowBasedClumping._identify_cluster_peaks(\n                    f.col(\"studyId\"),\n                    f.col(\"chromosome\"),\n                    f.col(\"position\"),\n                    window_length,\n                ),\n            )\n            .groupBy(\"cluster_id\")\n            # Aggregating all data from each cluster:\n            .agg(\n                WindowBasedClumping._collect_clump(\n                    f.col(\"pValueMantissa\"), f.col(\"pValueExponent\")\n                ).alias(\"clump\")\n            )\n            # Explode and identify the index variant representative of the cluster:\n            .withColumn(\n                \"exploded\",\n                f.explode(\n                    WindowBasedClumping._filter_leads(f.col(\"clump\"), window_length)\n                ),\n            )\n            .select(\"exploded.*\")\n            # Dropping helper columns:\n            .drop(\"isLead\", \"negLogPValue\", \"cluster_id\")\n            # assign study-locus id:\n            .withColumn(\"studyLocusId\", get_study_locus_id(\"studyId\", \"variantId\"))\n        )\n</code></pre>"},{"location":"components/method/window_based_clumping/#otg.method.window_based_clumping.WindowBasedClumping.clump","title":"<code>clump(summary_stats, window_length)</code>  <code>classmethod</code>","text":"<p>Clump summary statistics by distance.</p> <p>Parameters:</p> Name Type Description Default <code>summary_stats</code> <code>SummaryStatistics</code> <p>summary statistics to clump</p> required <code>window_length</code> <code>int</code> <p>window length in basepair</p> required <p>Returns:</p> Name Type Description <code>StudyLocus</code> <code>StudyLocus</code> <p>clumped summary statistics</p> Source code in <code>src/otg/method/window_based_clumping.py</code> <pre><code>@classmethod\ndef clump(\n    cls: type[WindowBasedClumping],\n    summary_stats: SummaryStatistics,\n    window_length: int,\n) -&gt; StudyLocus:\n\"\"\"Clump summary statistics by distance.\n\n    Args:\n        summary_stats (SummaryStatistics): summary statistics to clump\n        window_length (int): window length in basepair\n\n    Returns:\n        StudyLocus: clumped summary statistics\n    \"\"\"\n    return StudyLocus(\n        _df=summary_stats.df.withColumn(\n            \"cluster_id\",\n            # First identify clusters of variants within the window\n            WindowBasedClumping._identify_cluster_peaks(\n                f.col(\"studyId\"),\n                f.col(\"chromosome\"),\n                f.col(\"position\"),\n                window_length,\n            ),\n        )\n        .groupBy(\"cluster_id\")\n        # Aggregating all data from each cluster:\n        .agg(\n            WindowBasedClumping._collect_clump(\n                f.col(\"pValueMantissa\"), f.col(\"pValueExponent\")\n            ).alias(\"clump\")\n        )\n        # Explode and identify the index variant representative of the cluster:\n        .withColumn(\n            \"exploded\",\n            f.explode(\n                WindowBasedClumping._filter_leads(f.col(\"clump\"), window_length)\n            ),\n        )\n        .select(\"exploded.*\")\n        # Dropping helper columns:\n        .drop(\"isLead\", \"negLogPValue\", \"cluster_id\")\n        # assign study-locus id:\n        .withColumn(\"studyLocusId\", get_study_locus_id(\"studyId\", \"variantId\"))\n    )\n</code></pre>"},{"location":"components/step/colocalisation/","title":"Colocalisation","text":"<p>         Bases: <code>ColocalisationStepConfig</code></p> <p>Colocalisation step.</p> <p>This workflow runs colocalization analyses that assess the degree to which independent signals of the association share the same causal variant in a region of the genome, typically limited by linkage disequilibrium (LD).</p> Source code in <code>src/otg/colocalisation.py</code> <pre><code>@dataclass\nclass ColocalisationStep(ColocalisationStepConfig):\n\"\"\"Colocalisation step.\n\n    This workflow runs colocalization analyses that assess the degree to which independent signals of the association share the same causal variant in a region of the genome, typically limited by linkage disequilibrium (LD).\n    \"\"\"\n\n    session: Session = Session()\n\n    def run(self: ColocalisationStep) -&gt; None:\n\"\"\"Run colocalisation step.\"\"\"\n        # Study-locus information\n        sl = StudyLocus.from_parquet(self.session, self.study_locus_path)\n        si = StudyIndex.from_parquet(self.session, self.study_index_path)\n\n        # Study-locus overlaps for 95% credible sets\n        sl_overlaps = sl.credible_set(CredibleInterval.IS95).overlaps(si)\n\n        coloc_results = Coloc.colocalise(\n            sl_overlaps, self.priorc1, self.priorc2, self.priorc12\n        )\n        ecaviar_results = ECaviar.colocalise(sl_overlaps)\n\n        coloc_results.df.unionByName(ecaviar_results.df, allowMissingColumns=True)\n\n        coloc_results.df.write.mode(self.session.write_mode).parquet(self.coloc_path)\n</code></pre> <p>Colocalisation step requirements.</p> <p>Attributes:</p> Name Type Description <code>study_locus_path</code> <code>DictConfig</code> <p>Input Study-locus path.</p> <code>coloc_path</code> <code>DictConfig</code> <p>Output Colocalisation path.</p> <code>priorc1</code> <code>float</code> <p>Prior on variant being causal for trait 1.</p> <code>priorc2</code> <code>float</code> <p>Prior on variant being causal for trait 2.</p> <code>priorc12</code> <code>float</code> <p>Prior on variant being causal for traits 1 and 2.</p> Source code in <code>src/otg/config.py</code> <pre><code>@dataclass\nclass ColocalisationStepConfig:\n\"\"\"Colocalisation step requirements.\n\n    Attributes:\n        study_locus_path (DictConfig): Input Study-locus path.\n        coloc_path (DictConfig): Output Colocalisation path.\n        priorc1 (float): Prior on variant being causal for trait 1.\n        priorc2 (float): Prior on variant being causal for trait 2.\n        priorc12 (float): Prior on variant being causal for traits 1 and 2.\n    \"\"\"\n\n    _target_: str = \"otg.colocalisation.ColocalisationStep\"\n    study_locus_path: str = MISSING\n    study_index_path: str = MISSING\n    coloc_path: str = MISSING\n    priorc1: float = 1e-4\n    priorc2: float = 1e-4\n    priorc12: float = 1e-5\n</code></pre>"},{"location":"components/step/colocalisation/#otg.colocalisation.ColocalisationStep.run","title":"<code>run()</code>","text":"<p>Run colocalisation step.</p> Source code in <code>src/otg/colocalisation.py</code> <pre><code>def run(self: ColocalisationStep) -&gt; None:\n\"\"\"Run colocalisation step.\"\"\"\n    # Study-locus information\n    sl = StudyLocus.from_parquet(self.session, self.study_locus_path)\n    si = StudyIndex.from_parquet(self.session, self.study_index_path)\n\n    # Study-locus overlaps for 95% credible sets\n    sl_overlaps = sl.credible_set(CredibleInterval.IS95).overlaps(si)\n\n    coloc_results = Coloc.colocalise(\n        sl_overlaps, self.priorc1, self.priorc2, self.priorc12\n    )\n    ecaviar_results = ECaviar.colocalise(sl_overlaps)\n\n    coloc_results.df.unionByName(ecaviar_results.df, allowMissingColumns=True)\n\n    coloc_results.df.write.mode(self.session.write_mode).parquet(self.coloc_path)\n</code></pre>"},{"location":"components/step/finngen/","title":"FinnGen","text":"<p>         Bases: <code>FinnGenStepConfig</code></p> <p>FinnGen study table ingestion step.</p> Source code in <code>src/otg/finngen.py</code> <pre><code>@dataclass\nclass FinnGenStep(FinnGenStepConfig):\n\"\"\"FinnGen study table ingestion step.\"\"\"\n\n    session: Session = Session()\n\n    def run(self: FinnGenStep) -&gt; None:\n\"\"\"Run FinnGen study table ingestion step.\"\"\"\n        # Read the JSON data from the URL.\n        json_data = urlopen(self.finngen_phenotype_table_url).read().decode(\"utf-8\")\n        rdd = self.session.spark.sparkContext.parallelize([json_data])\n        df = self.session.spark.read.json(rdd)\n\n        # Parse the study index data.\n        finngen_studies = StudyIndexFinnGen.from_source(\n            df,\n            self.finngen_release_prefix,\n            self.finngen_sumstat_url_prefix,\n            self.finngen_sumstat_url_suffix,\n        )\n\n        # Write the output.\n        finngen_studies.df.write.mode(self.session.write_mode).parquet(\n            self.finngen_study_index_out\n        )\n</code></pre> <p>FinnGen study table ingestion step requirements.</p> <p>Attributes:</p> Name Type Description <code>finngen_phenotype_table_url</code> <code>str</code> <p>FinnGen API for fetching the list of studies.</p> <code>finngen_release_prefix</code> <code>str</code> <p>Release prefix pattern.</p> <code>finngen_sumstat_url_prefix</code> <code>str</code> <p>URL prefix for summary statistics location.</p> <code>finngen_sumstat_url_suffix</code> <code>str</code> <p>URL prefix suffix for summary statistics location.</p> <code>finngen_study_index_out</code> <code>str</code> <p>Output path for the FinnGen study index dataset.</p> Source code in <code>src/otg/config.py</code> <pre><code>@dataclass\nclass FinnGenStepConfig:\n\"\"\"FinnGen study table ingestion step requirements.\n\n    Attributes:\n        finngen_phenotype_table_url (str): FinnGen API for fetching the list of studies.\n        finngen_release_prefix (str): Release prefix pattern.\n        finngen_sumstat_url_prefix (str): URL prefix for summary statistics location.\n        finngen_sumstat_url_suffix (str): URL prefix suffix for summary statistics location.\n        finngen_study_index_out (str): Output path for the FinnGen study index dataset.\n    \"\"\"\n\n    _target_: str = \"otg.finngen.FinnGenStep\"\n    finngen_phenotype_table_url: str = MISSING\n    finngen_release_prefix: str = MISSING\n    finngen_sumstat_url_prefix: str = MISSING\n    finngen_sumstat_url_suffix: str = MISSING\n    finngen_study_index_out: str = MISSING\n</code></pre>"},{"location":"components/step/finngen/#otg.finngen.FinnGenStep.run","title":"<code>run()</code>","text":"<p>Run FinnGen study table ingestion step.</p> Source code in <code>src/otg/finngen.py</code> <pre><code>def run(self: FinnGenStep) -&gt; None:\n\"\"\"Run FinnGen study table ingestion step.\"\"\"\n    # Read the JSON data from the URL.\n    json_data = urlopen(self.finngen_phenotype_table_url).read().decode(\"utf-8\")\n    rdd = self.session.spark.sparkContext.parallelize([json_data])\n    df = self.session.spark.read.json(rdd)\n\n    # Parse the study index data.\n    finngen_studies = StudyIndexFinnGen.from_source(\n        df,\n        self.finngen_release_prefix,\n        self.finngen_sumstat_url_prefix,\n        self.finngen_sumstat_url_suffix,\n    )\n\n    # Write the output.\n    finngen_studies.df.write.mode(self.session.write_mode).parquet(\n        self.finngen_study_index_out\n    )\n</code></pre>"},{"location":"components/step/gene_index/","title":"Gene index","text":"<p>         Bases: <code>GeneIndexStepConfig</code></p> <p>Gene index step.</p> <p>This step generates a gene index dataset from an Open Targets Platform target dataset.</p> Source code in <code>src/otg/gene_index.py</code> <pre><code>@dataclass\nclass GeneIndexStep(GeneIndexStepConfig):\n\"\"\"Gene index step.\n\n    This step generates a gene index dataset from an Open Targets Platform target dataset.\n    \"\"\"\n\n    session: Session = Session()\n\n    def run(self: GeneIndexStep) -&gt; None:\n\"\"\"Run Target index step.\"\"\"\n        # Extract\n        platform_target = self.session.spark.read.parquet(self.target_path)\n        # Transform\n        gene_index = GeneIndex.from_source(platform_target)\n        # Load\n        gene_index.df.write.mode(self.session.write_mode).parquet(self.gene_index_path)\n</code></pre> <p>Gene index step requirements.</p> <p>Attributes:</p> Name Type Description <code>target_path</code> <code>str</code> <p>Open targets Platform target dataset path.</p> <code>gene_index_path</code> <code>str</code> <p>Output gene index path.</p> Source code in <code>src/otg/config.py</code> <pre><code>@dataclass\nclass GeneIndexStepConfig:\n\"\"\"Gene index step requirements.\n\n    Attributes:\n        target_path (str): Open targets Platform target dataset path.\n        gene_index_path (str): Output gene index path.\n    \"\"\"\n\n    _target_: str = \"otg.gene_index.GeneIndexStep\"\n    target_path: str = MISSING\n    gene_index_path: str = MISSING\n</code></pre>"},{"location":"components/step/gene_index/#otg.gene_index.GeneIndexStep.run","title":"<code>run()</code>","text":"<p>Run Target index step.</p> Source code in <code>src/otg/gene_index.py</code> <pre><code>def run(self: GeneIndexStep) -&gt; None:\n\"\"\"Run Target index step.\"\"\"\n    # Extract\n    platform_target = self.session.spark.read.parquet(self.target_path)\n    # Transform\n    gene_index = GeneIndex.from_source(platform_target)\n    # Load\n    gene_index.df.write.mode(self.session.write_mode).parquet(self.gene_index_path)\n</code></pre>"},{"location":"components/step/gwas_catalog/","title":"GWAS Catalog","text":"<p>         Bases: <code>GWASCatalogStepConfig</code></p> <p>GWAS Catalog step.</p> Source code in <code>src/otg/gwas_catalog.py</code> <pre><code>@dataclass\nclass GWASCatalogStep(GWASCatalogStepConfig):\n\"\"\"GWAS Catalog step.\"\"\"\n\n    session: Session = Session()\n\n    def run(self: GWASCatalogStep) -&gt; None:\n\"\"\"Run GWAS Catalog ingestion step to extract GWASCatalog Study and StudyLocus tables.\"\"\"\n        hl.init(sc=self.session.spark.sparkContext, log=\"/dev/null\")\n        # All inputs:\n        # Variant annotation dataset\n        va = VariantAnnotation.from_parquet(self.session, self.variant_annotation_path)\n        # GWAS Catalog raw study information\n        catalog_studies = self.session.spark.read.csv(\n            self.catalog_studies_file, sep=\"\\t\", header=True\n        )\n        # GWAS Catalog ancestry information\n        ancestry_lut = self.session.spark.read.csv(\n            self.catalog_ancestry_file, sep=\"\\t\", header=True\n        )\n        # GWAS Catalog summary statistics information\n        sumstats_lut = self.session.spark.read.csv(\n            self.catalog_sumstats_lut, sep=\"\\t\", header=False\n        )\n        # GWAS Catalog raw association information\n        catalog_associations = self.session.spark.read.csv(\n            self.catalog_associations_file, sep=\"\\t\", header=True\n        )\n\n        # Transform:\n        # GWAS Catalog study index and study-locus splitted\n        study_index, study_locus = GWASCatalogSplitter.split(\n            StudyIndexGWASCatalog.from_source(\n                catalog_studies, ancestry_lut, sumstats_lut\n            ),\n            StudyLocusGWASCatalog.from_source(catalog_associations, va),\n        )\n\n        # Annotate LD information\n        study_locus = study_locus.annotate_ld(\n            self.session,\n            study_index,\n            self.ld_populations,\n            self.ld_index_template,\n            self.ld_matrix_template,\n            self.min_r2,\n        )\n\n        # Fine-mapping LD-clumped study-locus using PICS\n        finemapped_study_locus = (\n            PICS.finemap(study_locus).annotate_credible_sets().clump()\n        )\n\n        # Write:\n        study_index.df.write.mode(self.session.write_mode).parquet(\n            self.catalog_studies_out\n        )\n        finemapped_study_locus.df.write.mode(self.session.write_mode).parquet(\n            self.catalog_associations_out\n        )\n</code></pre> <p>GWAS Catalog step requirements.</p> <p>Attributes:</p> Name Type Description <code>catalog_studies_file</code> <code>str</code> <p>Raw GWAS catalog studies file.</p> <code>catalog_ancestry_file</code> <code>str</code> <p>Ancestry annotations file from GWAS Catalog.</p> <code>catalog_sumstats_lut</code> <code>str</code> <p>GWAS Catalog summary statistics lookup table.</p> <code>catalog_associations_file</code> <code>str</code> <p>Raw GWAS catalog associations file.</p> <code>variant_annotation_path</code> <code>str</code> <p>Input variant annotation path.</p> <code>ld_populations</code> <code>list</code> <p>List of populations to include.</p> <code>min_r2</code> <code>float</code> <p>Minimum r2 to consider when considering variants within a window.</p> <code>catalog_studies_out</code> <code>str</code> <p>Output GWAS catalog studies path.</p> <code>catalog_associations_out</code> <code>str</code> <p>Output GWAS catalog associations path.</p> Source code in <code>src/otg/config.py</code> <pre><code>@dataclass\nclass GWASCatalogStepConfig:\n\"\"\"GWAS Catalog step requirements.\n\n    Attributes:\n        catalog_studies_file (str): Raw GWAS catalog studies file.\n        catalog_ancestry_file (str): Ancestry annotations file from GWAS Catalog.\n        catalog_sumstats_lut (str): GWAS Catalog summary statistics lookup table.\n        catalog_associations_file (str): Raw GWAS catalog associations file.\n        variant_annotation_path (str): Input variant annotation path.\n        ld_populations (list): List of populations to include.\n        min_r2 (float): Minimum r2 to consider when considering variants within a window.\n        catalog_studies_out (str): Output GWAS catalog studies path.\n        catalog_associations_out (str): Output GWAS catalog associations path.\n    \"\"\"\n\n    _target_: str = \"otg.gwas_catalog.GWASCatalogStep\"\n    catalog_studies_file: str = MISSING\n    catalog_ancestry_file: str = MISSING\n    catalog_sumstats_lut: str = MISSING\n    catalog_associations_file: str = MISSING\n    variant_annotation_path: str = MISSING\n    min_r2: float = 0.5\n    ld_matrix_template: str = MISSING\n    ld_index_template: str = MISSING\n    ld_populations: List[str] = field(\n        default_factory=lambda: [\n            \"afr\",  # African-American\n            \"amr\",  # American Admixed/Latino\n            \"asj\",  # Ashkenazi Jewish\n            \"eas\",  # East Asian\n            \"fin\",  # Finnish\n            \"nfe\",  # Non-Finnish European\n            \"nwe\",  # Northwestern European\n            \"seu\",  # Southeastern European\n        ]\n    )\n    catalog_studies_out: str = MISSING\n    catalog_associations_out: str = MISSING\n</code></pre>"},{"location":"components/step/gwas_catalog/#otg.gwas_catalog.GWASCatalogStep.run","title":"<code>run()</code>","text":"<p>Run GWAS Catalog ingestion step to extract GWASCatalog Study and StudyLocus tables.</p> Source code in <code>src/otg/gwas_catalog.py</code> <pre><code>def run(self: GWASCatalogStep) -&gt; None:\n\"\"\"Run GWAS Catalog ingestion step to extract GWASCatalog Study and StudyLocus tables.\"\"\"\n    hl.init(sc=self.session.spark.sparkContext, log=\"/dev/null\")\n    # All inputs:\n    # Variant annotation dataset\n    va = VariantAnnotation.from_parquet(self.session, self.variant_annotation_path)\n    # GWAS Catalog raw study information\n    catalog_studies = self.session.spark.read.csv(\n        self.catalog_studies_file, sep=\"\\t\", header=True\n    )\n    # GWAS Catalog ancestry information\n    ancestry_lut = self.session.spark.read.csv(\n        self.catalog_ancestry_file, sep=\"\\t\", header=True\n    )\n    # GWAS Catalog summary statistics information\n    sumstats_lut = self.session.spark.read.csv(\n        self.catalog_sumstats_lut, sep=\"\\t\", header=False\n    )\n    # GWAS Catalog raw association information\n    catalog_associations = self.session.spark.read.csv(\n        self.catalog_associations_file, sep=\"\\t\", header=True\n    )\n\n    # Transform:\n    # GWAS Catalog study index and study-locus splitted\n    study_index, study_locus = GWASCatalogSplitter.split(\n        StudyIndexGWASCatalog.from_source(\n            catalog_studies, ancestry_lut, sumstats_lut\n        ),\n        StudyLocusGWASCatalog.from_source(catalog_associations, va),\n    )\n\n    # Annotate LD information\n    study_locus = study_locus.annotate_ld(\n        self.session,\n        study_index,\n        self.ld_populations,\n        self.ld_index_template,\n        self.ld_matrix_template,\n        self.min_r2,\n    )\n\n    # Fine-mapping LD-clumped study-locus using PICS\n    finemapped_study_locus = (\n        PICS.finemap(study_locus).annotate_credible_sets().clump()\n    )\n\n    # Write:\n    study_index.df.write.mode(self.session.write_mode).parquet(\n        self.catalog_studies_out\n    )\n    finemapped_study_locus.df.write.mode(self.session.write_mode).parquet(\n        self.catalog_associations_out\n    )\n</code></pre>"},{"location":"components/step/gwas_catalog_sumstat_preprocess/","title":"GWAS Catalog sumstat preprocess","text":"<p>         Bases: <code>GWASCatalogSumstatsPreprocessConfig</code></p> <p>Step to preprocess GWAS Catalog harmonised summary stats.</p> Source code in <code>src/otg/gwas_catalog_sumstat_preprocess.py</code> <pre><code>@dataclass\nclass GWASCatalogSumstatsPreprocessStep(GWASCatalogSumstatsPreprocessConfig):\n\"\"\"Step to preprocess GWAS Catalog harmonised summary stats.\"\"\"\n\n    session: Session = Session()\n\n    def run(self: GWASCatalogSumstatsPreprocessStep) -&gt; None:\n\"\"\"Run Step.\"\"\"\n        # Extract\n        self.session.logger.info(self.raw_sumstats_path)\n        self.session.logger.info(self.out_sumstats_path)\n        self.session.logger.info(self.study_id)\n\n        # Reading dataset:\n        raw_dataset = self.session.spark.read.csv(\n            self.raw_sumstats_path, header=True, sep=\"\\t\"\n        )\n        self.session.logger.info(\n            f\"Number of single point associations: {raw_dataset.count()}\"\n        )\n\n        # Processing dataset:\n        SummaryStatistics.from_gwas_harmonized_summary_stats(\n            raw_dataset, self.study_id\n        ).df.write.mode(self.session.write_mode).parquet(self.out_sumstats_path)\n        self.session.logger.info(\"Processing dataset successfully completed.\")\n</code></pre> <p>GWAS Catalog Sumstats Preprocessing step requirements.</p> <p>Attributes:</p> Name Type Description <code>raw_sumstats_path</code> <code>str</code> <p>Input raw GWAS Catalog summary statistics path.</p> <code>out_sumstats_path</code> <code>str</code> <p>Output GWAS Catalog summary statistics path.</p> <code>study_id</code> <code>str</code> <p>GWAS Catalog study identifier.</p> Source code in <code>src/otg/config.py</code> <pre><code>@dataclass\nclass GWASCatalogSumstatsPreprocessConfig:\n\"\"\"GWAS Catalog Sumstats Preprocessing step requirements.\n\n    Attributes:\n        raw_sumstats_path (str): Input raw GWAS Catalog summary statistics path.\n        out_sumstats_path (str): Output GWAS Catalog summary statistics path.\n        study_id (str): GWAS Catalog study identifier.\n    \"\"\"\n\n    _target_: str = (\n        \"otg.gwas_catalog_sumstat_preprocess.GWASCatalogSumstatsPreprocessStep\"\n    )\n    raw_sumstats_path: str = MISSING\n    out_sumstats_path: str = MISSING\n    study_id: str = MISSING\n</code></pre>"},{"location":"components/step/gwas_catalog_sumstat_preprocess/#otg.gwas_catalog_sumstat_preprocess.GWASCatalogSumstatsPreprocessStep.run","title":"<code>run()</code>","text":"<p>Run Step.</p> Source code in <code>src/otg/gwas_catalog_sumstat_preprocess.py</code> <pre><code>def run(self: GWASCatalogSumstatsPreprocessStep) -&gt; None:\n\"\"\"Run Step.\"\"\"\n    # Extract\n    self.session.logger.info(self.raw_sumstats_path)\n    self.session.logger.info(self.out_sumstats_path)\n    self.session.logger.info(self.study_id)\n\n    # Reading dataset:\n    raw_dataset = self.session.spark.read.csv(\n        self.raw_sumstats_path, header=True, sep=\"\\t\"\n    )\n    self.session.logger.info(\n        f\"Number of single point associations: {raw_dataset.count()}\"\n    )\n\n    # Processing dataset:\n    SummaryStatistics.from_gwas_harmonized_summary_stats(\n        raw_dataset, self.study_id\n    ).df.write.mode(self.session.write_mode).parquet(self.out_sumstats_path)\n    self.session.logger.info(\"Processing dataset successfully completed.\")\n</code></pre>"},{"location":"components/step/ld_index/","title":"LD index","text":"<p>         Bases: <code>LDIndexStepConfig</code></p> <p>LD index step.</p> Source code in <code>src/otg/ld_index.py</code> <pre><code>@dataclass\nclass LDIndexStep(LDIndexStepConfig):\n\"\"\"LD index step.\"\"\"\n\n    session: Session = Session()\n\n    def run(self: LDIndexStep) -&gt; None:\n\"\"\"Run LD index step.\"\"\"\n        # init hail session\n        hl.init(sc=self.session.spark.sparkContext, log=\"/dev/null\")\n\n        for population in self.ld_populations:\n            self.session.logger.info(f\"Processing population: {population}\")\n            ld_index = LDIndex.create(\n                self.ld_index_raw_template.format(POP=population),\n                self.ld_radius,\n                self.grch37_to_grch38_chain_path,\n            )\n\n            self.session.logger.info(\n                f\"Writing ls index to: {self.ld_index_template.format(POP=population)}\"\n            )\n            (\n                ld_index.df.write.partitionBy(\"chromosome\")\n                .mode(self.session.write_mode)\n                .parquet(self.ld_index_template.format(POP=population))  # noqa: FS002\n            )\n</code></pre> <p>LD index step requirements.</p> <p>Attributes:</p> Name Type Description <code>pop_ldindex_path</code> <code>str</code> <p>Input population LD index file from gnomAD.</p> <code>ld_radius</code> <code>int</code> <p>Window radius around locus.</p> <code>grch37_to_grch38_chain_path</code> <code>str</code> <p>Path to GRCh37 to GRCh38 chain file.</p> <code>ld_index_path</code> <code>str</code> <p>Output LD index path.</p> Source code in <code>src/otg/config.py</code> <pre><code>@dataclass\nclass LDIndexStepConfig:\n\"\"\"LD index step requirements.\n\n    Attributes:\n        pop_ldindex_path (str): Input population LD index file from gnomAD.\n        ld_radius (int): Window radius around locus.\n        grch37_to_grch38_chain_path (str): Path to GRCh37 to GRCh38 chain file.\n        ld_index_path (str): Output LD index path.\n    \"\"\"\n\n    _target_: str = \"otg.ld_index.LDIndexStep\"\n    ld_index_raw_template: str = \"gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht\"\n    ld_radius: int = 500_000\n    grch37_to_grch38_chain_path: str = MISSING\n    ld_index_template: str = MISSING\n    ld_populations: List[str] = field(\n        default_factory=lambda: [\n            \"afr\",  # African-American\n            \"amr\",  # American Admixed/Latino\n            \"asj\",  # Ashkenazi Jewish\n            \"eas\",  # East Asian\n            \"fin\",  # Finnish\n            \"nfe\",  # Non-Finnish European\n            \"nwe\",  # Northwestern European\n            \"seu\",  # Southeastern European\n        ]\n    )\n</code></pre>"},{"location":"components/step/ld_index/#otg.ld_index.LDIndexStep.run","title":"<code>run()</code>","text":"<p>Run LD index step.</p> Source code in <code>src/otg/ld_index.py</code> <pre><code>def run(self: LDIndexStep) -&gt; None:\n\"\"\"Run LD index step.\"\"\"\n    # init hail session\n    hl.init(sc=self.session.spark.sparkContext, log=\"/dev/null\")\n\n    for population in self.ld_populations:\n        self.session.logger.info(f\"Processing population: {population}\")\n        ld_index = LDIndex.create(\n            self.ld_index_raw_template.format(POP=population),\n            self.ld_radius,\n            self.grch37_to_grch38_chain_path,\n        )\n\n        self.session.logger.info(\n            f\"Writing ls index to: {self.ld_index_template.format(POP=population)}\"\n        )\n        (\n            ld_index.df.write.partitionBy(\"chromosome\")\n            .mode(self.session.write_mode)\n            .parquet(self.ld_index_template.format(POP=population))  # noqa: FS002\n        )\n</code></pre>"},{"location":"components/step/ukbiobank/","title":"UKBiobank","text":"<p>         Bases: <code>UKBiobankStepConfig</code></p> <p>UKBiobank study table ingestion step.</p> Source code in <code>src/otg/ukbiobank.py</code> <pre><code>@dataclass\nclass UKBiobankStep(UKBiobankStepConfig):\n\"\"\"UKBiobank study table ingestion step.\"\"\"\n\n    session: Session = Session()\n\n    def run(self: UKBiobankStep) -&gt; None:\n\"\"\"Run UKBiobank study table ingestion step.\"\"\"\n        # Read in the UKBiobank manifest tsv file.\n        df = self.session.spark.read.csv(\n            self.ukbiobank_manifest, sep=\"\\t\", header=True, inferSchema=True\n        )\n\n        # Parse the study index data.\n        ukbiobank_study_index = StudyIndexUKBiobank.from_source(df)\n\n        # Write the output.\n        ukbiobank_study_index.df.write.mode(self.session.write_mode).parquet(\n            self.ukbiobank_study_index_out\n        )\n</code></pre> <p>UKBiobank study table ingestion step requirements.</p> <p>Attributes:</p> Name Type Description <code>ukbiobank_manifest</code> <code>str</code> <p>UKBiobank manifest of studies.</p> <code>ukbiobank_study_index_out</code> <code>str</code> <p>Output path for the UKBiobank study index dataset.</p> Source code in <code>src/otg/config.py</code> <pre><code>@dataclass\nclass UKBiobankStepConfig:\n\"\"\"UKBiobank study table ingestion step requirements.\n\n    Attributes:\n        ukbiobank_manifest (str): UKBiobank manifest of studies.\n        ukbiobank_study_index_out (str): Output path for the UKBiobank study index dataset.\n    \"\"\"\n\n    _target_: str = \"otg.ukbiobank.UKBiobankStep\"\n    ukbiobank_manifest: str = MISSING\n    ukbiobank_study_index_out: str = MISSING\n</code></pre>"},{"location":"components/step/ukbiobank/#otg.ukbiobank.UKBiobankStep.run","title":"<code>run()</code>","text":"<p>Run UKBiobank study table ingestion step.</p> Source code in <code>src/otg/ukbiobank.py</code> <pre><code>def run(self: UKBiobankStep) -&gt; None:\n\"\"\"Run UKBiobank study table ingestion step.\"\"\"\n    # Read in the UKBiobank manifest tsv file.\n    df = self.session.spark.read.csv(\n        self.ukbiobank_manifest, sep=\"\\t\", header=True, inferSchema=True\n    )\n\n    # Parse the study index data.\n    ukbiobank_study_index = StudyIndexUKBiobank.from_source(df)\n\n    # Write the output.\n    ukbiobank_study_index.df.write.mode(self.session.write_mode).parquet(\n        self.ukbiobank_study_index_out\n    )\n</code></pre>"},{"location":"components/step/variant_annotation_step/","title":"Variant annotation","text":"<p>         Bases: <code>VariantAnnotationStepConfig</code></p> <p>Variant annotation step.</p> <p>Variant annotation step produces a dataset of the type <code>VariantAnnotation</code> derived from gnomADs <code>gnomad.genomes.vX.X.X.sites.ht</code> Hail's table. This dataset is used to validate variants and as a source of annotation.</p> Source code in <code>src/otg/variant_annotation.py</code> <pre><code>@dataclass\nclass VariantAnnotationStep(VariantAnnotationStepConfig):\n\"\"\"Variant annotation step.\n\n    Variant annotation step produces a dataset of the type `VariantAnnotation` derived from gnomADs `gnomad.genomes.vX.X.X.sites.ht` Hail's table. This dataset is used to validate variants and as a source of annotation.\n    \"\"\"\n\n    session: Session = Session()\n\n    def run(self: VariantAnnotationStep) -&gt; None:\n\"\"\"Run variant annotation step.\"\"\"\n        # init hail session\n        hl.init(sc=self.session.spark.sparkContext, log=\"/dev/null\")\n\n\"\"\"Run variant annotation step.\"\"\"\n        variant_annotation = VariantAnnotation.from_gnomad(\n            self.gnomad_genomes,\n            self.chain_38_to_37,\n            self.populations,\n        )\n        # Writing data partitioned by chromosome and position:\n        (\n            variant_annotation.df.repartition(400, \"chromosome\")\n            .sortWithinPartitions(\"chromosome\", \"position\")\n            .write.partitionBy(\"chromosome\")\n            .mode(self.session.write_mode)\n            .parquet(self.variant_annotation_path)\n        )\n</code></pre> <p>Variant annotation step requirements.</p> <p>Attributes:</p> Name Type Description <code>gnomad_genomes</code> <code>str</code> <p>Path to gnomAD genomes hail table.</p> <code>chain_38_to_37</code> <code>str</code> <p>Path to GRCh38 to GRCh37 chain file.</p> <code>variant_annotation_path</code> <code>str</code> <p>Output variant annotation path.</p> <code>populations</code> <code>List[str]</code> <p>List of populations to include.</p> Source code in <code>src/otg/config.py</code> <pre><code>@dataclass\nclass VariantAnnotationStepConfig:\n\"\"\"Variant annotation step requirements.\n\n    Attributes:\n        gnomad_genomes (str): Path to gnomAD genomes hail table.\n        chain_38_to_37 (str): Path to GRCh38 to GRCh37 chain file.\n        variant_annotation_path (str): Output variant annotation path.\n        populations (List[str]): List of populations to include.\n    \"\"\"\n\n    _target_: str = \"otg.variant_annotation.VariantAnnotationStep\"\n    gnomad_genomes: str = MISSING\n    chain_38_to_37: str = MISSING\n    variant_annotation_path: str = MISSING\n    populations: List[str] = field(\n        default_factory=lambda: [\n            \"afr\",  # African-American\n            \"amr\",  # American Admixed/Latino\n            \"ami\",  # Amish ancestry\n            \"asj\",  # Ashkenazi Jewish\n            \"eas\",  # East Asian\n            \"fin\",  # Finnish\n            \"nfe\",  # Non-Finnish European\n            \"mid\",  # Middle Eastern\n            \"sas\",  # South Asian\n            \"oth\",  # Other\n        ]\n    )\n</code></pre>"},{"location":"components/step/variant_annotation_step/#otg.variant_annotation.VariantAnnotationStep.run","title":"<code>run()</code>","text":"<p>Run variant annotation step.</p> Source code in <code>src/otg/variant_annotation.py</code> <pre><code>def run(self: VariantAnnotationStep) -&gt; None:\n\"\"\"Run variant annotation step.\"\"\"\n    # init hail session\n    hl.init(sc=self.session.spark.sparkContext, log=\"/dev/null\")\n\n\"\"\"Run variant annotation step.\"\"\"\n    variant_annotation = VariantAnnotation.from_gnomad(\n        self.gnomad_genomes,\n        self.chain_38_to_37,\n        self.populations,\n    )\n    # Writing data partitioned by chromosome and position:\n    (\n        variant_annotation.df.repartition(400, \"chromosome\")\n        .sortWithinPartitions(\"chromosome\", \"position\")\n        .write.partitionBy(\"chromosome\")\n        .mode(self.session.write_mode)\n        .parquet(self.variant_annotation_path)\n    )\n</code></pre>"},{"location":"components/step/variant_index_step/","title":"Variant index","text":"<p>         Bases: <code>VariantIndexStepConfig</code></p> <p>Variant index step.</p> <p>Using a <code>VariantAnnotation</code> dataset as a reference, this step creates and writes a dataset of the type <code>VariantIndex</code> that includes only variants that have disease-association data with a reduced set of annotations.</p> Source code in <code>src/otg/variant_index.py</code> <pre><code>@dataclass\nclass VariantIndexStep(VariantIndexStepConfig):\n\"\"\"Variant index step.\n\n    Using a `VariantAnnotation` dataset as a reference, this step creates and writes a dataset of the type `VariantIndex` that includes only variants that have disease-association data with a reduced set of annotations.\n    \"\"\"\n\n    session: Session = Session()\n\n    def run(self: VariantIndexStep) -&gt; None:\n\"\"\"Run variant index step.\"\"\"\n        # Variant annotation dataset\n        va = VariantAnnotation.from_parquet(self.session, self.variant_annotation_path)\n\n        # Study-locus dataset\n        study_locus = StudyLocus.from_parquet(self.session, self.study_locus_path)\n\n        # Reduce scope of variant annotation dataset to only variants in study-locus sets:\n        va_slimmed = va.filter_by_variant_df(\n            study_locus.unique_lead_tag_variants(), [\"id\", \"chromosome\"]\n        )\n\n        # Generate variant index ussing a subset of the variant annotation dataset\n        vi = VariantIndex.from_variant_annotation(va_slimmed)\n\n        # Write data:\n        # self.etl.logger.info(\n        #     f\"Writing invalid variants from the credible set to: {self.variant_invalid}\"\n        # )\n        # vi.invalid_variants.write.mode(self.etl.write_mode).parquet(\n        #     self.variant_invalid\n        # )\n\n        self.session.logger.info(f\"Writing variant index to: {self.variant_index_path}\")\n        (\n            vi.df.write.partitionBy(\"chromosome\")\n            .mode(self.session.write_mode)\n            .parquet(self.variant_index_path)\n        )\n</code></pre> <p>Variant index step requirements.</p> <p>Attributes:</p> Name Type Description <code>variant_annotation_path</code> <code>str</code> <p>Input variant annotation path.</p> <code>study_locus_path</code> <code>str</code> <p>Input study-locus path.</p> <code>variant_index_path</code> <code>str</code> <p>Output variant index path.</p> Source code in <code>src/otg/config.py</code> <pre><code>@dataclass\nclass VariantIndexStepConfig:\n\"\"\"Variant index step requirements.\n\n    Attributes:\n        variant_annotation_path (str): Input variant annotation path.\n        study_locus_path (str): Input study-locus path.\n        variant_index_path (str): Output variant index path.\n    \"\"\"\n\n    _target_: str = \"otg.variant_index.VariantIndexStep\"\n    variant_annotation_path: str = MISSING\n    study_locus_path: str = MISSING\n    variant_index_path: str = MISSING\n</code></pre>"},{"location":"components/step/variant_index_step/#otg.variant_index.VariantIndexStep.run","title":"<code>run()</code>","text":"<p>Run variant index step.</p> Source code in <code>src/otg/variant_index.py</code> <pre><code>def run(self: VariantIndexStep) -&gt; None:\n\"\"\"Run variant index step.\"\"\"\n    # Variant annotation dataset\n    va = VariantAnnotation.from_parquet(self.session, self.variant_annotation_path)\n\n    # Study-locus dataset\n    study_locus = StudyLocus.from_parquet(self.session, self.study_locus_path)\n\n    # Reduce scope of variant annotation dataset to only variants in study-locus sets:\n    va_slimmed = va.filter_by_variant_df(\n        study_locus.unique_lead_tag_variants(), [\"id\", \"chromosome\"]\n    )\n\n    # Generate variant index ussing a subset of the variant annotation dataset\n    vi = VariantIndex.from_variant_annotation(va_slimmed)\n\n    # Write data:\n    # self.etl.logger.info(\n    #     f\"Writing invalid variants from the credible set to: {self.variant_invalid}\"\n    # )\n    # vi.invalid_variants.write.mode(self.etl.write_mode).parquet(\n    #     self.variant_invalid\n    # )\n\n    self.session.logger.info(f\"Writing variant index to: {self.variant_index_path}\")\n    (\n        vi.df.write.partitionBy(\"chromosome\")\n        .mode(self.session.write_mode)\n        .parquet(self.variant_index_path)\n    )\n</code></pre>"},{"location":"components/step/variant_to_gene_step/","title":"V2G","text":"<p>         Bases: <code>V2GStepConfig</code></p> <p>Variant-to-gene (V2G) step.</p> <p>This step aims to generate a dataset that contains multiple pieces of evidence supporting the functional association of specific variants with genes. Some of the evidence types include:</p> <ol> <li>Chromatin interaction experiments, e.g. Promoter Capture Hi-C (PCHi-C).</li> <li>In silico functional predictions, e.g. Variant Effect Predictor (VEP) from Ensembl.</li> <li>Distance between the variant and each gene's canonical transcription start site (TSS).</li> </ol> Source code in <code>src/otg/v2g.py</code> <pre><code>@dataclass\nclass V2GStep(V2GStepConfig):\n\"\"\"Variant-to-gene (V2G) step.\n\n    This step aims to generate a dataset that contains multiple pieces of evidence supporting the functional association of specific variants with genes. Some of the evidence types include:\n\n    1. Chromatin interaction experiments, e.g. Promoter Capture Hi-C (PCHi-C).\n    2. In silico functional predictions, e.g. Variant Effect Predictor (VEP) from Ensembl.\n    3. Distance between the variant and each gene's canonical transcription start site (TSS).\n\n    \"\"\"\n\n    session: Session = Session()\n\n    def run(self: V2GStep) -&gt; None:\n\"\"\"Run V2G dataset generation.\"\"\"\n        # Filter gene index by approved biotypes to define V2G gene universe\n        gene_index_filtered = GeneIndex.from_parquet(\n            self.session, self.gene_index_path\n        ).filter_by_biotypes(self.approved_biotypes)\n\n        vi = VariantIndex.from_parquet(self.session, self.variant_index_path).persist()\n        va = VariantAnnotation.from_parquet(self.session, self.variant_annotation_path)\n        vep_consequences = self.session.spark.read.csv(\n            self.vep_consequences_path, sep=\"\\t\", header=True\n        )\n\n        # Variant annotation reduced to the variant index to define V2G variant universe\n        va_slimmed = va.filter_by_variant_df(vi.df, [\"id\", \"chromosome\"]).persist()\n\n        # lift over variants to hg38\n        lift = LiftOverSpark(\n            self.liftover_chain_file_path, self.liftover_max_length_difference\n        )\n\n        v2g_datasets = [\n            va_slimmed.get_distance_to_tss(gene_index_filtered, self.max_distance),\n            # variant effects\n            va_slimmed.get_most_severe_vep_v2g(vep_consequences, gene_index_filtered),\n            va_slimmed.get_polyphen_v2g(gene_index_filtered),\n            va_slimmed.get_sift_v2g(gene_index_filtered),\n            va_slimmed.get_plof_v2g(gene_index_filtered),\n            # intervals\n            Intervals.parse_andersson(\n                self.session, self.anderson_path, gene_index_filtered, lift\n            ).v2g(vi),\n            Intervals.parse_javierre(\n                self.session, self.javierre_path, gene_index_filtered, lift\n            ).v2g(vi),\n            Intervals.parse_jung(\n                self.session, self.jung_path, gene_index_filtered, lift\n            ).v2g(vi),\n            Intervals.parse_thurman(\n                self.session, self.thurnman_path, gene_index_filtered, lift\n            ).v2g(vi),\n        ]\n\n        # merge all V2G datasets\n        v2g = V2G(\n            _df=reduce(\n                lambda x, y: x.unionByName(y, allowMissingColumns=True),\n                [dataset.df for dataset in v2g_datasets],\n            ).repartition(\"chromosome\")\n        )\n        # write V2G dataset\n        (\n            v2g.df.write.partitionBy(\"chromosome\")\n            .mode(self.session.write_mode)\n            .parquet(self.v2g_path)\n        )\n</code></pre> <p>Variant to gene (V2G) step requirements.</p> <p>Attributes:</p> Name Type Description <code>variant_index_path</code> <code>str</code> <p>Input variant index path.</p> <code>variant_annotation_path</code> <code>str</code> <p>Input variant annotation path.</p> <code>gene_index_path</code> <code>str</code> <p>Input gene index path.</p> <code>vep_consequences_path</code> <code>str</code> <p>Input VEP consequences path.</p> <code>lift_over_chain_file_path</code> <code>str</code> <p>Path to GRCh37 to GRCh38 chain file.</p> <code>approved_biotypes</code> <code>list[str]</code> <p>List of approved biotypes.</p> <code>anderson_path</code> <code>str</code> <p>Anderson intervals path.</p> <code>javierre_path</code> <code>str</code> <p>Javierre intervals path.</p> <code>jung_path</code> <code>str</code> <p>Jung intervals path.</p> <code>thurnman_path</code> <code>str</code> <p>Thurnman intervals path.</p> <code>liftover_max_length_difference</code> <code>int</code> <p>Maximum length difference for liftover.</p> <code>max_distance</code> <code>int</code> <p>Maximum distance to consider.</p> <code>output_path</code> <code>str</code> <p>Output V2G path.</p> Source code in <code>src/otg/config.py</code> <pre><code>@dataclass\nclass V2GStepConfig:\n\"\"\"Variant to gene (V2G) step requirements.\n\n    Attributes:\n        variant_index_path (str): Input variant index path.\n        variant_annotation_path (str): Input variant annotation path.\n        gene_index_path (str): Input gene index path.\n        vep_consequences_path (str): Input VEP consequences path.\n        lift_over_chain_file_path (str): Path to GRCh37 to GRCh38 chain file.\n        approved_biotypes (list[str]): List of approved biotypes.\n        anderson_path (str): Anderson intervals path.\n        javierre_path (str): Javierre intervals path.\n        jung_path (str): Jung intervals path.\n        thurnman_path (str): Thurnman intervals path.\n        liftover_max_length_difference (int): Maximum length difference for liftover.\n        max_distance (int): Maximum distance to consider.\n        output_path (str): Output V2G path.\n    \"\"\"\n\n    _target_: str = \"otg.v2g.V2GStep\"\n    variant_index_path: str = MISSING\n    variant_annotation_path: str = MISSING\n    gene_index_path: str = MISSING\n    vep_consequences_path: str = MISSING\n    liftover_chain_file_path: str = MISSING\n    anderson_path: str = MISSING\n    javierre_path: str = MISSING\n    jung_path: str = MISSING\n    thurnman_path: str = MISSING\n    liftover_max_length_difference: int = 100\n    max_distance: int = 500_000\n    v2g_path: str = MISSING\n    approved_biotypes: List[str] = field(\n        default_factory=lambda: [\n            \"protein_coding\",\n            \"3prime_overlapping_ncRNA\",\n            \"antisense\",\n            \"bidirectional_promoter_lncRNA\",\n            \"IG_C_gene\",\n            \"IG_D_gene\",\n            \"IG_J_gene\",\n            \"IG_V_gene\",\n            \"lincRNA\",\n            \"macro_lncRNA\",\n            \"non_coding\",\n            \"sense_intronic\",\n            \"sense_overlapping\",\n        ]\n    )\n</code></pre>"},{"location":"components/step/variant_to_gene_step/#otg.v2g.V2GStep.run","title":"<code>run()</code>","text":"<p>Run V2G dataset generation.</p> Source code in <code>src/otg/v2g.py</code> <pre><code>def run(self: V2GStep) -&gt; None:\n\"\"\"Run V2G dataset generation.\"\"\"\n    # Filter gene index by approved biotypes to define V2G gene universe\n    gene_index_filtered = GeneIndex.from_parquet(\n        self.session, self.gene_index_path\n    ).filter_by_biotypes(self.approved_biotypes)\n\n    vi = VariantIndex.from_parquet(self.session, self.variant_index_path).persist()\n    va = VariantAnnotation.from_parquet(self.session, self.variant_annotation_path)\n    vep_consequences = self.session.spark.read.csv(\n        self.vep_consequences_path, sep=\"\\t\", header=True\n    )\n\n    # Variant annotation reduced to the variant index to define V2G variant universe\n    va_slimmed = va.filter_by_variant_df(vi.df, [\"id\", \"chromosome\"]).persist()\n\n    # lift over variants to hg38\n    lift = LiftOverSpark(\n        self.liftover_chain_file_path, self.liftover_max_length_difference\n    )\n\n    v2g_datasets = [\n        va_slimmed.get_distance_to_tss(gene_index_filtered, self.max_distance),\n        # variant effects\n        va_slimmed.get_most_severe_vep_v2g(vep_consequences, gene_index_filtered),\n        va_slimmed.get_polyphen_v2g(gene_index_filtered),\n        va_slimmed.get_sift_v2g(gene_index_filtered),\n        va_slimmed.get_plof_v2g(gene_index_filtered),\n        # intervals\n        Intervals.parse_andersson(\n            self.session, self.anderson_path, gene_index_filtered, lift\n        ).v2g(vi),\n        Intervals.parse_javierre(\n            self.session, self.javierre_path, gene_index_filtered, lift\n        ).v2g(vi),\n        Intervals.parse_jung(\n            self.session, self.jung_path, gene_index_filtered, lift\n        ).v2g(vi),\n        Intervals.parse_thurman(\n            self.session, self.thurnman_path, gene_index_filtered, lift\n        ).v2g(vi),\n    ]\n\n    # merge all V2G datasets\n    v2g = V2G(\n        _df=reduce(\n            lambda x, y: x.unionByName(y, allowMissingColumns=True),\n            [dataset.df for dataset in v2g_datasets],\n        ).repartition(\"chromosome\")\n    )\n    # write V2G dataset\n    (\n        v2g.df.write.partitionBy(\"chromosome\")\n        .mode(self.session.write_mode)\n        .parquet(self.v2g_path)\n    )\n</code></pre>"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"]},"docs":[{"location":"","title":"Home","text":"<p>Ingestion and analysis of genetic and functional genomic data for the identification and prioritisation of drug targets.</p> <p>This project is still in experimental phase. Please refer to the roadmap section for more information.</p> <p>For information on how to configure the development environment, run the code, or contribute changes, see the contributing section. For known technical issues and solutions to them, see the troubleshooting section.</p>"},{"location":"contributing/","title":"Environment configuration and contributing changes","text":""},{"location":"contributing/#one-time-configuration","title":"One-time configuration","text":"<p>The steps in this section only ever need to be done once on any particular system.</p> <p>Google Cloud configuration: 1. Install Google Cloud SDK: https://cloud.google.com/sdk/docs/install. 1. Log in to your work Google Account: run <code>gcloud auth login</code> and follow instructions. 1. Obtain Google application credentials: run <code>gcloud auth application-default login</code> and follow instructions.</p> <p>Check that you have the <code>make</code> utility installed, and if not (which is unlikely), install it using your system package manager.</p> <p>Check that you have <code>java</code> installed.</p>"},{"location":"contributing/#environment-configuration","title":"Environment configuration","text":"<p>Run <code>make setup-dev</code> to install/update the necessary packages and activate the development environment. You need to do this every time you open a new shell.</p> <p>It is recommended to use VS Code as an IDE for development.</p>"},{"location":"contributing/#how-to-run-the-code","title":"How to run the code","text":"<p>All pipelines in this repository are intended to be run in Google Dataproc. Running them locally is not currently supported.</p> <p>In order to run the code:</p> <ol> <li> <p>Manually edit your local <code>workflow/dag.yaml</code> file and comment out the steps you do not want to run.</p> </li> <li> <p>Manually edit your local <code>pyproject.toml</code> file and modify the version of the code.</p> <ul> <li>This must be different from the version used by any other people working on the repository to avoid any deployment conflicts, so it's a good idea to use your name, for example: <code>1.2.3+jdoe</code>.</li> <li>You can also add a brief branch description, for example: <code>1.2.3+jdoe.myfeature</code>.</li> <li>Note that the version must comply with PEP440 conventions, otherwise Poetry will not allow it to be deployed.</li> <li>Do not use underscores or hyphens in your version name. When building the WHL file, they will be automatically converted to dots, which means the file name will no longer match the version and the build will fail. Use dots instead.</li> </ul> </li> <li> <p>Run <code>make build</code>.</p> <ul> <li>This will create a bundle containing the neccessary code, configuration and dependencies to run the ETL pipeline, and then upload this bundle to Google Cloud.</li> <li>A version specific subpath is used, so uploading the code will not affect any branches but your own.</li> <li>If there was already a code bundle uploaded with the same version number, it will be replaced.</li> </ul> </li> <li> <p>Submit the Dataproc job with <code>poetry run python workflow/workflow_template.py</code></p> <ul> <li>You will need to specify additional parameters, some are mandatory and some are optional. Run with <code>--help</code> to see usage.</li> <li>The script will provision the cluster and submit the job.</li> <li>The cluster will take a few minutes to get provisioned and running, during which the script will not output anything, this is normal.</li> <li>Once submitted, you can monitor the progress of your job on this page: https://console.cloud.google.com/dataproc/jobs?project=open-targets-genetics-dev.</li> <li>On completion (whether successful or a failure), the cluster will be automatically removed, so you don't have to worry about shutting it down to avoid incurring charges.</li> </ul> </li> </ol>"},{"location":"contributing/#how-to-generate-a-local-copy-of-the-documentation","title":"How to generate a local copy of the documentation","text":"<p>Run <code>poetry run mkdocs serve</code>. This will generate the local copy of the documentation and will start a local server to browse it (URL will be printed, usually http://127.0.0.1:8000/).</p>"},{"location":"contributing/#how-to-run-the-tests","title":"How to run the tests","text":"<p>Run <code>poetry run pytest</code>.</p>"},{"location":"contributing/#contributing-checklist","title":"Contributing checklist","text":"<p>When making changes, and especially when implementing a new module or feature, it's essential to ensure that all relevant sections of the code base are modified.</p>"},{"location":"contributing/#documentation","title":"Documentation","text":"<ul> <li>If during development you had a question which wasn't covered in the documentation, and someone explained it to you, add it to the documentation. The same applies if you encountered any instructions in the documentation which were obsolete or incorrect.</li> <li>Documentation autogeneration expressions start with <code>:::</code>. They will automatically generate sections of the documentation based on class and method docstrings. Be sure to update them for:</li> <li>Dataset definitions in <code>docs/reference/dataset</code> (example: <code>docs/reference/dataset/study_index/study_index_finngen.md</code>)</li> <li>Step definitions in <code>docs/reference/step</code> (example: <code>docs/reference/step/finngen.md</code>)</li> </ul>"},{"location":"contributing/#configuration","title":"Configuration","text":"<ul> <li>Input and output paths in <code>config/datasets/gcp.yaml</code></li> <li>Step configuration in <code>config/step/my_STEP.yaml</code> (example: <code>config/step/my_finngen.yaml</code>)</li> </ul>"},{"location":"contributing/#classes","title":"Classes","text":"<ul> <li>Step configuration class in <code>src/org/config.py</code> (example: <code>FinnGenStepConfig</code> class in that module)</li> <li>Dataset class in <code>src/org/dataset/</code> (example: <code>src/otg/dataset/study_index.py</code> \u2192 <code>StudyIndexFinnGen</code>)</li> <li>Step main running class in <code>src/org/STEP.py</code> (example: <code>src/org/finngen.py</code>)</li> </ul>"},{"location":"contributing/#tests","title":"Tests","text":"<ul> <li>Test study fixture in <code>tests/conftest.py</code> (example: <code>mock_study_index_finngen</code> in that module)</li> <li>Test sample data in <code>tests/data_samples</code> (example: <code>tests/data_samples/finngen_studies_sample.json</code>)</li> <li>Test definition in <code>tests/</code> (example: <code>tests/dataset/test_study_index.py</code> \u2192 <code>test_study_index_finngen_creation</code>)</li> </ul>"},{"location":"roadmap/","title":"Roadmap","text":"<p>The Open Targets core team is working on refactoring Open Targets Genetics, aiming to:</p> <ul> <li>Re-focus the product around Target ID</li> <li>Create a gold standard toolkit for post-GWAS analysis</li> <li>Faster/robust addition of new datasets and datatypes</li> <li>Reduce computational and financial cost</li> </ul> <p>See here for a list of open issues for this project.</p> <p>Schematic diagram representing the drafted process:</p> <p></p>"},{"location":"troubleshooting/","title":"Troubleshooting","text":""},{"location":"troubleshooting/#blaslapack","title":"BLAS/LAPACK","text":"<p>If you see errors related to BLAS/LAPACK libraries, see this StackOverflow post for guidance.</p>"},{"location":"troubleshooting/#pyenv-and-poetry","title":"Pyenv and Poetry","text":"<p>If you see various errors thrown by Pyenv or Poetry, they can be hard to specifically diagnose and resolve. In this case, it often helps to remove those tools from the system completely. Follow these steps:</p> <ol> <li>Close your currently activated environment, if any: <code>exit</code></li> <li>Uninstall Poetry: <code>curl -sSL https://install.python-poetry.org | python3 - --uninstall</code></li> <li>Clear Poetry cache: <code>rm -rf ~/.cache/pypoetry</code></li> <li>Clear pre-commit cache: <code>rm -rf ~/.cache/pre-commit</code></li> <li>Switch to system Python shell: <code>pyenv shell system</code></li> <li>Edit <code>~/.bashrc</code> to remove the lines related to Pyenv configuration</li> <li>Remove Pyenv configuration and cache: <code>rm -rf ~/.pyenv</code></li> </ol> <p>After that, open a fresh shell session and run <code>make setup-dev</code> again.</p>"},{"location":"troubleshooting/#java","title":"Java","text":"<p>Officially, PySpark requires Java version 8 (a.k.a. 1.8) or above to work. However, if you have a very recent version of Java, you may experience issues, as it may introduce breaking changes that PySpark hasn't had time to integrate. For example, as of May 2023, PySpark did not work with Java 20.</p> <p>If you are encountering problems with initialising a Spark session, try using Java 11.</p>"},{"location":"troubleshooting/#pre-commit","title":"Pre-commit","text":"<p>If you see an error message thrown by pre-commit, which looks like this (<code>SyntaxError: Unexpected token '?'</code>), followed by a JavaScript traceback, the issue is likely with your system NodeJS version.</p> <p>One solution which can help in this case is to upgrade your system NodeJS version. However, this may not always be possible. For example, Ubuntu repository is several major versions behind the latest version as of July 2023.</p> <p>Another solution which helps is to remove Node, NodeJS, and npm from your system entirely. In this case, pre-commit will not try to rely on a system version of NodeJS and will install its own, suitable one.</p> <p>On Ubuntu, this can be done using <code>sudo apt remove node nodejs npm</code>, followed by <code>sudo apt autoremove</code>. But in some cases, depending on your existing installation, you may need to also manually remove some files. See this StackOverflow answer for guidance.</p> <p>After running these commands, you are advised to open a fresh shell, and then also reinstall Pyenv and Poetry to make sure they pick up the changes (see relevant section above).</p>"},{"location":"components/dataset/_dataset/","title":"Dataset","text":"<p>Open Targets Genetics Dataset.</p> <p><code>Dataset</code> is a wrapper around a Spark DataFrame with a predefined schema. Schemas for each child dataset are described in the <code>json.schemas</code> module.</p> Source code in <code>src/otg/dataset/dataset.py</code> <pre><code>@dataclass\nclass Dataset:\n\"\"\"Open Targets Genetics Dataset.\n\n    `Dataset` is a wrapper around a Spark DataFrame with a predefined schema. Schemas for each child dataset are described in the `json.schemas` module.\n    \"\"\"\n\n    _df: DataFrame\n    _schema: StructType\n\n    def __post_init__(self: Dataset) -&gt; None:\n\"\"\"Post init.\"\"\"\n        self.validate_schema()\n\n    @property\n    def df(self: Dataset) -&gt; DataFrame:\n\"\"\"Dataframe included in the Dataset.\"\"\"\n        return self._df\n\n    @df.setter\n    def df(self: Dataset, new_df: DataFrame) -&gt; None:  # noqa: CCE001\n\"\"\"Dataframe setter.\"\"\"\n        self._df = new_df\n        self.validate_schema()\n\n    @property\n    def schema(self: Dataset) -&gt; StructType:\n\"\"\"Dataframe expected schema.\"\"\"\n        return self._schema\n\n    @classmethod\n    def from_parquet(\n        cls: type[Dataset], session: Session, path: str, schema: StructType\n    ) -&gt; Dataset:\n\"\"\"Reads a parquet file into a Dataset with a given schema.\n\n        Args:\n            session (Session): ETL session\n            path (str): Path to parquet file\n            schema (StructType): Schema to use\n\n        Returns:\n            Dataset: Dataset with given schema\n        \"\"\"\n        df = session.read_parquet(path=path, schema=schema)\n        return cls(_df=df, _schema=schema)\n\n    def validate_schema(self: Dataset) -&gt; None:  # sourcery skip: invert-any-all\n\"\"\"Validate DataFrame schema against expected class schema.\n\n        Raises:\n            ValueError: DataFrame schema is not valid\n        \"\"\"\n        expected_schema = self._schema\n        expected_fields = flatten_schema(expected_schema)\n        observed_schema = self._df.schema\n        observed_fields = flatten_schema(observed_schema)\n\n        # Unexpected fields in dataset\n        if unexpected_struct_fields := [\n            x for x in observed_fields if x not in expected_fields\n        ]:\n            raise ValueError(\n                f\"The {unexpected_struct_fields} fields are not included in DataFrame schema: {expected_fields}\"\n            )\n\n        # Required fields not in dataset\n        required_fields = [x.name for x in expected_schema if not x.nullable]\n        if missing_required_fields := [\n            req\n            for req in required_fields\n            if not any(field.name == req for field in observed_fields)\n        ]:\n            raise ValueError(\n                f\"The {missing_required_fields} fields are required but missing: {required_fields}\"\n            )\n\n        # Fields with duplicated names\n        if duplicated_fields := [\n            x for x in set(observed_fields) if observed_fields.count(x) &gt; 1\n        ]:\n            raise ValueError(\n                f\"The following fields are duplicated in DataFrame schema: {duplicated_fields}\"\n            )\n\n        # Fields with different datatype\n        if fields_with_different_observed_datatype := [\n            field\n            for field in set(observed_fields)\n            if observed_fields.count(field) != expected_fields.count(field)\n        ]:\n            raise ValueError(\n                f\"The following fields present differences in their datatypes: {fields_with_different_observed_datatype}.\"\n            )\n</code></pre>"},{"location":"components/dataset/_dataset/#otg.dataset.dataset.Dataset.df","title":"<code>df: DataFrame</code>  <code>property</code> <code>writable</code>","text":"<p>Dataframe included in the Dataset.</p>"},{"location":"components/dataset/_dataset/#otg.dataset.dataset.Dataset.schema","title":"<code>schema: StructType</code>  <code>property</code>","text":"<p>Dataframe expected schema.</p>"},{"location":"components/dataset/_dataset/#otg.dataset.dataset.Dataset.__post_init__","title":"<code>__post_init__()</code>","text":"<p>Post init.</p> Source code in <code>src/otg/dataset/dataset.py</code> <pre><code>def __post_init__(self: Dataset) -&gt; None:\n\"\"\"Post init.\"\"\"\n    self.validate_schema()\n</code></pre>"},{"location":"components/dataset/_dataset/#otg.dataset.dataset.Dataset.from_parquet","title":"<code>from_parquet(session, path, schema)</code>  <code>classmethod</code>","text":"<p>Reads a parquet file into a Dataset with a given schema.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>ETL session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <code>schema</code> <code>StructType</code> <p>Schema to use</p> required <p>Returns:</p> Name Type Description <code>Dataset</code> <code>Dataset</code> <p>Dataset with given schema</p> Source code in <code>src/otg/dataset/dataset.py</code> <pre><code>@classmethod\ndef from_parquet(\n    cls: type[Dataset], session: Session, path: str, schema: StructType\n) -&gt; Dataset:\n\"\"\"Reads a parquet file into a Dataset with a given schema.\n\n    Args:\n        session (Session): ETL session\n        path (str): Path to parquet file\n        schema (StructType): Schema to use\n\n    Returns:\n        Dataset: Dataset with given schema\n    \"\"\"\n    df = session.read_parquet(path=path, schema=schema)\n    return cls(_df=df, _schema=schema)\n</code></pre>"},{"location":"components/dataset/_dataset/#otg.dataset.dataset.Dataset.validate_schema","title":"<code>validate_schema()</code>","text":"<p>Validate DataFrame schema against expected class schema.</p> <p>Raises:</p> Type Description <code>ValueError</code> <p>DataFrame schema is not valid</p> Source code in <code>src/otg/dataset/dataset.py</code> <pre><code>def validate_schema(self: Dataset) -&gt; None:  # sourcery skip: invert-any-all\n\"\"\"Validate DataFrame schema against expected class schema.\n\n    Raises:\n        ValueError: DataFrame schema is not valid\n    \"\"\"\n    expected_schema = self._schema\n    expected_fields = flatten_schema(expected_schema)\n    observed_schema = self._df.schema\n    observed_fields = flatten_schema(observed_schema)\n\n    # Unexpected fields in dataset\n    if unexpected_struct_fields := [\n        x for x in observed_fields if x not in expected_fields\n    ]:\n        raise ValueError(\n            f\"The {unexpected_struct_fields} fields are not included in DataFrame schema: {expected_fields}\"\n        )\n\n    # Required fields not in dataset\n    required_fields = [x.name for x in expected_schema if not x.nullable]\n    if missing_required_fields := [\n        req\n        for req in required_fields\n        if not any(field.name == req for field in observed_fields)\n    ]:\n        raise ValueError(\n            f\"The {missing_required_fields} fields are required but missing: {required_fields}\"\n        )\n\n    # Fields with duplicated names\n    if duplicated_fields := [\n        x for x in set(observed_fields) if observed_fields.count(x) &gt; 1\n    ]:\n        raise ValueError(\n            f\"The following fields are duplicated in DataFrame schema: {duplicated_fields}\"\n        )\n\n    # Fields with different datatype\n    if fields_with_different_observed_datatype := [\n        field\n        for field in set(observed_fields)\n        if observed_fields.count(field) != expected_fields.count(field)\n    ]:\n        raise ValueError(\n            f\"The following fields present differences in their datatypes: {fields_with_different_observed_datatype}.\"\n        )\n</code></pre>"},{"location":"components/dataset/colocalisation/","title":"Colocalisation","text":"<p>         Bases: <code>Dataset</code></p> <p>Colocalisation results for pairs of overlapping study-locus.</p> Source code in <code>src/otg/dataset/colocalisation.py</code> <pre><code>@dataclass\nclass Colocalisation(Dataset):\n\"\"\"Colocalisation results for pairs of overlapping study-locus.\"\"\"\n\n    _schema: StructType = parse_spark_schema(\"colocalisation.json\")\n\n    @classmethod\n    def from_parquet(\n        cls: type[Colocalisation], session: Session, path: str\n    ) -&gt; Colocalisation:\n\"\"\"Initialise Colocalisation dataset from parquet file.\n\n        Args:\n            session (Session): ETL session\n            path (str): Path to parquet file\n\n        Returns:\n            Colocalisation: Colocalisation results\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/colocalisation/#otg.dataset.colocalisation.Colocalisation.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise Colocalisation dataset from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>ETL session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>Colocalisation</code> <code>Colocalisation</code> <p>Colocalisation results</p> Source code in <code>src/otg/dataset/colocalisation.py</code> <pre><code>@classmethod\ndef from_parquet(\n    cls: type[Colocalisation], session: Session, path: str\n) -&gt; Colocalisation:\n\"\"\"Initialise Colocalisation dataset from parquet file.\n\n    Args:\n        session (Session): ETL session\n        path (str): Path to parquet file\n\n    Returns:\n        Colocalisation: Colocalisation results\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/colocalisation/#schema","title":"Schema","text":"<pre><code>root\n |-- left_studyLocusId: long (nullable = false)\n |-- right_studyLocusId: long (nullable = false)\n |-- chromosome: string (nullable = false)\n |-- colocalisationMethod: string (nullable = false)\n |-- coloc_n_vars: long (nullable = false)\n |-- coloc_h0: double (nullable = true)\n |-- coloc_h1: double (nullable = true)\n |-- coloc_h2: double (nullable = true)\n |-- coloc_h3: double (nullable = true)\n |-- coloc_h4: double (nullable = true)\n |-- coloc_log2_h4_h3: double (nullable = true)\n |-- clpp: double (nullable = true)\n</code></pre>"},{"location":"components/dataset/gene_index/","title":"Gene index","text":"<p>         Bases: <code>Dataset</code></p> <p>Gene index dataset.</p> <p>Gene-based annotation.</p> Source code in <code>src/otg/dataset/gene_index.py</code> <pre><code>@dataclass\nclass GeneIndex(Dataset):\n\"\"\"Gene index dataset.\n\n    Gene-based annotation.\n    \"\"\"\n\n    _schema: StructType = parse_spark_schema(\"targets.json\")\n\n    @staticmethod\n    def _get_gene_tss(strand_col: Column, start_col: Column, end_col: Column) -&gt; Column:\n\"\"\"Returns the TSS of a gene based on its orientation.\n\n        Args:\n            strand_col (Column): Column containing 1 if the coding strand of the gene is forward, and -1 if it is reverse.\n            start_col (Column): Column containing the start position of the gene.\n            end_col (Column): Column containing the end position of the gene.\n\n        Returns:\n            Column: Column containing the TSS of the gene.\n\n        Examples:\n            &gt;&gt;&gt; df = spark.createDataFrame([{\"strand\": 1, \"start\": 100, \"end\": 200}, {\"strand\": -1, \"start\": 100, \"end\": 200}])\n            &gt;&gt;&gt; df.withColumn(\"tss\", GeneIndex._get_gene_tss(f.col(\"strand\"), f.col(\"start\"), f.col(\"end\"))).show()\n            +---+-----+------+---+\n            |end|start|strand|tss|\n            +---+-----+------+---+\n            |200|  100|     1|100|\n            |200|  100|    -1|200|\n            +---+-----+------+---+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        return f.when(strand_col == 1, start_col).when(strand_col == -1, end_col)\n\n    @classmethod\n    def from_source(cls: type[GeneIndex], target_index: DataFrame) -&gt; GeneIndex:\n\"\"\"Initialise GeneIndex from source dataset.\n\n        Args:\n            target_index (DataFrame): Target index dataframe\n\n        Returns:\n            GeneIndex: Gene index dataset\n        \"\"\"\n        return cls(\n            _df=target_index.select(\n                f.coalesce(f.col(\"id\"), f.lit(\"unknown\")).alias(\"geneId\"),\n                f.coalesce(f.col(\"genomicLocation.chromosome\"), f.lit(\"unknown\")).alias(\n                    \"chromosome\"\n                ),\n                GeneIndex._get_gene_tss(\n                    f.col(\"genomicLocation.strand\"),\n                    f.col(\"genomicLocation.start\"),\n                    f.col(\"genomicLocation.end\"),\n                ).alias(\"tss\"),\n                \"biotype\",\n                \"approvedSymbol\",\n                \"obsoleteSymbols\",\n            )\n        )\n\n    @classmethod\n    def from_parquet(cls: type[GeneIndex], session: Session, path: str) -&gt; GeneIndex:\n\"\"\"Initialise GeneIndex from parquet file.\n\n        Args:\n            session (Session): ETL session\n            path (str): Path to parquet file\n\n        Returns:\n            GeneIndex: Gene index dataset\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n\n    def filter_by_biotypes(self: GeneIndex, biotypes: list) -&gt; GeneIndex:\n\"\"\"Filter by approved biotypes.\n\n        Args:\n            biotypes (list): List of Ensembl biotypes to keep.\n\n        Returns:\n            GeneIndex: Gene index dataset filtered by biotypes.\n        \"\"\"\n        self.df = self._df.filter(f.col(\"biotype\").isin(biotypes))\n        return self\n\n    def locations_lut(self: GeneIndex) -&gt; DataFrame:\n\"\"\"Gene location information.\n\n        Returns:\n            DataFrame: Gene LUT including genomic location information.\n        \"\"\"\n        return self.df.select(\n            \"geneId\",\n            \"chromosome\",\n            \"tss\",\n        )\n\n    def symbols_lut(self: GeneIndex) -&gt; DataFrame:\n\"\"\"Gene symbol lookup table.\n\n        Pre-processess gene/target dataset to create lookup table of gene symbols, including\n        obsoleted gene symbols.\n\n        Returns:\n            DataFrame: Gene LUT for symbol mapping containing `geneId` and `geneSymbol` columns.\n        \"\"\"\n        return self.df.select(\n            \"geneId\",\n            f.explode(\n                f.array_union(f.array(\"approvedSymbol\"), f.col(\"obsoleteSymbols.label\"))\n            ).alias(\"geneSymbol\"),\n        )\n</code></pre>"},{"location":"components/dataset/gene_index/#otg.dataset.gene_index.GeneIndex.filter_by_biotypes","title":"<code>filter_by_biotypes(biotypes)</code>","text":"<p>Filter by approved biotypes.</p> <p>Parameters:</p> Name Type Description Default <code>biotypes</code> <code>list</code> <p>List of Ensembl biotypes to keep.</p> required <p>Returns:</p> Name Type Description <code>GeneIndex</code> <code>GeneIndex</code> <p>Gene index dataset filtered by biotypes.</p> Source code in <code>src/otg/dataset/gene_index.py</code> <pre><code>def filter_by_biotypes(self: GeneIndex, biotypes: list) -&gt; GeneIndex:\n\"\"\"Filter by approved biotypes.\n\n    Args:\n        biotypes (list): List of Ensembl biotypes to keep.\n\n    Returns:\n        GeneIndex: Gene index dataset filtered by biotypes.\n    \"\"\"\n    self.df = self._df.filter(f.col(\"biotype\").isin(biotypes))\n    return self\n</code></pre>"},{"location":"components/dataset/gene_index/#otg.dataset.gene_index.GeneIndex.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise GeneIndex from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>ETL session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>GeneIndex</code> <code>GeneIndex</code> <p>Gene index dataset</p> Source code in <code>src/otg/dataset/gene_index.py</code> <pre><code>@classmethod\ndef from_parquet(cls: type[GeneIndex], session: Session, path: str) -&gt; GeneIndex:\n\"\"\"Initialise GeneIndex from parquet file.\n\n    Args:\n        session (Session): ETL session\n        path (str): Path to parquet file\n\n    Returns:\n        GeneIndex: Gene index dataset\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/gene_index/#otg.dataset.gene_index.GeneIndex.from_source","title":"<code>from_source(target_index)</code>  <code>classmethod</code>","text":"<p>Initialise GeneIndex from source dataset.</p> <p>Parameters:</p> Name Type Description Default <code>target_index</code> <code>DataFrame</code> <p>Target index dataframe</p> required <p>Returns:</p> Name Type Description <code>GeneIndex</code> <code>GeneIndex</code> <p>Gene index dataset</p> Source code in <code>src/otg/dataset/gene_index.py</code> <pre><code>@classmethod\ndef from_source(cls: type[GeneIndex], target_index: DataFrame) -&gt; GeneIndex:\n\"\"\"Initialise GeneIndex from source dataset.\n\n    Args:\n        target_index (DataFrame): Target index dataframe\n\n    Returns:\n        GeneIndex: Gene index dataset\n    \"\"\"\n    return cls(\n        _df=target_index.select(\n            f.coalesce(f.col(\"id\"), f.lit(\"unknown\")).alias(\"geneId\"),\n            f.coalesce(f.col(\"genomicLocation.chromosome\"), f.lit(\"unknown\")).alias(\n                \"chromosome\"\n            ),\n            GeneIndex._get_gene_tss(\n                f.col(\"genomicLocation.strand\"),\n                f.col(\"genomicLocation.start\"),\n                f.col(\"genomicLocation.end\"),\n            ).alias(\"tss\"),\n            \"biotype\",\n            \"approvedSymbol\",\n            \"obsoleteSymbols\",\n        )\n    )\n</code></pre>"},{"location":"components/dataset/gene_index/#otg.dataset.gene_index.GeneIndex.locations_lut","title":"<code>locations_lut()</code>","text":"<p>Gene location information.</p> <p>Returns:</p> Name Type Description <code>DataFrame</code> <code>DataFrame</code> <p>Gene LUT including genomic location information.</p> Source code in <code>src/otg/dataset/gene_index.py</code> <pre><code>def locations_lut(self: GeneIndex) -&gt; DataFrame:\n\"\"\"Gene location information.\n\n    Returns:\n        DataFrame: Gene LUT including genomic location information.\n    \"\"\"\n    return self.df.select(\n        \"geneId\",\n        \"chromosome\",\n        \"tss\",\n    )\n</code></pre>"},{"location":"components/dataset/gene_index/#otg.dataset.gene_index.GeneIndex.symbols_lut","title":"<code>symbols_lut()</code>","text":"<p>Gene symbol lookup table.</p> <p>Pre-processess gene/target dataset to create lookup table of gene symbols, including obsoleted gene symbols.</p> <p>Returns:</p> Name Type Description <code>DataFrame</code> <code>DataFrame</code> <p>Gene LUT for symbol mapping containing <code>geneId</code> and <code>geneSymbol</code> columns.</p> Source code in <code>src/otg/dataset/gene_index.py</code> <pre><code>def symbols_lut(self: GeneIndex) -&gt; DataFrame:\n\"\"\"Gene symbol lookup table.\n\n    Pre-processess gene/target dataset to create lookup table of gene symbols, including\n    obsoleted gene symbols.\n\n    Returns:\n        DataFrame: Gene LUT for symbol mapping containing `geneId` and `geneSymbol` columns.\n    \"\"\"\n    return self.df.select(\n        \"geneId\",\n        f.explode(\n            f.array_union(f.array(\"approvedSymbol\"), f.col(\"obsoleteSymbols.label\"))\n        ).alias(\"geneSymbol\"),\n    )\n</code></pre>"},{"location":"components/dataset/gene_index/#schema","title":"Schema","text":"<pre><code>root\n |-- geneId: string (nullable = false)\n |-- chromosome: string (nullable = false)\n |-- approvedSymbol: string (nullable = true)\n |-- biotype: string (nullable = true)\n |-- approvedName: string (nullable = true)\n |-- obsoleteSymbols: array (nullable = true)\n |    |-- element: struct (containsNull = true)\n |    |    |-- label: string (nullable = true)\n |    |    |-- source: string (nullable = true)\n |-- tss: long (nullable = true)\n</code></pre>"},{"location":"components/dataset/intervals/","title":"Intervals","text":"<p>         Bases: <code>Dataset</code></p> <p>Intervals dataset links genes to genomic regions based on genome interaction studies.</p> Source code in <code>src/otg/dataset/intervals.py</code> <pre><code>@dataclass\nclass Intervals(Dataset):\n\"\"\"Intervals dataset links genes to genomic regions based on genome interaction studies.\"\"\"\n\n    _schema: StructType = parse_spark_schema(\"intervals.json\")\n\n    @classmethod\n    def from_parquet(cls: type[Intervals], session: Session, path: str) -&gt; Intervals:\n\"\"\"Initialise Intervals from parquet file.\n\n        Args:\n            session (Session): ETL session\n            path (str): Path to parquet file\n\n        Returns:\n            Intervals: Intervals dataset\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n\n    @classmethod\n    def parse_andersson(\n        cls: type[Intervals],\n        session: Session,\n        path: str,\n        gene_index: GeneIndex,\n        lift: LiftOverSpark,\n    ) -&gt; Intervals:\n\"\"\"Parse Andersson et al. 2014 dataset.\n\n        Args:\n            session (Session): session\n            path (str): Path to dataset\n            gene_index (GeneIndex): Gene index\n            lift (LiftOverSpark): LiftOverSpark instance\n\n        Returns:\n            Intervals: Intervals dataset\n        \"\"\"\n        # Constant values:\n        dataset_name = \"andersson2014\"\n        experiment_type = \"fantom5\"\n        pmid = \"24670763\"\n        bio_feature = \"aggregate\"\n        twosided_threshold = 2.45e6  # &lt;-  this needs to phased out. Filter by percentile instead of absolute value.\n\n        session.logger.info(\"Parsing Andersson 2014 data...\")\n        session.logger.info(f\"Reading data from {path}\")\n\n        # Expected andersson et al. schema:\n        input_schema = t.StructType.fromJson(\n            json.loads(\n                pkg_resources.read_text(schemas, \"andersson2014.json\", encoding=\"utf-8\")\n            )\n        )\n\n        # Read the anderson file:\n        parsed_anderson_df = (\n            session.spark.read.option(\"delimiter\", \"\\t\")\n            .option(\"header\", \"true\")\n            .schema(input_schema)\n            .csv(path)\n            # Parsing score column and casting as float:\n            .withColumn(\"score\", f.col(\"score\").cast(\"float\") / f.lit(1000))\n            # Parsing the 'name' column:\n            .withColumn(\"parsedName\", f.split(f.col(\"name\"), \";\"))\n            .withColumn(\"gene_symbol\", f.col(\"parsedName\")[2])\n            .withColumn(\"location\", f.col(\"parsedName\")[0])\n            .withColumn(\n                \"chrom\",\n                f.regexp_replace(f.split(f.col(\"location\"), \":|-\")[0], \"chr\", \"\"),\n            )\n            .withColumn(\n                \"start\", f.split(f.col(\"location\"), \":|-\")[1].cast(t.IntegerType())\n            )\n            .withColumn(\n                \"end\", f.split(f.col(\"location\"), \":|-\")[2].cast(t.IntegerType())\n            )\n            # Select relevant columns:\n            .select(\"chrom\", \"start\", \"end\", \"gene_symbol\", \"score\")\n            # Drop rows with non-canonical chromosomes:\n            .filter(\n                f.col(\"chrom\").isin([str(x) for x in range(1, 23)] + [\"X\", \"Y\", \"MT\"])\n            )\n            # For each region/gene, keep only one row with the highest score:\n            .groupBy(\"chrom\", \"start\", \"end\", \"gene_symbol\")\n            .agg(f.max(\"score\").alias(\"resourceScore\"))\n            .orderBy(\"chrom\", \"start\")\n        )\n\n        return cls(\n            _df=(\n                # Lift over the intervals:\n                lift.convert_intervals(parsed_anderson_df, \"chrom\", \"start\", \"end\")\n                .drop(\"start\", \"end\")\n                .withColumnRenamed(\"mapped_start\", \"start\")\n                .withColumnRenamed(\"mapped_end\", \"end\")\n                .distinct()\n                # Joining with the gene index\n                .alias(\"intervals\")\n                .join(\n                    gene_index.symbols_lut().alias(\"genes\"),\n                    on=[f.col(\"intervals.gene_symbol\") == f.col(\"genes.geneSymbol\")],\n                    how=\"left\",\n                )\n                .filter(\n                    # Drop rows where the gene is not on the same chromosome\n                    (f.col(\"chrom\") == f.col(\"chromosome\"))\n                    # Drop rows where the TSS is far from the start of the region\n                    &amp; (\n                        f.abs((f.col(\"start\") + f.col(\"end\")) / 2 - f.col(\"tss\"))\n                        &lt;= twosided_threshold\n                    )\n                )\n                # Select relevant columns:\n                .select(\n                    \"chromosome\",\n                    \"start\",\n                    \"end\",\n                    \"geneId\",\n                    \"resourceScore\",\n                    f.lit(dataset_name).alias(\"datasourceId\"),\n                    f.lit(experiment_type).alias(\"datatypeId\"),\n                    f.lit(pmid).alias(\"pmid\"),\n                    f.lit(bio_feature).alias(\"biofeature\"),\n                )\n            )\n        )\n\n    @classmethod\n    def parse_javierre(\n        cls: type[Intervals],\n        session: Session,\n        path: str,\n        gene_index: GeneIndex,\n        lift: LiftOverSpark,\n    ) -&gt; Intervals:\n\"\"\"Parse Javierre et al. 2016 dataset.\n\n        Args:\n            session (Session): session\n            path (str): Path to dataset\n            gene_index (GeneIndex): Gene index\n            lift (LiftOverSpark): LiftOverSpark instance\n\n        Returns:\n            Intervals: Javierre et al. 2016 interval data\n        \"\"\"\n        # Constant values:\n        dataset_name = \"javierre2016\"\n        experiment_type = \"pchic\"\n        pmid = \"27863249\"\n        twosided_threshold = 2.45e6\n\n        session.logger.info(\"Parsing Javierre 2016 data...\")\n        session.logger.info(f\"Reading data from {path}\")\n\n        # Read Javierre data:\n        javierre_raw = (\n            session.spark.read.parquet(path)\n            # Splitting name column into chromosome, start, end, and score:\n            .withColumn(\"name_split\", f.split(f.col(\"name\"), r\":|-|,\"))\n            .withColumn(\n                \"name_chr\",\n                f.regexp_replace(f.col(\"name_split\")[0], \"chr\", \"\").cast(\n                    t.StringType()\n                ),\n            )\n            .withColumn(\"name_start\", f.col(\"name_split\")[1].cast(t.IntegerType()))\n            .withColumn(\"name_end\", f.col(\"name_split\")[2].cast(t.IntegerType()))\n            .withColumn(\"name_score\", f.col(\"name_split\")[3].cast(t.FloatType()))\n            # Cleaning up chromosome:\n            .withColumn(\n                \"chrom\",\n                f.regexp_replace(f.col(\"chrom\"), \"chr\", \"\").cast(t.StringType()),\n            )\n            .drop(\"name_split\", \"name\", \"annotation\")\n            # Keep canonical chromosomes and consistent chromosomes with scores:\n            .filter(\n                (f.col(\"name_score\").isNotNull())\n                &amp; (f.col(\"chrom\") == f.col(\"name_chr\"))\n                &amp; f.col(\"name_chr\").isin(\n                    [f\"{x}\" for x in range(1, 23)] + [\"X\", \"Y\", \"MT\"]\n                )\n            )\n        )\n\n        # Lifting over intervals:\n        javierre_remapped = (\n            javierre_raw\n            # Lifting over to GRCh38 interval 1:\n            .transform(lambda df: lift.convert_intervals(df, \"chrom\", \"start\", \"end\"))\n            .drop(\"start\", \"end\")\n            .withColumnRenamed(\"mapped_chrom\", \"chrom\")\n            .withColumnRenamed(\"mapped_start\", \"start\")\n            .withColumnRenamed(\"mapped_end\", \"end\")\n            # Lifting over interval 2 to GRCh38:\n            .transform(\n                lambda df: lift.convert_intervals(\n                    df, \"name_chr\", \"name_start\", \"name_end\"\n                )\n            )\n            .drop(\"name_start\", \"name_end\")\n            .withColumnRenamed(\"mapped_name_chr\", \"name_chr\")\n            .withColumnRenamed(\"mapped_name_start\", \"name_start\")\n            .withColumnRenamed(\"mapped_name_end\", \"name_end\")\n        )\n\n        # Once the intervals are lifted, extracting the unique intervals:\n        unique_intervals_with_genes = (\n            javierre_remapped.alias(\"intervals\")\n            .select(\n                f.col(\"chrom\"),\n                f.col(\"start\").cast(t.IntegerType()),\n                f.col(\"end\").cast(t.IntegerType()),\n            )\n            .distinct()\n            .join(\n                gene_index.locations_lut().alias(\"genes\"),\n                on=[f.col(\"intervals.chrom\") == f.col(\"genes.chromosome\")],\n                how=\"left\",\n            )\n            # TODO: add filter as part of the join condition\n            .filter(\n                (\n                    (f.col(\"start\") &gt;= f.col(\"genomicLocation.start\"))\n                    &amp; (f.col(\"start\") &lt;= f.col(\"genomicLocation.end\"))\n                )\n                | (\n                    (f.col(\"end\") &gt;= f.col(\"genomicLocation.start\"))\n                    &amp; (f.col(\"end\") &lt;= f.col(\"genomicLocation.end\"))\n                )\n            )\n            .select(\"chrom\", \"start\", \"end\", \"geneId\", \"tss\")\n        )\n\n        # Joining back the data:\n        return cls(\n            _df=(\n                javierre_remapped.join(\n                    unique_intervals_with_genes,\n                    on=[\"chrom\", \"start\", \"end\"],\n                    how=\"left\",\n                )\n                .filter(\n                    # Drop rows where the TSS is far from the start of the region\n                    f.abs((f.col(\"start\") + f.col(\"end\")) / 2 - f.col(\"tss\"))\n                    &lt;= twosided_threshold\n                )\n                # For each gene, keep only the highest scoring interval:\n                .groupBy(\n                    \"name_chr\", \"name_start\", \"name_end\", \"genes.geneId\", \"bio_feature\"\n                )\n                .agg(f.max(f.col(\"name_score\")).alias(\"resourceScore\"))\n                # Create the output:\n                .select(\n                    f.col(\"name_chr\").alias(\"chromosome\"),\n                    f.col(\"name_start\").alias(\"start\"),\n                    f.col(\"name_end\").alias(\"end\"),\n                    f.col(\"resourceScore\"),\n                    f.col(\"genes.geneId\").alias(\"geneId\"),\n                    f.col(\"bio_feature\").alias(\"biofeature\"),\n                    f.lit(dataset_name).alias(\"datasourceId\"),\n                    f.lit(experiment_type).alias(\"datatypeId\"),\n                    f.lit(pmid).alias(\"pmid\"),\n                )\n            )\n        )\n\n    @classmethod\n    def parse_jung(\n        cls: type[Intervals],\n        session: Session,\n        path: str,\n        gene_index: GeneIndex,\n        lift: LiftOverSpark,\n    ) -&gt; Intervals:\n\"\"\"Parse the Jung et al. 2019 dataset.\n\n        Args:\n            session (Session): session\n            path (str): path to the Jung et al. 2019 dataset\n            gene_index (GeneIndex): gene index\n            lift (LiftOverSpark): LiftOverSpark instance\n\n        Returns:\n            Intervals: _description_\n        \"\"\"\n        dataset_name = \"javierre2016\"\n        experiment_type = \"pchic\"\n        pmid = \"27863249\"\n\n        session.logger.info(\"Parsing Jung 2019 data...\")\n        session.logger.info(f\"Reading data from {path}\")\n\n        # Read Jung data:\n        jung_raw = (\n            session.spark.read.csv(path, sep=\",\", header=True)\n            .withColumn(\"interval\", f.split(f.col(\"Interacting_fragment\"), r\"\\.\"))\n            .select(\n                # Parsing intervals:\n                f.regexp_replace(f.col(\"interval\")[0], \"chr\", \"\").alias(\"chrom\"),\n                f.col(\"interval\")[1].cast(t.IntegerType()).alias(\"start\"),\n                f.col(\"interval\")[2].cast(t.IntegerType()).alias(\"end\"),\n                # Extract other columns:\n                f.col(\"Promoter\").alias(\"gene_name\"),\n                f.col(\"Tissue_type\").alias(\"tissue\"),\n            )\n        )\n\n        # Lifting over the coordinates:\n        return cls(\n            _df=(\n                jung_raw\n                # Lifting over to GRCh38 interval 1:\n                .transform(\n                    lambda df: lift.convert_intervals(df, \"chrom\", \"start\", \"end\")\n                )\n                .select(\n                    \"chrom\",\n                    f.col(\"mapped_start\").alias(\"start\"),\n                    f.col(\"mapped_end\").alias(\"end\"),\n                    f.explode(f.split(f.col(\"gene_name\"), \";\")).alias(\"gene_name\"),\n                    \"tissue\",\n                )\n                .alias(\"intervals\")\n                # Joining with genes:\n                .join(\n                    gene_index.symbols_lut().alias(\"genes\"),\n                    on=[f.col(\"intervals.gene_name\") == f.col(\"genes.geneSymbol\")],\n                    how=\"inner\",\n                )\n                # Finalize dataset:\n                .select(\n                    \"chromosome\",\n                    \"start\",\n                    \"end\",\n                    \"geneId\",\n                    f.col(\"tissue\").alias(\"biofeature\"),\n                    f.lit(1.0).alias(\"score\"),\n                    f.lit(dataset_name).alias(\"datasourceId\"),\n                    f.lit(experiment_type).alias(\"datatypeId\"),\n                    f.lit(pmid).alias(\"pmid\"),\n                )\n                .drop_duplicates()\n            )\n        )\n\n    @classmethod\n    def parse_thurman(\n        cls: type[Intervals],\n        session: Session,\n        path: str,\n        gene_index: GeneIndex,\n        lift: LiftOverSpark,\n    ) -&gt; Intervals:\n\"\"\"Parse the Thurman et al. 2019 dataset.\n\n        Args:\n            session (Session): session\n            path (str): path to the Thurman et al. 2019 dataset\n            gene_index (GeneIndex): gene index\n            lift (LiftOverSpark): LiftOverSpark instance\n\n        Returns:\n            Intervals: _description_\n        \"\"\"\n        dataset_name = \"thurman2012\"\n        experiment_type = \"dhscor\"\n        pmid = \"22955617\"\n\n        session.logger.info(\"Parsing Jung 2019 data...\")\n        session.logger.info(f\"Reading data from {path}\")\n\n        # Read Jung data:\n        jung_raw = (\n            session.spark.read.csv(path, sep=\",\", header=True)\n            .withColumn(\"interval\", f.split(f.col(\"Interacting_fragment\"), r\"\\.\"))\n            .select(\n                # Parsing intervals:\n                f.regexp_replace(f.col(\"interval\")[0], \"chr\", \"\").alias(\"chrom\"),\n                f.col(\"interval\")[1].cast(t.IntegerType()).alias(\"start\"),\n                f.col(\"interval\")[2].cast(t.IntegerType()).alias(\"end\"),\n                # Extract other columns:\n                f.col(\"Promoter\").alias(\"gene_name\"),\n                f.col(\"Tissue_type\").alias(\"tissue\"),\n            )\n        )\n\n        return cls(\n            _df=(\n                jung_raw\n                # Lifting over to GRCh38 interval 1:\n                .transform(\n                    lambda df: lift.convert_intervals(df, \"chrom\", \"start\", \"end\")\n                )\n                .select(\n                    \"chrom\",\n                    f.col(\"mapped_start\").alias(\"start\"),\n                    f.col(\"mapped_end\").alias(\"end\"),\n                    f.explode(f.split(f.col(\"gene_name\"), \";\")).alias(\"gene_name\"),\n                    \"tissue\",\n                )\n                .alias(\"intervals\")\n                # Joining with genes:\n                .join(\n                    gene_index.symbols_lut().alias(\"genes\"),\n                    on=[f.col(\"intervals.gene_name\") == f.col(\"genes.geneSymbol\")],\n                    how=\"inner\",\n                )\n                # Finalize dataset:\n                .select(\n                    \"chromosome\",\n                    \"start\",\n                    \"end\",\n                    \"geneId\",\n                    f.col(\"tissue\").alias(\"biofeature\"),\n                    f.lit(1.0).alias(\"score\"),\n                    f.lit(dataset_name).alias(\"datasourceId\"),\n                    f.lit(experiment_type).alias(\"datatypeId\"),\n                    f.lit(pmid).alias(\"pmid\"),\n                )\n                .drop_duplicates()\n            )\n        )\n\n    def v2g(self: Intervals, variant_index: VariantIndex) -&gt; V2G:\n\"\"\"Convert intervals into V2G by intersecting with a variant index.\n\n        Args:\n            variant_index (VariantIndex): Variant index dataset\n\n        Returns:\n            V2G: Variant-to-gene evidence dataset\n        \"\"\"\n        return V2G(\n            _df=(\n                # TODO: We can include the start and end position as part of the `on` clause in the join\n                self.df.alias(\"interval\")\n                .join(\n                    variant_index.df.selectExpr(\n                        \"chromosome as vi_chromosome\", \"variantId\", \"position\"\n                    ).alias(\"vi\"),\n                    on=[\n                        f.col(\"vi.vi_chromosome\") == f.col(\"interval.chromosome\"),\n                        f.col(\"vi.position\").between(\n                            f.col(\"interval.start\"), f.col(\"interval.end\")\n                        ),\n                    ],\n                    how=\"inner\",\n                )\n                .drop(\"start\", \"end\", \"vi_chromosome\")\n            )\n        )\n</code></pre>"},{"location":"components/dataset/intervals/#otg.dataset.intervals.Intervals.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise Intervals from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>ETL session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>Intervals</code> <code>Intervals</code> <p>Intervals dataset</p> Source code in <code>src/otg/dataset/intervals.py</code> <pre><code>@classmethod\ndef from_parquet(cls: type[Intervals], session: Session, path: str) -&gt; Intervals:\n\"\"\"Initialise Intervals from parquet file.\n\n    Args:\n        session (Session): ETL session\n        path (str): Path to parquet file\n\n    Returns:\n        Intervals: Intervals dataset\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/intervals/#otg.dataset.intervals.Intervals.parse_andersson","title":"<code>parse_andersson(session, path, gene_index, lift)</code>  <code>classmethod</code>","text":"<p>Parse Andersson et al. 2014 dataset.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>session</p> required <code>path</code> <code>str</code> <p>Path to dataset</p> required <code>gene_index</code> <code>GeneIndex</code> <p>Gene index</p> required <code>lift</code> <code>LiftOverSpark</code> <p>LiftOverSpark instance</p> required <p>Returns:</p> Name Type Description <code>Intervals</code> <code>Intervals</code> <p>Intervals dataset</p> Source code in <code>src/otg/dataset/intervals.py</code> <pre><code>@classmethod\ndef parse_andersson(\n    cls: type[Intervals],\n    session: Session,\n    path: str,\n    gene_index: GeneIndex,\n    lift: LiftOverSpark,\n) -&gt; Intervals:\n\"\"\"Parse Andersson et al. 2014 dataset.\n\n    Args:\n        session (Session): session\n        path (str): Path to dataset\n        gene_index (GeneIndex): Gene index\n        lift (LiftOverSpark): LiftOverSpark instance\n\n    Returns:\n        Intervals: Intervals dataset\n    \"\"\"\n    # Constant values:\n    dataset_name = \"andersson2014\"\n    experiment_type = \"fantom5\"\n    pmid = \"24670763\"\n    bio_feature = \"aggregate\"\n    twosided_threshold = 2.45e6  # &lt;-  this needs to phased out. Filter by percentile instead of absolute value.\n\n    session.logger.info(\"Parsing Andersson 2014 data...\")\n    session.logger.info(f\"Reading data from {path}\")\n\n    # Expected andersson et al. schema:\n    input_schema = t.StructType.fromJson(\n        json.loads(\n            pkg_resources.read_text(schemas, \"andersson2014.json\", encoding=\"utf-8\")\n        )\n    )\n\n    # Read the anderson file:\n    parsed_anderson_df = (\n        session.spark.read.option(\"delimiter\", \"\\t\")\n        .option(\"header\", \"true\")\n        .schema(input_schema)\n        .csv(path)\n        # Parsing score column and casting as float:\n        .withColumn(\"score\", f.col(\"score\").cast(\"float\") / f.lit(1000))\n        # Parsing the 'name' column:\n        .withColumn(\"parsedName\", f.split(f.col(\"name\"), \";\"))\n        .withColumn(\"gene_symbol\", f.col(\"parsedName\")[2])\n        .withColumn(\"location\", f.col(\"parsedName\")[0])\n        .withColumn(\n            \"chrom\",\n            f.regexp_replace(f.split(f.col(\"location\"), \":|-\")[0], \"chr\", \"\"),\n        )\n        .withColumn(\n            \"start\", f.split(f.col(\"location\"), \":|-\")[1].cast(t.IntegerType())\n        )\n        .withColumn(\n            \"end\", f.split(f.col(\"location\"), \":|-\")[2].cast(t.IntegerType())\n        )\n        # Select relevant columns:\n        .select(\"chrom\", \"start\", \"end\", \"gene_symbol\", \"score\")\n        # Drop rows with non-canonical chromosomes:\n        .filter(\n            f.col(\"chrom\").isin([str(x) for x in range(1, 23)] + [\"X\", \"Y\", \"MT\"])\n        )\n        # For each region/gene, keep only one row with the highest score:\n        .groupBy(\"chrom\", \"start\", \"end\", \"gene_symbol\")\n        .agg(f.max(\"score\").alias(\"resourceScore\"))\n        .orderBy(\"chrom\", \"start\")\n    )\n\n    return cls(\n        _df=(\n            # Lift over the intervals:\n            lift.convert_intervals(parsed_anderson_df, \"chrom\", \"start\", \"end\")\n            .drop(\"start\", \"end\")\n            .withColumnRenamed(\"mapped_start\", \"start\")\n            .withColumnRenamed(\"mapped_end\", \"end\")\n            .distinct()\n            # Joining with the gene index\n            .alias(\"intervals\")\n            .join(\n                gene_index.symbols_lut().alias(\"genes\"),\n                on=[f.col(\"intervals.gene_symbol\") == f.col(\"genes.geneSymbol\")],\n                how=\"left\",\n            )\n            .filter(\n                # Drop rows where the gene is not on the same chromosome\n                (f.col(\"chrom\") == f.col(\"chromosome\"))\n                # Drop rows where the TSS is far from the start of the region\n                &amp; (\n                    f.abs((f.col(\"start\") + f.col(\"end\")) / 2 - f.col(\"tss\"))\n                    &lt;= twosided_threshold\n                )\n            )\n            # Select relevant columns:\n            .select(\n                \"chromosome\",\n                \"start\",\n                \"end\",\n                \"geneId\",\n                \"resourceScore\",\n                f.lit(dataset_name).alias(\"datasourceId\"),\n                f.lit(experiment_type).alias(\"datatypeId\"),\n                f.lit(pmid).alias(\"pmid\"),\n                f.lit(bio_feature).alias(\"biofeature\"),\n            )\n        )\n    )\n</code></pre>"},{"location":"components/dataset/intervals/#otg.dataset.intervals.Intervals.parse_javierre","title":"<code>parse_javierre(session, path, gene_index, lift)</code>  <code>classmethod</code>","text":"<p>Parse Javierre et al. 2016 dataset.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>session</p> required <code>path</code> <code>str</code> <p>Path to dataset</p> required <code>gene_index</code> <code>GeneIndex</code> <p>Gene index</p> required <code>lift</code> <code>LiftOverSpark</code> <p>LiftOverSpark instance</p> required <p>Returns:</p> Name Type Description <code>Intervals</code> <code>Intervals</code> <p>Javierre et al. 2016 interval data</p> Source code in <code>src/otg/dataset/intervals.py</code> <pre><code>@classmethod\ndef parse_javierre(\n    cls: type[Intervals],\n    session: Session,\n    path: str,\n    gene_index: GeneIndex,\n    lift: LiftOverSpark,\n) -&gt; Intervals:\n\"\"\"Parse Javierre et al. 2016 dataset.\n\n    Args:\n        session (Session): session\n        path (str): Path to dataset\n        gene_index (GeneIndex): Gene index\n        lift (LiftOverSpark): LiftOverSpark instance\n\n    Returns:\n        Intervals: Javierre et al. 2016 interval data\n    \"\"\"\n    # Constant values:\n    dataset_name = \"javierre2016\"\n    experiment_type = \"pchic\"\n    pmid = \"27863249\"\n    twosided_threshold = 2.45e6\n\n    session.logger.info(\"Parsing Javierre 2016 data...\")\n    session.logger.info(f\"Reading data from {path}\")\n\n    # Read Javierre data:\n    javierre_raw = (\n        session.spark.read.parquet(path)\n        # Splitting name column into chromosome, start, end, and score:\n        .withColumn(\"name_split\", f.split(f.col(\"name\"), r\":|-|,\"))\n        .withColumn(\n            \"name_chr\",\n            f.regexp_replace(f.col(\"name_split\")[0], \"chr\", \"\").cast(\n                t.StringType()\n            ),\n        )\n        .withColumn(\"name_start\", f.col(\"name_split\")[1].cast(t.IntegerType()))\n        .withColumn(\"name_end\", f.col(\"name_split\")[2].cast(t.IntegerType()))\n        .withColumn(\"name_score\", f.col(\"name_split\")[3].cast(t.FloatType()))\n        # Cleaning up chromosome:\n        .withColumn(\n            \"chrom\",\n            f.regexp_replace(f.col(\"chrom\"), \"chr\", \"\").cast(t.StringType()),\n        )\n        .drop(\"name_split\", \"name\", \"annotation\")\n        # Keep canonical chromosomes and consistent chromosomes with scores:\n        .filter(\n            (f.col(\"name_score\").isNotNull())\n            &amp; (f.col(\"chrom\") == f.col(\"name_chr\"))\n            &amp; f.col(\"name_chr\").isin(\n                [f\"{x}\" for x in range(1, 23)] + [\"X\", \"Y\", \"MT\"]\n            )\n        )\n    )\n\n    # Lifting over intervals:\n    javierre_remapped = (\n        javierre_raw\n        # Lifting over to GRCh38 interval 1:\n        .transform(lambda df: lift.convert_intervals(df, \"chrom\", \"start\", \"end\"))\n        .drop(\"start\", \"end\")\n        .withColumnRenamed(\"mapped_chrom\", \"chrom\")\n        .withColumnRenamed(\"mapped_start\", \"start\")\n        .withColumnRenamed(\"mapped_end\", \"end\")\n        # Lifting over interval 2 to GRCh38:\n        .transform(\n            lambda df: lift.convert_intervals(\n                df, \"name_chr\", \"name_start\", \"name_end\"\n            )\n        )\n        .drop(\"name_start\", \"name_end\")\n        .withColumnRenamed(\"mapped_name_chr\", \"name_chr\")\n        .withColumnRenamed(\"mapped_name_start\", \"name_start\")\n        .withColumnRenamed(\"mapped_name_end\", \"name_end\")\n    )\n\n    # Once the intervals are lifted, extracting the unique intervals:\n    unique_intervals_with_genes = (\n        javierre_remapped.alias(\"intervals\")\n        .select(\n            f.col(\"chrom\"),\n            f.col(\"start\").cast(t.IntegerType()),\n            f.col(\"end\").cast(t.IntegerType()),\n        )\n        .distinct()\n        .join(\n            gene_index.locations_lut().alias(\"genes\"),\n            on=[f.col(\"intervals.chrom\") == f.col(\"genes.chromosome\")],\n            how=\"left\",\n        )\n        # TODO: add filter as part of the join condition\n        .filter(\n            (\n                (f.col(\"start\") &gt;= f.col(\"genomicLocation.start\"))\n                &amp; (f.col(\"start\") &lt;= f.col(\"genomicLocation.end\"))\n            )\n            | (\n                (f.col(\"end\") &gt;= f.col(\"genomicLocation.start\"))\n                &amp; (f.col(\"end\") &lt;= f.col(\"genomicLocation.end\"))\n            )\n        )\n        .select(\"chrom\", \"start\", \"end\", \"geneId\", \"tss\")\n    )\n\n    # Joining back the data:\n    return cls(\n        _df=(\n            javierre_remapped.join(\n                unique_intervals_with_genes,\n                on=[\"chrom\", \"start\", \"end\"],\n                how=\"left\",\n            )\n            .filter(\n                # Drop rows where the TSS is far from the start of the region\n                f.abs((f.col(\"start\") + f.col(\"end\")) / 2 - f.col(\"tss\"))\n                &lt;= twosided_threshold\n            )\n            # For each gene, keep only the highest scoring interval:\n            .groupBy(\n                \"name_chr\", \"name_start\", \"name_end\", \"genes.geneId\", \"bio_feature\"\n            )\n            .agg(f.max(f.col(\"name_score\")).alias(\"resourceScore\"))\n            # Create the output:\n            .select(\n                f.col(\"name_chr\").alias(\"chromosome\"),\n                f.col(\"name_start\").alias(\"start\"),\n                f.col(\"name_end\").alias(\"end\"),\n                f.col(\"resourceScore\"),\n                f.col(\"genes.geneId\").alias(\"geneId\"),\n                f.col(\"bio_feature\").alias(\"biofeature\"),\n                f.lit(dataset_name).alias(\"datasourceId\"),\n                f.lit(experiment_type).alias(\"datatypeId\"),\n                f.lit(pmid).alias(\"pmid\"),\n            )\n        )\n    )\n</code></pre>"},{"location":"components/dataset/intervals/#otg.dataset.intervals.Intervals.parse_jung","title":"<code>parse_jung(session, path, gene_index, lift)</code>  <code>classmethod</code>","text":"<p>Parse the Jung et al. 2019 dataset.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>session</p> required <code>path</code> <code>str</code> <p>path to the Jung et al. 2019 dataset</p> required <code>gene_index</code> <code>GeneIndex</code> <p>gene index</p> required <code>lift</code> <code>LiftOverSpark</code> <p>LiftOverSpark instance</p> required <p>Returns:</p> Name Type Description <code>Intervals</code> <code>Intervals</code> <p>description</p> Source code in <code>src/otg/dataset/intervals.py</code> <pre><code>@classmethod\ndef parse_jung(\n    cls: type[Intervals],\n    session: Session,\n    path: str,\n    gene_index: GeneIndex,\n    lift: LiftOverSpark,\n) -&gt; Intervals:\n\"\"\"Parse the Jung et al. 2019 dataset.\n\n    Args:\n        session (Session): session\n        path (str): path to the Jung et al. 2019 dataset\n        gene_index (GeneIndex): gene index\n        lift (LiftOverSpark): LiftOverSpark instance\n\n    Returns:\n        Intervals: _description_\n    \"\"\"\n    dataset_name = \"javierre2016\"\n    experiment_type = \"pchic\"\n    pmid = \"27863249\"\n\n    session.logger.info(\"Parsing Jung 2019 data...\")\n    session.logger.info(f\"Reading data from {path}\")\n\n    # Read Jung data:\n    jung_raw = (\n        session.spark.read.csv(path, sep=\",\", header=True)\n        .withColumn(\"interval\", f.split(f.col(\"Interacting_fragment\"), r\"\\.\"))\n        .select(\n            # Parsing intervals:\n            f.regexp_replace(f.col(\"interval\")[0], \"chr\", \"\").alias(\"chrom\"),\n            f.col(\"interval\")[1].cast(t.IntegerType()).alias(\"start\"),\n            f.col(\"interval\")[2].cast(t.IntegerType()).alias(\"end\"),\n            # Extract other columns:\n            f.col(\"Promoter\").alias(\"gene_name\"),\n            f.col(\"Tissue_type\").alias(\"tissue\"),\n        )\n    )\n\n    # Lifting over the coordinates:\n    return cls(\n        _df=(\n            jung_raw\n            # Lifting over to GRCh38 interval 1:\n            .transform(\n                lambda df: lift.convert_intervals(df, \"chrom\", \"start\", \"end\")\n            )\n            .select(\n                \"chrom\",\n                f.col(\"mapped_start\").alias(\"start\"),\n                f.col(\"mapped_end\").alias(\"end\"),\n                f.explode(f.split(f.col(\"gene_name\"), \";\")).alias(\"gene_name\"),\n                \"tissue\",\n            )\n            .alias(\"intervals\")\n            # Joining with genes:\n            .join(\n                gene_index.symbols_lut().alias(\"genes\"),\n                on=[f.col(\"intervals.gene_name\") == f.col(\"genes.geneSymbol\")],\n                how=\"inner\",\n            )\n            # Finalize dataset:\n            .select(\n                \"chromosome\",\n                \"start\",\n                \"end\",\n                \"geneId\",\n                f.col(\"tissue\").alias(\"biofeature\"),\n                f.lit(1.0).alias(\"score\"),\n                f.lit(dataset_name).alias(\"datasourceId\"),\n                f.lit(experiment_type).alias(\"datatypeId\"),\n                f.lit(pmid).alias(\"pmid\"),\n            )\n            .drop_duplicates()\n        )\n    )\n</code></pre>"},{"location":"components/dataset/intervals/#otg.dataset.intervals.Intervals.parse_thurman","title":"<code>parse_thurman(session, path, gene_index, lift)</code>  <code>classmethod</code>","text":"<p>Parse the Thurman et al. 2019 dataset.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>session</p> required <code>path</code> <code>str</code> <p>path to the Thurman et al. 2019 dataset</p> required <code>gene_index</code> <code>GeneIndex</code> <p>gene index</p> required <code>lift</code> <code>LiftOverSpark</code> <p>LiftOverSpark instance</p> required <p>Returns:</p> Name Type Description <code>Intervals</code> <code>Intervals</code> <p>description</p> Source code in <code>src/otg/dataset/intervals.py</code> <pre><code>@classmethod\ndef parse_thurman(\n    cls: type[Intervals],\n    session: Session,\n    path: str,\n    gene_index: GeneIndex,\n    lift: LiftOverSpark,\n) -&gt; Intervals:\n\"\"\"Parse the Thurman et al. 2019 dataset.\n\n    Args:\n        session (Session): session\n        path (str): path to the Thurman et al. 2019 dataset\n        gene_index (GeneIndex): gene index\n        lift (LiftOverSpark): LiftOverSpark instance\n\n    Returns:\n        Intervals: _description_\n    \"\"\"\n    dataset_name = \"thurman2012\"\n    experiment_type = \"dhscor\"\n    pmid = \"22955617\"\n\n    session.logger.info(\"Parsing Jung 2019 data...\")\n    session.logger.info(f\"Reading data from {path}\")\n\n    # Read Jung data:\n    jung_raw = (\n        session.spark.read.csv(path, sep=\",\", header=True)\n        .withColumn(\"interval\", f.split(f.col(\"Interacting_fragment\"), r\"\\.\"))\n        .select(\n            # Parsing intervals:\n            f.regexp_replace(f.col(\"interval\")[0], \"chr\", \"\").alias(\"chrom\"),\n            f.col(\"interval\")[1].cast(t.IntegerType()).alias(\"start\"),\n            f.col(\"interval\")[2].cast(t.IntegerType()).alias(\"end\"),\n            # Extract other columns:\n            f.col(\"Promoter\").alias(\"gene_name\"),\n            f.col(\"Tissue_type\").alias(\"tissue\"),\n        )\n    )\n\n    return cls(\n        _df=(\n            jung_raw\n            # Lifting over to GRCh38 interval 1:\n            .transform(\n                lambda df: lift.convert_intervals(df, \"chrom\", \"start\", \"end\")\n            )\n            .select(\n                \"chrom\",\n                f.col(\"mapped_start\").alias(\"start\"),\n                f.col(\"mapped_end\").alias(\"end\"),\n                f.explode(f.split(f.col(\"gene_name\"), \";\")).alias(\"gene_name\"),\n                \"tissue\",\n            )\n            .alias(\"intervals\")\n            # Joining with genes:\n            .join(\n                gene_index.symbols_lut().alias(\"genes\"),\n                on=[f.col(\"intervals.gene_name\") == f.col(\"genes.geneSymbol\")],\n                how=\"inner\",\n            )\n            # Finalize dataset:\n            .select(\n                \"chromosome\",\n                \"start\",\n                \"end\",\n                \"geneId\",\n                f.col(\"tissue\").alias(\"biofeature\"),\n                f.lit(1.0).alias(\"score\"),\n                f.lit(dataset_name).alias(\"datasourceId\"),\n                f.lit(experiment_type).alias(\"datatypeId\"),\n                f.lit(pmid).alias(\"pmid\"),\n            )\n            .drop_duplicates()\n        )\n    )\n</code></pre>"},{"location":"components/dataset/intervals/#otg.dataset.intervals.Intervals.v2g","title":"<code>v2g(variant_index)</code>","text":"<p>Convert intervals into V2G by intersecting with a variant index.</p> <p>Parameters:</p> Name Type Description Default <code>variant_index</code> <code>VariantIndex</code> <p>Variant index dataset</p> required <p>Returns:</p> Name Type Description <code>V2G</code> <code>V2G</code> <p>Variant-to-gene evidence dataset</p> Source code in <code>src/otg/dataset/intervals.py</code> <pre><code>def v2g(self: Intervals, variant_index: VariantIndex) -&gt; V2G:\n\"\"\"Convert intervals into V2G by intersecting with a variant index.\n\n    Args:\n        variant_index (VariantIndex): Variant index dataset\n\n    Returns:\n        V2G: Variant-to-gene evidence dataset\n    \"\"\"\n    return V2G(\n        _df=(\n            # TODO: We can include the start and end position as part of the `on` clause in the join\n            self.df.alias(\"interval\")\n            .join(\n                variant_index.df.selectExpr(\n                    \"chromosome as vi_chromosome\", \"variantId\", \"position\"\n                ).alias(\"vi\"),\n                on=[\n                    f.col(\"vi.vi_chromosome\") == f.col(\"interval.chromosome\"),\n                    f.col(\"vi.position\").between(\n                        f.col(\"interval.start\"), f.col(\"interval.end\")\n                    ),\n                ],\n                how=\"inner\",\n            )\n            .drop(\"start\", \"end\", \"vi_chromosome\")\n        )\n    )\n</code></pre>"},{"location":"components/dataset/intervals/#schema","title":"Schema","text":"<pre><code>root\n |-- chromosome: string (nullable = false)\n |-- start: string (nullable = false)\n |-- end: string (nullable = false)\n |-- geneId: string (nullable = false)\n |-- resourceScore: double (nullable = true)\n |-- score: double (nullable = true)\n |-- datasourceId: string (nullable = false)\n |-- datatypeId: string (nullable = false)\n |-- pmid: string (nullable = true)\n |-- biofeature: string (nullable = true)\n</code></pre>"},{"location":"components/dataset/ld_index/","title":"LD index","text":"<p>         Bases: <code>Dataset</code></p> <p>Dataset to index access to LD information from GnomAD.</p> Source code in <code>src/otg/dataset/ld_index.py</code> <pre><code>@dataclass\nclass LDIndex(Dataset):\n\"\"\"Dataset to index access to LD information from GnomAD.\"\"\"\n\n    _schema: StructType = parse_spark_schema(\"ld_index.json\")\n\n    @staticmethod\n    def _liftover_loci(variant_index: Table, grch37_to_grch38_chain_path: str) -&gt; Table:\n\"\"\"Liftover hail table with LD variant index.\n\n        Args:\n            variant_index (Table): LD variant indexes\n            grch37_to_grch38_chain_path (str): Path to chain file\n\n        Returns:\n            Table: LD variant index with locus 38 coordinates\n        \"\"\"\n        if not hl.get_reference(\"GRCh37\").has_liftover(\"GRCh38\"):\n            rg37 = hl.get_reference(\"GRCh37\")\n            rg38 = hl.get_reference(\"GRCh38\")\n            rg37.add_liftover(grch37_to_grch38_chain_path, rg38)\n\n        return variant_index.annotate(\n            locus38=hl.liftover(variant_index.locus, \"GRCh38\")\n        )\n\n    @staticmethod\n    def _interval_start(contig: Column, position: Column, ld_radius: int) -&gt; Column:\n\"\"\"Start position of the interval based on available positions.\n\n        Args:\n            contig (Column): genomic contigs\n            position (Column): genomic positions\n            ld_radius (int): bp around locus\n\n        Returns:\n            Column: Position of the locus starting the interval\n\n        Examples:\n            &gt;&gt;&gt; d = [\n            ...     {\"contig\": \"21\", \"pos\": 100},\n            ...     {\"contig\": \"21\", \"pos\": 200},\n            ...     {\"contig\": \"21\", \"pos\": 300},\n            ... ]\n            &gt;&gt;&gt; df = spark.createDataFrame(d)\n            &gt;&gt;&gt; df.withColumn(\"start\", LDIndex._interval_start(f.col(\"contig\"), f.col(\"pos\"), 100)).show()\n            +------+---+-----+\n            |contig|pos|start|\n            +------+---+-----+\n            |    21|100|  100|\n            |    21|200|  100|\n            |    21|300|  200|\n            +------+---+-----+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        w = (\n            Window.partitionBy(contig)\n            .orderBy(position)\n            .rangeBetween(-ld_radius, ld_radius)\n        )\n        return f.min(position).over(w)\n\n    @staticmethod\n    def _interval_stop(contig: Column, position: Column, ld_radius: int) -&gt; Column:\n\"\"\"Stop position of the interval based on available positions.\n\n        Args:\n            contig (Column): genomic contigs\n            position (Column): genomic positions\n            ld_radius (int): bp around locus\n\n        Returns:\n            Column: Position of the locus at the end of the interval\n\n        Examples:\n            &gt;&gt;&gt; d = [\n            ...     {\"contig\": \"21\", \"pos\": 100},\n            ...     {\"contig\": \"21\", \"pos\": 200},\n            ...     {\"contig\": \"21\", \"pos\": 300},\n            ... ]\n            &gt;&gt;&gt; df = spark.createDataFrame(d)\n            &gt;&gt;&gt; df.withColumn(\"start\", LDIndex._interval_stop(f.col(\"contig\"), f.col(\"pos\"), 100)).show()\n            +------+---+-----+\n            |contig|pos|start|\n            +------+---+-----+\n            |    21|100|  200|\n            |    21|200|  300|\n            |    21|300|  300|\n            +------+---+-----+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        w = (\n            Window.partitionBy(contig)\n            .orderBy(position)\n            .rangeBetween(-ld_radius, ld_radius)\n        )\n        return f.max(position).over(w)\n\n    @classmethod\n    def from_parquet(cls: type[LDIndex], session: Session, path: str) -&gt; LDIndex:\n\"\"\"Initialise LD index from parquet file.\n\n        Args:\n            session (Session): ETL session\n            path (str): Path to parquet file\n\n        Returns:\n            LDIndex: LD index dataset\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n\n    @classmethod\n    def create(\n        cls: type[LDIndex],\n        pop_ldindex_path: str,\n        ld_radius: int,\n        grch37_to_grch38_chain_path: str,\n    ) -&gt; LDIndex:\n\"\"\"Parse LD index and annotate with interval start and stop.\n\n        Args:\n            pop_ldindex_path (str): path to gnomAD LD index\n            ld_radius (int): radius\n            grch37_to_grch38_chain_path (str): path to chain file for liftover\n\n        Returns:\n            LDIndex: Created GnomAD LD index\n        \"\"\"\n        ld_index = hl.read_table(pop_ldindex_path).naive_coalesce(400)\n        ld_index_38 = LDIndex._liftover_loci(ld_index, grch37_to_grch38_chain_path)\n\n        return cls(\n            _df=ld_index_38.to_spark()\n            .filter(f.col(\"`locus38.position`\").isNotNull())\n            .select(\n                f.coalesce(f.col(\"idx\"), f.monotonically_increasing_id()).alias(\"idx\"),\n                f.coalesce(\n                    f.regexp_replace(\"`locus38.contig`\", \"chr\", \"\"), f.lit(\"unknown\")\n                ).alias(\"chromosome\"),\n                f.coalesce(f.col(\"`locus38.position`\"), f.lit(-1)).alias(\"position\"),\n                f.coalesce(f.col(\"`alleles`\").getItem(0), f.lit(\"?\")).alias(\n                    \"referenceAllele\"\n                ),\n                f.coalesce(f.col(\"`alleles`\").getItem(1), f.lit(\"?\")).alias(\n                    \"alternateAllele\"\n                ),\n            )\n            # Convert gnomad position to Ensembl position (1-based for indels)\n            .withColumn(\n                \"position\",\n                convert_gnomad_position_to_ensembl(\n                    f.col(\"position\"),\n                    f.col(\"referenceAllele\"),\n                    f.col(\"alternateAllele\"),\n                ),\n            )\n            .withColumn(\n                \"variantId\",\n                f.concat_ws(\n                    \"_\",\n                    f.col(\"chromosome\"),\n                    f.col(\"position\"),\n                    f.col(\"referenceAllele\"),\n                    f.col(\"alternateAllele\"),\n                ),\n            )\n            # Filter out variants mapping to several indices due to liftover\n            .withColumn(\"count\", f.count(\"*\").over(Window.partitionBy([\"variantId\"])))\n            .filter(f.col(\"count\") == 1)\n            .drop(\"count\")\n            .withColumn(\"start_idx\", f.lit(None).cast(t.LongType()))\n            .withColumn(\"stop_idx\", f.lit(None).cast(t.LongType()))\n            .repartition(400, \"chromosome\")\n            .sortWithinPartitions(\"position\")\n            .persist()\n        ).annotate_index_intervals(ld_radius)\n\n    def annotate_index_intervals(self: LDIndex, ld_radius: int) -&gt; LDIndex:\n\"\"\"Annotate LD index with indices starting and stopping at a given interval.\n\n        Args:\n            ld_radius (int): radius around each position\n\n        Returns:\n            LDIndex: including `start_idx` and `stop_idx` columns\n        \"\"\"\n        index_with_positions = (\n            self._df.drop(\"start_idx\", \"stop_idx\")\n            .select(\n                \"*\",\n                LDIndex._interval_start(\n                    contig=f.col(\"chromosome\"),\n                    position=f.col(\"position\"),\n                    ld_radius=ld_radius,\n                ).alias(\"start_pos\"),\n                LDIndex._interval_stop(\n                    contig=f.col(\"chromosome\"),\n                    position=f.col(\"position\"),\n                    ld_radius=ld_radius,\n                ).alias(\"stop_pos\"),\n            )\n            .persist()\n        )\n\n        self.df = (\n            index_with_positions.join(\n                (\n                    index_with_positions\n                    # Given the multiple variants with the same chromosome/position can have different indices, filter for the lowest index:\n                    .transform(\n                        lambda df: get_record_with_minimum_value(\n                            df, [\"chromosome\", \"position\"], \"idx\"\n                        )\n                    ).select(\n                        \"chromosome\",\n                        f.col(\"position\").alias(\"start_pos\"),\n                        f.col(\"idx\").alias(\"start_idx\"),\n                    )\n                ),\n                on=[\"chromosome\", \"start_pos\"],\n            )\n            .join(\n                (\n                    index_with_positions\n                    # Given the multiple variants with the same chromosome/position can have different indices, filter for the highest index:\n                    .transform(\n                        lambda df: get_record_with_maximum_value(\n                            df, [\"chromosome\", \"position\"], \"idx\"\n                        )\n                    ).select(\n                        \"chromosome\",\n                        f.col(\"position\").alias(\"stop_pos\"),\n                        f.col(\"idx\").alias(\"stop_idx\"),\n                    )\n                ),\n                on=[\"chromosome\", \"stop_pos\"],\n            )\n            # Filter out variants for which start idx &gt; stop idx due to liftover\n            .filter(f.col(\"start_idx\") &lt; f.col(\"stop_idx\"))\n            .drop(\"start_pos\", \"stop_pos\")\n        )\n\n        return self\n</code></pre>"},{"location":"components/dataset/ld_index/#otg.dataset.ld_index.LDIndex.annotate_index_intervals","title":"<code>annotate_index_intervals(ld_radius)</code>","text":"<p>Annotate LD index with indices starting and stopping at a given interval.</p> <p>Parameters:</p> Name Type Description Default <code>ld_radius</code> <code>int</code> <p>radius around each position</p> required <p>Returns:</p> Name Type Description <code>LDIndex</code> <code>LDIndex</code> <p>including <code>start_idx</code> and <code>stop_idx</code> columns</p> Source code in <code>src/otg/dataset/ld_index.py</code> <pre><code>def annotate_index_intervals(self: LDIndex, ld_radius: int) -&gt; LDIndex:\n\"\"\"Annotate LD index with indices starting and stopping at a given interval.\n\n    Args:\n        ld_radius (int): radius around each position\n\n    Returns:\n        LDIndex: including `start_idx` and `stop_idx` columns\n    \"\"\"\n    index_with_positions = (\n        self._df.drop(\"start_idx\", \"stop_idx\")\n        .select(\n            \"*\",\n            LDIndex._interval_start(\n                contig=f.col(\"chromosome\"),\n                position=f.col(\"position\"),\n                ld_radius=ld_radius,\n            ).alias(\"start_pos\"),\n            LDIndex._interval_stop(\n                contig=f.col(\"chromosome\"),\n                position=f.col(\"position\"),\n                ld_radius=ld_radius,\n            ).alias(\"stop_pos\"),\n        )\n        .persist()\n    )\n\n    self.df = (\n        index_with_positions.join(\n            (\n                index_with_positions\n                # Given the multiple variants with the same chromosome/position can have different indices, filter for the lowest index:\n                .transform(\n                    lambda df: get_record_with_minimum_value(\n                        df, [\"chromosome\", \"position\"], \"idx\"\n                    )\n                ).select(\n                    \"chromosome\",\n                    f.col(\"position\").alias(\"start_pos\"),\n                    f.col(\"idx\").alias(\"start_idx\"),\n                )\n            ),\n            on=[\"chromosome\", \"start_pos\"],\n        )\n        .join(\n            (\n                index_with_positions\n                # Given the multiple variants with the same chromosome/position can have different indices, filter for the highest index:\n                .transform(\n                    lambda df: get_record_with_maximum_value(\n                        df, [\"chromosome\", \"position\"], \"idx\"\n                    )\n                ).select(\n                    \"chromosome\",\n                    f.col(\"position\").alias(\"stop_pos\"),\n                    f.col(\"idx\").alias(\"stop_idx\"),\n                )\n            ),\n            on=[\"chromosome\", \"stop_pos\"],\n        )\n        # Filter out variants for which start idx &gt; stop idx due to liftover\n        .filter(f.col(\"start_idx\") &lt; f.col(\"stop_idx\"))\n        .drop(\"start_pos\", \"stop_pos\")\n    )\n\n    return self\n</code></pre>"},{"location":"components/dataset/ld_index/#otg.dataset.ld_index.LDIndex.create","title":"<code>create(pop_ldindex_path, ld_radius, grch37_to_grch38_chain_path)</code>  <code>classmethod</code>","text":"<p>Parse LD index and annotate with interval start and stop.</p> <p>Parameters:</p> Name Type Description Default <code>pop_ldindex_path</code> <code>str</code> <p>path to gnomAD LD index</p> required <code>ld_radius</code> <code>int</code> <p>radius</p> required <code>grch37_to_grch38_chain_path</code> <code>str</code> <p>path to chain file for liftover</p> required <p>Returns:</p> Name Type Description <code>LDIndex</code> <code>LDIndex</code> <p>Created GnomAD LD index</p> Source code in <code>src/otg/dataset/ld_index.py</code> <pre><code>@classmethod\ndef create(\n    cls: type[LDIndex],\n    pop_ldindex_path: str,\n    ld_radius: int,\n    grch37_to_grch38_chain_path: str,\n) -&gt; LDIndex:\n\"\"\"Parse LD index and annotate with interval start and stop.\n\n    Args:\n        pop_ldindex_path (str): path to gnomAD LD index\n        ld_radius (int): radius\n        grch37_to_grch38_chain_path (str): path to chain file for liftover\n\n    Returns:\n        LDIndex: Created GnomAD LD index\n    \"\"\"\n    ld_index = hl.read_table(pop_ldindex_path).naive_coalesce(400)\n    ld_index_38 = LDIndex._liftover_loci(ld_index, grch37_to_grch38_chain_path)\n\n    return cls(\n        _df=ld_index_38.to_spark()\n        .filter(f.col(\"`locus38.position`\").isNotNull())\n        .select(\n            f.coalesce(f.col(\"idx\"), f.monotonically_increasing_id()).alias(\"idx\"),\n            f.coalesce(\n                f.regexp_replace(\"`locus38.contig`\", \"chr\", \"\"), f.lit(\"unknown\")\n            ).alias(\"chromosome\"),\n            f.coalesce(f.col(\"`locus38.position`\"), f.lit(-1)).alias(\"position\"),\n            f.coalesce(f.col(\"`alleles`\").getItem(0), f.lit(\"?\")).alias(\n                \"referenceAllele\"\n            ),\n            f.coalesce(f.col(\"`alleles`\").getItem(1), f.lit(\"?\")).alias(\n                \"alternateAllele\"\n            ),\n        )\n        # Convert gnomad position to Ensembl position (1-based for indels)\n        .withColumn(\n            \"position\",\n            convert_gnomad_position_to_ensembl(\n                f.col(\"position\"),\n                f.col(\"referenceAllele\"),\n                f.col(\"alternateAllele\"),\n            ),\n        )\n        .withColumn(\n            \"variantId\",\n            f.concat_ws(\n                \"_\",\n                f.col(\"chromosome\"),\n                f.col(\"position\"),\n                f.col(\"referenceAllele\"),\n                f.col(\"alternateAllele\"),\n            ),\n        )\n        # Filter out variants mapping to several indices due to liftover\n        .withColumn(\"count\", f.count(\"*\").over(Window.partitionBy([\"variantId\"])))\n        .filter(f.col(\"count\") == 1)\n        .drop(\"count\")\n        .withColumn(\"start_idx\", f.lit(None).cast(t.LongType()))\n        .withColumn(\"stop_idx\", f.lit(None).cast(t.LongType()))\n        .repartition(400, \"chromosome\")\n        .sortWithinPartitions(\"position\")\n        .persist()\n    ).annotate_index_intervals(ld_radius)\n</code></pre>"},{"location":"components/dataset/ld_index/#otg.dataset.ld_index.LDIndex.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise LD index from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>ETL session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>LDIndex</code> <code>LDIndex</code> <p>LD index dataset</p> Source code in <code>src/otg/dataset/ld_index.py</code> <pre><code>@classmethod\ndef from_parquet(cls: type[LDIndex], session: Session, path: str) -&gt; LDIndex:\n\"\"\"Initialise LD index from parquet file.\n\n    Args:\n        session (Session): ETL session\n        path (str): Path to parquet file\n\n    Returns:\n        LDIndex: LD index dataset\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/ld_index/#schema","title":"Schema","text":"<pre><code>root\n |-- variantId: string (nullable = false)\n |-- chromosome: string (nullable = false)\n |-- position: integer (nullable = false)\n |-- referenceAllele: string (nullable = false)\n |-- alternateAllele: string (nullable = false)\n |-- idx: long (nullable = false)\n |-- start_idx: long (nullable = true)\n |-- stop_idx: long (nullable = true)\n</code></pre>"},{"location":"components/dataset/study_locus_overlap/","title":"Study locus overlap","text":"<p>         Bases: <code>Dataset</code></p> <p>Study-Locus overlap.</p> <p>This dataset captures pairs of overlapping <code>StudyLocus</code>.</p> Source code in <code>src/otg/dataset/study_locus_overlap.py</code> <pre><code>@dataclass\nclass StudyLocusOverlap(Dataset):\n\"\"\"Study-Locus overlap.\n\n    This dataset captures pairs of overlapping `StudyLocus`.\n    \"\"\"\n\n    _schema: StructType = parse_spark_schema(\"study_locus_overlap.json\")\n\n    @classmethod\n    def from_parquet(\n        cls: type[StudyLocusOverlap], session: Session, path: str\n    ) -&gt; StudyLocusOverlap:\n\"\"\"Initialise StudyLocusOverlap from parquet file.\n\n        Args:\n            session (Session): ETL session\n            path (str): Path to parquet file\n\n        Returns:\n            StudyLocusOverlap: Study-locus overlap dataset\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/study_locus_overlap/#otg.dataset.study_locus_overlap.StudyLocusOverlap.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise StudyLocusOverlap from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>ETL session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>StudyLocusOverlap</code> <code>StudyLocusOverlap</code> <p>Study-locus overlap dataset</p> Source code in <code>src/otg/dataset/study_locus_overlap.py</code> <pre><code>@classmethod\ndef from_parquet(\n    cls: type[StudyLocusOverlap], session: Session, path: str\n) -&gt; StudyLocusOverlap:\n\"\"\"Initialise StudyLocusOverlap from parquet file.\n\n    Args:\n        session (Session): ETL session\n        path (str): Path to parquet file\n\n    Returns:\n        StudyLocusOverlap: Study-locus overlap dataset\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/study_locus_overlap/#schema","title":"Schema","text":"<pre><code>root\n |-- left_studyLocusId: long (nullable = false)\n |-- right_studyLocusId: long (nullable = false)\n |-- chromosome: string (nullable = false)\n |-- tagVariantId: string (nullable = false)\n |-- right_logABF: double (nullable = true)\n |-- left_logABF: double (nullable = true)\n |-- right_posteriorProbability: double (nullable = true)\n |-- left_posteriorProbability: double (nullable = true)\n</code></pre>"},{"location":"components/dataset/summary_statistics/","title":"Summary statistics","text":"<p>         Bases: <code>Dataset</code></p> <p>Summary Statistics dataset.</p> <p>A summary statistics dataset contains all single point statistics resulting from a GWAS.</p> Source code in <code>src/otg/dataset/summary_statistics.py</code> <pre><code>@dataclass\nclass SummaryStatistics(Dataset):\n\"\"\"Summary Statistics dataset.\n\n    A summary statistics dataset contains all single point statistics resulting from a GWAS.\n    \"\"\"\n\n    _schema: t.StructType = parse_spark_schema(\"summary_statistics.json\")\n\n    @classmethod\n    def from_parquet(\n        cls: type[SummaryStatistics], session: Session, path: str\n    ) -&gt; SummaryStatistics:\n\"\"\"Initialise SummaryStatistics from parquet file.\n\n        Args:\n            session (Session): Session\n            path (str): Path to parquet file\n\n        Returns:\n            SummaryStatistics: SummaryStatistics dataset\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n\n    @classmethod\n    def from_gwas_harmonized_summary_stats(\n        cls: type[SummaryStatistics],\n        sumstats_df: DataFrame,\n        study_id: str,\n    ) -&gt; SummaryStatistics:\n\"\"\"Create summary statistics object from summary statistics flatfile, harmonized by the GWAS Catalog.\n\n        Args:\n            sumstats_df (DataFrame): Harmonized dataset read as a spark dataframe from GWAS Catalog.\n            study_id (str): GWAS Catalog study accession.\n\n        Returns:\n            SummaryStatistics\n        \"\"\"\n        # The effect allele frequency is an optional column, we have to test if it is there:\n        allele_frequency_expression = (\n            f.col(\"hm_effect_allele_frequency\").cast(t.FloatType())\n            if \"hm_effect_allele_frequency\" in sumstats_df.columns\n            else f.lit(None)\n        )\n\n        # Processing columns of interest:\n        processed_sumstats_df = (\n            sumstats_df\n            # Dropping rows which doesn't have proper position:\n            .filter(f.col(\"hm_pos\").cast(t.IntegerType()).isNotNull())\n            .select(\n                # Adding study identifier:\n                f.lit(study_id).cast(t.StringType()).alias(\"studyId\"),\n                # Adding variant identifier:\n                f.col(\"hm_variant_id\").alias(\"variantId\"),\n                f.col(\"hm_chrom\").alias(\"chromosome\"),\n                f.col(\"hm_pos\").cast(t.IntegerType()).alias(\"position\"),\n                # Parsing p-value mantissa and exponent:\n                *parse_pvalue(f.col(\"p_value\")),\n                # Converting/calculating effect and confidence interval:\n                *convert_odds_ratio_to_beta(\n                    f.col(\"hm_beta\").cast(t.DoubleType()),\n                    f.col(\"hm_odds_ratio\").cast(t.DoubleType()),\n                    f.col(\"standard_error\").cast(t.DoubleType()),\n                ),\n                allele_frequency_expression.alias(\"effectAlleleFrequencyFromSource\"),\n            )\n            # The previous select expression generated the necessary fields for calculating the confidence intervals:\n            .select(\n                \"*\",\n                *calculate_confidence_interval(\n                    f.col(\"pValueMantissa\"),\n                    f.col(\"pValueExponent\"),\n                    f.col(\"beta\"),\n                    f.col(\"standardError\"),\n                ),\n            )\n            .repartition(200, \"chromosome\")\n            .sortWithinPartitions(\"position\")\n        )\n\n        # Initializing summary statistics object:\n        return cls(\n            _df=processed_sumstats_df,\n        )\n\n    def pvalue_filter(self: SummaryStatistics, pvalue: float) -&gt; SummaryStatistics:\n\"\"\"Filter summary statistics based on the provided p-value threshold.\n\n        Args:\n            pvalue (float): upper limit of the p-value to be filtered upon.\n\n        Returns:\n            SummaryStatistics: summary statistics object containing single point associations with p-values at least as significant as the provided threshold.\n        \"\"\"\n        # Converting p-value to mantissa and exponent:\n        (mantissa, exponent) = split_pvalue(pvalue)\n\n        # Applying filter:\n        df = self._df.filter(\n            (f.col(\"pValueExponent\") &lt; exponent)\n            | (\n                (f.col(\"pValueExponent\") == exponent)\n                &amp; (f.col(\"pValueMantissa\") &lt;= mantissa)\n            )\n        )\n        return SummaryStatistics(_df=df)\n\n    def window_based_clumping(self: SummaryStatistics, distance: int) -&gt; StudyLocus:\n\"\"\"Perform distance-based clumping.\n\n        Args:\n            distance (int): Distance in base pairs\n\n        Returns:\n            StudyLocus: StudyLocus object\n        \"\"\"\n        # Calculate distance-based clumping:\n        return WindowBasedClumping.clump(self, distance)\n</code></pre>"},{"location":"components/dataset/summary_statistics/#otg.dataset.summary_statistics.SummaryStatistics.from_gwas_harmonized_summary_stats","title":"<code>from_gwas_harmonized_summary_stats(sumstats_df, study_id)</code>  <code>classmethod</code>","text":"<p>Create summary statistics object from summary statistics flatfile, harmonized by the GWAS Catalog.</p> <p>Parameters:</p> Name Type Description Default <code>sumstats_df</code> <code>DataFrame</code> <p>Harmonized dataset read as a spark dataframe from GWAS Catalog.</p> required <code>study_id</code> <code>str</code> <p>GWAS Catalog study accession.</p> required <p>Returns:</p> Type Description <code>SummaryStatistics</code> <p>SummaryStatistics</p> Source code in <code>src/otg/dataset/summary_statistics.py</code> <pre><code>@classmethod\ndef from_gwas_harmonized_summary_stats(\n    cls: type[SummaryStatistics],\n    sumstats_df: DataFrame,\n    study_id: str,\n) -&gt; SummaryStatistics:\n\"\"\"Create summary statistics object from summary statistics flatfile, harmonized by the GWAS Catalog.\n\n    Args:\n        sumstats_df (DataFrame): Harmonized dataset read as a spark dataframe from GWAS Catalog.\n        study_id (str): GWAS Catalog study accession.\n\n    Returns:\n        SummaryStatistics\n    \"\"\"\n    # The effect allele frequency is an optional column, we have to test if it is there:\n    allele_frequency_expression = (\n        f.col(\"hm_effect_allele_frequency\").cast(t.FloatType())\n        if \"hm_effect_allele_frequency\" in sumstats_df.columns\n        else f.lit(None)\n    )\n\n    # Processing columns of interest:\n    processed_sumstats_df = (\n        sumstats_df\n        # Dropping rows which doesn't have proper position:\n        .filter(f.col(\"hm_pos\").cast(t.IntegerType()).isNotNull())\n        .select(\n            # Adding study identifier:\n            f.lit(study_id).cast(t.StringType()).alias(\"studyId\"),\n            # Adding variant identifier:\n            f.col(\"hm_variant_id\").alias(\"variantId\"),\n            f.col(\"hm_chrom\").alias(\"chromosome\"),\n            f.col(\"hm_pos\").cast(t.IntegerType()).alias(\"position\"),\n            # Parsing p-value mantissa and exponent:\n            *parse_pvalue(f.col(\"p_value\")),\n            # Converting/calculating effect and confidence interval:\n            *convert_odds_ratio_to_beta(\n                f.col(\"hm_beta\").cast(t.DoubleType()),\n                f.col(\"hm_odds_ratio\").cast(t.DoubleType()),\n                f.col(\"standard_error\").cast(t.DoubleType()),\n            ),\n            allele_frequency_expression.alias(\"effectAlleleFrequencyFromSource\"),\n        )\n        # The previous select expression generated the necessary fields for calculating the confidence intervals:\n        .select(\n            \"*\",\n            *calculate_confidence_interval(\n                f.col(\"pValueMantissa\"),\n                f.col(\"pValueExponent\"),\n                f.col(\"beta\"),\n                f.col(\"standardError\"),\n            ),\n        )\n        .repartition(200, \"chromosome\")\n        .sortWithinPartitions(\"position\")\n    )\n\n    # Initializing summary statistics object:\n    return cls(\n        _df=processed_sumstats_df,\n    )\n</code></pre>"},{"location":"components/dataset/summary_statistics/#otg.dataset.summary_statistics.SummaryStatistics.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise SummaryStatistics from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>Session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>SummaryStatistics</code> <code>SummaryStatistics</code> <p>SummaryStatistics dataset</p> Source code in <code>src/otg/dataset/summary_statistics.py</code> <pre><code>@classmethod\ndef from_parquet(\n    cls: type[SummaryStatistics], session: Session, path: str\n) -&gt; SummaryStatistics:\n\"\"\"Initialise SummaryStatistics from parquet file.\n\n    Args:\n        session (Session): Session\n        path (str): Path to parquet file\n\n    Returns:\n        SummaryStatistics: SummaryStatistics dataset\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/summary_statistics/#otg.dataset.summary_statistics.SummaryStatistics.pvalue_filter","title":"<code>pvalue_filter(pvalue)</code>","text":"<p>Filter summary statistics based on the provided p-value threshold.</p> <p>Parameters:</p> Name Type Description Default <code>pvalue</code> <code>float</code> <p>upper limit of the p-value to be filtered upon.</p> required <p>Returns:</p> Name Type Description <code>SummaryStatistics</code> <code>SummaryStatistics</code> <p>summary statistics object containing single point associations with p-values at least as significant as the provided threshold.</p> Source code in <code>src/otg/dataset/summary_statistics.py</code> <pre><code>def pvalue_filter(self: SummaryStatistics, pvalue: float) -&gt; SummaryStatistics:\n\"\"\"Filter summary statistics based on the provided p-value threshold.\n\n    Args:\n        pvalue (float): upper limit of the p-value to be filtered upon.\n\n    Returns:\n        SummaryStatistics: summary statistics object containing single point associations with p-values at least as significant as the provided threshold.\n    \"\"\"\n    # Converting p-value to mantissa and exponent:\n    (mantissa, exponent) = split_pvalue(pvalue)\n\n    # Applying filter:\n    df = self._df.filter(\n        (f.col(\"pValueExponent\") &lt; exponent)\n        | (\n            (f.col(\"pValueExponent\") == exponent)\n            &amp; (f.col(\"pValueMantissa\") &lt;= mantissa)\n        )\n    )\n    return SummaryStatistics(_df=df)\n</code></pre>"},{"location":"components/dataset/summary_statistics/#otg.dataset.summary_statistics.SummaryStatistics.window_based_clumping","title":"<code>window_based_clumping(distance)</code>","text":"<p>Perform distance-based clumping.</p> <p>Parameters:</p> Name Type Description Default <code>distance</code> <code>int</code> <p>Distance in base pairs</p> required <p>Returns:</p> Name Type Description <code>StudyLocus</code> <code>StudyLocus</code> <p>StudyLocus object</p> Source code in <code>src/otg/dataset/summary_statistics.py</code> <pre><code>def window_based_clumping(self: SummaryStatistics, distance: int) -&gt; StudyLocus:\n\"\"\"Perform distance-based clumping.\n\n    Args:\n        distance (int): Distance in base pairs\n\n    Returns:\n        StudyLocus: StudyLocus object\n    \"\"\"\n    # Calculate distance-based clumping:\n    return WindowBasedClumping.clump(self, distance)\n</code></pre>"},{"location":"components/dataset/summary_statistics/#schema","title":"Schema","text":"<pre><code>root\n |-- studyId: string (nullable = false)\n |-- variantId: string (nullable = false)\n |-- chromosome: string (nullable = false)\n |-- position: integer (nullable = false)\n |-- beta: double (nullable = false)\n |-- betaConfidenceIntervalLower: double (nullable = true)\n |-- betaConfidenceIntervalUpper: double (nullable = true)\n |-- pValueMantissa: float (nullable = false)\n |-- pValueExponent: integer (nullable = false)\n |-- effectAlleleFrequencyFromSource: float (nullable = true)\n |-- standardError: double (nullable = true)\n</code></pre>"},{"location":"components/dataset/variant_annotation/","title":"Variant annotation","text":"<p>         Bases: <code>Dataset</code></p> <p>Dataset with variant-level annotations derived from GnomAD.</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>@dataclass\nclass VariantAnnotation(Dataset):\n\"\"\"Dataset with variant-level annotations derived from GnomAD.\"\"\"\n\n    _schema: StructType = parse_spark_schema(\"variant_annotation.json\")\n\n    @classmethod\n    def from_parquet(\n        cls: type[VariantAnnotation], session: Session, path: str\n    ) -&gt; VariantAnnotation:\n\"\"\"Initialise VariantAnnotation from parquet file.\n\n        Args:\n            session (Session): ETL session\n            path (str): Path to parquet file\n\n        Returns:\n            VariantAnnotation: VariantAnnotation dataset\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n\n    @classmethod\n    def from_gnomad(\n        cls: type[VariantAnnotation],\n        gnomad_file: str,\n        grch38_to_grch37_chain: str,\n        populations: list,\n    ) -&gt; VariantAnnotation:\n\"\"\"Generate variant annotation dataset from gnomAD.\n\n        Some relevant modifications to the original dataset are:\n\n        1. The transcript consequences features provided by VEP are filtered to only refer to the Ensembl canonical transcript.\n        2. Genome coordinates are liftovered from GRCh38 to GRCh37 to keep as annotation.\n        3. Field names are converted to camel case to follow the convention.\n\n        Args:\n            gnomad_file (str): Path to `gnomad.genomes.vX.X.X.sites.ht` gnomAD dataset\n            grch38_to_grch37_chain (str): Path to chain file for liftover\n            populations (list): List of populations to include in the dataset\n\n        Returns:\n            VariantAnnotation: Variant annotation dataset\n        \"\"\"\n        # Load variants dataset\n        ht = hl.read_table(\n            gnomad_file,\n            _load_refs=False,\n        )\n\n        # Liftover\n        grch37 = hl.get_reference(\"GRCh37\")\n        grch38 = hl.get_reference(\"GRCh38\")\n        grch38.add_liftover(grch38_to_grch37_chain, grch37)\n\n        # Drop non biallelic variants\n        ht = ht.filter(ht.alleles.length() == 2)\n        # Liftover\n        ht = ht.annotate(locus_GRCh37=hl.liftover(ht.locus, \"GRCh37\"))\n        # Select relevant fields and nested records to create class\n        return cls(\n            _df=(\n                ht.select(\n                    gnomad3VariantId=hl.str(\"-\").join(\n                        [\n                            ht.locus.contig.replace(\"chr\", \"\"),\n                            hl.str(ht.locus.position),\n                            ht.alleles[0],\n                            ht.alleles[1],\n                        ]\n                    ),\n                    chromosome=ht.locus.contig.replace(\"chr\", \"\"),\n                    position=convert_gnomad_position_to_ensembl_hail(\n                        ht.locus.position, ht.alleles[0], ht.alleles[1]\n                    ),\n                    variantId=hl.str(\"_\").join(\n                        [\n                            ht.locus.contig.replace(\"chr\", \"\"),\n                            hl.str(\n                                convert_gnomad_position_to_ensembl_hail(\n                                    ht.locus.position, ht.alleles[0], ht.alleles[1]\n                                )\n                            ),\n                            ht.alleles[0],\n                            ht.alleles[1],\n                        ]\n                    ),\n                    chromosomeB37=ht.locus_GRCh37.contig.replace(\"chr\", \"\"),\n                    positionB37=ht.locus_GRCh37.position,\n                    referenceAllele=ht.alleles[0],\n                    alternateAllele=ht.alleles[1],\n                    rsIds=ht.rsid,\n                    alleleType=ht.allele_info.allele_type,\n                    cadd=hl.struct(\n                        phred=ht.cadd.phred,\n                        raw=ht.cadd.raw_score,\n                    ),\n                    alleleFrequencies=hl.set([f\"{pop}-adj\" for pop in populations]).map(\n                        lambda p: hl.struct(\n                            populationName=p,\n                            alleleFrequency=ht.freq[ht.globals.freq_index_dict[p]].AF,\n                        )\n                    ),\n                    vep=hl.struct(\n                        mostSevereConsequence=ht.vep.most_severe_consequence,\n                        transcriptConsequences=hl.map(\n                            lambda x: hl.struct(\n                                aminoAcids=x.amino_acids,\n                                consequenceTerms=x.consequence_terms,\n                                geneId=x.gene_id,\n                                lof=x.lof,\n                                polyphenScore=x.polyphen_score,\n                                polyphenPrediction=x.polyphen_prediction,\n                                siftScore=x.sift_score,\n                                siftPrediction=x.sift_prediction,\n                            ),\n                            # Only keeping canonical transcripts\n                            ht.vep.transcript_consequences.filter(\n                                lambda x: (x.canonical == 1)\n                                &amp; (x.gene_symbol_source == \"HGNC\")\n                            ),\n                        ),\n                    ),\n                )\n                .key_by(\"chromosome\", \"position\")\n                .drop(\"locus\", \"alleles\")\n                .select_globals()\n                .to_spark(flatten=False)\n            )\n        )\n\n    def persist(self: VariantAnnotation) -&gt; VariantAnnotation:\n\"\"\"Persist DataFrame included in the Dataset.\"\"\"\n        self.df = self._df.persist()\n        return self\n\n    def max_maf(self: VariantAnnotation) -&gt; Column:\n\"\"\"Maximum minor allele frequency accross all populations.\n\n        Returns:\n            Column: Maximum minor allele frequency accross all populations.\n        \"\"\"\n        return f.array_max(\n            f.transform(\n                self.df.alleleFrequencies,\n                lambda af: f.when(\n                    af.alleleFrequency &gt; 0.5, 1 - af.alleleFrequency\n                ).otherwise(af.alleleFrequency),\n            )\n        )\n\n    def filter_by_variant_df(\n        self: VariantAnnotation, df: DataFrame, cols: list[str]\n    ) -&gt; VariantAnnotation:\n\"\"\"Filter variant annotation dataset by a variant dataframe.\n\n        Args:\n            df (DataFrame): A dataframe of variants\n            cols (List[str]): A list of columns to join on\n\n        Returns:\n            VariantAnnotation: A filtered variant annotation dataset\n        \"\"\"\n        self.df = self._df.join(f.broadcast(df.select(cols)), on=cols, how=\"inner\")\n        return self\n\n    def get_transcript_consequence_df(\n        self: VariantAnnotation, filter_by: Optional[GeneIndex] = None\n    ) -&gt; DataFrame:\n\"\"\"Dataframe of exploded transcript consequences.\n\n        Optionally the trancript consequences can be reduced to the universe of a gene index.\n\n        Args:\n            filter_by (GeneIndex): A gene index. Defaults to None.\n\n        Returns:\n            DataFrame: A dataframe exploded by transcript consequences with the columns variantId, chromosome, transcriptConsequence\n        \"\"\"\n        # exploding the array removes records without VEP annotation\n        transript_consequences = self.df.withColumn(\n            \"transcriptConsequence\", f.explode(\"vep.transcriptConsequences\")\n        ).select(\n            \"variantId\",\n            \"chromosome\",\n            \"position\",\n            \"transcriptConsequence\",\n            f.col(\"transcriptConsequence.geneId\").alias(\"geneId\"),\n        )\n        if filter_by:\n            transript_consequences = transript_consequences.join(\n                f.broadcast(filter_by.df),\n                on=[\"chromosome\", \"geneId\"],\n            )\n        return transript_consequences.persist()\n\n    def get_most_severe_vep_v2g(\n        self: VariantAnnotation,\n        vep_consequences: DataFrame,\n        filter_by: GeneIndex,\n    ) -&gt; V2G:\n\"\"\"Creates a dataset with variant to gene assignments based on VEP's predicted consequence on the transcript.\n\n        Optionally the trancript consequences can be reduced to the universe of a gene index.\n\n        Args:\n            vep_consequences (DataFrame): A dataframe of VEP consequences\n            filter_by (GeneIndex): A gene index to filter by. Defaults to None.\n\n        Returns:\n            V2G: High and medium severity variant to gene assignments\n        \"\"\"\n        vep_lut = vep_consequences.select(\n            f.element_at(f.split(\"Accession\", r\"/\"), -1).alias(\n                \"variantFunctionalConsequenceId\"\n            ),\n            f.col(\"Term\").alias(\"label\"),\n            f.col(\"v2g_score\").cast(\"double\").alias(\"score\"),\n        )\n\n        return V2G(\n            _df=self.get_transcript_consequence_df(filter_by)\n            .select(\n                \"variantId\",\n                \"chromosome\",\n                \"position\",\n                f.col(\"transcriptConsequence.geneId\").alias(\"geneId\"),\n                f.explode(\"transcriptConsequence.consequenceTerms\").alias(\"label\"),\n                f.lit(\"vep\").alias(\"datatypeId\"),\n                f.lit(\"variantConsequence\").alias(\"datasourceId\"),\n            )\n            # A variant can have multiple predicted consequences on a transcript, the most severe one is selected\n            .join(\n                f.broadcast(vep_lut),\n                on=\"label\",\n                how=\"inner\",\n            )\n            .filter(f.col(\"score\") != 0)\n            .transform(\n                lambda df: get_record_with_maximum_value(\n                    df, [\"variantId\", \"geneId\"], \"score\"\n                )\n            )\n        )\n\n    def get_polyphen_v2g(\n        self: VariantAnnotation, filter_by: Optional[GeneIndex] = None\n    ) -&gt; V2G:\n\"\"\"Creates a dataset with variant to gene assignments with a PolyPhen's predicted score on the transcript.\n\n        Polyphen informs about the probability that a substitution is damaging. Optionally the trancript consequences can be reduced to the universe of a gene index.\n\n        Args:\n            filter_by (GeneIndex): A gene index to filter by. Defaults to None.\n\n        Returns:\n            V2G: variant to gene assignments with their polyphen scores\n        \"\"\"\n        return V2G(\n            _df=self.get_transcript_consequence_df(filter_by)\n            .filter(f.col(\"transcriptConsequence.polyphenScore\").isNotNull())\n            .select(\n                \"variantId\",\n                \"chromosome\",\n                \"position\",\n                \"geneId\",\n                f.col(\"transcriptConsequence.polyphenScore\").alias(\"score\"),\n                f.col(\"transcriptConsequence.polyphenPrediction\").alias(\"label\"),\n                f.lit(\"vep\").alias(\"datatypeId\"),\n                f.lit(\"polyphen\").alias(\"datasourceId\"),\n            )\n        )\n\n    def get_sift_v2g(self: VariantAnnotation, filter_by: GeneIndex) -&gt; V2G:\n\"\"\"Creates a dataset with variant to gene assignments with a SIFT's predicted score on the transcript.\n\n        SIFT informs about the probability that a substitution is tolerated so scores nearer zero are more likely to be deleterious.\n        Optionally the trancript consequences can be reduced to the universe of a gene index.\n\n        Args:\n            filter_by (GeneIndex): A gene index to filter by.\n\n        Returns:\n            V2G: variant to gene assignments with their SIFT scores\n        \"\"\"\n        return V2G(\n            _df=self.get_transcript_consequence_df(filter_by)\n            .filter(f.col(\"transcriptConsequence.siftScore\").isNotNull())\n            .select(\n                \"variantId\",\n                \"chromosome\",\n                \"position\",\n                \"geneId\",\n                f.expr(\"1 - transcriptConsequence.siftScore\").alias(\"score\"),\n                f.col(\"transcriptConsequence.siftPrediction\").alias(\"label\"),\n                f.lit(\"vep\").alias(\"datatypeId\"),\n                f.lit(\"sift\").alias(\"datasourceId\"),\n            )\n        )\n\n    def get_plof_v2g(self: VariantAnnotation, filter_by: GeneIndex) -&gt; V2G:\n\"\"\"Creates a dataset with variant to gene assignments with a flag indicating if the variant is predicted to be a loss-of-function variant by the LOFTEE algorithm.\n\n        Optionally the trancript consequences can be reduced to the universe of a gene index.\n\n        Args:\n            filter_by (GeneIndex): A gene index to filter by.\n\n        Returns:\n            V2G: variant to gene assignments from the LOFTEE algorithm\n        \"\"\"\n        return V2G(\n            _df=self.get_transcript_consequence_df(filter_by)\n            .filter(f.col(\"transcriptConsequence.lof\").isNotNull())\n            .withColumn(\n                \"isHighQualityPlof\",\n                f.when(f.col(\"transcriptConsequence.lof\") == \"HC\", True).when(\n                    f.col(\"transcriptConsequence.lof\") == \"LC\", False\n                ),\n            )\n            .withColumn(\n                \"score\",\n                f.when(f.col(\"isHighQualityPlof\"), 1.0).when(\n                    ~f.col(\"isHighQualityPlof\"), 0\n                ),\n            )\n            .select(\n                \"variantId\",\n                \"chromosome\",\n                \"position\",\n                \"geneId\",\n                \"isHighQualityPlof\",\n                f.col(\"score\"),\n                f.lit(\"vep\").alias(\"datatypeId\"),\n                f.lit(\"loftee\").alias(\"datasourceId\"),\n            )\n        )\n\n    def get_distance_to_tss(\n        self: VariantAnnotation,\n        filter_by: GeneIndex,\n        max_distance: int = 500_000,\n    ) -&gt; V2G:\n\"\"\"Extracts variant to gene assignments for variants falling within a window of a gene's TSS.\n\n        Args:\n            filter_by (GeneIndex): A gene index to filter by.\n            max_distance (int): The maximum distance from the TSS to consider. Defaults to 500_000.\n\n        Returns:\n            V2G: variant to gene assignments with their distance to the TSS\n        \"\"\"\n        return V2G(\n            _df=self.df.alias(\"variant\")\n            .join(\n                f.broadcast(filter_by.locations_lut()).alias(\"gene\"),\n                on=[\n                    f.col(\"variant.chromosome\") == f.col(\"gene.chromosome\"),\n                    f.abs(f.col(\"variant.position\") - f.col(\"gene.tss\"))\n                    &lt;= max_distance,\n                ],\n                how=\"inner\",\n            )\n            .withColumn(\n                \"inverse_distance\",\n                max_distance - f.abs(f.col(\"variant.position\") - f.col(\"gene.tss\")),\n            )\n            .transform(lambda df: normalise_column(df, \"inverse_distance\", \"score\"))\n            .select(\n                \"variantId\",\n                f.col(\"variant.chromosome\").alias(\"chromosome\"),\n                \"position\",\n                \"geneId\",\n                \"score\",\n                f.lit(\"distance\").alias(\"datatypeId\"),\n                f.lit(\"canonical_tss\").alias(\"datasourceId\"),\n            )\n        )\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.filter_by_variant_df","title":"<code>filter_by_variant_df(df, cols)</code>","text":"<p>Filter variant annotation dataset by a variant dataframe.</p> <p>Parameters:</p> Name Type Description Default <code>df</code> <code>DataFrame</code> <p>A dataframe of variants</p> required <code>cols</code> <code>List[str]</code> <p>A list of columns to join on</p> required <p>Returns:</p> Name Type Description <code>VariantAnnotation</code> <code>VariantAnnotation</code> <p>A filtered variant annotation dataset</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>def filter_by_variant_df(\n    self: VariantAnnotation, df: DataFrame, cols: list[str]\n) -&gt; VariantAnnotation:\n\"\"\"Filter variant annotation dataset by a variant dataframe.\n\n    Args:\n        df (DataFrame): A dataframe of variants\n        cols (List[str]): A list of columns to join on\n\n    Returns:\n        VariantAnnotation: A filtered variant annotation dataset\n    \"\"\"\n    self.df = self._df.join(f.broadcast(df.select(cols)), on=cols, how=\"inner\")\n    return self\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.from_gnomad","title":"<code>from_gnomad(gnomad_file, grch38_to_grch37_chain, populations)</code>  <code>classmethod</code>","text":"<p>Generate variant annotation dataset from gnomAD.</p> <p>Some relevant modifications to the original dataset are:</p> <ol> <li>The transcript consequences features provided by VEP are filtered to only refer to the Ensembl canonical transcript.</li> <li>Genome coordinates are liftovered from GRCh38 to GRCh37 to keep as annotation.</li> <li>Field names are converted to camel case to follow the convention.</li> </ol> <p>Parameters:</p> Name Type Description Default <code>gnomad_file</code> <code>str</code> <p>Path to <code>gnomad.genomes.vX.X.X.sites.ht</code> gnomAD dataset</p> required <code>grch38_to_grch37_chain</code> <code>str</code> <p>Path to chain file for liftover</p> required <code>populations</code> <code>list</code> <p>List of populations to include in the dataset</p> required <p>Returns:</p> Name Type Description <code>VariantAnnotation</code> <code>VariantAnnotation</code> <p>Variant annotation dataset</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>@classmethod\ndef from_gnomad(\n    cls: type[VariantAnnotation],\n    gnomad_file: str,\n    grch38_to_grch37_chain: str,\n    populations: list,\n) -&gt; VariantAnnotation:\n\"\"\"Generate variant annotation dataset from gnomAD.\n\n    Some relevant modifications to the original dataset are:\n\n    1. The transcript consequences features provided by VEP are filtered to only refer to the Ensembl canonical transcript.\n    2. Genome coordinates are liftovered from GRCh38 to GRCh37 to keep as annotation.\n    3. Field names are converted to camel case to follow the convention.\n\n    Args:\n        gnomad_file (str): Path to `gnomad.genomes.vX.X.X.sites.ht` gnomAD dataset\n        grch38_to_grch37_chain (str): Path to chain file for liftover\n        populations (list): List of populations to include in the dataset\n\n    Returns:\n        VariantAnnotation: Variant annotation dataset\n    \"\"\"\n    # Load variants dataset\n    ht = hl.read_table(\n        gnomad_file,\n        _load_refs=False,\n    )\n\n    # Liftover\n    grch37 = hl.get_reference(\"GRCh37\")\n    grch38 = hl.get_reference(\"GRCh38\")\n    grch38.add_liftover(grch38_to_grch37_chain, grch37)\n\n    # Drop non biallelic variants\n    ht = ht.filter(ht.alleles.length() == 2)\n    # Liftover\n    ht = ht.annotate(locus_GRCh37=hl.liftover(ht.locus, \"GRCh37\"))\n    # Select relevant fields and nested records to create class\n    return cls(\n        _df=(\n            ht.select(\n                gnomad3VariantId=hl.str(\"-\").join(\n                    [\n                        ht.locus.contig.replace(\"chr\", \"\"),\n                        hl.str(ht.locus.position),\n                        ht.alleles[0],\n                        ht.alleles[1],\n                    ]\n                ),\n                chromosome=ht.locus.contig.replace(\"chr\", \"\"),\n                position=convert_gnomad_position_to_ensembl_hail(\n                    ht.locus.position, ht.alleles[0], ht.alleles[1]\n                ),\n                variantId=hl.str(\"_\").join(\n                    [\n                        ht.locus.contig.replace(\"chr\", \"\"),\n                        hl.str(\n                            convert_gnomad_position_to_ensembl_hail(\n                                ht.locus.position, ht.alleles[0], ht.alleles[1]\n                            )\n                        ),\n                        ht.alleles[0],\n                        ht.alleles[1],\n                    ]\n                ),\n                chromosomeB37=ht.locus_GRCh37.contig.replace(\"chr\", \"\"),\n                positionB37=ht.locus_GRCh37.position,\n                referenceAllele=ht.alleles[0],\n                alternateAllele=ht.alleles[1],\n                rsIds=ht.rsid,\n                alleleType=ht.allele_info.allele_type,\n                cadd=hl.struct(\n                    phred=ht.cadd.phred,\n                    raw=ht.cadd.raw_score,\n                ),\n                alleleFrequencies=hl.set([f\"{pop}-adj\" for pop in populations]).map(\n                    lambda p: hl.struct(\n                        populationName=p,\n                        alleleFrequency=ht.freq[ht.globals.freq_index_dict[p]].AF,\n                    )\n                ),\n                vep=hl.struct(\n                    mostSevereConsequence=ht.vep.most_severe_consequence,\n                    transcriptConsequences=hl.map(\n                        lambda x: hl.struct(\n                            aminoAcids=x.amino_acids,\n                            consequenceTerms=x.consequence_terms,\n                            geneId=x.gene_id,\n                            lof=x.lof,\n                            polyphenScore=x.polyphen_score,\n                            polyphenPrediction=x.polyphen_prediction,\n                            siftScore=x.sift_score,\n                            siftPrediction=x.sift_prediction,\n                        ),\n                        # Only keeping canonical transcripts\n                        ht.vep.transcript_consequences.filter(\n                            lambda x: (x.canonical == 1)\n                            &amp; (x.gene_symbol_source == \"HGNC\")\n                        ),\n                    ),\n                ),\n            )\n            .key_by(\"chromosome\", \"position\")\n            .drop(\"locus\", \"alleles\")\n            .select_globals()\n            .to_spark(flatten=False)\n        )\n    )\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise VariantAnnotation from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>ETL session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>VariantAnnotation</code> <code>VariantAnnotation</code> <p>VariantAnnotation dataset</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>@classmethod\ndef from_parquet(\n    cls: type[VariantAnnotation], session: Session, path: str\n) -&gt; VariantAnnotation:\n\"\"\"Initialise VariantAnnotation from parquet file.\n\n    Args:\n        session (Session): ETL session\n        path (str): Path to parquet file\n\n    Returns:\n        VariantAnnotation: VariantAnnotation dataset\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.get_distance_to_tss","title":"<code>get_distance_to_tss(filter_by, max_distance=500000)</code>","text":"<p>Extracts variant to gene assignments for variants falling within a window of a gene's TSS.</p> <p>Parameters:</p> Name Type Description Default <code>filter_by</code> <code>GeneIndex</code> <p>A gene index to filter by.</p> required <code>max_distance</code> <code>int</code> <p>The maximum distance from the TSS to consider. Defaults to 500_000.</p> <code>500000</code> <p>Returns:</p> Name Type Description <code>V2G</code> <code>V2G</code> <p>variant to gene assignments with their distance to the TSS</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>def get_distance_to_tss(\n    self: VariantAnnotation,\n    filter_by: GeneIndex,\n    max_distance: int = 500_000,\n) -&gt; V2G:\n\"\"\"Extracts variant to gene assignments for variants falling within a window of a gene's TSS.\n\n    Args:\n        filter_by (GeneIndex): A gene index to filter by.\n        max_distance (int): The maximum distance from the TSS to consider. Defaults to 500_000.\n\n    Returns:\n        V2G: variant to gene assignments with their distance to the TSS\n    \"\"\"\n    return V2G(\n        _df=self.df.alias(\"variant\")\n        .join(\n            f.broadcast(filter_by.locations_lut()).alias(\"gene\"),\n            on=[\n                f.col(\"variant.chromosome\") == f.col(\"gene.chromosome\"),\n                f.abs(f.col(\"variant.position\") - f.col(\"gene.tss\"))\n                &lt;= max_distance,\n            ],\n            how=\"inner\",\n        )\n        .withColumn(\n            \"inverse_distance\",\n            max_distance - f.abs(f.col(\"variant.position\") - f.col(\"gene.tss\")),\n        )\n        .transform(lambda df: normalise_column(df, \"inverse_distance\", \"score\"))\n        .select(\n            \"variantId\",\n            f.col(\"variant.chromosome\").alias(\"chromosome\"),\n            \"position\",\n            \"geneId\",\n            \"score\",\n            f.lit(\"distance\").alias(\"datatypeId\"),\n            f.lit(\"canonical_tss\").alias(\"datasourceId\"),\n        )\n    )\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.get_most_severe_vep_v2g","title":"<code>get_most_severe_vep_v2g(vep_consequences, filter_by)</code>","text":"<p>Creates a dataset with variant to gene assignments based on VEP's predicted consequence on the transcript.</p> <p>Optionally the trancript consequences can be reduced to the universe of a gene index.</p> <p>Parameters:</p> Name Type Description Default <code>vep_consequences</code> <code>DataFrame</code> <p>A dataframe of VEP consequences</p> required <code>filter_by</code> <code>GeneIndex</code> <p>A gene index to filter by. Defaults to None.</p> required <p>Returns:</p> Name Type Description <code>V2G</code> <code>V2G</code> <p>High and medium severity variant to gene assignments</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>def get_most_severe_vep_v2g(\n    self: VariantAnnotation,\n    vep_consequences: DataFrame,\n    filter_by: GeneIndex,\n) -&gt; V2G:\n\"\"\"Creates a dataset with variant to gene assignments based on VEP's predicted consequence on the transcript.\n\n    Optionally the trancript consequences can be reduced to the universe of a gene index.\n\n    Args:\n        vep_consequences (DataFrame): A dataframe of VEP consequences\n        filter_by (GeneIndex): A gene index to filter by. Defaults to None.\n\n    Returns:\n        V2G: High and medium severity variant to gene assignments\n    \"\"\"\n    vep_lut = vep_consequences.select(\n        f.element_at(f.split(\"Accession\", r\"/\"), -1).alias(\n            \"variantFunctionalConsequenceId\"\n        ),\n        f.col(\"Term\").alias(\"label\"),\n        f.col(\"v2g_score\").cast(\"double\").alias(\"score\"),\n    )\n\n    return V2G(\n        _df=self.get_transcript_consequence_df(filter_by)\n        .select(\n            \"variantId\",\n            \"chromosome\",\n            \"position\",\n            f.col(\"transcriptConsequence.geneId\").alias(\"geneId\"),\n            f.explode(\"transcriptConsequence.consequenceTerms\").alias(\"label\"),\n            f.lit(\"vep\").alias(\"datatypeId\"),\n            f.lit(\"variantConsequence\").alias(\"datasourceId\"),\n        )\n        # A variant can have multiple predicted consequences on a transcript, the most severe one is selected\n        .join(\n            f.broadcast(vep_lut),\n            on=\"label\",\n            how=\"inner\",\n        )\n        .filter(f.col(\"score\") != 0)\n        .transform(\n            lambda df: get_record_with_maximum_value(\n                df, [\"variantId\", \"geneId\"], \"score\"\n            )\n        )\n    )\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.get_plof_v2g","title":"<code>get_plof_v2g(filter_by)</code>","text":"<p>Creates a dataset with variant to gene assignments with a flag indicating if the variant is predicted to be a loss-of-function variant by the LOFTEE algorithm.</p> <p>Optionally the trancript consequences can be reduced to the universe of a gene index.</p> <p>Parameters:</p> Name Type Description Default <code>filter_by</code> <code>GeneIndex</code> <p>A gene index to filter by.</p> required <p>Returns:</p> Name Type Description <code>V2G</code> <code>V2G</code> <p>variant to gene assignments from the LOFTEE algorithm</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>def get_plof_v2g(self: VariantAnnotation, filter_by: GeneIndex) -&gt; V2G:\n\"\"\"Creates a dataset with variant to gene assignments with a flag indicating if the variant is predicted to be a loss-of-function variant by the LOFTEE algorithm.\n\n    Optionally the trancript consequences can be reduced to the universe of a gene index.\n\n    Args:\n        filter_by (GeneIndex): A gene index to filter by.\n\n    Returns:\n        V2G: variant to gene assignments from the LOFTEE algorithm\n    \"\"\"\n    return V2G(\n        _df=self.get_transcript_consequence_df(filter_by)\n        .filter(f.col(\"transcriptConsequence.lof\").isNotNull())\n        .withColumn(\n            \"isHighQualityPlof\",\n            f.when(f.col(\"transcriptConsequence.lof\") == \"HC\", True).when(\n                f.col(\"transcriptConsequence.lof\") == \"LC\", False\n            ),\n        )\n        .withColumn(\n            \"score\",\n            f.when(f.col(\"isHighQualityPlof\"), 1.0).when(\n                ~f.col(\"isHighQualityPlof\"), 0\n            ),\n        )\n        .select(\n            \"variantId\",\n            \"chromosome\",\n            \"position\",\n            \"geneId\",\n            \"isHighQualityPlof\",\n            f.col(\"score\"),\n            f.lit(\"vep\").alias(\"datatypeId\"),\n            f.lit(\"loftee\").alias(\"datasourceId\"),\n        )\n    )\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.get_polyphen_v2g","title":"<code>get_polyphen_v2g(filter_by=None)</code>","text":"<p>Creates a dataset with variant to gene assignments with a PolyPhen's predicted score on the transcript.</p> <p>Polyphen informs about the probability that a substitution is damaging. Optionally the trancript consequences can be reduced to the universe of a gene index.</p> <p>Parameters:</p> Name Type Description Default <code>filter_by</code> <code>GeneIndex</code> <p>A gene index to filter by. Defaults to None.</p> <code>None</code> <p>Returns:</p> Name Type Description <code>V2G</code> <code>V2G</code> <p>variant to gene assignments with their polyphen scores</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>def get_polyphen_v2g(\n    self: VariantAnnotation, filter_by: Optional[GeneIndex] = None\n) -&gt; V2G:\n\"\"\"Creates a dataset with variant to gene assignments with a PolyPhen's predicted score on the transcript.\n\n    Polyphen informs about the probability that a substitution is damaging. Optionally the trancript consequences can be reduced to the universe of a gene index.\n\n    Args:\n        filter_by (GeneIndex): A gene index to filter by. Defaults to None.\n\n    Returns:\n        V2G: variant to gene assignments with their polyphen scores\n    \"\"\"\n    return V2G(\n        _df=self.get_transcript_consequence_df(filter_by)\n        .filter(f.col(\"transcriptConsequence.polyphenScore\").isNotNull())\n        .select(\n            \"variantId\",\n            \"chromosome\",\n            \"position\",\n            \"geneId\",\n            f.col(\"transcriptConsequence.polyphenScore\").alias(\"score\"),\n            f.col(\"transcriptConsequence.polyphenPrediction\").alias(\"label\"),\n            f.lit(\"vep\").alias(\"datatypeId\"),\n            f.lit(\"polyphen\").alias(\"datasourceId\"),\n        )\n    )\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.get_sift_v2g","title":"<code>get_sift_v2g(filter_by)</code>","text":"<p>Creates a dataset with variant to gene assignments with a SIFT's predicted score on the transcript.</p> <p>SIFT informs about the probability that a substitution is tolerated so scores nearer zero are more likely to be deleterious. Optionally the trancript consequences can be reduced to the universe of a gene index.</p> <p>Parameters:</p> Name Type Description Default <code>filter_by</code> <code>GeneIndex</code> <p>A gene index to filter by.</p> required <p>Returns:</p> Name Type Description <code>V2G</code> <code>V2G</code> <p>variant to gene assignments with their SIFT scores</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>def get_sift_v2g(self: VariantAnnotation, filter_by: GeneIndex) -&gt; V2G:\n\"\"\"Creates a dataset with variant to gene assignments with a SIFT's predicted score on the transcript.\n\n    SIFT informs about the probability that a substitution is tolerated so scores nearer zero are more likely to be deleterious.\n    Optionally the trancript consequences can be reduced to the universe of a gene index.\n\n    Args:\n        filter_by (GeneIndex): A gene index to filter by.\n\n    Returns:\n        V2G: variant to gene assignments with their SIFT scores\n    \"\"\"\n    return V2G(\n        _df=self.get_transcript_consequence_df(filter_by)\n        .filter(f.col(\"transcriptConsequence.siftScore\").isNotNull())\n        .select(\n            \"variantId\",\n            \"chromosome\",\n            \"position\",\n            \"geneId\",\n            f.expr(\"1 - transcriptConsequence.siftScore\").alias(\"score\"),\n            f.col(\"transcriptConsequence.siftPrediction\").alias(\"label\"),\n            f.lit(\"vep\").alias(\"datatypeId\"),\n            f.lit(\"sift\").alias(\"datasourceId\"),\n        )\n    )\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.get_transcript_consequence_df","title":"<code>get_transcript_consequence_df(filter_by=None)</code>","text":"<p>Dataframe of exploded transcript consequences.</p> <p>Optionally the trancript consequences can be reduced to the universe of a gene index.</p> <p>Parameters:</p> Name Type Description Default <code>filter_by</code> <code>GeneIndex</code> <p>A gene index. Defaults to None.</p> <code>None</code> <p>Returns:</p> Name Type Description <code>DataFrame</code> <code>DataFrame</code> <p>A dataframe exploded by transcript consequences with the columns variantId, chromosome, transcriptConsequence</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>def get_transcript_consequence_df(\n    self: VariantAnnotation, filter_by: Optional[GeneIndex] = None\n) -&gt; DataFrame:\n\"\"\"Dataframe of exploded transcript consequences.\n\n    Optionally the trancript consequences can be reduced to the universe of a gene index.\n\n    Args:\n        filter_by (GeneIndex): A gene index. Defaults to None.\n\n    Returns:\n        DataFrame: A dataframe exploded by transcript consequences with the columns variantId, chromosome, transcriptConsequence\n    \"\"\"\n    # exploding the array removes records without VEP annotation\n    transript_consequences = self.df.withColumn(\n        \"transcriptConsequence\", f.explode(\"vep.transcriptConsequences\")\n    ).select(\n        \"variantId\",\n        \"chromosome\",\n        \"position\",\n        \"transcriptConsequence\",\n        f.col(\"transcriptConsequence.geneId\").alias(\"geneId\"),\n    )\n    if filter_by:\n        transript_consequences = transript_consequences.join(\n            f.broadcast(filter_by.df),\n            on=[\"chromosome\", \"geneId\"],\n        )\n    return transript_consequences.persist()\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.max_maf","title":"<code>max_maf()</code>","text":"<p>Maximum minor allele frequency accross all populations.</p> <p>Returns:</p> Name Type Description <code>Column</code> <code>Column</code> <p>Maximum minor allele frequency accross all populations.</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>def max_maf(self: VariantAnnotation) -&gt; Column:\n\"\"\"Maximum minor allele frequency accross all populations.\n\n    Returns:\n        Column: Maximum minor allele frequency accross all populations.\n    \"\"\"\n    return f.array_max(\n        f.transform(\n            self.df.alleleFrequencies,\n            lambda af: f.when(\n                af.alleleFrequency &gt; 0.5, 1 - af.alleleFrequency\n            ).otherwise(af.alleleFrequency),\n        )\n    )\n</code></pre>"},{"location":"components/dataset/variant_annotation/#otg.dataset.variant_annotation.VariantAnnotation.persist","title":"<code>persist()</code>","text":"<p>Persist DataFrame included in the Dataset.</p> Source code in <code>src/otg/dataset/variant_annotation.py</code> <pre><code>def persist(self: VariantAnnotation) -&gt; VariantAnnotation:\n\"\"\"Persist DataFrame included in the Dataset.\"\"\"\n    self.df = self._df.persist()\n    return self\n</code></pre>"},{"location":"components/dataset/variant_annotation/#schema","title":"Schema","text":"<pre><code>root\n |-- variantId: string (nullable = false)\n |-- chromosome: string (nullable = false)\n |-- position: integer (nullable = false)\n |-- gnomad3VariantId: string (nullable = false)\n |-- referenceAllele: string (nullable = false)\n |-- alternateAllele: string (nullable = false)\n |-- chromosomeB37: string (nullable = true)\n |-- positionB37: integer (nullable = true)\n |-- alleleType: string (nullable = true)\n |-- rsIds: array (nullable = true)\n |    |-- element: string (containsNull = true)\n |-- alleleFrequencies: array (nullable = false)\n |    |-- element: struct (containsNull = true)\n |    |    |-- populationName: string (nullable = true)\n |    |    |-- alleleFrequency: double (nullable = true)\n |-- cadd: struct (nullable = true)\n |    |-- phred: float (nullable = true)\n |    |-- raw: float (nullable = true)\n |-- vep: struct (nullable = false)\n |    |-- mostSevereConsequence: string (nullable = true)\n |    |-- transcriptConsequences: array (nullable = true)\n |    |    |-- element: struct (containsNull = true)\n |    |    |    |-- aminoAcids: string (nullable = true)\n |    |    |    |-- consequenceTerms: array (nullable = true)\n |    |    |    |    |-- element: string (containsNull = true)\n |    |    |    |-- geneId: string (nullable = true)\n |    |    |    |-- lof: string (nullable = true)\n |    |    |    |-- polyphenScore: double (nullable = true)\n |    |    |    |-- polyphenPrediction: string (nullable = true)\n |    |    |    |-- siftScore: double (nullable = true)\n |    |    |    |-- siftPrediction: string (nullable = true)\n</code></pre>"},{"location":"components/dataset/variant_index/","title":"Variant index","text":"<p>         Bases: <code>Dataset</code></p> <p>Variant index dataset.</p> <p>Variant index dataset is the result of intersecting the variant annotation (gnomad) dataset with the variants with V2D available information.</p> Source code in <code>src/otg/dataset/variant_index.py</code> <pre><code>@dataclass\nclass VariantIndex(Dataset):\n\"\"\"Variant index dataset.\n\n    Variant index dataset is the result of intersecting the variant annotation (gnomad) dataset with the variants with V2D available information.\n    \"\"\"\n\n    _schema: StructType = parse_spark_schema(\"variant_index.json\")\n\n    @classmethod\n    def from_parquet(\n        cls: type[VariantIndex], session: Session, path: str\n    ) -&gt; VariantIndex:\n\"\"\"Initialise VariantIndex from parquet file.\n\n        Args:\n            session (Session): ETL session\n            path (str): Path to parquet file\n\n        Returns:\n            VariantIndex: VariantIndex dataset\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n\n    @classmethod\n    def from_variant_annotation(\n        cls: type[VariantIndex],\n        variant_annotation: VariantAnnotation,\n    ) -&gt; VariantIndex:\n\"\"\"Initialise VariantIndex from pre-existing variant annotation dataset.\"\"\"\n        unchanged_cols = [\n            \"variantId\",\n            \"chromosome\",\n            \"position\",\n            \"referenceAllele\",\n            \"alternateAllele\",\n            \"chromosomeB37\",\n            \"positionB37\",\n            \"alleleType\",\n            \"alleleFrequencies\",\n            \"cadd\",\n        ]\n        vi = cls(\n            _df=variant_annotation.df.select(\n                *unchanged_cols,\n                f.col(\"vep.mostSevereConsequence\").alias(\"mostSevereConsequence\"),\n                # filters/rsid are arrays that can be empty, in this case we convert them to null\n                nullify_empty_array(f.col(\"rsIds\")).alias(\"rsIds\"),\n            ),\n        )\n        return VariantIndex(\n            _df=vi.df.repartition(\n                400,\n                \"chromosome\",\n            ).sortWithinPartitions(\"chromosome\", \"position\")\n        )\n\n    def persist(self: VariantIndex) -&gt; VariantIndex:\n\"\"\"Persist DataFrame included in the Dataset.\"\"\"\n        self.df = self._df.persist()\n        return self\n</code></pre>"},{"location":"components/dataset/variant_index/#otg.dataset.variant_index.VariantIndex.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise VariantIndex from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>ETL session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>VariantIndex</code> <code>VariantIndex</code> <p>VariantIndex dataset</p> Source code in <code>src/otg/dataset/variant_index.py</code> <pre><code>@classmethod\ndef from_parquet(\n    cls: type[VariantIndex], session: Session, path: str\n) -&gt; VariantIndex:\n\"\"\"Initialise VariantIndex from parquet file.\n\n    Args:\n        session (Session): ETL session\n        path (str): Path to parquet file\n\n    Returns:\n        VariantIndex: VariantIndex dataset\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/variant_index/#otg.dataset.variant_index.VariantIndex.from_variant_annotation","title":"<code>from_variant_annotation(variant_annotation)</code>  <code>classmethod</code>","text":"<p>Initialise VariantIndex from pre-existing variant annotation dataset.</p> Source code in <code>src/otg/dataset/variant_index.py</code> <pre><code>@classmethod\ndef from_variant_annotation(\n    cls: type[VariantIndex],\n    variant_annotation: VariantAnnotation,\n) -&gt; VariantIndex:\n\"\"\"Initialise VariantIndex from pre-existing variant annotation dataset.\"\"\"\n    unchanged_cols = [\n        \"variantId\",\n        \"chromosome\",\n        \"position\",\n        \"referenceAllele\",\n        \"alternateAllele\",\n        \"chromosomeB37\",\n        \"positionB37\",\n        \"alleleType\",\n        \"alleleFrequencies\",\n        \"cadd\",\n    ]\n    vi = cls(\n        _df=variant_annotation.df.select(\n            *unchanged_cols,\n            f.col(\"vep.mostSevereConsequence\").alias(\"mostSevereConsequence\"),\n            # filters/rsid are arrays that can be empty, in this case we convert them to null\n            nullify_empty_array(f.col(\"rsIds\")).alias(\"rsIds\"),\n        ),\n    )\n    return VariantIndex(\n        _df=vi.df.repartition(\n            400,\n            \"chromosome\",\n        ).sortWithinPartitions(\"chromosome\", \"position\")\n    )\n</code></pre>"},{"location":"components/dataset/variant_index/#otg.dataset.variant_index.VariantIndex.persist","title":"<code>persist()</code>","text":"<p>Persist DataFrame included in the Dataset.</p> Source code in <code>src/otg/dataset/variant_index.py</code> <pre><code>def persist(self: VariantIndex) -&gt; VariantIndex:\n\"\"\"Persist DataFrame included in the Dataset.\"\"\"\n    self.df = self._df.persist()\n    return self\n</code></pre>"},{"location":"components/dataset/variant_index/#schema","title":"Schema","text":"<pre><code>root\n |-- variantId: string (nullable = false)\n |-- chromosome: string (nullable = false)\n |-- position: integer (nullable = false)\n |-- referenceAllele: string (nullable = false)\n |-- alternateAllele: string (nullable = false)\n |-- chromosomeB37: string (nullable = true)\n |-- positionB37: integer (nullable = true)\n |-- alleleType: string (nullable = false)\n |-- alleleFrequencies: array (nullable = false)\n |    |-- element: struct (containsNull = true)\n |    |    |-- populationName: string (nullable = true)\n |    |    |-- alleleFrequency: double (nullable = true)\n |-- cadd: struct (nullable = true)\n |    |-- phred: float (nullable = true)\n |    |-- raw: float (nullable = true)\n |-- mostSevereConsequence: string (nullable = true)\n |-- rsIds: array (nullable = true)\n |    |-- element: string (containsNull = true)\n</code></pre>"},{"location":"components/dataset/variant_to_gene/","title":"Variant to gene","text":"<p>         Bases: <code>Dataset</code></p> <p>Variant-to-gene (V2G) evidence dataset.</p> <p>A variant-to-gene (V2G) evidence is understood as any piece of evidence that supports the association of a variant with a likely causal gene. The evidence can sometimes be context-specific and refer to specific <code>biofeatures</code> (e.g. cell types)</p> Source code in <code>src/otg/dataset/v2g.py</code> <pre><code>@dataclass\nclass V2G(Dataset):\n\"\"\"Variant-to-gene (V2G) evidence dataset.\n\n    A variant-to-gene (V2G) evidence is understood as any piece of evidence that supports the association of a variant with a likely causal gene. The evidence can sometimes be context-specific and refer to specific `biofeatures` (e.g. cell types)\n    \"\"\"\n\n    _schema: StructType = parse_spark_schema(\"v2g.json\")\n\n    @classmethod\n    def from_parquet(cls: type[V2G], session: Session, path: str) -&gt; V2G:\n\"\"\"Initialise V2G from parquet file.\n\n        Args:\n            session (Session): ETL session\n            path (str): Path to parquet file\n\n        Returns:\n            V2G: V2G dataset\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n\n    def filter_by_genes(self: V2G, genes: GeneIndex) -&gt; V2G:\n\"\"\"Filter by V2G dataset by genes.\n\n        Args:\n            genes (GeneIndex): Gene index dataset to filter by\n\n        Returns:\n            V2G: V2G dataset filtered by genes\n        \"\"\"\n        self.df = self._df.join(genes.df.select(\"geneId\"), on=\"geneId\", how=\"inner\")\n        return self\n</code></pre>"},{"location":"components/dataset/variant_to_gene/#otg.dataset.v2g.V2G.filter_by_genes","title":"<code>filter_by_genes(genes)</code>","text":"<p>Filter by V2G dataset by genes.</p> <p>Parameters:</p> Name Type Description Default <code>genes</code> <code>GeneIndex</code> <p>Gene index dataset to filter by</p> required <p>Returns:</p> Name Type Description <code>V2G</code> <code>V2G</code> <p>V2G dataset filtered by genes</p> Source code in <code>src/otg/dataset/v2g.py</code> <pre><code>def filter_by_genes(self: V2G, genes: GeneIndex) -&gt; V2G:\n\"\"\"Filter by V2G dataset by genes.\n\n    Args:\n        genes (GeneIndex): Gene index dataset to filter by\n\n    Returns:\n        V2G: V2G dataset filtered by genes\n    \"\"\"\n    self.df = self._df.join(genes.df.select(\"geneId\"), on=\"geneId\", how=\"inner\")\n    return self\n</code></pre>"},{"location":"components/dataset/variant_to_gene/#otg.dataset.v2g.V2G.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise V2G from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>ETL session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>V2G</code> <code>V2G</code> <p>V2G dataset</p> Source code in <code>src/otg/dataset/v2g.py</code> <pre><code>@classmethod\ndef from_parquet(cls: type[V2G], session: Session, path: str) -&gt; V2G:\n\"\"\"Initialise V2G from parquet file.\n\n    Args:\n        session (Session): ETL session\n        path (str): Path to parquet file\n\n    Returns:\n        V2G: V2G dataset\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/variant_to_gene/#schema","title":"Schema","text":"<pre><code>root\n |-- geneId: string (nullable = false)\n |-- variantId: string (nullable = false)\n |-- distance: long (nullable = true)\n |-- chromosome: string (nullable = false)\n |-- datatypeId: string (nullable = false)\n |-- datasourceId: string (nullable = false)\n |-- score: double (nullable = true)\n |-- resourceScore: double (nullable = true)\n |-- pmid: string (nullable = true)\n |-- biofeature: string (nullable = true)\n |-- position: integer (nullable = false)\n |-- label: string (nullable = true)\n |-- variantFunctionalConsequenceId: string (nullable = true)\n |-- isHighQualityPlof: boolean (nullable = true)\n</code></pre>"},{"location":"components/dataset/study_index/_study_index/","title":"Study index","text":"<p>         Bases: <code>Dataset</code></p> <p>Study index dataset.</p> <p>A study index dataset captures all the metadata for all studies including GWAS and Molecular QTL.</p> Source code in <code>src/otg/dataset/study_index.py</code> <pre><code>@dataclass\nclass StudyIndex(Dataset):\n\"\"\"Study index dataset.\n\n    A study index dataset captures all the metadata for all studies including GWAS and Molecular QTL.\n    \"\"\"\n\n    _schema: StructType = parse_spark_schema(\"studies.json\")\n\n    @classmethod\n    def from_parquet(cls: type[StudyIndex], session: Session, path: str) -&gt; StudyIndex:\n\"\"\"Initialise StudyIndex from parquet file.\n\n        Args:\n            session (Session): ETL session\n            path (str): Path to parquet file\n\n        Returns:\n            StudyIndex: Study index dataset\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n\n    def study_type_lut(self: StudyIndex) -&gt; DataFrame:\n\"\"\"Return a lookup table of study type.\n\n        Returns:\n            DataFrame: A dataframe containing `studyId` and `studyType` columns.\n        \"\"\"\n        return self.df.select(\"studyId\", \"studyType\")\n</code></pre>"},{"location":"components/dataset/study_index/_study_index/#otg.dataset.study_index.StudyIndex.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise StudyIndex from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>ETL session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>StudyIndex</code> <code>StudyIndex</code> <p>Study index dataset</p> Source code in <code>src/otg/dataset/study_index.py</code> <pre><code>@classmethod\ndef from_parquet(cls: type[StudyIndex], session: Session, path: str) -&gt; StudyIndex:\n\"\"\"Initialise StudyIndex from parquet file.\n\n    Args:\n        session (Session): ETL session\n        path (str): Path to parquet file\n\n    Returns:\n        StudyIndex: Study index dataset\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/study_index/_study_index/#otg.dataset.study_index.StudyIndex.study_type_lut","title":"<code>study_type_lut()</code>","text":"<p>Return a lookup table of study type.</p> <p>Returns:</p> Name Type Description <code>DataFrame</code> <code>DataFrame</code> <p>A dataframe containing <code>studyId</code> and <code>studyType</code> columns.</p> Source code in <code>src/otg/dataset/study_index.py</code> <pre><code>def study_type_lut(self: StudyIndex) -&gt; DataFrame:\n\"\"\"Return a lookup table of study type.\n\n    Returns:\n        DataFrame: A dataframe containing `studyId` and `studyType` columns.\n    \"\"\"\n    return self.df.select(\"studyId\", \"studyType\")\n</code></pre>"},{"location":"components/dataset/study_index/_study_index/#schema","title":"Schema","text":"<pre><code>root\n |-- studyId: string (nullable = false)\n |-- projectId: string (nullable = false)\n |-- studyType: string (nullable = false)\n |-- traitFromSource: string (nullable = false)\n |-- traitFromSourceMappedIds: array (nullable = true)\n |    |-- element: string (containsNull = true)\n |-- pubmedId: string (nullable = true)\n |-- publicationTitle: string (nullable = true)\n |-- publicationFirstAuthor: string (nullable = true)\n |-- publicationDate: string (nullable = true)\n |-- publicationJournal: string (nullable = true)\n |-- backgroundTraitFromSourceMappedIds: array (nullable = true)\n |    |-- element: string (containsNull = true)\n |-- initialSampleSize: string (nullable = true)\n |-- nCases: long (nullable = true)\n |-- nControls: long (nullable = true)\n |-- nSamples: long (nullable = true)\n |-- discoverySamples: array (nullable = true)\n |    |-- element: struct (containsNull = false)\n |    |    |-- sampleSize: string (nullable = true)\n |    |    |-- ancestry: string (nullable = true)\n |-- replicationSamples: array (nullable = true)\n |    |-- element: struct (containsNull = false)\n |    |    |-- sampleSize: string (nullable = true)\n |    |    |-- ancestry: string (nullable = true)\n |-- summarystatsLocation: string (nullable = true)\n |-- hasSumstats: boolean (nullable = true)\n</code></pre>"},{"location":"components/dataset/study_index/study_index_finngen/","title":"Study index finngen","text":"<p>         Bases: <code>StudyIndex</code></p> <p>Study index dataset from FinnGen.</p> <p>The following information is aggregated/extracted:</p> <ul> <li>Study ID in the special format (FINNGEN_R9_*)</li> <li>Trait name (for example, Amoebiasis)</li> <li>Number of cases and controls</li> <li>Link to the summary statistics location</li> </ul> <p>Some fields are also populated as constants, such as study type and the initial sample size.</p> Source code in <code>src/otg/dataset/study_index.py</code> <pre><code>@dataclass\nclass StudyIndexFinnGen(StudyIndex):\n\"\"\"Study index dataset from FinnGen.\n\n    The following information is aggregated/extracted:\n\n    - Study ID in the special format (FINNGEN_R9_*)\n    - Trait name (for example, Amoebiasis)\n    - Number of cases and controls\n    - Link to the summary statistics location\n\n    Some fields are also populated as constants, such as study type and the initial sample size.\n    \"\"\"\n\n    @classmethod\n    def from_source(\n        cls: type[StudyIndexFinnGen],\n        finngen_studies: DataFrame,\n        finngen_release_prefix: str,\n        finngen_sumstat_url_prefix: str,\n        finngen_sumstat_url_suffix: str,\n    ) -&gt; StudyIndexFinnGen:\n\"\"\"This function ingests study level metadata from FinnGen.\n\n        Args:\n            finngen_studies (DataFrame): FinnGen raw study table\n            finngen_release_prefix (str): Release prefix pattern.\n            finngen_sumstat_url_prefix (str): URL prefix for summary statistics location.\n            finngen_sumstat_url_suffix (str): URL prefix suffix for summary statistics location.\n\n        Returns:\n            StudyIndexFinnGen: Parsed and annotated FinnGen study table.\n        \"\"\"\n        return cls(\n            _df=(\n                # Read FinnGen raw data.\n                finngen_studies.select(\n                    # Select the desired columns.\n                    f.concat(\n                        f.lit(finngen_release_prefix + \"_\"), f.col(\"phenocode\")\n                    ).alias(\"studyId\"),\n                    f.col(\"phenostring\").alias(\"traitFromSource\"),\n                    f.col(\"num_cases\").alias(\"nCases\"),\n                    f.col(\"num_controls\").alias(\"nControls\"),\n                    # Set constant value columns.\n                    f.lit(finngen_release_prefix).alias(\"projectId\"),\n                    f.lit(\"gwas\").alias(\"studyType\"),\n                    f.lit(True).alias(\"hasSumstats\"),\n                    f.lit(\"377,277 (210,870 females and 166,407 males)\").alias(\n                        \"initialSampleSize\"\n                    ),\n                )\n                .withColumn(\"nSamples\", f.col(\"nCases\") + f.col(\"nControls\"))\n                .withColumn(\n                    \"summarystatsLocation\",\n                    f.concat(\n                        f.lit(finngen_sumstat_url_prefix),\n                        f.col(\"studyId\"),\n                        f.lit(finngen_sumstat_url_suffix),\n                    ),\n                )\n            )\n        )\n</code></pre>"},{"location":"components/dataset/study_index/study_index_finngen/#otg.dataset.study_index.StudyIndexFinnGen.from_source","title":"<code>from_source(finngen_studies, finngen_release_prefix, finngen_sumstat_url_prefix, finngen_sumstat_url_suffix)</code>  <code>classmethod</code>","text":"<p>This function ingests study level metadata from FinnGen.</p> <p>Parameters:</p> Name Type Description Default <code>finngen_studies</code> <code>DataFrame</code> <p>FinnGen raw study table</p> required <code>finngen_release_prefix</code> <code>str</code> <p>Release prefix pattern.</p> required <code>finngen_sumstat_url_prefix</code> <code>str</code> <p>URL prefix for summary statistics location.</p> required <code>finngen_sumstat_url_suffix</code> <code>str</code> <p>URL prefix suffix for summary statistics location.</p> required <p>Returns:</p> Name Type Description <code>StudyIndexFinnGen</code> <code>StudyIndexFinnGen</code> <p>Parsed and annotated FinnGen study table.</p> Source code in <code>src/otg/dataset/study_index.py</code> <pre><code>@classmethod\ndef from_source(\n    cls: type[StudyIndexFinnGen],\n    finngen_studies: DataFrame,\n    finngen_release_prefix: str,\n    finngen_sumstat_url_prefix: str,\n    finngen_sumstat_url_suffix: str,\n) -&gt; StudyIndexFinnGen:\n\"\"\"This function ingests study level metadata from FinnGen.\n\n    Args:\n        finngen_studies (DataFrame): FinnGen raw study table\n        finngen_release_prefix (str): Release prefix pattern.\n        finngen_sumstat_url_prefix (str): URL prefix for summary statistics location.\n        finngen_sumstat_url_suffix (str): URL prefix suffix for summary statistics location.\n\n    Returns:\n        StudyIndexFinnGen: Parsed and annotated FinnGen study table.\n    \"\"\"\n    return cls(\n        _df=(\n            # Read FinnGen raw data.\n            finngen_studies.select(\n                # Select the desired columns.\n                f.concat(\n                    f.lit(finngen_release_prefix + \"_\"), f.col(\"phenocode\")\n                ).alias(\"studyId\"),\n                f.col(\"phenostring\").alias(\"traitFromSource\"),\n                f.col(\"num_cases\").alias(\"nCases\"),\n                f.col(\"num_controls\").alias(\"nControls\"),\n                # Set constant value columns.\n                f.lit(finngen_release_prefix).alias(\"projectId\"),\n                f.lit(\"gwas\").alias(\"studyType\"),\n                f.lit(True).alias(\"hasSumstats\"),\n                f.lit(\"377,277 (210,870 females and 166,407 males)\").alias(\n                    \"initialSampleSize\"\n                ),\n            )\n            .withColumn(\"nSamples\", f.col(\"nCases\") + f.col(\"nControls\"))\n            .withColumn(\n                \"summarystatsLocation\",\n                f.concat(\n                    f.lit(finngen_sumstat_url_prefix),\n                    f.col(\"studyId\"),\n                    f.lit(finngen_sumstat_url_suffix),\n                ),\n            )\n        )\n    )\n</code></pre>"},{"location":"components/dataset/study_index/study_index_finngen/#schema","title":"Schema","text":"<pre><code>root\n |-- studyId: string (nullable = false)\n |-- projectId: string (nullable = false)\n |-- studyType: string (nullable = false)\n |-- traitFromSource: string (nullable = false)\n |-- traitFromSourceMappedIds: array (nullable = true)\n |    |-- element: string (containsNull = true)\n |-- pubmedId: string (nullable = true)\n |-- publicationTitle: string (nullable = true)\n |-- publicationFirstAuthor: string (nullable = true)\n |-- publicationDate: string (nullable = true)\n |-- publicationJournal: string (nullable = true)\n |-- backgroundTraitFromSourceMappedIds: array (nullable = true)\n |    |-- element: string (containsNull = true)\n |-- initialSampleSize: string (nullable = true)\n |-- nCases: long (nullable = true)\n |-- nControls: long (nullable = true)\n |-- nSamples: long (nullable = true)\n |-- discoverySamples: array (nullable = true)\n |    |-- element: struct (containsNull = false)\n |    |    |-- sampleSize: string (nullable = true)\n |    |    |-- ancestry: string (nullable = true)\n |-- replicationSamples: array (nullable = true)\n |    |-- element: struct (containsNull = false)\n |    |    |-- sampleSize: string (nullable = true)\n |    |    |-- ancestry: string (nullable = true)\n |-- summarystatsLocation: string (nullable = true)\n |-- hasSumstats: boolean (nullable = true)\n</code></pre>"},{"location":"components/dataset/study_index/study_index_gwas_catalog/","title":"Study index gwas catalog","text":"<p>         Bases: <code>StudyIndex</code></p> <p>Study index dataset from GWAS Catalog.</p> <p>The following information is harmonised from the GWAS Catalog:</p> <ul> <li>All publication related information retained.</li> <li>Mapped measured and background traits parsed.</li> <li>Flagged if harmonized summary statistics datasets available.</li> <li>If available, the ftp path to these files presented.</li> <li>Ancestries from the discovery and replication stages are structured with sample counts.</li> <li>Case/control counts extracted.</li> <li>The number of samples with European ancestry extracted.</li> </ul> Source code in <code>src/otg/dataset/study_index.py</code> <pre><code>@dataclass\nclass StudyIndexGWASCatalog(StudyIndex):\n\"\"\"Study index dataset from GWAS Catalog.\n\n    The following information is harmonised from the GWAS Catalog:\n\n    - All publication related information retained.\n    - Mapped measured and background traits parsed.\n    - Flagged if harmonized summary statistics datasets available.\n    - If available, the ftp path to these files presented.\n    - Ancestries from the discovery and replication stages are structured with sample counts.\n    - Case/control counts extracted.\n    - The number of samples with European ancestry extracted.\n\n    \"\"\"\n\n    @staticmethod\n    def _gwas_ancestry_to_gnomad(gwas_catalog_ancestry: Column) -&gt; Column:\n\"\"\"Normalised ancestry column from GWAS Catalog into Gnomad ancestry.\n\n        Args:\n            gwas_catalog_ancestry (Column): GWAS Catalog ancestry\n\n        Returns:\n            Column: mapped Gnomad ancestry using LUT\n        \"\"\"\n        # GWAS Catalog to p-value mapping\n        json_dict = json.loads(\n            pkg_resources.read_text(\n                data, \"gwascat_2_gnomad_superpopulation_map.json\", encoding=\"utf-8\"\n            )\n        )\n        map_expr = f.create_map(*[f.lit(x) for x in chain(*json_dict.items())])\n\n        return f.transform(gwas_catalog_ancestry, lambda x: map_expr[x])\n\n    @classmethod\n    def _parse_study_table(\n        cls: type[StudyIndexGWASCatalog], catalog_studies: DataFrame\n    ) -&gt; StudyIndexGWASCatalog:\n\"\"\"Harmonise GWASCatalog study table with `StudyIndex` schema.\n\n        Args:\n            catalog_studies (DataFrame): GWAS Catalog study table\n\n        Returns:\n            StudyIndexGWASCatalog:\n        \"\"\"\n        return cls(\n            _df=catalog_studies.select(\n                f.coalesce(\n                    f.col(\"STUDY ACCESSION\"), f.monotonically_increasing_id()\n                ).alias(\"studyId\"),\n                f.lit(\"GCST\").alias(\"projectId\"),\n                f.lit(\"gwas\").alias(\"studyType\"),\n                f.col(\"PUBMED ID\").alias(\"pubmedId\"),\n                f.col(\"FIRST AUTHOR\").alias(\"publicationFirstAuthor\"),\n                f.col(\"DATE\").alias(\"publicationDate\"),\n                f.col(\"JOURNAL\").alias(\"publicationJournal\"),\n                f.col(\"STUDY\").alias(\"publicationTitle\"),\n                f.coalesce(f.col(\"DISEASE/TRAIT\"), f.lit(\"Unreported\")).alias(\n                    \"traitFromSource\"\n                ),\n                f.col(\"INITIAL SAMPLE SIZE\").alias(\"initialSampleSize\"),\n                parse_efos(f.col(\"MAPPED_TRAIT_URI\")).alias(\"traitFromSourceMappedIds\"),\n                parse_efos(f.col(\"MAPPED BACKGROUND TRAIT URI\")).alias(\n                    \"backgroundTraitFromSourceMappedIds\"\n                ),\n            )\n        )\n\n    @classmethod\n    def from_source(\n        cls: type[StudyIndexGWASCatalog],\n        catalog_studies: DataFrame,\n        ancestry_file: DataFrame,\n        sumstats_lut: DataFrame,\n    ) -&gt; StudyIndexGWASCatalog:\n\"\"\"This function ingests study level metadata from the GWAS Catalog.\n\n        Args:\n            catalog_studies (DataFrame): GWAS Catalog raw study table\n            ancestry_file (DataFrame): GWAS Catalog ancestry table.\n            sumstats_lut (DataFrame): GWAS Catalog summary statistics list.\n\n        Returns:\n            StudyIndexGWASCatalog: Parsed and annotated GWAS Catalog study table.\n        \"\"\"\n        # Read GWAS Catalogue raw data\n        return (\n            cls._parse_study_table(catalog_studies)\n            ._annotate_ancestries(ancestry_file)\n            ._annotate_sumstats_info(sumstats_lut)\n            ._annotate_discovery_sample_sizes()\n        )\n\n    def get_gnomad_ancestry_sample_sizes(self: StudyIndexGWASCatalog) -&gt; DataFrame:\n\"\"\"Get all studies and their ancestries.\n\n        Returns:\n            DataFrame: containing `studyId`, `gnomadPopulation` and `relativeSampleSize` columns\n        \"\"\"\n        # Study ancestries\n        w_study = Window.partitionBy(\"studyId\")\n        return (\n            self.df\n            # Excluding studies where no sample discription is provided:\n            .filter(f.col(\"discoverySamples\").isNotNull())\n            # Exploding sample description and study identifier:\n            .withColumn(\"discoverySample\", f.explode(f.col(\"discoverySamples\")))\n            # Splitting sample descriptions further:\n            .withColumn(\n                \"ancestries\",\n                f.split(f.col(\"discoverySample.ancestry\"), r\",\\s(?![^()]*\\))\"),\n            )\n            # Dividing sample sizes assuming even distribution\n            .withColumn(\n                \"adjustedSampleSize\",\n                f.col(\"discoverySample.sampleSize\") / f.size(f.col(\"ancestries\")),\n            )\n            # mapped to gnomAD superpopulation and exploded\n            .withColumn(\n                \"gnomadPopulation\",\n                f.explode(\n                    StudyIndexGWASCatalog._gwas_ancestry_to_gnomad(f.col(\"ancestries\"))\n                ),\n            )\n            # Group by studies and aggregate for major population:\n            .groupBy(\"studyId\", \"gnomadPopulation\")\n            .agg(f.sum(f.col(\"adjustedSampleSize\")).alias(\"sampleSize\"))\n            # Calculate proportions for each study\n            .withColumn(\n                \"relativeSampleSize\",\n                f.col(\"sampleSize\") / f.sum(\"sampleSize\").over(w_study),\n            )\n            .drop(\"sampleSize\")\n        )\n\n    def update_study_id(\n        self: StudyIndexGWASCatalog, study_annotation: DataFrame\n    ) -&gt; StudyIndexGWASCatalog:\n\"\"\"Update studyId with a dataframe containing study.\n\n        Args:\n            study_annotation (DataFrame): Dataframe containing `updatedStudyId`, `traitFromSource`, `traitFromSourceMappedIds` and key column `studyId`.\n\n        Returns:\n            StudyIndexGWASCatalog: Updated study table.\n        \"\"\"\n        self.df = (\n            self._df.join(\n                study_annotation.select(\n                    *[\n                        f.col(c).alias(f\"updated{c}\")\n                        if c not in [\"studyId\", \"updatedStudyId\"]\n                        else f.col(c)\n                        for c in study_annotation.columns\n                    ]\n                ),\n                on=\"studyId\",\n                how=\"left\",\n            )\n            .withColumn(\n                \"studyId\",\n                f.coalesce(f.col(\"updatedStudyId\"), f.col(\"studyId\")),\n            )\n            .withColumn(\n                \"traitFromSource\",\n                f.coalesce(f.col(\"updatedtraitFromSource\"), f.col(\"traitFromSource\")),\n            )\n            .withColumn(\n                \"traitFromSourceMappedIds\",\n                f.coalesce(\n                    f.col(\"updatedtraitFromSourceMappedIds\"),\n                    f.col(\"traitFromSourceMappedIds\"),\n                ),\n            )\n            .select(self._df.columns)\n        )\n\n        return self\n\n    def _annotate_ancestries(\n        self: StudyIndexGWASCatalog, ancestry_lut: DataFrame\n    ) -&gt; StudyIndexGWASCatalog:\n\"\"\"Extracting sample sizes and ancestry information.\n\n        This function parses the ancestry data. Also get counts for the europeans in the same\n        discovery stage.\n\n        Args:\n            ancestry_lut (DataFrame): Ancestry table as downloaded from the GWAS Catalog\n\n        Returns:\n            StudyIndexGWASCatalog: Slimmed and cleaned version of the ancestry annotation.\n        \"\"\"\n        ancestry = (\n            ancestry_lut\n            # Convert column headers to camelcase:\n            .transform(\n                lambda df: df.select(\n                    *[f.expr(column2camel_case(x)) for x in df.columns]\n                )\n            ).withColumnRenamed(\n                \"studyAccession\", \"studyId\"\n            )  # studyId has not been split yet\n        )\n\n        # Get a high resolution dataset on experimental stage:\n        ancestry_stages = (\n            ancestry.groupBy(\"studyId\")\n            .pivot(\"stage\")\n            .agg(\n                f.collect_set(\n                    f.struct(\n                        f.col(\"numberOfIndividuals\").alias(\"sampleSize\"),\n                        f.col(\"broadAncestralCategory\").alias(\"ancestry\"),\n                    )\n                )\n            )\n            .withColumnRenamed(\"initial\", \"discoverySamples\")\n            .withColumnRenamed(\"replication\", \"replicationSamples\")\n            .persist()\n        )\n\n        # Generate information on the ancestry composition of the discovery stage, and calculate\n        # the proportion of the Europeans:\n        europeans_deconvoluted = (\n            ancestry\n            # Focus on discovery stage:\n            .filter(f.col(\"stage\") == \"initial\")\n            # Sorting ancestries if European:\n            .withColumn(\n                \"ancestryFlag\",\n                # Excluding finnish:\n                f.when(\n                    f.col(\"initialSampleDescription\").contains(\"Finnish\"),\n                    f.lit(\"other\"),\n                )\n                # Excluding Icelandic population:\n                .when(\n                    f.col(\"initialSampleDescription\").contains(\"Icelandic\"),\n                    f.lit(\"other\"),\n                )\n                # Including European ancestry:\n                .when(f.col(\"broadAncestralCategory\") == \"European\", f.lit(\"european\"))\n                # Exclude all other population:\n                .otherwise(\"other\"),\n            )\n            # Grouping by study accession and initial sample description:\n            .groupBy(\"studyId\")\n            .pivot(\"ancestryFlag\")\n            .agg(\n                # Summarizing sample sizes for all ancestries:\n                f.sum(f.col(\"numberOfIndividuals\"))\n            )\n            # Do arithmetics to make sure we have the right proportion of european in the set:\n            .withColumn(\n                \"initialSampleCountEuropean\",\n                f.when(f.col(\"european\").isNull(), f.lit(0)).otherwise(\n                    f.col(\"european\")\n                ),\n            )\n            .withColumn(\n                \"initialSampleCountOther\",\n                f.when(f.col(\"other\").isNull(), f.lit(0)).otherwise(f.col(\"other\")),\n            )\n            .withColumn(\n                \"initialSampleCount\",\n                f.col(\"initialSampleCountEuropean\") + f.col(\"other\"),\n            )\n            .drop(\n                \"european\",\n                \"other\",\n                \"initialSampleCount\",\n                \"initialSampleCountEuropean\",\n                \"initialSampleCountOther\",\n            )\n        )\n\n        parsed_ancestry_lut = ancestry_stages.join(\n            europeans_deconvoluted, on=\"studyId\", how=\"outer\"\n        )\n\n        self.df = self.df.join(parsed_ancestry_lut, on=\"studyId\", how=\"left\")\n        return self\n\n    def _annotate_sumstats_info(\n        self: StudyIndexGWASCatalog, sumstats_lut: DataFrame\n    ) -&gt; StudyIndexGWASCatalog:\n\"\"\"Annotate summary stat locations.\n\n        Args:\n            sumstats_lut (DataFrame): listing GWAS Catalog summary stats paths\n\n        Returns:\n            StudyIndexGWASCatalog: including `summarystatsLocation` and `hasSumstats` columns\n        \"\"\"\n        gwas_sumstats_base_uri = (\n            \"ftp://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/\"\n        )\n\n        parsed_sumstats_lut = sumstats_lut.withColumn(\n            \"summarystatsLocation\",\n            f.concat(\n                f.lit(gwas_sumstats_base_uri),\n                f.regexp_replace(f.col(\"_c0\"), r\"^\\.\\/\", \"\"),\n            ),\n        ).select(\n            f.regexp_extract(f.col(\"summarystatsLocation\"), r\"\\/(GCST\\d+)\\/\", 1).alias(\n                \"studyId\"\n            ),\n            \"summarystatsLocation\",\n            f.lit(True).alias(\"hasSumstats\"),\n        )\n\n        self.df = (\n            self.df.drop(\"hasSumstats\")\n            .join(parsed_sumstats_lut, on=\"studyId\", how=\"left\")\n            .withColumn(\"hasSumstats\", f.coalesce(f.col(\"hasSumstats\"), f.lit(False)))\n        )\n        return self\n\n    def _annotate_discovery_sample_sizes(\n        self: StudyIndexGWASCatalog,\n    ) -&gt; StudyIndexGWASCatalog:\n\"\"\"Extract the sample size of the discovery stage of the study as annotated in the GWAS Catalog.\n\n        For some studies that measure quantitative traits, nCases and nControls can't be extracted. Therefore, we assume these are 0.\n\n        Returns:\n            StudyIndexGWASCatalog: object with columns `nCases`, `nControls`, and `nSamples` per `studyId` correctly extracted.\n        \"\"\"\n        sample_size_lut = (\n            self.df.select(\n                \"studyId\",\n                f.explode_outer(f.split(f.col(\"initialSampleSize\"), r\",\\s+\")).alias(\n                    \"samples\"\n                ),\n            )\n            # Extracting the sample size from the string:\n            .withColumn(\n                \"sampleSize\",\n                f.regexp_extract(\n                    f.regexp_replace(f.col(\"samples\"), \",\", \"\"), r\"[0-9,]+\", 0\n                ).cast(t.IntegerType()),\n            )\n            .select(\n                \"studyId\",\n                \"sampleSize\",\n                f.when(f.col(\"samples\").contains(\"cases\"), f.col(\"sampleSize\"))\n                .otherwise(f.lit(0))\n                .alias(\"nCases\"),\n                f.when(f.col(\"samples\").contains(\"controls\"), f.col(\"sampleSize\"))\n                .otherwise(f.lit(0))\n                .alias(\"nControls\"),\n            )\n            # Aggregating sample sizes for all ancestries:\n            .groupBy(\"studyId\")  # studyId has not been split yet\n            .agg(\n                f.sum(\"nCases\").alias(\"nCases\"),\n                f.sum(\"nControls\").alias(\"nControls\"),\n                f.sum(\"sampleSize\").alias(\"nSamples\"),\n            )\n        )\n        self.df = self.df.join(sample_size_lut, on=\"studyId\", how=\"left\")\n        return self\n</code></pre>"},{"location":"components/dataset/study_index/study_index_gwas_catalog/#otg.dataset.study_index.StudyIndexGWASCatalog.from_source","title":"<code>from_source(catalog_studies, ancestry_file, sumstats_lut)</code>  <code>classmethod</code>","text":"<p>This function ingests study level metadata from the GWAS Catalog.</p> <p>Parameters:</p> Name Type Description Default <code>catalog_studies</code> <code>DataFrame</code> <p>GWAS Catalog raw study table</p> required <code>ancestry_file</code> <code>DataFrame</code> <p>GWAS Catalog ancestry table.</p> required <code>sumstats_lut</code> <code>DataFrame</code> <p>GWAS Catalog summary statistics list.</p> required <p>Returns:</p> Name Type Description <code>StudyIndexGWASCatalog</code> <code>StudyIndexGWASCatalog</code> <p>Parsed and annotated GWAS Catalog study table.</p> Source code in <code>src/otg/dataset/study_index.py</code> <pre><code>@classmethod\ndef from_source(\n    cls: type[StudyIndexGWASCatalog],\n    catalog_studies: DataFrame,\n    ancestry_file: DataFrame,\n    sumstats_lut: DataFrame,\n) -&gt; StudyIndexGWASCatalog:\n\"\"\"This function ingests study level metadata from the GWAS Catalog.\n\n    Args:\n        catalog_studies (DataFrame): GWAS Catalog raw study table\n        ancestry_file (DataFrame): GWAS Catalog ancestry table.\n        sumstats_lut (DataFrame): GWAS Catalog summary statistics list.\n\n    Returns:\n        StudyIndexGWASCatalog: Parsed and annotated GWAS Catalog study table.\n    \"\"\"\n    # Read GWAS Catalogue raw data\n    return (\n        cls._parse_study_table(catalog_studies)\n        ._annotate_ancestries(ancestry_file)\n        ._annotate_sumstats_info(sumstats_lut)\n        ._annotate_discovery_sample_sizes()\n    )\n</code></pre>"},{"location":"components/dataset/study_index/study_index_gwas_catalog/#otg.dataset.study_index.StudyIndexGWASCatalog.get_gnomad_ancestry_sample_sizes","title":"<code>get_gnomad_ancestry_sample_sizes()</code>","text":"<p>Get all studies and their ancestries.</p> <p>Returns:</p> Name Type Description <code>DataFrame</code> <code>DataFrame</code> <p>containing <code>studyId</code>, <code>gnomadPopulation</code> and <code>relativeSampleSize</code> columns</p> Source code in <code>src/otg/dataset/study_index.py</code> <pre><code>def get_gnomad_ancestry_sample_sizes(self: StudyIndexGWASCatalog) -&gt; DataFrame:\n\"\"\"Get all studies and their ancestries.\n\n    Returns:\n        DataFrame: containing `studyId`, `gnomadPopulation` and `relativeSampleSize` columns\n    \"\"\"\n    # Study ancestries\n    w_study = Window.partitionBy(\"studyId\")\n    return (\n        self.df\n        # Excluding studies where no sample discription is provided:\n        .filter(f.col(\"discoverySamples\").isNotNull())\n        # Exploding sample description and study identifier:\n        .withColumn(\"discoverySample\", f.explode(f.col(\"discoverySamples\")))\n        # Splitting sample descriptions further:\n        .withColumn(\n            \"ancestries\",\n            f.split(f.col(\"discoverySample.ancestry\"), r\",\\s(?![^()]*\\))\"),\n        )\n        # Dividing sample sizes assuming even distribution\n        .withColumn(\n            \"adjustedSampleSize\",\n            f.col(\"discoverySample.sampleSize\") / f.size(f.col(\"ancestries\")),\n        )\n        # mapped to gnomAD superpopulation and exploded\n        .withColumn(\n            \"gnomadPopulation\",\n            f.explode(\n                StudyIndexGWASCatalog._gwas_ancestry_to_gnomad(f.col(\"ancestries\"))\n            ),\n        )\n        # Group by studies and aggregate for major population:\n        .groupBy(\"studyId\", \"gnomadPopulation\")\n        .agg(f.sum(f.col(\"adjustedSampleSize\")).alias(\"sampleSize\"))\n        # Calculate proportions for each study\n        .withColumn(\n            \"relativeSampleSize\",\n            f.col(\"sampleSize\") / f.sum(\"sampleSize\").over(w_study),\n        )\n        .drop(\"sampleSize\")\n    )\n</code></pre>"},{"location":"components/dataset/study_index/study_index_gwas_catalog/#otg.dataset.study_index.StudyIndexGWASCatalog.update_study_id","title":"<code>update_study_id(study_annotation)</code>","text":"<p>Update studyId with a dataframe containing study.</p> <p>Parameters:</p> Name Type Description Default <code>study_annotation</code> <code>DataFrame</code> <p>Dataframe containing <code>updatedStudyId</code>, <code>traitFromSource</code>, <code>traitFromSourceMappedIds</code> and key column <code>studyId</code>.</p> required <p>Returns:</p> Name Type Description <code>StudyIndexGWASCatalog</code> <code>StudyIndexGWASCatalog</code> <p>Updated study table.</p> Source code in <code>src/otg/dataset/study_index.py</code> <pre><code>def update_study_id(\n    self: StudyIndexGWASCatalog, study_annotation: DataFrame\n) -&gt; StudyIndexGWASCatalog:\n\"\"\"Update studyId with a dataframe containing study.\n\n    Args:\n        study_annotation (DataFrame): Dataframe containing `updatedStudyId`, `traitFromSource`, `traitFromSourceMappedIds` and key column `studyId`.\n\n    Returns:\n        StudyIndexGWASCatalog: Updated study table.\n    \"\"\"\n    self.df = (\n        self._df.join(\n            study_annotation.select(\n                *[\n                    f.col(c).alias(f\"updated{c}\")\n                    if c not in [\"studyId\", \"updatedStudyId\"]\n                    else f.col(c)\n                    for c in study_annotation.columns\n                ]\n            ),\n            on=\"studyId\",\n            how=\"left\",\n        )\n        .withColumn(\n            \"studyId\",\n            f.coalesce(f.col(\"updatedStudyId\"), f.col(\"studyId\")),\n        )\n        .withColumn(\n            \"traitFromSource\",\n            f.coalesce(f.col(\"updatedtraitFromSource\"), f.col(\"traitFromSource\")),\n        )\n        .withColumn(\n            \"traitFromSourceMappedIds\",\n            f.coalesce(\n                f.col(\"updatedtraitFromSourceMappedIds\"),\n                f.col(\"traitFromSourceMappedIds\"),\n            ),\n        )\n        .select(self._df.columns)\n    )\n\n    return self\n</code></pre>"},{"location":"components/dataset/study_index/study_index_gwas_catalog/#schema","title":"Schema","text":"<pre><code>root\n |-- studyId: string (nullable = false)\n |-- projectId: string (nullable = false)\n |-- studyType: string (nullable = false)\n |-- traitFromSource: string (nullable = false)\n |-- traitFromSourceMappedIds: array (nullable = true)\n |    |-- element: string (containsNull = true)\n |-- pubmedId: string (nullable = true)\n |-- publicationTitle: string (nullable = true)\n |-- publicationFirstAuthor: string (nullable = true)\n |-- publicationDate: string (nullable = true)\n |-- publicationJournal: string (nullable = true)\n |-- backgroundTraitFromSourceMappedIds: array (nullable = true)\n |    |-- element: string (containsNull = true)\n |-- initialSampleSize: string (nullable = true)\n |-- nCases: long (nullable = true)\n |-- nControls: long (nullable = true)\n |-- nSamples: long (nullable = true)\n |-- discoverySamples: array (nullable = true)\n |    |-- element: struct (containsNull = false)\n |    |    |-- sampleSize: string (nullable = true)\n |    |    |-- ancestry: string (nullable = true)\n |-- replicationSamples: array (nullable = true)\n |    |-- element: struct (containsNull = false)\n |    |    |-- sampleSize: string (nullable = true)\n |    |    |-- ancestry: string (nullable = true)\n |-- summarystatsLocation: string (nullable = true)\n |-- hasSumstats: boolean (nullable = true)\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/","title":"Study-locus","text":"<p>         Bases: <code>Dataset</code></p> <p>Study-Locus dataset.</p> <p>This dataset captures associations between study/traits and a genetic loci as provided by finemapping methods.</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>@dataclass\nclass StudyLocus(Dataset):\n\"\"\"Study-Locus dataset.\n\n    This dataset captures associations between study/traits and a genetic loci as provided by finemapping methods.\n    \"\"\"\n\n    _schema: StructType = parse_spark_schema(\"study_locus.json\")\n\n    @staticmethod\n    def _overlapping_peaks(credset_to_overlap: DataFrame) -&gt; DataFrame:\n\"\"\"Calculate overlapping signals (study-locus) between GWAS-GWAS and GWAS-Molecular trait.\n\n        Args:\n            credset_to_overlap (DataFrame): DataFrame containing at least `studyLocusId`, `studyType`, `chromosome` and `tagVariantId` columns.\n\n        Returns:\n            DataFrame: containing `left_studyLocusId`, `right_studyLocusId` and `chromosome` columns.\n        \"\"\"\n        # Reduce columns to the minimum to reduce the size of the dataframe\n        credset_to_overlap = credset_to_overlap.select(\n            \"studyLocusId\", \"studyType\", \"chromosome\", \"tagVariantId\"\n        )\n        return (\n            credset_to_overlap.alias(\"left\")\n            .filter(f.col(\"studyType\") == \"gwas\")\n            # Self join with complex condition. Left it's all gwas and right can be gwas or molecular trait\n            .join(\n                credset_to_overlap.alias(\"right\"),\n                on=[\n                    f.col(\"left.chromosome\") == f.col(\"right.chromosome\"),\n                    f.col(\"left.tagVariantId\") == f.col(\"right.tagVariantId\"),\n                    (f.col(\"right.studyType\") != \"gwas\")\n                    | (f.col(\"left.studyLocusId\") &gt; f.col(\"right.studyLocusId\")),\n                ],\n                how=\"inner\",\n            )\n            .select(\n                f.col(\"left.studyLocusId\").alias(\"left_studyLocusId\"),\n                f.col(\"right.studyLocusId\").alias(\"right_studyLocusId\"),\n                f.col(\"left.chromosome\").alias(\"chromosome\"),\n            )\n            .distinct()\n            .repartition(\"chromosome\")\n            .persist()\n        )\n\n    @staticmethod\n    def _align_overlapping_tags(\n        credset_to_overlap: DataFrame, peak_overlaps: DataFrame\n    ) -&gt; StudyLocusOverlap:\n\"\"\"Align overlapping tags in pairs of overlapping study-locus, keeping all tags in both loci.\n\n        Args:\n            credset_to_overlap (DataFrame): containing `studyLocusId`, `studyType`, `chromosome`, `tagVariantId`, `logABF` and `posteriorProbability` columns.\n            peak_overlaps (DataFrame): containing `left_studyLocusId`, `right_studyLocusId` and `chromosome` columns.\n\n        Returns:\n            StudyLocusOverlap: Pairs of overlapping study-locus with aligned tags.\n        \"\"\"\n        # Complete information about all tags in the left study-locus of the overlap\n        overlapping_left = credset_to_overlap.select(\n            f.col(\"chromosome\"),\n            f.col(\"tagVariantId\"),\n            f.col(\"studyLocusId\").alias(\"left_studyLocusId\"),\n            f.col(\"logABF\").alias(\"left_logABF\"),\n            f.col(\"posteriorProbability\").alias(\"left_posteriorProbability\"),\n        ).join(peak_overlaps, on=[\"chromosome\", \"left_studyLocusId\"], how=\"inner\")\n\n        # Complete information about all tags in the right study-locus of the overlap\n        overlapping_right = credset_to_overlap.select(\n            f.col(\"chromosome\"),\n            f.col(\"tagVariantId\"),\n            f.col(\"studyLocusId\").alias(\"right_studyLocusId\"),\n            f.col(\"logABF\").alias(\"right_logABF\"),\n            f.col(\"posteriorProbability\").alias(\"right_posteriorProbability\"),\n        ).join(peak_overlaps, on=[\"chromosome\", \"right_studyLocusId\"], how=\"inner\")\n\n        # Include information about all tag variants in both study-locus aligned by tag variant id\n        return StudyLocusOverlap(\n            _df=overlapping_left.join(\n                overlapping_right,\n                on=[\n                    \"chromosome\",\n                    \"right_studyLocusId\",\n                    \"left_studyLocusId\",\n                    \"tagVariantId\",\n                ],\n                how=\"outer\",\n            )\n            # ensures nullable=false for following columns\n            .fillna(\n                value=\"unknown\",\n                subset=[\n                    \"chromosome\",\n                    \"right_studyLocusId\",\n                    \"left_studyLocusId\",\n                    \"tagVariantId\",\n                ],\n            )\n        )\n\n    @staticmethod\n    def _update_quality_flag(\n        qc: Column, flag_condition: Column, flag_text: StudyLocusQualityCheck\n    ) -&gt; Column:\n\"\"\"Update the provided quality control list with a new flag if condition is met.\n\n        Args:\n            qc (Column): Array column with the current list of qc flags.\n            flag_condition (Column): This is a column of booleans, signing which row should be flagged\n            flag_text (StudyLocusQualityCheck): Text for the new quality control flag\n\n        Returns:\n            Column: Array column with the updated list of qc flags.\n        \"\"\"\n        qc = f.when(qc.isNull(), f.array()).otherwise(qc)\n        return f.when(\n            flag_condition,\n            f.array_union(qc, f.array(f.lit(flag_text.value))),\n        ).otherwise(qc)\n\n    @classmethod\n    def from_parquet(cls: type[StudyLocus], session: Session, path: str) -&gt; StudyLocus:\n\"\"\"Initialise StudyLocus from parquet file.\n\n        Args:\n            session (Session): spark session\n            path (str): Path to parquet file\n\n        Returns:\n            StudyLocus: Study-locus dataset\n        \"\"\"\n        df = session.read_parquet(path=path, schema=cls._schema)\n        return cls(_df=df, _schema=cls._schema)\n\n    def credible_set(\n        self: StudyLocus,\n        credible_interval: CredibleInterval,\n    ) -&gt; StudyLocus:\n\"\"\"Filter study-locus tag variants based on given credible interval.\n\n        Args:\n            credible_interval (CredibleInterval): Credible interval to filter for.\n\n        Returns:\n            StudyLocus: Filtered study-locus dataset.\n        \"\"\"\n        self.df = self._df.withColumn(\n            \"credibleSet\",\n            f.expr(f\"filter(credibleSet, tag -&gt; (tag.{credible_interval.value}))\"),\n        )\n        return self\n\n    def overlaps(self: StudyLocus, study_index: StudyIndex) -&gt; StudyLocusOverlap:\n\"\"\"Calculate overlapping study-locus.\n\n        Find overlapping study-locus that share at least one tagging variant. All GWAS-GWAS and all GWAS-Molecular traits are computed with the Molecular traits always\n        appearing on the right side.\n\n        Args:\n            study_index (StudyIndex): Study index to resolve study types.\n\n        Returns:\n            StudyLocusOverlap: Pairs of overlapping study-locus with aligned tags.\n        \"\"\"\n        credset_to_overlap = (\n            self.df.join(study_index.study_type_lut(), on=\"studyId\", how=\"inner\")\n            .withColumn(\"credibleSet\", f.explode(\"credibleSet\"))\n            .select(\n                \"studyLocusId\",\n                \"studyType\",\n                \"chromosome\",\n                f.col(\"credibleSet.tagVariantId\").alias(\"tagVariantId\"),\n                f.col(\"credibleSet.logABF\").alias(\"logABF\"),\n                f.col(\"credibleSet.posteriorProbability\").alias(\"posteriorProbability\"),\n            )\n            .persist()\n        )\n\n        # overlapping study-locus\n        peak_overlaps = self._overlapping_peaks(credset_to_overlap)\n\n        # study-locus overlap by aligning overlapping variants\n        return self._align_overlapping_tags(credset_to_overlap, peak_overlaps)\n\n    def unique_lead_tag_variants(self: StudyLocus) -&gt; DataFrame:\n\"\"\"All unique lead and tag variants contained in the `StudyLocus` dataframe.\n\n        Returns:\n            DataFrame: A dataframe containing `variantId` and `chromosome` columns.\n        \"\"\"\n        lead_tags = (\n            self.df.select(\n                f.col(\"variantId\"),\n                f.col(\"chromosome\"),\n                f.explode(\"credibleSet.tagVariantId\").alias(\"tagVariantId\"),\n            )\n            .repartition(\"chromosome\")\n            .persist()\n        )\n        return (\n            lead_tags.select(\"variantId\", \"chromosome\")\n            .union(\n                lead_tags.select(f.col(\"tagVariantId\").alias(\"variantId\"), \"chromosome\")\n            )\n            .distinct()\n        )\n\n    def unique_study_locus_ancestries(\n        self: StudyLocus, studies: StudyIndexGWASCatalog\n    ) -&gt; DataFrame:\n\"\"\"All unique lead variant and ancestries contained in the `StudyLocus`.\n\n        Args:\n            studies (StudyIndexGWASCatalog): Metadata about studies in the `StudyLocus`.\n\n        Returns:\n            DataFrame: unique [\"variantId\", \"studyId\", \"gnomadPopulation\", \"chromosome\", \"relativeSampleSize\"]\n\n        Note:\n            This method is only available for GWAS Catalog studies.\n        \"\"\"\n        return (\n            self.df.join(\n                studies.get_gnomad_ancestry_sample_sizes(), on=\"studyId\", how=\"left\"\n            )\n            .filter(f.col(\"position\").isNotNull())\n            .select(\n                \"variantId\",\n                \"chromosome\",\n                \"studyId\",\n                \"gnomadPopulation\",\n                \"relativeSampleSize\",\n            )\n            .distinct()\n        )\n\n    def neglog_pvalue(self: StudyLocus) -&gt; Column:\n\"\"\"Returns the negative log p-value.\n\n        Returns:\n            Column: Negative log p-value\n        \"\"\"\n        return calculate_neglog_pvalue(\n            self.df.pValueMantissa,\n            self.df.pValueExponent,\n        )\n\n    def annotate_credible_sets(self: StudyLocus) -&gt; StudyLocus:\n\"\"\"Annotate study-locus dataset with credible set flags.\n\n        Sorts the array in the `credibleSet` column elements by their `posteriorProbability` values in descending order and adds\n        `is95CredibleSet` and `is99CredibleSet` fields to the elements, indicating which are the tagging variants whose cumulative sum\n        of their `posteriorProbability` values is below 0.95 and 0.99, respectively.\n\n        Returns:\n            StudyLocus: including annotation on `is95CredibleSet` and `is99CredibleSet`.\n        \"\"\"\n        self.df = self.df.withColumn(\n            # Sort credible set by posterior probability in descending order\n            \"credibleSet\",\n            f.when(\n                f.size(f.col(\"credibleSet\")) &gt; 0,\n                order_array_of_structs_by_field(\"credibleSet\", \"posteriorProbability\"),\n            ).when(f.size(f.col(\"credibleSet\")) == 0, f.col(\"credibleSet\")),\n        ).withColumn(\n            # Calculate array of cumulative sums of posterior probabilities to determine which variants are in the 95% and 99% credible sets\n            # and zip the cumulative sums array with the credible set array to add the flags\n            \"credibleSet\",\n            f.when(\n                f.size(f.col(\"credibleSet\")) &gt; 0,\n                f.zip_with(\n                    f.col(\"credibleSet\"),\n                    f.transform(\n                        f.sequence(f.lit(1), f.size(f.col(\"credibleSet\"))),\n                        lambda index: f.aggregate(\n                            f.slice(\n                                # By using `index - 1` we introduce a value of `0.0` in the cumulative sums array. to ensure that the last variant\n                                # that exceeds the 0.95 threshold is included in the cumulative sum, as its probability is necessary to satisfy the threshold.\n                                f.col(\"credibleSet.posteriorProbability\"),\n                                1,\n                                index - 1,\n                            ),\n                            f.lit(0.0),\n                            lambda acc, el: acc + el,\n                        ),\n                    ),\n                    lambda struct_e, acc: struct_e.withField(\n                        CredibleInterval.IS95.value, acc &lt; 0.95\n                    ).withField(CredibleInterval.IS99.value, acc &lt; 0.99),\n                ),\n            ).when(f.size(f.col(\"credibleSet\")) == 0, f.col(\"credibleSet\")),\n        )\n        return self\n\n    def clump(self: StudyLocus) -&gt; StudyLocus:\n\"\"\"Perform LD clumping of the studyLocus.\n\n        Evaluates whether a lead variant is linked to a tag (with lowest p-value) in the same studyLocus dataset.\n\n        Returns:\n            StudyLocus: with empty credible sets for linked variants and QC flag.\n        \"\"\"\n        self.df = (\n            self.df.withColumn(\n                \"is_lead_linked\",\n                LDclumping._is_lead_linked(\n                    self.df.studyId,\n                    self.df.variantId,\n                    self.df.pValueExponent,\n                    self.df.pValueMantissa,\n                    self.df.credibleSet,\n                ),\n            )\n            .withColumn(\n                \"credibleSet\",\n                f.when(f.col(\"is_lead_linked\"), f.array()).otherwise(\n                    f.col(\"credibleSet\")\n                ),\n            )\n            .withColumn(\n                \"qualityControls\",\n                StudyLocus._update_quality_flag(\n                    f.col(\"qualityControls\"),\n                    f.col(\"is_lead_linked\"),\n                    StudyLocusQualityCheck.LD_CLUMPED,\n                ),\n            )\n            .drop(\"is_lead_linked\")\n        )\n        return self\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#otg.dataset.study_locus.StudyLocus.annotate_credible_sets","title":"<code>annotate_credible_sets()</code>","text":"<p>Annotate study-locus dataset with credible set flags.</p> <p>Sorts the array in the <code>credibleSet</code> column elements by their <code>posteriorProbability</code> values in descending order and adds <code>is95CredibleSet</code> and <code>is99CredibleSet</code> fields to the elements, indicating which are the tagging variants whose cumulative sum of their <code>posteriorProbability</code> values is below 0.95 and 0.99, respectively.</p> <p>Returns:</p> Name Type Description <code>StudyLocus</code> <code>StudyLocus</code> <p>including annotation on <code>is95CredibleSet</code> and <code>is99CredibleSet</code>.</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>def annotate_credible_sets(self: StudyLocus) -&gt; StudyLocus:\n\"\"\"Annotate study-locus dataset with credible set flags.\n\n    Sorts the array in the `credibleSet` column elements by their `posteriorProbability` values in descending order and adds\n    `is95CredibleSet` and `is99CredibleSet` fields to the elements, indicating which are the tagging variants whose cumulative sum\n    of their `posteriorProbability` values is below 0.95 and 0.99, respectively.\n\n    Returns:\n        StudyLocus: including annotation on `is95CredibleSet` and `is99CredibleSet`.\n    \"\"\"\n    self.df = self.df.withColumn(\n        # Sort credible set by posterior probability in descending order\n        \"credibleSet\",\n        f.when(\n            f.size(f.col(\"credibleSet\")) &gt; 0,\n            order_array_of_structs_by_field(\"credibleSet\", \"posteriorProbability\"),\n        ).when(f.size(f.col(\"credibleSet\")) == 0, f.col(\"credibleSet\")),\n    ).withColumn(\n        # Calculate array of cumulative sums of posterior probabilities to determine which variants are in the 95% and 99% credible sets\n        # and zip the cumulative sums array with the credible set array to add the flags\n        \"credibleSet\",\n        f.when(\n            f.size(f.col(\"credibleSet\")) &gt; 0,\n            f.zip_with(\n                f.col(\"credibleSet\"),\n                f.transform(\n                    f.sequence(f.lit(1), f.size(f.col(\"credibleSet\"))),\n                    lambda index: f.aggregate(\n                        f.slice(\n                            # By using `index - 1` we introduce a value of `0.0` in the cumulative sums array. to ensure that the last variant\n                            # that exceeds the 0.95 threshold is included in the cumulative sum, as its probability is necessary to satisfy the threshold.\n                            f.col(\"credibleSet.posteriorProbability\"),\n                            1,\n                            index - 1,\n                        ),\n                        f.lit(0.0),\n                        lambda acc, el: acc + el,\n                    ),\n                ),\n                lambda struct_e, acc: struct_e.withField(\n                    CredibleInterval.IS95.value, acc &lt; 0.95\n                ).withField(CredibleInterval.IS99.value, acc &lt; 0.99),\n            ),\n        ).when(f.size(f.col(\"credibleSet\")) == 0, f.col(\"credibleSet\")),\n    )\n    return self\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#otg.dataset.study_locus.StudyLocus.clump","title":"<code>clump()</code>","text":"<p>Perform LD clumping of the studyLocus.</p> <p>Evaluates whether a lead variant is linked to a tag (with lowest p-value) in the same studyLocus dataset.</p> <p>Returns:</p> Name Type Description <code>StudyLocus</code> <code>StudyLocus</code> <p>with empty credible sets for linked variants and QC flag.</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>def clump(self: StudyLocus) -&gt; StudyLocus:\n\"\"\"Perform LD clumping of the studyLocus.\n\n    Evaluates whether a lead variant is linked to a tag (with lowest p-value) in the same studyLocus dataset.\n\n    Returns:\n        StudyLocus: with empty credible sets for linked variants and QC flag.\n    \"\"\"\n    self.df = (\n        self.df.withColumn(\n            \"is_lead_linked\",\n            LDclumping._is_lead_linked(\n                self.df.studyId,\n                self.df.variantId,\n                self.df.pValueExponent,\n                self.df.pValueMantissa,\n                self.df.credibleSet,\n            ),\n        )\n        .withColumn(\n            \"credibleSet\",\n            f.when(f.col(\"is_lead_linked\"), f.array()).otherwise(\n                f.col(\"credibleSet\")\n            ),\n        )\n        .withColumn(\n            \"qualityControls\",\n            StudyLocus._update_quality_flag(\n                f.col(\"qualityControls\"),\n                f.col(\"is_lead_linked\"),\n                StudyLocusQualityCheck.LD_CLUMPED,\n            ),\n        )\n        .drop(\"is_lead_linked\")\n    )\n    return self\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#otg.dataset.study_locus.StudyLocus.credible_set","title":"<code>credible_set(credible_interval)</code>","text":"<p>Filter study-locus tag variants based on given credible interval.</p> <p>Parameters:</p> Name Type Description Default <code>credible_interval</code> <code>CredibleInterval</code> <p>Credible interval to filter for.</p> required <p>Returns:</p> Name Type Description <code>StudyLocus</code> <code>StudyLocus</code> <p>Filtered study-locus dataset.</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>def credible_set(\n    self: StudyLocus,\n    credible_interval: CredibleInterval,\n) -&gt; StudyLocus:\n\"\"\"Filter study-locus tag variants based on given credible interval.\n\n    Args:\n        credible_interval (CredibleInterval): Credible interval to filter for.\n\n    Returns:\n        StudyLocus: Filtered study-locus dataset.\n    \"\"\"\n    self.df = self._df.withColumn(\n        \"credibleSet\",\n        f.expr(f\"filter(credibleSet, tag -&gt; (tag.{credible_interval.value}))\"),\n    )\n    return self\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#otg.dataset.study_locus.StudyLocus.from_parquet","title":"<code>from_parquet(session, path)</code>  <code>classmethod</code>","text":"<p>Initialise StudyLocus from parquet file.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>spark session</p> required <code>path</code> <code>str</code> <p>Path to parquet file</p> required <p>Returns:</p> Name Type Description <code>StudyLocus</code> <code>StudyLocus</code> <p>Study-locus dataset</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>@classmethod\ndef from_parquet(cls: type[StudyLocus], session: Session, path: str) -&gt; StudyLocus:\n\"\"\"Initialise StudyLocus from parquet file.\n\n    Args:\n        session (Session): spark session\n        path (str): Path to parquet file\n\n    Returns:\n        StudyLocus: Study-locus dataset\n    \"\"\"\n    df = session.read_parquet(path=path, schema=cls._schema)\n    return cls(_df=df, _schema=cls._schema)\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#otg.dataset.study_locus.StudyLocus.neglog_pvalue","title":"<code>neglog_pvalue()</code>","text":"<p>Returns the negative log p-value.</p> <p>Returns:</p> Name Type Description <code>Column</code> <code>Column</code> <p>Negative log p-value</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>def neglog_pvalue(self: StudyLocus) -&gt; Column:\n\"\"\"Returns the negative log p-value.\n\n    Returns:\n        Column: Negative log p-value\n    \"\"\"\n    return calculate_neglog_pvalue(\n        self.df.pValueMantissa,\n        self.df.pValueExponent,\n    )\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#otg.dataset.study_locus.StudyLocus.overlaps","title":"<code>overlaps(study_index)</code>","text":"<p>Calculate overlapping study-locus.</p> <p>Find overlapping study-locus that share at least one tagging variant. All GWAS-GWAS and all GWAS-Molecular traits are computed with the Molecular traits always appearing on the right side.</p> <p>Parameters:</p> Name Type Description Default <code>study_index</code> <code>StudyIndex</code> <p>Study index to resolve study types.</p> required <p>Returns:</p> Name Type Description <code>StudyLocusOverlap</code> <code>StudyLocusOverlap</code> <p>Pairs of overlapping study-locus with aligned tags.</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>def overlaps(self: StudyLocus, study_index: StudyIndex) -&gt; StudyLocusOverlap:\n\"\"\"Calculate overlapping study-locus.\n\n    Find overlapping study-locus that share at least one tagging variant. All GWAS-GWAS and all GWAS-Molecular traits are computed with the Molecular traits always\n    appearing on the right side.\n\n    Args:\n        study_index (StudyIndex): Study index to resolve study types.\n\n    Returns:\n        StudyLocusOverlap: Pairs of overlapping study-locus with aligned tags.\n    \"\"\"\n    credset_to_overlap = (\n        self.df.join(study_index.study_type_lut(), on=\"studyId\", how=\"inner\")\n        .withColumn(\"credibleSet\", f.explode(\"credibleSet\"))\n        .select(\n            \"studyLocusId\",\n            \"studyType\",\n            \"chromosome\",\n            f.col(\"credibleSet.tagVariantId\").alias(\"tagVariantId\"),\n            f.col(\"credibleSet.logABF\").alias(\"logABF\"),\n            f.col(\"credibleSet.posteriorProbability\").alias(\"posteriorProbability\"),\n        )\n        .persist()\n    )\n\n    # overlapping study-locus\n    peak_overlaps = self._overlapping_peaks(credset_to_overlap)\n\n    # study-locus overlap by aligning overlapping variants\n    return self._align_overlapping_tags(credset_to_overlap, peak_overlaps)\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#otg.dataset.study_locus.StudyLocus.unique_lead_tag_variants","title":"<code>unique_lead_tag_variants()</code>","text":"<p>All unique lead and tag variants contained in the <code>StudyLocus</code> dataframe.</p> <p>Returns:</p> Name Type Description <code>DataFrame</code> <code>DataFrame</code> <p>A dataframe containing <code>variantId</code> and <code>chromosome</code> columns.</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>def unique_lead_tag_variants(self: StudyLocus) -&gt; DataFrame:\n\"\"\"All unique lead and tag variants contained in the `StudyLocus` dataframe.\n\n    Returns:\n        DataFrame: A dataframe containing `variantId` and `chromosome` columns.\n    \"\"\"\n    lead_tags = (\n        self.df.select(\n            f.col(\"variantId\"),\n            f.col(\"chromosome\"),\n            f.explode(\"credibleSet.tagVariantId\").alias(\"tagVariantId\"),\n        )\n        .repartition(\"chromosome\")\n        .persist()\n    )\n    return (\n        lead_tags.select(\"variantId\", \"chromosome\")\n        .union(\n            lead_tags.select(f.col(\"tagVariantId\").alias(\"variantId\"), \"chromosome\")\n        )\n        .distinct()\n    )\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#otg.dataset.study_locus.StudyLocus.unique_study_locus_ancestries","title":"<code>unique_study_locus_ancestries(studies)</code>","text":"<p>All unique lead variant and ancestries contained in the <code>StudyLocus</code>.</p> <p>Parameters:</p> Name Type Description Default <code>studies</code> <code>StudyIndexGWASCatalog</code> <p>Metadata about studies in the <code>StudyLocus</code>.</p> required <p>Returns:</p> Name Type Description <code>DataFrame</code> <code>DataFrame</code> <p>unique [\"variantId\", \"studyId\", \"gnomadPopulation\", \"chromosome\", \"relativeSampleSize\"]</p> Note <p>This method is only available for GWAS Catalog studies.</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>def unique_study_locus_ancestries(\n    self: StudyLocus, studies: StudyIndexGWASCatalog\n) -&gt; DataFrame:\n\"\"\"All unique lead variant and ancestries contained in the `StudyLocus`.\n\n    Args:\n        studies (StudyIndexGWASCatalog): Metadata about studies in the `StudyLocus`.\n\n    Returns:\n        DataFrame: unique [\"variantId\", \"studyId\", \"gnomadPopulation\", \"chromosome\", \"relativeSampleSize\"]\n\n    Note:\n        This method is only available for GWAS Catalog studies.\n    \"\"\"\n    return (\n        self.df.join(\n            studies.get_gnomad_ancestry_sample_sizes(), on=\"studyId\", how=\"left\"\n        )\n        .filter(f.col(\"position\").isNotNull())\n        .select(\n            \"variantId\",\n            \"chromosome\",\n            \"studyId\",\n            \"gnomadPopulation\",\n            \"relativeSampleSize\",\n        )\n        .distinct()\n    )\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#schema","title":"Schema","text":"<pre><code>root\n |-- studyLocusId: long (nullable = false)\n |-- variantId: string (nullable = false)\n |-- chromosome: string (nullable = true)\n |-- position: integer (nullable = true)\n |-- studyId: string (nullable = false)\n |-- beta: double (nullable = true)\n |-- oddsRatio: double (nullable = true)\n |-- oddsRatioConfidenceIntervalLower: double (nullable = true)\n |-- oddsRatioConfidenceIntervalUpper: double (nullable = true)\n |-- betaConfidenceIntervalLower: double (nullable = true)\n |-- betaConfidenceIntervalUpper: double (nullable = true)\n |-- pValueMantissa: float (nullable = true)\n |-- pValueExponent: integer (nullable = true)\n |-- effectAlleleFrequencyFromSource: float (nullable = true)\n |-- standardError: double (nullable = true)\n |-- subStudyDescription: string (nullable = true)\n |-- qualityControls: array (nullable = true)\n |    |-- element: string (containsNull = false)\n |-- finemappingMethod: string (nullable = true)\n |-- credibleSet: array (nullable = true)\n |    |-- element: struct (containsNull = true)\n |    |    |-- is95CredibleSet: boolean (nullable = true)\n |    |    |-- is99CredibleSet: boolean (nullable = true)\n |    |    |-- logABF: double (nullable = true)\n |    |    |-- posteriorProbability: double (nullable = true)\n |    |    |-- tagVariantId: string (nullable = true)\n |    |    |-- tagPValue: double (nullable = true)\n |    |    |-- tagPValueConditioned: double (nullable = true)\n |    |    |-- tagBeta: double (nullable = true)\n |    |    |-- tagStandardError: double (nullable = true)\n |    |    |-- tagBetaConditioned: double (nullable = true)\n |    |    |-- tagStandardErrorConditioned: double (nullable = true)\n |    |    |-- r2Overall: double (nullable = true)\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#study-locus-quality-controls","title":"Study-locus quality controls","text":"<p>         Bases: <code>Enum</code></p> <p>Study-Locus quality control options listing concerns on the quality of the association.</p> <p>Attributes:</p> Name Type Description <code>SUBSIGNIFICANT_FLAG</code> <code>str</code> <p>p-value below significance threshold</p> <code>NO_GENOMIC_LOCATION_FLAG</code> <code>str</code> <p>Incomplete genomic mapping</p> <code>COMPOSITE_FLAG</code> <code>str</code> <p>Composite association due to variant x variant interactions</p> <code>VARIANT_INCONSISTENCY_FLAG</code> <code>str</code> <p>Inconsistencies in the reported variants</p> <code>NON_MAPPED_VARIANT_FLAG</code> <code>str</code> <p>Variant not mapped to GnomAd</p> <code>PALINDROMIC_ALLELE_FLAG</code> <code>str</code> <p>Alleles are palindromic - cannot harmonize</p> <code>AMBIGUOUS_STUDY</code> <code>str</code> <p>Association with ambiguous study</p> <code>UNRESOLVED_LD</code> <code>str</code> <p>Variant not found in LD reference</p> <code>LD_CLUMPED</code> <code>str</code> <p>Explained by a more significant variant in high LD (clumped)</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>class StudyLocusQualityCheck(Enum):\n\"\"\"Study-Locus quality control options listing concerns on the quality of the association.\n\n    Attributes:\n        SUBSIGNIFICANT_FLAG (str): p-value below significance threshold\n        NO_GENOMIC_LOCATION_FLAG (str): Incomplete genomic mapping\n        COMPOSITE_FLAG (str): Composite association due to variant x variant interactions\n        VARIANT_INCONSISTENCY_FLAG (str): Inconsistencies in the reported variants\n        NON_MAPPED_VARIANT_FLAG (str): Variant not mapped to GnomAd\n        PALINDROMIC_ALLELE_FLAG (str): Alleles are palindromic - cannot harmonize\n        AMBIGUOUS_STUDY (str): Association with ambiguous study\n        UNRESOLVED_LD (str): Variant not found in LD reference\n        LD_CLUMPED (str): Explained by a more significant variant in high LD (clumped)\n    \"\"\"\n\n    SUBSIGNIFICANT_FLAG = \"Subsignificant p-value\"\n    NO_GENOMIC_LOCATION_FLAG = \"Incomplete genomic mapping\"\n    COMPOSITE_FLAG = \"Composite association\"\n    INCONSISTENCY_FLAG = \"Variant inconsistency\"\n    NON_MAPPED_VARIANT_FLAG = \"No mapping in GnomAd\"\n    PALINDROMIC_ALLELE_FLAG = \"Palindrome alleles - cannot harmonize\"\n    AMBIGUOUS_STUDY = \"Association with ambiguous study\"\n    UNRESOLVED_LD = \"Variant not found in LD reference\"\n    LD_CLUMPED = \"Explained by a more significant variant in high LD (clumped)\"\n</code></pre>"},{"location":"components/dataset/study_locus/_study_locus/#credible-interval","title":"Credible interval","text":"<p>         Bases: <code>Enum</code></p> <p>Credible interval enum.</p> <p>Interval within which an unobserved parameter value falls with a particular probability.</p> <p>Attributes:</p> Name Type Description <code>IS95</code> <code>str</code> <p>95% credible interval</p> <code>IS99</code> <code>str</code> <p>99% credible interval</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>class CredibleInterval(Enum):\n\"\"\"Credible interval enum.\n\n    Interval within which an unobserved parameter value falls with a particular probability.\n\n    Attributes:\n        IS95 (str): 95% credible interval\n        IS99 (str): 99% credible interval\n    \"\"\"\n\n    IS95 = \"is95CredibleSet\"\n    IS99 = \"is99CredibleSet\"\n</code></pre>"},{"location":"components/dataset/study_locus/study_locus_gwas_catalog/","title":"Study locus gwas catalog","text":"<p>         Bases: <code>StudyLocus</code></p> <p>Study-locus dataset derived from GWAS Catalog.</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>class StudyLocusGWASCatalog(StudyLocus):\n\"\"\"Study-locus dataset derived from GWAS Catalog.\"\"\"\n\n    @staticmethod\n    def _parse_pvalue(pvalue: Column) -&gt; tuple[Column, Column]:\n\"\"\"Parse p-value column.\n\n        Args:\n            pvalue (Column): p-value [string]\n\n        Returns:\n            tuple[Column, Column]: p-value mantissa and exponent\n\n        Example:\n            &gt;&gt;&gt; import pyspark.sql.types as t\n            &gt;&gt;&gt; d = [(\"1.0\"), (\"0.5\"), (\"1E-20\"), (\"3E-3\"), (\"1E-1000\")]\n            &gt;&gt;&gt; df = spark.createDataFrame(d, t.StringType())\n            &gt;&gt;&gt; df.select('value',*StudyLocusGWASCatalog._parse_pvalue(f.col('value'))).show()\n            +-------+--------------+--------------+\n            |  value|pValueMantissa|pValueExponent|\n            +-------+--------------+--------------+\n            |    1.0|           1.0|             1|\n            |    0.5|           0.5|             1|\n            |  1E-20|           1.0|           -20|\n            |   3E-3|           3.0|            -3|\n            |1E-1000|           1.0|         -1000|\n            +-------+--------------+--------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        split = f.split(pvalue, \"E\")\n        return split.getItem(0).cast(\"float\").alias(\"pValueMantissa\"), f.coalesce(\n            split.getItem(1).cast(\"integer\"), f.lit(1)\n        ).alias(\"pValueExponent\")\n\n    @staticmethod\n    def _normalise_pvaluetext(p_value_text: Column) -&gt; Column:\n\"\"\"Normalised p-value text column to a standardised format.\n\n        For cases where there is no mapping, the value is set to null.\n\n        Args:\n            p_value_text (Column): `pValueText` column from GWASCatalog\n\n        Returns:\n            Column: Array column after using GWAS Catalog mappings. There might be multiple mappings for a single p-value text.\n\n        Example:\n            &gt;&gt;&gt; import pyspark.sql.types as t\n            &gt;&gt;&gt; d = [(\"European Ancestry\"), (\"African ancestry\"), (\"Alzheimer\u2019s Disease\"), (\"(progression)\"), (\"\"), (None)]\n            &gt;&gt;&gt; df = spark.createDataFrame(d, t.StringType())\n            &gt;&gt;&gt; df.withColumn('normalised', StudyLocusGWASCatalog._normalise_pvaluetext(f.col('value'))).show()\n            +-------------------+----------+\n            |              value|normalised|\n            +-------------------+----------+\n            |  European Ancestry|      [EA]|\n            |   African ancestry|      [AA]|\n            |Alzheimer\u2019s Disease|      [AD]|\n            |      (progression)|      null|\n            |                   |      null|\n            |               null|      null|\n            +-------------------+----------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        # GWAS Catalog to p-value mapping\n        json_dict = json.loads(\n            pkg_resources.read_text(data, \"gwas_pValueText_map.json\", encoding=\"utf-8\")\n        )\n        map_expr = f.create_map(*[f.lit(x) for x in chain(*json_dict.items())])\n\n        splitted_col = f.split(f.regexp_replace(p_value_text, r\"[\\(\\)]\", \"\"), \",\")\n        mapped_col = f.transform(splitted_col, lambda x: map_expr[x])\n        return f.when(f.forall(mapped_col, lambda x: x.isNull()), None).otherwise(\n            mapped_col\n        )\n\n    @staticmethod\n    def _normalise_risk_allele(risk_allele: Column) -&gt; Column:\n\"\"\"Normalised risk allele column to a standardised format.\n\n        If multiple risk alleles are present, the first one is returned.\n\n        Args:\n            risk_allele (Column): `riskAllele` column from GWASCatalog\n\n        Returns:\n            Column: mapped using GWAS Catalog mapping\n\n        Example:\n            &gt;&gt;&gt; import pyspark.sql.types as t\n            &gt;&gt;&gt; d = [(\"rs1234-A-G\"), (\"rs1234-A\"), (\"rs1234-A; rs1235-G\")]\n            &gt;&gt;&gt; df = spark.createDataFrame(d, t.StringType())\n            &gt;&gt;&gt; df.withColumn('normalised', StudyLocusGWASCatalog._normalise_risk_allele(f.col('value'))).show()\n            +------------------+----------+\n            |             value|normalised|\n            +------------------+----------+\n            |        rs1234-A-G|         A|\n            |          rs1234-A|         A|\n            |rs1234-A; rs1235-G|         A|\n            +------------------+----------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        # GWAS Catalog to risk allele mapping\n        return f.split(f.split(risk_allele, \"; \").getItem(0), \"-\").getItem(1)\n\n    @staticmethod\n    def _collect_rsids(\n        snp_id: Column, snp_id_current: Column, risk_allele: Column\n    ) -&gt; Column:\n\"\"\"It takes three columns, and returns an array of distinct values from those columns.\n\n        Args:\n            snp_id (Column): The original snp id from the GWAS catalog.\n            snp_id_current (Column): The current snp id field is just a number at the moment (stored as a string). Adding 'rs' prefix if looks good.\n            risk_allele (Column): The risk allele for the SNP.\n\n        Returns:\n            An array of distinct values.\n        \"\"\"\n        # The current snp id field is just a number at the moment (stored as a string). Adding 'rs' prefix if looks good.\n        snp_id_current = f.when(\n            snp_id_current.rlike(\"^[0-9]*$\"),\n            f.format_string(\"rs%s\", snp_id_current),\n        )\n        # Cleaning risk allele:\n        risk_allele = f.split(risk_allele, \"-\").getItem(0)\n\n        # Collecting all values:\n        return f.array_distinct(f.array(snp_id, snp_id_current, risk_allele))\n\n    @staticmethod\n    def _map_to_variant_annotation_variants(\n        gwas_associations: DataFrame, variant_annotation: VariantAnnotation\n    ) -&gt; DataFrame:\n\"\"\"Add variant metadata in associations.\n\n        Args:\n            gwas_associations (DataFrame): raw GWAS Catalog associations\n            variant_annotation (VariantAnnotation): variant annotation dataset\n\n        Returns:\n            DataFrame: GWAS Catalog associations data including `variantId`, `referenceAllele`,\n            `alternateAllele`, `chromosome`, `position` with variant metadata\n        \"\"\"\n        # Subset of GWAS Catalog associations required for resolving variant IDs:\n        gwas_associations_subset = gwas_associations.select(\n            \"studyLocusId\",\n            f.col(\"CHR_ID\").alias(\"chromosome\"),\n            f.col(\"CHR_POS\").cast(IntegerType()).alias(\"position\"),\n            # List of all SNPs associated with the variant\n            StudyLocusGWASCatalog._collect_rsids(\n                f.split(f.col(\"SNPS\"), \"; \").getItem(0),\n                f.col(\"SNP_ID_CURRENT\"),\n                f.split(f.col(\"STRONGEST SNP-RISK ALLELE\"), \"; \").getItem(0),\n            ).alias(\"rsIdsGwasCatalog\"),\n            StudyLocusGWASCatalog._normalise_risk_allele(\n                f.col(\"STRONGEST SNP-RISK ALLELE\")\n            ).alias(\"riskAllele\"),\n        )\n\n        # Subset of variant annotation required for GWAS Catalog annotations:\n        va_subset = variant_annotation.df.select(\n            \"variantId\",\n            \"chromosome\",\n            \"position\",\n            f.col(\"rsIds\").alias(\"rsIdsGnomad\"),\n            \"referenceAllele\",\n            \"alternateAllele\",\n            \"alleleFrequencies\",\n            variant_annotation.max_maf().alias(\"maxMaf\"),\n        ).join(\n            f.broadcast(\n                gwas_associations_subset.select(\"chromosome\", \"position\").distinct()\n            ),\n            on=[\"chromosome\", \"position\"],\n            how=\"inner\",\n        )\n\n        # Semi-resolved ids (still contains duplicates when conclusion was not possible to make\n        # based on rsIds or allele concordance)\n        filtered_associations = (\n            gwas_associations_subset.join(\n                f.broadcast(va_subset),\n                on=[\"chromosome\", \"position\"],\n                how=\"left\",\n            )\n            .withColumn(\n                \"rsIdFilter\",\n                StudyLocusGWASCatalog._flag_mappings_to_retain(\n                    f.col(\"studyLocusId\"),\n                    StudyLocusGWASCatalog._compare_rsids(\n                        f.col(\"rsIdsGnomad\"), f.col(\"rsIdsGwasCatalog\")\n                    ),\n                ),\n            )\n            .withColumn(\n                \"concordanceFilter\",\n                StudyLocusGWASCatalog._flag_mappings_to_retain(\n                    f.col(\"studyLocusId\"),\n                    StudyLocusGWASCatalog._check_concordance(\n                        f.col(\"riskAllele\"),\n                        f.col(\"referenceAllele\"),\n                        f.col(\"alternateAllele\"),\n                    ),\n                ),\n            )\n            .filter(\n                # Filter out rows where GWAS Catalog rsId does not match with GnomAD rsId,\n                # but there is corresponding variant for the same association\n                f.col(\"rsIdFilter\")\n                # or filter out rows where GWAS Catalog alleles are not concordant with GnomAD alleles,\n                # but there is corresponding variant for the same association\n                | f.col(\"concordanceFilter\")\n            )\n        )\n\n        # Keep only highest maxMaf variant per studyLocusId\n        fully_mapped_associations = get_record_with_maximum_value(\n            filtered_associations, grouping_col=\"studyLocusId\", sorting_col=\"maxMaf\"\n        ).select(\n            \"studyLocusId\",\n            \"variantId\",\n            \"referenceAllele\",\n            \"alternateAllele\",\n            \"chromosome\",\n            \"position\",\n        )\n\n        return gwas_associations.join(\n            fully_mapped_associations, on=\"studyLocusId\", how=\"left\"\n        )\n\n    @staticmethod\n    def _compare_rsids(gnomad: Column, gwas: Column) -&gt; Column:\n\"\"\"If the intersection of the two arrays is greater than 0, return True, otherwise return False.\n\n        Args:\n            gnomad (Column): rsids from gnomad\n            gwas (Column): rsids from the GWAS Catalog\n\n        Returns:\n            A boolean column that is true if the GnomAD rsIDs can be found in the GWAS rsIDs.\n\n        Examples:\n            &gt;&gt;&gt; d = [\n            ...    (1, [\"rs123\", \"rs523\"], [\"rs123\"]),\n            ...    (2, [], [\"rs123\"]),\n            ...    (3, [\"rs123\", \"rs523\"], []),\n            ...    (4, [], []),\n            ... ]\n            &gt;&gt;&gt; df = spark.createDataFrame(d, ['associationId', 'gnomad', 'gwas'])\n            &gt;&gt;&gt; df.withColumn(\"rsid_matches\", StudyLocusGWASCatalog._compare_rsids(f.col(\"gnomad\"),f.col('gwas'))).show()\n            +-------------+--------------+-------+------------+\n            |associationId|        gnomad|   gwas|rsid_matches|\n            +-------------+--------------+-------+------------+\n            |            1|[rs123, rs523]|[rs123]|        true|\n            |            2|            []|[rs123]|       false|\n            |            3|[rs123, rs523]|     []|       false|\n            |            4|            []|     []|       false|\n            +-------------+--------------+-------+------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        return f.when(f.size(f.array_intersect(gnomad, gwas)) &gt; 0, True).otherwise(\n            False\n        )\n\n    @staticmethod\n    def _flag_mappings_to_retain(\n        association_id: Column, filter_column: Column\n    ) -&gt; Column:\n\"\"\"Flagging mappings to drop for each association.\n\n        Some associations have multiple mappings. Some has matching rsId others don't. We only\n        want to drop the non-matching mappings, when a matching is available for the given association.\n        This logic can be generalised for other measures eg. allele concordance.\n\n        Args:\n            association_id (Column): association identifier column\n            filter_column (Column): boolean col indicating to keep a mapping\n\n        Returns:\n            A column with a boolean value.\n\n        Examples:\n        &gt;&gt;&gt; d = [\n        ...    (1, False),\n        ...    (1, False),\n        ...    (2, False),\n        ...    (2, True),\n        ...    (3, True),\n        ...    (3, True),\n        ... ]\n        &gt;&gt;&gt; df = spark.createDataFrame(d, ['associationId', 'filter'])\n        &gt;&gt;&gt; df.withColumn(\"isConcordant\", StudyLocusGWASCatalog._flag_mappings_to_retain(f.col(\"associationId\"),f.col('filter'))).show()\n        +-------------+------+------------+\n        |associationId|filter|isConcordant|\n        +-------------+------+------------+\n        |            1| false|        true|\n        |            1| false|        true|\n        |            2| false|       false|\n        |            2|  true|        true|\n        |            3|  true|        true|\n        |            3|  true|        true|\n        +-------------+------+------------+\n        &lt;BLANKLINE&gt;\n\n        \"\"\"\n        w = Window.partitionBy(association_id)\n\n        # Generating a boolean column informing if the filter column contains true anywhere for the association:\n        aggregated_filter = f.when(\n            f.array_contains(f.collect_set(filter_column).over(w), True), True\n        ).otherwise(False)\n\n        # Generate a filter column:\n        return f.when(aggregated_filter &amp; (~filter_column), False).otherwise(True)\n\n    @staticmethod\n    def _check_concordance(\n        risk_allele: Column, reference_allele: Column, alternate_allele: Column\n    ) -&gt; Column:\n\"\"\"A function to check if the risk allele is concordant with the alt or ref allele.\n\n        If the risk allele is the same as the reference or alternate allele, or if the reverse complement of\n        the risk allele is the same as the reference or alternate allele, then the allele is concordant.\n        If no mapping is available (ref/alt is null), the function returns True.\n\n        Args:\n            risk_allele (Column): The allele that is associated with the risk of the disease.\n            reference_allele (Column): The reference allele from the GWAS catalog\n            alternate_allele (Column): The alternate allele of the variant.\n\n        Returns:\n            A boolean column that is True if the risk allele is the same as the reference or alternate allele,\n            or if the reverse complement of the risk allele is the same as the reference or alternate allele.\n\n        Examples:\n            &gt;&gt;&gt; d = [\n            ...     ('A', 'A', 'G'),\n            ...     ('A', 'T', 'G'),\n            ...     ('A', 'C', 'G'),\n            ...     ('A', 'A', '?'),\n            ...     (None, None, 'A'),\n            ... ]\n            &gt;&gt;&gt; df = spark.createDataFrame(d, ['riskAllele', 'referenceAllele', 'alternateAllele'])\n            &gt;&gt;&gt; df.withColumn(\"isConcordant\", StudyLocusGWASCatalog._check_concordance(f.col(\"riskAllele\"),f.col('referenceAllele'), f.col('alternateAllele'))).show()\n            +----------+---------------+---------------+------------+\n            |riskAllele|referenceAllele|alternateAllele|isConcordant|\n            +----------+---------------+---------------+------------+\n            |         A|              A|              G|        true|\n            |         A|              T|              G|        true|\n            |         A|              C|              G|       false|\n            |         A|              A|              ?|        true|\n            |      null|           null|              A|        true|\n            +----------+---------------+---------------+------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        # Calculating the reverse complement of the risk allele:\n        risk_allele_reverse_complement = f.when(\n            risk_allele.rlike(r\"^[ACTG]+$\"),\n            f.reverse(f.translate(risk_allele, \"ACTG\", \"TGAC\")),\n        ).otherwise(risk_allele)\n\n        # OK, is the risk allele or the reverse complent is the same as the mapped alleles:\n        return (\n            f.when(\n                (risk_allele == reference_allele) | (risk_allele == alternate_allele),\n                True,\n            )\n            # If risk allele is found on the negative strand:\n            .when(\n                (risk_allele_reverse_complement == reference_allele)\n                | (risk_allele_reverse_complement == alternate_allele),\n                True,\n            )\n            # If risk allele is ambiguous, still accepted: &lt; This condition could be reconsidered\n            .when(risk_allele == \"?\", True)\n            # If the association could not be mapped we keep it:\n            .when(reference_allele.isNull(), True)\n            # Allele is discordant:\n            .otherwise(False)\n        )\n\n    @staticmethod\n    def _get_reverse_complement(allele_col: Column) -&gt; Column:\n\"\"\"A function to return the reverse complement of an allele column.\n\n        It takes a string and returns the reverse complement of that string if it's a DNA sequence,\n        otherwise it returns the original string. Assumes alleles in upper case.\n\n        Args:\n            allele_col (Column): The column containing the allele to reverse complement.\n\n        Returns:\n            A column that is the reverse complement of the allele column.\n\n        Examples:\n            &gt;&gt;&gt; d = [{\"allele\": 'A'}, {\"allele\": 'T'},{\"allele\": 'G'}, {\"allele\": 'C'},{\"allele\": 'AC'}, {\"allele\": 'GTaatc'},{\"allele\": '?'}, {\"allele\": None}]\n            &gt;&gt;&gt; df = spark.createDataFrame(d)\n            &gt;&gt;&gt; df.withColumn(\"revcom_allele\", StudyLocusGWASCatalog._get_reverse_complement(f.col(\"allele\"))).show()\n            +------+-------------+\n            |allele|revcom_allele|\n            +------+-------------+\n            |     A|            T|\n            |     T|            A|\n            |     G|            C|\n            |     C|            G|\n            |    AC|           GT|\n            |GTaatc|       GATTAC|\n            |     ?|            ?|\n            |  null|         null|\n            +------+-------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        allele_col = f.upper(allele_col)\n        return f.when(\n            allele_col.rlike(\"[ACTG]+\"),\n            f.reverse(f.translate(allele_col, \"ACTG\", \"TGAC\")),\n        ).otherwise(allele_col)\n\n    @staticmethod\n    def _effect_needs_harmonisation(\n        risk_allele: Column, reference_allele: Column\n    ) -&gt; Column:\n\"\"\"A function to check if the effect allele needs to be harmonised.\n\n        Args:\n            risk_allele (Column): Risk allele column\n            reference_allele (Column): Effect allele column\n\n        Returns:\n            A boolean column indicating if the effect allele needs to be harmonised.\n\n        Examples:\n            &gt;&gt;&gt; d = [{\"risk\": 'A', \"reference\": 'A'}, {\"risk\": 'A', \"reference\": 'T'}, {\"risk\": 'AT', \"reference\": 'TA'}, {\"risk\": 'AT', \"reference\": 'AT'}]\n            &gt;&gt;&gt; df = spark.createDataFrame(d)\n            &gt;&gt;&gt; df.withColumn(\"needs_harmonisation\", StudyLocusGWASCatalog._effect_needs_harmonisation(f.col(\"risk\"), f.col(\"reference\"))).show()\n            +---------+----+-------------------+\n            |reference|risk|needs_harmonisation|\n            +---------+----+-------------------+\n            |        A|   A|               true|\n            |        T|   A|               true|\n            |       TA|  AT|              false|\n            |       AT|  AT|               true|\n            +---------+----+-------------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        return (risk_allele == reference_allele) | (\n            risk_allele\n            == StudyLocusGWASCatalog._get_reverse_complement(reference_allele)\n        )\n\n    @staticmethod\n    def _are_alleles_palindromic(\n        reference_allele: Column, alternate_allele: Column\n    ) -&gt; Column:\n\"\"\"A function to check if the alleles are palindromic.\n\n        Args:\n            reference_allele (Column): Reference allele column\n            alternate_allele (Column): Alternate allele column\n\n        Returns:\n            A boolean column indicating if the alleles are palindromic.\n\n        Examples:\n            &gt;&gt;&gt; d = [{\"reference\": 'A', \"alternate\": 'T'}, {\"reference\": 'AT', \"alternate\": 'AG'}, {\"reference\": 'AT', \"alternate\": 'AT'}, {\"reference\": 'CATATG', \"alternate\": 'CATATG'}, {\"reference\": '-', \"alternate\": None}]\n            &gt;&gt;&gt; df = spark.createDataFrame(d)\n            &gt;&gt;&gt; df.withColumn(\"is_palindromic\", StudyLocusGWASCatalog._are_alleles_palindromic(f.col(\"reference\"), f.col(\"alternate\"))).show()\n            +---------+---------+--------------+\n            |alternate|reference|is_palindromic|\n            +---------+---------+--------------+\n            |        T|        A|          true|\n            |       AG|       AT|         false|\n            |       AT|       AT|          true|\n            |   CATATG|   CATATG|          true|\n            |     null|        -|         false|\n            +---------+---------+--------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        revcomp = StudyLocusGWASCatalog._get_reverse_complement(alternate_allele)\n        return (\n            f.when(reference_allele == revcomp, True)\n            .when(revcomp.isNull(), False)\n            .otherwise(False)\n        )\n\n    @staticmethod\n    def _harmonise_beta(\n        risk_allele: Column,\n        reference_allele: Column,\n        alternate_allele: Column,\n        effect_size: Column,\n        confidence_interval: Column,\n    ) -&gt; Column:\n\"\"\"A function to extract the beta value from the effect size and confidence interval.\n\n        If the confidence interval contains the word \"increase\" or \"decrease\" it indicates, we are dealing with betas.\n        If it's \"increase\" and the effect size needs to be harmonized, then multiply the effect size by -1\n\n        Args:\n            risk_allele (Column): Risk allele column\n            reference_allele (Column): Reference allele column\n            alternate_allele (Column): Alternate allele column\n            effect_size (Column): GWAS Catalog effect size column\n            confidence_interval (Column): GWAS Catalog confidence interval column\n\n        Returns:\n            A column containing the beta value.\n        \"\"\"\n        return (\n            f.when(\n                StudyLocusGWASCatalog._are_alleles_palindromic(\n                    reference_allele, alternate_allele\n                ),\n                None,\n            )\n            .when(\n                (\n                    StudyLocusGWASCatalog._effect_needs_harmonisation(\n                        risk_allele, reference_allele\n                    )\n                    &amp; confidence_interval.contains(\"increase\")\n                )\n                | (\n                    ~StudyLocusGWASCatalog._effect_needs_harmonisation(\n                        risk_allele, reference_allele\n                    )\n                    &amp; confidence_interval.contains(\"decrease\")\n                ),\n                -effect_size,\n            )\n            .otherwise(effect_size)\n            .cast(DoubleType())\n        )\n\n    @staticmethod\n    def _harmonise_beta_ci(\n        risk_allele: Column,\n        reference_allele: Column,\n        alternate_allele: Column,\n        effect_size: Column,\n        confidence_interval: Column,\n        p_value: Column,\n        direction: str,\n    ) -&gt; Column:\n\"\"\"Calculating confidence intervals for beta values.\n\n        Args:\n            risk_allele (Column): Risk allele column\n            reference_allele (Column): Reference allele column\n            alternate_allele (Column): Alternate allele column\n            effect_size (Column): GWAS Catalog effect size column\n            confidence_interval (Column): GWAS Catalog confidence interval column\n            p_value (Column): GWAS Catalog p-value column\n            direction (str): This is the direction of the confidence interval. It can be either \"upper\" or \"lower\".\n\n        Returns:\n            The upper and lower bounds of the confidence interval for the beta coefficient.\n        \"\"\"\n        zscore_95 = f.lit(1.96)\n        beta = StudyLocusGWASCatalog._harmonise_beta(\n            risk_allele,\n            reference_allele,\n            alternate_allele,\n            effect_size,\n            confidence_interval,\n        )\n        zscore = pvalue_to_zscore(p_value)\n        return (\n            f.when(f.lit(direction) == \"upper\", beta + f.abs(zscore_95 * beta) / zscore)\n            .when(f.lit(direction) == \"lower\", beta - f.abs(zscore_95 * beta) / zscore)\n            .otherwise(None)\n        )\n\n    @staticmethod\n    def _harmonise_odds_ratio(\n        risk_allele: Column,\n        reference_allele: Column,\n        alternate_allele: Column,\n        effect_size: Column,\n        confidence_interval: Column,\n    ) -&gt; Column:\n\"\"\"Harmonizing odds ratio.\n\n        Args:\n            risk_allele (Column): Risk allele column\n            reference_allele (Column): Reference allele column\n            alternate_allele (Column): Alternate allele column\n            effect_size (Column): GWAS Catalog effect size column\n            confidence_interval (Column): GWAS Catalog confidence interval column\n\n        Returns:\n            A column with the odds ratio, or 1/odds_ratio if harmonization required.\n        \"\"\"\n        return (\n            f.when(\n                StudyLocusGWASCatalog._are_alleles_palindromic(\n                    reference_allele, alternate_allele\n                ),\n                None,\n            )\n            .when(\n                (\n                    StudyLocusGWASCatalog._effect_needs_harmonisation(\n                        risk_allele, reference_allele\n                    )\n                    &amp; ~confidence_interval.rlike(\"|\".join([\"decrease\", \"increase\"]))\n                ),\n                1 / effect_size,\n            )\n            .otherwise(effect_size)\n            .cast(DoubleType())\n        )\n\n    @staticmethod\n    def _harmonise_odds_ratio_ci(\n        risk_allele: Column,\n        reference_allele: Column,\n        alternate_allele: Column,\n        effect_size: Column,\n        confidence_interval: Column,\n        p_value: Column,\n        direction: str,\n    ) -&gt; Column:\n\"\"\"Calculating confidence intervals for beta values.\n\n        Args:\n            risk_allele (Column): Risk allele column\n            reference_allele (Column): Reference allele column\n            alternate_allele (Column): Alternate allele column\n            effect_size (Column): GWAS Catalog effect size column\n            confidence_interval (Column): GWAS Catalog confidence interval column\n            p_value (Column): GWAS Catalog p-value column\n            direction (str): This is the direction of the confidence interval. It can be either \"upper\" or \"lower\".\n\n        Returns:\n            The upper and lower bounds of the 95% confidence interval for the odds ratio.\n        \"\"\"\n        zscore_95 = f.lit(1.96)\n        odds_ratio = StudyLocusGWASCatalog._harmonise_odds_ratio(\n            risk_allele,\n            reference_allele,\n            alternate_allele,\n            effect_size,\n            confidence_interval,\n        )\n        odds_ratio_estimate = f.log(odds_ratio)\n        zscore = pvalue_to_zscore(p_value)\n        odds_ratio_se = odds_ratio_estimate / zscore\n        return f.when(\n            f.lit(direction) == \"upper\",\n            f.exp(odds_ratio_estimate + f.abs(zscore_95 * odds_ratio_se)),\n        ).when(\n            f.lit(direction) == \"lower\",\n            f.exp(odds_ratio_estimate - f.abs(zscore_95 * odds_ratio_se)),\n        )\n\n    @staticmethod\n    def _concatenate_substudy_description(\n        association_trait: Column, pvalue_text: Column, mapped_trait_uri: Column\n    ) -&gt; Column:\n\"\"\"Substudy description parsing. Complex string containing metadata about the substudy (e.g. QTL, specific EFO, etc.).\n\n        Args:\n            association_trait (Column): GWAS Catalog association trait column\n            pvalue_text (Column): GWAS Catalog p-value text column\n            mapped_trait_uri (Column): GWAS Catalog mapped trait URI column\n\n        Returns:\n            A column with the substudy description in the shape trait|pvaluetext1_pvaluetext2|EFO1_EFO2.\n\n        Examples:\n        &gt;&gt;&gt; df = spark.createDataFrame([\n        ...    (\"Height\", \"http://www.ebi.ac.uk/efo/EFO_0000001,http://www.ebi.ac.uk/efo/EFO_0000002\", \"European Ancestry\"),\n        ...    (\"Schizophrenia\", \"http://www.ebi.ac.uk/efo/MONDO_0005090\", None)],\n        ...    [\"association_trait\", \"mapped_trait_uri\", \"pvalue_text\"]\n        ... )\n        &gt;&gt;&gt; df.withColumn('substudy_description', StudyLocusGWASCatalog._concatenate_substudy_description(df.association_trait, df.pvalue_text, df.mapped_trait_uri)).show(truncate=False)\n        +-----------------+-------------------------------------------------------------------------+-----------------+------------------------------------------+\n        |association_trait|mapped_trait_uri                                                         |pvalue_text      |substudy_description                      |\n        +-----------------+-------------------------------------------------------------------------+-----------------+------------------------------------------+\n        |Height           |http://www.ebi.ac.uk/efo/EFO_0000001,http://www.ebi.ac.uk/efo/EFO_0000002|European Ancestry|Height|EA|EFO_0000001/EFO_0000002         |\n        |Schizophrenia    |http://www.ebi.ac.uk/efo/MONDO_0005090                                   |null             |Schizophrenia|no_pvalue_text|MONDO_0005090|\n        +-----------------+-------------------------------------------------------------------------+-----------------+------------------------------------------+\n        &lt;BLANKLINE&gt;\n        \"\"\"\n        p_value_text = f.coalesce(\n            StudyLocusGWASCatalog._normalise_pvaluetext(pvalue_text),\n            f.array(f.lit(\"no_pvalue_text\")),\n        )\n        return f.concat_ws(\n            \"|\",\n            association_trait,\n            f.concat_ws(\n                \"/\",\n                p_value_text,\n            ),\n            f.concat_ws(\n                \"/\",\n                parse_efos(mapped_trait_uri),\n            ),\n        )\n\n    @staticmethod\n    def _qc_all(\n        qc: Column,\n        chromosome: Column,\n        position: Column,\n        reference_allele: Column,\n        alternate_allele: Column,\n        strongest_snp_risk_allele: Column,\n        p_value_mantissa: Column,\n        p_value_exponent: Column,\n        p_value_cutoff: float,\n    ) -&gt; Column:\n\"\"\"Flag associations that fail any QC.\n\n        Args:\n            qc (Column): QC column\n            chromosome (Column): Chromosome column\n            position (Column): Position column\n            reference_allele (Column): Reference allele column\n            alternate_allele (Column): Alternate allele column\n            strongest_snp_risk_allele (Column): Strongest SNP risk allele column\n            p_value_mantissa (Column): P-value mantissa column\n            p_value_exponent (Column): P-value exponent column\n            p_value_cutoff (float): P-value cutoff\n\n        Returns:\n            Column: Updated QC column with flag.\n        \"\"\"\n        qc = StudyLocusGWASCatalog._qc_variant_interactions(\n            qc, strongest_snp_risk_allele\n        )\n        qc = StudyLocusGWASCatalog._qc_subsignificant_associations(\n            qc, p_value_mantissa, p_value_exponent, p_value_cutoff\n        )\n        qc = StudyLocusGWASCatalog._qc_genomic_location(qc, chromosome, position)\n        qc = StudyLocusGWASCatalog._qc_variant_inconsistencies(\n            qc, chromosome, position, strongest_snp_risk_allele\n        )\n        qc = StudyLocusGWASCatalog._qc_unmapped_variants(qc, alternate_allele)\n        qc = StudyLocusGWASCatalog._qc_palindromic_alleles(\n            qc, reference_allele, alternate_allele\n        )\n        return qc\n\n    @staticmethod\n    def _qc_variant_interactions(\n        qc: Column, strongest_snp_risk_allele: Column\n    ) -&gt; Column:\n\"\"\"Flag associations based on variant x variant interactions.\n\n        Args:\n            qc (Column): QC column\n            strongest_snp_risk_allele (Column): Column with the strongest SNP risk allele\n\n        Returns:\n            Column: Updated QC column with flag.\n        \"\"\"\n        return StudyLocusGWASCatalog._update_quality_flag(\n            qc,\n            strongest_snp_risk_allele.contains(\";\"),\n            StudyLocusQualityCheck.COMPOSITE_FLAG,\n        )\n\n    @staticmethod\n    def _qc_subsignificant_associations(\n        qc: Column,\n        p_value_mantissa: Column,\n        p_value_exponent: Column,\n        pvalue_cutoff: float,\n    ) -&gt; Column:\n\"\"\"Flag associations below significant threshold.\n\n        Args:\n            qc (Column): QC column\n            p_value_mantissa (Column): P-value mantissa column\n            p_value_exponent (Column): P-value exponent column\n            pvalue_cutoff (float): association p-value cut-off\n\n        Returns:\n            Column: Updated QC column with flag.\n\n        Examples:\n            &gt;&gt;&gt; import pyspark.sql.types as t\n            &gt;&gt;&gt; d = [{'qc': None, 'p_value_mantissa': 1, 'p_value_exponent': -7}, {'qc': None, 'p_value_mantissa': 1, 'p_value_exponent': -8}, {'qc': None, 'p_value_mantissa': 5, 'p_value_exponent': -8}, {'qc': None, 'p_value_mantissa': 1, 'p_value_exponent': -9}]\n            &gt;&gt;&gt; df = spark.createDataFrame(d, t.StructType([t.StructField('qc', t.ArrayType(t.StringType()), True), t.StructField('p_value_mantissa', t.IntegerType()), t.StructField('p_value_exponent', t.IntegerType())]))\n            &gt;&gt;&gt; df.withColumn('qc', StudyLocusGWASCatalog._qc_subsignificant_associations(f.col(\"qc\"), f.col(\"p_value_mantissa\"), f.col(\"p_value_exponent\"), 5e-8)).show(truncate = False)\n            +------------------------+----------------+----------------+\n            |qc                      |p_value_mantissa|p_value_exponent|\n            +------------------------+----------------+----------------+\n            |[Subsignificant p-value]|1               |-7              |\n            |[]                      |1               |-8              |\n            |[]                      |5               |-8              |\n            |[]                      |1               |-9              |\n            +------------------------+----------------+----------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        return StudyLocus._update_quality_flag(\n            qc,\n            calculate_neglog_pvalue(p_value_mantissa, p_value_exponent)\n            &lt; f.lit(-np.log10(pvalue_cutoff)),\n            StudyLocusQualityCheck.SUBSIGNIFICANT_FLAG,\n        )\n\n    @staticmethod\n    def _qc_genomic_location(\n        qc: Column, chromosome: Column, position: Column\n    ) -&gt; Column:\n\"\"\"Flag associations without genomic location in GWAS Catalog.\n\n        Args:\n            qc (Column): QC column\n            chromosome (Column): Chromosome column in GWAS Catalog\n            position (Column): Position column in GWAS Catalog\n\n        Returns:\n            Column: Updated QC column with flag.\n\n        Examples:\n            &gt;&gt;&gt; import pyspark.sql.types as t\n            &gt;&gt;&gt; d = [{'qc': None, 'chromosome': None, 'position': None}, {'qc': None, 'chromosome': '1', 'position': None}, {'qc': None, 'chromosome': None, 'position': 1}, {'qc': None, 'chromosome': '1', 'position': 1}]\n            &gt;&gt;&gt; df = spark.createDataFrame(d, schema=t.StructType([t.StructField('qc', t.ArrayType(t.StringType()), True), t.StructField('chromosome', t.StringType()), t.StructField('position', t.IntegerType())]))\n            &gt;&gt;&gt; df.withColumn('qc', StudyLocusGWASCatalog._qc_genomic_location(df.qc, df.chromosome, df.position)).show(truncate=False)\n            +----------------------------+----------+--------+\n            |qc                          |chromosome|position|\n            +----------------------------+----------+--------+\n            |[Incomplete genomic mapping]|null      |null    |\n            |[Incomplete genomic mapping]|1         |null    |\n            |[Incomplete genomic mapping]|null      |1       |\n            |[]                          |1         |1       |\n            +----------------------------+----------+--------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        return StudyLocus._update_quality_flag(\n            qc,\n            position.isNull() | chromosome.isNull(),\n            StudyLocusQualityCheck.NO_GENOMIC_LOCATION_FLAG,\n        )\n\n    @staticmethod\n    def _qc_variant_inconsistencies(\n        qc: Column,\n        chromosome: Column,\n        position: Column,\n        strongest_snp_risk_allele: Column,\n    ) -&gt; Column:\n\"\"\"Flag associations with inconsistencies in the variant annotation.\n\n        Args:\n            qc (Column): QC column\n            chromosome (Column): Chromosome column in GWAS Catalog\n            position (Column): Position column in GWAS Catalog\n            strongest_snp_risk_allele (Column): Strongest SNP risk allele column in GWAS Catalog\n\n        Returns:\n            Column: Updated QC column with flag.\n        \"\"\"\n        return StudyLocusGWASCatalog._update_quality_flag(\n            qc,\n            # Number of chromosomes does not correspond to the number of positions:\n            (f.size(f.split(chromosome, \";\")) != f.size(f.split(position, \";\")))\n            # Number of chromosome values different from riskAllele values:\n            | (\n                f.size(f.split(chromosome, \";\"))\n                != f.size(f.split(strongest_snp_risk_allele, \";\"))\n            ),\n            StudyLocusQualityCheck.INCONSISTENCY_FLAG,\n        )\n\n    @staticmethod\n    def _qc_unmapped_variants(qc: Column, alternate_allele: Column) -&gt; Column:\n\"\"\"Flag associations with variants not mapped to variantAnnotation.\n\n        Args:\n            qc (Column): QC column\n            alternate_allele (Column): alternate allele\n\n        Returns:\n            Column: Updated QC column with flag.\n\n        Example:\n            &gt;&gt;&gt; import pyspark.sql.types as t\n            &gt;&gt;&gt; d = [{'alternate_allele': 'A', 'qc': None}, {'alternate_allele': None, 'qc': None}]\n            &gt;&gt;&gt; schema = t.StructType([t.StructField('alternate_allele', t.StringType(), True), t.StructField('qc', t.ArrayType(t.StringType()), True)])\n            &gt;&gt;&gt; df = spark.createDataFrame(data=d, schema=schema)\n            &gt;&gt;&gt; df.withColumn(\"new_qc\", StudyLocusGWASCatalog._qc_unmapped_variants(f.col(\"qc\"), f.col(\"alternate_allele\"))).show()\n            +----------------+----+--------------------+\n            |alternate_allele|  qc|              new_qc|\n            +----------------+----+--------------------+\n            |               A|null|                  []|\n            |            null|null|[No mapping in Gn...|\n            +----------------+----+--------------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        return StudyLocus._update_quality_flag(\n            qc,\n            alternate_allele.isNull(),\n            StudyLocusQualityCheck.NON_MAPPED_VARIANT_FLAG,\n        )\n\n    @staticmethod\n    def _qc_palindromic_alleles(\n        qc: Column, reference_allele: Column, alternate_allele: Column\n    ) -&gt; Column:\n\"\"\"Flag associations with palindromic variants which effects can not be harmonised.\n\n        Args:\n            qc (Column): QC column\n            reference_allele (Column): reference allele\n            alternate_allele (Column): alternate allele\n\n        Returns:\n            Column: Updated QC column with flag.\n\n        Example:\n            &gt;&gt;&gt; import pyspark.sql.types as t\n            &gt;&gt;&gt; schema = t.StructType([t.StructField('reference_allele', t.StringType(), True), t.StructField('alternate_allele', t.StringType(), True), t.StructField('qc', t.ArrayType(t.StringType()), True)])\n            &gt;&gt;&gt; d = [{'reference_allele': 'A', 'alternate_allele': 'T', 'qc': None}, {'reference_allele': 'AT', 'alternate_allele': 'TA', 'qc': None}, {'reference_allele': 'AT', 'alternate_allele': 'AT', 'qc': None}]\n            &gt;&gt;&gt; df = spark.createDataFrame(data=d, schema=schema)\n            &gt;&gt;&gt; df.withColumn(\"qc\", StudyLocusGWASCatalog._qc_palindromic_alleles(f.col(\"qc\"), f.col(\"reference_allele\"), f.col(\"alternate_allele\"))).show(truncate=False)\n            +----------------+----------------+---------------------------------------+\n            |reference_allele|alternate_allele|qc                                     |\n            +----------------+----------------+---------------------------------------+\n            |A               |T               |[Palindrome alleles - cannot harmonize]|\n            |AT              |TA              |[]                                     |\n            |AT              |AT              |[Palindrome alleles - cannot harmonize]|\n            +----------------+----------------+---------------------------------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        return StudyLocus._update_quality_flag(\n            qc,\n            StudyLocusGWASCatalog._are_alleles_palindromic(\n                reference_allele, alternate_allele\n            ),\n            StudyLocusQualityCheck.PALINDROMIC_ALLELE_FLAG,\n        )\n\n    @classmethod\n    def from_source(\n        cls: type[StudyLocusGWASCatalog],\n        gwas_associations: DataFrame,\n        variant_annotation: VariantAnnotation,\n        pvalue_threshold: float = 5e-8,\n    ) -&gt; StudyLocusGWASCatalog:\n\"\"\"Read GWASCatalog associations.\n\n        It reads the GWAS Catalog association dataset, selects and renames columns, casts columns, and\n        applies some pre-defined filters on the data:\n\n        Args:\n            gwas_associations (DataFrame): GWAS Catalog raw associations dataset\n            variant_annotation (VariantAnnotation): Variant annotation dataset\n            pvalue_threshold (float): P-value threshold for flagging associations\n\n        Returns:\n            StudyLocusGWASCatalog: StudyLocusGWASCatalog dataset\n        \"\"\"\n        return cls(\n            _df=gwas_associations.withColumn(\n                \"studyLocusId\", f.monotonically_increasing_id().cast(LongType())\n            )\n            .transform(\n                # Map/harmonise variants to variant annotation dataset:\n                # This function adds columns: variantId, referenceAllele, alternateAllele, chromosome, position\n                lambda df: StudyLocusGWASCatalog._map_to_variant_annotation_variants(\n                    df, variant_annotation\n                )\n            )\n            .withColumn(\n                # Perform all quality control checks:\n                \"qualityControls\",\n                StudyLocusGWASCatalog._qc_all(\n                    f.array().alias(\"qualityControls\"),\n                    f.col(\"CHR_ID\"),\n                    f.col(\"CHR_POS\").cast(IntegerType()),\n                    f.col(\"referenceAllele\"),\n                    f.col(\"alternateAllele\"),\n                    f.col(\"STRONGEST SNP-RISK ALLELE\"),\n                    *StudyLocusGWASCatalog._parse_pvalue(f.col(\"P-VALUE\")),\n                    pvalue_threshold,\n                ),\n            )\n            .select(\n                # INSIDE STUDY-LOCUS SCHEMA:\n                \"studyLocusId\",\n                \"variantId\",\n                # Mapped genomic location of the variant (; separated list)\n                \"chromosome\",\n                \"position\",\n                f.col(\"STUDY ACCESSION\").alias(\"studyId\"),\n                # beta value of the association\n                StudyLocusGWASCatalog._harmonise_beta(\n                    StudyLocusGWASCatalog._normalise_risk_allele(\n                        f.col(\"STRONGEST SNP-RISK ALLELE\")\n                    ),\n                    f.col(\"referenceAllele\"),\n                    f.col(\"alternateAllele\"),\n                    f.col(\"OR or BETA\"),\n                    f.col(\"95% CI (TEXT)\"),\n                ).alias(\"beta\"),\n                # odds ratio of the association\n                StudyLocusGWASCatalog._harmonise_odds_ratio(\n                    StudyLocusGWASCatalog._normalise_risk_allele(\n                        f.col(\"STRONGEST SNP-RISK ALLELE\")\n                    ),\n                    f.col(\"referenceAllele\"),\n                    f.col(\"alternateAllele\"),\n                    f.col(\"OR or BETA\"),\n                    f.col(\"95% CI (TEXT)\"),\n                ).alias(\"oddsRatio\"),\n                # CI lower of the beta value\n                StudyLocusGWASCatalog._harmonise_beta_ci(\n                    StudyLocusGWASCatalog._normalise_risk_allele(\n                        f.col(\"STRONGEST SNP-RISK ALLELE\")\n                    ),\n                    f.col(\"referenceAllele\"),\n                    f.col(\"alternateAllele\"),\n                    f.col(\"OR or BETA\"),\n                    f.col(\"95% CI (TEXT)\"),\n                    f.col(\"P-VALUE\"),\n                    \"lower\",\n                ).alias(\"betaConfidenceIntervalLower\"),\n                # CI upper for the beta value\n                StudyLocusGWASCatalog._harmonise_beta_ci(\n                    StudyLocusGWASCatalog._normalise_risk_allele(\n                        f.col(\"STRONGEST SNP-RISK ALLELE\")\n                    ),\n                    f.col(\"referenceAllele\"),\n                    f.col(\"alternateAllele\"),\n                    f.col(\"OR or BETA\"),\n                    f.col(\"95% CI (TEXT)\"),\n                    f.col(\"P-VALUE\"),\n                    \"upper\",\n                ).alias(\"betaConfidenceIntervalUpper\"),\n                # CI lower of the odds ratio value\n                StudyLocusGWASCatalog._harmonise_odds_ratio_ci(\n                    StudyLocusGWASCatalog._normalise_risk_allele(\n                        f.col(\"STRONGEST SNP-RISK ALLELE\")\n                    ),\n                    f.col(\"referenceAllele\"),\n                    f.col(\"alternateAllele\"),\n                    f.col(\"OR or BETA\"),\n                    f.col(\"95% CI (TEXT)\"),\n                    f.col(\"P-VALUE\"),\n                    \"lower\",\n                ).alias(\"oddsRatioConfidenceIntervalLower\"),\n                # CI upper of the odds ratio value\n                StudyLocusGWASCatalog._harmonise_odds_ratio_ci(\n                    StudyLocusGWASCatalog._normalise_risk_allele(\n                        f.col(\"STRONGEST SNP-RISK ALLELE\")\n                    ),\n                    f.col(\"referenceAllele\"),\n                    f.col(\"alternateAllele\"),\n                    f.col(\"OR or BETA\"),\n                    f.col(\"95% CI (TEXT)\"),\n                    f.col(\"P-VALUE\"),\n                    \"upper\",\n                ).alias(\"oddsRatioConfidenceIntervalUpper\"),\n                # p-value of the association, string: split into exponent and mantissa.\n                *StudyLocusGWASCatalog._parse_pvalue(f.col(\"P-VALUE\")),\n                # Capturing phenotype granularity at the association level\n                StudyLocusGWASCatalog._concatenate_substudy_description(\n                    f.col(\"DISEASE/TRAIT\"),\n                    f.col(\"P-VALUE (TEXT)\"),\n                    f.col(\"MAPPED_TRAIT_URI\"),\n                ).alias(\"subStudyDescription\"),\n                # Quality controls (array of strings)\n                \"qualityControls\",\n            )\n        )\n\n    def update_study_id(\n        self: StudyLocusGWASCatalog, study_annotation: DataFrame\n    ) -&gt; StudyLocusGWASCatalog:\n\"\"\"Update studyId with a dataframe containing study.\n\n        Args:\n            study_annotation (DataFrame): Dataframe containing `updatedStudyId` and key columns `studyId` and `subStudyDescription`.\n\n        Returns:\n            StudyLocusGWASCatalog: Updated study locus.\n        \"\"\"\n        self.df = (\n            self._df.join(\n                study_annotation, on=[\"studyId\", \"subStudyDescription\"], how=\"left\"\n            )\n            .withColumn(\"studyId\", f.coalesce(\"updatedStudyId\", \"studyId\"))\n            .drop(\"subStudyDescription\", \"updatedStudyId\")\n        )\n        return self\n\n    def annotate_ld(\n        self: StudyLocusGWASCatalog,\n        session: Session,\n        studies: StudyIndexGWASCatalog,\n        ld_populations: list[str],\n        ld_index_template: str,\n        ld_matrix_template: str,\n        min_r2: float,\n    ) -&gt; StudyLocus:\n\"\"\"Annotate LD set for every studyLocus using gnomAD.\n\n        Args:\n            session (Session): Session\n            studies (StudyIndexGWASCatalog): Study index containing ancestry information\n            ld_populations (list[str]): List of populations to annotate\n            ld_index_template (str): Template path of the LD matrix index containing `{POP}` where the population is expected\n            ld_matrix_template (str): Template path of the LD matrix containing `{POP}` where the population is expected\n            min_r2 (float): Minimum r2 to include in the LD set\n\n        Returns:\n            StudyLocus: Study-locus with an annotated credible set.\n        \"\"\"\n        # TODO: call unique_study_locus_ancestries here so that it is not duplicated with ld_annotation_by_locus_ancestry\n        # LD annotation for all unique lead variants in all populations (study independent).\n        ld_r = LDAnnotatorGnomad.ld_annotation_by_locus_ancestry(\n            session,\n            self,\n            studies,\n            ld_populations,\n            ld_index_template,\n            ld_matrix_template,\n            min_r2,\n        ).coalesce(400)\n\n        ld_set = (\n            self.unique_study_locus_ancestries(studies)\n            .join(ld_r, on=[\"chromosome\", \"variantId\", \"gnomadPopulation\"], how=\"left\")\n            .withColumn(\"r2\", f.pow(f.col(\"r\"), f.lit(2)))\n            .withColumn(\n                \"r2Overall\",\n                LDAnnotatorGnomad.weighted_r_overall(\n                    f.col(\"chromosome\"),\n                    f.col(\"studyId\"),\n                    f.col(\"variantId\"),\n                    f.col(\"tagVariantId\"),\n                    f.col(\"relativeSampleSize\"),\n                    f.col(\"r2\"),\n                ),\n            )\n            .groupBy(\"chromosome\", \"studyId\", \"variantId\")\n            .agg(\n                f.collect_set(\n                    f.when(\n                        f.col(\"tagVariantId\").isNotNull(),\n                        f.struct(\"tagVariantId\", \"r2Overall\"),\n                    )\n                ).alias(\"credibleSet\")\n            )\n        )\n\n        self.df = self.df.join(\n            ld_set, on=[\"chromosome\", \"studyId\", \"variantId\"], how=\"left\"\n        )\n\n        return self._qc_unresolved_ld()\n\n    def _qc_ambiguous_study(self: StudyLocusGWASCatalog) -&gt; StudyLocusGWASCatalog:\n\"\"\"Flag associations with variants that can not be unambiguously associated with one study.\n\n        Returns:\n            StudyLocusGWASCatalog: Updated study locus.\n        \"\"\"\n        assoc_ambiguity_window = Window.partitionBy(\n            f.col(\"studyId\"), f.col(\"variantId\")\n        )\n\n        self._df.withColumn(\n            \"qualityControls\",\n            StudyLocus._update_quality_flag(\n                f.col(\"qualityControls\"),\n                f.count(f.col(\"variantId\")).over(assoc_ambiguity_window) &gt; 1,\n                StudyLocusQualityCheck.AMBIGUOUS_STUDY,\n            ),\n        )\n        return self\n\n    def _qc_unresolved_ld(self: StudyLocusGWASCatalog) -&gt; StudyLocusGWASCatalog:\n\"\"\"Flag associations with variants that are not found in the LD reference.\n\n        Returns:\n            StudyLocusGWASCatalog: Updated study locus.\n        \"\"\"\n        self._df.withColumn(\n            \"qualityControls\",\n            StudyLocus._update_quality_flag(\n                f.col(\"qualityControls\"),\n                f.col(\"credibleSet\").isNull(),\n                StudyLocusQualityCheck.UNRESOLVED_LD,\n            ),\n        )\n        return self\n</code></pre>"},{"location":"components/dataset/study_locus/study_locus_gwas_catalog/#otg.dataset.study_locus.StudyLocusGWASCatalog.annotate_ld","title":"<code>annotate_ld(session, studies, ld_populations, ld_index_template, ld_matrix_template, min_r2)</code>","text":"<p>Annotate LD set for every studyLocus using gnomAD.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>Session</p> required <code>studies</code> <code>StudyIndexGWASCatalog</code> <p>Study index containing ancestry information</p> required <code>ld_populations</code> <code>list[str]</code> <p>List of populations to annotate</p> required <code>ld_index_template</code> <code>str</code> <p>Template path of the LD matrix index containing <code>{POP}</code> where the population is expected</p> required <code>ld_matrix_template</code> <code>str</code> <p>Template path of the LD matrix containing <code>{POP}</code> where the population is expected</p> required <code>min_r2</code> <code>float</code> <p>Minimum r2 to include in the LD set</p> required <p>Returns:</p> Name Type Description <code>StudyLocus</code> <code>StudyLocus</code> <p>Study-locus with an annotated credible set.</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>def annotate_ld(\n    self: StudyLocusGWASCatalog,\n    session: Session,\n    studies: StudyIndexGWASCatalog,\n    ld_populations: list[str],\n    ld_index_template: str,\n    ld_matrix_template: str,\n    min_r2: float,\n) -&gt; StudyLocus:\n\"\"\"Annotate LD set for every studyLocus using gnomAD.\n\n    Args:\n        session (Session): Session\n        studies (StudyIndexGWASCatalog): Study index containing ancestry information\n        ld_populations (list[str]): List of populations to annotate\n        ld_index_template (str): Template path of the LD matrix index containing `{POP}` where the population is expected\n        ld_matrix_template (str): Template path of the LD matrix containing `{POP}` where the population is expected\n        min_r2 (float): Minimum r2 to include in the LD set\n\n    Returns:\n        StudyLocus: Study-locus with an annotated credible set.\n    \"\"\"\n    # TODO: call unique_study_locus_ancestries here so that it is not duplicated with ld_annotation_by_locus_ancestry\n    # LD annotation for all unique lead variants in all populations (study independent).\n    ld_r = LDAnnotatorGnomad.ld_annotation_by_locus_ancestry(\n        session,\n        self,\n        studies,\n        ld_populations,\n        ld_index_template,\n        ld_matrix_template,\n        min_r2,\n    ).coalesce(400)\n\n    ld_set = (\n        self.unique_study_locus_ancestries(studies)\n        .join(ld_r, on=[\"chromosome\", \"variantId\", \"gnomadPopulation\"], how=\"left\")\n        .withColumn(\"r2\", f.pow(f.col(\"r\"), f.lit(2)))\n        .withColumn(\n            \"r2Overall\",\n            LDAnnotatorGnomad.weighted_r_overall(\n                f.col(\"chromosome\"),\n                f.col(\"studyId\"),\n                f.col(\"variantId\"),\n                f.col(\"tagVariantId\"),\n                f.col(\"relativeSampleSize\"),\n                f.col(\"r2\"),\n            ),\n        )\n        .groupBy(\"chromosome\", \"studyId\", \"variantId\")\n        .agg(\n            f.collect_set(\n                f.when(\n                    f.col(\"tagVariantId\").isNotNull(),\n                    f.struct(\"tagVariantId\", \"r2Overall\"),\n                )\n            ).alias(\"credibleSet\")\n        )\n    )\n\n    self.df = self.df.join(\n        ld_set, on=[\"chromosome\", \"studyId\", \"variantId\"], how=\"left\"\n    )\n\n    return self._qc_unresolved_ld()\n</code></pre>"},{"location":"components/dataset/study_locus/study_locus_gwas_catalog/#otg.dataset.study_locus.StudyLocusGWASCatalog.from_source","title":"<code>from_source(gwas_associations, variant_annotation, pvalue_threshold=5e-08)</code>  <code>classmethod</code>","text":"<p>Read GWASCatalog associations.</p> <p>It reads the GWAS Catalog association dataset, selects and renames columns, casts columns, and applies some pre-defined filters on the data:</p> <p>Parameters:</p> Name Type Description Default <code>gwas_associations</code> <code>DataFrame</code> <p>GWAS Catalog raw associations dataset</p> required <code>variant_annotation</code> <code>VariantAnnotation</code> <p>Variant annotation dataset</p> required <code>pvalue_threshold</code> <code>float</code> <p>P-value threshold for flagging associations</p> <code>5e-08</code> <p>Returns:</p> Name Type Description <code>StudyLocusGWASCatalog</code> <code>StudyLocusGWASCatalog</code> <p>StudyLocusGWASCatalog dataset</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>@classmethod\ndef from_source(\n    cls: type[StudyLocusGWASCatalog],\n    gwas_associations: DataFrame,\n    variant_annotation: VariantAnnotation,\n    pvalue_threshold: float = 5e-8,\n) -&gt; StudyLocusGWASCatalog:\n\"\"\"Read GWASCatalog associations.\n\n    It reads the GWAS Catalog association dataset, selects and renames columns, casts columns, and\n    applies some pre-defined filters on the data:\n\n    Args:\n        gwas_associations (DataFrame): GWAS Catalog raw associations dataset\n        variant_annotation (VariantAnnotation): Variant annotation dataset\n        pvalue_threshold (float): P-value threshold for flagging associations\n\n    Returns:\n        StudyLocusGWASCatalog: StudyLocusGWASCatalog dataset\n    \"\"\"\n    return cls(\n        _df=gwas_associations.withColumn(\n            \"studyLocusId\", f.monotonically_increasing_id().cast(LongType())\n        )\n        .transform(\n            # Map/harmonise variants to variant annotation dataset:\n            # This function adds columns: variantId, referenceAllele, alternateAllele, chromosome, position\n            lambda df: StudyLocusGWASCatalog._map_to_variant_annotation_variants(\n                df, variant_annotation\n            )\n        )\n        .withColumn(\n            # Perform all quality control checks:\n            \"qualityControls\",\n            StudyLocusGWASCatalog._qc_all(\n                f.array().alias(\"qualityControls\"),\n                f.col(\"CHR_ID\"),\n                f.col(\"CHR_POS\").cast(IntegerType()),\n                f.col(\"referenceAllele\"),\n                f.col(\"alternateAllele\"),\n                f.col(\"STRONGEST SNP-RISK ALLELE\"),\n                *StudyLocusGWASCatalog._parse_pvalue(f.col(\"P-VALUE\")),\n                pvalue_threshold,\n            ),\n        )\n        .select(\n            # INSIDE STUDY-LOCUS SCHEMA:\n            \"studyLocusId\",\n            \"variantId\",\n            # Mapped genomic location of the variant (; separated list)\n            \"chromosome\",\n            \"position\",\n            f.col(\"STUDY ACCESSION\").alias(\"studyId\"),\n            # beta value of the association\n            StudyLocusGWASCatalog._harmonise_beta(\n                StudyLocusGWASCatalog._normalise_risk_allele(\n                    f.col(\"STRONGEST SNP-RISK ALLELE\")\n                ),\n                f.col(\"referenceAllele\"),\n                f.col(\"alternateAllele\"),\n                f.col(\"OR or BETA\"),\n                f.col(\"95% CI (TEXT)\"),\n            ).alias(\"beta\"),\n            # odds ratio of the association\n            StudyLocusGWASCatalog._harmonise_odds_ratio(\n                StudyLocusGWASCatalog._normalise_risk_allele(\n                    f.col(\"STRONGEST SNP-RISK ALLELE\")\n                ),\n                f.col(\"referenceAllele\"),\n                f.col(\"alternateAllele\"),\n                f.col(\"OR or BETA\"),\n                f.col(\"95% CI (TEXT)\"),\n            ).alias(\"oddsRatio\"),\n            # CI lower of the beta value\n            StudyLocusGWASCatalog._harmonise_beta_ci(\n                StudyLocusGWASCatalog._normalise_risk_allele(\n                    f.col(\"STRONGEST SNP-RISK ALLELE\")\n                ),\n                f.col(\"referenceAllele\"),\n                f.col(\"alternateAllele\"),\n                f.col(\"OR or BETA\"),\n                f.col(\"95% CI (TEXT)\"),\n                f.col(\"P-VALUE\"),\n                \"lower\",\n            ).alias(\"betaConfidenceIntervalLower\"),\n            # CI upper for the beta value\n            StudyLocusGWASCatalog._harmonise_beta_ci(\n                StudyLocusGWASCatalog._normalise_risk_allele(\n                    f.col(\"STRONGEST SNP-RISK ALLELE\")\n                ),\n                f.col(\"referenceAllele\"),\n                f.col(\"alternateAllele\"),\n                f.col(\"OR or BETA\"),\n                f.col(\"95% CI (TEXT)\"),\n                f.col(\"P-VALUE\"),\n                \"upper\",\n            ).alias(\"betaConfidenceIntervalUpper\"),\n            # CI lower of the odds ratio value\n            StudyLocusGWASCatalog._harmonise_odds_ratio_ci(\n                StudyLocusGWASCatalog._normalise_risk_allele(\n                    f.col(\"STRONGEST SNP-RISK ALLELE\")\n                ),\n                f.col(\"referenceAllele\"),\n                f.col(\"alternateAllele\"),\n                f.col(\"OR or BETA\"),\n                f.col(\"95% CI (TEXT)\"),\n                f.col(\"P-VALUE\"),\n                \"lower\",\n            ).alias(\"oddsRatioConfidenceIntervalLower\"),\n            # CI upper of the odds ratio value\n            StudyLocusGWASCatalog._harmonise_odds_ratio_ci(\n                StudyLocusGWASCatalog._normalise_risk_allele(\n                    f.col(\"STRONGEST SNP-RISK ALLELE\")\n                ),\n                f.col(\"referenceAllele\"),\n                f.col(\"alternateAllele\"),\n                f.col(\"OR or BETA\"),\n                f.col(\"95% CI (TEXT)\"),\n                f.col(\"P-VALUE\"),\n                \"upper\",\n            ).alias(\"oddsRatioConfidenceIntervalUpper\"),\n            # p-value of the association, string: split into exponent and mantissa.\n            *StudyLocusGWASCatalog._parse_pvalue(f.col(\"P-VALUE\")),\n            # Capturing phenotype granularity at the association level\n            StudyLocusGWASCatalog._concatenate_substudy_description(\n                f.col(\"DISEASE/TRAIT\"),\n                f.col(\"P-VALUE (TEXT)\"),\n                f.col(\"MAPPED_TRAIT_URI\"),\n            ).alias(\"subStudyDescription\"),\n            # Quality controls (array of strings)\n            \"qualityControls\",\n        )\n    )\n</code></pre>"},{"location":"components/dataset/study_locus/study_locus_gwas_catalog/#otg.dataset.study_locus.StudyLocusGWASCatalog.update_study_id","title":"<code>update_study_id(study_annotation)</code>","text":"<p>Update studyId with a dataframe containing study.</p> <p>Parameters:</p> Name Type Description Default <code>study_annotation</code> <code>DataFrame</code> <p>Dataframe containing <code>updatedStudyId</code> and key columns <code>studyId</code> and <code>subStudyDescription</code>.</p> required <p>Returns:</p> Name Type Description <code>StudyLocusGWASCatalog</code> <code>StudyLocusGWASCatalog</code> <p>Updated study locus.</p> Source code in <code>src/otg/dataset/study_locus.py</code> <pre><code>def update_study_id(\n    self: StudyLocusGWASCatalog, study_annotation: DataFrame\n) -&gt; StudyLocusGWASCatalog:\n\"\"\"Update studyId with a dataframe containing study.\n\n    Args:\n        study_annotation (DataFrame): Dataframe containing `updatedStudyId` and key columns `studyId` and `subStudyDescription`.\n\n    Returns:\n        StudyLocusGWASCatalog: Updated study locus.\n    \"\"\"\n    self.df = (\n        self._df.join(\n            study_annotation, on=[\"studyId\", \"subStudyDescription\"], how=\"left\"\n        )\n        .withColumn(\"studyId\", f.coalesce(\"updatedStudyId\", \"studyId\"))\n        .drop(\"subStudyDescription\", \"updatedStudyId\")\n    )\n    return self\n</code></pre>"},{"location":"components/method/_method/","title":"Method","text":"<p>Methods used accross the Open Targets Genetics Pipeline</p>"},{"location":"components/method/clumping/","title":"Clumping","text":"<p>Clumping is a commonly used post-processing method that allows for identification of independent association signals from GWAS summary statistics and curated associations. This process is critical because of the complex linkage disequilibrium (LD) structure in human populations, which can result in multiple statistically significant associations within the same genomic region. Clumping methods help reduce redundancy in GWAS results and ensure that each reported association represents an independent signal.</p> <p>We have implemented 2 clumping methods:</p>"},{"location":"components/method/clumping/#clumping-based-on-linkage-disequilibrium-ld","title":"Clumping based on Linkage Disequilibrium (LD)","text":"<p>LD clumping reports the most significant genetic associations in a region in terms of a smaller number of \u201cclumps\u201d of genetically linked SNPs.</p> Source code in <code>src/otg/method/clump.py</code> <pre><code>class LDclumping:\n\"\"\"LD clumping reports the most significant genetic associations in a region in terms of a smaller number of \u201cclumps\u201d of genetically linked SNPs.\"\"\"\n\n    @staticmethod\n    def _is_lead_linked(\n        study_id: Column,\n        variant_id: Column,\n        p_value_exponent: Column,\n        p_value_mantissa: Column,\n        credible_set: Column,\n    ) -&gt; Column:\n\"\"\"Evaluates whether a lead variant is linked to a tag (with lowest p-value) in the same studyLocus dataset.\n\n        Args:\n            study_id (Column): studyId\n            variant_id (Column): Lead variant id\n            p_value_exponent (Column): p-value exponent\n            p_value_mantissa (Column): p-value mantissa\n            credible_set (Column): Credible set &lt;array of structs&gt;\n\n        Returns:\n            Column: Boolean in which True indicates that the lead is linked to another tag in the same dataset.\n        \"\"\"\n        leads_in_study = f.collect_set(variant_id).over(Window.partitionBy(study_id))\n        tags_in_studylocus = f.array_union(\n            # Get all tag variants from the credible set per studyLocusId\n            f.transform(credible_set, lambda x: x.tagVariantId),\n            # And append the lead variant so that the intersection is the same for all studyLocusIds in a study\n            f.array(f.col(\"variantId\")),\n        )\n        intersect_lead_tags = f.array_sort(\n            f.array_intersect(leads_in_study, tags_in_studylocus)\n        )\n        return (\n            # If the lead is in the credible set, we rank the peaks by p-value\n            f.when(\n                f.size(intersect_lead_tags) &gt; 0,\n                f.row_number().over(\n                    Window.partitionBy(study_id, intersect_lead_tags).orderBy(\n                        p_value_exponent, p_value_mantissa\n                    )\n                )\n                &gt; 1,\n            )\n            # If the intersection is empty (lead is not in the credible set or cred set is empty), the association is not linked\n            .otherwise(f.lit(False))\n        )\n\n    @classmethod\n    def clump(cls: type[LDclumping], associations: StudyLocus) -&gt; StudyLocus:\n\"\"\"Perform clumping on studyLocus dataset.\n\n        Args:\n            associations (StudyLocus): StudyLocus dataset\n\n        Returns:\n            StudyLocus: including flag and removing credibleSet information for LD clumped loci.\n        \"\"\"\n        return associations.clump()\n</code></pre>"},{"location":"components/method/clumping/#otg.method.clump.LDclumping.clump","title":"<code>clump(associations)</code>  <code>classmethod</code>","text":"<p>Perform clumping on studyLocus dataset.</p> <p>Parameters:</p> Name Type Description Default <code>associations</code> <code>StudyLocus</code> <p>StudyLocus dataset</p> required <p>Returns:</p> Name Type Description <code>StudyLocus</code> <code>StudyLocus</code> <p>including flag and removing credibleSet information for LD clumped loci.</p> Source code in <code>src/otg/method/clump.py</code> <pre><code>@classmethod\ndef clump(cls: type[LDclumping], associations: StudyLocus) -&gt; StudyLocus:\n\"\"\"Perform clumping on studyLocus dataset.\n\n    Args:\n        associations (StudyLocus): StudyLocus dataset\n\n    Returns:\n        StudyLocus: including flag and removing credibleSet information for LD clumped loci.\n    \"\"\"\n    return associations.clump()\n</code></pre>"},{"location":"components/method/coloc/","title":"coloc","text":"<p>Calculate bayesian colocalisation based on overlapping signals from credible sets.</p> <p>Based on the R COLOC package, which uses the Bayes factors from the credible set to estimate the posterior probability of colocalisation. This method makes the simplifying assumption that only one single causal variant exists for any given trait in any genomic region.</p> Hypothesis Description H0 no association with either trait in the region H1 association with trait 1 only H2 association with trait 2 only H3 both traits are associated, but have different single causal variants H4 both traits are associated and share the same single causal variant <p>Approximate Bayes factors required</p> <p>Coloc requires the availability of approximate Bayes factors (ABF) for each variant in the credible set (<code>logABF</code> column).</p> Source code in <code>src/otg/method/colocalisation.py</code> <pre><code>class Coloc:\n\"\"\"Calculate bayesian colocalisation based on overlapping signals from credible sets.\n\n    Based on the [R COLOC package](https://github.com/chr1swallace/coloc/blob/main/R/claudia.R), which uses the Bayes factors from the credible set to estimate the posterior probability of colocalisation. This method makes the simplifying assumption that **only one single causal variant** exists for any given trait in any genomic region.\n\n    | Hypothesis    | Description                                                           |\n    | ------------- | --------------------------------------------------------------------- |\n    | H&lt;sub&gt;0&lt;/sub&gt; | no association with either trait in the region                        |\n    | H&lt;sub&gt;1&lt;/sub&gt; | association with trait 1 only                                         |\n    | H&lt;sub&gt;2&lt;/sub&gt; | association with trait 2 only                                         |\n    | H&lt;sub&gt;3&lt;/sub&gt; | both traits are associated, but have different single causal variants |\n    | H&lt;sub&gt;4&lt;/sub&gt; | both traits are associated and share the same single causal variant   |\n\n    !!! warning \"Approximate Bayes factors required\"\n        Coloc requires the availability of approximate Bayes factors (ABF) for each variant in the credible set (`logABF` column).\n\n    \"\"\"\n\n    @staticmethod\n    def _get_logsum(log_abf: ndarray) -&gt; float:\n\"\"\"Calculates logsum of vector.\n\n        This function calculates the log of the sum of the exponentiated\n        logs taking out the max, i.e. insuring that the sum is not Inf\n\n        Args:\n            log_abf (ndarray): log approximate bayes factor\n\n        Returns:\n            float: logsum\n\n        Example:\n            &gt;&gt;&gt; l = [0.2, 0.1, 0.05, 0]\n            &gt;&gt;&gt; round(Coloc._get_logsum(l), 6)\n            1.476557\n        \"\"\"\n        themax = np.max(log_abf)\n        result = themax + np.log(np.sum(np.exp(log_abf - themax)))\n        return float(result)\n\n    @staticmethod\n    def _get_posteriors(all_abfs: ndarray) -&gt; DenseVector:\n\"\"\"Calculate posterior probabilities for each hypothesis.\n\n        Args:\n            all_abfs (ndarray): h0-h4 bayes factors\n\n        Returns:\n            DenseVector: Posterior\n\n        Example:\n            &gt;&gt;&gt; l = np.array([0.2, 0.1, 0.05, 0])\n            &gt;&gt;&gt; Coloc._get_posteriors(l)\n            DenseVector([0.279, 0.2524, 0.2401, 0.2284])\n        \"\"\"\n        diff = all_abfs - Coloc._get_logsum(all_abfs)\n        abfs_posteriors = np.exp(diff)\n        return Vectors.dense(abfs_posteriors)\n\n    @classmethod\n    def colocalise(\n        cls: type[Coloc],\n        overlapping_signals: StudyLocusOverlap,\n        priorc1: float = 1e-4,\n        priorc2: float = 1e-4,\n        priorc12: float = 1e-5,\n    ) -&gt; Colocalisation:\n\"\"\"Calculate bayesian colocalisation based on overlapping signals.\n\n        Args:\n            overlapping_signals (StudyLocusOverlap): overlapping peaks\n            priorc1 (float): Prior on variant being causal for trait 1. Defaults to 1e-4.\n            priorc2 (float): Prior on variant being causal for trait 2. Defaults to 1e-4.\n            priorc12 (float): Prior on variant being causal for traits 1 and 2. Defaults to 1e-5.\n\n        Returns:\n            Colocalisation: Colocalisation results\n        \"\"\"\n        # register udfs\n        logsum = f.udf(Coloc._get_logsum, DoubleType())\n        posteriors = f.udf(Coloc._get_posteriors, VectorUDT())\n        return Colocalisation(\n            _df=(\n                overlapping_signals.df\n                # Before summing log_abf columns nulls need to be filled with 0:\n                .fillna(0, subset=[\"left_logABF\", \"right_logABF\"])\n                # Sum of log_abfs for each pair of signals\n                .withColumn(\"sum_log_abf\", f.col(\"left_logABF\") + f.col(\"right_logABF\"))\n                # Group by overlapping peak and generating dense vectors of log_abf:\n                .groupBy(\"chromosome\", \"left_studyLocusId\", \"right_studyLocusId\")\n                .agg(\n                    f.count(\"*\").alias(\"coloc_n_vars\"),\n                    fml.array_to_vector(f.collect_list(f.col(\"left_logABF\"))).alias(\n                        \"left_logABF\"\n                    ),\n                    fml.array_to_vector(f.collect_list(f.col(\"right_logABF\"))).alias(\n                        \"right_logABF\"\n                    ),\n                    fml.array_to_vector(f.collect_list(f.col(\"sum_log_abf\"))).alias(\n                        \"sum_log_abf\"\n                    ),\n                )\n                .withColumn(\"logsum1\", logsum(f.col(\"left_logABF\")))\n                .withColumn(\"logsum2\", logsum(f.col(\"right_logABF\")))\n                .withColumn(\"logsum12\", logsum(f.col(\"sum_log_abf\")))\n                .drop(\"left_logABF\", \"right_logABF\", \"sum_log_abf\")\n                # Add priors\n                # priorc1 Prior on variant being causal for trait 1\n                .withColumn(\"priorc1\", f.lit(priorc1))\n                # priorc2 Prior on variant being causal for trait 2\n                .withColumn(\"priorc2\", f.lit(priorc2))\n                # priorc12 Prior on variant being causal for traits 1 and 2\n                .withColumn(\"priorc12\", f.lit(priorc12))\n                # h0-h2\n                .withColumn(\"lH0abf\", f.lit(0))\n                .withColumn(\"lH1abf\", f.log(f.col(\"priorc1\")) + f.col(\"logsum1\"))\n                .withColumn(\"lH2abf\", f.log(f.col(\"priorc2\")) + f.col(\"logsum2\"))\n                # h3\n                .withColumn(\"sumlogsum\", f.col(\"logsum1\") + f.col(\"logsum2\"))\n                # exclude null H3/H4s: due to sumlogsum == logsum12\n                .filter(f.col(\"sumlogsum\") != f.col(\"logsum12\"))\n                .withColumn(\"max\", f.greatest(\"sumlogsum\", \"logsum12\"))\n                .withColumn(\n                    \"logdiff\",\n                    (\n                        f.col(\"max\")\n                        + f.log(\n                            f.exp(f.col(\"sumlogsum\") - f.col(\"max\"))\n                            - f.exp(f.col(\"logsum12\") - f.col(\"max\"))\n                        )\n                    ),\n                )\n                .withColumn(\n                    \"lH3abf\",\n                    f.log(f.col(\"priorc1\"))\n                    + f.log(f.col(\"priorc2\"))\n                    + f.col(\"logdiff\"),\n                )\n                .drop(\"right_logsum\", \"left_logsum\", \"sumlogsum\", \"max\", \"logdiff\")\n                # h4\n                .withColumn(\"lH4abf\", f.log(f.col(\"priorc12\")) + f.col(\"logsum12\"))\n                # cleaning\n                .drop(\n                    \"priorc1\", \"priorc2\", \"priorc12\", \"logsum1\", \"logsum2\", \"logsum12\"\n                )\n                # posteriors\n                .withColumn(\n                    \"allABF\",\n                    fml.array_to_vector(\n                        f.array(\n                            f.col(\"lH0abf\"),\n                            f.col(\"lH1abf\"),\n                            f.col(\"lH2abf\"),\n                            f.col(\"lH3abf\"),\n                            f.col(\"lH4abf\"),\n                        )\n                    ),\n                )\n                .withColumn(\n                    \"posteriors\", fml.vector_to_array(posteriors(f.col(\"allABF\")))\n                )\n                .withColumn(\"coloc_h0\", f.col(\"posteriors\").getItem(0))\n                .withColumn(\"coloc_h1\", f.col(\"posteriors\").getItem(1))\n                .withColumn(\"coloc_h2\", f.col(\"posteriors\").getItem(2))\n                .withColumn(\"coloc_h3\", f.col(\"posteriors\").getItem(3))\n                .withColumn(\"coloc_h4\", f.col(\"posteriors\").getItem(4))\n                .withColumn(\"coloc_h4_h3\", f.col(\"coloc_h4\") / f.col(\"coloc_h3\"))\n                .withColumn(\"coloc_log2_h4_h3\", f.log2(f.col(\"coloc_h4_h3\")))\n                # clean up\n                .drop(\n                    \"posteriors\",\n                    \"allABF\",\n                    \"coloc_h4_h3\",\n                    \"lH0abf\",\n                    \"lH1abf\",\n                    \"lH2abf\",\n                    \"lH3abf\",\n                    \"lH4abf\",\n                )\n                .withColumn(\"colocalisationMethod\", f.lit(\"COLOC\"))\n            )\n        )\n</code></pre>"},{"location":"components/method/coloc/#otg.method.colocalisation.Coloc.colocalise","title":"<code>colocalise(overlapping_signals, priorc1=0.0001, priorc2=0.0001, priorc12=1e-05)</code>  <code>classmethod</code>","text":"<p>Calculate bayesian colocalisation based on overlapping signals.</p> <p>Parameters:</p> Name Type Description Default <code>overlapping_signals</code> <code>StudyLocusOverlap</code> <p>overlapping peaks</p> required <code>priorc1</code> <code>float</code> <p>Prior on variant being causal for trait 1. Defaults to 1e-4.</p> <code>0.0001</code> <code>priorc2</code> <code>float</code> <p>Prior on variant being causal for trait 2. Defaults to 1e-4.</p> <code>0.0001</code> <code>priorc12</code> <code>float</code> <p>Prior on variant being causal for traits 1 and 2. Defaults to 1e-5.</p> <code>1e-05</code> <p>Returns:</p> Name Type Description <code>Colocalisation</code> <code>Colocalisation</code> <p>Colocalisation results</p> Source code in <code>src/otg/method/colocalisation.py</code> <pre><code>@classmethod\ndef colocalise(\n    cls: type[Coloc],\n    overlapping_signals: StudyLocusOverlap,\n    priorc1: float = 1e-4,\n    priorc2: float = 1e-4,\n    priorc12: float = 1e-5,\n) -&gt; Colocalisation:\n\"\"\"Calculate bayesian colocalisation based on overlapping signals.\n\n    Args:\n        overlapping_signals (StudyLocusOverlap): overlapping peaks\n        priorc1 (float): Prior on variant being causal for trait 1. Defaults to 1e-4.\n        priorc2 (float): Prior on variant being causal for trait 2. Defaults to 1e-4.\n        priorc12 (float): Prior on variant being causal for traits 1 and 2. Defaults to 1e-5.\n\n    Returns:\n        Colocalisation: Colocalisation results\n    \"\"\"\n    # register udfs\n    logsum = f.udf(Coloc._get_logsum, DoubleType())\n    posteriors = f.udf(Coloc._get_posteriors, VectorUDT())\n    return Colocalisation(\n        _df=(\n            overlapping_signals.df\n            # Before summing log_abf columns nulls need to be filled with 0:\n            .fillna(0, subset=[\"left_logABF\", \"right_logABF\"])\n            # Sum of log_abfs for each pair of signals\n            .withColumn(\"sum_log_abf\", f.col(\"left_logABF\") + f.col(\"right_logABF\"))\n            # Group by overlapping peak and generating dense vectors of log_abf:\n            .groupBy(\"chromosome\", \"left_studyLocusId\", \"right_studyLocusId\")\n            .agg(\n                f.count(\"*\").alias(\"coloc_n_vars\"),\n                fml.array_to_vector(f.collect_list(f.col(\"left_logABF\"))).alias(\n                    \"left_logABF\"\n                ),\n                fml.array_to_vector(f.collect_list(f.col(\"right_logABF\"))).alias(\n                    \"right_logABF\"\n                ),\n                fml.array_to_vector(f.collect_list(f.col(\"sum_log_abf\"))).alias(\n                    \"sum_log_abf\"\n                ),\n            )\n            .withColumn(\"logsum1\", logsum(f.col(\"left_logABF\")))\n            .withColumn(\"logsum2\", logsum(f.col(\"right_logABF\")))\n            .withColumn(\"logsum12\", logsum(f.col(\"sum_log_abf\")))\n            .drop(\"left_logABF\", \"right_logABF\", \"sum_log_abf\")\n            # Add priors\n            # priorc1 Prior on variant being causal for trait 1\n            .withColumn(\"priorc1\", f.lit(priorc1))\n            # priorc2 Prior on variant being causal for trait 2\n            .withColumn(\"priorc2\", f.lit(priorc2))\n            # priorc12 Prior on variant being causal for traits 1 and 2\n            .withColumn(\"priorc12\", f.lit(priorc12))\n            # h0-h2\n            .withColumn(\"lH0abf\", f.lit(0))\n            .withColumn(\"lH1abf\", f.log(f.col(\"priorc1\")) + f.col(\"logsum1\"))\n            .withColumn(\"lH2abf\", f.log(f.col(\"priorc2\")) + f.col(\"logsum2\"))\n            # h3\n            .withColumn(\"sumlogsum\", f.col(\"logsum1\") + f.col(\"logsum2\"))\n            # exclude null H3/H4s: due to sumlogsum == logsum12\n            .filter(f.col(\"sumlogsum\") != f.col(\"logsum12\"))\n            .withColumn(\"max\", f.greatest(\"sumlogsum\", \"logsum12\"))\n            .withColumn(\n                \"logdiff\",\n                (\n                    f.col(\"max\")\n                    + f.log(\n                        f.exp(f.col(\"sumlogsum\") - f.col(\"max\"))\n                        - f.exp(f.col(\"logsum12\") - f.col(\"max\"))\n                    )\n                ),\n            )\n            .withColumn(\n                \"lH3abf\",\n                f.log(f.col(\"priorc1\"))\n                + f.log(f.col(\"priorc2\"))\n                + f.col(\"logdiff\"),\n            )\n            .drop(\"right_logsum\", \"left_logsum\", \"sumlogsum\", \"max\", \"logdiff\")\n            # h4\n            .withColumn(\"lH4abf\", f.log(f.col(\"priorc12\")) + f.col(\"logsum12\"))\n            # cleaning\n            .drop(\n                \"priorc1\", \"priorc2\", \"priorc12\", \"logsum1\", \"logsum2\", \"logsum12\"\n            )\n            # posteriors\n            .withColumn(\n                \"allABF\",\n                fml.array_to_vector(\n                    f.array(\n                        f.col(\"lH0abf\"),\n                        f.col(\"lH1abf\"),\n                        f.col(\"lH2abf\"),\n                        f.col(\"lH3abf\"),\n                        f.col(\"lH4abf\"),\n                    )\n                ),\n            )\n            .withColumn(\n                \"posteriors\", fml.vector_to_array(posteriors(f.col(\"allABF\")))\n            )\n            .withColumn(\"coloc_h0\", f.col(\"posteriors\").getItem(0))\n            .withColumn(\"coloc_h1\", f.col(\"posteriors\").getItem(1))\n            .withColumn(\"coloc_h2\", f.col(\"posteriors\").getItem(2))\n            .withColumn(\"coloc_h3\", f.col(\"posteriors\").getItem(3))\n            .withColumn(\"coloc_h4\", f.col(\"posteriors\").getItem(4))\n            .withColumn(\"coloc_h4_h3\", f.col(\"coloc_h4\") / f.col(\"coloc_h3\"))\n            .withColumn(\"coloc_log2_h4_h3\", f.log2(f.col(\"coloc_h4_h3\")))\n            # clean up\n            .drop(\n                \"posteriors\",\n                \"allABF\",\n                \"coloc_h4_h3\",\n                \"lH0abf\",\n                \"lH1abf\",\n                \"lH2abf\",\n                \"lH3abf\",\n                \"lH4abf\",\n            )\n            .withColumn(\"colocalisationMethod\", f.lit(\"COLOC\"))\n        )\n    )\n</code></pre>"},{"location":"components/method/ecaviar/","title":"eCAVIAR","text":"<p>ECaviar-based colocalisation analysis.</p> <p>It extends CAVIAR\u00a0framework to explicitly estimate the posterior probability that the same variant is causal in 2 studies while accounting for the uncertainty of LD. eCAVIAR computes the colocalization posterior probability (CLPP) by utilizing the marginal posterior probabilities. This framework allows for multiple variants to be causal in a single locus.</p> Source code in <code>src/otg/method/colocalisation.py</code> <pre><code>class ECaviar:\n\"\"\"ECaviar-based colocalisation analysis.\n\n    It extends [CAVIAR](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5142122/#bib18)\u00a0framework to explicitly estimate the posterior probability that the same variant is causal in 2 studies while accounting for the uncertainty of LD. eCAVIAR computes the colocalization posterior probability (**CLPP**) by utilizing the marginal posterior probabilities. This framework allows for **multiple variants to be causal** in a single locus.\n    \"\"\"\n\n    @staticmethod\n    def _get_clpp(left_pp: Column, right_pp: Column) -&gt; Column:\n\"\"\"Calculate the colocalisation posterior probability (CLPP).\n\n        If the fact that the same variant is found causal for two studies are independent events,\n        CLPP is defined as the product of posterior porbabilities that a variant is causal in both studies.\n\n        Args:\n            left_pp (Column): left posterior probability\n            right_pp (Column): right posterior probability\n\n        Returns:\n            Column: CLPP\n\n        Examples:\n            &gt;&gt;&gt; d = [{\"left_pp\": 0.5, \"right_pp\": 0.5}, {\"left_pp\": 0.25, \"right_pp\": 0.75}]\n            &gt;&gt;&gt; df = spark.createDataFrame(d)\n            &gt;&gt;&gt; df.withColumn(\"clpp\", ECaviar._get_clpp(f.col(\"left_pp\"), f.col(\"right_pp\"))).show()\n            +-------+--------+------+\n            |left_pp|right_pp|  clpp|\n            +-------+--------+------+\n            |    0.5|     0.5|  0.25|\n            |   0.25|    0.75|0.1875|\n            +-------+--------+------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        return left_pp * right_pp\n\n    @classmethod\n    def colocalise(\n        cls: type[ECaviar], overlapping_signals: StudyLocusOverlap\n    ) -&gt; Colocalisation:\n\"\"\"Calculate bayesian colocalisation based on overlapping signals.\n\n        Args:\n            overlapping_signals (StudyLocusOverlap): overlapping signals.\n\n        Returns:\n            Colocalisation: colocalisation results based on eCAVIAR.\n        \"\"\"\n        return Colocalisation(\n            _df=(\n                overlapping_signals.df.withColumn(\n                    \"clpp\",\n                    ECaviar._get_clpp(\n                        f.col(\"left_posteriorProbability\"),\n                        f.col(\"right_posteriorProbability\"),\n                    ),\n                )\n                .groupBy(\"left_studyLocusId\", \"right_studyLocusId\", \"chromosome\")\n                .agg(\n                    f.count(\"*\").alias(\"coloc_n_vars\"),\n                    f.sum(f.col(\"clpp\")).alias(\"clpp\"),\n                )\n                .withColumn(\"colocalisationMethod\", f.lit(\"eCAVIAR\"))\n            )\n        )\n</code></pre>"},{"location":"components/method/ecaviar/#otg.method.colocalisation.ECaviar.colocalise","title":"<code>colocalise(overlapping_signals)</code>  <code>classmethod</code>","text":"<p>Calculate bayesian colocalisation based on overlapping signals.</p> <p>Parameters:</p> Name Type Description Default <code>overlapping_signals</code> <code>StudyLocusOverlap</code> <p>overlapping signals.</p> required <p>Returns:</p> Name Type Description <code>Colocalisation</code> <code>Colocalisation</code> <p>colocalisation results based on eCAVIAR.</p> Source code in <code>src/otg/method/colocalisation.py</code> <pre><code>@classmethod\ndef colocalise(\n    cls: type[ECaviar], overlapping_signals: StudyLocusOverlap\n) -&gt; Colocalisation:\n\"\"\"Calculate bayesian colocalisation based on overlapping signals.\n\n    Args:\n        overlapping_signals (StudyLocusOverlap): overlapping signals.\n\n    Returns:\n        Colocalisation: colocalisation results based on eCAVIAR.\n    \"\"\"\n    return Colocalisation(\n        _df=(\n            overlapping_signals.df.withColumn(\n                \"clpp\",\n                ECaviar._get_clpp(\n                    f.col(\"left_posteriorProbability\"),\n                    f.col(\"right_posteriorProbability\"),\n                ),\n            )\n            .groupBy(\"left_studyLocusId\", \"right_studyLocusId\", \"chromosome\")\n            .agg(\n                f.count(\"*\").alias(\"coloc_n_vars\"),\n                f.sum(f.col(\"clpp\")).alias(\"clpp\"),\n            )\n            .withColumn(\"colocalisationMethod\", f.lit(\"eCAVIAR\"))\n        )\n    )\n</code></pre>"},{"location":"components/method/ld_annotator/","title":"LD annotator","text":"<p>Class to annotate linkage disequilibrium (LD) operations from GnomAD.</p> Source code in <code>src/otg/method/ld.py</code> <pre><code>class LDAnnotatorGnomad:\n\"\"\"Class to annotate linkage disequilibrium (LD) operations from GnomAD.\"\"\"\n\n    @staticmethod\n    def _query_block_matrix(\n        bm: BlockMatrix,\n        idxs: list[int],\n        starts: list[int],\n        stops: list[int],\n        min_r2: float,\n    ) -&gt; DataFrame:\n\"\"\"Query block matrix for idxs rows sparsified by start/stop columns.\n\n        Args:\n            bm (BlockMatrix): LD matrix containing r values\n            idxs (List[int]): Row indexes to query (distinct and incremental)\n            starts (List[int]): Interval start column indexes (same size as idxs)\n            stops (List[int]): Interval stop column indexes (same size as idxs)\n            min_r2 (float): Minimum r2 to keep\n\n        Returns:\n            DataFrame: i,j,r where i and j are the row and column indexes and r is the LD\n\n        Examples:\n            &gt;&gt;&gt; import numpy as np\n            &gt;&gt;&gt; r = np.array([[1, 0.8, 0.7, 0.2],\n            ...               [0.8, 1, 0.6, 0.1],\n            ...               [0.7, 0.6, 1, 0.3],\n            ...               [0.2, 0.1, 0.3, 1]])\n            &gt;&gt;&gt; bm_r = BlockMatrix.from_numpy(r) # doctest: +SKIP\n            &gt;&gt;&gt; LDAnnotatorGnomad._query_block_matrix(bm_r, [1, 2], [0, 1], [3, 4], 0.5).show() # doctest: +SKIP\n            +---+---+---+\n            |  i|  j|  r|\n            +---+---+---+\n            |  0|  0|0.8|\n            |  0|  1|1.0|\n            |  1|  2|1.0|\n            +---+---+---+\n            &lt;BLANKLINE&gt;\n        \"\"\"\n        bm_sparsified = bm.filter_rows(idxs).sparsify_row_intervals(\n            starts, stops, blocks_only=True\n        )\n        entries = bm_sparsified.entries(keyed=False)\n\n        return (\n            entries.rename({\"entry\": \"r\"})\n            .to_spark()\n            .filter(f.col(\"r\") ** 2 &gt;= min_r2)\n            .withColumn(\"r\", f.when(f.col(\"r\") &gt;= 1, f.lit(1)).otherwise(f.col(\"r\")))\n        )\n\n    @staticmethod\n    def _variant_coordinates_in_ldindex(\n        variants_df: DataFrame,\n        ld_index: LDIndex,\n    ) -&gt; DataFrame:\n\"\"\"Idxs for variants, first variant in the region and last variant in the region in precomputed ld index.\n\n        It checks if the window defined by the start/stop indices is maintained after lifting over the variants.\n\n        Args:\n            variants_df (DataFrame): Lead variants from `_annotate_index_intervals` output\n            ld_index (LDIndex): LD index precomputed\n\n        Returns:\n            DataFrame: LD coordinates [variantId, chromosome, gnomadPopulation, i, idxs, start_idx and stop_idx]\n        \"\"\"\n        w = Window.orderBy(\"chromosome\", \"idx\")\n        return (\n            variants_df.join(\n                ld_index.df,\n                on=[\"variantId\", \"chromosome\"],\n            )\n            .select(\n                \"variantId\",\n                \"chromosome\",\n                \"gnomadPopulation\",\n                \"idx\",\n                \"start_idx\",\n                \"stop_idx\",\n            )\n            .distinct()\n            # necessary to resolve return of .entries() function\n            .withColumn(\"i\", f.row_number().over(w))\n            # the dataframe has to be ordered to query the block matrix\n            .orderBy(\"idx\")\n        )\n\n    @staticmethod\n    def weighted_r_overall(\n        chromosome: Column,\n        study_id: Column,\n        variant_id: Column,\n        tag_variant_id: Column,\n        relative_sample_size: Column,\n        r: Column,\n    ) -&gt; Column:\n\"\"\"Aggregation of weighted R information using ancestry proportions.\n\n        The method implements a simple average weighted by the relative population sizes.\n\n        Args:\n            chromosome (Column): Chromosome\n            study_id (Column): Study identifier\n            variant_id (Column): Variant identifier\n            tag_variant_id (Column): Tag variant identifier\n            relative_sample_size (Column): Relative sample size\n            r (Column): Correlation\n\n        Returns:\n            Column: Estimates weighted R information\n\n        Examples:\n            &gt;&gt;&gt; data = [('t3', 0.25, 0.2), ('t3', 0.25, 0.2), ('t3', 0.5, 0.99)]\n            &gt;&gt;&gt; columns = ['tag_variant_id', 'relative_sample_size', 'r']\n            &gt;&gt;&gt; (\n            ...    spark.createDataFrame(data, columns)\n            ...     .withColumn('chr', f.lit('chr1'))\n            ...     .withColumn('study_id', f.lit('s1'))\n            ...     .withColumn('variant_id', f.lit('v1'))\n            ...     .withColumn(\n            ...         'r_overall',\n            ...         LDAnnotatorGnomad.weighted_r_overall(\n            ...             f.col('chr'),\n            ...             f.col('study_id'),\n            ...             f.col('variant_id'),\n            ...             f.col('tag_variant_id'),\n            ...             f.col('relative_sample_size'),\n            ...             f.col('r')\n            ...         )\n            ...     )\n            ...     .show()\n            ... )\n            +--------------+--------------------+----+----+--------+----------+---------+\n            |tag_variant_id|relative_sample_size|   r| chr|study_id|variant_id|r_overall|\n            +--------------+--------------------+----+----+--------+----------+---------+\n            |            t3|                0.25| 0.2|chr1|      s1|        v1|    0.595|\n            |            t3|                0.25| 0.2|chr1|      s1|        v1|    0.595|\n            |            t3|                 0.5|0.99|chr1|      s1|        v1|    0.595|\n            +--------------+--------------------+----+----+--------+----------+---------+\n            &lt;BLANKLINE&gt;\n        \"\"\"\n        pseudo_r = f.when(r &gt;= 1, 0.9999995).otherwise(r)\n        return f.round(\n            f.sum(pseudo_r * relative_sample_size).over(\n                Window.partitionBy(chromosome, study_id, variant_id, tag_variant_id)\n            ),\n            6,\n        )\n\n    @staticmethod\n    def _flag_partial_mapped(\n        study_id: Column, variant_id: Column, tag_variant_id: Column\n    ) -&gt; Column:\n\"\"\"Generate flag for lead/tag pairs.\n\n        Some lead variants can be resolved in one population but not in other. Those rows interfere with PICS calculation, so they needs to be dropped.\n\n        Args:\n            study_id (Column): Study identifier column\n            variant_id (Column): Identifier of the lead variant\n            tag_variant_id (Column): Identifier of the tag variant\n\n        Returns:\n            Column: Boolean\n\n        Examples:\n            &gt;&gt;&gt; data = [\n            ...     ('study_1', 'lead_1', 'tag_1'),  # &lt;- keep row as tag available.\n            ...     ('study_1', 'lead_1', 'tag_2'),  # &lt;- keep row as tag available.\n            ...     ('study_1', 'lead_2', 'tag_3'),  # &lt;- keep row as tag available\n            ...     ('study_1', 'lead_2', None),  # &lt;- drop row as lead 2 is resolved.\n            ...     ('study_1', 'lead_3', None)   # &lt;- keep row as lead 3 is not resolved.\n            ... ]\n            &gt;&gt;&gt; (\n            ...     spark.createDataFrame(data, ['studyId', 'variantId', 'tagVariantId'])\n            ...     .withColumn(\"flag_to_keep_tag\", LDAnnotatorGnomad._flag_partial_mapped(f.col('studyId'), f.col('variantId'), f.col('tagVariantId')))\n            ...     .show()\n            ... )\n            +-------+---------+------------+----------------+\n            |studyId|variantId|tagVariantId|flag_to_keep_tag|\n            +-------+---------+------------+----------------+\n            |study_1|   lead_1|       tag_1|            true|\n            |study_1|   lead_1|       tag_2|            true|\n            |study_1|   lead_2|       tag_3|            true|\n            |study_1|   lead_2|        null|           false|\n            |study_1|   lead_3|        null|            true|\n            +-------+---------+------------+----------------+\n            &lt;BLANKLINE&gt;\n        \"\"\"\n        return tag_variant_id.isNotNull() | ~f.array_contains(\n            f.collect_set(tag_variant_id.isNotNull()).over(\n                Window.partitionBy(study_id, variant_id)\n            ),\n            True,\n        )\n\n    @staticmethod\n    def get_ld_annotated_assocs_for_population(\n        population: str,\n        ld_index: LDIndex,\n        ld_matrix: BlockMatrix,\n        locus_ancestry: DataFrame,\n        min_r2: float,\n    ) -&gt; DataFrame:\n\"\"\"This function annotates association data with LD information.\"\"\"\n        # map variants to precomputed LD indexes from gnomAD\n        variants_in_pop = locus_ancestry.filter(f.col(\"gnomadPopulation\") == population)\n        variants_ld_coordinates = LDAnnotatorGnomad._variant_coordinates_in_ldindex(\n            variants_in_pop, ld_index\n        ).persist()\n\n        # idxs for lead, first variant in the region and last variant in the region\n        variants_ld_scores = LDAnnotatorGnomad._query_block_matrix(\n            ld_matrix + ld_matrix.T,\n            variants_ld_coordinates.rdd.map(lambda x: x.idx).collect(),\n            variants_ld_coordinates.rdd.map(lambda x: x.start_idx).collect(),\n            variants_ld_coordinates.rdd.map(lambda x: x.stop_idx).collect(),\n            min_r2,\n        )\n\n        # aggregate LD info\n        variants_ld_info = variants_ld_scores.join(\n            f.broadcast(variants_ld_coordinates),\n            on=\"i\",\n            how=\"inner\",\n        ).select(\"variantId\", \"chromosome\", \"gnomadPopulation\", \"j\", \"r\")\n\n        variants_ld_coordinates.unpersist()\n        return LDAnnotatorGnomad.variants_in_ld_in_gnomad_pop(\n            variants_ld_info=variants_ld_info,\n            ld_index=ld_index,\n        )\n\n    @classmethod\n    def variants_in_ld_in_gnomad_pop(\n        cls: type[LDAnnotatorGnomad],\n        variants_ld_info: DataFrame,\n        ld_index: LDIndex,\n    ) -&gt; DataFrame:\n\"\"\"Return LD annotation for variants in specific gnomad population.\n\n        Args:\n            variants_ld_info (DataFrame): variant and their LD scores (r) and coordinates from the LD matrix of a population\n            ld_index (LDIndex): LD index precomputed\n\n        Returns:\n            DataFrame: LD information in the columns [\"variantId\", \"chromosome\", \"gnomadPopulation\", \"tagVariantId\", \"r\"]\n        \"\"\"\n        return (\n            variants_ld_info.alias(\"left\")\n            .join(\n                ld_index.df.select(\n                    f.col(\"chromosome\"),\n                    f.col(\"variantId\").alias(\"tagVariantId\"),\n                    f.col(\"idx\").alias(\"tag_idx\"),\n                ).alias(\"tags\"),\n                on=[\n                    f.col(\"left.chromosome\") == f.col(\"tags.chromosome\"),\n                    f.col(\"left.j\") == f.col(\"tags.tag_idx\"),\n                ],\n            )\n            .select(\n                \"variantId\", \"left.chromosome\", \"gnomadPopulation\", \"tagVariantId\", \"r\"\n            )\n        )\n\n    @classmethod\n    def ld_annotation_by_locus_ancestry(\n        cls: type[LDAnnotatorGnomad],\n        session: Session,\n        associations: StudyLocusGWASCatalog,\n        studies: StudyIndexGWASCatalog,\n        ld_populations: list[str],\n        ld_index_template: str,\n        ld_matrix_template: str,\n        min_r2: float,\n    ) -&gt; DataFrame:\n\"\"\"LD information for all locus and ancestries.\n\n        Args:\n            session (Session): Session\n            associations (StudyLocusGWASCatalog): GWAS associations\n            studies (StudyIndexGWASCatalog): study metadata of the associations\n            ld_populations (list[str]): List of populations to annotate\n            ld_index_template (str): Template path of the LD matrix index containing `{POP}` where the population is expected\n            ld_matrix_template (str): Template path of the LD matrix containing `{POP}` where the population is expected\n            min_r2 (float): minimum r2 to keep\n\n        Returns:\n            DataFrame: LD annotation [\"variantId\", \"chromosome\", \"gnomadPopulation\", \"tagVariantId\", \"r\"]\n        \"\"\"\n        # Unique lead - population pairs:\n        locus_ancestry = (\n            associations.unique_study_locus_ancestries(studies)\n            # Ignoring study information / relativeSampleSize to get unique lead-ancestry pairs\n            .drop(\"studyId\", \"relativeSampleSize\")\n            .distinct()\n            .persist()\n        )\n\n        # All gnomad populations captured in associations:\n        assoc_populations = locus_ancestry.rdd.map(\n            lambda x: x.gnomadPopulation\n        ).collect()\n\n        # Retrieve LD information from gnomAD\n        ld_annotated_assocs = []\n        for population in ld_populations:\n            if population in assoc_populations:\n                pop_parsed_ldindex_path = ld_index_template.format(POP=population)\n                pop_matrix_path = ld_matrix_template.format(POP=population)\n                ld_index = LDIndex.from_parquet(session, pop_parsed_ldindex_path)\n                ld_matrix = BlockMatrix.read(pop_matrix_path)\n                ld_annotated_assocs.append(\n                    LDAnnotatorGnomad.get_ld_annotated_assocs_for_population(\n                        population,\n                        ld_index,\n                        ld_matrix,\n                        locus_ancestry,\n                        min_r2,\n                    ).coalesce(400)\n                )\n        return reduce(DataFrame.unionByName, ld_annotated_assocs)\n</code></pre>"},{"location":"components/method/ld_annotator/#otg.method.ld.LDAnnotatorGnomad.get_ld_annotated_assocs_for_population","title":"<code>get_ld_annotated_assocs_for_population(population, ld_index, ld_matrix, locus_ancestry, min_r2)</code>  <code>staticmethod</code>","text":"<p>This function annotates association data with LD information.</p> Source code in <code>src/otg/method/ld.py</code> <pre><code>@staticmethod\ndef get_ld_annotated_assocs_for_population(\n    population: str,\n    ld_index: LDIndex,\n    ld_matrix: BlockMatrix,\n    locus_ancestry: DataFrame,\n    min_r2: float,\n) -&gt; DataFrame:\n\"\"\"This function annotates association data with LD information.\"\"\"\n    # map variants to precomputed LD indexes from gnomAD\n    variants_in_pop = locus_ancestry.filter(f.col(\"gnomadPopulation\") == population)\n    variants_ld_coordinates = LDAnnotatorGnomad._variant_coordinates_in_ldindex(\n        variants_in_pop, ld_index\n    ).persist()\n\n    # idxs for lead, first variant in the region and last variant in the region\n    variants_ld_scores = LDAnnotatorGnomad._query_block_matrix(\n        ld_matrix + ld_matrix.T,\n        variants_ld_coordinates.rdd.map(lambda x: x.idx).collect(),\n        variants_ld_coordinates.rdd.map(lambda x: x.start_idx).collect(),\n        variants_ld_coordinates.rdd.map(lambda x: x.stop_idx).collect(),\n        min_r2,\n    )\n\n    # aggregate LD info\n    variants_ld_info = variants_ld_scores.join(\n        f.broadcast(variants_ld_coordinates),\n        on=\"i\",\n        how=\"inner\",\n    ).select(\"variantId\", \"chromosome\", \"gnomadPopulation\", \"j\", \"r\")\n\n    variants_ld_coordinates.unpersist()\n    return LDAnnotatorGnomad.variants_in_ld_in_gnomad_pop(\n        variants_ld_info=variants_ld_info,\n        ld_index=ld_index,\n    )\n</code></pre>"},{"location":"components/method/ld_annotator/#otg.method.ld.LDAnnotatorGnomad.ld_annotation_by_locus_ancestry","title":"<code>ld_annotation_by_locus_ancestry(session, associations, studies, ld_populations, ld_index_template, ld_matrix_template, min_r2)</code>  <code>classmethod</code>","text":"<p>LD information for all locus and ancestries.</p> <p>Parameters:</p> Name Type Description Default <code>session</code> <code>Session</code> <p>Session</p> required <code>associations</code> <code>StudyLocusGWASCatalog</code> <p>GWAS associations</p> required <code>studies</code> <code>StudyIndexGWASCatalog</code> <p>study metadata of the associations</p> required <code>ld_populations</code> <code>list[str]</code> <p>List of populations to annotate</p> required <code>ld_index_template</code> <code>str</code> <p>Template path of the LD matrix index containing <code>{POP}</code> where the population is expected</p> required <code>ld_matrix_template</code> <code>str</code> <p>Template path of the LD matrix containing <code>{POP}</code> where the population is expected</p> required <code>min_r2</code> <code>float</code> <p>minimum r2 to keep</p> required <p>Returns:</p> Name Type Description <code>DataFrame</code> <code>DataFrame</code> <p>LD annotation [\"variantId\", \"chromosome\", \"gnomadPopulation\", \"tagVariantId\", \"r\"]</p> Source code in <code>src/otg/method/ld.py</code> <pre><code>@classmethod\ndef ld_annotation_by_locus_ancestry(\n    cls: type[LDAnnotatorGnomad],\n    session: Session,\n    associations: StudyLocusGWASCatalog,\n    studies: StudyIndexGWASCatalog,\n    ld_populations: list[str],\n    ld_index_template: str,\n    ld_matrix_template: str,\n    min_r2: float,\n) -&gt; DataFrame:\n\"\"\"LD information for all locus and ancestries.\n\n    Args:\n        session (Session): Session\n        associations (StudyLocusGWASCatalog): GWAS associations\n        studies (StudyIndexGWASCatalog): study metadata of the associations\n        ld_populations (list[str]): List of populations to annotate\n        ld_index_template (str): Template path of the LD matrix index containing `{POP}` where the population is expected\n        ld_matrix_template (str): Template path of the LD matrix containing `{POP}` where the population is expected\n        min_r2 (float): minimum r2 to keep\n\n    Returns:\n        DataFrame: LD annotation [\"variantId\", \"chromosome\", \"gnomadPopulation\", \"tagVariantId\", \"r\"]\n    \"\"\"\n    # Unique lead - population pairs:\n    locus_ancestry = (\n        associations.unique_study_locus_ancestries(studies)\n        # Ignoring study information / relativeSampleSize to get unique lead-ancestry pairs\n        .drop(\"studyId\", \"relativeSampleSize\")\n        .distinct()\n        .persist()\n    )\n\n    # All gnomad populations captured in associations:\n    assoc_populations = locus_ancestry.rdd.map(\n        lambda x: x.gnomadPopulation\n    ).collect()\n\n    # Retrieve LD information from gnomAD\n    ld_annotated_assocs = []\n    for population in ld_populations:\n        if population in assoc_populations:\n            pop_parsed_ldindex_path = ld_index_template.format(POP=population)\n            pop_matrix_path = ld_matrix_template.format(POP=population)\n            ld_index = LDIndex.from_parquet(session, pop_parsed_ldindex_path)\n            ld_matrix = BlockMatrix.read(pop_matrix_path)\n            ld_annotated_assocs.append(\n                LDAnnotatorGnomad.get_ld_annotated_assocs_for_population(\n                    population,\n                    ld_index,\n                    ld_matrix,\n                    locus_ancestry,\n                    min_r2,\n                ).coalesce(400)\n            )\n    return reduce(DataFrame.unionByName, ld_annotated_assocs)\n</code></pre>"},{"location":"components/method/ld_annotator/#otg.method.ld.LDAnnotatorGnomad.variants_in_ld_in_gnomad_pop","title":"<code>variants_in_ld_in_gnomad_pop(variants_ld_info, ld_index)</code>  <code>classmethod</code>","text":"<p>Return LD annotation for variants in specific gnomad population.</p> <p>Parameters:</p> Name Type Description Default <code>variants_ld_info</code> <code>DataFrame</code> <p>variant and their LD scores (r) and coordinates from the LD matrix of a population</p> required <code>ld_index</code> <code>LDIndex</code> <p>LD index precomputed</p> required <p>Returns:</p> Name Type Description <code>DataFrame</code> <code>DataFrame</code> <p>LD information in the columns [\"variantId\", \"chromosome\", \"gnomadPopulation\", \"tagVariantId\", \"r\"]</p> Source code in <code>src/otg/method/ld.py</code> <pre><code>@classmethod\ndef variants_in_ld_in_gnomad_pop(\n    cls: type[LDAnnotatorGnomad],\n    variants_ld_info: DataFrame,\n    ld_index: LDIndex,\n) -&gt; DataFrame:\n\"\"\"Return LD annotation for variants in specific gnomad population.\n\n    Args:\n        variants_ld_info (DataFrame): variant and their LD scores (r) and coordinates from the LD matrix of a population\n        ld_index (LDIndex): LD index precomputed\n\n    Returns:\n        DataFrame: LD information in the columns [\"variantId\", \"chromosome\", \"gnomadPopulation\", \"tagVariantId\", \"r\"]\n    \"\"\"\n    return (\n        variants_ld_info.alias(\"left\")\n        .join(\n            ld_index.df.select(\n                f.col(\"chromosome\"),\n                f.col(\"variantId\").alias(\"tagVariantId\"),\n                f.col(\"idx\").alias(\"tag_idx\"),\n            ).alias(\"tags\"),\n            on=[\n                f.col(\"left.chromosome\") == f.col(\"tags.chromosome\"),\n                f.col(\"left.j\") == f.col(\"tags.tag_idx\"),\n            ],\n        )\n        .select(\n            \"variantId\", \"left.chromosome\", \"gnomadPopulation\", \"tagVariantId\", \"r\"\n        )\n    )\n</code></pre>"},{"location":"components/method/ld_annotator/#otg.method.ld.LDAnnotatorGnomad.weighted_r_overall","title":"<code>weighted_r_overall(chromosome, study_id, variant_id, tag_variant_id, relative_sample_size, r)</code>  <code>staticmethod</code>","text":"<p>Aggregation of weighted R information using ancestry proportions.</p> <p>The method implements a simple average weighted by the relative population sizes.</p> <p>Parameters:</p> Name Type Description Default <code>chromosome</code> <code>Column</code> <p>Chromosome</p> required <code>study_id</code> <code>Column</code> <p>Study identifier</p> required <code>variant_id</code> <code>Column</code> <p>Variant identifier</p> required <code>tag_variant_id</code> <code>Column</code> <p>Tag variant identifier</p> required <code>relative_sample_size</code> <code>Column</code> <p>Relative sample size</p> required <code>r</code> <code>Column</code> <p>Correlation</p> required <p>Returns:</p> Name Type Description <code>Column</code> <code>Column</code> <p>Estimates weighted R information</p> <p>Examples:</p> <pre><code>&gt;&gt;&gt; data = [('t3', 0.25, 0.2), ('t3', 0.25, 0.2), ('t3', 0.5, 0.99)]\n&gt;&gt;&gt; columns = ['tag_variant_id', 'relative_sample_size', 'r']\n&gt;&gt;&gt; (\n...    spark.createDataFrame(data, columns)\n...     .withColumn('chr', f.lit('chr1'))\n...     .withColumn('study_id', f.lit('s1'))\n...     .withColumn('variant_id', f.lit('v1'))\n...     .withColumn(\n...         'r_overall',\n...         LDAnnotatorGnomad.weighted_r_overall(\n...             f.col('chr'),\n...             f.col('study_id'),\n...             f.col('variant_id'),\n...             f.col('tag_variant_id'),\n...             f.col('relative_sample_size'),\n...             f.col('r')\n...         )\n...     )\n...     .show()\n... )\n+--------------+--------------------+----+----+--------+----------+---------+\n|tag_variant_id|relative_sample_size|   r| chr|study_id|variant_id|r_overall|\n+--------------+--------------------+----+----+--------+----------+---------+\n|            t3|                0.25| 0.2|chr1|      s1|        v1|    0.595|\n|            t3|                0.25| 0.2|chr1|      s1|        v1|    0.595|\n|            t3|                 0.5|0.99|chr1|      s1|        v1|    0.595|\n+--------------+--------------------+----+----+--------+----------+---------+\n</code></pre> Source code in <code>src/otg/method/ld.py</code> <pre><code>@staticmethod\ndef weighted_r_overall(\n    chromosome: Column,\n    study_id: Column,\n    variant_id: Column,\n    tag_variant_id: Column,\n    relative_sample_size: Column,\n    r: Column,\n) -&gt; Column:\n\"\"\"Aggregation of weighted R information using ancestry proportions.\n\n    The method implements a simple average weighted by the relative population sizes.\n\n    Args:\n        chromosome (Column): Chromosome\n        study_id (Column): Study identifier\n        variant_id (Column): Variant identifier\n        tag_variant_id (Column): Tag variant identifier\n        relative_sample_size (Column): Relative sample size\n        r (Column): Correlation\n\n    Returns:\n        Column: Estimates weighted R information\n\n    Examples:\n        &gt;&gt;&gt; data = [('t3', 0.25, 0.2), ('t3', 0.25, 0.2), ('t3', 0.5, 0.99)]\n        &gt;&gt;&gt; columns = ['tag_variant_id', 'relative_sample_size', 'r']\n        &gt;&gt;&gt; (\n        ...    spark.createDataFrame(data, columns)\n        ...     .withColumn('chr', f.lit('chr1'))\n        ...     .withColumn('study_id', f.lit('s1'))\n        ...     .withColumn('variant_id', f.lit('v1'))\n        ...     .withColumn(\n        ...         'r_overall',\n        ...         LDAnnotatorGnomad.weighted_r_overall(\n        ...             f.col('chr'),\n        ...             f.col('study_id'),\n        ...             f.col('variant_id'),\n        ...             f.col('tag_variant_id'),\n        ...             f.col('relative_sample_size'),\n        ...             f.col('r')\n        ...         )\n        ...     )\n        ...     .show()\n        ... )\n        +--------------+--------------------+----+----+--------+----------+---------+\n        |tag_variant_id|relative_sample_size|   r| chr|study_id|variant_id|r_overall|\n        +--------------+--------------------+----+----+--------+----------+---------+\n        |            t3|                0.25| 0.2|chr1|      s1|        v1|    0.595|\n        |            t3|                0.25| 0.2|chr1|      s1|        v1|    0.595|\n        |            t3|                 0.5|0.99|chr1|      s1|        v1|    0.595|\n        +--------------+--------------------+----+----+--------+----------+---------+\n        &lt;BLANKLINE&gt;\n    \"\"\"\n    pseudo_r = f.when(r &gt;= 1, 0.9999995).otherwise(r)\n    return f.round(\n        f.sum(pseudo_r * relative_sample_size).over(\n            Window.partitionBy(chromosome, study_id, variant_id, tag_variant_id)\n        ),\n        6,\n    )\n</code></pre>"},{"location":"components/method/pics/","title":"PICS","text":"<p>Probabilistic Identification of Causal SNPs (PICS), an algorithm estimating the probability that an individual variant is causal considering the haplotype structure and observed pattern of association at the genetic locus.</p> Source code in <code>src/otg/method/pics.py</code> <pre><code>class PICS:\n\"\"\"Probabilistic Identification of Causal SNPs (PICS), an algorithm estimating the probability that an individual variant is causal considering the haplotype structure and observed pattern of association at the genetic locus.\"\"\"\n\n    @staticmethod\n    def _pics_relative_posterior_probability(\n        neglog_p: float, pics_snp_mu: float, pics_snp_std: float\n    ) -&gt; float:\n\"\"\"Compute the PICS posterior probability for a given SNP.\n\n        !!! info \"This probability needs to be scaled to take into account the probabilities of the other variants in the locus.\"\n\n        Args:\n            neglog_p (float): Negative log p-value of the lead variant\n            pics_snp_mu (float): Mean P value of the association between a SNP and a trait\n            pics_snp_std (float): Standard deviation for the P value of the association between a SNP and a trait\n\n        Returns:\n            Relative posterior probability of a SNP being causal in a locus\n\n        Examples:\n            &gt;&gt;&gt; rel_prob = PICS._pics_relative_posterior_probability(neglog_p=10.0, pics_snp_mu=1.0, pics_snp_std=10.0)\n            &gt;&gt;&gt; round(rel_prob, 3)\n            0.368\n        \"\"\"\n        return float(norm(pics_snp_mu, pics_snp_std).sf(neglog_p) * 2)\n\n    @staticmethod\n    def _pics_standard_deviation(neglog_p: float, r2: float, k: float) -&gt; float | None:\n\"\"\"Compute the PICS standard deviation.\n\n        This distribution is obtained after a series of permutation tests described in the PICS method, and it is only\n        valid when the SNP is highly linked with the lead (r2 &gt; 0.5).\n\n        Args:\n            neglog_p (float): Negative log p-value of the lead variant\n            r2 (float): LD score between a given SNP and the lead variant\n            k (float): Empiric constant that can be adjusted to fit the curve, 6.4 recommended.\n\n        Returns:\n            Standard deviation for the P value of the association between a SNP and a trait\n\n        Examples:\n            &gt;&gt;&gt; PICS._pics_standard_deviation(neglog_p=1.0, r2=1.0, k=6.4)\n            0.0\n            &gt;&gt;&gt; round(PICS._pics_standard_deviation(neglog_p=10.0, r2=0.5, k=6.4), 3)\n            0.143\n            &gt;&gt;&gt; print(PICS._pics_standard_deviation(neglog_p=1.0, r2=0.0, k=6.4))\n            None\n        \"\"\"\n        return (\n            (1 - abs(r2) ** 0.5**k) ** 0.5 * (neglog_p) ** 0.5 / 2\n            if r2 &gt;= 0.5\n            else None\n        )\n\n    @staticmethod\n    def _pics_mu(neglog_p: float, r2: float) -&gt; float | None:\n\"\"\"Compute the PICS mu that estimates the probability of association between a given SNP and the trait.\n\n        This distribution is obtained after a series of permutation tests described in the PICS method, and it is only\n        valid when the SNP is highly linked with the lead (r2 &gt; 0.5).\n\n        Args:\n            neglog_p (float): Negative log p-value of the lead variant\n            r2 (float): LD score between a given SNP and the lead variant\n\n        Returns:\n            Mean P value of the association between a SNP and a trait\n\n        Examples:\n            &gt;&gt;&gt; PICS._pics_mu(neglog_p=1.0, r2=1.0)\n            1.0\n            &gt;&gt;&gt; PICS._pics_mu(neglog_p=10.0, r2=0.5)\n            5.0\n            &gt;&gt;&gt; print(PICS._pics_mu(neglog_p=10.0, r2=0.3))\n            None\n        \"\"\"\n        return neglog_p * r2 if r2 &gt;= 0.5 else None\n\n    @staticmethod\n    def _finemap(\n        credible_set: list[Row], lead_neglog_p: float, k: float\n    ) -&gt; list | None:\n\"\"\"Calculates the probability of a variant being causal in a study-locus context by applying the PICS method.\n\n        It is intended to be applied as an UDF in `PICS.finemap`, where each row is a StudyLocus association.\n        The function iterates over every SNP in the `credibleSet` array, and it returns an updated credibleSet with\n        its association signal and causality probability as of PICS.\n\n        Args:\n            credible_set (list): list of tagging variants after expanding the locus\n            lead_neglog_p (float): P value of the association signal between the lead variant and the study in the form of -log10.\n            k (float): Empiric constant that can be adjusted to fit the curve, 6.4 recommended.\n\n        Returns:\n            List of tagging variants with an estimation of the association signal and their posterior probability as of PICS.\n        \"\"\"\n        if credible_set is None:\n            return None\n        elif not credible_set:\n            return []\n\n        tmp_credible_set = []\n        new_credible_set = []\n        # First iteration: calculation of mu, standard deviation, and the relative posterior probability\n        for tag_struct in credible_set:\n            tag_dict = (\n                tag_struct.asDict()\n            )  # tag_struct is of type pyspark.Row, we'll represent it as a dict\n            if (\n                not tag_dict[\"r2Overall\"]\n                or tag_dict[\"r2Overall\"] &lt; 0.5\n                or not lead_neglog_p\n            ):\n                # If PICS cannot be calculated, we'll return the original credible set\n                new_credible_set.append(tag_dict)\n                continue\n            pics_snp_mu = PICS._pics_mu(lead_neglog_p, tag_dict[\"r2Overall\"])\n            pics_snp_std = PICS._pics_standard_deviation(\n                lead_neglog_p, tag_dict[\"r2Overall\"], k\n            )\n            pics_snp_std = 0.001 if pics_snp_std == 0 else pics_snp_std\n            if pics_snp_mu is not None and pics_snp_std is not None:\n                posterior_probability = PICS._pics_relative_posterior_probability(\n                    lead_neglog_p, pics_snp_mu, pics_snp_std\n                )\n                tag_dict[\"tagPValue\"] = 10**-pics_snp_mu\n                tag_dict[\"tagStandardError\"] = 10**-pics_snp_std\n                tag_dict[\"relativePosteriorProbability\"] = posterior_probability\n\n                tmp_credible_set.append(tag_dict)\n\n        # Second iteration: calculation of the sum of all the posteriors in each study-locus, so that we scale them between 0-1\n        total_posteriors = sum(\n            tag_dict.get(\"relativePosteriorProbability\", 0)\n            for tag_dict in tmp_credible_set\n        )\n\n        # Third iteration: calculation of the final posteriorProbability\n        for tag_dict in tmp_credible_set:\n            if total_posteriors != 0:\n                tag_dict[\"posteriorProbability\"] = float(\n                    tag_dict.get(\"relativePosteriorProbability\", 0) / total_posteriors\n                )\n            tag_dict.pop(\"relativePosteriorProbability\")\n            new_credible_set.append(tag_dict)\n        return new_credible_set\n\n    @classmethod\n    def finemap(\n        cls: type[PICS], associations: StudyLocus, k: float = 6.4\n    ) -&gt; StudyLocus:\n\"\"\"Run PICS on a study locus.\n\n        !!! info \"Study locus needs to be LD annotated\"\n            The study locus needs to be LD annotated before PICS can be calculated.\n\n        Args:\n            associations (StudyLocus): Study locus to finemap using PICS\n            k (float): Empiric constant that can be adjusted to fit the curve, 6.4 recommended.\n\n        Returns:\n            StudyLocus: Study locus with PICS results\n        \"\"\"\n        # Register UDF by defining the structure of the output credibleSet array of structs\n        credset_schema = t.ArrayType(\n            [field.dataType.elementType for field in associations.schema if field.name == \"credibleSet\"][0]  # type: ignore\n        )\n        _finemap_udf = f.udf(\n            lambda credible_set, neglog_p: PICS._finemap(credible_set, neglog_p, k),\n            credset_schema,\n        )\n\n        associations.df = (\n            associations.df.withColumn(\"neglog_pvalue\", associations.neglog_pvalue())\n            .withColumn(\n                \"credibleSet\",\n                f.when(\n                    f.col(\"credibleSet\").isNotNull(),\n                    _finemap_udf(f.col(\"credibleSet\"), f.col(\"neglog_pvalue\")),\n                ),\n            )\n            .drop(\"neglog_pvalue\")\n        )\n        return associations\n</code></pre>"},{"location":"components/method/pics/#otg.method.pics.PICS.finemap","title":"<code>finemap(associations, k=6.4)</code>  <code>classmethod</code>","text":"<p>Run PICS on a study locus.</p> <p>Study locus needs to be LD annotated</p> <p>The study locus needs to be LD annotated before PICS can be calculated.</p> <p>Parameters:</p> Name Type Description Default <code>associations</code> <code>StudyLocus</code> <p>Study locus to finemap using PICS</p> required <code>k</code> <code>float</code> <p>Empiric constant that can be adjusted to fit the curve, 6.4 recommended.</p> <code>6.4</code> <p>Returns:</p> Name Type Description <code>StudyLocus</code> <code>StudyLocus</code> <p>Study locus with PICS results</p> Source code in <code>src/otg/method/pics.py</code> <pre><code>@classmethod\ndef finemap(\n    cls: type[PICS], associations: StudyLocus, k: float = 6.4\n) -&gt; StudyLocus:\n\"\"\"Run PICS on a study locus.\n\n    !!! info \"Study locus needs to be LD annotated\"\n        The study locus needs to be LD annotated before PICS can be calculated.\n\n    Args:\n        associations (StudyLocus): Study locus to finemap using PICS\n        k (float): Empiric constant that can be adjusted to fit the curve, 6.4 recommended.\n\n    Returns:\n        StudyLocus: Study locus with PICS results\n    \"\"\"\n    # Register UDF by defining the structure of the output credibleSet array of structs\n    credset_schema = t.ArrayType(\n        [field.dataType.elementType for field in associations.schema if field.name == \"credibleSet\"][0]  # type: ignore\n    )\n    _finemap_udf = f.udf(\n        lambda credible_set, neglog_p: PICS._finemap(credible_set, neglog_p, k),\n        credset_schema,\n    )\n\n    associations.df = (\n        associations.df.withColumn(\"neglog_pvalue\", associations.neglog_pvalue())\n        .withColumn(\n            \"credibleSet\",\n            f.when(\n                f.col(\"credibleSet\").isNotNull(),\n                _finemap_udf(f.col(\"credibleSet\"), f.col(\"neglog_pvalue\")),\n            ),\n        )\n        .drop(\"neglog_pvalue\")\n    )\n    return associations\n</code></pre>"},{"location":"components/method/window_based_clumping/","title":"Window-based clumping","text":"<p>Get semi-lead snps from summary statistics using a window based function.</p> Source code in <code>src/otg/method/window_based_clumping.py</code> <pre><code>class WindowBasedClumping:\n\"\"\"Get semi-lead snps from summary statistics using a window based function.\"\"\"\n\n    @staticmethod\n    def _identify_cluster_peaks(\n        study: Column, chromosome: Column, position: Column, window_length: int\n    ) -&gt; Column:\n\"\"\"Cluster GWAS significant variants, were clusters are separated by a defined distance.\n\n        !! Important to note that the length of the clusters can be arbitrarily big.\n\n        Args:\n            study (Column): study identifier\n            chromosome (Column): chromosome identifier\n            position (Column): position of the variant\n            window_length (int): window length in basepair\n\n        Returns:\n            Column: containing cluster identifier\n\n        Examples:\n            &gt;&gt;&gt; data = [\n            ...     # Cluster 1:\n            ...     ('s1', 'chr1', 2),\n            ...     ('s1', 'chr1', 4),\n            ...     ('s1', 'chr1', 12),\n            ...     # Cluster 2 - Same chromosome:\n            ...     ('s1', 'chr1', 31),\n            ...     ('s1', 'chr1', 38),\n            ...     ('s1', 'chr1', 42),\n            ...     # Cluster 3 - New chromosome:\n            ...     ('s1', 'chr2', 41),\n            ...     ('s1', 'chr2', 44),\n            ...     ('s1', 'chr2', 50),\n            ...     # Cluster 4 - other study:\n            ...     ('s2', 'chr2', 55),\n            ...     ('s2', 'chr2', 62),\n            ...     ('s2', 'chr2', 70),\n            ... ]\n            &gt;&gt;&gt; window_length = 10\n            &gt;&gt;&gt; (\n            ...     spark.createDataFrame(data, ['studyId', 'chromosome', 'position'])\n            ...     .withColumn(\"cluster_id\",\n            ...         WindowBasedClumping._identify_cluster_peaks(\n            ...             f.col('studyId'),\n            ...             f.col('chromosome'),\n            ...             f.col('position'),\n            ...             window_length\n            ...         )\n            ...     ).show()\n            ... )\n            +-------+----------+--------+----------+\n            |studyId|chromosome|position|cluster_id|\n            +-------+----------+--------+----------+\n            |     s1|      chr1|       2| s1_chr1_2|\n            |     s1|      chr1|       4| s1_chr1_2|\n            |     s1|      chr1|      12| s1_chr1_2|\n            |     s1|      chr1|      31|s1_chr1_31|\n            |     s1|      chr1|      38|s1_chr1_31|\n            |     s1|      chr1|      42|s1_chr1_31|\n            |     s1|      chr2|      41|s1_chr2_41|\n            |     s1|      chr2|      44|s1_chr2_41|\n            |     s1|      chr2|      50|s1_chr2_41|\n            |     s2|      chr2|      55|s2_chr2_55|\n            |     s2|      chr2|      62|s2_chr2_55|\n            |     s2|      chr2|      70|s2_chr2_55|\n            +-------+----------+--------+----------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        # By adding previous position, the cluster boundary can be identified:\n        previous_position = f.lag(position).over(\n            Window.partitionBy(study, chromosome).orderBy(position)\n        )\n        # We consider a cluster boudary if subsequent snps are further than the defined window:\n        cluster_id = f.when(\n            (previous_position.isNull())\n            | (position - previous_position &gt; window_length),\n            f.concat_ws(\"_\", study, chromosome, position),\n        )\n        # The cluster identifier is propagated across every variant of the cluster:\n        return f.when(\n            cluster_id.isNull(),\n            f.last(cluster_id, ignorenulls=True).over(\n                Window.partitionBy(study, chromosome)\n                .orderBy(position)\n                .rowsBetween(Window.unboundedPreceding, Window.currentRow)\n            ),\n        ).otherwise(cluster_id)\n\n    @staticmethod\n    @f.udf(VectorUDT())\n    def _find_peak(position: ndarray, window_size: int) -&gt; DenseVector:\n\"\"\"Establish lead snps based on their positions listed by p-value.\n\n        The function `find_peak` assigns lead SNPs based on their positions listed by p-value within a specified window size.\n\n        Args:\n            position (ndarray): positions of the SNPs sorted by p-value.\n            window_size (int): the distance in bp within which associations are clumped together around the lead snp.\n\n        Returns:\n            DenseVector: binary vector where 1 indicates a lead SNP and 0 indicates a non-lead SNP.\n\n        Examples:\n            &gt;&gt;&gt; from pyspark.ml import functions as fml\n            &gt;&gt;&gt; data = [\n            ...     ('c', 3, 4.0, True),\n            ...     ('c', 4, 2.0, False),\n            ...     ('c', 6, 1.0, True),\n            ...     ('c', 8, 2.5, False),\n            ...     ('c', 9, 3.0, True)\n            ... ]\n            &gt;&gt;&gt; (\n            ...     spark.createDataFrame(data, ['cluster', 'position', 'negLogPValue', 'isSemiIndex'])\n            ...     .withColumn(\n            ...        'collected_positions',\n            ...         f.collect_list(\n            ...             f.col('position'))\n            ...         .over(\n            ...             Window.partitionBy('cluster')\n            ...             .orderBy(f.col('negLogPValue').desc())\n            ...             .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)\n            ...         )\n            ...     )\n            ...     .withColumn('isLeadList', WindowBasedClumping._find_peak(fml.array_to_vector(f.col('collected_positions')), f.lit(2)))\n            ...     .show(truncate=False)\n            ... )\n            +-------+--------+------------+-----------+-------------------+---------------------+\n            |cluster|position|negLogPValue|isSemiIndex|collected_positions|isLeadList           |\n            +-------+--------+------------+-----------+-------------------+---------------------+\n            |c      |3       |4.0         |true       |[3, 9, 8, 4, 6]    |[1.0,1.0,0.0,0.0,1.0]|\n            |c      |9       |3.0         |true       |[3, 9, 8, 4, 6]    |[1.0,1.0,0.0,0.0,1.0]|\n            |c      |8       |2.5         |false      |[3, 9, 8, 4, 6]    |[1.0,1.0,0.0,0.0,1.0]|\n            |c      |4       |2.0         |false      |[3, 9, 8, 4, 6]    |[1.0,1.0,0.0,0.0,1.0]|\n            |c      |6       |1.0         |true       |[3, 9, 8, 4, 6]    |[1.0,1.0,0.0,0.0,1.0]|\n            +-------+--------+------------+-----------+-------------------+---------------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        # Initializing the lead list with zeroes:\n        is_lead: ndarray = np.zeros(len(position))\n\n        # List containing indices of leads:\n        lead_indices: list = []\n\n        # Looping through all positions:\n        for index in range(len(position)):\n            # Looping through leads to find out if they are within a window:\n            for lead_index in lead_indices:\n                # If any of the leads within the window:\n                if abs(position[lead_index] - position[index]) &lt; window_size:\n                    # Skipping further checks:\n                    break\n            else:\n                # None of the leads were within the window:\n                lead_indices.append(index)\n                is_lead[index] = 1\n\n        return DenseVector(is_lead)\n\n    @staticmethod\n    def _filter_leads(clump: Column, window_length: int) -&gt; Column:\n\"\"\"Filter lead snps from a column containing clumps with prioritised variants.\n\n        Args:\n            clump (Column): column containing array of structs with all variants in the clump sorted by priority.\n            window_length (int): window length in basepair\n\n        Returns:\n            Column: column containing array of structs with only lead variants.\n\n        Examples:\n            &gt;&gt;&gt; data = [\n            ...     ('v6', 10),\n            ...     ('v4', 6),\n            ...     ('v1', 3),\n            ...     ('v2', 4),\n            ...     ('v3', 5),\n            ...     ('v5', 8),\n            ...     ('v7', 13),\n            ...     ('v8', 20)\n            ... ]\n            &gt;&gt;&gt; window_length = 2\n            &gt;&gt;&gt; (\n            ...    spark.createDataFrame(data, ['variantId', 'position']).withColumn(\"study\", f.lit(\"s1\"))\n            ...    .groupBy(\"study\")\n            ...    .agg(f.collect_list(f.struct(\"*\")).alias(\"clump\"))\n            ...    .select(WindowBasedClumping._filter_leads(f.col('clump'), window_length).alias(\"filtered_clump\"))\n            ...    .show(truncate=False)\n            ... )\n            +---------------------------------------------------------------------------------------------------------------+\n            |filtered_clump                                                                                                 |\n            +---------------------------------------------------------------------------------------------------------------+\n            |[{v6, 10, s1, 1.0}, {v4, 6, s1, 1.0}, {v1, 3, s1, 1.0}, {v5, 8, s1, 1.0}, {v7, 13, s1, 1.0}, {v8, 20, s1, 1.0}]|\n            +---------------------------------------------------------------------------------------------------------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        # Combine the lead position vector with the aggregated fields and dropping non-lead snps:\n        return f.filter(\n            f.zip_with(\n                clump,\n                # Extract the position vector and identify positions of the leads:\n                fml.vector_to_array(\n                    WindowBasedClumping._find_peak(\n                        fml.array_to_vector(f.transform(clump, lambda x: x.position)),\n                        f.lit(window_length),\n                    )\n                ),\n                lambda x, y: f.when(y == 1.0, x.withField(\"isLead\", y)),\n            ),\n            lambda col: col.isNotNull(),\n        )\n\n    @staticmethod\n    def _collect_clump(mantissa: Column, exponent: Column) -&gt; Column:\n\"\"\"Collect clump into a sorted struct.\n\n        Args:\n            mantissa (Column): mantissa of the p-value\n            exponent (Column): exponent of the p-value\n\n        Returns:\n            Column: struct containing clumped variants sorted by negLogPValue in descending order\n\n        Examples:\n            &gt;&gt;&gt; data = [\n            ...     ('clump_1', 2, 0.1, -1),\n            ...     ('clump_1', 4, 0.2, -1),\n            ...     ('clump_1', 12, 0.3, -1),\n            ...     ('clump_1', 31, 0.4, -1),\n            ...     ('clump_1', 38, 0.5, -1),\n            ...     ('clump_1', 42, 0.6, -1),\n            ...     ('clump_2', 41, 0.7, -1),\n            ...     ('clump_2', 44, 0.8, -1),\n            ...     ('clump_2', 50, 0.9, -1),\n            ...     ('clump_3', 55, 1.0, -1),\n            ...     ('clump_3', 62, 1.1, -1),\n            ...     ('clump_3', 70, 1.2, -1),\n            ... ]\n            &gt;&gt;&gt; (\n            ...    spark.createDataFrame(data, ['clump_id', 'position', 'pValueMantissa', 'pValueExponent'])\n            ...     .groupBy('clump_id')\n            ...     .agg(WindowBasedClumping._collect_clump(\n            ...                 f.col('pValueMantissa'),\n            ...                 f.col('pValueExponent')\n            ...             ).alias(\"clump\")\n            ...     ).show(truncate=False)\n            ... )\n            +--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n            |clump_id|clump                                                                                                                                                                                                                                                  |\n            +--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n            |clump_1 |[{2.0, clump_1, 2, 0.1, -1}, {1.6989700043360187, clump_1, 4, 0.2, -1}, {1.5228787452803376, clump_1, 12, 0.3, -1}, {1.3979400086720375, clump_1, 31, 0.4, -1}, {1.3010299956639813, clump_1, 38, 0.5, -1}, {1.2218487496163564, clump_1, 42, 0.6, -1}]|\n            |clump_2 |[{1.154901959985743, clump_2, 41, 0.7, -1}, {1.0969100130080565, clump_2, 44, 0.8, -1}, {1.045757490560675, clump_2, 50, 0.9, -1}]                                                                                                                     |\n            |clump_3 |[{1.0, clump_3, 55, 1.0, -1}, {0.958607314841775, clump_3, 62, 1.1, -1}, {0.9208187539523752, clump_3, 70, 1.2, -1}]                                                                                                                                   |\n            +--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n            &lt;BLANKLINE&gt;\n\n        \"\"\"\n        return f.sort_array(\n            f.collect_list(\n                f.struct(\n                    calculate_neglog_pvalue(mantissa, exponent).alias(\"negLogPValue\"),\n                    \"*\",\n                )\n            ),\n            False,\n        )\n\n    @classmethod\n    def clump(\n        cls: type[WindowBasedClumping],\n        summary_stats: SummaryStatistics,\n        window_length: int,\n    ) -&gt; StudyLocus:\n\"\"\"Clump summary statistics by distance.\n\n        Args:\n            summary_stats (SummaryStatistics): summary statistics to clump\n            window_length (int): window length in basepair\n\n        Returns:\n            StudyLocus: clumped summary statistics\n        \"\"\"\n        return StudyLocus(\n            _df=summary_stats.df.withColumn(\n                \"cluster_id\",\n                # First identify clusters of variants within the window\n                WindowBasedClumping._identify_cluster_peaks(\n                    f.col(\"studyId\"),\n                    f.col(\"chromosome\"),\n                    f.col(\"position\"),\n                    window_length,\n                ),\n            )\n            .groupBy(\"cluster_id\")\n            # Aggregating all data from each cluster:\n            .agg(\n                WindowBasedClumping._collect_clump(\n                    f.col(\"pValueMantissa\"), f.col(\"pValueExponent\")\n                ).alias(\"clump\")\n            )\n            # Explode and identify the index variant representative of the cluster:\n            .withColumn(\n                \"exploded\",\n                f.explode(\n                    WindowBasedClumping._filter_leads(f.col(\"clump\"), window_length)\n                ),\n            )\n            .select(\"exploded.*\")\n            # Dropping helper columns:\n            .drop(\"isLead\", \"negLogPValue\", \"cluster_id\")\n            # assign study-locus id:\n            .withColumn(\"studyLocusId\", get_study_locus_id(\"studyId\", \"variantId\"))\n        )\n</code></pre>"},{"location":"components/method/window_based_clumping/#otg.method.window_based_clumping.WindowBasedClumping.clump","title":"<code>clump(summary_stats, window_length)</code>  <code>classmethod</code>","text":"<p>Clump summary statistics by distance.</p> <p>Parameters:</p> Name Type Description Default <code>summary_stats</code> <code>SummaryStatistics</code> <p>summary statistics to clump</p> required <code>window_length</code> <code>int</code> <p>window length in basepair</p> required <p>Returns:</p> Name Type Description <code>StudyLocus</code> <code>StudyLocus</code> <p>clumped summary statistics</p> Source code in <code>src/otg/method/window_based_clumping.py</code> <pre><code>@classmethod\ndef clump(\n    cls: type[WindowBasedClumping],\n    summary_stats: SummaryStatistics,\n    window_length: int,\n) -&gt; StudyLocus:\n\"\"\"Clump summary statistics by distance.\n\n    Args:\n        summary_stats (SummaryStatistics): summary statistics to clump\n        window_length (int): window length in basepair\n\n    Returns:\n        StudyLocus: clumped summary statistics\n    \"\"\"\n    return StudyLocus(\n        _df=summary_stats.df.withColumn(\n            \"cluster_id\",\n            # First identify clusters of variants within the window\n            WindowBasedClumping._identify_cluster_peaks(\n                f.col(\"studyId\"),\n                f.col(\"chromosome\"),\n                f.col(\"position\"),\n                window_length,\n            ),\n        )\n        .groupBy(\"cluster_id\")\n        # Aggregating all data from each cluster:\n        .agg(\n            WindowBasedClumping._collect_clump(\n                f.col(\"pValueMantissa\"), f.col(\"pValueExponent\")\n            ).alias(\"clump\")\n        )\n        # Explode and identify the index variant representative of the cluster:\n        .withColumn(\n            \"exploded\",\n            f.explode(\n                WindowBasedClumping._filter_leads(f.col(\"clump\"), window_length)\n            ),\n        )\n        .select(\"exploded.*\")\n        # Dropping helper columns:\n        .drop(\"isLead\", \"negLogPValue\", \"cluster_id\")\n        # assign study-locus id:\n        .withColumn(\"studyLocusId\", get_study_locus_id(\"studyId\", \"variantId\"))\n    )\n</code></pre>"},{"location":"components/step/colocalisation/","title":"Colocalisation","text":"<p>         Bases: <code>ColocalisationStepConfig</code></p> <p>Colocalisation step.</p> <p>This workflow runs colocalization analyses that assess the degree to which independent signals of the association share the same causal variant in a region of the genome, typically limited by linkage disequilibrium (LD).</p> Source code in <code>src/otg/colocalisation.py</code> <pre><code>@dataclass\nclass ColocalisationStep(ColocalisationStepConfig):\n\"\"\"Colocalisation step.\n\n    This workflow runs colocalization analyses that assess the degree to which independent signals of the association share the same causal variant in a region of the genome, typically limited by linkage disequilibrium (LD).\n    \"\"\"\n\n    session: Session = Session()\n\n    def run(self: ColocalisationStep) -&gt; None:\n\"\"\"Run colocalisation step.\"\"\"\n        # Study-locus information\n        sl = StudyLocus.from_parquet(self.session, self.study_locus_path)\n        si = StudyIndex.from_parquet(self.session, self.study_index_path)\n\n        # Study-locus overlaps for 95% credible sets\n        sl_overlaps = sl.credible_set(CredibleInterval.IS95).overlaps(si)\n\n        coloc_results = Coloc.colocalise(\n            sl_overlaps, self.priorc1, self.priorc2, self.priorc12\n        )\n        ecaviar_results = ECaviar.colocalise(sl_overlaps)\n\n        coloc_results.df.unionByName(ecaviar_results.df, allowMissingColumns=True)\n\n        coloc_results.df.write.mode(self.session.write_mode).parquet(self.coloc_path)\n</code></pre> <p>Colocalisation step requirements.</p> <p>Attributes:</p> Name Type Description <code>study_locus_path</code> <code>DictConfig</code> <p>Input Study-locus path.</p> <code>coloc_path</code> <code>DictConfig</code> <p>Output Colocalisation path.</p> <code>priorc1</code> <code>float</code> <p>Prior on variant being causal for trait 1.</p> <code>priorc2</code> <code>float</code> <p>Prior on variant being causal for trait 2.</p> <code>priorc12</code> <code>float</code> <p>Prior on variant being causal for traits 1 and 2.</p> Source code in <code>src/otg/config.py</code> <pre><code>@dataclass\nclass ColocalisationStepConfig:\n\"\"\"Colocalisation step requirements.\n\n    Attributes:\n        study_locus_path (DictConfig): Input Study-locus path.\n        coloc_path (DictConfig): Output Colocalisation path.\n        priorc1 (float): Prior on variant being causal for trait 1.\n        priorc2 (float): Prior on variant being causal for trait 2.\n        priorc12 (float): Prior on variant being causal for traits 1 and 2.\n    \"\"\"\n\n    _target_: str = \"otg.colocalisation.ColocalisationStep\"\n    study_locus_path: str = MISSING\n    study_index_path: str = MISSING\n    coloc_path: str = MISSING\n    priorc1: float = 1e-4\n    priorc2: float = 1e-4\n    priorc12: float = 1e-5\n</code></pre>"},{"location":"components/step/colocalisation/#otg.colocalisation.ColocalisationStep.run","title":"<code>run()</code>","text":"<p>Run colocalisation step.</p> Source code in <code>src/otg/colocalisation.py</code> <pre><code>def run(self: ColocalisationStep) -&gt; None:\n\"\"\"Run colocalisation step.\"\"\"\n    # Study-locus information\n    sl = StudyLocus.from_parquet(self.session, self.study_locus_path)\n    si = StudyIndex.from_parquet(self.session, self.study_index_path)\n\n    # Study-locus overlaps for 95% credible sets\n    sl_overlaps = sl.credible_set(CredibleInterval.IS95).overlaps(si)\n\n    coloc_results = Coloc.colocalise(\n        sl_overlaps, self.priorc1, self.priorc2, self.priorc12\n    )\n    ecaviar_results = ECaviar.colocalise(sl_overlaps)\n\n    coloc_results.df.unionByName(ecaviar_results.df, allowMissingColumns=True)\n\n    coloc_results.df.write.mode(self.session.write_mode).parquet(self.coloc_path)\n</code></pre>"},{"location":"components/step/finngen/","title":"FinnGen","text":"<p>         Bases: <code>FinnGenStepConfig</code></p> <p>FinnGen study table ingestion step.</p> Source code in <code>src/otg/finngen.py</code> <pre><code>@dataclass\nclass FinnGenStep(FinnGenStepConfig):\n\"\"\"FinnGen study table ingestion step.\"\"\"\n\n    session: Session = Session()\n\n    def run(self: FinnGenStep) -&gt; None:\n\"\"\"Run FinnGen study table ingestion step.\"\"\"\n        # Read the JSON data from the URL.\n        json_data = urlopen(self.finngen_phenotype_table_url).read().decode(\"utf-8\")\n        rdd = self.session.spark.sparkContext.parallelize([json_data])\n        df = self.session.spark.read.json(rdd)\n\n        # Parse the study index data.\n        finngen_studies = StudyIndexFinnGen.from_source(\n            df,\n            self.finngen_release_prefix,\n            self.finngen_sumstat_url_prefix,\n            self.finngen_sumstat_url_suffix,\n        )\n\n        # Write the output.\n        finngen_studies.df.write.mode(self.session.write_mode).parquet(\n            self.finngen_study_index_out\n        )\n</code></pre> <p>FinnGen study table ingestion step requirements.</p> <p>Attributes:</p> Name Type Description <code>finngen_phenotype_table_url</code> <code>str</code> <p>FinnGen API for fetching the list of studies.</p> <code>finngen_release_prefix</code> <code>str</code> <p>Release prefix pattern.</p> <code>finngen_sumstat_url_prefix</code> <code>str</code> <p>URL prefix for summary statistics location.</p> <code>finngen_sumstat_url_suffix</code> <code>str</code> <p>URL prefix suffix for summary statistics location.</p> <code>finngen_study_index_out</code> <code>str</code> <p>Output path for the FinnGen study index dataset.</p> Source code in <code>src/otg/config.py</code> <pre><code>@dataclass\nclass FinnGenStepConfig:\n\"\"\"FinnGen study table ingestion step requirements.\n\n    Attributes:\n        finngen_phenotype_table_url (str): FinnGen API for fetching the list of studies.\n        finngen_release_prefix (str): Release prefix pattern.\n        finngen_sumstat_url_prefix (str): URL prefix for summary statistics location.\n        finngen_sumstat_url_suffix (str): URL prefix suffix for summary statistics location.\n        finngen_study_index_out (str): Output path for the FinnGen study index dataset.\n    \"\"\"\n\n    _target_: str = \"otg.finngen.FinnGenStep\"\n    finngen_phenotype_table_url: str = MISSING\n    finngen_release_prefix: str = MISSING\n    finngen_sumstat_url_prefix: str = MISSING\n    finngen_sumstat_url_suffix: str = MISSING\n    finngen_study_index_out: str = MISSING\n</code></pre>"},{"location":"components/step/finngen/#otg.finngen.FinnGenStep.run","title":"<code>run()</code>","text":"<p>Run FinnGen study table ingestion step.</p> Source code in <code>src/otg/finngen.py</code> <pre><code>def run(self: FinnGenStep) -&gt; None:\n\"\"\"Run FinnGen study table ingestion step.\"\"\"\n    # Read the JSON data from the URL.\n    json_data = urlopen(self.finngen_phenotype_table_url).read().decode(\"utf-8\")\n    rdd = self.session.spark.sparkContext.parallelize([json_data])\n    df = self.session.spark.read.json(rdd)\n\n    # Parse the study index data.\n    finngen_studies = StudyIndexFinnGen.from_source(\n        df,\n        self.finngen_release_prefix,\n        self.finngen_sumstat_url_prefix,\n        self.finngen_sumstat_url_suffix,\n    )\n\n    # Write the output.\n    finngen_studies.df.write.mode(self.session.write_mode).parquet(\n        self.finngen_study_index_out\n    )\n</code></pre>"},{"location":"components/step/gene_index/","title":"Gene index","text":"<p>         Bases: <code>GeneIndexStepConfig</code></p> <p>Gene index step.</p> <p>This step generates a gene index dataset from an Open Targets Platform target dataset.</p> Source code in <code>src/otg/gene_index.py</code> <pre><code>@dataclass\nclass GeneIndexStep(GeneIndexStepConfig):\n\"\"\"Gene index step.\n\n    This step generates a gene index dataset from an Open Targets Platform target dataset.\n    \"\"\"\n\n    session: Session = Session()\n\n    def run(self: GeneIndexStep) -&gt; None:\n\"\"\"Run Target index step.\"\"\"\n        # Extract\n        platform_target = self.session.spark.read.parquet(self.target_path)\n        # Transform\n        gene_index = GeneIndex.from_source(platform_target)\n        # Load\n        gene_index.df.write.mode(self.session.write_mode).parquet(self.gene_index_path)\n</code></pre> <p>Gene index step requirements.</p> <p>Attributes:</p> Name Type Description <code>target_path</code> <code>str</code> <p>Open targets Platform target dataset path.</p> <code>gene_index_path</code> <code>str</code> <p>Output gene index path.</p> Source code in <code>src/otg/config.py</code> <pre><code>@dataclass\nclass GeneIndexStepConfig:\n\"\"\"Gene index step requirements.\n\n    Attributes:\n        target_path (str): Open targets Platform target dataset path.\n        gene_index_path (str): Output gene index path.\n    \"\"\"\n\n    _target_: str = \"otg.gene_index.GeneIndexStep\"\n    target_path: str = MISSING\n    gene_index_path: str = MISSING\n</code></pre>"},{"location":"components/step/gene_index/#otg.gene_index.GeneIndexStep.run","title":"<code>run()</code>","text":"<p>Run Target index step.</p> Source code in <code>src/otg/gene_index.py</code> <pre><code>def run(self: GeneIndexStep) -&gt; None:\n\"\"\"Run Target index step.\"\"\"\n    # Extract\n    platform_target = self.session.spark.read.parquet(self.target_path)\n    # Transform\n    gene_index = GeneIndex.from_source(platform_target)\n    # Load\n    gene_index.df.write.mode(self.session.write_mode).parquet(self.gene_index_path)\n</code></pre>"},{"location":"components/step/gwas_catalog/","title":"GWAS Catalog","text":"<p>         Bases: <code>GWASCatalogStepConfig</code></p> <p>GWAS Catalog step.</p> Source code in <code>src/otg/gwas_catalog.py</code> <pre><code>@dataclass\nclass GWASCatalogStep(GWASCatalogStepConfig):\n\"\"\"GWAS Catalog step.\"\"\"\n\n    session: Session = Session()\n\n    def run(self: GWASCatalogStep) -&gt; None:\n\"\"\"Run GWAS Catalog ingestion step to extract GWASCatalog Study and StudyLocus tables.\"\"\"\n        hl.init(sc=self.session.spark.sparkContext, log=\"/dev/null\")\n        # All inputs:\n        # Variant annotation dataset\n        va = VariantAnnotation.from_parquet(self.session, self.variant_annotation_path)\n        # GWAS Catalog raw study information\n        catalog_studies = self.session.spark.read.csv(\n            self.catalog_studies_file, sep=\"\\t\", header=True\n        )\n        # GWAS Catalog ancestry information\n        ancestry_lut = self.session.spark.read.csv(\n            self.catalog_ancestry_file, sep=\"\\t\", header=True\n        )\n        # GWAS Catalog summary statistics information\n        sumstats_lut = self.session.spark.read.csv(\n            self.catalog_sumstats_lut, sep=\"\\t\", header=False\n        )\n        # GWAS Catalog raw association information\n        catalog_associations = self.session.spark.read.csv(\n            self.catalog_associations_file, sep=\"\\t\", header=True\n        )\n\n        # Transform:\n        # GWAS Catalog study index and study-locus splitted\n        study_index, study_locus = GWASCatalogSplitter.split(\n            StudyIndexGWASCatalog.from_source(\n                catalog_studies, ancestry_lut, sumstats_lut\n            ),\n            StudyLocusGWASCatalog.from_source(catalog_associations, va),\n        )\n\n        # Annotate LD information\n        study_locus = study_locus.annotate_ld(\n            self.session,\n            study_index,\n            self.ld_populations,\n            self.ld_index_template,\n            self.ld_matrix_template,\n            self.min_r2,\n        )\n\n        # Fine-mapping LD-clumped study-locus using PICS\n        finemapped_study_locus = (\n            PICS.finemap(study_locus).annotate_credible_sets().clump()\n        )\n\n        # Write:\n        study_index.df.write.mode(self.session.write_mode).parquet(\n            self.catalog_studies_out\n        )\n        finemapped_study_locus.df.write.mode(self.session.write_mode).parquet(\n            self.catalog_associations_out\n        )\n</code></pre> <p>GWAS Catalog step requirements.</p> <p>Attributes:</p> Name Type Description <code>catalog_studies_file</code> <code>str</code> <p>Raw GWAS catalog studies file.</p> <code>catalog_ancestry_file</code> <code>str</code> <p>Ancestry annotations file from GWAS Catalog.</p> <code>catalog_sumstats_lut</code> <code>str</code> <p>GWAS Catalog summary statistics lookup table.</p> <code>catalog_associations_file</code> <code>str</code> <p>Raw GWAS catalog associations file.</p> <code>variant_annotation_path</code> <code>str</code> <p>Input variant annotation path.</p> <code>ld_populations</code> <code>list</code> <p>List of populations to include.</p> <code>min_r2</code> <code>float</code> <p>Minimum r2 to consider when considering variants within a window.</p> <code>catalog_studies_out</code> <code>str</code> <p>Output GWAS catalog studies path.</p> <code>catalog_associations_out</code> <code>str</code> <p>Output GWAS catalog associations path.</p> Source code in <code>src/otg/config.py</code> <pre><code>@dataclass\nclass GWASCatalogStepConfig:\n\"\"\"GWAS Catalog step requirements.\n\n    Attributes:\n        catalog_studies_file (str): Raw GWAS catalog studies file.\n        catalog_ancestry_file (str): Ancestry annotations file from GWAS Catalog.\n        catalog_sumstats_lut (str): GWAS Catalog summary statistics lookup table.\n        catalog_associations_file (str): Raw GWAS catalog associations file.\n        variant_annotation_path (str): Input variant annotation path.\n        ld_populations (list): List of populations to include.\n        min_r2 (float): Minimum r2 to consider when considering variants within a window.\n        catalog_studies_out (str): Output GWAS catalog studies path.\n        catalog_associations_out (str): Output GWAS catalog associations path.\n    \"\"\"\n\n    _target_: str = \"otg.gwas_catalog.GWASCatalogStep\"\n    catalog_studies_file: str = MISSING\n    catalog_ancestry_file: str = MISSING\n    catalog_sumstats_lut: str = MISSING\n    catalog_associations_file: str = MISSING\n    variant_annotation_path: str = MISSING\n    min_r2: float = 0.5\n    ld_matrix_template: str = MISSING\n    ld_index_template: str = MISSING\n    ld_populations: List[str] = field(\n        default_factory=lambda: [\n            \"afr\",  # African-American\n            \"amr\",  # American Admixed/Latino\n            \"asj\",  # Ashkenazi Jewish\n            \"eas\",  # East Asian\n            \"fin\",  # Finnish\n            \"nfe\",  # Non-Finnish European\n            \"nwe\",  # Northwestern European\n            \"seu\",  # Southeastern European\n        ]\n    )\n    catalog_studies_out: str = MISSING\n    catalog_associations_out: str = MISSING\n</code></pre>"},{"location":"components/step/gwas_catalog/#otg.gwas_catalog.GWASCatalogStep.run","title":"<code>run()</code>","text":"<p>Run GWAS Catalog ingestion step to extract GWASCatalog Study and StudyLocus tables.</p> Source code in <code>src/otg/gwas_catalog.py</code> <pre><code>def run(self: GWASCatalogStep) -&gt; None:\n\"\"\"Run GWAS Catalog ingestion step to extract GWASCatalog Study and StudyLocus tables.\"\"\"\n    hl.init(sc=self.session.spark.sparkContext, log=\"/dev/null\")\n    # All inputs:\n    # Variant annotation dataset\n    va = VariantAnnotation.from_parquet(self.session, self.variant_annotation_path)\n    # GWAS Catalog raw study information\n    catalog_studies = self.session.spark.read.csv(\n        self.catalog_studies_file, sep=\"\\t\", header=True\n    )\n    # GWAS Catalog ancestry information\n    ancestry_lut = self.session.spark.read.csv(\n        self.catalog_ancestry_file, sep=\"\\t\", header=True\n    )\n    # GWAS Catalog summary statistics information\n    sumstats_lut = self.session.spark.read.csv(\n        self.catalog_sumstats_lut, sep=\"\\t\", header=False\n    )\n    # GWAS Catalog raw association information\n    catalog_associations = self.session.spark.read.csv(\n        self.catalog_associations_file, sep=\"\\t\", header=True\n    )\n\n    # Transform:\n    # GWAS Catalog study index and study-locus splitted\n    study_index, study_locus = GWASCatalogSplitter.split(\n        StudyIndexGWASCatalog.from_source(\n            catalog_studies, ancestry_lut, sumstats_lut\n        ),\n        StudyLocusGWASCatalog.from_source(catalog_associations, va),\n    )\n\n    # Annotate LD information\n    study_locus = study_locus.annotate_ld(\n        self.session,\n        study_index,\n        self.ld_populations,\n        self.ld_index_template,\n        self.ld_matrix_template,\n        self.min_r2,\n    )\n\n    # Fine-mapping LD-clumped study-locus using PICS\n    finemapped_study_locus = (\n        PICS.finemap(study_locus).annotate_credible_sets().clump()\n    )\n\n    # Write:\n    study_index.df.write.mode(self.session.write_mode).parquet(\n        self.catalog_studies_out\n    )\n    finemapped_study_locus.df.write.mode(self.session.write_mode).parquet(\n        self.catalog_associations_out\n    )\n</code></pre>"},{"location":"components/step/gwas_catalog_sumstat_preprocess/","title":"GWAS Catalog sumstat preprocess","text":"<p>         Bases: <code>GWASCatalogSumstatsPreprocessConfig</code></p> <p>Step to preprocess GWAS Catalog harmonised summary stats.</p> Source code in <code>src/otg/gwas_catalog_sumstat_preprocess.py</code> <pre><code>@dataclass\nclass GWASCatalogSumstatsPreprocessStep(GWASCatalogSumstatsPreprocessConfig):\n\"\"\"Step to preprocess GWAS Catalog harmonised summary stats.\"\"\"\n\n    session: Session = Session()\n\n    def run(self: GWASCatalogSumstatsPreprocessStep) -&gt; None:\n\"\"\"Run Step.\"\"\"\n        # Extract\n        self.session.logger.info(self.raw_sumstats_path)\n        self.session.logger.info(self.out_sumstats_path)\n        self.session.logger.info(self.study_id)\n\n        # Reading dataset:\n        raw_dataset = self.session.spark.read.csv(\n            self.raw_sumstats_path, header=True, sep=\"\\t\"\n        )\n        self.session.logger.info(\n            f\"Number of single point associations: {raw_dataset.count()}\"\n        )\n\n        # Processing dataset:\n        SummaryStatistics.from_gwas_harmonized_summary_stats(\n            raw_dataset, self.study_id\n        ).df.write.mode(self.session.write_mode).parquet(self.out_sumstats_path)\n        self.session.logger.info(\"Processing dataset successfully completed.\")\n</code></pre> <p>GWAS Catalog Sumstats Preprocessing step requirements.</p> <p>Attributes:</p> Name Type Description <code>raw_sumstats_path</code> <code>str</code> <p>Input raw GWAS Catalog summary statistics path.</p> <code>out_sumstats_path</code> <code>str</code> <p>Output GWAS Catalog summary statistics path.</p> <code>study_id</code> <code>str</code> <p>GWAS Catalog study identifier.</p> Source code in <code>src/otg/config.py</code> <pre><code>@dataclass\nclass GWASCatalogSumstatsPreprocessConfig:\n\"\"\"GWAS Catalog Sumstats Preprocessing step requirements.\n\n    Attributes:\n        raw_sumstats_path (str): Input raw GWAS Catalog summary statistics path.\n        out_sumstats_path (str): Output GWAS Catalog summary statistics path.\n        study_id (str): GWAS Catalog study identifier.\n    \"\"\"\n\n    _target_: str = (\n        \"otg.gwas_catalog_sumstat_preprocess.GWASCatalogSumstatsPreprocessStep\"\n    )\n    raw_sumstats_path: str = MISSING\n    out_sumstats_path: str = MISSING\n    study_id: str = MISSING\n</code></pre>"},{"location":"components/step/gwas_catalog_sumstat_preprocess/#otg.gwas_catalog_sumstat_preprocess.GWASCatalogSumstatsPreprocessStep.run","title":"<code>run()</code>","text":"<p>Run Step.</p> Source code in <code>src/otg/gwas_catalog_sumstat_preprocess.py</code> <pre><code>def run(self: GWASCatalogSumstatsPreprocessStep) -&gt; None:\n\"\"\"Run Step.\"\"\"\n    # Extract\n    self.session.logger.info(self.raw_sumstats_path)\n    self.session.logger.info(self.out_sumstats_path)\n    self.session.logger.info(self.study_id)\n\n    # Reading dataset:\n    raw_dataset = self.session.spark.read.csv(\n        self.raw_sumstats_path, header=True, sep=\"\\t\"\n    )\n    self.session.logger.info(\n        f\"Number of single point associations: {raw_dataset.count()}\"\n    )\n\n    # Processing dataset:\n    SummaryStatistics.from_gwas_harmonized_summary_stats(\n        raw_dataset, self.study_id\n    ).df.write.mode(self.session.write_mode).parquet(self.out_sumstats_path)\n    self.session.logger.info(\"Processing dataset successfully completed.\")\n</code></pre>"},{"location":"components/step/ld_index/","title":"LD index","text":"<p>         Bases: <code>LDIndexStepConfig</code></p> <p>LD index step.</p> Source code in <code>src/otg/ld_index.py</code> <pre><code>@dataclass\nclass LDIndexStep(LDIndexStepConfig):\n\"\"\"LD index step.\"\"\"\n\n    session: Session = Session()\n\n    def run(self: LDIndexStep) -&gt; None:\n\"\"\"Run LD index step.\"\"\"\n        # init hail session\n        hl.init(sc=self.session.spark.sparkContext, log=\"/dev/null\")\n\n        for population in self.ld_populations:\n            self.session.logger.info(f\"Processing population: {population}\")\n            ld_index = LDIndex.create(\n                self.ld_index_raw_template.format(POP=population),\n                self.ld_radius,\n                self.grch37_to_grch38_chain_path,\n            )\n\n            self.session.logger.info(\n                f\"Writing ls index to: {self.ld_index_template.format(POP=population)}\"\n            )\n            (\n                ld_index.df.write.partitionBy(\"chromosome\")\n                .mode(self.session.write_mode)\n                .parquet(self.ld_index_template.format(POP=population))  # noqa: FS002\n            )\n</code></pre> <p>LD index step requirements.</p> <p>Attributes:</p> Name Type Description <code>pop_ldindex_path</code> <code>str</code> <p>Input population LD index file from gnomAD.</p> <code>ld_radius</code> <code>int</code> <p>Window radius around locus.</p> <code>grch37_to_grch38_chain_path</code> <code>str</code> <p>Path to GRCh37 to GRCh38 chain file.</p> <code>ld_index_path</code> <code>str</code> <p>Output LD index path.</p> Source code in <code>src/otg/config.py</code> <pre><code>@dataclass\nclass LDIndexStepConfig:\n\"\"\"LD index step requirements.\n\n    Attributes:\n        pop_ldindex_path (str): Input population LD index file from gnomAD.\n        ld_radius (int): Window radius around locus.\n        grch37_to_grch38_chain_path (str): Path to GRCh37 to GRCh38 chain file.\n        ld_index_path (str): Output LD index path.\n    \"\"\"\n\n    _target_: str = \"otg.ld_index.LDIndexStep\"\n    ld_index_raw_template: str = \"gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht\"\n    ld_radius: int = 500_000\n    grch37_to_grch38_chain_path: str = MISSING\n    ld_index_template: str = MISSING\n    ld_populations: List[str] = field(\n        default_factory=lambda: [\n            \"afr\",  # African-American\n            \"amr\",  # American Admixed/Latino\n            \"asj\",  # Ashkenazi Jewish\n            \"eas\",  # East Asian\n            \"fin\",  # Finnish\n            \"nfe\",  # Non-Finnish European\n            \"nwe\",  # Northwestern European\n            \"seu\",  # Southeastern European\n        ]\n    )\n</code></pre>"},{"location":"components/step/ld_index/#otg.ld_index.LDIndexStep.run","title":"<code>run()</code>","text":"<p>Run LD index step.</p> Source code in <code>src/otg/ld_index.py</code> <pre><code>def run(self: LDIndexStep) -&gt; None:\n\"\"\"Run LD index step.\"\"\"\n    # init hail session\n    hl.init(sc=self.session.spark.sparkContext, log=\"/dev/null\")\n\n    for population in self.ld_populations:\n        self.session.logger.info(f\"Processing population: {population}\")\n        ld_index = LDIndex.create(\n            self.ld_index_raw_template.format(POP=population),\n            self.ld_radius,\n            self.grch37_to_grch38_chain_path,\n        )\n\n        self.session.logger.info(\n            f\"Writing ls index to: {self.ld_index_template.format(POP=population)}\"\n        )\n        (\n            ld_index.df.write.partitionBy(\"chromosome\")\n            .mode(self.session.write_mode)\n            .parquet(self.ld_index_template.format(POP=population))  # noqa: FS002\n        )\n</code></pre>"},{"location":"components/step/ukbiobank/","title":"UKBiobank","text":"<p>         Bases: <code>UKBiobankStepConfig</code></p> <p>UKBiobank study table ingestion step.</p> Source code in <code>src/otg/ukbiobank.py</code> <pre><code>@dataclass\nclass UKBiobankStep(UKBiobankStepConfig):\n\"\"\"UKBiobank study table ingestion step.\"\"\"\n\n    session: Session = Session()\n\n    def run(self: UKBiobankStep) -&gt; None:\n\"\"\"Run UKBiobank study table ingestion step.\"\"\"\n        # Read in the UKBiobank manifest tsv file.\n        df = self.session.spark.read.csv(\n            self.ukbiobank_manifest, sep=\"\\t\", header=True, inferSchema=True\n        )\n\n        # Parse the study index data.\n        ukbiobank_study_index = StudyIndexUKBiobank.from_source(df)\n\n        # Write the output.\n        ukbiobank_study_index.df.write.mode(self.session.write_mode).parquet(\n            self.ukbiobank_study_index_out\n        )\n</code></pre> <p>UKBiobank study table ingestion step requirements.</p> <p>Attributes:</p> Name Type Description <code>ukbiobank_manifest</code> <code>str</code> <p>UKBiobank manifest of studies.</p> <code>ukbiobank_study_index_out</code> <code>str</code> <p>Output path for the UKBiobank study index dataset.</p> Source code in <code>src/otg/config.py</code> <pre><code>@dataclass\nclass UKBiobankStepConfig:\n\"\"\"UKBiobank study table ingestion step requirements.\n\n    Attributes:\n        ukbiobank_manifest (str): UKBiobank manifest of studies.\n        ukbiobank_study_index_out (str): Output path for the UKBiobank study index dataset.\n    \"\"\"\n\n    _target_: str = \"otg.ukbiobank.UKBiobankStep\"\n    ukbiobank_manifest: str = MISSING\n    ukbiobank_study_index_out: str = MISSING\n</code></pre>"},{"location":"components/step/ukbiobank/#otg.ukbiobank.UKBiobankStep.run","title":"<code>run()</code>","text":"<p>Run UKBiobank study table ingestion step.</p> Source code in <code>src/otg/ukbiobank.py</code> <pre><code>def run(self: UKBiobankStep) -&gt; None:\n\"\"\"Run UKBiobank study table ingestion step.\"\"\"\n    # Read in the UKBiobank manifest tsv file.\n    df = self.session.spark.read.csv(\n        self.ukbiobank_manifest, sep=\"\\t\", header=True, inferSchema=True\n    )\n\n    # Parse the study index data.\n    ukbiobank_study_index = StudyIndexUKBiobank.from_source(df)\n\n    # Write the output.\n    ukbiobank_study_index.df.write.mode(self.session.write_mode).parquet(\n        self.ukbiobank_study_index_out\n    )\n</code></pre>"},{"location":"components/step/variant_annotation_step/","title":"Variant annotation","text":"<p>         Bases: <code>VariantAnnotationStepConfig</code></p> <p>Variant annotation step.</p> <p>Variant annotation step produces a dataset of the type <code>VariantAnnotation</code> derived from gnomADs <code>gnomad.genomes.vX.X.X.sites.ht</code> Hail's table. This dataset is used to validate variants and as a source of annotation.</p> Source code in <code>src/otg/variant_annotation.py</code> <pre><code>@dataclass\nclass VariantAnnotationStep(VariantAnnotationStepConfig):\n\"\"\"Variant annotation step.\n\n    Variant annotation step produces a dataset of the type `VariantAnnotation` derived from gnomADs `gnomad.genomes.vX.X.X.sites.ht` Hail's table. This dataset is used to validate variants and as a source of annotation.\n    \"\"\"\n\n    session: Session = Session()\n\n    def run(self: VariantAnnotationStep) -&gt; None:\n\"\"\"Run variant annotation step.\"\"\"\n        # init hail session\n        hl.init(sc=self.session.spark.sparkContext, log=\"/dev/null\")\n\n\"\"\"Run variant annotation step.\"\"\"\n        variant_annotation = VariantAnnotation.from_gnomad(\n            self.gnomad_genomes,\n            self.chain_38_to_37,\n            self.populations,\n        )\n        # Writing data partitioned by chromosome and position:\n        (\n            variant_annotation.df.repartition(400, \"chromosome\")\n            .sortWithinPartitions(\"chromosome\", \"position\")\n            .write.partitionBy(\"chromosome\")\n            .mode(self.session.write_mode)\n            .parquet(self.variant_annotation_path)\n        )\n</code></pre> <p>Variant annotation step requirements.</p> <p>Attributes:</p> Name Type Description <code>gnomad_genomes</code> <code>str</code> <p>Path to gnomAD genomes hail table.</p> <code>chain_38_to_37</code> <code>str</code> <p>Path to GRCh38 to GRCh37 chain file.</p> <code>variant_annotation_path</code> <code>str</code> <p>Output variant annotation path.</p> <code>populations</code> <code>List[str]</code> <p>List of populations to include.</p> Source code in <code>src/otg/config.py</code> <pre><code>@dataclass\nclass VariantAnnotationStepConfig:\n\"\"\"Variant annotation step requirements.\n\n    Attributes:\n        gnomad_genomes (str): Path to gnomAD genomes hail table.\n        chain_38_to_37 (str): Path to GRCh38 to GRCh37 chain file.\n        variant_annotation_path (str): Output variant annotation path.\n        populations (List[str]): List of populations to include.\n    \"\"\"\n\n    _target_: str = \"otg.variant_annotation.VariantAnnotationStep\"\n    gnomad_genomes: str = MISSING\n    chain_38_to_37: str = MISSING\n    variant_annotation_path: str = MISSING\n    populations: List[str] = field(\n        default_factory=lambda: [\n            \"afr\",  # African-American\n            \"amr\",  # American Admixed/Latino\n            \"ami\",  # Amish ancestry\n            \"asj\",  # Ashkenazi Jewish\n            \"eas\",  # East Asian\n            \"fin\",  # Finnish\n            \"nfe\",  # Non-Finnish European\n            \"mid\",  # Middle Eastern\n            \"sas\",  # South Asian\n            \"oth\",  # Other\n        ]\n    )\n</code></pre>"},{"location":"components/step/variant_annotation_step/#otg.variant_annotation.VariantAnnotationStep.run","title":"<code>run()</code>","text":"<p>Run variant annotation step.</p> Source code in <code>src/otg/variant_annotation.py</code> <pre><code>def run(self: VariantAnnotationStep) -&gt; None:\n\"\"\"Run variant annotation step.\"\"\"\n    # init hail session\n    hl.init(sc=self.session.spark.sparkContext, log=\"/dev/null\")\n\n\"\"\"Run variant annotation step.\"\"\"\n    variant_annotation = VariantAnnotation.from_gnomad(\n        self.gnomad_genomes,\n        self.chain_38_to_37,\n        self.populations,\n    )\n    # Writing data partitioned by chromosome and position:\n    (\n        variant_annotation.df.repartition(400, \"chromosome\")\n        .sortWithinPartitions(\"chromosome\", \"position\")\n        .write.partitionBy(\"chromosome\")\n        .mode(self.session.write_mode)\n        .parquet(self.variant_annotation_path)\n    )\n</code></pre>"},{"location":"components/step/variant_index_step/","title":"Variant index","text":"<p>         Bases: <code>VariantIndexStepConfig</code></p> <p>Variant index step.</p> <p>Using a <code>VariantAnnotation</code> dataset as a reference, this step creates and writes a dataset of the type <code>VariantIndex</code> that includes only variants that have disease-association data with a reduced set of annotations.</p> Source code in <code>src/otg/variant_index.py</code> <pre><code>@dataclass\nclass VariantIndexStep(VariantIndexStepConfig):\n\"\"\"Variant index step.\n\n    Using a `VariantAnnotation` dataset as a reference, this step creates and writes a dataset of the type `VariantIndex` that includes only variants that have disease-association data with a reduced set of annotations.\n    \"\"\"\n\n    session: Session = Session()\n\n    def run(self: VariantIndexStep) -&gt; None:\n\"\"\"Run variant index step.\"\"\"\n        # Variant annotation dataset\n        va = VariantAnnotation.from_parquet(self.session, self.variant_annotation_path)\n\n        # Study-locus dataset\n        study_locus = StudyLocus.from_parquet(self.session, self.study_locus_path)\n\n        # Reduce scope of variant annotation dataset to only variants in study-locus sets:\n        va_slimmed = va.filter_by_variant_df(\n            study_locus.unique_lead_tag_variants(), [\"id\", \"chromosome\"]\n        )\n\n        # Generate variant index ussing a subset of the variant annotation dataset\n        vi = VariantIndex.from_variant_annotation(va_slimmed)\n\n        # Write data:\n        # self.etl.logger.info(\n        #     f\"Writing invalid variants from the credible set to: {self.variant_invalid}\"\n        # )\n        # vi.invalid_variants.write.mode(self.etl.write_mode).parquet(\n        #     self.variant_invalid\n        # )\n\n        self.session.logger.info(f\"Writing variant index to: {self.variant_index_path}\")\n        (\n            vi.df.write.partitionBy(\"chromosome\")\n            .mode(self.session.write_mode)\n            .parquet(self.variant_index_path)\n        )\n</code></pre> <p>Variant index step requirements.</p> <p>Attributes:</p> Name Type Description <code>variant_annotation_path</code> <code>str</code> <p>Input variant annotation path.</p> <code>study_locus_path</code> <code>str</code> <p>Input study-locus path.</p> <code>variant_index_path</code> <code>str</code> <p>Output variant index path.</p> Source code in <code>src/otg/config.py</code> <pre><code>@dataclass\nclass VariantIndexStepConfig:\n\"\"\"Variant index step requirements.\n\n    Attributes:\n        variant_annotation_path (str): Input variant annotation path.\n        study_locus_path (str): Input study-locus path.\n        variant_index_path (str): Output variant index path.\n    \"\"\"\n\n    _target_: str = \"otg.variant_index.VariantIndexStep\"\n    variant_annotation_path: str = MISSING\n    study_locus_path: str = MISSING\n    variant_index_path: str = MISSING\n</code></pre>"},{"location":"components/step/variant_index_step/#otg.variant_index.VariantIndexStep.run","title":"<code>run()</code>","text":"<p>Run variant index step.</p> Source code in <code>src/otg/variant_index.py</code> <pre><code>def run(self: VariantIndexStep) -&gt; None:\n\"\"\"Run variant index step.\"\"\"\n    # Variant annotation dataset\n    va = VariantAnnotation.from_parquet(self.session, self.variant_annotation_path)\n\n    # Study-locus dataset\n    study_locus = StudyLocus.from_parquet(self.session, self.study_locus_path)\n\n    # Reduce scope of variant annotation dataset to only variants in study-locus sets:\n    va_slimmed = va.filter_by_variant_df(\n        study_locus.unique_lead_tag_variants(), [\"id\", \"chromosome\"]\n    )\n\n    # Generate variant index ussing a subset of the variant annotation dataset\n    vi = VariantIndex.from_variant_annotation(va_slimmed)\n\n    # Write data:\n    # self.etl.logger.info(\n    #     f\"Writing invalid variants from the credible set to: {self.variant_invalid}\"\n    # )\n    # vi.invalid_variants.write.mode(self.etl.write_mode).parquet(\n    #     self.variant_invalid\n    # )\n\n    self.session.logger.info(f\"Writing variant index to: {self.variant_index_path}\")\n    (\n        vi.df.write.partitionBy(\"chromosome\")\n        .mode(self.session.write_mode)\n        .parquet(self.variant_index_path)\n    )\n</code></pre>"},{"location":"components/step/variant_to_gene_step/","title":"V2G","text":"<p>         Bases: <code>V2GStepConfig</code></p> <p>Variant-to-gene (V2G) step.</p> <p>This step aims to generate a dataset that contains multiple pieces of evidence supporting the functional association of specific variants with genes. Some of the evidence types include:</p> <ol> <li>Chromatin interaction experiments, e.g. Promoter Capture Hi-C (PCHi-C).</li> <li>In silico functional predictions, e.g. Variant Effect Predictor (VEP) from Ensembl.</li> <li>Distance between the variant and each gene's canonical transcription start site (TSS).</li> </ol> Source code in <code>src/otg/v2g.py</code> <pre><code>@dataclass\nclass V2GStep(V2GStepConfig):\n\"\"\"Variant-to-gene (V2G) step.\n\n    This step aims to generate a dataset that contains multiple pieces of evidence supporting the functional association of specific variants with genes. Some of the evidence types include:\n\n    1. Chromatin interaction experiments, e.g. Promoter Capture Hi-C (PCHi-C).\n    2. In silico functional predictions, e.g. Variant Effect Predictor (VEP) from Ensembl.\n    3. Distance between the variant and each gene's canonical transcription start site (TSS).\n\n    \"\"\"\n\n    session: Session = Session()\n\n    def run(self: V2GStep) -&gt; None:\n\"\"\"Run V2G dataset generation.\"\"\"\n        # Filter gene index by approved biotypes to define V2G gene universe\n        gene_index_filtered = GeneIndex.from_parquet(\n            self.session, self.gene_index_path\n        ).filter_by_biotypes(self.approved_biotypes)\n\n        vi = VariantIndex.from_parquet(self.session, self.variant_index_path).persist()\n        va = VariantAnnotation.from_parquet(self.session, self.variant_annotation_path)\n        vep_consequences = self.session.spark.read.csv(\n            self.vep_consequences_path, sep=\"\\t\", header=True\n        )\n\n        # Variant annotation reduced to the variant index to define V2G variant universe\n        va_slimmed = va.filter_by_variant_df(vi.df, [\"id\", \"chromosome\"]).persist()\n\n        # lift over variants to hg38\n        lift = LiftOverSpark(\n            self.liftover_chain_file_path, self.liftover_max_length_difference\n        )\n\n        v2g_datasets = [\n            va_slimmed.get_distance_to_tss(gene_index_filtered, self.max_distance),\n            # variant effects\n            va_slimmed.get_most_severe_vep_v2g(vep_consequences, gene_index_filtered),\n            va_slimmed.get_polyphen_v2g(gene_index_filtered),\n            va_slimmed.get_sift_v2g(gene_index_filtered),\n            va_slimmed.get_plof_v2g(gene_index_filtered),\n            # intervals\n            Intervals.parse_andersson(\n                self.session, self.anderson_path, gene_index_filtered, lift\n            ).v2g(vi),\n            Intervals.parse_javierre(\n                self.session, self.javierre_path, gene_index_filtered, lift\n            ).v2g(vi),\n            Intervals.parse_jung(\n                self.session, self.jung_path, gene_index_filtered, lift\n            ).v2g(vi),\n            Intervals.parse_thurman(\n                self.session, self.thurnman_path, gene_index_filtered, lift\n            ).v2g(vi),\n        ]\n\n        # merge all V2G datasets\n        v2g = V2G(\n            _df=reduce(\n                lambda x, y: x.unionByName(y, allowMissingColumns=True),\n                [dataset.df for dataset in v2g_datasets],\n            ).repartition(\"chromosome\")\n        )\n        # write V2G dataset\n        (\n            v2g.df.write.partitionBy(\"chromosome\")\n            .mode(self.session.write_mode)\n            .parquet(self.v2g_path)\n        )\n</code></pre> <p>Variant to gene (V2G) step requirements.</p> <p>Attributes:</p> Name Type Description <code>variant_index_path</code> <code>str</code> <p>Input variant index path.</p> <code>variant_annotation_path</code> <code>str</code> <p>Input variant annotation path.</p> <code>gene_index_path</code> <code>str</code> <p>Input gene index path.</p> <code>vep_consequences_path</code> <code>str</code> <p>Input VEP consequences path.</p> <code>lift_over_chain_file_path</code> <code>str</code> <p>Path to GRCh37 to GRCh38 chain file.</p> <code>approved_biotypes</code> <code>list[str]</code> <p>List of approved biotypes.</p> <code>anderson_path</code> <code>str</code> <p>Anderson intervals path.</p> <code>javierre_path</code> <code>str</code> <p>Javierre intervals path.</p> <code>jung_path</code> <code>str</code> <p>Jung intervals path.</p> <code>thurnman_path</code> <code>str</code> <p>Thurnman intervals path.</p> <code>liftover_max_length_difference</code> <code>int</code> <p>Maximum length difference for liftover.</p> <code>max_distance</code> <code>int</code> <p>Maximum distance to consider.</p> <code>output_path</code> <code>str</code> <p>Output V2G path.</p> Source code in <code>src/otg/config.py</code> <pre><code>@dataclass\nclass V2GStepConfig:\n\"\"\"Variant to gene (V2G) step requirements.\n\n    Attributes:\n        variant_index_path (str): Input variant index path.\n        variant_annotation_path (str): Input variant annotation path.\n        gene_index_path (str): Input gene index path.\n        vep_consequences_path (str): Input VEP consequences path.\n        lift_over_chain_file_path (str): Path to GRCh37 to GRCh38 chain file.\n        approved_biotypes (list[str]): List of approved biotypes.\n        anderson_path (str): Anderson intervals path.\n        javierre_path (str): Javierre intervals path.\n        jung_path (str): Jung intervals path.\n        thurnman_path (str): Thurnman intervals path.\n        liftover_max_length_difference (int): Maximum length difference for liftover.\n        max_distance (int): Maximum distance to consider.\n        output_path (str): Output V2G path.\n    \"\"\"\n\n    _target_: str = \"otg.v2g.V2GStep\"\n    variant_index_path: str = MISSING\n    variant_annotation_path: str = MISSING\n    gene_index_path: str = MISSING\n    vep_consequences_path: str = MISSING\n    liftover_chain_file_path: str = MISSING\n    anderson_path: str = MISSING\n    javierre_path: str = MISSING\n    jung_path: str = MISSING\n    thurnman_path: str = MISSING\n    liftover_max_length_difference: int = 100\n    max_distance: int = 500_000\n    v2g_path: str = MISSING\n    approved_biotypes: List[str] = field(\n        default_factory=lambda: [\n            \"protein_coding\",\n            \"3prime_overlapping_ncRNA\",\n            \"antisense\",\n            \"bidirectional_promoter_lncRNA\",\n            \"IG_C_gene\",\n            \"IG_D_gene\",\n            \"IG_J_gene\",\n            \"IG_V_gene\",\n            \"lincRNA\",\n            \"macro_lncRNA\",\n            \"non_coding\",\n            \"sense_intronic\",\n            \"sense_overlapping\",\n        ]\n    )\n</code></pre>"},{"location":"components/step/variant_to_gene_step/#otg.v2g.V2GStep.run","title":"<code>run()</code>","text":"<p>Run V2G dataset generation.</p> Source code in <code>src/otg/v2g.py</code> <pre><code>def run(self: V2GStep) -&gt; None:\n\"\"\"Run V2G dataset generation.\"\"\"\n    # Filter gene index by approved biotypes to define V2G gene universe\n    gene_index_filtered = GeneIndex.from_parquet(\n        self.session, self.gene_index_path\n    ).filter_by_biotypes(self.approved_biotypes)\n\n    vi = VariantIndex.from_parquet(self.session, self.variant_index_path).persist()\n    va = VariantAnnotation.from_parquet(self.session, self.variant_annotation_path)\n    vep_consequences = self.session.spark.read.csv(\n        self.vep_consequences_path, sep=\"\\t\", header=True\n    )\n\n    # Variant annotation reduced to the variant index to define V2G variant universe\n    va_slimmed = va.filter_by_variant_df(vi.df, [\"id\", \"chromosome\"]).persist()\n\n    # lift over variants to hg38\n    lift = LiftOverSpark(\n        self.liftover_chain_file_path, self.liftover_max_length_difference\n    )\n\n    v2g_datasets = [\n        va_slimmed.get_distance_to_tss(gene_index_filtered, self.max_distance),\n        # variant effects\n        va_slimmed.get_most_severe_vep_v2g(vep_consequences, gene_index_filtered),\n        va_slimmed.get_polyphen_v2g(gene_index_filtered),\n        va_slimmed.get_sift_v2g(gene_index_filtered),\n        va_slimmed.get_plof_v2g(gene_index_filtered),\n        # intervals\n        Intervals.parse_andersson(\n            self.session, self.anderson_path, gene_index_filtered, lift\n        ).v2g(vi),\n        Intervals.parse_javierre(\n            self.session, self.javierre_path, gene_index_filtered, lift\n        ).v2g(vi),\n        Intervals.parse_jung(\n            self.session, self.jung_path, gene_index_filtered, lift\n        ).v2g(vi),\n        Intervals.parse_thurman(\n            self.session, self.thurnman_path, gene_index_filtered, lift\n        ).v2g(vi),\n    ]\n\n    # merge all V2G datasets\n    v2g = V2G(\n        _df=reduce(\n            lambda x, y: x.unionByName(y, allowMissingColumns=True),\n            [dataset.df for dataset in v2g_datasets],\n        ).repartition(\"chromosome\")\n    )\n    # write V2G dataset\n    (\n        v2g.df.write.partitionBy(\"chromosome\")\n        .mode(self.session.write_mode)\n        .parquet(self.v2g_path)\n    )\n</code></pre>"}]}
\ No newline at end of file
diff --git a/sitemap.xml.gz b/sitemap.xml.gz
index 1057ad0e2..e9db2912a 100644
Binary files a/sitemap.xml.gz and b/sitemap.xml.gz differ