From 86600b0df666f4ed2874f446c3831fe2baa8ff13 Mon Sep 17 00:00:00 2001
From: Yakov <yt4@sanger.ac.uk>
Date: Tue, 9 Apr 2024 12:24:09 +0100
Subject: [PATCH 01/21] feat: add FM step with carma and sumstat imputation
 (#568)

* feat: add fm step with carma and sumstat imputation

* fix: adding log

* fix: fixing carma

* fix: resolving conflict

* fix: resolve conflicts with dev v2

* fix: siliencing FutureWarning in Carma
---
 src/gentropy/method/carma.py     |  19 +-
 src/gentropy/susie_finemapper.py | 296 ++++++++++++++++++++++++++++++-
 2 files changed, 310 insertions(+), 5 deletions(-)

diff --git a/src/gentropy/method/carma.py b/src/gentropy/method/carma.py
index 75cb32c79..af8816706 100644
--- a/src/gentropy/method/carma.py
+++ b/src/gentropy/method/carma.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 import concurrent.futures
+import warnings
 from itertools import combinations
 from math import floor, lgamma
 from typing import Any
@@ -32,6 +33,8 @@ def time_limited_CARMA_spike_slab_noEM(
                 - B_list: A dataframe containing the marginal likelihoods and the corresponding model space or None.
                 - Outliers: A list of outlier SNPs or None.
         """
+        # Ignore pandas future warnings
+        warnings.simplefilter(action="ignore", category=FutureWarning)
         try:
             # Execute CARMA.CARMA_spike_slab_noEM with a timeout
             with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
@@ -854,9 +857,19 @@ def _MCS_modified(  # noqa: C901
                         sec_sample = np.random.choice(
                             range(0, 3), 1, p=np.exp(aa) / np.sum(np.exp(aa))
                         )
-                        S = set_gamma[sec_sample[0]][
-                            int(set_star["gamma_set_index"][sec_sample[0]])
-                        ].tolist()
+                        if set_gamma[sec_sample[0]] is not None:
+                            S = set_gamma[sec_sample[0]][
+                                int(set_star["gamma_set_index"][sec_sample[0]])
+                            ].tolist()
+                        else:
+                            sec_sample = np.random.choice(
+                                range(1, 3),
+                                1,
+                                p=np.exp(aa)[[1, 2]] / np.sum(np.exp(aa)[[1, 2]]),
+                            )
+                            S = set_gamma[sec_sample[0]][
+                                int(set_star["gamma_set_index"][sec_sample[0]])
+                            ].tolist()
 
                 for item in conditional_S:
                     if item not in S:
diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py
index c5d86b1a8..aba5b4c40 100644
--- a/src/gentropy/susie_finemapper.py
+++ b/src/gentropy/susie_finemapper.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import time
 from typing import Any
 
 import numpy as np
@@ -15,6 +16,8 @@
 from gentropy.dataset.study_locus import StudyLocus
 from gentropy.dataset.summary_statistics import SummaryStatistics
 from gentropy.datasource.gnomad.ld import GnomADLDMatrix
+from gentropy.method.carma import CARMA
+from gentropy.method.sumstat_imputation import SummaryStatisticsImputation
 from gentropy.method.susie_inf import SUSIE_inf
 
 
@@ -150,6 +153,7 @@ def susie_inf_to_studylocus(
         region: str,
         variant_index: DataFrame,
         cs_lbf_thr: float = 2,
+        sum_pips: float = 0.99,
     ) -> StudyLocus:
         """Convert SuSiE-inf output to StudyLocus DataFrame.
 
@@ -160,6 +164,7 @@ def susie_inf_to_studylocus(
             region (str): region
             variant_index (DataFrame): DataFrame with variant information
             cs_lbf_thr (float): credible set logBF threshold, default is 2
+            sum_pips (float): the expected sum of posterior probabilities in the locus, default is 0.99 (99% credible set)
 
         Returns:
             StudyLocus: StudyLocus object with fine-mapped credible sets
@@ -189,8 +194,8 @@ def susie_inf_to_studylocus(
                 susie_result[:, i + 1].astype(float).argsort()[::-1]
             ]
             cumsum_arr = np.cumsum(sorted_arr[:, i + 1].astype(float))
-            filter_row = np.argmax(cumsum_arr >= 0.99)
-            if filter_row == 0 and cumsum_arr[0] < 0.99:
+            filter_row = np.argmax(cumsum_arr >= sum_pips)
+            if filter_row == 0 and cumsum_arr[0] < sum_pips:
                 filter_row = len(cumsum_arr)
             filter_row += 1
             filtered_arr = sorted_arr[:filter_row]
@@ -378,3 +383,290 @@ def susie_finemapper_ss_gathered(
             region=region,
             variant_index=variant_index,
         )
+
+    @staticmethod
+    def susie_finemapper_from_prepared_dataframes(
+        GWAS_df: DataFrame,
+        ld_index: DataFrame,
+        gnomad_ld: np.ndarray,
+        L: int,
+        session: Session,
+        studyId: str,
+        region: str,
+        susie_est_tausq: bool = False,
+        run_carma: bool = False,
+        run_sumstat_imputation: bool = False,
+        carma_time_limit: int = 600,
+        imputed_r2_threshold: float = 0.8,
+        ld_score_threshold: float = 4,
+        sum_pips: float = 0.99,
+    ) -> dict[str, Any]:
+        """Susie fine-mapper function that uses LD, z-scores, variant info and other options for Fine-Mapping.
+
+        Args:
+            GWAS_df (DataFrame): GWAS DataFrame with mandotary columns: z, variantId
+            ld_index (DataFrame): LD index DataFrame
+            gnomad_ld (np.ndarray): GnomAD LD matrix
+            L (int): number of causal variants
+            session (Session): Spark session
+            studyId (str): study ID
+            region (str): region
+            susie_est_tausq (bool): estimate tau squared, default is False
+            run_carma (bool): run CARMA, default is False
+            run_sumstat_imputation (bool): run summary statistics imputation, default is False
+            carma_time_limit (int): CARMA time limit, default is 600 seconds
+            imputed_r2_threshold (float): imputed R2 threshold, default is 0.8
+            ld_score_threshold (float): LD score threshold ofr imputation, default is 4
+            sum_pips (float): the expected sum of posterior probabilities in the locus, default is 0.99 (99% credible set)
+
+        Returns:
+            dict[str, Any]: dictionary with study locus, number of GWAS variants, number of LD variants, number of variants after merge, number of outliers, number of imputed variants, number of variants to fine-map
+        """
+        # PLEASE DO NOT REMOVE THIS LINE
+        pd.DataFrame.iteritems = pd.DataFrame.items
+
+        start_time = time.time()
+        GWAS_df = GWAS_df.toPandas()
+        ld_index = ld_index.toPandas()
+        ld_index = ld_index.reset_index()
+
+        N_gwas = len(GWAS_df)
+        N_ld = len(ld_index)
+
+        # Filtering out the variants that are not in the LD matrix, we don't need them
+        df_columns = ["variantId", "z"]
+        GWAS_df = GWAS_df.merge(ld_index, on="variantId", how="inner")
+        GWAS_df = GWAS_df[df_columns].reset_index()
+        N_after_merge = len(GWAS_df)
+
+        merged_df = GWAS_df.merge(
+            ld_index, left_on="variantId", right_on="variantId", how="inner"
+        )
+        indices = merged_df["index_y"].values
+
+        ld_to_fm = gnomad_ld[indices][:, indices]
+        z_to_fm = GWAS_df["z"].values
+
+        if run_carma:
+            carma_output = CARMA.time_limited_CARMA_spike_slab_noEM(
+                z=z_to_fm, ld=ld_to_fm, sec_threshold=carma_time_limit
+            )
+            if carma_output["Outliers"] != [] and carma_output["Outliers"] is not None:
+                GWAS_df.drop(carma_output["Outliers"], inplace=True)
+                GWAS_df = GWAS_df.reset_index()
+                ld_index = ld_index.reset_index()
+                merged_df = GWAS_df.merge(
+                    ld_index, left_on="variantId", right_on="variantId", how="inner"
+                )
+                indices = merged_df["index_y"].values
+
+                ld_to_fm = gnomad_ld[indices][:, indices]
+                z_to_fm = GWAS_df["z"].values
+                N_outliers = len(carma_output["Outliers"])
+            else:
+                N_outliers = 0
+        else:
+            N_outliers = 0
+
+        if run_sumstat_imputation:
+            known = indices
+            unknown = [
+                index for index in list(range(len(gnomad_ld))) if index not in known
+            ]
+            sig_t = gnomad_ld[known, :][:, known]
+            sig_i_t = gnomad_ld[unknown, :][:, known]
+            zt = z_to_fm
+
+            sumstat_imp_res = SummaryStatisticsImputation.raiss_model(
+                z_scores_known=zt,
+                ld_matrix_known=sig_t,
+                ld_matrix_known_missing=sig_i_t,
+                lamb=0.01,
+                rtol=0.01,
+            )
+
+            bool_index = (sumstat_imp_res["imputation_r2"] >= imputed_r2_threshold) * (
+                sumstat_imp_res["ld_score"] >= ld_score_threshold
+            )
+            if sum(bool_index) >= 1:
+                indices = np.where(bool_index)[0]
+                index_to_add = [unknown[i] for i in indices]
+                index_to_fm = np.concatenate((known, index_to_add))
+
+                ld_to_fm = gnomad_ld[index_to_fm][:, index_to_fm]
+
+                snp_info_to_add = pd.DataFrame(
+                    {
+                        "variantId": ld_index.iloc[index_to_add, :]["variantId"],
+                        "z": sumstat_imp_res["mu"][indices],
+                    }
+                )
+                GWAS_df = pd.concat([GWAS_df, snp_info_to_add], ignore_index=True)
+                z_to_fm = GWAS_df["z"].values
+
+                N_imputed = len(indices)
+            else:
+                N_imputed = 0
+        else:
+            N_imputed = 0
+
+        susie_output = SUSIE_inf.susie_inf(
+            z=z_to_fm, LD=ld_to_fm, L=L, est_tausq=susie_est_tausq
+        )
+
+        schema = StructType([StructField("variantId", StringType(), True)])
+        variant_index = (
+            session.spark.createDataFrame(
+                GWAS_df[["variantId"]],
+                schema=schema,
+            )
+            .withColumn(
+                "chromosome", f.split(f.col("variantId"), "_")[0].cast("string")
+            )
+            .withColumn("position", f.split(f.col("variantId"), "_")[1].cast("int"))
+        )
+
+        study_locus = SusieFineMapperStep.susie_inf_to_studylocus(
+            susie_output=susie_output,
+            session=session,
+            studyId=studyId,
+            region=region,
+            variant_index=variant_index,
+            sum_pips=sum_pips,
+        )
+
+        end_time = time.time()
+
+        log_df = pd.DataFrame(
+            {
+                "N_gwas": N_gwas,
+                "N_ld": N_ld,
+                "N_overlap": N_after_merge,
+                "N_outliers": N_outliers,
+                "N_imputed": N_imputed,
+                "N_final_to_fm": len(ld_to_fm),
+                "eleapsed_time": end_time - start_time,
+            },
+            index=[0],
+        )
+
+        return {
+            "study_locus": study_locus,
+            "log": log_df,
+        }
+
+    @staticmethod
+    def susie_finemapper_one_studylocus_row_v2_dev(
+        GWAS: SummaryStatistics,
+        session: Session,
+        study_locus_row: Row,
+        study_index: StudyIndex,
+        window: int = 1_000_000,
+        L: int = 10,
+        susie_est_tausq: bool = False,
+        run_carma: bool = False,
+        run_sumstat_imputation: bool = False,
+        carma_time_limit: int = 600,
+        imputed_r2_threshold: float = 0.8,
+        ld_score_threshold: float = 4,
+        sum_pips: float = 0.99,
+    ) -> dict[str, Any]:
+        """Susie fine-mapper function that uses Summary Statstics, chromosome and position as inputs.
+
+        Args:
+            GWAS (SummaryStatistics): GWAS summary statistics
+            session (Session): Spark session
+            study_locus_row (Row): StudyLocus row
+            study_index (StudyIndex): StudyIndex object
+            window (int): window size for fine-mapping
+            L (int): number of causal variants
+            susie_est_tausq (bool): estimate tau squared, default is False
+            run_carma (bool): run CARMA, default is False
+            run_sumstat_imputation (bool): run summary statistics imputation, default is False
+            carma_time_limit (int): CARMA time limit, default is 600 seconds
+            imputed_r2_threshold (float): imputed R2 threshold, default is 0.8
+            ld_score_threshold (float): LD score threshold ofr imputation, default is 4
+            sum_pips (float): the expected sum of posterior probabilities in the locus, default is 0.99 (99% credible set)
+
+        Returns:
+            dict[str, Any]: dictionary with study locus, number of GWAS variants, number of LD variants, number of variants after merge, number of outliers, number of imputed variants, number of variants to fine-map
+        """
+        # PLEASE DO NOT REMOVE THIS LINE
+        pd.DataFrame.iteritems = pd.DataFrame.items
+
+        chromosome = study_locus_row["chromosome"]
+        position = study_locus_row["position"]
+        studyId = study_locus_row["studyId"]
+
+        study_index_df = study_index._df
+        study_index_df = study_index_df.filter(f.col("studyId") == studyId)
+        major_population = study_index_df.select(
+            "studyId",
+            f.array_max(f.col("ldPopulationStructure"))
+            .getItem("ldPopulation")
+            .alias("majorPopulation"),
+        ).collect()[0]["majorPopulation"]
+
+        region = (
+            chromosome
+            + ":"
+            + str(int(position - window / 2))
+            + "-"
+            + str(int(position + window / 2))
+        )
+        gwas_df = (
+            GWAS.df.withColumn("z", f.col("beta") / f.col("standardError"))
+            .withColumn(
+                "chromosome", f.split(f.col("variantId"), "_")[0].cast("string")
+            )
+            .withColumn("position", f.split(f.col("variantId"), "_")[1].cast("int"))
+            .filter(f.col("studyId") == studyId)
+            .filter(f.col("z").isNotNull())
+            .filter(f.col("chromosome") == chromosome)
+            .filter(f.col("position") >= position - window / 2)
+            .filter(f.col("position") <= position + window / 2)
+        )
+
+        ld_index = (
+            GnomADLDMatrix()
+            .get_locus_index(
+                study_locus_row=study_locus_row,
+                window_size=window,
+                major_population=major_population,
+            )
+            .withColumn(
+                "variantId",
+                f.concat(
+                    f.lit(chromosome),
+                    f.lit("_"),
+                    f.col("`locus.position`"),
+                    f.lit("_"),
+                    f.col("alleles").getItem(0),
+                    f.lit("_"),
+                    f.col("alleles").getItem(1),
+                ).cast("string"),
+            )
+        )
+
+        gnomad_ld = GnomADLDMatrix.get_numpy_matrix(
+            ld_index, gnomad_ancestry=major_population
+        )
+
+        out = SusieFineMapperStep.susie_finemapper_from_prepared_dataframes(
+            GWAS_df=gwas_df,
+            ld_index=ld_index,
+            gnomad_ld=gnomad_ld,
+            L=L,
+            session=session,
+            studyId=studyId,
+            region=region,
+            susie_est_tausq=susie_est_tausq,
+            run_carma=run_carma,
+            run_sumstat_imputation=run_sumstat_imputation,
+            carma_time_limit=carma_time_limit,
+            imputed_r2_threshold=imputed_r2_threshold,
+            ld_score_threshold=ld_score_threshold,
+            sum_pips=sum_pips,
+        )
+
+        return out

From dc4e3674d30baf736fd540871c1fc00179777290 Mon Sep 17 00:00:00 2001
From: Yakov <yt4@sanger.ac.uk>
Date: Wed, 10 Apr 2024 13:23:44 +0100
Subject: [PATCH 02/21] fix: adding deduplication for GWAS in locus (#573)

---
 src/gentropy/susie_finemapper.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py
index aba5b4c40..db62fe359 100644
--- a/src/gentropy/susie_finemapper.py
+++ b/src/gentropy/susie_finemapper.py
@@ -427,6 +427,11 @@ def susie_finemapper_from_prepared_dataframes(
 
         start_time = time.time()
         GWAS_df = GWAS_df.toPandas()
+        N_gwas_before_dedupl = len(GWAS_df)
+
+        GWAS_df = GWAS_df.drop_duplicates(subset="variantId", keep=False)
+        GWAS_df = GWAS_df.reset_index()
+
         ld_index = ld_index.toPandas()
         ld_index = ld_index.reset_index()
 
@@ -539,6 +544,7 @@ def susie_finemapper_from_prepared_dataframes(
 
         log_df = pd.DataFrame(
             {
+                "N_gwas_before_dedupl": N_gwas_before_dedupl,
                 "N_gwas": N_gwas,
                 "N_ld": N_ld,
                 "N_overlap": N_after_merge,

From a5b62f2359aa8285ce638329400c39a55101d622 Mon Sep 17 00:00:00 2001
From: Yakov <yt4@sanger.ac.uk>
Date: Wed, 10 Apr 2024 14:02:51 +0100
Subject: [PATCH 03/21] feat: add benchmarking for fine-mapping using Alzheimer
 as example (#572)

* feat: add benchmarking for fine-mapping using Alzheimer as example

* fix: small fix in notebook
---
 notebooks/FineMapping_AlzheimierDisease.ipynb | 1734 +++++++++++++++++
 1 file changed, 1734 insertions(+)
 create mode 100644 notebooks/FineMapping_AlzheimierDisease.ipynb

diff --git a/notebooks/FineMapping_AlzheimierDisease.ipynb b/notebooks/FineMapping_AlzheimierDisease.ipynb
new file mode 100644
index 000000000..8a785cc3f
--- /dev/null
+++ b/notebooks/FineMapping_AlzheimierDisease.ipynb
@@ -0,0 +1,1734 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Fine-mapping of Alzheimer's disease GWAS summary statistics using GentroPy"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook presents an example of fine-mapping of the GWAS catalog study for Alzheimer's disease ([link to study](https://genetics.opentargets.org/Study/GCST90012877/associations)). The study itself is a good benchmarking example for fine-mapping - relatively large number of SNPs, very strong signal on the 19th chromosome (APOE). It's worth noting that usually very strong signals are excluded from fine-mapping due to instability.\n",
+    "\n",
+    "Also, we excluded MHC region (6:28M-34M) from fine-mapping because it has a huge density of the variants.\n",
+    "\n",
+    "To execute it on your local machine (not dataproc) you need to install https://github.com/broadinstitute/install-gcs-connector."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Initialization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Your browser has been opened to visit:\n",
+      "\n",
+      "    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=2Jvk4c7unAsigRvEKhceIxcrpGmeK8&access_type=offline&code_challenge=84guS6MmOY7qgvNpHLxoJbhRDBUAEUS93teMwQboD3Q&code_challenge_method=S256\n",
+      "\n",
+      "\n",
+      "Credentials saved to file: [/Users/yt4/.config/gcloud/application_default_credentials.json]\n",
+      "\n",
+      "These credentials will be used by any library that requests Application Default Credentials (ADC).\n",
+      "\n",
+      "Quota project \"open-targets-genetics-dev\" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.\n",
+      "\n",
+      "\n",
+      "Updates are available for some Google Cloud CLI components.  To install them,\n",
+      "please run:\n",
+      "  $ gcloud components update\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!gcloud auth application-default login"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style>\n",
+       "        .bk-notebook-logo {\n",
+       "            display: block;\n",
+       "            width: 20px;\n",
+       "            height: 20px;\n",
+       "            background-image: url(data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAYAAACNiR0NAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAABx0RVh0U29mdHdhcmUAQWRvYmUgRmlyZXdvcmtzIENTNui8sowAAAOkSURBVDiNjZRtaJVlGMd/1/08zzln5zjP1LWcU9N0NkN8m2CYjpgQYQXqSs0I84OLIC0hkEKoPtiH3gmKoiJDU7QpLgoLjLIQCpEsNJ1vqUOdO7ppbuec5+V+rj4ctwzd8IIbbi6u+8f1539dt3A78eXC7QizUF7gyV1fD1Yqg4JWz84yffhm0qkFqBogB9rM8tZdtwVsPUhWhGcFJngGeWrPzHm5oaMmkfEg1usvLFyc8jLRqDOMru7AyC8saQr7GG7f5fvDeH7Ej8CM66nIF+8yngt6HWaKh7k49Soy9nXurCi1o3qUbS3zWfrYeQDTB/Qj6kX6Ybhw4B+bOYoLKCC9H3Nu/leUTZ1JdRWkkn2ldcCamzrcf47KKXdAJllSlxAOkRgyHsGC/zRday5Qld9DyoM4/q/rUoy/CXh3jzOu3bHUVZeU+DEn8FInkPBFlu3+nW3Nw0mk6vCDiWg8CeJaxEwuHS3+z5RgY+YBR6V1Z1nxSOfoaPa4LASWxxdNp+VWTk7+4vzaou8v8PN+xo+KY2xsw6une2frhw05CTYOmQvsEhjhWjn0bmXPjpE1+kplmmkP3suftwTubK9Vq22qKmrBhpY4jvd5afdRA3wGjFAgcnTK2s4hY0/GPNIb0nErGMCRxWOOX64Z8RAC4oCXdklmEvcL8o0BfkNK4lUg9HTl+oPlQxdNo3Mg4Nv175e/1LDGzZen30MEjRUtmXSfiTVu1kK8W4txyV6BMKlbgk3lMwYCiusNy9fVfvvwMxv8Ynl6vxoByANLTWplvuj/nF9m2+PDtt1eiHPBr1oIfhCChQMBw6Aw0UulqTKZdfVvfG7VcfIqLG9bcldL/+pdWTLxLUy8Qq38heUIjh4XlzZxzQm19lLFlr8vdQ97rjZVOLf8nclzckbcD4wxXMidpX30sFd37Fv/GtwwhzhxGVAprjbg0gCAEeIgwCZyTV2Z1REEW8O4py0wsjeloKoMr6iCY6dP92H6Vw/oTyICIthibxjm/DfN9lVz8IqtqKYLUXfoKVMVQVVJOElGjrnnUt9T9wbgp8AyYKaGlqingHZU/uG2NTZSVqwHQTWkx9hxjkpWDaCg6Ckj5qebgBVbT3V3NNXMSiWSDdGV3hrtzla7J+duwPOToIg42ChPQOQjspnSlp1V+Gjdged7+8UN5CRAV7a5EdFNwCjEaBR27b3W890TE7g24NAP/mMDXRWrGoFPQI9ls/MWO2dWFAar/xcOIImbbpA3zgAAAABJRU5ErkJggg==);\n",
+       "        }\n",
+       "    </style>\n",
+       "    <div>\n",
+       "        <a href=\"https://bokeh.org\" target=\"_blank\" class=\"bk-notebook-logo\"></a>\n",
+       "        <span id=\"a8e9a6a6-96f7-4efd-a426-b2299499ef03\">Loading BokehJS ...</span>\n",
+       "    </div>\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/javascript": "(function(root) {\n  function now() {\n    return new Date();\n  }\n\n  const force = true;\n\n  if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n    root._bokeh_onload_callbacks = [];\n    root._bokeh_is_loading = undefined;\n  }\n\nconst JS_MIME_TYPE = 'application/javascript';\n  const HTML_MIME_TYPE = 'text/html';\n  const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n  const CLASS_NAME = 'output_bokeh rendered_html';\n\n  /**\n   * Render data to the DOM node\n   */\n  function render(props, node) {\n    const script = document.createElement(\"script\");\n    node.appendChild(script);\n  }\n\n  /**\n   * Handle when an output is cleared or removed\n   */\n  function handleClearOutput(event, handle) {\n    function drop(id) {\n      const view = Bokeh.index.get_by_id(id)\n      if (view != null) {\n        view.model.document.clear()\n        Bokeh.index.delete(view)\n      }\n    }\n\n    const cell = handle.cell;\n\n    const id = cell.output_area._bokeh_element_id;\n    const server_id = cell.output_area._bokeh_server_id;\n\n    // Clean up Bokeh references\n    if (id != null) {\n      drop(id)\n    }\n\n    if (server_id !== undefined) {\n      // Clean up Bokeh references\n      const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n      cell.notebook.kernel.execute(cmd_clean, {\n        iopub: {\n          output: function(msg) {\n            const id = msg.content.text.trim()\n            drop(id)\n          }\n        }\n      });\n      // Destroy server and session\n      const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n      cell.notebook.kernel.execute(cmd_destroy);\n    }\n  }\n\n  /**\n   * Handle when a new output is added\n   */\n  function handleAddOutput(event, handle) {\n    const output_area = handle.output_area;\n    const output = handle.output;\n\n    // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n    if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n      return\n    }\n\n    const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n\n    if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n      toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n      // store reference to embed id on output_area\n      output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n    }\n    if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n      const bk_div = document.createElement(\"div\");\n      bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n      const script_attrs = bk_div.children[0].attributes;\n      for (let i = 0; i < script_attrs.length; i++) {\n        toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n        toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n      }\n      // store reference to server id on output_area\n      output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n    }\n  }\n\n  function register_renderer(events, OutputArea) {\n\n    function append_mime(data, metadata, element) {\n      // create a DOM node to render to\n      const toinsert = this.create_output_subarea(\n        metadata,\n        CLASS_NAME,\n        EXEC_MIME_TYPE\n      );\n      this.keyboard_manager.register_events(toinsert);\n      // Render to node\n      const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n      render(props, toinsert[toinsert.length - 1]);\n      element.append(toinsert);\n      return toinsert\n    }\n\n    /* Handle when an output is cleared or removed */\n    events.on('clear_output.CodeCell', handleClearOutput);\n    events.on('delete.Cell', handleClearOutput);\n\n    /* Handle when a new output is added */\n    events.on('output_added.OutputArea', handleAddOutput);\n\n    /**\n     * Register the mime type and append_mime function with output_area\n     */\n    OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n      /* Is output safe? */\n      safe: true,\n      /* Index of renderer in `output_area.display_order` */\n      index: 0\n    });\n  }\n\n  // register the mime type if in Jupyter Notebook environment and previously unregistered\n  if (root.Jupyter !== undefined) {\n    const events = require('base/js/events');\n    const OutputArea = require('notebook/js/outputarea').OutputArea;\n\n    if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n      register_renderer(events, OutputArea);\n    }\n  }\n  if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n    root._bokeh_timeout = Date.now() + 5000;\n    root._bokeh_failed_load = false;\n  }\n\n  const NB_LOAD_WARNING = {'data': {'text/html':\n     \"<div style='background-color: #fdd'>\\n\"+\n     \"<p>\\n\"+\n     \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n     \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n     \"</p>\\n\"+\n     \"<ul>\\n\"+\n     \"<li>re-rerun `output_notebook()` to attempt to load from CDN again, or</li>\\n\"+\n     \"<li>use INLINE resources instead, as so:</li>\\n\"+\n     \"</ul>\\n\"+\n     \"<code>\\n\"+\n     \"from bokeh.resources import INLINE\\n\"+\n     \"output_notebook(resources=INLINE)\\n\"+\n     \"</code>\\n\"+\n     \"</div>\"}};\n\n  function display_loaded() {\n    const el = document.getElementById(\"a8e9a6a6-96f7-4efd-a426-b2299499ef03\");\n    if (el != null) {\n      el.textContent = \"BokehJS is loading...\";\n    }\n    if (root.Bokeh !== undefined) {\n      if (el != null) {\n        el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n      }\n    } else if (Date.now() < root._bokeh_timeout) {\n      setTimeout(display_loaded, 100)\n    }\n  }\n\n  function run_callbacks() {\n    try {\n      root._bokeh_onload_callbacks.forEach(function(callback) {\n        if (callback != null)\n          callback();\n      });\n    } finally {\n      delete root._bokeh_onload_callbacks\n    }\n    console.debug(\"Bokeh: all callbacks have finished\");\n  }\n\n  function load_libs(css_urls, js_urls, callback) {\n    if (css_urls == null) css_urls = [];\n    if (js_urls == null) js_urls = [];\n\n    root._bokeh_onload_callbacks.push(callback);\n    if (root._bokeh_is_loading > 0) {\n      console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n      return null;\n    }\n    if (js_urls == null || js_urls.length === 0) {\n      run_callbacks();\n      return null;\n    }\n    console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n    root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n    function on_load() {\n      root._bokeh_is_loading--;\n      if (root._bokeh_is_loading === 0) {\n        console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n        run_callbacks()\n      }\n    }\n\n    function on_error(url) {\n      console.error(\"failed to load \" + url);\n    }\n\n    for (let i = 0; i < css_urls.length; i++) {\n      const url = css_urls[i];\n      const element = document.createElement(\"link\");\n      element.onload = on_load;\n      element.onerror = on_error.bind(null, url);\n      element.rel = \"stylesheet\";\n      element.type = \"text/css\";\n      element.href = url;\n      console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n      document.body.appendChild(element);\n    }\n\n    for (let i = 0; i < js_urls.length; i++) {\n      const url = js_urls[i];\n      const element = document.createElement('script');\n      element.onload = on_load;\n      element.onerror = on_error.bind(null, url);\n      element.async = false;\n      element.src = url;\n      console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n      document.head.appendChild(element);\n    }\n  };\n\n  function inject_raw_css(css) {\n    const element = document.createElement(\"style\");\n    element.appendChild(document.createTextNode(css));\n    document.body.appendChild(element);\n  }\n\n  const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.3.0.min.js\"];\n  const css_urls = [];\n\n  const inline_js = [    function(Bokeh) {\n      Bokeh.set_log_level(\"info\");\n    },\nfunction(Bokeh) {\n    }\n  ];\n\n  function run_inline_js() {\n    if (root.Bokeh !== undefined || force === true) {\n          for (let i = 0; i < inline_js.length; i++) {\n      inline_js[i].call(root, root.Bokeh);\n    }\nif (force === true) {\n        display_loaded();\n      }} else if (Date.now() < root._bokeh_timeout) {\n      setTimeout(run_inline_js, 100);\n    } else if (!root._bokeh_failed_load) {\n      console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n      root._bokeh_failed_load = true;\n    } else if (force !== true) {\n      const cell = $(document.getElementById(\"a8e9a6a6-96f7-4efd-a426-b2299499ef03\")).parents('.cell').data().cell;\n      cell.output_area.append_execute_result(NB_LOAD_WARNING)\n    }\n  }\n\n  if (root._bokeh_is_loading === 0) {\n    console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n    run_inline_js();\n  } else {\n    load_libs(css_urls, js_urls, function() {\n      console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n      run_inline_js();\n    });\n  }\n}(window));",
+      "application/vnd.bokehjs_load.v0+json": ""
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "24/04/09 10:40:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting default log level to \"WARN\".\n",
+      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
+      "pip-installed Hail requires additional configuration options in Spark referring\n",
+      "  to the path to the Hail Python module directory HAIL_DIR,\n",
+      "  e.g. /path/to/python/site-packages/hail:\n",
+      "    spark.jars=HAIL_DIR/backend/hail-all-spark.jar\n",
+      "    spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar\n",
+      "    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 3.3.4\n",
+      "SparkUI available at http://192.168.0.232:4040\n",
+      "Welcome to\n",
+      "     __  __     <>__\n",
+      "    / /_/ /__  __/ /\n",
+      "   / __  / _ `/ / /\n",
+      "  /_/ /_/\\_,_/_/_/   version 0.2.127-bb535cd096c5\n",
+      "LOGGING: writing to /dev/null\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import hail as hl\n",
+    "import pyspark.sql.functions as f\n",
+    "import pandas as pd\n",
+    "pd.set_option('display.max_colwidth', None)\n",
+    "pd.set_option('display.expand_frame_repr', False)\n",
+    "\n",
+    "from gentropy.common.session import Session\n",
+    "from gentropy.dataset.study_index import StudyIndex\n",
+    "from gentropy.dataset.summary_statistics import SummaryStatistics\n",
+    "from gentropy.dataset.study_index import StudyIndex\n",
+    "from gentropy.method.window_based_clumping import WindowBasedClumping\n",
+    "from gentropy.susie_finemapper import SusieFineMapperStep\n",
+    "\n",
+    "hail_dir = os.path.dirname(hl.__file__)\n",
+    "session = Session(hail_home=hail_dir, start_hail=True, extended_spark_conf={\"spark.driver.memory\": \"12g\",\n",
+    "    \"spark.kryoserializer.buffer.max\": \"500m\",\"spark.driver.maxResultSize\":\"3g\"})\n",
+    "hl.init(sc=session.spark.sparkContext, log=\"/dev/null\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Loading the data and clumping"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of SNPs in GWAS:  10607272\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[Stage 7:>                                                          (0 + 1) / 1]\r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of clumps:  33\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    }
+   ],
+   "source": [
+    "path_gwas1=\"gs://gwas_catalog_data/harmonised_summary_statistics/GCST90012877.parquet\"\n",
+    "path_si=\"gs://gwas_catalog_data/study_index\"\n",
+    "\n",
+    "gwas1 = SummaryStatistics.from_parquet(session, path_gwas1)\n",
+    "study_index = StudyIndex.from_parquet(session, path_si)\n",
+    "\n",
+    "slt=WindowBasedClumping.clump(gwas1,gwas_significance=5e-8,distance=1e6)\n",
+    "slt_df=slt._df\n",
+    "\n",
+    "print(\"Number of SNPs in GWAS: \",gwas1._df.count())\n",
+    "print(\"Number of clumps: \",slt_df.count())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[Stage 15:===================================================>     (9 + 1) / 10]\r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+------------+----------------+----------+---------+----------------+----------+--------------+--------------+-------------------------------+----------------+--------------------+---------------+\n",
+      "|     studyId|       variantId|chromosome| position|            beta|sampleSize|pValueMantissa|pValueExponent|effectAlleleFrequencyFromSource|   standardError|        studyLocusId|qualityControls|\n",
+      "+------------+----------------+----------+---------+----------------+----------+--------------+--------------+-------------------------------+----------------+--------------------+---------------+\n",
+      "|GCST90012877| 1_161185602_G_A|         1|161185602| 0.0609052805639|      null|         4.302|            -8|                        0.23499| 0.0111181765833| 6360456299763482946|             []|\n",
+      "|GCST90012877| 1_207577223_T_C|         1|207577223| -0.122752564739|      null|         1.403|           -23|                       0.822818| 0.0122652043685|-6742466305250328444|             []|\n",
+      "|GCST90012877| 10_11678309_A_G|        10| 11678309| 0.0668997305692|      null|         1.085|           -11|                       0.380517|0.00984571382836| 3672202482976347473|             []|\n",
+      "|GCST90012877| 10_59886075_G_T|        10| 59886075|-0.0523916765294|      null|         3.802|            -8|                       0.480668|0.00952612570169|  760299597568413738|             []|\n",
+      "|GCST90012877| 10_80520381_T_G|        10| 80520381| 0.0701098772587|      null|         2.736|            -9|                       0.793475| 0.0117897597766|-6168361428432361140|             []|\n",
+      "|GCST90012877|11_121564878_T_C|        11|121564878| -0.186386086749|      null|         5.586|           -14|                       0.037005| 0.0247938672944|-7548659272243096830|             []|\n",
+      "|GCST90012877| 11_47370397_G_A|        11| 47370397| 0.0634588530202|      null|         6.911|           -11|                       0.387521| 0.0097291000298| 1916491992423016132|             []|\n",
+      "|GCST90012877| 11_60328267_T_C|        11| 60328267|-0.0892048800109|      null|         9.335|           -20|                       0.371215|0.00980658024905| 3318332793803757311|             []|\n",
+      "|GCST90012877| 11_86156833_A_G|        11| 86156833|  0.103281644827|      null|         5.214|           -26|                       0.629462|0.00979200684254| 3806751464721795080|             []|\n",
+      "|GCST90012877| 14_52924962_A_G|        14| 52924962|  0.102404628268|      null|          3.69|           -10|                       0.092233| 0.0163413709974|-8640267085448358001|             []|\n",
+      "|GCST90012877| 14_92472511_G_A|        14| 92472511|-0.0762776811698|      null|         7.454|           -14|                       0.339674| 0.0101980809801| 8895835730818824947|             []|\n",
+      "|GCST90012877| 15_50707194_C_G|        15| 50707194|-0.0722934881552|      null|         1.639|            -9|                       0.197469| 0.0119888249532|-4585712009512019667|             []|\n",
+      "|GCST90012877| 15_58730416_T_C|        15| 58730416|-0.0675867539589|      null|         2.674|           -11|                       0.319058|  0.010142839928|-9173595866829505633|             []|\n",
+      "|GCST90012877| 15_63277703_C_T|        15| 63277703| 0.0849598934189|      null|         1.052|            -8|                       0.139487| 0.0148475601067|-6181511576673508209|             []|\n",
+      "|GCST90012877| 16_31115000_C_A|        16| 31115000|-0.0620662164665|      null|         4.466|            -9|                       0.281382| 0.0105807514538|-3612515273077152914|             []|\n",
+      "|GCST90012877|  17_5229833_T_C|        17|  5229833|-0.0849787931131|      null|         1.352|            -9|                       0.875068| 0.0140203927902|-7070596043624425654|             []|\n",
+      "|GCST90012877| 17_63483402_T_C|        17| 63483402| 0.0542810764988|      null|         1.215|            -8|                       0.529632|0.00952697587266| 7171154626284587162|             []|\n",
+      "|GCST90012877|  19_1050875_A_G|        19|  1050875|-0.0772974277902|      null|         2.415|           -13|                       0.674169| 0.0105546077307| 6109438569946056978|             []|\n",
+      "|GCST90012877| 19_44892009_G_A|        19| 44892009|  0.352722374032|      null|         1.995|          -277|                       0.605067|0.00991069396551| 6814727764900576662|             []|\n",
+      "|GCST90012877| 19_51224706_C_A|        19| 51224706|-0.0582180344342|      null|         1.295|            -8|                       0.325551|  0.010237506551|-8288099943480320096|             []|\n",
+      "+------------+----------------+----------+---------+----------------+----------+--------------+--------------+-------------------------------+----------------+--------------------+---------------+\n",
+      "only showing top 20 rows\n",
+      "\n",
+      "None\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    }
+   ],
+   "source": [
+    "print(slt_df.show())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fine-mapping without outliers detection and imputation using 2M as window size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-04-09 10:41:57.354 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 10:42:15.499 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 10:42:28.284 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 10:44:41.305 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 10:44:51.854 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 10:45:03.059 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 10:47:04.871 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 10:47:17.310 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 10:47:29.113 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 10:50:32.790 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 10:50:46.191 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 10:50:57.958 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 10:53:22.698 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 10:53:34.535 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 10:53:45.816 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 10:57:23.189 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 10:57:34.563 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 10:57:43.988 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 10:59:39.834 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 10:59:52.878 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:00:06.629 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:02:11.433 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 11:02:22.101 Hail: INFO: Ordering unsorted dataset with network shuffle\n",
+      "2024-04-09 11:02:32.320 Hail: INFO: wrote table with 175330 rows in 8 partitions to /tmp/__iruid_20813-EyC6kjgQ1hAjFSiH1Xp7sB\n",
+      "2024-04-09 11:02:35.350 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:04:16.225 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 11:04:27.837 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:04:41.879 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:07:09.950 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 11:07:21.139 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:07:33.197 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:09:56.240 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 11:10:08.288 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:10:20.802 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:14:07.114 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 11:14:20.204 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:14:32.464 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:16:41.133 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 11:16:53.191 Hail: INFO: Ordering unsorted dataset with network shuffle\n",
+      "2024-04-09 11:17:03.328 Hail: INFO: wrote table with 211068 rows in 9 partitions to /tmp/__iruid_35318-By6CsozcY2JvH6dhwjdBPU\n",
+      "2024-04-09 11:17:10.133 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:19:10.141 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 11:19:21.964 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:19:34.636 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:21:47.445 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 11:21:59.127 Hail: INFO: Ordering unsorted dataset with network shuffle\n",
+      "2024-04-09 11:22:15.902 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:23:18.294 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 11:23:32.131 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:23:48.719 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:26:51.703 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 11:27:02.820 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:27:18.412 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:29:02.997 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 11:29:19.516 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:29:41.224 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:33:19.553 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 11:33:32.903 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:33:49.144 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:36:16.552 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 11:36:28.952 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:36:46.964 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:40:40.837 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 11:40:51.976 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:41:04.014 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:43:54.259 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 11:44:05.534 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:44:20.087 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:46:45.605 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 11:46:59.301 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:47:13.181 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:49:50.219 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 11:50:02.311 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:50:16.072 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:52:34.864 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 11:52:46.513 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:53:00.919 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:55:50.580 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 11:56:02.124 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:56:16.907 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:59:15.457 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 11:59:27.380 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 11:59:40.184 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 12:03:31.055 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 12:03:42.554 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 12:03:53.915 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 12:06:33.123 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 12:06:45.813 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 12:06:58.340 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 12:09:23.153 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 12:09:33.531 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 12:09:43.693 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 12:11:13.739 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 12:11:26.087 Hail: INFO: Ordering unsorted dataset with network shuffle\n",
+      "2024-04-09 12:11:38.950 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 12:13:17.020 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 12:13:29.727 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 12:13:45.213 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 12:20:03.844 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-09 12:20:13.834 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-09 12:20:24.282 Hail: INFO: Coerced sorted dataset\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  1:160185602-162185602 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/p5/4t9crp1563l792qz8xz_3x5h0000gq/T/ipykernel_46905/1319014212.py:29: FutureWarning:\n",
+      "\n",
+      "The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
+      "\n",
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  1:206577223-208577223 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  10:10678309-12678309 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  10:58886075-60886075 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  10:79520381-81520381 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  11:120564878-122564878 ; number of CSs:  2 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  11:46370397-48370397 ; number of CSs:  6 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  11:59328267-61328267 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  11:85156833-87156833 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  14:51924962-53924962 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  14:91472511-93472511 ; number of CSs:  2 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  15:49707194-51707194 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  15:57730416-59730416 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  15:62277703-64277703 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  16:30115000-32115000 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  17:4229833-6229833 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  17:62483402-64483402 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  19:50875-2050875 ; number of CSs:  2 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  19:43892009-45892009 ; number of CSs:  10 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  19:50224706-52224706 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  2:104749599-106749599 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  2:126135234-128135234 ; number of CSs:  2 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  2:232117202-234117202 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  2:64381229-66381229 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  20:55423488-57423488 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[Stage 1199:=====================>                                  (3 + 5) / 8]\r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  21:25775872-27775872 ; number of CSs:  2 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  4:10025995-12025995 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  6:39974457-41974457 ; number of CSs:  2 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  6:46627419-48627419 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[Stage 1403:===================================>                    (5 + 3) / 8]\r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  7:142410495-144410495 ; number of CSs:  2 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  7:99374211-101374211 ; number of CSs:  1 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[Stage 1505:>                                                       (0 + 8) / 8]\r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Region:  8:26610986-28610986 ; number of CSs:  3 ; log:\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    }
+   ],
+   "source": [
+    "df = slt_df.withColumn(\"row_index\", f.monotonically_increasing_id())\n",
+    "\n",
+    "columns = ['N_gwas', 'N_ld', 'N_overlap', 'N_outliers', 'N_imputed', 'N_final_to_fm', 'eleapsed_time']\n",
+    "logs = pd.DataFrame(columns=columns)\n",
+    "\n",
+    "for i in range(0,df.count()):\n",
+    "    if i!=27:\n",
+    "        one_row = df.filter(df.row_index == i).first()\n",
+    "\n",
+    "        res=SusieFineMapperStep.susie_finemapper_one_studylocus_row_v2_dev(\n",
+    "            GWAS=gwas1,\n",
+    "            session=session,\n",
+    "            study_locus_row=one_row,\n",
+    "            study_index=study_index,\n",
+    "            window= 2_000_000,\n",
+    "            L=10,\n",
+    "            susie_est_tausq=False,\n",
+    "            run_carma=False,\n",
+    "            run_sumstat_imputation=False,\n",
+    "            carma_time_limit=600,\n",
+    "            imputed_r2_threshold=0.8,\n",
+    "            ld_score_threshold=4\n",
+    "        )\n",
+    "\n",
+    "        sl=res[\"study_locus\"]\n",
+    "        #print(sl._df.withColumn(\"size\", f.size(sl._df[\"locus\"])).show())\n",
+    "        print(\"Region: \",sl._df.collect()[0]['region'], \"; number of CSs: \",sl._df.count(), \"; log:\")\n",
+    "        #print(res[\"log\"])\n",
+    "        logs=pd.concat([logs,res[\"log\"]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  N_gwas   N_ld N_overlap N_outliers N_imputed N_final_to_fm  eleapsed_time\n",
+      "0   7120  10431      6456          0         0          6456      56.839336\n",
+      "0   7128   8657      5769          0         0          5769      46.149004\n",
+      "0   9203  12106      7930          0         0          7930      93.531924\n",
+      "0   8351  10014      6995          0         0          6995      74.174323\n",
+      "0   9388  12551      8337          0         0          8337     120.602071\n",
+      "0   6560   8729      5758          0         0          5758      45.064894\n",
+      "0   5005   7701      3954          0         0          3954      55.229344\n",
+      "0   7012   8940      5815          0         0          5815      38.824251\n",
+      "0   8661  10303      7291          0         0          7291      68.802810\n",
+      "0   8081   9966      6771          0         0          6771      64.327746\n",
+      "0   8375  11213      7467          0         0          7467     141.808555\n",
+      "0   7377   9622      6369          0         0          6369      51.198955\n",
+      "0   8181  10864      7116          0         0          7116      49.033224\n",
+      "0   7976  10135      6704          0         0          6704      58.357743\n",
+      "0   3369   6542      2972          0         0          2972      17.138722\n",
+      "0   9006  12776      7969          0         0          7969      83.551872\n",
+      "0   4565   7018      3887          0         0          3887      37.801989\n",
+      "0   8278  13741      7852          0         0          7852     105.623508\n",
+      "0   7582  10448      6100          0         0          6100      58.572944\n",
+      "0   9145  12706      8242          0         0          8242     129.442009\n",
+      "0   8795  11311      7584          0         0          7584      86.336201\n",
+      "0   7852  10028      7041          0         0          7041      67.886754\n",
+      "0   8393  10850      7195          0         0          7195      72.375809\n",
+      "0   7639  10031      6520          0         0          6520      61.900982\n",
+      "0   8899  11509      7922          0         0          7922      86.535298\n",
+      "0   8908  11309      7889          0         0          7889      93.595320\n",
+      "0  10654  12663      8990          0         0          8990     133.390712\n",
+      "0   9073  10228      7398          0         0          7398      79.774280\n",
+      "0   8033   9785      6822          0         0          6822      63.950340\n",
+      "0   4570   5516      3162          0         0          3162      27.943313\n",
+      "0   5716   8785      4760          0         0          4760      30.024706\n",
+      "0   9243  10989      7869          0         0          7869     108.898056\n"
+     ]
+    }
+   ],
+   "source": [
+    "pd.set_option('display.max_rows', None)\n",
+    "print(logs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "6653.3125\n"
+     ]
+    }
+   ],
+   "source": [
+    "summary = logs['N_overlap'].mean()\n",
+    "print(summary)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Fine-mapping of APOE locus"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Row(studyId='GCST90012877', variantId='19_44892009_G_A', chromosome='19', position=44892009, beta=0.352722374032, sampleSize=None, pValueMantissa=1.9950000047683716, pValueExponent=-277, effectAlleleFrequencyFromSource=0.6050670146942139, standardError=0.00991069396551, studyLocusId=6814727764900576662, qualityControls=[], row_index=18)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = slt_df.withColumn(\"row_index\", f.monotonically_increasing_id())\n",
+    "one_row = df.filter(df.row_index == 18).first()\n",
+    "one_row"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Without CARMA, without imputation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-04-08 21:34:03.208 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "2024-04-08 21:34:19.253 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-08 21:34:34.941 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-08 21:37:16.576 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "2024-04-08 21:37:28.867 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-08 21:37:44.733 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-08 21:52:03.198 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "2024-04-08 21:52:15.100 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-08 21:52:30.553 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-08 22:21:27.877 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "2024-04-08 22:21:40.137 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-08 22:21:55.249 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-08 22:23:21.795 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "2024-04-08 22:23:35.031 Hail: INFO: Coerced sorted dataset\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n",
+      "|        studyLocusId|     studyId|              region|credibleSetIndex|               locus|      variantId|chromosome|position|finemappingMethod|credibleSetlog10BF|size|\n",
+      "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n",
+      "|-6417720984991662128|GCST90012877|19:43892009-45892009|               1|[{19_44908684_T_C...|19_44908684_T_C|        19|44908684|        SuSiE-inf| 2135.710824756712|   1|\n",
+      "|-1158278093713046158|GCST90012877|19:43892009-45892009|               2|[{19_44921094_A_T...|19_44921094_A_T|        19|44921094|        SuSiE-inf| 955.4948390766739|   1|\n",
+      "| 8324745608044585165|GCST90012877|19:43892009-45892009|               3|[{19_44917947_C_T...|19_44917947_C_T|        19|44917947|        SuSiE-inf| 690.0307437138443|   1|\n",
+      "|-1158278093713046158|GCST90012877|19:43892009-45892009|               4|[{19_44921094_A_T...|19_44921094_A_T|        19|44921094|        SuSiE-inf|425.33378303492805|   1|\n",
+      "| 2240477384494621278|GCST90012877|19:43892009-45892009|               5|[{19_44891079_T_C...|19_44891079_T_C|        19|44891079|        SuSiE-inf|395.31055398960274|   1|\n",
+      "| 1029535804909934921|GCST90012877|19:43892009-45892009|               6|[{19_44894695_T_C...|19_44894695_T_C|        19|44894695|        SuSiE-inf| 333.9497424582455|   1|\n",
+      "|-6417720984991662128|GCST90012877|19:43892009-45892009|               7|[{19_44908684_T_C...|19_44908684_T_C|        19|44908684|        SuSiE-inf|  261.573648706883|   1|\n",
+      "|-1158278093713046158|GCST90012877|19:43892009-45892009|               8|[{19_44921094_A_T...|19_44921094_A_T|        19|44921094|        SuSiE-inf|186.66554412409607|   1|\n",
+      "|-7370952295217410456|GCST90012877|19:43892009-45892009|               9|[{19_44922505_T_G...|19_44922505_T_G|        19|44922505|        SuSiE-inf| 78.06352464083552|   1|\n",
+      "| 3925446284512644964|GCST90012877|19:43892009-45892009|              10|[{19_44913574_T_G...|19_44913574_T_G|        19|44913574|        SuSiE-inf|55.346197523194675|   1|\n",
+      "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n",
+      "\n",
+      "None\n",
+      "   N_gwas   N_ld  N_overlap  N_outliers  N_imputed  N_final_to_fm  eleapsed_time\n",
+      "0    7582  10448       6100           0          0           6100      66.112839\n"
+     ]
+    }
+   ],
+   "source": [
+    "res=SusieFineMapperStep.susie_finemapper_one_studylocus_row_v2_dev(\n",
+    "    GWAS=gwas1,\n",
+    "    session=session,\n",
+    "    study_locus_row=one_row,\n",
+    "    study_index=study_index,\n",
+    "    window= 2_000_000,\n",
+    "    L=10,\n",
+    "    susie_est_tausq=False,\n",
+    "    run_carma=False,\n",
+    "    run_sumstat_imputation=False,\n",
+    "    carma_time_limit=1000,\n",
+    "    imputed_r2_threshold=0.8,\n",
+    "    ld_score_threshold=4\n",
+    ")\n",
+    "sl=res[\"study_locus\"]\n",
+    "print(sl._df.withColumn(\"size\", f.size(sl._df[\"locus\"])).show())\n",
+    "print(res[\"log\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### With CARMA, without imputation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n",
+      "|        studyLocusId|     studyId|              region|credibleSetIndex|               locus|      variantId|chromosome|position|finemappingMethod|credibleSetlog10BF|size|\n",
+      "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n",
+      "|-6417720984991662128|GCST90012877|19:43892009-45892009|               1|[{19_44908684_T_C...|19_44908684_T_C|        19|44908684|        SuSiE-inf|1995.6574121818223|   1|\n",
+      "|-1158278093713046158|GCST90012877|19:43892009-45892009|               2|[{19_44921094_A_T...|19_44921094_A_T|        19|44921094|        SuSiE-inf| 721.2637360279233|   1|\n",
+      "| 7760477027903907683|GCST90012877|19:43892009-45892009|               3|[{19_44911142_C_A...|19_44911142_C_A|        19|44911142|        SuSiE-inf|248.39159334060017|   1|\n",
+      "|-1172224975892516254|GCST90012877|19:43892009-45892009|               4|[{19_44894255_A_C...|19_44894255_A_C|        19|44894255|        SuSiE-inf| 96.16160678286879|   1|\n",
+      "| 8852802213660052283|GCST90012877|19:43892009-45892009|               5|[{19_44862190_G_A...|19_44862190_G_A|        19|44862190|        SuSiE-inf| 55.80518621838019|   1|\n",
+      "|-1158278093713046158|GCST90012877|19:43892009-45892009|               6|[{19_44921094_A_T...|19_44921094_A_T|        19|44921094|        SuSiE-inf| 53.24772075097935|   1|\n",
+      "|-6417720984991662128|GCST90012877|19:43892009-45892009|               7|[{19_44908684_T_C...|19_44908684_T_C|        19|44908684|        SuSiE-inf| 45.65754067281976|   1|\n",
+      "|-6417720984991662128|GCST90012877|19:43892009-45892009|               8|[{19_44908684_T_C...|19_44908684_T_C|        19|44908684|        SuSiE-inf|  39.3840804563262|   1|\n",
+      "| 6986973025714240626|GCST90012877|19:43892009-45892009|               9|[{19_44873060_C_G...|19_44873060_C_G|        19|44873060|        SuSiE-inf| 38.54912041595975|   1|\n",
+      "| 3640651426400620880|GCST90012877|19:43892009-45892009|              10|[{19_44845920_G_C...|19_44845920_G_C|        19|44845920|        SuSiE-inf|35.378479810047224|   2|\n",
+      "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n",
+      "\n",
+      "None\n",
+      "   N_gwas   N_ld  N_overlap  N_outliers  N_imputed  N_final_to_fm  eleapsed_time\n",
+      "0    7582  10448       6100         151          0           5949     783.939477\n"
+     ]
+    }
+   ],
+   "source": [
+    "res=SusieFineMapperStep.susie_finemapper_one_studylocus_row_v2_dev(\n",
+    "    GWAS=gwas1,\n",
+    "    session=session,\n",
+    "    study_locus_row=one_row,\n",
+    "    study_index=study_index,\n",
+    "    window= 2_000_000,\n",
+    "    L=10,\n",
+    "    susie_est_tausq=False,\n",
+    "    run_carma=True,\n",
+    "    run_sumstat_imputation=False,\n",
+    "    carma_time_limit=1000,\n",
+    "    imputed_r2_threshold=0.8,\n",
+    "    ld_score_threshold=4\n",
+    ")\n",
+    "sl=res[\"study_locus\"]\n",
+    "print(sl._df.withColumn(\"size\", f.size(sl._df[\"locus\"])).show())\n",
+    "print(res[\"log\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Without CARMA, with imputation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-04-08 22:25:15.739 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-08 22:25:30.625 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-08 22:25:46.020 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-08 22:32:35.094 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-08 22:32:47.616 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-08 22:33:02.484 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-08 22:51:33.149 Hail: INFO: Table.join: renamed the following fields on the right to avoid name conflicts:\n",
+      "    'age_index_dict' -> 'age_index_dict_1'\n",
+      "    'freq_index_dict' -> 'freq_index_dict_1'\n",
+      "    'faf_index_dict' -> 'faf_index_dict_1'\n",
+      "    'freq_meta' -> 'freq_meta_1'\n",
+      "    'rf' -> 'rf_1'\n",
+      "    'age_distribution' -> 'age_distribution_1'\n",
+      "    'popmax_index_dict' -> 'popmax_index_dict_1'\n",
+      "2024-04-08 22:51:45.708 Hail: INFO: Coerced sorted dataset\n",
+      "2024-04-08 22:52:00.731 Hail: INFO: Coerced sorted dataset\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+------------+--------------------+----------------+--------------------+-----------------+----------+--------+-----------------+------------------+----+\n",
+      "|        studyLocusId|     studyId|              region|credibleSetIndex|               locus|        variantId|chromosome|position|finemappingMethod|credibleSetlog10BF|size|\n",
+      "+--------------------+------------+--------------------+----------------+--------------------+-----------------+----------+--------+-----------------+------------------+----+\n",
+      "|-1350283509846281677|GCST90012877|19:43892009-45892009|               1|[{19_44909967_TGG...|19_44909967_TGG_T|        19|44909967|        SuSiE-inf| 2310.665662473933|   1|\n",
+      "|-1158278093713046158|GCST90012877|19:43892009-45892009|               2|[{19_44921094_A_T...|  19_44921094_A_T|        19|44921094|        SuSiE-inf| 903.6138342773536|   1|\n",
+      "| 8324745608044585165|GCST90012877|19:43892009-45892009|               3|[{19_44917947_C_T...|  19_44917947_C_T|        19|44917947|        SuSiE-inf| 700.3080514793324|   1|\n",
+      "|-1158278093713046158|GCST90012877|19:43892009-45892009|               4|[{19_44921094_A_T...|  19_44921094_A_T|        19|44921094|        SuSiE-inf|431.79459690536703|   1|\n",
+      "| 1029535804909934921|GCST90012877|19:43892009-45892009|               5|[{19_44894695_T_C...|  19_44894695_T_C|        19|44894695|        SuSiE-inf|402.50010763388156|   1|\n",
+      "|-1158278093713046158|GCST90012877|19:43892009-45892009|               6|[{19_44921094_A_T...|  19_44921094_A_T|        19|44921094|        SuSiE-inf|225.93101254172214|   1|\n",
+      "|  -60207296485035224|GCST90012877|19:43892009-45892009|               7|[{19_44888997_C_T...|  19_44888997_C_T|        19|44888997|        SuSiE-inf| 191.4947272198485|   1|\n",
+      "|-1350283509846281677|GCST90012877|19:43892009-45892009|               8|[{19_44909967_TGG...|19_44909967_TGG_T|        19|44909967|        SuSiE-inf|105.04460057482835|   1|\n",
+      "|-4078755027603845519|GCST90012877|19:43892009-45892009|               9|[{19_44918393_G_A...|  19_44918393_G_A|        19|44918393|        SuSiE-inf| 63.30243818120949|   1|\n",
+      "| 3925446284512644964|GCST90012877|19:43892009-45892009|              10|[{19_44913574_T_G...|  19_44913574_T_G|        19|44913574|        SuSiE-inf|54.079307276192694|   1|\n",
+      "+--------------------+------------+--------------------+----------------+--------------------+-----------------+----------+--------+-----------------+------------------+----+\n",
+      "\n",
+      "None\n",
+      "   N_gwas   N_ld  N_overlap  N_outliers  N_imputed  N_final_to_fm  eleapsed_time\n",
+      "0    7582  10448       6100           0        681           6781     334.328722\n"
+     ]
+    }
+   ],
+   "source": [
+    "res=SusieFineMapperStep.susie_finemapper_one_studylocus_row_v2_dev(\n",
+    "    GWAS=gwas1,\n",
+    "    session=session,\n",
+    "    study_locus_row=one_row,\n",
+    "    study_index=study_index,\n",
+    "    window= 2_000_000,\n",
+    "    L=10,\n",
+    "    susie_est_tausq=False,\n",
+    "    run_carma=False,\n",
+    "    run_sumstat_imputation=True,\n",
+    "    carma_time_limit=10000,\n",
+    "    imputed_r2_threshold=0.8,\n",
+    "    ld_score_threshold=4\n",
+    ")\n",
+    "sl=res[\"study_locus\"]\n",
+    "print(sl._df.withColumn(\"size\", f.size(sl._df[\"locus\"])).show())\n",
+    "print(res[\"log\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### With CARMA, with imputation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n",
+      "|        studyLocusId|     studyId|              region|credibleSetIndex|               locus|      variantId|chromosome|position|finemappingMethod|credibleSetlog10BF|size|\n",
+      "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n",
+      "| 3030414938485808431|GCST90012877|19:43892009-45892009|               1|[{19_44895007_C_T...|19_44895007_C_T|        19|44895007|        SuSiE-inf|2680.9099711333456|   1|\n",
+      "|-2201142982564351776|GCST90012877|19:43892009-45892009|               2|[{19_44900601_A_G...|19_44900601_A_G|        19|44900601|        SuSiE-inf| 2103.873956796136|   1|\n",
+      "|-6417720984991662128|GCST90012877|19:43892009-45892009|               3|[{19_44908684_T_C...|19_44908684_T_C|        19|44908684|        SuSiE-inf|1968.8126348567705|   1|\n",
+      "|-1158278093713046158|GCST90012877|19:43892009-45892009|               4|[{19_44921094_A_T...|19_44921094_A_T|        19|44921094|        SuSiE-inf|1089.9033376410644|   1|\n",
+      "| 7760477027903907683|GCST90012877|19:43892009-45892009|               5|[{19_44911142_C_A...|19_44911142_C_A|        19|44911142|        SuSiE-inf|188.55568384844716|   1|\n",
+      "|-6417720984991662128|GCST90012877|19:43892009-45892009|               6|[{19_44908684_T_C...|19_44908684_T_C|        19|44908684|        SuSiE-inf| 83.57344085238768|   1|\n",
+      "|-1158278093713046158|GCST90012877|19:43892009-45892009|               7|[{19_44921094_A_T...|19_44921094_A_T|        19|44921094|        SuSiE-inf| 82.01732099119907|   1|\n",
+      "| 8852802213660052283|GCST90012877|19:43892009-45892009|               8|[{19_44862190_G_A...|19_44862190_G_A|        19|44862190|        SuSiE-inf| 45.92126992319222|   1|\n",
+      "|-1611304699666037367|GCST90012877|19:43892009-45892009|               9|[{19_44821259_C_T...|19_44821259_C_T|        19|44821259|        SuSiE-inf|37.363613067645254|   1|\n",
+      "| 3556335645959991344|GCST90012877|19:43892009-45892009|              10|[{19_45017701_G_T...|19_45017701_G_T|        19|45017701|        SuSiE-inf|30.736039473626658|   4|\n",
+      "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n",
+      "\n",
+      "None\n",
+      "   N_gwas   N_ld  N_overlap  N_outliers  N_imputed  N_final_to_fm  eleapsed_time\n",
+      "0    7582  10448       6100         152        715           6663    1036.467428\n"
+     ]
+    }
+   ],
+   "source": [
+    "res=SusieFineMapperStep.susie_finemapper_one_studylocus_row_v2_dev(\n",
+    "    GWAS=gwas1,\n",
+    "    session=session,\n",
+    "    study_locus_row=one_row,\n",
+    "    study_index=study_index,\n",
+    "    window= 2_000_000,\n",
+    "    L=10,\n",
+    "    susie_est_tausq=False,\n",
+    "    run_carma=True,\n",
+    "    run_sumstat_imputation=True,\n",
+    "    carma_time_limit=10000,\n",
+    "    imputed_r2_threshold=0.8,\n",
+    "    ld_score_threshold=4\n",
+    ")\n",
+    "sl=res[\"study_locus\"]\n",
+    "print(sl._df.withColumn(\"size\", f.size(sl._df[\"locus\"])).show())\n",
+    "print(res[\"log\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### With CARMA, with imputation, with estimation of infinitisimal effects (susie_est_tausq=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n",
+      "|        studyLocusId|     studyId|              region|credibleSetIndex|               locus|      variantId|chromosome|position|finemappingMethod|credibleSetlog10BF|size|\n",
+      "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n",
+      "|-6417720984991662128|GCST90012877|19:43892009-45892009|               1|[{19_44908684_T_C...|19_44908684_T_C|        19|44908684|        SuSiE-inf| 1105.297844890198|   1|\n",
+      "|-1158278093713046158|GCST90012877|19:43892009-45892009|               2|[{19_44921094_A_T...|19_44921094_A_T|        19|44921094|        SuSiE-inf|1042.0949995382389|   1|\n",
+      "|-2201142982564351776|GCST90012877|19:43892009-45892009|               3|[{19_44900601_A_G...|19_44900601_A_G|        19|44900601|        SuSiE-inf| 760.0654878716481|   1|\n",
+      "| 3030414938485808431|GCST90012877|19:43892009-45892009|               4|[{19_44895007_C_T...|19_44895007_C_T|        19|44895007|        SuSiE-inf| 388.8928142354868|   1|\n",
+      "| -251577639520141451|GCST90012877|19:43892009-45892009|               5|[{19_44899220_C_T...|19_44899220_C_T|        19|44899220|        SuSiE-inf| 259.5645544847559|   1|\n",
+      "| 7760477027903907683|GCST90012877|19:43892009-45892009|               6|[{19_44911142_C_A...|19_44911142_C_A|        19|44911142|        SuSiE-inf|231.66277856324325|   1|\n",
+      "| 4133344777320628094|GCST90012877|19:43892009-45892009|               7|[{19_44904531_G_A...|19_44904531_G_A|        19|44904531|        SuSiE-inf|143.22657752219786|   1|\n",
+      "|-1764089385585984368|GCST90012877|19:43892009-45892009|               8|[{19_44893642_T_C...|19_44893642_T_C|        19|44893642|        SuSiE-inf| 87.72507299242906|   1|\n",
+      "|-1158278093713046158|GCST90012877|19:43892009-45892009|               9|[{19_44921094_A_T...|19_44921094_A_T|        19|44921094|        SuSiE-inf|  71.4171763690986|   1|\n",
+      "|-6417720984991662128|GCST90012877|19:43892009-45892009|              10|[{19_44908684_T_C...|19_44908684_T_C|        19|44908684|        SuSiE-inf| 43.36071977593145|   1|\n",
+      "+--------------------+------------+--------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n",
+      "\n",
+      "None\n",
+      "   N_gwas   N_ld  N_overlap  N_outliers  N_imputed  N_final_to_fm  eleapsed_time\n",
+      "0    7582  10448       6100         151        720           6669    1229.515921\n"
+     ]
+    }
+   ],
+   "source": [
+    "res=SusieFineMapperStep.susie_finemapper_one_studylocus_row_v2_dev(\n",
+    "    GWAS=gwas1,\n",
+    "    session=session,\n",
+    "    study_locus_row=one_row,\n",
+    "    study_index=study_index,\n",
+    "    window= 2_000_000,\n",
+    "    L=10,\n",
+    "    susie_est_tausq=True,\n",
+    "    run_carma=True,\n",
+    "    run_sumstat_imputation=True,\n",
+    "    carma_time_limit=10000,\n",
+    "    imputed_r2_threshold=0.8,\n",
+    "    ld_score_threshold=4\n",
+    ")\n",
+    "sl=res[\"study_locus\"]\n",
+    "print(sl._df.withColumn(\"size\", f.size(sl._df[\"locus\"])).show())\n",
+    "print(res[\"log\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Fine-mapping of MHC region using 1Mb window"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Row(studyId='GCST90012877', variantId='6_32592248_A_G', chromosome='6', position=32592248, beta=-0.103604380043, sampleSize=None, pValueMantissa=2.877000093460083, pValueExponent=-15, effectAlleleFrequencyFromSource=0.21086899936199188, standardError=0.0131209374957, studyLocusId=5718491981995302674, qualityControls=[], row_index=27)"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = slt_df.withColumn(\"row_index\", f.monotonically_increasing_id())\n",
+    "one_row = df.filter(df.row_index == 27).first()\n",
+    "one_row"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[Stage 1541:==========================================>             (6 + 2) / 8]\r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+------------+-------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n",
+      "|        studyLocusId|     studyId|             region|credibleSetIndex|               locus|      variantId|chromosome|position|finemappingMethod|credibleSetlog10BF|size|\n",
+      "+--------------------+------------+-------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n",
+      "|-3446214959021623473|GCST90012877|6:32092248-33092248|               1|[{6_32557997_G_A,...| 6_32557997_G_A|         6|32557997|        SuSiE-inf| 4323.908142062261|   1|\n",
+      "| -439738150050389281|GCST90012877|6:32092248-33092248|               2|[{6_32558002_G_T,...| 6_32558002_G_T|         6|32558002|        SuSiE-inf|3428.8321277074765|   1|\n",
+      "| 5831857384024844796|GCST90012877|6:32092248-33092248|               3|[{6_32557987_C_A,...| 6_32557987_C_A|         6|32557987|        SuSiE-inf|1699.8680349563335|   1|\n",
+      "|-1087057043201011402|GCST90012877|6:32092248-33092248|               4|[{6_32557977_T_C,...| 6_32557977_T_C|         6|32557977|        SuSiE-inf| 965.9753305300063|   1|\n",
+      "| 6919234179916081233|GCST90012877|6:32092248-33092248|               5|[{6_32649735_C_T,...| 6_32649735_C_T|         6|32649735|        SuSiE-inf| 369.9698233117616|   1|\n",
+      "| 7781006900918060896|GCST90012877|6:32092248-33092248|               6|[{6_32652962_C_T,...| 6_32652962_C_T|         6|32652962|        SuSiE-inf| 328.6834447478274|   1|\n",
+      "|-7512794333418509403|GCST90012877|6:32092248-33092248|               7|[{6_32591896_T_G,...| 6_32591896_T_G|         6|32591896|        SuSiE-inf|326.52393082050276|   1|\n",
+      "| 4056478719932360430|GCST90012877|6:32092248-33092248|               8|[{6_32621456_GC_G...|6_32621456_GC_G|         6|32621456|        SuSiE-inf|263.48518383939836|   1|\n",
+      "| 8380896542014789747|GCST90012877|6:32092248-33092248|               9|[{6_32648039_G_A,...| 6_32648039_G_A|         6|32648039|        SuSiE-inf|176.62947310155317|   1|\n",
+      "| 9053545161380162736|GCST90012877|6:32092248-33092248|              10|[{6_32700030_C_T,...| 6_32700030_C_T|         6|32700030|        SuSiE-inf| 69.83226092797517|   2|\n",
+      "+--------------------+------------+-------------------+----------------+--------------------+---------------+----------+--------+-----------------+------------------+----+\n",
+      "\n",
+      "None\n",
+      "   N_gwas   N_ld  N_overlap  N_outliers  N_imputed  N_final_to_fm  eleapsed_time\n",
+      "0   19311  22318      13188           0          0          13188     298.784288\n"
+     ]
+    }
+   ],
+   "source": [
+    "res=SusieFineMapperStep.susie_finemapper_one_studylocus_row_v2_dev(\n",
+    "    GWAS=gwas1,\n",
+    "    session=session,\n",
+    "    study_locus_row=one_row,\n",
+    "    study_index=study_index,\n",
+    "    window= 1_000_000,\n",
+    "    L=10,\n",
+    "    susie_est_tausq=False,\n",
+    "    run_carma=False,\n",
+    "    run_sumstat_imputation=False,\n",
+    "    carma_time_limit=10000,\n",
+    "    imputed_r2_threshold=0.8,\n",
+    "    ld_score_threshold=4\n",
+    ")\n",
+    "sl=res[\"study_locus\"]\n",
+    "print(sl._df.withColumn(\"size\", f.size(sl._df[\"locus\"])).show())\n",
+    "print(res[\"log\"])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "gentropy-krNFZEZg-py3.10",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From ecd806331813bb28eada4d4a56c983e5a419b833 Mon Sep 17 00:00:00 2001
From: Daniel-Considine <113430683+Daniel-Considine@users.noreply.github.com>
Date: Wed, 10 Apr 2024 14:41:07 +0100
Subject: [PATCH 04/21] fix: removing all duplicated variants in sumstats for
 finemapping functions (#574)

---
 src/gentropy/susie_finemapper.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py
index db62fe359..d37298436 100644
--- a/src/gentropy/susie_finemapper.py
+++ b/src/gentropy/susie_finemapper.py
@@ -81,6 +81,10 @@ def susie_finemapper_one_studylocus_row(
             .filter(f.col("studyId") == studyId)
             .filter(f.col("z").isNotNull())
         )
+        # Remove ALL duplicated variants from GWAS DataFrame - we don't know which is correct
+        variant_counts = gwas_df.groupBy("variantId").count()
+        unique_variants = variant_counts.filter(f.col("count") == 1)
+        gwas_df = gwas_df.join(unique_variants, on="variantId", how="left_semi")
 
         ld_index = (
             GnomADLDMatrix()
@@ -320,6 +324,10 @@ def susie_finemapper_ss_gathered(
             .withColumn("position", f.split(f.col("variantId"), "_")[1])
             .filter(f.col("z").isNotNull())
         )
+        # Remove ALL duplicated variants from GWAS DataFrame - we don't know which is correct
+        variant_counts = gwas_df.groupBy("variantId").count()
+        unique_variants = variant_counts.filter(f.col("count") == 1)
+        gwas_df = gwas_df.join(unique_variants, on="variantId", how="left_semi")
 
         ld_index = (
             GnomADLDMatrix()

From 7ed4703fcb3589af82f9fc76425f0691a080dee4 Mon Sep 17 00:00:00 2001
From: Yakov <yt4@sanger.ac.uk>
Date: Sun, 14 Apr 2024 16:42:34 +0100
Subject: [PATCH 05/21] feat: adding notebook for mapping EFOs for the FinnGen
 study index (#575)

---
 notebooks/Mapping_EFO_finngen.ipynb | 768 ++++++++++++++++++++++++++++
 1 file changed, 768 insertions(+)
 create mode 100644 notebooks/Mapping_EFO_finngen.ipynb

diff --git a/notebooks/Mapping_EFO_finngen.ipynb b/notebooks/Mapping_EFO_finngen.ipynb
new file mode 100644
index 000000000..9bd82d8d4
--- /dev/null
+++ b/notebooks/Mapping_EFO_finngen.ipynb
@@ -0,0 +1,768 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Mapping EFOs for the FinnGen study index using old study index from the previos prod"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This notebook adds EFOs from previos prod version of study_index to the new FinnGen study_index using trait name as a matching key.\n",
+    "\n",
+    "The rsulting study index has 1542 rows with not null EFOs (out of 2408 rows).\n",
+    "\n",
+    "The new study index is saved here:\n",
+    "\"gs://genetics-portal-dev-analysis/yt4/study_index_finngen_with_efo\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Your browser has been opened to visit:\n",
+      "\n",
+      "    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=XHb8Uk43SsVjvFRqwgrX4Tgg2tTOHS&access_type=offline&code_challenge=OkiqDAkHXDGEgJQbX8r0ZYKfZ7gcgfXS8mfZc5a913Y&code_challenge_method=S256\n",
+      "\n",
+      "\n",
+      "Credentials saved to file: [/Users/yt4/.config/gcloud/application_default_credentials.json]\n",
+      "\n",
+      "These credentials will be used by any library that requests Application Default Credentials (ADC).\n",
+      "\n",
+      "Quota project \"open-targets-genetics-dev\" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.\n"
+     ]
+    }
+   ],
+   "source": [
+    "!gcloud auth application-default login"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style>\n",
+       "        .bk-notebook-logo {\n",
+       "            display: block;\n",
+       "            width: 20px;\n",
+       "            height: 20px;\n",
+       "            background-image: url(data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAUCAYAAACNiR0NAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAABx0RVh0U29mdHdhcmUAQWRvYmUgRmlyZXdvcmtzIENTNui8sowAAAOkSURBVDiNjZRtaJVlGMd/1/08zzln5zjP1LWcU9N0NkN8m2CYjpgQYQXqSs0I84OLIC0hkEKoPtiH3gmKoiJDU7QpLgoLjLIQCpEsNJ1vqUOdO7ppbuec5+V+rj4ctwzd8IIbbi6u+8f1539dt3A78eXC7QizUF7gyV1fD1Yqg4JWz84yffhm0qkFqBogB9rM8tZdtwVsPUhWhGcFJngGeWrPzHm5oaMmkfEg1usvLFyc8jLRqDOMru7AyC8saQr7GG7f5fvDeH7Ej8CM66nIF+8yngt6HWaKh7k49Soy9nXurCi1o3qUbS3zWfrYeQDTB/Qj6kX6Ybhw4B+bOYoLKCC9H3Nu/leUTZ1JdRWkkn2ldcCamzrcf47KKXdAJllSlxAOkRgyHsGC/zRday5Qld9DyoM4/q/rUoy/CXh3jzOu3bHUVZeU+DEn8FInkPBFlu3+nW3Nw0mk6vCDiWg8CeJaxEwuHS3+z5RgY+YBR6V1Z1nxSOfoaPa4LASWxxdNp+VWTk7+4vzaou8v8PN+xo+KY2xsw6une2frhw05CTYOmQvsEhjhWjn0bmXPjpE1+kplmmkP3suftwTubK9Vq22qKmrBhpY4jvd5afdRA3wGjFAgcnTK2s4hY0/GPNIb0nErGMCRxWOOX64Z8RAC4oCXdklmEvcL8o0BfkNK4lUg9HTl+oPlQxdNo3Mg4Nv175e/1LDGzZen30MEjRUtmXSfiTVu1kK8W4txyV6BMKlbgk3lMwYCiusNy9fVfvvwMxv8Ynl6vxoByANLTWplvuj/nF9m2+PDtt1eiHPBr1oIfhCChQMBw6Aw0UulqTKZdfVvfG7VcfIqLG9bcldL/+pdWTLxLUy8Qq38heUIjh4XlzZxzQm19lLFlr8vdQ97rjZVOLf8nclzckbcD4wxXMidpX30sFd37Fv/GtwwhzhxGVAprjbg0gCAEeIgwCZyTV2Z1REEW8O4py0wsjeloKoMr6iCY6dP92H6Vw/oTyICIthibxjm/DfN9lVz8IqtqKYLUXfoKVMVQVVJOElGjrnnUt9T9wbgp8AyYKaGlqingHZU/uG2NTZSVqwHQTWkx9hxjkpWDaCg6Ckj5qebgBVbT3V3NNXMSiWSDdGV3hrtzla7J+duwPOToIg42ChPQOQjspnSlp1V+Gjdged7+8UN5CRAV7a5EdFNwCjEaBR27b3W890TE7g24NAP/mMDXRWrGoFPQI9ls/MWO2dWFAar/xcOIImbbpA3zgAAAABJRU5ErkJggg==);\n",
+       "        }\n",
+       "    </style>\n",
+       "    <div>\n",
+       "        <a href=\"https://bokeh.org\" target=\"_blank\" class=\"bk-notebook-logo\"></a>\n",
+       "        <span id=\"c92e22c1-acc6-4a9b-8a5a-529fec6e60ae\">Loading BokehJS ...</span>\n",
+       "    </div>\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/javascript": "(function(root) {\n  function now() {\n    return new Date();\n  }\n\n  const force = true;\n\n  if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n    root._bokeh_onload_callbacks = [];\n    root._bokeh_is_loading = undefined;\n  }\n\nconst JS_MIME_TYPE = 'application/javascript';\n  const HTML_MIME_TYPE = 'text/html';\n  const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n  const CLASS_NAME = 'output_bokeh rendered_html';\n\n  /**\n   * Render data to the DOM node\n   */\n  function render(props, node) {\n    const script = document.createElement(\"script\");\n    node.appendChild(script);\n  }\n\n  /**\n   * Handle when an output is cleared or removed\n   */\n  function handleClearOutput(event, handle) {\n    function drop(id) {\n      const view = Bokeh.index.get_by_id(id)\n      if (view != null) {\n        view.model.document.clear()\n        Bokeh.index.delete(view)\n      }\n    }\n\n    const cell = handle.cell;\n\n    const id = cell.output_area._bokeh_element_id;\n    const server_id = cell.output_area._bokeh_server_id;\n\n    // Clean up Bokeh references\n    if (id != null) {\n      drop(id)\n    }\n\n    if (server_id !== undefined) {\n      // Clean up Bokeh references\n      const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n      cell.notebook.kernel.execute(cmd_clean, {\n        iopub: {\n          output: function(msg) {\n            const id = msg.content.text.trim()\n            drop(id)\n          }\n        }\n      });\n      // Destroy server and session\n      const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n      cell.notebook.kernel.execute(cmd_destroy);\n    }\n  }\n\n  /**\n   * Handle when a new output is added\n   */\n  function handleAddOutput(event, handle) {\n    const output_area = handle.output_area;\n    const output = handle.output;\n\n    // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n    if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n      return\n    }\n\n    const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n\n    if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n      toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n      // store reference to embed id on output_area\n      output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n    }\n    if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n      const bk_div = document.createElement(\"div\");\n      bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n      const script_attrs = bk_div.children[0].attributes;\n      for (let i = 0; i < script_attrs.length; i++) {\n        toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n        toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n      }\n      // store reference to server id on output_area\n      output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n    }\n  }\n\n  function register_renderer(events, OutputArea) {\n\n    function append_mime(data, metadata, element) {\n      // create a DOM node to render to\n      const toinsert = this.create_output_subarea(\n        metadata,\n        CLASS_NAME,\n        EXEC_MIME_TYPE\n      );\n      this.keyboard_manager.register_events(toinsert);\n      // Render to node\n      const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n      render(props, toinsert[toinsert.length - 1]);\n      element.append(toinsert);\n      return toinsert\n    }\n\n    /* Handle when an output is cleared or removed */\n    events.on('clear_output.CodeCell', handleClearOutput);\n    events.on('delete.Cell', handleClearOutput);\n\n    /* Handle when a new output is added */\n    events.on('output_added.OutputArea', handleAddOutput);\n\n    /**\n     * Register the mime type and append_mime function with output_area\n     */\n    OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n      /* Is output safe? */\n      safe: true,\n      /* Index of renderer in `output_area.display_order` */\n      index: 0\n    });\n  }\n\n  // register the mime type if in Jupyter Notebook environment and previously unregistered\n  if (root.Jupyter !== undefined) {\n    const events = require('base/js/events');\n    const OutputArea = require('notebook/js/outputarea').OutputArea;\n\n    if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n      register_renderer(events, OutputArea);\n    }\n  }\n  if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n    root._bokeh_timeout = Date.now() + 5000;\n    root._bokeh_failed_load = false;\n  }\n\n  const NB_LOAD_WARNING = {'data': {'text/html':\n     \"<div style='background-color: #fdd'>\\n\"+\n     \"<p>\\n\"+\n     \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n     \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n     \"</p>\\n\"+\n     \"<ul>\\n\"+\n     \"<li>re-rerun `output_notebook()` to attempt to load from CDN again, or</li>\\n\"+\n     \"<li>use INLINE resources instead, as so:</li>\\n\"+\n     \"</ul>\\n\"+\n     \"<code>\\n\"+\n     \"from bokeh.resources import INLINE\\n\"+\n     \"output_notebook(resources=INLINE)\\n\"+\n     \"</code>\\n\"+\n     \"</div>\"}};\n\n  function display_loaded() {\n    const el = document.getElementById(\"c92e22c1-acc6-4a9b-8a5a-529fec6e60ae\");\n    if (el != null) {\n      el.textContent = \"BokehJS is loading...\";\n    }\n    if (root.Bokeh !== undefined) {\n      if (el != null) {\n        el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n      }\n    } else if (Date.now() < root._bokeh_timeout) {\n      setTimeout(display_loaded, 100)\n    }\n  }\n\n  function run_callbacks() {\n    try {\n      root._bokeh_onload_callbacks.forEach(function(callback) {\n        if (callback != null)\n          callback();\n      });\n    } finally {\n      delete root._bokeh_onload_callbacks\n    }\n    console.debug(\"Bokeh: all callbacks have finished\");\n  }\n\n  function load_libs(css_urls, js_urls, callback) {\n    if (css_urls == null) css_urls = [];\n    if (js_urls == null) js_urls = [];\n\n    root._bokeh_onload_callbacks.push(callback);\n    if (root._bokeh_is_loading > 0) {\n      console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n      return null;\n    }\n    if (js_urls == null || js_urls.length === 0) {\n      run_callbacks();\n      return null;\n    }\n    console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n    root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n    function on_load() {\n      root._bokeh_is_loading--;\n      if (root._bokeh_is_loading === 0) {\n        console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n        run_callbacks()\n      }\n    }\n\n    function on_error(url) {\n      console.error(\"failed to load \" + url);\n    }\n\n    for (let i = 0; i < css_urls.length; i++) {\n      const url = css_urls[i];\n      const element = document.createElement(\"link\");\n      element.onload = on_load;\n      element.onerror = on_error.bind(null, url);\n      element.rel = \"stylesheet\";\n      element.type = \"text/css\";\n      element.href = url;\n      console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n      document.body.appendChild(element);\n    }\n\n    for (let i = 0; i < js_urls.length; i++) {\n      const url = js_urls[i];\n      const element = document.createElement('script');\n      element.onload = on_load;\n      element.onerror = on_error.bind(null, url);\n      element.async = false;\n      element.src = url;\n      console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n      document.head.appendChild(element);\n    }\n  };\n\n  function inject_raw_css(css) {\n    const element = document.createElement(\"style\");\n    element.appendChild(document.createTextNode(css));\n    document.body.appendChild(element);\n  }\n\n  const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.0.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.3.0.min.js\"];\n  const css_urls = [];\n\n  const inline_js = [    function(Bokeh) {\n      Bokeh.set_log_level(\"info\");\n    },\nfunction(Bokeh) {\n    }\n  ];\n\n  function run_inline_js() {\n    if (root.Bokeh !== undefined || force === true) {\n          for (let i = 0; i < inline_js.length; i++) {\n      inline_js[i].call(root, root.Bokeh);\n    }\nif (force === true) {\n        display_loaded();\n      }} else if (Date.now() < root._bokeh_timeout) {\n      setTimeout(run_inline_js, 100);\n    } else if (!root._bokeh_failed_load) {\n      console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n      root._bokeh_failed_load = true;\n    } else if (force !== true) {\n      const cell = $(document.getElementById(\"c92e22c1-acc6-4a9b-8a5a-529fec6e60ae\")).parents('.cell').data().cell;\n      cell.output_area.append_execute_result(NB_LOAD_WARNING)\n    }\n  }\n\n  if (root._bokeh_is_loading === 0) {\n    console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n    run_inline_js();\n  } else {\n    load_libs(css_urls, js_urls, function() {\n      console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n      run_inline_js();\n    });\n  }\n}(window));",
+      "application/vnd.bokehjs_load.v0+json": ""
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "24/04/14 16:03:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting default log level to \"WARN\".\n",
+      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
+      "pip-installed Hail requires additional configuration options in Spark referring\n",
+      "  to the path to the Hail Python module directory HAIL_DIR,\n",
+      "  e.g. /path/to/python/site-packages/hail:\n",
+      "    spark.jars=HAIL_DIR/backend/hail-all-spark.jar\n",
+      "    spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar\n",
+      "    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 3.3.4\n",
+      "SparkUI available at http://192.168.0.232:4040\n",
+      "Welcome to\n",
+      "     __  __     <>__\n",
+      "    / /_/ /__  __/ /\n",
+      "   / __  / _ `/ / /\n",
+      "  /_/ /_/\\_,_/_/_/   version 0.2.127-bb535cd096c5\n",
+      "LOGGING: writing to /dev/null\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import hail as hl\n",
+    "import pyspark.sql.functions as f\n",
+    "import pandas as pd\n",
+    "pd.set_option('display.max_colwidth', None)\n",
+    "pd.set_option('display.expand_frame_repr', False)\n",
+    "\n",
+    "from gentropy.common.session import Session\n",
+    "from gentropy.dataset.study_index import StudyIndex\n",
+    "\n",
+    "\n",
+    "hail_dir = os.path.dirname(hl.__file__)\n",
+    "session = Session(hail_home=hail_dir, start_hail=True, extended_spark_conf={\"spark.driver.memory\": \"12g\",\n",
+    "    \"spark.kryoserializer.buffer.max\": \"500m\",\"spark.driver.maxResultSize\":\"3g\"})\n",
+    "hl.init(sc=session.spark.sparkContext, log=\"/dev/null\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path_si=\"gs://genetics_etl_python_playground/releases/24.03/study_index/finngen/study_index\"\n",
+    "path_si_old=\"gs://genetics-portal-dev-analysis/yt4/study_index.csv\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    }
+   ],
+   "source": [
+    "si_old=session.spark.read.csv(path_si_old, header=True,sep=\"\\t\")\n",
+    "si_new=StudyIndex.from_parquet(session=session, path=path_si)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+-------------------+--------------------+-------+---------+-------------+----+----------+----------+-----------+---------+------------+--------------+-------+--------------------+-----------------+--------------------+\n",
+      "|            study_id|   ancestry_initial|ancestry_replication|n_cases|n_initial|n_replication|pmid|pub_author|  pub_date|pub_journal|pub_title|has_sumstats|num_assoc_loci| source|      trait_reported|       trait_efos|      trait_category|\n",
+      "+--------------------+-------------------+--------------------+-------+---------+-------------+----+----------+----------+-----------+---------+------------+--------------+-------+--------------------+-----------------+--------------------+\n",
+      "|FINNGEN_R6_M13_MU...|['European=253458']|                  []|  108.0|   253458|          0.0|null|FINNGEN_R6|2022-01-24|       null|     null|        True|             0|FINNGEN|Multifocal fibros...|['MONDO_0009230']|immune system dis...|\n",
+      "|FINNGEN_R6_M13_MU...|['European=199528']|                  []| 1804.0|   199528|          0.0|null|FINNGEN_R6|2022-01-24|       null|     null|        True|             0|FINNGEN|Disorders of muscles|  ['EFO_0002970']|musculoskeletal o...|\n",
+      "|FINNGEN_R6_M13_MU...|['European=197821']|                  []|   97.0|   197821|          0.0|null|FINNGEN_R6|2022-01-24|       null|     null|        True|             0|FINNGEN|\"\"\"Muscle wasting...|  ['EFO_0009851']|  biological process|\n",
+      "|FINNGEN_R6_M13_MU...|['European=198253']|                  []|  529.0|   198253|          0.0|null|FINNGEN_R6|2022-01-24|       null|     null|        True|             0|FINNGEN|Other specified d...|  ['EFO_0002970']|musculoskeletal o...|\n",
+      "|FINNGEN_R6_M13_MU...|['European=198179']|                  []|  455.0|   198179|          0.0|null|FINNGEN_R6|2022-01-24|       null|     null|        True|             1|FINNGEN|       Muscle strain|  ['EFO_0010686']|injury, poisoning...|\n",
+      "+--------------------+-------------------+--------------------+-------+---------+-------------+----+----------+----------+-----------+---------+------------+--------------+-------+--------------------+-----------------+--------------------+\n",
+      "only showing top 5 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "si_old.show(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n",
+      "|             studyId|  projectId|studyType|     traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds|   initialSampleSize|nCases|nControls|nSamples|  cohorts|ldPopulationStructure|   discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|\n",
+      "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n",
+      "|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|       Actinomycosis|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   101|   363227|  363328|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|          Amoebiasis|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   160|   367214|  367374|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|Anogenital herpes...|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  1986|   400197|  402183|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|       Aspergillosis|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   211|   403213|  403424|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|Atypical virus in...|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   282|   409849|  410131|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n",
+      "only showing top 5 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "si_new_df=si_new.df\n",
+    "si_new_df.show(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "57246\n",
+      "2408\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(si_old.count())\n",
+    "print(si_new_df.count())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+-----------------+\n",
+      "|      trait_reported|       trait_efos|\n",
+      "+--------------------+-----------------+\n",
+      "|Multifocal fibros...|['MONDO_0009230']|\n",
+      "|Disorders of muscles|  ['EFO_0002970']|\n",
+      "|\"\"\"Muscle wasting...|  ['EFO_0009851']|\n",
+      "|Other specified d...|  ['EFO_0002970']|\n",
+      "|       Muscle strain|  ['EFO_0010686']|\n",
+      "+--------------------+-----------------+\n",
+      "only showing top 5 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "si_old=si_old.select(\"trait_reported\",\"trait_efos\")\n",
+    "si_old.show(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyspark.sql.functions import lower\n",
+    "\n",
+    "si_old = si_old.withColumn(\"trait_reported_low\", lower(si_old[\"trait_reported\"])).select(\"trait_reported_low\",\"trait_efos\")\n",
+    "si_new_df= si_new_df.withColumn(\"trait_reported_low\", lower(si_new_df[\"traitFromSource\"]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "2408"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "si_old = si_old.dropDuplicates(['trait_reported_low'])\n",
+    "joined_df = si_new_df.join(si_old, \"trait_reported_low\", how='left')\n",
+    "joined_df.count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+--------------------+\n",
+      "|  trait_reported_low|             studyId|  projectId|studyType|     traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds|   initialSampleSize|nCases|nControls|nSamples|  cohorts|ldPopulationStructure|   discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|          trait_efos|\n",
+      "+--------------------+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+--------------------+\n",
+      "|       actinomycosis|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|       Actinomycosis|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   101|   363227|  363328|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|                null|\n",
+      "|          amoebiasis|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|          Amoebiasis|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   160|   367214|  367374|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|     ['EFO_0007144']|\n",
+      "|anogenital herpes...|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|Anogenital herpes...|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  1986|   400197|  402183|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|     ['EFO_0007282']|\n",
+      "|       aspergillosis|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|       Aspergillosis|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   211|   403213|  403424|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|     ['EFO_0007157']|\n",
+      "|atypical virus in...|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|Atypical virus in...|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   282|   409849|  410131|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|   ['MONDO_0024318']|\n",
+      "|bacterial infecti...|FINNGEN_R10_AB1_B...|FINNGEN_R10|     gwas|Bacterial infecti...|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...| 20226|   363227|  383453|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|                null|\n",
+      "|bacterial, viral ...|FINNGEN_R10_AB1_B...|FINNGEN_R10|     gwas|Bacterial, viral ...|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  2852|   409329|  412181|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|                null|\n",
+      "|other bacterial i...|FINNGEN_R10_AB1_B...|FINNGEN_R10|     gwas|Other bacterial i...|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  6145|   367214|  373359|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|     ['EFO_0000771']|\n",
+      "|         candidiasis|FINNGEN_R10_AB1_C...|FINNGEN_R10|     gwas|         Candidiasis|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  4306|   403213|  407519|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|   ['MONDO_0002026']|\n",
+      "|other sexually tr...|FINNGEN_R10_AB1_C...|FINNGEN_R10|     gwas|Other sexually tr...|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  2186|   400197|  402383|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|['MONDO_0021681',...|\n",
+      "|             cholera|FINNGEN_R10_AB1_C...|FINNGEN_R10|     gwas|             Cholera|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  1385|   367214|  368599|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|     ['EFO_1001235']|\n",
+      "|dengue fever [cla...|FINNGEN_R10_AB1_D...|FINNGEN_R10|     gwas|Dengue fever [cla...|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|    53|   409137|  409190|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|                null|\n",
+      "|     dermatophytosis|FINNGEN_R10_AB1_D...|FINNGEN_R10|     gwas|     Dermatophytosis|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  3921|   403213|  407134|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|   ['MONDO_0004678']|\n",
+      "|      early syphilis|FINNGEN_R10_AB1_E...|FINNGEN_R10|     gwas|      Early syphilis|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   308|   400197|  400505|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|     ['EFO_0007504']|\n",
+      "|infectious mononu...| FINNGEN_R10_AB1_EBV|FINNGEN_R10|     gwas|Infectious mononu...|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  2979|   400974|  403953|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|     ['EFO_0007326']|\n",
+      "|        enterobiasis|FINNGEN_R10_AB1_E...|FINNGEN_R10|     gwas|        Enterobiasis|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   112|   411658|  411770|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|                null|\n",
+      "|          erysipelas|FINNGEN_R10_AB1_E...|FINNGEN_R10|     gwas|          Erysipelas|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...| 22261|   363227|  385488|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|     ['EFO_1001462']|\n",
+      "|diarrhoea and gas...|FINNGEN_R10_AB1_G...|FINNGEN_R10|     gwas|Diarrhoea and gas...|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...| 32210|   367214|  399424|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|   ['MONDO_0045031']|\n",
+      "|gonococcal infection|FINNGEN_R10_AB1_G...|FINNGEN_R10|     gwas|Gonococcal infection|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   954|   400197|  401151|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|       ['DOID_7551']|\n",
+      "|       helminthiases|FINNGEN_R10_AB1_H...|FINNGEN_R10|     gwas|       Helminthiases|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   523|   411658|  412181|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|['EFO_0007245', '...|\n",
+      "+--------------------+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+--------------------+\n",
+      "only showing top 20 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "joined_df.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1542\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[Stage 22:===========================================>              (3 + 1) / 4]\r"
+     ]
+    }
+   ],
+   "source": [
+    "num_non_null_rows = joined_df.filter(joined_df.trait_efos.isNotNull()).count()\n",
+    "print(num_non_null_rows)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+------------+---------+---------+--------------------+------------------------+------+------------------+--------+--------------------+----------------------+---------------+--------------------+----------------------------------+--------------------+------+---------+--------+--------------------+---------------------+--------------------+--------------------+---------------+-------------+--------------------+-----------+\n",
+      "|     studyId|projectId|studyType|     traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId|    publicationTitle|publicationFirstAuthor|publicationDate|  publicationJournal|backgroundTraitFromSourceMappedIds|   initialSampleSize|nCases|nControls|nSamples|             cohorts|ldPopulationStructure|    discoverySamples|  replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|\n",
+      "+------------+---------+---------+--------------------+------------------------+------+------------------+--------+--------------------+----------------------+---------------+--------------------+----------------------------------+--------------------+------+---------+--------+--------------------+---------------------+--------------------+--------------------+---------------+-------------+--------------------+-----------+\n",
+      "|  GCST000102|     GCST|     gwas|Endothelial funct...|           [EFO_0004298]|  null|              null|17903301|Genome-wide assoc...|              Vasan RS|     2007-09-19|       BMC Med Genet|                              null|Up to 1,238 Europ...|     0|        0|    1238|              [null]|         [{nfe, 1.0}]|  [{1238, European}]|                  []|           null|         null|                null|      false|\n",
+      "|  GCST000272|     GCST|     gwas|              Height|           [EFO_0004339]|  null|              null|19030899|Genome-wide assoc...|                Lei SF|     2008-11-23|           Hum Genet|                              null|618 Chinese ances...|     0|        0|     618|              [null]|         [{eas, 1.0}]| [{618, East Asian}]|[{2953, East Asian}]|           null|         null|                null|      false|\n",
+      "|  GCST000436|     GCST|     gwas|Acenocoumarol mai...|            [GO_0061476]|  null|              null|19578179|A genome-wide ass...|            Teichert M|     2009-07-04|       Hum Mol Genet|                              null|1,451 European an...|     0|        0|    1451|              [null]|         [{nfe, 1.0}]|  [{1451, European}]|         [{287, NR}]|           null|         null|                null|      false|\n",
+      "|  GCST000514|     GCST|     gwas|Response to antip...|            [GO_0097332]|  null|              null|19875103|Genomewide associ...|               Aberg K|     2009-10-27|     Biol Psychiatry|                              null|421 European ance...|   738|        0|     738|              [null]| [{afr, 0.28997289...|[{214, African Am...|                  []|           null|         null|                null|      false|\n",
+      "|  GCST000550|     GCST|     gwas|   Metabolite levels|           [EFO_0004725]|  null|              null|20037589|A genome-wide per...|               Illig T|     2009-12-27|           Nat Genet|                              null|1,029 European an...|     0|        0|    1029|              [null]|         [{nfe, 1.0}]|  [{1029, European}]|  [{1202, European}]|           null|         null|                null|      false|\n",
+      "|  GCST000708|     GCST|     gwas|           Freckling|           [EFO_0003963]|  null|              null|20585627|Web-based, partic...|            Eriksson N|     2010-06-24|          PLoS Genet|                              null|9,126 European an...|     0|        0|    9126|              [null]|         [{nfe, 1.0}]|  [{9126, European}]|                  []|           null|         null|                null|      false|\n",
+      "|  GCST000754|     GCST|     gwas|Personality dimen...|           [EFO_0004365]|  null|              null|20691247|A genome-wide ass...|            Verweij KJ|     2010-08-04|        Biol Psychol|                              null|5,117 European an...|     0|        0|    5117|              [null]|         [{nfe, 1.0}]|  [{5117, European}]|                  []|           null|         null|                null|      false|\n",
+      "|  GCST000880|     GCST|     gwas|Menarche (age at ...|           [EFO_0004703]|  null|              null|21102462|Thirty new loci f...|               Elks CE|     2010-11-21|           Nat Genet|                              null|86,142 European a...|     0|        0|   87802|              [null]|         [{nfe, 1.0}]| [{87802, European}]| [{14731, European}]|           null|         null|                null|      false|\n",
+      "|  GCST001031|     GCST|     gwas|Large B-cell lymp...|           [EFO_0000403]|  null|              null|21471979|Common variants o...|               Kumar V|     2011-04-07|         J Hum Genet|                              null|74 Japanese ances...|    74|      934|    1008|              [null]|         [{eas, 1.0}]|[{1008, East Asian}]|[{3634, East Asian}]|           null|         null|                null|      false|\n",
+      "|  GCST001032|     GCST|     gwas|Caffeine consumption|           [EFO_0004330]|  null|              null|21490707|Genome-wide meta-...|           Cornelis MC|     2011-04-07|          PLoS Genet|                              null|47,431 European a...|     0|        0|   47431|              [null]|         [{nfe, 1.0}]| [{47431, European}]|                  []|           null|         null|                null|      false|\n",
+      "|  GCST001059|     GCST|     gwas|    Neutrophil count|           [EFO_0004833]|  null|              null|21507922|Duffy-null-associ...|            Ramsuran V|     2011-05-01|     Clin Infect Dis|                              null|115 African ances...|     0|        0|     115|              [null]|         [{afr, 1.0}]|[{115, Sub-Sahara...|                  []|           null|         null|                null|      false|\n",
+      "|  GCST002187|     GCST|     gwas|Systolic blood pr...|           [EFO_0006335]|  null|              null|24058526|Genome-wide meta-...|           Bhatnagar P|     2013-09-13|            PLoS One|                   [MONDO_0011382]|1617 African Amer...|  1617|        0|    1617|              [null]|         [{afr, 1.0}]|[{1617, African A...|                  []|           null|         null|                null|      false|\n",
+      "|  GCST002623|     GCST|     gwas|   L-arginine levels|           [EFO_0006524]|  null|              null|25245031|Genome-wide assoc...|            Luneburg N|     2014-09-21|Circ Cardiovasc G...|                              null|3,747 European an...|     0|        0|    6739|              [null]|         [{nfe, 1.0}]|[{3747, European}...|  [{1159, European}]|           null|         null|                null|      false|\n",
+      "|  GCST003261|     GCST|     gwas|Ischemic stroke (...|            [HP_0002140]|  null|              null|26708676|Loci associated w...|              Pulit SL|     2015-12-18|       Lancet Neurol|                              null|up to 8,062 Europ...|  9510|    32473|   41983|              [null]| [{amr, 0.06647928...|[{2791, Hispanic ...|[{256, African Am...|           null|         null|                null|      false|\n",
+      "|  GCST003427|     GCST|     gwas|Alzheimer disease...|    [EFO_0004847, MON...|  null|              null|26830138|Family-based asso...|              Herold C|     2016-02-02|      Mol Psychiatry|                              null|2,478 European an...|  2478|      979|    3457|              [null]|         [{nfe, 1.0}]|  [{3524, European}]|                  []|           null|         null|                null|      false|\n",
+      "|  GCST003665|     GCST|     gwas|Free cholesterol ...|    [EFO_0004611, EFO...|  null|              null|27005778|Genome-wide study...|            Kettunen J|     2016-03-23|          Nat Commun|                              null|21,555 European a...|     0|        0|   21555|[EGCUT, ERF, FTC,...|         [{nfe, 1.0}]| [{21555, European}]|                  []|           null|         null|                null|      false|\n",
+      "|  GCST003773|     GCST|     gwas|Loneliness (multi...|           [EFO_0007865]|  null|              null|27629369|Genome-Wide Assoc...|                 Gao J|     2016-09-15|Neuropsychopharma...|                              null|8,490 European an...|     0|        0|   10760|              [null]| [{nfe, 0.80529739...|[{8490, European}...|                  []|           null|         null|                null|      false|\n",
+      "|  GCST003791|     GCST|     gwas|Response to metfo...|    [EFO_0006952, GO_...|  null|              null|28173075|Metformin pharmac...|                 Niu N|     2016-09-11|       Hum Mol Genet|                              null|up to 96 African ...|     0|        0|     288|              [null]| [{afr, 0.33333333...|[{96, African Ame...|                  []|           null|         null|                null|      false|\n",
+      "|  GCST003824|     GCST|     gwas|Depression in res...|    [EFO_0007006, EFO...|  null|              null|27723809|Genome-Wide Assoc...|           Matsunami K|     2016-10-10|            PLoS One|                     [EFO_0004220]|45 Japanese ances...|    45|      179|     224|              [null]|         [{eas, 1.0}]| [{224, East Asian}]| [{160, East Asian}]|           null|         null|                null|      false|\n",
+      "|  GCST003837|     GCST|     gwas|          Chronotype|           [EFO_0004354]|  null|              null|27494321|Genome-Wide Assoc...|              Jones SE|     2016-08-05|          PLoS Genet|                              null|127,898 British i...|     0|        0|  127898|              [null]|         [{nfe, 1.0}]|[{127898, European}]|       [{89283, NR}]|             []|           []|ftp://ftp.ebi.ac....|       true|\n",
+      "|  GCST004678|     GCST|     gwas|Psychosis pronene...|           [EFO_0008337]|  null|              null|28525603|Genome-Wide Assoc...|       Ortega-Alonso A|     2017-05-19|      Schizophr Bull|                              null|3,967 Finnish anc...|     0|        0|    3967|              [null]|         [{nfe, 1.0}]|  [{3967, European}]|                  []|           null|         null|                null|      false|\n",
+      "|  GCST005189|     GCST|     gwas|             Tanning|           [EFO_0004279]|  null|              null|29195075|An Unexpectedly C...|             Martin AR|     2017-11-30|                Cell|                              null|216 Sub-Saharan A...|     0|        0|     216|              [null]|         [{afr, 1.0}]|[{216, Sub-Sahara...|[{240, Sub-Sahara...|           null|         null|                null|      false|\n",
+      "|  GCST005437|     GCST|     gwas|Random C-peptide ...|           [EFO_0005187]|  null|              null|29404672|Meta-genome-wide ...|           Roshandel D|     2018-02-05|        Diabetologia|                   [MONDO_0005147]|1,497 European an...|     0|        0|    1497|              [null]|         [{nfe, 1.0}]|  [{1497, European}]|                  []|           null|         null|                null|      false|\n",
+      "|  GCST005503|     GCST|     gwas|Medium HDL partic...|           [EFO_0004612]|  null|              null|29084231|Common, low-frequ...|              Davis JP|     2017-10-30|          PLoS Genet|                              null|8,372 Finnish anc...|     0|        0|    8372|              [null]|         [{nfe, 1.0}]|  [{8372, European}]|                  []|           null|         null|                null|      false|\n",
+      "|  GCST005669|     GCST|     gwas|Delta-6 desaturas...|    [EFO_0007765, EFO...|  null|              null|29246731|A common variant ...|      de Toro-Martin J|     2017-11-02|      J Clin Lipidol|                              null|81 extreme respon...|     0|        0|     141|              [null]|         [{nfe, 1.0}]|         [{141, NR}]|                  []|           null|         null|                null|      false|\n",
+      "|  GCST005749|     GCST|     gwas|Digit length rati...|           [EFO_0004841]|  null|              null|29659830|Genome-wide assoc...|         Warrington NM|     2018-04-12|       Hum Mol Genet|                              null|14,382 European a...|     0|        0|   15661|              [null]|         [{nfe, 1.0}]|[{14382, European...|                  []|           null|         null|                null|      false|\n",
+      "|  GCST006420|     GCST|     gwas|Affective disorde...|    [EFO_0004247, EFO...|  null|              null|30116032|Genetics of suici...|           Erlangsen A|     2018-08-16|      Mol Psychiatry|                              null|4,302 European an...|  4302|    13294|   17596|              [null]|         [{nfe, 1.0}]| [{17596, European}]|                  []|           null|         null|                null|      false|\n",
+      "|  GCST006484|     GCST|     gwas|     Type 2 diabetes|         [MONDO_0005148]|  null|              null|30130595|Pilot genome-wide...|     Dominguez-Cruz MG|     2018-08-18|                Gene|                              null|45 Maya ancestry ...|    45|       47|      92|              [null]|         [{amr, 1.0}]|[{92, Native Amer...|                  []|           null|         null|                null|      false|\n",
+      "|  GCST006496|     GCST|     gwas|Glomerular filtra...|    [EFO_0006829, EFO...|  null|              null|30160337|Genome Wide Assoc...|               Asleh R|     2018-08-30|     Clin Transplant|                              null|243 European ance...|     0|        0|     251|              [null]| [{nfe, 0.99601593...|[{243, European},...|                  []|           null|         null|                null|      false|\n",
+      "|  GCST006739|     GCST|     gwas|Proportion of mis...|           [EFO_0006923]|  null|              null|30188897|Detecting past an...|               Jeong C|     2018-09-06|          PLoS Genet|                              null|981 Tibetan ances...|     0|        0|     981|              [null]|         [{nfe, 1.0}]|         [{981, NR}]|                  []|           null|         null|                null|      false|\n",
+      "|  GCST006907|     GCST|     gwas|Ischemic stroke (...|           [EFO_0005524]|  null|              null|29531354|Multiancestry gen...|               Malik R|     2018-03-12|           Nat Genet|                              null|4,373 European an...|  4373|   406111|  410484|              [null]|         [{nfe, 1.0}]|[{150765, European}]|                  []|             []|           []|ftp://ftp.ebi.ac....|       true|\n",
+      "|  GCST006960|     GCST|     gwas|Inflammatory bowe...|           [EFO_0003767]|  null|              null|26490195|Inherited determi...|             Cleynen I|     2015-10-18|              Lancet|                              null|16,902 European a...| 29838|        0|   29838|              [null]|         [{nfe, 1.0}]| [{29838, European}]|  [{6182, European}]|           null|         null|                null|      false|\n",
+      "|  GCST007217|     GCST|     gwas|RR interval (hear...|           [EFO_0004831]|  null|              null|30679814|Genome-wide assoc...|          van Setten J|     2019-01-24|     Eur J Hum Genet|                              null|2,006 Erasmus Ruc...|     0|        0|   28698|              [null]|         [{nfe, 1.0}]| [{28698, European}]|                  []|           null|         null|                null|      false|\n",
+      "|  GCST008154|     GCST|     gwas|      Trunk fat mass|           [EFO_0005409]|  null|              null|28552196|Whole-Genome Sequ...|         Tachmazidou I|     2017-06-01|      Am J Hum Genet|                              null|3,399 whole genom...|     0|        0|   16237|              [null]|         [{nfe, 1.0}]|[{3538, NR}, {128...| [{10667, European}]|           null|         null|                null|      false|\n",
+      "|  GCST008483|     GCST|     gwas|  Ulcerative colitis|           [EFO_0000729]|  null|              null|26398853|Identification of...|                 Ye BD|     2016-01-01|   Inflamm Bowel Dis|                              null|705 Korean ancest...|   705|     1178|    1883|              [null]|         [{eas, 1.0}]|[{1883, South Asi...|[{3674, South Asi...|           null|         null|                null|      false|\n",
+      "|  GCST008671|     GCST|     gwas|Phlegm x occupati...|    [EFO_0007939, EFO...|  null|              null|30449631|Genome-wide inter...|                Zeng X|     2018-11-15|         Environ Int|                              null|1,702 Dutch ances...|  1702|     6274|    7976|              [null]|         [{nfe, 1.0}]|  [{7976, European}]|  [{6789, European}]|           null|         null|                null|      false|\n",
+      "|  GCST008675|     GCST|     gwas|Maximum habitual ...|           [EFO_0007878]|  null|              null|31151762|Genome-wide Assoc...|           Gelernter J|     2019-04-08|     Biol Psychiatry|                              null|126,936 European ...|     0|        0|  143965|              [null]| [{afr, 0.11828569...|[{17029, African ...|                  []|           null|         null|                null|      false|\n",
+      "|  GCST008775|     GCST|     gwas|Birth weight or w...|    [EFO_0004342, EFO...|  null|              null|30858448|Genetic overlap b...|        Tekola-Ayele F|     2019-03-11|             Sci Rep|                              null|153,781 European ...|     0|        0|  378240|              [null]|         [{nfe, 1.0}]|[{246502, Europea...|                  []|           null|         null|                null|      false|\n",
+      "|  GCST008870|     GCST|     gwas|Keratinocyte canc...|           [EFO_0010176]|  null|              null|31174203|Combined analysis...|           Liyanage UE|     2019-06-07|       Hum Mol Genet|                              null|at least 18,538 E...| 18538|   340302|  358840|              [null]|         [{nfe, 1.0}]|[{358840, European}]|                  []|           null|         null|                null|      false|\n",
+      "|  GCST009173|     GCST|     gwas|Response to (pegy...|           [EFO_0007859]|  null|              null|30715261|Genome Wide Assoc...|            Brouwer WP|     2019-02-02|     Clin Infect Dis|                     [EFO_0004239]|121 Asian, Europe...|     0|        0|     509|              [null]| [{nfe, 0.5}, {afr...|[{127, European},...|                  []|           null|         null|                null|      false|\n",
+      "|  GCST009364|     GCST|     gwas|Triglyceride leve...|    [EFO_0004530, EFO...|  null|              null|31719535|Multi-ancestry sl...|             Noordam R|     2019-11-12|          Nat Commun|                              null|at least 2,926 Af...|     0|    49886|   61990|              [null]| [{eas, 0.03837715...|[{2096, East Asia...|[{12579, Hispanic...|           null|         null|                null|      false|\n",
+      "|  GCST009391|     GCST|     gwas|Metabolite levels...|           [EFO_0005132]|  null|              null|23823483|A genome-wide ass...|               Rhee EP|     2013-07-02|          Cell Metab|                              null|2,076 European an...|     0|        0|    2076|              [null]|         [{nfe, 1.0}]|  [{2076, European}]|                  []|           null|         null|                null|      false|\n",
+      "|GCST009391_2|     GCST|     gwas|   Metabolite levels|                      []|  null|              null|23823483|A genome-wide ass...|               Rhee EP|     2013-07-02|          Cell Metab|                              null|2,076 European an...|     0|        0|    2076|              [null]|         [{nfe, 1.0}]|  [{2076, European}]|                  []|           null|         null|                null|      false|\n",
+      "|GCST009391_3|     GCST|     gwas|   Metabolite levels|    [EFO_0004468, EFO...|  null|              null|23823483|A genome-wide ass...|               Rhee EP|     2013-07-02|          Cell Metab|                              null|2,076 European an...|     0|        0|    2076|              [null]|         [{nfe, 1.0}]|  [{2076, European}]|                  []|           null|         null|                null|      false|\n",
+      "|GCST009391_4|     GCST|     gwas|   Metabolite levels|           [EFO_0004518]|  null|              null|23823483|A genome-wide ass...|               Rhee EP|     2013-07-02|          Cell Metab|                              null|2,076 European an...|     0|        0|    2076|              [null]|         [{nfe, 1.0}]|  [{2076, European}]|                  []|           null|         null|                null|      false|\n",
+      "|GCST009391_5|     GCST|     gwas|   Metabolite levels|           [EFO_0004761]|  null|              null|23823483|A genome-wide ass...|               Rhee EP|     2013-07-02|          Cell Metab|                              null|2,076 European an...|     0|        0|    2076|              [null]|         [{nfe, 1.0}]|  [{2076, European}]|                  []|           null|         null|                null|      false|\n",
+      "|GCST009391_6|     GCST|     gwas|   Metabolite levels|           [EFO_0004846]|  null|              null|23823483|A genome-wide ass...|               Rhee EP|     2013-07-02|          Cell Metab|                              null|2,076 European an...|     0|        0|    2076|              [null]|         [{nfe, 1.0}]|  [{2076, European}]|                  []|           null|         null|                null|      false|\n",
+      "|GCST009391_7|     GCST|     gwas|   Metabolite levels|           [EFO_0005001]|  null|              null|23823483|A genome-wide ass...|               Rhee EP|     2013-07-02|          Cell Metab|                              null|2,076 European an...|     0|        0|    2076|              [null]|         [{nfe, 1.0}]|  [{2076, European}]|                  []|           null|         null|                null|      false|\n",
+      "|GCST009391_8|     GCST|     gwas|   Metabolite levels|           [EFO_0005002]|  null|              null|23823483|A genome-wide ass...|               Rhee EP|     2013-07-02|          Cell Metab|                              null|2,076 European an...|     0|        0|    2076|              [null]|         [{nfe, 1.0}]|  [{2076, European}]|                  []|           null|         null|                null|      false|\n",
+      "|GCST009391_9|     GCST|     gwas|   Metabolite levels|           [EFO_0005058]|  null|              null|23823483|A genome-wide ass...|               Rhee EP|     2013-07-02|          Cell Metab|                              null|2,076 European an...|     0|        0|    2076|              [null]|         [{nfe, 1.0}]|  [{2076, European}]|                  []|           null|         null|                null|      false|\n",
+      "+------------+---------+---------+--------------------+------------------------+------+------------------+--------+--------------------+----------------------+---------------+--------------------+----------------------------------+--------------------+------+---------+--------+--------------------+---------------------+--------------------+--------------------+---------------+-------------+--------------------+-----------+\n",
+      "only showing top 50 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "path_tmp=\"gs://gwas_catalog_data/study_index\"\n",
+    "tmp=StudyIndex.from_parquet(session=session, path=path_tmp)\n",
+    "tmp.df.show(50)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "joined_df=joined_df.withColumn(\"traitFromSourceMappedIds\",joined_df[\"trait_efos\"]).drop(\"trait_efos\",\"trait_reported_low\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n",
+      "|             studyId|  projectId|studyType|     traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds|   initialSampleSize|nCases|nControls|nSamples|  cohorts|ldPopulationStructure|   discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|\n",
+      "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n",
+      "|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|       Actinomycosis|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   101|   363227|  363328|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|          Amoebiasis|         ['EFO_0007144']|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   160|   367214|  367374|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|Anogenital herpes...|         ['EFO_0007282']|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  1986|   400197|  402183|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|       Aspergillosis|         ['EFO_0007157']|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   211|   403213|  403424|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|Atypical virus in...|       ['MONDO_0024318']|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   282|   409849|  410131|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_B...|FINNGEN_R10|     gwas|Bacterial infecti...|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...| 20226|   363227|  383453|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_B...|FINNGEN_R10|     gwas|Bacterial, viral ...|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  2852|   409329|  412181|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_B...|FINNGEN_R10|     gwas|Other bacterial i...|         ['EFO_0000771']|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  6145|   367214|  373359|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_C...|FINNGEN_R10|     gwas|         Candidiasis|       ['MONDO_0002026']|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  4306|   403213|  407519|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_C...|FINNGEN_R10|     gwas|Other sexually tr...|    ['MONDO_0021681',...|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  2186|   400197|  402383|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_C...|FINNGEN_R10|     gwas|             Cholera|         ['EFO_1001235']|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  1385|   367214|  368599|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_D...|FINNGEN_R10|     gwas|Dengue fever [cla...|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|    53|   409137|  409190|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_D...|FINNGEN_R10|     gwas|     Dermatophytosis|       ['MONDO_0004678']|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  3921|   403213|  407134|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_E...|FINNGEN_R10|     gwas|      Early syphilis|         ['EFO_0007504']|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   308|   400197|  400505|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "| FINNGEN_R10_AB1_EBV|FINNGEN_R10|     gwas|Infectious mononu...|         ['EFO_0007326']|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  2979|   400974|  403953|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_E...|FINNGEN_R10|     gwas|        Enterobiasis|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   112|   411658|  411770|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_E...|FINNGEN_R10|     gwas|          Erysipelas|         ['EFO_1001462']|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...| 22261|   363227|  385488|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_G...|FINNGEN_R10|     gwas|Diarrhoea and gas...|       ['MONDO_0045031']|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...| 32210|   367214|  399424|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_G...|FINNGEN_R10|     gwas|Gonococcal infection|           ['DOID_7551']|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   954|   400197|  401151|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_H...|FINNGEN_R10|     gwas|       Helminthiases|    ['EFO_0007245', '...|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   523|   411658|  412181|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n",
+      "only showing top 20 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "joined_df.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "string\n"
+     ]
+    }
+   ],
+   "source": [
+    "column_type = dict(joined_df.dtypes)[\"traitFromSourceMappedIds\"]\n",
+    "print(column_type)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyspark.sql.functions import from_json\n",
+    "from pyspark.sql.types import ArrayType, StringType\n",
+    "\n",
+    "# Assuming joined_df is your DataFrame\n",
+    "joined_df = joined_df.withColumn(\n",
+    "    \"traitFromSourceMappedIds\",\n",
+    "    from_json(\"traitFromSourceMappedIds\", ArrayType(StringType()))\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n",
+      "|             studyId|  projectId|studyType|     traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds|   initialSampleSize|nCases|nControls|nSamples|  cohorts|ldPopulationStructure|   discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|\n",
+      "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n",
+      "|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|       Actinomycosis|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   101|   363227|  363328|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|          Amoebiasis|           [EFO_0007144]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   160|   367214|  367374|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|Anogenital herpes...|           [EFO_0007282]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  1986|   400197|  402183|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|       Aspergillosis|           [EFO_0007157]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   211|   403213|  403424|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|Atypical virus in...|         [MONDO_0024318]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   282|   409849|  410131|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_B...|FINNGEN_R10|     gwas|Bacterial infecti...|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...| 20226|   363227|  383453|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_B...|FINNGEN_R10|     gwas|Bacterial, viral ...|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  2852|   409329|  412181|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_B...|FINNGEN_R10|     gwas|Other bacterial i...|           [EFO_0000771]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  6145|   367214|  373359|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_C...|FINNGEN_R10|     gwas|         Candidiasis|         [MONDO_0002026]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  4306|   403213|  407519|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_C...|FINNGEN_R10|     gwas|Other sexually tr...|    [MONDO_0021681, E...|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  2186|   400197|  402383|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_C...|FINNGEN_R10|     gwas|             Cholera|           [EFO_1001235]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  1385|   367214|  368599|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_D...|FINNGEN_R10|     gwas|Dengue fever [cla...|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|    53|   409137|  409190|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_D...|FINNGEN_R10|     gwas|     Dermatophytosis|         [MONDO_0004678]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  3921|   403213|  407134|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_E...|FINNGEN_R10|     gwas|      Early syphilis|           [EFO_0007504]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   308|   400197|  400505|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "| FINNGEN_R10_AB1_EBV|FINNGEN_R10|     gwas|Infectious mononu...|           [EFO_0007326]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  2979|   400974|  403953|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_E...|FINNGEN_R10|     gwas|        Enterobiasis|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   112|   411658|  411770|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_E...|FINNGEN_R10|     gwas|          Erysipelas|           [EFO_1001462]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...| 22261|   363227|  385488|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_G...|FINNGEN_R10|     gwas|Diarrhoea and gas...|         [MONDO_0045031]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...| 32210|   367214|  399424|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_G...|FINNGEN_R10|     gwas|Gonococcal infection|             [DOID_7551]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   954|   400197|  401151|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_H...|FINNGEN_R10|     gwas|       Helminthiases|    [EFO_0007245, EFO...|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   523|   411658|  412181|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n",
+      "only showing top 20 rows\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    }
+   ],
+   "source": [
+    "joined_df.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "array<string>\n"
+     ]
+    }
+   ],
+   "source": [
+    "column_type = dict(joined_df.dtypes)[\"traitFromSourceMappedIds\"]\n",
+    "print(column_type)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "si=StudyIndex(_df=joined_df, _schema=StudyIndex.get_schema())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n",
+      "|             studyId|  projectId|studyType|     traitFromSource|traitFromSourceMappedIds|geneId|tissueFromSourceId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds|   initialSampleSize|nCases|nControls|nSamples|  cohorts|ldPopulationStructure|   discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|\n",
+      "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n",
+      "|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|       Actinomycosis|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   101|   363227|  363328|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|          Amoebiasis|           [EFO_0007144]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   160|   367214|  367374|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|Anogenital herpes...|           [EFO_0007282]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  1986|   400197|  402183|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|       Aspergillosis|           [EFO_0007157]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   211|   403213|  403424|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_A...|FINNGEN_R10|     gwas|Atypical virus in...|         [MONDO_0024318]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   282|   409849|  410131|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_B...|FINNGEN_R10|     gwas|Bacterial infecti...|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...| 20226|   363227|  383453|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_B...|FINNGEN_R10|     gwas|Bacterial, viral ...|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  2852|   409329|  412181|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_B...|FINNGEN_R10|     gwas|Other bacterial i...|           [EFO_0000771]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  6145|   367214|  373359|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_C...|FINNGEN_R10|     gwas|         Candidiasis|         [MONDO_0002026]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  4306|   403213|  407519|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_C...|FINNGEN_R10|     gwas|Other sexually tr...|    [MONDO_0021681, E...|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  2186|   400197|  402383|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_C...|FINNGEN_R10|     gwas|             Cholera|           [EFO_1001235]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  1385|   367214|  368599|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_D...|FINNGEN_R10|     gwas|Dengue fever [cla...|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|    53|   409137|  409190|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_D...|FINNGEN_R10|     gwas|     Dermatophytosis|         [MONDO_0004678]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  3921|   403213|  407134|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_E...|FINNGEN_R10|     gwas|      Early syphilis|           [EFO_0007504]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   308|   400197|  400505|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "| FINNGEN_R10_AB1_EBV|FINNGEN_R10|     gwas|Infectious mononu...|           [EFO_0007326]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|  2979|   400974|  403953|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_E...|FINNGEN_R10|     gwas|        Enterobiasis|                    null|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   112|   411658|  411770|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_E...|FINNGEN_R10|     gwas|          Erysipelas|           [EFO_1001462]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...| 22261|   363227|  385488|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_G...|FINNGEN_R10|     gwas|Diarrhoea and gas...|         [MONDO_0045031]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...| 32210|   367214|  399424|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_G...|FINNGEN_R10|     gwas|Gonococcal infection|             [DOID_7551]|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   954|   400197|  401151|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "|FINNGEN_R10_AB1_H...|FINNGEN_R10|     gwas|       Helminthiases|    [EFO_0007245, EFO...|  null|              null|    null|            null|                  null|           null|              null|                              null|377,277 (210,870 ...|   523|   411658|  412181|[FinnGen]|         [{fin, 1.0}]|[{377277, Finnish}]|              null|           null|         null|gs://finngen-publ...|       true|\n",
+      "+--------------------+-----------+---------+--------------------+------------------------+------+------------------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+------+---------+--------+---------+---------------------+-------------------+------------------+---------------+-------------+--------------------+-----------+\n",
+      "only showing top 20 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "si.df.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "2408"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "si.df.count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "                                                                                \r"
+     ]
+    }
+   ],
+   "source": [
+    "si.df.write.parquet(path=\"gs://genetics-portal-dev-analysis/yt4/study_index_finngen_with_efo\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path_to_study_index=\"gs://genetics-portal-dev-analysis/yt4/study_index_finngen_with_efo\"\n",
+    "si=StudyIndex.from_parquet(session=session, path=path_to_study_index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "gentropy-krNFZEZg-py3.10",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 900dd649779c2727e44c30b11671b0e3c7261036 Mon Sep 17 00:00:00 2001
From: Daniel-Considine <113430683+Daniel-Considine@users.noreply.github.com>
Date: Tue, 23 Apr 2024 13:33:16 +0100
Subject: [PATCH 06/21] feat: adding init to finemapping step (#577)

* feat: adding init to finemapping step

* fix: removing some commented lines

* chore: fixing indents

* fix: schema

* feat: changing output path to include studyLocusId mapped

---------

Co-authored-by: Yakov <yt4@sanger.ac.uk>
---
 src/gentropy/susie_finemapper.py | 51 ++++++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py
index d37298436..7b6f81b3a 100644
--- a/src/gentropy/susie_finemapper.py
+++ b/src/gentropy/susie_finemapper.py
@@ -28,6 +28,47 @@ class SusieFineMapperStep:
     In the future this step will be refactored and moved to the methods module.
     """
 
+    def __init__(
+        self,
+        session: Session,
+        study_locus_to_finemap: str,
+        study_locus_collected_path: str,
+        study_index_path: str,
+        output_path: str,
+        locus_radius: int = 500_000,
+        locus_L: int = 10,
+    ) -> None:
+        """Run fine-mapping on a studyLocusId from a collected studyLocus table.
+
+        Args:
+            session (Session): Spark session
+            study_locus_to_finemap (str): path to the study locus to fine-map
+            study_locus_collected_path (str): path to the collected study locus
+            study_index_path (str): path to the study index
+            output_path (str): path to the output
+            locus_radius (int): Radius of base-pair window around the locus, default is 500_000
+            locus_L (int): Maximum number of causal variants in locus, default is 10
+        """
+        # Read studyLocus
+        study_locus = (
+            StudyLocus.from_parquet(session, study_locus_collected_path)
+            .df.filter(f.col("studyLocusId") == study_locus_to_finemap)
+            .collect()[0]
+        )
+        study_index = StudyIndex.from_parquet(session, study_index_path)
+        # Run fine-mapping
+        result = self.susie_finemapper_ss_gathered(
+            session,
+            study_locus,
+            study_index,
+            locus_radius * 2,
+            locus_L,
+        )
+        # Write result
+        result.df.write.mode(session.write_mode).parquet(
+            output_path + "/" + study_locus_to_finemap
+        )
+
     @staticmethod
     def susie_finemapper_one_studylocus_row(
         GWAS: SummaryStatistics,
@@ -317,9 +358,15 @@ def susie_finemapper_ss_gathered(
             + str(int(position + window / 2))
         )
 
+        schema = StudyLocus.get_schema()
+        gwas_df = session.spark.createDataFrame([study_locus_row], schema=schema)
+        exploded_df = gwas_df.select(f.explode("locus").alias("locus"))
+
+        result_df = exploded_df.select(
+            "locus.variantId", "locus.beta", "locus.standardError"
+        )
         gwas_df = (
-            session.spark.createDataFrame(study_locus_row.locus)
-            .withColumn("z", f.col("beta") / f.col("standardError"))
+            result_df.withColumn("z", f.col("beta") / f.col("standardError"))
             .withColumn("chromosome", f.split(f.col("variantId"), "_")[0])
             .withColumn("position", f.split(f.col("variantId"), "_")[1])
             .filter(f.col("z").isNotNull())

From 78fcf1b85a72751fb6a7a006b3769d8170348aed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?=
 <45119610+ireneisdoomed@users.noreply.github.com>
Date: Tue, 23 Apr 2024 13:55:35 +0100
Subject: [PATCH 07/21] feat: dockerise gentropy python package (#579)

Co-authored-by: David Ochoa <ochoa@ebi.ac.uk>
---
 Dockerfile | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 Dockerfile

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 000000000..1221ec637
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,8 @@
+FROM python:3.10
+
+RUN pip install poetry==1.7.1
+
+COPY . .
+RUN poetry install --without dev,docs,tests
+
+ENTRYPOINT ["poetry", "run", "gentropy"]

From 82b8a7c539d6ee669a73c09db1d30783eec41d69 Mon Sep 17 00:00:00 2001
From: Daniel-Considine <113430683+Daniel-Considine@users.noreply.github.com>
Date: Tue, 23 Apr 2024 14:27:50 +0100
Subject: [PATCH 08/21] feat: updating step config file (#580)

---
 src/gentropy/config.py           | 19 +++++++++++++++++++
 src/gentropy/susie_finemapper.py |  6 +++---
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/src/gentropy/config.py b/src/gentropy/config.py
index 127d90844..0a76b2c84 100644
--- a/src/gentropy/config.py
+++ b/src/gentropy/config.py
@@ -328,6 +328,24 @@ class WindowBasedClumpingStep(StepConfig):
     _target_: str = "gentropy.window_based_clumping.WindowBasedClumpingStep"
 
 
+@dataclass
+class FinemapperConfig(StepConfig):
+    """SuSiE fine-mapper step configuration."""
+
+    session: Any = field(
+        default_factory=lambda: {
+            "start_hail": True,
+        }
+    )
+    study_locus_to_finemap: str = MISSING
+    study_locus_collected_path: str = MISSING
+    study_index_path: str = MISSING
+    output_path: str = MISSING
+    locus_radius: int = MISSING
+    locus_l: int = MISSING
+    _target_: str = "gentropy.susie_finemapping.SusieFineMapperStep"
+
+
 @dataclass
 class Config:
     """Application configuration."""
@@ -385,3 +403,4 @@ def register_config() -> None:
     cs.store(group="step", name="variant_index", node=VariantIndexConfig)
     cs.store(group="step", name="variant_to_gene", node=VariantToGeneConfig)
     cs.store(group="step", name="window_based_clumping", node=WindowBasedClumpingStep)
+    cs.store(group="step", name="susie_finemapping", node=FinemapperConfig)
diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py
index 7b6f81b3a..5a0fa31f3 100644
--- a/src/gentropy/susie_finemapper.py
+++ b/src/gentropy/susie_finemapper.py
@@ -36,7 +36,7 @@ def __init__(
         study_index_path: str,
         output_path: str,
         locus_radius: int = 500_000,
-        locus_L: int = 10,
+        locus_l: int = 10,
     ) -> None:
         """Run fine-mapping on a studyLocusId from a collected studyLocus table.
 
@@ -47,7 +47,7 @@ def __init__(
             study_index_path (str): path to the study index
             output_path (str): path to the output
             locus_radius (int): Radius of base-pair window around the locus, default is 500_000
-            locus_L (int): Maximum number of causal variants in locus, default is 10
+            locus_l (int): Maximum number of causal variants in locus, default is 10
         """
         # Read studyLocus
         study_locus = (
@@ -62,7 +62,7 @@ def __init__(
             study_locus,
             study_index,
             locus_radius * 2,
-            locus_L,
+            locus_l,
         )
         # Write result
         result.df.write.mode(session.write_mode).parquet(

From b0c4530f8925a3d8123d678623d91e5743368ac2 Mon Sep 17 00:00:00 2001
From: Daniel-Considine <113430683+Daniel-Considine@users.noreply.github.com>
Date: Tue, 23 Apr 2024 16:22:38 +0100
Subject: [PATCH 09/21] fix: update error in config.py (#583)

---
 src/gentropy/config.py           | 2 +-
 src/gentropy/susie_finemapper.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/gentropy/config.py b/src/gentropy/config.py
index 0a76b2c84..8c876bb6d 100644
--- a/src/gentropy/config.py
+++ b/src/gentropy/config.py
@@ -343,7 +343,7 @@ class FinemapperConfig(StepConfig):
     output_path: str = MISSING
     locus_radius: int = MISSING
     locus_l: int = MISSING
-    _target_: str = "gentropy.susie_finemapping.SusieFineMapperStep"
+    _target_: str = "gentropy.susie_finemapper.SusieFineMapperStep"
 
 
 @dataclass
diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py
index 5a0fa31f3..ceb5fbb34 100644
--- a/src/gentropy/susie_finemapper.py
+++ b/src/gentropy/susie_finemapper.py
@@ -5,6 +5,7 @@
 import time
 from typing import Any
 
+import hail as hl
 import numpy as np
 import pandas as pd
 import pyspark.sql.functions as f
@@ -49,6 +50,7 @@ def __init__(
             locus_radius (int): Radius of base-pair window around the locus, default is 500_000
             locus_l (int): Maximum number of causal variants in locus, default is 10
         """
+        hl.init(sc=session.spark.sparkContext, log="/dev/null")
         # Read studyLocus
         study_locus = (
             StudyLocus.from_parquet(session, study_locus_collected_path)

From 28a067cd19573eaae49078df777fd72dbcd4dcc4 Mon Sep 17 00:00:00 2001
From: Daniel-Considine <113430683+Daniel-Considine@users.noreply.github.com>
Date: Tue, 23 Apr 2024 18:01:36 +0100
Subject: [PATCH 10/21] feat: changing locus window to locus radius to be
 consistent with other functions (#582)

---
 src/gentropy/datasource/gnomad/ld.py |  8 ++--
 src/gentropy/susie_finemapper.py     | 55 ++++++++++++++--------------
 2 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py
index a19cbb06b..02cb15c7f 100644
--- a/src/gentropy/datasource/gnomad/ld.py
+++ b/src/gentropy/datasource/gnomad/ld.py
@@ -453,14 +453,14 @@ def get_ld_matrix_slice(
     def get_locus_index(
         self: GnomADLDMatrix,
         study_locus_row: Row,
-        window_size: int = 1_000_000,
+        radius: int = 500_000,
         major_population: str = "nfe",
     ) -> DataFrame:
         """Extract hail matrix index from StudyLocus rows.
 
         Args:
             study_locus_row (Row): Study-locus row
-            window_size (int): Window size to extract from gnomad matrix
+            radius (int): Locus radius to extract from gnomad matrix
             major_population (str): Major population to extract from gnomad matrix, default is "nfe"
 
         Returns:
@@ -468,8 +468,8 @@ def get_locus_index(
 
         """
         chromosome = str("chr" + study_locus_row["chromosome"])
-        start = study_locus_row["position"] - window_size // 2
-        end = study_locus_row["position"] + window_size // 2
+        start = study_locus_row["position"] - radius
+        end = study_locus_row["position"] + radius
 
         liftover_ht = hl.read_table(self.liftover_ht_path)
         liftover_ht = (
diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py
index ceb5fbb34..24635934e 100644
--- a/src/gentropy/susie_finemapper.py
+++ b/src/gentropy/susie_finemapper.py
@@ -37,7 +37,7 @@ def __init__(
         study_index_path: str,
         output_path: str,
         locus_radius: int = 500_000,
-        locus_l: int = 10,
+        max_causal_snps: int = 10,
     ) -> None:
         """Run fine-mapping on a studyLocusId from a collected studyLocus table.
 
@@ -48,8 +48,9 @@ def __init__(
             study_index_path (str): path to the study index
             output_path (str): path to the output
             locus_radius (int): Radius of base-pair window around the locus, default is 500_000
-            locus_l (int): Maximum number of causal variants in locus, default is 10
+            max_causal_snps (int): Maximum number of causal variants in locus, default is 10
         """
+        # Initialise Hail
         hl.init(sc=session.spark.sparkContext, log="/dev/null")
         # Read studyLocus
         study_locus = (
@@ -63,8 +64,8 @@ def __init__(
             session,
             study_locus,
             study_index,
-            locus_radius * 2,
-            locus_l,
+            locus_radius,
+            max_causal_snps,
         )
         # Write result
         result.df.write.mode(session.write_mode).parquet(
@@ -77,7 +78,7 @@ def susie_finemapper_one_studylocus_row(
         session: Session,
         study_locus_row: Row,
         study_index: StudyIndex,
-        window: int = 1_000_000,
+        radius: int = 1_000_000,
         L: int = 10,
     ) -> StudyLocus:
         """Susie fine-mapper for StudyLocus row with SummaryStatistics object.
@@ -87,7 +88,7 @@ def susie_finemapper_one_studylocus_row(
             session (Session): Spark session
             study_locus_row (Row): StudyLocus row
             study_index (StudyIndex): StudyIndex object
-            window (int): window size for fine-mapping
+            radius (int): window size for fine-mapping
             L (int): number of causal variants
 
         Returns:
@@ -112,9 +113,9 @@ def susie_finemapper_one_studylocus_row(
         region = (
             chromosome
             + ":"
-            + str(int(position - window / 2))
+            + str(int(position - radius))
             + "-"
-            + str(int(position + window / 2))
+            + str(int(position + radius))
         )
 
         gwas_df = (
@@ -133,7 +134,7 @@ def susie_finemapper_one_studylocus_row(
             GnomADLDMatrix()
             .get_locus_index(
                 study_locus_row=study_locus_row,
-                window_size=window,
+                radius=radius,
                 major_population=major_population,
             )
             .withColumn(
@@ -321,8 +322,8 @@ def susie_finemapper_ss_gathered(
         session: Session,
         study_locus_row: Row,
         study_index: StudyIndex,
-        window: int = 1_000_000,
-        L: int = 10,
+        radius: int = 1_000_000,
+        max_causal_snps: int = 10,
     ) -> StudyLocus:
         """Susie fine-mapper for StudyLocus row with locus annotated summary statistics.
 
@@ -330,8 +331,8 @@ def susie_finemapper_ss_gathered(
             session (Session): Spark session
             study_locus_row (Row): StudyLocus row
             study_index (StudyIndex): StudyIndex object
-            window (int): window size for fine-mapping
-            L (int): number of causal variants
+            radius (int): window size for fine-mapping
+            max_causal_snps (int): number of causal variants
 
         Returns:
             StudyLocus: StudyLocus object with fine-mapped credible sets
@@ -355,9 +356,9 @@ def susie_finemapper_ss_gathered(
         region = (
             chromosome
             + ":"
-            + str(int(position - window / 2))
+            + str(int(position - radius))
             + "-"
-            + str(int(position + window / 2))
+            + str(int(position + radius))
         )
 
         schema = StudyLocus.get_schema()
@@ -382,7 +383,7 @@ def susie_finemapper_ss_gathered(
             GnomADLDMatrix()
             .get_locus_index(
                 study_locus_row=study_locus_row,
-                window_size=window,
+                radius=radius,
                 major_population=major_population,
             )
             .withColumn(
@@ -412,7 +413,7 @@ def susie_finemapper_ss_gathered(
         z_to_fm = np.array(pd_df["z"])
         ld_to_fm = gnomad_ld
 
-        susie_output = SUSIE_inf.susie_inf(z=z_to_fm, LD=ld_to_fm, L=L)
+        susie_output = SUSIE_inf.susie_inf(z=z_to_fm, LD=ld_to_fm, L=max_causal_snps)
 
         schema = StructType(
             [
@@ -624,8 +625,8 @@ def susie_finemapper_one_studylocus_row_v2_dev(
         session: Session,
         study_locus_row: Row,
         study_index: StudyIndex,
-        window: int = 1_000_000,
-        L: int = 10,
+        radius: int = 1_000_000,
+        max_causal_snps: int = 10,
         susie_est_tausq: bool = False,
         run_carma: bool = False,
         run_sumstat_imputation: bool = False,
@@ -641,8 +642,8 @@ def susie_finemapper_one_studylocus_row_v2_dev(
             session (Session): Spark session
             study_locus_row (Row): StudyLocus row
             study_index (StudyIndex): StudyIndex object
-            window (int): window size for fine-mapping
-            L (int): number of causal variants
+            radius (int): Radius in base-pairs of window for fine-mapping
+            max_causal_snps (int): maximum number of causal variants
             susie_est_tausq (bool): estimate tau squared, default is False
             run_carma (bool): run CARMA, default is False
             run_sumstat_imputation (bool): run summary statistics imputation, default is False
@@ -673,9 +674,9 @@ def susie_finemapper_one_studylocus_row_v2_dev(
         region = (
             chromosome
             + ":"
-            + str(int(position - window / 2))
+            + str(int(position - radius))
             + "-"
-            + str(int(position + window / 2))
+            + str(int(position + radius))
         )
         gwas_df = (
             GWAS.df.withColumn("z", f.col("beta") / f.col("standardError"))
@@ -686,15 +687,15 @@ def susie_finemapper_one_studylocus_row_v2_dev(
             .filter(f.col("studyId") == studyId)
             .filter(f.col("z").isNotNull())
             .filter(f.col("chromosome") == chromosome)
-            .filter(f.col("position") >= position - window / 2)
-            .filter(f.col("position") <= position + window / 2)
+            .filter(f.col("position") >= position - radius)
+            .filter(f.col("position") <= position + radius)
         )
 
         ld_index = (
             GnomADLDMatrix()
             .get_locus_index(
                 study_locus_row=study_locus_row,
-                window_size=window,
+                radius=radius,
                 major_population=major_population,
             )
             .withColumn(
@@ -719,7 +720,7 @@ def susie_finemapper_one_studylocus_row_v2_dev(
             GWAS_df=gwas_df,
             ld_index=ld_index,
             gnomad_ld=gnomad_ld,
-            L=L,
+            L=max_causal_snps,
             session=session,
             studyId=studyId,
             region=region,

From e2f8e870725b282d430b5e7f2fb140d30d66ecc1 Mon Sep 17 00:00:00 2001
From: Daniel Suveges <daniel.suveges@protonmail.com>
Date: Wed, 24 Apr 2024 10:29:22 +0200
Subject: [PATCH 11/21] fix: minor updates and bug fixes (#543)

---
 config/step/ot_variant_index.yaml           | 2 +-
 src/airflow/dags/genetics_etl.py            | 8 ++++----
 src/airflow/dags/gwas_catalog_preprocess.py | 8 ++------
 src/gentropy/pics.py                        | 8 ++++++--
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/config/step/ot_variant_index.yaml b/config/step/ot_variant_index.yaml
index 1625c7126..3834196b2 100644
--- a/config/step/ot_variant_index.yaml
+++ b/config/step/ot_variant_index.yaml
@@ -2,5 +2,5 @@ defaults:
   - variant_index
 
 variant_annotation_path: ${datasets.variant_annotation}
-credible_set_path: ${datasets.study_locus}
+credible_set_path: ${datasets.credible_set}
 variant_index_path: ${datasets.variant_index}
diff --git a/src/airflow/dags/genetics_etl.py b/src/airflow/dags/genetics_etl.py
index 55f343648..ae510131c 100644
--- a/src/airflow/dags/genetics_etl.py
+++ b/src/airflow/dags/genetics_etl.py
@@ -35,16 +35,16 @@
     # PICS credible sets from GWAS Catalog curated associations:
     "gwas_catalog_curated_credible_set": {
         "source_bucket": GWAS_CATALOG_BUCKET_NAME,
-        "source_object": "credible_set_datasets/gwas_catalog_curated",
+        "source_object": "credible_set_datasets/gwas_catalog_PICSed_curated_associations",
         "destination_bucket": RELEASE_BUCKET_NAME,
-        "destination_object": f"releases/{RELEASE_VERSION}/credible_set/gwas_catalog_pics_from_curation",
+        "destination_object": f"releases/{RELEASE_VERSION}/credible_set/gwas_catalog_PICSed_curated_associations",
     },
     # PICS credible sets from GWAS Catalog summary statistics:
     "gwas_catalog_sumstats_credible_set": {
         "source_bucket": GWAS_CATALOG_BUCKET_NAME,
-        "source_object": "credible_set_datasets/gwas_catalog_summary_stats",
+        "source_object": "credible_set_datasets/gwas_catalog_PICSed_summary_statistics",
         "destination_bucket": RELEASE_BUCKET_NAME,
-        "destination_object": f"releases/{RELEASE_VERSION}/credible_set/gwas_catalog_pics_from_summary_statistics",
+        "destination_object": f"releases/{RELEASE_VERSION}/credible_set/gwas_catalog_PICSed_summary_statistics",
     },
     # GWAS Catalog manifest files:
     "gwas_catalog_manifests": {
diff --git a/src/airflow/dags/gwas_catalog_preprocess.py b/src/airflow/dags/gwas_catalog_preprocess.py
index 36130c87e..1814ddf2d 100644
--- a/src/airflow/dags/gwas_catalog_preprocess.py
+++ b/src/airflow/dags/gwas_catalog_preprocess.py
@@ -45,12 +45,8 @@
 WINDOW_BASED_CLUMPED = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_summary_stats_window_clumped"
 LD_BASED_CLUMPED = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_summary_stats_ld_clumped"
 # Credible sets:
-CURATED_CREDIBLE_SETS = (
-    f"gs://{GWAS_CATALOG_BUCKET_NAME}/credible_set_datasets/gwas_catalog_curated"
-)
-SUMMARY_STATISTICS_CREDIBLE_SETS = (
-    f"gs://{GWAS_CATALOG_BUCKET_NAME}/credible_set_datasets/gwas_catalog_summary_stats"
-)
+CURATED_CREDIBLE_SETS = f"gs://{GWAS_CATALOG_BUCKET_NAME}/credible_set_datasets/gwas_catalog_PICSed_curated_associations"
+SUMMARY_STATISTICS_CREDIBLE_SETS = f"gs://{GWAS_CATALOG_BUCKET_NAME}/credible_set_datasets/gwas_catalog_PICSed_summary_statistics"
 
 
 def upload_harmonized_study_list(
diff --git a/src/gentropy/pics.py b/src/gentropy/pics.py
index c2ed9bf66..80421b9ae 100644
--- a/src/gentropy/pics.py
+++ b/src/gentropy/pics.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from gentropy.common.session import Session
-from gentropy.dataset.study_locus import StudyLocus
+from gentropy.dataset.study_locus import CredibleInterval, StudyLocus
 from gentropy.method.pics import PICS
 
 
@@ -28,6 +28,10 @@ def __init__(
             session, study_locus_ld_annotated_in
         )
         # PICS
-        picsed_sl = PICS.finemap(study_locus_ld_annotated).annotate_credible_sets()
+        picsed_sl = (
+            PICS.finemap(study_locus_ld_annotated)
+            .annotate_credible_sets()
+            .filter_credible_set(credible_interval=CredibleInterval.IS99)
+        )
         # Write
         picsed_sl.df.write.mode(session.write_mode).parquet(picsed_study_locus_out)

From cf184f8d8a3d88c02e056995e74549f141cc6348 Mon Sep 17 00:00:00 2001
From: Daniel-Considine <113430683+Daniel-Considine@users.noreply.github.com>
Date: Wed, 24 Apr 2024 09:43:46 +0100
Subject: [PATCH 12/21] fix: updating config.py argument for finemapper (#584)

---
 src/gentropy/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gentropy/config.py b/src/gentropy/config.py
index 8c876bb6d..068bd42bf 100644
--- a/src/gentropy/config.py
+++ b/src/gentropy/config.py
@@ -342,7 +342,7 @@ class FinemapperConfig(StepConfig):
     study_index_path: str = MISSING
     output_path: str = MISSING
     locus_radius: int = MISSING
-    locus_l: int = MISSING
+    max_causal_snps: int = MISSING
     _target_: str = "gentropy.susie_finemapper.SusieFineMapperStep"
 
 

From bcc9a364a475136dbd7350bc6c9a7ca4da252971 Mon Sep 17 00:00:00 2001
From: Yakov <yt4@sanger.ac.uk>
Date: Wed, 24 Apr 2024 09:55:51 +0100
Subject: [PATCH 13/21] feat(sumstat qc): adding methods for QC of summary
 statistics (#455)

* feat(sumstat qc): class for GWAS sumstat QC

* feat: small fixes

* feat: modifying sumarry_statistics class

* feat: adding qc_pz_check

* fix: test fix

* feat: adding se_n check

* chore: fixing functions

* feat: adding gc lambda check

* test: adding tests

* fix: fixing tests

* fix: fixing tests

* fix: fixing tests and adding rounding

* chore: pre-commit auto fixes [...]

* feat(sumstat qc): adding new method class

* test: fix tests

* fix(sumstat qc): fixing tests

* fix: remove number_of_snps from susmtat class

* test: add multiple studyId test

* docs: adding documentation for sumstat QC

* fix: fixes according to comments

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../methods/sumstat_quality_controls.md       |  18 ++
 src/gentropy/dataset/summary_statistics.py    |  26 ++
 .../method/sumstat_quality_controls.py        | 285 ++++++++++++++++++
 tests/gentropy/method/test_qc_of_sumstats.py  |  63 ++++
 4 files changed, 392 insertions(+)
 create mode 100644 docs/python_api/methods/sumstat_quality_controls.md
 create mode 100644 src/gentropy/method/sumstat_quality_controls.py
 create mode 100644 tests/gentropy/method/test_qc_of_sumstats.py

diff --git a/docs/python_api/methods/sumstat_quality_controls.md b/docs/python_api/methods/sumstat_quality_controls.md
new file mode 100644
index 000000000..dfc5c9d16
--- /dev/null
+++ b/docs/python_api/methods/sumstat_quality_controls.md
@@ -0,0 +1,18 @@
+---
+title: QC of GWAS Summary Statistics
+---
+
+This class consists of several general quality control checks for GWAS with full summary statistics.
+There are several checks included:
+
+1. Genomic control lambda (median of the distribution of Chi2 statistics divided by expected for Chi2 with df=1). Lambda should be reasonably close to 1. Ideally not bigger than 2.
+
+2. P-Z check: the linear regression between log10 of reported p-values and log10 of p-values inferred from betas and standard errors. Intercept of the regression should be close to 0, slope close to 1.
+
+3. Mean beta check: mean of beta. Should be close to 0.
+
+4. The N_eff check: It estimates the ratio between effective sample size and the expected one and checks its distribution. It is possible to conduct only if the effective allele frequency is provided in the study. The median ratio is always close to 1, standard error should be close to 0.
+
+5. Number of SNPs and number of significant SNPs.
+
+:::gentropy.method.sumstat_quality_controls.SummaryStatisticsQC
diff --git a/src/gentropy/dataset/summary_statistics.py b/src/gentropy/dataset/summary_statistics.py
index 6cde03988..6244d5879 100644
--- a/src/gentropy/dataset/summary_statistics.py
+++ b/src/gentropy/dataset/summary_statistics.py
@@ -103,3 +103,29 @@ def exclude_region(self: SummaryStatistics, region: str) -> SummaryStatistics:
             ),
             _schema=SummaryStatistics.get_schema(),
         )
+
+    def sanity_filter(self: SummaryStatistics) -> SummaryStatistics:
+        """The function filters the summary statistics by sanity filters.
+
+        The function filters the summary statistics by the following filters:
+            - The p-value should not be eqaul 1.
+            - The beta and se should not be equal 0.
+            - The p-value, beta and se should not be NaN.
+
+        Returns:
+            SummaryStatistics: The filtered summary statistics.
+        """
+        gwas_df = self._df
+        gwas_df = gwas_df.dropna(
+            subset=["beta", "standardError", "pValueMantissa", "pValueExponent"]
+        )
+
+        gwas_df = gwas_df.filter((f.col("beta") != 0) & (f.col("standardError") != 0))
+        gwas_df = gwas_df.filter(
+            f.col("pValueMantissa") * 10 ** f.col("pValueExponent") != 1
+        )
+
+        return SummaryStatistics(
+            _df=gwas_df,
+            _schema=SummaryStatistics.get_schema(),
+        )
diff --git a/src/gentropy/method/sumstat_quality_controls.py b/src/gentropy/method/sumstat_quality_controls.py
new file mode 100644
index 000000000..2858f4813
--- /dev/null
+++ b/src/gentropy/method/sumstat_quality_controls.py
@@ -0,0 +1,285 @@
+"""Summary statistics qulity control methods."""
+from __future__ import annotations
+
+import numpy as np
+import pyspark.sql.functions as f
+import pyspark.sql.types as t
+import scipy as sc
+from pyspark.sql import DataFrame
+from pyspark.sql.functions import expr, log10, row_number
+from pyspark.sql.window import Window
+from scipy.stats import chi2
+
+from gentropy.dataset.summary_statistics import SummaryStatistics
+
+
+class SummaryStatisticsQC:
+    """Summary statistics QC methods.
+
+    This module contains methods for quality control of GWAS summary statistics.
+    The list of methods includes:
+
+        - sumstat_qc_beta_check: This is the mean beta check. The mean beta should be close to 0.
+
+        - sumstat_qc_pz_check: This is the PZ check. It runs a linear regression between reported p-values and p-values inferred from z-scores.
+
+        - sumstat_n_eff_check: This is the effective sample size check. It estimates the ratio between the effective sample size and the expected one and checks its distribution.
+
+        - gc_lambda_check: This is the genomic control lambda check.
+
+        - number_of_snps: This function calculates the number of SNPs and the number of SNPs with a p-value less than 5e-8.
+    """
+
+    @staticmethod
+    def sumstat_qc_beta_check(
+        gwas_for_qc: SummaryStatistics,
+    ) -> DataFrame:
+        """The mean beta check for QC of GWAS summary statstics.
+
+        Args:
+            gwas_for_qc (SummaryStatistics): The instance of the SummaryStatistics class.
+
+        Returns:
+            DataFrame: PySpark DataFrame with the mean beta for each study.
+        """
+        gwas_df = gwas_for_qc._df
+        qc_c = gwas_df.groupBy("studyId").agg(
+            f.mean("beta").alias("mean_beta"),
+        )
+        return qc_c
+
+    @staticmethod
+    def _calculate_logpval(z2: float) -> float:
+        """Calculate negative log10-pval from Z-score.
+
+        Args:
+            z2 (float): Z-score squared.
+
+        Returns:
+            float: log10-pval.
+
+        Examples:
+            >>> SummaryStatisticsQC._calculate_logpval(1.0)
+            0.49851554582799334
+        """
+        logpval = -np.log10(sc.stats.chi2.sf((z2), 1))
+        return float(logpval)
+
+    @staticmethod
+    def sumstat_qc_pz_check(
+        gwas_for_qc: SummaryStatistics,
+        limit: int = 10_000_000,
+    ) -> DataFrame:
+        """The PZ check for QC of GWAS summary statstics. It runs linear regression between reported p-values and p-values infered from z-scores.
+
+        Args:
+            gwas_for_qc (SummaryStatistics): The instance of the SummaryStatistics class.
+            limit (int): The limit for the number of variants to be used for the estimation.
+
+        Returns:
+            DataFrame: PySpark DataFrame with the results of the linear regression for each study.
+        """
+        gwas_df = gwas_for_qc._df
+
+        calculate_logpval_udf = f.udf(
+            SummaryStatisticsQC._calculate_logpval, t.DoubleType()
+        )
+
+        window = Window.partitionBy("studyId").orderBy("studyId")
+
+        gwas_df = (
+            gwas_df.withColumn("row_num", row_number().over(window))
+            .filter(f.col("row_num") <= limit)
+            .drop("row_num")
+        )
+
+        qc_c = (
+            gwas_df.withColumn("zscore", f.col("beta") / f.col("standardError"))
+            .withColumn("new_logpval", calculate_logpval_udf(f.col("zscore") ** 2))
+            .withColumn("log_mantissa", log10("pValueMantissa"))
+            .withColumn(
+                "diffpval",
+                -f.col("log_mantissa") - f.col("pValueExponent") - f.col("new_logpval"),
+            )
+            .groupBy("studyId")
+            .agg(
+                f.mean("diffpval").alias("mean_diff_pz"),
+                f.stddev("diffpval").alias("se_diff_pz"),
+            )
+            .select("studyId", "mean_diff_pz", "se_diff_pz")
+        )
+
+        return qc_c
+
+    @staticmethod
+    def sumstat_n_eff_check(
+        gwas_for_qc: SummaryStatistics,
+        n_total: int = 100_000,
+        limit: int = 10_000_000,
+        min_count: int = 100,
+    ) -> DataFrame:
+        """The effective sample size check for QC of GWAS summary statstics.
+
+        It estiamtes the ratio between effective sample size and the expected one and checks it's distribution.
+        It is possible to conduct only if the effective allele frequency is provided in the study.
+        The median rartio is always close to 1, but standard error could be inflated.
+
+        Args:
+            gwas_for_qc (SummaryStatistics): The instance of the SummaryStatistics class.
+            n_total (int): The reported sample size of the study. The QC metrics is robust toward the sample size.
+            limit (int): The limit for the number of variants to be used for the estimation.
+            min_count (int): The minimum number of variants to be used for the estimation.
+
+        Returns:
+            DataFrame: PySpark DataFrame with the effective sample size ratio for each study.
+        """
+        gwas_df = gwas_for_qc._df
+
+        gwas_df = gwas_df.dropna(subset=["effectAlleleFrequencyFromSource"])
+
+        counts_df = gwas_df.groupBy("studyId").count()
+
+        # Join the original DataFrame with the counts DataFrame
+        df_with_counts = gwas_df.join(counts_df, on="studyId")
+
+        # Filter the DataFrame to keep only the groups with count greater than or equal to min_count
+        filtered_df = df_with_counts.filter(f.col("count") >= min_count).drop("count")
+
+        window = Window.partitionBy("studyId").orderBy("studyId")
+        gwas_df = (
+            filtered_df.withColumn("row_num", row_number().over(window))
+            .filter(f.col("row_num") <= limit)
+            .drop("row_num")
+        )
+
+        gwas_df = gwas_df.withColumn(
+            "var_af",
+            2
+            * (
+                f.col("effectAlleleFrequencyFromSource")
+                * (1 - f.col("effectAlleleFrequencyFromSource"))
+            ),
+        ).withColumn(
+            "pheno_var",
+            ((f.col("standardError") ** 2) * n_total * f.col("var_af"))
+            + ((f.col("beta") ** 2) * f.col("var_af")),
+        )
+
+        window = Window.partitionBy("studyId").orderBy("studyId")
+
+        # Calculate the median of 'pheno_var' for each 'studyId' and add it as a new column
+        gwas_df = gwas_df.withColumn(
+            "pheno_median", expr("percentile_approx(pheno_var, 0.5)").over(window)
+        )
+
+        gwas_df = gwas_df.withColumn(
+            "N_hat_ratio",
+            (
+                (f.col("pheno_median") - ((f.col("beta") ** 2) * f.col("var_af")))
+                / ((f.col("standardError") ** 2) * f.col("var_af") * n_total)
+            ),
+        )
+
+        qc_c = (
+            gwas_df.groupBy("studyId")
+            .agg(
+                f.stddev("N_hat_ratio").alias("se_N"),
+            )
+            .select("studyId", "se_N")
+        )
+
+        return qc_c
+
+    @staticmethod
+    def gc_lambda_check(
+        gwas_for_qc: SummaryStatistics,
+        limit: int = 10_000_000,
+    ) -> DataFrame:
+        """The genomic control lambda check for QC of GWAS summary statstics.
+
+        Args:
+            gwas_for_qc (SummaryStatistics): The instance of the SummaryStatistics class.
+            limit (int): The limit for the number of variants to be used for the estimation.
+
+        Returns:
+            DataFrame: PySpark DataFrame with the genomic control lambda for each study.
+        """
+        gwas_df = gwas_for_qc._df
+        window = Window.partitionBy("studyId").orderBy("studyId")
+        gwas_df = (
+            gwas_df.withColumn("row_num", row_number().over(window))
+            .filter(f.col("row_num") <= limit)
+            .drop("row_num")
+        )
+
+        qc_c = (
+            gwas_df.select("studyId", "beta", "standardError")
+            .withColumn("Z2", (f.col("beta") / f.col("standardError")) ** 2)
+            .groupBy("studyId")
+            .agg(f.expr("percentile_approx(Z2, 0.5)").alias("gc_lambda"))
+            .withColumn("gc_lambda", f.col("gc_lambda") / chi2.ppf(0.5, df=1))
+            .select("studyId", "gc_lambda")
+        )
+
+        return qc_c
+
+    @staticmethod
+    def number_of_snps(
+        gwas_for_qc: SummaryStatistics, pval_threhod: float = 5e-8
+    ) -> DataFrame:
+        """The function caluates number of SNPs and number of SNPs with p-value less than 5e-8.
+
+        Args:
+            gwas_for_qc (SummaryStatistics): The instance of the SummaryStatistics class.
+            pval_threhod (float): The threshold for the p-value.
+
+        Returns:
+            DataFrame: PySpark DataFrame with the number of SNPs and number of SNPs with p-value less than threshold.
+        """
+        gwas_df = gwas_for_qc._df
+
+        snp_counts = gwas_df.groupBy("studyId").agg(
+            f.count("*").alias("n_variants"),
+            f.sum(
+                (
+                    f.log10(f.col("pValueMantissa")) + f.col("pValueExponent")
+                    <= np.log10(pval_threhod)
+                ).cast("int")
+            ).alias("n_variants_sig"),
+        )
+
+        return snp_counts
+
+    @staticmethod
+    def get_quality_control_metrics(
+        gwas: SummaryStatistics,
+        limit: int = 100_000_000,
+        min_count: int = 100_000,
+        n_total: int = 100_000,
+    ) -> DataFrame:
+        """The function calculates the quality control metrics for the summary statistics.
+
+        Args:
+            gwas (SummaryStatistics): The instance of the SummaryStatistics class.
+            limit (int): The limit for the number of variants to be used for the estimation.
+            min_count (int): The minimum number of variants to be used for the estimation.
+            n_total (int): The total sample size.
+
+        Returns:
+            DataFrame: PySpark DataFrame with the quality control metrics for the summary statistics.
+        """
+        qc1 = SummaryStatisticsQC.sumstat_qc_beta_check(gwas_for_qc=gwas)
+        qc2 = SummaryStatisticsQC.sumstat_qc_pz_check(gwas_for_qc=gwas, limit=limit)
+        qc3 = SummaryStatisticsQC.sumstat_n_eff_check(
+            gwas_for_qc=gwas, n_total=n_total, limit=limit, min_count=min_count
+        )
+        qc4 = SummaryStatisticsQC.gc_lambda_check(gwas_for_qc=gwas, limit=limit)
+        qc5 = SummaryStatisticsQC.number_of_snps(gwas_for_qc=gwas)
+        df = (
+            qc1.join(qc2, on="studyId", how="outer")
+            .join(qc3, on="studyId", how="outer")
+            .join(qc4, on="studyId", how="outer")
+            .join(qc5, on="studyId", how="outer")
+        )
+
+        return df
diff --git a/tests/gentropy/method/test_qc_of_sumstats.py b/tests/gentropy/method/test_qc_of_sumstats.py
new file mode 100644
index 000000000..8480fce8d
--- /dev/null
+++ b/tests/gentropy/method/test_qc_of_sumstats.py
@@ -0,0 +1,63 @@
+"""Test of the qc of summary statistics."""
+
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+import pyspark.sql.functions as f
+from gentropy.dataset.summary_statistics import SummaryStatistics
+from gentropy.method.sumstat_quality_controls import SummaryStatisticsQC
+from pyspark.sql.functions import rand, when
+
+
+def test_qc_functions(
+    sample_summary_statistics: SummaryStatistics,
+) -> None:
+    """Test all sumstat qc functions."""
+    gwas = sample_summary_statistics.sanity_filter()
+    QC = SummaryStatisticsQC.get_quality_control_metrics(
+        gwas=gwas, limit=100000, min_count=100, n_total=100000
+    )
+    QC = QC.toPandas()
+
+    assert QC["n_variants"].iloc[0] == 1663
+    assert QC["n_variants_sig"].iloc[0] == 29
+    assert np.round(QC["gc_lambda"].iloc[0], 4) == 1.916
+    assert np.round(QC["mean_beta"].iloc[0], 4) == 0.0013
+    assert np.round(QC["mean_diff_pz"].iloc[0], 6) == 0
+    assert np.round(QC["se_diff_pz"].iloc[0], 6) == 0
+    assert pd.isna(QC["se_N"].iloc[0])
+
+
+def test_neff_check_eaf(
+    sample_summary_statistics: SummaryStatistics,
+) -> None:
+    """Test N_eff check using mock EAFs."""
+    gwas = sample_summary_statistics.sanity_filter()
+    gwas_df = gwas._df
+    gwas_df = gwas_df.withColumn("effectAlleleFrequencyFromSource", f.lit(0.5))
+    gwas._df = gwas_df
+
+    QC = SummaryStatisticsQC.get_quality_control_metrics(
+        gwas=gwas, limit=100000, min_count=100, n_total=100000
+    )
+    QC = QC.toPandas()
+    assert np.round(QC["se_N"].iloc[0], 4) == 0.5586
+
+
+def test_several_studyid(
+    sample_summary_statistics: SummaryStatistics,
+) -> None:
+    """Test stability when several studyIds are present."""
+    gwas = sample_summary_statistics.sanity_filter()
+    gwas_df = gwas._df
+    gwas_df = gwas_df.withColumn(
+        "studyId", when(rand() < 0.5, "new_value").otherwise(gwas_df["studyId"])
+    )
+    gwas._df = gwas_df
+
+    QC = SummaryStatisticsQC.get_quality_control_metrics(
+        gwas=gwas, limit=100000, min_count=100, n_total=100000
+    )
+    QC = QC.toPandas()
+    assert QC.shape == (2, 8)

From 05d21bc69331d7408dbfbacda54eab73ed67295b Mon Sep 17 00:00:00 2001
From: Yakov <yt4@sanger.ac.uk>
Date: Thu, 25 Apr 2024 14:05:10 +0100
Subject: [PATCH 14/21] feat:
 susie_finemapper_one_studylocus_row_v3_dev_ss_gathered (#586)

---
 src/gentropy/susie_finemapper.py | 121 +++++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)

diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py
index 24635934e..41e1ddcfe 100644
--- a/src/gentropy/susie_finemapper.py
+++ b/src/gentropy/susie_finemapper.py
@@ -734,3 +734,124 @@ def susie_finemapper_one_studylocus_row_v2_dev(
         )
 
         return out
+
+    @staticmethod
+    def susie_finemapper_one_studylocus_row_v3_dev_ss_gathered(
+        session: Session,
+        study_locus_row: Row,
+        study_index: StudyIndex,
+        radius: int = 1_000_000,
+        max_causal_snps: int = 10,
+        susie_est_tausq: bool = False,
+        run_carma: bool = False,
+        run_sumstat_imputation: bool = False,
+        carma_time_limit: int = 600,
+        imputed_r2_threshold: float = 0.8,
+        ld_score_threshold: float = 4,
+        sum_pips: float = 0.99,
+    ) -> dict[str, Any]:
+        """Susie fine-mapper function that uses study-locus row with collected locus, chromosome and position as inputs.
+
+        Args:
+            session (Session): Spark session
+            study_locus_row (Row): StudyLocus row with collected locus
+            study_index (StudyIndex): StudyIndex object
+            radius (int): Radius in base-pairs of window for fine-mapping
+            max_causal_snps (int): maximum number of causal variants
+            susie_est_tausq (bool): estimate tau squared, default is False
+            run_carma (bool): run CARMA, default is False
+            run_sumstat_imputation (bool): run summary statistics imputation, default is False
+            carma_time_limit (int): CARMA time limit, default is 600 seconds
+            imputed_r2_threshold (float): imputed R2 threshold, default is 0.8
+            ld_score_threshold (float): LD score threshold ofr imputation, default is 4
+            sum_pips (float): the expected sum of posterior probabilities in the locus, default is 0.99 (99% credible set)
+
+        Returns:
+            dict[str, Any]: dictionary with study locus, number of GWAS variants, number of LD variants, number of variants after merge, number of outliers, number of imputed variants, number of variants to fine-map
+        """
+        # PLEASE DO NOT REMOVE THIS LINE
+        pd.DataFrame.iteritems = pd.DataFrame.items
+
+        chromosome = study_locus_row["chromosome"]
+        position = study_locus_row["position"]
+        studyId = study_locus_row["studyId"]
+
+        study_index_df = study_index._df
+        study_index_df = study_index_df.filter(f.col("studyId") == studyId)
+        major_population = study_index_df.select(
+            "studyId",
+            f.array_max(f.col("ldPopulationStructure"))
+            .getItem("ldPopulation")
+            .alias("majorPopulation"),
+        ).collect()[0]["majorPopulation"]
+
+        region = (
+            chromosome
+            + ":"
+            + str(int(position - radius))
+            + "-"
+            + str(int(position + radius))
+        )
+
+        schema = StudyLocus.get_schema()
+        gwas_df = session.spark.createDataFrame([study_locus_row], schema=schema)
+        exploded_df = gwas_df.select(f.explode("locus").alias("locus"))
+
+        result_df = exploded_df.select(
+            "locus.variantId", "locus.beta", "locus.standardError"
+        )
+        gwas_df = (
+            result_df.withColumn("z", f.col("beta") / f.col("standardError"))
+            .withColumn(
+                "chromosome", f.split(f.col("variantId"), "_")[0].cast("string")
+            )
+            .withColumn("position", f.split(f.col("variantId"), "_")[1].cast("int"))
+            .filter(f.col("chromosome") == chromosome)
+            .filter(f.col("position") >= position - radius)
+            .filter(f.col("position") <= position + radius)
+            .filter(f.col("z").isNotNull())
+        )
+
+        ld_index = (
+            GnomADLDMatrix()
+            .get_locus_index(
+                study_locus_row=study_locus_row,
+                radius=radius,
+                major_population=major_population,
+            )
+            .withColumn(
+                "variantId",
+                f.concat(
+                    f.lit(chromosome),
+                    f.lit("_"),
+                    f.col("`locus.position`"),
+                    f.lit("_"),
+                    f.col("alleles").getItem(0),
+                    f.lit("_"),
+                    f.col("alleles").getItem(1),
+                ).cast("string"),
+            )
+        )
+
+        gnomad_ld = GnomADLDMatrix.get_numpy_matrix(
+            ld_index, gnomad_ancestry=major_population
+        )
+
+        out = SusieFineMapperStep.susie_finemapper_from_prepared_dataframes(
+            GWAS_df=gwas_df,
+            ld_index=ld_index,
+            gnomad_ld=gnomad_ld,
+            L=max_causal_snps,
+            session=session,
+            studyId=studyId,
+            region=region,
+            susie_est_tausq=susie_est_tausq,
+            run_carma=run_carma,
+            run_sumstat_imputation=run_sumstat_imputation,
+            carma_time_limit=carma_time_limit,
+            imputed_r2_threshold=imputed_r2_threshold,
+            ld_score_threshold=ld_score_threshold,
+            sum_pips=sum_pips,
+        )
+
+        return out

From a88f16caab044e2104063e2144b370fbaf61b520 Mon Sep 17 00:00:00 2001
From: Daniel-Considine <113430683+Daniel-Considine@users.noreply.github.com>
Date: Fri, 26 Apr 2024 11:14:54 +0100
Subject: [PATCH 15/21] feat: functionality added to StudyLocus.find_overlaps()
 for finding within-study overlaps (#587)

* feat: functionality added to StudyLocus.find_overlaps() for finding within-study overlaps

* feat: removal of secondary credible sets at the same region as overlaps

* fix: defining the join condition to make the code tidier
---
 src/gentropy/dataset/study_locus.py           | 50 ++++++++++++++-----
 .../dataset/test_study_locus_overlaps.py      | 49 ++++++++++++++++--
 2 files changed, 84 insertions(+), 15 deletions(-)

diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py
index 185aeb569..77c663800 100644
--- a/src/gentropy/dataset/study_locus.py
+++ b/src/gentropy/dataset/study_locus.py
@@ -82,31 +82,52 @@ class StudyLocus(Dataset):
     """
 
     @staticmethod
-    def _overlapping_peaks(credset_to_overlap: DataFrame) -> DataFrame:
+    def _overlapping_peaks(
+        credset_to_overlap: DataFrame, intra_study_overlap: bool = False
+    ) -> DataFrame:
         """Calculate overlapping signals (study-locus) between GWAS-GWAS and GWAS-Molecular trait.
 
         Args:
             credset_to_overlap (DataFrame): DataFrame containing at least `studyLocusId`, `studyType`, `chromosome` and `tagVariantId` columns.
+            intra_study_overlap (bool): When True, finds intra-study overlaps for credible set deduplication. Default is False.
 
         Returns:
             DataFrame: containing `leftStudyLocusId`, `rightStudyLocusId` and `chromosome` columns.
         """
         # Reduce columns to the minimum to reduce the size of the dataframe
         credset_to_overlap = credset_to_overlap.select(
-            "studyLocusId", "studyType", "chromosome", "tagVariantId"
+            "studyLocusId",
+            "studyId",
+            "studyType",
+            "chromosome",
+            "region",
+            "tagVariantId",
         )
+        # Define join condition - if intra_study_overlap is True, finds overlaps within the same study. Otherwise finds gwas vs everything overlaps for coloc.
+        join_condition = (
+            [
+                f.col("left.studyId") == f.col("right.studyId"),
+                f.col("left.chromosome") == f.col("right.chromosome"),
+                f.col("left.tagVariantId") == f.col("right.tagVariantId"),
+                f.col("left.studyLocusId") > f.col("right.studyLocusId"),
+                f.col("left.region") != f.col("right.region"),
+            ]
+            if intra_study_overlap
+            else [
+                f.col("left.chromosome") == f.col("right.chromosome"),
+                f.col("left.tagVariantId") == f.col("right.tagVariantId"),
+                (f.col("right.studyType") != "gwas")
+                | (f.col("left.studyLocusId") > f.col("right.studyLocusId")),
+                f.col("left.studyType") == f.lit("gwas"),
+            ]
+        )
+
         return (
             credset_to_overlap.alias("left")
-            .filter(f.col("studyType") == "gwas")
-            # Self join with complex condition. Left it's all gwas and right can be gwas or molecular trait
+            # Self join with complex condition.
             .join(
                 credset_to_overlap.alias("right"),
-                on=[
-                    f.col("left.chromosome") == f.col("right.chromosome"),
-                    f.col("left.tagVariantId") == f.col("right.tagVariantId"),
-                    (f.col("right.studyType") != "gwas")
-                    | (f.col("left.studyLocusId") > f.col("right.studyLocusId")),
-                ],
+                on=join_condition,
                 how="inner",
             )
             .select(
@@ -305,7 +326,9 @@ def filter_credible_set(
         )
         return self
 
-    def find_overlaps(self: StudyLocus, study_index: StudyIndex) -> StudyLocusOverlap:
+    def find_overlaps(
+        self: StudyLocus, study_index: StudyIndex, intra_study_overlap: bool = False
+    ) -> StudyLocusOverlap:
         """Calculate overlapping study-locus.
 
         Find overlapping study-locus that share at least one tagging variant. All GWAS-GWAS and all GWAS-Molecular traits are computed with the Molecular traits always
@@ -313,6 +336,7 @@ def find_overlaps(self: StudyLocus, study_index: StudyIndex) -> StudyLocusOverla
 
         Args:
             study_index (StudyIndex): Study index to resolve study types.
+            intra_study_overlap (bool): If True, finds intra-study overlaps for credible set deduplication. Default is False.
 
         Returns:
             StudyLocusOverlap: Pairs of overlapping study-locus with aligned tags.
@@ -322,8 +346,10 @@ def find_overlaps(self: StudyLocus, study_index: StudyIndex) -> StudyLocusOverla
             .withColumn("locus", f.explode("locus"))
             .select(
                 "studyLocusId",
+                "studyId",
                 "studyType",
                 "chromosome",
+                "region",
                 f.col("locus.variantId").alias("tagVariantId"),
                 f.col("locus.logBF").alias("logBF"),
                 f.col("locus.posteriorProbability").alias("posteriorProbability"),
@@ -335,7 +361,7 @@ def find_overlaps(self: StudyLocus, study_index: StudyIndex) -> StudyLocusOverla
         )
 
         # overlapping study-locus
-        peak_overlaps = self._overlapping_peaks(loci_to_overlap)
+        peak_overlaps = self._overlapping_peaks(loci_to_overlap, intra_study_overlap)
 
         # study-locus overlap by aligning overlapping variants
         return self._align_overlapping_tags(loci_to_overlap, peak_overlaps)
diff --git a/tests/gentropy/dataset/test_study_locus_overlaps.py b/tests/gentropy/dataset/test_study_locus_overlaps.py
index ee89eec84..8e732fc5c 100644
--- a/tests/gentropy/dataset/test_study_locus_overlaps.py
+++ b/tests/gentropy/dataset/test_study_locus_overlaps.py
@@ -30,46 +30,89 @@ def test_study_locus_overlap_from_associations(
 
 
 @pytest.mark.parametrize(
-    ("observed", "expected"),
+    ("observed", "intrastudy", "expected"),
     [
         (
             # observed - input DataFrame representing gwas and nongwas data to find overlapping signals
             [
                 {
                     "studyLocusId": 1,
+                    "studyId": "A",
                     "studyType": "gwas",
                     "chromosome": "1",
                     "tagVariantId": "A",
                 },
                 {
                     "studyLocusId": 2,
+                    "studyId": "B",
                     "studyType": "eqtl",
                     "chromosome": "1",
                     "tagVariantId": "A",
                 },
                 {
                     "studyLocusId": 3,
+                    "studyId": "C",
                     "studyType": "gwas",
                     "chromosome": "1",
                     "tagVariantId": "B",
                 },
             ],
+            # intrastudy - bool of whether or not to use inter-study or intra-study logic
+            False,
             # expected - output DataFrame with overlapping signals
             [
                 {"leftStudyLocusId": 1, "rightStudyLocusId": 2, "chromosome": "1"},
             ],
         ),
+        (
+            # observed - input DataFrame representing intra-study data to find overlapping signals in the same study
+            [
+                {
+                    "studyLocusId": 1,
+                    "studyId": "A",
+                    "studyType": "gwas",
+                    "chromosome": "1",
+                    "region": "X",
+                    "tagVariantId": "A",
+                },
+                {
+                    "studyLocusId": 2,
+                    "studyId": "A",
+                    "studyType": "gwas",
+                    "chromosome": "1",
+                    "region": "Y",
+                    "tagVariantId": "A",
+                },
+                {
+                    "studyLocusId": 3,
+                    "studyId": "B",
+                    "studyType": "gwas",
+                    "chromosome": "1",
+                    "region": "X",
+                    "tagVariantId": "A",
+                },
+            ],
+            # intrastudy - bool of whether or not to use inter-study or intra-study logic
+            True,
+            # expected - output DataFrame with overlapping signals
+            [{"leftStudyLocusId": 2, "rightStudyLocusId": 1, "chromosome": "1"}],
+        ),
     ],
 )
 def test_overlapping_peaks(
-    spark: SparkSession, observed: list[dict[str, Any]], expected: list[dict[str, Any]]
+    spark: SparkSession,
+    observed: list[dict[str, Any]],
+    intrastudy: bool,
+    expected: list[dict[str, Any]],
 ) -> None:
     """Test overlapping signals between GWAS-GWAS and GWAS-Molecular trait to make sure that mQTLs are always on the right."""
     mock_schema = t.StructType(
         [
             t.StructField("studyLocusId", t.LongType()),
+            t.StructField("studyId", t.StringType()),
             t.StructField("studyType", t.StringType()),
             t.StructField("chromosome", t.StringType()),
+            t.StructField("region", t.StringType()),
             t.StructField("tagVariantId", t.StringType()),
         ]
     )
@@ -81,6 +124,6 @@ def test_overlapping_peaks(
         ]
     )
     observed_df = spark.createDataFrame(observed, mock_schema)
-    result_df = StudyLocus._overlapping_peaks(observed_df)
+    result_df = StudyLocus._overlapping_peaks(observed_df, intrastudy)
     expected_df = spark.createDataFrame(expected, expected_schema)
     assert result_df.collect() == expected_df.collect()

From da2c75d5ab582109e9dd9072bf2f0d4b3038f39b Mon Sep 17 00:00:00 2001
From: David Ochoa <ochoa@ebi.ac.uk>
Date: Fri, 26 Apr 2024 17:21:12 +0100
Subject: [PATCH 16/21] feat: github action to upload docker image to registry
 (#588)

---
 .github/workflows/artifact.yml | 39 ++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 .github/workflows/artifact.yml

diff --git a/.github/workflows/artifact.yml b/.github/workflows/artifact.yml
new file mode 100644
index 000000000..a33bd1e55
--- /dev/null
+++ b/.github/workflows/artifact.yml
@@ -0,0 +1,39 @@
+name: Build and Push to Artifact Registry
+
+"on":
+  push:
+    branches: ["dev"]
+
+env:
+  PROJECT_ID: open-targets-genetics-dev
+  REGION: europe-west1
+  GAR_LOCATION: europe-west1-docker.pkg.dev/open-targets-genetics-dev/gentropy-app-dev/
+  IMAGE_NAME: gentropy-app
+
+jobs:
+  build-push-artifact:
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Checkout"
+        uses: "actions/checkout@v3"
+
+      - id: "auth"
+        uses: "google-github-actions/auth@v1"
+        with:
+          credentials_json: "${{ secrets.SERVICE_ACCOUNT_KEY }}"
+
+      - name: "Set up Cloud SDK"
+        uses: "google-github-actions/setup-gcloud@v1"
+
+      - name: "Use gcloud CLI"
+        run: "gcloud info"
+
+      - name: "Docker auth"
+        run: |-
+          gcloud auth configure-docker ${{ env.REGION }}-docker.pkg.dev --quiet
+
+      - name: Build image
+        run: docker build . --tag "${{ env.GAR_LOCATION }}/${{ env.IMAGE_NAME }}:${{ github.github.ref_name }}"
+
+      - name: Push image
+        run: docker push "${{ env.GAR_LOCATION }}/${{ env.IMAGE_NAME }}:${{ github.github.ref_name }}"

From df75870a9b902a4b6b9f842797d5183b87a83e26 Mon Sep 17 00:00:00 2001
From: David Ochoa <ochoa@ebi.ac.uk>
Date: Fri, 26 Apr 2024 17:31:16 +0100
Subject: [PATCH 17/21] feat: lighter dockerfile (#585)

---
 Dockerfile | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 1221ec637..deb43bcd8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,8 +1,33 @@
-FROM python:3.10
+FROM python:3.10-bullseye
+
+
+RUN apt-get update && \
+    apt-get install -y openjdk-11-jdk && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+
+RUN java -version
+
+# Set environment variables for Java
+ENV JAVA_HOME /usr/lib/jvm/java-11-openjdk-amd64
+ENV PATH=$PATH:$JAVA_HOME/bin
 
 RUN pip install poetry==1.7.1
 
-COPY . .
+ENV POETRY_NO_INTERACTION=1 \
+    POETRY_VIRTUALENVS_IN_PROJECT=1 \
+    POETRY_VIRTUALENVS_CREATE=1 \
+    POETRY_CACHE_DIR=/tmp/poetry_cache
+
+WORKDIR /app
+
+COPY pyproject.toml poetry.lock ./
+RUN touch README.md
+
+RUN poetry config installer.max-workers 10
+RUN poetry install --without dev,docs,tests --no-root --no-interaction --no-ansi -vvv && rm -rf $POETRY_CACHE_DIR
+
+COPY src ./src
+
 RUN poetry install --without dev,docs,tests
 
 ENTRYPOINT ["poetry", "run", "gentropy"]

From b83a8aa8fba4521a7465ecb9f4889112094e0950 Mon Sep 17 00:00:00 2001
From: David Ochoa <ochoa@ebi.ac.uk>
Date: Fri, 26 Apr 2024 17:44:21 +0100
Subject: [PATCH 18/21] fix: docker action fixes (#589)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Irene López <45119610+ireneisdoomed@users.noreply.github.com>
---
 .github/workflows/artifact.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/artifact.yml b/.github/workflows/artifact.yml
index a33bd1e55..849aa8268 100644
--- a/.github/workflows/artifact.yml
+++ b/.github/workflows/artifact.yml
@@ -7,7 +7,7 @@ name: Build and Push to Artifact Registry
 env:
   PROJECT_ID: open-targets-genetics-dev
   REGION: europe-west1
-  GAR_LOCATION: europe-west1-docker.pkg.dev/open-targets-genetics-dev/gentropy-app-dev/
+  GAR_LOCATION: europe-west1-docker.pkg.dev/open-targets-genetics-dev
   IMAGE_NAME: gentropy-app
 
 jobs:
@@ -23,7 +23,7 @@ jobs:
           credentials_json: "${{ secrets.SERVICE_ACCOUNT_KEY }}"
 
       - name: "Set up Cloud SDK"
-        uses: "google-github-actions/setup-gcloud@v1"
+        uses: "google-github-actions/setup-gcloud"
 
       - name: "Use gcloud CLI"
         run: "gcloud info"
@@ -33,7 +33,7 @@ jobs:
           gcloud auth configure-docker ${{ env.REGION }}-docker.pkg.dev --quiet
 
       - name: Build image
-        run: docker build . --tag "${{ env.GAR_LOCATION }}/${{ env.IMAGE_NAME }}:${{ github.github.ref_name }}"
+        run: docker build . --tag "${{ env.GAR_LOCATION }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}"
 
       - name: Push image
-        run: docker push "${{ env.GAR_LOCATION }}/${{ env.IMAGE_NAME }}:${{ github.github.ref_name }}"
+        run: docker push "${{ env.GAR_LOCATION }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}"

From c89cd57315a3431d0ad9072eb36aef954ddc32aa Mon Sep 17 00:00:00 2001
From: David Ochoa <ochoa@ebi.ac.uk>
Date: Fri, 26 Apr 2024 17:50:21 +0100
Subject: [PATCH 19/21] fix: docker action fixes (#590)

---
 .github/workflows/artifact.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/artifact.yml b/.github/workflows/artifact.yml
index 849aa8268..e9ab0157d 100644
--- a/.github/workflows/artifact.yml
+++ b/.github/workflows/artifact.yml
@@ -17,13 +17,13 @@ jobs:
       - name: "Checkout"
         uses: "actions/checkout@v3"
 
-      - id: "auth"
-        uses: "google-github-actions/auth@v1"
+      - name: "auth"
+        uses: "google-github-actions/auth@v2"
         with:
           credentials_json: "${{ secrets.SERVICE_ACCOUNT_KEY }}"
 
       - name: "Set up Cloud SDK"
-        uses: "google-github-actions/setup-gcloud"
+        uses: "google-github-actions/setup-gcloud@v2"
 
       - name: "Use gcloud CLI"
         run: "gcloud info"

From 70e5e2615700518324d414d11165efe025798284 Mon Sep 17 00:00:00 2001
From: David Ochoa <ochoa@ebi.ac.uk>
Date: Fri, 26 Apr 2024 18:09:52 +0100
Subject: [PATCH 20/21] fix: docker action fixes v3 (#591)

---
 .github/workflows/artifact.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/artifact.yml b/.github/workflows/artifact.yml
index e9ab0157d..ce7b47b6c 100644
--- a/.github/workflows/artifact.yml
+++ b/.github/workflows/artifact.yml
@@ -33,7 +33,7 @@ jobs:
           gcloud auth configure-docker ${{ env.REGION }}-docker.pkg.dev --quiet
 
       - name: Build image
-        run: docker build . --tag "${{ env.GAR_LOCATION }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}"
+        run: docker build . --tag "${{ env.GAR_LOCATION }}/${{ env.IMAGE_NAME }}/gentropy:${{ github.ref_name }}"
 
       - name: Push image
-        run: docker push "${{ env.GAR_LOCATION }}/${{ env.IMAGE_NAME }}:${{ github.ref_name }}"
+        run: docker push "${{ env.GAR_LOCATION }}/${{ env.IMAGE_NAME }}/gentropy:${{ github.ref_name }}"

From d16321522cf8cfd5c7a1a9ac5eada527f73c67a1 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 29 Apr 2024 16:34:50 +0100
Subject: [PATCH 21/21] build(deps-dev): bump deptry from 0.14.0 to 0.16.1
 (#570)

Bumps [deptry](https://github.com/fpgmaas/deptry) from 0.14.0 to 0.16.1.
- [Release notes](https://github.com/fpgmaas/deptry/releases)
- [Changelog](https://github.com/fpgmaas/deptry/blob/main/CHANGELOG.md)
- [Commits](https://github.com/fpgmaas/deptry/compare/0.14.0...0.16.1)

---
updated-dependencies:
- dependency-name: deptry
  dependency-type: direct:development
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: David Ochoa <ochoa@ebi.ac.uk>
---
 poetry.lock    | 198 ++++++++++++++++++++++++++++++++++++++++++++++---
 pyproject.toml |   2 +-
 2 files changed, 187 insertions(+), 13 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 01cafdeec..40c3283b4 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.0 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
 
 [[package]]
 name = "aiodns"
@@ -1627,24 +1627,26 @@ dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"]
 
 [[package]]
 name = "deptry"
-version = "0.14.0"
+version = "0.16.1"
 description = "A command line utility to check for unused, missing and transitive dependencies in a Python project."
 optional = false
-python-versions = ">=3.8, <4.0"
+python-versions = ">=3.8"
 files = [
-    {file = "deptry-0.14.0-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:951d40e40cca6b538d8b1992e9532c082757598c73249469d3f6f90cf3344c9f"},
-    {file = "deptry-0.14.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:59e0808507ffb57a0ce9b8c40ecda7b4a235a0627f00485d8e6104bf71dc99f8"},
-    {file = "deptry-0.14.0-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5e6b7485c3ece10eb94b2787258282275687a101587c6f88f65a6eadc65c4a4"},
-    {file = "deptry-0.14.0-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d5a8299b9627dc32352caf99c5e8fdff0227539ccbe1535a6145649b5caa394"},
-    {file = "deptry-0.14.0-cp38-abi3-win_amd64.whl", hash = "sha256:d55654025b567739a57f9b6b8467a21c65a30c21e834b2d8cb225618b320874c"},
-    {file = "deptry-0.14.0.tar.gz", hash = "sha256:3415c65c0734f6dfafa1321c77c1c9fafb6d6f88d0f8441e5b2cfb5b3b41f71b"},
+    {file = "deptry-0.16.1-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:29ed8ae61b8f5664dd484717c79eef7ec66d965940efd828fca0d3c09220a1db"},
+    {file = "deptry-0.16.1-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:738a772b538f51e9a7bb8d5cb9a61cfea8794a79371d171919b01cff0dc895bf"},
+    {file = "deptry-0.16.1-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:56b78f7c860def8000e93f88345a24809f1b91e2f7836ac9a08285cb405e2762"},
+    {file = "deptry-0.16.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3e86a04ea87ddece0f68ba204feb950f588205808c8320e6628300f03ff66dc"},
+    {file = "deptry-0.16.1-cp38-abi3-win_amd64.whl", hash = "sha256:01b5098739a56c93f3e1e40efec5f20452f22a9a8436a59809d46201fcb94bcf"},
+    {file = "deptry-0.16.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7e29dc4c1bbb933c9482e8cef85fafe2be7f46aeb90a8a07ba5f2b22af60876f"},
+    {file = "deptry-0.16.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8dfab68c247566c87a40f55f405be8549ffe4cea0b9b5384b7ae73a6f1d5cd1"},
+    {file = "deptry-0.16.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1228493926b6e59cd2df7cb6016e10c255553cc31db24edcf7fc8d5474b81be6"},
+    {file = "deptry-0.16.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:99c3ac60b78ad1b8fb9844c25393e7ebc969cc950601ce3c050f56d196da5a79"},
+    {file = "deptry-0.16.1.tar.gz", hash = "sha256:39fb62da4a8f4d17ed282310f7bcaadec55a95a8c471b01e0fcdf5351a7ac323"},
 ]
 
 [package.dependencies]
-chardet = ">=4.0.0"
 click = ">=8.0.0,<9"
 colorama = {version = ">=0.4.6", markers = "sys_platform == \"win32\""}
-pathspec = ">=0.9.0"
 tomli = {version = ">=2.0.1", markers = "python_version < \"3.11\""}
 
 [[package]]
@@ -3430,6 +3432,156 @@ files = [
     {file = "google_re2-1.1-3-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d140c7b9395b4d1e654127aa1c99bcc603ed01000b7bc7e28c52562f1894ec12"},
     {file = "google_re2-1.1-3-cp39-cp39-win32.whl", hash = "sha256:80c5fc200f64b2d903eeb07b8d6cefc620a872a0240c7caaa9aca05b20f5568f"},
     {file = "google_re2-1.1-3-cp39-cp39-win_amd64.whl", hash = "sha256:9eb6dbcee9b5dc4069bbc0634f2eb039ca524a14bed5868fdf6560aaafcbca06"},
+    {file = "google_re2-1.1-4-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:0db114d7e1aa96dbcea452a40136d7d747d60cbb61394965774688ef59cccd4e"},
+    {file = "google_re2-1.1-4-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:82133958e003a1344e5b7a791b9a9dd7560b5c8f96936dbe16f294604524a633"},
+    {file = "google_re2-1.1-4-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:9e74fd441d1f3d917d3303e319f61b82cdbd96b9a5ba919377a6eef1504a1e2b"},
+    {file = "google_re2-1.1-4-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:734a2e7a4541c57253b5ebee24f3f3366ba3658bcad01da25fb623c78723471a"},
+    {file = "google_re2-1.1-4-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:d88d5eecbc908abe16132456fae13690d0508f3ac5777f320ef95cb6cab9a961"},
+    {file = "google_re2-1.1-4-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:b91db80b171ecec435a07977a227757dd487356701a32f556fa6fca5d0a40522"},
+    {file = "google_re2-1.1-4-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b23129887a64bb9948af14c84705273ed1a40054e99433b4acccab4dcf6a226"},
+    {file = "google_re2-1.1-4-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5dc1a0cc7cd19261dcaf76763e2499305dbb7e51dc69555167cdb8af98782698"},
+    {file = "google_re2-1.1-4-cp310-cp310-win32.whl", hash = "sha256:3b2ab1e2420b5dd9743a2d6bc61b64e5f708563702a75b6db86637837eaeaf2f"},
+    {file = "google_re2-1.1-4-cp310-cp310-win_amd64.whl", hash = "sha256:92efca1a7ef83b6df012d432a1cbc71d10ff42200640c0f9a5ff5b343a48e633"},
+    {file = "google_re2-1.1-4-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:854818fd4ce79787aca5ba459d6e5abe4ca9be2c684a5b06a7f1757452ca3708"},
+    {file = "google_re2-1.1-4-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:4ceef51174b6f653b6659a8fdaa9c38960c5228b44b25be2a3bcd8566827554f"},
+    {file = "google_re2-1.1-4-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:ee49087c3db7e6f5238105ab5299c09e9b77516fe8cfb0a37e5f1e813d76ecb8"},
+    {file = "google_re2-1.1-4-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:dc2312854bdc01410acc5d935f1906a49cb1f28980341c20a68797ad89d8e178"},
+    {file = "google_re2-1.1-4-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:0dc0d2e42296fa84a3cb3e1bd667c6969389cd5cdf0786e6b1f911ae2d75375b"},
+    {file = "google_re2-1.1-4-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:6bf04ced98453b035f84320f348f67578024f44d2997498def149054eb860ae8"},
+    {file = "google_re2-1.1-4-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1d6b6ef11dc4ab322fa66c2f3561925f2b5372a879c3ed764d20e939e2fd3e5f"},
+    {file = "google_re2-1.1-4-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0dcde6646fa9a97fd3692b3f6ae7daf7f3277d7500b6c253badeefa11db8956a"},
+    {file = "google_re2-1.1-4-cp311-cp311-win32.whl", hash = "sha256:5f4f0229deb057348893574d5b0a96d055abebac6debf29d95b0c0e26524c9f6"},
+    {file = "google_re2-1.1-4-cp311-cp311-win_amd64.whl", hash = "sha256:4713ddbe48a18875270b36a462b0eada5e84d6826f8df7edd328d8706b6f9d07"},
+    {file = "google_re2-1.1-4-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:40a698300b8faddbb325662973f839489c89b960087060bd389c376828978a04"},
+    {file = "google_re2-1.1-4-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:103d2d7ac92ba23911a151fd1fc7035cbf6dc92a7f6aea92270ebceb5cd5acd3"},
+    {file = "google_re2-1.1-4-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:51fb7182bccab05e8258a2b6a63dda1a6b4a9e8dfb9b03ec50e50c49c2827dd4"},
+    {file = "google_re2-1.1-4-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:65383022abd63d7b620221eba7935132b53244b8b463d8fdce498c93cf58b7b7"},
+    {file = "google_re2-1.1-4-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:396281fc68a9337157b3ffcd9392c6b7fcb8aab43e5bdab496262a81d56a4ecc"},
+    {file = "google_re2-1.1-4-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:8198adcfcff1c680e052044124621730fc48d08005f90a75487f5651f1ebfce2"},
+    {file = "google_re2-1.1-4-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:81f7bff07c448aec4db9ca453d2126ece8710dbd9278b8bb09642045d3402a96"},
+    {file = "google_re2-1.1-4-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b7dacf730fd7d6ec71b11d6404b0b26e230814bfc8e9bb0d3f13bec9b5531f8d"},
+    {file = "google_re2-1.1-4-cp312-cp312-win32.whl", hash = "sha256:8c764f62f4b1d89d1ef264853b6dd9fee14a89e9b86a81bc2157fe3531425eb4"},
+    {file = "google_re2-1.1-4-cp312-cp312-win_amd64.whl", hash = "sha256:0be2666df4bc5381a5d693585f9bbfefb0bfd3c07530d7e403f181f5de47254a"},
+    {file = "google_re2-1.1-4-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:5cb1b63a0bfd8dd65d39d2f3b2e5ae0a06ce4b2ce5818a1d1fc78a786a252673"},
+    {file = "google_re2-1.1-4-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:e41751ce6b67a95230edd0772226dc94c2952a2909674cd69df9804ed0125307"},
+    {file = "google_re2-1.1-4-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:b998cfa2d50bf4c063e777c999a7e8645ec7e5d7baf43ad71b1e2e10bb0300c3"},
+    {file = "google_re2-1.1-4-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:226ca3b0c2e970f3fc82001ac89e845ecc7a4bb7c68583e7a76cda70b61251a7"},
+    {file = "google_re2-1.1-4-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:9adec1f734ebad7c72e56c85f205a281d8fe9bf6583bc21020157d3f2812ce89"},
+    {file = "google_re2-1.1-4-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:9c34f3c64ba566af967d29e11299560e6fdfacd8ca695120a7062b6ed993b179"},
+    {file = "google_re2-1.1-4-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e1b85385fe293838e0d0b6e19e6c48ba8c6f739ea92ce2e23b718afe7b343363"},
+    {file = "google_re2-1.1-4-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4694daa8a8987cfb568847aa872f9990e930c91a68c892ead876411d4b9012c3"},
+    {file = "google_re2-1.1-4-cp38-cp38-win32.whl", hash = "sha256:5e671e9be1668187e2995aac378de574fa40df70bb6f04657af4d30a79274ce0"},
+    {file = "google_re2-1.1-4-cp38-cp38-win_amd64.whl", hash = "sha256:f66c164d6049a8299f6dfcfa52d1580576b4b9724d6fcdad2f36f8f5da9304b6"},
+    {file = "google_re2-1.1-4-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:25cb17ae0993a48c70596f3a3ef5d659638106401cc8193f51c0d7961b3b3eb7"},
+    {file = "google_re2-1.1-4-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:5f101f86d14ca94ca4dcf63cceaa73d351f2be2481fcaa29d9e68eeab0dc2a88"},
+    {file = "google_re2-1.1-4-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:4e82591e85bf262a6d74cff152867e05fc97867c68ba81d6836ff8b0e7e62365"},
+    {file = "google_re2-1.1-4-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:1f61c09b93ffd34b1e2557e5a9565039f935407a5786dbad46f64f1a484166e6"},
+    {file = "google_re2-1.1-4-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:12b390ad8c7e74bab068732f774e75e0680dade6469b249a721f3432f90edfc3"},
+    {file = "google_re2-1.1-4-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:1284343eb31c2e82ed2d8159f33ba6842238a56782c881b07845a6d85613b055"},
+    {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6c7b38e0daf2c06e4d3163f4c732ab3ad2521aecfed6605b69e4482c612da303"},
+    {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f4d4f0823e8b2f6952a145295b1ff25245ce9bb136aff6fe86452e507d4c1dd"},
+    {file = "google_re2-1.1-4-cp39-cp39-win32.whl", hash = "sha256:1afae56b2a07bb48cfcfefaa15ed85bae26a68f5dc7f9e128e6e6ea36914e847"},
+    {file = "google_re2-1.1-4-cp39-cp39-win_amd64.whl", hash = "sha256:aa7d6d05911ab9c8adbf3c225a7a120ab50fd2784ac48f2f0d140c0b7afc2b55"},
+    {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:222fc2ee0e40522de0b21ad3bc90ab8983be3bf3cec3d349c80d76c8bb1a4beb"},
+    {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:d4763b0b9195b72132a4e7de8e5a9bf1f05542f442a9115aa27cfc2a8004f581"},
+    {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:209649da10c9d4a93d8a4d100ecbf9cc3b0252169426bec3e8b4ad7e57d600cf"},
+    {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:68813aa333c1604a2df4a495b2a6ed065d7c8aebf26cc7e7abb5a6835d08353c"},
+    {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:370a23ec775ad14e9d1e71474d56f381224dcf3e72b15d8ca7b4ad7dd9cd5853"},
+    {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:14664a66a3ddf6bc9e56f401bf029db2d169982c53eff3f5876399104df0e9a6"},
+    {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3ea3722cc4932cbcebd553b69dce1b4a73572823cff4e6a244f1c855da21d511"},
+    {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e14bb264c40fd7c627ef5678e295370cd6ba95ca71d835798b6e37502fc4c690"},
+    {file = "google_re2-1.1-5-cp310-cp310-win32.whl", hash = "sha256:39512cd0151ea4b3969c992579c79b423018b464624ae955be685fc07d94556c"},
+    {file = "google_re2-1.1-5-cp310-cp310-win_amd64.whl", hash = "sha256:ac66537aa3bc5504320d922b73156909e3c2b6da19739c866502f7827b3f9fdf"},
+    {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:5b5ea68d54890c9edb1b930dcb2658819354e5d3f2201f811798bbc0a142c2b4"},
+    {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:33443511b6b83c35242370908efe2e8e1e7cae749c766b2b247bf30e8616066c"},
+    {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:413d77bdd5ba0bfcada428b4c146e87707452ec50a4091ec8e8ba1413d7e0619"},
+    {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:5171686e43304996a34baa2abcee6f28b169806d0e583c16d55e5656b092a414"},
+    {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3b284db130283771558e31a02d8eb8fb756156ab98ce80035ae2e9e3a5f307c4"},
+    {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:296e6aed0b169648dc4b870ff47bd34c702a32600adb9926154569ef51033f47"},
+    {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:38d50e68ead374160b1e656bbb5d101f0b95fb4cc57f4a5c12100155001480c5"},
+    {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2a0416a35921e5041758948bcb882456916f22845f66a93bc25070ef7262b72a"},
+    {file = "google_re2-1.1-5-cp311-cp311-win32.whl", hash = "sha256:a1d59568bbb5de5dd56dd6cdc79907db26cce63eb4429260300c65f43469e3e7"},
+    {file = "google_re2-1.1-5-cp311-cp311-win_amd64.whl", hash = "sha256:72f5a2f179648b8358737b2b493549370debd7d389884a54d331619b285514e3"},
+    {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:cbc72c45937b1dc5acac3560eb1720007dccca7c9879138ff874c7f6baf96005"},
+    {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5fadd1417fbef7235fa9453dba4eb102e6e7d94b1e4c99d5fa3dd4e288d0d2ae"},
+    {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:040f85c63cc02696485b59b187a5ef044abe2f99b92b4fb399de40b7d2904ccc"},
+    {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:64e3b975ee6d9bbb2420494e41f929c1a0de4bcc16d86619ab7a87f6ea80d6bd"},
+    {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8ee370413e00f4d828eaed0e83b8af84d7a72e8ee4f4bd5d3078bc741dfc430a"},
+    {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:5b89383001079323f693ba592d7aad789d7a02e75adb5d3368d92b300f5963fd"},
+    {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:63cb4fdfbbda16ae31b41a6388ea621510db82feb8217a74bf36552ecfcd50ad"},
+    {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ebedd84ae8be10b7a71a16162376fd67a2386fe6361ef88c622dcf7fd679daf"},
+    {file = "google_re2-1.1-5-cp312-cp312-win32.whl", hash = "sha256:c8e22d1692bc2c81173330c721aff53e47ffd3c4403ff0cd9d91adfd255dd150"},
+    {file = "google_re2-1.1-5-cp312-cp312-win_amd64.whl", hash = "sha256:5197a6af438bb8c4abda0bbe9c4fbd6c27c159855b211098b29d51b73e4cbcf6"},
+    {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:b6727e0b98417e114b92688ad2aa256102ece51f29b743db3d831df53faf1ce3"},
+    {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:711e2b6417eb579c61a4951029d844f6b95b9b373b213232efd413659889a363"},
+    {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:71ae8b3df22c5c154c8af0f0e99d234a450ef1644393bc2d7f53fc8c0a1e111c"},
+    {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:94a04e214bc521a3807c217d50cf099bbdd0c0a80d2d996c0741dbb995b5f49f"},
+    {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:a770f75358508a9110c81a1257721f70c15d9bb592a2fb5c25ecbd13566e52a5"},
+    {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:07c9133357f7e0b17c6694d5dcb82e0371f695d7c25faef2ff8117ef375343ff"},
+    {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:204ca6b1cf2021548f4a9c29ac015e0a4ab0a7b6582bf2183d838132b60c8fda"},
+    {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f0b95857c2c654f419ca684ec38c9c3325c24e6ba7d11910a5110775a557bb18"},
+    {file = "google_re2-1.1-5-cp38-cp38-win32.whl", hash = "sha256:347ac770e091a0364e822220f8d26ab53e6fdcdeaec635052000845c5a3fb869"},
+    {file = "google_re2-1.1-5-cp38-cp38-win_amd64.whl", hash = "sha256:ec32bb6de7ffb112a07d210cf9f797b7600645c2d5910703fa07f456dd2150e0"},
+    {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:eb5adf89060f81c5ff26c28e261e6b4997530a923a6093c9726b8dec02a9a326"},
+    {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a22630c9dd9ceb41ca4316bccba2643a8b1d5c198f21c00ed5b50a94313aaf10"},
+    {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:544dc17fcc2d43ec05f317366375796351dec44058e1164e03c3f7d050284d58"},
+    {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:19710af5ea88751c7768575b23765ce0dfef7324d2539de576f75cdc319d6654"},
+    {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:f82995a205e08ad896f4bd5ce4847c834fab877e1772a44e5f262a647d8a1dec"},
+    {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:63533c4d58da9dc4bc040250f1f52b089911699f0368e0e6e15f996387a984ed"},
+    {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79e00fcf0cb04ea35a22b9014712d448725ce4ddc9f08cc818322566176ca4b0"},
+    {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bc41afcefee2da6c4ed883a93d7f527c4b960cd1d26bbb0020a7b8c2d341a60a"},
+    {file = "google_re2-1.1-5-cp39-cp39-win32.whl", hash = "sha256:486730b5e1f1c31b0abc6d80abe174ce4f1188fe17d1b50698f2bf79dc6e44be"},
+    {file = "google_re2-1.1-5-cp39-cp39-win_amd64.whl", hash = "sha256:4de637ca328f1d23209e80967d1b987d6b352cd01b3a52a84b4d742c69c3da6c"},
+    {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:621e9c199d1ff0fdb2a068ad450111a84b3bf14f96dfe5a8a7a0deae5f3f4cce"},
+    {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:220acd31e7dde95373f97c3d1f3b3bd2532b38936af28b1917ee265d25bebbf4"},
+    {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:db34e1098d164f76251a6ece30e8f0ddfd65bb658619f48613ce71acb3f9cbdb"},
+    {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:5152bac41d8073977582f06257219541d0fc46ad99b0bbf30e8f60198a43b08c"},
+    {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:6191294799e373ee1735af91f55abd23b786bdfd270768a690d9d55af9ea1b0d"},
+    {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:070cbafbb4fecbb02e98feb28a1eb292fb880f434d531f38cc33ee314b521f1f"},
+    {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8437d078b405a59a576cbed544490fe041140f64411f2d91012e8ec05ab8bf86"},
+    {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f00f9a9af8896040e37896d9b9fc409ad4979f1ddd85bb188694a7d95ddd1164"},
+    {file = "google_re2-1.1-6-cp310-cp310-win32.whl", hash = "sha256:df26345f229a898b4fd3cafd5f82259869388cee6268fc35af16a8e2293dd4e5"},
+    {file = "google_re2-1.1-6-cp310-cp310-win_amd64.whl", hash = "sha256:3665d08262c57c9b28a5bdeb88632ad792c4e5f417e5645901695ab2624f5059"},
+    {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b26b869d8aa1d8fe67c42836bf3416bb72f444528ee2431cfb59c0d3e02c6ce3"},
+    {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:41fd4486c57dea4f222a6bb7f1ff79accf76676a73bdb8da0fcbd5ba73f8da71"},
+    {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:0ee378e2e74e25960070c338c28192377c4dd41e7f4608f2688064bd2badc41e"},
+    {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:a00cdbf662693367b36d075b29feb649fd7ee1b617cf84f85f2deebeda25fc64"},
+    {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:4c09455014217a41499432b8c8f792f25f3df0ea2982203c3a8c8ca0e7895e69"},
+    {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:6501717909185327935c7945e23bb5aa8fc7b6f237b45fe3647fa36148662158"},
+    {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3510b04790355f199e7861c29234081900e1e1cbf2d1484da48aa0ba6d7356ab"},
+    {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8c0e64c187ca406764f9e9ad6e750d62e69ed8f75bf2e865d0bfbc03b642361c"},
+    {file = "google_re2-1.1-6-cp311-cp311-win32.whl", hash = "sha256:2a199132350542b0de0f31acbb3ca87c3a90895d1d6e5235f7792bb0af02e523"},
+    {file = "google_re2-1.1-6-cp311-cp311-win_amd64.whl", hash = "sha256:83bdac8ceaece8a6db082ea3a8ba6a99a2a1ee7e9f01a9d6d50f79c6f251a01d"},
+    {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:81985ff894cd45ab5a73025922ac28c0707759db8171dd2f2cc7a0e856b6b5ad"},
+    {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5635af26065e6b45456ccbea08674ae2ab62494008d9202df628df3b267bc095"},
+    {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:813b6f04de79f4a8fdfe05e2cb33e0ccb40fe75d30ba441d519168f9d958bd54"},
+    {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:5ec2f5332ad4fd232c3f2d6748c2c7845ccb66156a87df73abcc07f895d62ead"},
+    {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:5a687b3b32a6cbb731647393b7c4e3fde244aa557f647df124ff83fb9b93e170"},
+    {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:39a62f9b3db5d3021a09a47f5b91708b64a0580193e5352751eb0c689e4ad3d7"},
+    {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ca0f0b45d4a1709cbf5d21f355e5809ac238f1ee594625a1e5ffa9ff7a09eb2b"},
+    {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a64b3796a7a616c7861247bd061c9a836b5caf0d5963e5ea8022125601cf7b09"},
+    {file = "google_re2-1.1-6-cp312-cp312-win32.whl", hash = "sha256:32783b9cb88469ba4cd9472d459fe4865280a6b1acdad4480a7b5081144c4eb7"},
+    {file = "google_re2-1.1-6-cp312-cp312-win_amd64.whl", hash = "sha256:259ff3fd2d39035b9cbcbf375995f83fa5d9e6a0c5b94406ff1cc168ed41d6c6"},
+    {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:e4711bcffe190acd29104d8ecfea0c0e42b754837de3fb8aad96e6cc3c613cdc"},
+    {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:4d081cce43f39c2e813fe5990e1e378cbdb579d3f66ded5bade96130269ffd75"},
+    {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:4f123b54d48450d2d6b14d8fad38e930fb65b5b84f1b022c10f2913bd956f5b5"},
+    {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:e1928b304a2b591a28eb3175f9db7f17c40c12cf2d4ec2a85fdf1cc9c073ff91"},
+    {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:3a69f76146166aec1173003c1f547931bdf288c6b135fda0020468492ac4149f"},
+    {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:fc08c388f4ebbbca345e84a0c56362180d33d11cbe9ccfae663e4db88e13751e"},
+    {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b057adf38ce4e616486922f2f47fc7d19c827ba0a7f69d540a3664eba2269325"},
+    {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4138c0b933ab099e96f5d8defce4486f7dfd480ecaf7f221f2409f28022ccbc5"},
+    {file = "google_re2-1.1-6-cp38-cp38-win32.whl", hash = "sha256:9693e45b37b504634b1abbf1ee979471ac6a70a0035954592af616306ab05dd6"},
+    {file = "google_re2-1.1-6-cp38-cp38-win_amd64.whl", hash = "sha256:5674d437baba0ea287a5a7f8f81f24265d6ae8f8c09384e2ef7b6f84b40a7826"},
+    {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:7783137cb2e04f458a530c6d0ee9ef114815c1d48b9102f023998c371a3b060e"},
+    {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a49b7153935e7a303675f4deb5f5d02ab1305adefc436071348706d147c889e0"},
+    {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:a96a8bb309182090704593c60bdb369a2756b38fe358bbf0d40ddeb99c71769f"},
+    {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:dff3d4be9f27ef8ec3705eed54f19ef4ab096f5876c15fe011628c69ba3b561c"},
+    {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:40f818b0b39e26811fa677978112a8108269977fdab2ba0453ac4363c35d9e66"},
+    {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:8a7e53538cdb40ef4296017acfbb05cab0c19998be7552db1cfb85ba40b171b9"},
+    {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6ee18e7569fb714e5bb8c42809bf8160738637a5e71ed5a4797757a1fb4dc4de"},
+    {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1cda4f6d1a7d5b43ea92bc395f23853fba0caf8b1e1efa6e8c48685f912fcb89"},
+    {file = "google_re2-1.1-6-cp39-cp39-win32.whl", hash = "sha256:6a9cdbdc36a2bf24f897be6a6c85125876dc26fea9eb4247234aec0decbdccfd"},
+    {file = "google_re2-1.1-6-cp39-cp39-win_amd64.whl", hash = "sha256:73f646cecfad7cc5b4330b4192c25f2e29730a3b8408e089ffd2078094208196"},
 ]
 
 [[package]]
@@ -6701,6 +6853,7 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -7625,30 +7778,51 @@ description = "Database Abstraction Library"
 optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
 files = [
+    {file = "SQLAlchemy-1.4.50-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:54138aa80d2dedd364f4e8220eef284c364d3270aaef621570aa2bd99902e2e8"},
     {file = "SQLAlchemy-1.4.50-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d00665725063692c42badfd521d0c4392e83c6c826795d38eb88fb108e5660e5"},
     {file = "SQLAlchemy-1.4.50-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85292ff52ddf85a39367057c3d7968a12ee1fb84565331a36a8fead346f08796"},
     {file = "SQLAlchemy-1.4.50-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d0fed0f791d78e7767c2db28d34068649dfeea027b83ed18c45a423f741425cb"},
     {file = "SQLAlchemy-1.4.50-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db4db3c08ffbb18582f856545f058a7a5e4ab6f17f75795ca90b3c38ee0a8ba4"},
+    {file = "SQLAlchemy-1.4.50-cp310-cp310-win32.whl", hash = "sha256:6c78e3fb4a58e900ec433b6b5f4efe1a0bf81bbb366ae7761c6e0051dd310ee3"},
+    {file = "SQLAlchemy-1.4.50-cp310-cp310-win_amd64.whl", hash = "sha256:d55f7a33e8631e15af1b9e67c9387c894fedf6deb1a19f94be8731263c51d515"},
+    {file = "SQLAlchemy-1.4.50-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:324b1fdd50e960a93a231abb11d7e0f227989a371e3b9bd4f1259920f15d0304"},
     {file = "SQLAlchemy-1.4.50-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:14b0cacdc8a4759a1e1bd47dc3ee3f5db997129eb091330beda1da5a0e9e5bd7"},
     {file = "SQLAlchemy-1.4.50-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1fb9cb60e0f33040e4f4681e6658a7eb03b5cb4643284172f91410d8c493dace"},
+    {file = "SQLAlchemy-1.4.50-cp311-cp311-win32.whl", hash = "sha256:8bdab03ff34fc91bfab005e96f672ae207d87e0ac7ee716d74e87e7046079d8b"},
+    {file = "SQLAlchemy-1.4.50-cp311-cp311-win_amd64.whl", hash = "sha256:52e01d60b06f03b0a5fc303c8aada405729cbc91a56a64cead8cb7c0b9b13c1a"},
+    {file = "SQLAlchemy-1.4.50-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:77fde9bf74f4659864c8e26ac08add8b084e479b9a18388e7db377afc391f926"},
     {file = "SQLAlchemy-1.4.50-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4cb501d585aa74a0f86d0ea6263b9c5e1d1463f8f9071392477fd401bd3c7cc"},
     {file = "SQLAlchemy-1.4.50-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a7a66297e46f85a04d68981917c75723e377d2e0599d15fbe7a56abed5e2d75"},
+    {file = "SQLAlchemy-1.4.50-cp312-cp312-win32.whl", hash = "sha256:e86c920b7d362cfa078c8b40e7765cbc34efb44c1007d7557920be9ddf138ec7"},
+    {file = "SQLAlchemy-1.4.50-cp312-cp312-win_amd64.whl", hash = "sha256:6b3df20fbbcbcd1c1d43f49ccf3eefb370499088ca251ded632b8cbaee1d497d"},
+    {file = "SQLAlchemy-1.4.50-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:fb9adc4c6752d62c6078c107d23327aa3023ef737938d0135ece8ffb67d07030"},
     {file = "SQLAlchemy-1.4.50-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1db0221cb26d66294f4ca18c533e427211673ab86c1fbaca8d6d9ff78654293"},
     {file = "SQLAlchemy-1.4.50-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b7dbe6369677a2bea68fe9812c6e4bbca06ebfa4b5cde257b2b0bf208709131"},
     {file = "SQLAlchemy-1.4.50-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a9bddb60566dc45c57fd0a5e14dd2d9e5f106d2241e0a2dc0c1da144f9444516"},
     {file = "SQLAlchemy-1.4.50-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82dd4131d88395df7c318eeeef367ec768c2a6fe5bd69423f7720c4edb79473c"},
+    {file = "SQLAlchemy-1.4.50-cp36-cp36m-win32.whl", hash = "sha256:1b9c4359d3198f341480e57494471201e736de459452caaacf6faa1aca852bd8"},
+    {file = "SQLAlchemy-1.4.50-cp36-cp36m-win_amd64.whl", hash = "sha256:35e4520f7c33c77f2636a1e860e4f8cafaac84b0b44abe5de4c6c8890b6aaa6d"},
+    {file = "SQLAlchemy-1.4.50-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:f5b1fb2943d13aba17795a770d22a2ec2214fc65cff46c487790192dda3a3ee7"},
     {file = "SQLAlchemy-1.4.50-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:273505fcad22e58cc67329cefab2e436006fc68e3c5423056ee0513e6523268a"},
     {file = "SQLAlchemy-1.4.50-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3257a6e09626d32b28a0c5b4f1a97bced585e319cfa90b417f9ab0f6145c33c"},
     {file = "SQLAlchemy-1.4.50-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:d69738d582e3a24125f0c246ed8d712b03bd21e148268421e4a4d09c34f521a5"},
     {file = "SQLAlchemy-1.4.50-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:34e1c5d9cd3e6bf3d1ce56971c62a40c06bfc02861728f368dcfec8aeedb2814"},
+    {file = "SQLAlchemy-1.4.50-cp37-cp37m-win32.whl", hash = "sha256:7b4396452273aedda447e5aebe68077aa7516abf3b3f48408793e771d696f397"},
+    {file = "SQLAlchemy-1.4.50-cp37-cp37m-win_amd64.whl", hash = "sha256:752f9df3dddbacb5f42d8405b2d5885675a93501eb5f86b88f2e47a839cf6337"},
+    {file = "SQLAlchemy-1.4.50-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:35c7ed095a4b17dbc8813a2bfb38b5998318439da8e6db10a804df855e3a9e3a"},
     {file = "SQLAlchemy-1.4.50-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1fcee5a2c859eecb4ed179edac5ffbc7c84ab09a5420219078ccc6edda45436"},
     {file = "SQLAlchemy-1.4.50-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbaf6643a604aa17e7a7afd74f665f9db882df5c297bdd86c38368f2c471f37d"},
     {file = "SQLAlchemy-1.4.50-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2e70e0673d7d12fa6cd363453a0d22dac0d9978500aa6b46aa96e22690a55eab"},
     {file = "SQLAlchemy-1.4.50-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b881ac07d15fb3e4f68c5a67aa5cdaf9eb8f09eb5545aaf4b0a5f5f4659be18"},
+    {file = "SQLAlchemy-1.4.50-cp38-cp38-win32.whl", hash = "sha256:8a219688297ee5e887a93ce4679c87a60da4a5ce62b7cb4ee03d47e9e767f558"},
+    {file = "SQLAlchemy-1.4.50-cp38-cp38-win_amd64.whl", hash = "sha256:a648770db002452703b729bdcf7d194e904aa4092b9a4d6ab185b48d13252f63"},
+    {file = "SQLAlchemy-1.4.50-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:4be4da121d297ce81e1ba745a0a0521c6cf8704634d7b520e350dce5964c71ac"},
     {file = "SQLAlchemy-1.4.50-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f6997da81114daef9203d30aabfa6b218a577fc2bd797c795c9c88c9eb78d49"},
     {file = "SQLAlchemy-1.4.50-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bdb77e1789e7596b77fd48d99ec1d2108c3349abd20227eea0d48d3f8cf398d9"},
     {file = "SQLAlchemy-1.4.50-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:128a948bd40780667114b0297e2cc6d657b71effa942e0a368d8cc24293febb3"},
     {file = "SQLAlchemy-1.4.50-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2d526aeea1bd6a442abc7c9b4b00386fd70253b80d54a0930c0a216230a35be"},
+    {file = "SQLAlchemy-1.4.50-cp39-cp39-win32.whl", hash = "sha256:a7c9b9dca64036008962dd6b0d9fdab2dfdbf96c82f74dbd5d86006d8d24a30f"},
+    {file = "SQLAlchemy-1.4.50-cp39-cp39-win_amd64.whl", hash = "sha256:df200762efbd672f7621b253721644642ff04a6ff957236e0e2fe56d9ca34d2c"},
     {file = "SQLAlchemy-1.4.50.tar.gz", hash = "sha256:3b97ddf509fc21e10b09403b5219b06c5b558b27fc2453150274fa4e70707dbf"},
 ]
 
@@ -8485,4 +8659,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10, <3.11"
-content-hash = "7e49947f0842c795031610eba766ccca04acb4944d62c6bc2dc522fb4c702cc4"
+content-hash = "0219fa88667d94b340e7a707f86a732ddbb5a6c1e2ac91b4f0b4b97f134b33a4"
diff --git a/pyproject.toml b/pyproject.toml
index e83666536..cd9d6ffad 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,7 +74,7 @@ apache-airflow = "^2.8.0"
 apache-airflow-providers-google = "^10.13.1"
 pydoclint = ">=0.3.8,<0.5.0"
 prettier = "^0.0.7"
-deptry = ">=0.12,<0.15"
+deptry = ">=0.12,<0.17"
 python-semantic-release = ">=8.7,<10.0"
 yamllint = "^1.33.0"