From 632ff147e68c5c90824a989465d670a52be2eb33 Mon Sep 17 00:00:00 2001
From: Daniel Suveges <daniel.suveges@protonmail.com>
Date: Mon, 13 Nov 2023 15:19:54 +0000
Subject: [PATCH 01/33] test: adding test for pairwiseLD

---
 src/otg/assets/schemas/pairwise_ld.json   |  23 +++++
 src/otg/dataset/pairwise_ld.py            | 108 ++++++++++++++++++++++
 src/otg/datasource/gnomad/ld.py           |  61 ++++++------
 tests/dataset/test_pairwise_ld.py         | 103 +++++++++++++++++++++
 tests/datasource/gnomad/test_gnomad_ld.py |   9 +-
 5 files changed, 270 insertions(+), 34 deletions(-)
 create mode 100644 src/otg/assets/schemas/pairwise_ld.json
 create mode 100644 src/otg/dataset/pairwise_ld.py
 create mode 100644 tests/dataset/test_pairwise_ld.py

diff --git a/src/otg/assets/schemas/pairwise_ld.json b/src/otg/assets/schemas/pairwise_ld.json
new file mode 100644
index 000000000..2d5351d82
--- /dev/null
+++ b/src/otg/assets/schemas/pairwise_ld.json
@@ -0,0 +1,23 @@
+{
+  "fields": [
+    {
+      "metadata": {},
+      "name": "variantId_i",
+      "nullable": false,
+      "type": "string"
+    },
+    {
+      "metadata": {},
+      "name": "variantId_j",
+      "nullable": false,
+      "type": "string"
+    },
+    {
+      "metadata": {},
+      "name": "r",
+      "nullable": false,
+      "type": "double"
+    }
+  ],
+  "type": "struct"
+}
diff --git a/src/otg/dataset/pairwise_ld.py b/src/otg/dataset/pairwise_ld.py
new file mode 100644
index 000000000..20aac1f6d
--- /dev/null
+++ b/src/otg/dataset/pairwise_ld.py
@@ -0,0 +1,108 @@
+"""Pairwise LD dataset."""
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from math import sqrt
+from typing import TYPE_CHECKING
+
+import numpy as np
+from pyspark.sql import functions as f
+from pyspark.sql import types as t
+
+from otg.common.schemas import parse_spark_schema
+from otg.dataset.dataset import Dataset
+
+if TYPE_CHECKING:
+    from pyspark.sql.types import StructType
+
+
+@dataclass
+class PairwiseLD(Dataset):
+    """Pairwise variant correlation dataset.
+
+    This class captures logic applied on pairwise linkage data + by validation ensuring data quality.
+    """
+
+    dimension: tuple[int, int] = field(init=False)
+
+    def __post_init__(self: PairwiseLD) -> None:
+        """Validating the dataset upon creation.
+
+        - Besides the schema, a pairwise LD table is expected have rows being a square number.
+        """
+        row_count = self.df.count()
+
+        assert int(sqrt(row_count)) == sqrt(
+            row_count
+        ), f"The number of rows in a pairwise LD table has to be square. Got: {row_count}"
+
+        self.dimension = (int(sqrt(row_count)), int(sqrt(row_count)))
+
+    @classmethod
+    def get_schema(cls: type[PairwiseLD]) -> StructType:
+        """Provide the schema for the StudyIndex dataset.
+
+        Returns:
+            StructType: The schema of the StudyIndex dataset.
+        """
+        return parse_spark_schema("pairwise_ld.json")
+
+    def overlap_with_locus(self: PairwiseLD, locus_variants: list[str]) -> PairwiseLD:
+        """Subset pairwise LD table with locus.
+
+        Args:
+            locus_variants (list[str]): List of variants found in the locus.
+
+        Returns:
+            PairwiseLD: _description_
+        """
+        return PairwiseLD(
+            _df=(
+                self.df.filter(
+                    f.col("variantId_i").isin(locus_variants)
+                    & f.col("variantId_j").isin(locus_variants)
+                )
+            ),
+            _schema=PairwiseLD.get_schema(),
+        )
+
+    def r_to_numpy_matrix(self) -> np.ndarray:
+        """Convert pairwise LD to a numpy square matrix.
+
+        Returns:
+            np.ndarray: 2D square matrix with r values.
+        """
+        return np.array(
+            self.df.select(
+                f.split("variantId_i", "_")[1]
+                .cast(t.IntegerType())
+                .alias("position_i"),
+                f.split("variantId_j", "_")[1]
+                .cast(t.IntegerType())
+                .alias("position_j"),
+                "r",
+            )
+            .orderBy(f.col("position_i").asc(), f.col("position_j").asc())
+            .select("r")
+            .collect()
+        ).reshape(self.dimension)
+
+    def get_variant_list(self) -> list[str]:
+        """Return a list of unique variants from the dataset.
+
+        Returns:
+            list[str]: list of variant identifiers sorted by position.
+        """
+        return [
+            row["variantId"]
+            for row in (
+                self.df.select(
+                    f.col("variantId_i").alias("variantId"),
+                    f.split(f.col("variantId_i"), "_")[1]
+                    .cast(t.IntegerType())
+                    .alias("position"),
+                )
+                .orderBy(f.col("position").asc())
+                .collect()
+            )
+        ]
diff --git a/src/otg/datasource/gnomad/ld.py b/src/otg/datasource/gnomad/ld.py
index 6b85dabc6..1aa2ac223 100644
--- a/src/otg/datasource/gnomad/ld.py
+++ b/src/otg/datasource/gnomad/ld.py
@@ -14,6 +14,7 @@
 
 from otg.common.utils import _liftover_loci, convert_gnomad_position_to_ensembl
 from otg.dataset.ld_index import LDIndex
+from otg.dataset.pairwise_ld import PairwiseLD
 
 if TYPE_CHECKING:
     from pyspark.sql import DataFrame
@@ -337,7 +338,7 @@ def get_ld_variants(
         chromosome: str,
         start: int,
         end: int,
-    ) -> DataFrame:
+    ) -> PairwiseLD:
         """Return melted LD table with resolved variant id based on ancestry and genomic location.
 
         Args:
@@ -347,7 +348,7 @@ def get_ld_variants(
             end (int): window lower bound
 
         Returns:
-            DataFrame: square LD matrix resolved to variants.
+            PairwiseLD: Melted representation of the square slice of the LD matrix.
         """
         # Extracting locus:
         ld_index_df = (
@@ -363,9 +364,6 @@ def get_ld_variants(
             .select("chromosome", "position", "variantId", "idx")
             .persist()
         )
-        # +----------+---------+--------------------+--------+
-        # |chromosome| position|           variantId|     idx|
-        # +----------+---------+--------------------+--------+
         start_index = self._get_value_from_row(
             ld_index_df.orderBy(f.col("position").asc()).first(), "idx"
         )
@@ -385,31 +383,34 @@ def get_ld_variants(
             return SparkSession.builder.getOrCreate().createDataFrame([], schema=schema)
 
         # Extract square matrix:
-        return (
-            self.get_ld_matrix_slice(
-                gnomad_ancestry, start_index=start_index, end_index=end_index
-            )
-            .join(
-                (
-                    ld_index_df.select(
-                        f.col("idx").alias("idx_i"),
-                        f.col("variantId").alias("variantId_i"),
-                    )
-                ),
-                on="idx_i",
-                how="inner",
-            )
-            .join(
-                (
-                    ld_index_df.select(
-                        f.col("idx").alias("idx_j"),
-                        f.col("variantId").alias("variantId_j"),
-                    )
-                ),
-                on="idx_j",
-                how="inner",
-            )
-            .select("variantId_i", "variantId_j", "r")
+        return PairwiseLD(
+            _df=(
+                self.get_ld_matrix_slice(
+                    gnomad_ancestry, start_index=start_index, end_index=end_index
+                )
+                .join(
+                    (
+                        ld_index_df.select(
+                            f.col("idx").alias("idx_i"),
+                            f.col("variantId").alias("variantId_i"),
+                        )
+                    ),
+                    on="idx_i",
+                    how="inner",
+                )
+                .join(
+                    (
+                        ld_index_df.select(
+                            f.col("idx").alias("idx_j"),
+                            f.col("variantId").alias("variantId_j"),
+                        )
+                    ),
+                    on="idx_j",
+                    how="inner",
+                )
+                .select("variantId_i", "variantId_j", "r")
+            ),
+            _schema=PairwiseLD.get_schema(),
         )
 
     def get_ld_matrix_slice(
diff --git a/tests/dataset/test_pairwise_ld.py b/tests/dataset/test_pairwise_ld.py
new file mode 100644
index 000000000..b6b7daf46
--- /dev/null
+++ b/tests/dataset/test_pairwise_ld.py
@@ -0,0 +1,103 @@
+"""Testing pairwise LD dataset."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pytest
+from pyspark.sql import functions as f
+from pyspark.sql.window import Window
+
+from otg.dataset.pairwise_ld import PairwiseLD
+
+if TYPE_CHECKING:
+    from pyspark.sql import SparkSession
+
+
+class TestPairwiseLD:
+    """Test suit for pairwise LD dataset and associated methods."""
+
+    variants = [
+        "1_8_A_C",
+        "1_9_A_C",
+        "1_10_A_C",
+        "1_99_A_C",
+    ]
+
+    @pytest.fixture(scope="class")
+    def mock_pairwise_ld(self: TestPairwiseLD, spark: SparkSession) -> PairwiseLD:
+        """Generate a mock pairwise LD dataset.
+
+        Args:
+            spark (SparkSession): _description_
+
+        Returns:
+            PairwiseLD: _description_
+        """
+        spark = spark.builder.getOrCreate()
+
+        data = [(v1, v2) for v1 in self.variants for v2 in self.variants]
+        return PairwiseLD(
+            _df=(
+                spark.createDataFrame(data, ["variantId_i", "variantId_j"])
+                .withColumn(
+                    "r",
+                    f.row_number()
+                    .over(Window.partitionBy(f.lit("x")).orderBy("variantId_i"))
+                    .cast("double"),
+                )
+                .withColumn(
+                    "r",
+                    f.when(f.col("variantId_i") == f.col("variantId_j"), 1.0).otherwise(
+                        f.col("r")
+                    ),
+                )
+                .persist()
+            ),
+            _schema=PairwiseLD.get_schema(),
+        )
+
+    @staticmethod
+    def test_pairwise_ld__type(mock_pairwise_ld: PairwiseLD) -> None:
+        """Testing type."""
+        assert isinstance(mock_pairwise_ld, PairwiseLD)
+
+    def test_pariwise_ld__get_variants(
+        self: TestPairwiseLD, mock_pairwise_ld: PairwiseLD
+    ) -> None:
+        """Testing function that returns list of variants from the LD table.
+
+        Args:
+            mock_pairwise_ld (PairwiseLD): _description_
+        """
+        variant_set_expected = set(self.variants)
+        variant_set_from_data = set(mock_pairwise_ld.get_variant_list())
+
+        assert variant_set_from_data == variant_set_expected
+
+    def test_pairwise_ld__r_to_numpy_matrix__type(
+        self: TestPairwiseLD, mock_pairwise_ld: PairwiseLD
+    ) -> None:
+        """Testing the returned numpy array."""
+        assert isinstance(mock_pairwise_ld.r_to_numpy_matrix(), np.ndarray)
+
+    def test_pairwise_ld__r_to_numpy_matrix__dimensions(
+        self: TestPairwiseLD, mock_pairwise_ld: PairwiseLD
+    ) -> None:
+        """Testing the returned numpy array."""
+        assert mock_pairwise_ld.r_to_numpy_matrix().shape == (
+            len(self.variants),
+            len(self.variants),
+        )
+
+    def test_pairwise_ld__overlap_with_locus(
+        self: TestPairwiseLD, mock_pairwise_ld: PairwiseLD
+    ) -> None:
+        """Testing the returned numpy array."""
+        variant_subset = self.variants[1:3]
+
+        assert (
+            mock_pairwise_ld.overlap_with_locus(variant_subset).df.count()
+            == len(variant_subset) ** 2
+        )
diff --git a/tests/datasource/gnomad/test_gnomad_ld.py b/tests/datasource/gnomad/test_gnomad_ld.py
index bc2bd6f45..85dfbd0e1 100644
--- a/tests/datasource/gnomad/test_gnomad_ld.py
+++ b/tests/datasource/gnomad/test_gnomad_ld.py
@@ -10,6 +10,7 @@
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql import functions as f
 
+from otg.dataset.pairwise_ld import PairwiseLD
 from otg.datasource.gnomad.ld import GnomADLDMatrix
 
 
@@ -91,7 +92,7 @@ def _setup(self: TestGnomADLDMatrixVariants, spark: SparkSession) -> None:
         )
 
     @pytest.fixture(scope="class")
-    def ld_slice(self: TestGnomADLDMatrixVariants) -> DataFrame:
+    def ld_slice(self: TestGnomADLDMatrixVariants) -> PairwiseLD:
         """Generate a resolved LD slice."""
         return self.gnomad_ld_matrix.get_ld_variants(
             gnomad_ancestry="test-pop",  # observed[0],
@@ -102,14 +103,14 @@ def ld_slice(self: TestGnomADLDMatrixVariants) -> DataFrame:
 
     def test_get_ld_variants__type(
         self: TestGnomADLDMatrixVariants,
-        ld_slice: DataFrame
+        ld_slice: PairwiseLD
         # self: TestGnomADLDMatrixVariants, observed: list[Any], expected: list[Any]
     ) -> None:
         """Testing if the function returns the right type."""
         # Do we have the right type?
-        assert isinstance(ld_slice, DataFrame)
+        assert isinstance(ld_slice, PairwiseLD)
         # Do we have a real square?
-        assert sqrt(ld_slice.count()) == int(sqrt(ld_slice.count()))
+        assert sqrt(ld_slice.df.count()) == int(sqrt(ld_slice.df.count()))
 
 
 class TestGnomADLDMatrixSlice:

From 557fadedf3aa12d7e15c55fd3ea0a240318fbfef Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Thu, 18 Jan 2024 10:57:54 +0000
Subject: [PATCH 02/33] feat: adding ld matrix extraction

---
 src/otg/method/ld.py | 55 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/src/otg/method/ld.py b/src/otg/method/ld.py
index c834efb85..fab66eb9a 100644
--- a/src/otg/method/ld.py
+++ b/src/otg/method/ld.py
@@ -132,7 +132,60 @@ def ld_annotate(
                             f.col("ldPopulationStructure").isNotNull(),
                             cls._calculate_weighted_r_overall(f.col("ldSet")),
                         ),
-                    ).drop("ldPopulationStructure")
+                    )
+                    .drop("ldPopulationStructure")
+                ),
+                _schema=StudyLocus.get_schema(),
+            )
+            ._qc_no_population()
+            ._qc_unresolved_ld()
+        )
+
+
+class LDMatrix:
+    """Class to extract LD matrix from GnomAD."""
+
+    @classmethod
+    def matrix_extract(
+        cls: type[LDMatrix],
+        associations: StudyLocus,
+        studies: StudyIndex,
+        ld_index: LDIndex,
+    ) -> StudyLocus:
+        """Extract LD matrix for a set of studyLocus.
+
+        This function:
+            1. Annotates study locus with population structure information from the study index
+            2. Joins the LD index to the StudyLocus
+            3. Adds the population size of the study to each rValues entry in the ldSet
+            4. Calculates the overall R weighted by the ancestry proportions in every given study.
+
+        Args:
+            associations (StudyLocus): Dataset to be LD annotated
+            studies (StudyIndex): Dataset with study information
+            ld_index (LDIndex): Dataset with LD information for every variant present in LD matrix
+
+        Returns:
+            StudyLocus: including additional column with LD matrix.
+        """
+        return (
+            StudyLocus(
+                _df=(
+                    associations.df
+                    # Annotate study locus with population structure from study index
+                    .join(
+                        studies.df.select(
+                            "studyId", f.explode("ldPopulationStructure")
+                        ),
+                        on="studyId",
+                        how="left",
+                    )
+                    # Bring LD information from LD Index
+                    .join(
+                        ld_index.df,
+                        on=["variantId", "chromosome"],
+                        how="left",
+                    )
                 ),
                 _schema=StudyLocus.get_schema(),
             )

From c952898cd9409e01d6e30434b6fd063f62882036 Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Thu, 18 Jan 2024 12:33:28 +0000
Subject: [PATCH 03/33] chore: merge from dev

---
 config/datasets/ot_gcp.yaml                   |  35 ++++--
 config/step/ot_gwas_catalog_ingestion.yaml    |  14 +--
 .../step/ot_gwas_catalog_study_curation.yaml  |   6 +-
 .../step/ot_gwas_catalog_study_inclusion.yaml |   8 +-
 config/step/ot_ld_based_clumping.yaml         |   2 +-
 docs/python_api/steps/ld_clump.md             |   2 +-
 .../dags/gwas_catalog_harmonisation.py        |  21 ++--
 src/airflow/dags/gwas_catalog_preprocess.py   |  93 ++++++++++------
 src/gentropy/config.py                        |   4 +-
 src/gentropy/ld_based_clumping.py             |   2 +-
 utils/update_GWAS_Catalog_data.sh             | 102 +++++++++++++-----
 11 files changed, 193 insertions(+), 96 deletions(-)

diff --git a/config/datasets/ot_gcp.yaml b/config/datasets/ot_gcp.yaml
index 8e588ec62..93630c3a3 100644
--- a/config/datasets/ot_gcp.yaml
+++ b/config/datasets/ot_gcp.yaml
@@ -1,10 +1,33 @@
 # Release specific configuration:
 release_version: "24.01"
+version: "XX.XX"
 release_folder: gs://genetics_etl_python_playground/releases/${datasets.release_version}
 
 inputs: gs://genetics_etl_python_playground/input
 outputs: gs://genetics_etl_python_playground/output/python_etl/parquet/${datasets.version}
 
+## Datasets:
+gwas_catalog_dataset: gs://gwas_catalog_data
+# Ingestion input files:
+gwas_catalog_associations: ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_associations_ontology_annotated.tsv
+gwas_catalog_studies:
+  - ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_download_studies.tsv
+  - ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_unpublished_studies.tsv
+gwas_catalog_ancestries:
+  - ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_download_ancestries.tsv
+  - ${datasets.gwas_catalog_dataset}/curated_inputs/gwas_catalog_unpublished_ancestries.tsv
+gwas_catalog_sumstats_lut: ${datasets.gwas_catalog_dataset}/curated_inputs/harmonised_list.txt
+gwas_catalog_study_curation: ${datasets.gwas_catalog_dataset}/manifests/gwas_catalog_study_curation.tsv
+# Harmonised summary statistics list:
+gwas_catalog_summary_stats_list: ${datasets.gwas_catalog_dataset}/manifests/gwas_catalog_harmonised_sumstats_list.txt
+# Inclusion lists:
+gwas_catalog_curated_inclusion_list: ${datasets.gwas_catalog_dataset}/manifests/gwas_catalog_curated_included_studies
+gwas_catalog_summary_satistics_inclusion_list: ${datasets.gwas_catalog_dataset}/manifests/gwas_catalog_summary_statistics_included_studies
+# Ingestion output folders:
+gwas_catalog_study_index: ${datasets.gwas_catalog_dataset}/study_index
+gwas_catalog_study_locus_folder: ${datasets.gwas_catalog_dataset}/study_locus_datasets
+gwas_catalog_credible_set_folder: ${datasets.gwas_catalog_dataset}/credible_set_datasets
+
 # Input datasets
 chain_37_38: ${datasets.inputs}/v2g_input/grch37_to_grch38.over.chain
 target_index: ${datasets.inputs}/v2g_input/targets_correct_tss
@@ -13,16 +36,6 @@ anderson: gs://genetics-portal-input/v2g_input/andersson2014/enhancer_tss_associ
 javierre: gs://genetics-portal-input/v2g_input/javierre_2016_preprocessed.parquet
 jung: gs://genetics-portal-raw/pchic_jung2019/jung2019_pchic_tableS3.csv
 thurman: gs://genetics-portal-input/v2g_input/thurman2012/genomewideCorrs_above0.7_promoterPlusMinus500kb_withGeneNames_32celltypeCategories.bed8.gz
-catalog_associations: ${datasets.inputs}/v2d/gwas_catalog_v1.0.2-associations_e110_r2023-12-21.tsv
-catalog_studies:
-  # To get a complete representation of all GWAS Catalog studies, we need to
-  # ingest the list of unpublished studies from a different file.
-  - ${datasets.inputs}/v2d/gwas-catalog-v1.0.3-studies-r2023-12-21.tsv
-  - ${datasets.inputs}/v2d/gwas-catalog-v1.0.3-unpublished-studies-r2023-12-21.tsv
-catalog_ancestries:
-  - ${datasets.inputs}/v2d/gwas-catalog-v1.0.3-ancestries-r2023-12-21.tsv
-  - ${datasets.inputs}/v2d/gwas-catalog-v1.0.3-unpublished-ancestries-r2023-12-21.tsv
-catalog_sumstats_lut: ${datasets.inputs}/v2d/harmonised_list-r2023-12-21.txt
 
 gene_interactions: ${datasets.inputs}/l2g/interaction # 23.09 data
 eqtl_catalogue_paths_imported: ${datasets.inputs}/preprocess/eqtl_catalogue/tabix_ftp_paths_imported.tsv
@@ -37,7 +50,7 @@ study_locus_overlap: ${datasets.outputs}/study_locus_overlap
 ld_index: ${datasets.outputs}/ld_index
 catalog_study_index: ${datasets.study_index}/catalog
 catalog_study_locus: ${datasets.study_locus}/catalog_study_locus
-gwas_catalog_study_curation: ${datasets.inputs}/v2d/GWAS_Catalog_study_curation.tsv
+
 finngen_study_index: ${datasets.study_index}/finngen
 finngen_summary_stats: ${datasets.summary_statistics}/finngen
 from_sumstats_study_locus: ${datasets.study_locus}/from_sumstats
diff --git a/config/step/ot_gwas_catalog_ingestion.yaml b/config/step/ot_gwas_catalog_ingestion.yaml
index 65606b7e4..fc82b82c2 100644
--- a/config/step/ot_gwas_catalog_ingestion.yaml
+++ b/config/step/ot_gwas_catalog_ingestion.yaml
@@ -1,12 +1,12 @@
 defaults:
   - gwas_catalog_ingestion
 
-catalog_study_files: ${datasets.catalog_studies}
-catalog_ancestry_files: ${datasets.catalog_ancestries}
-catalog_associations_file: ${datasets.catalog_associations}
-catalog_sumstats_lut: ${datasets.catalog_sumstats_lut}
+catalog_study_files: ${datasets.gwas_catalog_studies}
+catalog_ancestry_files: ${datasets.gwas_catalog_ancestries}
+catalog_associations_file: ${datasets.gwas_catalog_associations}
+catalog_sumstats_lut: ${datasets.gwas_catalog_sumstats_lut}
 variant_annotation_path: ${datasets.variant_annotation}
-catalog_studies_out: ${datasets.catalog_study_index}
-catalog_associations_out: ${datasets.catalog_study_locus}
+catalog_studies_out: ${datasets.gwas_catalog_study_index}
+catalog_associations_out: ${datasets.gwas_catalog_study_locus_folder}/gwas_catalog_curated_associations
 gwas_catalog_study_curation_file: ${datasets.gwas_catalog_study_curation}
-inclusion_list_path: ???
+inclusion_list_path: ${datasets.gwas_catalog_curated_inclusion_list}
diff --git a/config/step/ot_gwas_catalog_study_curation.yaml b/config/step/ot_gwas_catalog_study_curation.yaml
index eb6c0ec78..77c1d7834 100644
--- a/config/step/ot_gwas_catalog_study_curation.yaml
+++ b/config/step/ot_gwas_catalog_study_curation.yaml
@@ -1,8 +1,8 @@
 defaults:
   - gwas_catalog_study_curation
 
-catalog_study_files: ${datasets.catalog_studies}
-catalog_ancestry_files: ${datasets.catalog_ancestries}
-catalog_sumstats_lut: ${datasets.catalog_sumstats_lut}
+catalog_study_files: ${datasets.gwas_catalog_studies}
+catalog_ancestry_files: ${datasets.gwas_catalog_ancestries}
+catalog_sumstats_lut: ${datasets.gwas_catalog_sumstats_lut}
 gwas_catalog_study_curation_file: ${datasets.gwas_catalog_study_curation}
 gwas_catalog_study_curation_out: ???
diff --git a/config/step/ot_gwas_catalog_study_inclusion.yaml b/config/step/ot_gwas_catalog_study_inclusion.yaml
index 8a560127e..7f3bf80b3 100644
--- a/config/step/ot_gwas_catalog_study_inclusion.yaml
+++ b/config/step/ot_gwas_catalog_study_inclusion.yaml
@@ -1,12 +1,12 @@
 defaults:
   - gwas_catalog_study_inclusion
 
-catalog_study_files: ${datasets.catalog_studies}
-catalog_ancestry_files: ${datasets.catalog_ancestries}
-catalog_associations_file: ${datasets.catalog_associations}
+catalog_study_files: ${datasets.gwas_catalog_studies}
+catalog_ancestry_files: ${datasets.gwas_catalog_ancestries}
+catalog_associations_file: ${datasets.gwas_catalog_associations}
 variant_annotation_path: ${datasets.variant_annotation}
 gwas_catalog_study_curation_file: ${datasets.gwas_catalog_study_curation}
-harmonised_study_file: ???
+harmonised_study_file: ${datasets.gwas_catalog_summary_stats_list}
 criteria: ???
 inclusion_list_path: ???
 exclusion_list_path: ???
diff --git a/config/step/ot_ld_based_clumping.yaml b/config/step/ot_ld_based_clumping.yaml
index d25ca84b7..d02c0acdd 100644
--- a/config/step/ot_ld_based_clumping.yaml
+++ b/config/step/ot_ld_based_clumping.yaml
@@ -1,7 +1,7 @@
 defaults:
   - ld_based_clumping
 
+ld_index_path: ${datasets.ld_index}
 study_locus_input_path: ???
-ld_index_path: ???
 study_index_path: ???
 clumped_study_locus_output_path: ???
diff --git a/docs/python_api/steps/ld_clump.md b/docs/python_api/steps/ld_clump.md
index 75097bd8c..00fc8f2e4 100644
--- a/docs/python_api/steps/ld_clump.md
+++ b/docs/python_api/steps/ld_clump.md
@@ -2,4 +2,4 @@
 title: LD-based clumping
 ---
 
-::: gentropy.ld_based_clumping.LdBasedClumpingStep
+::: gentropy.ld_based_clumping.LDBasedClumpingStep
diff --git a/src/airflow/dags/gwas_catalog_harmonisation.py b/src/airflow/dags/gwas_catalog_harmonisation.py
index 7e7790224..5713e223d 100644
--- a/src/airflow/dags/gwas_catalog_harmonisation.py
+++ b/src/airflow/dags/gwas_catalog_harmonisation.py
@@ -14,7 +14,9 @@
 CLUSTER_NAME = "otg-gwascatalog-harmonisation"
 AUTOSCALING = "gwascatalog-harmonisation"
 
-SUMMARY_STATS_BUCKET_NAME = "open-targets-gwas-summary-stats"
+SUMMARY_STATS_BUCKET_NAME = "gwas_catalog_data"
+RAW_SUMMARY_STATISTICS_PREFIX = "raw_summary_statistics"
+HARMONISED_SUMMARY_STATISTICS_PREFIX = "harmonised_summary_statistics"
 
 with DAG(
     dag_id=Path(__file__).stem,
@@ -26,14 +28,14 @@
     list_inputs = GCSListObjectsOperator(
         task_id="list_raw_harmonised",
         bucket=SUMMARY_STATS_BUCKET_NAME,
-        prefix="raw-harmonised",
+        prefix=RAW_SUMMARY_STATISTICS_PREFIX,
         match_glob="**/*.h.tsv.gz",
     )
     # List parquet files that have been previously processed
     list_outputs = GCSListObjectsOperator(
         task_id="list_harmonised_parquet",
         bucket=SUMMARY_STATS_BUCKET_NAME,
-        prefix="harmonised",
+        prefix=HARMONISED_SUMMARY_STATISTICS_PREFIX,
         match_glob="**/_SUCCESS",
     )
 
@@ -59,11 +61,15 @@ def create_to_do_list(**kwargs: Any) -> Any:
         print("Number of parquet files: ", len(parquets))  # noqa: T201
         for path in raw_harmonised:
             match_result = re.search(
-                r"raw-harmonised/(.*)/(GCST\d+)/harmonised/(.*)\.h\.tsv\.gz", path
+                rf"{RAW_SUMMARY_STATISTICS_PREFIX}/(.*)/(GCST\d+)/harmonised/(.*)\.h\.tsv\.gz",
+                path,
             )
             if match_result:
                 study_id = match_result.group(2)
-                if f"harmonised/{study_id}.parquet/_SUCCESS" not in parquets:
+                if (
+                    f"{HARMONISED_SUMMARY_STATISTICS_PREFIX}/{study_id}.parquet/_SUCCESS"
+                    not in parquets
+                ):
                     to_do_list.append(path)
         print("Number of jobs to submit: ", len(to_do_list))  # noqa: T201
         ti.xcom_push(key="to_do_list", value=to_do_list)
@@ -85,7 +91,8 @@ def submit_jobs(**kwargs: Any) -> None:
                 time.sleep(60)
             input_path = todo[i]
             match_result = re.search(
-                r"raw-harmonised/(.*)/(GCST\d+)/harmonised/(.*)\.h\.tsv\.gz", input_path
+                rf"{RAW_SUMMARY_STATISTICS_PREFIX}/(.*)/(GCST\d+)/harmonised/(.*)\.h\.tsv\.gz",
+                input_path,
             )
             if match_result:
                 study_id = match_result.group(2)
@@ -95,7 +102,7 @@ def submit_jobs(**kwargs: Any) -> None:
                 step_id="ot_gwas_catalog_sumstat_preprocess",
                 other_args=[
                     f"step.raw_sumstats_path=gs://{SUMMARY_STATS_BUCKET_NAME}/{input_path}",
-                    f"step.out_sumstats_path=gs://{SUMMARY_STATS_BUCKET_NAME}/harmonised/{study_id}.parquet",
+                    f"step.out_sumstats_path=gs://{SUMMARY_STATS_BUCKET_NAME}/{HARMONISED_SUMMARY_STATISTICS_PREFIX}/{study_id}.parquet",
                 ],
             )
 
diff --git a/src/airflow/dags/gwas_catalog_preprocess.py b/src/airflow/dags/gwas_catalog_preprocess.py
index 33d4062a9..9df22586a 100644
--- a/src/airflow/dags/gwas_catalog_preprocess.py
+++ b/src/airflow/dags/gwas_catalog_preprocess.py
@@ -13,11 +13,44 @@
 CLUSTER_NAME = "otg-preprocess-gwascatalog"
 AUTOSCALING = "otg-preprocess-gwascatalog"
 
-RELEASEBUCKET = "gs://genetics_etl_python_playground/output/python_etl/parquet/XX.XX"
-RELEASEBUCKET_NAME = "genetics_etl_python_playground"
-SUMMARY_STATS_BUCKET_NAME = "open-targets-gwas-summary-stats"
-SUMSTATS = "gs://open-targets-gwas-summary-stats/harmonised"
-MANIFESTS_PATH = f"{RELEASEBUCKET}/manifests/"
+# Setting up bucket name and output object names:
+GWAS_CATALOG_BUCKET_NAME = "gwas_catalog_data"
+HARMONISED_SUMSTATS_PREFIX = "harmonised_summary_statistics"
+
+# Manifest paths:
+MANIFESTS_PATH = f"gs://{GWAS_CATALOG_BUCKET_NAME}/manifests/"
+
+# The name of the manifest files have to be consistent with the config file:
+HARMONISED_SUMSTATS_LIST_OBJECT_NAME = (
+    "manifests/gwas_catalog_harmonised_sumstats_list.txt"
+)
+HARMONISED_SUMSTATS_LIST_FULL_NAME = (
+    f"gs://{GWAS_CATALOG_BUCKET_NAME}/{HARMONISED_SUMSTATS_LIST_OBJECT_NAME}"
+)
+CURATION_INCLUSION_NAME = f"{MANIFESTS_PATH}/gwas_catalog_curated_included_studies"
+CURATION_EXCLUSION_NAME = f"{MANIFESTS_PATH}/gwas_catalog_curation_excluded_studies"
+SUMMARY_STATISTICS_INCLUSION_NAME = (
+    f"{MANIFESTS_PATH}/gwas_catalog_summary_statistics_included_studies"
+)
+SUMMARY_STATISTICS_EXCLUSION_NAME = (
+    f"{MANIFESTS_PATH}/gwas_catalog_summary_statistics_excluded_studies"
+)
+
+# Study index:
+STUDY_INDEX = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_index"
+
+# Study loci:
+CURATED_STUDY_LOCI = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_curated_associations"
+CURATED_LD_CLUMPED = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_curated_associations_ld_clumped"
+WINDOW_BASED_CLUMPED = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_summary_stats_window_clumped"
+LD_BASED_CLUMPED = f"gs://{GWAS_CATALOG_BUCKET_NAME}/study_locus_datasets/gwas_catalog_summary_stats_ld_clumped"
+# Credible sets:
+CURATED_CREDIBLE_SETS = (
+    f"gs://{GWAS_CATALOG_BUCKET_NAME}/credible_set_datasets/gwas_catalog_curated"
+)
+SUMMARY_STATISTICS_CREDIBLE_SETS = (
+    f"gs://{GWAS_CATALOG_BUCKET_NAME}/credible_set_datasets/gwas_catalog_summary_stats"
+)
 
 
 def upload_harmonized_study_list(
@@ -48,8 +81,8 @@ def upload_harmonized_study_list(
     # Getting list of folders (each a gwas study with summary statistics)
     list_harmonised_sumstats = GCSListObjectsOperator(
         task_id="list_harmonised_parquet",
-        bucket=SUMMARY_STATS_BUCKET_NAME,
-        prefix="harmonised",
+        bucket=GWAS_CATALOG_BUCKET_NAME,
+        prefix=HARMONISED_SUMSTATS_PREFIX,
         match_glob="**/_SUCCESS",
     )
 
@@ -59,8 +92,8 @@ def upload_harmonized_study_list(
         python_callable=upload_harmonized_study_list,
         op_kwargs={
             "concatenated_studies": '{{ "\n".join(ti.xcom_pull( key="return_value", task_ids="list_harmonised_parquet")) }}',
-            "bucket_name": RELEASEBUCKET_NAME,
-            "object_name": "output/python_etl/parquet/XX.XX/manifests/harmonised_sumstats.txt",
+            "bucket_name": GWAS_CATALOG_BUCKET_NAME,
+            "object_name": HARMONISED_SUMSTATS_LIST_OBJECT_NAME,
         },
     )
 
@@ -73,9 +106,9 @@ def upload_harmonized_study_list(
             task_id="catalog_curation_inclusion_list",
             other_args=[
                 "step.criteria=curation",
-                f"step.inclusion_list_path={MANIFESTS_PATH}manifest_curation",
-                f"step.exclusion_list_path={MANIFESTS_PATH}exclusion_curation",
-                f"step.harmonised_study_file={MANIFESTS_PATH}harmonised_sumstats.txt",
+                f"step.inclusion_list_path={CURATION_INCLUSION_NAME}",
+                f"step.exclusion_list_path={CURATION_EXCLUSION_NAME}",
+                f"step.harmonised_study_file={HARMONISED_SUMSTATS_LIST_FULL_NAME}",
             ],
         )
 
@@ -84,7 +117,7 @@ def upload_harmonized_study_list(
             cluster_name=CLUSTER_NAME,
             step_id="ot_gwas_catalog_ingestion",
             task_id="ingest_curated_gwas_catalog_data",
-            other_args=[f"step.inclusion_list_path={MANIFESTS_PATH}manifest_curation"],
+            other_args=[f"step.inclusion_list_path={CURATION_INCLUSION_NAME}"],
         )
 
         # Run LD-annotation and clumping on curated data:
@@ -93,10 +126,9 @@ def upload_harmonized_study_list(
             step_id="ot_ld_based_clumping",
             task_id="catalog_curation_ld_clumping",
             other_args=[
-                f"step.study_locus_input_path={RELEASEBUCKET}/study_locus/catalog_curated",
-                f"step.ld_index_path={RELEASEBUCKET}/ld_index",
-                f"step.study_index_path={RELEASEBUCKET}/study_index/catalog",
-                f"step.clumped_study_locus_output_path={RELEASEBUCKET}/study_locus/ld_clumped/catalog_curated",
+                f"step.study_locus_input_path={CURATED_STUDY_LOCI}",
+                f"step.study_index_path={STUDY_INDEX}",
+                f"step.clumped_study_locus_output_path={CURATED_LD_CLUMPED}",
             ],
         )
 
@@ -106,8 +138,8 @@ def upload_harmonized_study_list(
             step_id="ot_pics",
             task_id="catalog_curation_pics",
             other_args=[
-                f"step.study_locus_ld_annotated_in={RELEASEBUCKET}/study_locus/ld_clumped/catalog_curated",
-                f"step.picsed_study_locus_out={RELEASEBUCKET}/credible_set/catalog_curated",
+                f"step.study_locus_ld_annotated_in={CURATED_LD_CLUMPED}",
+                f"step.picsed_study_locus_out={CURATED_CREDIBLE_SETS}",
             ],
         )
 
@@ -130,9 +162,9 @@ def upload_harmonized_study_list(
             task_id="catalog_sumstats_inclusion_list",
             other_args=[
                 "step.criteria=summary_stats",
-                f"step.inclusion_list_path={MANIFESTS_PATH}manifest_sumstats",
-                f"step.exclusion_list_path={MANIFESTS_PATH}exclusion_sumstats",
-                f"step.harmonised_study_file={MANIFESTS_PATH}harmonised_sumstats.txt",
+                f"step.inclusion_list_path={SUMMARY_STATISTICS_INCLUSION_NAME}",
+                f"step.exclusion_list_path={SUMMARY_STATISTICS_EXCLUSION_NAME}",
+                f"step.harmonised_study_file={HARMONISED_SUMSTATS_LIST_FULL_NAME}",
             ],
         )
 
@@ -142,9 +174,9 @@ def upload_harmonized_study_list(
             step_id="ot_window_based_clumping",
             task_id="catalog_sumstats_window_clumping",
             other_args=[
-                f"step.summary_statistics_input_path={SUMSTATS}",
-                f"step.study_locus_output_path={RELEASEBUCKET}/study_locus/window_clumped/from_sumstats/catalog",
-                f"step.inclusion_list_path={MANIFESTS_PATH}manifest_sumstats",
+                f"step.summary_statistics_input_path=gs://{GWAS_CATALOG_BUCKET_NAME}/{HARMONISED_SUMSTATS_PREFIX}",
+                f"step.inclusion_list_path={SUMMARY_STATISTICS_INCLUSION_NAME}",
+                f"step.study_locus_output_path={WINDOW_BASED_CLUMPED}",
             ],
         )
 
@@ -154,10 +186,9 @@ def upload_harmonized_study_list(
             step_id="ot_ld_based_clumping",
             task_id="catalog_sumstats_ld_clumping",
             other_args=[
-                f"step.study_locus_input_path={RELEASEBUCKET}/study_locus/window_clumped/from_sumstats/catalog",
-                f"step.ld_index_path={RELEASEBUCKET}/ld_index",
-                f"step.study_index_path={RELEASEBUCKET}/study_index/catalog",
-                f"step.clumped_study_locus_output_path={RELEASEBUCKET}/study_locus/ld_clumped/from_sumstats/catalog",
+                f"step.study_locus_input_path={WINDOW_BASED_CLUMPED}",
+                f"step.study_index_path={STUDY_INDEX}",
+                f"step.clumped_study_locus_output_path={LD_BASED_CLUMPED}",
             ],
         )
 
@@ -167,8 +198,8 @@ def upload_harmonized_study_list(
             step_id="ot_pics",
             task_id="catalog_sumstats_pics",
             other_args=[
-                f"step.study_locus_ld_annotated_in={RELEASEBUCKET}/study_locus/ld_clumped/from_sumstats/catalog",
-                f"step.picsed_study_locus_out={RELEASEBUCKET}/credible_set/from_sumstats/catalog",
+                f"step.study_locus_ld_annotated_in={LD_BASED_CLUMPED}",
+                f"step.picsed_study_locus_out={SUMMARY_STATISTICS_CREDIBLE_SETS}",
             ],
         )
 
diff --git a/src/gentropy/config.py b/src/gentropy/config.py
index ba213a349..ac9ce821d 100644
--- a/src/gentropy/config.py
+++ b/src/gentropy/config.py
@@ -78,7 +78,7 @@ class GWASCatalogStudyInclusionConfig(StepConfig):
     inclusion_list_path: str = MISSING
     exclusion_list_path: str = MISSING
     _target_: str = (
-        "gentropy.gwas_catalog_study_inclusion.GWASCatalogStudyInclusionStep"
+        "gentropy.gwas_catalog_study_inclusion.GWASCatalogStudyInclusionGenerator"
     )
 
 
@@ -301,7 +301,7 @@ class WindowBasedClumpingStep(StepConfig):
     inclusion_list_path: str = MISSING
     locus_collect_distance: str | None = None
 
-    _target_: str = "gentropy.clump.WindowBasedClumpingStep"
+    _target_: str = "gentropy.window_based_clumping.WindowBasedClumpingStep"
 
 
 @dataclass
diff --git a/src/gentropy/ld_based_clumping.py b/src/gentropy/ld_based_clumping.py
index ea9646806..e6a477a89 100644
--- a/src/gentropy/ld_based_clumping.py
+++ b/src/gentropy/ld_based_clumping.py
@@ -7,7 +7,7 @@
 from gentropy.dataset.study_locus import StudyLocus
 
 
-class LdBasedClumpingStep:
+class LDBasedClumpingStep:
     """Step to perform LD-based clumping on study locus dataset.
 
     As a first step, study locus is enriched with population specific linked-variants.
diff --git a/utils/update_GWAS_Catalog_data.sh b/utils/update_GWAS_Catalog_data.sh
index 98ac4cc36..1e380d30c 100755
--- a/utils/update_GWAS_Catalog_data.sh
+++ b/utils/update_GWAS_Catalog_data.sh
@@ -1,6 +1,5 @@
 #!/usr/bin/env bash
 
-
 # Function to get the most recent date:
 get_most_recent(){
     cat $1 | perl -lane 'push @a, $_ if $_ =~ /^\d+$/; END {@a = sort { $a <=> $b} @a; print pop @a }'
@@ -21,13 +20,47 @@ get_release_info(){
 
 logging(){
     log_prompt="[$(date "+%Y.%m.%d %H:%M")]"
-    echo "${log_prompt} $@"
+    echo "${log_prompt} $@" >> ${LOG_FILE}
+}
+
+upload_file_to_gcp(){
+    FILENAME=${1}
+    TARGET=${2}
+    # Test if file exists:
+    if [ ! -f ${FILENAME} ]; then
+        logging "File ${FILENAME} does not exist."
+        return
+    fi
+
+    logging "Copying ${FILENAME} to GCP..."
+    gsutil -mq cp file://$(pwd)/${FILENAME} ${TARGET}
+
+    # Test if file was successfully uploaded:
+    if [ $? -ne 0 ]; then
+        logging "File ${FILENAME} failed to upload."
+    fi
 }
 
 # Resources:
 export BASE_URL=ftp://ftp.ebi.ac.uk/pub/databases/gwas
 export RELEASE_INFO_URL=https://www.ebi.ac.uk/gwas/api/search/stats
-export GCP_TARGET=gs://genetics_etl_python_playground/input/v2d/
+export GCP_TARGET=gs://gwas_catalog_data
+export LOG_FILE=gwas_catalog_data_update.log
+
+export GWAS_CATALOG_STUDY_CURATION_URL=https://raw.githubusercontent.com/opentargets/curation/master/genetics/GWAS_Catalog_study_curation.tsv
+
+ASSOCIATION_FILE=gwas_catalog_associations_ontology_annotated.tsv
+PUBLISHED_STUDIES_FILE=gwas_catalog_download_studies.tsv
+PUBLISHED_ANCESTRIES_FILE=gwas_catalog_download_ancestries.tsv
+UNPUBLISHED_STUDIES_FILE=gwas_catalog_unpublished_studies.tsv
+UNPUBLISHED_ANCESTRIES_FILE=gwas_catalog_unpublished_ancestries.tsv
+HARMONISED_LIST_FILE=harmonised_list.txt
+GWAS_CATALOG_STUDY_CURATION_FILE=gwas_catalog_study_curation.tsv
+
+# Remove log file if exists:
+if [ -f ${LOG_FILE} ]; then
+    rm -rf ${LOG_FILE}
+fi
 
 logging "Extracing data from: ${BASE_URL}"
 logging "Release info fetched fom: ${RELEASE_INFO_URL}"
@@ -47,36 +80,49 @@ RELEASE_URL=${BASE_URL}/releases/${YEAR}/${MONTH}/${DAY}
 logging "Datafiles are fetching from ${RELEASE_URL}"
 
 # Fetching files while assigning properly dated and annotated names:
-wget -q ${RELEASE_URL}/gwas-catalog-associations_ontology-annotated.tsv \
-    -O gwas_catalog_v1.0.2-associations_e${ENSEMBL}_r${YEAR}-${MONTH}-${DAY}.tsv
-logging "File gwas_catalog_v1.0.2-associations_e${ENSEMBL}_r${YEAR}-${MONTH}-${DAY}.tsv saved."
+wget -q ${RELEASE_URL}/gwas-catalog-associations_ontology-annotated.tsv -O ${ASSOCIATION_FILE}
+logging "File ${ASSOCIATION_FILE} saved."
 
-wget -q ${RELEASE_URL}/gwas-catalog-download-studies-v1.0.3.txt \
-    -O gwas-catalog-v1.0.3-studies-r${YEAR}-${MONTH}-${DAY}.tsv
-logging "File gwas-catalog-v1.0.3-studies-r${YEAR}-${MONTH}-${DAY}.tsv saved."
+wget -q ${RELEASE_URL}/gwas-catalog-download-studies-v1.0.3.txt -O ${PUBLISHED_STUDIES_FILE}
+logging "File ${PUBLISHED_STUDIES_FILE} saved."
 
-wget -q ${RELEASE_URL}/gwas-catalog-unpublished-studies-v1.0.3.tsv \
-    -O gwas-catalog-v1.0.3-unpublished-studies-r${YEAR}-${MONTH}-${DAY}.tsv
-logging "File gwas-catalog-v1.0.3-unpublished-studies-r${YEAR}-${MONTH}-${DAY}.tsv saved."
+wget -q ${RELEASE_URL}/gwas-catalog-unpublished-studies-v1.0.3.tsv -O ${UNPUBLISHED_STUDIES_FILE}
+logging "File ${UNPUBLISHED_STUDIES_FILE} saved."
 
-wget -q ${RELEASE_URL}/gwas-catalog-download-ancestries-v1.0.3.txt \
-    -O gwas-catalog-v1.0.3-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv
-logging "File gwas-catalog-v1.0.3-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv saved."
+wget -q ${RELEASE_URL}/gwas-catalog-download-ancestries-v1.0.3.txt -O ${PUBLISHED_ANCESTRIES_FILE}
+logging "File ${PUBLISHED_ANCESTRIES_FILE} saved."
 
-wget -q ${RELEASE_URL}/gwas-catalog-unpublished-ancestries-v1.0.3.tsv \
-    -O gwas-catalog-v1.0.3-unpublished-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv
-logging "File gwas-catalog-v1.0.3-unpublished-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv saved."
+wget -q ${RELEASE_URL}/gwas-catalog-unpublished-ancestries-v1.0.3.tsv -O ${UNPUBLISHED_ANCESTRIES_FILE}
+logging "File ${UNPUBLISHED_ANCESTRIES_FILE} saved."
 
+wget -q ${BASE_URL}/summary_statistics/harmonised_list.txt -O ${HARMONISED_LIST_FILE}
+logging "File ${HARMONISED_LIST_FILE} saved."
 
-wget -q ${BASE_URL}/summary_statistics/harmonised_list.txt -O harmonised_list-r${YEAR}-${MONTH}-${DAY}.txt
-logging "File harmonised_list-r${YEAR}-${MONTH}-${DAY}.txt saved."
+wget -q ${GWAS_CATALOG_STUDY_CURATION_URL} -O ${GWAS_CATALOG_STUDY_CURATION_FILE}
+logging "In-house GWAS Catalog study curation file fetched from GitHub."
 
 logging "Copying files to GCP..."
-gsutil -mq cp file://$(pwd)/gwas_catalog_v1.0.2-associations_e${ENSEMBL}_r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/
-gsutil -mq cp file://$(pwd)/gwas-catalog-v1.0.3-studies-r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/
-gsutil -mq cp file://$(pwd)/gwas-catalog-v1.0.3-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/
-gsutil -mq cp file://$(pwd)/harmonised_list-r${YEAR}-${MONTH}-${DAY}.txt ${GCP_TARGET}/
-gsutil -mq cp file://$(pwd)/gwas-catalog-v1.0.3-unpublished-studies-r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/
-gsutil -mq cp file://$(pwd)/gwas-catalog-v1.0.3-unpublished-ancestries-r${YEAR}-${MONTH}-${DAY}.tsv ${GCP_TARGET}/
-
-logging "Done."
+
+upload_file_to_gcp ${ASSOCIATION_FILE} ${GCP_TARGET}/curated_inputs/
+upload_file_to_gcp ${PUBLISHED_STUDIES_FILE} ${GCP_TARGET}/curated_inputs/
+upload_file_to_gcp ${PUBLISHED_ANCESTRIES_FILE} ${GCP_TARGET}/curated_inputs/
+upload_file_to_gcp ${HARMONISED_LIST_FILE} ${GCP_TARGET}/curated_inputs/
+upload_file_to_gcp ${UNPUBLISHED_STUDIES_FILE} ${GCP_TARGET}/curated_inputs/
+upload_file_to_gcp ${UNPUBLISHED_ANCESTRIES_FILE} ${GCP_TARGET}/curated_inputs/
+upload_file_to_gcp ${GWAS_CATALOG_STUDY_CURATION_FILE} ${GCP_TARGET}/manifests/
+
+
+logging "Files successfully uploaded."
+logging "Removing local files..."
+rm ${ASSOCIATION_FILE} \
+    ${PUBLISHED_STUDIES_FILE} \
+    ${PUBLISHED_ANCESTRIES_FILE} \
+    ${HARMONISED_LIST_FILE} \
+    ${UNPUBLISHED_STUDIES_FILE} \
+    ${UNPUBLISHED_ANCESTRIES_FILE} \
+    ${GWAS_CATALOG_STUDY_CURATION_FILE}
+
+# Uploading log file to GCP manifest folder:
+logging "Uploading log file to GCP manifest folder..."
+upload_file_to_gcp ${LOG_FILE} ${GCP_TARGET}/manifests/
+cat $LOG_FILE

From 82107ec28246899c89154a1d3d3238c065475244 Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Thu, 1 Feb 2024 20:22:53 +0000
Subject: [PATCH 04/33] feat: index and block matrix extraction for studyLocus

---
 .pre-commit-config.yaml              |  4 +-
 src/gentropy/dataset/study_index.py  | 11 ++++++
 src/gentropy/dataset/study_locus.py  | 58 ++++++++++++++++++++++++++++
 src/gentropy/datasource/gnomad/ld.py |  2 +
 4 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c5330c4fe..c78b367b7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.13
+    rev: v0.1.14
     hooks:
       - id: ruff
         args:
@@ -91,6 +91,6 @@ repos:
       - id: beautysh
 
   - repo: https://github.com/jsh9/pydoclint
-    rev: 0.3.8
+    rev: 0.3.9
     hooks:
       - id: pydoclint
diff --git a/src/gentropy/dataset/study_index.py b/src/gentropy/dataset/study_index.py
index d048157d6..89a914c4b 100644
--- a/src/gentropy/dataset/study_index.py
+++ b/src/gentropy/dataset/study_index.py
@@ -183,3 +183,14 @@ def has_summarystats(self: StudyIndex) -> Column:
             Column: True if the study has harmonized summary statistics.
         """
         return self.df.hasSumstats
+
+    def get_major_pop(self: StudyIndex) -> DataFrame:
+        """Extract major population from ldPopulationStructure rows with multiple ancestries.
+
+        Returns:
+            DataFrame: Columns studyId and the extracted major population from ldPopulationStructure.
+        """
+        return self.df.select("studyId", "ldPopulationStructure").withColumn(
+            "ldPopulationStructure",
+            f.array_max(f.col("ldPopulationStructure")).getItem("ldPopulation"),
+        )
diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py
index 41c099959..050246187 100644
--- a/src/gentropy/dataset/study_locus.py
+++ b/src/gentropy/dataset/study_locus.py
@@ -5,7 +5,9 @@
 from enum import Enum
 from typing import TYPE_CHECKING
 
+import numpy as np
 import pyspark.sql.functions as f
+from hail.linalg import BlockMatrix
 
 from gentropy.common.schemas import parse_spark_schema
 from gentropy.common.spark_helpers import (
@@ -14,12 +16,14 @@
 )
 from gentropy.dataset.dataset import Dataset
 from gentropy.dataset.study_locus_overlap import StudyLocusOverlap
+from gentropy.datasource.gnomad.ld import GnomADLDMatrix
 from gentropy.method.clump import LDclumping
 
 if TYPE_CHECKING:
     from pyspark.sql import Column, DataFrame
     from pyspark.sql.types import StructType
 
+    from gentropy.common.session import Session
     from gentropy.dataset.ld_index import LDIndex
     from gentropy.dataset.study_index import StudyIndex
 
@@ -453,6 +457,60 @@ def clump(self: StudyLocus) -> StudyLocus:
         )
         return self
 
+    def get_gnomad_matrix(
+        self: StudyLocus, session: Session, study_index: StudyIndex, window_size: int
+    ) -> tuple[DataFrame, np.ndarray]:
+        """Extract hail matrix index and block matrix from StudyLocus rows.
+
+        Args:
+            session (Session): Spark session
+            study_index (StudyIndex): Study index
+            window_size (int): Window size to extract from gnomad matrix
+
+        Returns:
+            tuple[DataFrame, np.ndarray]: Returns the index of the gnomad matrix and the gnomad block matrix
+        """
+        _df = (
+            self.df.join(study_index.get_major_pop(), on="studyId", how="left")
+            .withColumn("start", f.col("position") - (window_size / 2))
+            .withColumn("end", f.col("position") + (window_size / 2))
+            .alias("_df")
+        )
+        _pop = _df.collect()[0]["ldPopulationStructure"]
+
+        _matrix_index = session.spark.read.parquet(
+            GnomADLDMatrix.ld_index_38_template.format(POP=_pop)
+        )
+        _index_joined = (
+            _df.alias("df")
+            .join(
+                _matrix_index.alias("matrix_index"),
+                (f.col("df.chromosome") == f.col("matrix_index.chromosome"))
+                & (f.col("df.start") <= f.col("matrix_index.position"))
+                & (f.col("df.end") >= f.col("matrix_index.position")),
+            )
+            .select(
+                "matrix_index.chromosome",
+                "matrix_index.position",
+                "referenceAllele",
+                "alternateAllele",
+                "idx",
+            )
+        ).persist()
+        _idx = (
+            _index_joined.sort("idx").select("idx").rdd.flatMap(lambda x: x).collect()
+        )
+        _block_matrix = (
+            BlockMatrix.read(GnomADLDMatrix.ld_matrix_template.format(POP=_pop))
+            .filter(_idx, _idx)
+            .to_numpy()
+        )
+        _block_matrix = (
+            _block_matrix + _block_matrix.T - np.diag(np.diag(_block_matrix))
+        )
+
+        return _index_joined, _block_matrix
+
     def _qc_unresolved_ld(
         self: StudyLocus,
     ) -> StudyLocus:
diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py
index 1d2d6c18f..858751690 100644
--- a/src/gentropy/datasource/gnomad/ld.py
+++ b/src/gentropy/datasource/gnomad/ld.py
@@ -28,12 +28,14 @@ class GnomADLDMatrix:
     Attributes:
         ld_matrix_template (str): Template for the LD matrix path. Defaults to "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm".
         ld_index_raw_template (str): Template for the LD index path. Defaults to "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht".
+        ld_index_38_template (str): Template for the LD index path in build 38. Defaults to "gs://genetics_etl_python_playground/input/ld/gnomad_r2.1.1.{POP}.common.ld.variant_indices.parquet"
         grch37_to_grch38_chain_path (str): Path to the chain file used to lift over the coordinates. Defaults to "gs://hail-common/references/grch37_to_grch38.over.chain.gz".
         ld_populations (list[str]): List of populations to use to build the LDIndex. Defaults to ["afr", "amr", "asj", "eas", "fin", "nfe", "nwe", "seu"].
     """
 
     ld_matrix_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm"
     ld_index_raw_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht"
+    ld_index_38_template: str = "gs://genetics_etl_python_playground/input/ld/gnomad_r2.1.1.{POP}.common.ld.variant_indices.parquet"
     grch37_to_grch38_chain_path: str = (
         "gs://hail-common/references/grch37_to_grch38.over.chain.gz"
     )

From bce30d264a82264953758bf7b62cb70ce3182255 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 1 Feb 2024 21:39:10 +0000
Subject: [PATCH 05/33] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/dataset/test_pairwise_ld.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/dataset/test_pairwise_ld.py b/tests/dataset/test_pairwise_ld.py
index b6b7daf46..91f7475e8 100644
--- a/tests/dataset/test_pairwise_ld.py
+++ b/tests/dataset/test_pairwise_ld.py
@@ -6,11 +6,10 @@
 
 import numpy as np
 import pytest
+from otg.dataset.pairwise_ld import PairwiseLD
 from pyspark.sql import functions as f
 from pyspark.sql.window import Window
 
-from otg.dataset.pairwise_ld import PairwiseLD
-
 if TYPE_CHECKING:
     from pyspark.sql import SparkSession
 

From 4d09a601f573d0660d3104cf9f47bdefa8d90156 Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Thu, 1 Feb 2024 21:49:05 +0000
Subject: [PATCH 06/33] chore: updating some test files to gentropy

---
 tests/dataset/test_pairwise_ld.py         | 2 +-
 tests/datasource/gnomad/test_gnomad_ld.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/dataset/test_pairwise_ld.py b/tests/dataset/test_pairwise_ld.py
index 91f7475e8..bd8ad7068 100644
--- a/tests/dataset/test_pairwise_ld.py
+++ b/tests/dataset/test_pairwise_ld.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 import pytest
-from otg.dataset.pairwise_ld import PairwiseLD
+from gentropy.dataset.pairwise_ld import PairwiseLD
 from pyspark.sql import functions as f
 from pyspark.sql.window import Window
 
diff --git a/tests/datasource/gnomad/test_gnomad_ld.py b/tests/datasource/gnomad/test_gnomad_ld.py
index bc1b63871..25336e13f 100644
--- a/tests/datasource/gnomad/test_gnomad_ld.py
+++ b/tests/datasource/gnomad/test_gnomad_ld.py
@@ -7,7 +7,7 @@
 
 import hail as hl
 import pytest
-from genotropy.dataset.pairwise_ld import PairwiseLD
+from gentropy.dataset.pairwise_ld import PairwiseLD
 from gentropy.datasource.gnomad.ld import GnomADLDMatrix
 from pyspark.sql import DataFrame, SparkSession
 from pyspark.sql import functions as f

From f7dfba176a1b25796d5a75f4ed261fc75c918981 Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Thu, 1 Feb 2024 23:20:32 +0000
Subject: [PATCH 07/33] chore: updating tests

---
 tests/datasource/gnomad/test_gnomad_ld.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/datasource/gnomad/test_gnomad_ld.py b/tests/datasource/gnomad/test_gnomad_ld.py
index 25336e13f..d75639ec1 100644
--- a/tests/datasource/gnomad/test_gnomad_ld.py
+++ b/tests/datasource/gnomad/test_gnomad_ld.py
@@ -91,7 +91,7 @@ def _setup(self: TestGnomADLDMatrixVariants, spark: SparkSession) -> None:
         )
 
     @pytest.fixture(scope="class")
-    def ld_slice(self: TestGnomADLDMatrixVariants) -> PairwiseLD:
+    def ld_slice(self: TestGnomADLDMatrixVariants) -> DataFrame | None:
         """Generate a resolved LD slice."""
         return self.gnomad_ld_matrix.get_ld_variants(
             gnomad_ancestry="test-pop",  # observed[0],

From 3a629ef38b95226353284566236e4a92a7133014 Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Fri, 2 Feb 2024 13:09:33 +0000
Subject: [PATCH 08/33] chore: updating pairwise_ld_schema for tests

---
 src/gentropy/assets/schemas/pairwise_ld.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gentropy/assets/schemas/pairwise_ld.json b/src/gentropy/assets/schemas/pairwise_ld.json
index 2d5351d82..bac781ac3 100644
--- a/src/gentropy/assets/schemas/pairwise_ld.json
+++ b/src/gentropy/assets/schemas/pairwise_ld.json
@@ -2,13 +2,13 @@
   "fields": [
     {
       "metadata": {},
-      "name": "variantId_i",
+      "name": "variantIdI",
       "nullable": false,
       "type": "string"
     },
     {
       "metadata": {},
-      "name": "variantId_j",
+      "name": "variantIdJ",
       "nullable": false,
       "type": "string"
     },

From 6ff6b1c03726bc5d461f5b57c015ca42ccc2367b Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Fri, 2 Feb 2024 13:24:19 +0000
Subject: [PATCH 09/33] chore: updating pairwise_ld tests

---
 src/gentropy/dataset/pairwise_ld.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/gentropy/dataset/pairwise_ld.py b/src/gentropy/dataset/pairwise_ld.py
index fc94afa85..221522d0b 100644
--- a/src/gentropy/dataset/pairwise_ld.py
+++ b/src/gentropy/dataset/pairwise_ld.py
@@ -32,8 +32,8 @@ def __post_init__(self: PairwiseLD) -> None:
         """
         row_count = self.df.count()
 
-        assert int(sqrt(row_count)) == sqrt(
-            row_count
+        assert (
+            int(sqrt(row_count)) == sqrt(row_count)
         ), f"The number of rows in a pairwise LD table has to be square. Got: {row_count}"
 
         self.dimension = (int(sqrt(row_count)), int(sqrt(row_count)))
@@ -74,12 +74,8 @@ def r_to_numpy_matrix(self) -> np.ndarray:
         """
         return np.array(
             self.df.select(
-                f.split("variantId_i", "_")[1]
-                .cast(t.IntegerType())
-                .alias("position_i"),
-                f.split("variantId_j", "_")[1]
-                .cast(t.IntegerType())
-                .alias("position_j"),
+                f.split("variantIdI", "_")[1].cast(t.IntegerType()).alias("position_i"),
+                f.split("variantIdJ", "_")[1].cast(t.IntegerType()).alias("position_j"),
                 "r",
             )
             .orderBy(f.col("position_i").asc(), f.col("position_j").asc())
@@ -97,8 +93,8 @@ def get_variant_list(self) -> list[str]:
             row["variantId"]
             for row in (
                 self.df.select(
-                    f.col("variantId_i").alias("variantId"),
-                    f.split(f.col("variantId_i"), "_")[1]
+                    f.col("variantIdI").alias("variantId"),
+                    f.split(f.col("variantIdI"), "_")[1]
                     .cast(t.IntegerType())
                     .alias("position"),
                 )

From 3fc9cfaeb81e3bc93d331b2eae71af93d7616037 Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Fri, 2 Feb 2024 13:49:57 +0000
Subject: [PATCH 10/33] chore: fix ld_pairwise tests

---
 tests/dataset/test_pairwise_ld.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/dataset/test_pairwise_ld.py b/tests/dataset/test_pairwise_ld.py
index bd8ad7068..11ebf75ca 100644
--- a/tests/dataset/test_pairwise_ld.py
+++ b/tests/dataset/test_pairwise_ld.py
@@ -39,16 +39,16 @@ def mock_pairwise_ld(self: TestPairwiseLD, spark: SparkSession) -> PairwiseLD:
         data = [(v1, v2) for v1 in self.variants for v2 in self.variants]
         return PairwiseLD(
             _df=(
-                spark.createDataFrame(data, ["variantId_i", "variantId_j"])
+                spark.createDataFrame(data, ["variantIdI", "variantIdJ"])
                 .withColumn(
                     "r",
                     f.row_number()
-                    .over(Window.partitionBy(f.lit("x")).orderBy("variantId_i"))
+                    .over(Window.partitionBy(f.lit("x")).orderBy("variantIdI"))
                     .cast("double"),
                 )
                 .withColumn(
                     "r",
-                    f.when(f.col("variantId_i") == f.col("variantId_j"), 1.0).otherwise(
+                    f.when(f.col("variantIdI") == f.col("variantIdJ"), 1.0).otherwise(
                         f.col("r")
                     ),
                 )

From 0d6802057d373ea84af9b242804eb2a7e4c64317 Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Fri, 2 Feb 2024 15:28:41 +0000
Subject: [PATCH 11/33] chore: fix pairwise_ld tests

---
 src/gentropy/dataset/pairwise_ld.py       | 4 ++--
 tests/datasource/gnomad/test_gnomad_ld.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gentropy/dataset/pairwise_ld.py b/src/gentropy/dataset/pairwise_ld.py
index 221522d0b..9650efa32 100644
--- a/src/gentropy/dataset/pairwise_ld.py
+++ b/src/gentropy/dataset/pairwise_ld.py
@@ -59,8 +59,8 @@ def overlap_with_locus(self: PairwiseLD, locus_variants: list[str]) -> PairwiseL
         return PairwiseLD(
             _df=(
                 self.df.filter(
-                    f.col("variantId_i").isin(locus_variants)
-                    & f.col("variantId_j").isin(locus_variants)
+                    f.col("variantIdI").isin(locus_variants)
+                    & f.col("variantIdJ").isin(locus_variants)
                 )
             ),
             _schema=PairwiseLD.get_schema(),
diff --git a/tests/datasource/gnomad/test_gnomad_ld.py b/tests/datasource/gnomad/test_gnomad_ld.py
index d75639ec1..eaeb6ebaa 100644
--- a/tests/datasource/gnomad/test_gnomad_ld.py
+++ b/tests/datasource/gnomad/test_gnomad_ld.py
@@ -64,7 +64,7 @@ def test_resolve_variant_indices(
     )
     expected_df = spark.createDataFrame(
         expected,
-        ["r", "variantId_i", "chromosome", "variantId_j"],
+        ["r", "variantIdI", "chromosome", "variantIdJ"],
     )
     observed_df = GnomADLDMatrix._resolve_variant_indices(ld_index, ld_matrix)
     assert (

From 07260a7897e450049a02a3d39584ae1648f7322e Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Fri, 2 Feb 2024 16:11:36 +0000
Subject: [PATCH 12/33] chore: fix tests

---
 src/gentropy/datasource/gnomad/ld.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py
index 858751690..870446623 100644
--- a/src/gentropy/datasource/gnomad/ld.py
+++ b/src/gentropy/datasource/gnomad/ld.py
@@ -220,9 +220,9 @@ def _resolve_variant_indices(
             DataFrame: Dataframe with variant IDs instead of `i` and `j` indices
         """
         ld_index_i = ld_index.selectExpr(
-            "idx as i", "variantId as variantId_i", "chromosome"
+            "idx as i", "variantId as variantIdI", "chromosome"
         )
-        ld_index_j = ld_index.selectExpr("idx as j", "variantId as variantId_j")
+        ld_index_j = ld_index.selectExpr("idx as j", "variantId as variantIdJ")
         return (
             ld_matrix.join(ld_index_i, on="i", how="inner")
             .join(ld_index_j, on="j", how="inner")
@@ -246,11 +246,11 @@ def _transpose_ld_matrix(ld_matrix: DataFrame) -> DataFrame:
             ...         (1, 2, 0.5, "1", "AFR"),
             ...         (2, 2, 1.0, "1", "AFR"),
             ...     ],
-            ...     ["variantId_i", "variantId_j", "r", "chromosome", "population"],
+            ...     ["variantIdI", "variantIdJ", "r", "chromosome", "population"],
             ... )
             >>> GnomADLDMatrix._transpose_ld_matrix(df).show()
             +-----------+-----------+---+----------+----------+
-            |variantId_i|variantId_j|  r|chromosome|population|
+            |variantIdI|variantIdJ|  r|chromosome|population|
             +-----------+-----------+---+----------+----------+
             |          1|          2|0.5|         1|       AFR|
             |          1|          1|1.0|         1|       AFR|
@@ -260,15 +260,15 @@ def _transpose_ld_matrix(ld_matrix: DataFrame) -> DataFrame:
             <BLANKLINE>
         """
         ld_matrix_transposed = ld_matrix.selectExpr(
-            "variantId_i as variantId_j",
-            "variantId_j as variantId_i",
+            "variantIdI as variantIdJ",
+            "variantIdJ as variantIdI",
             "r",
             "chromosome",
             "population",
         )
-        return ld_matrix.filter(
-            f.col("variantId_i") != f.col("variantId_j")
-        ).unionByName(ld_matrix_transposed)
+        return ld_matrix.filter(f.col("variantIdI") != f.col("variantIdJ")).unionByName(
+            ld_matrix_transposed
+        )
 
     def as_ld_index(
         self: GnomADLDMatrix,
@@ -309,8 +309,8 @@ def as_ld_index(
             GnomADLDMatrix._transpose_ld_matrix(
                 reduce(lambda df1, df2: df1.unionByName(df2), ld_indices_unaggregated)
             )
-            .withColumnRenamed("variantId_i", "variantId")
-            .withColumnRenamed("variantId_j", "tagVariantId")
+            .withColumnRenamed("variantIdI", "variantId")
+            .withColumnRenamed("variantIdJ", "tagVariantId")
         )
         return LDIndex(
             _df=self._aggregate_ld_index_across_populations(ld_index_unaggregated),
@@ -397,7 +397,7 @@ def _extract_square_matrix(
             .join(
                 ld_index_df.select(
                     f.col("idx").alias("idx_i"),
-                    f.col("variantId").alias("variantId_i"),
+                    f.col("variantId").alias("variantIdI"),
                 ),
                 on="idx_i",
                 how="inner",
@@ -405,12 +405,12 @@ def _extract_square_matrix(
             .join(
                 ld_index_df.select(
                     f.col("idx").alias("idx_j"),
-                    f.col("variantId").alias("variantId_j"),
+                    f.col("variantId").alias("variantIdJ"),
                 ),
                 on="idx_j",
                 how="inner",
             )
-            .select("variantId_i", "variantId_j", "r")
+            .select("variantIdI", "variantIdJ", "r")
         )
 
     def get_ld_matrix_slice(

From 65cea4e246413becee1f45f50efc1933ca8b5797 Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Fri, 2 Feb 2024 16:46:06 +0000
Subject: [PATCH 13/33] chore: fix tests

---
 src/gentropy/datasource/gnomad/ld.py | 36 ++++++++++++++--------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py
index 870446623..7becf6201 100644
--- a/src/gentropy/datasource/gnomad/ld.py
+++ b/src/gentropy/datasource/gnomad/ld.py
@@ -240,24 +240,24 @@ def _transpose_ld_matrix(ld_matrix: DataFrame) -> DataFrame:
             DataFrame: Square LD matrix without diagonal duplicates
 
         Examples:
-            >>> df = spark.createDataFrame(
-            ...     [
-            ...         (1, 1, 1.0, "1", "AFR"),
-            ...         (1, 2, 0.5, "1", "AFR"),
-            ...         (2, 2, 1.0, "1", "AFR"),
-            ...     ],
-            ...     ["variantIdI", "variantIdJ", "r", "chromosome", "population"],
-            ... )
-            >>> GnomADLDMatrix._transpose_ld_matrix(df).show()
-            +-----------+-----------+---+----------+----------+
-            |variantIdI|variantIdJ|  r|chromosome|population|
-            +-----------+-----------+---+----------+----------+
-            |          1|          2|0.5|         1|       AFR|
-            |          1|          1|1.0|         1|       AFR|
-            |          2|          1|0.5|         1|       AFR|
-            |          2|          2|1.0|         1|       AFR|
-            +-----------+-----------+---+----------+----------+
-            <BLANKLINE>
+        >>> df = spark.createDataFrame(
+        ...     [
+        ...         (1, 1, 1.0, "1", "AFR"),
+        ...         (1, 2, 0.5, "1", "AFR"),
+        ...         (2, 2, 1.0, "1", "AFR"),
+        ...     ],
+        ...     ["variantIdI", "variantIdJ", "r", "chromosome", "population"],
+        ... )
+        >>> GnomADLDMatrix._transpose_ld_matrix(df).show()
+        +----------+----------+---+----------+----------+
+        |variantIdI|variantIdJ|  r|chromosome|population|
+        +----------+----------+---+----------+----------+
+        |         1|         2|0.5|         1|       AFR|
+        |         1|         1|1.0|         1|       AFR|
+        |         2|         1|0.5|         1|       AFR|
+        |         2|         2|1.0|         1|       AFR|
+        +----------+----------+---+----------+----------+
+        <BLANKLINE>
         """
         ld_matrix_transposed = ld_matrix.selectExpr(
             "variantIdI as variantIdJ",

From aa19f9f5d50e5b583f2236938e2ada9e883b2d7c Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Fri, 2 Feb 2024 17:13:11 +0000
Subject: [PATCH 14/33] chore: fixing typing for tests

---
 tests/datasource/gnomad/test_gnomad_ld.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/datasource/gnomad/test_gnomad_ld.py b/tests/datasource/gnomad/test_gnomad_ld.py
index eaeb6ebaa..be2e39aec 100644
--- a/tests/datasource/gnomad/test_gnomad_ld.py
+++ b/tests/datasource/gnomad/test_gnomad_ld.py
@@ -91,7 +91,7 @@ def _setup(self: TestGnomADLDMatrixVariants, spark: SparkSession) -> None:
         )
 
     @pytest.fixture(scope="class")
-    def ld_slice(self: TestGnomADLDMatrixVariants) -> DataFrame | None:
+    def ld_slice(self: TestGnomADLDMatrixVariants) -> PairwiseLD:
         """Generate a resolved LD slice."""
         return self.gnomad_ld_matrix.get_ld_variants(
             gnomad_ancestry="test-pop",  # observed[0],
@@ -107,7 +107,7 @@ def test_get_ld_variants__type(
     ) -> None:
         """Testing if the function returns the right type."""
         # Do we have the right type?
-        assert isinstance(ld_slice, PairwiseLD)
+        assert isinstance(ld_slice, DataFrame)
         # Do we have a real square?
         assert sqrt(ld_slice.df.count()) == int(sqrt(ld_slice.df.count()))
 

From e24cac1088e56be127e5a6b81e54bc5d18f7d992 Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Fri, 2 Feb 2024 17:21:59 +0000
Subject: [PATCH 15/33] chore: fixing tests

---
 tests/datasource/gnomad/test_gnomad_ld.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/datasource/gnomad/test_gnomad_ld.py b/tests/datasource/gnomad/test_gnomad_ld.py
index be2e39aec..cdf530de1 100644
--- a/tests/datasource/gnomad/test_gnomad_ld.py
+++ b/tests/datasource/gnomad/test_gnomad_ld.py
@@ -109,7 +109,7 @@ def test_get_ld_variants__type(
         # Do we have the right type?
         assert isinstance(ld_slice, DataFrame)
         # Do we have a real square?
-        assert sqrt(ld_slice.df.count()) == int(sqrt(ld_slice.df.count()))
+        assert sqrt(ld_slice.count()) == int(sqrt(ld_slice.count()))
 
 
 class TestGnomADLDMatrixSlice:

From 4ec36b9dbcb47f1c5bed0d3f6a84c2b030c3751c Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Fri, 2 Feb 2024 17:52:22 +0000
Subject: [PATCH 16/33] chore: fixing ld tests

---
 tests/datasource/gnomad/test_gnomad_ld.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/datasource/gnomad/test_gnomad_ld.py b/tests/datasource/gnomad/test_gnomad_ld.py
index cdf530de1..67961c821 100644
--- a/tests/datasource/gnomad/test_gnomad_ld.py
+++ b/tests/datasource/gnomad/test_gnomad_ld.py
@@ -91,7 +91,7 @@ def _setup(self: TestGnomADLDMatrixVariants, spark: SparkSession) -> None:
         )
 
     @pytest.fixture(scope="class")
-    def ld_slice(self: TestGnomADLDMatrixVariants) -> PairwiseLD:
+    def ld_slice(self: TestGnomADLDMatrixVariants) -> DataFrame | None:
         """Generate a resolved LD slice."""
         return self.gnomad_ld_matrix.get_ld_variants(
             gnomad_ancestry="test-pop",  # observed[0],

From 0e93adb642537f4680d1770889886100e81c5e0c Mon Sep 17 00:00:00 2001
From: Daniel-Considine <113430683+Daniel-Considine@users.noreply.github.com>
Date: Mon, 5 Feb 2024 17:34:16 +0000
Subject: [PATCH 17/33] Update src/gentropy/dataset/study_index.py

Co-authored-by: Daniel Suveges <daniel.suveges@protonmail.com>
---
 src/gentropy/dataset/study_index.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/gentropy/dataset/study_index.py b/src/gentropy/dataset/study_index.py
index 89a914c4b..52b0a8269 100644
--- a/src/gentropy/dataset/study_index.py
+++ b/src/gentropy/dataset/study_index.py
@@ -190,7 +190,9 @@ def get_major_pop(self: StudyIndex) -> DataFrame:
         Returns:
             DataFrame: Columns studyId and the extracted major population from ldPopulationStructure.
         """
-        return self.df.select("studyId", "ldPopulationStructure").withColumn(
-            "ldPopulationStructure",
-            f.array_max(f.col("ldPopulationStructure")).getItem("ldPopulation"),
+        return self.df.select(
+            "studyId",
+            f.array_max(f.col("ldPopulationStructure"))
+            .getItem("ldPopulation")
+            .alias("majorPopulation"),
         )

From cd4cddfcee94d7fccc72960ede70e311fbf20ab4 Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Wed, 7 Feb 2024 18:30:54 +0000
Subject: [PATCH 18/33] feat: moving functions to their appropriate locations
 and improving logic

---
 src/gentropy/dataset/study_index.py  |  2 +-
 src/gentropy/dataset/study_locus.py  | 58 ---------------------
 src/gentropy/datasource/gnomad/ld.py | 76 +++++++++++++++++++++++++++-
 3 files changed, 76 insertions(+), 60 deletions(-)

diff --git a/src/gentropy/dataset/study_index.py b/src/gentropy/dataset/study_index.py
index 52b0a8269..9a7c6a2aa 100644
--- a/src/gentropy/dataset/study_index.py
+++ b/src/gentropy/dataset/study_index.py
@@ -184,7 +184,7 @@ def has_summarystats(self: StudyIndex) -> Column:
         """
         return self.df.hasSumstats
 
-    def get_major_pop(self: StudyIndex) -> DataFrame:
+    def get_major_population(self: StudyIndex) -> DataFrame:
         """Extract major population from ldPopulationStructure rows with multiple ancestries.
 
         Returns:
diff --git a/src/gentropy/dataset/study_locus.py b/src/gentropy/dataset/study_locus.py
index 050246187..41c099959 100644
--- a/src/gentropy/dataset/study_locus.py
+++ b/src/gentropy/dataset/study_locus.py
@@ -5,9 +5,7 @@
 from enum import Enum
 from typing import TYPE_CHECKING
 
-import numpy as np
 import pyspark.sql.functions as f
-from hail.linalg import BlockMatrix
 
 from gentropy.common.schemas import parse_spark_schema
 from gentropy.common.spark_helpers import (
@@ -16,14 +14,12 @@
 )
 from gentropy.dataset.dataset import Dataset
 from gentropy.dataset.study_locus_overlap import StudyLocusOverlap
-from gentropy.datasource.gnomad.ld import GnomADLDMatrix
 from gentropy.method.clump import LDclumping
 
 if TYPE_CHECKING:
     from pyspark.sql import Column, DataFrame
     from pyspark.sql.types import StructType
 
-    from gentropy.common.session import Session
     from gentropy.dataset.ld_index import LDIndex
     from gentropy.dataset.study_index import StudyIndex
 
@@ -457,60 +453,6 @@ def clump(self: StudyLocus) -> StudyLocus:
         )
         return self
 
-    def get_gnomad_matrix(
-        self: StudyLocus, session: Session, study_index: StudyIndex, window_size: int
-    ) -> tuple[DataFrame, np.ndarray]:
-        """Extract hail matrix index and block matrix from StudyLocus rows.
-
-        Args:
-            session (Session): Spark session
-            study_index (StudyIndex): Study index
-            window_size (int): Window size to extract from gnomad matrix
-
-        Returns:
-            tuple[DataFrame, np.ndarray]: Returns the index of the gnomad matrix and the gnomad block matrix
-        """
-        _df = (
-            self.df.join(study_index.get_major_pop(), on="studyId", how="left")
-            .withColumn("start", f.col("position") - (window_size / 2))
-            .withColumn("end", f.col("position") + (window_size / 2))
-            .alias("_df")
-        )
-        _pop = _df.collect()[0]["ldPopulationStructure"]
-
-        _matrix_index = session.spark.read.parquet(
-            GnomADLDMatrix.ld_index_38_template.format(POP=_pop)
-        )
-        _index_joined = (
-            _df.alias("df")
-            .join(
-                _matrix_index.alias("matrix_index"),
-                (f.col("df.chromosome") == f.col("matrix_index.chromosome"))
-                & (f.col("df.start") <= f.col("matrix_index.position"))
-                & (f.col("df.end") >= f.col("matrix_index.position")),
-            )
-            .select(
-                "matrix_index.chromosome",
-                "matrix_index.position",
-                "referenceAllele",
-                "alternateAllele",
-                "idx",
-            )
-        ).persist()
-        _idx = (
-            _index_joined.sort("idx").select("idx").rdd.flatMap(lambda x: x).collect()
-        )
-        _block_matrix = (
-            BlockMatrix.read(GnomADLDMatrix.ld_matrix_template.format(POP=_pop))
-            .filter(_idx, _idx)
-            .to_numpy()
-        )
-        _block_matrix = (
-            _block_matrix + _block_matrix.T - np.diag(np.diag(_block_matrix))
-        )
-
-        return _index_joined, _block_matrix
-
     def _qc_unresolved_ld(
         self: StudyLocus,
     ) -> StudyLocus:
diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py
index 7becf6201..a7e5725a6 100644
--- a/src/gentropy/datasource/gnomad/ld.py
+++ b/src/gentropy/datasource/gnomad/ld.py
@@ -7,10 +7,12 @@
 from typing import TYPE_CHECKING
 
 import hail as hl
+import numpy as np
 import pyspark.sql.functions as f
 from hail.linalg import BlockMatrix
 from pyspark.sql import Window
 
+from gentropy.common.session import Session
 from gentropy.common.spark_helpers import get_top_ranked_in_window, get_value_from_row
 from gentropy.common.utils import _liftover_loci, convert_gnomad_position_to_ensembl
 from gentropy.dataset.ld_index import LDIndex
@@ -347,7 +349,6 @@ def get_ld_variants(
                 & (f.col("position") <= end)
             )
             .select("chromosome", "position", "variantId", "idx")
-            .persist()
         )
 
         if ld_index_df.limit(1).count() == 0:
@@ -450,3 +451,76 @@ def get_ld_matrix_slice(
                 .alias("r"),
             )
         )
+
+    @staticmethod
+    def get_locus_index(
+        session: Session,
+        study_locus_row: DataFrame,
+        window_size: int,
+        major_population: str = "nfe",
+        ld_index_path: str = "gs://genetics_etl_python_playground/input/ld/gnomad_r2.1.1.{POP}.common.ld.variant_indices.parquet",
+    ) -> DataFrame:
+        """Extract hail matrix index from StudyLocus rows.
+
+        Args:
+            session (Session): Spark session
+            study_locus_row (DataFrame): Study-locus row
+            window_size (int): Window size to extract from gnomad matrix
+            major_population (str): Major population to extract from gnomad matrix, default is "nfe"
+            ld_index_path (str): Optional path to the LD index parquet
+        Returns:
+            DataFrame: Returns the index of the gnomad matrix for the locus
+        """
+        _df = (
+            study_locus_row.withColumn("start", f.col("position") - (window_size / 2))
+            .withColumn("end", f.col("position") + (window_size / 2))
+            .alias("_df")
+        )
+
+        _matrix_index = session.spark.read.parquet(
+            ld_index_path.format(POP=major_population)
+        )
+
+        _index_joined = (
+            _df.alias("df")
+            .join(
+                _matrix_index.alias("matrix_index"),
+                (f.col("df.chromosome") == f.col("matrix_index.chromosome"))
+                & (f.col("df.start") <= f.col("matrix_index.position"))
+                & (f.col("df.end") >= f.col("matrix_index.position")),
+            )
+            .select(
+                "matrix_index.chromosome",
+                "matrix_index.position",
+                "referenceAllele",
+                "alternateAllele",
+                "idx",
+            )
+            .sort("idx")
+        )
+
+        return _index_joined
+
+    @staticmethod
+    def get_locus_matrix(
+        locus_index: DataFrame,
+        gnomad_ancestry: str,
+    ) -> np.ndarray:
+        """Extract the LD block matrix for a locus.
+
+        Args:
+            locus_index (DataFrame): hail matrix variant index table
+            gnomad_ancestry (str): GnomAD major ancestry label eg. `nfe`
+
+        Returns:
+            np.ndarray: LD block matrix for the locus
+        """
+        idx = [row["idx"] for row in locus_index.select("idx").collect()]
+
+        half_matrix = BlockMatrix.read(
+            GnomADLDMatrix.ld_matrix_template.format(POP=gnomad_ancestry)
+        ).filter(idx, idx)
+
+        return (half_matrix + half_matrix.T).to_numpy() - np.diag(
+            np.diag(half_matrix.to_numpy())
+        )

From 228c66b80117de8fa8a247b8814d0fceef800c3e Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Wed, 7 Feb 2024 18:57:06 +0000
Subject: [PATCH 19/33] fix: optimise conversion of BM to NumPy

---
 src/gentropy/datasource/gnomad/ld.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py
index a7e5725a6..eaf7d1b3f 100644
--- a/src/gentropy/datasource/gnomad/ld.py
+++ b/src/gentropy/datasource/gnomad/ld.py
@@ -517,10 +517,12 @@ def get_locus_matrix(
         """
         idx = [row["idx"] for row in locus_index.select("idx").collect()]
 
-        half_matrix = BlockMatrix.read(
-            GnomADLDMatrix.ld_matrix_template.format(POP=gnomad_ancestry)
-        ).filter(idx, idx)
-
-        return (half_matrix + half_matrix.T).to_numpy() - np.diag(
-            np.diag(half_matrix.to_numpy())
+        half_matrix = (
+            BlockMatrix.read(
+                GnomADLDMatrix.ld_matrix_template.format(POP=gnomad_ancestry)
+            )
+            .filter(idx, idx)
+            .to_numpy()
         )
+
+        return (half_matrix + half_matrix.T) - np.diag(np.diag(half_matrix))

From 06a9e96191b95899718d516329a69ef28bafc8cf Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Fri, 9 Feb 2024 18:30:06 +0000
Subject: [PATCH 20/33] feat: updating get_locus_index to allow for just
 chromosome and position inputs

---
 src/gentropy/datasource/gnomad/ld.py | 30 +++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py
index eaf7d1b3f..5733189c0 100644
--- a/src/gentropy/datasource/gnomad/ld.py
+++ b/src/gentropy/datasource/gnomad/ld.py
@@ -4,7 +4,7 @@
 import sys
 from dataclasses import dataclass, field
 from functools import reduce
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 
 import hail as hl
 import numpy as np
@@ -455,8 +455,10 @@ def get_ld_matrix_slice(
     @staticmethod
     def get_locus_index(
         session: Session,
-        study_locus_row: DataFrame,
-        window_size: int,
+        study_locus_row: Optional[DataFrame] = None,
+        chromosome: Optional[str] = None,
+        position: Optional[int] = None,
+        window_size: int = 1_000_000,
         major_population: str = "nfe",
         ld_index_path: str = "gs://genetics_etl_python_playground/input/ld/gnomad_r2.1.1.{POP}.common.ld.variant_indices.parquet",
     ) -> DataFrame:
@@ -464,15 +466,33 @@ def get_locus_index(
 
         Args:
             session (Session): Spark session
-            study_locus_row (DataFrame): Study-locus row
+            study_locus_row (Optional[DataFrame]): Study-locus row
+            chromosome (Optional[str]): Chromosome to extract from gnomad matrix
+            position (Optional[int]): Position to extract from gnomad matrix
             window_size (int): Window size to extract from gnomad matrix
             major_population (str): Major population to extract from gnomad matrix, default is "nfe"
             ld_index_path (str): Optional path to the LD index parquet
+
         Returns:
             DataFrame: Returns the index of the gnomad matrix for the locus
+
+        Raises:
+            ValueError: Either a studyLocus or a chromosome and position must be provided
         """
+        if study_locus_row is None:
+            if chromosome is not None and position is not None:
+                _df = session.spark.createDataFrame(
+                    [(chromosome, position)], ["chromosome", "position"]
+                )
+            else:
+                raise ValueError(
+                    "Either a studyLocus or a chromosome and position must be provided."
+                )
+        else:
+            _df = study_locus_row
+
         _df = (
-            study_locus_row.withColumn("start", f.col("position") - (window_size / 2))
+            _df.withColumn("start", f.col("position") - (window_size / 2))
             .withColumn("end", f.col("position") + (window_size / 2))
             .alias("_df")
         )

From ebdd983c179fe1486ab69018b6180110e5418ef9 Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Mon, 18 Mar 2024 12:44:19 +0000
Subject: [PATCH 21/33] fix: suggested changes

---
 src/gentropy/datasource/gnomad/ld.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py
index 5733189c0..364ba0c35 100644
--- a/src/gentropy/datasource/gnomad/ld.py
+++ b/src/gentropy/datasource/gnomad/ld.py
@@ -455,23 +455,23 @@ def get_ld_matrix_slice(
     @staticmethod
     def get_locus_index(
         session: Session,
+        ld_index_path: str,
         study_locus_row: Optional[DataFrame] = None,
         chromosome: Optional[str] = None,
         position: Optional[int] = None,
         window_size: int = 1_000_000,
         major_population: str = "nfe",
-        ld_index_path: str = "gs://genetics_etl_python_playground/input/ld/gnomad_r2.1.1.{POP}.common.ld.variant_indices.parquet",
     ) -> DataFrame:
         """Extract hail matrix index from StudyLocus rows.
 
         Args:
             session (Session): Spark session
+            ld_index_path (str): Optional path to the LD index parquet
             study_locus_row (Optional[DataFrame]): Study-locus row
             chromosome (Optional[str]): Chromosome to extract from gnomad matrix
             position (Optional[int]): Position to extract from gnomad matrix
             window_size (int): Window size to extract from gnomad matrix
             major_population (str): Major population to extract from gnomad matrix, default is "nfe"
-            ld_index_path (str): Optional path to the LD index parquet
 
         Returns:
             DataFrame: Returns the index of the gnomad matrix for the locus

From a2232b596605f5fbde58aa5ec69af0b843354f08 Mon Sep 17 00:00:00 2001
From: Daniel-Considine <113430683+Daniel-Considine@users.noreply.github.com>
Date: Thu, 21 Mar 2024 11:31:51 +0000
Subject: [PATCH 22/33] Update study_index.py

---
 src/gentropy/dataset/study_index.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/src/gentropy/dataset/study_index.py b/src/gentropy/dataset/study_index.py
index 9a7c6a2aa..d048157d6 100644
--- a/src/gentropy/dataset/study_index.py
+++ b/src/gentropy/dataset/study_index.py
@@ -183,16 +183,3 @@ def has_summarystats(self: StudyIndex) -> Column:
             Column: True if the study has harmonized summary statistics.
         """
         return self.df.hasSumstats
-
-    def get_major_population(self: StudyIndex) -> DataFrame:
-        """Extract major population from ldPopulationStructure rows with multiple ancestries.
-
-        Returns:
-            DataFrame: Columns studyId and the extracted major population from ldPopulationStructure.
-        """
-        return self.df.select(
-            "studyId",
-            f.array_max(f.col("ldPopulationStructure"))
-            .getItem("ldPopulation")
-            .alias("majorPopulation"),
-        )

From b9b817d979b9964eb2dbf2702612f264612e8a60 Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Thu, 21 Mar 2024 13:54:48 +0000
Subject: [PATCH 23/33] fix: changes to datasource/gnomad/ld.py

---
 src/gentropy/datasource/gnomad/ld.py | 30 +++++-----------------------
 1 file changed, 5 insertions(+), 25 deletions(-)

diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py
index 364ba0c35..99b7b7ad2 100644
--- a/src/gentropy/datasource/gnomad/ld.py
+++ b/src/gentropy/datasource/gnomad/ld.py
@@ -4,7 +4,7 @@
 import sys
 from dataclasses import dataclass, field
 from functools import reduce
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING
 
 import hail as hl
 import numpy as np
@@ -30,14 +30,12 @@ class GnomADLDMatrix:
     Attributes:
         ld_matrix_template (str): Template for the LD matrix path. Defaults to "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm".
         ld_index_raw_template (str): Template for the LD index path. Defaults to "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht".
-        ld_index_38_template (str): Template for the LD index path in build 38. Defaults to "gs://genetics_etl_python_playground/input/ld/gnomad_r2.1.1.{POP}.common.ld.variant_indices.parquet"
         grch37_to_grch38_chain_path (str): Path to the chain file used to lift over the coordinates. Defaults to "gs://hail-common/references/grch37_to_grch38.over.chain.gz".
         ld_populations (list[str]): List of populations to use to build the LDIndex. Defaults to ["afr", "amr", "asj", "eas", "fin", "nfe", "nwe", "seu"].
     """
 
     ld_matrix_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm"
     ld_index_raw_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht"
-    ld_index_38_template: str = "gs://genetics_etl_python_playground/input/ld/gnomad_r2.1.1.{POP}.common.ld.variant_indices.parquet"
     grch37_to_grch38_chain_path: str = (
         "gs://hail-common/references/grch37_to_grch38.over.chain.gz"
     )
@@ -455,10 +453,8 @@ def get_ld_matrix_slice(
     @staticmethod
     def get_locus_index(
         session: Session,
+        study_locus_row: DataFrame,
         ld_index_path: str,
-        study_locus_row: Optional[DataFrame] = None,
-        chromosome: Optional[str] = None,
-        position: Optional[int] = None,
         window_size: int = 1_000_000,
         major_population: str = "nfe",
     ) -> DataFrame:
@@ -466,33 +462,17 @@ def get_locus_index(
 
         Args:
             session (Session): Spark session
-            ld_index_path (str): Optional path to the LD index parquet
-            study_locus_row (Optional[DataFrame]): Study-locus row
-            chromosome (Optional[str]): Chromosome to extract from gnomad matrix
-            position (Optional[int]): Position to extract from gnomad matrix
+            study_locus_row (DataFrame): Study-locus row
+            ld_index_path (str): Path to the hail LD index parquet
             window_size (int): Window size to extract from gnomad matrix
             major_population (str): Major population to extract from gnomad matrix, default is "nfe"
 
         Returns:
             DataFrame: Returns the index of the gnomad matrix for the locus
 
-        Raises:
-            ValueError: Either a studyLocus or a chromosome and position must be provided
         """
-        if study_locus_row is None:
-            if chromosome is not None and position is not None:
-                _df = session.spark.createDataFrame(
-                    [(chromosome, position)], ["chromosome", "position"]
-                )
-            else:
-                raise ValueError(
-                    "Either a studyLocus or a chromosome and position must be provided."
-                )
-        else:
-            _df = study_locus_row
-
         _df = (
-            _df.withColumn("start", f.col("position") - (window_size / 2))
+            study_locus_row.withColumn("start", f.col("position") - (window_size / 2))
             .withColumn("end", f.col("position") + (window_size / 2))
             .alias("_df")
         )

From 37668d14dda2125f66842fd4dbe019fa78069bd6 Mon Sep 17 00:00:00 2001
From: Yakov Tsepilov <yt4@sanger.ac.uk>
Date: Tue, 2 Apr 2024 16:40:50 +0100
Subject: [PATCH 24/33] feat: add the draft of finemapper fucntion

---
 src/gentropy/susie_finemapper.py | 119 +++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py
index bd02716c2..760ca73a1 100644
--- a/src/gentropy/susie_finemapper.py
+++ b/src/gentropy/susie_finemapper.py
@@ -9,7 +9,10 @@
 from pyspark.sql import DataFrame, Window
 
 from gentropy.common.session import Session
+from gentropy.dataset.study_index import StudyIndex
 from gentropy.dataset.study_locus import StudyLocus
+from gentropy.dataset.summary_statistics import SummaryStatistics
+from gentropy.method.susie_inf import SUSIE_inf
 
 
 class SusieFineMapperStep:
@@ -19,6 +22,122 @@ class SusieFineMapperStep:
     In the future this step will be refactored and moved to the methods module.
     """
 
+    @staticmethod
+    def susie_finemapper_one_locus(
+        GWAS: SummaryStatistics,
+        session: Session,
+        study_locus: StudyLocus,
+        study_index: StudyIndex,
+        window: int = 1_000_000,
+    ) -> StudyLocus:
+        """Susie fine-mapper for one study locus.
+
+        Args:
+            GWAS (SummaryStatistics): GWAS summary statistics
+            session (Session): Spark session
+            study_locus (StudyLocus): StudyLocus object with one row (the first one will be used)
+            study_index (StudyIndex): StudyIndex object
+            window (int): window size for fine-mapping
+
+        Returns:
+            StudyLocus: StudyLocus object with fine-mapped credible sets
+        """
+        study_locus_df = study_locus._df
+        first_line = study_locus_df.first()
+        chromosome = first_line.chromosome
+        position = first_line.position
+        _studyId = first_line.studyId
+
+        study_index_df = study_index._df
+        study_index_df = study_index_df.filter(study_index_df.studyId == _studyId)
+        _major_population = study_index_df.select(
+            "studyId",
+            f.array_max(f.col("ldPopulationStructure"))
+            .getItem("ldPopulation")
+            .alias("majorPopulation"),
+        )
+
+        _region = (
+            first_line.withColumn(
+                "region",
+                f.regexp_replace(
+                    f.concat(
+                        f.col("chromosome"),
+                        f.lit(":"),
+                        f.format_number((f.col("position") - (window / 2)), 0),
+                        f.lit("-"),
+                        f.format_number((f.col("position") + (window / 2)), 0),
+                    ),
+                    ",",
+                    "",
+                ),
+            )
+            .select("region")
+            .collect()[0]
+        )
+
+        GWAS_df = GWAS._df
+        GWAS_df = GWAS_df.filter(
+            (f.col("chromosome") == chromosome)
+            & (f.col("position") >= position - (window / 2))
+            & (f.col("position") <= position + (window / 2))
+        ).withColumn("z", f.col("beta") / f.col("standardError"))
+
+        _z = np.array([row["z"] for row in GWAS_df.select("z").collect()])
+
+        # # Extract summary statistics
+        # _ss = (
+        #     SummaryStatistics.get_locus_sumstats(session, window, _locus)
+        #     .withColumn("z", f.col("beta") / f.col("standardError"))
+        #     .withColumn("ref", f.split(f.col("variantId"), "_").getItem(2))
+        #     .withColumn("alt", f.split(f.col("variantId"), "_").getItem(3))
+        #     .select(
+        #         "variantId",
+        #         f.col("chromosome").alias("chr"),
+        #         f.col("position").alias("pos"),
+        #         "ref",
+        #         "alt",
+        #         "beta",
+        #         "pValueMantissa",
+        #         "pValueExponent",
+        #         "effectAlleleFrequencyFromSource",
+        #         "standardError",
+        #         "z",
+        #     )
+        # )
+
+        # Extract LD index
+        # _index = GnomADLDMatrix.get_locus_index(
+        #     session, locus, window_size=window, major_population=_major_population
+        # )
+        # _join = (
+        #     _ss.join(
+        #         _index.alias("_index"),
+        #         on=(
+        #             (_ss["chr"] == _index["chromosome"])
+        #             & (_ss["pos"] == _index["position"])
+        #             & (_ss["ref"] == _index["referenceAllele"])
+        #             & (_ss["alt"] == _index["alternateAllele"])
+        #         ),
+        #     )
+        #     .drop("ref", "alt", "chr", "pos")
+        #     .sort("idx")
+        # )
+
+        # Extracting z-scores and LD matrix, then running SuSiE-inf
+        # _ld = GnomADLDMatrix.get_locus_matrix(_join, gnomad_ancestry=_major_population)
+
+        _ld = 1
+        susie_output = SUSIE_inf.susie_inf(z=_z, LD=_ld, L=10)
+
+        return SusieFineMapperStep.susie_inf_to_studylocus(
+            susie_output=susie_output,
+            session=session,
+            _studyId=_studyId,
+            _region=_region,
+            variant_index=GWAS,
+        )
+
     @staticmethod
     def susie_inf_to_studylocus(
         susie_output: dict[str, Any],

From 33b6a51d1cc2575a7e91536a0c61a7e8a4720f6c Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Tue, 2 Apr 2024 16:44:37 +0100
Subject: [PATCH 25/33] feat: updated method for ld_index extraction

---
 src/gentropy/datasource/gnomad/ld.py | 61 ++++++++++++----------------
 1 file changed, 26 insertions(+), 35 deletions(-)

diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py
index 99b7b7ad2..752a05abb 100644
--- a/src/gentropy/datasource/gnomad/ld.py
+++ b/src/gentropy/datasource/gnomad/ld.py
@@ -12,13 +12,12 @@
 from hail.linalg import BlockMatrix
 from pyspark.sql import Window
 
-from gentropy.common.session import Session
 from gentropy.common.spark_helpers import get_top_ranked_in_window, get_value_from_row
 from gentropy.common.utils import _liftover_loci, convert_gnomad_position_to_ensembl
 from gentropy.dataset.ld_index import LDIndex
 
 if TYPE_CHECKING:
-    from pyspark.sql import DataFrame
+    from pyspark.sql import DataFrame, Row
 
 
 @dataclass
@@ -36,6 +35,7 @@ class GnomADLDMatrix:
 
     ld_matrix_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm"
     ld_index_raw_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht"
+    liftover_ht_path: str = "gs://gcp-public-data--gnomad/release/2.1.1/liftover_grch38/ht/genomes/gnomad.genomes.r2.1.1.sites.liftover_grch38.ht"
     grch37_to_grch38_chain_path: str = (
         "gs://hail-common/references/grch37_to_grch38.over.chain.gz"
     )
@@ -450,20 +450,16 @@ def get_ld_matrix_slice(
             )
         )
 
-    @staticmethod
     def get_locus_index(
-        session: Session,
-        study_locus_row: DataFrame,
-        ld_index_path: str,
+        self: GnomADLDMatrix,
+        study_locus_row: Row,
         window_size: int = 1_000_000,
         major_population: str = "nfe",
     ) -> DataFrame:
         """Extract hail matrix index from StudyLocus rows.
 
         Args:
-            session (Session): Spark session
-            study_locus_row (DataFrame): Study-locus row
-            ld_index_path (str): Path to the hail LD index parquet
+            study_locus_row (Row): Study-locus row
             window_size (int): Window size to extract from gnomad matrix
             major_population (str): Major population to extract from gnomad matrix, default is "nfe"
 
@@ -471,40 +467,35 @@ def get_locus_index(
             DataFrame: Returns the index of the gnomad matrix for the locus
 
         """
-        _df = (
-            study_locus_row.withColumn("start", f.col("position") - (window_size / 2))
-            .withColumn("end", f.col("position") + (window_size / 2))
-            .alias("_df")
+        chromosome = str("chr" + study_locus_row["chromosome"])
+        start = study_locus_row["position"] - window_size // 2
+        end = study_locus_row["position"] + window_size // 2
+
+        liftover_ht = hl.read_table(self.liftover_ht_path)
+        liftover_ht = (
+            liftover_ht.filter(
+                (liftover_ht.locus.contig == chromosome)
+                & (liftover_ht.locus.position >= start)
+                & (liftover_ht.locus.position <= end)
+            )
+            .key_by()
+            .select("locus", "alleles", "original_locus")
+            .key_by("original_locus", "alleles")
+            .naive_coalesce(20)
         )
 
-        _matrix_index = session.spark.read.parquet(
-            ld_index_path.format(POP=major_population)
+        hail_index = hl.read_table(
+            self.ld_index_raw_template.format(POP=major_population)
         )
 
-        _index_joined = (
-            _df.alias("df")
-            .join(
-                _matrix_index.alias("matrix_index"),
-                (f.col("df.chromosome") == f.col("matrix_index.chromosome"))
-                & (f.col("df.start") <= f.col("matrix_index.position"))
-                & (f.col("df.end") >= f.col("matrix_index.position")),
-            )
-            .select(
-                "matrix_index.chromosome",
-                "matrix_index.position",
-                "referenceAllele",
-                "alternateAllele",
-                "idx",
-            )
-            .sort("idx")
-        )
+        joined_index = liftover_ht.join(hail_index, how="inner").to_spark().sort("idx")
 
-        return _index_joined
+        return joined_index
 
     @staticmethod
-    def get_locus_matrix(
+    def get_numpy_matrix(
         locus_index: DataFrame,
-        gnomad_ancestry: str,
+        gnomad_ancestry: str = "nfe",
     ) -> np.ndarray:
         """Extract the LD block matrix for a locus.
 

From 67f7f36d0a073b0c4eef56927f010892dea0467b Mon Sep 17 00:00:00 2001
From: Yakov Tsepilov <yt4@sanger.ac.uk>
Date: Tue, 2 Apr 2024 16:55:32 +0100
Subject: [PATCH 26/33] fix: changing input

---
 src/gentropy/susie_finemapper.py | 39 +++++++++++---------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py
index 760ca73a1..e6d577784 100644
--- a/src/gentropy/susie_finemapper.py
+++ b/src/gentropy/susie_finemapper.py
@@ -23,31 +23,29 @@ class SusieFineMapperStep:
     """
 
     @staticmethod
-    def susie_finemapper_one_locus(
+    def susie_finemapper_chr_pos(
         GWAS: SummaryStatistics,
         session: Session,
-        study_locus: StudyLocus,
+        chromosome: str,
+        position: int,
+        _studyId: str,
         study_index: StudyIndex,
         window: int = 1_000_000,
     ) -> StudyLocus:
-        """Susie fine-mapper for one study locus.
+        """Susie fine-mapper function that uses Summary Statstics, chromosome and position as inputs.
 
         Args:
             GWAS (SummaryStatistics): GWAS summary statistics
             session (Session): Spark session
-            study_locus (StudyLocus): StudyLocus object with one row (the first one will be used)
+            chromosome (str): chromosome
+            position (int): position
+            _studyId (str): study ID
             study_index (StudyIndex): StudyIndex object
             window (int): window size for fine-mapping
 
         Returns:
             StudyLocus: StudyLocus object with fine-mapped credible sets
         """
-        study_locus_df = study_locus._df
-        first_line = study_locus_df.first()
-        chromosome = first_line.chromosome
-        position = first_line.position
-        _studyId = first_line.studyId
-
         study_index_df = study_index._df
         study_index_df = study_index_df.filter(study_index_df.studyId == _studyId)
         _major_population = study_index_df.select(
@@ -58,22 +56,11 @@ def susie_finemapper_one_locus(
         )
 
         _region = (
-            first_line.withColumn(
-                "region",
-                f.regexp_replace(
-                    f.concat(
-                        f.col("chromosome"),
-                        f.lit(":"),
-                        f.format_number((f.col("position") - (window / 2)), 0),
-                        f.lit("-"),
-                        f.format_number((f.col("position") + (window / 2)), 0),
-                    ),
-                    ",",
-                    "",
-                ),
-            )
-            .select("region")
-            .collect()[0]
+            chromosome
+            + ":"
+            + str(int(position - window / 2))
+            + "-"
+            + str(int(position + window / 2))
         )
 
         GWAS_df = GWAS._df

From 1dc67013585bcaba5a291ada1cdc05422a59c79a Mon Sep 17 00:00:00 2001
From: Yakov Tsepilov <yt4@sanger.ac.uk>
Date: Tue, 2 Apr 2024 17:01:49 +0100
Subject: [PATCH 27/33] fix: adding fillter by studyId

---
 src/gentropy/susie_finemapper.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py
index e6d577784..001f34bbf 100644
--- a/src/gentropy/susie_finemapper.py
+++ b/src/gentropy/susie_finemapper.py
@@ -65,7 +65,8 @@ def susie_finemapper_chr_pos(
 
         GWAS_df = GWAS._df
         GWAS_df = GWAS_df.filter(
-            (f.col("chromosome") == chromosome)
+            (f.col("studyId") == _studyId)
+            & (f.col("chromosome") == chromosome)
             & (f.col("position") >= position - (window / 2))
             & (f.col("position") <= position + (window / 2))
         ).withColumn("z", f.col("beta") / f.col("standardError"))

From 2a2afbe0cf7f681b328ad8f766abf4d387af5bc7 Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Tue, 2 Apr 2024 17:02:38 +0100
Subject: [PATCH 28/33] fix: sorting idx in hail

---
 src/gentropy/datasource/gnomad/ld.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/gentropy/datasource/gnomad/ld.py b/src/gentropy/datasource/gnomad/ld.py
index 752a05abb..a19cbb06b 100644
--- a/src/gentropy/datasource/gnomad/ld.py
+++ b/src/gentropy/datasource/gnomad/ld.py
@@ -488,7 +488,9 @@ def get_locus_index(
             self.ld_index_raw_template.format(POP=major_population)
         )
 
-        joined_index = liftover_ht.join(hail_index, how="inner").to_spark().sort("idx")
+        joined_index = (
+            liftover_ht.join(hail_index, how="inner").order_by("idx").to_spark()
+        )
 
         return joined_index
 

From 58f3eb95965fe58f5632ffbcec4248e88ac80a1d Mon Sep 17 00:00:00 2001
From: Yakov Tsepilov <yt4@sanger.ac.uk>
Date: Wed, 3 Apr 2024 12:37:36 +0100
Subject: [PATCH 29/33] feat: add fine-mapping of one study_locus_row

---
 src/gentropy/susie_finemapper.py        | 111 ++++++++++++------------
 tests/gentropy/method/test_susie_inf.py |   1 +
 2 files changed, 58 insertions(+), 54 deletions(-)

diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py
index 001f34bbf..740b6ecb3 100644
--- a/src/gentropy/susie_finemapper.py
+++ b/src/gentropy/susie_finemapper.py
@@ -6,12 +6,13 @@
 
 import numpy as np
 import pyspark.sql.functions as f
-from pyspark.sql import DataFrame, Window
+from pyspark.sql import DataFrame, Row, Window
 
 from gentropy.common.session import Session
 from gentropy.dataset.study_index import StudyIndex
 from gentropy.dataset.study_locus import StudyLocus
 from gentropy.dataset.summary_statistics import SummaryStatistics
+from gentropy.datasource.gnomad.ld import GnomADLDMatrix
 from gentropy.method.susie_inf import SUSIE_inf
 
 
@@ -26,26 +27,28 @@ class SusieFineMapperStep:
     def susie_finemapper_chr_pos(
         GWAS: SummaryStatistics,
         session: Session,
-        chromosome: str,
-        position: int,
-        _studyId: str,
+        study_locus_row: Row,
         study_index: StudyIndex,
         window: int = 1_000_000,
+        L: int = 10,
     ) -> StudyLocus:
         """Susie fine-mapper function that uses Summary Statstics, chromosome and position as inputs.
 
         Args:
             GWAS (SummaryStatistics): GWAS summary statistics
             session (Session): Spark session
-            chromosome (str): chromosome
-            position (int): position
-            _studyId (str): study ID
+            study_locus_row (Row): StudyLocus row
             study_index (StudyIndex): StudyIndex object
             window (int): window size for fine-mapping
+            L (int): number of causal variants
 
         Returns:
             StudyLocus: StudyLocus object with fine-mapped credible sets
         """
+        chromosome = study_locus_row.chromosome
+        position = study_locus_row.position
+        _studyId = study_locus_row.studyId
+
         study_index_df = study_index._df
         study_index_df = study_index_df.filter(study_index_df.studyId == _studyId)
         _major_population = study_index_df.select(
@@ -71,59 +74,57 @@ def susie_finemapper_chr_pos(
             & (f.col("position") <= position + (window / 2))
         ).withColumn("z", f.col("beta") / f.col("standardError"))
 
-        _z = np.array([row["z"] for row in GWAS_df.select("z").collect()])
-
-        # # Extract summary statistics
-        # _ss = (
-        #     SummaryStatistics.get_locus_sumstats(session, window, _locus)
-        #     .withColumn("z", f.col("beta") / f.col("standardError"))
-        #     .withColumn("ref", f.split(f.col("variantId"), "_").getItem(2))
-        #     .withColumn("alt", f.split(f.col("variantId"), "_").getItem(3))
-        #     .select(
-        #         "variantId",
-        #         f.col("chromosome").alias("chr"),
-        #         f.col("position").alias("pos"),
-        #         "ref",
-        #         "alt",
-        #         "beta",
-        #         "pValueMantissa",
-        #         "pValueExponent",
-        #         "effectAlleleFrequencyFromSource",
-        #         "standardError",
-        #         "z",
-        #     )
-        # )
-
-        # Extract LD index
-        # _index = GnomADLDMatrix.get_locus_index(
-        #     session, locus, window_size=window, major_population=_major_population
-        # )
-        # _join = (
-        #     _ss.join(
-        #         _index.alias("_index"),
-        #         on=(
-        #             (_ss["chr"] == _index["chromosome"])
-        #             & (_ss["pos"] == _index["position"])
-        #             & (_ss["ref"] == _index["referenceAllele"])
-        #             & (_ss["alt"] == _index["alternateAllele"])
-        #         ),
-        #     )
-        #     .drop("ref", "alt", "chr", "pos")
-        #     .sort("idx")
-        # )
-
-        # Extracting z-scores and LD matrix, then running SuSiE-inf
-        # _ld = GnomADLDMatrix.get_locus_matrix(_join, gnomad_ancestry=_major_population)
-
-        _ld = 1
-        susie_output = SUSIE_inf.susie_inf(z=_z, LD=_ld, L=10)
+        ld_index = (
+            GnomADLDMatrix()
+            .get_locus_index(
+                study_locus_row=study_locus_row,
+                window_size=window,
+                major_population=_major_population,
+            )
+            .withColumn(
+                "variantId",
+                f.concat(
+                    f.lit(chromosome),
+                    f.lit("_"),
+                    f.col("`locus.position`"),
+                    f.lit("_"),
+                    f.col("alleles").getItem(0),
+                    f.lit("_"),
+                    f.col("alleles").getItem(1),
+                ).cast("string"),
+            )
+        )
+
+        gnomad_ld = GnomADLDMatrix.get_numpy_matrix(
+            ld_index, gnomad_ancestry=_major_population
+        )
+
+        GWAS_df = GWAS_df.toPandas()
+        ld_index = ld_index.toPandas()
+        ld_index = ld_index.reset_index()
+
+        # Filtering out the variants that are not in the LD matrix, we don't need them
+        df_columns = GWAS_df.columns
+        GWAS_df = GWAS_df.merge(ld_index, on="variantId", how="inner")
+        GWAS_df = GWAS_df[df_columns].reset_index()
+
+        merged_df = GWAS_df.merge(
+            ld_index, left_on="variantId", right_on="variantId", how="inner"
+        )
+        indices = merged_df["index_y"].values
+
+        ld_to_fm = gnomad_ld[indices][:, indices]
+        z_to_fm = GWAS_df["z"].values
+
+        susie_output = SUSIE_inf.susie_inf(z=z_to_fm, LD=ld_to_fm, L=L)
+        variant_index = session.spark.createDataFrame(GWAS_df)
 
         return SusieFineMapperStep.susie_inf_to_studylocus(
             susie_output=susie_output,
             session=session,
             _studyId=_studyId,
             _region=_region,
-            variant_index=GWAS,
+            variant_index=variant_index,
         )
 
     @staticmethod
@@ -151,6 +152,7 @@ def susie_inf_to_studylocus(
         variants = np.array(
             [row["variantId"] for row in variant_index.select("variantId").collect()]
         ).reshape(-1, 1)
+
         PIPs = susie_output["PIP"]
         lbfs = susie_output["lbf_variable"]
         mu = susie_output["mu"]
@@ -181,6 +183,7 @@ def susie_inf_to_studylocus(
             win = Window.rowsBetween(
                 Window.unboundedPreceding, Window.unboundedFollowing
             )
+
             cred_set = (
                 session.spark.createDataFrame(
                     cred_set.tolist(),
diff --git a/tests/gentropy/method/test_susie_inf.py b/tests/gentropy/method/test_susie_inf.py
index 393f786d7..e7a5d219f 100644
--- a/tests/gentropy/method/test_susie_inf.py
+++ b/tests/gentropy/method/test_susie_inf.py
@@ -68,6 +68,7 @@ def test_SUSIE_inf_convert_to_study_locus(
             est_tausq=False,
         )
         gwas_df = sample_summary_statistics._df.limit(21)
+
         L1 = SusieFineMapperStep.susie_inf_to_studylocus(
             susie_output=susie_output,
             session=session,

From 5ee023a29de0ba5ac39fca5b14f83078e6dec6bc Mon Sep 17 00:00:00 2001
From: Yakov Tsepilov <yt4@sanger.ac.uk>
Date: Wed, 3 Apr 2024 16:07:57 +0100
Subject: [PATCH 30/33] fix: small fix in majpop

---
 src/gentropy/susie_finemapper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py
index 740b6ecb3..be5de68c1 100644
--- a/src/gentropy/susie_finemapper.py
+++ b/src/gentropy/susie_finemapper.py
@@ -56,7 +56,7 @@ def susie_finemapper_chr_pos(
             f.array_max(f.col("ldPopulationStructure"))
             .getItem("ldPopulation")
             .alias("majorPopulation"),
-        )
+        ).collect()[0]["majorPopulation"]
 
         _region = (
             chromosome

From 3b0ddda3fc64772bc3e414c7e9dd41dbf05ffa35 Mon Sep 17 00:00:00 2001
From: Yakov Tsepilov <yt4@sanger.ac.uk>
Date: Wed, 3 Apr 2024 17:18:43 +0100
Subject: [PATCH 31/33] fix: small fixes in function

---
 src/gentropy/susie_finemapper.py | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py
index be5de68c1..7117295b3 100644
--- a/src/gentropy/susie_finemapper.py
+++ b/src/gentropy/susie_finemapper.py
@@ -5,8 +5,10 @@
 from typing import Any
 
 import numpy as np
+import pandas as pd
 import pyspark.sql.functions as f
 from pyspark.sql import DataFrame, Row, Window
+from pyspark.sql.types import IntegerType, StringType, StructField, StructType
 
 from gentropy.common.session import Session
 from gentropy.dataset.study_index import StudyIndex
@@ -24,7 +26,7 @@ class SusieFineMapperStep:
     """
 
     @staticmethod
-    def susie_finemapper_chr_pos(
+    def susie_finemapper_one_studylocus_row(
         GWAS: SummaryStatistics,
         session: Session,
         study_locus_row: Row,
@@ -45,6 +47,9 @@ def susie_finemapper_chr_pos(
         Returns:
             StudyLocus: StudyLocus object with fine-mapped credible sets
         """
+        # PLEASE DO NOT REMOVE THIS LINE
+        pd.DataFrame.iteritems = pd.DataFrame.items
+
         chromosome = study_locus_row.chromosome
         position = study_locus_row.position
         _studyId = study_locus_row.studyId
@@ -117,7 +122,24 @@ def susie_finemapper_chr_pos(
         z_to_fm = GWAS_df["z"].values
 
         susie_output = SUSIE_inf.susie_inf(z=z_to_fm, LD=ld_to_fm, L=L)
-        variant_index = session.spark.createDataFrame(GWAS_df)
+
+        schema = StructType(
+            [
+                StructField("variantId", StringType(), True),
+                StructField("chromosome", StringType(), True),
+                StructField("position", IntegerType(), True),
+            ]
+        )
+        variant_index = session.spark.createDataFrame(
+            GWAS_df[
+                [
+                    "variantId",
+                    "chromosome",
+                    "position",
+                ]
+            ],
+            schema=schema,
+        )
 
         return SusieFineMapperStep.susie_inf_to_studylocus(
             susie_output=susie_output,

From 3f8e20ab10c9485a03d254fc21f8e77e7c4c651a Mon Sep 17 00:00:00 2001
From: Daniel Considine <dc16@sanger.ac.uk>
Date: Thu, 4 Apr 2024 21:04:13 +0100
Subject: [PATCH 32/33] fix: using more spark before converting to pandas

---
 src/gentropy/susie_finemapper.py | 70 ++++++++++++++------------------
 1 file changed, 31 insertions(+), 39 deletions(-)

diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py
index 7117295b3..8d25f533e 100644
--- a/src/gentropy/susie_finemapper.py
+++ b/src/gentropy/susie_finemapper.py
@@ -50,20 +50,20 @@ def susie_finemapper_one_studylocus_row(
         # PLEASE DO NOT REMOVE THIS LINE
         pd.DataFrame.iteritems = pd.DataFrame.items
 
-        chromosome = study_locus_row.chromosome
-        position = study_locus_row.position
-        _studyId = study_locus_row.studyId
+        chromosome = study_locus_row["chromosome"]
+        position = study_locus_row["position"]
+        studyId = study_locus_row["studyId"]
 
         study_index_df = study_index._df
-        study_index_df = study_index_df.filter(study_index_df.studyId == _studyId)
-        _major_population = study_index_df.select(
+        study_index_df = study_index_df.filter(f.col("studyId") == studyId)
+        major_population = study_index_df.select(
             "studyId",
             f.array_max(f.col("ldPopulationStructure"))
             .getItem("ldPopulation")
             .alias("majorPopulation"),
         ).collect()[0]["majorPopulation"]
 
-        _region = (
+        region = (
             chromosome
             + ":"
             + str(int(position - window / 2))
@@ -71,20 +71,19 @@ def susie_finemapper_one_studylocus_row(
             + str(int(position + window / 2))
         )
 
-        GWAS_df = GWAS._df
-        GWAS_df = GWAS_df.filter(
-            (f.col("studyId") == _studyId)
-            & (f.col("chromosome") == chromosome)
-            & (f.col("position") >= position - (window / 2))
-            & (f.col("position") <= position + (window / 2))
-        ).withColumn("z", f.col("beta") / f.col("standardError"))
+        gwas_df = (
+            GWAS.df.withColumn("z", f.col("beta") / f.col("standardError"))
+            .withColumn("chromosome", f.split(f.col("variantId"), "_")[0])
+            .withColumn("position", f.split(f.col("variantId"), "_")[1])
+            .filter(f.col("z").isNotNull())
+        )
 
         ld_index = (
             GnomADLDMatrix()
             .get_locus_index(
                 study_locus_row=study_locus_row,
                 window_size=window,
-                major_population=_major_population,
+                major_population=major_population,
             )
             .withColumn(
                 "variantId",
@@ -100,26 +99,18 @@ def susie_finemapper_one_studylocus_row(
             )
         )
 
-        gnomad_ld = GnomADLDMatrix.get_numpy_matrix(
-            ld_index, gnomad_ancestry=_major_population
-        )
-
-        GWAS_df = GWAS_df.toPandas()
-        ld_index = ld_index.toPandas()
-        ld_index = ld_index.reset_index()
-
         # Filtering out the variants that are not in the LD matrix, we don't need them
-        df_columns = GWAS_df.columns
-        GWAS_df = GWAS_df.merge(ld_index, on="variantId", how="inner")
-        GWAS_df = GWAS_df[df_columns].reset_index()
+        gwas_index = gwas_df.join(
+            ld_index.select("variantId", "alleles", "idx"), on="variantId"
+        ).sort("idx")
 
-        merged_df = GWAS_df.merge(
-            ld_index, left_on="variantId", right_on="variantId", how="inner"
+        gnomad_ld = GnomADLDMatrix.get_numpy_matrix(
+            gwas_index, gnomad_ancestry=major_population
         )
-        indices = merged_df["index_y"].values
 
-        ld_to_fm = gnomad_ld[indices][:, indices]
-        z_to_fm = GWAS_df["z"].values
+        pd_df = gwas_index.toPandas()
+        z_to_fm = np.array(pd_df["z"])
+        ld_to_fm = gnomad_ld
 
         susie_output = SUSIE_inf.susie_inf(z=z_to_fm, LD=ld_to_fm, L=L)
 
@@ -130,8 +121,9 @@ def susie_finemapper_one_studylocus_row(
                 StructField("position", IntegerType(), True),
             ]
         )
+        pd_df["position"] = pd_df["position"].astype(int)
         variant_index = session.spark.createDataFrame(
-            GWAS_df[
+            pd_df[
                 [
                     "variantId",
                     "chromosome",
@@ -144,8 +136,8 @@ def susie_finemapper_one_studylocus_row(
         return SusieFineMapperStep.susie_inf_to_studylocus(
             susie_output=susie_output,
             session=session,
-            _studyId=_studyId,
-            _region=_region,
+            studyId=studyId,
+            region=region,
             variant_index=variant_index,
         )
 
@@ -153,8 +145,8 @@ def susie_finemapper_one_studylocus_row(
     def susie_inf_to_studylocus(
         susie_output: dict[str, Any],
         session: Session,
-        _studyId: str,
-        _region: str,
+        studyId: str,
+        region: str,
         variant_index: DataFrame,
         cs_lbf_thr: float = 2,
     ) -> StudyLocus:
@@ -163,8 +155,8 @@ def susie_inf_to_studylocus(
         Args:
             susie_output (dict[str, Any]): SuSiE-inf output dictionary
             session (Session): Spark session
-            _studyId (str): study ID
-            _region (str): region
+            studyId (str): study ID
+            region (str): region
             variant_index (DataFrame): DataFrame with variant information
             cs_lbf_thr (float): credible set logBF threshold, default is 2
 
@@ -236,8 +228,8 @@ def susie_inf_to_studylocus(
                 .limit(1)
                 .withColumns(
                     {
-                        "studyId": f.lit(_studyId),
-                        "region": f.lit(_region),
+                        "studyId": f.lit(studyId),
+                        "region": f.lit(region),
                         "credibleSetIndex": f.lit(counter),
                         "credibleSetlog10BF": f.lit(cs_lbf_value * 0.4342944819),
                         "finemappingMethod": f.lit("SuSiE-inf"),

From d4623688f388155afb83676e640be1810551fa27 Mon Sep 17 00:00:00 2001
From: Yakov Tsepilov <yt4@sanger.ac.uk>
Date: Fri, 5 Apr 2024 12:01:29 +0100
Subject: [PATCH 33/33] fix: fix in test

---
 src/gentropy/susie_finemapper.py        | 1 +
 tests/gentropy/method/test_susie_inf.py | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/gentropy/susie_finemapper.py b/src/gentropy/susie_finemapper.py
index 8d25f533e..03e1c2a0b 100644
--- a/src/gentropy/susie_finemapper.py
+++ b/src/gentropy/susie_finemapper.py
@@ -75,6 +75,7 @@ def susie_finemapper_one_studylocus_row(
             GWAS.df.withColumn("z", f.col("beta") / f.col("standardError"))
             .withColumn("chromosome", f.split(f.col("variantId"), "_")[0])
             .withColumn("position", f.split(f.col("variantId"), "_")[1])
+            .filter(f.col("studyId") == studyId)
             .filter(f.col("z").isNotNull())
         )
 
diff --git a/tests/gentropy/method/test_susie_inf.py b/tests/gentropy/method/test_susie_inf.py
index e7a5d219f..0129a5934 100644
--- a/tests/gentropy/method/test_susie_inf.py
+++ b/tests/gentropy/method/test_susie_inf.py
@@ -72,8 +72,8 @@ def test_SUSIE_inf_convert_to_study_locus(
         L1 = SusieFineMapperStep.susie_inf_to_studylocus(
             susie_output=susie_output,
             session=session,
-            _studyId="sample_id",
-            _region="sample_region",
+            studyId="sample_id",
+            region="sample_region",
             variant_index=gwas_df,
             cs_lbf_thr=2,
         )