feat: add biosample index (#769)

* Initial commit of biosample index * Make minimal class * Tidy up first draft of adding biosample index * Add beginning of logic for checking if biosample from a studyindex is in biosample index * Make early file for merging multiple biosample indices into one * Finish adding basic iteration of biosample index, needs debugging * Tweak slightly * Modified the parser to accept JSON files * Update biosample index * Tests and docs * Updating tests * Revert GWAS catalog file * fix(biosample index): update to match pre-commit standards * fix(biosample index): merging indices fix * fix(biosample index): update study index qc logic * fix(biosample index): fix missing mock_biosample_index * chore(biosample index): change datasource name from ontologies * fix(biosample index): add dataset doc * fix(biosample index): change dbXrefs to xrefs * chore (biosample index): better commenting Co-authored-by: Daniel Suveges <[email protected]> * fix(biosample index): various minor tweaks to biosample index * fix(biosample index): minor bug * fix(biosample index): fix merge shift to method * feat(biosample index): make biosampleName not nullable --------- Co-authored-by: Daniel Suveges <[email protected]>
opentargets · Sep 24, 2024 · ccdb1f2 · ccdb1f2
1 parent 148e26e
commit ccdb1f2
Show file tree

Hide file tree

Showing 19 changed files with 1,735 additions and 2 deletions.
diff --git a/docs/python_api/datasets/biosample_index.md b/docs/python_api/datasets/biosample_index.md
@@ -0,0 +1,9 @@
+---
+title: Biosample index
+---
+
+::: gentropy.dataset.biosample_index.BiosampleIndex
+
+## Schema
+
+--8<-- "assets/schemas/biosample_index.md"
diff --git a/docs/python_api/datasources/_datasources.md b/docs/python_api/datasources/_datasources.md
@@ -26,7 +26,7 @@ This section contains information about the data source harmonisation tools avai
 2. GWAS catalog's [harmonisation pipeline](https://www.ebi.ac.uk/gwas/docs/methods/summary-statistics#_harmonised_summary_statistics_data)
 3. Ensembl's [Variant Effect Predictor](https://www.ensembl.org/info/docs/tools/vep/index.html)
 
-## Linkage desiquilibrium
+## Linkage disequilibrium
 
 1. [GnomAD](gnomad/_gnomad.md) v2.1.1 LD matrixes (7 ancestries)
 
@@ -37,3 +37,8 @@ This section contains information about the data source harmonisation tools avai
 ## Gene annotation
 
 1. [Open Targets Platform Target Dataset](open_targets/target.md) (derived from Ensembl)
+
+## Biological samples
+
+1. [Uberon](biosample_ontologies/_uberon.md)
+2. [Cell Ontology](biosample_ontologies/_cell_ontology.md)
diff --git a/docs/python_api/datasources/biosample_ontologies/_cell_ontology.md b/docs/python_api/datasources/biosample_ontologies/_cell_ontology.md
@@ -0,0 +1,5 @@
+---
+title: Cell Ontology
+---
+
+The [Cell Ontology](http://www.obofoundry.org/ontology/cl.html) is a structured controlled vocabulary for cell types. It is used to annotate cell types in single-cell RNA-seq data and other omics data.
diff --git a/docs/python_api/datasources/biosample_ontologies/_uberon.md b/docs/python_api/datasources/biosample_ontologies/_uberon.md
@@ -0,0 +1,5 @@
+---
+title: Uberon
+---
+
+The [Uberon](http://uberon.github.io/) ontology is a multi-species anatomy ontology that integrates cross-species ontologies into a single ontology.
diff --git a/docs/python_api/steps/biosample_index_step.md b/docs/python_api/steps/biosample_index_step.md
@@ -0,0 +1,5 @@
+---
+title: biosample_index
+---
+
+::: gentropy.biosample_index.BiosampleIndexStep
diff --git a/poetry.lock b/poetry.lock
diff --git a/src/gentropy/assets/schemas/biosample_index.json b/src/gentropy/assets/schemas/biosample_index.json
@@ -0,0 +1,83 @@
+{
+  "type": "struct",
+  "fields": [
+    {
+      "name": "biosampleId",
+      "type": "string",
+      "nullable": false,
+      "metadata": {}
+    },
+    {
+      "name": "biosampleName",
+      "type": "string",
+      "nullable": false,
+      "metadata": {}
+    },
+    {
+      "name": "description",
+      "type": "string",
+      "nullable": true,
+      "metadata": {}
+    },
+    {
+      "name": "xrefs",
+      "type": {
+        "type": "array",
+        "elementType": "string",
+        "containsNull": true
+      },
+      "nullable": true,
+      "metadata": {}
+    },
+    {
+      "name": "synonyms",
+      "type": {
+        "type": "array",
+        "elementType": "string",
+        "containsNull": true
+      },
+      "nullable": true,
+      "metadata": {}
+    },
+    {
+      "name": "parents",
+      "type": {
+        "type": "array",
+        "elementType": "string",
+        "containsNull": true
+      },
+      "nullable": true,
+      "metadata": {}
+    },
+    {
+      "name": "ancestors",
+      "type": {
+        "type": "array",
+        "elementType": "string",
+        "containsNull": true
+      },
+      "nullable": true,
+      "metadata": {}
+    },
+    {
+      "name": "descendants",
+      "type": {
+        "type": "array",
+        "elementType": "string",
+        "containsNull": true
+      },
+      "nullable": true,
+      "metadata": {}
+    },
+    {
+      "name": "children",
+      "type": {
+        "type": "array",
+        "elementType": "string",
+        "containsNull": true
+      },
+      "nullable": true,
+      "metadata": {}
+    }
+  ]
+}
diff --git a/src/gentropy/biosample_index.py b/src/gentropy/biosample_index.py
@@ -0,0 +1,34 @@
+"""Step to generate biosample index dataset."""
+from __future__ import annotations
+
+from gentropy.common.session import Session
+from gentropy.datasource.biosample_ontologies.utils import extract_ontology_from_json
+
+
+class BiosampleIndexStep:
+    """Biosample index step.
+
+    This step generates a Biosample index dataset from the various ontology sources. Currently Cell Ontology and Uberon are supported.
+    """
+
+    def __init__(
+        self,
+        session: Session,
+        cell_ontology_input_path: str,
+        uberon_input_path: str,
+        biosample_index_path: str,
+    ) -> None:
+        """Run Biosample index generation step.
+
+        Args:
+            session (Session): Session object.
+            cell_ontology_input_path (str): Input cell ontology dataset path.
+            uberon_input_path (str): Input uberon dataset path.
+            biosample_index_path (str): Output gene index dataset path.
+        """
+        cell_ontology_index = extract_ontology_from_json(cell_ontology_input_path, session.spark)
+        uberon_index = extract_ontology_from_json(uberon_input_path, session.spark)
+
+        biosample_index = cell_ontology_index.merge_indices([uberon_index])
+
+        biosample_index.df.write.mode(session.write_mode).parquet(biosample_index_path)
diff --git a/src/gentropy/config.py b/src/gentropy/config.py
@@ -51,6 +51,16 @@ class GeneIndexConfig(StepConfig):
     _target_: str = "gentropy.gene_index.GeneIndexStep"
 
 
+@dataclass
+class BiosampleIndexConfig(StepConfig):
+    """Biosample index step configuration."""
+
+    cell_ontology_input_path: str = MISSING
+    uberon_input_path: str = MISSING
+    biosample_index_path: str = MISSING
+    _target_: str = "gentropy.biosample_index.BiosampleIndexStep"
+
+
 @dataclass
 class GWASCatalogStudyCurationConfig(StepConfig):
     """GWAS Catalog study curation step configuration."""
@@ -472,6 +482,7 @@ class StudyValidationStepConfig(StepConfig):
     study_index_path: list[str] = MISSING
     target_index_path: str = MISSING
     disease_index_path: str = MISSING
+    biosample_index_path: str = MISSING
     valid_study_index_path: str = MISSING
     invalid_study_index_path: str = MISSING
     invalid_qc_reasons: list[str] = MISSING
@@ -512,6 +523,7 @@ def register_config() -> None:
     cs.store(group="step", name="colocalisation", node=ColocalisationConfig)
     cs.store(group="step", name="eqtl_catalogue", node=EqtlCatalogueConfig)
     cs.store(group="step", name="gene_index", node=GeneIndexConfig)
+    cs.store(group="step", name="biosample_index", node=BiosampleIndexConfig)
     cs.store(
         group="step",
         name="gwas_catalog_study_curation",

diff --git a/src/gentropy/dataset/biosample_index.py b/src/gentropy/dataset/biosample_index.py
@@ -0,0 +1,72 @@
+"""Biosample index dataset."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from functools import reduce
+from typing import TYPE_CHECKING
+
+import pyspark.sql.functions as f
+from pyspark.sql import DataFrame
+from pyspark.sql.types import ArrayType, StringType
+
+from gentropy.common.schemas import parse_spark_schema
+from gentropy.dataset.dataset import Dataset
+
+if TYPE_CHECKING:
+    from pyspark.sql.types import StructType
+
+
+@dataclass
+class BiosampleIndex(Dataset):
+    """Biosample index dataset.
+
+    A Biosample index dataset captures the metadata of the biosamples (e.g. tissues, cell types, cell lines, etc) such as alternate names and relationships with other biosamples.
+    """
+
+    @classmethod
+    def get_schema(cls: type[BiosampleIndex]) -> StructType:
+        """Provide the schema for the BiosampleIndex dataset.
+
+        Returns:
+            StructType: The schema of the BiosampleIndex dataset.
+        """
+        return parse_spark_schema("biosample_index.json")
+
+    def merge_indices(
+        self: BiosampleIndex,
+        biosample_indices : list[BiosampleIndex]
+        ) -> BiosampleIndex:
+        """Merge a list of biosample indices into a single biosample index.
+
+        Where there are conflicts, in single values - the first value is taken. In list values, the union of all values is taken.
+
+        Args:
+            biosample_indices (list[BiosampleIndex]): Biosample indices to merge.
+
+        Returns:
+            BiosampleIndex: Merged biosample index.
+        """
+        # Extract the DataFrames from the BiosampleIndex objects
+        biosample_dfs = [biosample_index.df for biosample_index in biosample_indices] + [self.df]
+
+        # Merge the DataFrames
+        merged_df = reduce(DataFrame.unionAll, biosample_dfs)
+
+        # Determine aggregation functions for each column
+        # Currently this will take the first value for single values and merge lists for list values
+        agg_funcs = []
+        for field in merged_df.schema.fields:
+            if field.name != "biosampleId":  # Skip the grouping column
+                if field.dataType == ArrayType(StringType()):
+                    agg_funcs.append(f.array_distinct(f.flatten(f.collect_list(field.name))).alias(field.name))
+                else:
+                    agg_funcs.append(f.first(f.col(field.name), ignorenulls=True).alias(field.name))
+
+        # Perform aggregation
+        aggregated_df = merged_df.groupBy("biosampleId").agg(*agg_funcs)
+
+        return BiosampleIndex(
+            _df=aggregated_df,
+            _schema=BiosampleIndex.get_schema()
+            )
diff --git a/src/gentropy/dataset/study_index.py b/src/gentropy/dataset/study_index.py
@@ -19,6 +19,7 @@
     from pyspark.sql import Column, DataFrame
     from pyspark.sql.types import StructType
 
+    from gentropy.dataset.biosample_index import BiosampleIndex
     from gentropy.dataset.gene_index import GeneIndex
 
 
@@ -29,12 +30,14 @@ class StudyQualityCheck(Enum):
         UNRESOLVED_TARGET (str): Target/gene identifier could not match to reference - Labelling failing target.
         UNRESOLVED_DISEASE (str): Disease identifier could not match to referece or retired identifier - labelling failing disease
         UNKNOWN_STUDY_TYPE (str): Indicating the provided type of study is not supported.
+        UNKNOWN_BIOSAMPLE (str): Flagging if a biosample identifier is not found in the reference.
         DUPLICATED_STUDY (str): Flagging if a study identifier is not unique.
     """
 
     UNRESOLVED_TARGET = "Target/gene identifier could not match to reference."
     UNRESOLVED_DISEASE = "No valid disease identifier found."
     UNKNOWN_STUDY_TYPE = "This type of study is not supported."
+    UNKNOWN_BIOSAMPLE = "Biosample identifier was not found in the reference."
     DUPLICATED_STUDY = "The identifier of this study is not unique."
 
 
@@ -406,3 +409,36 @@ def validate_target(self: StudyIndex, target_index: GeneIndex) -> StudyIndex:
         )
 
         return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema())
+
+    def validate_biosample(self: StudyIndex, biosample_index: BiosampleIndex) -> StudyIndex:
+        """Validating biosample identifiers in the study index against the provided biosample index.
+
+        Args:
+            biosample_index (BiosampleIndex): Biosample index containing a reference of biosample identifiers e.g. cell types, tissues, cell lines, etc.
+
+        Returns:
+            StudyIndex: with flagged studies if biosampleIndex could not be validated.
+        """
+        biosample_set = biosample_index.df.select("biosampleId", f.lit(True).alias("isIdFound"))
+
+        validated_df = (
+            self.df.join(biosample_set, self.df.biosampleFromSourceId == biosample_set.biosampleId, how="left")
+            .withColumn(
+                "isIdFound",
+                f.when(
+                    f.col("isIdFound").isNull(),
+                    f.lit(False),
+                ).otherwise(f.lit(True)),
+            )
+            .withColumn(
+                "qualityControls",
+                StudyIndex.update_quality_flag(
+                    f.col("qualityControls"),
+                    ~f.col("isIdFound"),
+                    StudyQualityCheck.UNKNOWN_BIOSAMPLE,
+                ),
+            )
+            .drop("isIdFound").drop("biosampleId")
+        )
+
+        return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema())
diff --git a/src/gentropy/datasource/biosample_ontologies/__init__.py b/src/gentropy/datasource/biosample_ontologies/__init__.py
@@ -0,0 +1,3 @@
+"""Biosample index data source."""
+
+from __future__ import annotations