Merge pull request #78 from bcgsc/release/v1.9.0_test_updates_helper_…

…funcs Release/v1.9.0 test updates helper funcs
bcgsc · Feb 22, 2023 · 09b33b2 · 09b33b2
2 parents 101b616 + d5a9932
commit 09b33b2
Show file tree

Hide file tree

Showing 11 changed files with 563 additions and 47 deletions.
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -10,13 +10,14 @@ jobs:
 
     runs-on: ubuntu-latest
     strategy:
+      max-parallel: 4
       matrix:
-        python-version: ['3.6', '3.7', '3.8', '3.9', '3.10']
+        python-version: ['3.7', '3.8', '3.9', '3.10']
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v3
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
@@ -33,10 +34,13 @@ jobs:
         pip install black
         black --check -S -l 100 graphkb tests
     - name: Test with pytest
-      run: pytest --junitxml=junit/test-results-${{ matrix.python-version }}.xml --cov graphkb --cov-report term --cov-report xml
+      run: |
+        pip list
+        pytest --junitxml=junit/test-results-${{ matrix.python-version }}.xml --cov graphkb --cov-report term --cov-report xml --durations 10 -vv
       env:
         GRAPHKB_USER: ${{ secrets.GKB_TEST_USER }}
         GRAPHKB_PASS: ${{ secrets.GKB_TEST_PASS }}
+        EXCLUDE_INTEGRATION_TESTS: 0
     - name: Upload pytest test results
       uses: actions/upload-artifact@master
       with:
@@ -45,12 +49,12 @@ jobs:
         # Use always() to always run this step to publish test results when there are test failures
       if: always()
     - name: Update code coverage report to CodeCov
-      uses: codecov/codecov-action@v1
+      uses: codecov/codecov-action@v3
       with:
         token: ${{ secrets.CODECOV_TOKEN }}
         file: ./coverage.xml
         flags: unittests
         env_vars: OS,PYTHON
         name: codecov-umbrella
         fail_ci_if_error: true
-      if: matrix.python-version == 3.8
+      if: matrix.python-version == 3.9
diff --git a/graphkb/constants.py b/graphkb/constants.py
@@ -3,12 +3,12 @@
 from .types import CategoryBaseTermMapping
 
 DEFAULT_LIMIT = 1000
-
 GKB_BASE_URL = "https://graphkb-api.bcgsc.ca/api"
 GKB_STAGING_URL = "https://graphkbstaging-api.bcgsc.ca/api"
 GKB_DEV_URL = "https://graphkbdev-api.bcgsc.ca/api"
 DEFAULT_URL = GKB_BASE_URL
 
+PREFERRED_GENE_SOURCE = "#39:5"  # HGNC
 
 BASE_RETURN_PROPERTIES = ['@rid', '@class']
 
@@ -61,8 +61,10 @@
 ONCOKB_SOURCE_NAME = 'oncokb'
 ONCOGENE = 'oncogenic'
 TUMOUR_SUPPRESSIVE = 'tumour suppressive'
-
 FUSION_NAMES = ['structural variant', 'fusion']
+
+PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST = ["cancer genome interpreter", "civic"]
+
 BASE_THERAPEUTIC_TERMS = ['therapeutic efficacy', 'eligibility']
 # the order here is the order these are applied, the first category matched is returned
 RELEVANCE_BASE_TERMS: CategoryBaseTermMapping = [
@@ -73,6 +75,11 @@
     ('cancer predisposition', ['pathogenic']),
     ('biological', ['functional effect', 'tumourigenesis', 'predisposing']),
 ]
+FAILED_REVIEW_STATUS = 'failed'
+
+CHROMOSOMES_HG38 = [f"chr{i}" for i in range(1, 23)] + ['chrX', 'chrY', 'chrM']
+CHROMOSOMES_HG19 = [str(i) for i in range(1, 23)] + ['x', 'y', 'mt']
+CHROMOSOMES = CHROMOSOMES_HG38 + CHROMOSOMES_HG19
 
 AMBIGUOUS_AA = ['x', '?', 'X']
 AA_3to1_MAPPING = {

diff --git a/graphkb/genes.py b/graphkb/genes.py
@@ -1,17 +1,22 @@
 """
 Methods for retrieving gene annotation lists from GraphKB
 """
-from typing import Any, Dict, List, cast
+from typing import Any, Dict, List, Tuple, cast
 
 from . import GraphKBConnection
 from .constants import (
     BASE_THERAPEUTIC_TERMS,
+    CHROMOSOMES,
     GENE_RETURN_PROPERTIES,
     ONCOGENE,
     ONCOKB_SOURCE_NAME,
+    PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST,
+    PREFERRED_GENE_SOURCE,
     TUMOUR_SUPPRESSIVE,
 )
+from .match import get_equivalent_features
 from .types import Ontology, Statement, Variant
+from .util import get_rid, logger
 from .vocab import get_terms_set
 
 
@@ -45,8 +50,7 @@ def _get_oncokb_gene_list(
 
 
 def get_oncokb_oncogenes(conn: GraphKBConnection) -> List[Ontology]:
-    """
-    Gets the list of oncogenes stored in GraphKB derived from OncoKB
+    """Gets the list of oncogenes stored in GraphKB derived from OncoKB.
 
     Args:
         conn: the graphkb connection object
@@ -58,8 +62,7 @@ def get_oncokb_oncogenes(conn: GraphKBConnection) -> List[Ontology]:
 
 
 def get_oncokb_tumour_supressors(conn: GraphKBConnection) -> List[Ontology]:
-    """
-    Gets the list of tumour supressor genes stored in GraphKB derived from OncoKB
+    """Gets the list of tumour supressor genes stored in GraphKB derived from OncoKB.
 
     Args:
         conn: the graphkb connection object
@@ -161,3 +164,201 @@ def get_genes_from_variant_types(
         ),
     )
     return result
+
+
+def get_preferred_gene_name(
+    conn: GraphKBConnection, gene_name: str, source: str = PREFERRED_GENE_SOURCE
+) -> str:
+    """Preferred gene symbol of a gene or transcript.
+
+    Args:
+        gene_name: the gene name to search features by
+        ignore_cache (bool, optional): bypass the cache to always force a new request
+        source: id of the preferred gene symbol source
+    Returns:
+        preferred displayName symbol.
+
+    Example:
+        return KRAS for get_preferred_gene_name(conn, 'NM_033360')
+        return KRAS for get_preferred_gene_name(conn, 'ENSG00000133703.11')
+    """
+    if gene_name in CHROMOSOMES:
+        logger.error(f"{gene_name} assumed to be a chromosome, not gene")
+        return ''
+    eq = get_equivalent_features(conn=conn, gene_name=gene_name)
+    genes = [m for m in eq if m.get('biotype') == 'gene' and not m.get('deprecated')]
+    if not genes:
+        logger.error(f"No genes found for: {gene_name}")
+        return ''
+    if source:
+        source_filtered_genes = [m for m in genes if m.get('source') == source]
+        if not source_filtered_genes:
+            logger.error(f"No data from source {source} for {gene_name}")
+        else:
+            genes = source_filtered_genes
+
+    gene_names = [g['displayName'] for g in genes if g]
+    if len(gene_names) > 1:
+        logger.error(
+            f"Multiple gene names found for: {gene_name} - using {gene_names[0]}, ignoring {gene_names[1:]}"
+        )
+    return gene_names[0]
+
+
+def get_cancer_predisposition_info(conn: GraphKBConnection) -> Tuple[List[str], Dict[str, str]]:
+    """
+    Return two lists from GraphKB, one of cancer predisposition genes and one of associated variants.
+
+    GERO-272 - criteria for what counts as a "cancer predisposition" variant
+
+    In short:
+    * Statement 'source' is 'CGL'
+    * Statement 'relevance' is 'pathogenic'
+    * gene is gotten from any associated 'PositionalVariant' records
+
+    Example: https://graphkb.bcgsc.ca/view/Statement/155:11616
+
+    Returns:
+        genes: list of cancer predisposition genes
+        variants: dictionary mapping pharmacogenomic variant IDs to variant display names
+    """
+    genes = set()
+    non_genes = set()
+    infer_genes = set()
+    variants = {}
+
+    relevance_rids = list(get_terms_set(conn, "cancer predisposition"))
+
+    for record in conn.query(
+        {
+            "target": "Statement",
+            "filters": [
+                {
+                    "evidence": {
+                        "target": "Source",
+                        "filters": {"@rid": get_rid(conn, "Source", "CGL")},
+                    },
+                    "relevance": {
+                        "target": "Vocabulary",
+                        "filters": {"@rid": relevance_rids},
+                    },
+                }
+            ],
+            "returnProperties": [
+                "conditions.@class",
+                "conditions.@rid",
+                "conditions.displayName",
+                "conditions.reference1.biotype",
+                "conditions.reference1.displayName",
+                "conditions.reference2.biotype",
+                "conditions.reference2.displayName",
+            ],
+        },
+        ignore_cache=False,
+    ):
+        for condition in record["conditions"]:  # type: ignore
+            if condition["@class"] == "PositionalVariant":
+                variants[condition["@rid"]] = condition["displayName"]
+                for reference in ["reference1", "reference2"]:
+                    name = (condition.get(reference) or {}).get("displayName", "")
+                    biotype = (condition.get(reference) or {}).get("biotype", "")
+                    if name and biotype == "gene":
+                        genes.add(name)
+                    elif name:
+                        gene = get_preferred_gene_name(conn, name)
+                        if gene:
+                            infer_genes.add((gene, name, biotype))
+                        else:
+                            non_genes.add((name, biotype))
+                            logger.error(
+                                f"Non-gene cancer predisposition {biotype}: {name} for {condition['displayName']}"
+                            )
+
+    for gene, name, biotype in infer_genes:
+        logger.debug(f"Found gene '{gene}' for '{name}' ({biotype})")
+        genes.add(gene)
+
+    for name, biotype in non_genes:
+        logger.error(f"Unable to find gene for '{name}' ({biotype})")
+
+    return sorted(genes), variants
+
+
+def get_pharmacogenomic_info(conn: GraphKBConnection) -> Tuple[List[str], Dict[str, str]]:
+    """
+    Return two lists from GraphKB, one of pharmacogenomic genes and one of associated variants.
+
+    SDEV-2733 - criteria for what counts as a "pharmacogenomic" variant
+
+    In short:
+    * Statement 'source' is not 'CGI' or 'CIViC'
+    * Statement 'relevance' is 'increased toxicity' or 'decreased toxicity'
+    * gene is gotten from any associated 'PositionalVariant' records
+
+    Example: https://graphkb.bcgsc.ca/view/Statement/154:9574
+
+    Returns:
+        genes: list of pharmacogenomic genes
+        variants: dictionary mapping pharmacogenomic variant IDs to variant display names
+    """
+    genes = set()
+    non_genes = set()
+    infer_genes = set()
+    variants = {}
+
+    relevance_rids = list(get_terms_set(conn, "pharmacogenomic"))
+
+    for record in conn.query(
+        {
+            "target": "Statement",
+            "filters": [
+                {
+                    "relevance": {
+                        "target": "Vocabulary",
+                        "filters": {"@rid": relevance_rids},
+                    },
+                }
+            ],
+            "returnProperties": [
+                "conditions.@class",
+                "conditions.@rid",
+                "conditions.displayName",
+                "conditions.reference1.biotype",
+                "conditions.reference1.displayName",
+                "conditions.reference2.biotype",
+                "conditions.reference2.displayName",
+                "source.name",
+            ],
+        },
+        ignore_cache=False,
+    ):
+        if record["source"]:  # type: ignore
+            if record["source"]["name"].lower() in PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST:  # type: ignore
+                continue
+
+        for condition in record["conditions"]:  # type: ignore
+            if condition["@class"] == "PositionalVariant":
+                variants[condition["@rid"]] = condition["displayName"]
+                for reference in ["reference1", "reference2"]:
+                    name = (condition.get(reference) or {}).get("displayName", "")
+                    biotype = (condition.get(reference) or {}).get("biotype", "")
+                    if name and biotype == "gene":
+                        genes.add(name)
+                    elif name:
+                        gene = get_preferred_gene_name(conn, name)
+                        if gene:
+                            infer_genes.add((gene, name, biotype))
+                        else:
+                            non_genes.add((name, biotype))
+                            logger.error(
+                                f"Non-gene pharmacogenomic {biotype}: {name} for {condition['displayName']}"
+                            )
+
+    for gene, name, biotype in infer_genes:
+        logger.debug(f"Found gene '{gene}' for '{name}' ({biotype})")
+        genes.add(gene)
+
+    for name, biotype in non_genes:
+        logger.error(f"Unable to find gene for '{name}' ({biotype})")
+
+    return sorted(genes), variants
diff --git a/graphkb/match.py b/graphkb/match.py
@@ -12,7 +12,7 @@
     VARIANT_RETURN_PROPERTIES,
 )
 from .types import BasicPosition, Ontology, ParsedVariant, PositionalVariant, Record, Variant
-from .util import FeatureNotFoundError, convert_to_rid_list, looks_like_rid
+from .util import FeatureNotFoundError, convert_to_rid_list, logger, looks_like_rid
 from .vocab import get_term_tree
 
 FEATURES_CACHE: Set[str] = set()
@@ -26,8 +26,7 @@ def get_equivalent_features(
     source: str = '',
     source_id_version: str = '',
 ) -> List[Ontology]:
-    """
-    Match an equivalent list of features given some input feature name (or ID)
+    """Match an equivalent list of features given some input feature name (or ID).
 
     Args:
         gene_name: the gene name to search features by
@@ -62,14 +61,19 @@ def get_equivalent_features(
     if source:
         filters.append({'source': {'target': 'Source', 'filters': {'name': source}}})
 
+    if gene_name.count('.') == 1 and gene_name.split('.')[-1].isnumeric():
+        # eg. ENSG00000133703.11 or NM_033360.4
+        logger.debug(
+            f"Assuming {gene_name} has a .version_format - ignoring the version for equivalent features"
+        )
+        gene_name = gene_name.split('.')[0]
+
     if is_source_id or source_id_version:
         filters.append({'sourceId': gene_name})
-
         if source_id_version:
             filters.append(
                 {'OR': [{'sourceIdVersion': source_id_version}, {'sourceIdVersion': None}]}
             )
-
     elif FEATURES_CACHE and gene_name.lower() not in FEATURES_CACHE and not ignore_cache:
         return []
     else: