Skip to content

Commit

Permalink
Merge pull request #78 from bcgsc/release/v1.9.0_test_updates_helper_…
Browse files Browse the repository at this point in the history
…funcs

Release/v1.9.0 test updates helper funcs
  • Loading branch information
dustinbleile authored Feb 22, 2023
2 parents 101b616 + d5a9932 commit 09b33b2
Show file tree
Hide file tree
Showing 11 changed files with 563 additions and 47 deletions.
16 changes: 10 additions & 6 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@ jobs:

runs-on: ubuntu-latest
strategy:
max-parallel: 4
matrix:
python-version: ['3.6', '3.7', '3.8', '3.9', '3.10']
python-version: ['3.7', '3.8', '3.9', '3.10']

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand All @@ -33,10 +34,13 @@ jobs:
pip install black
black --check -S -l 100 graphkb tests
- name: Test with pytest
run: pytest --junitxml=junit/test-results-${{ matrix.python-version }}.xml --cov graphkb --cov-report term --cov-report xml
run: |
pip list
pytest --junitxml=junit/test-results-${{ matrix.python-version }}.xml --cov graphkb --cov-report term --cov-report xml --durations 10 -vv
env:
GRAPHKB_USER: ${{ secrets.GKB_TEST_USER }}
GRAPHKB_PASS: ${{ secrets.GKB_TEST_PASS }}
EXCLUDE_INTEGRATION_TESTS: 0
- name: Upload pytest test results
uses: actions/upload-artifact@master
with:
Expand All @@ -45,12 +49,12 @@ jobs:
# Use always() to always run this step to publish test results when there are test failures
if: always()
- name: Update code coverage report to CodeCov
uses: codecov/codecov-action@v1
uses: codecov/codecov-action@v3
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: ./coverage.xml
flags: unittests
env_vars: OS,PYTHON
name: codecov-umbrella
fail_ci_if_error: true
if: matrix.python-version == 3.8
if: matrix.python-version == 3.9
11 changes: 9 additions & 2 deletions graphkb/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
from .types import CategoryBaseTermMapping

DEFAULT_LIMIT = 1000

GKB_BASE_URL = "https://graphkb-api.bcgsc.ca/api"
GKB_STAGING_URL = "https://graphkbstaging-api.bcgsc.ca/api"
GKB_DEV_URL = "https://graphkbdev-api.bcgsc.ca/api"
DEFAULT_URL = GKB_BASE_URL

PREFERRED_GENE_SOURCE = "#39:5" # HGNC

BASE_RETURN_PROPERTIES = ['@rid', '@class']

Expand Down Expand Up @@ -61,8 +61,10 @@
ONCOKB_SOURCE_NAME = 'oncokb'
ONCOGENE = 'oncogenic'
TUMOUR_SUPPRESSIVE = 'tumour suppressive'

FUSION_NAMES = ['structural variant', 'fusion']

PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST = ["cancer genome interpreter", "civic"]

BASE_THERAPEUTIC_TERMS = ['therapeutic efficacy', 'eligibility']
# the order here is the order these are applied, the first category matched is returned
RELEVANCE_BASE_TERMS: CategoryBaseTermMapping = [
Expand All @@ -73,6 +75,11 @@
('cancer predisposition', ['pathogenic']),
('biological', ['functional effect', 'tumourigenesis', 'predisposing']),
]
FAILED_REVIEW_STATUS = 'failed'

CHROMOSOMES_HG38 = [f"chr{i}" for i in range(1, 23)] + ['chrX', 'chrY', 'chrM']
CHROMOSOMES_HG19 = [str(i) for i in range(1, 23)] + ['x', 'y', 'mt']
CHROMOSOMES = CHROMOSOMES_HG38 + CHROMOSOMES_HG19

AMBIGUOUS_AA = ['x', '?', 'X']
AA_3to1_MAPPING = {
Expand Down
211 changes: 206 additions & 5 deletions graphkb/genes.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
"""
Methods for retrieving gene annotation lists from GraphKB
"""
from typing import Any, Dict, List, cast
from typing import Any, Dict, List, Tuple, cast

from . import GraphKBConnection
from .constants import (
BASE_THERAPEUTIC_TERMS,
CHROMOSOMES,
GENE_RETURN_PROPERTIES,
ONCOGENE,
ONCOKB_SOURCE_NAME,
PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST,
PREFERRED_GENE_SOURCE,
TUMOUR_SUPPRESSIVE,
)
from .match import get_equivalent_features
from .types import Ontology, Statement, Variant
from .util import get_rid, logger
from .vocab import get_terms_set


Expand Down Expand Up @@ -45,8 +50,7 @@ def _get_oncokb_gene_list(


def get_oncokb_oncogenes(conn: GraphKBConnection) -> List[Ontology]:
"""
Gets the list of oncogenes stored in GraphKB derived from OncoKB
"""Gets the list of oncogenes stored in GraphKB derived from OncoKB.
Args:
conn: the graphkb connection object
Expand All @@ -58,8 +62,7 @@ def get_oncokb_oncogenes(conn: GraphKBConnection) -> List[Ontology]:


def get_oncokb_tumour_supressors(conn: GraphKBConnection) -> List[Ontology]:
"""
Gets the list of tumour supressor genes stored in GraphKB derived from OncoKB
"""Gets the list of tumour supressor genes stored in GraphKB derived from OncoKB.
Args:
conn: the graphkb connection object
Expand Down Expand Up @@ -161,3 +164,201 @@ def get_genes_from_variant_types(
),
)
return result


def get_preferred_gene_name(
conn: GraphKBConnection, gene_name: str, source: str = PREFERRED_GENE_SOURCE
) -> str:
"""Preferred gene symbol of a gene or transcript.
Args:
gene_name: the gene name to search features by
ignore_cache (bool, optional): bypass the cache to always force a new request
source: id of the preferred gene symbol source
Returns:
preferred displayName symbol.
Example:
return KRAS for get_preferred_gene_name(conn, 'NM_033360')
return KRAS for get_preferred_gene_name(conn, 'ENSG00000133703.11')
"""
if gene_name in CHROMOSOMES:
logger.error(f"{gene_name} assumed to be a chromosome, not gene")
return ''
eq = get_equivalent_features(conn=conn, gene_name=gene_name)
genes = [m for m in eq if m.get('biotype') == 'gene' and not m.get('deprecated')]
if not genes:
logger.error(f"No genes found for: {gene_name}")
return ''
if source:
source_filtered_genes = [m for m in genes if m.get('source') == source]
if not source_filtered_genes:
logger.error(f"No data from source {source} for {gene_name}")
else:
genes = source_filtered_genes

gene_names = [g['displayName'] for g in genes if g]
if len(gene_names) > 1:
logger.error(
f"Multiple gene names found for: {gene_name} - using {gene_names[0]}, ignoring {gene_names[1:]}"
)
return gene_names[0]


def get_cancer_predisposition_info(conn: GraphKBConnection) -> Tuple[List[str], Dict[str, str]]:
"""
Return two lists from GraphKB, one of cancer predisposition genes and one of associated variants.
GERO-272 - criteria for what counts as a "cancer predisposition" variant
In short:
* Statement 'source' is 'CGL'
* Statement 'relevance' is 'pathogenic'
* gene is gotten from any associated 'PositionalVariant' records
Example: https://graphkb.bcgsc.ca/view/Statement/155:11616
Returns:
genes: list of cancer predisposition genes
variants: dictionary mapping pharmacogenomic variant IDs to variant display names
"""
genes = set()
non_genes = set()
infer_genes = set()
variants = {}

relevance_rids = list(get_terms_set(conn, "cancer predisposition"))

for record in conn.query(
{
"target": "Statement",
"filters": [
{
"evidence": {
"target": "Source",
"filters": {"@rid": get_rid(conn, "Source", "CGL")},
},
"relevance": {
"target": "Vocabulary",
"filters": {"@rid": relevance_rids},
},
}
],
"returnProperties": [
"conditions.@class",
"conditions.@rid",
"conditions.displayName",
"conditions.reference1.biotype",
"conditions.reference1.displayName",
"conditions.reference2.biotype",
"conditions.reference2.displayName",
],
},
ignore_cache=False,
):
for condition in record["conditions"]: # type: ignore
if condition["@class"] == "PositionalVariant":
variants[condition["@rid"]] = condition["displayName"]
for reference in ["reference1", "reference2"]:
name = (condition.get(reference) or {}).get("displayName", "")
biotype = (condition.get(reference) or {}).get("biotype", "")
if name and biotype == "gene":
genes.add(name)
elif name:
gene = get_preferred_gene_name(conn, name)
if gene:
infer_genes.add((gene, name, biotype))
else:
non_genes.add((name, biotype))
logger.error(
f"Non-gene cancer predisposition {biotype}: {name} for {condition['displayName']}"
)

for gene, name, biotype in infer_genes:
logger.debug(f"Found gene '{gene}' for '{name}' ({biotype})")
genes.add(gene)

for name, biotype in non_genes:
logger.error(f"Unable to find gene for '{name}' ({biotype})")

return sorted(genes), variants


def get_pharmacogenomic_info(conn: GraphKBConnection) -> Tuple[List[str], Dict[str, str]]:
"""
Return two lists from GraphKB, one of pharmacogenomic genes and one of associated variants.
SDEV-2733 - criteria for what counts as a "pharmacogenomic" variant
In short:
* Statement 'source' is not 'CGI' or 'CIViC'
* Statement 'relevance' is 'increased toxicity' or 'decreased toxicity'
* gene is gotten from any associated 'PositionalVariant' records
Example: https://graphkb.bcgsc.ca/view/Statement/154:9574
Returns:
genes: list of pharmacogenomic genes
variants: dictionary mapping pharmacogenomic variant IDs to variant display names
"""
genes = set()
non_genes = set()
infer_genes = set()
variants = {}

relevance_rids = list(get_terms_set(conn, "pharmacogenomic"))

for record in conn.query(
{
"target": "Statement",
"filters": [
{
"relevance": {
"target": "Vocabulary",
"filters": {"@rid": relevance_rids},
},
}
],
"returnProperties": [
"conditions.@class",
"conditions.@rid",
"conditions.displayName",
"conditions.reference1.biotype",
"conditions.reference1.displayName",
"conditions.reference2.biotype",
"conditions.reference2.displayName",
"source.name",
],
},
ignore_cache=False,
):
if record["source"]: # type: ignore
if record["source"]["name"].lower() in PHARMACOGENOMIC_SOURCE_EXCLUDE_LIST: # type: ignore
continue

for condition in record["conditions"]: # type: ignore
if condition["@class"] == "PositionalVariant":
variants[condition["@rid"]] = condition["displayName"]
for reference in ["reference1", "reference2"]:
name = (condition.get(reference) or {}).get("displayName", "")
biotype = (condition.get(reference) or {}).get("biotype", "")
if name and biotype == "gene":
genes.add(name)
elif name:
gene = get_preferred_gene_name(conn, name)
if gene:
infer_genes.add((gene, name, biotype))
else:
non_genes.add((name, biotype))
logger.error(
f"Non-gene pharmacogenomic {biotype}: {name} for {condition['displayName']}"
)

for gene, name, biotype in infer_genes:
logger.debug(f"Found gene '{gene}' for '{name}' ({biotype})")
genes.add(gene)

for name, biotype in non_genes:
logger.error(f"Unable to find gene for '{name}' ({biotype})")

return sorted(genes), variants
14 changes: 9 additions & 5 deletions graphkb/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
VARIANT_RETURN_PROPERTIES,
)
from .types import BasicPosition, Ontology, ParsedVariant, PositionalVariant, Record, Variant
from .util import FeatureNotFoundError, convert_to_rid_list, looks_like_rid
from .util import FeatureNotFoundError, convert_to_rid_list, logger, looks_like_rid
from .vocab import get_term_tree

FEATURES_CACHE: Set[str] = set()
Expand All @@ -26,8 +26,7 @@ def get_equivalent_features(
source: str = '',
source_id_version: str = '',
) -> List[Ontology]:
"""
Match an equivalent list of features given some input feature name (or ID)
"""Match an equivalent list of features given some input feature name (or ID).
Args:
gene_name: the gene name to search features by
Expand Down Expand Up @@ -62,14 +61,19 @@ def get_equivalent_features(
if source:
filters.append({'source': {'target': 'Source', 'filters': {'name': source}}})

if gene_name.count('.') == 1 and gene_name.split('.')[-1].isnumeric():
# eg. ENSG00000133703.11 or NM_033360.4
logger.debug(
f"Assuming {gene_name} has a .version_format - ignoring the version for equivalent features"
)
gene_name = gene_name.split('.')[0]

if is_source_id or source_id_version:
filters.append({'sourceId': gene_name})

if source_id_version:
filters.append(
{'OR': [{'sourceIdVersion': source_id_version}, {'sourceIdVersion': None}]}
)

elif FEATURES_CACHE and gene_name.lower() not in FEATURES_CACHE and not ignore_cache:
return []
else:
Expand Down
Loading

0 comments on commit 09b33b2

Please sign in to comment.