Skip to content

Commit

Permalink
Merge pull request #94 from bcgsc/release/v1.11.0_gene_information
Browse files Browse the repository at this point in the history
Release/v1.11.0 gene information
  • Loading branch information
dustinbleile authored Jun 14, 2023
2 parents d79c449 + 4223785 commit 59d6ca2
Show file tree
Hide file tree
Showing 12 changed files with 192 additions and 124 deletions.
4 changes: 2 additions & 2 deletions graphkb/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .constants import DEFAULT_URL
from .util import GraphKBConnection, logger
from .constants import DEFAULT_URL # noqa: F401
from .util import GraphKBConnection, logger # noqa: F401
1 change: 0 additions & 1 deletion graphkb/constants.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import argparse

from typing import Dict

from .types import CategoryBaseTermMapping
Expand Down
129 changes: 103 additions & 26 deletions graphkb/genes.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
"""
Methods for retrieving gene annotation lists from GraphKB
"""
from typing import Any, Dict, List, Tuple, cast
"""Methods for retrieving gene annotation lists from GraphKB."""
from typing import Any, Dict, List, Sequence, Set, Tuple, cast

from . import GraphKBConnection
from .constants import (
BASE_THERAPEUTIC_TERMS,
CHROMOSOMES,
FAILED_REVIEW_STATUS,
GENERIC_RETURN_PROPERTIES,
GENE_RETURN_PROPERTIES,
ONCOGENE,
ONCOKB_SOURCE_NAME,
Expand Down Expand Up @@ -50,7 +50,7 @@ def _get_oncokb_gene_list(


def get_oncokb_oncogenes(conn: GraphKBConnection) -> List[Ontology]:
"""Gets the list of oncogenes stored in GraphKB derived from OncoKB.
"""Get the list of oncogenes stored in GraphKB derived from OncoKB.
Args:
conn: the graphkb connection object
Expand All @@ -62,7 +62,7 @@ def get_oncokb_oncogenes(conn: GraphKBConnection) -> List[Ontology]:


def get_oncokb_tumour_supressors(conn: GraphKBConnection) -> List[Ontology]:
"""Gets the list of tumour supressor genes stored in GraphKB derived from OncoKB.
"""Get the list of tumour supressor genes stored in GraphKB derived from OncoKB.
Args:
conn: the graphkb connection object
Expand Down Expand Up @@ -116,8 +116,7 @@ def get_genes_from_variant_types(
source_record_ids: List[str] = [],
ignore_cache: bool = False,
) -> List[Ontology]:
"""
Retrieve a list of Genes which are found in variants on the given types
"""Retrieve a list of Genes which are found in variants on the given types.
Args:
conn: the graphkb connection object
Expand All @@ -127,35 +126,36 @@ def get_genes_from_variant_types(
Returns:
List.<dict>: gene (Feature) records
"""
filters: List[Dict[str, Any]] = []
if types:
filters.append(
{'type': {'target': 'Vocabulary', 'filters': {'name': types, 'operator': 'IN'}}}
)

variants = cast(
List[Variant],
conn.query(
{
'target': 'Variant',
'filters': [
{'type': {'target': 'Vocabulary', 'filters': {'name': types, 'operator': 'IN'}}}
],
'filters': filters,
'returnProperties': ['reference1', 'reference2'],
},
ignore_cache=ignore_cache,
),
)

genes = set()

for variant in variants:
genes.add(variant['reference1'])

if variant['reference2']:
genes.add(variant['reference2'])
if not genes:
return []

filters: List[Dict[str, Any]] = [{'biotype': 'gene'}]

if source_record_ids:
filters.append({'source': source_record_ids, 'operator': 'IN'})

if not genes:
return []
result = cast(
List[Ontology],
conn.query(
Expand Down Expand Up @@ -238,10 +238,7 @@ def get_cancer_predisposition_info(conn: GraphKBConnection) -> Tuple[List[str],
"target": "Source",
"filters": {"@rid": get_rid(conn, "Source", "CGL")},
},
"relevance": {
"target": "Vocabulary",
"filters": {"@rid": relevance_rids},
},
"relevance": {"target": "Vocabulary", "filters": {"@rid": relevance_rids}},
}
],
"returnProperties": [
Expand Down Expand Up @@ -312,12 +309,7 @@ def get_pharmacogenomic_info(conn: GraphKBConnection) -> Tuple[List[str], Dict[s
{
"target": "Statement",
"filters": [
{
"relevance": {
"target": "Vocabulary",
"filters": {"@rid": relevance_rids},
},
}
{"relevance": {"target": "Vocabulary", "filters": {"@rid": relevance_rids}}}
],
"returnProperties": [
"conditions.@class",
Expand Down Expand Up @@ -362,3 +354,88 @@ def get_pharmacogenomic_info(conn: GraphKBConnection) -> Tuple[List[str], Dict[s
logger.error(f"Unable to find gene for '{name}' ({biotype})")

return sorted(genes), variants


def convert_to_rid_set(records: Sequence[Dict]) -> Set[str]:
return {r['@rid'] for r in records}


def get_gene_information(
graphkb_conn: GraphKBConnection, gene_names: Sequence[str]
) -> List[Dict[str, bool]]:
"""Create a list of gene_info flag dicts for IPR report upload.
Function is originally from pori_ipr_python::annotate.py
Gene flags (categories) are: ['cancerRelated', 'knownFusionPartner', 'knownSmallMutation',
'oncogene', 'therapeuticAssociated', 'tumourSuppressor']
Args:
graphkb_conn ([type]): [description]
gene_names ([type]): [description]
Returns:
List of gene_info dicts of form [{'name':<gene_str>, <flag>: True}]
Keys of False values are simply omitted from ipr upload to reduce info transfer.
eg. [{'cancerRelated': True,
'knownFusionPartner': True,
'knownSmallMutation': True,
'name': 'TERT',
'oncogene': True}]
"""
logger.info('fetching variant related genes list')
# For query speed, only fetch the minimum needed details
ret_props = [
'conditions.@rid',
'conditions.@class',
'conditions.reference1',
'conditions.reference2',
'reviewStatus',
]
body: Dict[str, Any] = {'target': 'Statement', 'returnProperties': ret_props}

gene_names = sorted(set(gene_names))
statements = graphkb_conn.query(body)
statements = [s for s in statements if s.get('reviewStatus') != FAILED_REVIEW_STATUS]

gene_flags: Dict[str, Set[str]] = {
'cancerRelated': set(),
'knownFusionPartner': set(),
'knownSmallMutation': set(),
}

for statement in statements:
for condition in statement['conditions']:
if not condition.get('reference1'):
continue
gene_flags['cancerRelated'].add(condition['reference1'])
if condition['reference2']:
gene_flags['cancerRelated'].add(condition['reference2'])
gene_flags['knownFusionPartner'].add(condition['reference1'])
gene_flags['knownFusionPartner'].add(condition['reference2'])
elif condition['@class'] == 'PositionalVariant':
gene_flags['knownSmallMutation'].add(condition['reference1'])

logger.info('fetching oncogenes list')
gene_flags['oncogene'] = convert_to_rid_set(get_oncokb_oncogenes(graphkb_conn))
logger.info('fetching tumour supressors list')
gene_flags['tumourSuppressor'] = convert_to_rid_set(get_oncokb_tumour_supressors(graphkb_conn))

logger.info('fetching therapeutic associated genes lists')
gene_flags['therapeuticAssociated'] = convert_to_rid_set(
get_therapeutic_associated_genes(graphkb_conn)
)

logger.info(f"Setting gene_info flags on {len(gene_names)} genes")
result = []
for gene_name in gene_names:
equivalent = convert_to_rid_set(get_equivalent_features(graphkb_conn, gene_name))
row = {'name': gene_name}
flagged = False
for flag in gene_flags:
# make smaller JSON to upload since all default to false already
if equivalent.intersection(gene_flags[flag]):
row[flag] = flagged = True
if flagged:
result.append(row)

return result
16 changes: 5 additions & 11 deletions graphkb/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
)
from .types import BasicPosition, Ontology, ParsedVariant, PositionalVariant, Record, Variant
from .util import (
convert_to_rid_list,
FeatureNotFoundError,
convert_to_rid_list,
logger,
looks_like_rid,
stringifyVariant,
Expand Down Expand Up @@ -431,11 +431,7 @@ def match_positional_variant(
gene1 = parsed['reference1']

gene1_features = get_equivalent_features(
conn,
gene1,
source=gene_source,
is_source_id=gene_is_source_id,
ignore_cache=ignore_cache,
conn, gene1, source=gene_source, is_source_id=gene_is_source_id, ignore_cache=ignore_cache
)
features = convert_to_rid_list(gene1_features)

Expand Down Expand Up @@ -496,9 +492,7 @@ def match_positional_variant(
):
# TODO: Check if variant and reference_variant should be interchanged
if compare_positional_variants(
variant=parsed,
reference_variant=cast(PositionalVariant, row),
generic=True,
variant=parsed, reference_variant=cast(PositionalVariant, row), generic=True
):
filtered_similarAndGeneric.append(row)
if compare_positional_variants(
Expand All @@ -521,7 +515,7 @@ def match_positional_variant(
'returnProperties': POS_VARIANT_RETURN_PROPERTIES,
},
ignore_cache=ignore_cache,
),
)
)

# disambiguate the variant type
Expand Down Expand Up @@ -597,7 +591,7 @@ def cat_variant_query(
'returnProperties': POS_VARIANT_RETURN_PROPERTIES,
},
ignore_cache=ignore_cache,
),
)
)

result: Dict[str, Variant] = {}
Expand Down
4 changes: 1 addition & 3 deletions graphkb/statement.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@ def categorize_relevance(


def get_statements_from_variants(
graphkb_conn: GraphKBConnection,
variants: List[Variant],
failed_review: bool = False,
graphkb_conn: GraphKBConnection, variants: List[Variant], failed_review: bool = False
) -> List[Statement]:
"""Given a list of variant records from GraphKB, return related statements.
Expand Down
32 changes: 7 additions & 25 deletions graphkb/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

from .constants import DEFAULT_LIMIT, DEFAULT_URL, AA_3to1_MAPPING, TYPES_TO_NOTATION
from .constants import DEFAULT_LIMIT, DEFAULT_URL, TYPES_TO_NOTATION, AA_3to1_MAPPING
from .types import OntologyTerm, ParsedVariant, PositionalVariant, Record

QUERY_CACHE: Dict[Any, Any] = {}
Expand Down Expand Up @@ -130,12 +130,7 @@ def load(self) -> Optional[float]:
)
return None

def request(
self,
endpoint: str,
method: str = 'GET',
**kwargs,
) -> Dict:
def request(self, endpoint: str, method: str = 'GET', **kwargs) -> Dict:
"""Request wrapper to handle adding common headers and logging.
Args:
Expand Down Expand Up @@ -174,11 +169,7 @@ def request(
self.refresh_login()
self.request_count += 1
resp = requests.request(
method,
url,
headers=self.headers,
timeout=timeout,
**kwargs,
method, url, headers=self.headers, timeout=timeout, **kwargs
)
if resp.status_code == 401 or resp.status_code == 403:
logger.debug(f'/{endpoint} - {resp.status_code} - retrying')
Expand Down Expand Up @@ -276,10 +267,7 @@ def query(
return self.cache[hash_code]

while True:
content = self.post(
'query',
data={**request_body, 'limit': limit, 'skip': len(result)},
)
content = self.post('query', data={**request_body, 'limit': limit, 'skip': len(result)})
records = content['result']
result.extend(records)
if len(records) < limit or not paginate:
Expand Down Expand Up @@ -371,11 +359,7 @@ def stripRefSeq(breakRepr: str) -> str:
return breakRepr


def stripDisplayName(
displayName: str,
withRef: bool = True,
withRefSeq: bool = True,
) -> str:
def stripDisplayName(displayName: str, withRef: bool = True, withRefSeq: bool = True) -> str:
match: object = re.search(r"^(.*)(\:)(.*)$", displayName)
if match and not withRef:
if withRefSeq:
Expand Down Expand Up @@ -409,9 +393,7 @@ def stripDisplayName(


def stringifyVariant(
variant: Union[PositionalVariant, ParsedVariant],
withRef: bool = True,
withRefSeq: bool = True,
variant: Union[PositionalVariant, ParsedVariant], withRef: bool = True, withRefSeq: bool = True
) -> str:
"""
Convert variant record to a string representation (displayName/hgvs)
Expand Down Expand Up @@ -516,7 +498,7 @@ def stringifyVariant(
if withRefSeq:
result.append(f"del{refSeq}ins")
else:
result.append(f"delins")
result.append("delins")
else:
result.append(notationType)
if truncation and truncation != 1:
Expand Down
15 changes: 3 additions & 12 deletions graphkb/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,7 @@


def query_by_name(ontology_class: str, base_term_name: str) -> Dict:
return {
'target': ontology_class,
'filters': {'name': base_term_name},
}
return {'target': ontology_class, 'filters': {'name': base_term_name}}


def get_equivalent_terms(
Expand Down Expand Up @@ -53,10 +50,7 @@ def get_equivalent_terms(
convert_to_rid_list(
conn.query(
{
'target': {
'target': root_records,
'queryType': 'descendants',
},
'target': {'target': root_records, 'queryType': 'descendants'},
'queryType': 'similarTo',
'treeEdges': [],
'returnProperties': [
Expand Down Expand Up @@ -107,10 +101,7 @@ def get_term_tree(
List[Ontology],
conn.query(
{
'target': {
'target': base_records,
'queryType': 'ancestors',
},
'target': {'target': base_records, 'queryType': 'ancestors'},
'queryType': 'similarTo',
'treeEdges': [],
'returnProperties': ['sourceId', 'sourceIdVersion', 'deprecated', 'name', '@rid'],
Expand Down
Loading

0 comments on commit 59d6ca2

Please sign in to comment.