Merge remote-tracking branch 'upstream/develop' into develop

kbaseattic · Jul 21, 2020 · bc27f83 · bc27f83
2 parents 43109eb + 53eecfe
commit bc27f83
Show file tree

Hide file tree

Showing 45 changed files with 1,765 additions and 33 deletions.
diff --git a/README.md b/README.md
@@ -2,14 +2,14 @@
 
 This repo holds the [stored queries](stored_queries), [schemas](schemas), and [migrations](migrations) for the relation engine graph database service.
 
-These specifications are used by the [Relation Engine API]()
+These specifications are used by the [Relation Engine API](https://github.com/kbase/relation_engine_api).
 
-* **Stored queries** are stored [AQL queries](https://docs.arangodb.com/3.3/AQL/index.html) that can be used
+* **Stored queries** are stored [AQL queries](https://docs.arangodb.com/3.5/AQL/index.html) that can be used
 by KBase apps to fetch data from the database.
 * **Schemas** are [JSON schemas](https://json-schema.org/) that define what form of data can be stored in
 the database's collections.
-* **Migrations** are python modules that connect to the database and are responsible for transitioning the data in a collection from an old schema to a newer one.
 * **Data sources** (in `data_sources/`) contains some general information about where some of our imported data comes from.
+* **Views** (in `views/`) are raw ArangoSearch view configuration files
 
 ## Development
 

diff --git a/data_sources/rdp_taxonomy.yaml b/data_sources/rdp_taxonomy.yaml
@@ -3,4 +3,4 @@ category: taxonomy
 title: Ribosomal Database Project
 home_url: http://rdp.cme.msu.edu/taxomatic/main.spr
 data_url: http://rdp.cme.msu.edu/misc/resources.jsp
-logo_url: /images/third-party-data-sources/ncbi/logo-51-64.png
+logo_path: /images/third-party-data-sources/ncbi/logo-51-64.png
diff --git a/importers/README.md b/importers/README.md
@@ -0,0 +1,19 @@
+# RE Importers
+
+This directory holds python modules that import data into ArangoDB.
+
+## Running importers
+
+Configure importers through environment variables with the `RES_` prefix (which stands for Relation Engine Spec).
+
+Global env vars:
+
+* `RES_AUTH_TOKEN` - auth token to use when making requests to RE API - defaults to test value
+* `RES_API_URL` - url to use for the RE API - defaults to test value
+
+### djornl
+
+```sh
+RES_ROOT_DATA_PATH=/path/to/djornl_data \
+python -m importers.djornl.main
+```
diff --git a/importers/djornl/main.py b/importers/djornl/main.py
@@ -0,0 +1,11 @@
+"""
+Loads the Dan Jacobson/ORNL group's gene and phenotype network data into
+arangodb.
+
+Running this requires a set of source files provided by the ORNL group.
+"""
+from importers.djornl.parser import DJORNL_Parser
+
+if __name__ == '__main__':
+    parser = DJORNL_Parser()
+    parser.load_data()
diff --git a/importers/djornl/parser.py b/importers/djornl/parser.py
@@ -0,0 +1,214 @@
+"""
+Loads the Dan Jacobson/ORNL group's gene and phenotype network data into
+arangodb.
+
+Running this requires a set of source files provided by the ORNL group.
+"""
+import json
+import requests
+import os
+import csv
+
+import importers.utils.config as config
+
+
+class DJORNL_Parser(object):
+
+    def config(self):
+        if not hasattr(self, '_config'):
+            return self._configure()
+
+        return self._config
+
+    def _configure(self):
+
+        configuration = config.load_from_env(extra_required=['ROOT_DATA_PATH'])
+
+        # Collection name config
+        configuration['_NODE_NAME'] = 'djornl_node'
+        configuration['_EDGE_NAME'] = 'djornl_edge'
+
+        # Path config
+        configuration['_NODE_PATH'] = os.path.join(
+            configuration['ROOT_DATA_PATH'],
+            'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv'
+        )
+        configuration['_NODE_FILE_COL_COUNT'] = 20
+
+        configuration['_EDGE_PATH'] = os.path.join(
+            configuration['ROOT_DATA_PATH'],
+            'merged_edges-AMW-060820_AF.tsv'
+        )
+        configuration['_EDGE_FILE_COL_COUNT'] = 5
+
+        _CLUSTER_BASE = os.path.join(configuration['ROOT_DATA_PATH'], 'cluster_data')
+        configuration['_CLUSTER_PATHS'] = {
+            'cluster_I2': os.path.join(
+                _CLUSTER_BASE,
+                'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv'
+            ),
+            'cluster_I4': os.path.join(
+                _CLUSTER_BASE,
+                'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv'
+            ),
+            'cluster_I6': os.path.join(
+                _CLUSTER_BASE,
+                'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv'
+            ),
+        }
+        self._config = configuration
+        return self._config
+
+
+    def load_edges(self):
+        # Headers and sample row:
+        # node1	node2	edge	edge_descrip	layer_descrip
+        # AT1G01370	AT1G57820	4.40001558779779	AraNetv2_log-likelihood-score	AraNetv2-LC_lit-curated-ppi
+        edge_remap = {
+          'AraGWAS-Phenotype_Associations':         'pheno_assn',
+          'AraNetv2-CX_pairwise-gene-coexpression': 'gene_coexpr',
+          'AraNetv2-DC_domain-co-occurrence':       'domain_co_occur',
+          'AraNetv2-HT_high-throughput-ppi':        'ppi_hithru',
+          'AraNetv2-LC_lit-curated-ppi':            'ppi_liter',
+        }
+
+        # dict of nodes, indexed by node ID (node1 and node2 from the file)
+        node_ix = {}
+        edges = []
+        node_name = self.config()['_NODE_NAME']
+        expected_col_count = self.config()['_EDGE_FILE_COL_COUNT']
+
+        with open(self.config()['_EDGE_PATH']) as fd:
+            csv_reader = csv.reader(fd, delimiter='\t')
+            next(csv_reader, None)  # skip headers
+            line_no = 1
+            for row in csv_reader:
+                line_no += 1
+
+                cols = [c.strip() for c in row]
+                if len(cols) != expected_col_count:
+                    n_cols = len(cols)
+                    raise RuntimeError(f"line {line_no}: expected {expected_col_count} cols, found {n_cols}")
+
+                node_ix[cols[0]] = 1
+                node_ix[cols[1]] = 1
+                edge_type = cols[4]
+                if edge_type not in edge_remap:
+                    raise RuntimeError(f"line {line_no}: invalid edge type: {edge_type}")
+
+                edges.append({
+                    '_key': f'{cols[0]}__{cols[1]}__{edge_remap[edge_type]}__{cols[2]}',
+                    '_from': f'{node_name}/{cols[0]}',
+                    '_to': f'{node_name}/{cols[1]}',
+                    'score': float(cols[2]),
+                    'edge_type': edge_remap[edge_type],
+                })
+
+        return {
+            'nodes': [{'_key': n} for n in node_ix.keys()],
+            'edges': edges,
+        }
+
+
+    def load_node_metadata(self):
+        """Load node metadata"""
+
+        nodes = []
+        expected_col_count = self.config()['_NODE_FILE_COL_COUNT']
+        with open(self.config()['_NODE_PATH']) as fd:
+            csv_reader = csv.reader(fd, delimiter=',')
+            next(csv_reader, None)  # skip headers
+            line_no = 1
+            for row in csv_reader:
+                line_no += 1
+
+                cols = [c.strip() for c in row]
+                if len(cols) != expected_col_count:
+                    n_cols = len(cols)
+                    raise RuntimeError(f"line {line_no}: expected {expected_col_count} cols, found {n_cols}")
+
+                _key = cols[0]
+                node_type = cols[1]
+                if node_type != 'gene' and node_type != 'pheno':
+                    raise RuntimeError(f"line {line_no}: invalid node type: {node_type}")
+
+                go_terms = [c.strip() for c in cols[10].split(',')] if len(cols[10]) else []
+
+                doc = {
+                    '_key': _key,
+                    'node_type': node_type,
+                    'transcript': cols[2],
+                    'gene_symbol': cols[3],
+                    'gene_full_name': cols[4],
+                    'gene_model_type': cols[5],
+                    'tair_computational_desc': cols[6],
+                    'tair_curator_summary': cols[7],
+                    'tair_short_desc': cols[8],
+                    'go_descr': cols[9],
+                    'go_terms': go_terms,
+                    'mapman_bin': cols[11],
+                    'mapman_name': cols[12],
+                    'mapman_desc': cols[13],
+                    'pheno_aragwas_id': cols[14],
+                    'pheno_desc1': cols[15],
+                    'pheno_desc2': cols[16],
+                    'pheno_desc3': cols[17],
+                    'pheno_ref': cols[18],
+                    'user_notes': cols[19],
+                }
+                nodes.append(doc)
+
+        return {'nodes': nodes}
+
+
+    def load_cluster_data(self):
+        """Annotate genes with cluster ID fields."""
+        nodes = []
+        cluster_paths = self.config()['_CLUSTER_PATHS']
+        for (cluster_label, path) in cluster_paths.items():
+            with open(path) as fd:
+                csv_reader = csv.reader(fd, delimiter='\t')
+                for row in csv_reader:
+                    if len(row) > 1:
+                        # remove the 'Cluster' text
+                        cluster_id = row[0].replace('Cluster','')
+                        gene_keys = row[1:]
+                        nodes += [
+                            {'_key': key, cluster_label: int(cluster_id)}
+                            for key in gene_keys
+                        ]
+
+        return {'nodes': nodes}
+
+
+    def save_dataset(self, dataset):
+
+        if 'nodes' in dataset and len(dataset['nodes']) > 0:
+            self.save_docs(self.config()['_NODE_NAME'], dataset['nodes'])
+
+        if 'edges' in dataset and len(dataset['edges']) > 0:
+            self.save_docs(self.config()['_EDGE_NAME'], dataset['edges'])
+
+
+    def save_docs(self, coll_name, docs, on_dupe='update'):
+
+        resp = requests.put(
+            self.config()['API_URL'] + '/api/v1/documents',
+            params={'collection': coll_name, 'on_duplicate': on_dupe},
+            headers={'Authorization': self.config()['AUTH_TOKEN']},
+            data='\n'.join(json.dumps(d) for d in docs)
+        )
+        if not resp.ok:
+            raise RuntimeError(resp.text)
+
+        print(f"Saved docs to collection {coll_name}!")
+        print(resp.text)
+        print('=' * 80)
+        return resp
+
+
+    def load_data(self):
+        self.save_dataset(self.load_edges())
+        self.save_dataset(self.load_node_metadata())
+        self.save_dataset(self.load_cluster_data())
+
diff --git a/importers/tox.ini b/importers/tox.ini
@@ -0,0 +1,3 @@
+[flake8]
+; ignore line length
+ignore = E501
diff --git a/importers/utils/config.py b/importers/utils/config.py
@@ -0,0 +1,28 @@
+"""
+Loads and initializes configuration data for importers using environment
+variables and a set of default values.
+"""
+import os
+
+
+REQUIRED = []
+OPTIONAL = ['AUTH_TOKEN', 'API_URL']
+DEFAULTS = {
+    'AUTH_TOKEN': 'admin_token',  # test default
+    'API_URL': 'http://localhost:5000',  # test default
+}
+
+
+def load_from_env(extra_required=None, extra_optional=None, prefix='RES_'):
+    """Load all configuration vars from environment variables"""
+    conf = dict(DEFAULTS)
+    required = list(REQUIRED) + (extra_required or [])
+    optional = list(OPTIONAL) + (extra_optional or [])
+    for field in required:
+        if (prefix + field) not in os.environ:
+            print(f"Missing required env var: {prefix + field}")
+            exit(1)
+    for field in required + optional:
+        if (prefix + field) in os.environ:
+            conf[field] = os.environ[prefix + field]
+    return conf
diff --git a/migrations/README.md b/migrations/README.md
diff --git a/migrations/example.py b/migrations/example.py
diff --git a/schemas/ENVO/ENVO_terms.yaml b/schemas/ENVO/ENVO_terms.yaml
@@ -144,4 +144,4 @@ schema:
   - comments
   - subsets
   - synonyms
-  - xrefs
+  - xrefs
diff --git a/schemas/djornl/djornl_edge.yaml b/schemas/djornl/djornl_edge.yaml
@@ -0,0 +1,47 @@
+name: djornl_edge
+type: edge
+delta: false
+
+indexes:
+ - type: hash
+   fields: [edge_type]
+ - type: persistent
+   fields: [score]
+
+schema:
+  "$schema": http://json-schema.org/draft-07/schema#
+  title: Arabidopsis gene-gene or gene-phenotype edge
+  description: Generic gene-to-gene or gene-to-phenotype edge for Dan Jacobson Arabidopsis data
+  type: object
+  required: [score, edge_type, _from, _to, _key]
+  properties:
+    _key:
+      type: string
+      title: Key
+    _from:
+      type: string
+      title: Gene ID
+    _to:
+      type: string
+      title: Gene or Phenotype ID
+    score:
+      title: Edge Score (Weight)
+      # (float)
+      type: number
+    edge_type:
+      title: Edge Type
+      type: string
+      oneOf:
+        - const: domain_co_occur
+          description: A layer of protein domain co-occurrence values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated from weighted mutual information scores to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
+        - const: gene_coexpr
+          description: A subset of pairwise gene coexpression values from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were
+            calculated from Pearson correlation coefficients to normalize the data
+            for comparison across studies and different types of data layers (Lee et
+            al, 2015).
+        - const: pheno_assn
+          description: GWAS associations produced by analyzing a subset of phenotypes and SNPs in the Arabidopsis 1001 Genomes database. Edge values are significant association scores after FDR correction.
+        - const: ppi_hithru
+          description: Log likelihood score. A layer of protein-protein interaction values derived from four high-throughput PPI screening experiments; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).
+        - const: ppi_liter
+          description: A layer of protein-protein interaction values from literature-curated small- to medium-scale experimental data; from the Arabidopsis AraNetv2 database. The LLS scores that serve as edge values were calculated to normalize the data for comparison across studies and different types of data layers (Lee et al, 2015).