From 08bfb5818f2e1d9fc6e1fe7e2c442bcfbe549286 Mon Sep 17 00:00:00 2001 From: ialarmedalien Date: Wed, 15 Jul 2020 20:59:21 -0700 Subject: [PATCH 1/4] Reorganising cluster data to be a single field with an array of clusters in the form :. The current set of clusters have been renamed from 'cluster_i2' to 'markov_i2' as they were created using Markov clustering with inflation set to 2. --- Makefile | 2 + importers/djornl/parser.py | 47 +- schemas/djornl/djornl_node.yaml | 23 +- .../djornl/djornl_fetch_clusters.yaml | 26 +- test/djornl/results.json | 457 ++++++++++-------- ...p10percent_anno_AF_082919.abc.I2_named.tsv | 2 + ...p10percent_anno_AF_082919.abc.I4_named.tsv | 2 + ...p10percent_anno_AF_082919.abc.I6_named.tsv | 2 + test/stored_queries/test_djornl.py | 92 ++-- test/stored_queries/test_djornl_parser.py | 1 - 10 files changed, 339 insertions(+), 315 deletions(-) diff --git a/Makefile b/Makefile index c3baa81..8c1c446 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,6 @@ .PHONY: test test: + docker-compose down docker-compose run spec sh /app/test/run_tests.sh + docker-compose down \ No newline at end of file diff --git a/importers/djornl/parser.py b/importers/djornl/parser.py index 9eb5c87..c6051e8 100644 --- a/importers/djornl/parser.py +++ b/importers/djornl/parser.py @@ -43,15 +43,15 @@ def _configure(self): _CLUSTER_BASE = os.path.join(configuration['ROOT_DATA_PATH'], 'cluster_data') configuration['_CLUSTER_PATHS'] = { - 'cluster_I2': os.path.join( + 'markov_i2': os.path.join( _CLUSTER_BASE, 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv' ), - 'cluster_I4': os.path.join( + 'markov_i4': os.path.join( _CLUSTER_BASE, 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv' ), - 'cluster_I6': os.path.join( + 'markov_i6': os.path.join( _CLUSTER_BASE, 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv' ), @@ -163,24 +163,49 @@ def load_node_metadata(self): def load_cluster_data(self): """Annotate genes with cluster ID fields.""" - nodes = [] + + # index of nodes + node_ix = {} cluster_paths = self.config()['_CLUSTER_PATHS'] for (cluster_label, path) in cluster_paths.items(): with open(path) as fd: csv_reader = csv.reader(fd, delimiter='\t') for row in csv_reader: if len(row) > 1: - # remove the 'Cluster' text - cluster_id = row[0].replace('Cluster','') - gene_keys = row[1:] - nodes += [ - {'_key': key, cluster_label: int(cluster_id)} - for key in gene_keys - ] + self._parse_cluster_row(row, cluster_label, node_ix) + + + # gather a list of cluster IDs for each node + nodes = [] + for (key, cluster_data) in node_ix.items(): + clusters = [] + for (cluster_label, id_list) in cluster_data.items(): + clusters += [cluster_label + ":" + id for id in id_list] + + nodes += [{ + '_key': key, + 'clusters': clusters + }] return {'nodes': nodes} + def _parse_cluster_row(self, row, cluster_label, node_ix): + # metadata rows start with '#' + if row[0] != '#': + # remove the 'Cluster' text + cluster_id = row[0].replace('Cluster','') + node_keys = row[1:] + + for key in node_keys: + if key not in node_ix: + node_ix[key] = {} + if cluster_label not in node_ix[key]: + node_ix[key][cluster_label] = [] + + node_ix[key][cluster_label].append(cluster_id) + + def save_dataset(self, dataset): if 'nodes' in dataset and len(dataset['nodes']) > 0: diff --git a/schemas/djornl/djornl_node.yaml b/schemas/djornl/djornl_node.yaml index 9248f1c..f200931 100644 --- a/schemas/djornl/djornl_node.yaml +++ b/schemas/djornl/djornl_node.yaml @@ -13,21 +13,14 @@ schema: type: string title: Key examples: ["AT1G01010"] - cluster_I2: - type: integer - title: Cluster 2 ID - description: Iterative random forest cluster group ID - examples: [1] - cluster_I4: - type: integer - title: Cluster 4 ID - description: Iterative random forest cluster group ID - examples: [13] - cluster_I6: - type: integer - title: Cluster 6 ID - description: Iterative random forest cluster group ID - examples: [27] + clusters: + type: array + title: Clusters + description: Clusters to which the node has been assigned + items: + type: string +# pattern: "^\w+:\d+$" + examples: [["markov_i2:1", "markov_i4:5"], ["markov_i6:3"]] node_type: type: string title: Node type diff --git a/stored_queries/djornl/djornl_fetch_clusters.yaml b/stored_queries/djornl/djornl_fetch_clusters.yaml index 4c6b8c5..1fadca3 100644 --- a/stored_queries/djornl/djornl_fetch_clusters.yaml +++ b/stored_queries/djornl/djornl_fetch_clusters.yaml @@ -2,25 +2,13 @@ name: djornl_fetch_clusters description: Fetch all nodes that are members of the specified cluster(s), and the edges and nodes within the specified distance (number of hops) of those nodes. params: type: object + required: [cluster_ids] properties: - cluster_i2_ids: - title: Cluster I2 IDs - description: Cluster I2 IDs to locate - items: {type: integer} - default: [] - examples: [[1], [3, 5]] - cluster_i4_ids: - title: Cluster I4 IDs - description: Cluster I4 IDs to locate - items: {type: integer} - examples: [[2], [4, 6]] - default: [] - cluster_i6_ids: - title: Cluster I6 IDs - description: Cluster I6 IDs to locate - items: {type: integer} - examples: [[666], [999, 333]] - default: [] + cluster_ids: + title: Cluster IDs + description: Cluster IDs, in the form "clustering_system_name:cluster_id" + items: {type: string} + examples: [['markov_i2:5', 'markov_i6:2'],['markov_i6:1']] distance: type: integer title: Traversal Distance @@ -31,7 +19,7 @@ params: query: | LET node_ids = ( FOR n IN djornl_node - FILTER n.cluster_I2 IN @cluster_i2_ids OR n.cluster_I4 IN @cluster_i4_ids OR n.cluster_I6 IN @cluster_i6_ids + FILTER n.clusters ANY IN @cluster_ids FOR node IN 0..@distance ANY n djornl_edge OPTIONS {bfs: true, uniqueVertices: "global"} RETURN DISTINCT node._id diff --git a/test/djornl/results.json b/test/djornl/results.json index a844c2c..7fd3a4d 100644 --- a/test/djornl/results.json +++ b/test/djornl/results.json @@ -27,21 +27,15 @@ }, "load_cluster_data": { "nodes": [ - {"_key": "AT1G01010", "cluster_I2": 1}, - {"_key": "AT1G01030", "cluster_I2": 1}, - {"_key": "AT1G01040", "cluster_I2": 1}, - {"_key": "AT1G01050", "cluster_I2": 2}, - {"_key": "AT1G01060", "cluster_I2": 2}, - {"_key": "AT1G01070", "cluster_I2": 2}, - {"_key": "AT1G01080", "cluster_I2": 3}, - {"_key": "AT1G01090", "cluster_I2": 3}, - {"_key": "AT1G01020", "cluster_I2": 5}, - {"_key": "AT1G01040", "cluster_I6": 1}, - {"_key": "AT1G01090", "cluster_I6": 1}, - {"_key": "AT1G01070", "cluster_I6": 2}, - {"_key": "AT1G01010", "cluster_I6": 3}, - {"_key": "AT1G01020", "cluster_I6": 3}, - {"_key": "AT1G01030", "cluster_I6": 3} + {"_key": "AT1G01010", "clusters": ["markov_i2:1", "markov_i6:3"]}, + {"_key": "AT1G01030", "clusters": ["markov_i2:1", "markov_i6:3"]}, + {"_key": "AT1G01040", "clusters": ["markov_i2:1", "markov_i6:1"]}, + {"_key": "AT1G01050", "clusters": ["markov_i2:2"]}, + {"_key": "AT1G01060", "clusters": ["markov_i2:2"]}, + {"_key": "AT1G01070", "clusters": ["markov_i2:2", "markov_i6:2"]}, + {"_key": "AT1G01080", "clusters": ["markov_i2:3"]}, + {"_key": "AT1G01090", "clusters": ["markov_i2:3", "markov_i6:1"]}, + {"_key": "AT1G01020", "clusters": ["markov_i2:5", "markov_i6:3"]} ] }, "load_node_metadata": { @@ -93,220 +87,265 @@ ] }, "fetch_genes": { - "AT1G01010": { - "0": { - "nodes": ["AT1G01010"], - "edges": [] + "keys": { + "Mary Poppins": { + "distance": { + "0": {"nodes": [], "edges": []}, + "1": {"nodes": [], "edges": []}, + "5": {"nodes": [], "edges": []} + } }, - "1": { - "nodes": [ - "AT1G01010", - "AT1G01020", - "AT1G01030", - "AT1G01040" - ], - "edges": [ - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5" - ] + "AT1G01010": { + "distance": { + "0": { + "nodes": ["AT1G01010"], + "edges": [] + }, + "1": { + "nodes": [ + "AT1G01010", + "AT1G01020", + "AT1G01030", + "AT1G01040" + ], + "edges": [ + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5" + ] + }, + "5": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01030__AT1G01050__gene_coexpr__2.6", + "AT1G01050__AT1G01060__ppi_liter__2.7" + ] + } + } }, - "5": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7" - ] - } - }, - "AT1G01020__AT1G01070": { - "0": { - "nodes": ["AT1G01020", "AT1G01070"], - "edges": [] - }, - "1": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3" - ] - }, - "5": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7" - ] + "AT1G01020__AT1G01070": { + "distance": { + "0": { + "nodes": ["AT1G01020", "AT1G01070"], + "edges": [] + }, + "1": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3" + ] + }, + "5": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01030__AT1G01050__gene_coexpr__2.6", + "AT1G01050__AT1G01060__ppi_liter__2.7" + ] + } + } } } }, "fetch_phenotypes": { - "As2": { - "0": { - "nodes": ["As2"], - "edges": [] - }, - "1": { - "nodes": ["As2", "AT1G01020", "AT1G01040"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4" - ] - }, - "5": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7" - ] - } - }, - "As2__Na23": { - "0": { - "nodes": ["As2", "Na23"], - "edges": [] + "keys": { + "Mary Poppins": { + "distance": { + "0": {"nodes": [], "edges": []}, + "1": {"nodes": [], "edges": []}, + "5": {"nodes": [], "edges": []} + } }, - "1": { - "nodes": ["As2", "Na23", "AT1G01020", "AT1G01040"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4" - ] + "As2": { + "distance": { + "0": { + "nodes": ["As2"], + "edges": [] + }, + "1": { + "nodes": ["As2", "AT1G01020", "AT1G01040"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4" + ] + }, + "5": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01030__AT1G01050__gene_coexpr__2.6", + "AT1G01050__AT1G01060__ppi_liter__2.7" + ] + } + } }, - "5": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "Na23"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7" - ] + "As2__Na23": { + "distance": { + "0": { + "nodes": ["As2", "Na23"], + "edges": [] + }, + "1": { + "nodes": ["As2", "Na23", "AT1G01020", "AT1G01040"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4" + ] + }, + "5": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "Na23"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01030__AT1G01050__gene_coexpr__2.6", + "AT1G01050__AT1G01060__ppi_liter__2.7" + ] + } + } } } }, "search_nodes": { - "Mary Poppins": { - "0": {"nodes": [], "edges": []}, - "1": {"nodes": [], "edges": []}, - "5": {"nodes": [], "edges": []} - }, - "GO:0005515": { - "0": { - "nodes": ["AT1G01040", "AT1G01090"], - "edges": [] - }, - "1": { - "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"], - "edges": [ - "As2__AT1G01040__pheno_assn__5.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01080__AT1G01090__ppi_liter__2.8" - ] + "search_text": { + "Mary Poppins": { + "distance": { + "0": {"nodes": [], "edges": []}, + "1": {"nodes": [], "edges": []}, + "5": {"nodes": [], "edges": []} + } }, - "5": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080", "AT1G01090"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7", - "AT1G01080__AT1G01090__ppi_liter__2.8" - ] + "GO:0005515": { + "distance": { + "0": { + "nodes": ["AT1G01040", "AT1G01090"], + "edges": [] + }, + "1": { + "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"], + "edges": [ + "As2__AT1G01040__pheno_assn__5.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01080__AT1G01090__ppi_liter__2.8" + ] + }, + "5": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080", "AT1G01090"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01030__AT1G01050__gene_coexpr__2.6", + "AT1G01050__AT1G01060__ppi_liter__2.7", + "AT1G01080__AT1G01090__ppi_liter__2.8" + ] + } + } } } }, "fetch_clusters": { - "i6-1": { - "0": { - "nodes": ["AT1G01040", "AT1G01090"], - "edges": [] - }, - "1": { - "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"], - "edges": [ - "As2__AT1G01040__pheno_assn__5.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01080__AT1G01090__ppi_liter__2.8" - ] - }, - "5": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080", "AT1G01090"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7", - "AT1G01080__AT1G01090__ppi_liter__2.8" - ] - } - }, - "i2-5__i6-2": { - "0": { - "nodes": ["AT1G01020", "AT1G01070"], - "edges": [] + "cluster_ids": { + "Mary Poppins": { + "distance": { + "0": {"nodes": [], "edges": []}, + "1": {"nodes": [], "edges": []}, + "5": {"nodes": [], "edges": []} + } }, - "1": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3" - ] + "markov_i6:1": { + "distance": { + "0": { + "nodes": ["AT1G01040", "AT1G01090"], + "edges": [] + }, + "1": { + "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"], + "edges": [ + "As2__AT1G01040__pheno_assn__5.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01080__AT1G01090__ppi_liter__2.8" + ] + }, + "5": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080", "AT1G01090"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01030__AT1G01050__gene_coexpr__2.6", + "AT1G01050__AT1G01060__ppi_liter__2.7", + "AT1G01080__AT1G01090__ppi_liter__2.8" + ] + } + } }, - "5": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7" - ] + "markov_i2:5__markov_i6:2": { + "distance": { + "0": { + "nodes": ["AT1G01020", "AT1G01070"], + "edges": [] + }, + "1": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3" + ] + }, + "5": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01030__AT1G01050__gene_coexpr__2.6", + "AT1G01050__AT1G01060__ppi_liter__2.7" + ] + } + } } } } diff --git a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv b/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv index 086a920..585e0a5 100644 --- a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv +++ b/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv @@ -1,3 +1,5 @@ +# prefix: markov_i2 +# title: Markov clustering, inflation = 2 Cluster1 AT1G01010 AT1G01030 AT1G01040 Cluster2 AT1G01050 AT1G01060 AT1G01070 Cluster3 AT1G01080 AT1G01090 diff --git a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv b/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv index 8b13789..3cb18ea 100644 --- a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv +++ b/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv @@ -1 +1,3 @@ +# prefix: markov_i4 +# title: Markov clustering, inflation = 4 diff --git a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv b/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv index 389cae2..d6a1b07 100644 --- a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv +++ b/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv @@ -1,3 +1,5 @@ +# prefix: markov_i6 +# title: Markov clustering, inflation = 6 Cluster1 AT1G01040 AT1G01090 Cluster2 AT1G01070 Cluster3 AT1G01010 AT1G01020 AT1G01030 diff --git a/test/stored_queries/test_djornl.py b/test/stored_queries/test_djornl.py index df2a7e5..e61835b 100644 --- a/test/stored_queries/test_djornl.py +++ b/test/stored_queries/test_djornl.py @@ -79,6 +79,7 @@ def check_expected_results(self, description, response, expected): if _VERBOSE: print("Running test " + description) + results = response['results'][0] self.assertEqual( set([n["_key"] for n in results['nodes']]), @@ -112,25 +113,29 @@ def test_fetch_all(self): # indexing schema in results.json - # self.json_data[query][primary_param][distance_param] - # if primary_param is an array, join the array entities with "__" + # self.json_data[query_name][param_name][param_value]["distance"][distance_param] + # e.g. for fetch_clusters data: + # "fetch_clusters": { + # "cluster_ids": { + # "markov_i2:6__markov_i4:3": { + # "distance": { + # 1: { + # "nodes": [ node IDs ], + # "edges": [ edge data ], + # } + # } + # } + # } + # } + # if param_value is an array, join the array entities with "__" # results are in the form {"nodes": [...], "edges": [...]} # nodes are represented as a list of node[_key] # edges are objects with keys _to, _from, edge_type and score - def test_fetch_phenotypes_no_results(self): - - resp = self.submit_query('djornl_fetch_phenotypes', { - # gene node - "keys": ["AT1G01010"], - }) - self.assertEqual(resp['results'][0], self.no_results) - - def test_fetch_phenotypes(self): - for fetch_args in self.json_data['fetch_phenotypes'].keys(): - for distance in self.json_data['fetch_phenotypes'][fetch_args].keys(): + for (fetch_args, key_data) in self.json_data['fetch_phenotypes']['keys'].items(): + for (distance, distance_data) in key_data['distance'].items(): resp = self.submit_query('djornl_fetch_phenotypes', { "keys": fetch_args.split('__'), "distance": int(distance), @@ -138,22 +143,14 @@ def test_fetch_phenotypes(self): self.check_expected_results( "fetch phenotypes with args " + fetch_args + " and distance " + distance, resp, - self.json_data['fetch_phenotypes'][fetch_args][distance] + distance_data ) - def test_fetch_genes_no_results(self): - resp = self.submit_query('djornl_fetch_genes', { - # phenotype node - "keys": ["As2"], - }) - self.assertEqual(resp['results'][0], self.no_results) - - def test_fetch_genes(self): - for fetch_args in self.json_data['fetch_genes'].keys(): - for distance in self.json_data['fetch_genes'][fetch_args].keys(): + for (fetch_args, key_data) in self.json_data['fetch_genes']['keys'].items(): + for (distance, distance_data) in key_data['distance'].items(): resp = self.submit_query('djornl_fetch_genes', { "keys": fetch_args.split('__'), "distance": int(distance), @@ -161,54 +158,29 @@ def test_fetch_genes(self): self.check_expected_results( "fetch genes with args " + fetch_args + " and distance " + distance, resp, - self.json_data['fetch_genes'][fetch_args][distance] + distance_data ) - def test_fetch_clusters_no_results(self): - - resp = self.submit_query('djornl_fetch_clusters', { - 'cluster_i2_ids': [666], - 'cluster_i4_ids': [666], - 'cluster_i6_ids': [666], - }) - self.assertEqual(resp['results'][0], self.no_results) - - def test_fetch_clusters(self): - for fetch_args in self.json_data['fetch_clusters'].keys(): - cluster_args = {} - for arg in fetch_args.split('__'): - [c_name, c_id] = arg.split('-', maxsplit=1) - if "cluster_" + c_name + "_ids" in cluster_args: - cluster_args["cluster_" + c_name + "_ids"] += int(c_id) - else: - cluster_args["cluster_" + c_name + "_ids"] = [int(c_id)] - - for distance in self.json_data['fetch_clusters'][fetch_args].keys(): - cluster_args['distance'] = int(distance) - resp = self.submit_query('djornl_fetch_clusters', cluster_args) + for (fetch_args, cluster_data) in self.json_data['fetch_clusters']['cluster_ids'].items(): + for (distance, distance_data) in cluster_data['distance'].items(): + resp = self.submit_query('djornl_fetch_clusters', { + "cluster_ids": fetch_args.split('__'), + "distance": int(distance), + }) self.check_expected_results( "fetch clusters with args " + fetch_args + " and distance " + distance, resp, - self.json_data['fetch_clusters'][fetch_args][distance] + distance_data ) - @unittest.skip('This test is disabled until automated view loading is possible') - def test_search_nodes_no_results(self): - - resp = self.submit_query('djornl_search_nodes', { - "search_text": "Mary Poppins", - }) - self.assertEqual(resp['results'][0], self.no_results) - - @unittest.skip('This test is disabled until automated view loading is possible') def test_search_nodes(self): - for search_text in self.json_data['search_nodes'].keys(): - for distance in self.json_data['search_nodes'][search_text].keys(): + for (search_text, search_data) in self.json_data['search_nodes']['search_text'].items(): + for (distance, distance_data) in search_data['distance'].items(): resp = self.submit_query('djornl_search_nodes', { "search_text": search_text, "distance": int(distance), @@ -216,5 +188,5 @@ def test_search_nodes(self): self.check_expected_results( "search nodes with args " + search_text + " and distance " + distance, resp, - self.json_data['search_nodes'][search_text][distance] + distance_data ) diff --git a/test/stored_queries/test_djornl_parser.py b/test/stored_queries/test_djornl_parser.py index b2043b9..8d38761 100644 --- a/test/stored_queries/test_djornl_parser.py +++ b/test/stored_queries/test_djornl_parser.py @@ -142,4 +142,3 @@ def test_load_valid_cluster_data(self): cluster_data, self.json_data["load_cluster_data"] ) - From 93205a274600b6233e6c67f74684aa786d17fbf4 Mon Sep 17 00:00:00 2001 From: ialarmedalien Date: Mon, 20 Jul 2020 15:09:16 -0700 Subject: [PATCH 2/4] Adding in manifest and manifest schema for indicating the list of files that make up the data, plus code to validate the manifest. Created manifests for all test files and updated tests accordingly Added djornl data source (github repo) --- data_sources/djornl.yaml | 5 + importers/djornl/manifest.schema.json | 52 +++ importers/djornl/parser.py | 318 +++++++++++------- schemas/deltaloader/delta_load_registry.yaml | 2 +- schemas/djornl/djornl_node.yaml | 4 + test/djornl/col_count_errors/manifest.yaml | 5 + test/djornl/empty_files/manifest.yaml | 17 + test/djornl/invalid_file/manifest.yaml | 9 + test/djornl/invalid_manifest/manifest.yaml | 10 + ...aragwas-MERGED-AMW-v2_091319_nodeTable.csv | 1 + test/djornl/invalid_types/manifest.yaml | 5 + .../merged_edges-AMW-060820_AF.tsv | 1 + test/djornl/missing_files/manifest.yaml | 9 + ...F_082919.abc.I2_named.tsv => I2_named.tsv} | 1 + ...F_082919.abc.I4_named.tsv => I4_named.tsv} | 2 +- ...F_082919.abc.I6_named.tsv => I6_named.tsv} | 1 + ...rged_edges-AMW-060820_AF.tsv => edges.tsv} | 0 test/djornl/test_data/manifest.yaml | 19 ++ ...-AMW-v2_091319_nodeTable.csv => nodes.csv} | 1 + test/stored_queries/test_djornl.py | 24 +- test/stored_queries/test_djornl_parser.py | 89 +++-- 21 files changed, 405 insertions(+), 170 deletions(-) create mode 100644 data_sources/djornl.yaml create mode 100644 importers/djornl/manifest.schema.json create mode 100644 test/djornl/col_count_errors/manifest.yaml create mode 100644 test/djornl/empty_files/manifest.yaml create mode 100644 test/djornl/invalid_file/manifest.yaml create mode 100644 test/djornl/invalid_manifest/manifest.yaml create mode 100644 test/djornl/invalid_types/manifest.yaml create mode 100644 test/djornl/missing_files/manifest.yaml rename test/djornl/test_data/{cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv => I2_named.tsv} (90%) rename test/djornl/test_data/{cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv => I4_named.tsv} (74%) rename test/djornl/test_data/{cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv => I6_named.tsv} (88%) rename test/djornl/test_data/{merged_edges-AMW-060820_AF.tsv => edges.tsv} (100%) create mode 100644 test/djornl/test_data/manifest.yaml rename test/djornl/test_data/{aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv => nodes.csv} (99%) diff --git a/data_sources/djornl.yaml b/data_sources/djornl.yaml new file mode 100644 index 0000000..495aa8a --- /dev/null +++ b/data_sources/djornl.yaml @@ -0,0 +1,5 @@ +name: djornl +category: network +title: Jacobson Lab Exascale Networking data +home_url: https://github.com/kbase/exascale_data +data_url: https://github.com/kbase/exascale_data/releases/latest diff --git a/importers/djornl/manifest.schema.json b/importers/djornl/manifest.schema.json new file mode 100644 index 0000000..e29ab28 --- /dev/null +++ b/importers/djornl/manifest.schema.json @@ -0,0 +1,52 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Exascale parser file manifest", + "type": "array", + "items": { + "type": "object", + "required": ["data_type", "path"], + "oneOf": [ + { + "properties": { + "data_type": { "enum": ["cluster"] } + }, + "required": [ "prefix" ] + }, + { + "properties": { + "data_type": { "enum": [ "node", "edge" ] } + } + } + ], + "properties": { + "data_type": { + "title": "Data type", + "type": "string", + "enum": ["node", "edge", "cluster"] + }, + "creation_date": { + "title": "File creation date", + "description": "date of file creation in the format YYYY-MM-DD", + "type": "string", + "format": "date" + }, + "description": { + "title": "Description of the cluster set", + "type": "string" + }, + "path": { + "title": "File path", + "type": "string" + }, + "prefix": { + "title": "Prefix", + "type": "string", + "description": "The prefix to be used for clusters, e.g. markov_i2:4. Required for cluster data, not used for node or edge data" + }, + "title": { + "title": "Name of the cluster set", + "type": "string" + } + } + } +} diff --git a/importers/djornl/parser.py b/importers/djornl/parser.py index c6051e8..ebf7ed1 100644 --- a/importers/djornl/parser.py +++ b/importers/djornl/parser.py @@ -8,6 +8,12 @@ import requests import os import csv +import yaml +import json +import jsonschema + +from jsonschema.validators import Draft7Validator +from jsonschema.exceptions import ValidationError import importers.utils.config as config @@ -28,34 +34,50 @@ def _configure(self): configuration['_NODE_NAME'] = 'djornl_node' configuration['_EDGE_NAME'] = 'djornl_edge' - # Path config - configuration['_NODE_PATH'] = os.path.join( - configuration['ROOT_DATA_PATH'], - 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv' - ) - configuration['_NODE_FILE_COL_COUNT'] = 20 + # read the manifest file, which contains path and file type info + manifest_file = os.path.join(configuration['ROOT_DATA_PATH'], 'manifest.yaml') + + try: + with open(manifest_file) as fd: + manifest = yaml.safe_load(fd) + except FileNotFoundError: + raise RuntimeError( + f"No manifest file found at {manifest_file}.\n" + + "Please ensure that you have created a manifest that lists the files " + + "in the release" + ) + + # load the schema for the manifest and ensure that it is valid + schema_file = os.path.join(os.path.dirname(__file__), 'manifest.schema.json') + with open(schema_file) as fd: + manifest_schema = json.load(fd) + + validator = Draft7Validator(manifest_schema) + if not validator.is_valid(manifest): + error_list = [] + raise RuntimeError( + "The manifest file failed validation with the following errors:\n" + + "\n".join(e.message for e in sorted(validator.iter_errors(manifest), key=str)) + + "\nPlease recheck the file and try again." + ) + + # make sure all the files listed actually exist + for type in ['node', 'edge', 'cluster']: + configuration[type + '_files'] = [] + + for file in manifest: + file_path = os.path.join(configuration['ROOT_DATA_PATH'], file['path']) + + if not os.path.exists(file_path): + raise RuntimeError(f"{file_path}: file does not exist") + + if not os.path.isfile(file_path): + raise RuntimeError(f"{file_path}: not a file") + + # add the file to the appropriate list + file['file_path'] = file_path + configuration[file['data_type'] + '_files'].append(file) - configuration['_EDGE_PATH'] = os.path.join( - configuration['ROOT_DATA_PATH'], - 'merged_edges-AMW-060820_AF.tsv' - ) - configuration['_EDGE_FILE_COL_COUNT'] = 5 - - _CLUSTER_BASE = os.path.join(configuration['ROOT_DATA_PATH'], 'cluster_data') - configuration['_CLUSTER_PATHS'] = { - 'markov_i2': os.path.join( - _CLUSTER_BASE, - 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv' - ), - 'markov_i4': os.path.join( - _CLUSTER_BASE, - 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv' - ), - 'markov_i6': os.path.join( - _CLUSTER_BASE, - 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv' - ), - } self._config = configuration return self._config @@ -76,33 +98,44 @@ def load_edges(self): node_ix = {} edges = [] node_name = self.config()['_NODE_NAME'] - expected_col_count = self.config()['_EDGE_FILE_COL_COUNT'] - - with open(self.config()['_EDGE_PATH']) as fd: - csv_reader = csv.reader(fd, delimiter='\t') - next(csv_reader, None) # skip headers - line_no = 1 - for row in csv_reader: - line_no += 1 - - cols = [c.strip() for c in row] - if len(cols) != expected_col_count: - n_cols = len(cols) - raise RuntimeError(f"line {line_no}: expected {expected_col_count} cols, found {n_cols}") - - node_ix[cols[0]] = 1 - node_ix[cols[1]] = 1 - edge_type = cols[4] - if edge_type not in edge_remap: - raise RuntimeError(f"line {line_no}: invalid edge type: {edge_type}") - - edges.append({ - '_key': f'{cols[0]}__{cols[1]}__{edge_remap[edge_type]}__{cols[2]}', - '_from': f'{node_name}/{cols[0]}', - '_to': f'{node_name}/{cols[1]}', - 'score': float(cols[2]), - 'edge_type': edge_remap[edge_type], - }) + expected_col_count = 0 + headers = [] + + for file in self.config()['edge_files']: + with open(file['file_path']) as fd: + csv_reader = csv.reader(fd, delimiter='\t') + line_no = 0 + for row in csv_reader: + line_no += 1 + if len(row) <= 1 or row[0][0] == '#': + # comment / metadata + continue + + cols = [c.strip() for c in row] + + if len(cols) != expected_col_count: + n_cols = len(cols) + + if len(headers) == 0: + expected_col_count = len(cols) + headers = cols + continue + + raise RuntimeError(f"{file['path']} line {line_no}: expected {expected_col_count} cols, found {n_cols}") + + node_ix[cols[0]] = 1 + node_ix[cols[1]] = 1 + edge_type = cols[4] + if edge_type not in edge_remap: + raise RuntimeError(f"{file['path']} line {line_no}: invalid edge type: {edge_type}") + + edges.append({ + '_key': f'{cols[0]}__{cols[1]}__{edge_remap[edge_type]}__{cols[2]}', + '_from': f'{node_name}/{cols[0]}', + '_to': f'{node_name}/{cols[1]}', + 'score': float(cols[2]), + 'edge_type': edge_remap[edge_type], + }) return { 'nodes': [{'_key': n} for n in node_ix.keys()], @@ -114,49 +147,62 @@ def load_node_metadata(self): """Load node metadata""" nodes = [] - expected_col_count = self.config()['_NODE_FILE_COL_COUNT'] - with open(self.config()['_NODE_PATH']) as fd: - csv_reader = csv.reader(fd, delimiter=',') - next(csv_reader, None) # skip headers - line_no = 1 - for row in csv_reader: - line_no += 1 - - cols = [c.strip() for c in row] - if len(cols) != expected_col_count: - n_cols = len(cols) - raise RuntimeError(f"line {line_no}: expected {expected_col_count} cols, found {n_cols}") - - _key = cols[0] - node_type = cols[1] - if node_type != 'gene' and node_type != 'pheno': - raise RuntimeError(f"line {line_no}: invalid node type: {node_type}") - - go_terms = [c.strip() for c in cols[10].split(',')] if len(cols[10]) else [] - - doc = { - '_key': _key, - 'node_type': node_type, - 'transcript': cols[2], - 'gene_symbol': cols[3], - 'gene_full_name': cols[4], - 'gene_model_type': cols[5], - 'tair_computational_desc': cols[6], - 'tair_curator_summary': cols[7], - 'tair_short_desc': cols[8], - 'go_descr': cols[9], - 'go_terms': go_terms, - 'mapman_bin': cols[11], - 'mapman_name': cols[12], - 'mapman_desc': cols[13], - 'pheno_aragwas_id': cols[14], - 'pheno_desc1': cols[15], - 'pheno_desc2': cols[16], - 'pheno_desc3': cols[17], - 'pheno_ref': cols[18], - 'user_notes': cols[19], - } - nodes.append(doc) + headers = [] + expected_col_count = 0 + valid_node_types = ['gene', 'pheno'] + for file in self.config()['node_files']: + with open(file['file_path']) as fd: + csv_reader = csv.reader(fd, delimiter=',') + line_no = 0 + for row in csv_reader: + line_no += 1 + if len(row) <= 1 or row[0][0] == '#': + # comment / metadata + continue + + cols = [c.strip() for c in row] + if len(cols) != expected_col_count: + + if len(headers) == 0: + # this is the header row; set up the expected column count + expected_col_count = len(cols) + headers = cols + continue + + # otherwise, this row does not have the correct number of columns + n_cols = len(cols) + raise RuntimeError(f"{file['path']} line {line_no}: expected {expected_col_count} cols, found {n_cols}") + + _key = cols[0] + node_type = cols[1] + if node_type not in valid_node_types: + raise RuntimeError(f"{file['path']} line {line_no}: invalid node type: {node_type}") + + go_terms = [c.strip() for c in cols[10].split(',')] if len(cols[10]) else [] + + doc = { + '_key': _key, + 'node_type': node_type, + 'transcript': cols[2], + 'gene_symbol': cols[3], + 'gene_full_name': cols[4], + 'gene_model_type': cols[5], + 'tair_computational_desc': cols[6], + 'tair_curator_summary': cols[7], + 'tair_short_desc': cols[8], + 'go_descr': cols[9], + 'go_terms': go_terms, + 'mapman_bin': cols[11], + 'mapman_name': cols[12], + 'mapman_desc': cols[13], + 'pheno_aragwas_id': cols[14], + 'pheno_desc1': cols[15], + 'pheno_desc2': cols[16], + 'pheno_desc3': cols[17], + 'pheno_ref': cols[18], + 'user_notes': cols[19], + } + nodes.append(doc) return {'nodes': nodes} @@ -166,44 +212,41 @@ def load_cluster_data(self): # index of nodes node_ix = {} - cluster_paths = self.config()['_CLUSTER_PATHS'] - for (cluster_label, path) in cluster_paths.items(): - with open(path) as fd: + for file in self.config()['cluster_files']: + cluster_label = file['prefix'] + with open(file['file_path']) as fd: csv_reader = csv.reader(fd, delimiter='\t') + line_no = 0 for row in csv_reader: - if len(row) > 1: - self._parse_cluster_row(row, cluster_label, node_ix) + line_no += 1 + if len(row) <= 1 or row[0][0] == '#': + # comment / metadata + continue + self._parse_cluster_row(row, cluster_label, node_ix) # gather a list of cluster IDs for each node - nodes = [] - for (key, cluster_data) in node_ix.items(): - clusters = [] - for (cluster_label, id_list) in cluster_data.items(): - clusters += [cluster_label + ":" + id for id in id_list] - - nodes += [{ - '_key': key, - 'clusters': clusters - }] + nodes = [{ + '_key': key, + 'clusters': cluster_data + } for (key, cluster_data) in node_ix.items()] return {'nodes': nodes} def _parse_cluster_row(self, row, cluster_label, node_ix): - # metadata rows start with '#' - if row[0] != '#': - # remove the 'Cluster' text - cluster_id = row[0].replace('Cluster','') - node_keys = row[1:] - for key in node_keys: - if key not in node_ix: - node_ix[key] = {} - if cluster_label not in node_ix[key]: - node_ix[key][cluster_label] = [] + # remove the 'Cluster' text + id = row[0].replace('Cluster','') + node_keys = row[1:] + + for key in node_keys: + if key not in node_ix: + node_ix[key] = [] - node_ix[key][cluster_label].append(cluster_id) + cluster_id = cluster_label + ':' + id + if cluster_id not in node_ix[key]: + node_ix[key].append(cluster_id) def save_dataset(self, dataset): @@ -237,3 +280,32 @@ def load_data(self): self.save_dataset(self.load_node_metadata()) self.save_dataset(self.load_cluster_data()) + + def check_data_delta(self): + edge_data = self.load_edges() + node_metadata = self.load_node_metadata() + clusters = self.load_cluster_data() + + self.check_deltas(edge_data=edge_data, node_metadata=node_metadata, cluster_data=clusters) + + def check_deltas(self, edge_data={}, node_metadata={}, cluster_data={}): + + edge_nodes = set([e['_key'] for e in edge_data['nodes']]) + node_metadata_nodes = set([e['_key'] for e in node_metadata['nodes']]) + cluster_nodes = set([e['_key'] for e in cluster_data['nodes']]) + all_nodes = edge_nodes.union(node_metadata_nodes).union(cluster_nodes) + + # check all nodes in cluster_data have node_metadata + clstr_no_node_md_set = cluster_nodes.difference(node_metadata_nodes) + if clstr_no_node_md_set: + print({'clusters with no node metadata': clstr_no_node_md_set}) + + # check all nodes in the edge_data have node_metadata + edge_no_node_md_set = edge_nodes.difference(node_metadata_nodes) + if edge_no_node_md_set: + print({'edges with no node metadata': edge_no_node_md_set}) + + # count all edges + print("Dataset contains " + str(len(edge_data['edges'])) + " edges") + # count all nodes + print("Dataset contains " + str(len(all_nodes)) + " nodes") diff --git a/schemas/deltaloader/delta_load_registry.yaml b/schemas/deltaloader/delta_load_registry.yaml index dc9c7f8..419b2a7 100644 --- a/schemas/deltaloader/delta_load_registry.yaml +++ b/schemas/deltaloader/delta_load_registry.yaml @@ -5,7 +5,7 @@ schema: "$schema": http://json-schema.org/draft-07/schema# title: delta_load_registry type: object - description: Don't touch this. It's for the exlusive use of delta loaders. + description: Don't touch this. It's for the exclusive use of delta loaders. properties: _key: type: string diff --git a/schemas/djornl/djornl_node.yaml b/schemas/djornl/djornl_node.yaml index f200931..a7b44a8 100644 --- a/schemas/djornl/djornl_node.yaml +++ b/schemas/djornl/djornl_node.yaml @@ -2,6 +2,10 @@ name: djornl_node type: vertex delta: false +indexes: + - type: hash + fields: ["clusters[*]"] + schema: "$schema": http://json-schema.org/draft-07/schema# title: Gene and Phenotype Vertices diff --git a/test/djornl/col_count_errors/manifest.yaml b/test/djornl/col_count_errors/manifest.yaml new file mode 100644 index 0000000..88ab96d --- /dev/null +++ b/test/djornl/col_count_errors/manifest.yaml @@ -0,0 +1,5 @@ +- data_type: edge + path: merged_edges-AMW-060820_AF.tsv + +- data_type: node + path: aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv diff --git a/test/djornl/empty_files/manifest.yaml b/test/djornl/empty_files/manifest.yaml new file mode 100644 index 0000000..7d42ff6 --- /dev/null +++ b/test/djornl/empty_files/manifest.yaml @@ -0,0 +1,17 @@ +- data_type: edge + path: merged_edges-AMW-060820_AF.tsv + +- data_type: node + path: aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv + +- data_type: cluster + prefix: markov_i2 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv + +- data_type: cluster + prefix: markov_i4 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv + +- data_type: cluster + prefix: markov_i6 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv diff --git a/test/djornl/invalid_file/manifest.yaml b/test/djornl/invalid_file/manifest.yaml new file mode 100644 index 0000000..3a12de5 --- /dev/null +++ b/test/djornl/invalid_file/manifest.yaml @@ -0,0 +1,9 @@ +- data_type: edge + path: edges.tsv + +- data_type: node + path: nodes.csv + +- data_type: cluster + prefix: markov_i2 + path: clusters.tsv diff --git a/test/djornl/invalid_manifest/manifest.yaml b/test/djornl/invalid_manifest/manifest.yaml new file mode 100644 index 0000000..e7fa88e --- /dev/null +++ b/test/djornl/invalid_manifest/manifest.yaml @@ -0,0 +1,10 @@ +- data_type: edge + path: edges.tsv + +- data_type: node + +- data_type: cluster + path: clusters.tsv + +- data_type: ping-pong balls + path: where? \ No newline at end of file diff --git a/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv b/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv index af5fa6c..543dd99 100644 --- a/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv +++ b/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv @@ -1,4 +1,5 @@ node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,UserNotes +# data_type: node As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010", As75,pheno,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010", AT1G01010,Monkey,AT1G01010.1,NTL10,NAC domain containing protein 1,protein_coding,NAC domain containing protein 1;(source:Araport11),,NAC domain containing protein 1,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.17,.RNA biosynthesis.transcriptional regulation.transcription factor (NAC),transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96]),,,,,, diff --git a/test/djornl/invalid_types/manifest.yaml b/test/djornl/invalid_types/manifest.yaml new file mode 100644 index 0000000..88ab96d --- /dev/null +++ b/test/djornl/invalid_types/manifest.yaml @@ -0,0 +1,5 @@ +- data_type: edge + path: merged_edges-AMW-060820_AF.tsv + +- data_type: node + path: aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv diff --git a/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv b/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv index f9857bd..a98f49f 100644 --- a/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv +++ b/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv @@ -1,3 +1,4 @@ +# data_type: edge node1 node2 edge edge_descrip layer_descrip As2 AT1G01020 8.422046084731258 AraGWAS-Association_score AraGWAS-Some-Old-Rubbish-I-Made-Up As2 AT1G01040 5.422046084731258 AraGWAS-Association_score AraGWAS-Phenotype_Associations diff --git a/test/djornl/missing_files/manifest.yaml b/test/djornl/missing_files/manifest.yaml new file mode 100644 index 0000000..3a12de5 --- /dev/null +++ b/test/djornl/missing_files/manifest.yaml @@ -0,0 +1,9 @@ +- data_type: edge + path: edges.tsv + +- data_type: node + path: nodes.csv + +- data_type: cluster + prefix: markov_i2 + path: clusters.tsv diff --git a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv b/test/djornl/test_data/I2_named.tsv similarity index 90% rename from test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv rename to test/djornl/test_data/I2_named.tsv index 585e0a5..46f4498 100644 --- a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv +++ b/test/djornl/test_data/I2_named.tsv @@ -1,3 +1,4 @@ +# data_type: cluster # prefix: markov_i2 # title: Markov clustering, inflation = 2 Cluster1 AT1G01010 AT1G01030 AT1G01040 diff --git a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv b/test/djornl/test_data/I4_named.tsv similarity index 74% rename from test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv rename to test/djornl/test_data/I4_named.tsv index 3cb18ea..147831e 100644 --- a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv +++ b/test/djornl/test_data/I4_named.tsv @@ -1,3 +1,3 @@ # prefix: markov_i4 # title: Markov clustering, inflation = 4 - +# data_type: cluster diff --git a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv b/test/djornl/test_data/I6_named.tsv similarity index 88% rename from test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv rename to test/djornl/test_data/I6_named.tsv index d6a1b07..b4680eb 100644 --- a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv +++ b/test/djornl/test_data/I6_named.tsv @@ -1,3 +1,4 @@ +# data_type: cluster # prefix: markov_i6 # title: Markov clustering, inflation = 6 Cluster1 AT1G01040 AT1G01090 diff --git a/test/djornl/test_data/merged_edges-AMW-060820_AF.tsv b/test/djornl/test_data/edges.tsv similarity index 100% rename from test/djornl/test_data/merged_edges-AMW-060820_AF.tsv rename to test/djornl/test_data/edges.tsv diff --git a/test/djornl/test_data/manifest.yaml b/test/djornl/test_data/manifest.yaml new file mode 100644 index 0000000..2eb28e3 --- /dev/null +++ b/test/djornl/test_data/manifest.yaml @@ -0,0 +1,19 @@ +- data_type: edge + path: edges.tsv + date_created: 2020-12-25 + +- data_type: node + path: nodes.csv + date_created: 2019-01-01 + +- data_type: cluster + prefix: markov_i2 + path: I2_named.tsv + +- data_type: cluster + prefix: markov_i4 + path: I4_named.tsv + +- data_type: cluster + prefix: markov_i6 + path: I6_named.tsv diff --git a/test/djornl/test_data/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv b/test/djornl/test_data/nodes.csv similarity index 99% rename from test/djornl/test_data/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv rename to test/djornl/test_data/nodes.csv index 5bc0e1d..a032142 100644 --- a/test/djornl/test_data/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv +++ b/test/djornl/test_data/nodes.csv @@ -1,3 +1,4 @@ +# data_type: node node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,UserNotes As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010", As75,pheno,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010", diff --git a/test/stored_queries/test_djornl.py b/test/stored_queries/test_djornl.py index e61835b..b6468ba 100644 --- a/test/stored_queries/test_djornl.py +++ b/test/stored_queries/test_djornl.py @@ -94,23 +94,23 @@ def check_expected_results(self, description, response, expected): def test_fetch_all(self): - # expect all the nodes from load_node_metadata and all the edges from load_edges - expected = { - "nodes": [n["_key"] for n in self.json_data['load_node_metadata']['nodes']], - "edges": [ { - "_to": e["_to"], - "_from": e["_from"], - "score": e["score"], - "edge_type": e["edge_type"] } for e in self.json_data['load_edges']['edges'] - ] - } - + response = self.submit_query('djornl_fetch_all') self.check_expected_results( "djornl_fetch_all", - self.submit_query('djornl_fetch_all'), + response, self.json_data['fetch_all'] ) + # ensure that all the cluster data is returned OK + node_data = response['results'][0]['nodes'] + nodes_with_clusters = [json.dumps({ + '_key': n['_key'], + 'clusters': n['clusters'] + }) for n in node_data if 'clusters' in n] + self.assertEqual( + set(nodes_with_clusters), + set([json.dumps(this) for this in self.json_data['load_cluster_data']['nodes']]) + ) # indexing schema in results.json # self.json_data[query_name][param_name][param_value]["distance"][distance_param] diff --git a/test/stored_queries/test_djornl_parser.py b/test/stored_queries/test_djornl_parser.py index 8d38761..347e17d 100644 --- a/test/stored_queries/test_djornl_parser.py +++ b/test/stored_queries/test_djornl_parser.py @@ -10,6 +10,7 @@ import requests import os import contextlib +from jsonschema.exceptions import ValidationError from importers.djornl.parser import DJORNL_Parser @@ -40,69 +41,86 @@ def init_parser_with_path(self, root_path): return parser - def test_load_empty_files(self): - """ test loading files containing no data """ + def test_load_no_manifest(self): + """ test loading when the manifest does not exist """ + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'no_manifest') + err_str = 'No manifest file found at ' + os.path.join(RES_ROOT_DATA_PATH, 'manifest.yaml') + with self.assertRaisesRegex(RuntimeError, err_str): + self.init_parser_with_path(RES_ROOT_DATA_PATH) - # path: test/djornl/empty_files - RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'empty_files') - parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) - self.assertEqual(parser.load_edges(), {"nodes": [], "edges": []}) - self.assertEqual(parser.load_node_metadata(), {"nodes": []}) - self.assertEqual(parser.load_cluster_data(), {"nodes": []}) + def test_load_invalid_manifest(self): + """ test an invalid manifest file """ + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_manifest') + err_str = "The manifest file failed validation with the following errors:" + with self.assertRaisesRegex(RuntimeError, err_str): + self.init_parser_with_path(RES_ROOT_DATA_PATH) + + + def test_load_invalid_file(self): + """ test loading when a file specified in the manifest is a directory """ + + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_file') + + # edges: directory, not a file + err_str = os.path.join(RES_ROOT_DATA_PATH, "edges.tsv") + ": not a file" + with self.assertRaisesRegex(RuntimeError, err_str): + self.init_parser_with_path(RES_ROOT_DATA_PATH) def test_load_missing_files(self): """ test loading when files cannot be found """ - # this dir does not contain the correct file structure - # path: test/djornl/empty_files/cluster_data - RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'empty_files', 'cluster_data') - parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'missing_files') + # not found + err_str = os.path.join(RES_ROOT_DATA_PATH, "edges.tsv") + ': file does not exist' + with self.assertRaisesRegex(RuntimeError, err_str): + self.init_parser_with_path(RES_ROOT_DATA_PATH) - err_str = "No such file or directory: '" + RES_ROOT_DATA_PATH - with self.assertRaisesRegex(FileNotFoundError, err_str): - parser.load_edges() - with self.assertRaisesRegex(FileNotFoundError, err_str): - parser.load_node_metadata() + def test_load_empty_files(self): + """ test loading files containing no data """ - with self.assertRaisesRegex(FileNotFoundError, err_str): - parser.load_cluster_data() + # path: test/djornl/empty_files + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'empty_files') + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + self.assertEqual(parser.load_edges(), {"nodes": [], "edges": []}) + self.assertEqual(parser.load_node_metadata(), {"nodes": []}) + self.assertEqual(parser.load_cluster_data(), {"nodes": []}) - def test_load_invalid_types(self): - """ test file format errors """ + def test_load_col_count_errors(self): + """ test files with invalid numbers of columns """ - # path: test/djornl/invalid_types - RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types') + # path: test/djornl/col_count_errors + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'col_count_errors') parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) # invalid edge type - edge_err_msg = 'line 2: invalid edge type: AraGWAS-Some-Old-Rubbish-I-Made-Up' + edge_err_msg = 'line 6: expected 5 cols, found 3' with self.assertRaisesRegex(RuntimeError, edge_err_msg): parser.load_edges() # invalid node type - node_err_msg = 'line 4: invalid node type: Monkey' + node_err_msg = 'line 3: expected 20 cols, found 22' with self.assertRaisesRegex(RuntimeError, node_err_msg): parser.load_node_metadata() - def test_load_col_count_errors(self): - """ test files with invalid numbers of columns """ + def test_load_invalid_types(self): + """ test file format errors """ - # path: test/djornl/col_count_errors - RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'col_count_errors') + # path: test/djornl/invalid_types + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types') parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) # invalid edge type - edge_err_msg = 'line 6: expected 5 cols, found 3' + edge_err_msg = 'merged_edges-AMW-060820_AF.tsv line 3: invalid edge type: AraGWAS-Some-Old-Rubbish-I-Made-Up' with self.assertRaisesRegex(RuntimeError, edge_err_msg): parser.load_edges() # invalid node type - node_err_msg = 'line 3: expected 20 cols, found 22' + node_err_msg = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv line 5: invalid node type: Monkey' with self.assertRaisesRegex(RuntimeError, node_err_msg): parser.load_node_metadata() @@ -112,8 +130,6 @@ def test_load_valid_edge_data(self): RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data') parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) - self.maxDiff = None - edge_data = parser.load_edges() self.assertEqual( edge_data, @@ -142,3 +158,10 @@ def test_load_valid_cluster_data(self): cluster_data, self.json_data["load_cluster_data"] ) + + def test_load_valid_node_metadata(self): + + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data') + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + + parser.check_data_delta() From 9892d70ab982b09b3489a1f3329222beaee156e1 Mon Sep 17 00:00:00 2001 From: ialarmedalien Date: Mon, 20 Jul 2020 15:41:23 -0700 Subject: [PATCH 3/4] fixing LGTM errors --- importers/djornl/parser.py | 5 ----- test/stored_queries/test_djornl.py | 4 +--- test/stored_queries/test_djornl_parser.py | 8 +------- 3 files changed, 2 insertions(+), 15 deletions(-) diff --git a/importers/djornl/parser.py b/importers/djornl/parser.py index ebf7ed1..5295fdb 100644 --- a/importers/djornl/parser.py +++ b/importers/djornl/parser.py @@ -9,11 +9,7 @@ import os import csv import yaml -import json -import jsonschema - from jsonschema.validators import Draft7Validator -from jsonschema.exceptions import ValidationError import importers.utils.config as config @@ -54,7 +50,6 @@ def _configure(self): validator = Draft7Validator(manifest_schema) if not validator.is_valid(manifest): - error_list = [] raise RuntimeError( "The manifest file failed validation with the following errors:\n" + "\n".join(e.message for e in sorted(validator.iter_errors(manifest), key=str)) diff --git a/test/stored_queries/test_djornl.py b/test/stored_queries/test_djornl.py index b6468ba..7c502a3 100644 --- a/test/stored_queries/test_djornl.py +++ b/test/stored_queries/test_djornl.py @@ -6,10 +6,8 @@ import unittest import requests import os -import glob -import yaml -from test.helpers import get_config, assert_subset, modified_environ +from test.helpers import get_config, modified_environ from test.stored_queries.helpers import create_test_docs from importers.djornl.parser import DJORNL_Parser diff --git a/test/stored_queries/test_djornl_parser.py b/test/stored_queries/test_djornl_parser.py index 347e17d..91dfdb5 100644 --- a/test/stored_queries/test_djornl_parser.py +++ b/test/stored_queries/test_djornl_parser.py @@ -10,8 +10,6 @@ import requests import os import contextlib -from jsonschema.exceptions import ValidationError - from importers.djornl.parser import DJORNL_Parser from test.helpers import get_config, assert_subset, modified_environ @@ -159,9 +157,5 @@ def test_load_valid_cluster_data(self): self.json_data["load_cluster_data"] ) - def test_load_valid_node_metadata(self): - - RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data') - parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) - parser.check_data_delta() + From e293889c7828ae444e7fce4ca8fdcb700501f37c Mon Sep 17 00:00:00 2001 From: ialarmedalien Date: Mon, 20 Jul 2020 17:37:54 -0700 Subject: [PATCH 4/4] Adding fake file to get dir to show up in git --- test/djornl/invalid_file/edges.tsv/touch | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 test/djornl/invalid_file/edges.tsv/touch diff --git a/test/djornl/invalid_file/edges.tsv/touch b/test/djornl/invalid_file/edges.tsv/touch new file mode 100644 index 0000000..e69de29