From 08bfb5818f2e1d9fc6e1fe7e2c442bcfbe549286 Mon Sep 17 00:00:00 2001
From: ialarmedalien <ialarmedalien@gmail.com>
Date: Wed, 15 Jul 2020 20:59:21 -0700
Subject: [PATCH 1/4] Reorganising cluster data to be a single field with an
 array of clusters in the form <cluster_name>:<cluster_id>. The current set of
 clusters have been renamed from 'cluster_i2' to 'markov_i2' as they were
 created using Markov clustering with inflation set to 2.

---
 Makefile                                      |   2 +
 importers/djornl/parser.py                    |  47 +-
 schemas/djornl/djornl_node.yaml               |  23 +-
 .../djornl/djornl_fetch_clusters.yaml         |  26 +-
 test/djornl/results.json                      | 457 ++++++++++--------
 ...p10percent_anno_AF_082919.abc.I2_named.tsv |   2 +
 ...p10percent_anno_AF_082919.abc.I4_named.tsv |   2 +
 ...p10percent_anno_AF_082919.abc.I6_named.tsv |   2 +
 test/stored_queries/test_djornl.py            |  92 ++--
 test/stored_queries/test_djornl_parser.py     |   1 -
 10 files changed, 339 insertions(+), 315 deletions(-)

diff --git a/Makefile b/Makefile
index c3baa81..8c1c446 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,6 @@
 .PHONY: test
 
 test:
+	docker-compose down
 	docker-compose run spec sh /app/test/run_tests.sh
+	docker-compose down
\ No newline at end of file
diff --git a/importers/djornl/parser.py b/importers/djornl/parser.py
index 9eb5c87..c6051e8 100644
--- a/importers/djornl/parser.py
+++ b/importers/djornl/parser.py
@@ -43,15 +43,15 @@ def _configure(self):
 
         _CLUSTER_BASE = os.path.join(configuration['ROOT_DATA_PATH'], 'cluster_data')
         configuration['_CLUSTER_PATHS'] = {
-            'cluster_I2': os.path.join(
+            'markov_i2': os.path.join(
                 _CLUSTER_BASE,
                 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv'
             ),
-            'cluster_I4': os.path.join(
+            'markov_i4': os.path.join(
                 _CLUSTER_BASE,
                 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv'
             ),
-            'cluster_I6': os.path.join(
+            'markov_i6': os.path.join(
                 _CLUSTER_BASE,
                 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv'
             ),
@@ -163,24 +163,49 @@ def load_node_metadata(self):
 
     def load_cluster_data(self):
         """Annotate genes with cluster ID fields."""
-        nodes = []
+
+        # index of nodes
+        node_ix = {}
         cluster_paths = self.config()['_CLUSTER_PATHS']
         for (cluster_label, path) in cluster_paths.items():
             with open(path) as fd:
                 csv_reader = csv.reader(fd, delimiter='\t')
                 for row in csv_reader:
                     if len(row) > 1:
-                        # remove the 'Cluster' text
-                        cluster_id = row[0].replace('Cluster','')
-                        gene_keys = row[1:]
-                        nodes += [
-                            {'_key': key, cluster_label: int(cluster_id)}
-                            for key in gene_keys
-                        ]
+                        self._parse_cluster_row(row, cluster_label, node_ix)
+
+
+        # gather a list of cluster IDs for each node
+        nodes = []
+        for (key, cluster_data) in node_ix.items():
+            clusters = []
+            for (cluster_label, id_list) in cluster_data.items():
+                clusters += [cluster_label + ":" + id for id in id_list]
+
+            nodes += [{
+                '_key': key,
+                'clusters': clusters
+            }]
 
         return {'nodes': nodes}
 
 
+    def _parse_cluster_row(self, row, cluster_label, node_ix):
+        # metadata rows start with '#'
+        if row[0] != '#':
+            # remove the 'Cluster' text
+            cluster_id = row[0].replace('Cluster','')
+            node_keys = row[1:]
+
+            for key in node_keys:
+                if key not in node_ix:
+                    node_ix[key] = {}
+                if cluster_label not in node_ix[key]:
+                    node_ix[key][cluster_label] = []
+
+                node_ix[key][cluster_label].append(cluster_id)
+
+
     def save_dataset(self, dataset):
 
         if 'nodes' in dataset and len(dataset['nodes']) > 0:
diff --git a/schemas/djornl/djornl_node.yaml b/schemas/djornl/djornl_node.yaml
index 9248f1c..f200931 100644
--- a/schemas/djornl/djornl_node.yaml
+++ b/schemas/djornl/djornl_node.yaml
@@ -13,21 +13,14 @@ schema:
       type: string
       title: Key
       examples: ["AT1G01010"]
-    cluster_I2:
-      type: integer
-      title: Cluster 2 ID
-      description: Iterative random forest cluster group ID
-      examples: [1]
-    cluster_I4:
-      type: integer
-      title: Cluster 4 ID
-      description: Iterative random forest cluster group ID
-      examples: [13]
-    cluster_I6:
-      type: integer
-      title: Cluster 6 ID
-      description: Iterative random forest cluster group ID
-      examples: [27]
+    clusters:
+      type: array
+      title: Clusters
+      description: Clusters to which the node has been assigned
+      items:
+        type: string
+#        pattern: "^\w+:\d+$"
+      examples: [["markov_i2:1", "markov_i4:5"], ["markov_i6:3"]]
     node_type:
       type: string
       title: Node type
diff --git a/stored_queries/djornl/djornl_fetch_clusters.yaml b/stored_queries/djornl/djornl_fetch_clusters.yaml
index 4c6b8c5..1fadca3 100644
--- a/stored_queries/djornl/djornl_fetch_clusters.yaml
+++ b/stored_queries/djornl/djornl_fetch_clusters.yaml
@@ -2,25 +2,13 @@ name: djornl_fetch_clusters
 description: Fetch all nodes that are members of the specified cluster(s), and the edges and nodes within the specified distance (number of hops) of those nodes.
 params:
   type: object
+  required: [cluster_ids]
   properties:
-    cluster_i2_ids:
-      title: Cluster I2 IDs
-      description: Cluster I2 IDs to locate
-      items: {type: integer}
-      default: []
-      examples: [[1], [3, 5]]
-    cluster_i4_ids:
-      title: Cluster I4 IDs
-      description: Cluster I4 IDs to locate
-      items: {type: integer}
-      examples: [[2], [4, 6]]
-      default: []
-    cluster_i6_ids:
-      title: Cluster I6 IDs
-      description: Cluster I6 IDs to locate
-      items: {type: integer}
-      examples: [[666], [999, 333]]
-      default: []
+    cluster_ids:
+      title: Cluster IDs
+      description: Cluster IDs, in the form "clustering_system_name:cluster_id"
+      items: {type: string}
+      examples: [['markov_i2:5', 'markov_i6:2'],['markov_i6:1']]
     distance:
       type: integer
       title: Traversal Distance
@@ -31,7 +19,7 @@ params:
 query: |
   LET node_ids = (
     FOR n IN djornl_node
-      FILTER n.cluster_I2 IN @cluster_i2_ids OR n.cluster_I4 IN @cluster_i4_ids OR n.cluster_I6 IN @cluster_i6_ids
+      FILTER n.clusters ANY IN @cluster_ids
       FOR node IN 0..@distance ANY n djornl_edge
         OPTIONS {bfs: true, uniqueVertices: "global"}
         RETURN DISTINCT node._id
diff --git a/test/djornl/results.json b/test/djornl/results.json
index a844c2c..7fd3a4d 100644
--- a/test/djornl/results.json
+++ b/test/djornl/results.json
@@ -27,21 +27,15 @@
   },
   "load_cluster_data": {
     "nodes": [
-      {"_key": "AT1G01010", "cluster_I2": 1},
-      {"_key": "AT1G01030", "cluster_I2": 1},
-      {"_key": "AT1G01040", "cluster_I2": 1},
-      {"_key": "AT1G01050", "cluster_I2": 2},
-      {"_key": "AT1G01060", "cluster_I2": 2},
-      {"_key": "AT1G01070", "cluster_I2": 2},
-      {"_key": "AT1G01080", "cluster_I2": 3},
-      {"_key": "AT1G01090", "cluster_I2": 3},
-      {"_key": "AT1G01020", "cluster_I2": 5},
-      {"_key": "AT1G01040", "cluster_I6": 1},
-      {"_key": "AT1G01090", "cluster_I6": 1},
-      {"_key": "AT1G01070", "cluster_I6": 2},
-      {"_key": "AT1G01010", "cluster_I6": 3},
-      {"_key": "AT1G01020", "cluster_I6": 3},
-      {"_key": "AT1G01030", "cluster_I6": 3}
+      {"_key": "AT1G01010", "clusters": ["markov_i2:1", "markov_i6:3"]},
+      {"_key": "AT1G01030", "clusters": ["markov_i2:1", "markov_i6:3"]},
+      {"_key": "AT1G01040", "clusters": ["markov_i2:1", "markov_i6:1"]},
+      {"_key": "AT1G01050", "clusters": ["markov_i2:2"]},
+      {"_key": "AT1G01060", "clusters": ["markov_i2:2"]},
+      {"_key": "AT1G01070", "clusters": ["markov_i2:2", "markov_i6:2"]},
+      {"_key": "AT1G01080", "clusters": ["markov_i2:3"]},
+      {"_key": "AT1G01090", "clusters": ["markov_i2:3", "markov_i6:1"]},
+      {"_key": "AT1G01020", "clusters": ["markov_i2:5", "markov_i6:3"]}
     ]
   },
   "load_node_metadata": {
@@ -93,220 +87,265 @@
     ]
   },
   "fetch_genes": {
-    "AT1G01010": {
-      "0": {
-        "nodes": ["AT1G01010"],
-        "edges": []
+    "keys": {
+      "Mary Poppins": {
+        "distance": {
+          "0": {"nodes": [], "edges": []},
+          "1": {"nodes": [], "edges": []},
+          "5": {"nodes": [], "edges": []}
+        }
       },
-      "1": {
-        "nodes": [
-          "AT1G01010",
-          "AT1G01020",
-          "AT1G01030",
-          "AT1G01040"
-        ],
-        "edges": [
-          "AT1G01010__AT1G01020__ppi_hithru__2.3",
-          "AT1G01010__AT1G01030__ppi_hithru__2.4",
-          "AT1G01010__AT1G01040__domain_co_occur__2.5",
-          "AT1G01010__AT1G01040__ppi_liter__170.5"
-        ]
+      "AT1G01010": {
+        "distance": {
+          "0": {
+            "nodes": ["AT1G01010"],
+            "edges": []
+          },
+          "1": {
+            "nodes": [
+              "AT1G01010",
+              "AT1G01020",
+              "AT1G01030",
+              "AT1G01040"
+            ],
+            "edges": [
+              "AT1G01010__AT1G01020__ppi_hithru__2.3",
+              "AT1G01010__AT1G01030__ppi_hithru__2.4",
+              "AT1G01010__AT1G01040__domain_co_occur__2.5",
+              "AT1G01010__AT1G01040__ppi_liter__170.5"
+            ]
+          },
+          "5": {
+            "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"],
+            "edges": [
+              "As2__AT1G01020__pheno_assn__8.4",
+              "As2__AT1G01040__pheno_assn__5.4",
+              "As75__AT1G01020__pheno_assn__39.9",
+              "AT1G01010__AT1G01020__ppi_hithru__2.3",
+              "AT1G01010__AT1G01030__ppi_hithru__2.4",
+              "AT1G01010__AT1G01040__domain_co_occur__2.5",
+              "AT1G01010__AT1G01040__ppi_liter__170.5",
+              "AT1G01030__AT1G01050__gene_coexpr__2.6",
+              "AT1G01050__AT1G01060__ppi_liter__2.7"
+            ]
+          }
+        }
       },
-      "5": {
-        "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"],
-        "edges": [
-          "As2__AT1G01020__pheno_assn__8.4",
-          "As2__AT1G01040__pheno_assn__5.4",
-          "As75__AT1G01020__pheno_assn__39.9",
-          "AT1G01010__AT1G01020__ppi_hithru__2.3",
-          "AT1G01010__AT1G01030__ppi_hithru__2.4",
-          "AT1G01010__AT1G01040__domain_co_occur__2.5",
-          "AT1G01010__AT1G01040__ppi_liter__170.5",
-          "AT1G01030__AT1G01050__gene_coexpr__2.6",
-          "AT1G01050__AT1G01060__ppi_liter__2.7"
-        ]
-      }
-    },
-    "AT1G01020__AT1G01070": {
-      "0": {
-        "nodes": ["AT1G01020", "AT1G01070"],
-        "edges": []
-      },
-      "1": {
-        "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"],
-        "edges": [
-          "As2__AT1G01020__pheno_assn__8.4",
-          "As75__AT1G01020__pheno_assn__39.9",
-          "AT1G01010__AT1G01020__ppi_hithru__2.3"
-        ]
-      },
-      "5": {
-        "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"],
-        "edges": [
-          "As2__AT1G01020__pheno_assn__8.4",
-          "As2__AT1G01040__pheno_assn__5.4",
-          "As75__AT1G01020__pheno_assn__39.9",
-          "AT1G01010__AT1G01020__ppi_hithru__2.3",
-          "AT1G01010__AT1G01030__ppi_hithru__2.4",
-          "AT1G01010__AT1G01040__domain_co_occur__2.5",
-          "AT1G01010__AT1G01040__ppi_liter__170.5",
-          "AT1G01030__AT1G01050__gene_coexpr__2.6",
-          "AT1G01050__AT1G01060__ppi_liter__2.7"
-        ]
+      "AT1G01020__AT1G01070": {
+        "distance": {
+          "0": {
+            "nodes": ["AT1G01020", "AT1G01070"],
+            "edges": []
+          },
+          "1": {
+            "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"],
+            "edges": [
+              "As2__AT1G01020__pheno_assn__8.4",
+              "As75__AT1G01020__pheno_assn__39.9",
+              "AT1G01010__AT1G01020__ppi_hithru__2.3"
+            ]
+          },
+          "5": {
+            "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"],
+            "edges": [
+              "As2__AT1G01020__pheno_assn__8.4",
+              "As2__AT1G01040__pheno_assn__5.4",
+              "As75__AT1G01020__pheno_assn__39.9",
+              "AT1G01010__AT1G01020__ppi_hithru__2.3",
+              "AT1G01010__AT1G01030__ppi_hithru__2.4",
+              "AT1G01010__AT1G01040__domain_co_occur__2.5",
+              "AT1G01010__AT1G01040__ppi_liter__170.5",
+              "AT1G01030__AT1G01050__gene_coexpr__2.6",
+              "AT1G01050__AT1G01060__ppi_liter__2.7"
+            ]
+          }
+        }
       }
     }
   },
   "fetch_phenotypes": {
-    "As2": {
-      "0": {
-        "nodes": ["As2"],
-        "edges": []
-      },
-      "1": {
-        "nodes": ["As2", "AT1G01020", "AT1G01040"],
-        "edges": [
-          "As2__AT1G01020__pheno_assn__8.4",
-          "As2__AT1G01040__pheno_assn__5.4"
-        ]
-      },
-      "5": {
-        "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"],
-        "edges": [
-          "As2__AT1G01020__pheno_assn__8.4",
-          "As2__AT1G01040__pheno_assn__5.4",
-          "As75__AT1G01020__pheno_assn__39.9",
-          "AT1G01010__AT1G01020__ppi_hithru__2.3",
-          "AT1G01010__AT1G01030__ppi_hithru__2.4",
-          "AT1G01010__AT1G01040__domain_co_occur__2.5",
-          "AT1G01010__AT1G01040__ppi_liter__170.5",
-          "AT1G01030__AT1G01050__gene_coexpr__2.6",
-          "AT1G01050__AT1G01060__ppi_liter__2.7"
-        ]
-      }
-    },
-    "As2__Na23": {
-      "0": {
-        "nodes": ["As2", "Na23"],
-        "edges": []
+    "keys": {
+      "Mary Poppins": {
+        "distance": {
+          "0": {"nodes": [], "edges": []},
+          "1": {"nodes": [], "edges": []},
+          "5": {"nodes": [], "edges": []}
+        }
       },
-      "1": {
-        "nodes": ["As2", "Na23", "AT1G01020", "AT1G01040"],
-        "edges": [
-          "As2__AT1G01020__pheno_assn__8.4",
-          "As2__AT1G01040__pheno_assn__5.4"
-        ]
+      "As2": {
+        "distance": {
+          "0": {
+            "nodes": ["As2"],
+            "edges": []
+          },
+          "1": {
+            "nodes": ["As2", "AT1G01020", "AT1G01040"],
+            "edges": [
+              "As2__AT1G01020__pheno_assn__8.4",
+              "As2__AT1G01040__pheno_assn__5.4"
+            ]
+          },
+          "5": {
+            "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"],
+            "edges": [
+              "As2__AT1G01020__pheno_assn__8.4",
+              "As2__AT1G01040__pheno_assn__5.4",
+              "As75__AT1G01020__pheno_assn__39.9",
+              "AT1G01010__AT1G01020__ppi_hithru__2.3",
+              "AT1G01010__AT1G01030__ppi_hithru__2.4",
+              "AT1G01010__AT1G01040__domain_co_occur__2.5",
+              "AT1G01010__AT1G01040__ppi_liter__170.5",
+              "AT1G01030__AT1G01050__gene_coexpr__2.6",
+              "AT1G01050__AT1G01060__ppi_liter__2.7"
+            ]
+          }
+        }
       },
-      "5": {
-        "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "Na23"],
-        "edges": [
-          "As2__AT1G01020__pheno_assn__8.4",
-          "As2__AT1G01040__pheno_assn__5.4",
-          "As75__AT1G01020__pheno_assn__39.9",
-          "AT1G01010__AT1G01020__ppi_hithru__2.3",
-          "AT1G01010__AT1G01030__ppi_hithru__2.4",
-          "AT1G01010__AT1G01040__domain_co_occur__2.5",
-          "AT1G01010__AT1G01040__ppi_liter__170.5",
-          "AT1G01030__AT1G01050__gene_coexpr__2.6",
-          "AT1G01050__AT1G01060__ppi_liter__2.7"
-        ]
+      "As2__Na23": {
+        "distance": {
+          "0": {
+            "nodes": ["As2", "Na23"],
+            "edges": []
+          },
+          "1": {
+            "nodes": ["As2", "Na23", "AT1G01020", "AT1G01040"],
+            "edges": [
+              "As2__AT1G01020__pheno_assn__8.4",
+              "As2__AT1G01040__pheno_assn__5.4"
+            ]
+          },
+          "5": {
+            "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "Na23"],
+            "edges": [
+              "As2__AT1G01020__pheno_assn__8.4",
+              "As2__AT1G01040__pheno_assn__5.4",
+              "As75__AT1G01020__pheno_assn__39.9",
+              "AT1G01010__AT1G01020__ppi_hithru__2.3",
+              "AT1G01010__AT1G01030__ppi_hithru__2.4",
+              "AT1G01010__AT1G01040__domain_co_occur__2.5",
+              "AT1G01010__AT1G01040__ppi_liter__170.5",
+              "AT1G01030__AT1G01050__gene_coexpr__2.6",
+              "AT1G01050__AT1G01060__ppi_liter__2.7"
+            ]
+          }
+        }
       }
     }
   },
   "search_nodes": {
-    "Mary Poppins": {
-      "0": {"nodes": [], "edges": []},
-      "1": {"nodes": [], "edges": []},
-      "5": {"nodes": [], "edges": []}
-    },
-    "GO:0005515": {
-      "0": {
-        "nodes": ["AT1G01040", "AT1G01090"],
-        "edges": []
-      },
-      "1": {
-        "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"],
-        "edges": [
-          "As2__AT1G01040__pheno_assn__5.4",
-          "AT1G01010__AT1G01040__domain_co_occur__2.5",
-          "AT1G01010__AT1G01040__ppi_liter__170.5",
-          "AT1G01080__AT1G01090__ppi_liter__2.8"
-        ]
+    "search_text": {
+      "Mary Poppins": {
+        "distance": {
+          "0": {"nodes": [], "edges": []},
+          "1": {"nodes": [], "edges": []},
+          "5": {"nodes": [], "edges": []}
+        }
       },
-      "5": {
-        "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080",  "AT1G01090"],
-        "edges": [
-          "As2__AT1G01020__pheno_assn__8.4",
-          "As2__AT1G01040__pheno_assn__5.4",
-          "As75__AT1G01020__pheno_assn__39.9",
-          "AT1G01010__AT1G01020__ppi_hithru__2.3",
-          "AT1G01010__AT1G01030__ppi_hithru__2.4",
-          "AT1G01010__AT1G01040__domain_co_occur__2.5",
-          "AT1G01010__AT1G01040__ppi_liter__170.5",
-          "AT1G01030__AT1G01050__gene_coexpr__2.6",
-          "AT1G01050__AT1G01060__ppi_liter__2.7",
-          "AT1G01080__AT1G01090__ppi_liter__2.8"
-        ]
+      "GO:0005515": {
+        "distance": {
+          "0": {
+            "nodes": ["AT1G01040", "AT1G01090"],
+            "edges": []
+          },
+          "1": {
+            "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"],
+            "edges": [
+              "As2__AT1G01040__pheno_assn__5.4",
+              "AT1G01010__AT1G01040__domain_co_occur__2.5",
+              "AT1G01010__AT1G01040__ppi_liter__170.5",
+              "AT1G01080__AT1G01090__ppi_liter__2.8"
+            ]
+          },
+          "5": {
+            "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080",  "AT1G01090"],
+            "edges": [
+              "As2__AT1G01020__pheno_assn__8.4",
+              "As2__AT1G01040__pheno_assn__5.4",
+              "As75__AT1G01020__pheno_assn__39.9",
+              "AT1G01010__AT1G01020__ppi_hithru__2.3",
+              "AT1G01010__AT1G01030__ppi_hithru__2.4",
+              "AT1G01010__AT1G01040__domain_co_occur__2.5",
+              "AT1G01010__AT1G01040__ppi_liter__170.5",
+              "AT1G01030__AT1G01050__gene_coexpr__2.6",
+              "AT1G01050__AT1G01060__ppi_liter__2.7",
+              "AT1G01080__AT1G01090__ppi_liter__2.8"
+            ]
+          }
+        }
       }
     }
   },
 
   "fetch_clusters": {
-    "i6-1": {
-      "0": {
-        "nodes": ["AT1G01040", "AT1G01090"],
-        "edges": []
-      },
-      "1": {
-        "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"],
-        "edges": [
-          "As2__AT1G01040__pheno_assn__5.4",
-          "AT1G01010__AT1G01040__domain_co_occur__2.5",
-          "AT1G01010__AT1G01040__ppi_liter__170.5",
-          "AT1G01080__AT1G01090__ppi_liter__2.8"
-        ]
-      },
-      "5": {
-        "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080",  "AT1G01090"],
-        "edges": [
-          "As2__AT1G01020__pheno_assn__8.4",
-          "As2__AT1G01040__pheno_assn__5.4",
-          "As75__AT1G01020__pheno_assn__39.9",
-          "AT1G01010__AT1G01020__ppi_hithru__2.3",
-          "AT1G01010__AT1G01030__ppi_hithru__2.4",
-          "AT1G01010__AT1G01040__domain_co_occur__2.5",
-          "AT1G01010__AT1G01040__ppi_liter__170.5",
-          "AT1G01030__AT1G01050__gene_coexpr__2.6",
-          "AT1G01050__AT1G01060__ppi_liter__2.7",
-          "AT1G01080__AT1G01090__ppi_liter__2.8"
-        ]
-      }
-    },
-    "i2-5__i6-2": {
-      "0": {
-        "nodes": ["AT1G01020", "AT1G01070"],
-        "edges": []
+    "cluster_ids": {
+      "Mary Poppins": {
+        "distance": {
+          "0": {"nodes": [], "edges": []},
+          "1": {"nodes": [], "edges": []},
+          "5": {"nodes": [], "edges": []}
+        }
       },
-      "1": {
-        "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"],
-        "edges": [
-          "As2__AT1G01020__pheno_assn__8.4",
-          "As75__AT1G01020__pheno_assn__39.9",
-          "AT1G01010__AT1G01020__ppi_hithru__2.3"
-        ]
+      "markov_i6:1": {
+        "distance": {
+          "0": {
+            "nodes": ["AT1G01040", "AT1G01090"],
+            "edges": []
+          },
+          "1": {
+            "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"],
+            "edges": [
+              "As2__AT1G01040__pheno_assn__5.4",
+              "AT1G01010__AT1G01040__domain_co_occur__2.5",
+              "AT1G01010__AT1G01040__ppi_liter__170.5",
+              "AT1G01080__AT1G01090__ppi_liter__2.8"
+            ]
+          },
+          "5": {
+            "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080",  "AT1G01090"],
+            "edges": [
+              "As2__AT1G01020__pheno_assn__8.4",
+              "As2__AT1G01040__pheno_assn__5.4",
+              "As75__AT1G01020__pheno_assn__39.9",
+              "AT1G01010__AT1G01020__ppi_hithru__2.3",
+              "AT1G01010__AT1G01030__ppi_hithru__2.4",
+              "AT1G01010__AT1G01040__domain_co_occur__2.5",
+              "AT1G01010__AT1G01040__ppi_liter__170.5",
+              "AT1G01030__AT1G01050__gene_coexpr__2.6",
+              "AT1G01050__AT1G01060__ppi_liter__2.7",
+              "AT1G01080__AT1G01090__ppi_liter__2.8"
+            ]
+          }
+        }
       },
-      "5": {
-        "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"],
-        "edges": [
-          "As2__AT1G01020__pheno_assn__8.4",
-          "As2__AT1G01040__pheno_assn__5.4",
-          "As75__AT1G01020__pheno_assn__39.9",
-          "AT1G01010__AT1G01020__ppi_hithru__2.3",
-          "AT1G01010__AT1G01030__ppi_hithru__2.4",
-          "AT1G01010__AT1G01040__domain_co_occur__2.5",
-          "AT1G01010__AT1G01040__ppi_liter__170.5",
-          "AT1G01030__AT1G01050__gene_coexpr__2.6",
-          "AT1G01050__AT1G01060__ppi_liter__2.7"
-        ]
+      "markov_i2:5__markov_i6:2": {
+        "distance": {
+          "0": {
+            "nodes": ["AT1G01020", "AT1G01070"],
+            "edges": []
+          },
+          "1": {
+            "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"],
+            "edges": [
+              "As2__AT1G01020__pheno_assn__8.4",
+              "As75__AT1G01020__pheno_assn__39.9",
+              "AT1G01010__AT1G01020__ppi_hithru__2.3"
+            ]
+          },
+          "5": {
+            "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"],
+            "edges": [
+              "As2__AT1G01020__pheno_assn__8.4",
+              "As2__AT1G01040__pheno_assn__5.4",
+              "As75__AT1G01020__pheno_assn__39.9",
+              "AT1G01010__AT1G01020__ppi_hithru__2.3",
+              "AT1G01010__AT1G01030__ppi_hithru__2.4",
+              "AT1G01010__AT1G01040__domain_co_occur__2.5",
+              "AT1G01010__AT1G01040__ppi_liter__170.5",
+              "AT1G01030__AT1G01050__gene_coexpr__2.6",
+              "AT1G01050__AT1G01060__ppi_liter__2.7"
+            ]
+          }
+        }
       }
     }
   }
diff --git a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv b/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv
index 086a920..585e0a5 100644
--- a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv
+++ b/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv
@@ -1,3 +1,5 @@
+# prefix: markov_i2
+# title: Markov clustering, inflation = 2
 Cluster1	AT1G01010	AT1G01030	AT1G01040
 Cluster2	AT1G01050	AT1G01060	AT1G01070
 Cluster3	AT1G01080	AT1G01090
diff --git a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv b/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv
index 8b13789..3cb18ea 100644
--- a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv
+++ b/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv
@@ -1 +1,3 @@
+# prefix: markov_i4
+# title: Markov clustering, inflation = 4
 
diff --git a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv b/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv
index 389cae2..d6a1b07 100644
--- a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv
+++ b/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv
@@ -1,3 +1,5 @@
+# prefix: markov_i6
+# title: Markov clustering, inflation = 6
 Cluster1	AT1G01040	AT1G01090
 Cluster2	AT1G01070
 Cluster3	AT1G01010	AT1G01020	AT1G01030
diff --git a/test/stored_queries/test_djornl.py b/test/stored_queries/test_djornl.py
index df2a7e5..e61835b 100644
--- a/test/stored_queries/test_djornl.py
+++ b/test/stored_queries/test_djornl.py
@@ -79,6 +79,7 @@ def check_expected_results(self, description, response, expected):
 
         if _VERBOSE:
             print("Running test " + description)
+
         results = response['results'][0]
         self.assertEqual(
             set([n["_key"] for n in results['nodes']]),
@@ -112,25 +113,29 @@ def test_fetch_all(self):
 
 
     # indexing schema in results.json
-    # self.json_data[query][primary_param][distance_param]
-    # if primary_param is an array, join the array entities with "__"
+    # self.json_data[query_name][param_name][param_value]["distance"][distance_param]
+    # e.g. for fetch_clusters data:
+    # "fetch_clusters": {
+    #   "cluster_ids": {
+    #     "markov_i2:6__markov_i4:3": {
+    #       "distance": {
+    #         1: {
+    #           "nodes": [ node IDs ],
+    #           "edges": [ edge data ],
+    #         }
+    #       }
+    #     }
+    #   }
+    # }
+    # if param_value is an array, join the array entities with "__"
     # results are in the form {"nodes": [...], "edges": [...]}
     # nodes are represented as a list of node[_key]
     # edges are objects with keys _to, _from, edge_type and score
 
-    def test_fetch_phenotypes_no_results(self):
-
-        resp = self.submit_query('djornl_fetch_phenotypes', {
-            # gene node
-            "keys": ["AT1G01010"],
-        })
-        self.assertEqual(resp['results'][0], self.no_results)
-
-
     def test_fetch_phenotypes(self):
 
-        for fetch_args in self.json_data['fetch_phenotypes'].keys():
-            for distance in self.json_data['fetch_phenotypes'][fetch_args].keys():
+        for (fetch_args, key_data) in self.json_data['fetch_phenotypes']['keys'].items():
+            for (distance, distance_data) in key_data['distance'].items():
                 resp = self.submit_query('djornl_fetch_phenotypes', {
                     "keys": fetch_args.split('__'),
                     "distance": int(distance),
@@ -138,22 +143,14 @@ def test_fetch_phenotypes(self):
                 self.check_expected_results(
                     "fetch phenotypes with args " + fetch_args + " and distance " + distance,
                     resp,
-                    self.json_data['fetch_phenotypes'][fetch_args][distance]
+                    distance_data
                 )
 
 
-    def test_fetch_genes_no_results(self):
-        resp = self.submit_query('djornl_fetch_genes', {
-            # phenotype node
-            "keys": ["As2"],
-        })
-        self.assertEqual(resp['results'][0], self.no_results)
-
-
     def test_fetch_genes(self):
 
-        for fetch_args in self.json_data['fetch_genes'].keys():
-            for distance in self.json_data['fetch_genes'][fetch_args].keys():
+        for (fetch_args, key_data) in self.json_data['fetch_genes']['keys'].items():
+            for (distance, distance_data) in key_data['distance'].items():
                 resp = self.submit_query('djornl_fetch_genes', {
                     "keys": fetch_args.split('__'),
                     "distance": int(distance),
@@ -161,54 +158,29 @@ def test_fetch_genes(self):
                 self.check_expected_results(
                     "fetch genes with args " + fetch_args + " and distance " + distance,
                     resp,
-                    self.json_data['fetch_genes'][fetch_args][distance]
+                    distance_data
                 )
 
 
-    def test_fetch_clusters_no_results(self):
-
-        resp = self.submit_query('djornl_fetch_clusters', {
-            'cluster_i2_ids': [666],
-            'cluster_i4_ids': [666],
-            'cluster_i6_ids': [666],
-        })
-        self.assertEqual(resp['results'][0], self.no_results)
-
-
     def test_fetch_clusters(self):
 
-        for fetch_args in self.json_data['fetch_clusters'].keys():
-            cluster_args = {}
-            for arg in fetch_args.split('__'):
-                [c_name, c_id] = arg.split('-', maxsplit=1)
-                if "cluster_" + c_name + "_ids" in cluster_args:
-                    cluster_args["cluster_" + c_name + "_ids"] += int(c_id)
-                else:
-                    cluster_args["cluster_" + c_name + "_ids"] = [int(c_id)]
-
-            for distance in self.json_data['fetch_clusters'][fetch_args].keys():
-                cluster_args['distance'] = int(distance)
-                resp = self.submit_query('djornl_fetch_clusters', cluster_args)
+        for (fetch_args, cluster_data) in self.json_data['fetch_clusters']['cluster_ids'].items():
+            for (distance, distance_data) in cluster_data['distance'].items():
+                resp = self.submit_query('djornl_fetch_clusters', {
+                    "cluster_ids": fetch_args.split('__'),
+                    "distance": int(distance),
+                })
                 self.check_expected_results(
                     "fetch clusters with args " + fetch_args + " and distance " + distance,
                     resp,
-                    self.json_data['fetch_clusters'][fetch_args][distance]
+                    distance_data
                 )
 
-    @unittest.skip('This test is disabled until automated view loading is possible')
-    def test_search_nodes_no_results(self):
-
-        resp = self.submit_query('djornl_search_nodes', {
-            "search_text": "Mary Poppins",
-        })
-        self.assertEqual(resp['results'][0], self.no_results)
-
 
-    @unittest.skip('This test is disabled until automated view loading is possible')
     def test_search_nodes(self):
 
-        for search_text in self.json_data['search_nodes'].keys():
-            for distance in self.json_data['search_nodes'][search_text].keys():
+        for (search_text, search_data) in self.json_data['search_nodes']['search_text'].items():
+            for (distance, distance_data) in search_data['distance'].items():
                 resp = self.submit_query('djornl_search_nodes', {
                     "search_text": search_text,
                     "distance": int(distance),
@@ -216,5 +188,5 @@ def test_search_nodes(self):
                 self.check_expected_results(
                     "search nodes with args " + search_text + " and distance " + distance,
                     resp,
-                    self.json_data['search_nodes'][search_text][distance]
+                    distance_data
                 )
diff --git a/test/stored_queries/test_djornl_parser.py b/test/stored_queries/test_djornl_parser.py
index b2043b9..8d38761 100644
--- a/test/stored_queries/test_djornl_parser.py
+++ b/test/stored_queries/test_djornl_parser.py
@@ -142,4 +142,3 @@ def test_load_valid_cluster_data(self):
             cluster_data,
             self.json_data["load_cluster_data"]
         )
-

From 93205a274600b6233e6c67f74684aa786d17fbf4 Mon Sep 17 00:00:00 2001
From: ialarmedalien <ialarmedalien@gmail.com>
Date: Mon, 20 Jul 2020 15:09:16 -0700
Subject: [PATCH 2/4] Adding in manifest and manifest schema for indicating the
 list of files that make up the data, plus code to validate the manifest.
 Created manifests for all test files and updated tests accordingly Added
 djornl data source (github repo)

---
 data_sources/djornl.yaml                      |   5 +
 importers/djornl/manifest.schema.json         |  52 +++
 importers/djornl/parser.py                    | 318 +++++++++++-------
 schemas/deltaloader/delta_load_registry.yaml  |   2 +-
 schemas/djornl/djornl_node.yaml               |   4 +
 test/djornl/col_count_errors/manifest.yaml    |   5 +
 test/djornl/empty_files/manifest.yaml         |  17 +
 test/djornl/invalid_file/manifest.yaml        |   9 +
 test/djornl/invalid_manifest/manifest.yaml    |  10 +
 ...aragwas-MERGED-AMW-v2_091319_nodeTable.csv |   1 +
 test/djornl/invalid_types/manifest.yaml       |   5 +
 .../merged_edges-AMW-060820_AF.tsv            |   1 +
 test/djornl/missing_files/manifest.yaml       |   9 +
 ...F_082919.abc.I2_named.tsv => I2_named.tsv} |   1 +
 ...F_082919.abc.I4_named.tsv => I4_named.tsv} |   2 +-
 ...F_082919.abc.I6_named.tsv => I6_named.tsv} |   1 +
 ...rged_edges-AMW-060820_AF.tsv => edges.tsv} |   0
 test/djornl/test_data/manifest.yaml           |  19 ++
 ...-AMW-v2_091319_nodeTable.csv => nodes.csv} |   1 +
 test/stored_queries/test_djornl.py            |  24 +-
 test/stored_queries/test_djornl_parser.py     |  89 +++--
 21 files changed, 405 insertions(+), 170 deletions(-)
 create mode 100644 data_sources/djornl.yaml
 create mode 100644 importers/djornl/manifest.schema.json
 create mode 100644 test/djornl/col_count_errors/manifest.yaml
 create mode 100644 test/djornl/empty_files/manifest.yaml
 create mode 100644 test/djornl/invalid_file/manifest.yaml
 create mode 100644 test/djornl/invalid_manifest/manifest.yaml
 create mode 100644 test/djornl/invalid_types/manifest.yaml
 create mode 100644 test/djornl/missing_files/manifest.yaml
 rename test/djornl/test_data/{cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv => I2_named.tsv} (90%)
 rename test/djornl/test_data/{cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv => I4_named.tsv} (74%)
 rename test/djornl/test_data/{cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv => I6_named.tsv} (88%)
 rename test/djornl/test_data/{merged_edges-AMW-060820_AF.tsv => edges.tsv} (100%)
 create mode 100644 test/djornl/test_data/manifest.yaml
 rename test/djornl/test_data/{aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv => nodes.csv} (99%)

diff --git a/data_sources/djornl.yaml b/data_sources/djornl.yaml
new file mode 100644
index 0000000..495aa8a
--- /dev/null
+++ b/data_sources/djornl.yaml
@@ -0,0 +1,5 @@
+name: djornl
+category: network
+title: Jacobson Lab Exascale Networking data
+home_url: https://github.com/kbase/exascale_data
+data_url: https://github.com/kbase/exascale_data/releases/latest
diff --git a/importers/djornl/manifest.schema.json b/importers/djornl/manifest.schema.json
new file mode 100644
index 0000000..e29ab28
--- /dev/null
+++ b/importers/djornl/manifest.schema.json
@@ -0,0 +1,52 @@
+{
+ "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Exascale parser file manifest",
+  "type": "array",
+  "items": {
+    "type": "object",
+    "required": ["data_type", "path"],
+    "oneOf": [
+      {
+        "properties": {
+          "data_type": { "enum": ["cluster"] }
+        },
+        "required": [ "prefix" ]
+      },
+      {
+        "properties": {
+          "data_type": { "enum": [ "node", "edge" ] }
+        }
+      }
+    ],
+    "properties": {
+      "data_type": {
+        "title": "Data type",
+        "type": "string",
+        "enum": ["node", "edge", "cluster"]
+      },
+      "creation_date": {
+        "title": "File creation date",
+        "description": "date of file creation in the format YYYY-MM-DD",
+        "type": "string",
+        "format": "date"
+      },
+      "description": {
+        "title": "Description of the cluster set",
+        "type": "string"
+      },
+      "path": {
+        "title": "File path",
+        "type": "string"
+      },
+      "prefix": {
+        "title": "Prefix",
+        "type": "string",
+        "description": "The prefix to be used for clusters, e.g. markov_i2:4. Required for cluster data, not used for node or edge data"
+      },
+      "title": {
+        "title": "Name of the cluster set",
+        "type": "string"
+      }
+    }
+  }
+}
diff --git a/importers/djornl/parser.py b/importers/djornl/parser.py
index c6051e8..ebf7ed1 100644
--- a/importers/djornl/parser.py
+++ b/importers/djornl/parser.py
@@ -8,6 +8,12 @@
 import requests
 import os
 import csv
+import yaml
+import json
+import jsonschema
+
+from jsonschema.validators import Draft7Validator
+from jsonschema.exceptions import ValidationError
 
 import importers.utils.config as config
 
@@ -28,34 +34,50 @@ def _configure(self):
         configuration['_NODE_NAME'] = 'djornl_node'
         configuration['_EDGE_NAME'] = 'djornl_edge'
 
-        # Path config
-        configuration['_NODE_PATH'] = os.path.join(
-            configuration['ROOT_DATA_PATH'],
-            'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv'
-        )
-        configuration['_NODE_FILE_COL_COUNT'] = 20
+        # read the manifest file, which contains path and file type info
+        manifest_file = os.path.join(configuration['ROOT_DATA_PATH'], 'manifest.yaml')
+
+        try:
+            with open(manifest_file) as fd:
+                manifest = yaml.safe_load(fd)
+        except FileNotFoundError:
+            raise RuntimeError(
+                f"No manifest file found at {manifest_file}.\n"
+                + "Please ensure that you have created a manifest that lists the files "
+                + "in the release"
+            )
+
+        # load the schema for the manifest and ensure that it is valid
+        schema_file = os.path.join(os.path.dirname(__file__), 'manifest.schema.json')
+        with open(schema_file) as fd:
+            manifest_schema = json.load(fd)
+
+        validator = Draft7Validator(manifest_schema)
+        if not validator.is_valid(manifest):
+            error_list = []
+            raise RuntimeError(
+                "The manifest file failed validation with the following errors:\n"
+                + "\n".join(e.message for e in sorted(validator.iter_errors(manifest), key=str))
+                + "\nPlease recheck the file and try again."
+            )
+
+        # make sure all the files listed actually exist
+        for type in ['node', 'edge', 'cluster']:
+            configuration[type + '_files'] = []
+
+        for file in manifest:
+            file_path = os.path.join(configuration['ROOT_DATA_PATH'], file['path'])
+
+            if not os.path.exists(file_path):
+                raise RuntimeError(f"{file_path}: file does not exist")
+
+            if not os.path.isfile(file_path):
+                raise RuntimeError(f"{file_path}: not a file")
+
+            # add the file to the appropriate list
+            file['file_path'] = file_path
+            configuration[file['data_type'] + '_files'].append(file)
 
-        configuration['_EDGE_PATH'] = os.path.join(
-            configuration['ROOT_DATA_PATH'],
-            'merged_edges-AMW-060820_AF.tsv'
-        )
-        configuration['_EDGE_FILE_COL_COUNT'] = 5
-
-        _CLUSTER_BASE = os.path.join(configuration['ROOT_DATA_PATH'], 'cluster_data')
-        configuration['_CLUSTER_PATHS'] = {
-            'markov_i2': os.path.join(
-                _CLUSTER_BASE,
-                'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv'
-            ),
-            'markov_i4': os.path.join(
-                _CLUSTER_BASE,
-                'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv'
-            ),
-            'markov_i6': os.path.join(
-                _CLUSTER_BASE,
-                'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv'
-            ),
-        }
         self._config = configuration
         return self._config
 
@@ -76,33 +98,44 @@ def load_edges(self):
         node_ix = {}
         edges = []
         node_name = self.config()['_NODE_NAME']
-        expected_col_count = self.config()['_EDGE_FILE_COL_COUNT']
-
-        with open(self.config()['_EDGE_PATH']) as fd:
-            csv_reader = csv.reader(fd, delimiter='\t')
-            next(csv_reader, None)  # skip headers
-            line_no = 1
-            for row in csv_reader:
-                line_no += 1
-
-                cols = [c.strip() for c in row]
-                if len(cols) != expected_col_count:
-                    n_cols = len(cols)
-                    raise RuntimeError(f"line {line_no}: expected {expected_col_count} cols, found {n_cols}")
-
-                node_ix[cols[0]] = 1
-                node_ix[cols[1]] = 1
-                edge_type = cols[4]
-                if edge_type not in edge_remap:
-                    raise RuntimeError(f"line {line_no}: invalid edge type: {edge_type}")
-
-                edges.append({
-                    '_key': f'{cols[0]}__{cols[1]}__{edge_remap[edge_type]}__{cols[2]}',
-                    '_from': f'{node_name}/{cols[0]}',
-                    '_to': f'{node_name}/{cols[1]}',
-                    'score': float(cols[2]),
-                    'edge_type': edge_remap[edge_type],
-                })
+        expected_col_count = 0
+        headers = []
+
+        for file in self.config()['edge_files']:
+            with open(file['file_path']) as fd:
+                csv_reader = csv.reader(fd, delimiter='\t')
+                line_no = 0
+                for row in csv_reader:
+                    line_no += 1
+                    if len(row) <= 1 or row[0][0] == '#':
+                        # comment / metadata
+                        continue
+
+                    cols = [c.strip() for c in row]
+
+                    if len(cols) != expected_col_count:
+                        n_cols = len(cols)
+
+                        if len(headers) == 0:
+                            expected_col_count = len(cols)
+                            headers = cols
+                            continue
+
+                        raise RuntimeError(f"{file['path']} line {line_no}: expected {expected_col_count} cols, found {n_cols}")
+
+                    node_ix[cols[0]] = 1
+                    node_ix[cols[1]] = 1
+                    edge_type = cols[4]
+                    if edge_type not in edge_remap:
+                        raise RuntimeError(f"{file['path']} line {line_no}: invalid edge type: {edge_type}")
+
+                    edges.append({
+                        '_key': f'{cols[0]}__{cols[1]}__{edge_remap[edge_type]}__{cols[2]}',
+                        '_from': f'{node_name}/{cols[0]}',
+                        '_to': f'{node_name}/{cols[1]}',
+                        'score': float(cols[2]),
+                        'edge_type': edge_remap[edge_type],
+                    })
 
         return {
             'nodes': [{'_key': n} for n in node_ix.keys()],
@@ -114,49 +147,62 @@ def load_node_metadata(self):
         """Load node metadata"""
 
         nodes = []
-        expected_col_count = self.config()['_NODE_FILE_COL_COUNT']
-        with open(self.config()['_NODE_PATH']) as fd:
-            csv_reader = csv.reader(fd, delimiter=',')
-            next(csv_reader, None)  # skip headers
-            line_no = 1
-            for row in csv_reader:
-                line_no += 1
-
-                cols = [c.strip() for c in row]
-                if len(cols) != expected_col_count:
-                    n_cols = len(cols)
-                    raise RuntimeError(f"line {line_no}: expected {expected_col_count} cols, found {n_cols}")
-
-                _key = cols[0]
-                node_type = cols[1]
-                if node_type != 'gene' and node_type != 'pheno':
-                    raise RuntimeError(f"line {line_no}: invalid node type: {node_type}")
-
-                go_terms = [c.strip() for c in cols[10].split(',')] if len(cols[10]) else []
-
-                doc = {
-                    '_key': _key,
-                    'node_type': node_type,
-                    'transcript': cols[2],
-                    'gene_symbol': cols[3],
-                    'gene_full_name': cols[4],
-                    'gene_model_type': cols[5],
-                    'tair_computational_desc': cols[6],
-                    'tair_curator_summary': cols[7],
-                    'tair_short_desc': cols[8],
-                    'go_descr': cols[9],
-                    'go_terms': go_terms,
-                    'mapman_bin': cols[11],
-                    'mapman_name': cols[12],
-                    'mapman_desc': cols[13],
-                    'pheno_aragwas_id': cols[14],
-                    'pheno_desc1': cols[15],
-                    'pheno_desc2': cols[16],
-                    'pheno_desc3': cols[17],
-                    'pheno_ref': cols[18],
-                    'user_notes': cols[19],
-                }
-                nodes.append(doc)
+        headers = []
+        expected_col_count = 0
+        valid_node_types = ['gene', 'pheno']
+        for file in self.config()['node_files']:
+            with open(file['file_path']) as fd:
+                csv_reader = csv.reader(fd, delimiter=',')
+                line_no = 0
+                for row in csv_reader:
+                    line_no += 1
+                    if len(row) <= 1 or row[0][0] == '#':
+                        # comment / metadata
+                        continue
+
+                    cols = [c.strip() for c in row]
+                    if len(cols) != expected_col_count:
+
+                        if len(headers) == 0:
+                            # this is the header row; set up the expected column count
+                            expected_col_count = len(cols)
+                            headers = cols
+                            continue
+
+                        # otherwise, this row does not have the correct number of columns
+                        n_cols = len(cols)
+                        raise RuntimeError(f"{file['path']} line {line_no}: expected {expected_col_count} cols, found {n_cols}")
+
+                    _key = cols[0]
+                    node_type = cols[1]
+                    if node_type not in valid_node_types:
+                        raise RuntimeError(f"{file['path']} line {line_no}: invalid node type: {node_type}")
+
+                    go_terms = [c.strip() for c in cols[10].split(',')] if len(cols[10]) else []
+
+                    doc = {
+                        '_key': _key,
+                        'node_type': node_type,
+                        'transcript': cols[2],
+                        'gene_symbol': cols[3],
+                        'gene_full_name': cols[4],
+                        'gene_model_type': cols[5],
+                        'tair_computational_desc': cols[6],
+                        'tair_curator_summary': cols[7],
+                        'tair_short_desc': cols[8],
+                        'go_descr': cols[9],
+                        'go_terms': go_terms,
+                        'mapman_bin': cols[11],
+                        'mapman_name': cols[12],
+                        'mapman_desc': cols[13],
+                        'pheno_aragwas_id': cols[14],
+                        'pheno_desc1': cols[15],
+                        'pheno_desc2': cols[16],
+                        'pheno_desc3': cols[17],
+                        'pheno_ref': cols[18],
+                        'user_notes': cols[19],
+                    }
+                    nodes.append(doc)
 
         return {'nodes': nodes}
 
@@ -166,44 +212,41 @@ def load_cluster_data(self):
 
         # index of nodes
         node_ix = {}
-        cluster_paths = self.config()['_CLUSTER_PATHS']
-        for (cluster_label, path) in cluster_paths.items():
-            with open(path) as fd:
+        for file in self.config()['cluster_files']:
+            cluster_label = file['prefix']
+            with open(file['file_path']) as fd:
                 csv_reader = csv.reader(fd, delimiter='\t')
+                line_no = 0
                 for row in csv_reader:
-                    if len(row) > 1:
-                        self._parse_cluster_row(row, cluster_label, node_ix)
+                    line_no += 1
+                    if len(row) <= 1 or row[0][0] == '#':
+                        # comment / metadata
+                        continue
 
+                    self._parse_cluster_row(row, cluster_label, node_ix)
 
         # gather a list of cluster IDs for each node
-        nodes = []
-        for (key, cluster_data) in node_ix.items():
-            clusters = []
-            for (cluster_label, id_list) in cluster_data.items():
-                clusters += [cluster_label + ":" + id for id in id_list]
-
-            nodes += [{
-                '_key': key,
-                'clusters': clusters
-            }]
+        nodes = [{
+            '_key': key,
+            'clusters': cluster_data
+        } for (key, cluster_data) in node_ix.items()]
 
         return {'nodes': nodes}
 
 
     def _parse_cluster_row(self, row, cluster_label, node_ix):
-        # metadata rows start with '#'
-        if row[0] != '#':
-            # remove the 'Cluster' text
-            cluster_id = row[0].replace('Cluster','')
-            node_keys = row[1:]
 
-            for key in node_keys:
-                if key not in node_ix:
-                    node_ix[key] = {}
-                if cluster_label not in node_ix[key]:
-                    node_ix[key][cluster_label] = []
+        # remove the 'Cluster' text
+        id = row[0].replace('Cluster','')
+        node_keys = row[1:]
+
+        for key in node_keys:
+            if key not in node_ix:
+                node_ix[key] = []
 
-                node_ix[key][cluster_label].append(cluster_id)
+            cluster_id = cluster_label + ':' + id
+            if cluster_id not in node_ix[key]:
+                node_ix[key].append(cluster_id)
 
 
     def save_dataset(self, dataset):
@@ -237,3 +280,32 @@ def load_data(self):
         self.save_dataset(self.load_node_metadata())
         self.save_dataset(self.load_cluster_data())
 
+
+    def check_data_delta(self):
+        edge_data = self.load_edges()
+        node_metadata = self.load_node_metadata()
+        clusters = self.load_cluster_data()
+
+        self.check_deltas(edge_data=edge_data, node_metadata=node_metadata, cluster_data=clusters)
+
+    def check_deltas(self, edge_data={}, node_metadata={}, cluster_data={}):
+
+        edge_nodes = set([e['_key'] for e in edge_data['nodes']])
+        node_metadata_nodes = set([e['_key'] for e in node_metadata['nodes']])
+        cluster_nodes = set([e['_key'] for e in cluster_data['nodes']])
+        all_nodes = edge_nodes.union(node_metadata_nodes).union(cluster_nodes)
+
+        # check all nodes in cluster_data have node_metadata
+        clstr_no_node_md_set = cluster_nodes.difference(node_metadata_nodes)
+        if clstr_no_node_md_set:
+           print({'clusters with no node metadata': clstr_no_node_md_set})
+
+        # check all nodes in the edge_data have node_metadata
+        edge_no_node_md_set = edge_nodes.difference(node_metadata_nodes)
+        if edge_no_node_md_set:
+            print({'edges with no node metadata': edge_no_node_md_set})
+
+        # count all edges
+        print("Dataset contains " + str(len(edge_data['edges'])) + " edges")
+        # count all nodes
+        print("Dataset contains " + str(len(all_nodes)) + " nodes")
diff --git a/schemas/deltaloader/delta_load_registry.yaml b/schemas/deltaloader/delta_load_registry.yaml
index dc9c7f8..419b2a7 100644
--- a/schemas/deltaloader/delta_load_registry.yaml
+++ b/schemas/deltaloader/delta_load_registry.yaml
@@ -5,7 +5,7 @@ schema:
   "$schema": http://json-schema.org/draft-07/schema#
   title: delta_load_registry
   type: object
-  description: Don't touch this. It's for the exlusive use of delta loaders.
+  description: Don't touch this. It's for the exclusive use of delta loaders.
   properties:
     _key:
       type: string
diff --git a/schemas/djornl/djornl_node.yaml b/schemas/djornl/djornl_node.yaml
index f200931..a7b44a8 100644
--- a/schemas/djornl/djornl_node.yaml
+++ b/schemas/djornl/djornl_node.yaml
@@ -2,6 +2,10 @@ name: djornl_node
 type: vertex
 delta: false
 
+indexes:
+ - type: hash
+   fields: ["clusters[*]"]
+
 schema:
   "$schema": http://json-schema.org/draft-07/schema#
   title: Gene and Phenotype Vertices
diff --git a/test/djornl/col_count_errors/manifest.yaml b/test/djornl/col_count_errors/manifest.yaml
new file mode 100644
index 0000000..88ab96d
--- /dev/null
+++ b/test/djornl/col_count_errors/manifest.yaml
@@ -0,0 +1,5 @@
+- data_type: edge
+  path: merged_edges-AMW-060820_AF.tsv
+
+- data_type: node
+  path: aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
diff --git a/test/djornl/empty_files/manifest.yaml b/test/djornl/empty_files/manifest.yaml
new file mode 100644
index 0000000..7d42ff6
--- /dev/null
+++ b/test/djornl/empty_files/manifest.yaml
@@ -0,0 +1,17 @@
+- data_type: edge
+  path: merged_edges-AMW-060820_AF.tsv
+
+- data_type: node
+  path: aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
+
+- data_type: cluster
+  prefix: markov_i2
+  path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv
+
+- data_type: cluster
+  prefix: markov_i4
+  path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv
+
+- data_type: cluster
+  prefix: markov_i6
+  path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv
diff --git a/test/djornl/invalid_file/manifest.yaml b/test/djornl/invalid_file/manifest.yaml
new file mode 100644
index 0000000..3a12de5
--- /dev/null
+++ b/test/djornl/invalid_file/manifest.yaml
@@ -0,0 +1,9 @@
+- data_type: edge
+  path: edges.tsv
+
+- data_type: node
+  path: nodes.csv
+
+- data_type: cluster
+  prefix: markov_i2
+  path: clusters.tsv
diff --git a/test/djornl/invalid_manifest/manifest.yaml b/test/djornl/invalid_manifest/manifest.yaml
new file mode 100644
index 0000000..e7fa88e
--- /dev/null
+++ b/test/djornl/invalid_manifest/manifest.yaml
@@ -0,0 +1,10 @@
+- data_type: edge
+  path: edges.tsv
+
+- data_type: node
+
+- data_type: cluster
+  path: clusters.tsv
+
+- data_type: ping-pong balls
+  path: where?
\ No newline at end of file
diff --git a/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv b/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
index af5fa6c..543dd99 100644
--- a/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
+++ b/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
@@ -1,4 +1,5 @@
 node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,UserNotes
+# data_type: node
 As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010",
 As75,pheno,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010",
 AT1G01010,Monkey,AT1G01010.1,NTL10,NAC domain containing protein 1,protein_coding,NAC domain containing protein 1;(source:Araport11),,NAC domain containing protein 1,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.17,.RNA biosynthesis.transcriptional regulation.transcription factor (NAC),transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96]),,,,,,
diff --git a/test/djornl/invalid_types/manifest.yaml b/test/djornl/invalid_types/manifest.yaml
new file mode 100644
index 0000000..88ab96d
--- /dev/null
+++ b/test/djornl/invalid_types/manifest.yaml
@@ -0,0 +1,5 @@
+- data_type: edge
+  path: merged_edges-AMW-060820_AF.tsv
+
+- data_type: node
+  path: aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
diff --git a/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv b/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv
index f9857bd..a98f49f 100644
--- a/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv
+++ b/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv
@@ -1,3 +1,4 @@
+# data_type: edge
 node1	node2	edge	edge_descrip	layer_descrip
 As2	AT1G01020	8.422046084731258	AraGWAS-Association_score	AraGWAS-Some-Old-Rubbish-I-Made-Up
 As2	AT1G01040	5.422046084731258	AraGWAS-Association_score	AraGWAS-Phenotype_Associations
diff --git a/test/djornl/missing_files/manifest.yaml b/test/djornl/missing_files/manifest.yaml
new file mode 100644
index 0000000..3a12de5
--- /dev/null
+++ b/test/djornl/missing_files/manifest.yaml
@@ -0,0 +1,9 @@
+- data_type: edge
+  path: edges.tsv
+
+- data_type: node
+  path: nodes.csv
+
+- data_type: cluster
+  prefix: markov_i2
+  path: clusters.tsv
diff --git a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv b/test/djornl/test_data/I2_named.tsv
similarity index 90%
rename from test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv
rename to test/djornl/test_data/I2_named.tsv
index 585e0a5..46f4498 100644
--- a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv
+++ b/test/djornl/test_data/I2_named.tsv
@@ -1,3 +1,4 @@
+# data_type: cluster
 # prefix: markov_i2
 # title: Markov clustering, inflation = 2
 Cluster1	AT1G01010	AT1G01030	AT1G01040
diff --git a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv b/test/djornl/test_data/I4_named.tsv
similarity index 74%
rename from test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv
rename to test/djornl/test_data/I4_named.tsv
index 3cb18ea..147831e 100644
--- a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv
+++ b/test/djornl/test_data/I4_named.tsv
@@ -1,3 +1,3 @@
 # prefix: markov_i4
 # title: Markov clustering, inflation = 4
-
+# data_type: cluster
diff --git a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv b/test/djornl/test_data/I6_named.tsv
similarity index 88%
rename from test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv
rename to test/djornl/test_data/I6_named.tsv
index d6a1b07..b4680eb 100644
--- a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv
+++ b/test/djornl/test_data/I6_named.tsv
@@ -1,3 +1,4 @@
+# data_type: cluster
 # prefix: markov_i6
 # title: Markov clustering, inflation = 6
 Cluster1	AT1G01040	AT1G01090
diff --git a/test/djornl/test_data/merged_edges-AMW-060820_AF.tsv b/test/djornl/test_data/edges.tsv
similarity index 100%
rename from test/djornl/test_data/merged_edges-AMW-060820_AF.tsv
rename to test/djornl/test_data/edges.tsv
diff --git a/test/djornl/test_data/manifest.yaml b/test/djornl/test_data/manifest.yaml
new file mode 100644
index 0000000..2eb28e3
--- /dev/null
+++ b/test/djornl/test_data/manifest.yaml
@@ -0,0 +1,19 @@
+- data_type: edge
+  path: edges.tsv
+  date_created: 2020-12-25
+
+- data_type: node
+  path: nodes.csv
+  date_created: 2019-01-01
+
+- data_type: cluster
+  prefix: markov_i2
+  path: I2_named.tsv
+
+- data_type: cluster
+  prefix: markov_i4
+  path: I4_named.tsv
+
+- data_type: cluster
+  prefix: markov_i6
+  path: I6_named.tsv
diff --git a/test/djornl/test_data/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv b/test/djornl/test_data/nodes.csv
similarity index 99%
rename from test/djornl/test_data/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
rename to test/djornl/test_data/nodes.csv
index 5bc0e1d..a032142 100644
--- a/test/djornl/test_data/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv
+++ b/test/djornl/test_data/nodes.csv
@@ -1,3 +1,4 @@
+# data_type: node
 node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,UserNotes
 As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010",
 As75,pheno,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010",
diff --git a/test/stored_queries/test_djornl.py b/test/stored_queries/test_djornl.py
index e61835b..b6468ba 100644
--- a/test/stored_queries/test_djornl.py
+++ b/test/stored_queries/test_djornl.py
@@ -94,23 +94,23 @@ def check_expected_results(self, description, response, expected):
 
     def test_fetch_all(self):
 
-        # expect all the nodes from load_node_metadata and all the edges from load_edges
-        expected = {
-            "nodes": [n["_key"] for n in self.json_data['load_node_metadata']['nodes']],
-            "edges": [ {
-              "_to":        e["_to"],
-              "_from":      e["_from"],
-              "score":      e["score"],
-              "edge_type":  e["edge_type"] } for e in self.json_data['load_edges']['edges']
-            ]
-        }
-
+        response = self.submit_query('djornl_fetch_all')
         self.check_expected_results(
             "djornl_fetch_all",
-            self.submit_query('djornl_fetch_all'),
+            response,
             self.json_data['fetch_all']
         )
 
+        # ensure that all the cluster data is returned OK
+        node_data = response['results'][0]['nodes']
+        nodes_with_clusters = [json.dumps({
+            '_key':     n['_key'],
+            'clusters': n['clusters']
+        }) for n in node_data if 'clusters' in n]
+        self.assertEqual(
+            set(nodes_with_clusters),
+            set([json.dumps(this) for this in self.json_data['load_cluster_data']['nodes']])
+        )
 
     # indexing schema in results.json
     # self.json_data[query_name][param_name][param_value]["distance"][distance_param]
diff --git a/test/stored_queries/test_djornl_parser.py b/test/stored_queries/test_djornl_parser.py
index 8d38761..347e17d 100644
--- a/test/stored_queries/test_djornl_parser.py
+++ b/test/stored_queries/test_djornl_parser.py
@@ -10,6 +10,7 @@
 import requests
 import os
 import contextlib
+from jsonschema.exceptions import ValidationError
 
 from importers.djornl.parser import DJORNL_Parser
 
@@ -40,69 +41,86 @@ def init_parser_with_path(self, root_path):
             return parser
 
 
-    def test_load_empty_files(self):
-        """ test loading files containing no data """
+    def test_load_no_manifest(self):
+        """ test loading when the manifest does not exist """
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'no_manifest')
+        err_str = 'No manifest file found at ' + os.path.join(RES_ROOT_DATA_PATH, 'manifest.yaml')
+        with self.assertRaisesRegex(RuntimeError, err_str):
+            self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
-        # path: test/djornl/empty_files
-        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'empty_files')
-        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
-        self.assertEqual(parser.load_edges(), {"nodes": [], "edges": []})
-        self.assertEqual(parser.load_node_metadata(), {"nodes": []})
-        self.assertEqual(parser.load_cluster_data(), {"nodes": []})
+    def test_load_invalid_manifest(self):
+        """ test an invalid manifest file """
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_manifest')
+        err_str = "The manifest file failed validation with the following errors:"
+        with self.assertRaisesRegex(RuntimeError, err_str):
+            self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
+
+    def test_load_invalid_file(self):
+        """ test loading when a file specified in the manifest is a directory """
+
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_file')
+
+        # edges: directory, not a file
+        err_str = os.path.join(RES_ROOT_DATA_PATH, "edges.tsv") + ": not a file"
+        with self.assertRaisesRegex(RuntimeError, err_str):
+            self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
 
     def test_load_missing_files(self):
         """ test loading when files cannot be found """
 
-        # this dir does not contain the correct file structure
-        # path: test/djornl/empty_files/cluster_data
-        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'empty_files', 'cluster_data')
-        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'missing_files')
+        # not found
+        err_str = os.path.join(RES_ROOT_DATA_PATH, "edges.tsv") + ': file does not exist'
+        with self.assertRaisesRegex(RuntimeError, err_str):
+            self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
-        err_str = "No such file or directory: '" + RES_ROOT_DATA_PATH
-        with self.assertRaisesRegex(FileNotFoundError, err_str):
-            parser.load_edges()
 
-        with self.assertRaisesRegex(FileNotFoundError, err_str):
-            parser.load_node_metadata()
+    def test_load_empty_files(self):
+        """ test loading files containing no data """
 
-        with self.assertRaisesRegex(FileNotFoundError, err_str):
-            parser.load_cluster_data()
+        # path: test/djornl/empty_files
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'empty_files')
+        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+        self.assertEqual(parser.load_edges(), {"nodes": [], "edges": []})
+        self.assertEqual(parser.load_node_metadata(), {"nodes": []})
+        self.assertEqual(parser.load_cluster_data(), {"nodes": []})
 
 
-    def test_load_invalid_types(self):
-        """ test file format errors """
+    def test_load_col_count_errors(self):
+        """ test files with invalid numbers of columns """
 
-        # path: test/djornl/invalid_types
-        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types')
+        # path: test/djornl/col_count_errors
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'col_count_errors')
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
         # invalid edge type
-        edge_err_msg = 'line 2: invalid edge type: AraGWAS-Some-Old-Rubbish-I-Made-Up'
+        edge_err_msg = 'line 6: expected 5 cols, found 3'
         with self.assertRaisesRegex(RuntimeError, edge_err_msg):
             parser.load_edges()
 
         # invalid node type
-        node_err_msg = 'line 4: invalid node type: Monkey'
+        node_err_msg = 'line 3: expected 20 cols, found 22'
         with self.assertRaisesRegex(RuntimeError, node_err_msg):
             parser.load_node_metadata()
 
 
-    def test_load_col_count_errors(self):
-        """ test files with invalid numbers of columns """
+    def test_load_invalid_types(self):
+        """ test file format errors """
 
-        # path: test/djornl/col_count_errors
-        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'col_count_errors')
+        # path: test/djornl/invalid_types
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types')
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
         # invalid edge type
-        edge_err_msg = 'line 6: expected 5 cols, found 3'
+        edge_err_msg = 'merged_edges-AMW-060820_AF.tsv line 3: invalid edge type: AraGWAS-Some-Old-Rubbish-I-Made-Up'
         with self.assertRaisesRegex(RuntimeError, edge_err_msg):
             parser.load_edges()
 
         # invalid node type
-        node_err_msg = 'line 3: expected 20 cols, found 22'
+        node_err_msg = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv line 5: invalid node type: Monkey'
         with self.assertRaisesRegex(RuntimeError, node_err_msg):
             parser.load_node_metadata()
 
@@ -112,8 +130,6 @@ def test_load_valid_edge_data(self):
         RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
         parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
 
-        self.maxDiff = None
-
         edge_data = parser.load_edges()
         self.assertEqual(
             edge_data,
@@ -142,3 +158,10 @@ def test_load_valid_cluster_data(self):
             cluster_data,
             self.json_data["load_cluster_data"]
         )
+
+    def test_load_valid_node_metadata(self):
+
+        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
+        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
+
+        parser.check_data_delta()

From 9892d70ab982b09b3489a1f3329222beaee156e1 Mon Sep 17 00:00:00 2001
From: ialarmedalien <ialarmedalien@gmail.com>
Date: Mon, 20 Jul 2020 15:41:23 -0700
Subject: [PATCH 3/4] fixing LGTM errors

---
 importers/djornl/parser.py                | 5 -----
 test/stored_queries/test_djornl.py        | 4 +---
 test/stored_queries/test_djornl_parser.py | 8 +-------
 3 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/importers/djornl/parser.py b/importers/djornl/parser.py
index ebf7ed1..5295fdb 100644
--- a/importers/djornl/parser.py
+++ b/importers/djornl/parser.py
@@ -9,11 +9,7 @@
 import os
 import csv
 import yaml
-import json
-import jsonschema
-
 from jsonschema.validators import Draft7Validator
-from jsonschema.exceptions import ValidationError
 
 import importers.utils.config as config
 
@@ -54,7 +50,6 @@ def _configure(self):
 
         validator = Draft7Validator(manifest_schema)
         if not validator.is_valid(manifest):
-            error_list = []
             raise RuntimeError(
                 "The manifest file failed validation with the following errors:\n"
                 + "\n".join(e.message for e in sorted(validator.iter_errors(manifest), key=str))
diff --git a/test/stored_queries/test_djornl.py b/test/stored_queries/test_djornl.py
index b6468ba..7c502a3 100644
--- a/test/stored_queries/test_djornl.py
+++ b/test/stored_queries/test_djornl.py
@@ -6,10 +6,8 @@
 import unittest
 import requests
 import os
-import glob
-import yaml
 
-from test.helpers import get_config, assert_subset, modified_environ
+from test.helpers import get_config, modified_environ
 from test.stored_queries.helpers import create_test_docs
 from importers.djornl.parser import DJORNL_Parser
 
diff --git a/test/stored_queries/test_djornl_parser.py b/test/stored_queries/test_djornl_parser.py
index 347e17d..91dfdb5 100644
--- a/test/stored_queries/test_djornl_parser.py
+++ b/test/stored_queries/test_djornl_parser.py
@@ -10,8 +10,6 @@
 import requests
 import os
 import contextlib
-from jsonschema.exceptions import ValidationError
-
 from importers.djornl.parser import DJORNL_Parser
 
 from test.helpers import get_config, assert_subset, modified_environ
@@ -159,9 +157,5 @@ def test_load_valid_cluster_data(self):
             self.json_data["load_cluster_data"]
         )
 
-    def test_load_valid_node_metadata(self):
-
-        RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data')
-        parser = self.init_parser_with_path(RES_ROOT_DATA_PATH)
-
         parser.check_data_delta()
+

From e293889c7828ae444e7fce4ca8fdcb700501f37c Mon Sep 17 00:00:00 2001
From: ialarmedalien <ialarmedalien@gmail.com>
Date: Mon, 20 Jul 2020 17:37:54 -0700
Subject: [PATCH 4/4] Adding fake file to get dir to show up in git

---
 test/djornl/invalid_file/edges.tsv/touch | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 test/djornl/invalid_file/edges.tsv/touch

diff --git a/test/djornl/invalid_file/edges.tsv/touch b/test/djornl/invalid_file/edges.tsv/touch
new file mode 100644
index 0000000..e69de29