kbaseattic · ialarmedalien · Jul 16, 2020 · Jul 20, 2020 · Jul 20, 2020 · Jul 21, 2020
diff --git a/Makefile b/Makefile
@@ -1,4 +1,6 @@
 .PHONY: test
 
 test:
+	docker-compose down
 	docker-compose run spec sh /app/test/run_tests.sh
+	docker-compose down
diff --git a/importers/djornl/parser.py b/importers/djornl/parser.py
@@ -43,15 +43,15 @@ def _configure(self):
 
         _CLUSTER_BASE = os.path.join(configuration['ROOT_DATA_PATH'], 'cluster_data')
         configuration['_CLUSTER_PATHS'] = {
-            'cluster_I2': os.path.join(
+            'markov_i2': os.path.join(
                 _CLUSTER_BASE,
                 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv'
             ),
-            'cluster_I4': os.path.join(
+            'markov_i4': os.path.join(
                 _CLUSTER_BASE,
                 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv'
             ),
-            'cluster_I6': os.path.join(
+            'markov_i6': os.path.join(
                 _CLUSTER_BASE,
                 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv'
             ),
@@ -163,24 +163,49 @@ def load_node_metadata(self):
 
     def load_cluster_data(self):
         """Annotate genes with cluster ID fields."""
-        nodes = []
+
+        # index of nodes
+        node_ix = {}
         cluster_paths = self.config()['_CLUSTER_PATHS']
         for (cluster_label, path) in cluster_paths.items():
             with open(path) as fd:
                 csv_reader = csv.reader(fd, delimiter='\t')
                 for row in csv_reader:
                     if len(row) > 1:
-                        # remove the 'Cluster' text
-                        cluster_id = row[0].replace('Cluster','')
-                        gene_keys = row[1:]
-                        nodes += [
-                            {'_key': key, cluster_label: int(cluster_id)}
-                            for key in gene_keys
-                        ]
+                        self._parse_cluster_row(row, cluster_label, node_ix)
+
+
+        # gather a list of cluster IDs for each node
+        nodes = []
+        for (key, cluster_data) in node_ix.items():
+            clusters = []
+            for (cluster_label, id_list) in cluster_data.items():
+                clusters += [cluster_label + ":" + id for id in id_list]
+
+            nodes += [{
+                '_key': key,
+                'clusters': clusters
+            }]
 
         return {'nodes': nodes}
 
 
+    def _parse_cluster_row(self, row, cluster_label, node_ix):
+        # metadata rows start with '#'
+        if row[0] != '#':
+            # remove the 'Cluster' text
+            cluster_id = row[0].replace('Cluster','')
+            node_keys = row[1:]
+
+            for key in node_keys:
+                if key not in node_ix:
+                    node_ix[key] = {}
+                if cluster_label not in node_ix[key]:
+                    node_ix[key][cluster_label] = []
+
+                node_ix[key][cluster_label].append(cluster_id)
+
+
     def save_dataset(self, dataset):
 
         if 'nodes' in dataset and len(dataset['nodes']) > 0:

diff --git a/schemas/djornl/djornl_node.yaml b/schemas/djornl/djornl_node.yaml
@@ -13,21 +13,14 @@ schema:
       type: string
       title: Key
       examples: ["AT1G01010"]
-    cluster_I2:
-      type: integer
-      title: Cluster 2 ID
-      description: Iterative random forest cluster group ID
-      examples: [1]
-    cluster_I4:
-      type: integer
-      title: Cluster 4 ID
-      description: Iterative random forest cluster group ID
-      examples: [13]
-    cluster_I6:
-      type: integer
-      title: Cluster 6 ID
-      description: Iterative random forest cluster group ID
-      examples: [27]
+    clusters:
+      type: array
+      title: Clusters
+      description: Clusters to which the node has been assigned
+      items:
+        type: string
+#        pattern: "^\w+:\d+$"
+      examples: [["markov_i2:1", "markov_i4:5"], ["markov_i6:3"]]
     node_type:
       type: string
       title: Node type

diff --git a/stored_queries/djornl/djornl_fetch_clusters.yaml b/stored_queries/djornl/djornl_fetch_clusters.yaml
@@ -2,25 +2,13 @@ name: djornl_fetch_clusters
 description: Fetch all nodes that are members of the specified cluster(s), and the edges and nodes within the specified distance (number of hops) of those nodes.
 params:
   type: object
+  required: [cluster_ids]
   properties:
-    cluster_i2_ids:
-      title: Cluster I2 IDs
-      description: Cluster I2 IDs to locate
-      items: {type: integer}
-      default: []
-      examples: [[1], [3, 5]]
-    cluster_i4_ids:
-      title: Cluster I4 IDs
-      description: Cluster I4 IDs to locate
-      items: {type: integer}
-      examples: [[2], [4, 6]]
-      default: []
-    cluster_i6_ids:
-      title: Cluster I6 IDs
-      description: Cluster I6 IDs to locate
-      items: {type: integer}
-      examples: [[666], [999, 333]]
-      default: []
+    cluster_ids:
+      title: Cluster IDs
+      description: Cluster IDs, in the form "clustering_system_name:cluster_id"
+      items: {type: string}
+      examples: [['markov_i2:5', 'markov_i6:2'],['markov_i6:1']]
     distance:
       type: integer
       title: Traversal Distance
@@ -31,7 +19,7 @@ params:
 query: |
   LET node_ids = (
     FOR n IN djornl_node
-      FILTER n.cluster_I2 IN @cluster_i2_ids OR n.cluster_I4 IN @cluster_i4_ids OR n.cluster_I6 IN @cluster_i6_ids
+      FILTER n.clusters ANY IN @cluster_ids
       FOR node IN 0..@distance ANY n djornl_edge
         OPTIONS {bfs: true, uniqueVertices: "global"}
         RETURN DISTINCT node._id