Skip to content
This repository has been archived by the owner on Aug 29, 2020. It is now read-only.

Reorganising cluster data #144

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
.PHONY: test

test:
docker-compose down
docker-compose run spec sh /app/test/run_tests.sh
docker-compose down
Comment on lines +4 to +6
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make sure that any docker containers that are hanging around don't accidentally contaminate test runs.

47 changes: 36 additions & 11 deletions importers/djornl/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,15 @@ def _configure(self):

_CLUSTER_BASE = os.path.join(configuration['ROOT_DATA_PATH'], 'cluster_data')
configuration['_CLUSTER_PATHS'] = {
'cluster_I2': os.path.join(
'markov_i2': os.path.join(
_CLUSTER_BASE,
'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv'
),
'cluster_I4': os.path.join(
'markov_i4': os.path.join(
_CLUSTER_BASE,
'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv'
),
'cluster_I6': os.path.join(
'markov_i6': os.path.join(
_CLUSTER_BASE,
'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv'
),
Expand Down Expand Up @@ -163,24 +163,49 @@ def load_node_metadata(self):

def load_cluster_data(self):
"""Annotate genes with cluster ID fields."""
nodes = []

# index of nodes
node_ix = {}
cluster_paths = self.config()['_CLUSTER_PATHS']
for (cluster_label, path) in cluster_paths.items():
with open(path) as fd:
csv_reader = csv.reader(fd, delimiter='\t')
for row in csv_reader:
if len(row) > 1:
# remove the 'Cluster' text
cluster_id = row[0].replace('Cluster','')
gene_keys = row[1:]
nodes += [
{'_key': key, cluster_label: int(cluster_id)}
for key in gene_keys
]
self._parse_cluster_row(row, cluster_label, node_ix)


# gather a list of cluster IDs for each node
nodes = []
for (key, cluster_data) in node_ix.items():
clusters = []
for (cluster_label, id_list) in cluster_data.items():
clusters += [cluster_label + ":" + id for id in id_list]

nodes += [{
'_key': key,
'clusters': clusters
}]

return {'nodes': nodes}


def _parse_cluster_row(self, row, cluster_label, node_ix):
# metadata rows start with '#'
if row[0] != '#':
# remove the 'Cluster' text
cluster_id = row[0].replace('Cluster','')
node_keys = row[1:]

for key in node_keys:
if key not in node_ix:
node_ix[key] = {}
if cluster_label not in node_ix[key]:
node_ix[key][cluster_label] = []

node_ix[key][cluster_label].append(cluster_id)
ialarmedalien marked this conversation as resolved.
Show resolved Hide resolved


def save_dataset(self, dataset):

if 'nodes' in dataset and len(dataset['nodes']) > 0:
Expand Down
23 changes: 8 additions & 15 deletions schemas/djornl/djornl_node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,14 @@ schema:
type: string
title: Key
examples: ["AT1G01010"]
cluster_I2:
type: integer
title: Cluster 2 ID
description: Iterative random forest cluster group ID
examples: [1]
cluster_I4:
type: integer
title: Cluster 4 ID
description: Iterative random forest cluster group ID
examples: [13]
cluster_I6:
type: integer
title: Cluster 6 ID
description: Iterative random forest cluster group ID
examples: [27]
clusters:
ialarmedalien marked this conversation as resolved.
Show resolved Hide resolved
type: array
title: Clusters
description: Clusters to which the node has been assigned
items:
type: string
# pattern: "^\w+:\d+$"
examples: [["markov_i2:1", "markov_i4:5"], ["markov_i6:3"]]
node_type:
type: string
title: Node type
Expand Down
26 changes: 7 additions & 19 deletions stored_queries/djornl/djornl_fetch_clusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,13 @@ name: djornl_fetch_clusters
description: Fetch all nodes that are members of the specified cluster(s), and the edges and nodes within the specified distance (number of hops) of those nodes.
params:
type: object
required: [cluster_ids]
properties:
cluster_i2_ids:
title: Cluster I2 IDs
description: Cluster I2 IDs to locate
items: {type: integer}
default: []
examples: [[1], [3, 5]]
cluster_i4_ids:
title: Cluster I4 IDs
description: Cluster I4 IDs to locate
items: {type: integer}
examples: [[2], [4, 6]]
default: []
cluster_i6_ids:
title: Cluster I6 IDs
description: Cluster I6 IDs to locate
items: {type: integer}
examples: [[666], [999, 333]]
default: []
cluster_ids:
title: Cluster IDs
description: Cluster IDs, in the form "clustering_system_name:cluster_id"
items: {type: string}
examples: [['markov_i2:5', 'markov_i6:2'],['markov_i6:1']]
distance:
type: integer
title: Traversal Distance
Expand All @@ -31,7 +19,7 @@ params:
query: |
LET node_ids = (
FOR n IN djornl_node
FILTER n.cluster_I2 IN @cluster_i2_ids OR n.cluster_I4 IN @cluster_i4_ids OR n.cluster_I6 IN @cluster_i6_ids
FILTER n.clusters ANY IN @cluster_ids
FOR node IN 0..@distance ANY n djornl_edge
OPTIONS {bfs: true, uniqueVertices: "global"}
RETURN DISTINCT node._id
Expand Down
Loading