-
Notifications
You must be signed in to change notification settings - Fork 0
/
accuracy.py
164 lines (117 loc) · 5.99 KB
/
accuracy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
from itertools import product
from scipy import sparse as sp
from munkres import Munkres
from math import sqrt
import numpy as np
import sys
def clustering_error(predicted_biclustering, reference_biclustering, num_rows, num_cols):
"""The Clustering Error (CE) external evaluation measure.
CE computes the similarity between two subspace clusterings. This measure was originally
introduced in (Patrikainen and Meila, 2006) as a dissimilarity measure. In this package, it
was implemented as a similarity measure as presented in (Horta and Campello, 2014). This measure
lies in the interval [0, 1], where values close to 1 indicate better biclustering solutions.
Reference
---------
Patrikainen, A., & Meila, M. (2006). Comparing subspace clusterings. IEEE Transactions on
Knowledge and Data Engineering, 18(7), 902-916.
Horta, D., & Campello, R. J. G. B. (2014). Similarity measures for comparing biclusterings.
IEEE/ACM Transactions on Computational Biology and Bioinformatics, 11(5), 942-954.
Parameters
----------
predicted_biclustering : biclustlib.model.Biclustering
Predicted biclustering solution.
reference_biclustering : biclustlib.model.Biclustering
Reference biclustering solution.
num_rows : int
Number of rows of the dataset.
num_cols : int
Number of columns of the dataset.
Returns
-------
ce : float
Similarity score between 0.0 and 1.0.
"""
# check = check_biclusterings(predicted_biclustering, reference_biclustering)
# if isinstance(check, float):
# return check
union_size = _calculate_size(predicted_biclustering, reference_biclustering, num_rows, num_cols, 'union')
dmax = _calculate_dmax(predicted_biclustering, reference_biclustering)
return float(dmax) / union_size
def _calculate_size(predicted_biclustering, reference_biclustering, num_rows, num_cols, operation):
pred_count = _count_biclusters(predicted_biclustering, num_rows, num_cols)
true_count = _count_biclusters(reference_biclustering, num_rows, num_cols)
if operation == 'union':
return np.sum(np.maximum(pred_count, true_count))
elif operation == 'intersection':
return np.sum(np.minimum(pred_count, true_count))
valid_operations = ('union', 'intersection')
raise ValueError("operation must be one of {0}, got {1}".format(valid_operations, operation))
def _calculate_dmax(predicted_biclustering, reference_biclustering):
pred_sets = _bic2sets(predicted_biclustering)
true_sets = _bic2sets(reference_biclustering)
cost_matrix = [[sys.maxsize - len(b.intersection(g)) for g in true_sets] for b in pred_sets]
indices = Munkres().compute(cost_matrix)
return sum(sys.maxsize - cost_matrix[i][j] for i, j in indices)
def _count_biclusters(biclustering, num_rows, num_cols):
count = np.zeros((num_rows, num_cols), dtype=int)
for b in biclustering.biclusters:
count[np.ix_(b.rows, b.cols)] += 1
return count
def _bic2sets(biclust):
return [set(product(b.rows, b.cols)) for b in biclust.biclusters]
def liu_wang_match_score(predicted_biclustering, reference_biclustering):
"""Liu & Wang match score.
Reference
---------
Liu, X., & Wang, L. (2006). Computing the maximum similarity bi-clusters of gene expression data.
Bioinformatics, 23(1), 50-56.
Horta, D., & Campello, R. J. G. B. (2014). Similarity measures for comparing biclusterings.
IEEE/ACM Transactions on Computational Biology and Bioinformatics, 11(5), 942-954.
Parameters
----------
predicted_biclustering : biclustlib.model.Biclustering
Predicted biclustering solution.
reference_biclustering : biclustlib.model.Biclustering
Reference biclustering solution.
Returns
-------
lw_match_score : float
Liu and Wang match score between 0.0 and 1.0.
"""
k = len(predicted_biclustering.biclusters)
return sum(max((len(np.intersect1d(bp.rows, br.rows)) + len(np.intersect1d(bp.cols, br.cols))) /
(len(np.union1d(bp.rows, br.rows)) + len(np.union1d(bp.cols, br.cols)))
for br in reference_biclustering.biclusters)
for bp in predicted_biclustering.biclusters) / k
def prelic_relevance(predicted_biclustering, reference_biclustering):
"""The overall relevance match score defined in the supplementary material of Prelic et al. (2006).
This score reflects how well the predicted biclusters represent the reference biclusters in both dimensions
(rows and columns). This measure lies in the interval [0, 1], where values close to 1 indicate better
biclustering solutions.
Reference
---------
Prelic, A., Bleuler, S., Zimmermann, P., Wille, A., Buhlmann, P., Gruissem, W., Hennig, L., Thiele, L. &
Zitzler, E. (2006). A systematic comparison and evaluation of biclustering methods for gene expression data.
Bioinformatics, 22(9), 1122-1129.
Horta, D., & Campello, R. J. G. B. (2014). Similarity measures for comparing biclusterings.
IEEE/ACM Transactions on Computational Biology and Bioinformatics, 11(5), 942-954.
Parameters
----------
predicted_biclustering : biclustlib.model.Biclustering
Predicted biclustering solution.
reference_biclustering : biclustlib.model.Biclustering
Reference biclustering solution.
Returns
-------
prel : float
Similarity score between 0.0 and 1.0.
"""
row_score = _match_score(predicted_biclustering, reference_biclustering, 'rows')
col_score = _match_score(predicted_biclustering, reference_biclustering, 'cols')
return sqrt(row_score * col_score)
def _match_score(predicted_biclustering, reference_biclustering, bicluster_attr):
k = len(predicted_biclustering.biclusters)
return sum(max(len(np.intersect1d(getattr(bp, bicluster_attr), getattr(bt, bicluster_attr))) /
len(np.union1d(getattr(bp, bicluster_attr), getattr(bt, bicluster_attr)))
for bt in reference_biclustering.biclusters)
for bp in predicted_biclustering.biclusters) / k