-
Notifications
You must be signed in to change notification settings - Fork 6
/
evaluate_prototypes.py
218 lines (164 loc) · 7.91 KB
/
evaluate_prototypes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
__author__ = 'rwechsler'
import datetime
import time
import cPickle as pickle
from annoy import AnnoyIndex
import gensim
import argparse
import numpy as np
import sys
import random
from scipy import spatial
import multiprocessing as mp
from collections import defaultdict
import codecs
def timestamp():
return datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S')
def load_candidate_dump(file_name):
return pickle.load(open(file_name, "rb"))
def load_annoy_tree(model_file_name, vector_dims):
tree = AnnoyIndex(vector_dims)
tree.load(model_file_name)
return tree
def load_prototype_dump(file_name):
return pickle.load(open(file_name, "rb"))
def load_word2vecmodel(file_name):
return gensim.models.Word2Vec.load_word2vec_format(file_name, binary=True)
def get_rank_annoy_knn(annoy_tree, vector, true_index, k=100):
neighbours = annoy_tree.get_nns_by_vector(list(vector), k)
try:
return neighbours.index(true_index) + 1
except ValueError:
return 0
def get_rank_word2vec_knn(word2vec_model, vector, true_index, k=100):
neighbours, _ = zip(*word2vec_model.most_similar(positive=[vector], topn=k))
try:
return neighbours.index(word2vec_model.index2word[true_index]) + 1
except ValueError:
return 0
def candidate_generator(evaluation_set, rank_threshold, sim_threshold):
for prefix_prototype_pair in evaluation_set:
yield (prefix_prototype_pair, evaluation_set[prefix_prototype_pair], rank_threshold, sim_threshold)
def mp_wrapper_evaluate_set(argument):
return evaluate_set(*argument)
def get_nn_hitrate(ranks):
return (len(ranks) - ranks.count(0)) / float(len(ranks))
def get_sim_hitrate(similarities, threshold):
return np.sum([1 for s in similarities if s >= threshold]) / float(len(similarities))
def get_average_rank(ranks):
return np.mean([r for r in ranks if r > 0] or 0)
def get_average_similarity(similarities):
return np.mean(similarities)
def get_hitrate(ranks, similarities, threshold):
count = 0
for i, r in enumerate(ranks):
if r > 0 and similarities[i] >= threshold:
count += 1
return count / float(len(ranks))
def get_word_representation(prefix, comp_index, tail_index, word2vec_model):
comp = word2vec_model.index2word[comp_index]
tail = word2vec_model.index2word[tail_index]
fl = comp[len(prefix):-len(tail)]
if fl:
fl = "[" + fl + "]"
return fl + tail
if __name__ == "__main__":
#### Default Parameters-------------------------------------------####
rank_threshold = 30
vector_dims = 500
sim_threshold = 0.5
sample_set_size = np.inf
n_processes = 2
####End-Parametes-------------------------------------------------####
parser = argparse.ArgumentParser(description='Evaluate candidates')
parser.add_argument('-w', action='store', dest="word2vec_file", required=True)
parser.add_argument('-v', action="store", dest="prototypes_file", required=True)
parser.add_argument('-d', action="store", dest="vector_dims", type=int, default=vector_dims)
parser.add_argument('-t', action="store", dest="annoy_tree_file")
parser.add_argument('-c', action="store", dest="candidates_index_file")
parser.add_argument('-o', action="store", dest="result_output_file", required=True)
parser.add_argument('-p', action="store", dest="n_processes", type=int, default=n_processes)
parser.add_argument('-s', action="store", dest="sample_set_size", type=int, default=sample_set_size)
parser.add_argument('-r', action="store", dest="rank_threshold", type=int, default=rank_threshold)
parser.add_argument('-z', action="store", dest="sim_threshold", type=float, default=sim_threshold)
arguments = parser.parse_args(sys.argv[1:])
print timestamp(), "loading word2vec model"
word2vec_model = load_word2vecmodel(arguments.word2vec_file)
print timestamp(), "loading prototypes"
prototypes = load_prototype_dump(arguments.prototypes_file)
if arguments.candidates_index_file:
print timestamp(), "loading candidates"
candidates = load_candidate_dump(arguments.candidates_index_file)
evaluation_set = dict()
# keys are (prefix, prototype_pair)
for prefix in prototypes:
for prototype, evidence_set in prototypes[prefix]:
if arguments.candidates_index_file:
evaluation_set[(prefix, prototype)] = candidates[prefix]
else:
evaluation_set[(prefix, prototype)] = evidence_set
print timestamp(), "preprocess candidates"
# only store vectors that we need. And sample already.
word2vec_vectors = dict()
for prototype_tup in evaluation_set:
if len(evaluation_set[prototype_tup]) > arguments.sample_set_size:
evaluation_set[prototype_tup] = set(random.sample(evaluation_set[prototype_tup], arguments.sample_set_size))
for (i,j) in evaluation_set[prototype_tup]:
word2vec_vectors[i] = np.array(word2vec_model.syn0[i])
word2vec_vectors[j] = np.array(word2vec_model.syn0[j])
word2vec_vectors[prototype_tup[1][0]] = np.array(word2vec_model.syn0[prototype_tup[1][0]])
word2vec_vectors[prototype_tup[1][1]] = np.array(word2vec_model.syn0[prototype_tup[1][1]])
print timestamp(), "number of vectors: ", len(word2vec_vectors)
if arguments.annoy_tree_file and arguments.vector_dims:
del word2vec_model
print timestamp(), "loading annoy tree"
# global annoy_tree
model = load_annoy_tree(arguments.annoy_tree_file, arguments.vector_dims)
knn_method = get_rank_annoy_knn
else:
print timestamp(), "using word2vec model"
model = word2vec_model
knn_method = get_rank_word2vec_knn
def evaluate_set(prefix_prototype_pair, evidence_set, rank_threshold=100, sim_threshold=0.5):
global model
global word2vec_vectors
ranks = []
similarities = []
prefix, vector_pair = prefix_prototype_pair
diff = word2vec_vectors[vector_pair[0]]- word2vec_vectors[vector_pair[1]]
for comp, tail in evidence_set:
predicted = word2vec_vectors[tail] + diff
true_vector = word2vec_vectors[comp]
rank = knn_method(model, predicted, comp, rank_threshold)
ranks.append(rank)
sim = spatial.distance.cosine(predicted, true_vector)
similarities.append(sim)
# returns hitrate, hitrate_nn, hitrate_sim, average_rank_if_found, average_similarity_if_found
results = get_hitrate(ranks, similarities, threshold=sim_threshold), get_nn_hitrate(ranks), get_sim_hitrate(similarities, threshold=sim_threshold), get_average_rank(ranks), get_average_similarity(similarities)
return (prefix_prototype_pair,results)
print timestamp(), "evaluating candidates"
pool = mp.Pool(processes=arguments.n_processes)
params = candidate_generator(evaluation_set, arguments.rank_threshold, arguments.sim_threshold)
results = pool.map(mp_wrapper_evaluate_set, params)
pool.close()
pool.join()
del pool
print timestamp(), "pickling"
pickle.dump(results, open(arguments.result_output_file, "wb"))
if arguments.annoy_tree_file:
print timestamp(), "loading word2vec model"
word2vec_model = load_word2vecmodel(arguments.word2vec_file)
else:
word2vec_model = model
print timestamp(), "mapping indices to word"
scores = defaultdict(dict)
for ((prefix, vector), eval_scores) in results:
vector_repr = get_word_representation(prefix, vector[0], vector[1], word2vec_model)
scores[prefix][vector_repr] = eval_scores
print timestamp(), "writing result file"
outfile = codecs.open(arguments.result_output_file, "w", "utf-8")
for prefix in scores:
for vector in scores[prefix]:
outfile.write("\t".join([prefix, vector] + map(str, scores[prefix][vector])) + "\n")
outfile.close()
print timestamp(), "done"