-
Notifications
You must be signed in to change notification settings - Fork 2
/
util.py
311 lines (252 loc) · 7.45 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
import operator
import itertools
import math
import collections
def all_combinations_with_len(lst,min_len,max_len):
for i in xrange(min_len,max_len+1):
for j in list(itertools.product(*([lst]*i))):
yield j
def powerset(iterable):
"powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
s = list(iterable)
return itertools.chain.from_iterable(itertools.combinations(s, r) for r in range(len(s)+1))
def flatten(lst):
if not lst:
return lst
if isinstance(lst[0], list):
return reduce(operator.add, lst, [])
elif isinstance(lst[0], set):
return reduce(operator.or_, lst, set([]))
else:
raise
def remove_duplicates(lst):
seen = set()
result = []
for i in lst:
if i in seen: continue
seen.add(i)
result.append(i)
return result
def union_list_without_duplicates(lst1,lst2):
seen = set(lst1)
result = list(lst1)
for i in lst2:
if i in seen: continue
seen.add(i)
result.append(i)
return result
def remove_duplicates_and_intersect(lst,target_set):
seen = set()
result = []
for item in lst:
if item in seen: continue
if item not in target_set: continue
seen.add(item)
result.append(item)
return result
def clamp(minval, maxval, val):
if val < minval: return minval
if val > maxval: return maxval
return val
def sum_lists(a, b):
return [_a + _b for _a, _b in zip(a, b)]
def average(lst):
"""
:param lst: list
:return: float
"""
if not lst: return 0.0
return 1.0*sum(lst) / len(lst)
def count_iter(iter):
"""
:param iter: iterator
:return: int
"""
return sum(1 for _ in iter)
def all_subclasses(cls, _seen=None):
if not isinstance(cls, type):
raise TypeError("all_subclasses must be called with "
"new-style classes, not %.100r" % cls)
if _seen is None: _seen = set()
try:
subs = cls.__subclasses__()
except TypeError: # fails only when cls is type
subs = cls.__subclasses__(cls)
for sub in subs:
if sub not in _seen:
_seen.add(sub)
yield sub
for sub in all_subclasses(sub, _seen):
yield sub
def chunks(l, n):
for i in xrange(0, len(l), n):
yield l[i:i + n]
def is_any_a_in_b(a,b):
"""
Returns True if len(a&b)>0
:param a: set
:param b: set
:return: bool
"""
for i in a:
if i in b:
return True
return False
def describe_distribution(data,verbose=False):
mean = average(data)
sqd = [(i-mean)**2 for i in data]
variance = average(sqd)
std = math.sqrt(variance)
if not verbose:
return mean,std
median = sorted(data)[int(len(data)/2)]
mode = collections.Counter(data).most_common(1)
return mean,std,min(data),max(data),median,mode
from collections import deque
def sliding_window(seq, n=2):
it = iter(seq)
win = deque((next(it, None) for _ in xrange(n)), maxlen=n)
yield win
append = win.append
for e in it:
append(e)
yield tuple(win)
def partition_data(dataset,test_set,id_column=0):
"""
Partition the input dataset (as lists) and returns two datasets containing and excluding the items in test_set
:param dataset: list[object]
:param test_set: set
:param id_column: int
:return:
"""
# this interface is left for legacy compatibility
return partition_dataset(dataset,test_set,operator.itemgetter(id_column))
def partition_dataset(dataset,test_set,key):
"""
Partition the input dataset (as objects) and returns two datasets containing and excluding the items in test_set
:param dataset: list[object]
:param test_set: set
:param key: from operator import itemgetter,attrgetter
:return:
"""
# dataset is a list of objects
# test_set is a list of ids to move to test set
train = []
test = []
for i in dataset:
if key(i) in test_set:
test.append(i)
else:
train.append(i)
return train,test
def most_common(lst):
return max(set(lst), key=lst.count)
def return_one(*args, **kwargs):
"""
Returns 1 for use with defaultdict.
:return: int
"""
return 1
def return_zero(*args, **kwargs):
"""
Returns 0 for use with defaultdict.
:return: int
"""
return 0
def count_bits(n):
"""
Counts the number of bits in an integer or flags in a bitmask.
:param n: int
:return: int
"""
n = (n & 0x5555555555555555) + ((n & 0xAAAAAAAAAAAAAAAA) >> 1)
n = (n & 0x3333333333333333) + ((n & 0xCCCCCCCCCCCCCCCC) >> 2)
n = (n & 0x0F0F0F0F0F0F0F0F) + ((n & 0xF0F0F0F0F0F0F0F0) >> 4)
n = (n & 0x00FF00FF00FF00FF) + ((n & 0xFF00FF00FF00FF00) >> 8)
n = (n & 0x0000FFFF0000FFFF) + ((n & 0xFFFF0000FFFF0000) >> 16)
n = (n & 0x00000000FFFFFFFF) + ((n & 0xFFFFFFFF00000000) >> 32) # This last & isn't strictly necessary.
return n
def compute_set_intersection(a,b):
a = set(a)
b = set(b)
return len(a & b),len(a-b),len(b-a)
def compute_abcprf(a,b,c):
"""
Computes P, R, F-value given counts for true positives, false positives and true negatives.
:param a: int
:param b: int
:param c: int
:return: tuple(int,int,int,float,float,float)
"""
if a:
p,r=1.0*a/(a+b),1.0*a/(a+c)
f = 2*p*r/(p+r)
return a,b,c,p,r,f
else:
return 0,0,0,0,0,0
def compute_f(p,r):
return 2*p*r/(p+r)
def string_as_print(*argv,**kargv):
"""
Returns a string joining all the arguments except the special kargv `glue` used to join the string.
:return: str
"""
glue = kargv.get('glue',' ')
return glue.join([str(i) for i in argv])
def is_numeric_int(s):
try:
int(s)
return True
except ValueError:
return False
def is_numeric_float(s):
try:
float(s)
return True
except ValueError:
return False
def object_list_to_dict(lst,key='id'):
return dict([(getattr(i,key),i) for i in lst])
def get_from_list(lst,query,key='id'):
for i in lst:
if getattr(i,key)==query:
return i
return None
class AttrValueStorePrinter(object):
"""
Inherit to easily print a representation of the object as it's dictionary
"""
def __repr__(self):
return 'id: %s, ' % str(id(self)) + ','.join(['%s: %s' % (key,str(getattr(self,key))) for key in sorted(self.__dict__) if not key.startswith('_')])
class AttrValueStore(AttrValueStorePrinter):
"""
Key-Value store in an object
"""
def __init__(self):
self.__dict__data = {}
def __setattr__(self, name, value):
try:
super(AttrValueStore, self).__setattr__(name, value)
except:
self.__dict__data[name] = value
def __getattr__(self, name):
return self.__dict__data[name]
def __delattr__(self, name):
del self.__dict__data[name]
class KnownException(Exception):
pass
def ping(host):
"""
Returns True if host responds to a ping request
"""
import os, platform
# Ping parameters as function of OS
ping_str = "-n 1" if platform.system().lower()=="windows" else "-c 1"
# Ping
return os.system("ping " + ping_str + " " + host) == 0
class SentinelValue(object):
pass
def format_list(lst,glue=' ',options={}):
return glue.join([str(i) for i in lst])
def bool_str(str):
return str.lower() in ("yes", "true", "t", "1")