-
Notifications
You must be signed in to change notification settings - Fork 1
/
utilities.py
117 lines (80 loc) · 3.57 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from pandas_confusion import ConfusionMatrix,BinaryConfusionMatrix
import os,errno,re,codecs,unidecode
#functions for fixing unicode issues
def remove_non_ascii(text):
return unidecode(unicode(text, encoding = "utf-8"))
def fixUnicode(text):
return str(text).decode('ascii', 'ignore')
def writeListToFile(filename,lst):
with codecs.open(filename, 'w',"utf-8") as f:
for s in lst:
f.write(s )
def appendListToFile(filename,lst):
with codecs.open(filename, 'a',"utf-8") as f:
for s in lst:
f.write(s )
def fileToList(file):
doc=[]
list_docs=[]
for i in range (len(file)):
if '#doc' in file[i][0] or i==file.shape[0]-1:
if doc:
list_docs.append(doc)
doc=[]
else:
doc.append(file[i])
return list_docs
def classify(X_train,y_train,c,algorithm): # gives as output the classifier with making predictions
if algorithm=="lr":
clf=clf=LogisticRegression(C=c)
elif algorithm=="svm":
clf=svm.LinearSVC(random_state=42,C=c)
clf.fit(X_train, y_train)
return clf
def classification(X_train,y_train,X_test,c,algorithm):# gives as output the classifier and the predictions
if algorithm=="lr":
clf=LogisticRegression(random_state=42,C=c)
elif algorithm=="svm":
clf=svm.LinearSVC(random_state=42,C=c)
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)
return clf,y_pred
def splitbyDel(string,delimeter):
return [x.strip() for x in string.split(delimeter)]
def find_sub_list(sl,l):
results=[]
sll=len(sl)
for ind in (i for i,e in enumerate(l) if e==sl[0]):
if l[ind:ind+sll]==sl:
results.append((ind,ind+sll-1))
return results
def replaceWhiteSpaces(string):
return re.sub(r"\s+", '_', string)
def delFileIfExists(filename):
try:
os.remove(filename)
except OSError:
pass
def make_sure_path_exists(path):
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise
def printResultsToFile(y_test,y_pred,filename,c,model,random_seed):
binary_confusion_matrix = BinaryConfusionMatrix(map(int, y_test), y_pred)
tp=float(binary_confusion_matrix.TP)
tn=binary_confusion_matrix.TN
fp=binary_confusion_matrix.FP
fn=binary_confusion_matrix.FN
f1=(2*tp)/(2*tp+fp+fn)
re=tp / (tp + fn)
pre=tp / (tp + fp)
write_list=[]
write_list.append("{0:15s} {1:12s} {2:10s}".format("C","model","seed")+"\n")
write_list.append("{0:15s} {1:12s} {2:10s}".format(str(int(c)),model,str(random_seed))+"\n")
write_list.append("{0:15s} {1:12s} {2:12s} {3:12s} {4:12s} {5:12s} {6:12s}".format( "Type", "TP", "FP", "FN", "Pr", "Re", "F1")+"\n")
write_list.append(("{0:15s} {1:12s} {2:12s} {3:12s} {4:12s} {5:12s} {6:12s}".format("part-of",str(int(tp)),str(fp),str(fn),str(round(pre,4)),str(round(re,4)),str(round(f1,4)))+"\n")+"\n")
appendListToFile(filename,write_list)