-
Notifications
You must be signed in to change notification settings - Fork 0
/
src.py
94 lines (78 loc) · 3.45 KB
/
src.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import numpy as np
import pandas as pd
import re
import pprint
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn import preprocessing
def hasNumbers(inputString): # check if string has a number within it
return any(char.isdigit() for char in inputString)
def prepare_data(df): #remove quotations and dots
for x in range(len(df)):
for y in range(22):
if type(df[x][y]) is str:
df[x][y] = df[x][y].replace('"', "")
df[x][y] = df[x][y].replace('.', "")
for x in range(len(df)): #typecast a number in string format to float
for y in range(22):
if type(df[x][y]) is str and hasNumbers(df[x][y]):
df[x][y] = float(df[x][y])
df = pd.DataFrame(df)
df.dropna(inplace=True, subset=[21]) #remove instances were label is empty
df = df.drop([19],1) # remove feature with a high frequency of NaN occurences
df = df.to_numpy()
for x in range(len(df)):#update 'yes' to 1 and 'no' to 0
for y in range(20,21):
if df[x][y] == 'yes':
df[x][y] = 1
elif df[x][y] == 'no':
df[x][y] = 0
df = pd.DataFrame(df)
return df
training_df = pd.read_csv('training_new.csv')
validation_df = pd.read_csv('validation_new.csv')
training_df = training_df.to_numpy()
validation_df = validation_df.to_numpy()
df1 = prepare_data(training_df) #function to prepare dataset
df2 = prepare_data(validation_df)
df = pd.concat([df1,df2], axis = 0) #merge datasets vertically
df = pd.get_dummies(df, columns=[0,5,6,7,8,11,12,14,15]) #one hot encoding for categorical features
df.fillna(df.mean(), inplace= True) #fill gaps in dataset using mean
label = df[20]
### separate dataset label ###
training_label = label[:df1.shape[0]]
validation_label = label[df1.shape[0]+1:]
features = df.drop([20],1)
### normalization ###
min_max_scaler = preprocessing.MinMaxScaler()
features = min_max_scaler.fit_transform(features)
features = pd.DataFrame(features)
### separate dataset features ###
training_features = features[:df1.shape[0]]
validation_features = features[df1.shape[0]+1:]
#clf = MultinomialNB()
#clf = LogisticRegression(random_state=0, solver='lbfgs')
clf = MLPClassifier(solver='sgd', alpha=1e-5,hidden_layer_sizes=(10,10), random_state=1, max_iter = 200)# model used
#clf = SVC(gamma='auto')
#clf = tree.DecisionTreeClassifier()
clf.fit(training_features, training_label)
#clf.fit(x_train, y_train)
y_predicted = clf.predict(validation_features)
#y_predicted = clf.predict(x_test)
accuracy = accuracy_score(validation_label, y_predicted)
precision = precision_score(validation_label, y_predicted)
recall = recall_score(validation_label, y_predicted)
# accuracy = accuracy_score(y_test, y_predicted) # excellent results for all metrics when splitting and testing on the training set
# precision = precision_score(y_test, y_predicted)
# recall = recall_score(y_test, y_predicted)
print("accuracy:" + str(accuracy))
print("precision:" + str(precision))
print("recall:" + str(recall))
print("f1-score:" + str(2*(precision*recall/(precision + recall))))
print(classification_report(validation_label,y_predicted))