-
Notifications
You must be signed in to change notification settings - Fork 1
/
train_cnn_model.py
98 lines (84 loc) · 3.61 KB
/
train_cnn_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#DATA: 6,000 labled tweets before 2020 election from Biden=0, Trump=1.
#VECTORIZER: tokenizer. vocab size 19,415.
#MODEL: 1-d CNN with a global max-pool layer.
if __name__ == "__main__":
#load the data
import pandas as pd
project_path = '/Users/jacksonwalters/Documents/GitHub/scotus-v-public/'
data_path = 'data/training/all_labeled_tweets.txt'
file_path = project_path + data_path
df = pd.read_csv(file_path, names=['sentence', 'label'], sep='\t')
print(df.iloc[0])
#vectorize the data and split into train/test
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
tokenizer = Tokenizer(num_words=5000)
#get sentence column
sentences = df['sentence'].values
#tweet sentence sentiment labels. 0 = negative, 1 = positive
y = df['label'].values
#split the sentences into training data and test data
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)
#vectorize the sentences
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)
# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1
#pad_sequences
maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
print(vocab_size)
#plotting function
import matplotlib.pyplot as plt
plt.style.use('ggplot')
def plot_history(history):
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
x = range(1, len(acc) + 1)
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(x, acc, 'b', label='Training acc')
plt.plot(x, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(x, loss, 'b', label='Training loss')
plt.plot(x, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
#construct the model
from keras import Sequential
from keras import layers
embedding_dim = 100
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.summary()
#train the model
history = model.fit(X_train, y_train,
epochs=2,
verbose=True,
validation_data=(X_test, y_test),
batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy: {:.4f}".format(accuracy))
plot_history(history)
#test examples
ex_sent = "conservative feeling"
X_ex_sent = tokenizer.texts_to_sequences([ex_sent])
X_ex_sent = pad_sequences(X_ex_sent, padding='post', maxlen=maxlen)
print(model.predict(X_ex_sent))
#save the model
model_path = 'cnn_model'
model.save(project_path + model_path)