-
Notifications
You must be signed in to change notification settings - Fork 27
/
train_ffw.py
136 lines (111 loc) · 4.93 KB
/
train_ffw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import argparse
import pandas as pd
from scipy.sparse import load_npz, csr_matrix
from sklearn.metrics import roc_auc_score
import torch.nn as nn
from torch.optim import Adam
from model_ffw import FeedForward
from utils import *
def get_tensors(sparse):
# First 5 columns are the original dataset, including label in column 3
dense = torch.tensor(sparse.toarray())
inputs = dense[:, 5:].float()
labels = dense[:, 3].float()
return inputs, labels
def train_ffw(train, val, model, optimizer, logger, saver, num_epochs, batch_size):
"""Train feedforward baseline.
Arguments:
train (sparse matrix): output by encode.py
val (sparse matrix): output by encode.py
model (torch Module)
optimizer (torch optimizer)
logger: wrapper for TensorboardX logger
saver: wrapper for torch saving
num_epochs (int): number of epochs to train for
batch_size (int)
"""
criterion = nn.BCEWithLogitsLoss()
metrics = Metrics()
train_idxs = np.arange(train.shape[0])
val_idxs = np.arange(val.shape[0])
step = 0
for epoch in range(num_epochs):
np.random.shuffle(train_idxs)
np.random.shuffle(val_idxs)
# Training
for k in range(0, len(train_idxs), batch_size):
inputs, labels = get_tensors(train[train_idxs[k:k + batch_size]])
inputs = inputs.cuda()
preds = model(inputs).flatten()
loss = criterion(preds, labels.cuda())
train_auc = roc_auc_score(labels, torch.sigmoid(preds).detach().cpu())
model.zero_grad()
loss.backward()
optimizer.step()
step += 1
metrics.store({'loss/train': loss.item()})
metrics.store({'auc/train': train_auc})
# Logging
if step % 20 == 0:
logger.log_scalars(metrics.average(), step)
# Validation
model.eval()
for k in range(0, len(val_idxs), batch_size):
inputs, labels = get_tensors(val[val_idxs[k:k + batch_size]])
inputs = inputs.cuda()
with torch.no_grad():
preds = model(inputs).flatten()
val_auc = roc_auc_score(labels, torch.sigmoid(preds).cpu())
metrics.store({'auc/val': val_auc})
model.train()
# Save model
average_metrics = metrics.average()
logger.log_scalars(average_metrics, step)
stop = saver.save(average_metrics['auc/val'], model)
if stop:
break
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Train feedforward neural network on sparse feature matrix.')
parser.add_argument('--X_file', type=str)
parser.add_argument('--dataset', type=str)
parser.add_argument('--logdir', type=str, default='runs/ffw')
parser.add_argument('--savedir', type=str, default='save/ffw')
parser.add_argument('--hid_size', type=int, default=500)
parser.add_argument('--drop_prob', type=float, default=0.5)
parser.add_argument('--batch_size', type=int, default=500)
parser.add_argument('--lr', type=float, default=1e-2)
parser.add_argument('--num_epochs', type=int, default=30)
args = parser.parse_args()
features_suffix = (args.X_file.split("-")[-1]).split(".")[0]
# Load sparse dataset
X = csr_matrix(load_npz(args.X_file))
train_df = pd.read_csv(f'data/{args.dataset}/preprocessed_data_train.csv', sep="\t")
test_df = pd.read_csv(f'data/{args.dataset}/preprocessed_data_test.csv', sep="\t")
# Student-wise train-val-test split
user_ids = X[:, 0].toarray().flatten()
users_test = test_df["user_id"].unique()
users_train_val = train_df["user_id"].unique()
split = int(0.8 * len(users_train_val))
users_train, users_val = users_train_val[:split], users_train_val[split:]
train = X[np.where(np.isin(user_ids, users_train))]
val = X[np.where(np.isin(user_ids, users_val))]
test = X[np.where(np.isin(user_ids, users_test))]
model = FeedForward(train.shape[1] - 5, args.hid_size, args.drop_prob).cuda()
optimizer = Adam(model.parameters(), lr=args.lr)
# Train
param_str = f'{args.dataset}, features={features_suffix}'
logger = Logger(os.path.join(args.logdir, param_str))
saver = Saver(args.savedir, param_str)
train_ffw(train, val, model, optimizer, logger, saver, args.num_epochs, args.batch_size)
logger.close()
model.eval()
pred_test = np.zeros(len(test_df))
for k in range(0, test.shape[0], args.batch_size):
inputs, labels = get_tensors(test[k:k + args.batch_size])
inputs = inputs.cuda()
with torch.no_grad():
pred_test[k:k + args.batch_size] = torch.sigmoid(model(inputs)).flatten().cpu().numpy()
# Write predictions to csv
test_df[f"FFW_{features_suffix}"] = pred_test
test_df.to_csv(f'data/{args.dataset}/preprocessed_data_test.csv', sep="\t", index=False)
print("auc_test = ", roc_auc_score(test_df["correct"], pred_test))