-
Notifications
You must be signed in to change notification settings - Fork 2
/
train.py
172 lines (131 loc) · 5.47 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')
import argparse
import datetime
import gc
import os
import random
import sys
import dateutil.relativedelta
import lightgbm as lgb
import numpy as np
import pandas as pd
import torch
from features import feature_engineering1, generate_label
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
# Machine learning
from sklearn.preprocessing import LabelEncoder
from model import clf, unsupervised_model
# Custom library
from utils import print_score, seed_everything_for_torch
TOTAL_THRES = 300 # 구매액 임계값
SEED = 42 # 랜덤 시드
seed_everything_for_torch(SEED) # 시드 고정
data_dir = './code/input'
model_dir = './code/model'
output_dir = './code/output'
######tabnet training######
def make_tabnet_oof_prediction(train, y, test, features, categorical_features='auto', folds=10):
####################MLFLOW###########################
import mlflow
HOST = "http://localhost"
mlflow.set_tracking_uri(HOST+":6006/")
mlflow.start_run()
####################MLFLOW###########################
x_train = train[features]
x_test = test[features]
unsupervised_model.fit(
X_train=x_train.values, # values는 np.array랑 똑같은 역할
eval_set=[x_test.values],
max_epochs=1000 , patience=50,
batch_size=2048, virtual_batch_size=128,
drop_last=False,
pretraining_ratio=0.8,
)
clf = TabNetClassifier(
n_d=64, n_a=64, n_steps=5,
optimizer_fn=torch.optim.Adam,
optimizer_params=dict(lr=2e-2),
scheduler_params={"step_size":20,
"gamma":0.95},
scheduler_fn=torch.optim.lr_scheduler.StepLR,
mask_type='entmax',
lambda_sparse=1e-4,
device_name='auto',
)
# 테스트 데이터 예측값을 저장할 변수
test_preds = np.zeros((x_test.shape[0])) # 21 - feature dimension으로 설정
# Out Of Fold Validation 예측 데이터를 저장할 변수
y_oof = np.zeros((x_train.shape[0]))
# 폴드별 평균 Validation 스코어를 저장할 변수
score = 0
# 피처 중요도를 저장할 데이터 프레임 선언
fi = pd.DataFrame()
fi['feature'] = features
# Stratified K Fold 선언
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=SEED)
for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
# train index, validation index로 train 데이터를 나눔
x_tr, x_val = np.array(x_train.loc[tr_idx, features]), np.array(x_train.loc[val_idx, features])
y_tr, y_val = np.array(y[tr_idx]),np.array(y[val_idx])
clf.fit(
x_tr, y_tr,
eval_set=[(x_val, y_val)],
max_epochs=1000 , patience=20,
batch_size=1024, virtual_batch_size=128,
)
print(f'fold: {fold+1}, x_tr.shape: {x_tr.shape}, x_val.shape: {x_val.shape}')
# Validation 데이터 예측
val_preds = clf.predict_proba(x_val)[:,1]
# Validation index에 예측값 저장
y_oof[val_idx] = val_preds
# 폴드별 Validation 스코어 측정
print(f"Fold {fold + 1} | AUC: {roc_auc_score(y_val, val_preds)}")
print('-'*80)
# score 변수에 폴드별 평균 Validation 스코어 저장
score += roc_auc_score(y_val, val_preds) / folds
# 테스트 데이터 예측하고 평균해서 저장
test_preds += clf.predict_proba(x_test.values)[:,1] / folds
# 폴드별 피처 중요도 저장
fi[f'fold_{fold+1}'] = clf.feature_importances_
del x_tr, x_val, y_tr, y_val
gc.collect()
print(f"\nMean AUC = {score}") # 폴드별 Validation 스코어 출력
print(f"OOF AUC = {roc_auc_score(y, y_oof)}") # Out Of Fold Validation 스코어 출력
####################MLFLOW###########################
mlflow.log_param("folds", folds)
for k,v in model_params.items():
mlflow.log_param(k, v)
mlflow.log_metric("Mean AUC", score)
mlflow.log_metric("OOF AUC", roc_auc_score(y, y_oof))
mlflow.end_run()
####################MLFLOW###########################
# 폴드별 피처 중요도 평균값 계산해서 저장
fi_cols = [col for col in fi.columns if 'fold_' in col]
fi['importance'] = fi[fi_cols].mean(axis=1)
return y_oof, test_preds, fi
if __name__ == '__main__':
# 데이터 파일 읽기
data = pd.read_csv(data_dir + '/train.csv', parse_dates=['order_date'])
# -- settings
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
# 예측할 연월 설정
year_month = '2011-12'
# 피처 엔지니어링 실행
train, test, y, features = feature_engineering1(data, year_month)
# Cross Validation Out Of Fold로 LightGBM 모델 훈련 및 예측
y_oof, test_preds, fi = make_tabnet_oof_prediction(train, y, test, features)
# 테스트 결과 제출 파일 읽기
sub = pd.read_csv(data_dir + '/sample_submission.csv')
# 테스트 예측 결과 저장
sub['probability'] = test_preds
print(sub['probability'].head())
os.makedirs(output_dir, exist_ok=True)
# 제출 파일 쓰기
sub.to_csv(os.path.join(output_dir , 'output.csv'), index=False)