forked from h2oai/driverlessai-recipes
-
Notifications
You must be signed in to change notification settings - Fork 1
/
knearestneighbour.py
207 lines (181 loc) · 9.74 KB
/
knearestneighbour.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
"""K-Nearest Neighbor implementation by sklearn. For small data (< 200k rows)."""
import os
import datatable as dt
import numpy as np
import pandas as pd
from h2oaicore.models_main import MainModel
from h2oaicore.systemutils import config, physical_cores_count, loggerinfo
from sklearn.preprocessing import LabelEncoder
from h2oaicore.models import CustomModel
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
class KNearestNeighbourModel(CustomModel):
_regression = True
_binary = True
_multiclass = True
_parallel_task = True
_testing_can_skip_failure = False # ensure tested as if shouldn't fail
_display_name = "KNearestNeighbour"
_description = "K Nearest Neighbour Model based on sklearn. Not adviced if the data is larger than 200K rows"
@staticmethod
def can_use(accuracy, interpretability, train_shape=None, test_shape=None, valid_shape=None, n_gpus=0,
num_classes=None, **kwargs):
if config.hard_asserts:
# for bigger data, too slow to test even with 1 iteration
use = train_shape is not None and train_shape[0] * train_shape[1] < 1024 * 1024 or \
valid_shape is not None and valid_shape[0] * valid_shape[1] < 1024 * 1024
# too slow for walmart with only 421k x 15 even with 10 neighbors
use &= train_shape is not None and train_shape[1] < 10
return use
else:
return True
def set_default_params(self, accuracy=None, time_tolerance=None, interpretability=None, **kwargs):
kwargs.pop('get_best', None)
self.mutate_params(accuracy=accuracy, time_tolerance=time_tolerance, interpretability=interpretability,
get_best=True, **kwargs)
def mutate_params(self, accuracy=10, time_tolerance=10, interpretability=1, get_best=False, **kwargs):
# Modify certain parameters for tuning
user_choice = config.recipe_dict.copy()
self.params = dict()
trial = kwargs.get('trial')
list_of_neibs = [1, 2, 5, 10, 50, 100, 150, 200]
if config.recipe == 'kaggle':
list_of_neibs.extend([250, 300])
if 'GIT_HASH' in os.environ and config.hard_asserts:
list_of_neibs = [1]
self.params['n_neighbors'] = MainModel.get_one(list_of_neibs, get_best=get_best,
best_type="first", name="n_neighbors",
trial=trial,
user_choice=user_choice)
self.params['metric'] = MainModel.get_one(["minkowski", "cityblock"], get_best=get_best,
best_type="first", name="metric",
trial=trial,
user_choice=user_choice)
self.params['algorithm'] = MainModel.get_one(['auto', 'ball_tree', 'kd_tree', 'brute'], get_best=get_best,
best_type="first", name="algorithm",
trial=trial,
user_choice=user_choice)
self.params['leaf_size'] = MainModel.get_one([30, 4, 100], get_best=get_best,
best_type="first", name="leaf_size",
trial=trial,
user_choice=user_choice)
self.params['p'] = MainModel.get_one([2, 1], get_best=get_best,
best_type="first", name="p",
trial=trial,
user_choice=user_choice)
self.params['weights'] = MainModel.get_one(['uniform', 'distance'], get_best=get_best,
best_type="first", name="weights",
trial=trial,
user_choice=user_choice)
self.params['standardize'] = MainModel.get_one([False, True], get_best=get_best,
best_type="first", name="standardize",
trial=trial,
user_choice=user_choice)
def transcribe_params(self, params=None, **kwargs):
"""
Fixups of params to avoid any conflicts not expressible easily for Optuna
Or system things only need to set at fit time
:param params:
:return:
"""
params_was_None = False
if params is None:
params = self.params # reference, so goes back into self.params
params_was_None = True
# add anything that needs fixing
if params.get('algorithm', "") not in ['ball_tree', 'kd_tree']:
params.pop('leaf_size', None)
if isinstance(kwargs.get('train_shape'), (tuple, list)) and kwargs.get('train_shape')[0] > 1:
params['n_neighbors'] = min(params['n_neighbors'], kwargs.get('train_shape')[0])
params.pop('standardize', None) # internal parameter, not for actual underlying sklearn model
if params_was_None:
# in case some function didn't preserve reference
self.params = params
return params # default is no transcription
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
# system thing, doesn't need to be set in default or mutate, just at runtime in fit, into self.params so can see
self.params["n_jobs"] = self.params_base.get('n_jobs', max(1, physical_cores_count))
params = self.params.copy()
params = self.transcribe_params(params, train_shape=X.shape)
loggerinfo(self.get_logger(**kwargs), "%s fit params: %s" % (self.display_name, dict(params)))
loggerinfo(self.get_logger(**kwargs), "%s data: %s %s" % (self.display_name, X.shape, y.shape))
X = dt.Frame(X)
orig_cols = list(X.names)
if self.num_classes >= 2:
model = KNeighborsClassifier(**params)
lb = LabelEncoder()
lb.fit(self.labels)
y = lb.transform(y)
else:
model = KNeighborsRegressor(**params)
X = self.basic_impute(X)
X = X.to_numpy()
if self.params.get('standardize', False): # self.params since params has it popped out
standard_scaler = StandardScaler()
X = standard_scaler.fit_transform(X)
else:
standard_scaler = None
model.fit(X, y)
importances = self.get_basic_importances(X, y)
self.set_model_properties(model=(model, standard_scaler, self.min),
features=orig_cols,
importances=importances.tolist(), # abs(model.coef_[0])
iterations=0)
def basic_impute(self, X):
# scikit extra trees internally converts to np.float32 during all operations,
# so if float64 datatable, need to cast first, in case will be nan for float32
from h2oaicore.systemutils import update_precision
X = update_precision(X, data_type=np.float32, override_with_data_type=True, fixup_almost_numeric=True)
# Replace missing values with a value smaller than all observed values
if not hasattr(self, 'min') or not isinstance(self.min, dict):
self.min = dict()
for col in X.names:
XX = X[:, col]
if col not in self.min:
self.min[col] = XX.min1()
if self.min[col] is None or np.isnan(self.min[col]) or np.isinf(self.min[col]):
self.min[col] = -1e10
else:
self.min[col] -= 1
XX.replace([None, np.inf, -np.inf], self.min[col])
X[:, col] = XX
assert X[dt.isna(dt.f[col]), col].nrows == 0
return X
def get_basic_importances(self, X, y):
# Just get feature importance using basic model so at least something to consume by genetic algorithm
if isinstance(X, dt.Frame):
X = X.to_numpy()
elif isinstance(X, pd.DataFrame):
X = X.values
# not in self, just for importances regardless of real model behavior
standard_scaler = StandardScaler()
X = standard_scaler.fit_transform(X)
from sklearn.linear_model import Ridge # will be used to derive feature importances
feature_model = Ridge(alpha=1., random_state=self.random_state)
feature_model.fit(X, y)
return np.array(abs(feature_model.coef_))
def predict(self, X, **kwargs):
model_tuple, _, _, _ = self.get_model_properties()
if len(model_tuple) == 3:
model, standard_scaler, self.min = model_tuple
else:
# migration for old recipe version
model = model_tuple
standard_scaler = None
self.min = dict()
X = dt.Frame(X)
X = self.basic_impute(X)
X = X.to_numpy()
if standard_scaler is not None:
X = standard_scaler.transform(X)
pred_contribs = kwargs.get('pred_contribs', None)
output_margin = kwargs.get('output_margin', None)
if not pred_contribs:
if self.num_classes == 1:
preds = model.predict(X)
else:
preds = model.predict_proba(X)
# preds = (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
return preds
else:
raise NotImplementedError("No Shapley for K-nearest model")