forked from h2oai/driverlessai-recipes
-
Notifications
You must be signed in to change notification settings - Fork 1
/
adaboost.py
146 lines (135 loc) · 4.86 KB
/
adaboost.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
"""Adaboost model from sklearn"""
import datatable as dt
import numpy as np
from h2oaicore.models import CustomModel
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.preprocessing import LabelEncoder
from h2oaicore.systemutils import physical_cores_count, config
class AdaBoostModel(CustomModel):
_regression = True
_binary = True
_multiclass = True
_display_name = "AdaBoost"
_description = "AdaBoost Model based on sklearn"
_testing_can_skip_failure = False # ensure tested as if shouldn't fail
_parallel_task = True
_use_single_core_if_many = True
@staticmethod
def can_use(
accuracy,
interpretability,
train_shape=None,
test_shape=None,
valid_shape=None,
n_gpus=0,
num_classes=None,
**kwargs
):
if config.hard_asserts:
# for bigger data, too slow to test even with 1 iteration
use = (
train_shape is not None
and train_shape[0] * train_shape[1] < 1024 * 1024
or valid_shape is not None
and valid_shape[0] * valid_shape[1] < 1024 * 1024
)
# too slow for walmart with only 421k x 15
use &= train_shape is not None and train_shape[1] < 10
return use
else:
return True
def set_default_params(
self, accuracy=None, time_tolerance=None, interpretability=None, **kwargs
):
# Fill up parameters we care about
n_estimators = min(kwargs.get("n_estimators", 100), 1000)
if config.hard_asserts:
# for testing avoid too many trees
n_estimators = 10
self.params = dict(
random_state=kwargs.get("random_state", 1234), n_estimators=n_estimators
)
def mutate_params(self, accuracy=10, **kwargs):
if accuracy > 8:
estimators_list = [100, 200, 300, 500, 1000, 2000]
elif accuracy >= 5:
estimators_list = [50, 100, 200, 300, 400, 500]
elif accuracy >= 3:
estimators_list = [10, 50, 100]
elif accuracy >= 2:
estimators_list = [10, 50]
else:
estimators_list = [10]
if config.hard_asserts:
# for testing avoid too many trees
estimators_list = [10]
# Modify certain parameters for tuning
self.params["n_estimators"] = int(np.random.choice(estimators_list))
if self.num_classes == 1:
self.params["loss"] = np.random.choice(["linear", "square", "exponential"])
def fit(
self,
X,
y,
sample_weight=None,
eval_set=None,
sample_weight_eval_set=None,
**kwargs
):
orig_cols = list(X.names)
if self.num_classes >= 2:
lb = LabelEncoder()
lb.fit(self.labels)
y = lb.transform(y)
model = AdaBoostClassifier(**self.params)
else:
model = AdaBoostRegressor(**self.params)
X = self.basic_impute(X)
X = X.to_numpy()
model.fit(X, y)
importances = np.array(model.feature_importances_)
self.set_model_properties(
model=model,
features=orig_cols,
importances=importances.tolist(),
iterations=self.params["n_estimators"],
)
def basic_impute(self, X):
# scikit extra trees internally converts to np.float32 during all operations,
# so if float64 datatable, need to cast first, in case will be nan for float32
from h2oaicore.systemutils import update_precision
X = update_precision(
X,
data_type=np.float32,
override_with_data_type=True,
fixup_almost_numeric=True,
)
# Replace missing values with a value smaller than all observed values
if not hasattr(self, "min"):
self.min = dict()
for col in X.names:
XX = X[:, col]
if col not in self.min:
self.min[col] = XX.min1()
if (
self.min[col] is None
or np.isnan(self.min[col])
or np.isinf(self.min[col])
):
self.min[col] = -1e10
else:
self.min[col] -= 1
XX.replace([None, np.inf, -np.inf], self.min[col])
X[:, col] = XX
assert X[dt.isna(dt.f[col]), col].nrows == 0
return X
def predict(self, X, **kwargs):
X = dt.Frame(X)
X = self.basic_impute(X)
X = X.to_numpy()
model, _, _, _ = self.get_model_properties()
if self.num_classes == 1:
preds = model.predict(X)
else:
preds = model.predict_proba(X)
return preds