forked from h2oai/driverlessai-recipes
-
Notifications
You must be signed in to change notification settings - Fork 1
/
daal_trees.py
222 lines (198 loc) · 9.82 KB
/
daal_trees.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
"""Binary Classification and Regression for Decision Forest and Gradient Boosting based on Intel DAAL"""
import datatable as dt
from h2oaicore.models import CustomModel
import numpy as np
from h2oaicore.systemutils_more import arch_type
from sklearn.preprocessing import LabelEncoder
enable_daal = arch_type != 'ppc64le' and False # WIP until figure out how to support on py38
if enable_daal:
import daal4py as d4p
class DaalBaseModel(object):
_regression = True
_binary = True # FIXME: but returns class, not probabilities
_multiclass = False # FIXME: shape issue
_can_use_gpu = False
_is_reproducible = False
@staticmethod
def is_enabled():
from h2oaicore.systemutils_more import arch_type
# return not (arch_type == "ppc64le")
return False # WIP until figure out how to support on py38
def dt_to_numpy(self, X, y=None):
if isinstance(X, dt.Frame):
X = X.to_numpy()
dtype = np.float32 if self.params['fptype'] == 'float' else np.float64
X = np.ascontiguousarray(X, dtype=dtype)
if y is not None:
y = np.ascontiguousarray(y, dtype=dtype).reshape(X.shape[0], 1)
else:
raise
return X, y
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
if self.num_classes > 1:
lb = LabelEncoder()
lb.fit(self.labels)
y = lb.transform(y)
X_features = list(X.names)
X, y = self.dt_to_numpy(X, y)
if self.num_classes == 1:
train_func = self._train_func_regress
else:
train_func = self._train_func_class
train_algo = train_func(**self.params)
train_result = train_algo.compute(X, y)
model_tuple = (train_result, self.num_classes, self.params['fptype'])
if hasattr(train_result, 'variableImportance'):
importances = train_result.variableImportance.tolist()[0]
else:
importances = np.ones(len(X_features))
self.set_model_properties(model=model_tuple,
features=X_features,
importances=importances,
iterations=self.params.get('nTrees', self.params.get('maxIterations', 100)))
def predict(self, X, **kwargs):
model_tuple, _, _, _ = self.get_model_properties()
train_result = model_tuple[0]
nClasses = model_tuple[1]
fptype = model_tuple[2]
if self.num_classes == 1:
predict_func = self._predict_func_regress
other_kwargs = {}
else:
predict_func = self._predict_func_class
other_kwargs = {'nClasses': nClasses}
predict_algo = predict_func(fptype=fptype, **other_kwargs)
X, _ = self.dt_to_numpy(X, None)
# This is not optimal at the moment because it returns the 0/1 label and not a probability.
# So the ROC curve in DAI looks very jagged. A future version of DAAL Decision Forest will
# support predicting probabilities as well as the label.
if self.num_classes <= 2:
result = predict_algo.compute(X, train_result.model).prediction.ravel()
else:
result = predict_algo.compute(X, train_result.model).prediction
return result
class DaalTreeModel(DaalBaseModel, CustomModel):
_display_name = "DaalTree"
_description = "Decision Tree Model based on Intel DAAL (https://intelpython.github.io/daal4py/algorithms.html)"
if enable_daal:
_train_func_class = d4p.gbt_classification_training
_predict_func_class = d4p.gbt_classification_prediction
_train_func_regress = d4p.gbt_regression_training
_predict_func_regress = d4p.gbt_regression_prediction
else:
_train_func_class = None
_predict_func_class = None
_train_func_regress = None
_predict_func_regress = None
def set_default_params(self, accuracy=None, time_tolerance=None, interpretability=None, **kwargs):
self.params = {
'nClasses': self.num_classes,
'fptype': 'float',
'maxIterations': 200,
'maxTreeDepth': 6,
'minSplitLoss': 0.1,
'shrinkage': 0.3,
'observationsPerTreeFraction': 1,
'lambda_': 1,
'maxBins': 256,
'featuresPerNode': 0,
'minBinSize': 5,
'memorySavingMode': False,
'minObservationsInLeafNode': 1
}
if self.num_classes == 1:
self.params.pop('nClasses', None)
self.params.pop('nTrees', None)
self.params.pop('maxIterations', None)
class DaalForestModel(DaalBaseModel, CustomModel):
_display_name = "DaalForest"
_description = "Decision Forest Model based on Intel DAAL (https://intelpython.github.io/daal4py/algorithms.html)"
if enable_daal:
_train_func_class = d4p.decision_forest_classification_training
_predict_func_class = d4p.decision_forest_classification_prediction
_train_func_regress = d4p.decision_forest_regression_training
_predict_func_regress = d4p.decision_forest_regression_prediction
else:
_train_func_class = None
_predict_func_class = None
_train_func_regress = None
_predict_func_regress = None
def set_default_params(self, accuracy=None, time_tolerance=None, interpretability=None, **kwargs):
self.params = dict(nClasses=self.num_classes,
fptype='float',
varImportance='MDI',
nTrees=100)
if self.num_classes == 1:
self.params.pop('nClasses', None)
self.params.pop('nTrees', None)
self.params.pop('maxIterations', None)
def _setup_recipe():
# for DAI 1.7.0 one is required to run this function manually
# in DAI >=1.7.1, this function will be run by DAI itself
import os
from h2oaicore.systemutils_more import extract, download
from h2oaicore.systemutils import config, remove
from h2oaicore.systemutils import user_dir
import shutil
from h2oaicore.systemutils_more import arch_type # don't remove this import, setup_recipe parsed-out separately
return True # WIP: Disable daal for now in general, just leave recipe floating there for migration purposes
if arch_type == "ppc64le":
if config.hard_asserts:
# in CI testing just ignore
return True
else:
# for user use, raise
raise RuntimeError("Cannot use daal on PPC")
daal_is_installed_path = os.path.join(user_dir(), config.contrib_env_relative_directory, "daal")
daal_is_installed_file = os.path.join(daal_is_installed_path, "daal_is_installed")
if not os.path.isfile(daal_is_installed_file):
daal_temp_path = os.path.join(user_dir(), config.contrib_relative_directory, "daal")
os.makedirs(daal_temp_path, exist_ok=True)
prefix = "https://anaconda.org/intel"
try:
file1 = download("%s/daal4py/2021.2.0/download/linux-64/daal4py-2021.2.0-py38_intel_358.tar.bz2" % prefix,
dest_path=daal_temp_path)
file2 = download("%s/impi_rt/2021.2.0/download/linux-64/impi_rt-2021.2.0-intel_215.tar.bz2" % prefix,
dest_path=daal_temp_path)
file3 = download("%s/daal/2021.2.0/download/linux-64/daal-2021.2.0-intel_358.tar.bz2" % prefix,
dest_path=daal_temp_path)
file4 = download("https://github.com/intel/daal/releases/download/2019_u4/l_daal_oss_p_2019.4.007.tgz",
dest_path=daal_temp_path)
except:
file1 = download("https://0xdata-public.s3.amazonaws.com/daal4py-2019.4-py36h7b7c402_6.tar.bz2",
dest_path=daal_temp_path)
file2 = download("https://0xdata-public.s3.amazonaws.com/impi_rt-2019.4-intel_243.tar.bz2",
dest_path=daal_temp_path)
file3 = download("https://0xdata-public.s3.amazonaws.com/daal-2019.4-intel_243.tar.bz2",
dest_path=daal_temp_path)
file4 = download("https://0xdata-public.s3.amazonaws.com/l_daal_oss_p_2019.4.007.tgz",
dest_path=daal_temp_path)
temp_path = os.path.join(user_dir(), config.contrib_env_relative_directory, "info")
os.makedirs(temp_path, exist_ok=True)
python_site_packages_path = os.path.join(user_dir(), config.contrib_env_relative_directory)
extract(file1, python_site_packages_path)
python_site_packages_path2 = os.path.join(user_dir(), config.contrib_env_relative_directory)
extract(file2, python_site_packages_path2)
extract(file3, python_site_packages_path2)
extract(file4, python_site_packages_path2, "gz")
other_path = os.path.join(python_site_packages_path2, "lib/libfabric/")
import glob
for file in glob.glob(os.path.join(other_path, "*.so*")):
new_file = os.path.join(python_site_packages_path2, "lib", os.path.basename(file))
if not os.path.isfile(new_file):
shutil.copy(file, new_file)
other_path = os.path.join(python_site_packages_path2,
"l_daal_oss_p_2019.4.007/daal_prebuild/linux/tbb/lib/intel64_lin/gcc4.4/")
import glob
for file in glob.glob(os.path.join(other_path, "*.so*")):
new_file = os.path.join(python_site_packages_path2, "lib", os.path.basename(file))
if not os.path.isfile(new_file):
shutil.copy(file, new_file)
os.makedirs(daal_is_installed_path, exist_ok=True)
with open(daal_is_installed_file, "wt") as f:
f.write("DONE")
remove(file1)
remove(file2)
remove(file3)
remove(file4)
return True