forked from h2oai/driverlessai-recipes
-
Notifications
You must be signed in to change notification settings - Fork 1
/
feature_selection.py
134 lines (107 loc) · 4.34 KB
/
feature_selection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""Perform feature selection by using target perturbation technique"""
# Article: https://academic.oup.com/bioinformatics/article/26/10/1340/193348
from typing import Union, List
from h2oaicore.data import CustomData
import datatable as dt
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
##### Global variables #####
cols2ignore = [
"ID"
] # this columns will be ignored by feature selection and remain in the dataset
target = "target" # target column to use
is_regression = False # problem type, regression or classification
number_of_iterations = 100 # how many perturbations to perform
threshold = 75 # percentile of NULL importance score distribution to use (higher number - more aggressive feature pruning)
importance = "gain" # which importance measurement to use? 'gain' or 'split'
lgbm_params = {
"n_estimators": 250,
"num_leaves": 256,
"boosting_type": "rf",
"colsample_bytree": 0.75,
"subsample": 0.632, # Standard RF bagging fraction
"subsample_freq": 1,
"n_jobs": -1,
}
##### Global variables #####
def get_feature_importances(data, shuffle, cats=[], seed=None):
# Gather real features
train_features = [f for f in data if f not in [target] + cols2ignore]
# Shuffle target if required
y = data[target].copy()
if shuffle:
y = data[target].copy().sample(frac=1.0, random_state=seed + 4)
from h2oaicore.lightgbm_dynamic import import_lightgbm
lgbm = import_lightgbm()
import lightgbm as lgbm
if is_regression:
model = lgbm.LGBMRegressor(
random_state=seed, importance_type=importance, **lgbm_params
)
else:
model = lgbm.LGBMClassifier(
random_state=seed, importance_type=importance, **lgbm_params
)
y = LabelEncoder().fit_transform(y)
# Fit LightGBM in RF mode, yes it's quicker than sklearn RandomForest
model.fit(data[train_features], y, categorical_feature=cats)
# Get feature importances
imp_df = pd.DataFrame()
imp_df["feature"] = list(train_features)
imp_df["importance"] = model.feature_importances_
return imp_df
class FeatureSelection(CustomData):
@staticmethod
def do_acceptance_test():
return False
@staticmethod
def create_data(X: dt.Frame = None):
if X is None:
return []
data = X.to_pandas().copy()
# identify categorical colmns and trasform them
cats = [
x
for x in data.select_dtypes(exclude=np.number).columns
if x not in [target] + cols2ignore
]
for c in cats:
data[c] = OrdinalEncoder().fit_transform(
data[c].astype(str).values.reshape(-1, 1)
)
# Get the actual importance, i.e. without shuffling
actual_imp_df = get_feature_importances(
data=data, cats=cats, shuffle=False, seed=42
)
# Seed the unexpected randomness of this world
np.random.seed(123)
seeds = np.random.randint(0, 2 ** 30, size=number_of_iterations)
null_imp_df = pd.DataFrame()
for i, s in enumerate(seeds):
# Get current run importances
imp_df = get_feature_importances(data=data, cats=cats, shuffle=True, seed=s)
imp_df["run"] = i + 1
# Concat the latest importances with the old ones
null_imp_df = pd.concat([null_imp_df, imp_df], axis=0)
feature_scores = []
for _f in actual_imp_df["feature"].unique():
f_null_imps_gain = null_imp_df.loc[
null_imp_df["feature"] == _f, "importance"
].values
f_act_imps_gain = actual_imp_df.loc[
actual_imp_df["feature"] == _f, "importance"
].mean()
_score = np.log(
1e-10
+ f_act_imps_gain
/ (1 + np.percentile(f_null_imps_gain, max(75, min(99, threshold))))
)
feature_scores.append((_f, _score))
scores_df = pd.DataFrame(feature_scores, columns=["feature", "score"])
# final feature selection
selected_features = scores_df[scores_df["score"] > 0]["feature"].values.tolist()
selected_features = np.unique(selected_features).tolist()
data = X.to_pandas().copy()
return data[cols2ignore + selected_features + [target]]