From 5ada8a95b5575f2d8274b182347ded4ed5dbede5 Mon Sep 17 00:00:00 2001 From: Chandan Singh Date: Fri, 12 Feb 2021 09:51:42 -0800 Subject: [PATCH] update docs --- docs/index.html | 36 +- .../bayesian_rule_list.html | 333 +++---------- docs/rule_set/fplasso.html | 399 ++++++++++++++++ docs/rule_set/fpskope.html | 307 ++++++++++++ docs/rule_set/index.html | 10 + docs/rule_set/rule_fit.html | 226 +++------ docs/rule_set/rule_set.html | 28 +- docs/rule_set/skope_rules.html | 290 ++++-------- docs/util/convert.html | 74 ++- docs/util/discretization/index.html | 8 +- docs/util/discretization/mdlp.html | 385 ++++++++++++++- docs/util/extract.html | 442 ++++++++++++++++++ docs/util/index.html | 5 + docs/util/prune.html | 46 +- docs/util/rule.html | 98 ++-- docs/util/score.html | 6 +- readme.md | 20 +- 17 files changed, 1968 insertions(+), 745 deletions(-) create mode 100644 docs/rule_set/fplasso.html create mode 100644 docs/rule_set/fpskope.html create mode 100644 docs/util/extract.html diff --git a/docs/index.html b/docs/index.html index 37cf0238..2d995473 100644 --- a/docs/index.html +++ b/docs/index.html @@ -27,15 +27,17 @@

Interpretable machine-learning models (imodels) 🔍

Python package for concise, transparent, and accurate predictive modeling. All sklearn-compatible and easily customizable.

-Github • +docsimodels overview • -Demo notebooks +demo notebooks

- + + - - + + +

imodels overview

Implementations of different popular interpretable models can be easily used and installed:

@@ -46,6 +48,12 @@

imodels overview

model.fit(X_train, y_train) # fit model preds = model.predict(X_test) # discrete predictions: shape is (n_test, 1) preds_proba = model.predict_proba(X_test) # predicted probabilities: shape is (n_test, n_classes) +print(model) # print the rule-based model + +----------------------------- +# if X1 > 5: then 80.5% risk +# else if X2 > 5: then 40% risk +# else: 10% risk

Install with pip install imodels (see here for help). Contains the following models:

@@ -134,10 +142,18 @@

imodels overview

Different models and algorithms vary not only in their final form but also in different choices made during modeling. In particular, many models differ in the 3 steps given by the table below.

- +
+ex. RuleFit and SkopeRules +RuleFit and SkopeRules differ only in the way they prune rules: RuleFit uses a linear model whereas SkopeRules heuristically deduplicates rules sharing overlap. +
+
+ex. Bayesian rule lists and greedy rule lists +Bayesian rule lists and greedy rule lists differ in how they select rules; bayesian rule lists perform a global optimization over possible rule lists while Greedy rule lists pick splits sequentially to maximize a given criterion. +
+
+ex. FPSkope and SkopeRules +FPSkope and SkopeRules differ only in the way they generate candidate rules: FPSkope uses FPgrowth whereas SkopeRules extracts rules from decision trees. +

See the docs for individual models for futher descriptions.

@@ -279,6 +295,8 @@

References

from .rule_list.greedy_rule_list import GreedyRuleListClassifier from .rule_list.one_r import OneRClassifier from .rule_set.rule_fit import RuleFitRegressor, RuleFitClassifier +from .rule_set.fplasso import FPLassoRegressor, FPLassoClassifier +from .rule_set.fpskope import FPSkopeClassifier from .rule_set.skope_rules import SkopeRulesClassifier from .rule_set.boosted_rules import BoostedRulesClassifier # from .tree.iterative_random_forest.iterative_random_forest import IRFClassifier diff --git a/docs/rule_list/bayesian_rule_list/bayesian_rule_list.html b/docs/rule_list/bayesian_rule_list/bayesian_rule_list.html index 344b5b57..07aa3c38 100644 --- a/docs/rule_list/bayesian_rule_list/bayesian_rule_list.html +++ b/docs/rule_list/bayesian_rule_list/bayesian_rule_list.html @@ -34,8 +34,9 @@

Module imodels.rule_list.bayesian_rule_list.bayesian_rul from sklearn.base import BaseEstimator from imodels.rule_list.bayesian_rule_list.brl_util import * -from imodels.util.discretization.mdlp import MDLP_Discretizer +from imodels.util.discretization.mdlp import MDLP_Discretizer, BRLDiscretizer from imodels.rule_list.rule_list import RuleList +from imodels.util.extract import extract_fpgrowth class BayesianRuleListClassifier(BaseEstimator, RuleList): @@ -112,34 +113,6 @@

Module imodels.rule_list.bayesian_rule_list.bayesian_rul feature_labels = ["ft" + str(i + 1) for i in range(len(X[0]))] self.feature_labels = feature_labels - def _discretize_mixed_data(self, X, y, undiscretized_features=[]): - if type(X) != list: - X = np.array(X).tolist() - - # check which features are numeric (to be discretized) - self.discretized_features = [] - for fi in range(len(X[0])): - # if not string, and not specified as undiscretized - if isinstance(X[0][fi], numbers.Number) \ - and (len(self.feature_labels) == 0 or \ - len(undiscretized_features) == 0 or \ - self.feature_labels[fi] not in undiscretized_features): - self.discretized_features.append(self.feature_labels[fi]) - - if len(self.discretized_features) > 0: - if self.verbose: - print( - "Warning: non-categorical data found. Trying to discretize. (Please convert categorical values to " - "strings, and/or specify the argument 'undiscretized_features', to avoid this.)") - X = self.discretize(X, y) - - return X - - def _setdata(self, X, y, feature_labels=[], undiscretized_features=[]): - self._setlabels(X, feature_labels) - X = self._discretize_mixed_data(X, y, undiscretized_features) - return X, y - def fit(self, X, y, feature_labels: list=None, undiscretized_features=[], verbose=False): """Fit rule lists to data @@ -171,50 +144,25 @@

Module imodels.rule_list.bayesian_rule_list.bayesian_rul if len(set(y)) != 2: raise Exception("Only binary classification is supported at this time!") - # deal with pandas data - if type(X) in [pd.DataFrame, pd.Series]: - if feature_labels is None: - feature_labels = X.columns - X = X.values - if type(y) in [pd.DataFrame, pd.Series]: - y = y.values - - if feature_labels is None: - feature_labels = [f'X{i}' for i in range(X.shape[1])] - - X, y = self._setdata(X, y, feature_labels, undiscretized_features) - permsdic = defaultdict(default_permsdic) # We will store here the MCMC results - data = list(X[:]) - - # Now find frequent itemsets - - X_colname_removed = data.copy() - for i in range(len(data)): - X_colname_removed[i] = list(map(lambda s: s.split(' : ')[1], X_colname_removed[i])) - - X_df_categorical = pd.DataFrame(X_colname_removed, columns=feature_labels) - X_df_onehot = pd.get_dummies(X_df_categorical) - onehot_features = X_df_onehot.columns - - itemsets_df = fpgrowth(X_df_onehot, min_support=self.minsupport, max_len=self.maxcardinality) - itemsets_indices = [tuple(s[1]) for s in itemsets_df.values] - itemsets = [np.array(onehot_features)[list(inds)] for inds in itemsets_indices] - itemsets = list(map(tuple, itemsets)) - if self.verbose: - print(len(itemsets), 'rules mined') - - + itemsets, self.discretizer = extract_fpgrowth(X, y, + feature_labels=feature_labels, + minsupport=self.minsupport, + maxcardinality=self.maxcardinality, + undiscretized_features=undiscretized_features, + verbose=verbose) + + self.feature_labels = self.discretizer.feature_labels + X_df_onehot = self.discretizer.onehot_df + # Now form the data-vs.-lhs set # X[j] is the set of data points that contain itemset j (that is, satisfy rule j) for c in X_df_onehot.columns: X_df_onehot[c] = [c if x == 1 else '' for x in list(X_df_onehot[c])] X = [{}] * (len(itemsets) + 1) - X[0] = set(range(len(data))) # the default rule satisfies all data + X[0] = set(range(len(X_df_onehot))) # the default rule satisfies all data for (j, lhs) in enumerate(itemsets): X[j + 1] = set([i for (i, xi) in enumerate(X_df_onehot.values) if set(lhs).issubset(xi)]) - - # now form lhs_len lhs_len = [0] for lhs in itemsets: @@ -227,7 +175,8 @@

Module imodels.rule_list.bayesian_rule_list.bayesian_rul Xtrain, Ytrain, nruleslen, lhs_len, self.itemsets = ( X, np.vstack((1 - np.array(y), y)).T.astype(int), nruleslen, lhs_len, itemsets_all ) - + + permsdic = defaultdict(default_permsdic) # We will store here the MCMC results # Do MCMC res, Rhat = run_bdl_multichain_serial(self.max_iter, self.thinning, self.alpha, self.listlengthprior, self.listwidthprior, Xtrain, Ytrain, nruleslen, lhs_len, @@ -248,34 +197,6 @@

Module imodels.rule_list.bayesian_rule_list.bayesian_rul return self - def discretize(self, X, y): - '''Discretize the features specified in self.discretized_features - ''' - if self.verbose: - print("Discretizing ", self.discretized_features, "...") - D = pd.DataFrame(np.hstack((X, np.array(y).reshape((len(y), 1)))), columns=list(self.feature_labels) + ["y"]) - self.discretizer = MDLP_Discretizer(dataset=D, class_label="y", features=self.discretized_features) - - cat_data = pd.DataFrame(np.zeros_like(X)) - for i in range(len(self.feature_labels)): - label = self.feature_labels[i] - if label in self.discretized_features: - column = [] - for j in range(len(self.discretizer._data[label])): - column += [label + " : " + self.discretizer._data[label][j]] - cat_data.iloc[:, i] = np.array(column) - else: - cat_data.iloc[:, i] = D[label] - - return np.array(cat_data).tolist() - - def _prepend_feature_labels(self, X): - Xl = np.copy(X).astype(str).tolist() - for i in range(len(Xl)): - for j in range(len(Xl[0])): - Xl[i][j] = self.feature_labels[j] + " : " + Xl[i][j] - return Xl - def __str__(self, decimals=1): if self.d_star: detect = "" @@ -329,17 +250,15 @@

Module imodels.rule_list.bayesian_rule_list.bayesian_rul the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ - # deal with pandas data - if type(X) in [pd.DataFrame, pd.Series]: - X = X.values - if self.discretizer: - self.discretizer._data = pd.DataFrame(X, columns=self.feature_labels) - self.discretizer.apply_cutpoints() - D = self._prepend_feature_labels(np.array(self.discretizer._data)) + D = self.discretizer.apply_discretization(X) else: D = X + # deal with pandas data + if type(D) in [pd.DataFrame, pd.Series]: + D = D.values + N = len(D) X2 = self._to_itemset_indices(D[:]) P = preds_d_t(X2, np.zeros((N, 1), dtype=int), self.d_star, self.theta) @@ -484,34 +403,6 @@

Parameters

feature_labels = ["ft" + str(i + 1) for i in range(len(X[0]))] self.feature_labels = feature_labels - def _discretize_mixed_data(self, X, y, undiscretized_features=[]): - if type(X) != list: - X = np.array(X).tolist() - - # check which features are numeric (to be discretized) - self.discretized_features = [] - for fi in range(len(X[0])): - # if not string, and not specified as undiscretized - if isinstance(X[0][fi], numbers.Number) \ - and (len(self.feature_labels) == 0 or \ - len(undiscretized_features) == 0 or \ - self.feature_labels[fi] not in undiscretized_features): - self.discretized_features.append(self.feature_labels[fi]) - - if len(self.discretized_features) > 0: - if self.verbose: - print( - "Warning: non-categorical data found. Trying to discretize. (Please convert categorical values to " - "strings, and/or specify the argument 'undiscretized_features', to avoid this.)") - X = self.discretize(X, y) - - return X - - def _setdata(self, X, y, feature_labels=[], undiscretized_features=[]): - self._setlabels(X, feature_labels) - X = self._discretize_mixed_data(X, y, undiscretized_features) - return X, y - def fit(self, X, y, feature_labels: list=None, undiscretized_features=[], verbose=False): """Fit rule lists to data @@ -543,50 +434,25 @@

Parameters

if len(set(y)) != 2: raise Exception("Only binary classification is supported at this time!") - # deal with pandas data - if type(X) in [pd.DataFrame, pd.Series]: - if feature_labels is None: - feature_labels = X.columns - X = X.values - if type(y) in [pd.DataFrame, pd.Series]: - y = y.values - - if feature_labels is None: - feature_labels = [f'X{i}' for i in range(X.shape[1])] - - X, y = self._setdata(X, y, feature_labels, undiscretized_features) - permsdic = defaultdict(default_permsdic) # We will store here the MCMC results - data = list(X[:]) - - # Now find frequent itemsets - - X_colname_removed = data.copy() - for i in range(len(data)): - X_colname_removed[i] = list(map(lambda s: s.split(' : ')[1], X_colname_removed[i])) - - X_df_categorical = pd.DataFrame(X_colname_removed, columns=feature_labels) - X_df_onehot = pd.get_dummies(X_df_categorical) - onehot_features = X_df_onehot.columns - - itemsets_df = fpgrowth(X_df_onehot, min_support=self.minsupport, max_len=self.maxcardinality) - itemsets_indices = [tuple(s[1]) for s in itemsets_df.values] - itemsets = [np.array(onehot_features)[list(inds)] for inds in itemsets_indices] - itemsets = list(map(tuple, itemsets)) - if self.verbose: - print(len(itemsets), 'rules mined') - - + itemsets, self.discretizer = extract_fpgrowth(X, y, + feature_labels=feature_labels, + minsupport=self.minsupport, + maxcardinality=self.maxcardinality, + undiscretized_features=undiscretized_features, + verbose=verbose) + + self.feature_labels = self.discretizer.feature_labels + X_df_onehot = self.discretizer.onehot_df + # Now form the data-vs.-lhs set # X[j] is the set of data points that contain itemset j (that is, satisfy rule j) for c in X_df_onehot.columns: X_df_onehot[c] = [c if x == 1 else '' for x in list(X_df_onehot[c])] X = [{}] * (len(itemsets) + 1) - X[0] = set(range(len(data))) # the default rule satisfies all data + X[0] = set(range(len(X_df_onehot))) # the default rule satisfies all data for (j, lhs) in enumerate(itemsets): X[j + 1] = set([i for (i, xi) in enumerate(X_df_onehot.values) if set(lhs).issubset(xi)]) - - # now form lhs_len lhs_len = [0] for lhs in itemsets: @@ -599,7 +465,8 @@

Parameters

Xtrain, Ytrain, nruleslen, lhs_len, self.itemsets = ( X, np.vstack((1 - np.array(y), y)).T.astype(int), nruleslen, lhs_len, itemsets_all ) - + + permsdic = defaultdict(default_permsdic) # We will store here the MCMC results # Do MCMC res, Rhat = run_bdl_multichain_serial(self.max_iter, self.thinning, self.alpha, self.listlengthprior, self.listwidthprior, Xtrain, Ytrain, nruleslen, lhs_len, @@ -620,34 +487,6 @@

Parameters

return self - def discretize(self, X, y): - '''Discretize the features specified in self.discretized_features - ''' - if self.verbose: - print("Discretizing ", self.discretized_features, "...") - D = pd.DataFrame(np.hstack((X, np.array(y).reshape((len(y), 1)))), columns=list(self.feature_labels) + ["y"]) - self.discretizer = MDLP_Discretizer(dataset=D, class_label="y", features=self.discretized_features) - - cat_data = pd.DataFrame(np.zeros_like(X)) - for i in range(len(self.feature_labels)): - label = self.feature_labels[i] - if label in self.discretized_features: - column = [] - for j in range(len(self.discretizer._data[label])): - column += [label + " : " + self.discretizer._data[label][j]] - cat_data.iloc[:, i] = np.array(column) - else: - cat_data.iloc[:, i] = D[label] - - return np.array(cat_data).tolist() - - def _prepend_feature_labels(self, X): - Xl = np.copy(X).astype(str).tolist() - for i in range(len(Xl)): - for j in range(len(Xl[0])): - Xl[i][j] = self.feature_labels[j] + " : " + Xl[i][j] - return Xl - def __str__(self, decimals=1): if self.d_star: detect = "" @@ -701,17 +540,15 @@

Parameters

the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ - # deal with pandas data - if type(X) in [pd.DataFrame, pd.Series]: - X = X.values - if self.discretizer: - self.discretizer._data = pd.DataFrame(X, columns=self.feature_labels) - self.discretizer.apply_cutpoints() - D = self._prepend_feature_labels(np.array(self.discretizer._data)) + D = self.discretizer.apply_discretization(X) else: D = X + # deal with pandas data + if type(D) in [pd.DataFrame, pd.Series]: + D = D.values + N = len(D) X2 = self._to_itemset_indices(D[:]) P = preds_d_t(X2, np.zeros((N, 1), dtype=int), self.d_star, self.theta) @@ -743,37 +580,6 @@

Ancestors

Methods

-
-def discretize(self, X, y) -
-
-

Discretize the features specified in self.discretized_features

-
- -Expand source code - -
def discretize(self, X, y):
-    '''Discretize the features specified in self.discretized_features
-    '''
-    if self.verbose:
-        print("Discretizing ", self.discretized_features, "...")
-    D = pd.DataFrame(np.hstack((X, np.array(y).reshape((len(y), 1)))), columns=list(self.feature_labels) + ["y"])
-    self.discretizer = MDLP_Discretizer(dataset=D, class_label="y", features=self.discretized_features)
-
-    cat_data = pd.DataFrame(np.zeros_like(X))
-    for i in range(len(self.feature_labels)):
-        label = self.feature_labels[i]
-        if label in self.discretized_features:
-            column = []
-            for j in range(len(self.discretizer._data[label])):
-                column += [label + " : " + self.discretizer._data[label][j]]
-            cat_data.iloc[:, i] = np.array(column)
-        else:
-            cat_data.iloc[:, i] = D[label]
-
-    return np.array(cat_data).tolist()
-
-
def fit(self, X, y, feature_labels=None, undiscretized_features=[], verbose=False)
@@ -831,50 +637,25 @@

Returns

if len(set(y)) != 2: raise Exception("Only binary classification is supported at this time!") - # deal with pandas data - if type(X) in [pd.DataFrame, pd.Series]: - if feature_labels is None: - feature_labels = X.columns - X = X.values - if type(y) in [pd.DataFrame, pd.Series]: - y = y.values - - if feature_labels is None: - feature_labels = [f'X{i}' for i in range(X.shape[1])] - - X, y = self._setdata(X, y, feature_labels, undiscretized_features) - permsdic = defaultdict(default_permsdic) # We will store here the MCMC results - data = list(X[:]) - - # Now find frequent itemsets - - X_colname_removed = data.copy() - for i in range(len(data)): - X_colname_removed[i] = list(map(lambda s: s.split(' : ')[1], X_colname_removed[i])) - - X_df_categorical = pd.DataFrame(X_colname_removed, columns=feature_labels) - X_df_onehot = pd.get_dummies(X_df_categorical) - onehot_features = X_df_onehot.columns - - itemsets_df = fpgrowth(X_df_onehot, min_support=self.minsupport, max_len=self.maxcardinality) - itemsets_indices = [tuple(s[1]) for s in itemsets_df.values] - itemsets = [np.array(onehot_features)[list(inds)] for inds in itemsets_indices] - itemsets = list(map(tuple, itemsets)) - if self.verbose: - print(len(itemsets), 'rules mined') - - + itemsets, self.discretizer = extract_fpgrowth(X, y, + feature_labels=feature_labels, + minsupport=self.minsupport, + maxcardinality=self.maxcardinality, + undiscretized_features=undiscretized_features, + verbose=verbose) + + self.feature_labels = self.discretizer.feature_labels + X_df_onehot = self.discretizer.onehot_df + # Now form the data-vs.-lhs set # X[j] is the set of data points that contain itemset j (that is, satisfy rule j) for c in X_df_onehot.columns: X_df_onehot[c] = [c if x == 1 else '' for x in list(X_df_onehot[c])] X = [{}] * (len(itemsets) + 1) - X[0] = set(range(len(data))) # the default rule satisfies all data + X[0] = set(range(len(X_df_onehot))) # the default rule satisfies all data for (j, lhs) in enumerate(itemsets): X[j + 1] = set([i for (i, xi) in enumerate(X_df_onehot.values) if set(lhs).issubset(xi)]) - - # now form lhs_len lhs_len = [0] for lhs in itemsets: @@ -887,7 +668,8 @@

Returns

Xtrain, Ytrain, nruleslen, lhs_len, self.itemsets = ( X, np.vstack((1 - np.array(y), y)).T.astype(int), nruleslen, lhs_len, itemsets_all ) - + + permsdic = defaultdict(default_permsdic) # We will store here the MCMC results # Do MCMC res, Rhat = run_bdl_multichain_serial(self.max_iter, self.thinning, self.alpha, self.listlengthprior, self.listwidthprior, Xtrain, Ytrain, nruleslen, lhs_len, @@ -983,17 +765,15 @@

Returns

the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ - # deal with pandas data - if type(X) in [pd.DataFrame, pd.Series]: - X = X.values - if self.discretizer: - self.discretizer._data = pd.DataFrame(X, columns=self.feature_labels) - self.discretizer.apply_cutpoints() - D = self._prepend_feature_labels(np.array(self.discretizer._data)) + D = self.discretizer.apply_discretization(X) else: D = X + # deal with pandas data + if type(D) in [pd.DataFrame, pd.Series]: + D = D.values + N = len(D) X2 = self._to_itemset_indices(D[:]) P = preds_d_t(X2, np.zeros((N, 1), dtype=int), self.d_star, self.theta) @@ -1044,7 +824,6 @@

Index

  • BayesianRuleListClassifier

      -
    • discretize
    • fit
    • predict
    • predict_proba
    • diff --git a/docs/rule_set/fplasso.html b/docs/rule_set/fplasso.html new file mode 100644 index 00000000..02ddb898 --- /dev/null +++ b/docs/rule_set/fplasso.html @@ -0,0 +1,399 @@ + + + + + + +imodels.rule_set.fplasso API documentation + + + + + + + + + +
      +
      +
      +

      Module imodels.rule_set.fplasso

      +
      +
      +
      + +Expand source code + +
      from typing import List
      +
      +from imodels.rule_set.rule_fit import RuleFit
      +from imodels.util.extract import extract_fpgrowth
      +from imodels.util.convert import itemsets_to_rules
      +
      +class FPLasso(RuleFit):
      +
      +    def __init__(self, 
      +                 minsupport=0.1,
      +                 maxcardinality=2,
      +                 verbose=False,
      +                 tree_size=4,
      +                 sample_fract='default',
      +                 max_rules=2000,
      +                 memory_par=0.01,
      +                 tree_generator=None,
      +                 lin_trim_quantile=0.025,
      +                 lin_standardise=True,
      +                 exp_rand_tree_size=True,
      +                 include_linear=True,
      +                 alphas=None,
      +                 cv=3,
      +                 random_state=None):
      +        super().__init__(tree_size,
      +                         sample_fract,
      +                         max_rules,
      +                         memory_par,
      +                         tree_generator,
      +                         lin_trim_quantile,
      +                         lin_standardise,
      +                         exp_rand_tree_size,
      +                         include_linear,
      +                         alphas,
      +                         cv,
      +                         random_state)
      +        self.minsupport = minsupport
      +        self.maxcardinality = maxcardinality
      +        self.verbose = verbose
      +
      +    def fit(self, X, y=None, feature_names=None, undiscretized_features=[]):
      +        self.undiscretized_features = undiscretized_features
      +        super().fit(X, y, feature_names=feature_names)
      +        return self
      +    
      +    def _extract_rules(self, X, y) -> List[str]:
      +        itemsets = extract_fpgrowth(X, y,
      +                                    feature_labels=self.feature_placeholders,
      +                                    minsupport=self.minsupport,
      +                                    maxcardinality=self.maxcardinality,
      +                                    undiscretized_features=self.undiscretized_features,
      +                                    verbose=self.verbose)[0]
      +        return itemsets_to_rules(itemsets)
      +
      +class FPLassoRegressor(FPLasso):        
      +    def _init_prediction_task(self):
      +        self.prediction_task = 'regression'
      +        
      +class FPLassoClassifier(FPLasso):
      +    def _init_prediction_task(self):
      +        self.prediction_task = 'classification'
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +

      Classes

      +
      +
      +class FPLasso +(minsupport=0.1, maxcardinality=2, verbose=False, tree_size=4, sample_fract='default', max_rules=2000, memory_par=0.01, tree_generator=None, lin_trim_quantile=0.025, lin_standardise=True, exp_rand_tree_size=True, include_linear=True, alphas=None, cv=3, random_state=None) +
      +
      +

      Rulefit class. Rather than using this class directly, should use RuleFitRegressor or RuleFitClassifier

      +

      Parameters

      +
      +
      tree_size :  +Number of terminal nodes in generated trees. If exp_rand_tree_size=True,
      +
      this will be the mean number of terminal nodes.
      +
      sample_fract :  +fraction of randomly chosen training observations used to produce each tree.
      +
      FP 2004 (Sec. 2)
      +
      max_rules :  +total number of terms included in the final model (both linear and rules)
      +
      approximate total number of rules generated for fitting also is based on this +Note that actual number of rules will usually be lower than this due to duplicates.
      +
      memory_par :  +scale multiplier (shrinkage factor) applied to each new tree when
      +
      sequentially induced. FP 2004 (Sec. 2)
      +
      lin_standardise : If True, the linear terms will be standardised as per Friedman Sec 3.2
      +
      by multiplying the winsorised variable by 0.4/stdev.
      +
      lin_trim_quantile : If lin_standardise is True, this quantile will be used to trim linear
      +
      terms before standardisation.
      +
      exp_rand_tree_size : If True, each boosted tree will have a different maximum number of
      +
      terminal nodes based on an exponential distribution about tree_size. +(Friedman Sec 3.3)
      +
      include_linear : Include linear terms as opposed to only rules
      +
       
      +
      random_state: +Integer to initialise random objects and provide repeatability.
      +
      tree_generator : Optional: this object will be used as provided to generate the rules.
      +
      This will override almost all the other properties above. +Must be GradientBoostingRegressor or GradientBoostingClassifier, optional (default=None)
      +
      +

      Attributes

      +
      +
      rule_ensemble : RuleEnsemble
      +
      The rule ensemble
      +
      feature_names : list of strings, optional (default=None)
      +
      The names of the features (columns)
      +
      +
      + +Expand source code + +
      class FPLasso(RuleFit):
      +
      +    def __init__(self, 
      +                 minsupport=0.1,
      +                 maxcardinality=2,
      +                 verbose=False,
      +                 tree_size=4,
      +                 sample_fract='default',
      +                 max_rules=2000,
      +                 memory_par=0.01,
      +                 tree_generator=None,
      +                 lin_trim_quantile=0.025,
      +                 lin_standardise=True,
      +                 exp_rand_tree_size=True,
      +                 include_linear=True,
      +                 alphas=None,
      +                 cv=3,
      +                 random_state=None):
      +        super().__init__(tree_size,
      +                         sample_fract,
      +                         max_rules,
      +                         memory_par,
      +                         tree_generator,
      +                         lin_trim_quantile,
      +                         lin_standardise,
      +                         exp_rand_tree_size,
      +                         include_linear,
      +                         alphas,
      +                         cv,
      +                         random_state)
      +        self.minsupport = minsupport
      +        self.maxcardinality = maxcardinality
      +        self.verbose = verbose
      +
      +    def fit(self, X, y=None, feature_names=None, undiscretized_features=[]):
      +        self.undiscretized_features = undiscretized_features
      +        super().fit(X, y, feature_names=feature_names)
      +        return self
      +    
      +    def _extract_rules(self, X, y) -> List[str]:
      +        itemsets = extract_fpgrowth(X, y,
      +                                    feature_labels=self.feature_placeholders,
      +                                    minsupport=self.minsupport,
      +                                    maxcardinality=self.maxcardinality,
      +                                    undiscretized_features=self.undiscretized_features,
      +                                    verbose=self.verbose)[0]
      +        return itemsets_to_rules(itemsets)
      +
      +

      Ancestors

      +
        +
      • RuleFit
      • +
      • sklearn.base.BaseEstimator
      • +
      • sklearn.base.TransformerMixin
      • +
      • RuleSet
      • +
      +

      Subclasses

      + +

      Inherited members

      + +
      +
      +class FPLassoClassifier +(minsupport=0.1, maxcardinality=2, verbose=False, tree_size=4, sample_fract='default', max_rules=2000, memory_par=0.01, tree_generator=None, lin_trim_quantile=0.025, lin_standardise=True, exp_rand_tree_size=True, include_linear=True, alphas=None, cv=3, random_state=None) +
      +
      +

      Rulefit class. Rather than using this class directly, should use RuleFitRegressor or RuleFitClassifier

      +

      Parameters

      +
      +
      tree_size :  +Number of terminal nodes in generated trees. If exp_rand_tree_size=True,
      +
      this will be the mean number of terminal nodes.
      +
      sample_fract :  +fraction of randomly chosen training observations used to produce each tree.
      +
      FP 2004 (Sec. 2)
      +
      max_rules :  +total number of terms included in the final model (both linear and rules)
      +
      approximate total number of rules generated for fitting also is based on this +Note that actual number of rules will usually be lower than this due to duplicates.
      +
      memory_par :  +scale multiplier (shrinkage factor) applied to each new tree when
      +
      sequentially induced. FP 2004 (Sec. 2)
      +
      lin_standardise : If True, the linear terms will be standardised as per Friedman Sec 3.2
      +
      by multiplying the winsorised variable by 0.4/stdev.
      +
      lin_trim_quantile : If lin_standardise is True, this quantile will be used to trim linear
      +
      terms before standardisation.
      +
      exp_rand_tree_size : If True, each boosted tree will have a different maximum number of
      +
      terminal nodes based on an exponential distribution about tree_size. +(Friedman Sec 3.3)
      +
      include_linear : Include linear terms as opposed to only rules
      +
       
      +
      random_state: +Integer to initialise random objects and provide repeatability.
      +
      tree_generator : Optional: this object will be used as provided to generate the rules.
      +
      This will override almost all the other properties above. +Must be GradientBoostingRegressor or GradientBoostingClassifier, optional (default=None)
      +
      +

      Attributes

      +
      +
      rule_ensemble : RuleEnsemble
      +
      The rule ensemble
      +
      feature_names : list of strings, optional (default=None)
      +
      The names of the features (columns)
      +
      +
      + +Expand source code + +
      class FPLassoClassifier(FPLasso):
      +    def _init_prediction_task(self):
      +        self.prediction_task = 'classification'
      +
      +

      Ancestors

      + +

      Inherited members

      + +
      +
      +class FPLassoRegressor +(minsupport=0.1, maxcardinality=2, verbose=False, tree_size=4, sample_fract='default', max_rules=2000, memory_par=0.01, tree_generator=None, lin_trim_quantile=0.025, lin_standardise=True, exp_rand_tree_size=True, include_linear=True, alphas=None, cv=3, random_state=None) +
      +
      +

      Rulefit class. Rather than using this class directly, should use RuleFitRegressor or RuleFitClassifier

      +

      Parameters

      +
      +
      tree_size :  +Number of terminal nodes in generated trees. If exp_rand_tree_size=True,
      +
      this will be the mean number of terminal nodes.
      +
      sample_fract :  +fraction of randomly chosen training observations used to produce each tree.
      +
      FP 2004 (Sec. 2)
      +
      max_rules :  +total number of terms included in the final model (both linear and rules)
      +
      approximate total number of rules generated for fitting also is based on this +Note that actual number of rules will usually be lower than this due to duplicates.
      +
      memory_par :  +scale multiplier (shrinkage factor) applied to each new tree when
      +
      sequentially induced. FP 2004 (Sec. 2)
      +
      lin_standardise : If True, the linear terms will be standardised as per Friedman Sec 3.2
      +
      by multiplying the winsorised variable by 0.4/stdev.
      +
      lin_trim_quantile : If lin_standardise is True, this quantile will be used to trim linear
      +
      terms before standardisation.
      +
      exp_rand_tree_size : If True, each boosted tree will have a different maximum number of
      +
      terminal nodes based on an exponential distribution about tree_size. +(Friedman Sec 3.3)
      +
      include_linear : Include linear terms as opposed to only rules
      +
       
      +
      random_state: +Integer to initialise random objects and provide repeatability.
      +
      tree_generator : Optional: this object will be used as provided to generate the rules.
      +
      This will override almost all the other properties above. +Must be GradientBoostingRegressor or GradientBoostingClassifier, optional (default=None)
      +
      +

      Attributes

      +
      +
      rule_ensemble : RuleEnsemble
      +
      The rule ensemble
      +
      feature_names : list of strings, optional (default=None)
      +
      The names of the features (columns)
      +
      +
      + +Expand source code + +
      class FPLassoRegressor(FPLasso):        
      +    def _init_prediction_task(self):
      +        self.prediction_task = 'regression'
      +
      +

      Ancestors

      + +

      Inherited members

      + +
      +
      +
      +
      + +
      + + + + + \ No newline at end of file diff --git a/docs/rule_set/fpskope.html b/docs/rule_set/fpskope.html new file mode 100644 index 00000000..f7fdf972 --- /dev/null +++ b/docs/rule_set/fpskope.html @@ -0,0 +1,307 @@ + + + + + + +imodels.rule_set.fpskope API documentation + + + + + + + + + +
      +
      +
      +

      Module imodels.rule_set.fpskope

      +
      +
      +
      + +Expand source code + +
      from typing import List
      +
      +import numpy as np
      +
      +from imodels.rule_set.skope_rules import SkopeRulesClassifier
      +from imodels.util.extract import extract_fpgrowth
      +from imodels.util.convert import itemsets_to_rules
      +
      +class FPSkopeClassifier(SkopeRulesClassifier):
      +
      +    def __init__(self,
      +                 minsupport=0.1,
      +                 maxcardinality=2,
      +                 verbose=False,
      +                 precision_min=0.5,
      +                 recall_min=0.01,
      +                 n_estimators=10,
      +                 max_samples=.8,
      +                 max_samples_features=1.,
      +                 bootstrap=False,
      +                 bootstrap_features=False,
      +                 max_depth=3,
      +                 max_depth_duplication=None,
      +                 max_features=1.,
      +                 min_samples_split=2,
      +                 n_jobs=1,
      +                 random_state=None):
      +        super().__init__(precision_min,
      +                         recall_min,
      +                         n_estimators,
      +                         max_samples,
      +                         max_samples_features,
      +                         bootstrap,
      +                         bootstrap_features,
      +                         max_depth,
      +                         max_depth_duplication,
      +                         max_features,
      +                         min_samples_split,
      +                         n_jobs,
      +                         random_state,
      +                         verbose)
      +        self.minsupport = minsupport
      +        self.maxcardinality = maxcardinality
      +        self.verbose = verbose
      +
      +    def fit(self, X, y=None, feature_names=None, undiscretized_features=[], sample_weight=None):
      +        self.undiscretized_features = undiscretized_features
      +        super().fit(X, y, feature_names=feature_names, sample_weight=sample_weight)
      +        return self
      +
      +    def _extract_rules(self, X, y) -> List[str]:
      +        itemsets = extract_fpgrowth(X, y,
      +                                    feature_labels=self.feature_placeholders,
      +                                    minsupport=self.minsupport,
      +                                    maxcardinality=self.maxcardinality,
      +                                    undiscretized_features=self.undiscretized_features,
      +                                    verbose=self.verbose)[0]
      +        return [itemsets_to_rules(itemsets)], [np.arange(X.shape[0])], [np.arange(len(self.feature_names))]
      +
      +
      +
      +
      +
      +
      +
      +
      +
      +

      Classes

      +
      +
      +class FPSkopeClassifier +(minsupport=0.1, maxcardinality=2, verbose=False, precision_min=0.5, recall_min=0.01, n_estimators=10, max_samples=0.8, max_samples_features=1.0, bootstrap=False, bootstrap_features=False, max_depth=3, max_depth_duplication=None, max_features=1.0, min_samples_split=2, n_jobs=1, random_state=None) +
      +
      +

      An easy-interpretable classifier optimizing simple logical rules.

      +

      Parameters

      +
      +
      feature_names : list of str, optional
      +
      The names of each feature to be used for returning rules in string +format.
      +
      precision_min : float, optional (default=0.5)
      +
      The minimal precision of a rule to be selected.
      +
      recall_min : float, optional (default=0.01)
      +
      The minimal recall of a rule to be selected.
      +
      n_estimators : int, optional (default=10)
      +
      The number of base estimators (rules) to use for prediction. More are +built before selection. All are available in the estimators_ attribute.
      +
      max_samples : int or float, optional (default=.8)
      +
      The number of samples to draw from X to train each decision tree, from +which rules are generated and selected. +- If int, then draw max_samples samples. +- If float, then draw max_samples * X.shape[0] samples. +If max_samples is larger than the number of samples provided, +all samples will be used for all trees (no sampling).
      +
      max_samples_features : int or float, optional (default=1.0)
      +
      The number of features to draw from X to train each decision tree, from +which rules are generated and selected. +- If int, then draw max_features features. +- If float, then draw max_features * X.shape[1] features.
      +
      bootstrap : boolean, optional (default=False)
      +
      Whether samples are drawn with replacement.
      +
      bootstrap_features : boolean, optional (default=False)
      +
      Whether features are drawn with replacement.
      +
      max_depth : integer or List or None, optional (default=3)
      +
      The maximum depth of the decision trees. If None, then nodes are +expanded until all leaves are pure or until all leaves contain less +than min_samples_split samples. +If an iterable is passed, you will train n_estimators +for each tree depth. It allows you to create and compare +rules of different length.
      +
      max_depth_duplication : integer, optional (default=None)
      +
      The maximum depth of the decision tree for rule deduplication, +if None then no deduplication occurs.
      +
      max_features : int, float, string or None, optional (default="auto")
      +
      +

      The number of features considered (by each decision tree) when looking +for the best split:

      +
        +
      • If int, then consider max_features features at each split.
      • +
      • If float, then max_features is a percentage and +int(max_features * n_features) features are considered at each +split.
      • +
      • If "auto", then max_features=sqrt(n_features).
      • +
      • If "sqrt", then max_features=sqrt(n_features) (same as "auto").
      • +
      • If "log2", then max_features=log2(n_features).
      • +
      • If None, then max_features=n_features.
      • +
      +

      Note: the search for a split does not stop until at least one +valid partition of the node samples is found, even if it requires to +effectively inspect more than max_features features.

      +
      +
      min_samples_split : int, float, optional (default=2)
      +
      The minimum number of samples required to split an internal node for +each decision tree. +- If int, then consider min_samples_split as the minimum number. +- If float, then min_samples_split is a percentage and +ceil(min_samples_split * n_samples) are the minimum +number of samples for each split.
      +
      n_jobs : integer, optional (default=1)
      +
      The number of jobs to run in parallel for both fit and predict. +If -1, then the number of jobs is set to the number of cores.
      +
      random_state : int, RandomState instance or None, optional
      +
      +
        +
      • If int, random_state is the seed used by the random number generator.
      • +
      • If RandomState instance, random_state is the random number generator.
      • +
      • If None, the random number generator is the RandomState instance used +by np.random.
      • +
      +
      +
      verbose : int, optional (default=0)
      +
      Controls the verbosity of the tree building process.
      +
      +

      Attributes

      +

      rules_ : dict of tuples (rule, precision, recall, nb). +The collection of n_estimators rules used in the predict method. +The rules are generated by fitted sub-estimators (decision trees). Each +rule satisfies recall_min and precision_min conditions. The selection +is done according to OOB precisions.

      +
      +
      estimators_ : list of DecisionTreeClassifier
      +
      The collection of fitted sub-estimators used to generate candidate +rules.
      +
      estimators_samples_ : list of arrays
      +
      The subset of drawn samples (i.e., the in-bag samples) for each base +estimator.
      +
      estimators_features_ : list of arrays
      +
      The subset of drawn features for each base estimator.
      +
      max_samples_ : integer
      +
      The actual number of samples
      +
      n_features_ : integer
      +
      The number of features when fit is performed.
      +
      classes_ : array, shape (n_classes,)
      +
      The classes labels.
      +
      +
      + +Expand source code + +
      class FPSkopeClassifier(SkopeRulesClassifier):
      +
      +    def __init__(self,
      +                 minsupport=0.1,
      +                 maxcardinality=2,
      +                 verbose=False,
      +                 precision_min=0.5,
      +                 recall_min=0.01,
      +                 n_estimators=10,
      +                 max_samples=.8,
      +                 max_samples_features=1.,
      +                 bootstrap=False,
      +                 bootstrap_features=False,
      +                 max_depth=3,
      +                 max_depth_duplication=None,
      +                 max_features=1.,
      +                 min_samples_split=2,
      +                 n_jobs=1,
      +                 random_state=None):
      +        super().__init__(precision_min,
      +                         recall_min,
      +                         n_estimators,
      +                         max_samples,
      +                         max_samples_features,
      +                         bootstrap,
      +                         bootstrap_features,
      +                         max_depth,
      +                         max_depth_duplication,
      +                         max_features,
      +                         min_samples_split,
      +                         n_jobs,
      +                         random_state,
      +                         verbose)
      +        self.minsupport = minsupport
      +        self.maxcardinality = maxcardinality
      +        self.verbose = verbose
      +
      +    def fit(self, X, y=None, feature_names=None, undiscretized_features=[], sample_weight=None):
      +        self.undiscretized_features = undiscretized_features
      +        super().fit(X, y, feature_names=feature_names, sample_weight=sample_weight)
      +        return self
      +
      +    def _extract_rules(self, X, y) -> List[str]:
      +        itemsets = extract_fpgrowth(X, y,
      +                                    feature_labels=self.feature_placeholders,
      +                                    minsupport=self.minsupport,
      +                                    maxcardinality=self.maxcardinality,
      +                                    undiscretized_features=self.undiscretized_features,
      +                                    verbose=self.verbose)[0]
      +        return [itemsets_to_rules(itemsets)], [np.arange(X.shape[0])], [np.arange(len(self.feature_names))]
      +
      +

      Ancestors

      + +

      Inherited members

      + +
      +
      +
      +
      + +
      + + + + + \ No newline at end of file diff --git a/docs/rule_set/index.html b/docs/rule_set/index.html index 42c5d936..a3df99af 100644 --- a/docs/rule_set/index.html +++ b/docs/rule_set/index.html @@ -36,6 +36,14 @@

      Sub-modules

      +
      imodels.rule_set.fplasso
      +
      +
      +
      +
      imodels.rule_set.fpskope
      +
      +
      +
      imodels.rule_set.rule_fit

      Linear model of tree-based decision rules based on the rulefit algorithm from Friedman and Popescu …

      @@ -72,6 +80,8 @@

      Index

    • Sub-modules

      • imodels.rule_set.boosted_rules
      • +
      • imodels.rule_set.fplasso
      • +
      • imodels.rule_set.fpskope
      • imodels.rule_set.rule_fit
      • imodels.rule_set.rule_set
      • imodels.rule_set.skope_rules
      • diff --git a/docs/rule_set/rule_fit.html b/docs/rule_set/rule_fit.html index 5f51388e..1a7c737c 100644 --- a/docs/rule_set/rule_fit.html +++ b/docs/rule_set/rule_fit.html @@ -40,6 +40,8 @@

        Module imodels.rule_set.rule_fit

        L1-regularized linear model, also called Lasso, which estimates the effects of each rule on the output target but at the same time estimating many of those effects to zero. """ +from typing import List, Tuple + import numpy as np import pandas as pd from sklearn.base import BaseEstimator @@ -48,10 +50,11 @@

        Module imodels.rule_set.rule_fit

        from scipy.special import softmax from imodels.rule_set.rule_set import RuleSet -from imodels.util.rule import enum_features +from imodels.util.rule import get_feature_dict, replace_feature_name, Rule from imodels.util.transforms import Winsorizer, FriedScale from imodels.util.score import score_lasso from imodels.util.convert import tree_to_rules +from imodels.util.extract import extract_rulefit class RuleFit(BaseEstimator, TransformerMixin, RuleSet): """Rulefit class. Rather than using this class directly, should use RuleFitRegressor or RuleFitClassifier @@ -139,15 +142,16 @@

        Module imodels.rule_set.rule_fit

        if type(y) in [pd.DataFrame, pd.Series]: y = y.values - self.n_obs = X.shape[0] self.n_features_ = X.shape[1] - self.feature_names_, self.feature_dict_ = enum_features(X, feature_names) - - self.tree_generator = self._get_tree_ensemble(classify=False) - self._fit_tree_ensemble(X, y) + self.feature_dict_ = get_feature_dict(X.shape[1], feature_names) + self.feature_placeholders = list(self.feature_dict_.keys()) + self.feature_names = list(self.feature_dict_.values()) - extracted_rules = self._extract_rules() + extracted_rules = self._extract_rules(X, y) self.rules_without_feature_names_, self.coef, self.intercept = self._score_rules(X, y, extracted_rules) + self.rules_ = [ + replace_feature_name(rule, self.feature_dict_) for rule in self.rules_without_feature_names_ + ] return self @@ -196,7 +200,7 @@

        Module imodels.rule_set.rule_fit

        X_transformed: matrix, shape=(n_samples, n_out) Transformed data set """ - df = pd.DataFrame(X, columns=self.feature_names_) + df = pd.DataFrame(X, columns=self.feature_placeholders) X_transformed = np.zeros([X.shape[0], 0]) for r in rules: @@ -224,8 +228,7 @@

        Module imodels.rule_set.rule_fit

        the coefficients and 'support' the support of the rule in the training data set (X) """ - - n_features = len(self.coef) - len(self.rules_without_feature_names_) + n_features = len(self.coef) - len(self.rules_) rule_ensemble = list(self.rules_without_feature_names_) output_rules = [] ## Add coefficients for linear effects @@ -240,10 +243,10 @@

        Module imodels.rule_set.rule_fit

        subregion = np.array(subregion) importance = sum(abs(coef) * abs([x[i] for x in self.winsorizer.trim(subregion)] - self.mean[i])) / len( subregion) - output_rules += [(self.feature_names_[i], 'linear', coef, 1, importance)] + output_rules += [(self.feature_names[i], 'linear', coef, 1, importance)] ## Add rules - for i in range(0, len(self.rules_without_feature_names_)): + for i in range(0, len(self.rules_)): rule = rule_ensemble[i] coef = self.coef[i + n_features] @@ -253,7 +256,7 @@

        Module imodels.rule_set.rule_fit

        rkx = self.transform(subregion, [rule])[:, -1] importance = sum(abs(coef) * abs(rkx - rule.support)) / len(subregion) - output_rules += [(rule.__str__(), 'rule', coef, rule.support, importance)] + output_rules += [(self.rules_[i].rule, 'rule', coef, rule.support, importance)] rules = pd.DataFrame(output_rules, columns=["rule", "type", "coef", "support", "importance"]) if exclude_zero_coef: rules = rules.ix[rules.coef != 0] @@ -264,70 +267,18 @@

        Module imodels.rule_set.rule_fit

        rules = rules[rules.coef != 0].sort_values("support", ascending=False) pd.set_option('display.max_colwidth', -1) return rules[['rule', 'coef']].round(3) - - def _get_tree_ensemble(self, classify=False): - - if self.tree_generator is None: - n_estimators_default = int(np.ceil(self.max_rules / self.tree_size)) - self.sample_fract_ = min(0.5, (100 + 6 * np.sqrt(self.n_obs)) / self.n_obs) - - tree_generator = GradientBoostingRegressor(n_estimators=n_estimators_default, - max_leaf_nodes=self.tree_size, - learning_rate=self.memory_par, - subsample=self.sample_fract_, - random_state=self.random_state, - max_depth=100) - - if type(tree_generator) not in [GradientBoostingRegressor, RandomForestRegressor]: - raise ValueError("RuleFit only works with RandomForest and BoostingRegressor") - - return tree_generator - - def _fit_tree_ensemble(self, X, y): - ## fit tree generator - if not self.exp_rand_tree_size: # simply fit with constant tree size - self.tree_generator.fit(X, y) - else: # randomise tree size as per Friedman 2005 Sec 3.3 - np.random.seed(self.random_state) - tree_sizes = np.random.exponential(scale=self.tree_size - 2, - size=int(np.ceil(self.max_rules * 2 / self.tree_size))) - tree_sizes = np.asarray([2 + np.floor(tree_sizes[i_]) for i_ in np.arange(len(tree_sizes))], dtype=int) - i = int(len(tree_sizes) / 4) - while np.sum(tree_sizes[0:i]) < self.max_rules: - i = i + 1 - tree_sizes = tree_sizes[0:i] - self.tree_generator.set_params(warm_start=True) - curr_est_ = 0 - for i_size in np.arange(len(tree_sizes)): - size = tree_sizes[i_size] - self.tree_generator.set_params(n_estimators=curr_est_ + 1) - self.tree_generator.set_params(max_leaf_nodes=size) - random_state_add = self.random_state if self.random_state else 0 - self.tree_generator.set_params( - random_state=i_size + random_state_add) # warm_state=True seems to reset random_state, such that the trees are highly correlated, unless we manually change the random_sate here. - self.tree_generator.fit(np.copy(X, order='C'), np.copy(y, order='C')) - curr_est_ = curr_est_ + 1 - self.tree_generator.set_params(warm_start=False) - - if isinstance(self.tree_generator, RandomForestRegressor): - self.estimators_ = [[x] for x in self.tree_generator.estimators_] - else: - self.estimators_ = self.tree_generator.estimators_ - def _extract_rules(self): - seen_antecedents = set() - extracted_rules = [] - for estimator in self.estimators_: - for rule_value_pair in tree_to_rules(estimator[0], np.array(self.feature_names_), prediction_values=True): - if rule_value_pair[0] not in seen_antecedents: - extracted_rules.append(rule_value_pair) - seen_antecedents.add(rule_value_pair[0]) - - extracted_rules = sorted(extracted_rules, key=lambda x: x[1]) - extracted_rules = list(map(lambda x: x[0], extracted_rules)) - return extracted_rules - - def _score_rules(self, X, y, rules): + def _extract_rules(self, X, y) -> List[Rule]: + return extract_rulefit(X, y, + feature_names=self.feature_placeholders, + tree_size=self.tree_size, + max_rules=self.max_rules, + memory_par=self.memory_par, + tree_generator=self.tree_generator, + exp_rand_tree_size=self.exp_rand_tree_size, + random_state=self.random_state) + + def _score_rules(self, X, y, rules) -> Tuple[List[Rule], List[float], float]: X_concat = np.zeros([X.shape[0], 0]) # standardise linear variables if requested (for regression model only) @@ -505,15 +456,16 @@

        Attributes

        if type(y) in [pd.DataFrame, pd.Series]: y = y.values - self.n_obs = X.shape[0] self.n_features_ = X.shape[1] - self.feature_names_, self.feature_dict_ = enum_features(X, feature_names) - - self.tree_generator = self._get_tree_ensemble(classify=False) - self._fit_tree_ensemble(X, y) + self.feature_dict_ = get_feature_dict(X.shape[1], feature_names) + self.feature_placeholders = list(self.feature_dict_.keys()) + self.feature_names = list(self.feature_dict_.values()) - extracted_rules = self._extract_rules() + extracted_rules = self._extract_rules(X, y) self.rules_without_feature_names_, self.coef, self.intercept = self._score_rules(X, y, extracted_rules) + self.rules_ = [ + replace_feature_name(rule, self.feature_dict_) for rule in self.rules_without_feature_names_ + ] return self @@ -562,7 +514,7 @@

        Attributes

        X_transformed: matrix, shape=(n_samples, n_out) Transformed data set """ - df = pd.DataFrame(X, columns=self.feature_names_) + df = pd.DataFrame(X, columns=self.feature_placeholders) X_transformed = np.zeros([X.shape[0], 0]) for r in rules: @@ -590,8 +542,7 @@

        Attributes

        the coefficients and 'support' the support of the rule in the training data set (X) """ - - n_features = len(self.coef) - len(self.rules_without_feature_names_) + n_features = len(self.coef) - len(self.rules_) rule_ensemble = list(self.rules_without_feature_names_) output_rules = [] ## Add coefficients for linear effects @@ -606,10 +557,10 @@

        Attributes

        subregion = np.array(subregion) importance = sum(abs(coef) * abs([x[i] for x in self.winsorizer.trim(subregion)] - self.mean[i])) / len( subregion) - output_rules += [(self.feature_names_[i], 'linear', coef, 1, importance)] + output_rules += [(self.feature_names[i], 'linear', coef, 1, importance)] ## Add rules - for i in range(0, len(self.rules_without_feature_names_)): + for i in range(0, len(self.rules_)): rule = rule_ensemble[i] coef = self.coef[i + n_features] @@ -619,7 +570,7 @@

        Attributes

        rkx = self.transform(subregion, [rule])[:, -1] importance = sum(abs(coef) * abs(rkx - rule.support)) / len(subregion) - output_rules += [(rule.__str__(), 'rule', coef, rule.support, importance)] + output_rules += [(self.rules_[i].rule, 'rule', coef, rule.support, importance)] rules = pd.DataFrame(output_rules, columns=["rule", "type", "coef", "support", "importance"]) if exclude_zero_coef: rules = rules.ix[rules.coef != 0] @@ -630,70 +581,18 @@

        Attributes

        rules = rules[rules.coef != 0].sort_values("support", ascending=False) pd.set_option('display.max_colwidth', -1) return rules[['rule', 'coef']].round(3) - - def _get_tree_ensemble(self, classify=False): - - if self.tree_generator is None: - n_estimators_default = int(np.ceil(self.max_rules / self.tree_size)) - self.sample_fract_ = min(0.5, (100 + 6 * np.sqrt(self.n_obs)) / self.n_obs) - - tree_generator = GradientBoostingRegressor(n_estimators=n_estimators_default, - max_leaf_nodes=self.tree_size, - learning_rate=self.memory_par, - subsample=self.sample_fract_, - random_state=self.random_state, - max_depth=100) - - if type(tree_generator) not in [GradientBoostingRegressor, RandomForestRegressor]: - raise ValueError("RuleFit only works with RandomForest and BoostingRegressor") - - return tree_generator - - def _fit_tree_ensemble(self, X, y): - ## fit tree generator - if not self.exp_rand_tree_size: # simply fit with constant tree size - self.tree_generator.fit(X, y) - else: # randomise tree size as per Friedman 2005 Sec 3.3 - np.random.seed(self.random_state) - tree_sizes = np.random.exponential(scale=self.tree_size - 2, - size=int(np.ceil(self.max_rules * 2 / self.tree_size))) - tree_sizes = np.asarray([2 + np.floor(tree_sizes[i_]) for i_ in np.arange(len(tree_sizes))], dtype=int) - i = int(len(tree_sizes) / 4) - while np.sum(tree_sizes[0:i]) < self.max_rules: - i = i + 1 - tree_sizes = tree_sizes[0:i] - self.tree_generator.set_params(warm_start=True) - curr_est_ = 0 - for i_size in np.arange(len(tree_sizes)): - size = tree_sizes[i_size] - self.tree_generator.set_params(n_estimators=curr_est_ + 1) - self.tree_generator.set_params(max_leaf_nodes=size) - random_state_add = self.random_state if self.random_state else 0 - self.tree_generator.set_params( - random_state=i_size + random_state_add) # warm_state=True seems to reset random_state, such that the trees are highly correlated, unless we manually change the random_sate here. - self.tree_generator.fit(np.copy(X, order='C'), np.copy(y, order='C')) - curr_est_ = curr_est_ + 1 - self.tree_generator.set_params(warm_start=False) - - if isinstance(self.tree_generator, RandomForestRegressor): - self.estimators_ = [[x] for x in self.tree_generator.estimators_] - else: - self.estimators_ = self.tree_generator.estimators_ - def _extract_rules(self): - seen_antecedents = set() - extracted_rules = [] - for estimator in self.estimators_: - for rule_value_pair in tree_to_rules(estimator[0], np.array(self.feature_names_), prediction_values=True): - if rule_value_pair[0] not in seen_antecedents: - extracted_rules.append(rule_value_pair) - seen_antecedents.add(rule_value_pair[0]) - - extracted_rules = sorted(extracted_rules, key=lambda x: x[1]) - extracted_rules = list(map(lambda x: x[0], extracted_rules)) - return extracted_rules - - def _score_rules(self, X, y, rules): + def _extract_rules(self, X, y) -> List[Rule]: + return extract_rulefit(X, y, + feature_names=self.feature_placeholders, + tree_size=self.tree_size, + max_rules=self.max_rules, + memory_par=self.memory_par, + tree_generator=self.tree_generator, + exp_rand_tree_size=self.exp_rand_tree_size, + random_state=self.random_state) + + def _score_rules(self, X, y, rules) -> Tuple[List[Rule], List[float], float]: X_concat = np.zeros([X.shape[0], 0]) # standardise linear variables if requested (for regression model only) @@ -730,6 +629,7 @@

        Subclasses

        Methods

        @@ -751,15 +651,16 @@

        Methods

        if type(y) in [pd.DataFrame, pd.Series]: y = y.values - self.n_obs = X.shape[0] self.n_features_ = X.shape[1] - self.feature_names_, self.feature_dict_ = enum_features(X, feature_names) - - self.tree_generator = self._get_tree_ensemble(classify=False) - self._fit_tree_ensemble(X, y) + self.feature_dict_ = get_feature_dict(X.shape[1], feature_names) + self.feature_placeholders = list(self.feature_dict_.keys()) + self.feature_names = list(self.feature_dict_.values()) - extracted_rules = self._extract_rules() + extracted_rules = self._extract_rules(X, y) self.rules_without_feature_names_, self.coef, self.intercept = self._score_rules(X, y, extracted_rules) + self.rules_ = [ + replace_feature_name(rule, self.feature_dict_) for rule in self.rules_without_feature_names_ + ] return self
        @@ -804,8 +705,7 @@

        Returns

        the coefficients and 'support' the support of the rule in the training data set (X) """ - - n_features = len(self.coef) - len(self.rules_without_feature_names_) + n_features = len(self.coef) - len(self.rules_) rule_ensemble = list(self.rules_without_feature_names_) output_rules = [] ## Add coefficients for linear effects @@ -820,10 +720,10 @@

        Returns

        subregion = np.array(subregion) importance = sum(abs(coef) * abs([x[i] for x in self.winsorizer.trim(subregion)] - self.mean[i])) / len( subregion) - output_rules += [(self.feature_names_[i], 'linear', coef, 1, importance)] + output_rules += [(self.feature_names[i], 'linear', coef, 1, importance)] ## Add rules - for i in range(0, len(self.rules_without_feature_names_)): + for i in range(0, len(self.rules_)): rule = rule_ensemble[i] coef = self.coef[i + n_features] @@ -833,7 +733,7 @@

        Returns

        rkx = self.transform(subregion, [rule])[:, -1] importance = sum(abs(coef) * abs(rkx - rule.support)) / len(subregion) - output_rules += [(rule.__str__(), 'rule', coef, rule.support, importance)] + output_rules += [(self.rules_[i].rule, 'rule', coef, rule.support, importance)] rules = pd.DataFrame(output_rules, columns=["rule", "type", "coef", "support", "importance"]) if exclude_zero_coef: rules = rules.ix[rules.coef != 0] @@ -934,7 +834,7 @@

        Returns

        X_transformed: matrix, shape=(n_samples, n_out) Transformed data set """ - df = pd.DataFrame(X, columns=self.feature_names_) + df = pd.DataFrame(X, columns=self.feature_placeholders) X_transformed = np.zeros([X.shape[0], 0]) for r in rules: diff --git a/docs/rule_set/rule_set.html b/docs/rule_set/rule_set.html index 9b73f8bb..c06d8de4 100644 --- a/docs/rule_set/rule_set.html +++ b/docs/rule_set/rule_set.html @@ -33,13 +33,7 @@

        Module imodels.rule_set.rule_set

        class RuleSet: - def _get_tree_ensemble(self): - pass - - def _fit_tree_ensemble(self, X, y): - pass - - def _extract_rules(self): + def _extract_rules(self, X, y): pass def _score_rules(self, X, y, rules): @@ -50,7 +44,7 @@

        Module imodels.rule_set.rule_set

        def eval_weighted_rule_sum(self, X) -> np.ndarray: - check_is_fitted(self, ['rules_without_feature_names_', 'n_features_', 'feature_names_']) + check_is_fitted(self, ['rules_without_feature_names_', 'n_features_', 'feature_placeholders']) X = check_array(X) if X.shape[1] != self.n_features_: @@ -58,7 +52,7 @@

        Module imodels.rule_set.rule_set

        " Please reshape your data." % (X.shape[1], self.n_features_)) - df = pd.DataFrame(X, columns=self.feature_names_) + df = pd.DataFrame(X, columns=self.feature_placeholders) selected_rules = self.rules_without_feature_names_ scores = np.zeros(X.shape[0]) @@ -89,13 +83,7 @@

        Classes

        class RuleSet:
         
        -    def _get_tree_ensemble(self):
        -        pass
        -
        -    def _fit_tree_ensemble(self, X, y):
        -        pass
        -
        -    def _extract_rules(self):
        +    def _extract_rules(self, X, y):
                 pass
         
             def _score_rules(self, X, y, rules):
        @@ -106,7 +94,7 @@ 

        Classes

        def eval_weighted_rule_sum(self, X) -> np.ndarray: - check_is_fitted(self, ['rules_without_feature_names_', 'n_features_', 'feature_names_']) + check_is_fitted(self, ['rules_without_feature_names_', 'n_features_', 'feature_placeholders']) X = check_array(X) if X.shape[1] != self.n_features_: @@ -114,7 +102,7 @@

        Classes

        " Please reshape your data." % (X.shape[1], self.n_features_)) - df = pd.DataFrame(X, columns=self.feature_names_) + df = pd.DataFrame(X, columns=self.feature_placeholders) selected_rules = self.rules_without_feature_names_ scores = np.zeros(X.shape[0]) @@ -142,7 +130,7 @@

        Methods

        def eval_weighted_rule_sum(self, X) -> np.ndarray:
         
        -    check_is_fitted(self, ['rules_without_feature_names_', 'n_features_', 'feature_names_'])
        +    check_is_fitted(self, ['rules_without_feature_names_', 'n_features_', 'feature_placeholders'])
             X = check_array(X)
         
             if X.shape[1] != self.n_features_:
        @@ -150,7 +138,7 @@ 

        Methods

        " Please reshape your data." % (X.shape[1], self.n_features_)) - df = pd.DataFrame(X, columns=self.feature_names_) + df = pd.DataFrame(X, columns=self.feature_placeholders) selected_rules = self.rules_without_feature_names_ scores = np.zeros(X.shape[0]) diff --git a/docs/rule_set/skope_rules.html b/docs/rule_set/skope_rules.html index 38a94d20..b690c69a 100644 --- a/docs/rule_set/skope_rules.html +++ b/docs/rule_set/skope_rules.html @@ -169,7 +169,8 @@ from imodels.rule_set.rule_set import RuleSet from imodels.util.convert import tree_to_rules -from imodels.util.rule import replace_feature_name, enum_features +from imodels.util.rule import replace_feature_name, get_feature_dict, Rule +from imodels.util.extract import extract_skope from imodels.util.score import score_oob from imodels.util.prune import prune_mins, deduplicate @@ -326,7 +327,7 @@ self.random_state = random_state self.verbose = verbose - def fit(self, X, y, feature_names=None, sample_weight=None) -> 'SkopeRulesClassifier': + def fit(self, X, y, feature_names=None, sample_weight=None): """Fit the model according to the given training data. Parameters @@ -398,20 +399,18 @@ raise ValueError("max_samples must be in (0, 1], got %r" % self.max_samples) max_samples = int(self.max_samples * X.shape[0]) self.max_samples_ = max_samples - self._max_depths = self.max_depth if isinstance(self.max_depth, Iterable) else [self.max_depth] - self.feature_names_, self.feature_dict_ = enum_features(X, feature_names) + self.feature_dict_ = get_feature_dict(X.shape[1], feature_names) + self.feature_placeholders = list(self.feature_dict_.keys()) + self.feature_names = list(self.feature_dict_.values()) - self.tree_generators = self._get_tree_ensemble() - self._fit_tree_ensemble(X, y) - - extracted_rules = self._extract_rules() + extracted_rules, self.estimators_samples_, self.estimators_features_ = self._extract_rules(X, y) scored_rules = self._score_rules(X, y, extracted_rules) self.rules_ = self._prune_rules(scored_rules) self.rules_without_feature_names_ = self.rules_ self.rules_ = [ - (replace_feature_name(rule, self.feature_dict_), perf) for rule, perf in self.rules_ + replace_feature_name(rule, self.feature_dict_) for rule in self.rules_ ] return self @@ -431,13 +430,16 @@ be considered as an outlier according to the selected rules. """ - return np.array((self.eval_weighted_rule_sum(X) > 0), dtype=int) + return np.argmax(self.predict_proba(X), axis=1) def predict_proba(self, X) -> np.ndarray: '''Predict probability of a particular sample being an outlier or not ''' - y = self.rules_vote(X) / len(self.rules_without_feature_names_) + weight_sum = np.sum([w[0] for (r, w) in self.rules_without_feature_names_]) + if weight_sum == 0: + return np.vstack((np.ones(X.shape[0]), np.zeros(X.shape[0]))).transpose() + y = self.eval_weighted_rule_sum(X) / weight_sum return np.vstack((1 - y, y)).transpose() def rules_vote(self, X) -> np.ndarray: @@ -460,8 +462,7 @@ """ # Check if fit had been called - check_is_fitted(self, ['rules_', 'estimators_', 'estimators_samples_', - 'max_samples_']) + check_is_fitted(self, ['rules_', 'estimators_samples_', 'max_samples_']) # Input validation X = check_array(X) @@ -472,8 +473,8 @@ " Please reshape your data." % (X.shape[1], self.n_features_)) - df = pandas.DataFrame(X, columns=self.feature_names_) - selected_rules = self.rules_ + df = pandas.DataFrame(X, columns=self.feature_placeholders) + selected_rules = self.rules_without_feature_names_ scores = np.zeros(X.shape[0]) for (r, _) in selected_rules: @@ -502,8 +503,7 @@ """ # Check if fit had been called - check_is_fitted(self, ['rules_', 'estimators_', 'estimators_samples_', - 'max_samples_']) + check_is_fitted(self, ['rules_', 'estimators_samples_', 'max_samples_']) # Input validation X = check_array(X) @@ -514,14 +514,14 @@ " Please reshape your data." % (X.shape[1], self.n_features_)) - df = pandas.DataFrame(X, columns=self.feature_names_) + df = pandas.DataFrame(X, columns=self.feature_placeholders) selected_rules = self.rules_without_feature_names_ scores = np.zeros(X.shape[0]) for (k, r) in enumerate(list((selected_rules))): - scores[list(df.query(r[0]).index)] = np.maximum( + scores[list(df.query(r.rule).index)] = np.maximum( len(selected_rules) - k, - scores[list(df.query(r[0]).index)]) + scores[list(df.query(r.rule).index)]) return scores @@ -550,71 +550,27 @@ return np.array((self.score_top_rules(X) > len(self.rules_) - n_rules), dtype=int) - def _get_tree_ensemble(self) -> Union[List[BaggingClassifier], List[BaggingRegressor]]: - - for ensemble_class, tree_class in [ - (BaggingClassifier, DecisionTreeClassifier), (BaggingRegressor, DecisionTreeRegressor) - ]: - - ensembles = [] - - for max_depth in self._max_depths: - bagging_clf = ensemble_class( - base_estimator=tree_class( - max_depth=max_depth, - max_features=self.max_features, - min_samples_split=self.min_samples_split - ), - n_estimators=self.n_estimators, - max_samples=self.max_samples_, - max_features=self.max_samples_features, - bootstrap=self.bootstrap, - bootstrap_features=self.bootstrap_features, - # oob_score=... XXX may be added - # if selection on tree perf needed. - # warm_start=... XXX may be added to increase computation perf. - n_jobs=self.n_jobs, - random_state=self.random_state, - verbose=self.verbose - ) - ensembles.append(bagging_clf) - - return ensembles - - def _fit_tree_ensemble(self, X, y) -> None: - y_reg = y - if self.sample_weight is not None: - sample_weight = check_array(self.sample_weight, ensure_2d=False) - weights = sample_weight - sample_weight.min() - contamination = float(sum(y)) / len(y) - y_reg = ( - pow(weights, 0.5) * 0.5 / contamination * (y > 0) - - pow((weights).mean(), 0.5) * (y == 0) - ) - y_reg = 1. / (1 + np.exp(-y_reg)) # sigmoid - - for e in self.tree_generators[:len(self.tree_generators) // 2]: - e.fit(X, y) - - for e in self.tree_generators[len(self.tree_generators) // 2:]: - e.fit(X, y_reg) - - def _extract_rules(self): - self.estimators_, self.estimators_samples_, self.estimators_features_ = [], [], [] - for ensemble in self.tree_generators: - self.estimators_ += ensemble.estimators_ - self.estimators_samples_ += ensemble.estimators_samples_ - self.estimators_features_ += ensemble.estimators_features_ - - extracted_rules = [] - for estimator, features in zip(self.estimators_, self.estimators_features_): - extracted_rules.append(tree_to_rules(estimator, np.array(self.feature_names_)[features])) - return extracted_rules - - def _score_rules(self, X, y, rules): - return score_oob(X, y, rules, self.estimators_samples_, self.estimators_features_, self.feature_names_) - - def _prune_rules(self, rules): + def _extract_rules(self, X, y) -> Tuple[List[str], List[np.array], List[np.array]]: + return extract_skope(X, y, + feature_names=self.feature_placeholders, + sample_weight=self.sample_weight, + n_estimators=self.n_estimators, + max_samples=self.max_samples_, + max_samples_features=self.max_samples_features, + bootstrap=self.bootstrap, + bootstrap_features=self.bootstrap_features, + max_depths=self.max_depth, + max_depth_duplication=self.max_depth_duplication, + max_features=self.max_features, + min_samples_split=self.min_samples_split, + n_jobs=self.n_jobs, + random_state=self.random_state, + verbose=self.verbose) + + def _score_rules(self, X, y, rules) -> List[Rule]: + return score_oob(X, y, rules, self.estimators_samples_, self.estimators_features_, self.feature_placeholders) + + def _prune_rules(self, rules) -> List[Rule]: return deduplicate( prune_mins(rules, self.precision_min, self.recall_min), self.max_depth_duplication @@ -889,7 +845,7 @@

        Attributes

        self.random_state = random_state self.verbose = verbose - def fit(self, X, y, feature_names=None, sample_weight=None) -> 'SkopeRulesClassifier': + def fit(self, X, y, feature_names=None, sample_weight=None): """Fit the model according to the given training data. Parameters @@ -961,20 +917,18 @@

        Attributes

        raise ValueError("max_samples must be in (0, 1], got %r" % self.max_samples) max_samples = int(self.max_samples * X.shape[0]) self.max_samples_ = max_samples - self._max_depths = self.max_depth if isinstance(self.max_depth, Iterable) else [self.max_depth] - - self.feature_names_, self.feature_dict_ = enum_features(X, feature_names) - self.tree_generators = self._get_tree_ensemble() - self._fit_tree_ensemble(X, y) + self.feature_dict_ = get_feature_dict(X.shape[1], feature_names) + self.feature_placeholders = list(self.feature_dict_.keys()) + self.feature_names = list(self.feature_dict_.values()) - extracted_rules = self._extract_rules() + extracted_rules, self.estimators_samples_, self.estimators_features_ = self._extract_rules(X, y) scored_rules = self._score_rules(X, y, extracted_rules) self.rules_ = self._prune_rules(scored_rules) self.rules_without_feature_names_ = self.rules_ self.rules_ = [ - (replace_feature_name(rule, self.feature_dict_), perf) for rule, perf in self.rules_ + replace_feature_name(rule, self.feature_dict_) for rule in self.rules_ ] return self @@ -994,13 +948,16 @@

        Attributes

        be considered as an outlier according to the selected rules. """ - return np.array((self.eval_weighted_rule_sum(X) > 0), dtype=int) + return np.argmax(self.predict_proba(X), axis=1) def predict_proba(self, X) -> np.ndarray: '''Predict probability of a particular sample being an outlier or not ''' - y = self.rules_vote(X) / len(self.rules_without_feature_names_) + weight_sum = np.sum([w[0] for (r, w) in self.rules_without_feature_names_]) + if weight_sum == 0: + return np.vstack((np.ones(X.shape[0]), np.zeros(X.shape[0]))).transpose() + y = self.eval_weighted_rule_sum(X) / weight_sum return np.vstack((1 - y, y)).transpose() def rules_vote(self, X) -> np.ndarray: @@ -1023,8 +980,7 @@

        Attributes

        """ # Check if fit had been called - check_is_fitted(self, ['rules_', 'estimators_', 'estimators_samples_', - 'max_samples_']) + check_is_fitted(self, ['rules_', 'estimators_samples_', 'max_samples_']) # Input validation X = check_array(X) @@ -1035,8 +991,8 @@

        Attributes

        " Please reshape your data." % (X.shape[1], self.n_features_)) - df = pandas.DataFrame(X, columns=self.feature_names_) - selected_rules = self.rules_ + df = pandas.DataFrame(X, columns=self.feature_placeholders) + selected_rules = self.rules_without_feature_names_ scores = np.zeros(X.shape[0]) for (r, _) in selected_rules: @@ -1065,8 +1021,7 @@

        Attributes

        """ # Check if fit had been called - check_is_fitted(self, ['rules_', 'estimators_', 'estimators_samples_', - 'max_samples_']) + check_is_fitted(self, ['rules_', 'estimators_samples_', 'max_samples_']) # Input validation X = check_array(X) @@ -1077,14 +1032,14 @@

        Attributes

        " Please reshape your data." % (X.shape[1], self.n_features_)) - df = pandas.DataFrame(X, columns=self.feature_names_) + df = pandas.DataFrame(X, columns=self.feature_placeholders) selected_rules = self.rules_without_feature_names_ scores = np.zeros(X.shape[0]) for (k, r) in enumerate(list((selected_rules))): - scores[list(df.query(r[0]).index)] = np.maximum( + scores[list(df.query(r.rule).index)] = np.maximum( len(selected_rules) - k, - scores[list(df.query(r[0]).index)]) + scores[list(df.query(r.rule).index)]) return scores @@ -1113,71 +1068,27 @@

        Attributes

        return np.array((self.score_top_rules(X) > len(self.rules_) - n_rules), dtype=int) - def _get_tree_ensemble(self) -> Union[List[BaggingClassifier], List[BaggingRegressor]]: - - for ensemble_class, tree_class in [ - (BaggingClassifier, DecisionTreeClassifier), (BaggingRegressor, DecisionTreeRegressor) - ]: - - ensembles = [] - - for max_depth in self._max_depths: - bagging_clf = ensemble_class( - base_estimator=tree_class( - max_depth=max_depth, - max_features=self.max_features, - min_samples_split=self.min_samples_split - ), - n_estimators=self.n_estimators, - max_samples=self.max_samples_, - max_features=self.max_samples_features, - bootstrap=self.bootstrap, - bootstrap_features=self.bootstrap_features, - # oob_score=... XXX may be added - # if selection on tree perf needed. - # warm_start=... XXX may be added to increase computation perf. - n_jobs=self.n_jobs, - random_state=self.random_state, - verbose=self.verbose - ) - ensembles.append(bagging_clf) - - return ensembles - - def _fit_tree_ensemble(self, X, y) -> None: - y_reg = y - if self.sample_weight is not None: - sample_weight = check_array(self.sample_weight, ensure_2d=False) - weights = sample_weight - sample_weight.min() - contamination = float(sum(y)) / len(y) - y_reg = ( - pow(weights, 0.5) * 0.5 / contamination * (y > 0) - - pow((weights).mean(), 0.5) * (y == 0) - ) - y_reg = 1. / (1 + np.exp(-y_reg)) # sigmoid - - for e in self.tree_generators[:len(self.tree_generators) // 2]: - e.fit(X, y) - - for e in self.tree_generators[len(self.tree_generators) // 2:]: - e.fit(X, y_reg) - - def _extract_rules(self): - self.estimators_, self.estimators_samples_, self.estimators_features_ = [], [], [] - for ensemble in self.tree_generators: - self.estimators_ += ensemble.estimators_ - self.estimators_samples_ += ensemble.estimators_samples_ - self.estimators_features_ += ensemble.estimators_features_ - - extracted_rules = [] - for estimator, features in zip(self.estimators_, self.estimators_features_): - extracted_rules.append(tree_to_rules(estimator, np.array(self.feature_names_)[features])) - return extracted_rules - - def _score_rules(self, X, y, rules): - return score_oob(X, y, rules, self.estimators_samples_, self.estimators_features_, self.feature_names_) - - def _prune_rules(self, rules): + def _extract_rules(self, X, y) -> Tuple[List[str], List[np.array], List[np.array]]: + return extract_skope(X, y, + feature_names=self.feature_placeholders, + sample_weight=self.sample_weight, + n_estimators=self.n_estimators, + max_samples=self.max_samples_, + max_samples_features=self.max_samples_features, + bootstrap=self.bootstrap, + bootstrap_features=self.bootstrap_features, + max_depths=self.max_depth, + max_depth_duplication=self.max_depth_duplication, + max_features=self.max_features, + min_samples_split=self.min_samples_split, + n_jobs=self.n_jobs, + random_state=self.random_state, + verbose=self.verbose) + + def _score_rules(self, X, y, rules) -> List[Rule]: + return score_oob(X, y, rules, self.estimators_samples_, self.estimators_features_, self.feature_placeholders) + + def _prune_rules(self, rules) -> List[Rule]: return deduplicate( prune_mins(rules, self.precision_min, self.recall_min), self.max_depth_duplication @@ -1188,6 +1099,10 @@

        Ancestors

      • sklearn.base.BaseEstimator
      • RuleSet
      +

      Subclasses

      +

      Methods

      @@ -1218,7 +1133,7 @@

      Returns

      Expand source code -
      def fit(self, X, y, feature_names=None, sample_weight=None) -> 'SkopeRulesClassifier':
      +
      def fit(self, X, y, feature_names=None, sample_weight=None):
           """Fit the model according to the given training data.
       
           Parameters
      @@ -1290,20 +1205,18 @@ 

      Returns

      raise ValueError("max_samples must be in (0, 1], got %r" % self.max_samples) max_samples = int(self.max_samples * X.shape[0]) self.max_samples_ = max_samples - self._max_depths = self.max_depth if isinstance(self.max_depth, Iterable) else [self.max_depth] - - self.feature_names_, self.feature_dict_ = enum_features(X, feature_names) - self.tree_generators = self._get_tree_ensemble() - self._fit_tree_ensemble(X, y) + self.feature_dict_ = get_feature_dict(X.shape[1], feature_names) + self.feature_placeholders = list(self.feature_dict_.keys()) + self.feature_names = list(self.feature_dict_.values()) - extracted_rules = self._extract_rules() + extracted_rules, self.estimators_samples_, self.estimators_features_ = self._extract_rules(X, y) scored_rules = self._score_rules(X, y, extracted_rules) self.rules_ = self._prune_rules(scored_rules) self.rules_without_feature_names_ = self.rules_ self.rules_ = [ - (replace_feature_name(rule, self.feature_dict_), perf) for rule, perf in self.rules_ + replace_feature_name(rule, self.feature_dict_) for rule in self.rules_ ] return self
      @@ -1345,7 +1258,7 @@

      Returns

      be considered as an outlier according to the selected rules. """ - return np.array((self.eval_weighted_rule_sum(X) > 0), dtype=int)
      + return np.argmax(self.predict_proba(X), axis=1)
    • @@ -1361,7 +1274,10 @@

      Returns

      '''Predict probability of a particular sample being an outlier or not ''' - y = self.rules_vote(X) / len(self.rules_without_feature_names_) + weight_sum = np.sum([w[0] for (r, w) in self.rules_without_feature_names_]) + if weight_sum == 0: + return np.vstack((np.ones(X.shape[0]), np.zeros(X.shape[0]))).transpose() + y = self.eval_weighted_rule_sum(X) / weight_sum return np.vstack((1 - y, y)).transpose()
      @@ -1460,8 +1376,7 @@

      Returns

      """ # Check if fit had been called - check_is_fitted(self, ['rules_', 'estimators_', 'estimators_samples_', - 'max_samples_']) + check_is_fitted(self, ['rules_', 'estimators_samples_', 'max_samples_']) # Input validation X = check_array(X) @@ -1472,8 +1387,8 @@

      Returns

      " Please reshape your data." % (X.shape[1], self.n_features_)) - df = pandas.DataFrame(X, columns=self.feature_names_) - selected_rules = self.rules_ + df = pandas.DataFrame(X, columns=self.feature_placeholders) + selected_rules = self.rules_without_feature_names_ scores = np.zeros(X.shape[0]) for (r, _) in selected_rules: @@ -1527,8 +1442,7 @@

      Returns

      """ # Check if fit had been called - check_is_fitted(self, ['rules_', 'estimators_', 'estimators_samples_', - 'max_samples_']) + check_is_fitted(self, ['rules_', 'estimators_samples_', 'max_samples_']) # Input validation X = check_array(X) @@ -1539,14 +1453,14 @@

      Returns

      " Please reshape your data." % (X.shape[1], self.n_features_)) - df = pandas.DataFrame(X, columns=self.feature_names_) + df = pandas.DataFrame(X, columns=self.feature_placeholders) selected_rules = self.rules_without_feature_names_ scores = np.zeros(X.shape[0]) for (k, r) in enumerate(list((selected_rules))): - scores[list(df.query(r[0]).index)] = np.maximum( + scores[list(df.query(r.rule).index)] = np.maximum( len(selected_rules) - k, - scores[list(df.query(r[0]).index)]) + scores[list(df.query(r.rule).index)]) return scores
      diff --git a/docs/util/convert.html b/docs/util/convert.html index 8975e293..394e8ff1 100644 --- a/docs/util/convert.html +++ b/docs/util/convert.html @@ -28,7 +28,7 @@

      Module imodels.util.convert

      from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.tree import _tree from sklearn.tree import _tree -from typing import Union, List +from typing import Union, List, Tuple def tree_to_rules(tree: Union[DecisionTreeClassifier, DecisionTreeRegressor], @@ -123,7 +123,32 @@

      Module imodels.util.convert

      # space=node_depth[i] * "\t", node=i)) else: s += f"{feature_names[feature[i]]} <= {threshold[i]}" - return s + '\n'
      + return s + '\n' + + +def itemsets_to_rules(itemsets: List[Tuple]) -> List[str]: + itemsets_without_all = [itemset for itemset in itemsets if 'All' not in ''.join(itemset)] + f = lambda itemset: ' and '.join([single_discretized_feature_to_rule(item) for item in itemset]) + return list(map(f, itemsets_without_all)) + + +def single_discretized_feature_to_rule(feat: str) -> str: + + feat_split = feat.split('_to_') + upper_value = feat_split[-1] + lower_value = feat_split[-2].split('_')[-1] + + lower_to_upper_len = 1 + len(lower_value) + 4 + len(upper_value) + feature_name = feat[:-lower_to_upper_len] + + if lower_value == '-inf': + rule = f'{feature_name} <= {upper_value}' + elif upper_value == 'inf': + rule = f'{feature_name} > {lower_value}' + else: + rule = f'{feature_name} > {lower_value} and {feature_name} <= {upper_value}' + + return rule
      @@ -133,6 +158,49 @@

      Module imodels.util.convert

      Functions

      +
      +def itemsets_to_rules(itemsets) +
      +
      +
      +
      + +Expand source code + +
      def itemsets_to_rules(itemsets: List[Tuple]) -> List[str]:
      +    itemsets_without_all = [itemset for itemset in itemsets if 'All' not in ''.join(itemset)]
      +    f = lambda itemset: ' and '.join([single_discretized_feature_to_rule(item) for item in itemset])
      +    return list(map(f, itemsets_without_all))
      +
      +
      +
      +def single_discretized_feature_to_rule(feat) +
      +
      +
      +
      + +Expand source code + +
      def single_discretized_feature_to_rule(feat: str) -> str:
      +    
      +    feat_split = feat.split('_to_')
      +    upper_value = feat_split[-1]
      +    lower_value = feat_split[-2].split('_')[-1]
      +    
      +    lower_to_upper_len = 1 + len(lower_value) + 4 + len(upper_value)
      +    feature_name = feat[:-lower_to_upper_len]
      +    
      +    if lower_value == '-inf':
      +        rule = f'{feature_name} <= {upper_value}'
      +    elif upper_value == 'inf':
      +        rule = f'{feature_name} > {lower_value}'
      +    else:
      +        rule = f'{feature_name} > {lower_value} and {feature_name} <= {upper_value}'
      +    
      +    return rule
      +
      +
      def tree_to_code(clf, feature_names)
      @@ -270,6 +338,8 @@

      Index

    • Functions

      diff --git a/docs/util/discretization/index.html b/docs/util/discretization/index.html index 0e3fe17b..0412e6f8 100644 --- a/docs/util/discretization/index.html +++ b/docs/util/discretization/index.html @@ -33,10 +33,10 @@

      Discretization MDLP

      python MDLPC.py –options=…
    • script options: - in_path (required): Path to dataset in .csv format (must include header) - out_path (required): Path where the discretized dataset will be saved - features (optional): comma-separated list of attribute names to be discretized, e.g., features=attr1,attr2,attr3 - class_label (required): label of class column in .csv dataset +* in_path (required): Path to dataset in .csv format (must include header) +* out_path (required): Path where the discretized dataset will be saved +* features (optional): comma-separated list of attribute names to be discretized, e.g., features=attr1,attr2,attr3 +* class_label (required): label of class column in .csv dataset * return_bins (optional): Doesn't take on values. If specified (–return_bins), a text file will be saved in the same directory as out_path. This file will include the description of the bins computed by the algorighm.

      Dependencies:

        diff --git a/docs/util/discretization/mdlp.html b/docs/util/discretization/mdlp.html index 655498a1..867bc803 100644 --- a/docs/util/discretization/mdlp.html +++ b/docs/util/discretization/mdlp.html @@ -29,6 +29,7 @@

        Module imodels.util.discretization.mdlp

        __author__ = 'Victor Ruiz, vmr11@pitt.edu' from math import log +import numbers import numpy as np import pandas as pd @@ -283,7 +284,106 @@

        Module imodels.util.discretization.mdlp

        # print(>>bins_file, 'Description of bins in file: %s' % out_data_path) for attr in self._features: print('attr: %s\n\t%s' % (attr, ', '.join([bin_label for bin_label in bin_label_collection[attr]])), - file=bins_file) + file=bins_file) + +class BRLDiscretizer: + + def __init__(self, X, y, feature_labels, verbose=False): + self.feature_labels = feature_labels + self.verbose = verbose + + def discretize_mixed_data(self, X, y, undiscretized_features=[]): + if type(X) != list: + X = np.array(X).tolist() + + # check which features are numeric (to be discretized) + self.discretized_features = [] + for fi in range(len(X[0])): + # if not string, and not specified as undiscretized + if isinstance(X[0][fi], numbers.Number) \ + and (len(self.feature_labels) == 0 or \ + len(undiscretized_features) == 0 or \ + self.feature_labels[fi] not in undiscretized_features): + self.discretized_features.append(self.feature_labels[fi]) + + if len(self.discretized_features) > 0: + if self.verbose: + print( + "Warning: non-categorical data found. Trying to discretize. (Please convert categorical values to " + "strings, and/or specify the argument 'undiscretized_features', to avoid this.)") + X = self.discretize(X, y) + + self.discretized_X = X + return X + + def discretize(self, X, y): + '''Discretize the features specified in self.discretized_features + ''' + if self.verbose: + print("Discretizing ", self.discretized_features, "...") + D = pd.DataFrame(np.hstack((X, np.array(y).reshape((len(y), 1)))), columns=list(self.feature_labels) + ["y"]) + self.discretizer = MDLP_Discretizer(dataset=D, class_label="y", features=self.discretized_features) + + cat_data = pd.DataFrame(np.zeros_like(X)) + for i in range(len(self.feature_labels)): + label = self.feature_labels[i] + if label in self.discretized_features: + column = [] + for j in range(len(self.discretizer._data[label])): + column += [label + " : " + self.discretizer._data[label][j]] + cat_data.iloc[:, i] = np.array(column) + else: + cat_data.iloc[:, i] = D[label] + + return np.array(cat_data).tolist() + + def apply_discretization(self, X, return_onehot=False): + + if type(X) in [pd.DataFrame, pd.Series]: + X = X.values + + self.data = pd.DataFrame(X, columns=self.feature_labels) + self.apply_cutpoints() + D = np.array(self.data) + + # prepend feature labels + Dl = np.copy(D).astype(str).tolist() + for i in range(len(Dl)): + for j in range(len(Dl[0])): + Dl[i][j] = self.feature_labels[j] + " : " + Dl[i][j] + + if not return_onehot: + return Dl + else: + return self.get_onehot_df(Dl) + + @property + def onehot_df(self): + return self.get_onehot_df(self.discretized_X) + + def get_onehot_df(self, discretized_X): + '''Create readable one-hot encoded DataFrame from discretized features + ''' + data = list(discretized_X[:]) + + X_colname_removed = data.copy() + for i in range(len(data)): + X_colname_removed[i] = list(map(lambda s: s.split(' : ')[1], X_colname_removed[i])) + + X_df_categorical = pd.DataFrame(X_colname_removed, columns=self.feature_labels) + X_df_onehot = pd.get_dummies(X_df_categorical) + return X_df_onehot + + @property + def data(self): + return self.discretizer._data + + @data.setter + def data(self, value): + self.discretizer._data = value + + def apply_cutpoints(self): + return self.discretizer.apply_cutpoints()
      @@ -295,6 +395,277 @@

      Module imodels.util.discretization.mdlp

      Classes

      +
      +class BRLDiscretizer +(X, y, feature_labels, verbose=False) +
      +
      +
      +
      + +Expand source code + +
      class BRLDiscretizer:
      +    
      +    def __init__(self, X, y, feature_labels, verbose=False):
      +        self.feature_labels = feature_labels
      +        self.verbose = verbose
      + 
      +    def discretize_mixed_data(self, X, y, undiscretized_features=[]):
      +        if type(X) != list:
      +            X = np.array(X).tolist()
      +
      +        # check which features are numeric (to be discretized)
      +        self.discretized_features = []
      +        for fi in range(len(X[0])):
      +            # if not string, and not specified as undiscretized
      +            if isinstance(X[0][fi], numbers.Number) \
      +                    and (len(self.feature_labels) == 0 or \
      +                         len(undiscretized_features) == 0 or \
      +                         self.feature_labels[fi] not in undiscretized_features):
      +                self.discretized_features.append(self.feature_labels[fi])
      +
      +        if len(self.discretized_features) > 0:
      +            if self.verbose:
      +                print(
      +                    "Warning: non-categorical data found. Trying to discretize. (Please convert categorical values to "
      +                    "strings, and/or specify the argument 'undiscretized_features', to avoid this.)")
      +            X = self.discretize(X, y)
      +        
      +        self.discretized_X = X
      +        return X
      +    
      +    def discretize(self, X, y):
      +        '''Discretize the features specified in self.discretized_features
      +        '''
      +        if self.verbose:
      +            print("Discretizing ", self.discretized_features, "...")
      +        D = pd.DataFrame(np.hstack((X, np.array(y).reshape((len(y), 1)))), columns=list(self.feature_labels) + ["y"])
      +        self.discretizer = MDLP_Discretizer(dataset=D, class_label="y", features=self.discretized_features)
      +
      +        cat_data = pd.DataFrame(np.zeros_like(X))
      +        for i in range(len(self.feature_labels)):
      +            label = self.feature_labels[i]
      +            if label in self.discretized_features:
      +                column = []
      +                for j in range(len(self.discretizer._data[label])):
      +                    column += [label + " : " + self.discretizer._data[label][j]]
      +                cat_data.iloc[:, i] = np.array(column)
      +            else:
      +                cat_data.iloc[:, i] = D[label]
      +
      +        return np.array(cat_data).tolist()
      +
      +    def apply_discretization(self, X, return_onehot=False):
      +        
      +        if type(X) in [pd.DataFrame, pd.Series]:
      +            X = X.values
      +        
      +        self.data = pd.DataFrame(X, columns=self.feature_labels)
      +        self.apply_cutpoints()
      +        D = np.array(self.data)
      +
      +        # prepend feature labels
      +        Dl = np.copy(D).astype(str).tolist()
      +        for i in range(len(Dl)):
      +            for j in range(len(Dl[0])):
      +                Dl[i][j] = self.feature_labels[j] + " : " + Dl[i][j]
      +        
      +        if not return_onehot:
      +            return Dl
      +        else:
      +            return self.get_onehot_df(Dl)
      +    
      +    @property
      +    def onehot_df(self):
      +        return self.get_onehot_df(self.discretized_X)
      +
      +    def get_onehot_df(self, discretized_X):
      +        '''Create readable one-hot encoded DataFrame from discretized features
      +        '''
      +        data = list(discretized_X[:])
      +
      +        X_colname_removed = data.copy()
      +        for i in range(len(data)):
      +            X_colname_removed[i] = list(map(lambda s: s.split(' : ')[1], X_colname_removed[i]))
      +
      +        X_df_categorical = pd.DataFrame(X_colname_removed, columns=self.feature_labels)
      +        X_df_onehot = pd.get_dummies(X_df_categorical)
      +        return X_df_onehot
      +    
      +    @property
      +    def data(self):
      +        return self.discretizer._data
      +
      +    @data.setter
      +    def data(self, value):
      +        self.discretizer._data = value
      +    
      +    def apply_cutpoints(self):
      +        return self.discretizer.apply_cutpoints()
      +
      +

      Instance variables

      +
      +
      var data
      +
      +
      +
      + +Expand source code + +
      @property
      +def data(self):
      +    return self.discretizer._data
      +
      +
      +
      var onehot_df
      +
      +
      +
      + +Expand source code + +
      @property
      +def onehot_df(self):
      +    return self.get_onehot_df(self.discretized_X)
      +
      +
      +
      +

      Methods

      +
      +
      +def apply_cutpoints(self) +
      +
      +
      +
      + +Expand source code + +
      def apply_cutpoints(self):
      +    return self.discretizer.apply_cutpoints()
      +
      +
      +
      +def apply_discretization(self, X, return_onehot=False) +
      +
      +
      +
      + +Expand source code + +
      def apply_discretization(self, X, return_onehot=False):
      +    
      +    if type(X) in [pd.DataFrame, pd.Series]:
      +        X = X.values
      +    
      +    self.data = pd.DataFrame(X, columns=self.feature_labels)
      +    self.apply_cutpoints()
      +    D = np.array(self.data)
      +
      +    # prepend feature labels
      +    Dl = np.copy(D).astype(str).tolist()
      +    for i in range(len(Dl)):
      +        for j in range(len(Dl[0])):
      +            Dl[i][j] = self.feature_labels[j] + " : " + Dl[i][j]
      +    
      +    if not return_onehot:
      +        return Dl
      +    else:
      +        return self.get_onehot_df(Dl)
      +
      +
      +
      +def discretize(self, X, y) +
      +
      +

      Discretize the features specified in self.discretized_features

      +
      + +Expand source code + +
      def discretize(self, X, y):
      +    '''Discretize the features specified in self.discretized_features
      +    '''
      +    if self.verbose:
      +        print("Discretizing ", self.discretized_features, "...")
      +    D = pd.DataFrame(np.hstack((X, np.array(y).reshape((len(y), 1)))), columns=list(self.feature_labels) + ["y"])
      +    self.discretizer = MDLP_Discretizer(dataset=D, class_label="y", features=self.discretized_features)
      +
      +    cat_data = pd.DataFrame(np.zeros_like(X))
      +    for i in range(len(self.feature_labels)):
      +        label = self.feature_labels[i]
      +        if label in self.discretized_features:
      +            column = []
      +            for j in range(len(self.discretizer._data[label])):
      +                column += [label + " : " + self.discretizer._data[label][j]]
      +            cat_data.iloc[:, i] = np.array(column)
      +        else:
      +            cat_data.iloc[:, i] = D[label]
      +
      +    return np.array(cat_data).tolist()
      +
      +
      +
      +def discretize_mixed_data(self, X, y, undiscretized_features=[]) +
      +
      +
      +
      + +Expand source code + +
      def discretize_mixed_data(self, X, y, undiscretized_features=[]):
      +    if type(X) != list:
      +        X = np.array(X).tolist()
      +
      +    # check which features are numeric (to be discretized)
      +    self.discretized_features = []
      +    for fi in range(len(X[0])):
      +        # if not string, and not specified as undiscretized
      +        if isinstance(X[0][fi], numbers.Number) \
      +                and (len(self.feature_labels) == 0 or \
      +                     len(undiscretized_features) == 0 or \
      +                     self.feature_labels[fi] not in undiscretized_features):
      +            self.discretized_features.append(self.feature_labels[fi])
      +
      +    if len(self.discretized_features) > 0:
      +        if self.verbose:
      +            print(
      +                "Warning: non-categorical data found. Trying to discretize. (Please convert categorical values to "
      +                "strings, and/or specify the argument 'undiscretized_features', to avoid this.)")
      +        X = self.discretize(X, y)
      +    
      +    self.discretized_X = X
      +    return X
      +
      +
      +
      +def get_onehot_df(self, discretized_X) +
      +
      +

      Create readable one-hot encoded DataFrame from discretized features

      +
      + +Expand source code + +
      def get_onehot_df(self, discretized_X):
      +    '''Create readable one-hot encoded DataFrame from discretized features
      +    '''
      +    data = list(discretized_X[:])
      +
      +    X_colname_removed = data.copy()
      +    for i in range(len(data)):
      +        X_colname_removed[i] = list(map(lambda s: s.split(' : ')[1], X_colname_removed[i]))
      +
      +    X_df_categorical = pd.DataFrame(X_colname_removed, columns=self.feature_labels)
      +    X_df_onehot = pd.get_dummies(X_df_categorical)
      +    return X_df_onehot
      +
      +
      +
      +
      class MDLP_Discretizer (dataset, class_label, out_path_data=None, out_path_bins=None, features=None) @@ -887,6 +1258,18 @@

      Index

    • Classes

      • +

        BRLDiscretizer

        + +
      • +
      • MDLP_Discretizer

        • MDLPC_criterion
        • diff --git a/docs/util/extract.html b/docs/util/extract.html new file mode 100644 index 00000000..0540843a --- /dev/null +++ b/docs/util/extract.html @@ -0,0 +1,442 @@ + + + + + + +imodels.util.extract API documentation + + + + + + + + + +
          +
          +
          +

          Module imodels.util.extract

          +
          +
          +
          + +Expand source code + +
          from typing import Iterable, Tuple, List
          +
          +import numpy as np
          +import pandas as pd
          +from sklearn.ensemble import BaggingClassifier, BaggingRegressor, GradientBoostingRegressor, RandomForestRegressor
          +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
          +from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
          +from mlxtend.frequent_patterns import fpgrowth
          +
          +from imodels.util.convert import tree_to_rules
          +from imodels.util.discretization.mdlp import BRLDiscretizer
          +
          +
          +def extract_fpgrowth(X, y,
          +                     feature_labels=None,
          +                     minsupport=0.1,
          +                     maxcardinality=2,
          +                     undiscretized_features=[],
          +                     verbose=False) -> Tuple[List[Tuple], BRLDiscretizer]:
          +
          +    # deal with pandas data
          +    if type(X) in [pd.DataFrame, pd.Series]:
          +        if feature_labels is None:
          +            feature_labels = X.columns
          +        X = X.values
          +    if type(y) in [pd.DataFrame, pd.Series]:
          +        y = y.values
          +
          +    if feature_labels is None:
          +        feature_labels = [f'feature_{i}' for i in range(X.shape[1])]
          +    
          +    discretizer = BRLDiscretizer(X, y, feature_labels=feature_labels, verbose=verbose)
          +    X = discretizer.discretize_mixed_data(X, y, undiscretized_features)
          +    X_df_onehot = discretizer.onehot_df
          +    
          +    # Now find frequent itemsets
          +    itemsets_df = fpgrowth(X_df_onehot, min_support=minsupport, max_len=maxcardinality)
          +    itemsets_indices = [tuple(s[1]) for s in itemsets_df.values]
          +    itemsets = [np.array(X_df_onehot.columns)[list(inds)] for inds in itemsets_indices]
          +    itemsets = list(map(tuple, itemsets))
          +    if verbose:
          +        print(len(itemsets), 'rules mined')
          +
          +    return itemsets, discretizer
          +
          +
          +def extract_rulefit(X, y, feature_names,
          +                    tree_size=4,
          +                    max_rules=2000,
          +                    memory_par=0.01,
          +                    tree_generator=None,
          +                    exp_rand_tree_size=True,
          +                    random_state=None) -> List[str]:
          +
          +    if tree_generator is None:
          +        n_estimators_default = int(np.ceil(max_rules / tree_size))
          +        sample_fract_ = min(0.5, (100 + 6 * np.sqrt(X.shape[0])) / X.shape[0])
          +
          +        tree_generator = GradientBoostingRegressor(n_estimators=n_estimators_default,
          +                                                    max_leaf_nodes=tree_size,
          +                                                    learning_rate=memory_par,
          +                                                    subsample=sample_fract_,
          +                                                    random_state=random_state,
          +                                                    max_depth=100)
          +
          +    if type(tree_generator) not in [GradientBoostingRegressor, RandomForestRegressor]:
          +        raise ValueError("RuleFit only works with RandomForest and BoostingRegressor")
          +
          +    ## fit tree generator
          +    if not exp_rand_tree_size:  # simply fit with constant tree size
          +        tree_generator.fit(X, y)
          +    else:  # randomise tree size as per Friedman 2005 Sec 3.3
          +        np.random.seed(random_state)
          +        tree_sizes = np.random.exponential(scale=tree_size - 2,
          +                                            size=int(np.ceil(max_rules * 2 / tree_size)))
          +        tree_sizes = np.asarray([2 + np.floor(tree_sizes[i_]) for i_ in np.arange(len(tree_sizes))], dtype=int)
          +        i = int(len(tree_sizes) / 4)
          +        while np.sum(tree_sizes[0:i]) < max_rules:
          +            i = i + 1
          +        tree_sizes = tree_sizes[0:i]
          +        tree_generator.set_params(warm_start=True)
          +        curr_est_ = 0
          +        for i_size in np.arange(len(tree_sizes)):
          +            size = tree_sizes[i_size]
          +            tree_generator.set_params(n_estimators=curr_est_ + 1)
          +            tree_generator.set_params(max_leaf_nodes=size)
          +            random_state_add = random_state if random_state else 0
          +            tree_generator.set_params(
          +                random_state=i_size + random_state_add)  # warm_state=True seems to reset random_state, such that the trees are highly correlated, unless we manually change the random_sate here.
          +            tree_generator.fit(np.copy(X, order='C'), np.copy(y, order='C'))
          +            curr_est_ = curr_est_ + 1
          +        tree_generator.set_params(warm_start=False)
          +
          +    if isinstance(tree_generator, RandomForestRegressor):
          +        estimators_ = [[x] for x in tree_generator.estimators_]
          +    else:
          +        estimators_ = tree_generator.estimators_
          +
          +    seen_antecedents = set()
          +    extracted_rules = [] 
          +    for estimator in estimators_:
          +        for rule_value_pair in tree_to_rules(estimator[0], np.array(feature_names), prediction_values=True):
          +            if rule_value_pair[0] not in seen_antecedents:
          +                extracted_rules.append(rule_value_pair)
          +                seen_antecedents.add(rule_value_pair[0])
          +    
          +    extracted_rules = sorted(extracted_rules, key=lambda x: x[1])
          +    extracted_rules = list(map(lambda x: x[0], extracted_rules))
          +    return extracted_rules
          +
          +
          +def extract_skope(X, y, feature_names, 
          +                  sample_weight=None,
          +                  n_estimators=10,
          +                  max_samples=.8,
          +                  max_samples_features=1.,
          +                  bootstrap=False,
          +                  bootstrap_features=False,
          +                  max_depths=[3], 
          +                  max_depth_duplication=None,
          +                  max_features=1.,
          +                  min_samples_split=2,
          +                  n_jobs=1,
          +                  random_state=None,
          +                  verbose=0) -> Tuple[List[str], List[np.array], List[np.array]]:
          +    
          +    ensembles = []
          +    if not isinstance(max_depths, Iterable):
          +        max_depths = [max_depths]
          +
          +    for max_depth in max_depths:
          +        bagging_clf = BaggingRegressor(
          +            base_estimator= DecisionTreeRegressor(
          +                max_depth=max_depth,
          +                max_features=max_features,
          +                min_samples_split=min_samples_split
          +            ),
          +            n_estimators=n_estimators,
          +            max_samples=max_samples,
          +            max_features=max_samples_features,
          +            bootstrap=bootstrap,
          +            bootstrap_features=bootstrap_features,
          +            # oob_score=... XXX may be added
          +            # if selection on tree perf needed.
          +            # warm_start=... XXX may be added to increase computation perf.
          +            n_jobs=n_jobs,
          +            random_state=random_state,
          +            verbose=verbose
          +        )
          +        ensembles.append(bagging_clf)
          +
          +    y_reg = y
          +    if sample_weight is not None:
          +        sample_weight = check_array(sample_weight, ensure_2d=False)
          +        weights = sample_weight - sample_weight.min()
          +        contamination = float(sum(y)) / len(y)
          +        y_reg = (
          +                pow(weights, 0.5) * 0.5 / contamination * (y > 0) -
          +                pow((weights).mean(), 0.5) * (y == 0)
          +        )
          +        y_reg = 1. / (1 + np.exp(-y_reg))  # sigmoid
          +
          +    for e in ensembles[:len(ensembles) // 2]:
          +        e.fit(X, y)
          +
          +    for e in ensembles[len(ensembles) // 2:]:
          +        e.fit(X, y_reg)
          +
          +    estimators_, estimators_samples_, estimators_features_ = [], [], []
          +    for ensemble in ensembles:
          +        estimators_ += ensemble.estimators_
          +        estimators_samples_ += ensemble.estimators_samples_
          +        estimators_features_ += ensemble.estimators_features_
          +
          +    extracted_rules = []
          +    for estimator, features in zip(estimators_, estimators_features_):
          +        extracted_rules.append(tree_to_rules(estimator, np.array(feature_names)[features]))
          +    
          +    return extracted_rules, estimators_samples_, estimators_features_
          +
          +
          +
          +
          +
          +
          +
          +

          Functions

          +
          +
          +def extract_fpgrowth(X, y, feature_labels=None, minsupport=0.1, maxcardinality=2, undiscretized_features=[], verbose=False) +
          +
          +
          +
          + +Expand source code + +
          def extract_fpgrowth(X, y,
          +                     feature_labels=None,
          +                     minsupport=0.1,
          +                     maxcardinality=2,
          +                     undiscretized_features=[],
          +                     verbose=False) -> Tuple[List[Tuple], BRLDiscretizer]:
          +
          +    # deal with pandas data
          +    if type(X) in [pd.DataFrame, pd.Series]:
          +        if feature_labels is None:
          +            feature_labels = X.columns
          +        X = X.values
          +    if type(y) in [pd.DataFrame, pd.Series]:
          +        y = y.values
          +
          +    if feature_labels is None:
          +        feature_labels = [f'feature_{i}' for i in range(X.shape[1])]
          +    
          +    discretizer = BRLDiscretizer(X, y, feature_labels=feature_labels, verbose=verbose)
          +    X = discretizer.discretize_mixed_data(X, y, undiscretized_features)
          +    X_df_onehot = discretizer.onehot_df
          +    
          +    # Now find frequent itemsets
          +    itemsets_df = fpgrowth(X_df_onehot, min_support=minsupport, max_len=maxcardinality)
          +    itemsets_indices = [tuple(s[1]) for s in itemsets_df.values]
          +    itemsets = [np.array(X_df_onehot.columns)[list(inds)] for inds in itemsets_indices]
          +    itemsets = list(map(tuple, itemsets))
          +    if verbose:
          +        print(len(itemsets), 'rules mined')
          +
          +    return itemsets, discretizer
          +
          +
          +
          +def extract_rulefit(X, y, feature_names, tree_size=4, max_rules=2000, memory_par=0.01, tree_generator=None, exp_rand_tree_size=True, random_state=None) +
          +
          +
          +
          + +Expand source code + +
          def extract_rulefit(X, y, feature_names,
          +                    tree_size=4,
          +                    max_rules=2000,
          +                    memory_par=0.01,
          +                    tree_generator=None,
          +                    exp_rand_tree_size=True,
          +                    random_state=None) -> List[str]:
          +
          +    if tree_generator is None:
          +        n_estimators_default = int(np.ceil(max_rules / tree_size))
          +        sample_fract_ = min(0.5, (100 + 6 * np.sqrt(X.shape[0])) / X.shape[0])
          +
          +        tree_generator = GradientBoostingRegressor(n_estimators=n_estimators_default,
          +                                                    max_leaf_nodes=tree_size,
          +                                                    learning_rate=memory_par,
          +                                                    subsample=sample_fract_,
          +                                                    random_state=random_state,
          +                                                    max_depth=100)
          +
          +    if type(tree_generator) not in [GradientBoostingRegressor, RandomForestRegressor]:
          +        raise ValueError("RuleFit only works with RandomForest and BoostingRegressor")
          +
          +    ## fit tree generator
          +    if not exp_rand_tree_size:  # simply fit with constant tree size
          +        tree_generator.fit(X, y)
          +    else:  # randomise tree size as per Friedman 2005 Sec 3.3
          +        np.random.seed(random_state)
          +        tree_sizes = np.random.exponential(scale=tree_size - 2,
          +                                            size=int(np.ceil(max_rules * 2 / tree_size)))
          +        tree_sizes = np.asarray([2 + np.floor(tree_sizes[i_]) for i_ in np.arange(len(tree_sizes))], dtype=int)
          +        i = int(len(tree_sizes) / 4)
          +        while np.sum(tree_sizes[0:i]) < max_rules:
          +            i = i + 1
          +        tree_sizes = tree_sizes[0:i]
          +        tree_generator.set_params(warm_start=True)
          +        curr_est_ = 0
          +        for i_size in np.arange(len(tree_sizes)):
          +            size = tree_sizes[i_size]
          +            tree_generator.set_params(n_estimators=curr_est_ + 1)
          +            tree_generator.set_params(max_leaf_nodes=size)
          +            random_state_add = random_state if random_state else 0
          +            tree_generator.set_params(
          +                random_state=i_size + random_state_add)  # warm_state=True seems to reset random_state, such that the trees are highly correlated, unless we manually change the random_sate here.
          +            tree_generator.fit(np.copy(X, order='C'), np.copy(y, order='C'))
          +            curr_est_ = curr_est_ + 1
          +        tree_generator.set_params(warm_start=False)
          +
          +    if isinstance(tree_generator, RandomForestRegressor):
          +        estimators_ = [[x] for x in tree_generator.estimators_]
          +    else:
          +        estimators_ = tree_generator.estimators_
          +
          +    seen_antecedents = set()
          +    extracted_rules = [] 
          +    for estimator in estimators_:
          +        for rule_value_pair in tree_to_rules(estimator[0], np.array(feature_names), prediction_values=True):
          +            if rule_value_pair[0] not in seen_antecedents:
          +                extracted_rules.append(rule_value_pair)
          +                seen_antecedents.add(rule_value_pair[0])
          +    
          +    extracted_rules = sorted(extracted_rules, key=lambda x: x[1])
          +    extracted_rules = list(map(lambda x: x[0], extracted_rules))
          +    return extracted_rules
          +
          +
          +
          +def extract_skope(X, y, feature_names, sample_weight=None, n_estimators=10, max_samples=0.8, max_samples_features=1.0, bootstrap=False, bootstrap_features=False, max_depths=[3], max_depth_duplication=None, max_features=1.0, min_samples_split=2, n_jobs=1, random_state=None, verbose=0) +
          +
          +
          +
          + +Expand source code + +
          def extract_skope(X, y, feature_names, 
          +                  sample_weight=None,
          +                  n_estimators=10,
          +                  max_samples=.8,
          +                  max_samples_features=1.,
          +                  bootstrap=False,
          +                  bootstrap_features=False,
          +                  max_depths=[3], 
          +                  max_depth_duplication=None,
          +                  max_features=1.,
          +                  min_samples_split=2,
          +                  n_jobs=1,
          +                  random_state=None,
          +                  verbose=0) -> Tuple[List[str], List[np.array], List[np.array]]:
          +    
          +    ensembles = []
          +    if not isinstance(max_depths, Iterable):
          +        max_depths = [max_depths]
          +
          +    for max_depth in max_depths:
          +        bagging_clf = BaggingRegressor(
          +            base_estimator= DecisionTreeRegressor(
          +                max_depth=max_depth,
          +                max_features=max_features,
          +                min_samples_split=min_samples_split
          +            ),
          +            n_estimators=n_estimators,
          +            max_samples=max_samples,
          +            max_features=max_samples_features,
          +            bootstrap=bootstrap,
          +            bootstrap_features=bootstrap_features,
          +            # oob_score=... XXX may be added
          +            # if selection on tree perf needed.
          +            # warm_start=... XXX may be added to increase computation perf.
          +            n_jobs=n_jobs,
          +            random_state=random_state,
          +            verbose=verbose
          +        )
          +        ensembles.append(bagging_clf)
          +
          +    y_reg = y
          +    if sample_weight is not None:
          +        sample_weight = check_array(sample_weight, ensure_2d=False)
          +        weights = sample_weight - sample_weight.min()
          +        contamination = float(sum(y)) / len(y)
          +        y_reg = (
          +                pow(weights, 0.5) * 0.5 / contamination * (y > 0) -
          +                pow((weights).mean(), 0.5) * (y == 0)
          +        )
          +        y_reg = 1. / (1 + np.exp(-y_reg))  # sigmoid
          +
          +    for e in ensembles[:len(ensembles) // 2]:
          +        e.fit(X, y)
          +
          +    for e in ensembles[len(ensembles) // 2:]:
          +        e.fit(X, y_reg)
          +
          +    estimators_, estimators_samples_, estimators_features_ = [], [], []
          +    for ensemble in ensembles:
          +        estimators_ += ensemble.estimators_
          +        estimators_samples_ += ensemble.estimators_samples_
          +        estimators_features_ += ensemble.estimators_features_
          +
          +    extracted_rules = []
          +    for estimator, features in zip(estimators_, estimators_features_):
          +        extracted_rules.append(tree_to_rules(estimator, np.array(feature_names)[features]))
          +    
          +    return extracted_rules, estimators_samples_, estimators_features_
          +
          +
          +
          +
          +
          +
          +
          + +
          + + + + + \ No newline at end of file diff --git a/docs/util/index.html b/docs/util/index.html index 27deb1fe..5442d84a 100644 --- a/docs/util/index.html +++ b/docs/util/index.html @@ -45,6 +45,10 @@

          Sub-modules

          +
          imodels.util.extract
          +
          +
          +
          imodels.util.metrics
          @@ -95,6 +99,7 @@

          Index

        • imodels.util.convert
        • imodels.util.discretization
        • imodels.util.evaluate
        • +
        • imodels.util.extract
        • imodels.util.metrics
        • imodels.util.neural_nets
        • imodels.util.prune
        • diff --git a/docs/util/prune.html b/docs/util/prune.html index 5b6d486b..eeed3328 100644 --- a/docs/util/prune.html +++ b/docs/util/prune.html @@ -27,8 +27,10 @@

          Module imodels.util.prune

          from typing import List
           from collections import Counter
           
          +from imodels.util.rule import Rule
           
          -def prune_mins(rules: List[str], precision_min: float, recall_min: float):
          +
          +def prune_mins(rules: List[Rule], precision_min: float, recall_min: float) -> List[Rule]:
               # Factorize rules before semantic tree filtering
               rules_ = [tuple(rule) for rule in rules]
               rules_dict = {}
          @@ -48,23 +50,22 @@ 

          Module imodels.util.prune

          else: rules_dict[rule] = (score[0], score[1], 1) - rules_dict = sorted(rules_dict.items(), key=lambda x: (x[1][0], x[1][1]), reverse=True) - - return rules_dict + rule_tuple_list = sorted(rules_dict.items(), key=lambda x: (x[1][0], x[1][1]), reverse=True) + return [Rule(rule, args=scores) for rule, scores in rule_tuple_list] -def deduplicate(rules, max_depth_dup): +def deduplicate(rules: List[Rule], max_depth_dup: int) -> List[Rule]: if max_depth_dup is not None: rules = [max(rules_set, key=f1_score) for rules_set in find_similar_rulesets(rules, max_depth_dup)] return sorted(rules, key=lambda x: - f1_score(x)) -def f1_score(x) -> float: - return 2 * x[1][0] * x[1][1] / \ - (x[1][0] + x[1][1]) if (x[1][0] + x[1][1]) > 0 else 0 +def f1_score(rule: Rule) -> float: + return 2 * rule.args[0] * rule.args[1] / \ + (rule.args[0] + rule.args[1]) if (rule.args[0] + rule.args[1]) > 0 else 0 -def find_similar_rulesets(rules, max_depth_duplication=None): +def find_similar_rulesets(rules: List[Rule], max_depth_duplication: int = None) -> List[List[Rule]]: """Create clusters of rules using a decision tree based on the terms of the rules @@ -102,9 +103,9 @@

          Module imodels.util.prune

          # Proceed to split rules_splitted = [[], [], []] for rule in rules: - if (most_represented_term + ' <=') in rule[0]: + if (most_represented_term + ' <=') in rule.rule: rules_splitted[0].append(rule) - elif (most_represented_term + ' >') in rule[0]: + elif (most_represented_term + ' >') in rule.rule: rules_splitted[1].append(rule) else: rules_splitted[2].append(rule) @@ -147,14 +148,14 @@

          Functions

          Expand source code -
          def deduplicate(rules, max_depth_dup):
          +
          def deduplicate(rules: List[Rule], max_depth_dup: int) -> List[Rule]:
               if max_depth_dup is not None:
                   rules = [max(rules_set, key=f1_score) for rules_set in find_similar_rulesets(rules, max_depth_dup)]
               return sorted(rules, key=lambda x: - f1_score(x))
          -def f1_score(x) +def f1_score(rule)
          @@ -162,9 +163,9 @@

          Functions

          Expand source code -
          def f1_score(x) -> float:
          -    return 2 * x[1][0] * x[1][1] / \
          -           (x[1][0] + x[1][1]) if (x[1][0] + x[1][1]) > 0 else 0
          +
          def f1_score(rule: Rule) -> float:
          +    return 2 * rule.args[0] * rule.args[1] / \
          +           (rule.args[0] + rule.args[1]) if (rule.args[0] + rule.args[1]) > 0 else 0
          @@ -187,7 +188,7 @@

          Returns

          Expand source code -
          def find_similar_rulesets(rules, max_depth_duplication=None):
          +
          def find_similar_rulesets(rules: List[Rule], max_depth_duplication: int = None) -> List[List[Rule]]:
               """Create clusters of rules using a decision tree based
               on the terms of the rules
           
          @@ -225,9 +226,9 @@ 

          Returns

          # Proceed to split rules_splitted = [[], [], []] for rule in rules: - if (most_represented_term + ' <=') in rule[0]: + if (most_represented_term + ' <=') in rule.rule: rules_splitted[0].append(rule) - elif (most_represented_term + ' >') in rule[0]: + elif (most_represented_term + ' >') in rule.rule: rules_splitted[1].append(rule) else: rules_splitted[2].append(rule) @@ -263,7 +264,7 @@

          Returns

          Expand source code -
          def prune_mins(rules: List[str], precision_min: float, recall_min: float):
          +
          def prune_mins(rules: List[Rule], precision_min: float, recall_min: float) -> List[Rule]:
               # Factorize rules before semantic tree filtering
               rules_ = [tuple(rule) for rule in rules]
               rules_dict = {}
          @@ -283,9 +284,8 @@ 

          Returns

          else: rules_dict[rule] = (score[0], score[1], 1) - rules_dict = sorted(rules_dict.items(), key=lambda x: (x[1][0], x[1][1]), reverse=True) - - return rules_dict
          + rule_tuple_list = sorted(rules_dict.items(), key=lambda x: (x[1][0], x[1][1]), reverse=True) + return [Rule(rule, args=scores) for rule, scores in rule_tuple_list]
    • diff --git a/docs/util/rule.html b/docs/util/rule.html index a540759a..3aa7639c 100644 --- a/docs/util/rule.html +++ b/docs/util/rule.html @@ -25,31 +25,9 @@

      Module imodels.util.rule

      Expand source code
      import re
      -from typing import List, Tuple, Dict
      -
      -
      -def replace_feature_name(rule, replace_dict):
      -    def replace(match):
      -        return replace_dict[match.group(0)]
      -
      -    rule = re.sub('|'.join(r'\b%s\b' % re.escape(s) for s in replace_dict),
      -                  replace, rule)
      -    return rule
      -
      -def enum_features(X, feature_names: List[str]) -> Tuple[List[str], Dict[str, str]]:
      -    """ Removes problematic characters in features; if none provided, 
      -    returns placeholder feature names
      -    """
      -
      -    enum_feature_names = [f'feature_{i}' for i in range(X.shape[1])]
      -    if feature_names is None:
      -        feature_names = enum_feature_names
      -    else:
      -        feature_clean_fn = lambda f: f.replace(' ', '_').replace('/', '_').replace('<', '_under_')
      -        feature_names = list(map(feature_clean_fn, feature_names))
      -    feature_dict = {k: v for k, v in zip(enum_feature_names, feature_names)}
      -
      -    return feature_names, feature_dict
      +from collections import OrderedDict
      +import copy
      +from typing import Dict, Iterable
       
       
       class Rule:
      @@ -83,7 +61,7 @@ 

      Module imodels.util.rule

      # FIXME : Easier method ? return hash(tuple(sorted(((i, j) for i, j in self.agg_dict.items())))) - def factorize(self): + def factorize(self) -> None: for feature, symbol, value in self.terms: if (feature, symbol) not in self.agg_dict: if symbol != '==': @@ -110,7 +88,29 @@

      Module imodels.util.rule

      return ' and '.join([' '.join( [feature, symbol, str(self.agg_dict[(feature, symbol)])]) for feature, symbol in sorted(self.agg_dict.keys()) - ])
      + ]) + + +def replace_feature_name(rule: Rule, replace_dict: Dict[str, str]) -> Rule: + def replace(match): + return replace_dict[match.group(0)] + + rule_replaced = Rule( + re.sub('|'.join(r'\b%s\b' % re.escape(s) for s in replace_dict), replace, rule.rule), + args=rule.args + ) + return rule_replaced + + +def get_feature_dict(num_features: int, feature_names: Iterable[str] = None) -> Dict[str, str]: + feature_dict = OrderedDict() + if feature_names is not None: + for i in range(num_features): + feature_dict[f'feature_{i}'] = feature_names[i] + else: + for i in range(num_features): + feature_dict[f'feature_{i}'] = f'feature_{i}' + return feature_dict
      @@ -120,30 +120,24 @@

      Module imodels.util.rule

      Functions

      -
      -def enum_features(X, feature_names) +
      +def get_feature_dict(num_features, feature_names=None)
      -

      Removes problematic characters in features; if none provided, -returns placeholder feature names

      +
      Expand source code -
      def enum_features(X, feature_names: List[str]) -> Tuple[List[str], Dict[str, str]]:
      -    """ Removes problematic characters in features; if none provided, 
      -    returns placeholder feature names
      -    """
      -
      -    enum_feature_names = [f'feature_{i}' for i in range(X.shape[1])]
      -    if feature_names is None:
      -        feature_names = enum_feature_names
      +
      def get_feature_dict(num_features: int, feature_names: Iterable[str] = None) -> Dict[str, str]:
      +    feature_dict = OrderedDict()
      +    if feature_names is not None:
      +        for i in range(num_features):
      +            feature_dict[f'feature_{i}'] = feature_names[i]
           else:
      -        feature_clean_fn = lambda f: f.replace(' ', '_').replace('/', '_').replace('<', '_under_')
      -        feature_names = list(map(feature_clean_fn, feature_names))
      -    feature_dict = {k: v for k, v in zip(enum_feature_names, feature_names)}
      -
      -    return feature_names, feature_dict
      + for i in range(num_features): + feature_dict[f'feature_{i}'] = f'feature_{i}' + return feature_dict
      @@ -155,13 +149,15 @@

      Functions

      Expand source code -
      def replace_feature_name(rule, replace_dict):
      +
      def replace_feature_name(rule: Rule, replace_dict: Dict[str, str]) -> Rule:
           def replace(match):
               return replace_dict[match.group(0)]
       
      -    rule = re.sub('|'.join(r'\b%s\b' % re.escape(s) for s in replace_dict),
      -                  replace, rule)
      -    return rule
      + rule_replaced = Rule( + re.sub('|'.join(r'\b%s\b' % re.escape(s) for s in replace_dict), replace, rule.rule), + args=rule.args + ) + return rule_replaced
      @@ -219,7 +215,7 @@

      Parameters

      # FIXME : Easier method ? return hash(tuple(sorted(((i, j) for i, j in self.agg_dict.items())))) - def factorize(self): + def factorize(self) -> None: for feature, symbol, value in self.terms: if (feature, symbol) not in self.agg_dict: if symbol != '==': @@ -259,7 +255,7 @@

      Methods

      Expand source code -
      def factorize(self):
      +
      def factorize(self) -> None:
           for feature, symbol, value in self.terms:
               if (feature, symbol) not in self.agg_dict:
                   if symbol != '==':
      @@ -297,7 +293,7 @@ 

      Index

    • Functions

    • diff --git a/docs/util/score.html b/docs/util/score.html index 346560a4..65a550f5 100644 --- a/docs/util/score.html +++ b/docs/util/score.html @@ -79,7 +79,7 @@

      Module imodels.util.score

      return scored_rules -def _eval_rule_perf(rule, X, y) -> Tuple[float, float]: +def _eval_rule_perf(rule: str, X, y) -> Tuple[float, float]: detected_index = list(X.query(rule).index) if len(detected_index) <= 1: return (0, 0) @@ -93,7 +93,7 @@

      Module imodels.util.score

      def score_lasso(X, y, rules: List[str], alphas=None, cv=3, prediction_task='regression', - max_rules=2000, random_state=None) -> Tuple[List[Rule], Lasso]: + max_rules=2000, random_state=None) -> Tuple[List[Rule], List[float], float]: if alphas is None: if prediction_task == 'regression': alphas = _alpha_grid(X, y) @@ -167,7 +167,7 @@

      Functions

      def score_lasso(X, y, rules: List[str], alphas=None, cv=3,
                       prediction_task='regression',
      -                max_rules=2000, random_state=None) -> Tuple[List[Rule], Lasso]:
      +                max_rules=2000, random_state=None) -> Tuple[List[Rule], List[float], float]:
           if alphas is None:
               if prediction_task == 'regression':
                   alphas = _alpha_grid(X, y)
      diff --git a/readme.md b/readme.md
      index 2b3e37bf..6bf39f0d 100644
      --- a/readme.md
      +++ b/readme.md
      @@ -8,17 +8,17 @@
         imodels overviewdemo notebooks
       

      -

      - +

      + ## imodels overview Implementations of different popular interpretable models can be easily used and installed: @@ -68,8 +68,20 @@ The final form of the above models takes one of the following forms, which aim t Different models and algorithms vary not only in their final form but also in different choices made during modeling. In particular, many models differ in the 3 steps given by the table below. -- ex. RuleFit and SkopeRules differ only in the way they prune rules: RuleFit uses a linear model whereas SkopeRules heuristically deduplicates rules sharing overlap. -- ex. Bayesian rule lists and greedy rule lists differ in how they select rules; bayesian rule lists perform a global optimization over possible rule lists while Greedy rule lists pick splits sequentially to maximize a given criterion. +
      +ex. RuleFit and SkopeRules +RuleFit and SkopeRules differ only in the way they prune rules: RuleFit uses a linear model whereas SkopeRules heuristically deduplicates rules sharing overlap. +
      + +
      +ex. Bayesian rule lists and greedy rule lists +Bayesian rule lists and greedy rule lists differ in how they select rules; bayesian rule lists perform a global optimization over possible rule lists while Greedy rule lists pick splits sequentially to maximize a given criterion. +
      + +
      +ex. FPSkope and SkopeRules +FPSkope and SkopeRules differ only in the way they generate candidate rules: FPSkope uses FPgrowth whereas SkopeRules extracts rules from decision trees. +
      See the docs for individual models for futher descriptions.