diff --git a/.gitignore b/.gitignore index 2281b8db..aa2a974b 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,4 @@ env /*.egg-info build +.gitmodules diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 73d5e1b9..00000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "models/rulefit"] - path = models/rulefit - url = https://github.com/christophM/rulefit.git diff --git a/imodels/bayesian_rule_lists/BigDataRuleListClassifier.py b/imodels/bayesian_rule_lists/BigDataRuleListClassifier.py deleted file mode 100644 index 3ea7d270..00000000 --- a/imodels/bayesian_rule_lists/BigDataRuleListClassifier.py +++ /dev/null @@ -1,118 +0,0 @@ -import numpy as np -import pandas as pd -import numbers -from sklearn.ensemble import RandomForestClassifier -from RuleListClassifier import RuleListClassifier - -class BigDataRuleListClassifier(RuleListClassifier): - """ - A scikit-learn compatible wrapper for the Bayesian Rule List - classifier by Benjamin Letham, adapted to work on large datasets. It - trains a linear SVM first, takes the subset of the training data closest - to the decision boundary (specified by the parameter training_subset), - which is most critical to learning a classifier, and then uses this subset - to learn a rule list. - - It produces a highly interpretable model (a list of decision rules) of - the same form as an expert system. - - Parameters - ---------- - training_subset : float, optional (default=0.1) - Determines the fraction of the data to use for training the Bayesian - Rule List classifier (the data points closest to a linear decision - boundary are selected). - - subset_estimator: BaseEstimator, optional (default=RandomForestClassifier) - An Estimator which is able to produce probabilities, used for finding - the subset of the data which is closest to the decision boundary - - listlengthprior : int, optional (default=3) - Prior hyperparameter for expected list length (excluding null rule) - - listwidthprior : int, optional (default=1) - Prior hyperparameter for expected list length (excluding null rule) - - maxcardinality : int, optional (default=1) - Maximum cardinality of an itemset - - minsupport : int, optional (default=10) - Minimum support (%) of an itemset - - alpha : array_like, shape = [n_classes] - prior hyperparameter for multinomial pseudocounts - - n_chains : int, optional (default=3) - Number of MCMC chains for inference - - max_iter : int, optional (default=50000) - Maximum number of iterations - - class1label: str, optional (default="class 1") - Label or description of what the positive class (with y=1) means - - verbose: bool, optional (default=True) - Verbose output - """ - - def __init__(self, training_subset=0.1, subset_estimator=RandomForestClassifier(), listlengthprior=3, listwidthprior=1, maxcardinality=2, minsupport=10, alpha = np.array([1.,1.]), n_chains=3, max_iter=50000, class1label="class 1", verbose=True): - self.training_subset = training_subset - self.subset_estimator = subset_estimator - - self.listlengthprior = listlengthprior - self.listwidthprior = listwidthprior - self.maxcardinality = maxcardinality - self.minsupport = minsupport - self.alpha = alpha - self.n_chains = n_chains - self.max_iter = max_iter - self.class1label = class1label - self.verbose = verbose - self._zmin = 1 - - self.thinning = 1 #The thinning rate - self.burnin = self.max_iter//2 #the number of samples to drop as burn-in in-simulation - - self.discretizer = None - self.d_star = None - - def _setdata(self, X, y, feature_labels=[], undiscretized_features = []): - self._setlabels(X, feature_labels) - - for fi in range(len(X[0])): - if not isinstance(X[0][fi], numbers.Number): - raise Exception("Sorry, only numeric data is supported by BigDataRuleListClassifier at this time") - - Xn = np.array(X) - # train subset estimator if necessary - try: - self.subset_estimator.predict_proba(Xn[0]) - except: - self.subset_estimator.fit(X, y) - # calculate distances from decision boundary for each point - dist = np.abs(0.5-self.subset_estimator.predict_proba(Xn)[:, 1]) - ones_idx = np.where(y==1)[0] - zeros_idx = np.where(y==0)[0] - dist_ones = dist[ones_idx] - dist_zeros = dist[zeros_idx] - - # take closest training_subset portion of data, preserving class imbalance - if self.verbose: - print("Reduced from", len(X)) - n = int(len(y)*self.training_subset) - bestidx_ones = np.argsort(-dist_ones) - bestidx_zeros = np.argsort(-dist_zeros) - one_fraction = len(np.where(y==1)[0])/float(len(y)) - keep_idx = ones_idx[bestidx_ones[:(int(n*one_fraction)+1)]] - keep_idx = np.hstack((keep_idx, zeros_idx[bestidx_zeros[:(int(n*(1-one_fraction))+1)]])) - - if type(X) == pd.DataFrame: - X = X.iloc[keep_idx, :] - else: - X = np.array(X)[keep_idx, :] - y = np.array(y)[keep_idx].astype(int) - if self.verbose: - print("...to", len(X), " data points") - - X = self._discretize_mixed_data(X, y, undiscretized_features) - return X, y \ No newline at end of file diff --git a/imodels/bayesian_rule_lists/RuleListClassifier.py b/imodels/bayesian_rule_lists/RuleListClassifier.py index 36487a27..f5678798 100644 --- a/imodels/bayesian_rule_lists/RuleListClassifier.py +++ b/imodels/bayesian_rule_lists/RuleListClassifier.py @@ -3,8 +3,8 @@ import sys import numpy as np import pandas as pd -from brl import * -from discretization.MDLP import * +from .brl import * +from .discretization.MDLP import * import numbers class RuleListClassifier(BaseEstimator): diff --git a/imodels/bayesian_rule_lists/SVMBigDataRuleListClassifier.py b/imodels/bayesian_rule_lists/SVMBigDataRuleListClassifier.py deleted file mode 100644 index 4126b699..00000000 --- a/imodels/bayesian_rule_lists/SVMBigDataRuleListClassifier.py +++ /dev/null @@ -1,116 +0,0 @@ -import numpy as np -import pandas as pd -import numbers -from sklearn.svm import LinearSVC -from RuleListClassifier import RuleListClassifier - -class SVMBigDataRuleListClassifier(RuleListClassifier): - """ - A scikit-learn compatible wrapper for the Bayesian Rule List - classifier by Benjamin Letham, adapted to work on large datasets. It - trains a linear SVM first, takes the subset of the training data closest - to the decision boundary (specified by the parameter training_subset), - which is most critical to learning a classifier, and then uses this subset - to learn a rule list. - - It produces a highly interpretable model (a list of decision rules) of - the same form as an expert system. - - Parameters - ---------- - training_subset : float, optional (default=0.1) - Determines the fraction of the data to use for training the Bayesian - Rule List classifier (the data points closest to a linear decision - boundary are selected). - - subsetSVM_C : float, optional (default=1) - Regularization parameter for the SVM which is used to determine which - fraction of the data is most important (i.e. closest to the decision - boundary) to use for training the Bayesian Rule List classifier - - listlengthprior : int, optional (default=3) - Prior hyperparameter for expected list length (excluding null rule) - - listwidthprior : int, optional (default=1) - Prior hyperparameter for expected list length (excluding null rule) - - maxcardinality : int, optional (default=1) - Maximum cardinality of an itemset - - minsupport : int, optional (default=10) - Minimum support (%) of an itemset - - alpha : array_like, shape = [n_classes] - prior hyperparameter for multinomial pseudocounts - - n_chains : int, optional (default=3) - Number of MCMC chains for inference - - max_iter : int, optional (default=50000) - Maximum number of iterations - - class1label: str, optional (default="class 1") - Label or description of what the positive class (with y=1) means - - verbose: bool, optional (default=True) - Verbose output - """ - - def __init__(self, training_subset=0.1, subsetSVM_C=1, listlengthprior=3, listwidthprior=1, maxcardinality=2, minsupport=10, alpha = np.array([1.,1.]), n_chains=3, max_iter=50000, class1label="class 1", verbose=True): - self.training_subset = training_subset - self.subsetSVM_C = subsetSVM_C - - self.listlengthprior = listlengthprior - self.listwidthprior = listwidthprior - self.maxcardinality = maxcardinality - self.minsupport = minsupport - self.alpha = alpha - self.n_chains = n_chains - self.max_iter = max_iter - self.class1label = class1label - self.verbose = verbose - self._zmin = 1 - - self.thinning = 1 #The thinning rate - self.burnin = self.max_iter//2 #the number of samples to drop as burn-in in-simulation - - self.discretizer = None - self.d_star = None - - def _setdata(self, X, y, feature_labels=[], undiscretized_features = []): - self._setlabels(X, feature_labels) - - for fi in range(len(X[0])): - if not isinstance(X[0][fi], numbers.Number): - raise Exception("Sorry, only numeric data is supported by BigDataRuleListClassifier at this time") - - # train linear SVM - self.svm = LinearSVC(C=self.subsetSVM_C) - self.svm.fit(X, y) - # calculate distances from decision boundary for each point - Xn = np.array(X) - dfun_ones = self.svm.decision_function(Xn[np.where(y==1)[0], :]) - dist_ones = dfun_ones / np.linalg.norm(self.svm.coef_) - dfun_zeros = self.svm.decision_function(Xn[np.where(y==0)[0], :]) - dist_zeros = dfun_zeros / np.linalg.norm(self.svm.coef_) - - # take closest training_subset portion of data, preserving class imbalance - if self.verbose: - print "Reduced from", len(X) - n = int(len(y)*self.training_subset) - bestidx_ones = np.argsort(dist_ones) - bestidx_zeros = np.argsort(dist_zeros) - one_fraction = len(np.where(y==1)[0])/float(len(y)) - keep_idx = bestidx_ones[:(int(n*one_fraction)+1)] - keep_idx = np.hstack((keep_idx, bestidx_zeros[:(int(n*(1-one_fraction))+1)])) - - if type(X) == pd.DataFrame: - X = X.iloc[keep_idx, :] - else: - X = np.array(X)[keep_idx, :] - y = np.array(y)[keep_idx].astype(int) - if self.verbose: - print "...to", len(X), " data points" - - X = self._discretize_mixed_data(X, y, undiscretized_features) - return X, y \ No newline at end of file diff --git a/imodels/bayesian_rule_lists/discretization/MDLP.py b/imodels/bayesian_rule_lists/discretization/MDLP.py index 7ceb2491..2e39eae6 100644 --- a/imodels/bayesian_rule_lists/discretization/MDLP.py +++ b/imodels/bayesian_rule_lists/discretization/MDLP.py @@ -2,7 +2,7 @@ __author__ = 'Victor Ruiz, vmr11@pitt.edu' import pandas as pd import numpy as np -from entropy import entropy, cut_point_information_gain +from .entropy import entropy, cut_point_information_gain from math import log import sys import getopt diff --git a/imodels/bayesian_rule_lists/discretization/__init__.py b/imodels/bayesian_rule_lists/discretization/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/imodels/bayesian_rule_lists/examples/diabetes_bigdata_demo.py b/imodels/bayesian_rule_lists/examples/diabetes_bigdata_demo.py deleted file mode 100644 index a36d5a5a..00000000 --- a/imodels/bayesian_rule_lists/examples/diabetes_bigdata_demo.py +++ /dev/null @@ -1,44 +0,0 @@ -from sklearn.cross_validation import train_test_split -from sklearn.datasets.mldata import fetch_mldata -from BigDataRuleListClassifier import * -from SVMBigDataRuleListClassifier import * -from sklearn.ensemble import RandomForestClassifier -import time - -feature_labels = ["#Pregnant","Glucose concentration test","Blood pressure(mmHg)","Triceps skin fold thickness(mm)","2-Hour serum insulin (mu U/ml)","Body mass index","Diabetes pedigree function","Age (years)"] - -data = fetch_mldata("diabetes") # get dataset -y = -(data.target-1)/2 # target labels (0: healthy, or 1: diabetes) - the original dataset contains -1 for diabetes and +1 for healthy - -############################################################################### - -Xtrain, Xtest, ytrain, ytest = train_test_split(data.data, y) # split - -t0 = time.time() -# train classifier (allow more iterations for better accuracy) -clf = RuleListClassifier(max_iter=10000, class1label="diabetes", verbose=False) -clf.fit(Xtrain, ytrain, feature_labels=feature_labels) -print "RuleListClassifier Accuracy:", clf.score(Xtest, ytest), "Learned interpretable model:\n", clf -t1 = time.time() - -# train classifier (allow more iterations for better accuracy) -bclf = BigDataRuleListClassifier(training_subset=0.1, subset_estimator=RandomForestClassifier(n_estimators=100).fit(Xtrain, ytrain), max_iter=10000, class1label="diabetes", verbose=False) -bclf.fit(Xtrain, ytrain, feature_labels=feature_labels) -print "BigDataRuleListClassifier Accuracy:", bclf.score(Xtest, ytest), "Learned interpretable model:\n", bclf -t2 = time.time() - -# train classifier (allow more iterations for better accuracy) -sclf = SVMBigDataRuleListClassifier(training_subset=0.1, subsetSVM_C=0.01, max_iter=10000, class1label="diabetes", verbose=False) -sclf.fit(Xtrain, ytrain, feature_labels=feature_labels) -print "SVMBigDataRuleListClassifier Accuracy:", bclf.score(Xtest, ytest), "Learned interpretable model:\n", sclf -t3 = time.time() - -print "Comparison\n=========" -print "Time taken for RuleListClassifier: ", t1-t0, "Score achieved:", clf.score(Xtest, ytest) -print "Time taken for BigDataRuleListClassifier: ", t2-t1, "Score achieved:", bclf.score(Xtest, ytest) -print "Time taken for SVMBigDataRuleListClassifier: ", t3-t2, "Score achieved:", sclf.score(Xtest, ytest) -print "========" - -############################################################################### - -print "RandomForestClassifier Accuracy:", RandomForestClassifier().fit(Xtrain, ytrain).score(Xtest, ytest) \ No newline at end of file diff --git a/imodels/bayesian_rule_lists/examples/hepatitis_mixeddata_demo.py b/imodels/bayesian_rule_lists/examples/hepatitis_mixeddata_demo.py deleted file mode 100644 index 32bbf714..00000000 --- a/imodels/bayesian_rule_lists/examples/hepatitis_mixeddata_demo.py +++ /dev/null @@ -1,93 +0,0 @@ -from sklearn.cross_validation import train_test_split -from sklearn.datasets.mldata import fetch_mldata -from RuleListClassifier import * -from sklearn.ensemble import RandomForestClassifier -import pandas as pd - -""" -https://archive.ics.uci.edu/ml/datasets/Hepatitis -1. Class: DIE, LIVE -2. AGE: 10, 20, 30, 40, 50, 60, 70, 80 -3. SEX: male, female -4. STEROID: no, yes -5. ANTIVIRALS: no, yes -6. FATIGUE: no, yes -7. MALAISE: no, yes -8. ANOREXIA: no, yes -9. LIVER BIG: no, yes -10. LIVER FIRM: no, yes -11. SPLEEN PALPABLE: no, yes -12. SPIDERS: no, yes -13. ASCITES: no, yes -14. VARICES: no, yes -15. BILIRUBIN: 0.39, 0.80, 1.20, 2.00, 3.00, 4.00 --- see the note below -16. ALK PHOSPHATE: 33, 80, 120, 160, 200, 250 -17. SGOT: 13, 100, 200, 300, 400, 500, -18. ALBUMIN: 2.1, 3.0, 3.8, 4.5, 5.0, 6.0 -19. PROTIME: 10, 20, 30, 40, 50, 60, 70, 80, 90 -20. HISTOLOGY: no, yes -""" -data = fetch_mldata("datasets-UCI hepatitis") # get dataset - -#some data cleaning (due to horrible mldata format) -# target -y = [1 if 'live' in v[0].lower() else 0 for v in data['Class'][0]] -# categorical variables -data['SEX'] = data.data -feature_labels = [col for col in data['COL_NAMES'] if col == col.upper()] -columns = {} -for label in feature_labels: - column = data[label] if len(data[label])>1 else data[label][0] - while type(column[0]) == list or type(column[0]) == np.ndarray: - column = [c[0] for c in column] - columns[label] = pd.Series(column) -# numeric variables -columns['AGE'] = data.target -columns['BILIRUBIN'] = pd.Series(data['double1'][0]) -columns['ALK PHOSPHATE'] = pd.Series(data['int2'][0]) -columns['ALK PHOSPHATE'][columns['ALK PHOSPHATE']<0] = np.nan -columns['SGOT'] = pd.Series(data['int2'][1]) -columns['SGOT'][columns['SGOT']<0] = np.nan -columns['ALBUMIN'] = pd.Series(data['double3'][0]) -columns['PROTIME'] = pd.Series(data['int4'][0]) -columns['PROTIME'][columns['PROTIME']<0] = np.nan -# convert to dataframe -hepatitis_df = pd.DataFrame(columns) -# deal with missing values -for c in hepatitis_df.columns: - if hepatitis_df[c].dtype != np.object: - hepatitis_df[c] = hepatitis_df[c].fillna(hepatitis_df[c][~np.isnan(hepatitis_df[c])].mean()) - -print hepatitis_df.head() - -############################################################################### - -Xtrain, Xtest, ytrain, ytest = train_test_split(hepatitis_df, y) # split - -# train classifier (allow more iterations for better accuracy) -clf = RuleListClassifier(max_iter=10000, class1label="survival", verbose=False) -clf.fit(Xtrain, ytrain) - -print "RuleListClassifier Accuracy:", clf.score(Xtest, ytest), "Learned interpretable model:\n", clf - -############################################################################### - -try: - from category_encoders import HashingEncoder -except: - raise Exception("Please install category_encoders (pip install category_encoders) for comparing mixed data with Random Forests!") -from sklearn import pipeline - -ppl = pipeline.Pipeline([ - ('encoder', HashingEncoder(cols=['LIVER_BIG', 'ANTIVIRALS', 'HISTOLOGY', 'SEX', 'STEROID', 'MALAISE', 'FATIGUE', 'SPIDERS', 'VARICES', 'LIVER_FIRM', 'SPLEEN_PALPABLE', 'ASCITES', 'ANOREXIA'])), - ('clf', RandomForestClassifier()) -]) - -# back to dataframes (for HashingEncoder) -Xtrain = pd.DataFrame(Xtrain) -Xtrain.columns = hepatitis_df.columns -Xtest = pd.DataFrame(Xtest) -Xtest.columns = hepatitis_df.columns - -print "RandomForestClassifier Accuracy:", ppl.fit(Xtrain, ytrain).score(Xtest, ytest) \ No newline at end of file diff --git a/imodels/optimal_classification_tree/__init__.py b/imodels/optimal_classification_tree/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/imodels/slim/__init__.py b/imodels/slim/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/setup.py b/setup.py index d3425ba2..5f5806c9 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,17 @@ long_description_content_type="text/markdown", url="https://github.com/csinva/interpretability-implementations-demos", packages=setuptools.find_packages(), + install_requires=[ + 'fim', + 'numpy', + 'scipy', + 'matplotlib', + 'pandas', + 'scikit-learn', + ], + dependency_links=[ + 'https://github.com/csinva/pyfim-clone/tarball/master#egg=fim-6.28' + ], classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License",