diff --git a/.gitignore b/.gitignore
index 2281b8db..aa2a974b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,4 @@ env
 /*.egg-info
 
 build
+.gitmodules
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index 73d5e1b9..00000000
--- a/.gitmodules
+++ /dev/null
@@ -1,3 +0,0 @@
-[submodule "models/rulefit"]
-	path = models/rulefit
-	url = https://github.com/christophM/rulefit.git
diff --git a/imodels/bayesian_rule_lists/BigDataRuleListClassifier.py b/imodels/bayesian_rule_lists/BigDataRuleListClassifier.py
deleted file mode 100644
index 3ea7d270..00000000
--- a/imodels/bayesian_rule_lists/BigDataRuleListClassifier.py
+++ /dev/null
@@ -1,118 +0,0 @@
-import numpy as np
-import pandas as pd
-import numbers
-from sklearn.ensemble import RandomForestClassifier
-from RuleListClassifier import RuleListClassifier
-
-class BigDataRuleListClassifier(RuleListClassifier):
-    """
-    A scikit-learn compatible wrapper for the Bayesian Rule List
-    classifier by Benjamin Letham, adapted to work on large datasets. It 
-    trains a linear SVM first, takes the subset of the training data closest
-    to the decision boundary (specified by the parameter training_subset), 
-    which is most critical to learning a classifier, and then uses this subset
-    to learn a rule list. 
-
-    It produces a highly interpretable model (a list of decision rules) of 
-    the same form as an expert system. 
-
-    Parameters
-    ----------
-    training_subset : float, optional (default=0.1)
-        Determines the fraction of the data to use for training the Bayesian  
-        Rule List classifier (the data points closest to a linear decision
-        boundary are selected).
-        
-    subset_estimator: BaseEstimator, optional (default=RandomForestClassifier)
-        An Estimator which is able to produce probabilities, used for finding
-        the subset of the data which is closest to the decision boundary
-    
-    listlengthprior : int, optional (default=3)
-        Prior hyperparameter for expected list length (excluding null rule)
-
-    listwidthprior : int, optional (default=1)
-        Prior hyperparameter for expected list length (excluding null rule)
-        
-    maxcardinality : int, optional (default=1)
-        Maximum cardinality of an itemset
-        
-    minsupport : int, optional (default=10)
-        Minimum support (%) of an itemset
-
-    alpha : array_like, shape = [n_classes]
-        prior hyperparameter for multinomial pseudocounts
-
-    n_chains : int, optional (default=3)
-        Number of MCMC chains for inference
-
-    max_iter : int, optional (default=50000)
-        Maximum number of iterations
-        
-    class1label: str, optional (default="class 1")
-        Label or description of what the positive class (with y=1) means
-        
-    verbose: bool, optional (default=True)
-        Verbose output
-    """
-    
-    def __init__(self, training_subset=0.1, subset_estimator=RandomForestClassifier(), listlengthprior=3, listwidthprior=1, maxcardinality=2, minsupport=10, alpha = np.array([1.,1.]), n_chains=3, max_iter=50000, class1label="class 1", verbose=True):
-        self.training_subset = training_subset
-        self.subset_estimator = subset_estimator
-        
-        self.listlengthprior = listlengthprior
-        self.listwidthprior = listwidthprior
-        self.maxcardinality = maxcardinality
-        self.minsupport = minsupport
-        self.alpha = alpha
-        self.n_chains = n_chains
-        self.max_iter = max_iter
-        self.class1label = class1label
-        self.verbose = verbose
-        self._zmin = 1
-        
-        self.thinning = 1 #The thinning rate
-        self.burnin = self.max_iter//2 #the number of samples to drop as burn-in in-simulation
-        
-        self.discretizer = None
-        self.d_star = None
-        
-    def _setdata(self, X, y, feature_labels=[], undiscretized_features = []):
-        self._setlabels(X, feature_labels)
-        
-        for fi in range(len(X[0])):
-            if not isinstance(X[0][fi], numbers.Number):
-                raise Exception("Sorry, only numeric data is supported by BigDataRuleListClassifier at this time")
-        
-        Xn = np.array(X)
-        # train subset estimator if necessary
-        try:
-            self.subset_estimator.predict_proba(Xn[0])
-        except:
-            self.subset_estimator.fit(X, y)
-        # calculate distances from decision boundary for each point
-        dist = np.abs(0.5-self.subset_estimator.predict_proba(Xn)[:, 1])
-        ones_idx = np.where(y==1)[0]
-        zeros_idx = np.where(y==0)[0]
-        dist_ones = dist[ones_idx]
-        dist_zeros = dist[zeros_idx]
-        
-        # take closest training_subset portion of data, preserving class imbalance
-        if self.verbose:
-            print("Reduced from", len(X))
-        n = int(len(y)*self.training_subset)
-        bestidx_ones = np.argsort(-dist_ones)
-        bestidx_zeros = np.argsort(-dist_zeros)
-        one_fraction = len(np.where(y==1)[0])/float(len(y))
-        keep_idx = ones_idx[bestidx_ones[:(int(n*one_fraction)+1)]]
-        keep_idx = np.hstack((keep_idx, zeros_idx[bestidx_zeros[:(int(n*(1-one_fraction))+1)]]))
-        
-        if type(X) == pd.DataFrame:
-            X = X.iloc[keep_idx, :]
-        else:
-            X = np.array(X)[keep_idx, :]
-        y = np.array(y)[keep_idx].astype(int)
-        if self.verbose:
-            print("...to", len(X), " data points")
-            
-        X = self._discretize_mixed_data(X, y, undiscretized_features)
-        return X, y
\ No newline at end of file
diff --git a/imodels/bayesian_rule_lists/RuleListClassifier.py b/imodels/bayesian_rule_lists/RuleListClassifier.py
index 36487a27..f5678798 100644
--- a/imodels/bayesian_rule_lists/RuleListClassifier.py
+++ b/imodels/bayesian_rule_lists/RuleListClassifier.py
@@ -3,8 +3,8 @@
 import sys
 import numpy as np
 import pandas as pd
-from brl import *
-from discretization.MDLP import *
+from .brl import *
+from .discretization.MDLP import *
 import numbers
 
 class RuleListClassifier(BaseEstimator):
diff --git a/imodels/bayesian_rule_lists/SVMBigDataRuleListClassifier.py b/imodels/bayesian_rule_lists/SVMBigDataRuleListClassifier.py
deleted file mode 100644
index 4126b699..00000000
--- a/imodels/bayesian_rule_lists/SVMBigDataRuleListClassifier.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import numpy as np
-import pandas as pd
-import numbers
-from sklearn.svm import LinearSVC
-from RuleListClassifier import RuleListClassifier
-
-class SVMBigDataRuleListClassifier(RuleListClassifier):
-    """
-    A scikit-learn compatible wrapper for the Bayesian Rule List
-    classifier by Benjamin Letham, adapted to work on large datasets. It 
-    trains a linear SVM first, takes the subset of the training data closest
-    to the decision boundary (specified by the parameter training_subset), 
-    which is most critical to learning a classifier, and then uses this subset
-    to learn a rule list. 
-
-    It produces a highly interpretable model (a list of decision rules) of 
-    the same form as an expert system. 
-
-    Parameters
-    ----------
-    training_subset : float, optional (default=0.1)
-        Determines the fraction of the data to use for training the Bayesian  
-        Rule List classifier (the data points closest to a linear decision
-        boundary are selected).
-        
-    subsetSVM_C : float, optional (default=1)
-        Regularization parameter for the SVM which is used to determine which
-        fraction of the data is most important (i.e. closest to the decision 
-        boundary) to use for training the Bayesian Rule List classifier
-    
-    listlengthprior : int, optional (default=3)
-        Prior hyperparameter for expected list length (excluding null rule)
-
-    listwidthprior : int, optional (default=1)
-        Prior hyperparameter for expected list length (excluding null rule)
-        
-    maxcardinality : int, optional (default=1)
-        Maximum cardinality of an itemset
-        
-    minsupport : int, optional (default=10)
-        Minimum support (%) of an itemset
-
-    alpha : array_like, shape = [n_classes]
-        prior hyperparameter for multinomial pseudocounts
-
-    n_chains : int, optional (default=3)
-        Number of MCMC chains for inference
-
-    max_iter : int, optional (default=50000)
-        Maximum number of iterations
-        
-    class1label: str, optional (default="class 1")
-        Label or description of what the positive class (with y=1) means
-        
-    verbose: bool, optional (default=True)
-        Verbose output
-    """
-    
-    def __init__(self, training_subset=0.1, subsetSVM_C=1, listlengthprior=3, listwidthprior=1, maxcardinality=2, minsupport=10, alpha = np.array([1.,1.]), n_chains=3, max_iter=50000, class1label="class 1", verbose=True):
-        self.training_subset = training_subset
-        self.subsetSVM_C = subsetSVM_C
-        
-        self.listlengthprior = listlengthprior
-        self.listwidthprior = listwidthprior
-        self.maxcardinality = maxcardinality
-        self.minsupport = minsupport
-        self.alpha = alpha
-        self.n_chains = n_chains
-        self.max_iter = max_iter
-        self.class1label = class1label
-        self.verbose = verbose
-        self._zmin = 1
-        
-        self.thinning = 1 #The thinning rate
-        self.burnin = self.max_iter//2 #the number of samples to drop as burn-in in-simulation
-        
-        self.discretizer = None
-        self.d_star = None
-        
-    def _setdata(self, X, y, feature_labels=[], undiscretized_features = []):
-        self._setlabels(X, feature_labels)
-        
-        for fi in range(len(X[0])):
-            if not isinstance(X[0][fi], numbers.Number):
-                raise Exception("Sorry, only numeric data is supported by BigDataRuleListClassifier at this time")
-        
-        # train linear SVM
-        self.svm = LinearSVC(C=self.subsetSVM_C)
-        self.svm.fit(X, y)
-        # calculate distances from decision boundary for each point
-        Xn = np.array(X)
-        dfun_ones = self.svm.decision_function(Xn[np.where(y==1)[0], :])
-        dist_ones = dfun_ones / np.linalg.norm(self.svm.coef_)
-        dfun_zeros = self.svm.decision_function(Xn[np.where(y==0)[0], :])
-        dist_zeros = dfun_zeros / np.linalg.norm(self.svm.coef_)
-        
-        # take closest training_subset portion of data, preserving class imbalance
-        if self.verbose:
-            print "Reduced from", len(X)
-        n = int(len(y)*self.training_subset)
-        bestidx_ones = np.argsort(dist_ones)
-        bestidx_zeros = np.argsort(dist_zeros)
-        one_fraction = len(np.where(y==1)[0])/float(len(y))
-        keep_idx = bestidx_ones[:(int(n*one_fraction)+1)]
-        keep_idx = np.hstack((keep_idx, bestidx_zeros[:(int(n*(1-one_fraction))+1)]))
-        
-        if type(X) == pd.DataFrame:
-            X = X.iloc[keep_idx, :]
-        else:
-            X = np.array(X)[keep_idx, :]
-        y = np.array(y)[keep_idx].astype(int)
-        if self.verbose:
-            print "...to", len(X), " data points"
-            
-        X = self._discretize_mixed_data(X, y, undiscretized_features)
-        return X, y
\ No newline at end of file
diff --git a/imodels/bayesian_rule_lists/discretization/MDLP.py b/imodels/bayesian_rule_lists/discretization/MDLP.py
index 7ceb2491..2e39eae6 100644
--- a/imodels/bayesian_rule_lists/discretization/MDLP.py
+++ b/imodels/bayesian_rule_lists/discretization/MDLP.py
@@ -2,7 +2,7 @@
 __author__ = 'Victor Ruiz, vmr11@pitt.edu'
 import pandas as pd
 import numpy as np
-from entropy import entropy, cut_point_information_gain
+from .entropy import entropy, cut_point_information_gain
 from math import log
 import sys
 import getopt
diff --git a/imodels/bayesian_rule_lists/discretization/__init__.py b/imodels/bayesian_rule_lists/discretization/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/imodels/bayesian_rule_lists/examples/diabetes_bigdata_demo.py b/imodels/bayesian_rule_lists/examples/diabetes_bigdata_demo.py
deleted file mode 100644
index a36d5a5a..00000000
--- a/imodels/bayesian_rule_lists/examples/diabetes_bigdata_demo.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from sklearn.cross_validation import train_test_split
-from sklearn.datasets.mldata import fetch_mldata
-from BigDataRuleListClassifier import *
-from SVMBigDataRuleListClassifier import *
-from sklearn.ensemble import RandomForestClassifier
-import time
-
-feature_labels = ["#Pregnant","Glucose concentration test","Blood pressure(mmHg)","Triceps skin fold thickness(mm)","2-Hour serum insulin (mu U/ml)","Body mass index","Diabetes pedigree function","Age (years)"]
-    
-data = fetch_mldata("diabetes") # get dataset
-y = -(data.target-1)/2 # target labels (0: healthy, or 1: diabetes) - the original dataset contains -1 for diabetes and +1 for healthy
-
-###############################################################################
-
-Xtrain, Xtest, ytrain, ytest = train_test_split(data.data, y) # split
-
-t0 = time.time()
-# train classifier (allow more iterations for better accuracy)
-clf = RuleListClassifier(max_iter=10000, class1label="diabetes", verbose=False)
-clf.fit(Xtrain, ytrain, feature_labels=feature_labels)
-print "RuleListClassifier Accuracy:", clf.score(Xtest, ytest), "Learned interpretable model:\n", clf
-t1 = time.time()
-
-# train classifier (allow more iterations for better accuracy)
-bclf = BigDataRuleListClassifier(training_subset=0.1, subset_estimator=RandomForestClassifier(n_estimators=100).fit(Xtrain, ytrain), max_iter=10000, class1label="diabetes", verbose=False)
-bclf.fit(Xtrain, ytrain, feature_labels=feature_labels)
-print "BigDataRuleListClassifier Accuracy:", bclf.score(Xtest, ytest), "Learned interpretable model:\n", bclf
-t2 = time.time()
-
-# train classifier (allow more iterations for better accuracy)
-sclf = SVMBigDataRuleListClassifier(training_subset=0.1, subsetSVM_C=0.01, max_iter=10000, class1label="diabetes", verbose=False)
-sclf.fit(Xtrain, ytrain, feature_labels=feature_labels)
-print "SVMBigDataRuleListClassifier Accuracy:", bclf.score(Xtest, ytest), "Learned interpretable model:\n", sclf
-t3 = time.time()
-
-print "Comparison\n========="
-print "Time taken for RuleListClassifier: ", t1-t0, "Score achieved:", clf.score(Xtest, ytest)
-print "Time taken for BigDataRuleListClassifier: ", t2-t1, "Score achieved:", bclf.score(Xtest, ytest)
-print "Time taken for SVMBigDataRuleListClassifier: ", t3-t2, "Score achieved:", sclf.score(Xtest, ytest)
-print "========"
-
-###############################################################################
-
-print "RandomForestClassifier Accuracy:", RandomForestClassifier().fit(Xtrain, ytrain).score(Xtest, ytest)
\ No newline at end of file
diff --git a/imodels/bayesian_rule_lists/examples/hepatitis_mixeddata_demo.py b/imodels/bayesian_rule_lists/examples/hepatitis_mixeddata_demo.py
deleted file mode 100644
index 32bbf714..00000000
--- a/imodels/bayesian_rule_lists/examples/hepatitis_mixeddata_demo.py
+++ /dev/null
@@ -1,93 +0,0 @@
-from sklearn.cross_validation import train_test_split
-from sklearn.datasets.mldata import fetch_mldata
-from RuleListClassifier import *
-from sklearn.ensemble import RandomForestClassifier
-import pandas as pd
-
-"""
-https://archive.ics.uci.edu/ml/datasets/Hepatitis
-1. Class: DIE, LIVE 
-2. AGE: 10, 20, 30, 40, 50, 60, 70, 80 
-3. SEX: male, female 
-4. STEROID: no, yes 
-5. ANTIVIRALS: no, yes 
-6. FATIGUE: no, yes 
-7. MALAISE: no, yes 
-8. ANOREXIA: no, yes 
-9. LIVER BIG: no, yes 
-10. LIVER FIRM: no, yes 
-11. SPLEEN PALPABLE: no, yes 
-12. SPIDERS: no, yes 
-13. ASCITES: no, yes 
-14. VARICES: no, yes 
-15. BILIRUBIN: 0.39, 0.80, 1.20, 2.00, 3.00, 4.00 
--- see the note below 
-16. ALK PHOSPHATE: 33, 80, 120, 160, 200, 250 
-17. SGOT: 13, 100, 200, 300, 400, 500, 
-18. ALBUMIN: 2.1, 3.0, 3.8, 4.5, 5.0, 6.0 
-19. PROTIME: 10, 20, 30, 40, 50, 60, 70, 80, 90 
-20. HISTOLOGY: no, yes 
-""" 
-data = fetch_mldata("datasets-UCI hepatitis") # get dataset
-
-#some data cleaning (due to horrible mldata format)
-# target
-y = [1 if 'live' in v[0].lower() else 0 for v in data['Class'][0]]
-# categorical variables
-data['SEX'] = data.data
-feature_labels = [col for col in data['COL_NAMES'] if col == col.upper()]
-columns = {}
-for label in feature_labels:
-    column = data[label] if len(data[label])>1 else data[label][0]
-    while type(column[0]) == list or type(column[0]) == np.ndarray:
-        column = [c[0] for c in column]
-    columns[label] = pd.Series(column)
-# numeric variables
-columns['AGE'] = data.target 
-columns['BILIRUBIN'] = pd.Series(data['double1'][0])
-columns['ALK PHOSPHATE'] = pd.Series(data['int2'][0])
-columns['ALK PHOSPHATE'][columns['ALK PHOSPHATE']<0] = np.nan
-columns['SGOT'] = pd.Series(data['int2'][1])
-columns['SGOT'][columns['SGOT']<0] = np.nan
-columns['ALBUMIN'] = pd.Series(data['double3'][0])
-columns['PROTIME'] = pd.Series(data['int4'][0])
-columns['PROTIME'][columns['PROTIME']<0] = np.nan
-# convert to dataframe
-hepatitis_df = pd.DataFrame(columns)
-# deal with missing values
-for c in hepatitis_df.columns:
-    if hepatitis_df[c].dtype != np.object:
-        hepatitis_df[c] = hepatitis_df[c].fillna(hepatitis_df[c][~np.isnan(hepatitis_df[c])].mean())
-
-print hepatitis_df.head()
-
-###############################################################################
-
-Xtrain, Xtest, ytrain, ytest = train_test_split(hepatitis_df, y) # split
-
-# train classifier (allow more iterations for better accuracy)
-clf = RuleListClassifier(max_iter=10000, class1label="survival", verbose=False)
-clf.fit(Xtrain, ytrain)
-
-print "RuleListClassifier Accuracy:", clf.score(Xtest, ytest), "Learned interpretable model:\n", clf
-
-###############################################################################
-
-try:
-    from category_encoders import HashingEncoder
-except:
-    raise Exception("Please install category_encoders (pip install category_encoders) for comparing mixed data with Random Forests!")
-from sklearn import pipeline
-
-ppl = pipeline.Pipeline([
-    ('encoder', HashingEncoder(cols=['LIVER_BIG', 'ANTIVIRALS', 'HISTOLOGY', 'SEX', 'STEROID', 'MALAISE', 'FATIGUE', 'SPIDERS', 'VARICES', 'LIVER_FIRM', 'SPLEEN_PALPABLE', 'ASCITES', 'ANOREXIA'])),
-    ('clf', RandomForestClassifier())
-])
-
-# back to dataframes (for HashingEncoder)
-Xtrain = pd.DataFrame(Xtrain)
-Xtrain.columns = hepatitis_df.columns
-Xtest = pd.DataFrame(Xtest)
-Xtest.columns = hepatitis_df.columns
-
-print "RandomForestClassifier Accuracy:", ppl.fit(Xtrain, ytrain).score(Xtest, ytest)
\ No newline at end of file
diff --git a/imodels/optimal_classification_tree/__init__.py b/imodels/optimal_classification_tree/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/imodels/slim/__init__.py b/imodels/slim/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/setup.py b/setup.py
index d3425ba2..5f5806c9 100644
--- a/setup.py
+++ b/setup.py
@@ -13,6 +13,17 @@
     long_description_content_type="text/markdown",
     url="https://github.com/csinva/interpretability-implementations-demos",
     packages=setuptools.find_packages(),
+    install_requires=[
+        'fim',
+        'numpy',
+        'scipy',
+        'matplotlib',
+        'pandas',
+        'scikit-learn',
+    ],
+    dependency_links=[
+        'https://github.com/csinva/pyfim-clone/tarball/master#egg=fim-6.28'
+    ],
     classifiers=[
         "Programming Language :: Python :: 3",
         "License :: OSI Approved :: MIT License",