bump version

csinva · Nov 15, 2020 · a68e2e5 · a68e2e5
1 parent 87b75b4
commit a68e2e5
Show file tree

Hide file tree

Showing 5 changed files with 90 additions and 70 deletions.
diff --git a/docs/index.html b/docs/index.html
@@ -46,7 +46,7 @@ <h2 id="popular-interpretable-models">Popular interpretable models</h2>
 preds = model.predict(X_test) # discrete predictions: shape is (n_test, 1)
 preds_proba = model.predict_proba(X_test) # predicted probabilities: shape is (n_test, n_classes)
 </code></pre>
-<p>Install with <code>pip install git+https://github.com/csinva/imodels</code> (see <a href="https://github.com/csinva/imodels/blob/master/docs/troubleshooting.md">here</a> for help). Contains the following models:</p>
+<p>Install with <code>pip install imodels</code> (see <a href="https://github.com/csinva/imodels/blob/master/docs/troubleshooting.md">here</a> for help). Contains the following models:</p>
 <table>
 <thead>
 <tr>
@@ -139,7 +139,7 @@ <h2 id="references">References</h2>
 <li>Review on evaluating interpretability: doshi-velez &amp; kim 2017, <a href="https://arxiv.org/pdf/1702.08608.pdf">pdf</a></li>
 </ul>
 </li>
-<li>Reference implementations (also linked above): the code here heavily derives from (and in some case is just a wrapper for) the wonderful work of previous projects. We seek to to extract out, combine, and maintain select relevant parts of these projects.<ul>
+<li>Reference implementations (also linked above): the code here heavily derives from the wonderful work of previous projects. We seek to to extract out, unify, and maintain key parts of these projects.<ul>
 <li><a href="https://github.com/tmadl/sklearn-expertsys">sklearn-expertsys</a> - by <a href="https://github.com/tmadl">@tmadl</a> and <a href="https://github.com/kenben">@kenben</a> based on original code by <a href="http://lethalletham.com/">Ben Letham</a></li>
 <li><a href="https://github.com/christophM/rulefit">rulefit</a> - by <a href="https://github.com/christophM">@christophM</a></li>
 <li><a href="https://github.com/scikit-learn-contrib/skope-rules">skope-rules</a> - by the <a href="https://github.com/scikit-learn-contrib/skope-rules/blob/master/AUTHORS.rst">skope-rules team</a> (including <a href="https://github.com/ngoix">@ngoix</a>, <a href="https://github.com/floriangardin">@floriangardin</a>, <a href="https://github.com/datajms">@datajms</a>, <a href="">Bibi Ndiaye</a>, <a href="">Ronan Gautier</a>)</li>

diff --git a/docs/rule_list/bayesian_rule_list/bayesian_rule_list.html b/docs/rule_list/bayesian_rule_list/bayesian_rule_list.html
@@ -178,34 +178,36 @@ <h1 class="title">Module <code>imodels.rule_list.bayesian_rule_list.bayesian_rul
             y = y.values
 
         X, y = self._setdata(X, y, feature_labels, undiscretized_features)
-
         permsdic = defaultdict(default_permsdic)  # We will store here the MCMC results
-
         data = list(X[:])
-        
+
         # Now find frequent itemsets
-        # Mine separately for each class
-        data_pos = [x for i, x in enumerate(data) if y[i] == 0]
-        data_neg = [x for i, x in enumerate(data) if y[i] == 1]
-        assert len(data_pos) + len(data_neg) == len(data)
 
-        X_df = pd.DataFrame(X, columns=feature_labels)
-        itemsets_df = fpgrowth(X_df, min_support=(self.minsupport / len(X)), max_len=self.maxcardinality)
+        X_colname_removed = data.copy()
+        for i in range(len(data)):
+            X_colname_removed[i] = list(map(lambda s: s.split(&#39; : &#39;)[1], X_colname_removed[i]))
+
+        X_df_categorical = pd.DataFrame(X_colname_removed, columns=feature_labels)
+        X_df_onehot = pd.get_dummies(X_df_categorical)
+        onehot_features = X_df_onehot.columns
+
+        itemsets_df = fpgrowth(X_df_onehot, min_support=(self.minsupport / len(X)), max_len=self.maxcardinality)
         itemsets_indices = [tuple(s[1]) for s in itemsets_df.values]
-        itemsets = [np.array(feature_labels)[list(inds)] for inds in itemsets_indices]
+        itemsets = [np.array(onehot_features)[list(inds)] for inds in itemsets_indices]
         itemsets = list(map(tuple, itemsets))
         if self.verbose:
             print(len(itemsets), &#39;rules mined&#39;)
 
 
         # Now form the data-vs.-lhs set
         # X[j] is the set of data points that contain itemset j (that is, satisfy rule j)
-        for c in X_df.columns:
-            X_df[c] = [c if x == 1 else &#39;&#39; for x in list(X_df[c])]
+        for c in X_df_onehot.columns:
+            X_df_onehot[c] = [c if x == 1 else &#39;&#39; for x in list(X_df_onehot[c])]
         X = [{}] * (len(itemsets) + 1)
         X[0] = set(range(len(data)))  # the default rule satisfies all data
         for (j, lhs) in enumerate(itemsets):
-            X[j + 1] = set([i for (i, xi) in enumerate(X_df.values) if set(lhs).issubset(xi)])
+            X[j + 1] = set([i for (i, xi) in enumerate(X_df_onehot.values) if set(lhs).issubset(xi)])
+
 
 
         # now form lhs_len
@@ -290,15 +292,20 @@ <h1 class="title">Module <code>imodels.rule_list.bayesian_rule_list.bayesian_rul
             return &#34;(Untrained RuleListClassifier)&#34;
 
     def _to_itemset_indices(self, data):
+        X_colname_removed = data.copy()
+        for i in range(len(data)):
+            X_colname_removed[i] = list(map(lambda s: s.split(&#39; : &#39;)[1], X_colname_removed[i]))
+        X_df_categorical = pd.DataFrame(X_colname_removed, columns=self.feature_labels)
+        X_df_onehot = pd.get_dummies(X_df_categorical)
+
         # X[j] is the set of data points that contain itemset j (that is, satisfy rule j)
-        X_df = pd.DataFrame(data, columns=self.feature_labels)
-        for c in X_df.columns:
-            X_df[c] = [c if x == 1 else &#39;&#39; for x in list(X_df[c])]
+        for c in X_df_onehot.columns:
+            X_df_onehot[c] = [c if x == 1 else &#39;&#39; for x in list(X_df_onehot[c])]
         X = [set() for j in range(len(self.itemsets))]
         X[0] = set(range(len(data)))  # the default rule satisfies all data
         for (j, lhs) in enumerate(self.itemsets):
             if j &gt; 0:
-                X[j] = set([i for (i, xi) in enumerate(X_df.values) if set(lhs).issubset(xi)])
+                X[j] = set([i for (i, xi) in enumerate(X_df_onehot.values) if set(lhs).issubset(xi)])
         return X
 
     def predict_proba(self, X):
@@ -322,7 +329,7 @@ <h1 class="title">Module <code>imodels.rule_list.bayesian_rule_list.bayesian_rul
         if self.discretizer:
             self.discretizer._data = pd.DataFrame(X, columns=self.feature_labels)
             self.discretizer.apply_cutpoints()
-            D = self._prepend_feature_labels(np.array(self.discretizer._data)[:, :-1])
+            D = self._prepend_feature_labels(np.array(self.discretizer._data))
         else:
             D = X
 
@@ -536,34 +543,36 @@ <h2 id="parameters">Parameters</h2>
             y = y.values
 
         X, y = self._setdata(X, y, feature_labels, undiscretized_features)
-
         permsdic = defaultdict(default_permsdic)  # We will store here the MCMC results
-
         data = list(X[:])
-        
+
         # Now find frequent itemsets
-        # Mine separately for each class
-        data_pos = [x for i, x in enumerate(data) if y[i] == 0]
-        data_neg = [x for i, x in enumerate(data) if y[i] == 1]
-        assert len(data_pos) + len(data_neg) == len(data)
 
-        X_df = pd.DataFrame(X, columns=feature_labels)
-        itemsets_df = fpgrowth(X_df, min_support=(self.minsupport / len(X)), max_len=self.maxcardinality)
+        X_colname_removed = data.copy()
+        for i in range(len(data)):
+            X_colname_removed[i] = list(map(lambda s: s.split(&#39; : &#39;)[1], X_colname_removed[i]))
+
+        X_df_categorical = pd.DataFrame(X_colname_removed, columns=feature_labels)
+        X_df_onehot = pd.get_dummies(X_df_categorical)
+        onehot_features = X_df_onehot.columns
+
+        itemsets_df = fpgrowth(X_df_onehot, min_support=(self.minsupport / len(X)), max_len=self.maxcardinality)
         itemsets_indices = [tuple(s[1]) for s in itemsets_df.values]
-        itemsets = [np.array(feature_labels)[list(inds)] for inds in itemsets_indices]
+        itemsets = [np.array(onehot_features)[list(inds)] for inds in itemsets_indices]
         itemsets = list(map(tuple, itemsets))
         if self.verbose:
             print(len(itemsets), &#39;rules mined&#39;)
 
 
         # Now form the data-vs.-lhs set
         # X[j] is the set of data points that contain itemset j (that is, satisfy rule j)
-        for c in X_df.columns:
-            X_df[c] = [c if x == 1 else &#39;&#39; for x in list(X_df[c])]
+        for c in X_df_onehot.columns:
+            X_df_onehot[c] = [c if x == 1 else &#39;&#39; for x in list(X_df_onehot[c])]
         X = [{}] * (len(itemsets) + 1)
         X[0] = set(range(len(data)))  # the default rule satisfies all data
         for (j, lhs) in enumerate(itemsets):
-            X[j + 1] = set([i for (i, xi) in enumerate(X_df.values) if set(lhs).issubset(xi)])
+            X[j + 1] = set([i for (i, xi) in enumerate(X_df_onehot.values) if set(lhs).issubset(xi)])
+
 
 
         # now form lhs_len
@@ -648,15 +657,20 @@ <h2 id="parameters">Parameters</h2>
             return &#34;(Untrained RuleListClassifier)&#34;
 
     def _to_itemset_indices(self, data):
+        X_colname_removed = data.copy()
+        for i in range(len(data)):
+            X_colname_removed[i] = list(map(lambda s: s.split(&#39; : &#39;)[1], X_colname_removed[i]))
+        X_df_categorical = pd.DataFrame(X_colname_removed, columns=self.feature_labels)
+        X_df_onehot = pd.get_dummies(X_df_categorical)
+
         # X[j] is the set of data points that contain itemset j (that is, satisfy rule j)
-        X_df = pd.DataFrame(data, columns=self.feature_labels)
-        for c in X_df.columns:
-            X_df[c] = [c if x == 1 else &#39;&#39; for x in list(X_df[c])]
+        for c in X_df_onehot.columns:
+            X_df_onehot[c] = [c if x == 1 else &#39;&#39; for x in list(X_df_onehot[c])]
         X = [set() for j in range(len(self.itemsets))]
         X[0] = set(range(len(data)))  # the default rule satisfies all data
         for (j, lhs) in enumerate(self.itemsets):
             if j &gt; 0:
-                X[j] = set([i for (i, xi) in enumerate(X_df.values) if set(lhs).issubset(xi)])
+                X[j] = set([i for (i, xi) in enumerate(X_df_onehot.values) if set(lhs).issubset(xi)])
         return X
 
     def predict_proba(self, X):
@@ -680,7 +694,7 @@ <h2 id="parameters">Parameters</h2>
         if self.discretizer:
             self.discretizer._data = pd.DataFrame(X, columns=self.feature_labels)
             self.discretizer.apply_cutpoints()
-            D = self._prepend_feature_labels(np.array(self.discretizer._data)[:, :-1])
+            D = self._prepend_feature_labels(np.array(self.discretizer._data))
         else:
             D = X
 
@@ -808,34 +822,36 @@ <h2 id="returns">Returns</h2>
         y = y.values
 
     X, y = self._setdata(X, y, feature_labels, undiscretized_features)
-
     permsdic = defaultdict(default_permsdic)  # We will store here the MCMC results
-
     data = list(X[:])
-    
+
     # Now find frequent itemsets
-    # Mine separately for each class
-    data_pos = [x for i, x in enumerate(data) if y[i] == 0]
-    data_neg = [x for i, x in enumerate(data) if y[i] == 1]
-    assert len(data_pos) + len(data_neg) == len(data)
 
-    X_df = pd.DataFrame(X, columns=feature_labels)
-    itemsets_df = fpgrowth(X_df, min_support=(self.minsupport / len(X)), max_len=self.maxcardinality)
+    X_colname_removed = data.copy()
+    for i in range(len(data)):
+        X_colname_removed[i] = list(map(lambda s: s.split(&#39; : &#39;)[1], X_colname_removed[i]))
+
+    X_df_categorical = pd.DataFrame(X_colname_removed, columns=feature_labels)
+    X_df_onehot = pd.get_dummies(X_df_categorical)
+    onehot_features = X_df_onehot.columns
+
+    itemsets_df = fpgrowth(X_df_onehot, min_support=(self.minsupport / len(X)), max_len=self.maxcardinality)
     itemsets_indices = [tuple(s[1]) for s in itemsets_df.values]
-    itemsets = [np.array(feature_labels)[list(inds)] for inds in itemsets_indices]
+    itemsets = [np.array(onehot_features)[list(inds)] for inds in itemsets_indices]
     itemsets = list(map(tuple, itemsets))
     if self.verbose:
         print(len(itemsets), &#39;rules mined&#39;)
 
 
     # Now form the data-vs.-lhs set
     # X[j] is the set of data points that contain itemset j (that is, satisfy rule j)
-    for c in X_df.columns:
-        X_df[c] = [c if x == 1 else &#39;&#39; for x in list(X_df[c])]
+    for c in X_df_onehot.columns:
+        X_df_onehot[c] = [c if x == 1 else &#39;&#39; for x in list(X_df_onehot[c])]
     X = [{}] * (len(itemsets) + 1)
     X[0] = set(range(len(data)))  # the default rule satisfies all data
     for (j, lhs) in enumerate(itemsets):
-        X[j + 1] = set([i for (i, xi) in enumerate(X_df.values) if set(lhs).issubset(xi)])
+        X[j + 1] = set([i for (i, xi) in enumerate(X_df_onehot.values) if set(lhs).issubset(xi)])
+
 
 
     # now form lhs_len
@@ -953,7 +969,7 @@ <h2 id="returns">Returns</h2>
     if self.discretizer:
         self.discretizer._data = pd.DataFrame(X, columns=self.feature_labels)
         self.discretizer.apply_cutpoints()
-        D = self._prepend_feature_labels(np.array(self.discretizer._data)[:, :-1])
+        D = self._prepend_feature_labels(np.array(self.discretizer._data))
     else:
         D = X