Yu-Group · andleb · Nov 16, 2021 · Nov 16, 2021 · Nov 17, 2021 · Nov 18, 2021
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,7 @@
 **.ipynb_checkpoints
-**.pkl
+# **.pkl
 **cache*
-**.pdf
+# **.pdf
 **.zip
 **.npy
 **AutogluonModels*
@@ -29,7 +29,6 @@ venv
 
 build
 .gitmodules
-build
 dist
 
 **.gslides
@@ -39,3 +38,20 @@ dist
 experiments/data/
 !experiments/data/get_datasets.ipynb
 .hypothesis
+
+# directories
+/rule-env/
+/data/tbi_pecarn/
+/docs/projects/tbi_pecarn/~$glis Meeting Notes.docx
+/docs/projects/tbi_pecarn/.~lock.Inglis Meeting Notes.docx\#
+/activate.sh
+/rulevetting/projects/tbi_pecarn/.Rhistory
+!/rulevetting/projects/tbi_pecarn/notebooks/models
+/models/decision_tree.pkl
+
+/rulevetting/projects/tbi_pecarn/notebooks/models/decision_tree.pkl
+/rulevetting/projects/tbi_pecarn/notebooks/models/grl.pkl
+/rulevetting/projects/tbi_pecarn/notebooks/models/rulefit.pkl
+*.RData
+/Feedback.pdf
+/writeup-annotated.pdf
diff --git a/docs/projects/tbi_pecarn/Friedman, Popescu - RuleFit.pdf b/docs/projects/tbi_pecarn/Friedman, Popescu - RuleFit.pdf
diff --git a/docs/projects/tbi_pecarn/Inglis Meeting Notes.docx b/docs/projects/tbi_pecarn/Inglis Meeting Notes.docx
diff --git a/docs/projects/tbi_pecarn/Kuppermann_2009_The-Lancet_000.pdf b/docs/projects/tbi_pecarn/Kuppermann_2009_The-Lancet_000.pdf
diff --git a/rulevetting/projects/tbi_pecarn/ModelFitting.py b/rulevetting/projects/tbi_pecarn/ModelFitting.py
@@ -0,0 +1,194 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+import warnings
+import pickle as pkl
+from sklearn import ensemble
+from sklearn import tree
+from sklearn import metrics
+from sklearn.tree import DecisionTreeClassifier,plot_tree
+from sklearn.ensemble import RandomForestClassifier
+from tqdm import tqdm
+from os.path import join as oj
+import imodels
+MODELS_DIR = './models'
+os.makedirs(MODELS_DIR, exist_ok=True)
+
+# Can update directory to read csv properly
+os.chdir("C://Users/ianan/Downloads")
+tbi_data = pd.read_csv("clean_dataset_11_30.csv")
+
+# Train/test methods from fit interpretable models:
+
+df_train, df_tune, _ = tbi_data
+X_train = df_train.drop(columns="outcome")
+y_train = df_train["outcome"].values
+X_tune = df_tune.drop(columns="outcome")
+y_tune = df_tune["outcome"].values
+processed_feats = df_train.keys().values.tolist()
+
+
+def rename(x):
+    RENAME_DICT = {}
+    x = (
+        RENAME_DICT
+            .get(x, x.replace('_yes', '')
+                 .replace('_', ' '))
+            .replace('ThoracicTrauma', 'Thoracic Trauma')
+            .replace('VomitWretch', 'Vomit/wretch')
+            .replace('AbdDistention or AbdomenPain', 'Abdominal distention / Abdomen pain')
+            .replace('DecrBreathSounds', 'Decreased breath sounds')
+            .replace('AbdTenderDegree None', 'No Abdominal Tenderness')
+    )
+    return x
+feature_names = [rename(x) for x in list(X_train)]
+
+def all_stats_curve(y_test, preds_proba, plot=False, thresholds=None):
+    '''preds_proba should be 1d
+    '''
+    if thresholds is None:
+        thresholds = sorted(np.unique(preds_proba))
+    all_stats = {
+        s: [] for s in ['sens', 'spec', 'ppv', 'npv', 'lr+', 'lr-', 'f1']
+    }
+    for threshold in tqdm(thresholds):
+        preds = preds_proba > threshold
+        tn, fp, fn, tp = metrics.confusion_matrix(y_test, preds).ravel()
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            sens = tp / (tp + fn)
+            spec = tn / (tn + fp)
+            all_stats['sens'].append(sens)
+            all_stats['spec'].append(spec)
+            all_stats['ppv'].append(tp / (tp + fp))
+            all_stats['npv'].append(tn / (tn + fn))
+            all_stats['lr+'].append(sens / (1 - spec))
+            all_stats['lr-'].append((1 - sens) / spec)
+            all_stats['f1'].append(tp / (tp + 0.5 * (fp + fn)))
+
+    if plot:
+        plt.plot(all_stats['sens'], all_stats['spec'], '.-')
+        plt.xlabel('sensitivity')
+        plt.ylabel('specificity')
+        plt.grid()
+    return all_stats, thresholds
+
+def predict_and_save(model, model_name='decision_tree'):
+    '''Plots cv and returns cv, saves all stats
+    '''
+    results = {'model': model}
+    for x, y, suffix in zip([X_train, X_tune],
+                            [y_train, y_tune],
+                            ['_train', '_tune']):
+        stats, threshes = all_stats_curve(y, model.predict_proba(x)[:, 1],
+                                                     plot=suffix == '_tune')
+        for stat in stats.keys():
+            results[stat + suffix] = stats[stat]
+        results['threshes' + suffix] = threshes
+    pkl.dump(results, open(oj(MODELS_DIR, model_name + '.pkl'), 'wb'))
+    return stats, threshes
+
+# fit decision tree
+dt = DecisionTreeClassifier(max_depth=4, class_weight={0: 1, 1: 1e3})
+dt.fit(X_train, y_train)
+stats, threshes = predict_and_save(dt, model_name='decision_tree')
+# plt.xlim((0.8, 1.0))
+# plt.ylim((0.5, 1.0))
+plt.show()
+
+fig = plt.figure(figsize=(50, 40))
+plot_tree(dt, feature_names=feature_names, filled=True)
+plt.show()
+
+# fit random forest
+
+# specify a decision tree with a maximum depth
+rf = RandomForestClassifier(n_estimators=200,max_depth=5)
+rf.fit(X_train, y_train)
+sens, spec, threshes = predict_and_save(rf, model_name='rf')
+imps = rf.feature_importances_.round(3)
+args = np.argsort(imps)
+imps = imps[args]
+feats = np.array(processed_feats)[args]
+for imp, feat in zip(imps, feats):
+    print(feat, imp)
+
+np.random.seed(13)
+# train classifier (allow more iterations for better accuracy; use BigDataRuleListClassifier for large datasets)
+print('training bayesian_rule_list...')
+brl = imodels.BayesianRuleListClassifier(listlengthprior=2, max_iter=10000, class1label="IwI", verbose=False)
+brl.fit(X_train, y_train, feature_names=feature_names)
+stats, threshes = predict_and_save(brl, model_name='bayesian_rule_list')
+print(brl)
+
+# fit a rulefit model
+np.random.seed(13)
+rulefit = imodels.RuleFitRegressor(max_rules=4)
+rulefit.fit(X_train, y_train, feature_names=feature_names)
+
+# preds = rulefit.predict(X_test)
+stats, threshes = predict_and_save(rulefit, model_name='rulefit')
+rulefit.visualize()
+
+class_weight = {0: 1, 1: 100}
+d = imodels.GreedyRuleListClassifier(max_depth=9, class_weight=class_weight, criterion='neg_corr')
+d.fit(X_train, y_train, feature_names=feature_names, verbose=False)
+stats, threshes = predict_and_save(d, model_name='grl')
+# d.print_list()
+print(d)
+
+def plot_metrics(suffix, title=None, fs=15):
+    for fname in sorted(os.listdir(MODELS_DIR)):
+        if 'pkl' in fname:
+            if not fname[:-4] == 'rf':
+                r = pkl.load(open(oj(MODELS_DIR, fname), 'rb'))
+                #         print(r)
+                #                 print(r.keys())
+
+                threshes = np.array(r['threshes' + suffix])
+                sens = np.array(r['sens' + suffix])
+                spec = np.array(r['spec' + suffix])
+                plt.plot(100 * sens, 100 * spec, 'o-', label=fname[:-4], alpha=0.6, markersize=3)
+                plt.xlabel('Sensitivity (%)', fontsize=fs)
+                plt.ylabel('Specificity (%)', fontsize=fs)
+                s = suffix[1:]
+                if title is None:
+                    plt.title(f'{s}\n{data_sizes[s][0]} IAI-I / {data_sizes[s][1]}')
+                else:
+                    plt.title(title, fontsize=fs)
+
+                # print best results
+                if suffix == '_test2':
+                    idxs = (sens > 0.95) & (spec > 0.43)
+                    if np.sum(idxs) > 0:
+                        idx_max = np.argmax(spec[idxs])
+                        print(fname, f'{100 * sens[idxs][idx_max]:0.2f} {100 * spec[idxs][idx_max]:0.2f}')
+
+    if suffix == '_test2':
+        plt.plot(96.77, 43.98, 'o', color='black', label='Original CDR', ms=4)
+    else:
+        plt.plot(97.0, 42.5, 'o', color='black', label='Original CDR', ms=4)
+    plt.grid()
+
+
+suffixes = ['_train', '_tune']  # _train, _test1, _test2, _cv
+titles = ['Train (PECARN)', 'Tune (PECARN)']
+R, C = 1, len(suffixes)
+plt.figure(dpi=200, figsize=(C * 2.5, R * 3), facecolor='w')
+fs = 10
+for i, suffix in enumerate(suffixes):
+    ax = plt.subplot(R, C, i + 1)
+    plot_metrics(suffix, title=titles[i], fs=fs)
+    if i > 0:
+        plt.ylabel('')
+        plt.yticks([0, 25, 50, 75, 100], labels=[''] * 5)
+    #         ax.yaxis.set_visible(False)
+    plt.xlim((50, 101))
+    plt.ylim((0, 101))
+plt.tight_layout()
+# plt.subplot(R, C, 1)
+# plt.legend(fontsize=20)
+plt.legend(bbox_to_anchor=(1.1, 1), fontsize=fs, frameon=False)
+plt.savefig('figs/metrics_3_splits')
+plt.show()
diff --git a/rulevetting/projects/tbi_pecarn/baseline.py b/rulevetting/projects/tbi_pecarn/baseline.py
@@ -0,0 +1,105 @@
+import pandas as pd
+
+from rulevetting.templates.model import ModelTemplate
+
+
+class Baseline(ModelTemplate):
+
+    def __init__(self, agegroup: str = "old"):
+        # query for each rule + resulting predicted probability
+        self.agegroup = agegroup
+        # Kuppermann get two classifiers based on chilren age < 2 or >= 2
+        if self.agegroup == 'young':
+            self.rules = [
+                ('AMS == 1', 4.1),
+                ('HemaLoc == [2, 3]', 1.9),
+                ('LocLen == [2, 3, 4]', 2.0),
+                ('High_impact_InjSev == 3', 0.5),
+                ('SFxPalp == 1', 33.3),
+                ('ActNorm == 0', 0.4),
+
+                # final condition is just something that is always true
+                ('GCSTotal >= 0', 0.03),
+            ]
+        if self.agegroup == 'old':
+            self.rules = [
+                ('AMS == 1', 4.1),
+                ('LOCSeparate == [1, 2]', 1.2),
+                ('Vomit == 1', 0.9),
+                ('High_impact_InjSev == 3', 0.5),
+                ('SFxBas == 1', 9.0),
+                ('HASeverity == 3', 1.3),
+
+                # final condition is just something that is always true
+                ('GCSTotal >= 0', 0.05),
+            ]
+
+    def _traverse_rule(self, df_features: pd.DataFrame):
+        str_print = f''
+        predicted_probabilities = pd.Series(index=df_features.index, dtype=float)
+        df = df_features.copy()
+        o = 'outcome'  # outcome variable name
+        str_print += f'{df[o].sum()} / {df.shape[0]} (positive class / total)\n\t\u2193 \n'
+        for j, rule in enumerate(self.rules):
+            query, prob = rule
+            df_rhs = df.query(query)
+            idxs_satisfying_rule = df_rhs.index
+            # the prob we used in rule should be the approx of computed_prob (frequency)
+            predicted_probabilities.loc[idxs_satisfying_rule] = prob
+            # drop the rows we just assigned prob
+            df.drop(index=idxs_satisfying_rule, inplace=True)
+            # compute the frequency in percent
+            computed_prob = 100 * df_rhs[o].sum() / df_rhs.shape[0]
+            query_print = query.replace(' == 1', '')  # for print purpose
+            if j < len(self.rules) - 1:
+                str_print += f'\033[96mIf {query_print:<35}\033[00m \u2192 {df_rhs[o].sum():>3} / {df_rhs.shape[0]:>4} ({computed_prob:0.1f}%)\n\t\u2193 \n   {df[o].sum():>3} / {df.shape[0]:>5}\t \n'
+        # we have assigned all patients prob
+        predicted_probabilities = predicted_probabilities.values
+        self.str_print = str_print
+        return predicted_probabilities
+
+    def predict(self, df_features: pd.DataFrame):
+        predicted_probabilities = self._traverse_rule(df_features)
+        # for each age group, do different prediction
+        # (based on the  prob from final condition - the one always true)
+        if self.agegroup == "young":
+            return (predicted_probabilities > 0.031).astype(int)
+        if self.agegroup == "old":
+            return (predicted_probabilities > 0.051).astype(int)
+
+    def predict_proba(self, df_features: pd.DataFrame):
+        # convert from percent to value
+        predicted_probabilities = self._traverse_rule(df_features) / 100
+        return np.vstack((1 - predicted_probabilities, predicted_probabilities)).transpose()
+
+    def print_model(self, df_features):
+        self._traverse_rule(df_features)
+        return self.str_print
+
+
+if __name__ == '__main__':
+    import numpy as np
+    import pandas as pd
+    from rulevetting.projects.tbi_pecarn.dataset import Dataset
+
+    # use original data
+    tbi_df = Dataset().clean_data()
+    tbi_df.index = tbi_df.PatNum.copy()
+
+    # data processing - same as Kuppermann
+    tbi_df = tbi_df[tbi_df['GCSGroup'] == 2]
+    tbi_df.drop(tbi_df[tbi_df.outcome.isnull()].index,
+                inplace=True)
+
+    # divided by ages - same as Kuppermann
+    tbi_df_young = tbi_df[tbi_df['AgeinYears'] < 2]
+    tbi_df_old = tbi_df[tbi_df['AgeinYears'] >= 2]
+
+    # baseline for age < 2
+    model_young = Baseline("young")
+    preds_proba = model_young.predict_proba(tbi_df_young)
+    print(model_young.print_model(tbi_df_young))
+
+    # baseline for age >= 2
+    model_old = Baseline('old')
+    print(model_old.print_model(tbi_df_old))
diff --git a/rulevetting/projects/tbi_pecarn/contributions.md b/rulevetting/projects/tbi_pecarn/contributions.md
@@ -0,0 +1,37 @@
+# Who Contributed What to the Project
+
+## Repo Files:
+
++ readme.md: Jimmy
++ data_dictionary.md: Jimmy
++ dataset.py: Andrej coded preprocessing steps from preprocessing notebook, implemented the judgement calls (which is complex and required a ton of time and effort) and wrote the final version; Jimmy did data cleaning and preprocessing; Xin did a little help (modified other functions)
++ helper.py: Andrej
++ baseline.py: Xin
++ model_best.py (and model_best.pkl): Xin
++ notebooks for EDA, BestModelStabilityCheck, PostHocAnalysis, preprocessing: Jimmy
++ notebooks for Fitmodels: Andrej
++ notebooks for Fitmodels, RuleFit (and its CV), Grl (and its CV): Xin
++ Rmd for rf (random forest): Ian
+
+## Write-up (who wrote what)
+
++ Introduction and problem motivation: Jimmy
++ Data collection: Jimmy
++ Data cleaning and preprocessing: Jimmy
++ Exploratory Data Analysis: Jimmy
++ Baseline model: Xin
++ Proposed model Rulefit: Introduction and Basic Illustrative Implementation: Andrej
++ Proposed model Procedure and Performance: Xin
++ Random Assessment: Andrej
++ Stability Analysis: Jimmy wrote intro paragraph, baseline model paragraphs in Feature Stability, and Classification Result Stability; Ian wrote random forest paragraphs in Feature Stability.
++ Discussion: Jimmy wrote the first paragraph discussing false negative; Xin wrote other parts.
++ Conclusion: Xin
+
++ Proofreading: all
+
+## Meeting with Physician:
++ 1st meeting: EDA powerpoint prepared by Jimmy
++ 2nd meeting: notebooks of results/EDA prepared by Jimmy and Xin
+
+## Technical Support:
++ Andrej