diff --git a/feature_importance/01_run_ablation_classification.py b/feature_importance/01_run_ablation_classification.py index adf1199..989f5cc 100644 --- a/feature_importance/01_run_ablation_classification.py +++ b/feature_importance/01_run_ablation_classification.py @@ -22,7 +22,7 @@ from sklearn.linear_model import LogisticRegressionCV from sklearn.svm import SVC import xgboost as xgb -from imodels.importance import RandomForestPlusRegressor, RandomForestPlusClassifier +from imodels.tree.rf_plus.rf_plus.rf_plus_models import RandomForestPlusRegressor, RandomForestPlusClassifier sys.path.append(".") sys.path.append("..") sys.path.append("../..") @@ -62,11 +62,11 @@ def ablation_to_mean(train, data, feature_importance, mode, num_features): """ - Replace the top num_features max feature importance data with random shuffle for each sample + Replace the top num_features max feature importance data with mean value for each sample """ train_mean = np.mean(train, axis=0) assert mode in ["max", "min"] - fi = feature_importance.to_numpy() + fi = feature_importance if mode == "max": indices = np.argsort(-fi) else: @@ -77,17 +77,18 @@ def ablation_to_mean(train, data, feature_importance, mode, num_features): data_copy[i, indices[i,j]] = train_mean[indices[i,j]] return data_copy -def ablation_by_addition(data, feature_importance, mode, num_features): +def ablation_by_addition(train, data, feature_importance, mode, num_features): """ - Initialize the data with zeros and add the top num_features max feature importance data for each sample + Initialize the data with mean values and add the top num_features max feature importance data for each sample """ assert mode in ["max", "min"] - fi = feature_importance.to_numpy() + fi = feature_importance if mode == "max": indices = np.argsort(-fi) else: indices = np.argsort(fi) - data_copy = np.zeros(data.shape) + row_values = np.mean(train, axis=0).tolist() + data_copy = np.array([row_values] * data.shape[0]) for i in range(data.shape[0]): for j in range(num_features): data_copy[i, indices[i,j]] = data[i, indices[i,j]] @@ -137,21 +138,20 @@ def compare_estimators(estimators: List[ModelConfig], y_tune = y y_test = y - normalizer = preprocessing.Normalizer() - if splitting_strategy == "train-test": - X_train = normalizer.fit_transform(X_train) - X_test = normalizer.transform(X_test) - else: - X = normalizer.fit_transform(X) - X_train = normalizer.transform(X_train) - X_test = normalizer.transform(X_test) - - - # fit model + # fit RF model est.fit(X_train, y_train) - test_all_auc = roc_auc_score(y_test, est.predict_proba(X_test)[:, 1]) - test_all_auprc = average_precision_score(y_test, est.predict_proba(X_test)[:, 1]) - test_all_f1 = f1_score(y_test, est.predict_proba(X_test)[:, 1] > 0.5) + test_all_auc_rf = roc_auc_score(y_test, est.predict_proba(X_test)[:, 1]) + test_all_auprc_rf = average_precision_score(y_test, est.predict_proba(X_test)[:, 1]) + test_all_f1_rf = f1_score(y_test, est.predict_proba(X_test)[:, 1] > 0.5) + + # fit RF_plus model + start = time.time() + rf_plus_base = RandomForestPlusClassifier(rf_model=est) + rf_plus_base.fit(X_train, y_train) + end = time.time() + test_all_auc_rf_plus = roc_auc_score(y_test, rf_plus_base.predict_proba(X_test)[:, 1]) + test_all_auprc_rf_plus = average_precision_score(y_test, rf_plus_base.predict_proba(X_test)[:, 1]) + test_all_f1_rf_plus = f1_score(y_test, rf_plus_base.predict_proba(X_test)[:, 1] > 0.5) np.random.seed(42) indices_train = np.random.choice(X_train.shape[0], 100, replace=False) @@ -161,46 +161,57 @@ def compare_estimators(estimators: List[ModelConfig], X_test_subset = X_test[indices_test] y_test_subset = y_test[indices_test] - # loop over fi estimators - rng = np.random.RandomState() - number_of_ablations = 1 - seeds = rng.randint(0, 10000, number_of_ablations) for fi_est in tqdm(fi_ests): metric_results = { 'model': model.name, 'fi': fi_est.name, 'train_size': X_train.shape[0], + 'train_subset_size': X_train_subset.shape[0], 'test_size': X_test.shape[0], + 'test_subset_size': X_test_subset.shape[0], 'num_features': X_train.shape[1], 'data_split_seed': args.split_seed, - 'test_all_auc': test_all_auc, - 'test_all_auprc': test_all_auprc, - 'test_all_f1': test_all_f1 + 'test_all_auc_rf': test_all_auc_rf, + 'test_all_auprc_rf': test_all_auprc_rf, + 'test_all_f1_rf': test_all_f1_rf, + 'test_all_auc_rf_plus': test_all_auc_rf_plus, + 'test_all_auprc_rf_plus': test_all_auprc_rf_plus, + 'test_all_f1_rf_plus': test_all_f1_rf_plus, + 'rf_plus_fit_time': end - start, } for i in range(100): metric_results[f'sample_train_{i}'] = indices_train[i] metric_results[f'sample_test_{i}'] = indices_test[i] - for i in range(len(seeds)): - metric_results[f'ablation_seed_{i}'] = seeds[i] + + print("Compute feature importance") start = time.time() - local_fi_score_train_subset = fi_est.cls(X_train=X_train, y_train=y_train, - X_train_subset = X_train_subset, y_train_subset=y_train_subset, - X_test=X_test, y_test=y_test, - fit=copy.deepcopy(est), data_fit_on="train_subset", **fi_est.kwargs) - if fi_est.name not in ["LIME_RF_plus", "Kernel_SHAP_RF_plus"]: - local_fi_score_test = fi_est.cls(X_train=X_train, y_train=y_train, - X_train_subset = X_train_subset, y_train_subset=y_train_subset, - X_test=X_test, y_test=y_test, - fit=copy.deepcopy(est), data_fit_on="test", **fi_est.kwargs) - else: - local_fi_score_test = None - local_fi_score_test_subset = fi_est.cls(X_train=X_train, y_train=y_train, - X_train_subset = X_train_subset, y_train_subset=y_train_subset, - X_test=X_test_subset, y_test=y_test_subset, - fit=copy.deepcopy(est), data_fit_on="test", **fi_est.kwargs) + if fi_est.name == "LFI_evaluate_on_all_RF_plus" or fi_est.name == "LFI_evaluate_on_oob_RF_plus": + local_fi_score_train, local_parital_pred_train, local_fi_score_test, local_partial_pred_test, local_fi_score_test_subset, local_partial_pred_test_subset = fi_est.cls(X_train=X_train, y_train=y_train, + X_train_subset = X_train_subset, y_train_subset=y_train_subset, + X_test_subset=X_test_subset, X_test=X_test, + fit=rf_plus_base, **fi_est.kwargs) + local_fi_score_train_subset = local_fi_score_train[indices_train] + local_partial_pred_train_subset = local_parital_pred_train[indices_train] + elif fi_est.name == "LFI_fit_on_inbag_RF" or fi_est.name == "LFI_fit_on_inbag_RF": + local_fi_score_train, local_parital_pred_train, local_fi_score_test, local_partial_pred_test, local_fi_score_test_subset, local_partial_pred_test_subset = fi_est.cls(X_train=X_train, y_train=y_train, + X_train_subset = X_train_subset, y_train_subset=y_train_subset, + X_test_subset=X_test_subset, X_test=X_test, + fit=copy.deepcopy(est), **fi_est.kwargs) + local_fi_score_train_subset = local_fi_score_train[indices_train] + local_partial_pred_train_subset = local_parital_pred_train[indices_train] + elif fi_est.name == "TreeSHAP_RF": + local_fi_score_train_subset, local_fi_score_test, local_fi_score_test_subset = fi_est.cls(X_train=X_train, y_train=y_train, + X_train_subset = X_train_subset, y_train_subset=y_train_subset, + X_test_subset=X_test_subset, X_test=X_test, + fit=copy.deepcopy(est), **fi_est.kwargs) + elif fi_est.name == "Kernel_SHAP_RF_plus" or fi_est.name == "LIME_RF_plus": + local_fi_score_train_subset, local_fi_score_test, local_fi_score_test_subset = fi_est.cls(X_train=X_train, y_train=y_train, + X_train_subset = X_train_subset, y_train_subset=y_train_subset, + X_test_subset=X_test_subset, X_test=X_test, + fit=rf_plus_base, **fi_est.kwargs) end = time.time() metric_results['fi_time'] = end - start - # feature_importance_list.append(local_fi_score_train_subset) + feature_importance_list.append(local_fi_score_train_subset) feature_importance_list.append(local_fi_score_test) feature_importance_list.append(local_fi_score_test_subset) @@ -208,13 +219,21 @@ def compare_estimators(estimators: List[ModelConfig], "LogisticCV": LogisticRegressionCV(random_state=42), "SVM": SVC(random_state=42, probability=True), "XGBoost_Classifier": xgb.XGBClassifier(random_state=42), - "RF_Plus_Classifier": RandomForestPlusClassifier(rf_model=RandomForestClassifier(n_estimators=100, min_samples_leaf=1, max_features='sqrt', random_state=42))} + "RF_Plus_Classifier": rf_plus_base} + start = time.time() + for a_model in ablation_models: + if a_model != "RF_Plus_Classifier": + ablation_models[a_model].fit(X_train, y_train) + end = time.time() + metric_results['ablation_model_fit_time'] = end - start + print("start ablation") # Subset Train data ablation for all FI methods start = time.time() for a_model in ablation_models: ablation_est = ablation_models[a_model] - ablation_est.fit(X_train, y_train) + if a_model != "RF_Plus_Classifier": + ablation_est.fit(X_train, y_train) y_pred = ablation_est.predict_proba(X_train_subset)[:, 1] metric_results[a_model+'_train_subset_AUROC_before_ablation'] = roc_auc_score(y_train_subset, y_pred) metric_results[a_model+'_train_subset_AUPRC_before_ablation'] = average_precision_score(y_train_subset, y_pred) @@ -225,31 +244,30 @@ def compare_estimators(estimators: List[ModelConfig], ablation_results_auroc_list = [0] * X_train_subset.shape[1] ablation_results_auprc_list = [0] * X_train_subset.shape[1] ablation_results_f1_list = [0] * X_train_subset.shape[1] - for seed in seeds: - for i in range(X_train_subset.shape[1]): - if fi_est.ascending: - ablation_X_train_subset = ablation_to_mean(X_train, X_train_subset, imp_vals, "max", i+1) - else: - ablation_X_train_subset = ablation_to_mean(X_train, X_train_subset, imp_vals, "min", i+1) - ablation_results_auroc_list[i] += roc_auc_score(y_train_subset, ablation_est.predict_proba(ablation_X_train_subset)[:, 1]) - ablation_results_auprc_list[i] += average_precision_score(y_train_subset, ablation_est.predict_proba(ablation_X_train_subset)[:, 1]) - ablation_results_f1_list[i] += f1_score(y_train_subset, ablation_est.predict_proba(ablation_X_train_subset)[:, 1] > 0.5) - ablation_results_f1_list = [x / number_of_ablations for x in ablation_results_f1_list] - ablation_results_auroc_list = [x / number_of_ablations for x in ablation_results_auroc_list] - ablation_results_auprc_list = [x / number_of_ablations for x in ablation_results_auprc_list] + for i in range(X_train_subset.shape[1]): + if fi_est.ascending: + ablation_X_train_subset = ablation_to_mean(X_train, X_train_subset, imp_vals, "max", i+1) + else: + ablation_X_train_subset = ablation_to_mean(X_train, X_train_subset, imp_vals, "min", i+1) + ablation_results_auroc_list[i] += roc_auc_score(y_train_subset, ablation_est.predict_proba(ablation_X_train_subset)[:, 1]) + ablation_results_auprc_list[i] += average_precision_score(y_train_subset, ablation_est.predict_proba(ablation_X_train_subset)[:, 1]) + ablation_results_f1_list[i] += f1_score(y_train_subset, ablation_est.predict_proba(ablation_X_train_subset)[:, 1] > 0.5) for i in range(X_train_subset.shape[1]): metric_results[f'{a_model}_train_subset_AUROC_after_ablation_{i+1}'] = ablation_results_auroc_list[i] metric_results[f'{a_model}_train_subset_AUPRC_after_ablation_{i+1}'] = ablation_results_auprc_list[i] metric_results[f'{a_model}_train_subset_F1_after_ablation_{i+1}'] = ablation_results_f1_list[i] end = time.time() - metric_results['train_subset_data_ablation_time'] = end - start + print(f"done with ablation train subset {end - start}") + metric_results['train_subset_ablation_time'] = end - start + # Test data ablation # Subset test data ablation for all FI methods - removal start = time.time() for a_model in ablation_models: ablation_est = ablation_models[a_model] - ablation_est.fit(X_train, y_train) + if a_model != "RF_Plus_Classifier": + ablation_est.fit(X_train, y_train) y_pred_subset = ablation_est.predict_proba(X_test_subset)[:, 1] metric_results[a_model+'_test_subset_AUROC_before_ablation'] = roc_auc_score(y_test_subset, y_pred_subset) metric_results[a_model+'_test_subset_AUPRC_before_ablation'] = average_precision_score(y_test_subset, y_pred_subset) @@ -260,31 +278,29 @@ def compare_estimators(estimators: List[ModelConfig], ablation_results_auroc_list = [0] * X_test_subset.shape[1] ablation_results_auprc_list = [0] * X_test_subset.shape[1] ablation_results_f1_list = [0] * X_test_subset.shape[1] - for seed in seeds: - for i in range(X_test_subset.shape[1]): - if fi_est.ascending: - ablation_X_test_subset = ablation_to_mean(X_train, X_test_subset, imp_vals, "max", i+1) - else: - ablation_X_test_subset = ablation_to_mean(X_train, X_test_subset, imp_vals, "min", i+1) - ablation_results_auroc_list[i] += roc_auc_score(y_test_subset, ablation_est.predict_proba(ablation_X_test_subset)[:, 1]) - ablation_results_auprc_list[i] += average_precision_score(y_test_subset, ablation_est.predict_proba(ablation_X_test_subset)[:, 1]) - ablation_results_f1_list[i] += f1_score(y_test_subset, ablation_est.predict_proba(ablation_X_test_subset)[:, 1] > 0.5) - ablation_results_f1_list = [x / number_of_ablations for x in ablation_results_f1_list] - ablation_results_auroc_list = [x / number_of_ablations for x in ablation_results_auroc_list] - ablation_results_auprc_list = [x / number_of_ablations for x in ablation_results_auprc_list] + for i in range(X_test_subset.shape[1]): + if fi_est.ascending: + ablation_X_test_subset = ablation_to_mean(X_train, X_test_subset, imp_vals, "max", i+1) + else: + ablation_X_test_subset = ablation_to_mean(X_train, X_test_subset, imp_vals, "min", i+1) + ablation_results_auroc_list[i] += roc_auc_score(y_test_subset, ablation_est.predict_proba(ablation_X_test_subset)[:, 1]) + ablation_results_auprc_list[i] += average_precision_score(y_test_subset, ablation_est.predict_proba(ablation_X_test_subset)[:, 1]) + ablation_results_f1_list[i] += f1_score(y_test_subset, ablation_est.predict_proba(ablation_X_test_subset)[:, 1] > 0.5) for i in range(X_test_subset.shape[1]): metric_results[f'{a_model}_test_subset_AUROC_after_ablation_{i+1}'] = ablation_results_auroc_list[i] metric_results[f'{a_model}_test_subset_AUPRC_after_ablation_{i+1}'] = ablation_results_auprc_list[i] metric_results[f'{a_model}_test_subset_F1_after_ablation_{i+1}'] = ablation_results_f1_list[i] end = time.time() - metric_results['test_subset_ablation_time'] = end - start + print(f"done with ablation 1 test subset {end - start}") + metric_results['test_subset_ablation_1_time'] = end - start # Subset test data ablation for all FI methods - addition start = time.time() for a_model in ablation_models: ablation_est = ablation_models[a_model] - ablation_est.fit(X_train, y_train) + if a_model != "RF_Plus_Classifier": + ablation_est.fit(X_train, y_train) metric_results[a_model+'_test_subset_AUROC_before_ablation_blank'] = roc_auc_score(y_test_subset, ablation_est.predict(np.zeros(X_test_subset.shape))) metric_results[a_model+'_test_subset_AUPRC_before_ablation_blank'] = average_precision_score(y_test_subset, ablation_est.predict(np.zeros(X_test_subset.shape))) metric_results[a_model+'_test_subset_F1_before_ablation_blank'] = f1_score(y_test_subset, ablation_est.predict(np.zeros(X_test_subset.shape)) > 0.5) @@ -294,31 +310,29 @@ def compare_estimators(estimators: List[ModelConfig], ablation_results_auroc_list = [0] * X_test_subset.shape[1] ablation_results_auprc_list = [0] * X_test_subset.shape[1] ablation_results_f1_list = [0] * X_test_subset.shape[1] - for seed in seeds: - for i in range(X_test_subset.shape[1]): - if fi_est.ascending: - ablation_X_test_subset_blank = ablation_by_addition(X_test_subset, imp_vals, "max", i+1) - else: - ablation_X_test_subset_blank = ablation_by_addition(X_test_subset, imp_vals, "min", i+1) - ablation_results_auroc_list[i] += roc_auc_score(y_test_subset, ablation_est.predict_proba(ablation_X_test_subset_blank)[:, 1]) - ablation_results_auprc_list[i] += average_precision_score(y_test_subset, ablation_est.predict_proba(ablation_X_test_subset_blank)[:, 1]) - ablation_results_f1_list[i] += f1_score(y_test_subset, ablation_est.predict_proba(ablation_X_test_subset_blank)[:, 1] > 0.5) - ablation_results_f1_list = [x / number_of_ablations for x in ablation_results_f1_list] - ablation_results_auroc_list = [x / number_of_ablations for x in ablation_results_auroc_list] - ablation_results_auprc_list = [x / number_of_ablations for x in ablation_results_auprc_list] + for i in range(X_test_subset.shape[1]): + if fi_est.ascending: + ablation_X_test_subset_blank = ablation_by_addition(X_train, X_test_subset, imp_vals, "max", i+1) + else: + ablation_X_test_subset_blank = ablation_by_addition(X_train, X_test_subset, imp_vals, "min", i+1) + ablation_results_auroc_list[i] += roc_auc_score(y_test_subset, ablation_est.predict_proba(ablation_X_test_subset_blank)[:, 1]) + ablation_results_auprc_list[i] += average_precision_score(y_test_subset, ablation_est.predict_proba(ablation_X_test_subset_blank)[:, 1]) + ablation_results_f1_list[i] += f1_score(y_test_subset, ablation_est.predict_proba(ablation_X_test_subset_blank)[:, 1] > 0.5) for i in range(X_test_subset.shape[1]): metric_results[f'{a_model}_test_subset_AUROC_after_ablation_{i+1}_blank'] = ablation_results_auroc_list[i] metric_results[f'{a_model}_test_subset_AUPRC_after_ablation_{i+1}_blank'] = ablation_results_auprc_list[i] metric_results[f'{a_model}_test_subset_F1_after_ablation_{i+1}_blank'] = ablation_results_f1_list[i] end = time.time() - metric_results['test_subset_blank_ablation_time'] = end - start + print(f"done with ablation 2 test subset {end - start}") + metric_results['test_subset_ablation_2_time'] = end - start # Whole test data ablation for all FI methods except for KernelSHAP and LIME if fi_est.name not in ["LIME_RF_plus", "Kernel_SHAP_RF_plus"]: start = time.time() for a_model in ablation_models: ablation_est = ablation_models[a_model] - ablation_est.fit(X_train, y_train) + if a_model != "RF_Plus_Classifier": + ablation_est.fit(X_train, y_train) y_pred = ablation_est.predict_proba(X_test)[:, 1] metric_results[a_model+'_test_AUROC_before_ablation'] = roc_auc_score(y_test, y_pred) metric_results[a_model+'_test_AUPRC_before_ablation'] = average_precision_score(y_test, y_pred) @@ -329,24 +343,21 @@ def compare_estimators(estimators: List[ModelConfig], ablation_results_auroc_list = [0] * X_test.shape[1] ablation_results_auprc_list = [0] * X_test.shape[1] ablation_results_f1_list = [0] * X_test.shape[1] - for seed in seeds: - for i in range(X_test.shape[1]): - if fi_est.ascending: - ablation_X_test = ablation_to_mean(X_train, X_test, imp_vals, "max", i+1) - else: - ablation_X_test = ablation_to_mean(X_train, X_test, imp_vals, "min", i+1) - ablation_results_auroc_list[i] += roc_auc_score(y_test, ablation_est.predict_proba(ablation_X_test)[:, 1]) - ablation_results_auprc_list[i] += average_precision_score(y_test, ablation_est.predict_proba(ablation_X_test)[:, 1]) - ablation_results_f1_list[i] += f1_score(y_test, ablation_est.predict_proba(ablation_X_test)[:, 1] > 0.5) - ablation_results_f1_list = [x / number_of_ablations for x in ablation_results_f1_list] - ablation_results_auroc_list = [x / number_of_ablations for x in ablation_results_auroc_list] - ablation_results_auprc_list = [x / number_of_ablations for x in ablation_results_auprc_list] + for i in range(X_test.shape[1]): + if fi_est.ascending: + ablation_X_test = ablation_to_mean(X_train, X_test, imp_vals, "max", i+1) + else: + ablation_X_test = ablation_to_mean(X_train, X_test, imp_vals, "min", i+1) + ablation_results_auroc_list[i] += roc_auc_score(y_test, ablation_est.predict_proba(ablation_X_test)[:, 1]) + ablation_results_auprc_list[i] += average_precision_score(y_test, ablation_est.predict_proba(ablation_X_test)[:, 1]) + ablation_results_f1_list[i] += f1_score(y_test, ablation_est.predict_proba(ablation_X_test)[:, 1] > 0.5) for i in range(X_test.shape[1]): metric_results[f'{a_model}_test_AUROC_after_ablation_{i+1}'] = ablation_results_auroc_list[i] metric_results[f'{a_model}_test_AUPRC_after_ablation_{i+1}'] = ablation_results_auprc_list[i] metric_results[f'{a_model}_test_F1_after_ablation_{i+1}'] = ablation_results_f1_list[i] end = time.time() metric_results['test_data_ablation_time'] = end - start + print(f"done with ablation test {end - start}") else: for a_model in ablation_models: metric_results[a_model+'_test_AUROC_before_ablation'] = None @@ -357,7 +368,8 @@ def compare_estimators(estimators: List[ModelConfig], metric_results[f'{a_model}_test_AUPRC_after_ablation_{i+1}'] = None metric_results[f'{a_model}_test_F1_after_ablation_{i+1}'] = None metric_results["test_data_ablation_time"] = None - print(f"fi: {fi_est.name} ablation done with time: {end - start}") + + print(f"fi: {fi_est.name} all ablation done") # initialize results with metadata and metric results kwargs: dict = model.kwargs # dict diff --git a/feature_importance/01_run_ablation_regression.py b/feature_importance/01_run_ablation_regression.py index 3501404..207f7bd 100644 --- a/feature_importance/01_run_ablation_regression.py +++ b/feature_importance/01_run_ablation_regression.py @@ -21,12 +21,10 @@ from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import LinearRegression import xgboost as xgb +from imodels.tree.rf_plus.rf_plus.rf_plus_models import RandomForestPlusRegressor, RandomForestPlusClassifier sys.path.append(".") sys.path.append("..") sys.path.append("../..") -sys.path.append("/accounts/grad/zachrewolinski/research/imodels") -print("sys.path", sys.path) -from imodels.importance import RandomForestPlusRegressor, RandomForestPlusClassifier import fi_config from util import ModelConfig, FIModelConfig, tp, fp, neg, pos, specificity_score, auroc_score, auprc_score, compute_nsg_feat_corr_w_sig_subspace, apply_splitting_strategy @@ -63,11 +61,11 @@ def ablation_to_mean(train, data, feature_importance, mode, num_features): """ - Replace the top num_features max feature importance data with random shuffle for each sample + Replace the top num_features max feature importance data with mean value for each sample """ train_mean = np.mean(train, axis=0) assert mode in ["max", "min"] - fi = feature_importance.to_numpy() + fi = feature_importance if mode == "max": indices = np.argsort(-fi) else: @@ -78,17 +76,18 @@ def ablation_to_mean(train, data, feature_importance, mode, num_features): data_copy[i, indices[i,j]] = train_mean[indices[i,j]] return data_copy -def ablation_by_addition(data, feature_importance, mode, num_features): +def ablation_by_addition(train, data, feature_importance, mode, num_features): """ - Initialize the data with zeros and add the top num_features max feature importance data for each sample + Initialize the data with mean values and add the top num_features max feature importance data for each sample """ assert mode in ["max", "min"] - fi = feature_importance.to_numpy() + fi = feature_importance if mode == "max": indices = np.argsort(-fi) else: indices = np.argsort(fi) - data_copy = np.zeros(data.shape) + row_values = np.mean(train, axis=0).tolist() + data_copy = np.array([row_values] * data.shape[0]) for i in range(data.shape[0]): for j in range(num_features): data_copy[i, indices[i,j]] = data[i, indices[i,j]] @@ -137,19 +136,18 @@ def compare_estimators(estimators: List[ModelConfig], y_train = y y_test = y - normalizer = preprocessing.Normalizer() - if splitting_strategy == "train-test": - X_train = normalizer.fit_transform(X_train) - X_test = normalizer.transform(X_test) - else: - X = normalizer.fit_transform(X) - X_train = normalizer.transform(X_train) - X_test = normalizer.transform(X_test) - - # fit model + # fit RF model est.fit(X_train, y_train) - test_all_mse = mean_squared_error(y_test, est.predict(X_test)) - test_all_r2 = r2_score(y_test, est.predict(X_test)) + test_all_mse_rf = mean_squared_error(y_test, est.predict(X_test)) + test_all_r2_rf = r2_score(y_test, est.predict(X_test)) + + # fit RF_plus model + start = time.time() + rf_plus_base = RandomForestPlusRegressor(rf_model=est) + rf_plus_base.fit(X_train, y_train) + end = time.time() + test_all_mse_rf_plus = mean_squared_error(y_test, rf_plus_base.predict(X_test)) + test_all_r2_rf_plus = r2_score(y_test, rf_plus_base.predict(X_test)) np.random.seed(42) indices_train = np.random.choice(X_train.shape[0], 100, replace=False) @@ -160,57 +158,74 @@ def compare_estimators(estimators: List[ModelConfig], y_test_subset = y_test[indices_test] # loop over fi estimators - rng = np.random.RandomState() - number_of_ablations = 1 - seeds = rng.randint(0, 10000, number_of_ablations) for fi_est in tqdm(fi_ests): metric_results = { 'model': model.name, 'fi': fi_est.name, 'train_size': X_train.shape[0], + 'train_subset_size': X_train_subset.shape[0], 'test_size': X_test.shape[0], + 'test_subset_size': X_test_subset.shape[0], 'num_features': X_train.shape[1], 'data_split_seed': args.split_seed, - 'test_all_mse': test_all_mse, - 'test_all_r2': test_all_r2 + 'test_all_mse_rf': test_all_mse_rf, + 'test_all_r2_rf': test_all_r2_rf, + 'test_all_mse_rf_plus': test_all_mse_rf_plus, + 'test_all_r2_rf_plus': test_all_r2_rf_plus, + 'rf_plus_fit_time': end - start, } for i in range(100): metric_results[f'sample_train_{i}'] = indices_train[i] metric_results[f'sample_test_{i}'] = indices_test[i] - for i in range(len(seeds)): - metric_results[f'ablation_seed_{i}'] = seeds[i] + + print("Compute feature importance") start = time.time() - local_fi_score_train_subset = fi_est.cls(X_train=X_train, y_train=y_train, - X_train_subset = X_train_subset, y_train_subset=y_train_subset, - X_test=X_test, y_test=y_test, - fit=copy.deepcopy(est), data_fit_on="train_subset", **fi_est.kwargs) - if fi_est.name not in ["LIME_RF_plus", "Kernel_SHAP_RF_plus"]: - local_fi_score_test = fi_est.cls(X_train=X_train, y_train=y_train, - X_train_subset = X_train_subset, y_train_subset=y_train_subset, - X_test=X_test, y_test=y_test, - fit=copy.deepcopy(est), data_fit_on="test", **fi_est.kwargs) - else: - local_fi_score_test = None - local_fi_score_test_subset = fi_est.cls(X_train=X_train, y_train=y_train, - X_train_subset = X_train_subset, y_train_subset=y_train_subset, - X_test=X_test_subset, y_test=y_test_subset, - fit=copy.deepcopy(est), data_fit_on="test", **fi_est.kwargs) + if fi_est.name == "LFI_evaluate_on_all_RF_plus" or fi_est.name == "LFI_evaluate_on_oob_RF_plus": + local_fi_score_train, local_parital_pred_train, local_fi_score_test, local_partial_pred_test, local_fi_score_test_subset, local_partial_pred_test_subset = fi_est.cls(X_train=X_train, y_train=y_train, + X_train_subset = X_train_subset, y_train_subset=y_train_subset, + X_test_subset=X_test_subset, X_test=X_test, + fit=rf_plus_base, **fi_est.kwargs) + local_fi_score_train_subset = local_fi_score_train[indices_train] + local_partial_pred_train_subset = local_parital_pred_train[indices_train] + elif fi_est.name == "LFI_fit_on_inbag_RF" or fi_est.name == "LFI_fit_on_inbag_RF": + local_fi_score_train, local_parital_pred_train, local_fi_score_test, local_partial_pred_test, local_fi_score_test_subset, local_partial_pred_test_subset = fi_est.cls(X_train=X_train, y_train=y_train, + X_train_subset = X_train_subset, y_train_subset=y_train_subset, + X_test_subset=X_test_subset, X_test=X_test, + fit=copy.deepcopy(est), **fi_est.kwargs) + local_fi_score_train_subset = local_fi_score_train[indices_train] + local_partial_pred_train_subset = local_parital_pred_train[indices_train] + elif fi_est.name == "TreeSHAP_RF": + local_fi_score_train_subset, local_fi_score_test, local_fi_score_test_subset = fi_est.cls(X_train=X_train, y_train=y_train, + X_train_subset = X_train_subset, y_train_subset=y_train_subset, + X_test_subset=X_test_subset, X_test=X_test, + fit=copy.deepcopy(est), **fi_est.kwargs) + elif fi_est.name == "Kernel_SHAP_RF_plus" or fi_est.name == "LIME_RF_plus": + local_fi_score_train_subset, local_fi_score_test, local_fi_score_test_subset = fi_est.cls(X_train=X_train, y_train=y_train, + X_train_subset = X_train_subset, y_train_subset=y_train_subset, + X_test_subset=X_test_subset, X_test=X_test, + fit=rf_plus_base, **fi_est.kwargs) end = time.time() metric_results['fi_time'] = end - start - # feature_importance_list.append(local_fi_score_train_subset) + feature_importance_list.append(local_fi_score_train_subset) feature_importance_list.append(local_fi_score_test) feature_importance_list.append(local_fi_score_test_subset) ablation_models = {"RF_Regressor": RandomForestRegressor(n_estimators=100,min_samples_leaf=5,max_features=0.33,random_state=42), "Linear": LinearRegression(), "XGB_Regressor": xgb.XGBRegressor(random_state=42), - "RF_Plus_Regressor":RandomForestPlusRegressor(rf_model=RandomForestRegressor(n_estimators=100,min_samples_leaf=5,max_features=0.33,random_state=42))} + "RF_Plus_Regressor": rf_plus_base} + start = time.time() + for a_model in ablation_models: + if a_model != "RF_Plus_Regressor": + ablation_models[a_model].fit(X_train, y_train) + end = time.time() + metric_results['ablation_model_fit_time'] = end - start + print("start ablation") # Subset Train data ablation for all FI methods start = time.time() for a_model in ablation_models: ablation_est = ablation_models[a_model] - ablation_est.fit(X_train, y_train) y_pred_subset = ablation_est.predict(X_train_subset) metric_results[a_model + '_train_subset_MSE_before_ablation'] = mean_squared_error(y_train_subset, y_pred_subset) metric_results[a_model + '_train_subset_R_2_before_ablation'] = r2_score(y_train_subset, y_pred_subset) @@ -219,20 +234,18 @@ def compare_estimators(estimators: List[ModelConfig], imp_vals[imp_vals == float("inf")] = sys.maxsize - 1 ablation_results_list = [0] * X_train_subset.shape[1] ablation_results_list_r2 = [0] * X_train_subset.shape[1] - for seed in seeds: - for i in range(X_train_subset.shape[1]): - if fi_est.ascending: - ablation_X_train_subset = ablation_to_mean(X_train, X_train_subset, imp_vals, "max", i+1) - else: - ablation_X_train_subset = ablation_to_mean(X_train, X_train_subset, imp_vals, "min", i+1) - ablation_results_list[i] += mean_squared_error(y_train_subset, ablation_est.predict(ablation_X_train_subset)) - ablation_results_list_r2[i] += r2_score(y_train_subset, ablation_est.predict(ablation_X_train_subset)) - ablation_results_list = [x / len(seeds) for x in ablation_results_list] - ablation_results_list_r2 = [x / len(seeds) for x in ablation_results_list_r2] + for i in range(X_train_subset.shape[1]): + if fi_est.ascending: + ablation_X_train_subset = ablation_to_mean(X_train, X_train_subset, imp_vals, "max", i+1) + else: + ablation_X_train_subset = ablation_to_mean(X_train, X_train_subset, imp_vals, "min", i+1) + ablation_results_list[i] += mean_squared_error(y_train_subset, ablation_est.predict(ablation_X_train_subset)) + ablation_results_list_r2[i] += r2_score(y_train_subset, ablation_est.predict(ablation_X_train_subset)) for i in range(X_train.shape[1]): metric_results[f'{a_model}_train_subset_MSE_after_ablation_{i+1}'] = ablation_results_list[i] metric_results[f'{a_model}_train_subset_R_2_after_ablation_{i+1}'] = ablation_results_list_r2[i] end = time.time() + print(f"done with ablation train subset {end - start}") metric_results['train_subset_ablation_time'] = end - start # Test data ablation @@ -240,7 +253,6 @@ def compare_estimators(estimators: List[ModelConfig], start = time.time() for a_model in ablation_models: ablation_est = ablation_models[a_model] - ablation_est.fit(X_train, y_train) y_pred_subset = ablation_est.predict(X_test_subset) metric_results[a_model + '_test_subset_MSE_before_ablation'] = mean_squared_error(y_test_subset, y_pred_subset) metric_results[a_model + '_test_subset_R_2_before_ablation'] = r2_score(y_test_subset, y_pred_subset) @@ -249,28 +261,25 @@ def compare_estimators(estimators: List[ModelConfig], imp_vals[imp_vals == float("inf")] = sys.maxsize - 1 ablation_results_list = [0] * X_test_subset.shape[1] ablation_results_list_r2 = [0] * X_test_subset.shape[1] - for seed in seeds: - for i in range(X_test_subset.shape[1]): - if fi_est.ascending: - ablation_X_test_subset = ablation_to_mean(X_train, X_test_subset, imp_vals, "max", i+1) - else: - ablation_X_test_subset = ablation_to_mean(X_train, X_test_subset, imp_vals, "min", i+1) - ablation_results_list[i] += mean_squared_error(y_test_subset, ablation_est.predict(ablation_X_test_subset)) - ablation_results_list_r2[i] += r2_score(y_test_subset, ablation_est.predict(ablation_X_test_subset)) - ablation_results_list = [x / len(seeds) for x in ablation_results_list] - ablation_results_list_r2 = [x / len(seeds) for x in ablation_results_list_r2] + for i in range(X_test_subset.shape[1]): + if fi_est.ascending: + ablation_X_test_subset = ablation_to_mean(X_train, X_test_subset, imp_vals, "max", i+1) + else: + ablation_X_test_subset = ablation_to_mean(X_train, X_test_subset, imp_vals, "min", i+1) + ablation_results_list[i] += mean_squared_error(y_test_subset, ablation_est.predict(ablation_X_test_subset)) + ablation_results_list_r2[i] += r2_score(y_test_subset, ablation_est.predict(ablation_X_test_subset)) for i in range(X_test_subset.shape[1]): metric_results[f'{a_model}_test_subset_MSE_after_ablation_{i+1}'] = ablation_results_list[i] metric_results[f'{a_model}_test_subset_R_2_after_ablation_{i+1}'] = ablation_results_list_r2[i] end = time.time() - metric_results['test_subset_ablation_time'] = end - start + print(f"done with ablation 1 test subset {end - start}") + metric_results['test_subset_ablation_1_time'] = end - start # Subset test data ablation for all FI methods - addition start = time.time() for a_model in ablation_models: ablation_est = ablation_models[a_model] - ablation_est.fit(X_train, y_train) metric_results[a_model + '_test_subset_MSE_before_ablation_blank'] = mean_squared_error(y_test_subset, ablation_est.predict(np.zeros(X_test_subset.shape))) metric_results[a_model + '_test_subset_R_2_before_ablation_blank'] = r2_score(y_test_subset, ablation_est.predict(np.zeros(X_test_subset.shape))) imp_vals = copy.deepcopy(local_fi_score_test_subset) @@ -278,28 +287,25 @@ def compare_estimators(estimators: List[ModelConfig], imp_vals[imp_vals == float("inf")] = sys.maxsize - 1 ablation_results_list = [0] * X_test_subset.shape[1] ablation_results_list_r2 = [0] * X_test_subset.shape[1] - for seed in seeds: - for i in range(X_test_subset.shape[1]): - if fi_est.ascending: - ablation_X_test_subset_blank = ablation_by_addition(X_test_subset, imp_vals, "max", i+1) - else: - ablation_X_test_subset_blank = ablation_by_addition(X_test_subset, imp_vals, "min", i+1) - ablation_results_list[i] += mean_squared_error(y_test_subset, ablation_est.predict(ablation_X_test_subset_blank)) - ablation_results_list_r2[i] += r2_score(y_test_subset, ablation_est.predict(ablation_X_test_subset_blank)) - ablation_results_list = [x / len(seeds) for x in ablation_results_list] - ablation_results_list_r2 = [x / len(seeds) for x in ablation_results_list_r2] + for i in range(X_test_subset.shape[1]): + if fi_est.ascending: + ablation_X_test_subset_blank = ablation_by_addition(X_train, X_test_subset, imp_vals, "max", i+1) + else: + ablation_X_test_subset_blank = ablation_by_addition(X_train, X_test_subset, imp_vals, "min", i+1) + ablation_results_list[i] += mean_squared_error(y_test_subset, ablation_est.predict(ablation_X_test_subset_blank)) + ablation_results_list_r2[i] += r2_score(y_test_subset, ablation_est.predict(ablation_X_test_subset_blank)) for i in range(X_test_subset.shape[1]): metric_results[f'{a_model}_test_subset_MSE_after_ablation_{i+1}_blank'] = ablation_results_list[i] metric_results[f'{a_model}_test_subset_R_2_after_ablation_{i+1}_blank'] = ablation_results_list_r2[i] end = time.time() - metric_results['test_subset_blank_ablation_time'] = end - start + print(f"done with ablation 2 test subset {end - start}") + metric_results['test_subset_ablation_2_time'] = end - start # Whole test data ablation for all FI methods except for KernelSHAP and LIME if fi_est.name not in ["LIME_RF_plus", "Kernel_SHAP_RF_plus"]: start = time.time() for a_model in ablation_models: ablation_est = ablation_models[a_model] - ablation_est.fit(X_train, y_train) y_pred = ablation_est.predict(X_test) metric_results[a_model + '_test_MSE_before_ablation'] = mean_squared_error(y_test, y_pred) metric_results[a_model + '_test_R_2_before_ablation'] = r2_score(y_test, y_pred) @@ -308,21 +314,19 @@ def compare_estimators(estimators: List[ModelConfig], imp_vals[imp_vals == float("inf")] = sys.maxsize - 1 ablation_results_list = [0] * X_test.shape[1] ablation_results_list_r2 = [0] * X_test.shape[1] - for seed in seeds: - for i in range(X_test.shape[1]): - if fi_est.ascending: - ablation_X_test = ablation_to_mean(X_train, X_test, imp_vals, "max", i+1) - else: - ablation_X_test = ablation_to_mean(X_train, X_test, imp_vals, "min", i+1) - ablation_results_list[i] += mean_squared_error(y_test, ablation_est.predict(ablation_X_test)) - ablation_results_list_r2[i] += r2_score(y_test, ablation_est.predict(ablation_X_test)) - ablation_results_list = [x / len(seeds) for x in ablation_results_list] - ablation_results_list_r2 = [x / len(seeds) for x in ablation_results_list_r2] + for i in range(X_test.shape[1]): + if fi_est.ascending: + ablation_X_test = ablation_to_mean(X_train, X_test, imp_vals, "max", i+1) + else: + ablation_X_test = ablation_to_mean(X_train, X_test, imp_vals, "min", i+1) + ablation_results_list[i] += mean_squared_error(y_test, ablation_est.predict(ablation_X_test)) + ablation_results_list_r2[i] += r2_score(y_test, ablation_est.predict(ablation_X_test)) for i in range(X_test.shape[1]): metric_results[f'{a_model}_test_MSE_after_ablation_{i+1}'] = ablation_results_list[i] metric_results[f'{a_model}_test_R_2_after_ablation_{i+1}'] = ablation_results_list_r2[i] end = time.time() metric_results['test_data_ablation_time'] = end - start + print(f"done with ablation test {end - start}") else: for a_model in ablation_models: metric_results[a_model + '_test_MSE_before_ablation'] = None @@ -332,7 +336,7 @@ def compare_estimators(estimators: List[ModelConfig], metric_results[f'{a_model}_test_R_2_after_ablation_{i+1}'] = None metric_results["test_data_ablation_time"] = None - print(f"fi: {fi_est.name} ablation done with time: {end - start}") + print(f"fi: {fi_est.name} all ablation done") # initialize results with metadata and metric results kwargs: dict = model.kwargs # dict diff --git a/feature_importance/feature_ranking.sh b/feature_importance/feature_ranking.sh index 659942e..0a7ae14 100644 --- a/feature_importance/feature_ranking.sh +++ b/feature_importance/feature_ranking.sh @@ -3,7 +3,7 @@ #SBATCH --mail-type=ALL source activate mdi -command="ranking_importance_local_sims.py --nreps 1 --config mdi_local.real_x_sim_y.diabetes-classification.lss-model --split_seed ${1} --ignore_cache --create_rmd --result_name diabetes-class-lss" +command="ranking_importance_local_sims.py --nreps 1 --config mdi_local.real_x_sim_y.diabetes-regression.lss-model --split_seed 6 --ignore_cache --create_rmd --result_name diabetes-reg-lss" # Execute the command python $command \ No newline at end of file diff --git a/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/hierarchical-polynomial/dgp.py b/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/hierarchical-polynomial/dgp.py index 98cc33d..fd592dc 100644 --- a/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/hierarchical-polynomial/dgp.py +++ b/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/hierarchical-polynomial/dgp.py @@ -3,11 +3,13 @@ from feature_importance.scripts.simulations_util import * -X_DGP = sample_real_X +X_DGP = sample_real_data_X X_PARAMS_DICT = { - "fpath": "../data/regression_data/Diabetes_regression/X_diabetes_regression.csv", - "sample_row_n": 442 + "source": "imodels", + "data_name": "diabetes_regr", + "sample_row_n": None } + Y_DGP = hierarchical_poly Y_PARAMS_DICT = { "beta": 1, diff --git a/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/hierarchical-polynomial/models.py b/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/hierarchical-polynomial/models.py index 0d225d0..5d578cb 100644 --- a/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/hierarchical-polynomial/models.py +++ b/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/hierarchical-polynomial/models.py @@ -2,24 +2,21 @@ import numpy as np from feature_importance.util import ModelConfig, FIModelConfig from sklearn.ensemble import RandomForestRegressor -from imodels.importance.rf_plus import RandomForestPlusRegressor from feature_importance.scripts.competing_methods_local import * - +from sklearn.linear_model import Ridge ESTIMATORS = [ [ModelConfig('RF', RandomForestRegressor, model_type='tree', - other_params={'n_estimators': 100, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'random_state': 42})], - [ModelConfig('RF_plus', RandomForestPlusRegressor, model_type='t_plus', - other_params={'rf_model': RandomForestRegressor(n_estimators=100, min_samples_leaf=1, max_features='sqrt', random_state=42)})] + other_params={'n_estimators': 100, 'min_samples_leaf': 5, 'max_features': 0.33, 'random_state': 42})] ] FI_ESTIMATORS = [ - [FIModelConfig('LFI_with_raw_RF', LFI_evaluation_RF, model_type='tree', splitting_strategy = "train-test")], - [FIModelConfig('MDI_RF', LFI_evaluation_RF, model_type='tree', splitting_strategy = "train-test", other_params={"include_raw": False, "cv_ridge": 0, "calc_loo_coef":False, "sample_split":"inbag"})], - [FIModelConfig('LFI_with_raw_OOB_RF', LFI_evaluation_RF, model_type='tree', splitting_strategy = "train-test", other_params={"sample_split":"oob", "fit_on":"test", "calc_loo_coef":False})], [FIModelConfig('TreeSHAP_RF', tree_shap_evaluation_RF, model_type='tree', splitting_strategy = "train-test")], - [FIModelConfig('LFI_with_raw_RF_plus', LFI_evaluation_RF_plus, model_type='t_plus', splitting_strategy = "train-test")], - [FIModelConfig('Kernel_SHAP_RF_plus', kernel_shap_evaluation_RF_plus, model_type='t_plus', splitting_strategy = "train-test")], - [FIModelConfig('LIME_RF_plus', lime_evaluation_RF_plus, model_type='t_plus', splitting_strategy = "train-test")], + [FIModelConfig('LFI_fit_on_inbag_RF', LFI_evaluation_RF_MDI, model_type='tree', splitting_strategy = "train-test", ascending = False, other_params={"include_raw":False, "fit_on":"inbag", "prediction_model": Ridge(alpha=1e-6)})], + [FIModelConfig('LFI_fit_on_OOB_RF', LFI_evaluation_RF_OOB, model_type='tree', splitting_strategy = "train-test", ascending = False, other_params={"fit_on":"oob"})], + [FIModelConfig('LFI_evaluate_on_all_RF_plus', LFI_evaluation_RF_plus, model_type='tree', splitting_strategy = "train-test", ascending = False)], + [FIModelConfig('LFI_evaluate_on_oob_RF_plus', LFI_evaluation_RF_plus_OOB, model_type='tree', splitting_strategy = "train-test", ascending = False)], + [FIModelConfig('Kernel_SHAP_RF_plus', kernel_shap_evaluation_RF_plus, model_type='tree', splitting_strategy = "train-test")], + [FIModelConfig('LIME_RF_plus', lime_evaluation_RF_plus, model_type='tree', splitting_strategy = "train-test")], ] \ No newline at end of file diff --git a/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/linear-lss/dgp.py b/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/linear-lss/dgp.py index 98cc33d..fd592dc 100644 --- a/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/linear-lss/dgp.py +++ b/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/linear-lss/dgp.py @@ -3,11 +3,13 @@ from feature_importance.scripts.simulations_util import * -X_DGP = sample_real_X +X_DGP = sample_real_data_X X_PARAMS_DICT = { - "fpath": "../data/regression_data/Diabetes_regression/X_diabetes_regression.csv", - "sample_row_n": 442 + "source": "imodels", + "data_name": "diabetes_regr", + "sample_row_n": None } + Y_DGP = hierarchical_poly Y_PARAMS_DICT = { "beta": 1, diff --git a/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/linear-lss/models.py b/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/linear-lss/models.py index 0d225d0..5d578cb 100644 --- a/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/linear-lss/models.py +++ b/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/linear-lss/models.py @@ -2,24 +2,21 @@ import numpy as np from feature_importance.util import ModelConfig, FIModelConfig from sklearn.ensemble import RandomForestRegressor -from imodels.importance.rf_plus import RandomForestPlusRegressor from feature_importance.scripts.competing_methods_local import * - +from sklearn.linear_model import Ridge ESTIMATORS = [ [ModelConfig('RF', RandomForestRegressor, model_type='tree', - other_params={'n_estimators': 100, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'random_state': 42})], - [ModelConfig('RF_plus', RandomForestPlusRegressor, model_type='t_plus', - other_params={'rf_model': RandomForestRegressor(n_estimators=100, min_samples_leaf=1, max_features='sqrt', random_state=42)})] + other_params={'n_estimators': 100, 'min_samples_leaf': 5, 'max_features': 0.33, 'random_state': 42})] ] FI_ESTIMATORS = [ - [FIModelConfig('LFI_with_raw_RF', LFI_evaluation_RF, model_type='tree', splitting_strategy = "train-test")], - [FIModelConfig('MDI_RF', LFI_evaluation_RF, model_type='tree', splitting_strategy = "train-test", other_params={"include_raw": False, "cv_ridge": 0, "calc_loo_coef":False, "sample_split":"inbag"})], - [FIModelConfig('LFI_with_raw_OOB_RF', LFI_evaluation_RF, model_type='tree', splitting_strategy = "train-test", other_params={"sample_split":"oob", "fit_on":"test", "calc_loo_coef":False})], [FIModelConfig('TreeSHAP_RF', tree_shap_evaluation_RF, model_type='tree', splitting_strategy = "train-test")], - [FIModelConfig('LFI_with_raw_RF_plus', LFI_evaluation_RF_plus, model_type='t_plus', splitting_strategy = "train-test")], - [FIModelConfig('Kernel_SHAP_RF_plus', kernel_shap_evaluation_RF_plus, model_type='t_plus', splitting_strategy = "train-test")], - [FIModelConfig('LIME_RF_plus', lime_evaluation_RF_plus, model_type='t_plus', splitting_strategy = "train-test")], + [FIModelConfig('LFI_fit_on_inbag_RF', LFI_evaluation_RF_MDI, model_type='tree', splitting_strategy = "train-test", ascending = False, other_params={"include_raw":False, "fit_on":"inbag", "prediction_model": Ridge(alpha=1e-6)})], + [FIModelConfig('LFI_fit_on_OOB_RF', LFI_evaluation_RF_OOB, model_type='tree', splitting_strategy = "train-test", ascending = False, other_params={"fit_on":"oob"})], + [FIModelConfig('LFI_evaluate_on_all_RF_plus', LFI_evaluation_RF_plus, model_type='tree', splitting_strategy = "train-test", ascending = False)], + [FIModelConfig('LFI_evaluate_on_oob_RF_plus', LFI_evaluation_RF_plus_OOB, model_type='tree', splitting_strategy = "train-test", ascending = False)], + [FIModelConfig('Kernel_SHAP_RF_plus', kernel_shap_evaluation_RF_plus, model_type='tree', splitting_strategy = "train-test")], + [FIModelConfig('LIME_RF_plus', lime_evaluation_RF_plus, model_type='tree', splitting_strategy = "train-test")], ] \ No newline at end of file diff --git a/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/linear-model/dgp.py b/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/linear-model/dgp.py index 2b6256b..9210596 100644 --- a/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/linear-model/dgp.py +++ b/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/linear-model/dgp.py @@ -3,21 +3,13 @@ from feature_importance.scripts.simulations_util import * -X_DGP = sample_real_X +X_DGP = sample_real_data_X X_PARAMS_DICT = { - "fpath": "../data/regression_data/Diabetes_regression/X_diabetes_regression.csv", - "sample_row_n": 442 + "source": "imodels", + "data_name": "diabetes_regr", + "sample_row_n": None } -# X_PARAMS_DICT = { -# "X_fpath": "../data/classification_data/Fico/X_fico.csv", -# "sample_row_n": None, -# "return_data": "X" -# } -# X_PARAMS_DICT = { -# "X_fpath": "../data/classification_data/Juvenile/X_juvenile.csv", -# "sample_row_n": None, -# "return_data": "X" -# } + Y_DGP = linear_model Y_PARAMS_DICT = { "beta": 1, @@ -25,16 +17,7 @@ "heritability": 0.4, "s": 5 } -# Y_PARAMS_DICT = { -# "y_fpath": "../data/classification_data/Fico/y_fico.csv", -# "return_data": "y" -# } -# Y_PARAMS_DICT = { -# "y_fpath": "../data/classification_data/Juvenile/y_juvenile.csv", -# "return_data": "y" -# } -# vary one parameter VARY_PARAM_NAME = ["heritability", "sample_row_n"] VARY_PARAM_VALS = {"heritability": {"0.1": 0.1, "0.2": 0.2, "0.4": 0.4, "0.8": 0.8}, diff --git a/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/linear-model/models.py b/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/linear-model/models.py index 0d225d0..5d578cb 100644 --- a/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/linear-model/models.py +++ b/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/linear-model/models.py @@ -2,24 +2,21 @@ import numpy as np from feature_importance.util import ModelConfig, FIModelConfig from sklearn.ensemble import RandomForestRegressor -from imodels.importance.rf_plus import RandomForestPlusRegressor from feature_importance.scripts.competing_methods_local import * - +from sklearn.linear_model import Ridge ESTIMATORS = [ [ModelConfig('RF', RandomForestRegressor, model_type='tree', - other_params={'n_estimators': 100, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'random_state': 42})], - [ModelConfig('RF_plus', RandomForestPlusRegressor, model_type='t_plus', - other_params={'rf_model': RandomForestRegressor(n_estimators=100, min_samples_leaf=1, max_features='sqrt', random_state=42)})] + other_params={'n_estimators': 100, 'min_samples_leaf': 5, 'max_features': 0.33, 'random_state': 42})] ] FI_ESTIMATORS = [ - [FIModelConfig('LFI_with_raw_RF', LFI_evaluation_RF, model_type='tree', splitting_strategy = "train-test")], - [FIModelConfig('MDI_RF', LFI_evaluation_RF, model_type='tree', splitting_strategy = "train-test", other_params={"include_raw": False, "cv_ridge": 0, "calc_loo_coef":False, "sample_split":"inbag"})], - [FIModelConfig('LFI_with_raw_OOB_RF', LFI_evaluation_RF, model_type='tree', splitting_strategy = "train-test", other_params={"sample_split":"oob", "fit_on":"test", "calc_loo_coef":False})], [FIModelConfig('TreeSHAP_RF', tree_shap_evaluation_RF, model_type='tree', splitting_strategy = "train-test")], - [FIModelConfig('LFI_with_raw_RF_plus', LFI_evaluation_RF_plus, model_type='t_plus', splitting_strategy = "train-test")], - [FIModelConfig('Kernel_SHAP_RF_plus', kernel_shap_evaluation_RF_plus, model_type='t_plus', splitting_strategy = "train-test")], - [FIModelConfig('LIME_RF_plus', lime_evaluation_RF_plus, model_type='t_plus', splitting_strategy = "train-test")], + [FIModelConfig('LFI_fit_on_inbag_RF', LFI_evaluation_RF_MDI, model_type='tree', splitting_strategy = "train-test", ascending = False, other_params={"include_raw":False, "fit_on":"inbag", "prediction_model": Ridge(alpha=1e-6)})], + [FIModelConfig('LFI_fit_on_OOB_RF', LFI_evaluation_RF_OOB, model_type='tree', splitting_strategy = "train-test", ascending = False, other_params={"fit_on":"oob"})], + [FIModelConfig('LFI_evaluate_on_all_RF_plus', LFI_evaluation_RF_plus, model_type='tree', splitting_strategy = "train-test", ascending = False)], + [FIModelConfig('LFI_evaluate_on_oob_RF_plus', LFI_evaluation_RF_plus_OOB, model_type='tree', splitting_strategy = "train-test", ascending = False)], + [FIModelConfig('Kernel_SHAP_RF_plus', kernel_shap_evaluation_RF_plus, model_type='tree', splitting_strategy = "train-test")], + [FIModelConfig('LIME_RF_plus', lime_evaluation_RF_plus, model_type='tree', splitting_strategy = "train-test")], ] \ No newline at end of file diff --git a/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/lss-model/dgp.py b/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/lss-model/dgp.py index 78a86f7..243a098 100644 --- a/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/lss-model/dgp.py +++ b/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/lss-model/dgp.py @@ -2,11 +2,11 @@ sys.path.append("../..") from feature_importance.scripts.simulations_util import * -X_DGP = sample_real_X +X_DGP = sample_real_data_X X_PARAMS_DICT = { - "fpath": "../data/regression_data/Diabetes_regression/X_diabetes_regression.csv", - "sample_row_n": None, - "sample_col_n": None + "source": "imodels", + "data_name": "diabetes_regr", + "sample_row_n": None } Y_DGP = lss_model diff --git a/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/lss-model/models.py b/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/lss-model/models.py index 0d225d0..5d578cb 100644 --- a/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/lss-model/models.py +++ b/feature_importance/fi_config/mdi_local/real_x_sim_y/diabetes-regression/lss-model/models.py @@ -2,24 +2,21 @@ import numpy as np from feature_importance.util import ModelConfig, FIModelConfig from sklearn.ensemble import RandomForestRegressor -from imodels.importance.rf_plus import RandomForestPlusRegressor from feature_importance.scripts.competing_methods_local import * - +from sklearn.linear_model import Ridge ESTIMATORS = [ [ModelConfig('RF', RandomForestRegressor, model_type='tree', - other_params={'n_estimators': 100, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'random_state': 42})], - [ModelConfig('RF_plus', RandomForestPlusRegressor, model_type='t_plus', - other_params={'rf_model': RandomForestRegressor(n_estimators=100, min_samples_leaf=1, max_features='sqrt', random_state=42)})] + other_params={'n_estimators': 100, 'min_samples_leaf': 5, 'max_features': 0.33, 'random_state': 42})] ] FI_ESTIMATORS = [ - [FIModelConfig('LFI_with_raw_RF', LFI_evaluation_RF, model_type='tree', splitting_strategy = "train-test")], - [FIModelConfig('MDI_RF', LFI_evaluation_RF, model_type='tree', splitting_strategy = "train-test", other_params={"include_raw": False, "cv_ridge": 0, "calc_loo_coef":False, "sample_split":"inbag"})], - [FIModelConfig('LFI_with_raw_OOB_RF', LFI_evaluation_RF, model_type='tree', splitting_strategy = "train-test", other_params={"sample_split":"oob", "fit_on":"test", "calc_loo_coef":False})], [FIModelConfig('TreeSHAP_RF', tree_shap_evaluation_RF, model_type='tree', splitting_strategy = "train-test")], - [FIModelConfig('LFI_with_raw_RF_plus', LFI_evaluation_RF_plus, model_type='t_plus', splitting_strategy = "train-test")], - [FIModelConfig('Kernel_SHAP_RF_plus', kernel_shap_evaluation_RF_plus, model_type='t_plus', splitting_strategy = "train-test")], - [FIModelConfig('LIME_RF_plus', lime_evaluation_RF_plus, model_type='t_plus', splitting_strategy = "train-test")], + [FIModelConfig('LFI_fit_on_inbag_RF', LFI_evaluation_RF_MDI, model_type='tree', splitting_strategy = "train-test", ascending = False, other_params={"include_raw":False, "fit_on":"inbag", "prediction_model": Ridge(alpha=1e-6)})], + [FIModelConfig('LFI_fit_on_OOB_RF', LFI_evaluation_RF_OOB, model_type='tree', splitting_strategy = "train-test", ascending = False, other_params={"fit_on":"oob"})], + [FIModelConfig('LFI_evaluate_on_all_RF_plus', LFI_evaluation_RF_plus, model_type='tree', splitting_strategy = "train-test", ascending = False)], + [FIModelConfig('LFI_evaluate_on_oob_RF_plus', LFI_evaluation_RF_plus_OOB, model_type='tree', splitting_strategy = "train-test", ascending = False)], + [FIModelConfig('Kernel_SHAP_RF_plus', kernel_shap_evaluation_RF_plus, model_type='tree', splitting_strategy = "train-test")], + [FIModelConfig('LIME_RF_plus', lime_evaluation_RF_plus, model_type='tree', splitting_strategy = "train-test")], ] \ No newline at end of file diff --git a/feature_importance/ranking_importance_local_sims.ipynb b/feature_importance/ranking_importance_local_sims.ipynb index 0392647..2e8526b 100644 --- a/feature_importance/ranking_importance_local_sims.ipynb +++ b/feature_importance/ranking_importance_local_sims.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 115, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -15,11 +15,11 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "directory = \"./results/mdi_local.real_x_sim_y.diabetes-regression.hierarchical-polynomial/diabetes-reg-hierpoly/varying_heritability_sample_row_n/\"\n", + "directory = \"./results/mdi_local.real_x_sim_y.diabetes-regression.linear-model/diabetes-reg-hierpoly/varying_heritability_sample_row_n/\"\n", "folder_names = [folder for folder in os.listdir(directory) if os.path.isdir(os.path.join(directory, folder))]\n", "experiments_seeds = []\n", "for folder_name in folder_names:\n", @@ -32,7 +32,2063 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
repsample_row_nsample_row_n_nameheritabilityheritability_namen_estimatorsmin_samples_leafmax_featuresrandom_stateinclude_rawcv_ridgecalc_loo_coefsample_splitfit_onmodelfitrain_sizetest_sizenum_featuresdata_split_seedsample_train_0sample_train_1sample_train_2sample_train_3sample_train_4sample_train_5sample_train_6sample_train_7sample_train_8sample_train_9sample_train_10sample_train_11sample_train_12sample_train_13sample_train_14sample_train_15sample_test_0sample_test_1sample_test_2sample_test_3sample_test_4sample_test_5sample_test_6sample_test_7ablation_seed_0fi_timetrain_AUROCtrain_AUPRCtrain_F1test_AUROCtest_AUPRCtest_F1split_seedrf_modelsample_train_16sample_train_17sample_train_18sample_train_19sample_train_20sample_train_21sample_train_22sample_train_23sample_train_24sample_train_25sample_train_26sample_train_27sample_train_28sample_train_29sample_train_30sample_train_31sample_train_32sample_test_8sample_test_9sample_test_10sample_test_11sample_test_12sample_test_13sample_test_14sample_test_15sample_train_33sample_train_34sample_train_35sample_train_36sample_train_37sample_train_38sample_train_39sample_train_40sample_train_41sample_train_42sample_train_43sample_train_44sample_train_45sample_train_46sample_train_47sample_train_48sample_train_49sample_test_16sample_test_17sample_test_18sample_test_19sample_test_20sample_test_21sample_test_22sample_test_23sample_train_50sample_train_51sample_train_52sample_train_53sample_train_54sample_train_55sample_train_56sample_train_57sample_train_58sample_train_59sample_train_60sample_train_61sample_train_62sample_train_63sample_train_64sample_train_65sample_train_66sample_test_24sample_test_25sample_test_26sample_test_27sample_test_28sample_test_29sample_test_30sample_test_31sample_test_32
001001000.10.1100.01.0sqrt42.0NaNNaNFalseoobtestRFLFI_with_raw_OOB_RF673310236164945406156412255928062342952192716226537116.3941670.7526040.8474910.5225920.7708330.8699070.5416672NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
101001000.10.1100.01.0sqrt42.0NaNNaNNaNNaNNaNRFLFI_with_raw_RF673310236164945406156412255928062342952192716226537117.5291870.6614580.7989830.6641650.6822920.8104250.7204842NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
201001000.10.1100.01.0sqrt42.0False0.0FalseinbagNaNRFMDI_RF67331023616494540615641225592806234295219271622653718.7251400.5677080.7136820.6817670.6093750.7587220.6631722NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
301001000.10.1100.01.0sqrt42.0NaNNaNNaNNaNNaNRFTreeSHAP_RF67331023616494540615641225592806234295219271622653710.2508360.5026040.7055100.6021020.5520830.7280010.6169872NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
401001000.10.1NaNNaNNaNNaNNaNNaNNaNNaNNaNRF_plusKernel_SHAP_RF_plus673310236164945406156412255928062342952192716226486144.5177480.6757810.8144390.6062550.6562500.8110530.6263892RandomForestRegressor(max_features='sqrt', ran...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
.........................................................................................................................................................................................................................................................................................................................................................................................................
111504004000.80.8100.01.0sqrt42.0False0.0FalseinbagNaNRFMDI_RF26813210711521322132117159234137306712682791842468784445571614291473126.8977120.7873130.8875800.6408070.7449490.8557400.5068397NaN179.0180.066.0112.090.09.093.0196.0108.042.0216.046.0201.0124.045.0144.06.010.0116.09.0117.062.069.035.038.0240.0155.086.0118.025.0208.0127.019.097.0164.0238.0183.092.0150.015.0255.060.0124.014.011.049.015.0106.0123.00.0241.0250.033.0244.0140.0165.0249.016.0266.075.010.0223.0227.0224.0119.084.0104.0128.0113.077.033.0101.076.0127.012.0130.0
111604004000.80.8100.01.0sqrt42.0NaNNaNNaNNaNNaNRFTreeSHAP_RF2681321071152132213211715923413730671268279184246878444557161429147313.5191230.7997510.9004370.6049070.8358590.9129630.5450817NaN179.0180.066.0112.090.09.093.0196.0108.042.0216.046.0201.0124.045.0144.06.010.0116.09.0117.062.069.035.038.0240.0155.086.0118.025.0208.0127.019.097.0164.0238.0183.092.0150.015.0255.060.0124.014.011.049.015.0106.0123.00.0241.0250.033.0244.0140.0165.0249.016.0266.075.010.0223.0227.0224.0119.084.0104.0128.0113.077.033.0101.076.0127.012.0130.0
111704004000.80.8NaNNaNNaNNaNNaNNaNNaNNaNNaNRF_plusKernel_SHAP_RF_plus268132107115213221321171592341373067126827918424687844455716142919672188.5675480.7991290.8901310.6156260.7664140.8788080.5905907RandomForestRegressor(max_features='sqrt', ran...179.0180.066.0112.090.09.093.0196.0108.042.0216.046.0201.0124.045.0144.06.010.0116.09.0117.062.069.035.038.0240.0155.086.0118.025.0208.0127.019.097.0164.0238.0183.092.0150.015.0255.060.0124.014.011.049.015.0106.0123.00.0241.0250.033.0244.0140.0165.0249.016.0266.075.010.0223.0227.0224.0119.084.0104.0128.0113.077.033.0101.076.0127.012.0130.0
111804004000.80.8NaNNaNNaNNaNNaNNaNNaNNaNNaNRF_plusLFI_with_raw_RF_plus2681321071152132213211715923413730671268279184246878444557161429196725.6247230.8016170.8955910.6887230.8219700.9119750.6852387RandomForestRegressor(max_features='sqrt', ran...179.0180.066.0112.090.09.093.0196.0108.042.0216.046.0201.0124.045.0144.06.010.0116.09.0117.062.069.035.038.0240.0155.086.0118.025.0208.0127.019.097.0164.0238.0183.092.0150.015.0255.060.0124.014.011.049.015.0106.0123.00.0241.0250.033.0244.0140.0165.0249.016.0266.075.010.0223.0227.0224.0119.084.0104.0128.0113.077.033.0101.076.0127.012.0130.0
111904004000.80.8NaNNaNNaNNaNNaNNaNNaNNaNNaNRF_plusLIME_RF_plus268132107115213221321171592341373067126827918424687844455716142919672364.4086270.8152990.9007710.7254840.8282830.9087820.7293547RandomForestRegressor(max_features='sqrt', ran...179.0180.066.0112.090.09.093.0196.0108.042.0216.046.0201.0124.045.0144.06.010.0116.09.0117.062.069.035.038.0240.0155.086.0118.025.0208.0127.019.097.0164.0238.0183.092.0150.015.0255.060.0124.014.011.049.015.0106.0123.00.0241.0250.033.0244.0140.0165.0249.016.0266.075.010.0223.0227.0224.0119.084.0104.0128.0113.077.033.0101.076.0127.012.0130.0
\n", + "

1120 rows × 130 columns

\n", + "
" + ], + "text/plain": [ + " rep sample_row_n sample_row_n_name heritability heritability_name \\\n", + "0 0 100 100 0.1 0.1 \n", + "1 0 100 100 0.1 0.1 \n", + "2 0 100 100 0.1 0.1 \n", + "3 0 100 100 0.1 0.1 \n", + "4 0 100 100 0.1 0.1 \n", + "... ... ... ... ... ... \n", + "1115 0 400 400 0.8 0.8 \n", + "1116 0 400 400 0.8 0.8 \n", + "1117 0 400 400 0.8 0.8 \n", + "1118 0 400 400 0.8 0.8 \n", + "1119 0 400 400 0.8 0.8 \n", + "\n", + " n_estimators min_samples_leaf max_features random_state include_raw \\\n", + "0 100.0 1.0 sqrt 42.0 NaN \n", + "1 100.0 1.0 sqrt 42.0 NaN \n", + "2 100.0 1.0 sqrt 42.0 False \n", + "3 100.0 1.0 sqrt 42.0 NaN \n", + "4 NaN NaN NaN NaN NaN \n", + "... ... ... ... ... ... \n", + "1115 100.0 1.0 sqrt 42.0 False \n", + "1116 100.0 1.0 sqrt 42.0 NaN \n", + "1117 NaN NaN NaN NaN NaN \n", + "1118 NaN NaN NaN NaN NaN \n", + "1119 NaN NaN NaN NaN NaN \n", + "\n", + " cv_ridge calc_loo_coef sample_split fit_on model \\\n", + "0 NaN False oob test RF \n", + "1 NaN NaN NaN NaN RF \n", + "2 0.0 False inbag NaN RF \n", + "3 NaN NaN NaN NaN RF \n", + "4 NaN NaN NaN NaN RF_plus \n", + "... ... ... ... ... ... \n", + "1115 0.0 False inbag NaN RF \n", + "1116 NaN NaN NaN NaN RF \n", + "1117 NaN NaN NaN NaN RF_plus \n", + "1118 NaN NaN NaN NaN RF_plus \n", + "1119 NaN NaN NaN NaN RF_plus \n", + "\n", + " fi train_size test_size num_features \\\n", + "0 LFI_with_raw_OOB_RF 67 33 10 \n", + "1 LFI_with_raw_RF 67 33 10 \n", + "2 MDI_RF 67 33 10 \n", + "3 TreeSHAP_RF 67 33 10 \n", + "4 Kernel_SHAP_RF_plus 67 33 10 \n", + "... ... ... ... ... \n", + "1115 MDI_RF 268 132 10 \n", + "1116 TreeSHAP_RF 268 132 10 \n", + "1117 Kernel_SHAP_RF_plus 268 132 10 \n", + "1118 LFI_with_raw_RF_plus 268 132 10 \n", + "1119 LIME_RF_plus 268 132 10 \n", + "\n", + " data_split_seed sample_train_0 sample_train_1 sample_train_2 \\\n", + "0 2 36 16 4 \n", + "1 2 36 16 4 \n", + "2 2 36 16 4 \n", + "3 2 36 16 4 \n", + "4 2 36 16 4 \n", + "... ... ... ... ... \n", + "1115 7 115 213 22 \n", + "1116 7 115 213 22 \n", + "1117 7 115 213 22 \n", + "1118 7 115 213 22 \n", + "1119 7 115 213 22 \n", + "\n", + " sample_train_3 sample_train_4 sample_train_5 sample_train_6 \\\n", + "0 9 45 40 61 \n", + "1 9 45 40 61 \n", + "2 9 45 40 61 \n", + "3 9 45 40 61 \n", + "4 9 45 40 61 \n", + "... ... ... ... ... \n", + "1115 132 117 159 234 \n", + "1116 132 117 159 234 \n", + "1117 132 117 159 234 \n", + "1118 132 117 159 234 \n", + "1119 132 117 159 234 \n", + "\n", + " sample_train_7 sample_train_8 sample_train_9 sample_train_10 \\\n", + "0 5 64 12 25 \n", + "1 5 64 12 25 \n", + "2 5 64 12 25 \n", + "3 5 64 12 25 \n", + "4 5 64 12 25 \n", + "... ... ... ... ... \n", + "1115 137 30 67 126 \n", + "1116 137 30 67 126 \n", + "1117 137 30 67 126 \n", + "1118 137 30 67 126 \n", + "1119 137 30 67 126 \n", + "\n", + " sample_train_11 sample_train_12 sample_train_13 sample_train_14 \\\n", + "0 59 28 0 62 \n", + "1 59 28 0 62 \n", + "2 59 28 0 62 \n", + "3 59 28 0 62 \n", + "4 59 28 0 62 \n", + "... ... ... ... ... \n", + "1115 82 79 184 24 \n", + "1116 82 79 184 24 \n", + "1117 82 79 184 24 \n", + "1118 82 79 184 24 \n", + "1119 82 79 184 24 \n", + "\n", + " sample_train_15 sample_test_0 sample_test_1 sample_test_2 \\\n", + "0 34 29 5 2 \n", + "1 34 29 5 2 \n", + "2 34 29 5 2 \n", + "3 34 29 5 2 \n", + "4 34 29 5 2 \n", + "... ... ... ... ... \n", + "1115 68 78 4 44 \n", + "1116 68 78 4 44 \n", + "1117 68 78 4 44 \n", + "1118 68 78 4 44 \n", + "1119 68 78 4 44 \n", + "\n", + " sample_test_3 sample_test_4 sample_test_5 sample_test_6 \\\n", + "0 19 27 16 22 \n", + "1 19 27 16 22 \n", + "2 19 27 16 22 \n", + "3 19 27 16 22 \n", + "4 19 27 16 22 \n", + "... ... ... ... ... \n", + "1115 55 71 61 42 \n", + "1116 55 71 61 42 \n", + "1117 55 71 61 42 \n", + "1118 55 71 61 42 \n", + "1119 55 71 61 42 \n", + "\n", + " sample_test_7 ablation_seed_0 fi_time train_AUROC train_AUPRC \\\n", + "0 6 5371 16.394167 0.752604 0.847491 \n", + "1 6 5371 17.529187 0.661458 0.798983 \n", + "2 6 5371 8.725140 0.567708 0.713682 \n", + "3 6 5371 0.250836 0.502604 0.705510 \n", + "4 6 4861 44.517748 0.675781 0.814439 \n", + "... ... ... ... ... ... \n", + "1115 91 4731 26.897712 0.787313 0.887580 \n", + "1116 91 4731 3.519123 0.799751 0.900437 \n", + "1117 91 9672 188.567548 0.799129 0.890131 \n", + "1118 91 9672 5.624723 0.801617 0.895591 \n", + "1119 91 9672 364.408627 0.815299 0.900771 \n", + "\n", + " train_F1 test_AUROC test_AUPRC test_F1 split_seed \\\n", + "0 0.522592 0.770833 0.869907 0.541667 2 \n", + "1 0.664165 0.682292 0.810425 0.720484 2 \n", + "2 0.681767 0.609375 0.758722 0.663172 2 \n", + "3 0.602102 0.552083 0.728001 0.616987 2 \n", + "4 0.606255 0.656250 0.811053 0.626389 2 \n", + "... ... ... ... ... ... \n", + "1115 0.640807 0.744949 0.855740 0.506839 7 \n", + "1116 0.604907 0.835859 0.912963 0.545081 7 \n", + "1117 0.615626 0.766414 0.878808 0.590590 7 \n", + "1118 0.688723 0.821970 0.911975 0.685238 7 \n", + "1119 0.725484 0.828283 0.908782 0.729354 7 \n", + "\n", + " rf_model sample_train_16 \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 RandomForestRegressor(max_features='sqrt', ran... NaN \n", + "... ... ... \n", + "1115 NaN 179.0 \n", + "1116 NaN 179.0 \n", + "1117 RandomForestRegressor(max_features='sqrt', ran... 179.0 \n", + "1118 RandomForestRegressor(max_features='sqrt', ran... 179.0 \n", + "1119 RandomForestRegressor(max_features='sqrt', ran... 179.0 \n", + "\n", + " sample_train_17 sample_train_18 sample_train_19 sample_train_20 \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "1115 180.0 66.0 112.0 90.0 \n", + "1116 180.0 66.0 112.0 90.0 \n", + "1117 180.0 66.0 112.0 90.0 \n", + "1118 180.0 66.0 112.0 90.0 \n", + "1119 180.0 66.0 112.0 90.0 \n", + "\n", + " sample_train_21 sample_train_22 sample_train_23 sample_train_24 \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "1115 9.0 93.0 196.0 108.0 \n", + "1116 9.0 93.0 196.0 108.0 \n", + "1117 9.0 93.0 196.0 108.0 \n", + "1118 9.0 93.0 196.0 108.0 \n", + "1119 9.0 93.0 196.0 108.0 \n", + "\n", + " sample_train_25 sample_train_26 sample_train_27 sample_train_28 \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "1115 42.0 216.0 46.0 201.0 \n", + "1116 42.0 216.0 46.0 201.0 \n", + "1117 42.0 216.0 46.0 201.0 \n", + "1118 42.0 216.0 46.0 201.0 \n", + "1119 42.0 216.0 46.0 201.0 \n", + "\n", + " sample_train_29 sample_train_30 sample_train_31 sample_train_32 \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "1115 124.0 45.0 144.0 6.0 \n", + "1116 124.0 45.0 144.0 6.0 \n", + "1117 124.0 45.0 144.0 6.0 \n", + "1118 124.0 45.0 144.0 6.0 \n", + "1119 124.0 45.0 144.0 6.0 \n", + "\n", + " sample_test_8 sample_test_9 sample_test_10 sample_test_11 \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "1115 10.0 116.0 9.0 117.0 \n", + "1116 10.0 116.0 9.0 117.0 \n", + "1117 10.0 116.0 9.0 117.0 \n", + "1118 10.0 116.0 9.0 117.0 \n", + "1119 10.0 116.0 9.0 117.0 \n", + "\n", + " sample_test_12 sample_test_13 sample_test_14 sample_test_15 \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "1115 62.0 69.0 35.0 38.0 \n", + "1116 62.0 69.0 35.0 38.0 \n", + "1117 62.0 69.0 35.0 38.0 \n", + "1118 62.0 69.0 35.0 38.0 \n", + "1119 62.0 69.0 35.0 38.0 \n", + "\n", + " sample_train_33 sample_train_34 sample_train_35 sample_train_36 \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "1115 240.0 155.0 86.0 118.0 \n", + "1116 240.0 155.0 86.0 118.0 \n", + "1117 240.0 155.0 86.0 118.0 \n", + "1118 240.0 155.0 86.0 118.0 \n", + "1119 240.0 155.0 86.0 118.0 \n", + "\n", + " sample_train_37 sample_train_38 sample_train_39 sample_train_40 \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "1115 25.0 208.0 127.0 19.0 \n", + "1116 25.0 208.0 127.0 19.0 \n", + "1117 25.0 208.0 127.0 19.0 \n", + "1118 25.0 208.0 127.0 19.0 \n", + "1119 25.0 208.0 127.0 19.0 \n", + "\n", + " sample_train_41 sample_train_42 sample_train_43 sample_train_44 \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "1115 97.0 164.0 238.0 183.0 \n", + "1116 97.0 164.0 238.0 183.0 \n", + "1117 97.0 164.0 238.0 183.0 \n", + "1118 97.0 164.0 238.0 183.0 \n", + "1119 97.0 164.0 238.0 183.0 \n", + "\n", + " sample_train_45 sample_train_46 sample_train_47 sample_train_48 \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "1115 92.0 150.0 15.0 255.0 \n", + "1116 92.0 150.0 15.0 255.0 \n", + "1117 92.0 150.0 15.0 255.0 \n", + "1118 92.0 150.0 15.0 255.0 \n", + "1119 92.0 150.0 15.0 255.0 \n", + "\n", + " sample_train_49 sample_test_16 sample_test_17 sample_test_18 \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "1115 60.0 124.0 14.0 11.0 \n", + "1116 60.0 124.0 14.0 11.0 \n", + "1117 60.0 124.0 14.0 11.0 \n", + "1118 60.0 124.0 14.0 11.0 \n", + "1119 60.0 124.0 14.0 11.0 \n", + "\n", + " sample_test_19 sample_test_20 sample_test_21 sample_test_22 \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "1115 49.0 15.0 106.0 123.0 \n", + "1116 49.0 15.0 106.0 123.0 \n", + "1117 49.0 15.0 106.0 123.0 \n", + "1118 49.0 15.0 106.0 123.0 \n", + "1119 49.0 15.0 106.0 123.0 \n", + "\n", + " sample_test_23 sample_train_50 sample_train_51 sample_train_52 \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "1115 0.0 241.0 250.0 33.0 \n", + "1116 0.0 241.0 250.0 33.0 \n", + "1117 0.0 241.0 250.0 33.0 \n", + "1118 0.0 241.0 250.0 33.0 \n", + "1119 0.0 241.0 250.0 33.0 \n", + "\n", + " sample_train_53 sample_train_54 sample_train_55 sample_train_56 \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "1115 244.0 140.0 165.0 249.0 \n", + "1116 244.0 140.0 165.0 249.0 \n", + "1117 244.0 140.0 165.0 249.0 \n", + "1118 244.0 140.0 165.0 249.0 \n", + "1119 244.0 140.0 165.0 249.0 \n", + "\n", + " sample_train_57 sample_train_58 sample_train_59 sample_train_60 \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "1115 16.0 266.0 75.0 10.0 \n", + "1116 16.0 266.0 75.0 10.0 \n", + "1117 16.0 266.0 75.0 10.0 \n", + "1118 16.0 266.0 75.0 10.0 \n", + "1119 16.0 266.0 75.0 10.0 \n", + "\n", + " sample_train_61 sample_train_62 sample_train_63 sample_train_64 \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "1115 223.0 227.0 224.0 119.0 \n", + "1116 223.0 227.0 224.0 119.0 \n", + "1117 223.0 227.0 224.0 119.0 \n", + "1118 223.0 227.0 224.0 119.0 \n", + "1119 223.0 227.0 224.0 119.0 \n", + "\n", + " sample_train_65 sample_train_66 sample_test_24 sample_test_25 \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "1115 84.0 104.0 128.0 113.0 \n", + "1116 84.0 104.0 128.0 113.0 \n", + "1117 84.0 104.0 128.0 113.0 \n", + "1118 84.0 104.0 128.0 113.0 \n", + "1119 84.0 104.0 128.0 113.0 \n", + "\n", + " sample_test_26 sample_test_27 sample_test_28 sample_test_29 \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "... ... ... ... ... \n", + "1115 77.0 33.0 101.0 76.0 \n", + "1116 77.0 33.0 101.0 76.0 \n", + "1117 77.0 33.0 101.0 76.0 \n", + "1118 77.0 33.0 101.0 76.0 \n", + "1119 77.0 33.0 101.0 76.0 \n", + "\n", + " sample_test_30 sample_test_31 sample_test_32 \n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "... ... ... ... \n", + "1115 127.0 12.0 130.0 \n", + "1116 127.0 12.0 130.0 \n", + "1117 127.0 12.0 130.0 \n", + "1118 127.0 12.0 130.0 \n", + "1119 127.0 12.0 130.0 \n", + "\n", + "[1120 rows x 130 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "combined_df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -60,7 +2116,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -85,7 +2141,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -104,7 +2160,7 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -147,7 +2203,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 7, "metadata": {}, "outputs": [ { diff --git a/feature_importance/ranking_importance_local_sims.py b/feature_importance/ranking_importance_local_sims.py index 9ddf19e..9787b46 100644 --- a/feature_importance/ranking_importance_local_sims.py +++ b/feature_importance/ranking_importance_local_sims.py @@ -16,12 +16,16 @@ from collections import defaultdict from typing import Callable, List, Tuple import itertools +from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score, mean_squared_error, r2_score, average_precision_score from sklearn import preprocessing -from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score, average_precision_score +from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import LinearRegression +import xgboost as xgb +from imodels.tree.rf_plus.rf_plus.rf_plus_models import RandomForestPlusRegressor, RandomForestPlusClassifier sys.path.append(".") sys.path.append("..") sys.path.append("../../") -sys.path.append("/accounts/grad/zachrewolinski/research/imodels") +sys.path.append("/accounts/grad/zachrewolinski/research/imodels/imodels") print(sys.path) import fi_config from util import ModelConfig, FIModelConfig, tp, fp, neg, pos, specificity_score, auroc_score, auprc_score, compute_nsg_feat_corr_w_sig_subspace, apply_splitting_strategy @@ -47,7 +51,7 @@ def compare_estimators(estimators: List[ModelConfig], # loop over model estimators for model in estimators: - print("Running model:", model) + print("running model:", model) est = model.cls(**model.kwargs) # get kwargs for all fi_ests @@ -62,7 +66,7 @@ def compare_estimators(estimators: List[ModelConfig], # loop over splitting strategies for splitting_strategy, fi_ests in fi_ests_dict.items(): - print("Using feature importance estimator:", fi_ests) + print("using feature importance estimator:", fi_ests) # implement provided splitting strategy if splitting_strategy is not None: X_train, X_tune, X_test, y_train, y_tune, y_test = apply_splitting_strategy(X, y, splitting_strategy, args.split_seed) @@ -71,15 +75,25 @@ def compare_estimators(estimators: List[ModelConfig], X_test = X y_train = y y_test = y - - normalizer = preprocessing.Normalizer() - if splitting_strategy == "train-test": - X_train = normalizer.fit_transform(X_train) - X_test = normalizer.transform(X_test) - else: - X = normalizer.fit_transform(X) - X_train = normalizer.transform(X_train) - X_test = normalizer.transform(X_test) + + # fit RF model + est.fit(X_train, y_train) + + # fit RF_plus model + start = time.time() + rf_plus_base = RandomForestPlusRegressor(rf_model=est) + rf_plus_base.fit(X_train, y_train) + end = time.time() + + # if logistic regression then do this + # normalizer = preprocessing.Normalizer() + # if splitting_strategy == "train-test": + # X_train = normalizer.fit_transform(X_train) + # X_test = normalizer.transform(X_test) + # else: + # X = normalizer.fit_transform(X) + # X_train = normalizer.transform(X_train) + # X_test = normalizer.transform(X_test) print("Line 85") @@ -99,9 +113,6 @@ def compare_estimators(estimators: List[ModelConfig], print("Line 100") # loop over fi estimators - rng = np.random.RandomState() - number_of_ablations = 1 - seeds = rng.randint(0, 10000, number_of_ablations) for fi_est in tqdm(fi_ests): print("line 107") print("Using fi est:", fi_est) @@ -111,47 +122,51 @@ def compare_estimators(estimators: List[ModelConfig], 'train_size': X_train.shape[0], 'test_size': X_test.shape[0], 'num_features': X_train.shape[1], - 'data_split_seed': args.split_seed + 'data_split_seed': args.split_seed, + 'rf_plus_fit_time': end - start } for i in range(int(X_train.shape[0]*.25)): metric_results[f'sample_train_{i}'] = indices_train[i] for i in range(int(X_test.shape[0]*.25)): metric_results[f'sample_test_{i}'] = indices_test[i] - for i in range(len(seeds)): - metric_results[f'ablation_seed_{i}'] = seeds[i] start = time.time() - local_fi_score_train_subset = fi_est.cls(X_train=X_train, y_train=y_train, - X_train_subset = X_train_subset, y_train_subset=y_train_subset, - X_test=X_test, y_test=y_test, - fit=copy.deepcopy(est), data_fit_on="train_subset", **fi_est.kwargs) - if fi_est.name not in ["LIME_RF_plus", "Kernel_SHAP_RF_plus"]: - local_fi_score_test = fi_est.cls(X_train=X_train, y_train=y_train, - X_train_subset = X_train_subset, y_train_subset=y_train_subset, - X_test=X_test, y_test=y_test, - fit=copy.deepcopy(est), data_fit_on="test", **fi_est.kwargs) - else: - local_fi_score_test = None - local_fi_score_test_subset = fi_est.cls(X_train=X_train, y_train=y_train, - X_train_subset = X_train_subset, y_train_subset=y_train_subset, - X_test=X_test_subset, y_test=y_test_subset, - fit=copy.deepcopy(est), data_fit_on="test", **fi_est.kwargs) + if fi_est.name == "LFI_evaluate_on_all_RF_plus" or fi_est.name == "LFI_evaluate_on_oob_RF_plus": + local_fi_score_train, local_parital_pred_train, local_fi_score_test, local_partial_pred_test, local_fi_score_test_subset, local_partial_pred_test_subset = fi_est.cls(X_train=X_train, y_train=y_train, + X_train_subset = X_train_subset, y_train_subset=y_train_subset, + X_test_subset=X_test_subset, X_test=X_test, + fit=rf_plus_base, **fi_est.kwargs) + local_fi_score_train_subset = local_fi_score_train[indices_train] + local_partial_pred_train_subset = local_parital_pred_train[indices_train] + elif fi_est.name == "LFI_fit_on_inbag_RF" or fi_est.name == "LFI_fit_on_inbag_RF": + local_fi_score_train, local_parital_pred_train, local_fi_score_test, local_partial_pred_test, local_fi_score_test_subset, local_partial_pred_test_subset = fi_est.cls(X_train=X_train, y_train=y_train, + X_train_subset = X_train_subset, y_train_subset=y_train_subset, + X_test_subset=X_test_subset, X_test=X_test, + fit=copy.deepcopy(est), **fi_est.kwargs) + local_fi_score_train_subset = local_fi_score_train[indices_train] + local_partial_pred_train_subset = local_parital_pred_train[indices_train] + elif fi_est.name == "TreeSHAP_RF": + local_fi_score_train_subset, local_fi_score_test, local_fi_score_test_subset = fi_est.cls(X_train=X_train, y_train=y_train, + X_train_subset = X_train_subset, y_train_subset=y_train_subset, + X_test_subset=X_test_subset, X_test=X_test, + fit=copy.deepcopy(est), **fi_est.kwargs) + elif fi_est.name == "Kernel_SHAP_RF_plus" or fi_est.name == "LIME_RF_plus": + local_fi_score_train_subset, local_fi_score_test, local_fi_score_test_subset = fi_est.cls(X_train=X_train, y_train=y_train, + X_train_subset = X_train_subset, y_train_subset=y_train_subset, + X_test_subset=X_test_subset, X_test=X_test, + fit=rf_plus_base, **fi_est.kwargs) + end = time.time() metric_results['fi_time'] = end - start # feature_importance_list.append(local_fi_score_train_subset) feature_importance_list.append(local_fi_score_test) feature_importance_list.append(local_fi_score_test_subset) - # print("support:") - # print(support) - # print("support shape:") - # print(support.shape) - # print("local_fi_score_train_subset") - # print(local_fi_score_train_subset) - # print(type(local_fi_score_train_subset)) - auroc = [] auprc = [] f1 = [] + print("Original Type:", type(local_fi_score_train_subset)) + local_fi_score_train_subset = pd.DataFrame(local_fi_score_train_subset) + print("Changed Type:", type(local_fi_score_train_subset)) for rownum in range(local_fi_score_train_subset.shape[0]): auroc.append(roc_auc_score(support, local_fi_score_train_subset.iloc[rownum,:])) auprc.append(average_precision_score(support, local_fi_score_train_subset.iloc[rownum,:])) @@ -164,6 +179,9 @@ def compare_estimators(estimators: List[ModelConfig], auroc = [] auprc = [] f1 = [] + print("Original Type:", type(local_fi_score_test_subset)) + local_fi_score_test_subset = pd.DataFrame(local_fi_score_test_subset) + print("Changed Type:", type(local_fi_score_train_subset)) for rownum in range(local_fi_score_test_subset.shape[0]): auroc.append(roc_auc_score(support, local_fi_score_test_subset.iloc[rownum,:])) auprc.append(average_precision_score(support, local_fi_score_test_subset.iloc[rownum,:])) @@ -172,7 +190,6 @@ def compare_estimators(estimators: List[ModelConfig], metric_results['test_AUROC'] = np.array(auroc).mean() metric_results['test_AUPRC'] = np.array(auprc).mean() metric_results['test_F1'] = np.array(f1).mean() - # print("done with metrics") # initialize results with metadata and metric results kwargs: dict = model.kwargs # dict @@ -185,7 +202,7 @@ def compare_estimators(estimators: List[ModelConfig], results[k].append(None) for met_name, met_val in metric_results.items(): results[met_name].append(met_val) - print("Done iterating over individual estimators") + print("done iterating over individual estimators") return results, feature_importance_list diff --git a/feature_importance/scripts/competing_methods_local.py b/feature_importance/scripts/competing_methods_local.py index c347d0d..999c823 100644 --- a/feature_importance/scripts/competing_methods_local.py +++ b/feature_importance/scripts/competing_methods_local.py @@ -10,11 +10,193 @@ import shap import lime import lime.lime_tabular -from imodels.importance.rf_plus import RandomForestPlusRegressor, RandomForestPlusClassifier -from imodels.importance.rf_plus import _fast_r2_score +from imodels.tree.rf_plus.rf_plus.rf_plus_models import RandomForestPlusRegressor, RandomForestPlusClassifier +from sklearn.ensemble import RandomForestRegressor +from imodels.tree.rf_plus.feature_importance.rfplus_explainer import * from sklearn.metrics import r2_score, mean_absolute_error, accuracy_score, roc_auc_score, mean_squared_error +# Feature Importance Methods for RF +def tree_shap_evaluation_RF(X_train, y_train, X_train_subset, y_train_subset, X_test, X_test_subset, fit): + """ + Compute average treeshap value across observations. + Larger absolute values indicate more important features. + :param X: design matrix + :param y: response + :param fit: fitted model of interest (tree-based) + :return: dataframe of shape: (n_samples, n_features) + """ + def add_abs(a, b): + return abs(a) + abs(b) + + subsets = [(X_train_subset, y_train_subset), (X_test, None), (X_test_subset, None)] + result_tables = [] + + explainer = shap.TreeExplainer(fit) + + for X_data, _ in subsets: + shap_values = explainer.shap_values(X_data, check_additivity=False) + if sklearn.base.is_classifier(fit): + # Shape values are returned as a list of arrays, one for each class + results = np.sum(np.abs(shap_values), axis=-1) + else: + results = np.abs(shap_values) + + result_tables.append(results) + + return tuple(result_tables) + +def LFI_evaluation_RF_MDI(X_train, y_train, X_train_subset, y_train_subset, X_test, X_test_subset, fit, **kwargs): + if isinstance(fit, RegressorMixin): + RFPlus = RandomForestPlusRegressor + elif isinstance(fit, ClassifierMixin): + RFPlus = RandomForestPlusClassifier + else: + raise ValueError("Unknown task.") + + rf_plus_model = RFPlus(rf_model=fit, **kwargs) + rf_plus_model.fit(X_train, y_train) + + subsets = [(X_train, y_train), (X_test, None), (X_test_subset, None)] + result_tables = [] + + for X_data, y_data in subsets: + if np.array_equal(X_data, X_train): + rf_plus_mdi = RFPlusMDI(rf_plus_model, evaluate_on="inbag") + else: + rf_plus_mdi = RFPlusMDI(rf_plus_model, evaluate_on="all") + num_samples, num_features = X_data.shape + local_feature_importances, partial_preds = rf_plus_mdi.explain(X=X_data, y=y_data) + abs_local_feature_importances = np.abs(local_feature_importances) + abs_partial_preds = np.abs(partial_preds) + result_tables.append(abs_local_feature_importances) + result_tables.append(abs_partial_preds) + return tuple(result_tables) + +def LFI_evaluation_RF_MDI_classification(X_train, y_train, X_train_subset, y_train_subset, X_test, X_test_subset, fit, **kwargs): + + rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=3, max_features='sqrt', random_state=42) + rf.fit(X_train, y_train) + rf_plus_model = RandomForestPlusRegressor(rf_model=rf, **kwargs) + rf_plus_model.fit(X_train, y_train) + + subsets = [(X_train, y_train), (X_test, None), (X_test_subset, None)] + result_tables = [] + + for X_data, y_data in subsets: + if np.array_equal(X_data, X_train): + rf_plus_mdi = RFPlusMDI(rf_plus_model, evaluate_on="inbag") + else: + rf_plus_mdi = RFPlusMDI(rf_plus_model, evaluate_on="all") + num_samples, num_features = X_data.shape + local_feature_importances, partial_preds = rf_plus_mdi.explain(X=X_data, y=y_data) + abs_local_feature_importances = np.abs(local_feature_importances) + abs_partial_preds = np.abs(partial_preds) + result_tables.append(abs_local_feature_importances) + result_tables.append(abs_partial_preds) + return tuple(result_tables) + +def LFI_evaluation_RF_OOB(X_train, y_train, X_train_subset, y_train_subset, X_test, X_test_subset, fit, **kwargs): + if isinstance(fit, RegressorMixin): + RFPlus = RandomForestPlusRegressor + elif isinstance(fit, ClassifierMixin): + RFPlus = RandomForestPlusClassifier + else: + raise ValueError("Unknown task.") + + rf_plus_model = RFPlus(rf_model=fit, **kwargs) + rf_plus_model.fit(X_train, y_train) + + subsets = [(X_train, y_train), (X_test, None), (X_test_subset, None)] + result_tables = [] + + for X_data, y_data in subsets: + if np.array_equal(X_data, X_train): + rf_plus_mdi = AloRFPlusMDI(rf_plus_model, evaluate_on="oob") + else: + rf_plus_mdi = AloRFPlusMDI(rf_plus_model, evaluate_on="all") + num_samples, num_features = X_data.shape + local_feature_importances, partial_preds = rf_plus_mdi.explain(X=X_data, y=y_data) + abs_local_feature_importances = np.abs(local_feature_importances) + abs_partial_preds = np.abs(partial_preds) + result_tables.append(abs_local_feature_importances) + result_tables.append(abs_partial_preds) + return tuple(result_tables) + + +# Feature Importance Methods for RF+ +def LFI_evaluation_RF_plus(X_train, y_train, X_train_subset, y_train_subset, X_test, X_test_subset, fit): + assert isinstance(fit, RandomForestPlusRegressor) or isinstance(fit, RandomForestPlusClassifier) + subsets = [(X_train, y_train), (X_test, None), (X_test_subset, None)] + result_tables = [] + rf_plus_mdi = AloRFPlusMDI(fit, evaluate_on="all") + + for X_data, y_data in subsets: + num_samples, num_features = X_data.shape + local_feature_importances, partial_preds = rf_plus_mdi.explain(X=X_data, y=y_data) + abs_local_feature_importances = np.abs(local_feature_importances) + abs_partial_preds = np.abs(partial_preds) + result_tables.append(abs_local_feature_importances) + result_tables.append(abs_partial_preds) + + return tuple(result_tables) + +def LFI_evaluation_RF_plus_OOB(X_train, y_train, X_train_subset, y_train_subset, X_test, X_test_subset, fit): + assert isinstance(fit, RandomForestPlusRegressor) or isinstance(fit, RandomForestPlusClassifier) + subsets = [(X_train, y_train), (X_test, None), (X_test_subset, None)] + result_tables = [] + + for X_data, y_data in subsets: + num_samples, num_features = X_data.shape + if np.array_equal(X_data, X_train): + rf_plus_mdi = AloRFPlusMDI(fit, evaluate_on="oob") + else: + rf_plus_mdi = AloRFPlusMDI(fit, evaluate_on="all") + local_feature_importances, partial_preds = rf_plus_mdi.explain(X=X_data, y=y_data) + abs_local_feature_importances = np.abs(local_feature_importances) + abs_partial_preds = np.abs(partial_preds) + result_tables.append(abs_local_feature_importances) + result_tables.append(abs_partial_preds) + + return tuple(result_tables) + +def lime_evaluation_RF_plus(X_train, y_train, X_train_subset, y_train_subset, X_test, X_test_subset, fit): + assert isinstance(fit, RandomForestPlusRegressor) or isinstance(fit, RandomForestPlusClassifier) + subsets = [(X_train_subset, y_train_subset), (X_test_subset, None)] + result_tables = [] + + for X_data, _ in subsets: + num_samples, num_features = X_data.shape + rf_plus_lime = RFPlusLime(fit) + lime_values = rf_plus_lime.explain(X_train=X_train, X_test=X_data) + lime_scores = np.abs(lime_values) + result_tables.append(lime_scores) + + result_table_train_subset, result_table_test_subset = result_tables + + return result_table_train_subset, None, result_table_test_subset + + +def kernel_shap_evaluation_RF_plus(X_train, y_train, X_train_subset, y_train_subset, X_test, X_test_subset, fit): + assert isinstance(fit, RandomForestPlusRegressor) or isinstance(fit, RandomForestPlusClassifier) + subsets = [(X_train_subset, y_train_subset), (X_test_subset, None)] + result_tables = [] + + for X_data, _ in subsets: + num_samples, num_features = X_data.shape + rf_plus_kernel_shap = RFPlusKernelSHAP(fit) + kernel_shap_scores = rf_plus_kernel_shap.explain(X_train=X_train, X_test=X_data) + kernel_shap_scores = np.abs(kernel_shap_scores) + result_tables.append(kernel_shap_scores) + + result_table_train_subset, result_table_test_subset = result_tables + + return result_table_train_subset, None, result_table_test_subset + + +# result_table = pd.DataFrame(kernel_shap_scores, columns=[f'Feature_{i}' for i in range(num_features)]) +# result_tables.append(result_table) + # def MDI_local_sub_stumps(X, y, fit, scoring_fns="auto", return_stability_scores=False, **kwargs): # """ # Compute local MDI importance for each feature and sample. @@ -206,155 +388,6 @@ # return result_table - -def LFI_evaluation_RF(X_train, y_train, X_train_subset, y_train_subset, X_test, y_test, fit, data_fit_on, scoring_fns="auto", return_stability_scores=False, **kwargs): - assert data_fit_on in ["train_subset", "test"] - if data_fit_on == "train_subset": - X_data = X_train_subset - y_data = y_train_subset - else: - X_data = X_test - y_data = y_test - num_samples, num_features = X_data.shape - if isinstance(fit, RegressorMixin): - RFPlus = RandomForestPlusRegressor - elif isinstance(fit, ClassifierMixin): - RFPlus = RandomForestPlusClassifier - else: - raise ValueError("Unknown task.") - rf_plus_model = RFPlus(rf_model=fit, **kwargs) - rf_plus_model.fit(X_train, y_train) - - try: - mdi_plus_scores = rf_plus_model.get_mdi_plus_scores(X=X_data, y=y_data, lfi=True, lfi_abs="none", sample_split=None, train_or_test = "test")["lfi"].values - mdi_plus_scores = np.abs(mdi_plus_scores) - if return_stability_scores: - raise NotImplementedError - stability_scores = rf_plus_model.get_mdi_plus_stability_scores(B=25) - except ValueError as e: - if str(e) == 'Transformer representation was empty for all trees.': - mdi_plus_scores = np.zeros((num_samples, num_features)) - stability_scores = None - else: - raise - result_table = pd.DataFrame(mdi_plus_scores, columns=[f'Feature_{i}' for i in range(num_features)]) - - return result_table - - -def lime_evaluation_RF(X_train, y_train, X_train_subset, y_train_subset, X_test, y_test, fit, data_fit_on): - """ - Compute LIME local importance for each feature and sample. - Larger values indicate more important features. - :param X: design matrix - :param y: response - :param fit: fitted model of interest (tree-based) - :return: dataframe of shape: (n_samples, n_features) - - """ - assert data_fit_on in ["train_subset", "test"] - if data_fit_on == "train_subset": - X_data = X_train_subset - else: - X_data = X_test - if isinstance(fit, RegressorMixin): - mode='regression' - elif isinstance(fit, ClassifierMixin): - mode='classification' - np.random.seed(1) - num_samples, num_features = X_data.shape - result = np.zeros((num_samples, num_features)) - explainer = lime.lime_tabular.LimeTabularExplainer(X_train, verbose=False, mode=mode) - - if mode == 'classification': - if not hasattr(fit, 'predict_proba'): - raise ValueError("Classifier model must have predict_proba method") - - for i in range(num_samples): - if mode == 'classification': - predict_fn = fit.predict_proba - else: - predict_fn = fit.predict - exp = explainer.explain_instance(X_data[i], predict_fn, num_features=num_features) - original_feature_importance = exp.as_map()[1] - sorted_feature_importance = sorted(original_feature_importance, key=lambda x: x[0]) - for j in range(num_features): - result[i,j] = abs(sorted_feature_importance[j][1]) - # Convert the array to a DataFrame - result_table = pd.DataFrame(result, columns=[f'Feature_{i}' for i in range(num_features)]) - - return result_table - - -def tree_shap_evaluation_RF(X_train, y_train, X_train_subset, y_train_subset, X_test, y_test, fit, data_fit_on): - """ - Compute average treeshap value across observations. - Larger absolute values indicate more important features. - :param X: design matrix - :param y: response - :param fit: fitted model of interest (tree-based) - :return: dataframe of shape: (n_samples, n_features) - """ - assert data_fit_on in ["train_subset", "test"] - if data_fit_on == "train_subset": - X_data = X_train_subset - else: - X_data = X_test - explainer = shap.TreeExplainer(fit) - shap_values = explainer.shap_values(X_data, check_additivity=False) - if sklearn.base.is_classifier(fit): - # Shape values are returned as a list of arrays, one for each class - def add_abs(a, b): - return abs(a) + abs(b) - results = np.sum(np.abs(shap_values),axis=-1) - else: - results = abs(shap_values) - result_table = pd.DataFrame(results, columns=[f'Feature_{i}' for i in range(X_data.shape[1])]) - - return result_table - -def lime_evaluation_RF_plus(X_train, y_train, X_train_subset, y_train_subset, X_test, y_test, fit, data_fit_on): - assert data_fit_on in ["train_subset", "test"] - if data_fit_on == "train_subset": - X_data = X_train_subset - else: - X_data = X_test - num_samples, num_features = X_data.shape - lime_scores = fit.get_lime_scores(X_train, X_data).values - result_table = pd.DataFrame(lime_scores, columns=[f'Feature_{i}' for i in range(num_features)]) - - return result_table - - -def kernel_shap_evaluation_RF_plus(X_train, y_train, X_train_subset, y_train_subset, X_test, y_test, fit, data_fit_on): - assert data_fit_on in ["train_subset", "test"] - if data_fit_on == "train_subset": - X_data = X_train_subset - else: - X_data = X_test - num_samples, num_features = X_data.shape - kernel_shap_scores = fit.get_kernel_shap_scores(X_train, X_data) - result_table = pd.DataFrame(kernel_shap_scores, columns=[f'Feature_{i}' for i in range(num_features)]) - - return result_table - - -def LFI_evaluation_RF_plus(X_train, y_train, X_train_subset, y_train_subset, X_test, y_test, fit, data_fit_on): - assert data_fit_on in ["train_subset", "test"] - if data_fit_on == "train_subset": - X_data = X_train_subset - y_data = y_train_subset - else: - X_data = X_test - y_data = y_test - num_samples, num_features = X_data.shape - abs_lfi_scores = fit.get_mdi_plus_scores(X=X_data, y=y_data, lfi=True, lfi_abs="none", sample_split=None, train_or_test = "test")["lfi"].values - abs_lfi_scores = np.abs(abs_lfi_scores) - result_table = pd.DataFrame(abs_lfi_scores, columns=[f'Feature_{i}' for i in range(num_features)]) - - return result_table - - # def MDI_local_sub_stumps_evaluate(X_train, y_train, X_test, y_test, fit, scoring_fns="auto", return_stability_scores=False, **kwargs): # """ # Compute local MDI importance for each feature and sample. diff --git a/feature_importance/scripts/simulations_util.py b/feature_importance/scripts/simulations_util.py index d76a220..9a106ef 100644 --- a/feature_importance/scripts/simulations_util.py +++ b/feature_importance/scripts/simulations_util.py @@ -4,33 +4,42 @@ from scipy.linalg import toeplitz import warnings import math +import imodels +import openml - -def sample_real_data(X_fpath=None, y_fpath=None, seed=4307, normalize=False, - sample_row_n=None, sample_col_n=None, return_data=None, - return_support=True): - - assert return_data in ["X", "y"] +def sample_real_data_X(source=None, data_name=None, task_id=None, seed=4307, normalize=False, sample_row_n=None): np.random.seed(seed) - if return_data == "X": - X = pd.read_csv(X_fpath) - if normalize: - X = (X - X.mean()) / X.std() - if sample_row_n is not None: - keep_idx = np.random.choice(X.shape[0], sample_row_n, replace=False) - X = X.iloc[keep_idx, :] - if sample_col_n is not None: - X = X.sample(n=sample_col_n, replace=False, axis=1) - return X.to_numpy() - else: - y = pd.read_csv(y_fpath) - if sample_row_n is not None: - keep_idx = np.random.choice(y.shape[0], sample_row_n, replace=False) - y = y.iloc[keep_idx, :] - if return_support: - return y.to_numpy().flatten(), np.ones(y.shape[1]), None - return y.to_numpy().flatten() + if source == "imodels": + X, _, _ = imodels.get_clean_dataset(data_name) + elif source == "openml": + task = openml.tasks.get_task(task_id) + dataset_id = task.dataset_id + dataset = openml.datasets.get_dataset(dataset_id) + X, _, _, _ = dataset.get_data(target=dataset.default_target_attribute, dataset_format="array") + if normalize: + X = (X - X.mean()) / X.std() + if sample_row_n is not None: + keep_idx = np.random.choice(X.shape[0], sample_row_n, replace=False) + X = X[keep_idx, :] + return X + +def sample_real_data_y(X=None, source=None, data_name=None, task_id=None, + seed=4307, sample_row_n=None, return_support=True): + np.random.seed(seed) + if source == "imodels": + _, y, _ = imodels.get_clean_dataset(data_name) + elif source == "openml": + task = openml.tasks.get_task(task_id) + dataset_id = task.dataset_id + dataset = openml.datasets.get_dataset(dataset_id) + _, y, _, _ = dataset.get_data(target=dataset.default_target_attribute,dataset_format="array") + if sample_row_n is not None: + keep_idx = np.random.choice(y.shape[0], sample_row_n, replace=False) + y = y[keep_idx, :] + if return_support: + return y, np.ones(y.shape), None + return y def sample_real_X(fpath=None, X=None, seed=None, normalize=True, sample_row_n=None, sample_col_n=None, permute_col=True, @@ -218,6 +227,7 @@ def create_y(x, s, beta): for j in range(s): linear_term += x[j] * beta[j] return linear_term + beta = generate_coef(beta, s) y_train = np.array([create_y(X[i, :], s, beta) for i in range(len(X))]) if heritability is not None: @@ -261,7 +271,7 @@ def create_y(x, s, beta): y_train = y_train + sigma * error_fun(n) corrupt_idx = np.random.choice(range(s, p), size=1) y_train = corrupt_leverage(X[:, corrupt_idx], y_train, mean_shift=corrupt_mean, corrupt_quantile=corrupt_quantile, mode="normal") - print("beta:", beta) + if return_support: support = np.concatenate((np.ones(s), np.zeros(X.shape[1] - s))) return y_train, support, beta @@ -1144,4 +1154,3 @@ def __array_finalize__(self, obj): return self.index = getattr(obj, 'index', None) -#%% diff --git a/feature_importance/test_error.py b/feature_importance/test_error.py new file mode 100644 index 0000000..e69de29 diff --git a/feature_importance/util.py b/feature_importance/util.py index cb95e03..bfd9e85 100644 --- a/feature_importance/util.py +++ b/feature_importance/util.py @@ -13,7 +13,6 @@ from sklearn.preprocessing import label_binarize from sklearn.utils._encode import _unique from sklearn import metrics -from imodels.importance.ppms import huber_loss DATASET_PATH = oj(dirname(os.path.realpath(__file__)), 'data') @@ -144,8 +143,8 @@ def neg_mean_absolute_error(y_true, y_pred, **kwargs): return -mean_absolute_error(y_true, y_pred, **kwargs) -def neg_huber_loss(y_true, y_pred, **kwargs): - return -huber_loss(y_true, y_pred, **kwargs) +# def neg_huber_loss(y_true, y_pred, **kwargs): +# return -huber_loss(y_true, y_pred, **kwargs) def restricted_roc_auc_score(y_true, y_score, ignored_indices=[]): @@ -197,4 +196,4 @@ def apply_splitting_strategy(X: np.ndarray, X_train, X_tune, y_train, y_tune = model_selection.train_test_split( X_train, y_train, test_size=0.2, random_state=split_seed) - return X_train, X_tune, X_test, y_train, y_tune, y_test + return X_train, X_tune, X_test, y_train, y_tune, y_test \ No newline at end of file