feature ranking stuff

Yu-Group · Apr 21, 2024 · 0a7508a · 0a7508a
1 parent 9ab24a6
commit 0a7508a
Show file tree

Hide file tree

Showing 8 changed files with 1,314 additions and 22 deletions.
diff --git a/feature_importance/feature_ranking.sh b/feature_importance/feature_ranking.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+#SBATCH [email protected]
+#SBATCH --mail-type=ALL
+#SBATCH --partition=yugroup
+
+source activate mdi
+command="run_importance_local_sims.py --nreps 1 --config mdi_local.real_x_sim_y --split_seed 1 --ignore_cache --create_rmd --result_name feature_ranking"
+
+# Execute the command
+python $command
diff --git a/feature_importance/fi_config/mdi_local/real_x_sim_y/dgp.py b/feature_importance/fi_config/mdi_local/real_x_sim_y/dgp.py
@@ -0,0 +1,43 @@
+import sys
+sys.path.append("../..")
+from feature_importance.scripts.simulations_util import *
+
+
+X_DGP = sample_real_data
+X_PARAMS_DICT = {
+    "X_fpath": "../data/regression_data/Diabetes_regression/X_diabetes_regression.csv",
+    "sample_row_n": None,
+    "return_data": "X"
+}
+# X_PARAMS_DICT = {
+#     "X_fpath": "../data/classification_data/Fico/X_fico.csv",
+#     "sample_row_n": None,
+#     "return_data": "X"
+# }
+# X_PARAMS_DICT = {
+#     "X_fpath": "../data/classification_data/Juvenile/X_juvenile.csv",
+#     "sample_row_n": None,
+#     "return_data": "X"
+# }
+Y_DGP = linear_model
+Y_PARAMS_DICT = {
+    "beta": 1,
+    "sigma": None,
+    "heritability": 0.4,
+    "s": 4
+}
+# Y_PARAMS_DICT = {
+#     "y_fpath": "../data/classification_data/Fico/y_fico.csv",
+#     "return_data": "y"
+# }
+# Y_PARAMS_DICT = {
+#     "y_fpath": "../data/classification_data/Juvenile/y_juvenile.csv",
+#     "return_data": "y"
+# }
+
+# vary one parameter
+VARY_PARAM_NAME = ["heritability", "sample_row_n"]
+VARY_PARAM_VALS = {"heritability": {"0.1": 0.1, "0.2": 0.2,
+                                    "0.4": 0.4, "0.8": 0.8},
+                   "sample_row_n": {"100": 100, "200": 200,
+                                    "300": 300, "442": 442}}
diff --git a/feature_importance/fi_config/mdi_local/real_x_sim_y/models.py b/feature_importance/fi_config/mdi_local/real_x_sim_y/models.py
@@ -0,0 +1,25 @@
+import copy
+import numpy as np
+from feature_importance.util import ModelConfig, FIModelConfig
+from sklearn.ensemble import RandomForestRegressor
+from imodels.importance.rf_plus import RandomForestPlusRegressor
+from feature_importance.scripts.competing_methods_local import *
+
+
+
+ESTIMATORS = [
+    [ModelConfig('RF', RandomForestRegressor, model_type='tree',
+                other_params={'n_estimators': 100, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'random_state': 42})],
+    [ModelConfig('RF_plus', RandomForestPlusClassifier, model_type='t_plus',
+                other_params={'rf_model': RandomForestRegressor(n_estimators=100, min_samples_leaf=1, max_features='sqrt', random_state=42)})]
+]
+
+FI_ESTIMATORS = [
+    [FIModelConfig('LFI_with_raw_RF', LFI_evaluation_RF, model_type='tree', splitting_strategy = "train-test")],
+    [FIModelConfig('MDI_RF', LFI_evaluation_RF, model_type='tree', splitting_strategy = "train-test", other_params={"include_raw": False, "cv_ridge": 0, "calc_loo_coef":False, "sample_split":"inbag"})],
+    [FIModelConfig('LFI_with_raw_OOB_RF', LFI_evaluation_RF, model_type='tree', splitting_strategy = "train-test", other_params={"sample_split":"oob", "fit_on":"test", "calc_loo_coef":False})],
+    [FIModelConfig('TreeSHAP_RF', tree_shap_evaluation_RF, model_type='tree', splitting_strategy = "train-test")],
+    [FIModelConfig('LFI_with_raw_RF_plus', LFI_evaluation_RF_plus, model_type='t_plus', splitting_strategy = "train-test")],
+    [FIModelConfig('Kernel_SHAP_RF_plus', kernel_shap_evaluation_RF_plus, model_type='t_plus', splitting_strategy = "train-test")],
+    # [FIModelConfig('LIME_RF_plus', lime_evaluation_RF_plus, model_type='t_plus', splitting_strategy = "train-test")],
+]
diff --git a/feature_importance/fi_config/mdi_local/synthetic_data/models.py b/feature_importance/fi_config/mdi_local/synthetic_data/models.py
@@ -4,26 +4,17 @@
 # N_ESTIMATORS=[50, 100, 500, 1000]
 ESTIMATORS = [
     [ModelConfig('RF', RandomForestRegressor, model_type='tree',
-                 other_params={'n_estimators': 100, 'min_samples_leaf': 5, 'max_features': 0.33})],
-    # [ModelConfig('RF', RandomForestRegressor, model_type='tree', vary_param="n_estimators", vary_param_val=m,
-    #              other_params={'min_samples_leaf': 5, 'max_features': 0.33}) for m in N_ESTIMATORS]
+                other_params={'n_estimators': 100, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'random_state': 42})],
+    [ModelConfig('RF_plus', RandomForestPlusClassifier, model_type='t_plus',
+                other_params={'rf_model': RandomForestRegressor(n_estimators=100, min_samples_leaf=1, max_features='sqrt', random_state=42)})]
 ]
 
 FI_ESTIMATORS = [
-    [FIModelConfig('MDI_local_all_stumps_evaluate', MDI_local_all_stumps_evaluate, ascending = False, splitting_strategy = "train-test", model_type='tree')],
-    # [FIModelConfig('MDI_sub_stumps', MDI_local_sub_stumps, ascending = False, model_type='tree')],
-    [FIModelConfig('MDI_local_all_stumps_evaluate_without_raw', MDI_local_all_stumps_evaluate, ascending = False, splitting_strategy = "train-test", model_type='tree', other_params={"include_raw": False})],
-    # [FIModelConfig('MDI_sub_stumps_without_raw', MDI_local_sub_stumps, ascending = False, model_type='tree', other_params={"include_raw": False})],
-    # [FIModelConfig('LFI_sum_absolute', LFI_sum_absolute, model_type='tree', splitting_strategy = "train-test")],
-    [FIModelConfig('LFI_absolute_sum_evaluate', LFI_absolute_sum_evaluate, model_type='tree', splitting_strategy = "train-test")],
-    # [FIModelConfig('LFI_sum_absolute_sub_stumps', LFI_sum_absolute_sub_stumps, model_type='tree')],
-    # [FIModelConfig('LFI_absolute_sum_sub_stumps', LFI_absolute_sum_sub_stumps, model_type='tree')],
-    # [FIModelConfig('LFI_sum_absolute_without_raw', LFI_sum_absolute, model_type='tree', splitting_strategy = "train-test", other_params={"include_raw": False})],
-    [FIModelConfig('LFI_absolute_sum_evaluate_without_raw', LFI_absolute_sum_evaluate, model_type='tree', splitting_strategy = "train-test", other_params={"include_raw": False})],
-    # [FIModelConfig('LFI_sum_absolute_sub_stumps_without_raw', LFI_sum_absolute_sub_stumps, model_type='tree', other_params={"include_raw": False})],
-    # [FIModelConfig('LFI_absolute_sum_sub_stumps_without_raw', LFI_absolute_sum_sub_stumps, model_type='tree', other_params={"include_raw": False})],
-    [FIModelConfig('TreeSHAP', tree_shap_local, model_type='tree', splitting_strategy = "train-test")],
-    [FIModelConfig('LIME', lime_local, model_type='tree', splitting_strategy = "train-test")],
-]
-
-# [FIModelConfig('Permutation', permutation_local, model_type='tree')],
+    [FIModelConfig('LFI_with_raw_RF', LFI_evaluation_RF, model_type='tree', splitting_strategy = "train-test")],
+    [FIModelConfig('MDI_RF', LFI_evaluation_RF, model_type='tree', splitting_strategy = "train-test", other_params={"include_raw": False, "cv_ridge": 0, "calc_loo_coef":False, "sample_split":"inbag"})],
+    [FIModelConfig('LFI_with_raw_OOB_RF', LFI_evaluation_RF, model_type='tree', splitting_strategy = "train-test", other_params={"sample_split":"oob", "fit_on":"test", "calc_loo_coef":False})],
+    [FIModelConfig('TreeSHAP_RF', tree_shap_evaluation_RF, model_type='tree', splitting_strategy = "train-test")],
+    [FIModelConfig('LFI_with_raw_RF_plus', LFI_evaluation_RF_plus, model_type='t_plus', splitting_strategy = "train-test")],
+    [FIModelConfig('Kernel_SHAP_RF_plus', kernel_shap_evaluation_RF_plus, model_type='t_plus', splitting_strategy = "train-test")],
+    [FIModelConfig('LIME_RF_plus', lime_evaluation_RF_plus, model_type='t_plus', splitting_strategy = "train-test")],
+]
diff --git a/feature_importance/run_importance_local_sims.ipynb b/feature_importance/run_importance_local_sims.ipynb