Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
zyliang2001 committed Apr 15, 2024
1 parent c1b55f6 commit 9ab24a6
Show file tree
Hide file tree
Showing 14 changed files with 11,254 additions and 5,165 deletions.
2 changes: 1 addition & 1 deletion feature_importance/01_ablation_classification_script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#SBATCH --partition=yugroup

source activate mdi
command="01_run_ablation_classification.py --nreps 1 --config mdi_local.real_data_classification --split_seed ${1} --ignore_cache --create_rmd --result_name Diabetes_classification_parallel"
command="01_run_ablation_classification.py --nreps 1 --config mdi_local.real_data_classification --split_seed ${1} --ignore_cache --create_rmd --result_name fico"

# Execute the command
python $command
8 changes: 2 additions & 6 deletions feature_importance/01_ablation_regression_script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,7 @@
#SBATCH --partition=yugroup

source activate mdi
command="01_run_ablation_regression.py --nreps 1 --config mdi_local.real_data_regression --split_seed ${1} --ignore_cache --create_rmd --result_name diabetes_regression"
command="01_run_ablation_regression.py --nreps 1 --config mdi_local.real_data_regression --split_seed ${1} --ignore_cache --create_rmd --result_name satellite_image"

# Execute the command
python $command



python OLD_XX.py --nreps 1 --config mdi_local.real_data_regression --split_seed ${1} --ignore_cache --create_rmd --result_name diabetes_regression"
python $command
66 changes: 43 additions & 23 deletions feature_importance/01_run_ablation_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from collections import defaultdict
from typing import Callable, List, Tuple
import itertools
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score, mean_squared_error
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score, mean_squared_error, average_precision_score
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
Expand Down Expand Up @@ -150,9 +150,10 @@ def compare_estimators(estimators: List[ModelConfig],
# fit model
est.fit(X_train, y_train)
test_all_auc = roc_auc_score(y_test, est.predict_proba(X_test)[:, 1])
test_all_auprc = auprc_score(y_test, est.predict_proba(X_test)[:, 1])
test_all_auprc = average_precision_score(y_test, est.predict_proba(X_test)[:, 1])
test_all_f1 = f1_score(y_test, est.predict_proba(X_test)[:, 1] > 0.5)

np.random.seed(42)
indices_train = np.random.choice(X_train.shape[0], 100, replace=False)
indices_test = np.random.choice(X_test.shape[0], 100, replace=False)
X_train_subset = X_train[indices_train]
Expand Down Expand Up @@ -183,21 +184,29 @@ def compare_estimators(estimators: List[ModelConfig],
metric_results[f'ablation_seed_{i}'] = seeds[i]
start = time.time()
local_fi_score_train_subset = fi_est.cls(X_train=X_train, y_train=y_train,
X_test=X_test, y_test=y_test,
fit=copy.deepcopy(est), data_fit_on="train", **fi_est.kwargs)
local_fi_score_test = fi_est.cls(X_train=X_train, y_train=y_train,
X_test=X_test, y_test=y_test,
fit=copy.deepcopy(est), data_fit_on="test", **fi_est.kwargs)
local_fi_score_test_subset = None
X_train_subset = X_train_subset, y_train_subset=y_train_subset,
X_test=X_test, y_test=y_test,
fit=copy.deepcopy(est), data_fit_on="train_subset", **fi_est.kwargs)
if fi_est.name not in ["LIME_RF_plus", "Kernel_SHAP_RF_plus"]:
local_fi_score_test = fi_est.cls(X_train=X_train, y_train=y_train,
X_train_subset = X_train_subset, y_train_subset=y_train_subset,
X_test=X_test, y_test=y_test,
fit=copy.deepcopy(est), data_fit_on="test", **fi_est.kwargs)
else:
local_fi_score_test = None
local_fi_score_test_subset = fi_est.cls(X_train=X_train, y_train=y_train,
X_train_subset = X_train_subset, y_train_subset=y_train_subset,
X_test=X_test_subset, y_test=y_test_subset,
fit=copy.deepcopy(est), data_fit_on="test", **fi_est.kwargs)
end = time.time()
metric_results['fi_time'] = end - start
feature_importance_list.append(local_fi_score_train_subset)
# feature_importance_list.append(local_fi_score_train_subset)
feature_importance_list.append(local_fi_score_test)
feature_importance_list.append(local_fi_score_test_subset)

ablation_models = {"RF_Classifier": RandomForestClassifier(n_estimators=100, min_samples_leaf=1, max_features='sqrt', random_state=42),
"Logistic": LogisticRegressionCV(),
"SVM": SVC(probability=True),
"LogisticCV": LogisticRegressionCV(random_state=42),
"SVM": SVC(random_state=42, probability=True),
"XGBoost_Classifier": xgb.XGBClassifier(random_state=42),
"RF_Plus_Classifier": RandomForestPlusClassifier(rf_model=RandomForestClassifier(n_estimators=100, min_samples_leaf=1, max_features='sqrt', random_state=42))}

Expand All @@ -208,7 +217,7 @@ def compare_estimators(estimators: List[ModelConfig],
ablation_est.fit(X_train, y_train)
y_pred = ablation_est.predict_proba(X_train_subset)[:, 1]
metric_results[a_model+'_train_subset_AUROC_before_ablation'] = roc_auc_score(y_train_subset, y_pred)
metric_results[a_model+'_train_subset_AUPRC_before_ablation'] = auprc_score(y_train_subset, y_pred)
metric_results[a_model+'_train_subset_AUPRC_before_ablation'] = average_precision_score(y_train_subset, y_pred)
metric_results[a_model+'_train_subset_F1_before_ablation'] = f1_score(y_train_subset, y_pred > 0.5)
imp_vals = copy.deepcopy(local_fi_score_train_subset)
imp_vals[imp_vals == float("-inf")] = -sys.maxsize - 1
Expand All @@ -223,7 +232,7 @@ def compare_estimators(estimators: List[ModelConfig],
else:
ablation_X_train_subset = ablation_to_mean(X_train, X_train_subset, imp_vals, "min", i+1)
ablation_results_auroc_list[i] += roc_auc_score(y_train_subset, ablation_est.predict_proba(ablation_X_train_subset)[:, 1])
ablation_results_auprc_list[i] += auprc_score(y_train_subset, ablation_est.predict_proba(ablation_X_train_subset)[:, 1])
ablation_results_auprc_list[i] += average_precision_score(y_train_subset, ablation_est.predict_proba(ablation_X_train_subset)[:, 1])
ablation_results_f1_list[i] += f1_score(y_train_subset, ablation_est.predict_proba(ablation_X_train_subset)[:, 1] > 0.5)
ablation_results_f1_list = [x / number_of_ablations for x in ablation_results_f1_list]
ablation_results_auroc_list = [x / number_of_ablations for x in ablation_results_auroc_list]
Expand All @@ -241,9 +250,9 @@ def compare_estimators(estimators: List[ModelConfig],
for a_model in ablation_models:
ablation_est = ablation_models[a_model]
ablation_est.fit(X_train, y_train)
y_pred_subset = est.predict_proba(X_test_subset)[:, 1]
y_pred_subset = ablation_est.predict_proba(X_test_subset)[:, 1]
metric_results[a_model+'_test_subset_AUROC_before_ablation'] = roc_auc_score(y_test_subset, y_pred_subset)
metric_results[a_model+'_test_subset_AUPRC_before_ablation'] = auprc_score(y_test_subset, y_pred_subset)
metric_results[a_model+'_test_subset_AUPRC_before_ablation'] = average_precision_score(y_test_subset, y_pred_subset)
metric_results[a_model+'_test_subset_F1_before_ablation'] = f1_score(y_test_subset, y_pred_subset > 0.5)
imp_vals = copy.deepcopy(local_fi_score_test_subset)
imp_vals[imp_vals == float("-inf")] = -sys.maxsize - 1
Expand All @@ -258,7 +267,7 @@ def compare_estimators(estimators: List[ModelConfig],
else:
ablation_X_test_subset = ablation_to_mean(X_train, X_test_subset, imp_vals, "min", i+1)
ablation_results_auroc_list[i] += roc_auc_score(y_test_subset, ablation_est.predict_proba(ablation_X_test_subset)[:, 1])
ablation_results_auprc_list[i] += auprc_score(y_test_subset, ablation_est.predict_proba(ablation_X_test_subset)[:, 1])
ablation_results_auprc_list[i] += average_precision_score(y_test_subset, ablation_est.predict_proba(ablation_X_test_subset)[:, 1])
ablation_results_f1_list[i] += f1_score(y_test_subset, ablation_est.predict_proba(ablation_X_test_subset)[:, 1] > 0.5)
ablation_results_f1_list = [x / number_of_ablations for x in ablation_results_f1_list]
ablation_results_auroc_list = [x / number_of_ablations for x in ablation_results_auroc_list]
Expand All @@ -277,7 +286,7 @@ def compare_estimators(estimators: List[ModelConfig],
ablation_est = ablation_models[a_model]
ablation_est.fit(X_train, y_train)
metric_results[a_model+'_test_subset_AUROC_before_ablation_blank'] = roc_auc_score(y_test_subset, ablation_est.predict(np.zeros(X_test_subset.shape)))
metric_results[a_model+'_test_subset_AUPRC_before_ablation_blank'] = auprc_score(y_test_subset, ablation_est.predict(np.zeros(X_test_subset.shape)))
metric_results[a_model+'_test_subset_AUPRC_before_ablation_blank'] = average_precision_score(y_test_subset, ablation_est.predict(np.zeros(X_test_subset.shape)))
metric_results[a_model+'_test_subset_F1_before_ablation_blank'] = f1_score(y_test_subset, ablation_est.predict(np.zeros(X_test_subset.shape)) > 0.5)
imp_vals = copy.deepcopy(local_fi_score_test_subset)
imp_vals[imp_vals == float("-inf")] = -sys.maxsize - 1
Expand All @@ -292,10 +301,11 @@ def compare_estimators(estimators: List[ModelConfig],
else:
ablation_X_test_subset_blank = ablation_by_addition(X_test_subset, imp_vals, "min", i+1)
ablation_results_auroc_list[i] += roc_auc_score(y_test_subset, ablation_est.predict_proba(ablation_X_test_subset_blank)[:, 1])
ablation_results_auprc_list[i] += auprc_score(y_test_subset, ablation_est.predict_proba(ablation_X_test_subset_blank)[:, 1])
ablation_results_auprc_list[i] += average_precision_score(y_test_subset, ablation_est.predict_proba(ablation_X_test_subset_blank)[:, 1])
ablation_results_f1_list[i] += f1_score(y_test_subset, ablation_est.predict_proba(ablation_X_test_subset_blank)[:, 1] > 0.5)
ablation_results_list = [x / len(seeds) for x in ablation_results_list]
ablation_results_list_r2 = [x / len(seeds) for x in ablation_results_list_r2]
ablation_results_f1_list = [x / number_of_ablations for x in ablation_results_f1_list]
ablation_results_auroc_list = [x / number_of_ablations for x in ablation_results_auroc_list]
ablation_results_auprc_list = [x / number_of_ablations for x in ablation_results_auprc_list]
for i in range(X_test_subset.shape[1]):
metric_results[f'{a_model}_test_subset_AUROC_after_ablation_{i+1}_blank'] = ablation_results_auroc_list[i]
metric_results[f'{a_model}_test_subset_AUPRC_after_ablation_{i+1}_blank'] = ablation_results_auprc_list[i]
Expand All @@ -309,9 +319,9 @@ def compare_estimators(estimators: List[ModelConfig],
for a_model in ablation_models:
ablation_est = ablation_models[a_model]
ablation_est.fit(X_train, y_train)
y_pred = est.predict_proba(X_test)[:, 1]
y_pred = ablation_est.predict_proba(X_test)[:, 1]
metric_results[a_model+'_test_AUROC_before_ablation'] = roc_auc_score(y_test, y_pred)
metric_results[a_model+'_test_AUPRC_before_ablation'] = auprc_score(y_test, y_pred)
metric_results[a_model+'_test_AUPRC_before_ablation'] = average_precision_score(y_test, y_pred)
metric_results[a_model+'_test_F1_before_ablation'] = f1_score(y_test, y_pred > 0.5)
imp_vals = copy.deepcopy(local_fi_score_test)
imp_vals[imp_vals == float("-inf")] = -sys.maxsize - 1
Expand All @@ -326,7 +336,7 @@ def compare_estimators(estimators: List[ModelConfig],
else:
ablation_X_test = ablation_to_mean(X_train, X_test, imp_vals, "min", i+1)
ablation_results_auroc_list[i] += roc_auc_score(y_test, ablation_est.predict_proba(ablation_X_test)[:, 1])
ablation_results_auprc_list[i] += auprc_score(y_test, ablation_est.predict_proba(ablation_X_test)[:, 1])
ablation_results_auprc_list[i] += average_precision_score(y_test, ablation_est.predict_proba(ablation_X_test)[:, 1])
ablation_results_f1_list[i] += f1_score(y_test, ablation_est.predict_proba(ablation_X_test)[:, 1] > 0.5)
ablation_results_f1_list = [x / number_of_ablations for x in ablation_results_f1_list]
ablation_results_auroc_list = [x / number_of_ablations for x in ablation_results_auroc_list]
Expand All @@ -337,6 +347,16 @@ def compare_estimators(estimators: List[ModelConfig],
metric_results[f'{a_model}_test_F1_after_ablation_{i+1}'] = ablation_results_f1_list[i]
end = time.time()
metric_results['test_data_ablation_time'] = end - start
else:
for a_model in ablation_models:
metric_results[a_model+'_test_AUROC_before_ablation'] = None
metric_results[a_model+'_test_AUPRC_before_ablation'] = None
metric_results[a_model+'_test_F1_before_ablation'] = None
for i in range(X_test.shape[1]):
metric_results[f'{a_model}_test_AUROC_after_ablation_{i+1}'] = None
metric_results[f'{a_model}_test_AUPRC_after_ablation_{i+1}'] = None
metric_results[f'{a_model}_test_F1_after_ablation_{i+1}'] = None
metric_results["test_data_ablation_time"] = None
print(f"fi: {fi_est.name} ablation done with time: {end - start}")

# initialize results with metadata and metric results
Expand Down
33 changes: 25 additions & 8 deletions feature_importance/01_run_ablation_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from imodels.importance import RandomForestPlusRegressor, RandomForestPlusClassifier

sys.path.append(".")
sys.path.append("..")
sys.path.append("../..")
Expand Down Expand Up @@ -150,6 +149,7 @@ def compare_estimators(estimators: List[ModelConfig],
test_all_mse = mean_squared_error(y_test, est.predict(X_test))
test_all_r2 = r2_score(y_test, est.predict(X_test))

np.random.seed(42)
indices_train = np.random.choice(X_train.shape[0], 100, replace=False)
indices_test = np.random.choice(X_test.shape[0], 100, replace=False)
X_train_subset = X_train[indices_train]
Expand Down Expand Up @@ -179,15 +179,23 @@ def compare_estimators(estimators: List[ModelConfig],
metric_results[f'ablation_seed_{i}'] = seeds[i]
start = time.time()
local_fi_score_train_subset = fi_est.cls(X_train=X_train, y_train=y_train,
X_test=X_test, y_test=y_test,
fit=copy.deepcopy(est), data_fit_on="train", **fi_est.kwargs)
local_fi_score_test = fi_est.cls(X_train=X_train, y_train=y_train,
X_test=X_test, y_test=y_test,
fit=copy.deepcopy(est), data_fit_on="test", **fi_est.kwargs)
local_fi_score_test_subset = None
X_train_subset = X_train_subset, y_train_subset=y_train_subset,
X_test=X_test, y_test=y_test,
fit=copy.deepcopy(est), data_fit_on="train_subset", **fi_est.kwargs)
if fi_est.name not in ["LIME_RF_plus", "Kernel_SHAP_RF_plus"]:
local_fi_score_test = fi_est.cls(X_train=X_train, y_train=y_train,
X_train_subset = X_train_subset, y_train_subset=y_train_subset,
X_test=X_test, y_test=y_test,
fit=copy.deepcopy(est), data_fit_on="test", **fi_est.kwargs)
else:
local_fi_score_test = None
local_fi_score_test_subset = fi_est.cls(X_train=X_train, y_train=y_train,
X_train_subset = X_train_subset, y_train_subset=y_train_subset,
X_test=X_test_subset, y_test=y_test_subset,
fit=copy.deepcopy(est), data_fit_on="test", **fi_est.kwargs)
end = time.time()
metric_results['fi_time'] = end - start
feature_importance_list.append(local_fi_score_train_subset)
# feature_importance_list.append(local_fi_score_train_subset)
feature_importance_list.append(local_fi_score_test)
feature_importance_list.append(local_fi_score_test_subset)

Expand Down Expand Up @@ -313,6 +321,15 @@ def compare_estimators(estimators: List[ModelConfig],
metric_results[f'{a_model}_test_R_2_after_ablation_{i+1}'] = ablation_results_list_r2[i]
end = time.time()
metric_results['test_data_ablation_time'] = end - start
else:
for a_model in ablation_models:
metric_results[a_model + '_test_MSE_before_ablation'] = None
metric_results[a_model + '_test_R_2_before_ablation'] = None
for i in range(X_test.shape[1]):
metric_results[f'{a_model}_test_MSE_after_ablation_{i+1}'] = None
metric_results[f'{a_model}_test_R_2_after_ablation_{i+1}'] = None
metric_results["test_data_ablation_time"] = None

print(f"fi: {fi_est.name} ablation done with time: {end - start}")

# initialize results with metadata and metric results
Expand Down
Binary file added feature_importance/diabetes_regression_test.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added feature_importance/diabetes_regression_train.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 9ab24a6

Please sign in to comment.