Skip to content

Commit

Permalink
add changes
Browse files Browse the repository at this point in the history
  • Loading branch information
zyliang2001 committed Jun 11, 2024
1 parent 3430b61 commit 029ead3
Show file tree
Hide file tree
Showing 16 changed files with 871 additions and 42,420 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,4 @@ _site
temp.ipynb
**.png
data
saved_models
4 changes: 2 additions & 2 deletions feature_importance/01_ablation_classification_script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#SBATCH --partition=yugroup

source activate mdi
command="01_run_ablation_classification.py --nreps 1 --config mdi_local.real_data_classification --split_seed ${1} --ignore_cache --create_rmd --result_name diabetes_simplify"
# command="01_run_ablation_classification.py --nreps 1 --config mdi_local.real_data_classification --split_seed ${1} --ignore_cache --create_rmd --result_name Enhancer --ablate_features 20"
# Need to specify --result_name --ablate_features(default all features) --fitted(default not fitted)
command="01_run_ablation_classification.py --nreps 1 --config mdi_local.real_data_classification --split_seed ${1} --ignore_cache --create_rmd --ablate_features 20 --result_name Juvenile --fitted True"
# Execute the command
python $command
4 changes: 2 additions & 2 deletions feature_importance/01_ablation_regression_script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
#SBATCH --partition=yugroup

source activate mdi
command="01_run_ablation_regression.py --nreps 1 --config mdi_local.real_data_regression --split_seed ${1} --ignore_cache --create_rmd --result_name diabetes_test_new"
#command="01_run_ablation_regression.py --nreps 1 --config mdi_local.real_data_regression --split_seed ${1} --ignore_cache --create_rmd --result_name CCLE_AZD0530_new --ablate_features 20"
# Need to specify --result_name --ablate_features(default all features) --fitted(default not fitted)
command="01_run_ablation_regression.py --nreps 1 --config mdi_local.real_data_regression --split_seed ${1} --ignore_cache --create_rmd --ablate_features 20 --result_name CCLE_2000"

# Execute the command
python $command
3 changes: 2 additions & 1 deletion feature_importance/01_ablation_script_class.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

slurm_script="01_ablation_classification_script.sh"

for rep in {1..2}
for rep in {1..10}
do
sbatch $slurm_script $rep # Submit SLURM job using the specified script
sleep 10
done
3 changes: 2 additions & 1 deletion feature_importance/01_ablation_script_regr.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

slurm_script="01_ablation_regression_script.sh"

for rep in {1..2}
for rep in {1..10}
do
sbatch $slurm_script $rep # Submit SLURM job using the specified script
sleep 5
done
52 changes: 24 additions & 28 deletions feature_importance/01_run_ablation_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def compare_estimators(estimators: List[ModelConfig],

# initialize results
results = defaultdict(lambda: [])
feature_importance_list = []
feature_importance_list = {}

# loop over model estimators
for model in estimators:
Expand Down Expand Up @@ -177,32 +177,30 @@ def compare_estimators(estimators: List[ModelConfig],
test_all_auc_rf_plus_oob = roc_auc_score(y_test, rf_plus_base_oob.predict_proba(X_test)[:, 1])
test_all_auprc_rf_plus_oob = average_precision_score(y_test, rf_plus_base_oob.predict_proba(X_test)[:, 1])
test_all_f1_rf_plus_oob = f1_score(y_test, rf_plus_base_oob.predict_proba(X_test)[:, 1] > 0.5)
test_all_auc_rf_plus_inbag = roc_auc_score(y_test, rf_plus_base_inbag.predict_proba(X_test)[:, 1])
test_all_auprc_rf_plus_inbag = average_precision_score(y_test, rf_plus_base_inbag.predict_proba(X_test)[:, 1])
test_all_f1_rf_plus_inbag = f1_score(y_test, rf_plus_base_inbag.predict_proba(X_test)[:, 1] > 0.5)

fitted_results = {
"Model": ["RF", "RF_plus", "RF_plus_oob", "RF_plus_inbag"],
"AUC": [test_all_auc_rf, test_all_auc_rf_plus, test_all_auc_rf_plus_oob, test_all_auc_rf_plus_inbag],
"AUPRC": [test_all_auprc_rf, test_all_auprc_rf_plus, test_all_auprc_rf_plus_oob, test_all_auprc_rf_plus_inbag],
"F1": [test_all_f1_rf, test_all_f1_rf_plus, test_all_f1_rf_plus_oob, test_all_f1_rf_plus_inbag],
"AUC": [test_all_auc_rf, test_all_auc_rf_plus, test_all_auc_rf_plus_oob, None],
"AUPRC": [test_all_auprc_rf, test_all_auprc_rf_plus, test_all_auprc_rf_plus_oob, None],
"F1": [test_all_f1_rf, test_all_f1_rf_plus, test_all_f1_rf_plus_oob, None],
"Time": [end_rf - start_rf, end_rf_plus - start_rf_plus, end_rf_plus_oob - start_rf_plus_oob, end_rf_plus_inbag - start_rf_plus_inbag]
}

os.makedirs(f"/scratch/users/zhongyuan_liang/saved_models/{args.result_name}", exist_ok=True)
results_df = pd.DataFrame(fitted_results)
results_df.to_csv(f"./saved_models/{args.result_name}/RFPlus_fitted_summary_{args.split_seed}.csv", index=False)
results_df.to_csv(f"/scratch/users/zhongyuan_liang/saved_models/{args.result_name}/RFPlus_fitted_summary_{args.split_seed}.csv", index=False)


pickle_file = f"./saved_models/{args.result_name}/RF_{args.split_seed}.dill"
pickle_file = f"/scratch/users/zhongyuan_liang/saved_models/{args.result_name}/RF_{args.split_seed}.dill"
with open(pickle_file, 'wb') as file:
dill.dump(est, file)
pickle_file = f"./saved_models/{args.result_name}/RFPlus_default_{args.split_seed}.dill"
pickle_file = f"/scratch/users/zhongyuan_liang/saved_models/{args.result_name}/RFPlus_default_{args.split_seed}.dill"
with open(pickle_file, 'wb') as file:
dill.dump(rf_plus_base, file)
pickle_file = f"./saved_models/{args.result_name}/RFPlus_oob_{args.split_seed}.dill"
pickle_file = f"/scratch/users/zhongyuan_liang/saved_models/{args.result_name}/RFPlus_oob_{args.split_seed}.dill"
with open(pickle_file, 'wb') as file:
dill.dump(rf_plus_base_oob, file)
pickle_file = f"./saved_models/{args.result_name}/RFPlus_inbag_{args.split_seed}.dill"
pickle_file = f"/scratch/users/zhongyuan_liang/saved_models/{args.result_name}/RFPlus_inbag_{args.split_seed}.dill"
with open(pickle_file, 'wb') as file:
dill.dump(rf_plus_base_inbag, file)

Expand Down Expand Up @@ -232,18 +230,18 @@ def compare_estimators(estimators: List[ModelConfig],

print("Load Models")
start = time.time()
with open(f"./saved_models/{args.result_name}/RFPlus_default_{args.split_seed}.dill", 'rb') as file:
with open(f"/scratch/users/zhongyuan_liang/saved_models/{args.result_name}/RFPlus_default_{args.split_seed}.dill", 'rb') as file:
rf_plus_base = dill.load(file)
if fi_est.base_model == "None":
pass
elif fi_est.base_model == "RF":
with open(f"./saved_models/{args.result_name}/RF_{args.split_seed}.dill", 'rb') as file:
with open(f"/scratch/users/zhongyuan_liang/saved_models/{args.result_name}/RF_{args.split_seed}.dill", 'rb') as file:
loaded_model = dill.load(file)
elif fi_est.base_model == "RFPlus_oob":
with open(f"./saved_models/{args.result_name}/RFPlus_oob_{args.split_seed}.dill", 'rb') as file:
with open(f"/scratch/users/zhongyuan_liang/saved_models/{args.result_name}/RFPlus_oob_{args.split_seed}.dill", 'rb') as file:
loaded_model = dill.load(file)
elif fi_est.base_model == "RFPlus_inbag":
with open(f"./saved_models/{args.result_name}/RFPlus_inbag_{args.split_seed}.dill", 'rb') as file:
with open(f"/scratch/users/zhongyuan_liang/saved_models/{args.result_name}/RFPlus_inbag_{args.split_seed}.dill", 'rb') as file:
loaded_model = dill.load(file)
elif fi_est.base_model == "RFPlus_default":
loaded_model = rf_plus_base
Expand All @@ -263,15 +261,13 @@ def compare_estimators(estimators: List[ModelConfig],
local_fi_score_train, local_fi_score_train_subset, local_fi_score_test, local_fi_score_test_subset = fi_est.cls(X_train=X_train, y_train=y_train, X_train_subset = X_train_subset, y_train_subset=y_train_subset,
X_test=X_test, y_test=y_test, X_test_subset=X_test_subset, y_test_subset=y_test_subset,
fit=loaded_model)
if fi_est.name.startswith("LFI"):
if fi_est.name.startswith("Local_MDI+"):
local_fi_score_train_subset = local_fi_score_train[indices_train]
end = time.time()
metric_results['fi_time'] = end - start
print(f"done with feature importance: {end - start}")

feature_importance_list.append(local_fi_score_train_subset)
feature_importance_list.append(local_fi_score_test)
feature_importance_list.append(local_fi_score_test_subset)
feature_importance_list[fi_est.name] = [local_fi_score_train_subset, local_fi_score_test, local_fi_score_test_subset]

# prepare ablations
print("start ablation")
Expand All @@ -288,11 +284,8 @@ def compare_estimators(estimators: List[ModelConfig],
metric_results['ablation_model_fit_time'] = end - start
print(f"done with ablation model fit: {end - start}")

local_fi_score_train_subset_rank = None
local_fi_score_test_subset_rank = None
local_fi_score_test_rank = None
all_fi = [local_fi_score_train_subset, local_fi_score_test_subset, local_fi_score_test]
all_fi_rank = [local_fi_score_train_subset_rank, local_fi_score_test_subset_rank, local_fi_score_test_rank]
all_fi_rank = [None, None, None]
for i in range(len(all_fi)):
fi = all_fi[i]
if isinstance(fi, np.ndarray):
Expand All @@ -303,9 +296,12 @@ def compare_estimators(estimators: List[ModelConfig],
else:
all_fi_rank[i] = np.argsort(fi)

ablation_datas = {"train_subset": (X_train_subset, y_train_subset, local_fi_score_train_subset_rank),
"test_subset": (X_test_subset, y_test_subset, local_fi_score_test_subset_rank),
"test": (X_test, y_test, local_fi_score_test_rank)}
feature_importance_list[fi_est.name].extend(all_fi_rank)

ablation_datas = {"train_subset": (X_train_subset, y_train_subset, all_fi_rank[0]),
"test_subset": (X_test_subset, y_test_subset, all_fi_rank[1]),
"test": (X_test, y_test, all_fi_rank[2])}

num_ablate_features = args.ablate_features
if num_ablate_features is None:
num_ablate_features = X_train.shape[1]
Expand Down Expand Up @@ -464,7 +460,7 @@ def run_comparison(path: str,
df = df.drop(columns=[col])

for i in range(len(feature_importance_all)):
pkl.dump(fi_lst[i], open(feature_importance_all[i], 'wb'))
pkl.dump(list(fi_lst.items())[i], open(feature_importance_all[i], 'wb'))

for model_comparison_file, fi_estimator in zip(model_comparison_files, fi_estimators):
output_dict = {
Expand Down
34 changes: 18 additions & 16 deletions feature_importance/01_run_ablation_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def compare_estimators(estimators: List[ModelConfig],

# initialize results
results = defaultdict(lambda: [])
feature_importance_list = []
feature_importance_list = {}

# loop over model estimators
for model in estimators:
Expand Down Expand Up @@ -180,21 +180,23 @@ def compare_estimators(estimators: List[ModelConfig],
"MSE": [test_all_mse_rf, test_all_mse_rf_plus, test_all_mse_rf_plus_oob, test_all_mse_rf_plus_inbag],
"R2": [test_all_r2_rf, test_all_r2_rf_plus, test_all_r2_rf_plus_oob, test_all_r2_rf_plus_inbag],
"Time": [end_rf - start_rf, end_rf_plus - start_rf_plus, end_rf_plus_oob - start_rf_plus_oob, end_rf_plus_inbag - start_rf_plus_inbag]
}
}

os.makedirs(f"/scratch/users/zhongyuan_liang/saved_models/{args.result_name}", exist_ok=True)
results_df = pd.DataFrame(fitted_results)
results_df.to_csv(f"./saved_models/{args.result_name}/RFPlus_fitted_summary_{args.split_seed}.csv", index=False)
results_df.to_csv(f"/scratch/users/zhongyuan_liang/saved_models/{args.result_name}/RFPlus_fitted_summary_{args.split_seed}.csv", index=False)


pickle_file = f"./saved_models/{args.result_name}/RF_{args.split_seed}.dill"
pickle_file = f"/scratch/users/zhongyuan_liang/saved_models/{args.result_name}/RF_{args.split_seed}.dill"
with open(pickle_file, 'wb') as file:
dill.dump(est, file)
pickle_file = f"./saved_models/{args.result_name}/RFPlus_default_{args.split_seed}.dill"
pickle_file = f"/scratch/users/zhongyuan_liang/saved_models/{args.result_name}/RFPlus_default_{args.split_seed}.dill"
with open(pickle_file, 'wb') as file:
dill.dump(rf_plus_base, file)
pickle_file = f"./saved_models/{args.result_name}/RFPlus_oob_{args.split_seed}.dill"
pickle_file = f"/scratch/users/zhongyuan_liang/saved_models/{args.result_name}/RFPlus_oob_{args.split_seed}.dill"
with open(pickle_file, 'wb') as file:
dill.dump(rf_plus_base_oob, file)
pickle_file = f"./saved_models/{args.result_name}/RFPlus_inbag_{args.split_seed}.dill"
pickle_file = f"/scratch/users/zhongyuan_liang/saved_models/{args.result_name}/RFPlus_inbag_{args.split_seed}.dill"
with open(pickle_file, 'wb') as file:
dill.dump(rf_plus_base_inbag, file)

Expand Down Expand Up @@ -225,18 +227,18 @@ def compare_estimators(estimators: List[ModelConfig],

print("Load Models")
start = time.time()
with open(f"./saved_models/{args.result_name}/RFPlus_default_{args.split_seed}.dill", 'rb') as file:
with open(f"/scratch/users/zhongyuan_liang/saved_models/{args.result_name}/RFPlus_default_{args.split_seed}.dill", 'rb') as file:
rf_plus_base = dill.load(file)
if fi_est.base_model == "None":
pass
elif fi_est.base_model == "RF":
with open(f"./saved_models/{args.result_name}/RF_{args.split_seed}.dill", 'rb') as file:
with open(f"/scratch/users/zhongyuan_liang/saved_models/{args.result_name}/RF_{args.split_seed}.dill", 'rb') as file:
loaded_model = dill.load(file)
elif fi_est.base_model == "RFPlus_oob":
with open(f"./saved_models/{args.result_name}/RFPlus_oob_{args.split_seed}.dill", 'rb') as file:
with open(f"/scratch/users/zhongyuan_liang/saved_models/{args.result_name}/RFPlus_oob_{args.split_seed}.dill", 'rb') as file:
loaded_model = dill.load(file)
elif fi_est.base_model == "RFPlus_inbag":
with open(f"./saved_models/{args.result_name}/RFPlus_inbag_{args.split_seed}.dill", 'rb') as file:
with open(f"/scratch/users/zhongyuan_liang/saved_models/{args.result_name}/RFPlus_inbag_{args.split_seed}.dill", 'rb') as file:
loaded_model = dill.load(file)
elif fi_est.base_model == "RFPlus_default":
loaded_model = rf_plus_base
Expand All @@ -256,15 +258,13 @@ def compare_estimators(estimators: List[ModelConfig],
local_fi_score_train, local_fi_score_train_subset, local_fi_score_test, local_fi_score_test_subset = fi_est.cls(X_train=X_train, y_train=y_train, X_train_subset = X_train_subset, y_train_subset=y_train_subset,
X_test=X_test, y_test=y_test, X_test_subset=X_test_subset, y_test_subset=y_test_subset,
fit=loaded_model)
if fi_est.name.startswith("LFI"):
if fi_est.name.startswith("Local_MDI+"):
local_fi_score_train_subset = local_fi_score_train[indices_train]
end = time.time()
metric_results['fi_time'] = end - start
print(f"done with feature importance: {end - start}")

feature_importance_list.append(local_fi_score_train_subset)
feature_importance_list.append(local_fi_score_test)
feature_importance_list.append(local_fi_score_test_subset)
feature_importance_list[fi_est.name] = [local_fi_score_train_subset, local_fi_score_test, local_fi_score_test_subset]

# prepare ablations
print("start ablation")
Expand Down Expand Up @@ -292,6 +292,8 @@ def compare_estimators(estimators: List[ModelConfig],
else:
all_fi_rank[i] = np.argsort(fi)

feature_importance_list[fi_est.name].extend(all_fi_rank)

ablation_datas = {"train_subset": (X_train_subset, y_train_subset, all_fi_rank[0]),
"test_subset": (X_test_subset, y_test_subset, all_fi_rank[1]),
"test": (X_test, y_test, all_fi_rank[2])}
Expand Down Expand Up @@ -442,7 +444,7 @@ def run_comparison(path: str,
df = df.drop(columns=[col])

for i in range(len(feature_importance_all)):
pkl.dump(fi_lst[i], open(feature_importance_all[i], 'wb'))
pkl.dump(list(fi_lst.items())[i], open(feature_importance_all[i], 'wb'))

for model_comparison_file, fi_estimator in zip(model_comparison_files, fi_estimators):
output_dict = {
Expand Down
Loading

0 comments on commit 029ead3

Please sign in to comment.