Skip to content

Commit

Permalink
clean and add lasso and ridge
Browse files Browse the repository at this point in the history
  • Loading branch information
zyliang2001 committed Dec 2, 2024
1 parent 01b374b commit c314921
Show file tree
Hide file tree
Showing 61 changed files with 3,370 additions and 6,563 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#SBATCH --partition=yugroup
source activate mdi
# Need to specify --result_name --ablate_features(default all features) --fitted(default not fitted)
command="00_run_ablation_regression_stability.py --nreps 1 --config mdi_local.real_data_regression_temperature_retrain --split_seed ${1} --ignore_cache --create_rmd --folder_name temperature_stability"
command="00_run_feature_ranking_simulation.py --nreps 1 --config mdi_local.real_data_regression_${1}_${2} --split_seed 1 --y_seed ${3} --ignore_cache --create_rmd --folder_name ${1}_${2}"

# Execute the command
python $command
10 changes: 0 additions & 10 deletions feature_importance/00_ablation_regression_script.sh

This file was deleted.

10 changes: 0 additions & 10 deletions feature_importance/00_ablation_regression_script2.sh

This file was deleted.

10 changes: 0 additions & 10 deletions feature_importance/00_ablation_regression_script3.sh

This file was deleted.

10 changes: 0 additions & 10 deletions feature_importance/00_ablation_regression_script4.sh

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#SBATCH --partition=yugroup
source activate mdi
# Need to specify --result_name --ablate_features(default all features) --fitted(default not fitted)
command="00_run_ablation_regression_stability.py --nreps 1 --config mdi_local.real_data_regression_CCLE_PD_0325901_retrain --split_seed ${1} --ignore_cache --create_rmd --folder_name CCLE_PD_0325901_stability"
command="00_run_ablation_regression_selection.py --nreps 1 --config mdi_local.real_data_regression_${1} --split_seed ${2} --rf_seed ${3} --ignore_cache --create_rmd --folder_name ${1}_selection --fit_model True"

# Execute the command
python $command
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#SBATCH --partition=yugroup
source activate mdi
# Need to specify --result_name --ablate_features(default all features) --fitted(default not fitted)
command="00_run_ablation_regression_stability.py --nreps 1 --config mdi_local.real_data_regression_parkinsons_retrain --split_seed ${1} --ignore_cache --create_rmd --folder_name parkinsons_stability"
command="00_run_ablation_regression_stability.py --nreps 1 --config mdi_local.real_data_regression_${1} --split_seed ${2} --ignore_cache --create_rmd --folder_name ${1}_stability"

# Execute the command
python $command
10 changes: 0 additions & 10 deletions feature_importance/00_ablation_regression_stability_script2.sh

This file was deleted.

10 changes: 0 additions & 10 deletions feature_importance/00_ablation_script_regr.sh

This file was deleted.

12 changes: 12 additions & 0 deletions feature_importance/00_ablation_script_regr_ranking.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

slurm_script="00_ablation_regression_ranking_script.sh"

for data_name in "temperature" "performance" "parkinsons" "CCLE_PD_0325901"; do
for dgp in "linear" "lss" "poly"; do
for y_seed in {1..10}; do
sbatch $slurm_script $data_name $dgp $y_seed
sleep 2
done
done
done
12 changes: 12 additions & 0 deletions feature_importance/00_ablation_script_regr_selection.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

slurm_script="00_ablation_regression_selection_script.sh"

for data_name in "temperature" "performance" "parkinsons" "CCLE_PD_0325901"; do
for split_seed in {1..3}; do
for rf_seed in {1..3}; do
sbatch $slurm_script $data_name $split_seed $rf_seed
sleep 2
done
done
done
10 changes: 6 additions & 4 deletions feature_importance/00_ablation_script_regr_stability.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
#!/bin/bash

slurm_script="00_ablation_regression_stability_script4.sh"
slurm_script="00_ablation_regression_stability_script.sh"

for split_seed in {1..3}; do
sbatch $slurm_script $split_seed # Submit SLURM job with both split_seed and rf_seed as arguments
sleep 2
for data_name in "temperature" "performance" "parkinsons" "CCLE_PD_0325901"; do
for split_seed in {1..3}; do
sbatch $slurm_script $data_name $split_seed
sleep 2
done
done
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from imodels.tree.rf_plus.rf_plus.rf_plus_models import RandomForestPlusRegressor, RandomForestPlusClassifier
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import RidgeCV, LassoCV
sys.path.append(".")
sys.path.append("..")
sys.path.append("../..")
Expand Down Expand Up @@ -60,7 +60,6 @@ def compare_estimators(estimators: List[ModelConfig],

# initialize results
results = defaultdict(lambda: [])
feature_importance_list = {"positive": {}, "negative": {}, "absolute": {}}

# loop over model estimators
for model in estimators:
Expand Down Expand Up @@ -90,55 +89,31 @@ def compare_estimators(estimators: List[ModelConfig],
if args.fit_model:
print("Fitting Models")
# fit RF model
start_rf = time.time()
est = RandomForestRegressor(n_estimators=100, min_samples_leaf=5, max_features=0.33, random_state=args.rf_seed)
est.fit(X_train, y_train)
end_rf = time.time()

# fit default RF_plus model
start_rf_plus = time.time()
rf_plus_base = RandomForestPlusRegressor(rf_model=est)
rf_plus_base.fit(X_train, y_train)
end_rf_plus = time.time()

# fit oob RF_plus model
start_rf_plus_oob = time.time()
rf_plus_base_oob = RandomForestPlusRegressor(rf_model=est, fit_on="oob")
rf_plus_base_oob.fit(X_train, y_train)
end_rf_plus_oob = time.time()

#fit inbag RF_plus model
start_rf_plus_inbag = time.time()
rf_plus_base_inbag = RandomForestPlusRegressor(rf_model=est, include_raw=False, fit_on="inbag", prediction_model=LinearRegression())
rf_plus_base_inbag.fit(X_train, y_train)
end_rf_plus_inbag = time.time()


# fit default RF_plus model
rf_plus_base_ridge = RandomForestPlusRegressor(rf_model=est, prediction_model=RidgeCV(cv=5))
rf_plus_base_ridge.fit(X_train, y_train)
rf_plus_base_oob_ridge = RandomForestPlusRegressor(rf_model=est, fit_on="oob", prediction_model=RidgeCV(cv=5))
rf_plus_base_oob_ridge.fit(X_train, y_train)
rf_plus_base_inbag_ridge = RandomForestPlusRegressor(rf_model=est, include_raw=False, fit_on="inbag", prediction_model=RidgeCV(cv=5))
rf_plus_base_inbag_ridge.fit(X_train, y_train)


rf_plus_base_lasso = RandomForestPlusRegressor(rf_model=est, prediction_model=LassoCV(cv=5, max_iter=5000))
rf_plus_base_lasso.fit(X_train, y_train)

# get test results
test_all_mse_rf = mean_squared_error(y_test, est.predict(X_test))
test_all_r2_rf = r2_score(y_test, est.predict(X_test))
test_all_mse_rf_plus = mean_squared_error(y_test, rf_plus_base.predict(X_test))
test_all_r2_rf_plus = r2_score(y_test, rf_plus_base.predict(X_test))
test_all_mse_rf_plus_oob = mean_squared_error(y_test, rf_plus_base_oob.predict(X_test))
test_all_r2_rf_plus_oob = r2_score(y_test, rf_plus_base_oob.predict(X_test))
test_all_mse_rf_plus_inbag = mean_squared_error(y_test, rf_plus_base_inbag.predict(X_test))
test_all_r2_rf_plus_inbag = r2_score(y_test, rf_plus_base_inbag.predict(X_test))
test_all_mse_rf_plus_ridge = mean_squared_error(y_test, rf_plus_base_ridge.predict(X_test))
test_all_r2_rf_plus_ridge = r2_score(y_test, rf_plus_base_ridge.predict(X_test))
test_all_mse_rf_plus_lasso = mean_squared_error(y_test, rf_plus_base_lasso.predict(X_test))
test_all_r2_rf_plus_lasso = r2_score(y_test, rf_plus_base_lasso.predict(X_test))

fitted_results = {
"Model": ["RF", "RF_plus", "RF_plus_oob", "RF_plus_inbag"],
"MSE": [test_all_mse_rf, test_all_mse_rf_plus, test_all_mse_rf_plus_oob, test_all_mse_rf_plus_inbag],
"R2": [test_all_r2_rf, test_all_r2_rf_plus, test_all_r2_rf_plus_oob, test_all_r2_rf_plus_inbag],
"Time": [end_rf - start_rf, end_rf_plus - start_rf_plus, end_rf_plus_oob - start_rf_plus_oob, end_rf_plus_inbag - start_rf_plus_inbag]
"Model": ["RF", "RF_plus", "RF_plus_ridge", "RF_plus_lasso"],
"MSE": [test_all_mse_rf, test_all_mse_rf_plus, test_all_mse_rf_plus_ridge, test_all_mse_rf_plus_lasso],
"R2": [test_all_r2_rf, test_all_r2_rf_plus, test_all_r2_rf_plus_ridge, test_all_r2_rf_plus_lasso]
}

os.makedirs(f"/scratch/users/zhongyuan_liang/saved_models/{args.folder_name}", exist_ok=True)
Expand Down Expand Up @@ -166,34 +141,25 @@ def compare_estimators(estimators: List[ModelConfig],
loaded_model = None
elif fi_est.base_model == "RF":
loaded_model = est
elif fi_est.base_model == "RFPlus_oob":
loaded_model = rf_plus_base_oob
elif fi_est.base_model == "RFPlus_inbag":
loaded_model = rf_plus_base_inbag
elif fi_est.base_model == "RFPlus_default":
loaded_model = rf_plus_base
elif fi_est.base_model == "RFPlus_ridge":
loaded_model = rf_plus_base_ridge
elif fi_est.base_model == "RFPlus_oob_ridge":
loaded_model = rf_plus_base_oob_ridge
elif fi_est.base_model == "RFPlus_inbag_ridge":
loaded_model = rf_plus_base_inbag_ridge
elif fi_est.base_model == "RFPlus_lasso":
loaded_model = rf_plus_base_lasso

m= "absolute"
start = time.time()
print(f"Compute feature importance")
local_fi_score_train = fi_est.cls(X_train=X_train, y_train=y_train, fit=loaded_model, mode="absolute")
local_fi_score_train, _ = fi_est.cls(X_train=X_train, y_train=y_train, X_test=X_test, fit=loaded_model, mode="absolute")
train_fi_mean = np.mean(local_fi_score_train, axis=0)
print(f"Train FI Mean: {train_fi_mean}")
if fi_est.ascending:
sorted_feature = np.argsort(-train_fi_mean)
else:
sorted_feature = np.argsort(train_fi_mean)
print(f"Sorted Feature: {sorted_feature}")
end = time.time()
metric_results[f'fi_time_{m}'] = end - start

ablation_models = {"RF_Regressor": RandomForestRegressor(n_estimators=100,min_samples_leaf=5,max_features=0.33,random_state=args.rf_seed),
"xgboost_Regressor": xgb.XGBRegressor(random_state=args.rf_seed),
"Linear_Regressor": LinearRegression()}
if X_train.shape[1] > 20:
mask_ratio = [0.01, 0.05, 0.1, 0.15, 0.25, 0.4, 0.5, 0.7, 0.9]
Expand All @@ -214,9 +180,7 @@ def compare_estimators(estimators: List[ModelConfig],
metric_results[f'{a_model}_MSE_top_{mask}'] = mean_squared_error(y_test, y_pred)
metric_results[f'{a_model}_R2_top_{mask}'] = r2_score(y_test, y_pred)


# initialize results with metadata and metric results
kwargs: dict = model.kwargs # dict
kwargs: dict = model.kwargs
for k in kwargs:
results[k].append(kwargs[k])
for k in fi_kwargs:
Expand All @@ -228,7 +192,7 @@ def compare_estimators(estimators: List[ModelConfig],
results[met_name].append(met_val)
# for key, value in results.items():
# print(f"{key}: {len(value)}")
return results, feature_importance_list
return results


def run_comparison(path: str,
Expand Down Expand Up @@ -263,7 +227,7 @@ def run_comparison(path: str,
if len(fi_estimators) == 0:
return

results, fi_lst = compare_estimators(estimators=estimators,
results = compare_estimators(estimators=estimators,
fi_estimators=fi_estimators,
X=X, y=y, support=support,
metrics=metrics,
Expand All @@ -282,8 +246,6 @@ def run_comparison(path: str,
if col in df.columns:
df = df.drop(columns=[col])

pkl.dump(fi_lst, open(feature_importance_all, 'wb'))

for model_comparison_file, fi_estimator in zip(model_comparison_files, fi_estimators):
output_dict = {
# metadata
Expand Down Expand Up @@ -360,9 +322,6 @@ def run_simulation(i, path, val_name, X_params_dict, X_dgp, y_params_dict, y_dgp
### Newly added arguments
parser.add_argument('--folder_name', type=str, default=None)
parser.add_argument('--fit_model', type=bool, default=False)
parser.add_argument('--absolute_masking', type=bool, default=False)
parser.add_argument('--positive_masking', type=bool, default=False)
parser.add_argument('--negative_masking', type=bool, default=False)
parser.add_argument('--num_features_masked', type=int, default=None)
parser.add_argument('--rf_seed', type=int, default=0)

Expand Down
Loading

0 comments on commit c314921

Please sign in to comment.