Skip to content

Commit

Permalink
new branch for experiment tracking
Browse files Browse the repository at this point in the history
  • Loading branch information
zachrewolinski committed Dec 19, 2024
1 parent 5a2b0f0 commit 58dbfc0
Show file tree
Hide file tree
Showing 74 changed files with 47,380 additions and 1,619 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ results.csv
# Compiled python modules.
*.pyc

# results folders in feature_importance
feature_importance/**/results/*

# gene data files ending with .csv
feature_importance/gene-data/*.csv

# Setuptools distribution folder.
/dist/

Expand Down
12 changes: 9 additions & 3 deletions feature_importance/compile-jupyter/compile_pdf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,15 @@ temp_path="${path%.ipynb}-temp.ipynb"
# add title and author metadata
python3 compile_helper.py add_metadata "$path" "$title" "${authors[@]}"

# convert to pdf
jupyter nbconvert --execute --no-input --to pdf "$temp_path" \
--output "$output_name"
# use the following to convert to pdf by re-running the file
# jupyter nbconvert --execute --no-input --to pdf "$temp_path" \
# --output "$output_name"

# use the following to convert to pdf without re-running the file
# jupyter nbconvert --no-input --to pdf "$temp_path" --output "$output_name"

# use the following to convert to pdf including code
jupyter nbconvert --execute --to pdf "$temp_path" --output "$output_name"

# delete temporary file
python3 compile_helper.py delete_temp "$path"
14 changes: 14 additions & 0 deletions feature_importance/correlation-bias/correlation-runner.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash

slurm_script="correlation.sh"

for rep in {1..50}
do
for pve in {0.1,0.4}
do
for rho in {0.5,0.6,0.7,0.8,0.9,0.99}
do
sbatch $slurm_script $rep $pve $rho # Submit SLURM job using the specified script
done
done
done
215 changes: 215 additions & 0 deletions feature_importance/correlation-bias/correlation.ipynb

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions feature_importance/correlation-bias/correlation.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash
#SBATCH --cpus-per-task=8

njobs=8

source activate mdi
command="correlation_pipeline.py --seed ${1} --pve ${2} --rho ${3} --njobs $njobs"

# Execute the command
python $command
305 changes: 305 additions & 0 deletions feature_importance/correlation-bias/correlation_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,305 @@
# imports from imodels
from imodels import get_clean_dataset
from imodels.tree.rf_plus.rf_plus.rf_plus_models import \
RandomForestPlusRegressor, RandomForestPlusClassifier
from imodels.tree.rf_plus.feature_importance.rfplus_explainer import \
RFPlusMDI, AloRFPlusMDI
from simulations_util import partial_linear_lss_model

# imports from sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, \
accuracy_score, r2_score, f1_score, log_loss, root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

# parallelization imports
from joblib import Parallel, delayed

# timing imports
import time

# other data science imports
import numpy as np
import pandas as pd
import shap
from ucimlrepo import fetch_ucirepo

# i/o imports
import argparse
import os
from os.path import join as oj

# global variable for classification/regression status
TASK = None

def simulate_data(rho, pve, seed):

np.random.seed(seed)

n = 250 # number of samples
p1 = 50 # number of correlated features
p2 = 50 # number of uncorrelated features

# create the covariance matrix for the first block (correlated features)
Sigma_1 = np.full((p1, p1), rho) # matrix filled with rho
np.fill_diagonal(Sigma_1, 1) # set diagonal elements to 1

# create the covariance matrix for the second block (uncorrelated features)
Sigma_2 = np.eye(p2) # identity matrix for independent features

# create the full covariance matrix by combining the two blocks
# using np.zeros to initialize the off-diagonal blocks
Sigma = np.block([
[Sigma_1, np.zeros((p1, p2))], # Correlated features with independent features (zero covariance)
[np.zeros((p2, p1)), Sigma_2] # Independent features (identity covariance)
])

# mean vector (zero mean)
mu = np.zeros(100)

# draw X from the multivariate normal distribution
X = np.random.multivariate_normal(mu, Sigma, size = n)

y = partial_linear_lss_model(X=X, s=2, m=3, r=2, tau=0, beta=1, heritability=pve)

return X, y

def split_data(X, y, test_size, seed):
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=test_size,
random_state=seed)
return X_train, X_test, y_train, y_test

def fit_models(X_train, y_train):
# fit models
if TASK == "classification":
rf = RandomForestClassifier(n_estimators = 100, min_samples_leaf=3,
max_features = "sqrt", random_state=42)
rf.fit(X_train, y_train)
rf_plus = RandomForestPlusClassifier(rf_model=rf)
rf_plus.fit(X_train, y_train)
elif TASK == "regression":
rf = RandomForestRegressor(n_estimators = 100, min_samples_leaf=5,
max_features = 0.33, random_state=42)
rf.fit(X_train, y_train)
rf_plus = RandomForestPlusRegressor(rf_model=rf)
rf_plus.fit(X_train, y_train)
else:
raise ValueError("Task must be either 'classification' or 'regression'.")
return rf, rf_plus

def get_shap(X, shap_explainer):
if TASK == "classification":
# the shap values are an array of shape
# (# of samples, # of features, # of classes), and in this binary
# classification case, we want the shap values for the positive class.
# check_additivity=False is used to speed up computation.
shap_values = \
shap_explainer.shap_values(X, check_additivity=False)[:, :, 1]
else:
# check_additivity=False is used to speed up computation.
shap_values = shap_explainer.shap_values(X, check_additivity=False)
# get the rankings of the shap values. negative absolute value is taken
# because np.argsort sorts from smallest to largest.
shap_rankings = np.argsort(-np.abs(shap_values), axis = 1)
return shap_values, shap_rankings

def get_lmdi(X, y, lmdi_explainer, l2norm, sign, normalize, leaf_average, ranking=False):
# get feature importances
lmdi = lmdi_explainer.explain_linear_partial(X, y, l2norm=l2norm, sign=sign,
normalize=normalize,
leaf_average=leaf_average,
ranking=ranking)
mdi_rankings = lmdi_explainer.get_rankings(np.abs(lmdi))
return lmdi, mdi_rankings

if __name__ == '__main__':

# start time
start = time.time()

TASK = "regression"

# store command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument('--seed', type=int, default=None)
parser.add_argument('--rho', type=float, default=None)
parser.add_argument('--pve', type=float, default=None)
parser.add_argument('--njobs', type=int, default=1)
args = parser.parse_args()

# convert namespace to a dictionary
args_dict = vars(args)

# assign the arguments to variables
seed = args_dict['seed']
rho = args_dict['rho']
pve = args_dict['pve']
njobs = args_dict['njobs']

X_train, y_train = simulate_data(rho, pve, seed)

# end time
end = time.time()

# print progress message
print(f"Progress Message 1/5: Obtained data with PVE = {pve}, rho = {rho}, and seed = {seed}.")
print(f"Step #1 took {end-start} seconds.")

# start time
start = time.time()

# fit the prediction models
rf, rf_plus = fit_models(X_train, y_train)

# fit baseline model
rf_plus_baseline = RandomForestPlusRegressor(rf_model=rf,
include_raw=False, fit_on="inbag",
prediction_model=LinearRegression())
rf_plus_baseline.fit(X_train, y_train)

# end time
end = time.time()

print(f"Progress Message 2/5: RF and RF+ models fit.")
print(f"Step #2 took {end-start} seconds.")

# start time
start = time.time()

# obtain shap feature importances
# shap_explainer = shap.TreeExplainer(rf)
# shap_values, shap_rankings = get_shap(X_train, shap_explainer)

# end time
end = time.time()

print(f"Progress Message 3/5: SHAP values/rankings obtained.")
print(f"Step #3 took {end-start} seconds.")

# start time
start = time.time()

# obtain lmdi feature importances
# lmdi_explainer_signed_normalized_l2_avg = AloRFPlusMDI(rf_plus, mode = "only_k")
# lmdi_explainer_signed_normalized_l2_noavg = AloRFPlusMDI(rf_plus, mode = "only_k")
# lmdi_explainer_signed_nonnormalized_l2_avg = AloRFPlusMDI(rf_plus, mode = "only_k")
# lmdi_explainer_signed_nonnormalized_l2_noavg = AloRFPlusMDI(rf_plus, mode = "only_k")
# lmdi_explainer_nonl2_avg = AloRFPlusMDI(rf_plus, mode = "only_k")
# lmdi_explainer_nonl2_noavg = AloRFPlusMDI(rf_plus, mode = "only_k")
# lmdi_explainer_l2_ranking = AloRFPlusMDI(rf_plus, mode = "only_k")
# lmdi_explainer_nonl2_ranking = AloRFPlusMDI(rf_plus, mode = "only_k")
# lmdi_explainer_normalized_l2_ranking = AloRFPlusMDI(rf_plus, mode = "only_k")
# lmdi_explainer_nonnormalized_l2_ranking = AloRFPlusMDI(rf_plus, mode = "only_k")
# lmdi_baseline_explainer = RFPlusMDI(rf_plus_baseline, mode = "only_k", evaluate_on = "inbag")
# lmdi_values_signed_normalized_l2_avg, \
# lmdi_rankings_signed_normalized_l2_avg = \
# get_lmdi(X_train, y_train, lmdi_explainer_signed_normalized_l2_avg,
# l2norm=True, sign=True, normalize=True, leaf_average=True)
# lmdi_values_signed_normalized_l2_noavg, \
# lmdi_rankings_signed_normalized_l2_noavg = \
# get_lmdi(X_train, y_train,lmdi_explainer_signed_normalized_l2_noavg,
# l2norm=True, sign=True, normalize=True, leaf_average=False)
# lmdi_values_signed_nonnormalized_l2_avg, \
# lmdi_rankings_signed_nonnormalized_l2_avg = \
# get_lmdi(X_train,y_train,lmdi_explainer_signed_nonnormalized_l2_avg,
# l2norm=True, sign=True, normalize=False, leaf_average=True)
# lmdi_values_signed_nonnormalized_l2_noavg, \
# lmdi_rankings_signed_nonnormalized_l2_noavg = \
# get_lmdi(X_train, y_train,
# lmdi_explainer_signed_nonnormalized_l2_noavg, l2norm=True,
# sign=True, normalize=False, leaf_average=False)
# lmdi_values_nonl2_avg, lmdi_rankings_nonl2_avg = \
# get_lmdi(X_train, y_train, lmdi_explainer_nonl2_avg, l2norm=False,
# sign=False, normalize=False, leaf_average=True)
# lmdi_values_nonl2_noavg, lmdi_rankings_nonl2_noavg = \
# get_lmdi(X_train, y_train, lmdi_explainer_nonl2_noavg, l2norm=False,
# sign=False, normalize=False, leaf_average=False)
# lmdi_values_l2_ranking, lmdi_rankings_l2_ranking = \
# get_lmdi(X_train, y_train, lmdi_explainer_l2_ranking, l2norm=True,
# sign=False, normalize=False, leaf_average=False, ranking=True)
# lmdi_values_nonl2_ranking, lmdi_rankings_nonl2_ranking = \
# get_lmdi(X_train, y_train, lmdi_explainer_nonl2_ranking, l2norm=False,
# sign=False, normalize=False, leaf_average=False, ranking=True)
# lmdi_values_normalized_l2_ranking, lmdi_rankings_normalized_l2_ranking = \
# get_lmdi(X_train, y_train, lmdi_explainer_normalized_l2_ranking, l2norm=True,
# sign=False, normalize=True, leaf_average=False, ranking=True)
# lmdi_values_baseline, lmdi_rankings_baseline = \
# get_lmdi(X_train, y_train, lmdi_baseline_explainer, l2norm=False,
# sign=False, normalize=False, leaf_average=False)

# # create storage for iteration purposes
# lfi_values = \
# {'shap': shap_values,
# 'signed_normalized_l2_avg': lmdi_values_signed_normalized_l2_avg,
# 'signed_normalized_l2_noavg': lmdi_values_signed_normalized_l2_noavg,
# 'signed_nonnormalized_l2_avg': lmdi_values_signed_nonnormalized_l2_avg,
# 'signed_nonnormalized_l2_noavg':
# lmdi_values_signed_nonnormalized_l2_noavg,
# 'nonl2_avg': lmdi_values_nonl2_avg,
# 'nonl2_noavg': lmdi_values_nonl2_noavg,
# 'l2_ranking': lmdi_values_l2_ranking,
# 'nonl2_ranking': lmdi_values_nonl2_ranking,
# 'normalized_l2_ranking': lmdi_values_normalized_l2_ranking,
# 'baseline': lmdi_values_baseline}
# lfi_rankings = \
# {'shap': shap_rankings,
# 'signed_normalized_l2_avg': lmdi_rankings_signed_normalized_l2_avg,
# 'signed_normalized_l2_noavg': lmdi_rankings_signed_normalized_l2_noavg,
# 'signed_nonnormalized_l2_avg': lmdi_rankings_signed_nonnormalized_l2_avg,
# 'signed_nonnormalized_l2_noavg':
# lmdi_rankings_signed_nonnormalized_l2_noavg,
# 'nonl2_avg': lmdi_rankings_nonl2_avg,
# 'nonl2_noavg': lmdi_rankings_nonl2_noavg,
# 'l2_ranking': lmdi_rankings_l2_ranking,
# 'nonl2_ranking': lmdi_rankings_nonl2_ranking,
# 'normalized_l2_ranking': lmdi_values_normalized_l2_ranking,
# 'baseline': lmdi_rankings_baseline}

# get mdi importances from rf
mdi_values = rf.feature_importances_
mdi_rankings = np.argsort(-np.abs(mdi_values))

# create storage for iteration purposes
lfi_values = \
{'mdi': mdi_values}

lfi_rankings = \
{'mdi': mdi_rankings}

# end time
end = time.time()

print(f"Progress Message 4/5: LMDI+ values/rankings obtained.")
print(f"Step #4 took {end-start} seconds.")

result_dir = oj(os.path.dirname(os.path.realpath(__file__)),
f'results/pve{pve}/rho{rho}/seed{seed}')

# get result dataframes
for method, values in lfi_values.items():
df = pd.DataFrame(values)
directory = oj(result_dir,
f'values')
if not os.path.exists(directory):
os.makedirs(directory)
df.to_csv(oj(directory, f'{method}.csv'),
index=False)
for method, rankings in lfi_rankings.items():
df = pd.DataFrame(rankings)
directory = oj(result_dir,
f'rankings')
if not os.path.exists(directory):
os.makedirs(directory)
df.to_csv(oj(directory, f'{method}.csv'),
index=False)

# end time
end = time.time()

print(f"Progress Message 5/5: Results saved to {result_dir}.")
print(f"Step #5 took {end-start} seconds.")
Loading

0 comments on commit 58dbfc0

Please sign in to comment.