Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prediction algorithms, hyperparameterization, evaluation #22

Merged
merged 19 commits into from
Sep 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
22835f5
feat: create script to generate algo prediction data for testing
glitt13 Sep 3, 2024
346f668
feat: generating predictions from trained algos under dev
glitt13 Sep 3, 2024
b6723fd
feat: add processing of xssa locations, randomly selecting a subset t…
glitt13 Sep 3, 2024
66f51dc
feat: develop algo prediction's config ingest, and determine paths to…
glitt13 Sep 4, 2024
10d2422
fix: resolve merge conflict
glitt13 Sep 4, 2024
8fe0a7c
feat: create metric prediction and write results to file
glitt13 Sep 4, 2024
0fffc45
feat: build unit test for build_cfig_path()
glitt13 Sep 5, 2024
62ff9aa
feat: build unit test for build_cfig_path()
glitt13 Sep 5, 2024
42edc50
feat: add unit testsfor std_pred_path and _read_pred_comid; test cove…
glitt13 Sep 5, 2024
f07b9c9
feat: add oob = True as default for RandomForestRegressor
glitt13 Sep 5, 2024
bae5175
feat: add hyperparameterization capability using grid search and asso…
glitt13 Sep 6, 2024
90c1443
feat: add unit testing for train_eval()
glitt13 Sep 6, 2024
5adb43b
chore: change algo config for testing out hyperparameterization
glitt13 Sep 6, 2024
b0c3ef2
chore: add UserWarning category specification to warnings.warn
glitt13 Sep 6, 2024
3abfb08
fix: algo config assignment accidentally only looked at first line of…
glitt13 Sep 6, 2024
c7de9ae
fix: make sure that hyperparameter key:value pairings contained insid…
glitt13 Sep 6, 2024
3e60519
fix: adjust unit test's algo_config formats to represent the issue of…
glitt13 Sep 6, 2024
b4034e8
fix: _check_attributes_exist now appropriately reports missing attrib…
glitt13 Sep 6, 2024
7e982d2
fix: ensure algo and pipeline keys contain algo and pipeline object t…
glitt13 Sep 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
225 changes: 195 additions & 30 deletions pkg/fs_algo/fs_algo/fs_algo_train_eval.py

Large diffs are not rendered by default.

90 changes: 90 additions & 0 deletions pkg/fs_algo/fs_algo/fs_pred_algo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import argparse
import yaml
import joblib
import fs_algo.fs_algo_train_eval as fsate
import pandas as pd
from pathlib import Path
import ast
import warnings
import os
import numpy as np

# TODO create a function that's flexible/converts user formatted checks (a la fsds_proc)


# Predict values and evaluate predictions
if __name__ == "__main__":
parser = argparse.ArgumentParser(description = 'process the prediction config file')
parser.add_argument('path_pred_config', type=str, help='Path to the YAML configuration file specific for prediction.')
# NOTE pred_config should contain the path for path_algo_config
args = parser.parse_args()

home_dir = Path.home()
path_pred_config = Path(args.path_pred_config) #Path(f'{home_dir}/git/formulation-selector/scripts/eval_ingest/xssa/xssa_pred_config.yaml')
with open(path_pred_config, 'r') as file:
pred_cfg = yaml.safe_load(file)

#%% READ CONTENTS FROM THE ATTRIBUTE CONFIG
path_attr_config = fsate.build_cfig_path(path_pred_config,pred_cfg.get('name_attr_config',None))
attr_cfig = fsate.AttrConfigAndVars(path_attr_config)
attr_cfig._read_attr_config()

dir_base = attr_cfig.attrs_cfg_dict.get('dir_base')
dir_std_base = attr_cfig.attrs_cfg_dict.get('dir_std_base')
dir_db_attrs = attr_cfig.attrs_cfg_dict.get('dir_db_attrs')
datasets = attr_cfig.attrs_cfg_dict.get('datasets') # Identify datasets of interest
attrs_sel = attr_cfig.attrs_cfg_dict.get('attrs_sel', None)

#%% ESTABLISH ALGORITHM FILE I/O
dir_out = fsate.fs_save_algo_dir_struct(dir_base).get('dir_out')
dir_out_alg_base = fsate.fs_save_algo_dir_struct(dir_base).get('dir_out_alg_base')
#%% PREDICTION FILE'S COMIDS
path_pred_locs = pred_cfg.get('pred_file_in').format(**attr_cfig.attrs_cfg_dict)
comid_pred_col = pred_cfg.get('pred_file_comid_colname')

comids_pred = fsate._read_pred_comid(path_pred_locs, comid_pred_col )

#%% prediction config
# TODO create pred config
# path_pred_config = Path(args.path_pred_config)
resp_vars = pred_cfg.get('algo_response_vars')
algos = pred_cfg.get('algo_type')


#%% Read in predictor variable data (aka basin attributes)
# Read the predictor variable data (basin attributes) generated by fsds.attr.hydfab
df_attr = fsate.fs_read_attr_comid(dir_db_attrs, comids_pred, attrs_sel = attrs_sel,
_s3 = None,storage_options=None)
# Convert into wide format for model training
df_attr_wide = df_attr.pivot(index='featureID', columns = 'attribute', values = 'value')
#%% Run prediction
for ds in datasets:
dir_out_alg_ds = Path(dir_out_alg_base/Path(ds))
print(f"PREDICTING algorithm for {ds}")
for metric in resp_vars:
for algo in algos:
path_algo = fsate.std_algo_path(dir_out_alg_ds, algo=algo, metric=metric, dataset_id=ds)
if not Path(path_algo).exists():
raise FileNotFoundError(f"The following algorithm path does not exist: \n{path_algo}")


# Read in the algorithm's pipeline
pipe = joblib.load(path_algo)
feat_names = list(pipe.feature_names_in_)
df_attr_sub = df_attr_wide[feat_names]

# Perform prediction
resp_pred = pipe.predict(df_attr_sub)

# compile prediction results:
df_pred =pd.DataFrame({'comid':comids_pred,
'prediction':resp_pred,
'metric':metric,
'dataset':ds,
'algo':algo,
'name_algo':Path(path_algo).name})

path_pred_out = fsate.std_pred_path(dir_out,algo=algo,metric=metric,dataset_id=ds)
# Write prediction results
df_pred.to_parquet(path_pred_out)
print(f" Completed {algo} prediction of {metric}")
10 changes: 5 additions & 5 deletions pkg/fs_algo/fs_algo/fs_proc_algo.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,16 @@

with open(path_algo_config, 'r') as file:
algo_cfg = yaml.safe_load(file)

algo_config = {k: algo_cfg['algorithms'][k][0] for k in algo_cfg['algorithms']}
if algo_config['mlp']['hidden_layer_sizes']: # purpose: evaluate string literal to a tuple
algo_config['mlp']['hidden_layer_sizes'] = ast.literal_eval(algo_config['mlp']['hidden_layer_sizes'])

# Ensure the string literal is converted to a tuple for `hidden_layer_sizes`
algo_config = {k: algo_cfg['algorithms'][k] for k in algo_cfg['algorithms']}
if algo_config['mlp'][0].get('hidden_layer_sizes',None): # purpose: evaluate string literal to a tuple
algo_config['mlp'][0]['hidden_layer_sizes'] = ast.literal_eval(algo_config['mlp'][0]['hidden_layer_sizes'])

verbose = algo_cfg['verbose']
test_size = algo_cfg['test_size']
seed = algo_cfg['seed']


#%% Attribute configuration
name_attr_config = algo_cfg.get('name_attr_config', Path(path_algo_config).name.replace('algo','attr'))
path_attr_config = fsate.build_cfig_path(path_algo_config, name_attr_config)
Expand Down
Loading