Skip to content

Commit

Permalink
Prediction algorithms, hyperparameterization, evaluation (#22)
Browse files Browse the repository at this point in the history
* feat: create script to generate algo prediction data for testing

* feat: generating predictions from trained algos under dev

* feat: add processing of xssa locations, randomly selecting a subset to use for algo prediction

* feat: develop algo prediction's config ingest, and determine paths to prediction locations and trained algos

* feat: create metric prediction and write results to file

* feat: build unit test for build_cfig_path()

* feat: build unit test for build_cfig_path()

* feat: add unit testsfor std_pred_path and _read_pred_comid; test coverage now at 92%

* feat: add oob = True as default for RandomForestRegressor

* feat: add hyperparameterization capability using grid search and associated unit tests

* feat: add unit testing for train_eval()

* chore: change algo config for testing out hyperparameterization

* chore: add UserWarning category specification to warnings.warn

* fix: algo config assignment accidentally only looked at first line of params

* fix: make sure that hyperparameter key:value pairings contained inside dict, not list

* fix: adjust unit test's algo_config formats to represent the issue of a dict of a list, which the list_to_dict() function then converts

* fix: _check_attributes_exist now appropriately reports missing attributes and comids

* fix: ensure algo and pipeline keys contain algo and pipeline object types in the grid search case
  • Loading branch information
glitt13 authored Sep 15, 2024
1 parent becfc45 commit 89a52ee
Show file tree
Hide file tree
Showing 10 changed files with 753 additions and 116 deletions.
225 changes: 195 additions & 30 deletions pkg/fs_algo/fs_algo/fs_algo_train_eval.py

Large diffs are not rendered by default.

90 changes: 90 additions & 0 deletions pkg/fs_algo/fs_algo/fs_pred_algo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import argparse
import yaml
import joblib
import fs_algo.fs_algo_train_eval as fsate
import pandas as pd
from pathlib import Path
import ast
import warnings
import os
import numpy as np

# TODO create a function that's flexible/converts user formatted checks (a la fsds_proc)


# Predict values and evaluate predictions
if __name__ == "__main__":
parser = argparse.ArgumentParser(description = 'process the prediction config file')
parser.add_argument('path_pred_config', type=str, help='Path to the YAML configuration file specific for prediction.')
# NOTE pred_config should contain the path for path_algo_config
args = parser.parse_args()

home_dir = Path.home()
path_pred_config = Path(args.path_pred_config) #Path(f'{home_dir}/git/formulation-selector/scripts/eval_ingest/xssa/xssa_pred_config.yaml')
with open(path_pred_config, 'r') as file:
pred_cfg = yaml.safe_load(file)

#%% READ CONTENTS FROM THE ATTRIBUTE CONFIG
path_attr_config = fsate.build_cfig_path(path_pred_config,pred_cfg.get('name_attr_config',None))
attr_cfig = fsate.AttrConfigAndVars(path_attr_config)
attr_cfig._read_attr_config()

dir_base = attr_cfig.attrs_cfg_dict.get('dir_base')
dir_std_base = attr_cfig.attrs_cfg_dict.get('dir_std_base')
dir_db_attrs = attr_cfig.attrs_cfg_dict.get('dir_db_attrs')
datasets = attr_cfig.attrs_cfg_dict.get('datasets') # Identify datasets of interest
attrs_sel = attr_cfig.attrs_cfg_dict.get('attrs_sel', None)

#%% ESTABLISH ALGORITHM FILE I/O
dir_out = fsate.fs_save_algo_dir_struct(dir_base).get('dir_out')
dir_out_alg_base = fsate.fs_save_algo_dir_struct(dir_base).get('dir_out_alg_base')
#%% PREDICTION FILE'S COMIDS
path_pred_locs = pred_cfg.get('pred_file_in').format(**attr_cfig.attrs_cfg_dict)
comid_pred_col = pred_cfg.get('pred_file_comid_colname')

comids_pred = fsate._read_pred_comid(path_pred_locs, comid_pred_col )

#%% prediction config
# TODO create pred config
# path_pred_config = Path(args.path_pred_config)
resp_vars = pred_cfg.get('algo_response_vars')
algos = pred_cfg.get('algo_type')


#%% Read in predictor variable data (aka basin attributes)
# Read the predictor variable data (basin attributes) generated by fsds.attr.hydfab
df_attr = fsate.fs_read_attr_comid(dir_db_attrs, comids_pred, attrs_sel = attrs_sel,
_s3 = None,storage_options=None)
# Convert into wide format for model training
df_attr_wide = df_attr.pivot(index='featureID', columns = 'attribute', values = 'value')
#%% Run prediction
for ds in datasets:
dir_out_alg_ds = Path(dir_out_alg_base/Path(ds))
print(f"PREDICTING algorithm for {ds}")
for metric in resp_vars:
for algo in algos:
path_algo = fsate.std_algo_path(dir_out_alg_ds, algo=algo, metric=metric, dataset_id=ds)
if not Path(path_algo).exists():
raise FileNotFoundError(f"The following algorithm path does not exist: \n{path_algo}")


# Read in the algorithm's pipeline
pipe = joblib.load(path_algo)
feat_names = list(pipe.feature_names_in_)
df_attr_sub = df_attr_wide[feat_names]

# Perform prediction
resp_pred = pipe.predict(df_attr_sub)

# compile prediction results:
df_pred =pd.DataFrame({'comid':comids_pred,
'prediction':resp_pred,
'metric':metric,
'dataset':ds,
'algo':algo,
'name_algo':Path(path_algo).name})

path_pred_out = fsate.std_pred_path(dir_out,algo=algo,metric=metric,dataset_id=ds)
# Write prediction results
df_pred.to_parquet(path_pred_out)
print(f" Completed {algo} prediction of {metric}")
10 changes: 5 additions & 5 deletions pkg/fs_algo/fs_algo/fs_proc_algo.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,16 @@

with open(path_algo_config, 'r') as file:
algo_cfg = yaml.safe_load(file)

algo_config = {k: algo_cfg['algorithms'][k][0] for k in algo_cfg['algorithms']}
if algo_config['mlp']['hidden_layer_sizes']: # purpose: evaluate string literal to a tuple
algo_config['mlp']['hidden_layer_sizes'] = ast.literal_eval(algo_config['mlp']['hidden_layer_sizes'])

# Ensure the string literal is converted to a tuple for `hidden_layer_sizes`
algo_config = {k: algo_cfg['algorithms'][k] for k in algo_cfg['algorithms']}
if algo_config['mlp'][0].get('hidden_layer_sizes',None): # purpose: evaluate string literal to a tuple
algo_config['mlp'][0]['hidden_layer_sizes'] = ast.literal_eval(algo_config['mlp'][0]['hidden_layer_sizes'])

verbose = algo_cfg['verbose']
test_size = algo_cfg['test_size']
seed = algo_cfg['seed']


#%% Attribute configuration
name_attr_config = algo_cfg.get('name_attr_config', Path(path_algo_config).name.replace('algo','attr'))
path_attr_config = fsate.build_cfig_path(path_algo_config, name_attr_config)
Expand Down
Loading

0 comments on commit 89a52ee

Please sign in to comment.