NOAA-OWP · glitt13 · Nov 1, 2024 · Nov 1, 2024 · Nov 1, 2024 · Nov 1, 2024
diff --git a/pkg/fs_algo/fs_algo/RaFTS_theme.mplstyle b/pkg/fs_algo/fs_algo/RaFTS_theme.mplstyle
@@ -0,0 +1,19 @@
+# Style theme for RaFTS data visualizations
+
+axes.labelsize : 12
+lines.linewidth : 2
+xtick.labelsize : 11
+ytick.labelsize : 11
+legend.fontsize : 11
+font.family : Arial
+
+# viridis color codes: https://waldyrious.net/viridis-palette-generator/
+# viridis with a slightly lighter purple:
+axes.prop_cycle: cycler('color', ['7e3b8a', '21918c', 'fde725', '3b528b', '5ec962'])
+
+# Other odd options -------
+# viridis: 
+# axes.prop_cycle: cycler('color', ['440154', '21918c', 'fde725', '3b528b', '5ec962'])
+
+# viridis plasma: 
+# axes.prop_cycle: cycler('color', ['f89540', 'cc4778', '7e03a8', '0d0887', 'f0f921'])
diff --git a/pkg/fs_algo/fs_algo/fs_algo_train_eval.py b/pkg/fs_algo/fs_algo/fs_algo_train_eval.py
diff --git a/pkg/fs_algo/fs_algo/fs_perf_viz.py b/pkg/fs_algo/fs_algo/fs_perf_viz.py
@@ -0,0 +1,190 @@
+'''
+@title: Produce data visualizations for RaFTS model performance outputs
+@author: Lauren Bolotin <[email protected]>
+@description: Reads in several config files, 
+    visualizes results for the specified RaFTS algorithms and evaluation metrics, 
+    and saves plots to .png's.
+@usage: python fs_perf_viz.py "/full/path/to/viz_config.yaml"
+
+Changelog/contributions
+    2024-11-22 Originally created, LB
+'''
+import geopandas as gpd
+import os
+import pandas as pd
+from shapely.geometry import Point
+import matplotlib.pyplot as plt
+import matplotlib
+import seaborn as sns
+from sklearn.metrics import r2_score
+from sklearn.metrics import root_mean_squared_error
+import yaml
+from pathlib import Path
+import argparse
+import fs_algo.fs_algo_train_eval as fsate
+import xarray as xr
+import urllib.request
+import zipfile
+import pkg_resources
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description = 'process the data visualization config file')
+    parser.add_argument('path_viz_config', type=str, help='Path to the YAML configuration file specific for data visualization')
+    args = parser.parse_args()
+
+    home_dir = Path.home()
+    path_viz_config = Path(args.path_viz_config) #Path(f'{home_dir}/FSDS/formulation-selector/scripts/eval_ingest/xssa/xssa_viz_config.yaml') 
+
+    with open(path_viz_config, 'r') as file:
+        viz_cfg = yaml.safe_load(file)
+
+    # Get features from the viz config file --------------------------
+    algos = viz_cfg.get('algos')
+    print('Visualizing data for the following RaFTS algorithms:')
+    print(algos)
+    print('')
+    metrics = viz_cfg.get('metrics')
+    print('And for the following evaluation metrics:')
+    print(metrics)
+    print('')
+
+    plot_types = viz_cfg.get('plot_types')
+    plot_types_dict = {k: v for d in plot_types for k, v in d.items()}
+    true_keys = [key for key, value in plot_types_dict.items() if value is True]
+    print('The following plots will be generated:')
+    print(true_keys)
+    print('')
+
+    # Get features from the pred config file --------------------------
+    path_pred_config = fsate.build_cfig_path(path_viz_config,viz_cfg.get('name_pred_config',None)) # currently, this gives the pred config path, not the attr config path
+    pred_cfg = yaml.safe_load(open(path_pred_config, 'r'))
+    path_attr_config = fsate.build_cfig_path(path_pred_config,pred_cfg.get('name_attr_config',None)) 
+    ds_type = pred_cfg.get('ds_type')
+    write_type = pred_cfg.get('write_type')
+
+    # Get features from the attr config file --------------------------
+    attr_cfg = fsate.AttrConfigAndVars(path_attr_config)
+    attr_cfg._read_attr_config()
+    datasets = attr_cfg.attrs_cfg_dict.get('datasets')
+    dir_base = attr_cfg.attrs_cfg_dict.get('dir_base')  
+    dir_std_base = attr_cfg.attrs_cfg_dict.get('dir_std_base')
+
+    # Get features from the main config file --------------------------
+    # NOTE: This assumes that the main config file is just called [same prefix as all other config files]_config.yaml
+    # Build the path to the main config file by referencing the other config files we've already read in
+    prefix_viz = str(path_viz_config.name).split('_')[0]
+    prefix_attr = str(path_attr_config.name).split('_')[0]
+    if (prefix_viz != prefix_attr):
+        raise ValueError('All config files must be in the same directory and be\
+                          identifiable using the same prefix as each other (e.g.\
+                          [dataset]_config.yaml, [dataset]_pred_config.yaml, \
+                         [dataset]_attr_config.yaml, etc.)')
+    else:
+        prefix = prefix_viz
+
+    path_main_config = fsate.build_cfig_path(path_viz_config,f'{prefix_viz}_config.yaml')
+    with open(path_main_config, 'r') as file:
+        main_cfg = yaml.safe_load(file)
+
+    # NOTE: This is something I'm not totally sure will function properly with multiple datasets
+    formulation_id = list([x for x in main_cfg['formulation_metadata'] if 'formulation_id' in x][0].values())[0]
+    save_type = list([x for x in main_cfg['file_io'] if 'save_type' in x][0].values())[0]
+    if save_type.lower() == 'netcdf':
+        save_type_obs = 'nc'
+        engine = 'netcdf4'
+    else:
+        save_type_obs = 'zarr'
+        engine = 'zarr'
+
+    # Access the location metadata for prediction sites
+    path_meta_pred = pred_cfg.get('path_meta')
+
+    # Location for accessing existing outputs and saving plots
+    dir_out = fsate.fs_save_algo_dir_struct(dir_base).get('dir_out')
+    dir_out_viz_base = Path(dir_out/Path("data_visualizations"))
+
+    # Enforce style
+    style_path = pkg_resources.resource_filename('fs_algo', 'RaFTS_theme.mplstyle')
+    plt.style.use(style_path)
+
+    # Loop through all datasets
+    for ds in datasets:
+        path_meta_pred = f'{path_meta_pred}'.format(ds = ds, dir_std_base = dir_std_base, ds_type = ds_type, write_type = write_type)
+        meta_pred = pd.read_parquet(path_meta_pred)
+
+        # Loop through all algorithms
+        for algo in algos:
+            # Loop through all metrics
+            for metric in metrics:
+                # Pull the predictions
+                path_pred = fsate.std_pred_path(dir_out,algo=algo,metric=metric,dataset_id=ds)
+                pred = pd.read_parquet(path_pred)
+                data = pd.merge(meta_pred, pred, how = 'inner', on = 'comid')
+                Path(f'{dir_out}/data_visualizations').mkdir(parents=True, exist_ok=True)
+                # If you want to export the merged data for any reason: 
+                # data.to_csv(f'{dir_out}/data_visualizations/{ds}_{algo}_{metric}_data.csv')
+
+                # Does the user want a scatter plot comparing the observed module performance and the predicted module performance by RaFTS?
+                if 'pred_map' in true_keys:
+                    states = fsate.gen_conus_basemap(f'{dir_out}/data_visualizations/')
+
+                    # Plot performance on map
+                    lat = data['Y']
+                    lon = data['X']
+                    geometry = [Point(xy) for xy in zip(lon,lat)]
+                    geo_df = gpd.GeoDataFrame(geometry = geometry)
+                    geo_df['performance'] = data['prediction'].values
+                    geo_df.crs = ("EPSG:4326")
+
+                    fsate.plot_map_pred(geo_df=geo_df, states=states, 
+                                        title=f'RaFTS Predicted Performance Map: {ds}', 
+                                        metr=metric, colname_data='performance')
+
+                    # Save the plot as a .png file
+                    output_path = fsate.std_map_pred_path(dir_out_viz_base=dir_out_viz_base,
+                                                          ds=ds, metr=metric, algo_str=algo,
+                                                          split_type='prediction')
+                    plt.savefig(output_path, dpi=300, bbox_inches='tight')
+                    plt.clf()
+                    plt.close()
+
+
+                if 'obs_vs_sim_scatter' in true_keys:
+                    # Scatter plot of observed vs. predicted module performance
+                    # Remove 'USGS-' from ids so it can be merged with the actual performance data
+                    data['identifier'] = data['identifier'].str.replace(r'\D', '', regex=True)
+                    data['identifier'] = data['identifier'].str.strip() # remove leading and trailing spaces
+
+                    # Read in the observed performance data
+                    path_obs_perf = f'{dir_std_base}/{ds}/{ds}_{formulation_id}.{save_type_obs}'
+                    obs = xr.open_dataset(path_obs_perf, engine=engine)
+                    # NOTE: Below is one option, but it assumes there is only one possible .nc or .zarr file to read in (it only reads the first one it finds with that file extension)
+                    # obs = fsate._open_response_data_fs(dir_std_base=dir_std_base, ds=ds)
+                    obs = obs.to_dataframe()
+
+                    # Standardize column names
+                    obs.reset_index(inplace=True)
+                    obs = obs.rename(columns={"gage_id": "identifier"})
+
+                    # Subset columns
+                    data = data[['identifier', 'comid', 'X', 'Y', 'prediction', 'metric', 'dataset']]
+                    data = data[data['metric'] == metric]
+                    data.columns = data.columns.str.lower()
+                    obs = obs[['identifier', metric]]
+
+                    # Merge the observed and predicted data
+                    data = pd.merge(data, obs, how = 'inner', on = 'identifier')
+
+                    # Plot the observed vs. predicted module performance
+                    fsate.plot_pred_vs_obs_regr(y_pred=data['prediction'], y_obs=data[metric], 
+                                                ds = ds, metr=metric)
+
+                    # Save the plot as a .png file
+                    output_path = fsate.std_regr_pred_obs_path(dir_out_viz_base=dir_out_viz_base,
+                                                               ds=ds, metr=metric, algo_str=algo,
+                                                            split_type='prediction')
+                    plt.savefig(output_path, dpi=300, bbox_inches='tight')
+                    plt.clf()
+                    plt.close()
+
diff --git a/pkg/fs_algo/fs_algo/fs_proc_algo.py b/pkg/fs_algo/fs_algo/fs_proc_algo.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 import fs_algo.fs_algo_train_eval as fsate
 import ast
+import numpy as np
 
 """Workflow script to train algorithms on catchment attribute data for predicting
     formulation metrics and/or hydrologic signatures.
@@ -27,10 +28,12 @@
     algo_config = {k: algo_cfg['algorithms'][k] for k in algo_cfg['algorithms']}
     if algo_config['mlp'][0].get('hidden_layer_sizes',None): # purpose: evaluate string literal to a tuple
         algo_config['mlp'][0]['hidden_layer_sizes'] = ast.literal_eval(algo_config['mlp'][0]['hidden_layer_sizes'])
-
+    algo_config_og = algo_config.copy()
+
     verbose = algo_cfg['verbose']
     test_size = algo_cfg['test_size']
     seed = algo_cfg['seed']
+    read_type = algo_cfg.get('read_type','all') # Arg for how to read attribute data using comids in fs_read_attr_comid(). May be 'all' or 'filename'.
 
     #%% Attribute configuration
     name_attr_config = algo_cfg.get('name_attr_config', Path(path_algo_config).name.replace('algo','attr')) 
@@ -45,7 +48,18 @@
     attr_cfig = fsate.AttrConfigAndVars(path_attr_config)
     attr_cfig._read_attr_config()
 
-    attrs_sel = attr_cfig.attrs_cfg_dict.get('attrs_sel', None)
+
+
+    # Grab the attributes of interest from the attribute config file,
+    #  OR a .csv file if specified in the algo config file.
+    name_attr_csv = algo_cfg.get('name_attr_csv')
+    colname_attr_csv = algo_cfg.get('colname_attr_csv')
+    attrs_sel = fsate._id_attrs_sel_wrap(attr_cfig=attr_cfig,
+                    path_cfig=path_attr_config,
+                    name_attr_csv = name_attr_csv,
+                    colname_attr_csv = colname_attr_csv)
+
+    # Define directories/datasets from the attribute config file
     dir_db_attrs = attr_cfig.attrs_cfg_dict.get('dir_db_attrs')
     dir_std_base = attr_cfig.attrs_cfg_dict.get('dir_std_base')
     dir_base = attr_cfig.attrs_cfg_dict.get('dir_base')
@@ -71,22 +85,29 @@
 
         # %% COMID retrieval and assignment to response variable's coordinate
         [featureSource,featureID] = fsate._find_feat_srce_id(dat_resp,attr_cfig.attr_config) # e.g. ['nwissite','USGS-{gage_id}']
-        comids_resp = fsate.fs_retr_nhdp_comids(featureSource,featureID,gage_ids=dat_resp['gage_id'].values)
+        gdf_comid = fsate.fs_retr_nhdp_comids_geom(featureSource=featureSource,
+                                            featureID=featureID,
+                                            gage_ids=dat_resp['gage_id'].values)
+        comids_resp = gdf_comid['comid']
         dat_resp = dat_resp.assign_coords(comid = comids_resp)
-
+        # Remove the unknown comids:
+        dat_resp = dat_resp.dropna(dim='comid',how='any')
+        comids_resp = [x for x in comids_resp if x is not np.nan]
         # TODO allow secondary option where featureSource and featureIDs already provided, not COMID 
 
         #%%  Read in predictor variable data (aka basin attributes) 
         # Read the predictor variable data (basin attributes) generated by proc.attr.hydfab
         df_attr = fsate.fs_read_attr_comid(dir_db_attrs, comids_resp, attrs_sel = attrs_sel,
-                                        _s3 = None,storage_options=None)
+                                        _s3 = None,storage_options=None,read_type=read_type)
         # Convert into wide format for model training
         df_attr_wide = df_attr.pivot(index='featureID', columns = 'attribute', values = 'value')
 
     # %% Train, test, and evaluate
         rslt_eval = dict()
         for metr in metrics:
             print(f' - Processing {metr}')
+            if len(algo_config) == 0:
+                algo_config = algo_config_og.copy()
             # Subset response data to metric of interest & the comid
             df_metr_resp = pd.DataFrame({'comid': dat_resp['comid'],
                                         metr : dat_resp[metr].data})
@@ -103,10 +124,12 @@
                                         metr=metr,test_size=test_size, rs = seed,
                                         verbose=verbose)
             train_eval.train_eval() # Train, test, eval wrapper
-            
+
             # Retrieve evaluation metrics dataframe
             rslt_eval[metr] = train_eval.eval_df
-
+            path_eval_metr = fsate.std_eval_metrs_path(dir_out_alg_ds, ds,metr)
+            train_eval.eval_df.to_csv(path_eval_metr)
+            del train_eval
         # Compile results and write to file
         rslt_eval_df = pd.concat(rslt_eval).reset_index(drop=True)
         rslt_eval_df['dataset'] = ds