Skip to content

Commit

Permalink
patch: proc_attr_grabber.R (#29)
Browse files Browse the repository at this point in the history
* Add alternate comid retrieval via sf geometry in case nwissite returns comid of NA

* fix: add gage_id inside each loc_attrs df; fix: set fill=TRUE for rbindlist

* fix: add usgs_vars sublist to Retr_Params

* feat: add a format checker on Retr_Params

* feat: add attribute variable name checker, incorporate check_attr_selection() into standard processing
  • Loading branch information
glitt13 authored Nov 5, 2024
1 parent 929371b commit b24ab4d
Show file tree
Hide file tree
Showing 4 changed files with 174 additions and 7 deletions.
2 changes: 1 addition & 1 deletion pkg/proc.attr.hydfab/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: proc.attr.hydfab
Title: Grab and process catchment attributes using the hydrofabric
Version: 0.0.1.0013
Version: 0.0.1.0014
Authors@R:
c(person("Guy", "Litt", , "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "https://orcid.org/0000-0003-1996-7468")),
Expand Down
64 changes: 58 additions & 6 deletions pkg/proc.attr.hydfab/R/proc_attr_grabber.R
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,10 @@ proc_attr_wrap <- function(comid, Retr_Params, lyrs='network',overwrite=FALSE,hf
path_attrs <- base::file.path(Retr_Params$paths$dir_db_attrs,
base::paste0("comid_",comid,"_attrs.parquet"))
vars_ls <- Retr_Params$vars
# ------- Retr_Params$vars format checker --------- #
# Run check on requested variables for retrieval:
proc.attr.hydfab:::wrap_check_vars(vars_ls)

# ----------- existing dataset checker ----------- #
ls_chck <- proc.attr.hydfab::proc_attr_exst_wrap(comid,path_attrs,
vars_ls,bucket_conn=NA)
Expand Down Expand Up @@ -513,33 +517,39 @@ proc_attr_gageids <- function(gage_ids,featureSource,featureID,Retr_Params,
revisit the configuration yaml file that processes this dataset in
fs_proc: \n {featureSource}, and featureID={featureID}"))
} else if (!is.null(site_feature)){
comid <- site_feature['comid']$comid
ls_site_feat[[gage_id]] <- site_feature
if(!base::is.na(site_feature['comid']$comid)){
comid <- site_feature['comid']$comid
} else {
message(glue::glue("Could not retrieve comid for {nldi_feat$featureID}."))
comid <- nhdplusTools::discover_nhdplus_id(point=site_feature$geometry)
message(glue::glue("Geospatial search found a comid value of: {comid}"))
}
ls_comid[[gage_id]] <- comid

# Retrieve the variables corresponding to datasets of interest & update database
loc_attrs <- try(proc.attr.hydfab::proc_attr_wrap(comid=comid,
Retr_Params=Retr_Params,
lyrs=lyrs,overwrite=FALSE,
hfab_retr=hfab_retr))
loc_attrs$gage_id <- gage_id # Add the original identifier to dataset
ls_site_feat[[gage_id]] <- loc_attrs
if("try-error" %in% class(loc_attrs)){
message(glue::glue("Skipping gage_id {gage_id} corresponding to comid {comid}"))
}
} else {
message(glue::glue("Skipping {gage_id}"))
}
}
just_comids <- ls_comid %>% unname() %>% unlist()
just_comids <- ls_comid %>% base::unname() %>% base::unlist()

if(any(is.na(just_comids))){
idxs_na_comids <- which(is.na(just_comids))
idxs_na_comids <- base::which(base::is.na(just_comids))
gage_ids_missing <- paste0(names(ls_comid[idxs_na_comids]), collapse = ", ")
warning(glue::glue("The following gage_id values did not return a comid:\n
{gage_ids_missing}"))
}

dt_site_feat <- data.table::rbindlist(ls_site_feat)
dt_site_feat$gage_id <- gage_ids # Add the original identifier to dataset
dt_site_feat <- data.table::rbindlist(ls_site_feat,fill = TRUE)
return(dt_site_feat)
}

Expand Down Expand Up @@ -795,6 +805,48 @@ write_meta_nldi_feat <- function(dt_site_feat, path_meta){
base::message(glue::glue("Wrote nldi location metadata to {path_meta}"))
}

wrap_check_vars <- function(vars_ls){
#' @title Internal wrapper to run checks on requested attribute variable names
#' @param vars_ls A named list from Retr_Params$vars in the standardized format
#' @description Given a list of variable categories, each containing vectors
#' of variable names, check the following:
#' 1) the variable category is a recognized category name (e.g. 'usgs_vars')
#' 2) the variable names inside the category name are actual variable names
#' that can be used to retrieve attributes (e.g. 'TOT_TWI' as an nhdplus attribute)

# Get the accepted variable categories used in proc.attr.hydfab R package
dir_pkg <- system.file("extdata",package="proc.attr.hydfab")
cfg_attr_src <- yaml::read_yaml(base::file.path(dir_pkg,"attr_source_types.yml"))
var_catgs <- base::lapply(cfg_attr_src,
function(x) base::unlist(x)[['name']]) %>%
base::unlist() %>% base::unname()

# Now check what var categories provided by user in the the Retr_Params$vars
names_var_catg <- base::names(vars_ls)
if(base::any(base::is.null(names_var_catg))){
stop(glue::glue("Retr_Params$vars should be a sublist with sublist names ",
"corresponding to\n standardized names in the proc.attr.hydfab package.",
" These names include:\n{paste0(var_catgs,collapse='\n')}"))
}

# Run test that the variable name is inside
test_bool_var_catg <- base::lapply(names_var_catg,
function(x) x %in% var_catgs) %>% unlist()
if(base::any(!test_bool_var_catg)){
stop(glue::glue("Retr_Params$vars contains the following unrecognized ",
"variable category name(s): ",
"{paste0(names_var_catg[!test_bool_var_catg],collapse='\n')}",
"\nAcceptable names include:\n",
"{paste0(var_catgs,collapse='\n')}"
))
}

# ------------------ RUN CHECK ON INDIVIDUAL VARIABLE NAMES -------------- #
for(var_group_name in names(vars_ls)){
sub_vars <- vars_ls[[var_group_name]]
proc.attr.hydfab::check_attr_selection(vars=sub_vars)
}
}

check_attr_selection <- function(attr_cfg_path = NULL, vars = NULL, verbose = TRUE){
#' @title Check that attributes selected by user are available
Expand Down
61 changes: 61 additions & 0 deletions scripts/config/attr_gen_camels.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#' @title Generate attributes for CAMELS basins
#' @description This script uses the proc.attr.hydfab package to acquire attributes
#' of interest.
#'


library(dplyr)
library(glue)
library(tidyr)
library(yaml)
library(proc.attr.hydfab)

main <- function(){
# Define args supplied to command line
home_dir <- Sys.getenv("HOME")

############################ BEGIN CUSTOM MUNGING ############################

# ----------------------=-- Read in CAMELS gage ids ------------------------ #
path_gages_ii <- glue::glue("{home_dir}/noaa/camels/gagesII_wood/gages_list.txt")
dat_gages_ii <- read.csv(path_gages_ii)
gage_ids <- base::lapply(1:nrow(dat_gages_ii), function(i)
tail(strsplit(dat_gages_ii[i,],split = ' ',fixed = TRUE)[[1]],n=1)) |>
unlist() |>
lapply(function(x)
gsub(pattern=".gpkg",replacement = "",x = x)) |>
unlist() |>
lapply( function(x) gsub(pattern = "Gage_", replacement = "",x=x)) |>
unlist()

utils::write.table(gage_ids,glue::glue('{home_dir}/noaa/camels/gagesII_wood/camels_ii_gage_ids.txt'),row.names = FALSE,col.names = FALSE)

# --------------------- Read in usgs NHD attribute IDs --------------------- #
# Read desired usgs nhdplus attributes, stored in NOAA shared drive here:
# https://docs.google.com/spreadsheets/d/1h-630L2ChH5zlQIcWJHVaxY9YXtGowcCqakQEAXgRrY/edit?usp=sharing
attrs_nhd_df <- read.csv(glue::glue("{home_dir}/noaa/regionalization/processing/usgs_nhdplus_attrs.csv"))

attrs_nhd <- attrs_nhd_df$ID

Retr_Params <- list(paths = list(dir_db_attrs = glue::glue("{home_dir}/noaa/regionalization/data/input/attributes/"),
dir_std_base = glue::glue("{home_dir}/noaa/regionalization/data/input/user_data_std")),
vars = list(usgs_vars = attrs_nhd),
datasets = "camelsii_nhdp_grab_nov24",
xtra_hfab = list(hfab_retr=FALSE))


############################ END CUSTOM MUNGING ##############################

# ---------------------- Grab all needed attributes ---------------------- #
# Now acquire the attributes:
ls_comids <- proc.attr.hydfab::proc_attr_gageids(gage_ids=gage_ids,
featureSource='nwissite',
featureID='USGS-{gage_id}',
Retr_Params=Retr_Params,
overwrite=FALSE)

message(glue::glue("Completed attribute acquisition for {Retr_Params$paths$dir_db_attrs}"))
}


main()
54 changes: 54 additions & 0 deletions scripts/config/camels_attr_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Config for grabbing catchment attributes corresponding to standard-named locations
# Two options exist for defining locations that need attributes. At least one must be used. Both may be used.
# 1. Refer to a file/dataset {loc_id_filepath} with a column identifer {loc_id} representing a standardized location identifier.
# 2. Refer to a dataset processed by fs_proc python package and point to its location, {dir_std_base}/{datasets}, where {datasets} is a specific subdirectory name(s) or simply 'all'

col_schema: # required column mappings in the evaluation metrics dataset (if read in)
- 'featureID': 'USGS-{gage_id}' # python f-string / R glue() format; converting the 'gage_id' to the standardized featureID used by nhdplusTools/hydrofabric. Must use '{gage_id}' e.g. 'USGS-{gage_id}'
- 'featureSource': 'nwissite' # The standardized nhdplusTools featureSource. Possible featureSources might be 'nwissite', 'comid'.
loc_id_read: # This section only required for locations NOT to be read in under a standardized dataset location (dir_std_base). May be used for additional prediction locations. MUST leave each item name inside list with empty assignments if no datasets desired.
- 'gage_id': 'gage_id' # expects tabular dataset with this column name representing the location id.
- 'loc_id_filepath': '' # Required. filepath. Allows reading of .csv or a dataset accessible using arrow::open_datast() in lieu of reading dataset generated by fs_proc.
- 'featureID_loc' : 'USGS-{gage_id}' # python f-string / R glue() format; converting the 'loc_id' to the standardized featureID used by nhdplusTools/hydrofabric. Must use '{loc_id}' e.g. 'USGS-{loc_id}'.
- 'featureSource_loc': 'nwissite' # The standardized nhdplusTools featureSource.
file_io: # May define {home_dir} for python's '{home_dir}/string_path'.format(home_dir =str(Path.home())) functionality
- 'save_loc': 'local' # #TODO implement once s3 becomes a capability. Use 'local' for saving to a local path via dir_save. Future work will create an approach for 'aws' or other cloud saving methods
- 'dir_base' : '{home_dir}/noaa/regionalization/data/input' # Required. The save location of standardized output
- 'dir_std_base' : '{dir_base}/user_data_std' # Required. The location of standardized data generated by fs_proc python package
- 'dir_db_hydfab' : '{dir_base}/hydrofabric' # Required. The local dir where hydrofabric data are stored (limits the total s3 connections)
- 'dir_db_attrs' : '{dir_base}/attributes' # Required. The parent dir where each comid's attribute parquet file is stored in the subdirectory 'comid/', and each dataset's aggregated parquet attributes are stored in the subdirectory '/{dataset_name}
formulation_metadata:
- 'datasets': # Required. Must match directory name inside dir_std_base. May be a list of items, or simply sublist 'all' to select everything inside dir_std_base for attribute grabbing.
- 'juliemai-xSSA' # Required. In this example case, it's a sublist of just one thing.
- 'formulation_base': 'Raven_blended' # Informational. Unique name of formulation. Optional.
hydfab_config: # Required section describing hydrofabric connection details and objects of interest
- 's3_base' : "s3://lynker-spatial/tabular-resources" # Required. s3 path containing hydrofabric-formatted attribute datasets
- 's3_bucket' : 'lynker-spatial' # Required. s3 bucket containing hydrofabric data
- 'ext' : 'gpkg' # Required. file extension of the hydrofrabric data. Default 'gpkg'.
- 'hf_cat_sel': "total" # Required. Options include 'total' or 'all'; total: interested in the single location's aggregated catchment data; all: all subcatchments of interest
attr_select: # Required. The names of variable sublistings are standardized, e.g. ha_vars, usgs_vars, sc_vars
- 's3_path_hydatl' : '{s3_base}/hydroATLAS/hydroatlas_vars.parquet' # path to hydroatlas data formatted for hydrofabric. Required only if hydroatlas variables desired.
- 'ha_vars': # hydroatlas variables. Must specify s3_path_hydatl if desired.
- 'pet_mm_s01'
- 'cly_pc_sav'
- 'cly_pc_uav'
- 'ari_ix_sav'
- 'usgs_vars': # list of variables retrievable using nhdplusTools::get_characteristics_metadata().
- 'TOT_TWI'
- 'TOT_PRSNOW'
- 'TOT_POPDENS90'
- 'TOT_EWT'
- 'TOT_RECHG'
- 'TOT_PPT7100_ANN'
- 'TOT_AET'
- 'TOT_PET'
- 'TOT_SILTAVE'
- 'TOT_BASIN_AREA'
- 'TOT_BASIN_SLOPE'
- 'TOT_ELEV_MEAN'
- 'TOT_ELEV_MAX'
- 'TOT_Intensity'
- 'TOT_Wet'
- 'TOT_Dry'
- 'sc_vars': # Streamcat variables of interest. #TODO add streamcat grabber capability to proc.attr.hydfab
- # In this example case, no streamcat variables selected

0 comments on commit b24ab4d

Please sign in to comment.