Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

patch: proc_attr_grabber.R #29

Merged
merged 6 commits into from
Nov 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/proc.attr.hydfab/DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: proc.attr.hydfab
Title: Grab and process catchment attributes using the hydrofabric
Version: 0.0.1.0013
Version: 0.0.1.0014
Authors@R:
c(person("Guy", "Litt", , "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "https://orcid.org/0000-0003-1996-7468")),
Expand Down
64 changes: 58 additions & 6 deletions pkg/proc.attr.hydfab/R/proc_attr_grabber.R
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,10 @@ proc_attr_wrap <- function(comid, Retr_Params, lyrs='network',overwrite=FALSE,hf
path_attrs <- base::file.path(Retr_Params$paths$dir_db_attrs,
base::paste0("comid_",comid,"_attrs.parquet"))
vars_ls <- Retr_Params$vars
# ------- Retr_Params$vars format checker --------- #
# Run check on requested variables for retrieval:
proc.attr.hydfab:::wrap_check_vars(vars_ls)

# ----------- existing dataset checker ----------- #
ls_chck <- proc.attr.hydfab::proc_attr_exst_wrap(comid,path_attrs,
vars_ls,bucket_conn=NA)
Expand Down Expand Up @@ -513,33 +517,39 @@ proc_attr_gageids <- function(gage_ids,featureSource,featureID,Retr_Params,
revisit the configuration yaml file that processes this dataset in
fs_proc: \n {featureSource}, and featureID={featureID}"))
} else if (!is.null(site_feature)){
comid <- site_feature['comid']$comid
ls_site_feat[[gage_id]] <- site_feature
if(!base::is.na(site_feature['comid']$comid)){
comid <- site_feature['comid']$comid
} else {
message(glue::glue("Could not retrieve comid for {nldi_feat$featureID}."))
comid <- nhdplusTools::discover_nhdplus_id(point=site_feature$geometry)
message(glue::glue("Geospatial search found a comid value of: {comid}"))
}
ls_comid[[gage_id]] <- comid

# Retrieve the variables corresponding to datasets of interest & update database
loc_attrs <- try(proc.attr.hydfab::proc_attr_wrap(comid=comid,
Retr_Params=Retr_Params,
lyrs=lyrs,overwrite=FALSE,
hfab_retr=hfab_retr))
loc_attrs$gage_id <- gage_id # Add the original identifier to dataset
ls_site_feat[[gage_id]] <- loc_attrs
if("try-error" %in% class(loc_attrs)){
message(glue::glue("Skipping gage_id {gage_id} corresponding to comid {comid}"))
}
} else {
message(glue::glue("Skipping {gage_id}"))
}
}
just_comids <- ls_comid %>% unname() %>% unlist()
just_comids <- ls_comid %>% base::unname() %>% base::unlist()

if(any(is.na(just_comids))){
idxs_na_comids <- which(is.na(just_comids))
idxs_na_comids <- base::which(base::is.na(just_comids))
gage_ids_missing <- paste0(names(ls_comid[idxs_na_comids]), collapse = ", ")
warning(glue::glue("The following gage_id values did not return a comid:\n
{gage_ids_missing}"))
}

dt_site_feat <- data.table::rbindlist(ls_site_feat)
dt_site_feat$gage_id <- gage_ids # Add the original identifier to dataset
dt_site_feat <- data.table::rbindlist(ls_site_feat,fill = TRUE)
return(dt_site_feat)
}

Expand Down Expand Up @@ -795,6 +805,48 @@ write_meta_nldi_feat <- function(dt_site_feat, path_meta){
base::message(glue::glue("Wrote nldi location metadata to {path_meta}"))
}

wrap_check_vars <- function(vars_ls){
#' @title Internal wrapper to run checks on requested attribute variable names
#' @param vars_ls A named list from Retr_Params$vars in the standardized format
#' @description Given a list of variable categories, each containing vectors
#' of variable names, check the following:
#' 1) the variable category is a recognized category name (e.g. 'usgs_vars')
#' 2) the variable names inside the category name are actual variable names
#' that can be used to retrieve attributes (e.g. 'TOT_TWI' as an nhdplus attribute)

# Get the accepted variable categories used in proc.attr.hydfab R package
dir_pkg <- system.file("extdata",package="proc.attr.hydfab")
cfg_attr_src <- yaml::read_yaml(base::file.path(dir_pkg,"attr_source_types.yml"))
var_catgs <- base::lapply(cfg_attr_src,
function(x) base::unlist(x)[['name']]) %>%
base::unlist() %>% base::unname()

# Now check what var categories provided by user in the the Retr_Params$vars
names_var_catg <- base::names(vars_ls)
if(base::any(base::is.null(names_var_catg))){
stop(glue::glue("Retr_Params$vars should be a sublist with sublist names ",
"corresponding to\n standardized names in the proc.attr.hydfab package.",
" These names include:\n{paste0(var_catgs,collapse='\n')}"))
}

# Run test that the variable name is inside
test_bool_var_catg <- base::lapply(names_var_catg,
function(x) x %in% var_catgs) %>% unlist()
if(base::any(!test_bool_var_catg)){
stop(glue::glue("Retr_Params$vars contains the following unrecognized ",
"variable category name(s): ",
"{paste0(names_var_catg[!test_bool_var_catg],collapse='\n')}",
"\nAcceptable names include:\n",
"{paste0(var_catgs,collapse='\n')}"
))
}

# ------------------ RUN CHECK ON INDIVIDUAL VARIABLE NAMES -------------- #
for(var_group_name in names(vars_ls)){
sub_vars <- vars_ls[[var_group_name]]
proc.attr.hydfab::check_attr_selection(vars=sub_vars)
}
}

check_attr_selection <- function(attr_cfg_path = NULL, vars = NULL, verbose = TRUE){
#' @title Check that attributes selected by user are available
Expand Down
61 changes: 61 additions & 0 deletions scripts/config/attr_gen_camels.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#' @title Generate attributes for CAMELS basins
#' @description This script uses the proc.attr.hydfab package to acquire attributes
#' of interest.
#'


library(dplyr)
library(glue)
library(tidyr)
library(yaml)
library(proc.attr.hydfab)

main <- function(){
# Define args supplied to command line
home_dir <- Sys.getenv("HOME")

############################ BEGIN CUSTOM MUNGING ############################

# ----------------------=-- Read in CAMELS gage ids ------------------------ #
path_gages_ii <- glue::glue("{home_dir}/noaa/camels/gagesII_wood/gages_list.txt")
dat_gages_ii <- read.csv(path_gages_ii)
gage_ids <- base::lapply(1:nrow(dat_gages_ii), function(i)
tail(strsplit(dat_gages_ii[i,],split = ' ',fixed = TRUE)[[1]],n=1)) |>
unlist() |>
lapply(function(x)
gsub(pattern=".gpkg",replacement = "",x = x)) |>
unlist() |>
lapply( function(x) gsub(pattern = "Gage_", replacement = "",x=x)) |>
unlist()

utils::write.table(gage_ids,glue::glue('{home_dir}/noaa/camels/gagesII_wood/camels_ii_gage_ids.txt'),row.names = FALSE,col.names = FALSE)

# --------------------- Read in usgs NHD attribute IDs --------------------- #
# Read desired usgs nhdplus attributes, stored in NOAA shared drive here:
# https://docs.google.com/spreadsheets/d/1h-630L2ChH5zlQIcWJHVaxY9YXtGowcCqakQEAXgRrY/edit?usp=sharing
attrs_nhd_df <- read.csv(glue::glue("{home_dir}/noaa/regionalization/processing/usgs_nhdplus_attrs.csv"))

attrs_nhd <- attrs_nhd_df$ID

Retr_Params <- list(paths = list(dir_db_attrs = glue::glue("{home_dir}/noaa/regionalization/data/input/attributes/"),
dir_std_base = glue::glue("{home_dir}/noaa/regionalization/data/input/user_data_std")),
vars = list(usgs_vars = attrs_nhd),
datasets = "camelsii_nhdp_grab_nov24",
xtra_hfab = list(hfab_retr=FALSE))


############################ END CUSTOM MUNGING ##############################

# ---------------------- Grab all needed attributes ---------------------- #
# Now acquire the attributes:
ls_comids <- proc.attr.hydfab::proc_attr_gageids(gage_ids=gage_ids,
featureSource='nwissite',
featureID='USGS-{gage_id}',
Retr_Params=Retr_Params,
overwrite=FALSE)

message(glue::glue("Completed attribute acquisition for {Retr_Params$paths$dir_db_attrs}"))
}


main()
54 changes: 54 additions & 0 deletions scripts/config/camels_attr_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Config for grabbing catchment attributes corresponding to standard-named locations
# Two options exist for defining locations that need attributes. At least one must be used. Both may be used.
# 1. Refer to a file/dataset {loc_id_filepath} with a column identifer {loc_id} representing a standardized location identifier.
# 2. Refer to a dataset processed by fs_proc python package and point to its location, {dir_std_base}/{datasets}, where {datasets} is a specific subdirectory name(s) or simply 'all'

col_schema: # required column mappings in the evaluation metrics dataset (if read in)
- 'featureID': 'USGS-{gage_id}' # python f-string / R glue() format; converting the 'gage_id' to the standardized featureID used by nhdplusTools/hydrofabric. Must use '{gage_id}' e.g. 'USGS-{gage_id}'
- 'featureSource': 'nwissite' # The standardized nhdplusTools featureSource. Possible featureSources might be 'nwissite', 'comid'.
loc_id_read: # This section only required for locations NOT to be read in under a standardized dataset location (dir_std_base). May be used for additional prediction locations. MUST leave each item name inside list with empty assignments if no datasets desired.
- 'gage_id': 'gage_id' # expects tabular dataset with this column name representing the location id.
- 'loc_id_filepath': '' # Required. filepath. Allows reading of .csv or a dataset accessible using arrow::open_datast() in lieu of reading dataset generated by fs_proc.
- 'featureID_loc' : 'USGS-{gage_id}' # python f-string / R glue() format; converting the 'loc_id' to the standardized featureID used by nhdplusTools/hydrofabric. Must use '{loc_id}' e.g. 'USGS-{loc_id}'.
- 'featureSource_loc': 'nwissite' # The standardized nhdplusTools featureSource.
file_io: # May define {home_dir} for python's '{home_dir}/string_path'.format(home_dir =str(Path.home())) functionality
- 'save_loc': 'local' # #TODO implement once s3 becomes a capability. Use 'local' for saving to a local path via dir_save. Future work will create an approach for 'aws' or other cloud saving methods
- 'dir_base' : '{home_dir}/noaa/regionalization/data/input' # Required. The save location of standardized output
- 'dir_std_base' : '{dir_base}/user_data_std' # Required. The location of standardized data generated by fs_proc python package
- 'dir_db_hydfab' : '{dir_base}/hydrofabric' # Required. The local dir where hydrofabric data are stored (limits the total s3 connections)
- 'dir_db_attrs' : '{dir_base}/attributes' # Required. The parent dir where each comid's attribute parquet file is stored in the subdirectory 'comid/', and each dataset's aggregated parquet attributes are stored in the subdirectory '/{dataset_name}
formulation_metadata:
- 'datasets': # Required. Must match directory name inside dir_std_base. May be a list of items, or simply sublist 'all' to select everything inside dir_std_base for attribute grabbing.
- 'juliemai-xSSA' # Required. In this example case, it's a sublist of just one thing.
- 'formulation_base': 'Raven_blended' # Informational. Unique name of formulation. Optional.
hydfab_config: # Required section describing hydrofabric connection details and objects of interest
- 's3_base' : "s3://lynker-spatial/tabular-resources" # Required. s3 path containing hydrofabric-formatted attribute datasets
- 's3_bucket' : 'lynker-spatial' # Required. s3 bucket containing hydrofabric data
- 'ext' : 'gpkg' # Required. file extension of the hydrofrabric data. Default 'gpkg'.
- 'hf_cat_sel': "total" # Required. Options include 'total' or 'all'; total: interested in the single location's aggregated catchment data; all: all subcatchments of interest
attr_select: # Required. The names of variable sublistings are standardized, e.g. ha_vars, usgs_vars, sc_vars
- 's3_path_hydatl' : '{s3_base}/hydroATLAS/hydroatlas_vars.parquet' # path to hydroatlas data formatted for hydrofabric. Required only if hydroatlas variables desired.
- 'ha_vars': # hydroatlas variables. Must specify s3_path_hydatl if desired.
- 'pet_mm_s01'
- 'cly_pc_sav'
- 'cly_pc_uav'
- 'ari_ix_sav'
- 'usgs_vars': # list of variables retrievable using nhdplusTools::get_characteristics_metadata().
- 'TOT_TWI'
- 'TOT_PRSNOW'
- 'TOT_POPDENS90'
- 'TOT_EWT'
- 'TOT_RECHG'
- 'TOT_PPT7100_ANN'
- 'TOT_AET'
- 'TOT_PET'
- 'TOT_SILTAVE'
- 'TOT_BASIN_AREA'
- 'TOT_BASIN_SLOPE'
- 'TOT_ELEV_MEAN'
- 'TOT_ELEV_MAX'
- 'TOT_Intensity'
- 'TOT_Wet'
- 'TOT_Dry'
- 'sc_vars': # Streamcat variables of interest. #TODO add streamcat grabber capability to proc.attr.hydfab
- # In this example case, no streamcat variables selected