patch: proc_attr_grabber.R (#29)

* Add alternate comid retrieval via sf geometry in case nwissite returns comid of NA * fix: add gage_id inside each loc_attrs df; fix: set fill=TRUE for rbindlist * fix: add usgs_vars sublist to Retr_Params * feat: add a format checker on Retr_Params * feat: add attribute variable name checker, incorporate check_attr_selection() into standard processing
NOAA-OWP · Nov 5, 2024 · b24ab4d · b24ab4d
1 parent 929371b
commit b24ab4d
Show file tree

Hide file tree

Showing 4 changed files with 174 additions and 7 deletions.
diff --git a/pkg/proc.attr.hydfab/DESCRIPTION b/pkg/proc.attr.hydfab/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: proc.attr.hydfab
 Title: Grab and process catchment attributes using the hydrofabric
-Version: 0.0.1.0013
+Version: 0.0.1.0014
 Authors@R: 
     c(person("Guy", "Litt", , "[email protected]", role = c("aut", "cre"),
            comment = c(ORCID = "https://orcid.org/0000-0003-1996-7468")),

diff --git a/pkg/proc.attr.hydfab/R/proc_attr_grabber.R b/pkg/proc.attr.hydfab/R/proc_attr_grabber.R
@@ -375,6 +375,10 @@ proc_attr_wrap <- function(comid, Retr_Params, lyrs='network',overwrite=FALSE,hf
   path_attrs <- base::file.path(Retr_Params$paths$dir_db_attrs,
                           base::paste0("comid_",comid,"_attrs.parquet"))
   vars_ls <- Retr_Params$vars
+  # ------- Retr_Params$vars format checker --------- #
+  # Run check on requested variables for retrieval:
+  proc.attr.hydfab:::wrap_check_vars(vars_ls)
+
   # ----------- existing dataset checker ----------- #
   ls_chck <- proc.attr.hydfab::proc_attr_exst_wrap(comid,path_attrs,
                                                    vars_ls,bucket_conn=NA)
@@ -513,33 +517,39 @@ proc_attr_gageids <- function(gage_ids,featureSource,featureID,Retr_Params,
              revisit the configuration yaml file that processes this dataset in
             fs_proc: \n {featureSource}, and featureID={featureID}"))
     } else if (!is.null(site_feature)){
-      comid <- site_feature['comid']$comid
-      ls_site_feat[[gage_id]] <- site_feature
+      if(!base::is.na(site_feature['comid']$comid)){
+        comid <- site_feature['comid']$comid
+      } else {
+        message(glue::glue("Could not retrieve comid for {nldi_feat$featureID}."))
+        comid <- nhdplusTools::discover_nhdplus_id(point=site_feature$geometry)
+        message(glue::glue("Geospatial search found a comid value of: {comid}"))
+      }
       ls_comid[[gage_id]] <- comid
 
       # Retrieve the variables corresponding to datasets of interest & update database
       loc_attrs <- try(proc.attr.hydfab::proc_attr_wrap(comid=comid,
                                                     Retr_Params=Retr_Params,
                                                     lyrs=lyrs,overwrite=FALSE,
                                                     hfab_retr=hfab_retr))
+      loc_attrs$gage_id <- gage_id # Add the original identifier to dataset
+      ls_site_feat[[gage_id]] <- loc_attrs
       if("try-error" %in% class(loc_attrs)){
         message(glue::glue("Skipping gage_id {gage_id} corresponding to comid {comid}"))
       }
     } else {
       message(glue::glue("Skipping {gage_id}"))
     }
   }
-  just_comids <- ls_comid %>% unname() %>% unlist()
+  just_comids <- ls_comid %>% base::unname() %>% base::unlist()
 
   if(any(is.na(just_comids))){
-    idxs_na_comids <- which(is.na(just_comids))
+    idxs_na_comids <- base::which(base::is.na(just_comids))
     gage_ids_missing <- paste0(names(ls_comid[idxs_na_comids]), collapse = ", ")
     warning(glue::glue("The following gage_id values did not return a comid:\n
                        {gage_ids_missing}"))
   }
 
-  dt_site_feat <- data.table::rbindlist(ls_site_feat)
-  dt_site_feat$gage_id <- gage_ids # Add the original identifier to dataset
+  dt_site_feat <- data.table::rbindlist(ls_site_feat,fill = TRUE)
   return(dt_site_feat)
 }
 
@@ -795,6 +805,48 @@ write_meta_nldi_feat <- function(dt_site_feat, path_meta){
   base::message(glue::glue("Wrote nldi location metadata to {path_meta}"))
 }
 
+wrap_check_vars <- function(vars_ls){
+  #' @title Internal wrapper to run checks on requested attribute variable names
+  #' @param vars_ls A named list from Retr_Params$vars in the standardized format
+  #' @description Given a list of variable categories, each containing vectors
+  #' of variable names, check the following:
+  #' 1) the variable category is a recognized category name (e.g. 'usgs_vars')
+  #' 2) the variable names inside the category name are actual variable names
+  #' that can be used to retrieve attributes (e.g. 'TOT_TWI' as an nhdplus attribute)
+
+  # Get the accepted variable categories used in proc.attr.hydfab R package
+  dir_pkg <- system.file("extdata",package="proc.attr.hydfab")
+  cfg_attr_src <- yaml::read_yaml(base::file.path(dir_pkg,"attr_source_types.yml"))
+  var_catgs <- base::lapply(cfg_attr_src,
+                            function(x) base::unlist(x)[['name']]) %>%
+    base::unlist() %>% base::unname()
+
+  # Now check what var categories provided by user in the the Retr_Params$vars
+  names_var_catg <- base::names(vars_ls)
+  if(base::any(base::is.null(names_var_catg))){
+    stop(glue::glue("Retr_Params$vars should be a sublist with sublist names ",
+                    "corresponding to\n standardized names in the proc.attr.hydfab package.",
+                    " These names include:\n{paste0(var_catgs,collapse='\n')}"))
+  }
+
+  # Run test that the variable name is inside
+  test_bool_var_catg <- base::lapply(names_var_catg,
+                                     function(x) x %in% var_catgs) %>% unlist()
+  if(base::any(!test_bool_var_catg)){
+    stop(glue::glue("Retr_Params$vars contains the following unrecognized ",
+                    "variable category name(s): ",
+                    "{paste0(names_var_catg[!test_bool_var_catg],collapse='\n')}",
+                    "\nAcceptable names include:\n",
+                    "{paste0(var_catgs,collapse='\n')}"
+    ))
+  }
+
+  # ------------------ RUN CHECK ON INDIVIDUAL VARIABLE NAMES -------------- #
+  for(var_group_name in names(vars_ls)){
+    sub_vars <- vars_ls[[var_group_name]]
+    proc.attr.hydfab::check_attr_selection(vars=sub_vars)
+  }
+}
 
 check_attr_selection <- function(attr_cfg_path = NULL, vars = NULL, verbose = TRUE){
   #' @title Check that attributes selected by user are available

diff --git a/scripts/config/attr_gen_camels.R b/scripts/config/attr_gen_camels.R
@@ -0,0 +1,61 @@
+#' @title Generate attributes for CAMELS basins
+#' @description This script uses the proc.attr.hydfab package to acquire attributes
+#' of interest.
+#'
+
+
+library(dplyr)
+library(glue)
+library(tidyr)
+library(yaml)
+library(proc.attr.hydfab)
+
+main <- function(){
+  # Define args supplied to command line
+  home_dir <- Sys.getenv("HOME")
+
+  ############################ BEGIN CUSTOM MUNGING ############################
+
+  # ----------------------=-- Read in CAMELS gage ids ------------------------ #
+  path_gages_ii <- glue::glue("{home_dir}/noaa/camels/gagesII_wood/gages_list.txt")
+  dat_gages_ii <- read.csv(path_gages_ii)
+  gage_ids <- base::lapply(1:nrow(dat_gages_ii), function(i)
+    tail(strsplit(dat_gages_ii[i,],split = ' ',fixed = TRUE)[[1]],n=1)) |>
+    unlist() |>
+    lapply(function(x)
+    gsub(pattern=".gpkg",replacement = "",x = x)) |>
+    unlist() |>
+    lapply( function(x) gsub(pattern = "Gage_", replacement = "",x=x)) |>
+    unlist()
+
+  utils::write.table(gage_ids,glue::glue('{home_dir}/noaa/camels/gagesII_wood/camels_ii_gage_ids.txt'),row.names = FALSE,col.names = FALSE)
+
+  # --------------------- Read in usgs NHD attribute IDs --------------------- #
+  # Read desired usgs nhdplus attributes, stored in NOAA shared drive here:
+  # https://docs.google.com/spreadsheets/d/1h-630L2ChH5zlQIcWJHVaxY9YXtGowcCqakQEAXgRrY/edit?usp=sharing
+  attrs_nhd_df <- read.csv(glue::glue("{home_dir}/noaa/regionalization/processing/usgs_nhdplus_attrs.csv"))
+
+  attrs_nhd <-   attrs_nhd_df$ID
+
+  Retr_Params <- list(paths = list(dir_db_attrs = glue::glue("{home_dir}/noaa/regionalization/data/input/attributes/"),
+                                   dir_std_base = glue::glue("{home_dir}/noaa/regionalization/data/input/user_data_std")),
+                      vars = list(usgs_vars = attrs_nhd),
+                      datasets = "camelsii_nhdp_grab_nov24",
+                      xtra_hfab = list(hfab_retr=FALSE))
+
+
+  ############################ END CUSTOM MUNGING ##############################
+
+  # ---------------------- Grab all needed attributes ---------------------- #
+  # Now acquire the attributes:
+  ls_comids <- proc.attr.hydfab::proc_attr_gageids(gage_ids=gage_ids,
+                                                   featureSource='nwissite',
+                                                   featureID='USGS-{gage_id}',
+                                                   Retr_Params=Retr_Params,
+                                                   overwrite=FALSE)
+
+  message(glue::glue("Completed attribute acquisition for {Retr_Params$paths$dir_db_attrs}"))
+}
+
+
+main()
diff --git a/scripts/config/camels_attr_config.yaml b/scripts/config/camels_attr_config.yaml
@@ -0,0 +1,54 @@
+# Config for grabbing catchment attributes corresponding to standard-named locations
+# Two options exist for defining locations that need attributes. At least one must be used. Both may be used.
+# 1. Refer to a file/dataset {loc_id_filepath} with a column identifer {loc_id} representing a standardized location identifier.
+# 2. Refer to a dataset processed by fs_proc python package and point to its location, {dir_std_base}/{datasets}, where {datasets} is a specific subdirectory name(s) or simply 'all'
+
+col_schema:   # required column mappings in the evaluation metrics dataset (if read in)
+  - 'featureID': 'USGS-{gage_id}' # python f-string / R glue() format; converting the 'gage_id' to the standardized featureID used by nhdplusTools/hydrofabric. Must use '{gage_id}' e.g. 'USGS-{gage_id}'
+  - 'featureSource': 'nwissite' # The standardized nhdplusTools featureSource. Possible featureSources might be 'nwissite', 'comid'.
+loc_id_read: # This section only required for locations NOT to be read in under a standardized dataset location (dir_std_base). May be used for additional prediction locations. MUST leave each item name inside list with empty assignments if no datasets desired.
+  - 'gage_id': 'gage_id' # expects tabular dataset with this column name representing the location id. 
+  - 'loc_id_filepath': '' # Required. filepath. Allows reading of .csv or a dataset accessible using arrow::open_datast() in lieu of reading dataset generated by fs_proc.
+  - 'featureID_loc' : 'USGS-{gage_id}' # python f-string / R glue() format; converting the 'loc_id' to the standardized featureID used by nhdplusTools/hydrofabric. Must use '{loc_id}' e.g. 'USGS-{loc_id}'. 
+  - 'featureSource_loc': 'nwissite' # The standardized nhdplusTools featureSource. 
+file_io: # May define {home_dir} for python's '{home_dir}/string_path'.format(home_dir =str(Path.home())) functionality
+  - 'save_loc': 'local' #  #TODO implement once s3 becomes a capability. Use 'local' for saving to a local path via dir_save. Future work will create an approach for 'aws' or other cloud saving methods
+  - 'dir_base' : '{home_dir}/noaa/regionalization/data/input' # Required. The save location of standardized output
+  - 'dir_std_base' : '{dir_base}/user_data_std' # Required. The location of standardized data generated by fs_proc python package
+  - 'dir_db_hydfab' : '{dir_base}/hydrofabric' # Required. The local dir where hydrofabric data are stored (limits the total s3 connections)
+  - 'dir_db_attrs' : '{dir_base}/attributes' # Required. The parent dir where each comid's attribute parquet file is stored in the subdirectory 'comid/', and each dataset's aggregated parquet attributes are stored in the subdirectory '/{dataset_name}
+formulation_metadata:  
+  - 'datasets': # Required. Must match directory name inside dir_std_base. May be a list of items, or simply sublist 'all' to select everything inside dir_std_base for attribute grabbing.
+    - 'juliemai-xSSA' # Required. In this example case, it's a sublist of just one thing.
+  - 'formulation_base': 'Raven_blended' # Informational. Unique name of formulation. Optional.
+hydfab_config: # Required section describing hydrofabric connection details and objects of interest
+ - 's3_base' : "s3://lynker-spatial/tabular-resources" # Required. s3 path containing hydrofabric-formatted attribute datasets
+ - 's3_bucket' : 'lynker-spatial' # Required. s3 bucket containing hydrofabric data
+ - 'ext' : 'gpkg' # Required. file extension of the hydrofrabric data. Default 'gpkg'. 
+ - 'hf_cat_sel': "total" # Required. Options include 'total' or 'all'; total: interested in the single location's aggregated catchment data; all: all subcatchments of interest
+attr_select: # Required. The names of variable sublistings are standardized, e.g. ha_vars, usgs_vars, sc_vars
+  - 's3_path_hydatl' : '{s3_base}/hydroATLAS/hydroatlas_vars.parquet' # path to hydroatlas data formatted for hydrofabric. Required only if hydroatlas variables desired.
+  - 'ha_vars':  # hydroatlas variables. Must specify s3_path_hydatl if desired.
+    - 'pet_mm_s01'
+    - 'cly_pc_sav'
+    - 'cly_pc_uav'
+    - 'ari_ix_sav'
+  - 'usgs_vars': # list of variables retrievable using nhdplusTools::get_characteristics_metadata(). 
+    - 'TOT_TWI'
+    - 'TOT_PRSNOW'
+    - 'TOT_POPDENS90'
+    - 'TOT_EWT'
+    - 'TOT_RECHG'
+    - 'TOT_PPT7100_ANN'
+    - 'TOT_AET'
+    - 'TOT_PET'
+    - 'TOT_SILTAVE'
+    - 'TOT_BASIN_AREA'
+    - 'TOT_BASIN_SLOPE'
+    - 'TOT_ELEV_MEAN'
+    - 'TOT_ELEV_MAX'
+    - 'TOT_Intensity'
+    - 'TOT_Wet'
+    - 'TOT_Dry'
+  - 'sc_vars': # Streamcat variables of interest. #TODO add streamcat grabber capability to proc.attr.hydfab
+    - # In this example case, no streamcat variables selected