diff --git a/.Rbuildignore b/.Rbuildignore index af6362a..6cc9a29 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,3 +12,5 @@ ChangeLog ^cran-comments\.md$ ^CRAN-SUBMISSION$ +^README\.Rmd$ +^data-raw$ diff --git a/DESCRIPTION b/DESCRIPTION index 08c4337..78eed02 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,24 +1,30 @@ Package: wru +Title: Who are You? Bayesian Prediction of Racial Category Using Surname, + First Name, Middle Name, and Geolocation Version: 2.0.0 Date: 2023-07-12 -Title: Who are You? Bayesian Prediction of Racial Category Using Surname, First Name, Middle Name, and - Geolocation Authors@R: c( - person("Kabir", "Khanna", email = "kabirkhanna@gmail.com", role = c("aut")), - person("Brandon", "Bertelsen", email = "brandon@bertelsen.ca", role = c("aut","cre")), - person("Santiago", "Olivella", email = "olivella@unc.edu", role = c("aut")), - person("Evan", "Rosenman", email = "etrrosenman@gmail.com", role = c("aut")), - person("Kosuke", "Imai", email = "imai@harvard.edu", role = c("aut")) + person("Kabir", "Khanna", , "kabirkhanna@gmail.com", role = "aut"), + person("Brandon", "Bertelsen", , "brandon@bertelsen.ca", role = c("aut", "cre")), + person("Santiago", "Olivella", , "olivella@unc.edu", role = "aut"), + person("Evan", "Rosenman", , "etrrosenman@gmail.com", role = "aut"), + person("Kosuke", "Imai", , "imai@harvard.edu", role = "aut") ) -Description: Predicts individual race/ethnicity using surname, first name, middle name, geolocation, - and other attributes, such as gender and age. The method utilizes Bayes' - Rule (with optional measurement error correction) to compute the posterior probability of each racial category for any given - individual. The package implements methods described in Imai and Khanna (2016) - "Improving Ecological Inference by Predicting Individual Ethnicity from Voter - Registration Records" Political Analysis and Imai, Olivella, and Rosenman (2022) - "Addressing census data problems in race imputation via fully Bayesian Improved Surname Geocoding and name supplements" - . The package also incorporates the data described in Rosenman, Olivella, and Imai (2023) - "Race and ethnicity data for first, middle, and surnames" . +Description: Predicts individual race/ethnicity using surname, first name, + middle name, geolocation, and other attributes, such as gender and + age. The method utilizes Bayes' Rule (with optional measurement error + correction) to compute the posterior probability of each racial + category for any given individual. The package implements methods + described in Imai and Khanna (2016) "Improving Ecological Inference by + Predicting Individual Ethnicity from Voter Registration Records" + Political Analysis and Imai, Olivella, and + Rosenman (2022) "Addressing census data problems in race imputation + via fully Bayesian Improved Surname Geocoding and name supplements" + . The package also incorporates the data + described in Rosenman, Olivella, and Imai (2023) "Race and ethnicity + data for first, middle, and surnames" + . +License: GPL (>= 3) URL: https://github.com/kosukeimai/wru BugReports: https://github.com/kosukeimai/wru/issues Depends: @@ -28,20 +34,20 @@ Imports: dplyr, furrr, future, + piggyback (>= 0.1.4), + PL94171, purrr, - Rcpp, - piggyback (>= 0.1.4), - PL94171 + Rcpp Suggests: - testthat (>= 3.0.0), - covr + covr, + testthat (>= 3.0.0) LinkingTo: Rcpp, RcppArmadillo -LazyLoad: yes +Config/testthat/edition: 3 +Encoding: UTF-8 LazyData: yes LazyDataCompression: xz -License: GPL (>= 3) +LazyLoad: yes +Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.3 -Encoding: UTF-8 -Config/testthat/edition: 3 diff --git a/R/census_data_preflight.R b/R/census_data_preflight.R index 067117e..804c066 100644 --- a/R/census_data_preflight.R +++ b/R/census_data_preflight.R @@ -1,8 +1,6 @@ #' Preflight census data #' -#' @param census.data See documentation in \code{race_predict}. -#' @param census.geo See documentation in \code{race_predict}. -#' @param year See documentation in \code{race_predict}. +#' @inheritParams predict_race #' @keywords internal census_data_preflight <- function(census.data, census.geo, year) { diff --git a/R/get_census_data.R b/R/get_census_data.R index dd86516..1c1ebc2 100644 --- a/R/get_census_data.R +++ b/R/get_census_data.R @@ -4,8 +4,13 @@ #' for specified state(s). Using this function to download Census data in advance #' can save considerable time when running \code{predict_race} and \code{census_helper}. #' -#' @param key A required character object containing a valid Census API key, -#' which can be requested \href{https://api.census.gov/data/key_signup.html}{here}. +#' @param key A character string containing a valid U.S. Census API key, +#' which can be requested from the +#' [U.S. Census API key signup page](https://api.census.gov/data/key_signup.html). +#' +#' If [`NULL`], the default, attempts to find a census key stored in an +#' [environment variable][Sys.getenv] named `CENSUS_API_KEY`. +#' #' @param states which states to extract Census data for, e.g., \code{c("NJ", "NY")}. #' @param age A \code{TRUE}/\code{FALSE} object indicating whether to condition on #' age or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). diff --git a/R/predict_race.R b/R/predict_race.R index 28b035e..e777d50 100644 --- a/R/predict_race.R +++ b/R/predict_race.R @@ -42,9 +42,14 @@ #' must have column named \code{place}. #' Specifying \code{\var{census.geo}} will call \code{census_helper} function #' to merge Census geographic data at specified level of geography. -#' @param census.key A character object specifying user's Census API -#' key. Required if \code{\var{census.geo}} is specified, because -#' a valid Census API key is required to download Census geographic data. +#' +#' @param census.key A character object specifying user's Census API key. +#' Required if `census.geo` is specified, because a valid Census API key is +#' required to download Census geographic data. +#' +#' If [`NULL`], the default, attempts to find a census key stored in an +#' [environment variable][Sys.getenv] named `CENSUS_API_KEY`. +#' #' @param census.data A list indexed by two-letter state abbreviations, #' which contains pre-saved Census geographic data. #' Can be generated using \code{get_census_data} function. diff --git a/R/race_prediction_funs.R b/R/race_prediction_funs.R index c8ea4d7..c2a3fcd 100644 --- a/R/race_prediction_funs.R +++ b/R/race_prediction_funs.R @@ -1,7 +1,7 @@ #' Internal model fitting functions #' #' These functions are intended for internal use only. Users should use the -#' \code{race_predict} interface rather any of these functions directly. +#' [predict_race()] interface rather any of these functions directly. #' #' These functions fit different versions of WRU. \code{.predict_race_old} fits #' the original WRU model, also known as BISG with census-based surname dictionary. @@ -13,26 +13,11 @@ #' the augmented surname dictionary, and the first and middle name #' dictionaries when making predictions. #' -#' @param voter.file See documentation in \code{race_predict}. -#' @param census.surname See documentation in \code{race_predict}. -#' @param surname.only See documentation in \code{race_predict}. -#' @param surname.year See documentation in \code{race_predict}. -#' @param census.geo See documentation in \code{race_predict}. -#' @param census.key See documentation in \code{race_predict}. -#' @param census.data See documentation in \code{race_predict}. -#' @param age See documentation in \code{race_predict}. -#' @param sex See documentation in \code{race_predict}. -#' @param year See documentation in \code{race_predict}. -#' @param party See documentation in \code{race_predict}. -#' @param retry See documentation in \code{race_predict}. -#' @param impute.missing See documentation in \code{race_predict}. -#' @param names.to.use See documentation in \code{race_predict}. -#' @param race.init See documentation in \code{race_predict}. -#' @param name.dictionaries See documentation in \code{race_predict}. -#' @param ctrl See \code{control} in documentation for \code{race_predict}. +#' @inheritParams predict_race +#' @param ctrl See `control` in documentation for [predict_race()]. #' @param use.counties A logical, defaulting to FALSE. Should census data be filtered by counties available in \var{census.data}? #' -#' @return See documentation in \code{race_predict}. +#' @inherit predict_race return #' #' @name modfuns NULL diff --git a/R/wru-internal.R b/R/wru-internal.R index 209921b..f2eabc0 100644 --- a/R/wru-internal.R +++ b/R/wru-internal.R @@ -1,7 +1,10 @@ -.onAttach <- -function(libname, pkgname) { - packageStartupMessage("\nPlease cite as: \n") - packageStartupMessage("Khanna K, Bertelsen B, Olivella S, Rosenman E, Imai K (2022). wru: Who are You?") - packageStartupMessage("Bayesian Prediction of Racial Category Using Surname, First Name, Middle Name, and Geolocation.") - packageStartupMessage("URL: https://CRAN.R-project.org/package=wru \n") +.onAttach <- function(libname, pkgname) { + packageStartupMessage( + "\n", + "Please cite as:", "\n\n", + format(citation("wru"), style = "text"), "\n\n", + "Note that wru 2.0.0 uses 2020 census data by default.", "\n", + 'Use the argument `year = "2010"`, to replicate analyses produced with earlier package versions.', + "\n" + ) } diff --git a/README.Rmd b/README.Rmd new file mode 100644 index 0000000..b900a04 --- /dev/null +++ b/README.Rmd @@ -0,0 +1,221 @@ +--- +output: github_document +--- + + + +```{r, include = FALSE} +library(wru) +options(width = 10000) +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.path = "man/figures/README-", + out.width = "100%", + message = FALSE +) + +voters <- dplyr::select(wru::voters, -precinct, -first, -last) +``` + +# wru: Who Are You? Bayesian Prediction of Racial Category Using Surname and Geolocation Package logo + +[![R-CMD-check](https://github.com/kosukeimai/wru/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/kosukeimai/wru/actions/workflows/R-CMD-check.yaml) [![CRAN_Status_Badge](https://www.r-pkg.org/badges/version-last-release/wru)](https://cran.r-project.org/package=wru) ![CRAN downloads](http://cranlogs.r-pkg.org/badges/grand-total/wru) + +This R package implements the methods proposed in Imai, K. and Khanna, K. (2016). "[Improving Ecological Inference by Predicting Individual Ethnicity from Voter Registration Record](http://imai.princeton.edu/research/race.html)." Political Analysis, Vol. 24, No. 2 (Spring), pp. 263-272. [doi: 10.1093/pan/mpw001](https://dx.doi.org/10.1093/pan/mpw001). + +## Installation + +You can install the released version of **wru** from [CRAN](https://cran.r-project.org/package=wru) with: + +``` r +install.packages("wru") +``` + +Or you can install the development version of **wru** from [GitHub](https://github.com/kosukeimai/wru) with: + +``` r +# install.packages("pak") +pak::pkg_install("kosukeimai/wru") +``` + +## Using wru + +Here is a simple example that predicts the race/ethnicity of voters based only on their surnames. + +``` r +library(wru) +future::plan(future::multisession) +predict_race(voter.file = voters, surname.only = TRUE) +``` + +The above produces the following output, where the last five columns are probabilistic race/ethnicity predictions (e.g., `pred.his` is the probability of being Hispanic/Latino): + +``` + VoterID surname state CD county tract block age sex party PID place pred.whi pred.bla pred.his pred.asi pred.oth + 1 Khanna NJ 12 021 004000 3001 29 0 Ind 0 74000 0.045110474 0.003067623 0.0068522723 0.860411906 0.084557725 + 2 Imai NJ 12 021 004501 1025 40 0 Dem 1 60900 0.052645440 0.001334812 0.0558160072 0.719376581 0.170827160 + 3 Rivera NY 12 061 004800 6001 33 0 Rep 2 51000 0.043285692 0.008204605 0.9136195794 0.024316883 0.010573240 + 4 Fifield NJ 12 021 004501 1025 27 0 Dem 1 60900 0.895405704 0.001911388 0.0337464844 0.011079323 0.057857101 + 5 Zhou NJ 12 021 004501 1025 28 1 Dem 1 60900 0.006572555 0.001298962 0.0005388581 0.982365594 0.009224032 + 6 Ratkovic NJ 12 021 004000 1025 35 0 Ind 0 60900 0.861236727 0.008212824 0.0095395642 0.011334635 0.109676251 + 7 Johnson NY 9 061 014900 4000 25 0 Dem 1 51000 0.543815322 0.344128607 0.0272403940 0.007405765 0.077409913 + 8 Lopez NJ 12 021 004501 1025 33 0 Rep 2 60900 0.038939877 0.004920643 0.9318797791 0.012154125 0.012105576 + 9 Wantchekon NJ 12 021 004501 1025 50 0 Rep 2 60900 0.330697188 0.194700665 0.4042849478 0.021379541 0.048937658 + 10 Morse DC 0 001 001301 3005 29 1 Rep 2 50000 0.866360147 0.044429853 0.0246568086 0.010219712 0.054333479 +``` + +### Using geolocation + +In order to predict race/ethnicity based on surnames *and* geolocation, a user needs to provide a valid U.S. Census API key to access the census statistics. +You can request a U.S. Census API key from [the U.S. Census API key signup page](http://api.census.gov/data/key_signup.html). +Once you have an API key, you can use the package to download relevant Census geographic data on demand and condition race/ethnicity predictions on geolocation (county, tract, block, or place). + +First, you should save your census key to your `.Rprofile` or `.Renviron`. Below is an example procedure: + +``` +usethis::edit_r_environ() +# Edit the file with the following: +CENSUS_API_KEY=YourKey +# Save and close the file +# Restart your R session +``` + +The following example predicts the race/ethnicity of voters based on their surnames, census tract of residence (`census.geo = "tract"`), and party registration (`party = "PID"`). +Note that a valid API key must be stored in a `CENSUS_API_KEY` environment variable or provided with the `census.key` argument in order for the function to download the relevant tract-level data. + +``` r +library(wru) +predict_race(voter.file = voters, census.geo = "tract", party = "PID") +``` +``` + VoterID surname state CD county tract block age sex party PID place pred.whi pred.bla pred.his pred.asi pred.oth + 1 Khanna NJ 12 021 004000 3001 29 0 Ind 0 74000 0.021711601 0.0009552652 2.826779e-03 0.93364592 0.040860431 + 2 Imai NJ 12 021 004501 1025 40 0 Dem 1 60900 0.015364583 0.0002320815 9.020240e-03 0.90245186 0.072931231 + 3 Rivera NY 12 061 004800 6001 33 0 Rep 2 51000 0.092415538 0.0047099965 7.860806e-01 0.09924761 0.017546300 + 4 Fifield NJ 12 021 004501 1025 27 0 Dem 1 60900 0.854810748 0.0010870744 1.783931e-02 0.04546436 0.080798514 + 5 Zhou NJ 12 021 004501 1025 28 1 Dem 1 60900 0.001548762 0.0001823506 7.031116e-05 0.99501901 0.003179566 + 6 Ratkovic NJ 12 021 004000 1025 35 0 Ind 0 60900 0.852374629 0.0052590592 8.092435e-03 0.02529163 0.108982246 + 7 Johnson NY 9 061 014900 4000 25 0 Dem 1 51000 0.831282563 0.0613242553 1.059715e-02 0.01602557 0.080770461 + 8 Lopez NJ 12 021 004501 1025 33 0 Rep 2 60900 0.062022518 0.0046691402 8.218906e-01 0.08321206 0.028205698 + 9 Wantchekon NJ 12 021 004501 1025 50 0 Rep 2 60900 0.396500218 0.1390722877 2.684107e-01 0.11018413 0.085832686 + 10 Morse DC 0 001 001301 3005 29 1 Rep 2 50000 0.861168219 0.0498449102 1.131154e-02 0.01633532 0.061340015 +``` + +In `predict_race()`, the `census.geo` options are "county", "tract", "block" and "place". +Here is an example of prediction based on census statistics collected at the level of "place": + +``` r +predict_race(voter.file = voters, census.geo = "place", party = "PID") +``` +``` + VoterID surname state CD county tract block age sex party PID place pred.whi pred.bla pred.his pred.asi pred.oth + 1 Khanna NJ 12 021 004000 3001 29 0 Ind 0 74000 0.042146148 0.0620484276 9.502254e-02 0.55109761 0.249685278 + 2 Imai NJ 12 021 004501 1025 40 0 Dem 1 60900 0.018140322 0.0002204255 1.026018e-02 0.90710894 0.064270133 + 3 Rivera NY 12 061 004800 6001 33 0 Rep 2 51000 0.015528660 0.0092292671 9.266893e-01 0.04182290 0.006729825 + 4 Fifield NJ 12 021 004501 1025 27 0 Dem 1 60900 0.879537890 0.0008997896 1.768379e-02 0.03982601 0.062052518 + 5 Zhou NJ 12 021 004501 1025 28 1 Dem 1 60900 0.001819394 0.0001723242 7.957542e-05 0.99514078 0.002787926 + 6 Ratkovic NJ 12 021 004000 1025 35 0 Ind 0 60900 0.834942701 0.0038157857 4.933723e-03 0.04021245 0.116095337 + 7 Johnson NY 9 061 014900 4000 25 0 Dem 1 51000 0.290386744 0.5761904554 4.112613e-02 0.01895885 0.073337820 + 8 Lopez NJ 12 021 004501 1025 33 0 Rep 2 60900 0.065321588 0.0039558641 8.339387e-01 0.07461133 0.022172551 + 9 Wantchekon NJ 12 021 004501 1025 50 0 Rep 2 60900 0.428723819 0.1209683869 2.796062e-01 0.10142953 0.069272098 + 10 Morse DC 0 001 001301 3005 29 1 Rep 2 50000 0.716211008 0.1899554127 1.867133e-02 0.01025241 0.064909839 +``` + +### Downloading census data + +It is also possible to pre-download Census geographic data, which can save time when running `predict_race()`. +The example dataset `voters` includes people in DC, NJ, and NY. +The following example subsets voters in DC and NJ, and then uses `get_census_data()` to download census geographic data in these two states (a valid API key must be stored in a `CENSUS_API_KEY` environment variable or provided with the `key` argument). +Census data is assigned to an object named `census.dc.nj`. +The `predict_race()` statement predicts the race/ethnicity of voters in DC and NJ using the pre-downloaded census data (`census.data = census.dc.nj`). This example conditions race/ethnicity predictions on voters' surnames, block of residence (`census.geo = "block"`), age (`age = TRUE`), and party registration (`party = "PID"`). + +Please note that the input parameters `age` and `sex` must have the same values in `get_census_data()` and `predict_race()`, i.e., `TRUE` in both or `FALSE` in both. +In this case, predictions are conditioned on age but not sex, so `age = TRUE` and `sex = FALSE` in both the `get_census_data()` and `predict_race()` statements. + +``` r +library(wru) +voters.dc.nj <- voters[voters$state %in% c("DC", "NJ"), ] +census.dc.nj <- get_census_data(state = c("DC", "NJ"), age = TRUE, sex = FALSE) +predict_race(voter.file = voters.dc.nj, census.geo = "block", census.data = census.dc.nj, age = TRUE, sex = FALSE, party = "PID") +``` + +This produces the same result as the following statement, which downloads census data during evaluation rather than using pre-downloaded data: + +``` r +predict_race(voter.file = voters.dc.nj, census.geo = "block", age = TRUE, sex = FALSE, party = "PID") +``` + +Using pre-downloaded Census data may be useful for the following reasons: + +- You can save a lot of time in future runs of `predict_race()` if the relevant census data has already been saved; +- The machines used to run `predict_race()` may not have internet access; +- You can obtain timely snapshots of census geographic data that match your voter file. + +Downloading data using `get_census_data()` may take a long time, especially in large states or when using small geographic levels. +If block-level census data is not required, downloading census data at the tract level will save time. +Similarly, if tract-level data is not required, county-level data may be specified in order to save time. + +``` r +library(wru) +voters.dc.nj <- voters[voters$state %in% c("DC", "NJ"), ] +census.dc.nj2 <- get_census_data(state = c("DC", "NJ"), age = TRUE, sex = FALSE, census.geo = "tract") +predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = census.dc.nj2, party = "PID", age = TRUE, sex = FALSE) +predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = census.dc.nj2, age = TRUE, sex = FALSE) # Pr(Race | Surname, County) +predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = census.dc.nj2, age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract) +predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = census.dc.nj2, party = "PID", age = TRUE, sex = FALSE) # Pr(Race | Surname, County, Party) +predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = census.dc.nj2, party = "PID", age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract, Party) +``` + +#### Interact directly with the Census API + +You can use `census_geo_api()` to manually construct a census object. +The example below creates a census object with county-level and tract-level data in DC and NJ, while avoiding downloading block-level data. +Note that the `state` argument requires a vector of two-letter state abbreviations. + +``` r +census.dc.nj3 = list() + +county.dc <- census_geo_api(state = "DC", geo = "county", age = TRUE, sex = FALSE) +tract.dc <- census_geo_api(state = "DC", geo = "tract", age = TRUE, sex = FALSE) +census.dc.nj3[["DC"]] <- list(state = "DC", county = county.dc, tract = tract.dc, age = TRUE, sex = FALSE) + +tract.nj <- census_geo_api(state = "NJ", geo = "tract", age = TRUE, sex = FALSE) +county.nj <- census_geo_api(state = "NJ", geo = "county", age = TRUE, sex = FALSE) +census.dc.nj3[["NJ"]] <- list(state = "NJ", county = county.nj, tract = tract.nj, age = TRUE, sex = FALSE) +``` + +Note: The age and sex parameters must be consistent when creating the Census object and using that Census object in the predict_race function. If one of these parameters is TRUE in the Census object, it must also be TRUE in the predict_race function. + +After saving the data in censusObj2 above, we can condition race/ethnicity predictions on different combinations of input variables, without having to re-download the relevant Census data. + +``` r +predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = census.dc.nj3, age = TRUE, sex = FALSE) # Pr(Race | Surname, County) +predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = census.dc.nj3, age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract) +predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = census.dc.nj3, party = "PID", age = TRUE, sex = FALSE) # Pr(Race | Surname, County, Party) +predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = census.dc.nj3, party = "PID", age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract, Party) +``` + +### Parallelization + +For larger scale imputations, garbage collection can become a problem and your machine(s) can quickly run out of memory (RAM). +We recommended using the `future.callr::callr` plan instead of `future::multisession`. +The `callr` plan instantiates a new session at every iteration of your parallel loop or map. +Although this has the negative effect of creating more overhead, it also clears sticky memory elements that can grow to eventual system failure when using `multisession`. +You end up with a process that is more stable, but slightly slower. + +``` r +library(wru) +future::plan(future.callr::callr) +# ... +``` + +## Census Data + +This package uses the Census Bureau Data API but is not endorsed or certified by the Census Bureau. + +U.S. Census Bureau (2021, October 8). Decennial Census API. Census.gov. Retrieved from + +## A related song + +[![Thumbnail of the music video for "Who Are You" by The Who](https://img.youtube.com/vi/PNbBDrceCy8/maxresdefault.jpg)](https://www.youtube.com/watch?v=PNbBDrceCy8) diff --git a/README.md b/README.md index bd076f2..45ddaa3 100644 --- a/README.md +++ b/README.md @@ -1,103 +1,183 @@ -# wru: Who Are You? Bayesian Prediction of Racial Category Using Surname and Geolocation [![R-CMD-check](https://github.com/kosukeimai/wru/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/kosukeimai/wru/actions/workflows/R-CMD-check.yaml) [![CRAN_Status_Badge](https://www.r-pkg.org/badges/version-last-release/wru)](https://cran.r-project.org/package=wru) ![CRAN downloads](http://cranlogs.r-pkg.org/badges/grand-total/wru) -This R package implements the methods proposed in Imai, K. and Khanna, K. (2016). "[Improving Ecological Inference by Predicting Individual Ethnicity from Voter Registration Record.](http://imai.princeton.edu/research/race.html)" Political Analysis, Vol. 24, No. 2 (Spring), pp. 263-272. doi: 10.1093/pan/mpw001. + -### Using wru +# wru: Who Are You? Bayesian Prediction of Racial Category Using Surname and Geolocation Package logo -First, you should save your census key to your `.Rprofile` or `.Renviron`. Below is an example procedure: +[![R-CMD-check](https://github.com/kosukeimai/wru/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/kosukeimai/wru/actions/workflows/R-CMD-check.yaml) +[![CRAN_Status_Badge](https://www.r-pkg.org/badges/version-last-release/wru)](https://cran.r-project.org/package=wru) +![CRAN downloads](http://cranlogs.r-pkg.org/badges/grand-total/wru) - > usethis::edit_r_profile() - # edit the file with the following - Sys.setenv("CENSUS_API_KEY" = "Your Key") - # save and close the file - # Restart your R session +This R package implements the methods proposed in Imai, K. and Khanna, +K. (2016). “[Improving Ecological Inference by Predicting Individual +Ethnicity from Voter Registration +Record](http://imai.princeton.edu/research/race.html).” Political +Analysis, Vol. 24, No. 2 (Spring), pp. 263-272. [doi: +10.1093/pan/mpw001](https://dx.doi.org/10.1093/pan/mpw001). + +## Installation -Now, here is a simple example that predicts the race/ethnicity of voters based only on their surnames. +You can install the released version of **wru** from +[CRAN](https://cran.r-project.org/package=wru) with: ``` r -library(wru) -future::plan(future::multisession) -data(voters) -predict_race(voter.file = voters, surname.only = T) +install.packages("wru") ``` -The above produces the following output, where the last five columns are probabilistic race/ethnicity predictions (e.g., 'pred.his' is the probability of being Hispanic/Latino): +Or you can install the development version of **wru** from +[GitHub](https://github.com/kosukeimai/wru) with: - "Proceeding with surname-only predictions ..." - VoterID surname state CD county tract block precinct age sex party PID place pred.whi pred.bla pred.his pred.asi pred.oth - 1 Khanna NJ 12 021 004000 3001 6 29 0 Ind 0 74000 0.0676000 0.00430000 0.00820000 0.86680000 0.05310000 - 2 Imai NJ 12 021 004501 1025 40 0 Dem 1 60900 0.0812000 0.00240000 0.06890000 0.73750000 0.11000000 - 3 Velasco NY 12 061 004800 6001 33 0 Rep 2 51000 0.0594000 0.00260000 0.82270000 0.10510000 0.01020000 - 4 Fifield NJ 12 021 004501 1025 27 0 Dem 1 60900 0.9355936 0.00220022 0.02850285 0.00780078 0.02590259 - 5 Zhou NJ 12 021 004501 1025 28 1 Dem 1 60900 0.0098000 0.00180000 0.00065000 0.98200000 0.00575000 - 6 Ratkovic NJ 12 021 004000 1025 35 0 Ind 0 60900 0.9187000 0.01083333 0.01083333 0.01083333 0.04880000 - 7 Johnson NY 9 061 015100 4000 25 0 Dem 1 51000 0.5897000 0.34630000 0.02360000 0.00540000 0.03500000 - 8 Lopez NJ 12 021 004501 1025 33 0 Rep 2 60900 0.0486000 0.00570000 0.92920000 0.01020000 0.00630000 - 9 Wantchekon NJ 12 021 004501 1025 50 0 Rep 2 60900 0.6665000 0.08530000 0.13670000 0.07970000 0.03180000 - 10 Morse DC 0 001 001301 3005 29 1 Rep 2 50000 0.9054000 0.04310000 0.02060000 0.00720000 0.02370000 +``` r +# install.packages("pak") +pak::pkg_install("kosukeimai/wru") +``` -In order to predict race/ethnicity based on surnames AND geolocation, a user needs to provide a valid U.S. Census API key to access the census statistics. You may request a U.S. Census API key [here](http://api.census.gov/data/key_signup.html). Once you have an API key, you can use the package to download relevant Census geographic data on demand and condition race/ethnicity predictions on geolocation (county, tract, block, or place). +## Using wru -The following example predicts the race/ethnicity of voters based on their surnames, Census tract of residence (census.geo = "tract"), and which party registration (party = "PID"). Note that a valid API key must be provided in the input parameter 'census.key' in order for the function to download the relevant tract-level data. +Here is a simple example that predicts the race/ethnicity of voters +based only on their surnames. ``` r library(wru) -data(voters) -predict_race(voter.file = voters, census.geo = "tract", census.key = "...", party = "PID") +future::plan(future::multisession) +predict_race(voter.file = voters, surname.only = TRUE) ``` -The above returns the following output. - - VoterID surname state CD county tract block precinct age sex party PID place pred.whi pred.bla pred.his pred.asi pred.oth - 1 Khanna NJ 12 021 004000 3001 6 29 0 Ind 0 74000 0.081856291 0.0021396565 0.0110451405 0.828313291 0.076645621 - 6 Ratkovic NJ 12 021 004000 1025 35 0 Ind 0 60900 0.916936771 0.0044432219 0.0120276229 0.008532929 0.058059455 - 4 Fifield NJ 12 021 004501 1025 27 0 Dem 1 60900 0.895620643 0.0022078678 0.0139457411 0.023345853 0.064879895 - 5 Zhou NJ 12 021 004501 1025 28 1 Dem 1 60900 0.003164229 0.0006092345 0.0001072684 0.991261466 0.004857802 - 2 Imai NJ 12 021 004501 1025 40 0 Dem 1 60900 0.029936354 0.0009275220 0.0129831039 0.850040743 0.106112277 - 8 Lopez NJ 12 021 004501 1025 33 0 Rep 2 60900 0.231046860 0.0016485574 0.6813780115 0.053180270 0.032746301 - 9 Wantchekon NJ 12 021 004501 1025 50 0 Rep 2 60900 0.817841573 0.0063677130 0.0258733496 0.107254103 0.042663261 - 3 Velasco NY 12 061 004800 6001 33 0 Rep 2 51000 0.223924118 0.0002913000 0.4451163607 0.313431417 0.017236805 - 7 Johnson NY 9 061 015100 4000 25 0 Dem 1 51000 0.241417483 0.6900686166 0.0293556870 0.011105140 0.028053073 - 10 Morse DC 0 001 001301 3005 29 1 Rep 2 50000 0.983300770 0.0006116706 0.0034070782 0.004823439 0.007857042 +The above produces the following output, where the last five columns are +probabilistic race/ethnicity predictions (e.g., `pred.his` is the +probability of being Hispanic/Latino): + + VoterID surname state CD county tract block age sex party PID place pred.whi pred.bla pred.his pred.asi pred.oth + 1 Khanna NJ 12 021 004000 3001 29 0 Ind 0 74000 0.045110474 0.003067623 0.0068522723 0.860411906 0.084557725 + 2 Imai NJ 12 021 004501 1025 40 0 Dem 1 60900 0.052645440 0.001334812 0.0558160072 0.719376581 0.170827160 + 3 Rivera NY 12 061 004800 6001 33 0 Rep 2 51000 0.043285692 0.008204605 0.9136195794 0.024316883 0.010573240 + 4 Fifield NJ 12 021 004501 1025 27 0 Dem 1 60900 0.895405704 0.001911388 0.0337464844 0.011079323 0.057857101 + 5 Zhou NJ 12 021 004501 1025 28 1 Dem 1 60900 0.006572555 0.001298962 0.0005388581 0.982365594 0.009224032 + 6 Ratkovic NJ 12 021 004000 1025 35 0 Ind 0 60900 0.861236727 0.008212824 0.0095395642 0.011334635 0.109676251 + 7 Johnson NY 9 061 014900 4000 25 0 Dem 1 51000 0.543815322 0.344128607 0.0272403940 0.007405765 0.077409913 + 8 Lopez NJ 12 021 004501 1025 33 0 Rep 2 60900 0.038939877 0.004920643 0.9318797791 0.012154125 0.012105576 + 9 Wantchekon NJ 12 021 004501 1025 50 0 Rep 2 60900 0.330697188 0.194700665 0.4042849478 0.021379541 0.048937658 + 10 Morse DC 0 001 001301 3005 29 1 Rep 2 50000 0.866360147 0.044429853 0.0246568086 0.010219712 0.054333479 + +### Using geolocation + +In order to predict race/ethnicity based on surnames *and* geolocation, +a user needs to provide a valid U.S. Census API key to access the census +statistics. You can request a U.S. Census API key from [the U.S. Census +API key signup page](http://api.census.gov/data/key_signup.html). Once +you have an API key, you can use the package to download relevant Census +geographic data on demand and condition race/ethnicity predictions on +geolocation (county, tract, block, or place). + +First, you should save your census key to your `.Rprofile` or +`.Renviron`. Below is an example procedure: + + usethis::edit_r_environ() + # Edit the file with the following: + CENSUS_API_KEY=YourKey + # Save and close the file + # Restart your R session -In predict_race, the census.geo options are "county", "tract", "block" and "place". Here is an example of prediction based on census statistics collected at the level of "place": +The following example predicts the race/ethnicity of voters based on +their surnames, census tract of residence (`census.geo = "tract"`), and +party registration (`party = "PID"`). Note that a valid API key must be +stored in a `CENSUS_API_KEY` environment variable or provided with the +`census.key` argument in order for the function to download the relevant +tract-level data. ``` r -data(voters) -predict_race(voter.file = voters, census.geo = "place", census.key = "...", party = "PID") +library(wru) +predict_race(voter.file = voters, census.geo = "tract", party = "PID") ``` -It is also possible to pre-download Census geographic data, which can save time when running predict_race(). The example dataset 'voters' includes people in DC, NJ, and NY. The following example subsets voters in DC and NJ, and then uses get_census_data() to download Census geographic data in these two states (input parameter 'key' requires valid API key). Census data is assigned to an object named census.dc.nj. The predict_race() statement predicts the race/ethnicity of voters in DC and NJ using the pre-saved Census data (census.data = census.dc.nj). This example conditions race/ethnicity predictions on voters' surnames, block of residence (census.geo = "block"), age (age = TRUE), and party registration (party = "PID"). + VoterID surname state CD county tract block age sex party PID place pred.whi pred.bla pred.his pred.asi pred.oth + 1 Khanna NJ 12 021 004000 3001 29 0 Ind 0 74000 0.021711601 0.0009552652 2.826779e-03 0.93364592 0.040860431 + 2 Imai NJ 12 021 004501 1025 40 0 Dem 1 60900 0.015364583 0.0002320815 9.020240e-03 0.90245186 0.072931231 + 3 Rivera NY 12 061 004800 6001 33 0 Rep 2 51000 0.092415538 0.0047099965 7.860806e-01 0.09924761 0.017546300 + 4 Fifield NJ 12 021 004501 1025 27 0 Dem 1 60900 0.854810748 0.0010870744 1.783931e-02 0.04546436 0.080798514 + 5 Zhou NJ 12 021 004501 1025 28 1 Dem 1 60900 0.001548762 0.0001823506 7.031116e-05 0.99501901 0.003179566 + 6 Ratkovic NJ 12 021 004000 1025 35 0 Ind 0 60900 0.852374629 0.0052590592 8.092435e-03 0.02529163 0.108982246 + 7 Johnson NY 9 061 014900 4000 25 0 Dem 1 51000 0.831282563 0.0613242553 1.059715e-02 0.01602557 0.080770461 + 8 Lopez NJ 12 021 004501 1025 33 0 Rep 2 60900 0.062022518 0.0046691402 8.218906e-01 0.08321206 0.028205698 + 9 Wantchekon NJ 12 021 004501 1025 50 0 Rep 2 60900 0.396500218 0.1390722877 2.684107e-01 0.11018413 0.085832686 + 10 Morse DC 0 001 001301 3005 29 1 Rep 2 50000 0.861168219 0.0498449102 1.131154e-02 0.01633532 0.061340015 + +In `predict_race()`, the `census.geo` options are “county”, “tract”, +“block” and “place”. Here is an example of prediction based on census +statistics collected at the level of “place”: + +``` r +predict_race(voter.file = voters, census.geo = "place", party = "PID") +``` -Please note that the input parameters 'age' and 'sex' must have the same values in get_census_data() and predict_race(), i.e., TRUE in both or FALSE in both. In this case, predictions are conditioned on age but not sex, so age = TRUE and sex = FALSE in both the get_census_data() and predict_race() statements. + VoterID surname state CD county tract block age sex party PID place pred.whi pred.bla pred.his pred.asi pred.oth + 1 Khanna NJ 12 021 004000 3001 29 0 Ind 0 74000 0.042146148 0.0620484276 9.502254e-02 0.55109761 0.249685278 + 2 Imai NJ 12 021 004501 1025 40 0 Dem 1 60900 0.018140322 0.0002204255 1.026018e-02 0.90710894 0.064270133 + 3 Rivera NY 12 061 004800 6001 33 0 Rep 2 51000 0.015528660 0.0092292671 9.266893e-01 0.04182290 0.006729825 + 4 Fifield NJ 12 021 004501 1025 27 0 Dem 1 60900 0.879537890 0.0008997896 1.768379e-02 0.03982601 0.062052518 + 5 Zhou NJ 12 021 004501 1025 28 1 Dem 1 60900 0.001819394 0.0001723242 7.957542e-05 0.99514078 0.002787926 + 6 Ratkovic NJ 12 021 004000 1025 35 0 Ind 0 60900 0.834942701 0.0038157857 4.933723e-03 0.04021245 0.116095337 + 7 Johnson NY 9 061 014900 4000 25 0 Dem 1 51000 0.290386744 0.5761904554 4.112613e-02 0.01895885 0.073337820 + 8 Lopez NJ 12 021 004501 1025 33 0 Rep 2 60900 0.065321588 0.0039558641 8.339387e-01 0.07461133 0.022172551 + 9 Wantchekon NJ 12 021 004501 1025 50 0 Rep 2 60900 0.428723819 0.1209683869 2.796062e-01 0.10142953 0.069272098 + 10 Morse DC 0 001 001301 3005 29 1 Rep 2 50000 0.716211008 0.1899554127 1.867133e-02 0.01025241 0.064909839 + +### Downloading census data + +It is also possible to pre-download Census geographic data, which can +save time when running `predict_race()`. The example dataset `voters` +includes people in DC, NJ, and NY. The following example subsets voters +in DC and NJ, and then uses `get_census_data()` to download census +geographic data in these two states (a valid API key must be stored in a +`CENSUS_API_KEY` environment variable or provided with the `key` +argument). Census data is assigned to an object named `census.dc.nj`. +The `predict_race()` statement predicts the race/ethnicity of voters in +DC and NJ using the pre-downloaded census data +(`census.data = census.dc.nj`). This example conditions race/ethnicity +predictions on voters’ surnames, block of residence +(`census.geo = "block"`), age (`age = TRUE`), and party registration +(`party = "PID"`). + +Please note that the input parameters `age` and `sex` must have the same +values in `get_census_data()` and `predict_race()`, i.e., `TRUE` in both +or `FALSE` in both. In this case, predictions are conditioned on age but +not sex, so `age = TRUE` and `sex = FALSE` in both the +`get_census_data()` and `predict_race()` statements. ``` r library(wru) -data(voters) -voters.dc.nj <- voters[c(-3, -7), ] # remove two NY cases from dataset -census.dc.nj <- get_census_data(key = "...", state = c("DC", "NJ"), age = TRUE, sex = FALSE) # create Census data object covering DC and NJ +voters.dc.nj <- voters[voters$state %in% c("DC", "NJ"), ] +census.dc.nj <- get_census_data(state = c("DC", "NJ"), age = TRUE, sex = FALSE) predict_race(voter.file = voters.dc.nj, census.geo = "block", census.data = census.dc.nj, age = TRUE, sex = FALSE, party = "PID") ``` -The last two lines above are equivalent to the following: +This produces the same result as the following statement, which +downloads census data during evaluation rather than using pre-downloaded +data: ``` r -predict_race(voter.file = voters.dc.nj, census.geo = "block", census.key = "...", age = TRUE, sex = FALSE, party = "PID") +predict_race(voter.file = voters.dc.nj, census.geo = "block", age = TRUE, sex = FALSE, party = "PID") ``` -Using pre-downloaded Census data may be useful for the following reasons: +Using pre-downloaded Census data may be useful for the following +reasons: -- You can save a lot of time in future runs of predict_race() if the relevant Census data has already been saved; -- The machines used to run predict_race() may not have internet access; -- You can obtain timely snapshots of Census geographic data that match your voter file. +- You can save a lot of time in future runs of `predict_race()` if the + relevant census data has already been saved; +- The machines used to run `predict_race()` may not have internet + access; +- You can obtain timely snapshots of census geographic data that match + your voter file. -Downloading data using get_census_data() may take a long time, especially at the block level or in large states. If block-level Census data is not required, downloading Census data at the tract level will save time. Similarly, if tract-level Census data is not required, county-level data may be specified in order to save time. +Downloading data using `get_census_data()` may take a long time, +especially in large states or when using small geographic levels. If +block-level census data is not required, downloading census data at the +tract level will save time. Similarly, if tract-level data is not +required, county-level data may be specified in order to save time. ``` r library(wru) -data(voters) -voters.dc.nj <- voters[c(-3, -7), ] # remove two NY cases from dataset -census.dc.nj2 <- get_census_data(key = "...", state = c("DC", "NJ"), age = TRUE, sex = FALSE, census.geo = "tract") +voters.dc.nj <- voters[voters$state %in% c("DC", "NJ"), ] +census.dc.nj2 <- get_census_data(state = c("DC", "NJ"), age = TRUE, sex = FALSE, census.geo = "tract") predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = census.dc.nj2, party = "PID", age = TRUE, sex = FALSE) predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = census.dc.nj2, age = TRUE, sex = FALSE) # Pr(Race | Surname, County) predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = census.dc.nj2, age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract) @@ -105,45 +185,69 @@ predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = cen predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = census.dc.nj2, party = "PID", age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract, Party) ``` -Or you can also use the census_geo_api() to manually construct a census object. The example below creates a census object with county-level and tract-level data in DC and NJ, while avoiding downloading block-level data. Note that this function has the input parameter 'state' that requires a two-letter state abbreviation to proceed. +#### Interact directly with the Census API + +You can use `census_geo_api()` to manually construct a census object. +The example below creates a census object with county-level and +tract-level data in DC and NJ, while avoiding downloading block-level +data. Note that the `state` argument requires a vector of two-letter +state abbreviations. ``` r -censusObj2 = list() +census.dc.nj3 = list() -county.dc <- census_geo_api(key = "...", state = "DC", geo = "county", age = TRUE, sex = FALSE) -tract.dc <- census_geo_api(key = "...", state = "DC", geo = "tract", age = TRUE, sex = FALSE) -censusObj2[["DC"]] <- list(state = "DC", county = county.dc, tract = tract.dc, age = TRUE, sex = FALSE) +county.dc <- census_geo_api(state = "DC", geo = "county", age = TRUE, sex = FALSE) +tract.dc <- census_geo_api(state = "DC", geo = "tract", age = TRUE, sex = FALSE) +census.dc.nj3[["DC"]] <- list(state = "DC", county = county.dc, tract = tract.dc, age = TRUE, sex = FALSE) -tract.nj <- census_geo_api(key = "...", state = "NJ", geo = "tract", age = TRUE, sex = FALSE) -county.nj <- census_geo_api(key = "...", state = "NJ", geo = "county", age = TRUE, sex = FALSE) -censusObj2[["NJ"]] <- list(state = "NJ", county = county.nj, tract = tract.nj, age = TRUE, sex = FALSE) +tract.nj <- census_geo_api(state = "NJ", geo = "tract", age = TRUE, sex = FALSE) +county.nj <- census_geo_api(state = "NJ", geo = "county", age = TRUE, sex = FALSE) +census.dc.nj3[["NJ"]] <- list(state = "NJ", county = county.nj, tract = tract.nj, age = TRUE, sex = FALSE) ``` -Note: The age and sex parameters must be consistent when creating the Census object and using that Census object in the predict_race function. If one of these parameters is TRUE in the Census object, it must also be TRUE in the predict_race function. +Note: The age and sex parameters must be consistent when creating the +Census object and using that Census object in the predict_race function. +If one of these parameters is TRUE in the Census object, it must also be +TRUE in the predict_race function. -After saving the data in censusObj2 above, we can condition race/ethnicity predictions on different combinations of input variables, without having to re-download the relevant Census data. +After saving the data in censusObj2 above, we can condition +race/ethnicity predictions on different combinations of input variables, +without having to re-download the relevant Census data. ``` r -predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = censusObj2, age = TRUE, sex = FALSE) # Pr(Race | Surname, County) -predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = censusObj2, age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract) -predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = censusObj2, party = "PID", age = TRUE, sex = FALSE) # Pr(Race | Surname, County, Party) -predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = censusObj2, party = "PID", age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract, Party) +predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = census.dc.nj3, age = TRUE, sex = FALSE) # Pr(Race | Surname, County) +predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = census.dc.nj3, age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract) +predict_race(voter.file = voters.dc.nj, census.geo = "county", census.data = census.dc.nj3, party = "PID", age = TRUE, sex = FALSE) # Pr(Race | Surname, County, Party) +predict_race(voter.file = voters.dc.nj, census.geo = "tract", census.data = census.dc.nj3, party = "PID", age = TRUE, sex = FALSE) # Pr(Race | Surname, Tract, Party) ``` -## Notes about process design +### Parallelization -For larger scale imputations garbage-collection can become a problem and your machine(s) can quickly run out of memory (RAM). It is recommended to use the `future.callr::callr` plan instead of `future::multisession`. The `callr` plan instantiates a new session at every iteration of your parallel loop or map. This simultaneously has the effect of creating more overhead, but also clearing the often sticky memory elements that would be left over to grow to eventual system failure when using `multisession`. You end up with a process that is more stable, but slightly slower. +For larger scale imputations, garbage collection can become a problem +and your machine(s) can quickly run out of memory (RAM). We recommended +using the `future.callr::callr` plan instead of `future::multisession`. +The `callr` plan instantiates a new session at every iteration of your +parallel loop or map. Although this has the negative effect of creating +more overhead, it also clears sticky memory elements that can grow to +eventual system failure when using `multisession`. You end up with a +process that is more stable, but slightly slower. - library(wru) - future::plan(future.callr::callr) - # ... +``` r +library(wru) +future::plan(future.callr::callr) +# ... +``` ## Census Data -This package uses the Census Bureau Data API but is not endorsed or certified by the Census Bureau. +This package uses the Census Bureau Data API but is not endorsed or +certified by the Census Bureau. -U.S. Census Bureau (2021, October 8). Decennial Census API. Census.gov. Retrieved from +U.S. Census Bureau (2021, October 8). Decennial Census API. Census.gov. +Retrieved from + -### A related song +## A related song -Watch [this](https://www.youtube.com/watch?v=LYb_nqU_43w)! +[![Thumbnail of the music video for “Who Are You” by The +Who](https://img.youtube.com/vi/PNbBDrceCy8/maxresdefault.jpg)](https://www.youtube.com/watch?v=PNbBDrceCy8) diff --git a/data-raw/voters.R b/data-raw/voters.R new file mode 100644 index 0000000..a224447 --- /dev/null +++ b/data-raw/voters.R @@ -0,0 +1,19 @@ +voters <- data.frame( + VoterID = as.character(1:10), + surname = c("Khanna", "Imai", "Rivera", "Fifield", "Zhou", "Ratkovic", "Johnson", "Lopez", "Wantchekon", "Morse"), + state = c("NJ", "NJ", "NY", "NJ", "NJ", "NJ", "NY", "NJ", "NJ", "DC"), + CD = c("12", "12", "12", "12", "12", "12", "9", "12", "12", "0"), + county = c("021", "021", "061", "021", "021", "021", "061", "021", "021", "001"), + tract = c("004000", "004501", "004800", "004501", "004501", "004000", "014900", "004501", "004501", "001301"), + block = c("3001", "1025", "6001", "1025", "1025", "1025", "4000", "1025", "1025", "3005"), + precinct = c("6", "", "", "", "", "", "", "", "", ""), + age = c(29, 40, 33, 27, 28, 35, 25, 33, 50, 29), + sex = c(0, 0, 0, 0, 1, 0, 0, 0, 0, 1), + party = c("Ind", "Dem", "Rep", "Dem", "Dem", "Ind", "Dem", "Rep", "Rep", "Rep"), + PID = c("0", "1", "2", "1", "1", "0", "1", "2", "2", "2"), + place = c("74000", "60900", "51000", "60900", "60900", "60900", "51000", "60900", "60900", "50000"), + last = c("Khanna", "Imai", "Rivera", "Fifield", "Zhou", "Ratkovic", "Johnson", "Lopez", "Wantchekon", "Morse"), + first = c("Kabir", "Kosuke", "Carlos", "Ben", "Yang-Yang", "Marc", "Frank", "Gabriel", "Leonard", "Julia") +) + +usethis::use_data(voters, overwrite = TRUE) diff --git a/data/voters.RData b/data/voters.RData deleted file mode 100644 index ffe9973..0000000 Binary files a/data/voters.RData and /dev/null differ diff --git a/data/voters.rda b/data/voters.rda new file mode 100644 index 0000000..bfa686b Binary files /dev/null and b/data/voters.rda differ diff --git a/inst/CITATION b/inst/CITATION new file mode 100644 index 0000000..7967551 --- /dev/null +++ b/inst/CITATION @@ -0,0 +1,13 @@ +bibentry( + bibtype = "Manual", + title = "wru: Who are You? Bayesian Prediction of Racial Category Using Surname, First Name, Middle Name, and Geolocation", + author = c( + person("Kabir", "Khanna", , "kabirkhanna@gmail.com", role = "aut"), + person("Brandon", "Bertelsen", , "brandon@bertelsen.ca", role = c("aut", "cre")), + person("Santiago", "Olivella", , "olivella@unc.edu", role = "aut"), + person("Evan", "Rosenman", , "etrrosenman@gmail.com", role = "aut"), + person("Kosuke", "Imai", , "imai@harvard.edu", role = "aut") + ), + year = 2023, + url = "https://CRAN.R-project.org/package=wru" +) diff --git a/man/census_data_preflight.Rd b/man/census_data_preflight.Rd index f3512c8..9c4dda9 100644 --- a/man/census_data_preflight.Rd +++ b/man/census_data_preflight.Rd @@ -7,11 +7,28 @@ census_data_preflight(census.data, census.geo, year) } \arguments{ -\item{census.data}{See documentation in \code{race_predict}.} +\item{census.data}{A list indexed by two-letter state abbreviations, +which contains pre-saved Census geographic data. +Can be generated using \code{get_census_data} function.} -\item{census.geo}{See documentation in \code{race_predict}.} +\item{census.geo}{An optional character vector specifying what level of +geography to use to merge in U.S. Census geographic data. Currently +\code{"county"}, \code{"tract"}, \code{"block_group"}, \code{"block"}, and \code{"place"} +are supported. +Note: sufficient information must be in user-defined \code{\var{voter.file}} object. +If \code{\var{census.geo} = "county"}, then \code{\var{voter.file}} +must have column named \code{county}. +If \code{\var{census.geo} = "tract"}, then \code{\var{voter.file}} +must have columns named \code{county} and \code{tract}. +And if \code{\var{census.geo} = "block"}, then \code{\var{voter.file}} +must have columns named \code{county}, \code{tract}, and \code{block}. +If \code{\var{census.geo} = "place"}, then \code{\var{voter.file}} +must have column named \code{place}. +Specifying \code{\var{census.geo}} will call \code{census_helper} function +to merge Census geographic data at specified level of geography.} -\item{year}{See documentation in \code{race_predict}.} +\item{year}{An optional character vector specifying the year of U.S. Census geographic +data to be downloaded. Use \code{"2010"}, or \code{"2020"}. Default is \code{"2020"}.} } \description{ Preflight census data diff --git a/man/census_geo_api.Rd b/man/census_geo_api.Rd index 4c1f6ae..b59ba69 100644 --- a/man/census_geo_api.Rd +++ b/man/census_geo_api.Rd @@ -20,31 +20,31 @@ census_geo_api( \item{key}{A required character object. Must contain user's Census API key, which can be requested \href{https://api.census.gov/data/key_signup.html}{here}.} -\item{state}{A required character object specifying which state to extract Census data for, +\item{state}{A required character object specifying which state to extract Census data for, e.g., \code{"NJ"}.} -\item{geo}{A character object specifying what aggregation level to use. -Use \code{"county"}, \code{"tract"},\code{"block_group"}, \code{"block"}, or \code{"place"}. +\item{geo}{A character object specifying what aggregation level to use. +Use \code{"county"}, \code{"tract"},\code{"block_group"}, \code{"block"}, or \code{"place"}. Default is \code{"tract"}. Warning: extracting block-level data takes very long.} -\item{age}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on +\item{age}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on age or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). -If \code{TRUE}, function will return Pr(Geolocation, Age | Race). +If \code{TRUE}, function will return Pr(Geolocation, Age | Race). If \code{\var{sex}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race).} -\item{sex}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on -sex or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). -If \code{TRUE}, function will return Pr(Geolocation, Sex | Race). +\item{sex}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on +sex or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). +If \code{TRUE}, function will return Pr(Geolocation, Sex | Race). If \code{\var{age}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race).} \item{year}{A character object specifying the year of U.S. Census data to be downloaded. Use \code{"2010"}, or \code{"2020"}. Default is \code{"2020"}. -Warning: 2020 U.S. Census data is downloaded only when \code{\var{age}} and +Warning: 2020 U.S. Census data is downloaded only when \code{\var{age}} and \code{\var{sex}} are both \code{FALSE}.} \item{retry}{The number of retries at the census website if network interruption occurs.} -\item{save_temp}{File indicating where to save the temporary outputs. +\item{save_temp}{File indicating where to save the temporary outputs. Defaults to NULL. If specified, the function will look for an .RData file with the same format as the expected output.} @@ -52,14 +52,14 @@ with the same format as the expected output.} Useful for smaller predictions where only a few counties are considered. Must be zero padded.} } \value{ -Output will be an object of class \code{list}, indexed by state names. It will - consist of the original user-input data with additional columns of Census geographic data. +Output will be an object of class \code{list}, indexed by state names. It will +consist of the original user-input data with additional columns of Census geographic data. } \description{ \code{census_geo_api} retrieves U.S. Census geographic data for a given state. } \details{ -This function allows users to download U.S. Census geographic data (2010 or 2020), +This function allows users to download U.S. Census geographic data (2010 or 2020), at either the county, tract, block, or place level, for a particular state. } \examples{ @@ -71,7 +71,7 @@ at either the county, tract, block, or place level, for a particular state. } \references{ -Relies on get_census_api, get_census_api_2, and vec_to_chunk functions authored by Nicholas Nagle, +Relies on get_census_api, get_census_api_2, and vec_to_chunk functions authored by Nicholas Nagle, available \href{https://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. } \keyword{internal} diff --git a/man/census_helper.Rd b/man/census_helper.Rd index ef3c5ff..56eddff 100644 --- a/man/census_helper.Rd +++ b/man/census_helper.Rd @@ -68,8 +68,8 @@ If \code{\var{census.data}} is missing, Census geographic data will be obtained } \value{ Output will be an object of class \code{data.frame}. It will - consist of the original user-input data with additional columns of - Census data. +consist of the original user-input data with additional columns of +Census data. } \description{ \code{census_helper} links user-input dataset with Census geographic data. diff --git a/man/census_helper_new.Rd b/man/census_helper_new.Rd index e410267..1473f29 100644 --- a/man/census_helper_new.Rd +++ b/man/census_helper_new.Rd @@ -21,19 +21,19 @@ census_helper_new( \item{key}{A required character object. Must contain user's Census API key, which can be requested \href{https://api.census.gov/data/key_signup.html}{here}.} -\item{voter.file}{An object of class \code{data.frame}. Must contain field(s) named +\item{voter.file}{An object of class \code{data.frame}. Must contain field(s) named \code{\var{county}}, \code{\var{tract}}, \code{\var{block}}, and/or \code{\var{place}} -specifying geolocation. These should be character variables that match up with -U.S. Census categories. County should be three characters (e.g., "031" not "31"), -tract should be six characters, and block should be four characters. +specifying geolocation. These should be character variables that match up with +U.S. Census categories. County should be three characters (e.g., "031" not "31"), +tract should be six characters, and block should be four characters. Place should be five characters if it is included.} -\item{states}{A character vector specifying which states to extract -Census data for, e.g. \code{c("NJ", "NY")}. Default is \code{"all"}, which extracts +\item{states}{A character vector specifying which states to extract +Census data for, e.g. \code{c("NJ", "NY")}. Default is \code{"all"}, which extracts Census data for all states contained in user-input data.} -\item{geo}{A character object specifying what aggregation level to use. -Use \code{"county"}, \code{"tract"}, \code{"block"}, or \code{"place"}. +\item{geo}{A character object specifying what aggregation level to use. +Use \code{"county"}, \code{"tract"}, \code{"block"}, or \code{"place"}. Default is \code{"tract"}. Warning: extracting block-level data takes very long.} \item{age}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on @@ -49,32 +49,32 @@ If \code{\var{age}} is also \code{TRUE}, function will return Pr(Geolocation, Ag \item{year}{A character object specifying the year of U.S. Census data to be downloaded. Use \code{"2010"}, or \code{"2020"}. Default is \code{"2020"}.} -\item{census.data}{A optional census object of class \code{list} containing +\item{census.data}{A optional census object of class \code{list} containing pre-saved Census geographic data. Can be created using \code{get_census_data} function. -If \code{\var{census.data}} is provided, the \code{\var{year}} element must -have the same value as the \code{\var{year}} option specified in this function -(i.e., \code{"2010"} in both or \code{"2020"} in both). -If \code{\var{census.data}} is provided, the \code{\var{age}} and the \code{\var{sex}} +If \code{\var{census.data}} is provided, the \code{\var{year}} element must +have the same value as the \code{\var{year}} option specified in this function +(i.e., \code{"2010"} in both or \code{"2020"} in both). +If \code{\var{census.data}} is provided, the \code{\var{age}} and the \code{\var{sex}} elements must be \code{FALSE}. This corresponds to the defaults of \code{census_geo_api}. If \code{\var{census.data}} is missing, Census geographic data will be obtained via Census API.} \item{retry}{The number of retries at the census website if network interruption occurs.} -\item{use.counties}{A logical, defaulting to FALSE. Should census data be filtered by counties +\item{use.counties}{A logical, defaulting to FALSE. Should census data be filtered by counties available in \var{census.data}?} } \value{ -Output will be an object of class \code{data.frame}. It will - consist of the original user-input data with additional columns of - Census data. +Output will be an object of class \code{data.frame}. It will +consist of the original user-input data with additional columns of +Census data. } \description{ \code{census_helper_new} links user-input dataset with Census geographic data. } \details{ -This function allows users to link their geocoded dataset (e.g., voter file) -with U.S. Census data (2010 or 2020). The function extracts Census Summary File data -at the county, tract, block, or place level. Census data calculated are +This function allows users to link their geocoded dataset (e.g., voter file) +with U.S. Census data (2010 or 2020). The function extracts Census Summary File data +at the county, tract, block, or place level. Census data calculated are Pr(Geolocation | Race) where geolocation is county, tract, block, or place. } \examples{ diff --git a/man/figures/logo.png b/man/figures/logo.png new file mode 100644 index 0000000..b279633 Binary files /dev/null and b/man/figures/logo.png differ diff --git a/man/format_legacy_data.Rd b/man/format_legacy_data.Rd index eefc5f3..2f7cc50 100644 --- a/man/format_legacy_data.Rd +++ b/man/format_legacy_data.Rd @@ -8,7 +8,7 @@ format_legacy_data(legacyFilePath, state, outFile = NULL) } \arguments{ \item{legacyFilePath}{A character vector giving the location of a legacy census data folder, -sourced from https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/. +sourced from https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/. These file names should end in ".pl".} \item{state}{The two letter state postal code.} @@ -21,10 +21,10 @@ filepath should end in ".RData".} for Bayesian name geocoding. } \details{ -This function allows users to construct datasets for analysis using the census legacy data format. -These data are available for the 2020 census at -https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/. -This function returns data structured analogously to data from the Census API, which is not yet +This function allows users to construct datasets for analysis using the census legacy data format. +These data are available for the 2020 census at +https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/. +This function returns data structured analogously to data from the Census API, which is not yet available for the 2020 Census as of September 2021. } \examples{ diff --git a/man/get_census_api.Rd b/man/get_census_api.Rd index cf99156..202b031 100644 --- a/man/get_census_api.Rd +++ b/man/get_census_api.Rd @@ -26,7 +26,7 @@ e.g., \code{"for=block:1213&in=state:47+county:015+tract:*"}.} } \value{ If successful, output will be an object of class \code{data.frame}. - If unsuccessful, function prints the URL query that caused the error. +If unsuccessful, function prints the URL query that caused the error. } \description{ \code{get_census_api} obtains U.S. Census data via the public API. diff --git a/man/get_census_api_2.Rd b/man/get_census_api_2.Rd index 065fe9e..9ee0670 100644 --- a/man/get_census_api_2.Rd +++ b/man/get_census_api_2.Rd @@ -10,30 +10,30 @@ get_census_api_2(data_url, key, get, region, retry = 3) \item{data_url}{URL root of the API, e.g., \code{"https://api.census.gov/data/2020/dec/pl"}.} -\item{key}{A required character object containing user's Census API key, +\item{key}{A required character object containing user's Census API key, which can be requested \href{https://api.census.gov/data/key_signup.html}{here}.} -\item{get}{A character vector of variables to get, +\item{get}{A character vector of variables to get, e.g., \code{c("P2_005N", "P2_006N", "P2_007N", "P2_008N")}. -If there are more than 50 variables, then function will automatically +If there are more than 50 variables, then function will automatically split variables into separate queries.} \item{region}{Character object specifying which region to obtain data for. -Must contain "for" and possibly "in", +Must contain "for" and possibly "in", e.g., \code{"for=block:1213&in=state:47+county:015+tract:*"}.} \item{retry}{The number of retries at the census website if network interruption occurs.} } \value{ -If successful, output will be an object of class \code{data.frame}. - If unsuccessful, function prints the URL query that was constructed. +If successful, output will be an object of class \code{data.frame}. +If unsuccessful, function prints the URL query that was constructed. } \description{ \code{get_census_api_2} assembles URL components for \code{get_census_api}. } \details{ -This function assembles the URL components and sends the request to the Census server. -It is used by the \code{get_census_api} function. The user should not need to call this +This function assembles the URL components and sends the request to the Census server. +It is used by the \code{get_census_api} function. The user should not need to call this function directly. } \examples{ @@ -42,7 +42,7 @@ get = c("P2_005N", "P2_006N", "P2_007N", "P2_008N"), region = "for=county:*&in=s } \references{ -Based on code authored by Nicholas Nagle, which is available +Based on code authored by Nicholas Nagle, which is available \href{https://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. } \keyword{internal} diff --git a/man/get_census_data.Rd b/man/get_census_data.Rd index dce4371..b2b7085 100644 --- a/man/get_census_data.Rd +++ b/man/get_census_data.Rd @@ -16,27 +16,31 @@ get_census_data( ) } \arguments{ -\item{key}{A required character object containing a valid Census API key, -which can be requested \href{https://api.census.gov/data/key_signup.html}{here}.} +\item{key}{A character string containing a valid U.S. Census API key, +which can be requested from the +\href{https://api.census.gov/data/key_signup.html}{U.S. Census API key signup page}. + +If \code{\link{NULL}}, the default, attempts to find a census key stored in an +\link[=Sys.getenv]{environment variable} named \code{CENSUS_API_KEY}.} \item{states}{which states to extract Census data for, e.g., \code{c("NJ", "NY")}.} -\item{age}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on +\item{age}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on age or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). -If \code{TRUE}, function will return Pr(Geolocation, Age | Race). +If \code{TRUE}, function will return Pr(Geolocation, Age | Race). If \code{\var{sex}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race).} -\item{sex}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on -sex or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). -If \code{TRUE}, function will return Pr(Geolocation, Sex | Race). +\item{sex}{A \code{TRUE}/\code{FALSE} object indicating whether to condition on +sex or not. If \code{FALSE} (default), function will return Pr(Geolocation | Race). +If \code{TRUE}, function will return Pr(Geolocation, Sex | Race). If \code{\var{age}} is also \code{TRUE}, function will return Pr(Geolocation, Age, Sex | Race).} \item{year}{A character object specifying the year of U.S. Census data to be downloaded. Use \code{"2010"}, or \code{"2020"}. Default is \code{"2020"}. -Warning: 2020 U.S. Census data is downloaded only when \code{\var{age}} and +Warning: 2020 U.S. Census data is downloaded only when \code{\var{age}} and \code{\var{sex}} are both \code{FALSE}.} -\item{census.geo}{An optional character vector specifying what level of +\item{census.geo}{An optional character vector specifying what level of geography to use to merge in U.S. Census 2010 geographic data. Currently \code{"county"}, \code{"tract"}, \code{"block"}, and \code{"place"} are supported.} @@ -45,14 +49,14 @@ geography to use to merge in U.S. Census 2010 geographic data. Currently \item{county.list}{A named list of character vectors of counties present in your \var{voter.file}, per state.} } \value{ -Output will be an object of class \code{list} indexed by state. -Output will contain a subset of the following elements: -\code{state}, \code{age}, \code{sex}, +Output will be an object of class \code{list} indexed by state. +Output will contain a subset of the following elements: +\code{state}, \code{age}, \code{sex}, \code{county}, \code{tract}, \code{block_group}, \code{block}, and \code{place}. } \description{ -\code{get_census_data} returns county-, tract-, and block-level Census data -for specified state(s). Using this function to download Census data in advance +\code{get_census_data} returns county-, tract-, and block-level Census data +for specified state(s). Using this function to download Census data in advance can save considerable time when running \code{predict_race} and \code{census_helper}. } \examples{ diff --git a/man/merge_names.Rd b/man/merge_names.Rd index 114c6fe..ab98bee 100644 --- a/man/merge_names.Rd +++ b/man/merge_names.Rd @@ -58,38 +58,40 @@ in order to increase the chance of finding a match. Default is \code{TRUE}.} } \value{ Output will be an object of class \code{data.frame}. It will - consist of the original user-input data with additional columns that - specify the part of the name matched with Census data (\code{\var{surname.match}}), - and the probabilities Pr(Race | Surname) for each racial group - (\code{\var{p_whi}} for White, \code{\var{p_bla}} for Black, - \code{\var{p_his}} for Hispanic/Latino, - \code{\var{p_asi}} for Asian and Pacific Islander, and - \code{\var{p_oth}} for Other/Mixed). +consist of the original user-input data with additional columns that +specify the part of the name matched with Census data (\code{\var{surname.match}}), +and the probabilities Pr(Race | Surname) for each racial group +(\code{\var{p_whi}} for White, \code{\var{p_bla}} for Black, +\code{\var{p_his}} for Hispanic/Latino, +\code{\var{p_asi}} for Asian and Pacific Islander, and +\code{\var{p_oth}} for Other/Mixed). } \description{ \code{merge_names} merges names in a user-input dataset with corresponding - race/ethnicity probabilities derived from both the U.S. Census Surname List - and Spanish Surname List and voter files from states in the Southern U.S. +race/ethnicity probabilities derived from both the U.S. Census Surname List +and Spanish Surname List and voter files from states in the Southern U.S. } \details{ This function allows users to match names in their dataset with database entries - estimating P(name | ethnicity) for each of the five major racial groups for each - name. The database probabilities are derived from both the U.S. Census Surname List - and Spanish Surname List and voter files from states in the Southern U.S. +estimating P(name | ethnicity) for each of the five major racial groups for each +name. The database probabilities are derived from both the U.S. Census Surname List +and Spanish Surname List and voter files from states in the Southern U.S. - By default, the function matches names as follows: - 1) Search raw surnames in the database; - 2) Remove any punctuation and search again; - 3) Remove any spaces and search again; - 4) Remove suffixes (e.g., "Jr") and search again (last names only) - 5) Split double-barreled names into two parts and search first part of name; - 6) Split double-barreled names into two parts and search second part of name; +By default, the function matches names as follows: +\enumerate{ +\item Search raw surnames in the database; +\item Remove any punctuation and search again; +\item Remove any spaces and search again; +\item Remove suffixes (e.g., "Jr") and search again (last names only) +\item Split double-barreled names into two parts and search first part of name; +\item Split double-barreled names into two parts and search second part of name; +} - Each step only applies to names not matched in a previous step. - Steps 2 through 6 are not applied if \code{clean.surname} is FALSE. +Each step only applies to names not matched in a previous step. +Steps 2 through 6 are not applied if \code{clean.surname} is FALSE. - Note: Any name appearing only on the Spanish Surname List is assigned a - probability of 1 for Hispanics/Latinos and 0 for all other racial groups. +Note: Any name appearing only on the Spanish Surname List is assigned a +probability of 1 for Hispanics/Latinos and 0 for all other racial groups. } \examples{ data(voters) diff --git a/man/merge_surnames.Rd b/man/merge_surnames.Rd index a4ccd94..08c257d 100644 --- a/man/merge_surnames.Rd +++ b/man/merge_surnames.Rd @@ -13,62 +13,64 @@ merge_surnames( ) } \arguments{ -\item{voter.file}{An object of class \code{data.frame}. Must contain a field +\item{voter.file}{An object of class \code{data.frame}. Must contain a field named 'surname' containing list of surnames to be merged with Census lists.} -\item{surname.year}{An object of class \code{numeric} indicating which year -Census Surname List is from. Accepted values are \code{2010} and \code{2000}. +\item{surname.year}{An object of class \code{numeric} indicating which year +Census Surname List is from. Accepted values are \code{2010} and \code{2000}. Default is \code{2020}.} -\item{name.data}{An object of class \code{data.frame}. Must contain a leading -column of surnames, and 5 subsequent columns, with Pr(Race | Surname) for each +\item{name.data}{An object of class \code{data.frame}. Must contain a leading +column of surnames, and 5 subsequent columns, with Pr(Race | Surname) for each of the five major racial categories.} -\item{clean.surname}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, -any surnames in \code{\var{voter.file}} that cannot initially be matched -to surname lists will be cleaned, according to U.S. Census specifications, +\item{clean.surname}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, +any surnames in \code{\var{voter.file}} that cannot initially be matched +to surname lists will be cleaned, according to U.S. Census specifications, in order to increase the chance of finding a match. Default is \code{TRUE}.} -\item{impute.missing}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, -race/ethnicity probabilities will be imputed for unmatched names using +\item{impute.missing}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, +race/ethnicity probabilities will be imputed for unmatched names using race/ethnicity distribution for all other names (i.e., not on Census List). Default is \code{TRUE}.} } \value{ -Output will be an object of class \code{data.frame}. It will - consist of the original user-input data with additional columns that - specify the part of the name matched with Census data (\code{\var{surname.match}}), - and the probabilities Pr(Race | Surname) for each racial group - (\code{\var{p_whi}} for White, \code{\var{p_bla}} for Black, - \code{\var{p_his}} for Hispanic/Latino, - \code{\var{p_asi}} for Asian and Pacific Islander, and - \code{\var{p_oth}} for Other/Mixed). +Output will be an object of class \code{data.frame}. It will +consist of the original user-input data with additional columns that +specify the part of the name matched with Census data (\code{\var{surname.match}}), +and the probabilities Pr(Race | Surname) for each racial group +(\code{\var{p_whi}} for White, \code{\var{p_bla}} for Black, +\code{\var{p_his}} for Hispanic/Latino, +\code{\var{p_asi}} for Asian and Pacific Islander, and +\code{\var{p_oth}} for Other/Mixed). #' } \description{ -\code{merge_surnames} merges surnames in user-input dataset with corresponding - race/ethnicity probabilities from U.S. Census Surname List and Spanish Surname List. +\code{merge_surnames} merges surnames in user-input dataset with corresponding +race/ethnicity probabilities from U.S. Census Surname List and Spanish Surname List. } \details{ -This function allows users to match surnames in their dataset with the U.S. - Census Surname List (from 2000 or 2010) and Spanish Surname List to obtain - Pr(Race | Surname) for each of the five major racial groups. - - By default, the function matches surnames to the Census list as follows: - 1) Search raw surnames in Census surname list; - 2) Remove any punctuation and search again; - 3) Remove any spaces and search again; - 4) Remove suffixes (e.g., Jr) and search again; - 5) Split double-barreled surnames into two parts and search first part of name; - 6) Split double-barreled surnames into two parts and search second part of name; - 7) For any remaining names, impute probabilities using distribution - for all names not appearing on Census list. - - Each step only applies to surnames not matched in a previous ste. - Steps 2 through 7 are not applied if \code{clean.surname} is FALSE. - - Note: Any name appearing only on the Spanish Surname List is assigned a - probability of 1 for Hispanics/Latinos and 0 for all other racial groups. +This function allows users to match surnames in their dataset with the U.S. +Census Surname List (from 2000 or 2010) and Spanish Surname List to obtain +Pr(Race | Surname) for each of the five major racial groups. + +By default, the function matches surnames to the Census list as follows: +\enumerate{ +\item Search raw surnames in Census surname list; +\item Remove any punctuation and search again; +\item Remove any spaces and search again; +\item Remove suffixes (e.g., Jr) and search again; +\item Split double-barreled surnames into two parts and search first part of name; +\item Split double-barreled surnames into two parts and search second part of name; +\item For any remaining names, impute probabilities using distribution +for all names not appearing on Census list. +} + +Each step only applies to surnames not matched in a previous ste. +Steps 2 through 7 are not applied if \code{clean.surname} is FALSE. + +Note: Any name appearing only on the Spanish Surname List is assigned a +probability of 1 for Hispanics/Latinos and 0 for all other racial groups. } \examples{ data(voters) diff --git a/man/modfuns.Rd b/man/modfuns.Rd index 9702621..2d66275 100644 --- a/man/modfuns.Rd +++ b/man/modfuns.Rd @@ -62,48 +62,115 @@ predict_race_me( ) } \arguments{ -\item{voter.file}{See documentation in \code{race_predict}.} - -\item{census.surname}{See documentation in \code{race_predict}.} - -\item{surname.only}{See documentation in \code{race_predict}.} - -\item{surname.year}{See documentation in \code{race_predict}.} - -\item{name.dictionaries}{See documentation in \code{race_predict}.} - -\item{census.geo}{See documentation in \code{race_predict}.} - -\item{census.key}{See documentation in \code{race_predict}.} - -\item{census.data}{See documentation in \code{race_predict}.} - -\item{age}{See documentation in \code{race_predict}.} - -\item{sex}{See documentation in \code{race_predict}.} - -\item{year}{See documentation in \code{race_predict}.} - -\item{party}{See documentation in \code{race_predict}.} - -\item{retry}{See documentation in \code{race_predict}.} - -\item{impute.missing}{See documentation in \code{race_predict}.} +\item{voter.file}{An object of class \code{data.frame}. +Must contain a row for each individual being predicted, +as well as a field named \code{\var{surname}} containing each individual's surname. +If using geolocation in predictions, \code{\var{voter.file}} must contain a field named +\code{\var{state}}, which contains the two-character abbreviation for each individual's +state of residence (e.g., \code{"nj"} for New Jersey). +If using Census geographic data in race/ethnicity predictions, +\code{\var{voter.file}} must also contain at least one of the following fields: +\code{\var{county}}, \code{\var{tract}}, \code{\var{block_group}}, \code{\var{block}}, +and/or \code{\var{place}}. +These fields should contain character strings matching U.S. Census categories. +County is three characters (e.g., \code{"031"} not \code{"31"}), +tract is six characters, block group is usually a single character and block +is four characters. Place is five characters. +See below for other optional fields.} + +\item{census.surname}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, +function will call \code{merge_surnames} to merge in Pr(Race | Surname) +from U.S. Census Surname List (2000, 2010, or 2020) and Spanish Surname List. +If \code{FALSE}, user must provide a \code{name.dictionary} (see below). +Default is \code{TRUE}.} + +\item{surname.only}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, race predictions will +only use surname data and calculate Pr(Race | Surname). Default is \code{FALSE}.} + +\item{name.dictionaries}{Optional named list of \code{data.frame}'s +containing counts of names by race. Any of the following named elements +are allowed: "surname", "first", "middle". When present, the objects must +follow the same structure as \code{last_c}, \code{first_c}, +\code{mid_c}, respectively.} + +\item{census.geo}{An optional character vector specifying what level of +geography to use to merge in U.S. Census geographic data. Currently +\code{"county"}, \code{"tract"}, \code{"block_group"}, \code{"block"}, and \code{"place"} +are supported. +Note: sufficient information must be in user-defined \code{\var{voter.file}} object. +If \code{\var{census.geo} = "county"}, then \code{\var{voter.file}} +must have column named \code{county}. +If \code{\var{census.geo} = "tract"}, then \code{\var{voter.file}} +must have columns named \code{county} and \code{tract}. +And if \code{\var{census.geo} = "block"}, then \code{\var{voter.file}} +must have columns named \code{county}, \code{tract}, and \code{block}. +If \code{\var{census.geo} = "place"}, then \code{\var{voter.file}} +must have column named \code{place}. +Specifying \code{\var{census.geo}} will call \code{census_helper} function +to merge Census geographic data at specified level of geography.} + +\item{census.key}{A character object specifying user's Census API key. +Required if \code{census.geo} is specified, because a valid Census API key is +required to download Census geographic data. + +If \code{\link{NULL}}, the default, attempts to find a census key stored in an +\link[=Sys.getenv]{environment variable} named \code{CENSUS_API_KEY}.} + +\item{census.data}{A list indexed by two-letter state abbreviations, +which contains pre-saved Census geographic data. +Can be generated using \code{get_census_data} function.} + +\item{age}{An optional \code{TRUE}/\code{FALSE} object specifying whether to +condition race predictions on age (in addition to surname and geolocation). +Default is \code{FALSE}. Must be same as \code{\var{age}} in \code{\var{census.data}} object. +May only be set to \code{TRUE} if \code{census.geo} option is specified. +If \code{TRUE}, \code{\var{voter.file}} should include a numerical variable \code{\var{age}}.} + +\item{sex}{optional \code{TRUE}/\code{FALSE} object specifying whether to +condition race predictions on sex (in addition to surname and geolocation). +Default is \code{FALSE}. Must be same as \code{\var{sex}} in \code{\var{census.data}} object. +May only be set to \code{TRUE} if \code{census.geo} option is specified. +If \code{TRUE}, \code{\var{voter.file}} should include a numerical variable \code{\var{sex}}, +where \code{\var{sex}} is coded as 0 for males and 1 for females.} + +\item{year}{An optional character vector specifying the year of U.S. Census geographic +data to be downloaded. Use \code{"2010"}, or \code{"2020"}. Default is \code{"2020"}.} + +\item{party}{An optional character object specifying party registration field +in \code{\var{voter.file}}, e.g., \code{\var{party} = "PartyReg"}. +If specified, race/ethnicity predictions will be conditioned +on individual's party registration (in addition to geolocation). +Whatever the name of the party registration field in \code{\var{voter.file}}, +it should be coded as 1 for Democrat, 2 for Republican, and 0 for Other.} + +\item{retry}{The number of retries at the census website if network interruption occurs.} + +\item{impute.missing}{Logical, defaults to TRUE. Should missing be imputed?} \item{use.counties}{A logical, defaulting to FALSE. Should census data be filtered by counties available in \var{census.data}?} -\item{names.to.use}{See documentation in \code{race_predict}.} +\item{names.to.use}{One of 'surname', 'surname, first', or 'surname, first, +middle'. Defaults to 'surname'.} -\item{race.init}{See documentation in \code{race_predict}.} +\item{race.init}{Vector of initial race for each observation in voter.file. +Must be an integer vector, with 1=white, 2=black, 3=hispanic, 4=asian, and +5=other. Defaults to values obtained using \code{model="BISG_surname"}.} -\item{ctrl}{See \code{control} in documentation for \code{race_predict}.} +\item{ctrl}{See \code{control} in documentation for \code{\link[=predict_race]{predict_race()}}.} } \value{ -See documentation in \code{race_predict}. +Output will be an object of class \code{data.frame}. It will +consist of the original user-input \code{voter.file} with additional columns with +predicted probabilities for each of the five major racial categories: +\code{\var{pred.whi}} for White, +\code{\var{pred.bla}} for Black, +\code{\var{pred.his}} for Hispanic/Latino, +\code{\var{pred.asi}} for Asian/Pacific Islander, and +\code{\var{pred.oth}} for Other/Mixed. } \description{ These functions are intended for internal use only. Users should use the -\code{race_predict} interface rather any of these functions directly. +\code{\link[=predict_race]{predict_race()}} interface rather any of these functions directly. } \details{ These functions fit different versions of WRU. \code{.predict_race_old} fits diff --git a/man/predict_race.Rd b/man/predict_race.Rd index 5b80be6..3c73abd 100644 --- a/man/predict_race.Rd +++ b/man/predict_race.Rd @@ -34,12 +34,12 @@ If using geolocation in predictions, \code{\var{voter.file}} must contain a fiel state of residence (e.g., \code{"nj"} for New Jersey). If using Census geographic data in race/ethnicity predictions, \code{\var{voter.file}} must also contain at least one of the following fields: -\code{\var{county}}, \code{\var{tract}}, \code{\var{block_group}}, \code{\var{block}}, +\code{\var{county}}, \code{\var{tract}}, \code{\var{block_group}}, \code{\var{block}}, and/or \code{\var{place}}. These fields should contain character strings matching U.S. Census categories. County is three characters (e.g., \code{"031"} not \code{"31"}), tract is six characters, block group is usually a single character and block - is four characters. Place is five characters. +is four characters. Place is five characters. See below for other optional fields.} \item{census.surname}{A \code{TRUE}/\code{FALSE} object. If \code{TRUE}, @@ -53,7 +53,7 @@ only use surname data and calculate Pr(Race | Surname). Default is \code{FALSE}. \item{census.geo}{An optional character vector specifying what level of geography to use to merge in U.S. Census geographic data. Currently -\code{"county"}, \code{"tract"}, \code{"block_group"}, \code{"block"}, and \code{"place"} +\code{"county"}, \code{"tract"}, \code{"block_group"}, \code{"block"}, and \code{"place"} are supported. Note: sufficient information must be in user-defined \code{\var{voter.file}} object. If \code{\var{census.geo} = "county"}, then \code{\var{voter.file}} @@ -67,9 +67,12 @@ must have column named \code{place}. Specifying \code{\var{census.geo}} will call \code{census_helper} function to merge Census geographic data at specified level of geography.} -\item{census.key}{A character object specifying user's Census API -key. Required if \code{\var{census.geo}} is specified, because -a valid Census API key is required to download Census geographic data.} +\item{census.key}{A character object specifying user's Census API key. +Required if \code{census.geo} is specified, because a valid Census API key is +required to download Census geographic data. + +If \code{\link{NULL}}, the default, attempts to find a census key stored in an +\link[=Sys.getenv]{environment variable} named \code{CENSUS_API_KEY}.} \item{census.data}{A list indexed by two-letter state abbreviations, which contains pre-saved Census geographic data. @@ -102,19 +105,19 @@ it should be coded as 1 for Democrat, 2 for Republican, and 0 for Other.} \item{impute.missing}{Logical, defaults to TRUE. Should missing be imputed?} -\item{use.counties}{A logical, defaulting to FALSE. Should census data be filtered by counties +\item{use.counties}{A logical, defaulting to FALSE. Should census data be filtered by counties available in \var{census.data}?} -\item{model}{Character string, either "BISG" (default) or "fBISG" (for error-correction, +\item{model}{Character string, either "BISG" (default) or "fBISG" (for error-correction, fully-Bayesian model).} \item{race.init}{Vector of initial race for each observation in voter.file. -Must be an integer vector, with 1=white, 2=black, 3=hispanic, 4=asian, and +Must be an integer vector, with 1=white, 2=black, 3=hispanic, 4=asian, and 5=other. Defaults to values obtained using \code{model="BISG_surname"}.} -\item{name.dictionaries}{Optional named list of \code{data.frame}'s -containing counts of names by race. Any of the following named elements -are allowed: "surname", "first", "middle". When present, the objects must +\item{name.dictionaries}{Optional named list of \code{data.frame}'s +containing counts of names by race. Any of the following named elements +are allowed: "surname", "first", "middle". When present, the objects must follow the same structure as \code{last_c}, \code{first_c}, \code{mid_c}, respectively.} @@ -123,22 +126,22 @@ middle'. Defaults to 'surname'.} \item{control}{List of control arguments only used when \code{model="fBISG"}, including \itemize{ - \item{iter}{ Number of MCMC iterations. Defaults to 1000.} - \item{burnin}{ Number of iterations discarded as burnin. Defaults to half of \code{iter}.} - \item{verbose}{ Print progress information. Defaults to \code{TRUE}.} - \item{me.correct}{ Boolean. Should the model correcting measurement error for \code{races|geo}? Defaults to \code{TRUE}.} - \item{seed}{ RNG seed. If \code{NULL}, a seed is generated and returned as an attribute for reproducibility.} +\item{iter}{ Number of MCMC iterations. Defaults to 1000.} +\item{burnin}{ Number of iterations discarded as burnin. Defaults to half of \code{iter}.} +\item{verbose}{ Print progress information. Defaults to \code{TRUE}.} +\item{me.correct}{ Boolean. Should the model correcting measurement error for \code{races|geo}? Defaults to \code{TRUE}.} +\item{seed}{ RNG seed. If \code{NULL}, a seed is generated and returned as an attribute for reproducibility.} }} } \value{ Output will be an object of class \code{data.frame}. It will - consist of the original user-input \code{voter.file} with additional columns with - predicted probabilities for each of the five major racial categories: - \code{\var{pred.whi}} for White, - \code{\var{pred.bla}} for Black, - \code{\var{pred.his}} for Hispanic/Latino, - \code{\var{pred.asi}} for Asian/Pacific Islander, and - \code{\var{pred.oth}} for Other/Mixed. +consist of the original user-input \code{voter.file} with additional columns with +predicted probabilities for each of the five major racial categories: +\code{\var{pred.whi}} for White, +\code{\var{pred.bla}} for Black, +\code{\var{pred.his}} for Hispanic/Latino, +\code{\var{pred.asi}} for Asian/Pacific Islander, and +\code{\var{pred.oth}} for Other/Mixed. } \description{ \code{predict_race} makes probabilistic estimates of individual-level race/ethnicity. diff --git a/man/sample_me.Rd b/man/sample_me.Rd index 4e82121..157ae51 100644 --- a/man/sample_me.Rd +++ b/man/sample_me.Rd @@ -32,9 +32,9 @@ sample_me( \item{pi_s}{Numeric matrix of race | surname prior probabilities.} -\item{pi_f}{Same as `pi_s`, but for first names.} +\item{pi_f}{Same as \code{pi_s}, but for first names.} -\item{pi_m}{Same as `pi_s`, but for middle names.} +\item{pi_m}{Same as \code{pi_s}, but for middle names.} \item{pi_nr}{Matrix of marginal probability distribution over missing names; non-keyword names default to this distribution.} @@ -52,9 +52,9 @@ sample_me( \item{M_rs}{Integer matrix of race | surname counts in dictionary (surnames in columns).} -\item{M_rf}{Same as `M_rs`, but for first names (can be empty matrix for surname only models).} +\item{M_rf}{Same as \code{M_rs}, but for first names (can be empty matrix for surname only models).} -\item{M_rm}{Same as `M_rs`, but for middle names (can be empty matrix for surname, or surname and first name only models).} +\item{M_rm}{Same as \code{M_rs}, but for middle names (can be empty matrix for surname, or surname and first name only models).} \item{alpha}{Numeric matrix of race | geography prior probabilities.} diff --git a/man/surnames2000.Rd b/man/surnames2000.Rd index 00a2c30..6dc45f5 100644 --- a/man/surnames2000.Rd +++ b/man/surnames2000.Rd @@ -7,13 +7,13 @@ \format{ A data frame with 157,728 rows and 6 variables: \describe{ - \item{surname}{Surname} - \item{p_whi}{Pr(White | Surname)} - \item{p_bla}{Pr(Black | Surname)} - \item{p_his}{Pr(Hispanic/Latino | Surname)} - \item{p_asi}{Pr(Asian/Pacific Islander | Surname)} - \item{p_oth}{Pr(Other | Surname)} - #' } +\item{surname}{Surname} +\item{p_whi}{Pr(White | Surname)} +\item{p_bla}{Pr(Black | Surname)} +\item{p_his}{Pr(Hispanic/Latino | Surname)} +\item{p_asi}{Pr(Asian/Pacific Islander | Surname)} +\item{p_oth}{Pr(Other | Surname)} +#' } } \usage{ surnames2000 diff --git a/man/surnames2010.Rd b/man/surnames2010.Rd index 7341985..a09afdf 100644 --- a/man/surnames2010.Rd +++ b/man/surnames2010.Rd @@ -7,13 +7,13 @@ \format{ A data frame with 167,613 rows and 6 variables: \describe{ - \item{surname}{Surname} - \item{p_whi}{Pr(White | Surname)} - \item{p_bla}{Pr(Black | Surname)} - \item{p_his}{Pr(Hispanic/Latino | Surname)} - \item{p_asi}{Pr(Asian/Pacific Islander | Surname)} - \item{p_oth}{Pr(Other | Surname)} - #' } +\item{surname}{Surname} +\item{p_whi}{Pr(White | Surname)} +\item{p_bla}{Pr(Black | Surname)} +\item{p_his}{Pr(Hispanic/Latino | Surname)} +\item{p_asi}{Pr(Asian/Pacific Islander | Surname)} +\item{p_oth}{Pr(Other | Surname)} +#' } } \usage{ surnames2010 diff --git a/man/vec_to_chunk.Rd b/man/vec_to_chunk.Rd index eb77daa..fe5ae7f 100644 --- a/man/vec_to_chunk.Rd +++ b/man/vec_to_chunk.Rd @@ -16,9 +16,9 @@ Object of class \code{list}. \code{vec_to_chunk} takes a list of variables and collects them into 50-variable chunks. } \details{ -This function takes a list of variable names and collects them into chunks with no more than -50 variables each. This helps to get around requests with more than 50 variables,because the -API only allows queries of 50 variables at a time. +This function takes a list of variable names and collects them into chunks with no more than +50 variables each. This helps to get around requests with more than 50 variables,because the +API only allows queries of 50 variables at a time. The user should not need to call this function directly. } \examples{ @@ -29,7 +29,7 @@ vec_to_chunk(x = c(paste("P012F0", seq(10:49), sep = ""), } \references{ -Based on code authored by Nicholas Nagle, which is available +Based on code authored by Nicholas Nagle, which is available \href{https://rstudio-pubs-static.s3.amazonaws.com/19337_2e7f827190514c569ea136db788ce850.html}{here}. } \keyword{internal} diff --git a/man/voters.Rd b/man/voters.Rd index 3637041..be5d7bc 100644 --- a/man/voters.Rd +++ b/man/voters.Rd @@ -7,22 +7,22 @@ \format{ A data frame with 10 rows and 12 variables: \describe{ - \item{VoterID}{Voter identifier (numeric)} - \item{surname}{Surname} - \item{state}{State of residence} - \item{CD}{Congressional district} - \item{county}{Census county (three-digit code)} - \item{first}{First name} - \item{last}{Last name or surname} - \item{tract}{Census tract (six-digit code)} - \item{block}{Census block (four-digit code)} - \item{precinct}{Voting precinct} - \item{place}{Voting place} - \item{age}{Age in years} - \item{sex}{0=male, 1=female} - \item{party}{Party registration (character)} - \item{PID}{Party registration (numeric)} - #' } +\item{VoterID}{Voter identifier (numeric)} +\item{surname}{Surname} +\item{state}{State of residence} +\item{CD}{Congressional district} +\item{county}{Census county (three-digit code)} +\item{first}{First name} +\item{last}{Last name or surname} +\item{tract}{Census tract (six-digit code)} +\item{block}{Census block (four-digit code)} +\item{precinct}{Voting precinct} +\item{place}{Voting place} +\item{age}{Age in years} +\item{sex}{0=male, 1=female} +\item{party}{Party registration (character)} +\item{PID}{Party registration (numeric)} +#' } } \usage{ voters diff --git a/man/wru_data_preflight.Rd b/man/wru_data_preflight.Rd index cc93d37..7da9e58 100644 --- a/man/wru_data_preflight.Rd +++ b/man/wru_data_preflight.Rd @@ -10,6 +10,6 @@ wru_data_preflight() Checks if namedata is available in the current working directory, if not downloads it from github using piggyback. By default, wru will download the data to a temporary directory that lasts as long as your session does. -However, you may wish to set the \code{wru_data_wd} option to save the +However, you may wish to set the \code{wru_data_wd} option to save the downloaded data to your current working directory for more permanence. }