Skip to content

Commit

Permalink
Merge pull request #125 from mdblocker/drop_bad_geos
Browse files Browse the repository at this point in the history
Update to census_helper_v2
  • Loading branch information
1beb authored Dec 5, 2023
2 parents 612ed1c + 4692683 commit 5aaea19
Show file tree
Hide file tree
Showing 9 changed files with 79 additions and 9 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ Depends:
Imports:
cli,
dplyr,
tidyr,
furrr,
future,
piggyback (>= 0.1.4),
Expand Down
13 changes: 11 additions & 2 deletions R/census_helper_v2.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@
#' @param retry The number of retries at the census website if network interruption occurs.
#' @param use.counties A logical, defaulting to FALSE. Should census data be filtered by counties
#' available in \var{census.data}?
#' @param skip_bad_geos Logical. Option to have the function skip any geolocations that are not present
#' in the census data, returning a partial data set. Default is set to \code{FALSE}, which case it will
#' break and provide error message with a list of offending geolocations.
#' @return Output will be an object of class \code{data.frame}. It will
#' consist of the original user-input data with additional columns of
#' Census data.
Expand All @@ -65,8 +68,10 @@ census_helper_new <- function(
year = "2020",
census.data = NULL,
retry = 3,
use.counties = FALSE
use.counties = FALSE,
skip_bad_geos = FALSE
) {

if (geo == "precinct") {
stop("Error: census_helper_new function does not currently support precinct-level data.")
}
Expand Down Expand Up @@ -198,11 +203,15 @@ census_helper_new <- function(
#Check if geolocation missing from census object
if(any(is.na(voters.census$r_whi))){
miss_ind <- which(is.na(voters.census$r_whi))
stop("The following locations in the voter.file are not available in the census data ",
message("The following locations in the voter.file are not available in the census data.",
paste0("(listed as ", paste0(c("state",geo.merge), collapse="-"),"):\n"),
paste(do.call(paste, c(unique(voters.census[miss_ind, c("state",geo.merge)]),
sep="-")),
collapse = ", "))
if(skip_bad_geos == TRUE){
message("NOTE: Skipping unavailable geolocations. Returning partial data set.")
voters.census <- tidyr::drop_na(voters.census, r_whi)}
else(stop("Stopping predictions. Please revise census data and/or verify the correct year is being supplied. To skip these rows use 'skip_bad_geos = TRUE'"))
}

# }
Expand Down
8 changes: 8 additions & 0 deletions R/predict_race.R
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@
#' it should be coded as 1 for Democrat, 2 for Republican, and 0 for Other.
#' @param retry The number of retries at the census website if network interruption occurs.
#' @param impute.missing Logical, defaults to TRUE. Should missing be imputed?
#' @param skip_bad_geos Logical. Option to have the function skip any geolocations that are not present
#' in the census data, returning a partial data set. Default is set to \code{FALSE}, in which case it
#' will break and provide error message with a list of offending geolocations.
#' @param use.counties A logical, defaulting to FALSE. Should census data be filtered by counties
#' available in \var{census.data}?
#' @param model Character string, either "BISG" (default) or "fBISG" (for error-correction,
Expand Down Expand Up @@ -150,6 +153,7 @@ predict_race <- function(
party = NULL,
retry = 3,
impute.missing = TRUE,
skip_bad_geos = FALSE,
use.counties = FALSE,
model = "BISG",
race.init = NULL,
Expand Down Expand Up @@ -220,6 +224,7 @@ predict_race <- function(
census.data = census.data,
retry = retry,
impute.missing = impute.missing,
skip_bad_geos = skip_bad_geos,
census.surname = census.surname,
use.counties = use.counties)
} else {
Expand All @@ -237,6 +242,7 @@ predict_race <- function(
if(ctrl$verbose){
message("Using `predict_race` to obtain initial race prediction priors with BISG model")
}

race.init <- predict_race(voter.file = voter.file,
names.to.use = names.to.use,
year = year,
Expand All @@ -248,10 +254,12 @@ predict_race <- function(
census.data = census.data,
retry = retry,
impute.missing = TRUE,
skip_bad_geos = skip_bad_geos,
census.surname = census.surname,
use.counties = use.counties,
model = "BISG",
control = list(verbose=FALSE))

race.init <- max.col(
race.init[, paste0("pred.", c("whi", "bla", "his", "asi", "oth"))],
ties.method = "random"
Expand Down
7 changes: 6 additions & 1 deletion R/race_prediction_funs.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#' @param party See documentation in \code{race_predict}.
#' @param retry See documentation in \code{race_predict}.
#' @param impute.missing See documentation in \code{race_predict}.
#' @param skip_bad_geos See documentation in \code{race_predict}.
#' @param names.to.use See documentation in \code{race_predict}.
#' @param race.init See documentation in \code{race_predict}.
#' @param name.dictionaries See documentation in \code{race_predict}.
Expand Down Expand Up @@ -268,6 +269,7 @@ NULL
#' New race prediction function, implementing classical BISG with augmented
#' surname dictionary, as well as first and middle name information.
#' @rdname modfuns

predict_race_new <- function(
voter.file,
names.to.use,
Expand All @@ -281,6 +283,7 @@ predict_race_new <- function(
census.data = NULL,
retry = 0,
impute.missing = TRUE,
skip_bad_geos = FALSE,
census.surname = FALSE,
use.counties = FALSE
) {
Expand Down Expand Up @@ -379,7 +382,8 @@ predict_race_new <- function(
year = year,
census.data = census.data,
retry = retry,
use.counties = use.counties
use.counties = use.counties,
skip_bad_geos = skip_bad_geos
)
}

Expand Down Expand Up @@ -433,6 +437,7 @@ predict_race_new <- function(
#' surname dictionary, as well as first and middle name information.
#' @importFrom dplyr pull
#' @rdname modfuns

predict_race_me <- function(
voter.file,
names.to.use,
Expand Down
7 changes: 6 additions & 1 deletion man/census_helper_new.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions man/modfuns.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 7 additions & 1 deletion man/predict_race.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 19 additions & 2 deletions tests/testthat/test-predict_race_2010.R
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ test_that("Fails on territories", {
)
})

test_that("Fails on missing geolocation", {
test_that("Fails on missing geolocation if skip_bad_geos default is used", {
skip_on_cran()
set.seed(42)
data(voters)
Expand All @@ -138,10 +138,27 @@ test_that("Fails on missing geolocation", {
census.data = census,
use.counties = TRUE)
),
"The following locations in the voter\\.file are not available"
"Stopping predictions. Please revise"
)
})

test_that("Skip_bad_geos option successfully returns working geolocations", {
skip_on_cran()
set.seed(42)
data(voters)
census <- readRDS(test_path("data/census_test_nj_block_2010.rds"))
test_drop <- suppressMessages(predict_race(
voter.file = voters[voters$state == "NJ", ],
year = 2010,
census.geo = "block",
census.key = NULL,
census.data = census,
skip_bad_geos = TRUE,
use.counties = TRUE)
)
expect_equal(nrow(test_drop), 6)
})

test_that("Handles zero-pop. geolocations", {
skip_on_cran()
set.seed(42)
Expand Down
20 changes: 18 additions & 2 deletions tests/testthat/test-predict_race_2020.R
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ test_that("Fails on territories", {
)
})

test_that("Fails on missing geolocation", {
test_that("Fails on missing geolocation if skip_bad_geos default is used", {
skip_on_cran()
set.seed(42)
data(voters)
Expand All @@ -132,10 +132,26 @@ test_that("Fails on missing geolocation", {
census.data = census,
use.counties = TRUE)
),
"The following locations in the voter\\.file are not available"
"Stopping predictions. Please revise"
)
})

test_that("Skip_bad_geos option successfully returns working geolocations", {
skip_on_cran()
set.seed(42)
data(voters)
census <- readRDS(test_path("data/census_test_nj_block_2020.rds"))
test_drop <- suppressMessages(predict_race(
voter.file = voters[voters$state == "NJ", ],
census.geo = "block",
census.key = NULL,
census.data = census,
skip_bad_geos = TRUE,
use.counties = TRUE)
)
expect_equal(nrow(test_drop), 1)
})

test_that("Handles zero-pop. geolocations", {
skip_on_cran()
set.seed(42)
Expand Down

0 comments on commit 5aaea19

Please sign in to comment.