From e1cbd714a5655f3071792f306fb9fb0b4fa46ebf Mon Sep 17 00:00:00 2001 From: Alexander Rossell Hayes Date: Fri, 8 Dec 2023 11:11:11 -0800 Subject: [PATCH 1/5] fix(census_helper_new): fix bug when checking if `geo` is "precinct" --- R/census_helper_v2.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/census_helper_v2.R b/R/census_helper_v2.R index 8e4ddd5..8b310de 100644 --- a/R/census_helper_v2.R +++ b/R/census_helper_v2.R @@ -72,7 +72,7 @@ census_helper_new <- function( skip_bad_geos = FALSE ) { - if (geo == "precinct") { + if ("precinct" %in% geo) { stop("Error: census_helper_new function does not currently support precinct-level data.") } From 59a340dac1e233e98a1e96468f89725b0150a719 Mon Sep 17 00:00:00 2001 From: Alexander Rossell Hayes Date: Fri, 8 Dec 2023 11:12:23 -0800 Subject: [PATCH 2/5] fix(.predict_race_old): use `all(is.na())` in `if` statements --- R/race_prediction_funs.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/race_prediction_funs.R b/R/race_prediction_funs.R index ce96e82..23b4564 100644 --- a/R/race_prediction_funs.R +++ b/R/race_prediction_funs.R @@ -82,12 +82,12 @@ NULL stop("Voter data frame needs to have a column named surname") } } else { - if (missing(census.geo) || is.null(census.geo) || is.na(census.geo) || census.geo %in% c("county", "tract", "block", "place") == FALSE) { + if (missing(census.geo) || is.null(census.geo) || all(is.na(census.geo)) || census.geo %in% c("county", "tract", "block", "place") == FALSE) { stop("census.geo must be either 'county', 'tract', 'block', or 'place'") } else { message(paste("Proceeding with Census geographic data at", census.geo, "level...")) } - if (missing(census.data) || is.null(census.data) || is.na(census.data)) { + if (missing(census.data) || is.null(census.data) || all(is.na(census.data))) { census.key <- validate_key(census.key) message("Downloading Census geographic data using provided API key...") } else { From 4ff2f0f42e1748c52b74ac45d9407c997600ae0b Mon Sep 17 00:00:00 2001 From: Alexander Rossell Hayes Date: Fri, 8 Dec 2023 11:12:57 -0800 Subject: [PATCH 3/5] test(census_geo_api): add comment explaining snapshots --- tests/testthat/test-census_geo_api.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/testthat/test-census_geo_api.R b/tests/testthat/test-census_geo_api.R index 4fd02c8..9d18679 100644 --- a/tests/testthat/test-census_geo_api.R +++ b/tests/testthat/test-census_geo_api.R @@ -1,6 +1,8 @@ skip_if_not(nzchar(Sys.getenv("CENSUS_API_KEY"))) test_that("snapshot", { + # These snapshots were generated using the calculations in v2.0.0 + # and verified that the calculations resulted in the same numbers for PR #120. expect_snapshot_value( census_geo_api(state = "DE", geo = "county", year = "2020"), style = "deparse" From 7be05c092c2a309d2654e780b377581de12870cf Mon Sep 17 00:00:00 2001 From: Alexander Rossell Hayes Date: Fri, 8 Dec 2023 11:18:03 -0800 Subject: [PATCH 4/5] todo: add TODO comments --- R/census_helper_v2.R | 5 +++++ README.Rmd | 4 ++++ tests/testthat/test-census_geo_api.R | 3 +++ tests/testthat/test-census_geo_api_names.R | 2 ++ 4 files changed, 14 insertions(+) diff --git a/R/census_helper_v2.R b/R/census_helper_v2.R index 8b310de..8744389 100644 --- a/R/census_helper_v2.R +++ b/R/census_helper_v2.R @@ -160,6 +160,7 @@ census_helper_new <- function( state_must_be_downloaded <- toDownload || is.null(census.data[[state]]) || census.data[[state]]$year != year || + # TODO: Why do we always redownload if sex or age == TRUE? census.data[[state]]$age != FALSE || census.data[[state]]$sex != FALSE @@ -174,6 +175,10 @@ census_helper_new <- function( ## Calculate Pr(Geolocation | Race) if (any(c("P2_005N", "P005003") %in% names(census))) { + # TODO: Add message that they're using a legacy data source + # TODO: Add test that we get the same ratios with legacy and new tables for 2020 + # Old table: Redistricting (Pl-some numbers) (does not have age, sex, or ZCTAs) + # New table: DHC (does have age, sex, and ZCTA) vars_ <- census_geo_api_names_legacy(year = year) } else { vars_ <- census_geo_api_names(year) diff --git a/README.Rmd b/README.Rmd index b900a04..ccf678a 100644 --- a/README.Rmd +++ b/README.Rmd @@ -210,6 +210,10 @@ future::plan(future.callr::callr) # ... ``` +```{comment} +Add guidance for starting a new analysis vs. replicating a previous analysis +``` + ## Census Data This package uses the Census Bureau Data API but is not endorsed or certified by the Census Bureau. diff --git a/tests/testthat/test-census_geo_api.R b/tests/testthat/test-census_geo_api.R index 9d18679..d4d3534 100644 --- a/tests/testthat/test-census_geo_api.R +++ b/tests/testthat/test-census_geo_api.R @@ -1,6 +1,9 @@ skip_if_not(nzchar(Sys.getenv("CENSUS_API_KEY"))) test_that("snapshot", { + # TODO: Test that sub-geographies sum to match pooled geographies (e.g. blocks sum to block groups, sum to tracts, sum to counties) + # TODO: Test that age/sex subsets sum to race totals + # These snapshots were generated using the calculations in v2.0.0 # and verified that the calculations resulted in the same numbers for PR #120. expect_snapshot_value( diff --git a/tests/testthat/test-census_geo_api_names.R b/tests/testthat/test-census_geo_api_names.R index 2ca9d84..2d7bd79 100644 --- a/tests/testthat/test-census_geo_api_names.R +++ b/tests/testthat/test-census_geo_api_names.R @@ -1,4 +1,6 @@ test_that("census_geo_api_names() for 2020", { + # TODO: Verify against table names here: https://api.census.gov/data/2020/dec/dhc/variables.html + expect_equal( census_geo_api_names("2020"), list( From 4943cff1b069aad59a679651cb04a28bcd0f487e Mon Sep 17 00:00:00 2001 From: Alexander Rossell Hayes Date: Fri, 8 Dec 2023 11:32:51 -0800 Subject: [PATCH 5/5] refactor: replace loops in `census_helper_new()` and `predict_race_me()` with explicit calculations --- R/census_helper_v2.R | 11 ++++++----- R/race_prediction_funs.R | 23 +++++++++++------------ 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/R/census_helper_v2.R b/R/census_helper_v2.R index 8744389..358260b 100644 --- a/R/census_helper_v2.R +++ b/R/census_helper_v2.R @@ -186,11 +186,12 @@ census_helper_new <- function( drop <- match(c("state", unlist(vars_)), names(census)) geoPopulations <- rowSums(census[,names(census) %in% vars_]) - - for (i in seq_along(vars_)) { - census[[names(vars_)[[i]]]] <- - rowSums(census[, vars_[[i]], drop = FALSE]) / geoPopulations - } + + census$r_whi <- rowSums(census[, vars_[["r_whi"]], drop = FALSE]) / (geoPopulations) #Pr(White | Geo) + census$r_bla <- rowSums(census[, vars_[["r_bla"]], drop = FALSE]) / (geoPopulations) #Pr(Black | Geo) + census$r_his <- rowSums(census[, vars_[["r_his"]], drop = FALSE]) / (geoPopulations) #Pr(Latino | Geo) + census$r_asi <- rowSums(census[, vars_[["r_asi"]], drop = FALSE]) / (geoPopulations) #Pr(Asian or NH/PI | Geo) + census$r_oth <- rowSums(census[, vars_[["r_oth"]], drop = FALSE]) / (geoPopulations) #Pr(AI/AN, Other, or Mixed | Geo) # check locations with zero people # get average without places with zero people, and assign that to zero locs. diff --git a/R/race_prediction_funs.R b/R/race_prediction_funs.R index 23b4564..f883336 100644 --- a/R/race_prediction_funs.R +++ b/R/race_prediction_funs.R @@ -545,7 +545,7 @@ predict_race_me <- function( vars_ <- census_geo_api_names(year = year) - tmp_tabs <- lapply( + N_rg <- purrr::map( census.data, function(x) { all_names <- names(x[[census.geo]]) @@ -554,20 +554,19 @@ predict_race_me <- function( vars_ <- census_geo_api_names_legacy(year = year) } - tmp <- x[[census.geo]][, c(geo_id_names, grep("^P[0-2]", all_names, value = TRUE))] + totals <- x[[census.geo]][, match(c(geo_id_names, unlist(vars_)), all_names)] - for (i in seq_along(vars_)) { - tmp[[names(vars_)[[i]]]] <- - rowSums(tmp[, vars_[[i]], drop = FALSE]) - } - - all_names <- names(tmp) - ## Totals - tmp_la <- tmp[, c(geo_id_names, grep("^r_", all_names, value = TRUE))] - return(list(tots = tmp_la)) + totals$r_whi <- rowSums(totals[, vars_[["r_whi"]], drop = FALSE]) # White population + totals$r_bla <- rowSums(totals[, vars_[["r_bla"]], drop = FALSE]) # Black population + totals$r_his <- rowSums(totals[, vars_[["r_his"]], drop = FALSE]) # Latino population + totals$r_asi <- rowSums(totals[, vars_[["r_asi"]], drop = FALSE]) # Asian + NH/PI population + totals$r_oth <- rowSums(totals[, vars_[["r_oth"]], drop = FALSE]) # AI/AN + Other + Mixed population + + totals <- totals[, -match(unlist(vars_), names(totals))] + totals } ) - N_rg <- do.call(rbind, lapply(tmp_tabs, function(x) x$tots)) + N_rg <- dplyr::bind_rows(N_rg) N_rg_geo <- do.call(paste, N_rg[, geo_id_names]) ## Subset to geo's in vf N_rg <- N_rg[N_rg_geo %in% geo_id, ]