Skip to content

Instantly share code, notes, and snippets.

@chuckpurvis
Last active March 17, 2022 20:45
Show Gist options
  • Save chuckpurvis/35f2e4968a2a815ce38b1756515f8ccf to your computer and use it in GitHub Desktop.
Save chuckpurvis/35f2e4968a2a815ce38b1756515f8ccf to your computer and use it in GitHub Desktop.
R-script using tidycensus, purrr and dplyr to pull American Community Survey data for single years (2006-2019) and for 3 nonoverlapping 5-year periods: 2006-2010, 2011-2015, 2016-2020. The 2016-2020 5-year ACS data was released today, March 17, 2022. Note that the geographic boundaries from the 2016-2020 data are Census 2020 geographies. All oth…
########################################################################
# ACS_AllYears_Journey2Work_purrr_revd.r
# Use the purrr function map_dfr to iterate tidycensus call for 1-year, 5-year ACS data.
# Data on Means of Transportation to Work Variables, All ACS years
# Nation and US States and counties and places (added 1/28/22)
#
# American Community Survey, 1-year, 2005-2019 data
# ACS 5-year data: 2005-2009, 2010-2014, 2015-2019
# ACS 5-year data: 2006-2010, 2010-2014, 2015-2019 (this works for B08006....)
## ACS 5-year data: 2006-2010, 2011-2015, 2016-2020 @@ should work after 3/17/22.
#
# Data is meant to be "stacked" with one row/record for each geography/year combination
# -- January 18, 2022 --
## -- January 28, 2022 -- (Add County and Place!!!)
## -- February 7, 2022 -- "skip-a-year" is working. And new sets of
## nonoverlapping 5-yrs when 2016-2020 is released 3/17/22
## -- March 17, 2022 --
########################################################################
# These R packages were installed in previous sessions, so no need to re-install
# install.packages("tidyverse")
# install.packages("tidycensus")
# install.packages("janitor")
# install.packages("purrr")
# Load relevant libraries into R-session.
library(purrr)
library(tidyverse)
library(tidycensus)
library(janitor)
library(plyr)
library(dplyr)
library(xlsx)
# setwd("~/Desktop/tidycensus_work/output")
setwd("~/Desktop/tidycensus_work")
getwd()
# Census API Key was installed in previous sessions, so no need to re-install
# census_api_key("fortycharacterkeysentbyCensusBureau",install=TRUE)
Sys.getenv("CENSUS_API_KEY")
# Set a list of variables to extract in each iteration of get_acs
# Total Population is problematic for 2005-09 since GQ pop wasn't collected in 2005....
# selvars2 was a previous "best of" variables for introductory analyses....
selvars2 <- c(# TotalPop_ = "B06001_001", # Total Population
Med_HHInc_ = "B19013_001", # Median Household Income
Agg_HHInc_ = "B19025_001", # Aggregate Household Income
HHldPop_ = "B11002_001", # Population in Households
Househlds_ = "B25003_001", # Total Households
Owner_OccDU_= "B25003_002", # Owner-Occupied Dwelling Units
Rent_OccDU_ = "B25003_003", # Renter-Occupied Dwelling Units
Med_HHVal_ = "B25077_001")
selvars <- c(total_pop_ = "B06001_001", # Total Population
hh_pop_ = "B11002_001", # Population in Households
households_ = "B25003_001", # Total Households
agg_commtime_ = "B08013_001", # Aggregate Travel Time to Work
agg_vehicle_ = "B08015_001", # Aggregate Vehicles Used in Journey-to-Work
total_worker_ = "B08006_001",
drove_alone_ = "B08006_003",
carpool_2_ = "B08006_005",
carpool_3_ = "B08006_006",
carpool_4p_ = "B08006_007",
transit_ = "B08006_008",
bicycle_ = "B08006_014",
walked_ = "B08006_015",
other3_ = "B08006_016", # taxicab, motorcycle, other means
at_home_ = "B08006_017",
bus_ = "B08006_009",
subway_ = "B08006_010",
railroad_ = "B08006_011",
lightrail_ = "B08006_012",
ferry_ = "B08006_013")
###################################################################
# Iterate on ACS 1-year data, using the purrr function "map_dfr"
###################################################################
test1 <- get_acs(survey="acs1",geography="place",state="CA",variables=selvars,
year=2019,output='wide') %>%
dplyr::arrange(GEOID)
# This doesn't work for 2005 since the ACS 2005 B08006 table
# had extra records for Carpools: carpool 4, carpool 5-6, and carpool 7+
# years <- 2005:2020
# Testing 2020 to see what kind of error message is generated....
# Running 2020 single year ACS gives an "Error: Your API call has errors".
# Perhaps add a WARNING to users to NOT to ask for 2020 ACS1 data!
# years <- 2006:2020 # Asking for 2020 ACS1 will bomb.
years <- 2006:2019
names(years) <- years
# Testing skip-a-year to see what how we'll handle 2020-X data.....
# years <- c(2009:2017,2019)
# This does indeed work AOK.....
# United States, 2006-2019, single year ACS
zeta1 <- purrr::map_dfr(years, ~{ get_acs(survey = "acs1",
geography = "us", # state = "CA",
variables = selvars, output='wide',
year = .x)}, .id = "year")
# All US States, 2006-2019, single year ACS
zeta2 <- purrr::map_dfr(years, ~{ get_acs(survey = "acs1",
geography = "state", # state = "CA",
variables = selvars, output='wide',
year = .x)}, .id = "year") %>%
dplyr::arrange(GEOID,year)
# Large US Counties, 2006-2019, single year ACS (Population 65K+)
zeta3 <- purrr::map_dfr(years, ~{ get_acs(survey = "acs1",
geography = "county", # state = "CA",
variables = selvars, output='wide',
year = .x)}, .id = "year") %>%
dplyr::arrange(GEOID,year)
# Large US Places, 2006-2019, single year ACS (Population 65K+)
zeta4 <- purrr::map_dfr(years, ~{ get_acs(survey = "acs1",
geography = "place", # state = "CA",
variables = selvars, output='wide',
year = .x)}, .id = "year") %>%
dplyr::arrange(GEOID,year)
########################################################################
## Output the ACS single year databases....
########################################################################
# note that my working directory (wd) is for a Mac computer.
setwd("~/Desktop/tidycensus_work/output")
# Export the data frames to CSV files, for importing to Excel
write.csv(zeta1,"ACS_AllYears_US_meansoftransportation_2006_2019_revd.csv")
write.csv(zeta2,"ACS_AllYears_states_meansoftransportation_2006_2019_revd.csv")
write.csv(zeta3,"ACS_AllYears_counties_meansoftransportation_2006_2019_revd.csv")
write.csv(zeta4,"ACS_AllYears_places_meansoftransportation_2006_2019_revd.csv")
# just write them in a master XLSX workbook.....
# Grrrr... this doesn't work... java heap space error, whatever that is.....
write.xlsx2(zeta1, "ACS_AllYears_journey_to_work_master_2006_2019.xlsx",
sheetName="nation", append=FALSE)
write.xlsx2(zeta2, "ACS_AllYears_journey_to_work_master_2006_2019.xlsx",
sheetName="states", append=TRUE)
write.xlsx2(zeta3, "ACS_AllYears_journey_to_work_master_2006_2019.xlsx",
sheetName="counties", append=TRUE)
write.xlsx2(zeta4, "ACS_AllYears_journey_to_work_master_2006_2019.xlsx",
sheetName="places", append=TRUE)
###################################################################
# Iterate on ACS 5-year data, using the purrr function "map_dfr"
###################################################################
# This works ok:
test2 <- get_acs(survey="acs5",year=2020,
geography="us", variables="B08006_001", output="wide")
# with years 2010, 2014 and 2019, this is a one-year overlap between
# the first two periods.... 2009 --- bombed for some reason.....
# The new 2016-2020 ACS 5-year data is scheduled for release 3/17/22.....
# Data for 2006-2010, 2011-2015, 2016-2020:
years5 <- c(2010,2015,2020)
names(years5) <- years5
## years5 <- c(2010,2014,2019) ... this works, too, but overlapping periods.
# This code is choking on selvars (means of transportation to work)
# getting an "unknown variable 'B08006_001E'" error??? :(
# re-ran... it works on 2006-2010; 2010-2014; 2015-2019..... weird!!
# National, ACS 5-years (2006-2010, 2011-2015, 2016-2020)
zeta5 <- purrr::map_dfr(years5, ~{ get_acs(survey = "acs5",
geography = "us", # state = "CA",
variables = selvars, output='wide',
year = .x)}, .id = "year")
# State, ACS 5-years (2006-2010, 2011-2015, 2016-2020)
zeta6 <- purrr::map_dfr(years5, ~{ get_acs(survey = "acs5",
geography = "state", # state = "CA",
variables = selvars, output='wide',
year = .x)}, .id = "year") %>%
dplyr::arrange(GEOID,year)
# County, ACS 5-years (2006-2010, 2011-2015, 2016-2020)
zeta7 <- purrr::map_dfr(years5, ~{ get_acs(survey = "acs5",
geography = "county", # state = "CA",
variables = selvars, output='wide',
year = .x)}, .id = "year") %>%
dplyr::arrange(GEOID,year)
# Place, ACS 5-years (2006-2010, 2011-2015, 2016-2020)
zeta8 <- purrr::map_dfr(years5, ~{ get_acs(survey = "acs5",
geography = "place", # state = "CA",
variables = selvars, output='wide',
year = .x)}, .id = "year") %>%
dplyr::arrange(GEOID,year)
# This last dataframe, zeta8, has 90,996 records... pretty big!
########################################################################
## Output the ACS five year databases....
########################################################################
setwd("~/Desktop/tidycensus_work/output")
# Export the data frames to CSV files, for importing to Excel
write.csv(zeta5,"ACS_5Years_US_meansoftransportation_2010_2015_2020.csv")
write.csv(zeta6,"ACS_5Years_states_meansoftransportation_2010_2015_2020.csv")
write.csv(zeta7,"ACS_5Years_counties_meansoftransportation_2010_2015_2020.csv")
write.csv(zeta8,"ACS_5Years_places_meansoftransportation_2010_2015_2020.csv")
########################################################################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment