Webscraping all the Bureau Labor Stats tables and joining them into an R data frame
library(purrr) | |
setwd("~/Desktop") | |
library(rvest) | |
if(!file.exists("./data/labor_data")) {dir.create("./data/labor_data")} | |
suppressPackageStartupMessages(library(lubridate)) | |
suppressPackageStartupMessages(library(rvest)) | |
#save url into a variable | |
url <- "https://www.bls.gov/lau/#tables" | |
#download the html content using read_html | |
download.file(url,destfile="./data/uslabor.html") | |
us_county_labor_html <- read_html("./data/uslabor.html") | |
#extract the xslx | |
us_county_labor_html %>% | |
rvest::html_nodes("ul") %>% | |
rvest::html_nodes("li") %>% | |
rvest::html_nodes("a") %>% | |
rvest::html_attr("href") %>% | |
str_subset(".xlsx$") -> us_labor_urls | |
#domain | |
domain <- "https://www.bls.gov" | |
#paste domain to urls | |
str_c(domain,us_labor_urls) -> us_labor_urls | |
#only need years from 2000 to 2016 | |
us_labor_urls[3:19] -> us_labor_2000_2016 | |
years <- rep(2000:2016,1) | |
#a for loop that downloads each file | |
for(i in seq_along(us_labor_2000_2016)){ | |
download.file(us_labor_2000_2016[i],destfile = paste("./data/labor_data/",years[i],".xslx",sep=""),mode="wb") | |
} | |
#save the files pertaining to us labor | |
labor_files <- dir("./data/labor_data") | |
#create a function that downloads each url and saves it #into a dataframe | |
read_files <- function(x){ | |
read_excel(path= paste("./data/labor_data",x,sep=""),skip = 7,col_names = c("laus_code","state_fips_code","county_fips_code","county_name","year","","labor_force","employed","unemployed","unemployment_rate"),sheet = 1,na="") | |
} | |
#map the function to read each file | |
map(labor_files,read_files) -> all_labor_data | |
#join all the US labor tables | |
all_labor_data %>% reduce(full_join) -> all_labor_data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment