klahrich/explore_usa

## explore_usa
#+ chunk-0
library(acs)
library(magrittr)
library(dplyr)

#' #Step 1: get your free api key
#' Here's the [link](http://api.census.gov/data/key_signup.html)
#' Install your api key:

#+ chunk-0-b, eval=F
api.key.install('your-api-key-here')

#'
#' #Step 2: select a survey and a table to explore.
#' There is an excellent variety of data you can play with !
#' Here's the [link](http://factfinder.census.gov/faces/affhelp/jsf/pages/metadata.xhtml?lang=en&type=survey&id=survey.en.ACS_ACS)
#'
#' #Step 3: Fetch the data.
#' In the example below, I've chosen to explore the table number B15001.
#' First you need to call the geo.make function. The example below simply means "give me all states and counties"

#+ chunk-1
county_geo <- geo.make(state = "*", county = "*")

#' Then we actually get the data. It's important to set the *col.names* parameter to "pretty" to get understandable variable names.

#+ chunk-2, cache=T
education_acs <- acs.fetch(geography = county_geo, table.number = "B15001",
          col.names = "pretty", endyear = 2014, span = 5)

#' What you have at this point is an acs object.
#' Two slots are of particular interest to us: the @geography slot which contains the state/county name in the $NAME attribute.
#' Let's go ahead and extract those right away:

#+ chunk-3
state_county <- education_acs@geography$NAME %>% str_split(",")

county <- state_county  %>%
            sapply(`[[`, 1) %>%
            str_replace(" County", "")

state <-  state_county %>%
            sapply(`[[`, 2)

#' And the @estimate slot which contains the actual census values.
#' Now, the @estimate slot is actually a matrix:

#+ chunk-4
str(education_acs@estimate)

#' #Step 4: Tidy your data
#' As you can see, there is a separate column for every level of drilldown.
#' So We have a little bit of work in order to get a tidy dataset. Come on then, let's get to it !
#'
#' In the end, what we really want, is a dataframe in long format, with the state and county variables,
#' then one variable for the education level, one for the age group, one for the gender,
#' and finally the census value.
#'
#' Because there are so many columns, and also because **in R we trust**,
#' there is no way in hell we are going to do this manually.
#'
#' You can check the code below if you're curious, suffice it to say that the *expand.grid* base function was super useful !

#+ chunk-5
df_education = NULL

education <- c("Total", "< 9th grade", "< 12th grade", "High school", "College", "Associate", "Bachelor", "Graduate")
age <- c("18-24", "25-34", "35-44", "45-64", "65+")
sex = c("Male", "Female")

columns = c(3:42, 44:83)

#+ cache=T
df_exp <- expand.grid(education=education, age=age, sex=sex)

for(i in 1:length(columns)){
  df_education <- rbind(df_education, data.frame(county=county,
                                                 state=state,
                                                 sex=(df_exp$sex)[i],
                                                 age=(df_exp$age)[i],
                                                 education=(df_exp$education)[i],
                                                 value=education_acs@estimate[,columns[i]]))
}

#' I had to include the 'Total' level of education in the loop because of the way the columns of the matrix are enumerated, but I don't actually want to keep it:

#+ cache=T
df_education %<>% filter(education != 'Total')

#' And guess what my friends...we now have a freakin cool dataset with 6 variables and 225400 observations:
head(df_education)

#' That's it for today ! We're gonna keep it short & sweet. Our data is ready, we can take a break and come back later to play with it.
#'
#' In the meantime, if you want to get a headstart, or have any suggestions, please feel free to comment.
#'
#' Next time, we'll be doing some vizualisations using this dataset, and maybe we'll do some webscraping and merge some interesting information onto it.
#'
#' Come back soon !
	#+ chunk-0
	library(acs)
	library(magrittr)
	library(dplyr)

	#' #Step 1: get your free api key
	#' Here's the [link](http://api.census.gov/data/key_signup.html)
	#' Install your api key:

	#+ chunk-0-b, eval=F
	api.key.install('your-api-key-here')

	#'
	#' #Step 2: select a survey and a table to explore.
	#' There is an excellent variety of data you can play with !
	#' Here's the [link](http://factfinder.census.gov/faces/affhelp/jsf/pages/metadata.xhtml?lang=en&type=survey&id=survey.en.ACS_ACS)
	#'
	#' #Step 3: Fetch the data.
	#' In the example below, I've chosen to explore the table number B15001.
	#' First you need to call the geo.make function. The example below simply means "give me all states and counties"

	#+ chunk-1
	county_geo <- geo.make(state = "", county = "")

	#' Then we actually get the data. It's important to set the col.names parameter to "pretty" to get understandable variable names.

	#+ chunk-2, cache=T
	education_acs <- acs.fetch(geography = county_geo, table.number = "B15001",
	col.names = "pretty", endyear = 2014, span = 5)

	#' What you have at this point is an acs object.
	#' Two slots are of particular interest to us: the @geography slot which contains the state/county name in the $NAME attribute.
	#' Let's go ahead and extract those right away:

	#+ chunk-3
	state_county <- education_acs@geography$NAME %>% str_split(",")

	county <- state_county %>%
	sapply(`[[`, 1) %>%
	str_replace(" County", "")

	state <- state_county %>%
	sapply(`[[`, 2)

	#' And the @estimate slot which contains the actual census values.
	#' Now, the @estimate slot is actually a matrix:

	#+ chunk-4
	str(education_acs@estimate)

	#' #Step 4: Tidy your data
	#' As you can see, there is a separate column for every level of drilldown.
	#' So We have a little bit of work in order to get a tidy dataset. Come on then, let's get to it !
	#'
	#' In the end, what we really want, is a dataframe in long format, with the state and county variables,
	#' then one variable for the education level, one for the age group, one for the gender,
	#' and finally the census value.
	#'
	#' Because there are so many columns, and also because in R we trust,
	#' there is no way in hell we are going to do this manually.
	#'
	#' You can check the code below if you're curious, suffice it to say that the expand.grid base function was super useful !

	#+ chunk-5
	df_education = NULL

	education <- c("Total", "< 9th grade", "< 12th grade", "High school", "College", "Associate", "Bachelor", "Graduate")
	age <- c("18-24", "25-34", "35-44", "45-64", "65+")
	sex = c("Male", "Female")

	columns = c(3:42, 44:83)

	#+ cache=T
	df_exp <- expand.grid(education=education, age=age, sex=sex)

	for(i in 1:length(columns)){
	df_education <- rbind(df_education, data.frame(county=county,
	state=state,
	sex=(df_exp$sex)[i],
	age=(df_exp$age)[i],
	education=(df_exp$education)[i],
	value=education_acs@estimate[,columns[i]]))
	}

	#' I had to include the 'Total' level of education in the loop because of the way the columns of the matrix are enumerated, but I don't actually want to keep it:

	#+ cache=T
	df_education %<>% filter(education != 'Total')

	#' And guess what my friends...we now have a freakin cool dataset with 6 variables and 225400 observations:
	head(df_education)

	#' That's it for today ! We're gonna keep it short & sweet. Our data is ready, we can take a break and come back later to play with it.
	#'
	#' In the meantime, if you want to get a headstart, or have any suggestions, please feel free to comment.
	#'
	#' Next time, we'll be doing some vizualisations using this dataset, and maybe we'll do some webscraping and merge some interesting information onto it.
	#'
	#' Come back soon !