mihirzaveri/chicago-gun-exercise

## chicago-gun-exercise
#set working directory to where you downloaded your data
setwd("Documents/grad_school/dataviz-fall-2013/crime-exercise/")

#make sure strings arent factors

options(stringsAsFactors = FALSE)

#load the packages you will need

install.packages("maptools")
library(maptools)

install.packages("RColorBrewer")
library(RColorBrewer)

#load your data and store it in a data frame called "data", set header = FALSE to indicate that the first row in your data are NOT column titles
data <- read.delim("county-data.txt", header = FALSE)

#rename your columns by storing two strings (county_original and guncount_original) as the values in the names(data) vector
names(data) <- c("county_original", "guncount_original")

#now we want our columns to be in a nicer format. to do this we start with creating a new object that splits the string in county_orig with function strsplit() and storing the result of that into an object called split. this is a list not a vector (funny R thing)

split <- strsplit(data$county_orig, "\\(")

#now we need to write two functions that get the county and the state out of this list

#getting the first part left of the (
get_first_item <- function(item){
    return(item[1])
}

#getting the second part right of the (
get_second_item <- function(item){
    return(item[2])
}

#we need to use a function called sapply to apply get_first_item and get_second_item to every element of our list that we stored in split. THEN we can store the output from these functions in new columns on our dataset.

data$county <- sapply(split, get_first_item)
data$state <- sapply(split, get_second_item)

#now let's get rid of that other parenthesis left in the state column with a function called gsub (we're substituting all the )s for nothing essentially)

data$state <- gsub(")", "", data$state)

#now our initial data set is pretty much cleaned up. our goal is to make a choropleth map that shows darker areas sending more guns to Chicago. first let's get rid of commas in the guncount column using gsub

data$guncount_nocommas <- gsub(",", "", data$guncount_orig)

#now let's make guncount numeric instead of characters

data$guncount_clean <- as.numeric(data$guncount_nocommas)

#now let's aggregate the totals from each state using the aggregate function, storing that as a new data frame

state_totals_data <- aggregate(data$guncount_clean, by = list(data$state), sum)

#and make sure the names of the columns make sense

names(state_totals_data) <- c("state", "guncount")

#so now we have the guncount totals by state from the data we received from the police department. time to start messing with our map data.

shapes <- readShapePoly("nytlayout_state.shp")

#check it

plot(shapes)

#make a new data frame using the data from our shape file. we're eventually going to join this with our guncount data. check out the new data with head()

map_data <- data.frame(shapes)
head(map_data)

#unfortunately the STATE_ABBR column contains a bunch of factors. we need to fix this and rename the column better. then check it.

map_data$state <- as.character(map_data$STATE_ABBR)
head(map_data)
class(map_data$state[1])

#let's see if the state abbreviations in our map data exist in our original data

state_totals_data$state%in%map_data$state

#you'll see two falses come up, those are puerto rico and guam which are not in map_data. let's just ignore them.

#so we need to join now. for that we need to know where in our original data set do each of the state abbreviations occur?

match(map_data$state, state_totals_data$state)

#returns a vector with a bunch of numbers. but these numbers are the position in our original data set of each of the states. for example, the second number in this vector, 13, shows that HI (second in our map_data), occurs in the 13th position in our original data.

#let's store this as a vector of its own.

data_order <-match(map_data$state, state_totals_data$state)

#now we can join the guncounts since we know which one to attach where. for example, we know to attach the 13th guncount in our original data to the second entry in our map_data

map_data$guncount <- state_totals_data$guncount[data_order]

#now let's think about how many colors we need and how we should break up our data.

breaks <- c(0,50,100,200,300,500,1000,5000,10000,25000)

#and cut our data based on these breaks (assign a level to each data item), making them numeric makes it easier to understand

buckets <- cut(map_data$guncount, breaks=breaks)
numeric_buckets <- as.numeric(buckets)

#assign your colors
colors <- brewer.pal(9, "YlOrRd")

#check out the colors assigned to each value in our data

colors[numeric_buckets]

#now finally plot the colors in our map

plot(shapes, col = colors[numeric_buckets])
title("What state do guns come to Chicago from?")
	#set working directory to where you downloaded your data
	setwd("Documents/grad_school/dataviz-fall-2013/crime-exercise/")

	#make sure strings arent factors

	options(stringsAsFactors = FALSE)

	#load the packages you will need

	install.packages("maptools")
	library(maptools)

	install.packages("RColorBrewer")
	library(RColorBrewer)

	#load your data and store it in a data frame called "data", set header = FALSE to indicate that the first row in your data are NOT column titles
	data <- read.delim("county-data.txt", header = FALSE)

	#rename your columns by storing two strings (county_original and guncount_original) as the values in the names(data) vector
	names(data) <- c("county_original", "guncount_original")

	#now we want our columns to be in a nicer format. to do this we start with creating a new object that splits the string in county_orig with function strsplit() and storing the result of that into an object called split. this is a list not a vector (funny R thing)

	split <- strsplit(data$county_orig, "\\(")

	#now we need to write two functions that get the county and the state out of this list

	#getting the first part left of the (
	get_first_item <- function(item){
	return(item[1])
	}

	#getting the second part right of the (
	get_second_item <- function(item){
	return(item[2])
	}

	#we need to use a function called sapply to apply get_first_item and get_second_item to every element of our list that we stored in split. THEN we can store the output from these functions in new columns on our dataset.

	data$county <- sapply(split, get_first_item)
	data$state <- sapply(split, get_second_item)

	#now let's get rid of that other parenthesis left in the state column with a function called gsub (we're substituting all the )s for nothing essentially)

	data$state <- gsub(")", "", data$state)

	#now our initial data set is pretty much cleaned up. our goal is to make a choropleth map that shows darker areas sending more guns to Chicago. first let's get rid of commas in the guncount column using gsub

	data$guncount_nocommas <- gsub(",", "", data$guncount_orig)

	#now let's make guncount numeric instead of characters

	data$guncount_clean <- as.numeric(data$guncount_nocommas)

	#now let's aggregate the totals from each state using the aggregate function, storing that as a new data frame

	state_totals_data <- aggregate(data$guncount_clean, by = list(data$state), sum)

	#and make sure the names of the columns make sense

	names(state_totals_data) <- c("state", "guncount")

	#so now we have the guncount totals by state from the data we received from the police department. time to start messing with our map data.

	shapes <- readShapePoly("nytlayout_state.shp")

	#check it

	plot(shapes)

	#make a new data frame using the data from our shape file. we're eventually going to join this with our guncount data. check out the new data with head()

	map_data <- data.frame(shapes)
	head(map_data)

	#unfortunately the STATE_ABBR column contains a bunch of factors. we need to fix this and rename the column better. then check it.

	map_data$state <- as.character(map_data$STATE_ABBR)
	head(map_data)
	class(map_data$state[1])

	#let's see if the state abbreviations in our map data exist in our original data

	state_totals_data$state%in%map_data$state

	#you'll see two falses come up, those are puerto rico and guam which are not in map_data. let's just ignore them.

	#so we need to join now. for that we need to know where in our original data set do each of the state abbreviations occur?

	match(map_data$state, state_totals_data$state)

	#returns a vector with a bunch of numbers. but these numbers are the position in our original data set of each of the states. for example, the second number in this vector, 13, shows that HI (second in our map_data), occurs in the 13th position in our original data.

	#let's store this as a vector of its own.

	data_order <-match(map_data$state, state_totals_data$state)

	#now we can join the guncounts since we know which one to attach where. for example, we know to attach the 13th guncount in our original data to the second entry in our map_data

	map_data$guncount <- state_totals_data$guncount[data_order]

	#now let's think about how many colors we need and how we should break up our data.

	breaks <- c(0,50,100,200,300,500,1000,5000,10000,25000)

	#and cut our data based on these breaks (assign a level to each data item), making them numeric makes it easier to understand

	buckets <- cut(map_data$guncount, breaks=breaks)
	numeric_buckets <- as.numeric(buckets)

	#assign your colors
	colors <- brewer.pal(9, "YlOrRd")

	#check out the colors assigned to each value in our data

	colors[numeric_buckets]

	#now finally plot the colors in our map

	plot(shapes, col = colors[numeric_buckets])
	title("What state do guns come to Chicago from?")