michelleboisson/data-without-borders-hw2-part1.R

## data-without-borders-hw2-part1.R
# Write code to return the percentage of people who were frisked for each
# race.  In other words, count up the number of people who were frisked for a given race
# divided by the number of people of that race stopped.  Which race leads to the highest
# percentage of frisks?  Which one the lowest?

#read in the data
snf <- read.csv("http://www.jakeporway.com/teaching/data/snf_2.csv", as.is=TRUE)

#race1 is black
race1.in.total = table(snf$race ==1)
race1.frisked = table(snf$race==1 & snf$frisked == 1)
race1.frisked / race1.in.total * 100
#    FALSE      TRUE
#144.08737  59.30879

#race2 is black Hispanic
table(snf$race==2 & snf$frisked == 1) / table(snf$race ==2) *100

#    FALSE      TRUE
#103.21993  59.71156

#race 3 is white Hispanic
table(snf$race==3 & snf$frisked == 1) / table(snf$race ==3) *100

#    FALSE      TRUE
#114.90219  55.76884

#race 4 is white
table(snf$race==4 & snf$frisked == 1) / table(snf$race ==4) *100

#FALSE      TRUE
#105.58145  41.90888

#race 5 is Asian/Pacific Islander
table(snf$race==5 & snf$frisked == 1) / table(snf$race ==5) *100

#    FALSE      TRUE
#101.76448  48.51948

#race 6 is American Indian/Native Alaskan
table(snf$race==6 & snf$frisked == 1) / table(snf$race ==6) *100
FALSE      TRUE
100.19523  45.93301

#race -1 is undefined
table(snf$race==-1 & snf$frisked == 1) / table(snf$race ==-1) *100

#    FALSE      TRUE
#101.56461  48.63133


#Plot the number of times each crime occurs in descending order (we’ve learned a couple of ways to do this, #though using sort(), table() and that new type= parameter to plot() is your best bet).   What does this #distribution of crimes look like?  In other words, are there an equal number of every kind of crime or are there #a few that dominate?

plot(rev(sort(table(snf$crime.suspected))))


#Well I’m kind of answering that question for you here – let’s take the top 30
#suspected crimes and look at those.  If we were to just look at stops where the
#crime.suspected was one of the top 30 crimes, what percentage of the stops would that
#cover?  Do you think that’s enough?

all.crimes.sorted = data.frame(rev(sort(table(snf$crime.suspected))))
top.30.crimes = all.crimes.sorted[1:30,]

#Well I’m kind of answering that question for you here – let’s take the top 30
#suspected crimes and look at those.  If we were to just look at stops where the
#crime.suspected was one of the top 30 crimes, what percentage of the stops would that
#cover?

sum(top.30.crimes)
#[1] 53048
sum(top.30.crimes) / sum(all.crimes.sorted) * 100
#[1] 91.32194
#the top 30 crimes cover 91.3% of all crimes.


#Write code to create a variable called “crime.abbv” that consists of just the
#first three letters of crime.suspected and show the code to add it to our main data frame.
#Now what percentage of the stops do the top 30 crime.abbvs account for?
crime.abbr = substr(snf$crime.suspected, 1,3)
snf$crime.abbr = crime.abbr
all.crimes.sorted.abbr = data.frame(rev(sort(table(snf$crime.abbr))))
all.crimes.sorted.abbr[1:30,]
new.top.30.crimes = all.crimes.sorted.abbr[1:30,]
sum(new.top.30.crimes) / sum(all.crimes.sorted) * 100
#[1] 98.43172
#Now, using the abbreviated names for the crimes suspected, the top 30 make up 98.4% of all crimes.


#Write code to show the top 3 crimes each race is suspected of (rev(),
#sort(), and table() are your friends here again, but you’ll have to subset the data by race
#first).  Huh.  If you do this right, almost all the top 3’s should be the same, but a few are
#different.  What are these differences?

#top three crimes for race 1
race1 = snf[snf$race == 1,]
race1 = snf[snf$race == 1,]$crime.abbr
rev(sort(table(race1)))[1:3]
FEL   MIS   CPW
10744  4487  4009

#top 3 crimes suspected for race 2
rev(sort(table(snf[snf$race == 2,]$crime.abbr)))[1:3]

# FEL  MIS  CPW
#1405  747  636

#top 3 crimes suspected for race 3
rev(sort(table(snf[snf$race == 3,]$crime.abbr)))[1:3]

#FEL  MIS  CPW
#5072 2423 1573

#top 3 crimes suspected for race 4
rev(sort(table(snf[snf$race == 4,]$crime.abbr)))[1:3]
 FEL  MIS  BUR
1647  769  485

#top 3 crimes suspected for race 5
rev(sort(table(snf[snf$race == 5,]$crime.abbr)))[1:3]

FEL ROB MIS
788 224 188

#top 3 crimes suspected for race 6
rev(sort(table(snf[snf$race == 6,]$crime.abbr)))[1:3]
FEL ROB GLA
 91  28  21

#top 3 crimes suspected for race -1
rev(sort(table(snf[snf$race == -1,]$crime.abbr)))[1:3]

FEL MIS CPW
566 229 202

## data-without-borders-hw2-part2.R
#Let’s create an “hour” variable that tells us what hour of the day each stop
#happened during and add it to our dataset.  How do we do this?  Well we’ve got a
#great column of “time” variables that always has the hour in the same place.  Use
#the substr() function we learned about above to strip out the hour, then use
#as.numeric() from lecture 2 to convert it to a number.


hour = substr(snf$time, 12, 13)
hour = as.numeric(hour)


#Create a line plot (i.e. a plot with type=”l”) of the stops by hour.
stops.by.hour = table(hour)
plot(stops.by.hour, type="l")

Which hour of the day has the most stops?
stops.by.hour[which(stops.by.hour == max(stops.by.hour))]
#20
#4607

Which hour has the fewest?
stops.by.hour[which(stops.by.hour == min(stops.by.hour))]
#  6
#323

#Create the same plot but with points instead of lines.  Use a different plotting
#symbol than the default and color the max point and min points different colors.

#create a vector to hold the colors for each point
day.colors = rep(1, 24)
day.colors
# [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

#change the color of the max number to 6 (hot pink) and min number to blue
day.colors[which(stops.by.hour == max(stops.by.hour))] = 6
day.colors[which(stops.by.hour == min(stops.by.hour))] = 5
#plot
plot(stops.by.hour, type="p", col=day.colors)
	# Write code to return the percentage of people who were frisked for each
	# race. In other words, count up the number of people who were frisked for a given race
	# divided by the number of people of that race stopped. Which race leads to the highest
	# percentage of frisks? Which one the lowest?

	#read in the data
	snf <- read.csv("http://www.jakeporway.com/teaching/data/snf_2.csv", as.is=TRUE)

	#race1 is black
	race1.in.total = table(snf$race ==1)
	race1.frisked = table(snf$race==1 & snf$frisked == 1)
	race1.frisked / race1.in.total * 100
	# FALSE TRUE
	#144.08737 59.30879

	#race2 is black Hispanic
	table(snf$race==2 & snf$frisked == 1) / table(snf$race ==2) *100

	# FALSE TRUE
	#103.21993 59.71156

	#race 3 is white Hispanic
	table(snf$race==3 & snf$frisked == 1) / table(snf$race ==3) *100

	# FALSE TRUE
	#114.90219 55.76884

	#race 4 is white
	table(snf$race==4 & snf$frisked == 1) / table(snf$race ==4) *100

	#FALSE TRUE
	#105.58145 41.90888

	#race 5 is Asian/Pacific Islander
	table(snf$race==5 & snf$frisked == 1) / table(snf$race ==5) *100

	# FALSE TRUE
	#101.76448 48.51948

	#race 6 is American Indian/Native Alaskan
	table(snf$race==6 & snf$frisked == 1) / table(snf$race ==6) *100
	FALSE TRUE
	100.19523 45.93301

	#race -1 is undefined
	table(snf$race==-1 & snf$frisked == 1) / table(snf$race ==-1) *100

	# FALSE TRUE
	#101.56461 48.63133


	#Plot the number of times each crime occurs in descending order (we’ve learned a couple of ways to do this, #though using sort(), table() and that new type= parameter to plot() is your best bet). What does this #distribution of crimes look like? In other words, are there an equal number of every kind of crime or are there #a few that dominate?

	plot(rev(sort(table(snf$crime.suspected))))


	#Well I’m kind of answering that question for you here – let’s take the top 30
	#suspected crimes and look at those. If we were to just look at stops where the
	#crime.suspected was one of the top 30 crimes, what percentage of the stops would that
	#cover? Do you think that’s enough?

	all.crimes.sorted = data.frame(rev(sort(table(snf$crime.suspected))))
	top.30.crimes = all.crimes.sorted[1:30,]

	#Well I’m kind of answering that question for you here – let’s take the top 30
	#suspected crimes and look at those. If we were to just look at stops where the
	#crime.suspected was one of the top 30 crimes, what percentage of the stops would that
	#cover?

	sum(top.30.crimes)
	#[1] 53048
	sum(top.30.crimes) / sum(all.crimes.sorted) * 100
	#[1] 91.32194
	#the top 30 crimes cover 91.3% of all crimes.


	#Write code to create a variable called “crime.abbv” that consists of just the
	#first three letters of crime.suspected and show the code to add it to our main data frame.
	#Now what percentage of the stops do the top 30 crime.abbvs account for?
	crime.abbr = substr(snf$crime.suspected, 1,3)
	snf$crime.abbr = crime.abbr
	all.crimes.sorted.abbr = data.frame(rev(sort(table(snf$crime.abbr))))
	all.crimes.sorted.abbr[1:30,]
	new.top.30.crimes = all.crimes.sorted.abbr[1:30,]
	sum(new.top.30.crimes) / sum(all.crimes.sorted) * 100
	#[1] 98.43172
	#Now, using the abbreviated names for the crimes suspected, the top 30 make up 98.4% of all crimes.



	#Write code to show the top 3 crimes each race is suspected of (rev(),
	#sort(), and table() are your friends here again, but you’ll have to subset the data by race
	#first). Huh. If you do this right, almost all the top 3’s should be the same, but a few are
	#different. What are these differences?

	#top three crimes for race 1
	race1 = snf[snf$race == 1,]
	race1 = snf[snf$race == 1,]$crime.abbr
	rev(sort(table(race1)))[1:3]
	FEL MIS CPW
	10744 4487 4009

	#top 3 crimes suspected for race 2
	rev(sort(table(snf[snf$race == 2,]$crime.abbr)))[1:3]

	# FEL MIS CPW
	#1405 747 636

	#top 3 crimes suspected for race 3
	rev(sort(table(snf[snf$race == 3,]$crime.abbr)))[1:3]

	#FEL MIS CPW
	#5072 2423 1573

	#top 3 crimes suspected for race 4
	rev(sort(table(snf[snf$race == 4,]$crime.abbr)))[1:3]
	FEL MIS BUR
	1647 769 485

	#top 3 crimes suspected for race 5
	rev(sort(table(snf[snf$race == 5,]$crime.abbr)))[1:3]

	FEL ROB MIS
	788 224 188

	#top 3 crimes suspected for race 6
	rev(sort(table(snf[snf$race == 6,]$crime.abbr)))[1:3]
	FEL ROB GLA
	91 28 21

	#top 3 crimes suspected for race -1
	rev(sort(table(snf[snf$race == -1,]$crime.abbr)))[1:3]

	FEL MIS CPW
	566 229 202
	#Let’s create an “hour” variable that tells us what hour of the day each stop
	#happened during and add it to our dataset. How do we do this? Well we’ve got a
	#great column of “time” variables that always has the hour in the same place. Use
	#the substr() function we learned about above to strip out the hour, then use
	#as.numeric() from lecture 2 to convert it to a number.


	hour = substr(snf$time, 12, 13)
	hour = as.numeric(hour)


	#Create a line plot (i.e. a plot with type=”l”) of the stops by hour.
	stops.by.hour = table(hour)
	plot(stops.by.hour, type="l")

	Which hour of the day has the most stops?
	stops.by.hour[which(stops.by.hour == max(stops.by.hour))]
	#20
	#4607

	Which hour has the fewest?
	stops.by.hour[which(stops.by.hour == min(stops.by.hour))]
	# 6
	#323

	#Create the same plot but with points instead of lines. Use a different plotting
	#symbol than the default and color the max point and min points different colors.

	#create a vector to hold the colors for each point
	day.colors = rep(1, 24)
	day.colors
	# [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

	#change the color of the max number to 6 (hot pink) and min number to blue
	day.colors[which(stops.by.hour == max(stops.by.hour))] = 6
	day.colors[which(stops.by.hour == min(stops.by.hour))] = 5
	#plot
	plot(stops.by.hour, type="p", col=day.colors)