Create a gist now

Instantly share code, notes, and snippets.

An exercise in data manipulation from chapter 1 of Machine Learning for Hackers
## Cbare's code from Machine Learning for Hackers
## Chapter 1 - cleaning up data on UFO sightings
##
## for data files and lot's more R code, see:
## https://github.com/johnmyleswhite/ML_for_Hackers
##
############################################################
library(ggplot2)
library(scales)
# read data on UFO sightings from infochimps
ufo<-read.delim("data/ufo/ufo_awesome.tsv",
sep="\t", stringsAsFactors=FALSE,
header=FALSE, na.strings="")
# give the columns nice names
names(ufo)<-c("DateOccurred",
"DateReported",
"Location",
"ShortDescription",
"Duration",
"LongDescription")
head(ufo[ nchar(ufo$DateOccurred)!=8, ])
# there are bad date fields, so drop those rows
bad.rows <- nchar(ufo$DateOccurred)!=8 | nchar(ufo$DateReported)!=8
ufo <- ufo[!bad.rows,]
# convert strings to dates
ufo$DateOccurred<-as.Date(ufo$DateOccurred, format="%Y%m%d")
ufo$DateReported<-as.Date(ufo$DateReported, format="%Y%m%d")
# parse out city and state from location, where possible
# for US cities and states, Location takes the form "Seattle, WA"
loc.split <- strsplit(ufo$Location,",\\s*")
# Look for state in the last position
ufo$state <- toupper(unlist( lapply(loc.split, function(x) x[length(x)])))
# shove everything else in the city column
ufo$city <- unlist( lapply(loc.split, function(x) paste(x[-length(x)],collapse=",")))
# keep just the rows that seem to be located in US states
# using the built-in "state" data sets.
ufo.us <- ufo[ufo$state %in% state.abb,]
# read a file with state populations from 2011, 2012, and 2000
state.pop <- read.csv('data/census.csv')[, c('State', 'X2000')]
colnames(state.pop) <- c('name','pop.2000')
# Let's use 2000 populations, and also tack on abbreviations
# so we can merge, later
state.pop$abbrev <- state.abb[ sapply(state.pop$name, function(s) which(state.name==s)) ]
# count sightings by US state
sightings.by.state <- as.data.frame(table(ufo.us$state))
colnames(sightings.by.state) <- c('state', 'sightings')
# combine UFO sightings data with state population
sightings.by.state <- merge(sightings.by.state, state.pop, by.x='state', by.y='abbrev')
# compute per.capita UFO sightings
sightings.by.state <- transform(sightings.by.state, state=state, state.name=name, sightings=sightings, sightings.per.cap=sightings/pop.2000)
sightings.by.state <- sightings.by.state[ order(sightings.by.state$sightings.per.cap, decreasing=T), ]
# make a nice boxplot
boxdata <- boxplot(sightings.by.state$sightings.per.cap,
ylab="sightings per capita",
col="#66FF5580")
title("UFO sightings by state")
for(i in 1:length(boxdata$group)){
#add text to the boxplot
text(boxdata$group[i],
boxdata$out[i],
sightings.by.state$state[ which(sightings.by.state$sightings.per.cap==boxdata$out[i]) ],
pos=4, cex=0.5, col="#990000CC")
}
# play with ggplot a bit to see if we can't make a nicer plot
p <- ggplot(data=sightings.by.state,
aes(x=factor(0), y=sightings.per.cap),
scale_x_discrete(breaks=NA))
p <- p + geom_boxplot(aes(group=factor(0), fill="#66FF5580"), alpha=0.6, outlier.size=0)
p <- p + scale_fill_manual(values = alpha(c("#66FF5580"), .3), guide=FALSE)
p <- p + opts(title = "UFO sightings per capita for US States")
p <- p + ylab("Sightings per capita")
p <- p + xlab(NULL)
p <- p + opts(axis.title.y = theme_text(size = 14, colour = 'red'))
p <- p + theme(axis.ticks = element_blank(), axis.text.x = element_blank())
p <- p + geom_point(aes(x=factor(0), y=sightings.per.cap),
data = sightings.by.state, colour=alpha("blue", 0.33), size=3)
p <- p + geom_text(data=sightings.by.state[3:15,], aes(x=factor(0), y=sightings.per.cap, label=state), hjust = 2, size=3, colour=alpha('red', 0.6))
p <- p + geom_text(data=sightings.by.state[1,], aes(x=factor(0), y=sightings.per.cap, label=state), hjust = 2, size=5, colour=alpha('red', 0.9))
p <- p + geom_text(data=sightings.by.state[2,], aes(x=factor(0), y=sightings.per.cap, label=state), hjust = 2.5, size=4, colour=alpha('red', 0.9))
print(p)
ggsave(plot=p, filename='UFO sightings per captia.png', width=5,height=6)
@cbare
Owner
cbare commented Sep 25, 2012

Fixed a bunch of late-night coding errors. I think this works with a clean environment, if you're working directory is set to chapter 1 of the code for Machine Learning for Hackers.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment