public
Created

An exercise in data manipulation from chapter 1 of Machine Learning for Hackers

  • Download Gist
munge-ufo-data.R
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
## Cbare's code from Machine Learning for Hackers
## Chapter 1 - cleaning up data on UFO sightings
##
## for data files and lot's more R code, see:
## https://github.com/johnmyleswhite/ML_for_Hackers
##
############################################################
library(ggplot2)
library(scales)
 
 
# read data on UFO sightings from infochimps
ufo<-read.delim("data/ufo/ufo_awesome.tsv",
sep="\t", stringsAsFactors=FALSE,
header=FALSE, na.strings="")
 
# give the columns nice names
names(ufo)<-c("DateOccurred",
"DateReported",
"Location",
"ShortDescription",
"Duration",
"LongDescription")
 
head(ufo[ nchar(ufo$DateOccurred)!=8, ])
 
# there are bad date fields, so drop those rows
bad.rows <- nchar(ufo$DateOccurred)!=8 | nchar(ufo$DateReported)!=8
ufo <- ufo[!bad.rows,]
 
# convert strings to dates
ufo$DateOccurred<-as.Date(ufo$DateOccurred, format="%Y%m%d")
ufo$DateReported<-as.Date(ufo$DateReported, format="%Y%m%d")
# parse out city and state from location, where possible
# for US cities and states, Location takes the form "Seattle, WA"
loc.split <- strsplit(ufo$Location,",\\s*")
 
# Look for state in the last position
ufo$state <- toupper(unlist( lapply(loc.split, function(x) x[length(x)])))
 
# shove everything else in the city column
ufo$city <- unlist( lapply(loc.split, function(x) paste(x[-length(x)],collapse=",")))
 
# keep just the rows that seem to be located in US states
# using the built-in "state" data sets.
ufo.us <- ufo[ufo$state %in% state.abb,]
 
# read a file with state populations from 2011, 2012, and 2000
state.pop <- read.csv('data/census.csv')[, c('State', 'X2000')]
colnames(state.pop) <- c('name','pop.2000')
# Let's use 2000 populations, and also tack on abbreviations
# so we can merge, later
state.pop$abbrev <- state.abb[ sapply(state.pop$name, function(s) which(state.name==s)) ]
 
# count sightings by US state
sightings.by.state <- as.data.frame(table(ufo.us$state))
colnames(sightings.by.state) <- c('state', 'sightings')
 
# combine UFO sightings data with state population
sightings.by.state <- merge(sightings.by.state, state.pop, by.x='state', by.y='abbrev')
 
# compute per.capita UFO sightings
sightings.by.state <- transform(sightings.by.state, state=state, state.name=name, sightings=sightings, sightings.per.cap=sightings/pop.2000)
sightings.by.state <- sightings.by.state[ order(sightings.by.state$sightings.per.cap, decreasing=T), ]
 
# make a nice boxplot
boxdata <- boxplot(sightings.by.state$sightings.per.cap,
ylab="sightings per capita",
col="#66FF5580")
title("UFO sightings by state")
for(i in 1:length(boxdata$group)){
#add text to the boxplot
text(boxdata$group[i],
boxdata$out[i],
sightings.by.state$state[ which(sightings.by.state$sightings.per.cap==boxdata$out[i]) ],
pos=4, cex=0.5, col="#990000CC")
}
 
# play with ggplot a bit to see if we can't make a nicer plot
p <- ggplot(data=sightings.by.state,
aes(x=factor(0), y=sightings.per.cap),
scale_x_discrete(breaks=NA))
p <- p + geom_boxplot(aes(group=factor(0), fill="#66FF5580"), alpha=0.6, outlier.size=0)
p <- p + scale_fill_manual(values = alpha(c("#66FF5580"), .3), guide=FALSE)
p <- p + opts(title = "UFO sightings per capita for US States")
p <- p + ylab("Sightings per capita")
p <- p + xlab(NULL)
p <- p + opts(axis.title.y = theme_text(size = 14, colour = 'red'))
p <- p + theme(axis.ticks = element_blank(), axis.text.x = element_blank())
p <- p + geom_point(aes(x=factor(0), y=sightings.per.cap),
data = sightings.by.state, colour=alpha("blue", 0.33), size=3)
p <- p + geom_text(data=sightings.by.state[3:15,], aes(x=factor(0), y=sightings.per.cap, label=state), hjust = 2, size=3, colour=alpha('red', 0.6))
p <- p + geom_text(data=sightings.by.state[1,], aes(x=factor(0), y=sightings.per.cap, label=state), hjust = 2, size=5, colour=alpha('red', 0.9))
p <- p + geom_text(data=sightings.by.state[2,], aes(x=factor(0), y=sightings.per.cap, label=state), hjust = 2.5, size=4, colour=alpha('red', 0.9))
print(p)
 
ggsave(plot=p, filename='UFO sightings per captia.png', width=5,height=6)

Fixed a bunch of late-night coding errors. I think this works with a clean environment, if you're working directory is set to chapter 1 of the code for Machine Learning for Hackers.

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.