Created

Embed URL

HTTPS clone URL

SSH clone URL

You can clone with HTTPS or SSH.

Download Gist

An exercise in data manipulation from chapter 1 of Machine Learning for Hackers

View munge-ufo-data.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
## Cbare's code from Machine Learning for Hackers
## Chapter 1 - cleaning up data on UFO sightings
##
## for data files and lot's more R code, see:
## https://github.com/johnmyleswhite/ML_for_Hackers
##
############################################################
library(ggplot2)
library(scales)
 
 
# read data on UFO sightings from infochimps
ufo<-read.delim("data/ufo/ufo_awesome.tsv",
sep="\t", stringsAsFactors=FALSE,
header=FALSE, na.strings="")
 
# give the columns nice names
names(ufo)<-c("DateOccurred",
"DateReported",
"Location",
"ShortDescription",
"Duration",
"LongDescription")
 
head(ufo[ nchar(ufo$DateOccurred)!=8, ])
 
# there are bad date fields, so drop those rows
bad.rows <- nchar(ufo$DateOccurred)!=8 | nchar(ufo$DateReported)!=8
ufo <- ufo[!bad.rows,]
 
# convert strings to dates
ufo$DateOccurred<-as.Date(ufo$DateOccurred, format="%Y%m%d")
ufo$DateReported<-as.Date(ufo$DateReported, format="%Y%m%d")
# parse out city and state from location, where possible
# for US cities and states, Location takes the form "Seattle, WA"
loc.split <- strsplit(ufo$Location,",\\s*")
 
# Look for state in the last position
ufo$state <- toupper(unlist( lapply(loc.split, function(x) x[length(x)])))
 
# shove everything else in the city column
ufo$city <- unlist( lapply(loc.split, function(x) paste(x[-length(x)],collapse=",")))
 
# keep just the rows that seem to be located in US states
# using the built-in "state" data sets.
ufo.us <- ufo[ufo$state %in% state.abb,]
 
# read a file with state populations from 2011, 2012, and 2000
state.pop <- read.csv('data/census.csv')[, c('State', 'X2000')]
colnames(state.pop) <- c('name','pop.2000')
# Let's use 2000 populations, and also tack on abbreviations
# so we can merge, later
state.pop$abbrev <- state.abb[ sapply(state.pop$name, function(s) which(state.name==s)) ]
 
# count sightings by US state
sightings.by.state <- as.data.frame(table(ufo.us$state))
colnames(sightings.by.state) <- c('state', 'sightings')
 
# combine UFO sightings data with state population
sightings.by.state <- merge(sightings.by.state, state.pop, by.x='state', by.y='abbrev')
 
# compute per.capita UFO sightings
sightings.by.state <- transform(sightings.by.state, state=state, state.name=name, sightings=sightings, sightings.per.cap=sightings/pop.2000)
sightings.by.state <- sightings.by.state[ order(sightings.by.state$sightings.per.cap, decreasing=T), ]
 
# make a nice boxplot
boxdata <- boxplot(sightings.by.state$sightings.per.cap,
ylab="sightings per capita",
col="#66FF5580")
title("UFO sightings by state")
for(i in 1:length(boxdata$group)){
#add text to the boxplot
text(boxdata$group[i],
boxdata$out[i],
sightings.by.state$state[ which(sightings.by.state$sightings.per.cap==boxdata$out[i]) ],
pos=4, cex=0.5, col="#990000CC")
}
 
# play with ggplot a bit to see if we can't make a nicer plot
p <- ggplot(data=sightings.by.state,
aes(x=factor(0), y=sightings.per.cap),
scale_x_discrete(breaks=NA))
p <- p + geom_boxplot(aes(group=factor(0), fill="#66FF5580"), alpha=0.6, outlier.size=0)
p <- p + scale_fill_manual(values = alpha(c("#66FF5580"), .3), guide=FALSE)
p <- p + opts(title = "UFO sightings per capita for US States")
p <- p + ylab("Sightings per capita")
p <- p + xlab(NULL)
p <- p + opts(axis.title.y = theme_text(size = 14, colour = 'red'))
p <- p + theme(axis.ticks = element_blank(), axis.text.x = element_blank())
p <- p + geom_point(aes(x=factor(0), y=sightings.per.cap),
data = sightings.by.state, colour=alpha("blue", 0.33), size=3)
p <- p + geom_text(data=sightings.by.state[3:15,], aes(x=factor(0), y=sightings.per.cap, label=state), hjust = 2, size=3, colour=alpha('red', 0.6))
p <- p + geom_text(data=sightings.by.state[1,], aes(x=factor(0), y=sightings.per.cap, label=state), hjust = 2, size=5, colour=alpha('red', 0.9))
p <- p + geom_text(data=sightings.by.state[2,], aes(x=factor(0), y=sightings.per.cap, label=state), hjust = 2.5, size=4, colour=alpha('red', 0.9))
print(p)
 
ggsave(plot=p, filename='UFO sightings per captia.png', width=5,height=6)
Owner

Fixed a bunch of late-night coding errors. I think this works with a clean environment, if you're working directory is set to chapter 1 of the code for Machine Learning for Hackers.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.