Skip to content

Instantly share code, notes, and snippets.

@awcull
Created February 22, 2014 19:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save awcull/9160251 to your computer and use it in GitHub Desktop.
Save awcull/9160251 to your computer and use it in GitHub Desktop.
Divvy Data
library(ggplot2)
library(scales)
library(reshape)
library(ggmap)
setwd("C:\\code\\Divvy_Stations_Trips_2013")
## Read data
data <- read.csv("Divvy_Trips_Clean_2013.csv", stringsAsFactors=F)
data.stations <- read.csv("Divvy_Stations_2013.csv")
## Easy stuff
ggplot(na.omit(data), aes(gender)) + geom_bar() + labs(title="Gender") # gender diff
ggsave(file="gender.png")
data$age <- as.numeric(format(Sys.Date(), "%Y")) - data$birthyear
ggplot(data, aes(age)) + geom_bar() + scale_x_continuous(breaks=seq(0,120,10)) + labs(title="Age Distribution") # age histogram
ggsave(file="age.png")
# Convert dates
data$starttime <- as.POSIXct(data$starttime)
data$stoptime <- as.POSIXct(data$stoptime)
## Take look at pickup and drop off times
# Need to times, due to how this works we just put all the times as one day
pickup.time <- as.POSIXct(format(data[,2], "%H:%M:%S"), "%H:%M:%S", tz="GMT")
dropoff.time <- as.POSIXct(format(data[,3], "%H:%M:%S"), "%H:%M:%S", tz="GMT")
pickup <- data.frame(time=pickup.time)
pickup$type <- "pickup"
dropoff <- data.frame(time=dropoff.time)
dropoff$type <- "dropoff"
time.data <- rbind(pickup,dropoff)
# pick up drop off times
ggplot(time.data, aes(time, fill=type)) + geom_density(alpha=0.4) +
scale_x_datetime(breaks=date_breaks("2 hour"), labels=date_format("%H:%M")) +
theme(axis.text.x=element_text(angle=90), axis.title.y=element_blank()) + labs(title="Pick Up and Drop Off")
ggsave(file="Pickup vs Dropoff.png")
## Look at days of the week
day.pickup <- weekdays(data$starttime)
day.dropoff <- weekdays(data$stoptime)
day.pickup.df <- data.frame(time=pickup.time, day=day.pickup)
day.dropoff.df <- data.frame(time=dropoff.time, day=day.dropoff)
# Daily pick up times
ggplot(day.pickup.df, aes(time, fill=day)) + geom_density(alpha=0.4) + scale_x_datetime(breaks=date_breaks("2 hour"),labels=date_format("%H:%M")) +
theme(axis.text.x=element_text(angle=90), axis.title.y=element_blank()) + scale_fill_brewer(palette="Set1") + labs(title="Daily Pick Up Times")
ggsave(file="daily pickup.png")
# Daily drop off times
ggplot(day.dropoff.df, aes(time, fill=day)) + geom_density(alpha=0.4) + scale_x_datetime(breaks=date_breaks("2 hour"),labels=date_format("%H:%M")) +
theme(axis.text.x=element_text(angle=90), axis.title.y=element_blank()) + scale_fill_brewer(palette="Set1") + labs(title="Daily Drop off Times")
ggsave(file="daily dropoff.png")
## Look at weekend/days
weekday <- c("Monday","Tuesday","Wednesday","Thursday","Friday")
weekend <- c("Saturday", "Sunday")
wk.day.end <- rep("", nrow(data))
wk.day.end[day.pickup %in% weekday] <- "Weekday"
wk.day.end[!(day.pickup %in% weekday)] <- "Weekend"
wk.day.end.df <- data.frame(time=pickup.time, type=wk.day.end)
# Weekend vs weekday pick up times
ggplot(wk.day.end.df, aes(time, fill=type)) + geom_density(alpha=0.4) + scale_x_datetime(breaks=date_breaks("2 hour"),labels=date_format("%H:%M")) +
theme(axis.text.x=element_text(angle=90), axis.title.y=element_blank()) + labs(title="Weekday vs Weekend Pickup Times")
ggsave(file="weekday vs weekend.png")
## Look at what times certain age ranges are using
age.brackets <- function(x) {
siz <- length(x)
tmp <- rep("", siz)
tmp[which(x >= 0 & x < 13)] <- "Under 13"
tmp[which(x >= 13 & x < 20)] <- "Teenager"
tmp[which(x >= 20 & x <30)] <- "Twenties"
tmp[which(x >= 30 & x < 40)] <- "Thirties"
tmp[which(x >= 40 & x < 50)] <- "Forties"
tmp[which(x >= 50 & x < 60)] <- "Fifties"
tmp[which(x >= 60 & x < 70)] <- "Sixties"
tmp[which(x >= 70 & x < 80)] <- "Seventies"
return(tmp)
}
time.age <- data.frame(time=pickup.time, age=age.brackets(data$age))
time.age <- time.age[time.age$age != "",]
# Pick up times by age
ggplot(time.age, aes(time, fill=age)) + geom_bar() + scale_fill_brewer(palette="Set1") + scale_x_datetime(breaks=date_breaks("2 hour"),labels=date_format("%H:%M")) +
theme(axis.text.x=element_text(angle=90)) + labs(title="Age Pickup Times")
ggsave(file="agePickup.png")
## Stations
stat.table <- table(data$from_station_name, data$to_station_name) # Create table
stat.table.df <- as.data.frame(stat.table) # Coerce to DF
stat.table.df <- stat.table.df[order(stat.table.df$Freq, decreasing=T),] # Order by most frequent from -> to station
# Pull out used and unused
stat.used <- stat.table.df[stat.table.df$Freq > 0,]
stat.used$scale <- stat.used$Freq / sum(stat.used$Freq)
stat.unused <- stat.table.df[stat.table.df$Freq == 0,]
# Plot up top 15 stations as a heatmap. Cant plot up full stations because of large number in a sparse matrix
ggplot(stat.used[1:15,], aes(Var1, Var2)) + geom_tile(aes(fill=Freq)) +
scale_fill_gradient(low="white", high="steelblue") + labs(x="",y="") +
theme(axis.text.x=element_text(angle=90), axis.ticks=element_blank(), legend.position="none") + labs(title="Usage of top 15 Stations")
ggsave(file="heatmap.png")
# Try and make map of routes
# merge locations with coordinates
data.tmp <- ddply(data.frame(stations=c(data$from_station_name, data$to_station_name)), .(stations), summarize, log.freq=log(length(stations)))
data.merged <- merge(data.tmp, data.stations, by.x="stations", by.y="name")
# Get map
map <- get_map(location=c(lon=mean(data.merged$longitude), lat=mean(data.merged$latitude)), zoom=12, color="bw")
# Quick look at where the uses are
ggmap(map) + geom_point(aes(x=longitude, y=latitude, colour=log.freq), data=data.merged) + scale_colour_gradient(low="white", high="black") +
theme(axis.title.x=element_blank(), axis.title.y=element_blank()) + labs(title="Station Use")
ggsave(file="mapColorFreq.png")
# Overlay a heatmap onto it, using different colours
ggmap(map) + stat_density2d(mapping=aes(x=longitude, y=latitude, fill=log.freq, alpha=..level..), geom="polygon",data=data.merged) +
geom_point(aes(x=longitude, y=latitude, colour=log.freq), data=data.merged) + scale_colour_gradient(low="black", high="steelblue") +
theme(legend.position="none", axis.title.x=element_blank(), axis.title.y=element_blank()) + labs(title="Station Use HeatMap")
ggsave(file="mapHeatFreq.png")
setwd("C:\\code\\Divvy_Stations_Trips_2013")
## Read data
data <- read.csv("Divvy_Trips_2013.csv", stringsAsFactors=F)
## Look at what kind of data we have first
summary(data)
## starttime and stoptime should be a POSIXct class, also want to change it to ISO style of %Y-%m-%d %H:%M:%S, makes stuff later easier
# Take a look at values to know format
head(data$starttime)
head(data$stoptime) # check to make sure data is same format between the two
# Format then is %m/%d/%Y %HH:%MM
time.format <- "%m/%d/%Y %H:%M"
data$starttime <- as.POSIXct(data$starttime, time.format, tz="America/Chicago")
data$stoptime <- as.POSIXct(data$stoptime, time.format, tz="America/Chicago")
## From this tripduration/from_station_id/to_station_id are reported as characters but really should be integers
# Find out what values we are dealing with
unique(data$tripduration)
sort(unique(data$to_station_id)))
sort(unique(data$from_station_id))
# From this the easiest fix is tripduration. Values have a , in them!
data$tripduration <- as.numeric(gsub(",", "", data$tripduration))
# Next up lets find out what is causing the #N/A in to_station_id and from_station_id
tmp_1 <- which(data$to_station_id == "#N/A")
head(data[tmp_1,])
# Looks like its Congress Pkwy & Ogden Ave that may be causing the issue, but lets double check this
unique(data[tmp_1,]$to_station_name)
# only get the one, now lets check again for from_station_name. Most likely will have the same issue
tmp_2 <- which(data$from_station_id == "#N/A")
head(data[tmp_2,])
unique(data[tmp_2,]$from_station_name)
# Again same issue, so lets assign our own station id
known_station_ids <- sort(unique(as.numeric(c(data$from_station_id, data$to_station_id))))
# Check to see if value is there, if not the station becomes that id
tmp_id <- 9999
is.done <- F
while (!(is.done)) {
if (!(tmp_id %in% known_station_ids)) {
data$to_station_id[tmp_1] <- tmp_id
data$from_station_id[tmp_2] <- tmp_id
is.done <- T
}
tmp_id <- tmp_id + 1
}
# Convert over to numeric
data$to_station_id <- as.numeric(data$to_station_id)
data$from_station_id <- as.numeric(data$from_station_id)
## Save data
write.csv(data, "Divvy_Trips_Clean_2013.csv", row.names=F)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment