Created
February 22, 2014 19:09
-
-
Save awcull/9160251 to your computer and use it in GitHub Desktop.
Divvy Data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ggplot2) | |
library(scales) | |
library(reshape) | |
library(ggmap) | |
setwd("C:\\code\\Divvy_Stations_Trips_2013") | |
## Read data | |
data <- read.csv("Divvy_Trips_Clean_2013.csv", stringsAsFactors=F) | |
data.stations <- read.csv("Divvy_Stations_2013.csv") | |
## Easy stuff | |
ggplot(na.omit(data), aes(gender)) + geom_bar() + labs(title="Gender") # gender diff | |
ggsave(file="gender.png") | |
data$age <- as.numeric(format(Sys.Date(), "%Y")) - data$birthyear | |
ggplot(data, aes(age)) + geom_bar() + scale_x_continuous(breaks=seq(0,120,10)) + labs(title="Age Distribution") # age histogram | |
ggsave(file="age.png") | |
# Convert dates | |
data$starttime <- as.POSIXct(data$starttime) | |
data$stoptime <- as.POSIXct(data$stoptime) | |
## Take look at pickup and drop off times | |
# Need to times, due to how this works we just put all the times as one day | |
pickup.time <- as.POSIXct(format(data[,2], "%H:%M:%S"), "%H:%M:%S", tz="GMT") | |
dropoff.time <- as.POSIXct(format(data[,3], "%H:%M:%S"), "%H:%M:%S", tz="GMT") | |
pickup <- data.frame(time=pickup.time) | |
pickup$type <- "pickup" | |
dropoff <- data.frame(time=dropoff.time) | |
dropoff$type <- "dropoff" | |
time.data <- rbind(pickup,dropoff) | |
# pick up drop off times | |
ggplot(time.data, aes(time, fill=type)) + geom_density(alpha=0.4) + | |
scale_x_datetime(breaks=date_breaks("2 hour"), labels=date_format("%H:%M")) + | |
theme(axis.text.x=element_text(angle=90), axis.title.y=element_blank()) + labs(title="Pick Up and Drop Off") | |
ggsave(file="Pickup vs Dropoff.png") | |
## Look at days of the week | |
day.pickup <- weekdays(data$starttime) | |
day.dropoff <- weekdays(data$stoptime) | |
day.pickup.df <- data.frame(time=pickup.time, day=day.pickup) | |
day.dropoff.df <- data.frame(time=dropoff.time, day=day.dropoff) | |
# Daily pick up times | |
ggplot(day.pickup.df, aes(time, fill=day)) + geom_density(alpha=0.4) + scale_x_datetime(breaks=date_breaks("2 hour"),labels=date_format("%H:%M")) + | |
theme(axis.text.x=element_text(angle=90), axis.title.y=element_blank()) + scale_fill_brewer(palette="Set1") + labs(title="Daily Pick Up Times") | |
ggsave(file="daily pickup.png") | |
# Daily drop off times | |
ggplot(day.dropoff.df, aes(time, fill=day)) + geom_density(alpha=0.4) + scale_x_datetime(breaks=date_breaks("2 hour"),labels=date_format("%H:%M")) + | |
theme(axis.text.x=element_text(angle=90), axis.title.y=element_blank()) + scale_fill_brewer(palette="Set1") + labs(title="Daily Drop off Times") | |
ggsave(file="daily dropoff.png") | |
## Look at weekend/days | |
weekday <- c("Monday","Tuesday","Wednesday","Thursday","Friday") | |
weekend <- c("Saturday", "Sunday") | |
wk.day.end <- rep("", nrow(data)) | |
wk.day.end[day.pickup %in% weekday] <- "Weekday" | |
wk.day.end[!(day.pickup %in% weekday)] <- "Weekend" | |
wk.day.end.df <- data.frame(time=pickup.time, type=wk.day.end) | |
# Weekend vs weekday pick up times | |
ggplot(wk.day.end.df, aes(time, fill=type)) + geom_density(alpha=0.4) + scale_x_datetime(breaks=date_breaks("2 hour"),labels=date_format("%H:%M")) + | |
theme(axis.text.x=element_text(angle=90), axis.title.y=element_blank()) + labs(title="Weekday vs Weekend Pickup Times") | |
ggsave(file="weekday vs weekend.png") | |
## Look at what times certain age ranges are using | |
age.brackets <- function(x) { | |
siz <- length(x) | |
tmp <- rep("", siz) | |
tmp[which(x >= 0 & x < 13)] <- "Under 13" | |
tmp[which(x >= 13 & x < 20)] <- "Teenager" | |
tmp[which(x >= 20 & x <30)] <- "Twenties" | |
tmp[which(x >= 30 & x < 40)] <- "Thirties" | |
tmp[which(x >= 40 & x < 50)] <- "Forties" | |
tmp[which(x >= 50 & x < 60)] <- "Fifties" | |
tmp[which(x >= 60 & x < 70)] <- "Sixties" | |
tmp[which(x >= 70 & x < 80)] <- "Seventies" | |
return(tmp) | |
} | |
time.age <- data.frame(time=pickup.time, age=age.brackets(data$age)) | |
time.age <- time.age[time.age$age != "",] | |
# Pick up times by age | |
ggplot(time.age, aes(time, fill=age)) + geom_bar() + scale_fill_brewer(palette="Set1") + scale_x_datetime(breaks=date_breaks("2 hour"),labels=date_format("%H:%M")) + | |
theme(axis.text.x=element_text(angle=90)) + labs(title="Age Pickup Times") | |
ggsave(file="agePickup.png") | |
## Stations | |
stat.table <- table(data$from_station_name, data$to_station_name) # Create table | |
stat.table.df <- as.data.frame(stat.table) # Coerce to DF | |
stat.table.df <- stat.table.df[order(stat.table.df$Freq, decreasing=T),] # Order by most frequent from -> to station | |
# Pull out used and unused | |
stat.used <- stat.table.df[stat.table.df$Freq > 0,] | |
stat.used$scale <- stat.used$Freq / sum(stat.used$Freq) | |
stat.unused <- stat.table.df[stat.table.df$Freq == 0,] | |
# Plot up top 15 stations as a heatmap. Cant plot up full stations because of large number in a sparse matrix | |
ggplot(stat.used[1:15,], aes(Var1, Var2)) + geom_tile(aes(fill=Freq)) + | |
scale_fill_gradient(low="white", high="steelblue") + labs(x="",y="") + | |
theme(axis.text.x=element_text(angle=90), axis.ticks=element_blank(), legend.position="none") + labs(title="Usage of top 15 Stations") | |
ggsave(file="heatmap.png") | |
# Try and make map of routes | |
# merge locations with coordinates | |
data.tmp <- ddply(data.frame(stations=c(data$from_station_name, data$to_station_name)), .(stations), summarize, log.freq=log(length(stations))) | |
data.merged <- merge(data.tmp, data.stations, by.x="stations", by.y="name") | |
# Get map | |
map <- get_map(location=c(lon=mean(data.merged$longitude), lat=mean(data.merged$latitude)), zoom=12, color="bw") | |
# Quick look at where the uses are | |
ggmap(map) + geom_point(aes(x=longitude, y=latitude, colour=log.freq), data=data.merged) + scale_colour_gradient(low="white", high="black") + | |
theme(axis.title.x=element_blank(), axis.title.y=element_blank()) + labs(title="Station Use") | |
ggsave(file="mapColorFreq.png") | |
# Overlay a heatmap onto it, using different colours | |
ggmap(map) + stat_density2d(mapping=aes(x=longitude, y=latitude, fill=log.freq, alpha=..level..), geom="polygon",data=data.merged) + | |
geom_point(aes(x=longitude, y=latitude, colour=log.freq), data=data.merged) + scale_colour_gradient(low="black", high="steelblue") + | |
theme(legend.position="none", axis.title.x=element_blank(), axis.title.y=element_blank()) + labs(title="Station Use HeatMap") | |
ggsave(file="mapHeatFreq.png") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
setwd("C:\\code\\Divvy_Stations_Trips_2013") | |
## Read data | |
data <- read.csv("Divvy_Trips_2013.csv", stringsAsFactors=F) | |
## Look at what kind of data we have first | |
summary(data) | |
## starttime and stoptime should be a POSIXct class, also want to change it to ISO style of %Y-%m-%d %H:%M:%S, makes stuff later easier | |
# Take a look at values to know format | |
head(data$starttime) | |
head(data$stoptime) # check to make sure data is same format between the two | |
# Format then is %m/%d/%Y %HH:%MM | |
time.format <- "%m/%d/%Y %H:%M" | |
data$starttime <- as.POSIXct(data$starttime, time.format, tz="America/Chicago") | |
data$stoptime <- as.POSIXct(data$stoptime, time.format, tz="America/Chicago") | |
## From this tripduration/from_station_id/to_station_id are reported as characters but really should be integers | |
# Find out what values we are dealing with | |
unique(data$tripduration) | |
sort(unique(data$to_station_id))) | |
sort(unique(data$from_station_id)) | |
# From this the easiest fix is tripduration. Values have a , in them! | |
data$tripduration <- as.numeric(gsub(",", "", data$tripduration)) | |
# Next up lets find out what is causing the #N/A in to_station_id and from_station_id | |
tmp_1 <- which(data$to_station_id == "#N/A") | |
head(data[tmp_1,]) | |
# Looks like its Congress Pkwy & Ogden Ave that may be causing the issue, but lets double check this | |
unique(data[tmp_1,]$to_station_name) | |
# only get the one, now lets check again for from_station_name. Most likely will have the same issue | |
tmp_2 <- which(data$from_station_id == "#N/A") | |
head(data[tmp_2,]) | |
unique(data[tmp_2,]$from_station_name) | |
# Again same issue, so lets assign our own station id | |
known_station_ids <- sort(unique(as.numeric(c(data$from_station_id, data$to_station_id)))) | |
# Check to see if value is there, if not the station becomes that id | |
tmp_id <- 9999 | |
is.done <- F | |
while (!(is.done)) { | |
if (!(tmp_id %in% known_station_ids)) { | |
data$to_station_id[tmp_1] <- tmp_id | |
data$from_station_id[tmp_2] <- tmp_id | |
is.done <- T | |
} | |
tmp_id <- tmp_id + 1 | |
} | |
# Convert over to numeric | |
data$to_station_id <- as.numeric(data$to_station_id) | |
data$from_station_id <- as.numeric(data$from_station_id) | |
## Save data | |
write.csv(data, "Divvy_Trips_Clean_2013.csv", row.names=F) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment