Skip to content

Instantly share code, notes, and snippets.

@milesgrimshaw
Created August 6, 2014 11:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save milesgrimshaw/67d26ffa50a6f9c4f5ef to your computer and use it in GitHub Desktop.
Save milesgrimshaw/67d26ffa50a6f9c4f5ef to your computer and use it in GitHub Desktop.
R script to analyze personal Citibike data
library(ggplot2)
library(lubridate)
## Set the working directory
setwd("~/Dropbox (Personal)/Personal/Github/Citibike")
## Organize restaurant data
r <- read.csv('trips.csv', header=TRUE, as.is=TRUE)
head(r)
## Calculate minute amount for each trip
r$actual_mins <- r$actual_duration/60
r$estimated_mins <- r$estimated_duration/60
## Save a version with only trips less than 60 mins
## My trips over 60 mins are errors
r <- r[which(r$actual_mins<60),]
write.csv(r,file="miles_trips_edits.csv", row.names=FALSE)
## Calculate average difference in actual vs. trip
r$time_diff_perc <- (r$estimated_mins - r$actual_mins)/r$estimated_mins
## Calculate a mile amount from meters for each trip
r$miles <- r$estimated_distance/1600
## Calculate a MPH amount for each trip
r$mph <- (r$miles/r$actual_mins)*60
## Correcting for errors
r <- r[which(r$actual_mins < 100),]
r <- r[which(r$mph < 25),]
r <- r[which(r$mph > 0),]
## Summary Stats
sum(r$miles)
summary(r$miles)
summary(r$miles[which(r$miles>0)])
sum(r$actual_mins)
sum(r$estimated_mins)
summary(r$actual_mins)
summary(r$time_diff_perc[which(is.finite(r$time_diff_perc))])*100
summary(r$mph)
summary(r$mph[which(r$mph<20)])
## Create time stamps
r$start_date <- as.POSIXct(r$start_time,origin="1970-01-01", format="%m/%d/%Y %I:%M:%S %p")
r$end_date <- as.POSIXct(r$end_time,origin="1970-01-01", format="%m/%d/%Y %I:%M:%S %p")
# End time in mins
r$end_date_mins <- ((hour(r$end_date)*60)+minute(r$end_date))
r$start_date_mins <- ((hour(r$start_date)*60)+minute(r$start_date))
## Create weekday morning and evening commute groups
weekday <- r[which(wday(r$start_date) > 1 & wday(r$start_date) < 7),]
morn_commute <- weekday[which(hour(weekday$start_date) > 6 & hour(weekday$start_date) < 11),]
morn_commute <- morn_commute[which(morn_commute$end_station == "Lafayette St & Jersey St"),]
weekday_evening <- weekday[which(hour(weekday$start_date) > 17 & hour(weekday$start_date) < 24),]
eve_commute <- weekday_evening[which(weekday_evening$start_station == "Lafayette St & Jersey St"),]
eve_commute_home <- weekday_evening[which(weekday_evening$end_station == "Greenwich Ave & Charles St"),]
## Summary speeds for morning vs evening commutes
summary(morn_commute$mph)
summary(weekday_evening$mph)
summary(eve_commute$mph)
## Morning work commmute end time in mins
summary(morn_commute$end_date_mins)/60
length(which((morn_commute$end_date_mins/60)>9))/length(morn_commute$end_date_mins)
length(which((morn_commute$end_date_mins/60)<=8))/length(morn_commute$end_date_mins)
## Evening work commmute end time in mins
summary(eve_commute$start_date_mins)/60
length(which((eve_commute$start_date_mins/60)>20))/length(eve_commute$start_date_mins)
summary(eve_commute_home$end_date_mins)/60
length(which((eve_commute_home$end_date_mins/60)>22))/length(eve_commute_home$end_date_mins)
## Create weekday binary for graphic
r$weekday <- "Weekend"
r$weekday[which(wday(r$start_date) > 1 & wday(r$start_date) < 7)] <- "Workday"
## Weekday vs. weekend speed
summary(r$mph[which(r$weekday==1)])
summary(r$mph[which(r$weekday==0)])
## Weekday vs. weekend distance
summary(r$miles[which(r$weekday==1)])
summary(r$miles[which(r$weekday==0)])
## Plot of speed with weekday vs weekend colored
r$weekday <- factor(r$weekday)
pdf(file="Plot_Time_Speed_Weekday.pdf",width=11,height=8.5)
ggplot(r, aes(x=hour(r$start_date), y=r$mph)) + geom_point(aes(colour = weekday), position = "jitter") + xlab("Hour of the Day") +
ylab("MPH") + ggtitle("Trip Speed by Time of Day")
dev.off()
## Plot distribution of trip distances
ggplot(r, aes(x=miles)) + geom_histogram(binwidth=0.1, fill="#0000CC") + xlab("Miles") + ylab("Number of Trips") +
ggtitle("Distribution of Trip Lengths")
## Plot distribution of trip times
ggplot(r, aes(x=actual_mins)) + geom_histogram(binwidth=1)
## Plot distribution of trip distances
pdf(file="Distribution_Trip_Speeds.pdf",width=11,height=8.5)
ggplot(r, aes(x=mph)) + geom_histogram(binwidth=1, fill="#0000CC") + xlab("MPH") + ylab("Number of Trips") +
ggtitle("Distribution of Trip Speeds")
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment