Created
February 21, 2016 23:23
-
-
Save chrswt/5764ae6088be078c99f9 to your computer and use it in GitHub Desktop.
Tartan Data Science Cup Ep. 1 - Citibike NYC
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Tartan Data Science Cup Episode 1" | |
author: "Christopher Tham (ctham), Eddy Yeo (eyeo), Kevin Ku (kku)" | |
output: html_document | |
--- | |
```{r, include=FALSE} | |
library(animation) | |
library(ggmap) | |
library(ggplot2) | |
library(plyr) | |
library(np) | |
library(gridExtra) | |
library(lubridate) | |
# Adapted from 36-401 dap solutions. | |
# Set-up: Save results so that code blocks aren't re-run unless code | |
# changes (cache), _or_ a relevant earlier code block changed (autodep), | |
# don't clutter R output with messages or warnings, turn off meaningless | |
# significance stars | |
library(knitr) | |
opts_chunk$set(cache=TRUE, autodep=TRUE, message=TRUE, warning=TRUE) | |
options(show.signif.stars=FALSE, np.messages=FALSE) | |
``` | |
_Introduction_ | |
Bike stations have become popular in the recent years especially in large cities such as NYC. In this report, we examine the possible locations to build two additional bike stations to increase ridership. We analyze the different usage patterns between subscribing members, one-off riders, and female riders and take them into account in our recommedations. | |
_Exploratory Data Analysis_ | |
```{r echo = FALSE} | |
bike <- read.csv("20150708-citibike-tripdata.csv") | |
bike$tripduration_mins <- bike$tripduration / 60 | |
bike$gender <- with(bike, | |
ifelse(gender == 0, NA, | |
ifelse(gender == 1, "Male", "Female"))) | |
time_cols <- c("starttime", "stoptime") | |
bike[, time_cols] <- sapply(bike[, time_cols], function(t) { | |
as.POSIXlt(t, format = "%m / %d / %Y %H:%M:%S") | |
}) | |
``` | |
We see from our two histograms below that the distribution of riders is heavily skewed towards males, (we do not have gender data on one-off riders), and towards subscribers. The majority of riders are around 30 years of age, with the median age of subscribers being slightly lower than the median age of one-off riders. Most trips are less than 10 minutes long, although this distribution has a very heavy tail (some riders keep the bikes for two weeks). The histogram of trip durations below shows data only up to the 95% quantile to prevent graphic distortion. The distribution of the start time (hour of day) of bike rentals is clearly bimodal for subscribers. As our data spans a single Wednesday, it is clear that many of the subscribers use bike rentals as a means of commuting to work. | |
From our data, we clearly see a distinct mold of our customers. Majority of them are working age males in their mid-30s who use bikes to commute to work. The caveat is that we are only analyzing data from a single Wednesday, which might bias our analysis. | |
```{r echo = FALSE, fig.width = 10} | |
gender_bar <- ggplot(bike) + | |
aes(x = gender) + | |
geom_bar() + | |
ggtitle("Gender (Subscribers-only)") + | |
labs(x = "Gender", y = "Count") | |
usertype_bar <- ggplot(bike) + | |
aes(x = usertype) + | |
geom_bar() + | |
ggtitle("User Type") + | |
labs(x = "User Type", y = "Count") | |
age_gender_den <- ggplot(bike) + | |
aes(x = 2016 - as.numeric(birth.year), fill = gender) + | |
geom_density(alpha = 0.5, adjust = 1/2) + | |
ggtitle("Rider Age colored by Gender") + | |
labs(x = "Age (years)", y = "Density") | |
# Cut off heavy tail as it distorts the plot. | |
tripduration_95 <- quantile(bike$tripduration, 0.95) | |
duration_hist <- ggplot(bike[bike$tripduration <= tripduration_95, ]) + | |
aes(x = tripduration_mins) + | |
geom_histogram(binwidth = 1) + | |
facet_wrap(~ usertype + gender) + | |
ggtitle("Trip Durations sliced by sliced by demographic") + | |
labs(x = "Duration (mins)", y = "Count") | |
hours_hist <- ggplot(bike) + | |
aes(x = starttime$hour) + | |
geom_histogram(binwidth = 1) + | |
facet_wrap(~ usertype) + | |
ggtitle("Cycling Hours by User Type") + | |
labs(x = "Hour of day", y = "Count") | |
suppressWarnings(grid.arrange(gender_bar, usertype_bar, | |
age_gender_den, duration_hist, | |
hours_hist, | |
ncol = 2)) | |
citibike <- read.csv("20150708-citibike-tripdata.csv") | |
# counting the number of arrivals and departures from each station | |
departures <- ddply(citibike, .(start.station.name, start.station.latitude, start.station.longitude), "nrow") | |
arrivals <- ddply(citibike, .(end.station.name, end.station.latitude, end.station.longitude), "nrow") | |
names(departures)[4] <- "trips" | |
names(arrivals)[4] <- "trips" | |
# plotting the departures | |
dep_map <- get_map(location = "lower manhattan", zoom = 13) | |
dep_mpoints <- ggmap(dep_map) + geom_point(aes(x = start.station.longitude, y = start.station.latitude, size = trips), | |
data = departures, alpha = .5) | |
#plotting the arrivals | |
arr_map <- get_map(location = "lower manhattan", zoom = 13) | |
arr_mpoints <- ggmap(arr_map) + geom_point(aes(x = end.station.longitude, y = end.station.latitude, size = trips), | |
data = arrivals, alpha = .5) | |
# converting start and end times to POSIXt | |
citibike$starttime <- mdy_hms(citibike$starttime) | |
citibike$stoptime <- mdy_hms(citibike$stoptime) | |
# morning slice -> 6am to 8am | |
morning <- subset(citibike, starttime > "2015-07-08 02:00:00" & starttime < "2015-07-08 04:00:00") | |
# evening slice -> 5:30pm to 7:30pm | |
evening <- subset(citibike, starttime > "2015-07-08 13:30" & starttime < "2015-07-08 15:30") | |
# departures in the morning | |
morning_departures <- ddply(morning, .(start.station.name, start.station.latitude, start.station.longitude), "nrow") | |
names(morning_departures)[4] <- "trips" | |
dep_morning <- ggmap(dep_map) + geom_point(aes(x = start.station.longitude, y = start.station.latitude, size = trips), | |
data = morning_departures, alpha = .5) | |
##penn station and times square | |
# arrivals in the morning | |
morning_arrivals <- ddply(morning, .(end.station.name, end.station.latitude, end.station.longitude), "nrow") | |
names(morning_arrivals)[4] <- "trips" | |
arr_morning <- ggmap(arr_map) + geom_point(aes(x = end.station.longitude, y = end.station.latitude, size = trips), | |
data = morning_arrivals, alpha = .5) | |
# departures in the evening | |
evening_departures <- ddply(evening, .(start.station.name, start.station.latitude, start.station.longitude), "nrow") | |
names(evening_departures)[4] <- "trips" | |
dep_evening <- ggmap(dep_map) + geom_point(aes(x = start.station.longitude, y = start.station.latitude, size = trips), | |
data = evening_departures, alpha = .5) | |
# arrivals in the evening | |
evening_arrivals <- ddply(evening, .(end.station.name, end.station.latitude, end.station.longitude), "nrow") | |
names(evening_arrivals)[4] <- "trips" | |
arr_evening <- ggmap(arr_map) + geom_point(aes(x = end.station.longitude, y = end.station.latitude, size = trips), | |
data = evening_arrivals, alpha = .5) | |
suppressWarnings(grid.arrange(dep_morning, arr_morning, | |
dep_evening, arr_evening, | |
ncol = 2)) | |
``` | |
_Modelling and Conclusions_ | |
We used a kernel regression model on the number of landmarks around our bike stations, distances from subway stations and the latitude and longitude to predict the number of total rentals per station. <br> | |
(Y = expected number of users) <br> | |
(X = latitude, longtitude location data, number of nearby landmarks, distance to a subway stop) <br> | |
Using taxi data to identify transportation pick-up hotspots, we identified candidate latitudes and longitudes. From these candidates, we used our model to predict the expected number of rentals, Y, and looked to maximize this value. | |
<br><br> | |
Three locations stood out, but one was dismissed (Madison Square Garden) because of the seasonal traffic fluctuations, as well as the fact that it was a well-serviced area by existing bike rentals. The other 2 locations were: <br> | |
(40.756, -73.991) West 40th and 8th Street. The nearest landmark to this is Madame Tussand's Gallery, and the closest bike station today is 93.5m away. The expected number of daily rentals here would be 462.96. <br> | |
(40.743777, -73.983367) Park Ave. and East 29th Street (Kip's Bay). This is an area with an exceptionally high female demographic, and the expected number of daily rentals here would be 394.85, despite it being 79.9m away from the current existing bike station. | |
```{r include = FALSE} | |
# Code to generate time based animation on start and end points. | |
nyc <- get_map("new york", zoom = 14) | |
nyc_map <- ggmap(nyc, extent = "device", legend = "topleft") | |
plot_usage <- function() { | |
for (t in 1:24) { | |
for (type in c("start", "end")) { | |
if(type == "start") { | |
lat <- bike$start.station.latitude | |
lon <- bike$start.station.longitude | |
} else { | |
lat <- bike$end.station.latitude | |
lon <- bike$end.station.longitude | |
} | |
df <- bike | |
df$lon <- lon | |
df$lat <- lat | |
df = df[df$starttime$hour == t, ] | |
male_plot <- nyc_map + | |
stat_density2d( | |
aes(x = lon, y = lat, | |
fill = ..level.., alpha = ..level..), | |
size = 0.01, | |
data = df[df$gender == "Male", ], | |
geom = "polygon" | |
) + | |
ggtitle("Female") | |
female_plot <- nyc_map + | |
stat_density2d( | |
aes(x = lon, y = lat, | |
fill = ..level.., alpha = ..level..), | |
size = 0.01, | |
data = df[df$gender == "Female", ], | |
geom = "polygon" | |
) + | |
ggtitle("Female") | |
if (type == "end") { | |
male_plot <- male_plot + | |
scale_fill_gradient(low = "black", high= "red") | |
female_plot <- female_plot + | |
scale_fill_gradient(low = "black", high= "red") | |
} | |
print(male_plot) | |
print(female_plot) | |
} | |
} | |
} | |
saveHTML(plot_usage()) | |
``` | |
```{r include = FALSE} | |
# Kernel regression predictive model. | |
station_ids <- data.frame(station_id = unique(bike$start.station.id)) | |
join_data1 <- merge(station_ids, bike, | |
by.x = "station_id", | |
by.y = "start.station.id" ) | |
join_data1$neighbor.station.id <- join_data1$end.station.id | |
join_data1$station_lat <- join_data1$start.station.latitude | |
join_data1$station_lon <- join_data1$start.station.longitude | |
join_data1 <- join_data1[, -which(colnames(join_data1) == "end.station.id")] | |
join_data2 <- merge(station_ids, bike, | |
by.x = "station_id", | |
by.y = "end.station.id" ) | |
join_data2$neighbor.station.id <- join_data2$start.station.id | |
join_data2$station_lat <- join_data2$end.station.latitude | |
join_data2$station_lon <- join_data2$end.station.longitude | |
join_data2 <- join_data2[, -which(colnames(join_data2) == "start.station.id")] | |
join_data <- rbind(join_data1, join_data2) | |
# Remove columns before ddply() to avoid type errors. | |
join_data <- join_data[, -which(colnames(join_data) == time_cols[1])] | |
join_data <- join_data[, -which(colnames(join_data) == time_cols[2])] | |
stations <- ddply(join_data, .(station_id), function(data) { | |
stops <- data.frame(neighbors = length(unique(data$neighbor.station.id))) | |
stops$female <- sum(data$gender == "Female", na.rm = TRUE) | |
stops$male <- sum(data$gender == "Male", na.rm = TRUE) | |
stops$unknown <- sum(is.na(data$gender)) | |
stops$subscriber <- sum(data$usertype == "Subscriber") | |
stops$customer <- sum(data$usertype == "Customer") | |
stops$unique_bikes <- length(unique(data$bikeid)) | |
stops$station_lon <- data$station_lon[1] | |
stops$station_lat <- data$station_lat[1] | |
# Sanity checks. | |
stopifnot(stops$female + stops$male + stops$unknown == nrow(data)) | |
stopifnot(stops$subscriber + stops$customer == nrow(data)) | |
return(stops) | |
}) | |
hist(stations$neighbors) | |
hist(stations$female) | |
hist(stations$male) | |
hist(stations$subscriber) | |
hist(stations$customer) | |
hist(stations$unique_bikes) | |
landmark <- read.csv("bike_landmarks_500m.csv") | |
subway <- read.csv("bike_subway_distance.csv") | |
stations <- merge(stations, landmark, | |
by.x = "station_id", | |
by.y = "Bike.Station") | |
stations <- merge(stations, subway, | |
by.x = "station_id", | |
by.y = "Bike.Station") | |
stations$total <- with(stations, subscriber + customer) | |
npm <- npreg(total ~ Distance..m. + Num.Landmarks + station_lat + station_lon, | |
data = stations) | |
taxis <- read.csv("taxi_candidates.csv") | |
taxis$station_lat <- taxis$Lat | |
taxis$station_lon <- taxis$Lon | |
taxis$Num.Landmarks <- taxis$Num_Landmarks | |
taxis$Distance..m. <- taxis$Distance_Subway | |
predict(npm, newdata = data.frame(station_lat = 40.7528, station_lon = -73.9765, | |
Num.Landmarks = 37, Distance..m. = 117)) | |
predict(npm, newdata = taxis) | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment