Skip to content

Instantly share code, notes, and snippets.

@ikovsky
Last active Feb 2, 2017
Embed
What would you like to do?
The codes for the NYC yellow taxi project
Assign.yellow.taxi<-function(strName,months=1:12) { # the end result is the sum of the tables from the various months
monthStr <- str_pad(months,2,"left",pad='0')
x <- get(paste0(strName,monthStr[1]))
colNames <- colnames(x)
numColNames <- grep(pattern='Sum$',colNames,value=T)
numColNames <- c(numColNames, 'count')
keyNames <- setdiff(colNames, numColNames)
for (monthToken in monthStr[2:length(months)]) {
y<-get(paste0(strName,monthToken))
# Do the sums for numerical columns but taking care of the missing rows by using outer_join
w<-full_join(x,y,keyNames)
w[is.na(w)]<-0
x<-w[,c(keyNames, paste0(numColNames,'.x'))]
y<-w[,paste0(numColNames,'.y')]
names(x) <- colNames
names(y) <- numColNames
x[,numColNames]<-x[,numColNames] + y
}
return(x)
}
Averaging.yellow.taxi<-function(df) {
colKeys<-names(df)
try(if (!('count' %in% colKeys)) stop("No count in the columns of df"))
newCols<-sub('Sum','',colKeys)
x<-grep(pattern='Sum$',colKeys,value=T) # grep the column names which end by 'Sum'
y<-as.data.frame(matrix(nrow=nrow(df),ncol=length(newCols)))
names(y)<-newCols
for (key in colKeys) {
if (key %in% x) {
headStr<-sub('Sum','',key) #collapsing the 'Sum' into an empty string
y[[headStr]]<-ifelse(df$count>0,df[[key]]/df$count,numeric(nrow(df)))
}
else {y[,key] <- df[,key]}
}
return(y)
}
connection<- green_clean %>% filter(speed>1,speed<80,percent<50,duration<60,Trip_distance<20) %>%
group_by(Pickup_Borough=boroughCode_p,Destination_Borough=boroughCode_d) %>%
summarise(count=n(),dist=mean(Trip_distance), speed=mean(speed),duration=mean(duration),percent=mean(percent))
connection <- filter(connection,Pickup_Borough!=5,Destination_Borough!=5)
connection$Pickup_Borough <- factor(connection$Pickup_Borough,labels=Boroughs[1:4])
connection$Destination_Borough <- factor(connection$Destination_Borough,labels=Boroughs[1:4])
# starting boroughs vs target borough in a bar graph
ggplot(data=connection,aes(x=Pickup_Borough,y=count)) + geom_bar(stat='identity',aes(fill=Destination_Borough),
position='dodge') + xlab('Starting Borough') + ylab('green taxi rides in 2015') + scale_y_continuous(breaks=seq(0,4e6,by=2e6),
labels=c('0M','2M','4M'))
# the speed of cross-borough traffic in a 2D heat map, using start, end borough as horizontal, vertical indexes
ggplot(connection, aes(x=as.factor(Pickup_Borough), y=as.factor(Destination_Borough),fill=speed)) + geom_tile() + scale_fill_gradientn(colors=c('black','dark red','red',
'orange','yellow','white')) + scale_x_discrete(name='Green Taxi Originated Borough') +ggtitle(
'speed-origin vs target borough heat map') + ylab('Target Borough') + labs(fill="speed")
# To analyze the borough's traffic across different hours in the day
by_origin_hour <- green_clean %>% filter(speed>1,speed<80,percent<50,duration<60,Trip_distance<20,boroughCode_p!=5) %>%
group_by(Pickup_Borough=factor(boroughCode_p,labels=Boroughs[1:4]),hour=factor(hour)) %>%
summarise(count=n(),dist=mean(Trip_distance), speed=mean(speed),duration=mean(duration),percent=mean(percent))
ggplot(data=by_origin_hour,aes(x=as.factor(Pickup_Borough),y=as.factor(hour))) + geom_tile(aes(fill=count)) + scale_fill_gradientn(colors=c('black','dark red','red',
'orange','yellow','white'), breaks=seq(1e5,4e5,by=1e5),labels=c('0.1M','0.2M','0.3M','0.4M')) + scale_x_discrete(name='Originated Borough') +ggtitle(
'The ride count in an origin vs hour heat map') + ylab('hour') + labs(fill="daily taxi rides")
# Step 1. Data Pre-processing, data cleaning/filtering, etc.
# We bundle the 12 monthly NYC green taxi files into two half-year gz files by using cat, >> and gzip utilities.
# Loading the NYC geojson polygon file
nyc<-geojson_read('nycboroughboundaries.geojson',what='sp')
Boroughs = c('Manhattan', 'Bronx', 'Brooklyn', 'Queens', 'Staten Island') #BoroughCode Mapping
nyc2<-nyc
nyc2@data[,2:3]<-NULL #only keep the borough code, throw away the others
green_taxi1<-read.csv('green_tripdata_2015_01-06.csv.gz',stringsAsFactors = F)
green_taxi2<-read.csv('green_tripdata_2015_07-12.csv.gz',stringsAsFactors = F)
green_taxi1$dummy1 <- NULL #handle the excess ',' in all the rows of the csv files
green_taxi1$dummy2 <- NULL
green_taxi2$dummy1 <- NULL
green_taxi2$dummy2 <- NULL
green_taxi<-rbind(green_taxi1,green_taxi2)
names(green_taxi) = c("VendorID" , "Pickup_datetime" ,"Dropoff_datetime" ,"Store_and_fwd_flag" ,
"RateCodeID" , "Pickup_longitude" , "Pickup_latitude" , "Dropoff_longitude" ,
"Dropoff_latitude" , "Passenger_count" , "Trip_distance" , "Fare_amount" ,
"Extra" , "MTA_tax" , "Tip_amount" , "Tolls_amount" ,
"Ehail_fee" , "improvement_surcharge" ,"Total_amount" , "Payment_type" ,
"Trip_type" )
rm(green_taxi1)
rm(green_taxi2)
PickupPts<-SpatialPoints(cbind(green_taxi$Pickup_longitude,green_taxi$Pickup_latitude))
PickupPts@proj4string <- nyc@proj4string
pickupBoroughCodes<-PickupPts %over% nyc2
rm(PickupPts)
green_taxi$boroughCode_p <- pickupBoroughCodes$boroughCode
rm(pickupBoroughCodes)
DropoffPts<-SpatialPoints(cbind(green_taxi$Dropoff_longitude,green_taxi$Dropoff_latitude))
DropoffPts@proj4string <- nyc@proj4string
dropoffBoroughCodes<-DropoffPts %over% nyc2
rm(DropoffPts)
green_taxi$boroughCode_d <- dropoffBoroughCodes$boroughCode
rm(dropoffBoroughCodes)
green_taxi$month <- month(green_taxi$Pickup_datetime)
green_taxi$wday <- wday(green_taxi$Pickup_datetime)
green_taxi$hour <- hour(green_taxi$Pickup_datetime)
# Change the format of datetime from string to POSIXct objects
green_taxi$Pickup_datetime <- as.POSIXct(green_taxi$Pickup_datetime,format='%Y-%m-%d %H:%M:%S')
green_taxi$Dropoff_datetime <- as.POSIXct(green_taxi$Dropoff_datetime,format='%Y-%m-%d %H:%M:%S')
green_taxi$duration <- floor(as.double(green_taxi$Dropoff_datetime-green_taxi$Pickup_datetime)/60.0)
green_taxi$percent <- green_taxi$Tip_amount/green_taxi$Fare_amount * 100.0
green_taxi$speed <- green_taxi$Trip_distance/green_taxi$duration *60
save(green_taxi,file='green_tripdata_2015.RData') # saving the raw data to R native binary format for
# future usage
# Even though the data has no missing values (it has some unused column whose data is missing), the
# data is very dirty. Source: untrained human input error, GPS instrument error...,etc. We have to apply
# filter to remove those outliers which can damage our analysis. Furthermore, we need to fiter away
#taxi rides which are actually FHV (for hire vehicle) rides which can last several hours and a few hundred
#dollors. We fiter these rare events away.
green_adm <- filter(green_taxi, RateCodeID %in% c(1,2,6), Payment_type %in% 1:2)
# filter away non-credit card/cash ride and rides to Westchester county,etc.
rm(green_taxi)
green_clean<-filter(green_adm,Fare_amount<200,Total_amount<1000,Tolls_amount<30,Fare_amount>0,Trip_distance<50,
Trip_distance>0.02,duration<120,duration>=1,speed>0,speed<100,percent<100,
!is.na(boroughCode_p),!is.na(boroughCode_d))
# green_clean is the data.frame for which the Time-domain analysis is performed
by_wdayNhour<-green_clean %>% group_by(.,wday,hour) %>% summarise(.,speed=mean(speed),duration=mean(duration),
Payment_type=mean(Payment_type), dist=mean(Trip_distance),percent=mean(percent),count=n()/365.0)
ggplot(by_wdayNhour, aes(x=as.factor(wday), y=as.factor(hour),fill=speed)) + geom_tile() + scale_fill_gradientn(colors=c('black','dark red','red',
'orange','yellow','white')) + scale_x_discrete(name='week day',
labels=c('1'='S','2'='M','3'='T','4'='W','5'='T','6'='F','7'='S'))+ggtitle(
'The green taxi speed in a weekday vs hour heat map') + ylab('hour') + labs(fill="Avg Speed (mph)")
by_speed<-green_clean %>% filter(Payment_type==1,speed<50,percent<50) %>% group_by(speed=floor(2*speed)/2) %>% summarise(
duration=mean(duration),percent=mean(percent),dist=mean(Trip_distance),count=n()/365.0)
# use ggplot to plot the tip percentage w.r.t. speeds
ggplot(by_speed, aes(x=(speed), y=percent)) + geom_point(aes(color
=count),size=2,shape=1) + scale_x_continuous(name='speed (mph)',
breaks=seq(0,50,by=5))+geom_smooth(se=F) + ylab('tip percentage')
# to plot the multi-intraday time-windows tip percent vs speed
# We refine this into different hours in the day
newLevel<-c(rep('0-6am',6),rep('6-9am',3),rep('9am-4pm',7),rep('4-7pm',3),rep('7-12pm',5))
by_speed_H <-green_clean %>% filter(Payment_type==1,speed<30,percent<50) %>% group_by(speed=floor(2*speed)/2,hour=factor(hour)) %>% summarise(
duration=mean(duration),percent=mean(percent),dist=mean(Trip_distance),count=n()/365.0)
levels(by_speed_H$hour)<-newLevel
g<-ggplot(by_speed_H, aes(x=(speed), y=percent)) + scale_x_continuous(
name='speed (mph)',breaks=seq(0,50,by=10))+ geom_smooth(se=F,aes(color=I('green'))) + ylab('tip percentages')
g+facet_wrap(~hour) + ggtitle('The Tip Percentages at Differerent Speeds')
g<-ggplot(by_speed_H, aes(x=(speed), y=duration)) + scale_x_continuous(
name='speed (mph)',breaks=seq(0,50,by=10))+geom_smooth(se=F,aes(color=I('green'))) + ylab('duration (minutes)')
g+facet_wrap(~hour) + ggtitle('The Ride Duration at Differerent Speeds')
# To analyze the percentages of credit card paying passengers who do NOT pay
noPay_byspeed<-green_clean %>% filter(Payment_type==1,speed<50,percent==0.0) %>% group_by(speed=floor(2*speed)/2) %>% summarise(
duration=mean(duration),percent=mean(percent),dist=mean(Trip_distance),count=n()/365.0)
noPay_byspeed$ratio = noPay_byspeed$count/by_speed$count * 100
ggplot(noPay_byspeed, aes(x=(speed), y=ratio)) + geom_point(size=2,shape=1) + scale_x_continuous(name='speed (mph)',
breaks=seq(0,50,by=5))+geom_smooth(se=F)+ylab('non-tippers\' population percentages')
Map.PartialReduce.yellow.taxi<-function(yellow_adm) {
yellow_clean<-yellow_adm %>% filter(Fare_amount<200,Total_amount<1000,Tolls_amount<30,Fare_amount>0,Trip_distance<50,
Trip_distance>0.02,duration<120,duration>=1,speed>0,speed<100,percent<100)
print(paste('month', month, 'yellow_clean number of rows:',nrow(yellow_clean)))
x<-select(yellow_clean,month,wday,hour,duration,speed, percent,Tip_amount,Fare_amount,Total_amount,Tolls_amount,
Trip_distance,Payment_type,RateCodeID,Passenger_count,boroughCode_p,boroughCode_d)
rm(yellow_clean)
y<-transmute(x,Passenger_count,month,wday,hour,duration,percent, Fare_amount, speed, Total_amount, Tolls_amount, Trip_distance,Payment_type,RateCodeID,boroughCode_p,boroughCode_d)
y<-filter(y,speed<80,percent<50,duration<60,Trip_distance<20)
y<-filter(y,Fare_amount<200,Total_amount<1000,Tolls_amount<30,Fare_amount>0,Trip_distance<20,
Trip_distance>0.02,duration<120,duration>=1,speed>0,
!is.na(boroughCode_p),!is.na(boroughCode_d))
by_passenger <- y %>% group_by(Passenger_count) %>% summarise(count=n())
by_wday_hour <- y %>% group_by(wday,hour) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n())
by_month <- y %>% group_by(month) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n())
by_wday <- y %>% group_by(wday) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n())
by_hour <- y %>% group_by(hour) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n())
by_origin_hour<- y %>% group_by(Pickup_Borough=factor(boroughCode_p,labels=Boroughs),hour=factor(hour)) %>%
summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n())
by_speed <- y %>% filter(Payment_type==1,speed<50) %>% group_by(speed=floor(speed*2)*0.5) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), distSum=sum(Trip_distance), count=n())
by_speed_hour <- y %>% filter(Payment_type==1,speed<50) %>% group_by(speed=floor(speed*2)*0.5,hour) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), distSum=sum(Trip_distance), count=n())
by_duration <- y %>% filter(duration<60) %>% group_by(duration=floor(2*duration)*0.5) %>% summarise(percentSum=sum(percent),speedSum=sum(speed),distSum=sum(Trip_distance), count=n())
by_dist <- y %>% filter(Trip_distance<10) %>% group_by(dist=floor(2*Trip_distance)*0.5) %>% summarise(percentSum=sum(percent),speedSum=sum(speed),durationSum=sum(duration), count=n())
connection<- y %>% group_by(Pickup_Borough=factor(boroughCode_p,labels=Boroughs),
Destination_Borough=factor(boroughCode_d,labels=Boroughs)) %>%
summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n())
pay_byspeed_H <- y %>% filter(Payment_type==1,speed<50,percent>0,percent<50) %>% group_by(speed=floor(2*speed)/2,hour=factor(hour)) %>% summarise(
durationSum=sum(duration),percentSum=sum(percent),distSum=sum(Trip_distance),count=n())
noPay_byspeed_H <- y %>% filter(Payment_type==1,speed<50,percent==0) %>% group_by(speed=floor(2*speed)/2,hour=factor(hour)) %>% summarise(
durationSum=sum(duration),distSum=sum(Trip_distance),count=n())
newLevel<-c(rep('0-6am',6),rep('6-9am',3),rep('9am-4pm',7),rep('4-7pm',3),rep('7-12pm',5))
myList <- c('noTipG_by_speed','tipG_by_speed', 'by_passenger', 'by_wday_hour','by_speed', 'pay_byspeed_H','noPay_byspeed_H',
'by_month','by_wday', 'by_hour', 'by_speed_hour','by_origin_hour', 'by_dist', 'by_duration','connection')
noTipG <-filter(y,percent==0,Payment_type==1)
noTipG_by_speed <- noTipG %>% group_by(speed=floor(speed*2)*0.5) %>% summarise(
percentSum=sum(percent),durationSum=sum(duration),distSum=sum(Trip_distance),count=n())
tipG <- filter(y,percent>0,Payment_type==1)
tipG_by_speed <- tipG %>% group_by(speed=floor(speed*2)*0.5) %>% summarise(
percentSum=sum(percent), durationSum=sum(duration), distSum=sum(Trip_distance), count=n())
for (vName in myList) {
assign(paste0(vName,months[month]), get(vName))
}
dump(paste0(myList,months[month]),file='yellow_taxi_dump.R',append=T)
rm(x,y)
return(myList)
}
Preprocessing.Yellow.Taxi<-function(fileName,nyc) {
library(saves)
library(stringr)
library(sp)
Boroughs = c('Manhattan', 'Bronx', 'Brooklyn', 'Queens', 'Staten Island') #BoroughCode Mapping
nyc2<-nyc
nyc2@data[,2:3]<-NULL
yellow_taxi<-read.csv(fileName,stringsAsFactors = F)
print(paste("Finish reading the yellow taxi file for month",month))
names(yellow_taxi)=c("VendorID","Pickup_datetime","Dropoff_datetime",
"Passenger_count","Trip_distance","Pickup_longitude","Pickup_latitude","RateCodeID","Store_and_fwd_flag",
"Dropoff_longitude","Dropoff_latitude","Payment_type","Fare_amount","Extra",
"Mta_tax","Tip_amount","Tolls_amount","Improvement_surcharge","Total_amount")
# Convert the long, lat info to the NYC borough categories
PickupPts<-SpatialPoints(cbind(yellow_taxi$Pickup_longitude,yellow_taxi$Pickup_latitude))
PickupPts@proj4string <- nyc@proj4string
pickupBoroughCodes<-PickupPts %over% nyc2
rm(PickupPts)
yellow_taxi$boroughCode_p <- pickupBoroughCodes$boroughCode
rm(pickupBoroughCodes)
DropoffPts<-SpatialPoints(cbind(yellow_taxi$Dropoff_longitude,yellow_taxi$Dropoff_latitude))
DropoffPts@proj4string <- nyc@proj4string
dropoffBoroughCodes<-DropoffPts %over% nyc2
rm(DropoffPts)
yellow_taxi$boroughCode_d <- dropoffBoroughCodes$boroughCode
rm(dropoffBoroughCodes)
# convert the datetime string into month, wday, hour, etc.
yellow_taxi$month = month(yellow_taxi$Pickup_datetime)
yellow_taxi$wday = wday(yellow_taxi$Pickup_datetime)
yellow_taxi$hour = hour(yellow_taxi$Pickup_datetime)
# add derived variables which we will analyze
yellow_taxi$duration = floor(as.double(as.POSIXct(yellow_taxi$Dropoff_datetime,format='%Y-%m-%d %H:%M:%S')
-as.POSIXct(yellow_taxi$Pickup_datetime,format='%Y-%m-%d %H:%M:%S'))/60.0)
yellow_taxi$speed = (yellow_taxi$Trip_distance/yellow_taxi$duration)*60
yellow_taxi$percent = (yellow_taxi$Tip_amount/yellow_taxi$Fare_amount) *100
yellow_taxi$Pickup_datetime<-NULL
yellow_taxi$Dropoff_datetime<-NULL
# Restrict to the credit card/cash payment, standard fare, JFK rides and group rides, etc.
# we do not intend to include negotiated fares or payment by non-standard method.
yellow_adm <- filter(yellow_taxi,RateCodeID %in% c(1,2,6),Payment_type %in% 1:2)
print(paste('month', month, 'raw number of rows:', nrow(yellow_taxi)))
rm(yellow_taxi)
return(yellow_adm)
}
# After restarting a new session (or rm(list=ls()) ), assuming that the relavent data has been saved into the files
library(dplyr)
library(lubridate)
library(reshape2)
library(stringr)
library(saves)
library(ggplot2)
library(ggmap)
source('yellow_taxi_dump.R') # source and dump to the file, dump allows writing in an appending mode
# varNames are from the previous session
by_wday <- Assign.yellow.taxi('by_wday') # aggregating the monthly tables into a single data frame by aggregating
by_wday <- Averaging.yellow.taxi(by_wday) #conver the sum statistics to mean statistics
# the graphing to ggplot is not display as they are almost identical to the green taxis'.
# in the following, demonstrate how to plot the origination locaitons of the yellow taxis
nyc_map_g_str <- get_map(location = "new york city", zoom = 11) # google map of the NYC
months = str_pad(1:12,2,"left",pad='0')
yellow_files = paste0('yellow_adm_2015_',months,'.RDatas')
colNames = c('Pickup_longitude','Pickup_latitude','percent','boroughCode_p','boroughCode_d')
colNames = c('Dropoff_longitude','Dropoff_latitude','boroughCode_p','boroughCode_d')
inNYC = list()
for (i in 1:12) {
yellow_partial<-loads(file=yellow_files[i],variables=colNames, to.data.frame=T)
# test the boroughCodes to determine if the origin and the destination are both in the city
inNYC[[i]] <- (!is.na(yellow_partial$boroughCode_p))&(!is.na(yellow_partial$boroughCode_d))
rm(yellow_partial)
if (i%%3==0) {gc()} # call the garbage collector
}
colNames <- c('Pickup_longitude','Pickup_latitude','duration','percent')
yellow_samples = as.data.frame(matrix(nrow=0,ncol=length(colNames)))
colnames(yellow_samples) <- colNames
for (i in 1:12) {
yellow_partial<-loads(file=yellow_files[i],variables=colNames, to.data.frame=T)
yellow_samples <- rbind(yellow_samples, sample_frac(filter(yellow_partial,inNYC[[i]]),size=0.015))
rm(yellow_partial) # tell the garbage collector that it can discard the variable
if (i%%3==0) { gc() } # calling garbage collector every 3 iteration steps
}
yellow_sample <- transmute(yellow_samples, lon=signif(Pickup_longitude,5), lat=signif(Pickup_latitude,5)) # truncate the long-lat coordinates
#or for drop off locations
#yellow_sample <- transmute(yellow_samples, lon=signif(Dropoff_longitude,5), lat=signif(Dropoff_latitude,5))
rm(yellow_samples)
yellow_sample<-filter(yellow_sample,lon > -80,lon< -65,lat > 39,lat< 46)
# rounding the long, lat to the desired accuracies
ggmap(nyc_map_g_str, extent = "device") + stat_density2d(data = yellow_sample, aes(x = lon, y = lat, fill = ..level.., alpha = ..level..), size = 0.01, bins = 80, geom = "polygon") + scale_fill_gradient(low = "blue",
high = "red") + scale_alpha(range = c(0, 0.3), guide = FALSE)+ theme(legend.position='none') +ggtitle('yellow taxi origination plot')
rm(yellow_sample)
library(dplyr)
library(lubridate)
library(reshape2)
library(stringr)
library(ggmap)
library(ggplot2)
library(ggrepel)
library(grid)
library(saves)
library(geojsonio)
library(sp)
library(rgdal)
nyc<-geojson_read('nycboroughboundaries.geojson',what='sp') # load the nyc spatial polygon data frame
Boroughs = c('Manhattan', 'Bronx', 'Brooklyn', 'Queens', 'Staten Island') #BoroughCode Mapping
months = str_pad(1:12,2,"left",pad='0')
yellow_taxi_files = paste0('yellow_tripdata_2015-',months,'.csv.gz')
yellow_taxi_output_Files = paste0('yellow_adm_2015_',months,'.RDatas')
varNames = c() # will store the header names of the variables in yellow_taxi_dump.R
# The bottleneck of our computation is the RAM ~ 16G. Otherwise, we could have used
#the multi-threading support of R (FOREACH) to parallelly compute the monthly map and reduce steps
#as they are totally un-related.
for (month in 1:12) {
yellow_adm <- Preprocessing.Yellow.Taxi(yellow_taxi_files[month],nyc)
# 'saves' stores the yellow_adm as many 1 column data.frames
saves(yellow_adm,file=yellow_taxi_output_Files[month],overwrite=T)
varNames <- Map.PartialReduce.yellow.taxi(yellow_adm)
rm(yellow_adm)
gc()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment