-
-
Save ikovsky/c371b5822f5ecdb6add173ad1c328c97 to your computer and use it in GitHub Desktop.
The codes for the NYC yellow taxi project
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Assign.yellow.taxi<-function(strName,months=1:12) { # the end result is the sum of the tables from the various months | |
monthStr <- str_pad(months,2,"left",pad='0') | |
x <- get(paste0(strName,monthStr[1])) | |
colNames <- colnames(x) | |
numColNames <- grep(pattern='Sum$',colNames,value=T) | |
numColNames <- c(numColNames, 'count') | |
keyNames <- setdiff(colNames, numColNames) | |
for (monthToken in monthStr[2:length(months)]) { | |
y<-get(paste0(strName,monthToken)) | |
# Do the sums for numerical columns but taking care of the missing rows by using outer_join | |
w<-full_join(x,y,keyNames) | |
w[is.na(w)]<-0 | |
x<-w[,c(keyNames, paste0(numColNames,'.x'))] | |
y<-w[,paste0(numColNames,'.y')] | |
names(x) <- colNames | |
names(y) <- numColNames | |
x[,numColNames]<-x[,numColNames] + y | |
} | |
return(x) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Averaging.yellow.taxi<-function(df) { | |
colKeys<-names(df) | |
try(if (!('count' %in% colKeys)) stop("No count in the columns of df")) | |
newCols<-sub('Sum','',colKeys) | |
x<-grep(pattern='Sum$',colKeys,value=T) # grep the column names which end by 'Sum' | |
y<-as.data.frame(matrix(nrow=nrow(df),ncol=length(newCols))) | |
names(y)<-newCols | |
for (key in colKeys) { | |
if (key %in% x) { | |
headStr<-sub('Sum','',key) #collapsing the 'Sum' into an empty string | |
y[[headStr]]<-ifelse(df$count>0,df[[key]]/df$count,numeric(nrow(df))) | |
} | |
else {y[,key] <- df[,key]} | |
} | |
return(y) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
connection<- green_clean %>% filter(speed>1,speed<80,percent<50,duration<60,Trip_distance<20) %>% | |
group_by(Pickup_Borough=boroughCode_p,Destination_Borough=boroughCode_d) %>% | |
summarise(count=n(),dist=mean(Trip_distance), speed=mean(speed),duration=mean(duration),percent=mean(percent)) | |
connection <- filter(connection,Pickup_Borough!=5,Destination_Borough!=5) | |
connection$Pickup_Borough <- factor(connection$Pickup_Borough,labels=Boroughs[1:4]) | |
connection$Destination_Borough <- factor(connection$Destination_Borough,labels=Boroughs[1:4]) | |
# starting boroughs vs target borough in a bar graph | |
ggplot(data=connection,aes(x=Pickup_Borough,y=count)) + geom_bar(stat='identity',aes(fill=Destination_Borough), | |
position='dodge') + xlab('Starting Borough') + ylab('green taxi rides in 2015') + scale_y_continuous(breaks=seq(0,4e6,by=2e6), | |
labels=c('0M','2M','4M')) | |
# the speed of cross-borough traffic in a 2D heat map, using start, end borough as horizontal, vertical indexes | |
ggplot(connection, aes(x=as.factor(Pickup_Borough), y=as.factor(Destination_Borough),fill=speed)) + geom_tile() + scale_fill_gradientn(colors=c('black','dark red','red', | |
'orange','yellow','white')) + scale_x_discrete(name='Green Taxi Originated Borough') +ggtitle( | |
'speed-origin vs target borough heat map') + ylab('Target Borough') + labs(fill="speed") | |
# To analyze the borough's traffic across different hours in the day | |
by_origin_hour <- green_clean %>% filter(speed>1,speed<80,percent<50,duration<60,Trip_distance<20,boroughCode_p!=5) %>% | |
group_by(Pickup_Borough=factor(boroughCode_p,labels=Boroughs[1:4]),hour=factor(hour)) %>% | |
summarise(count=n(),dist=mean(Trip_distance), speed=mean(speed),duration=mean(duration),percent=mean(percent)) | |
ggplot(data=by_origin_hour,aes(x=as.factor(Pickup_Borough),y=as.factor(hour))) + geom_tile(aes(fill=count)) + scale_fill_gradientn(colors=c('black','dark red','red', | |
'orange','yellow','white'), breaks=seq(1e5,4e5,by=1e5),labels=c('0.1M','0.2M','0.3M','0.4M')) + scale_x_discrete(name='Originated Borough') +ggtitle( | |
'The ride count in an origin vs hour heat map') + ylab('hour') + labs(fill="daily taxi rides") | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Step 1. Data Pre-processing, data cleaning/filtering, etc. | |
# We bundle the 12 monthly NYC green taxi files into two half-year gz files by using cat, >> and gzip utilities. | |
# Loading the NYC geojson polygon file | |
nyc<-geojson_read('nycboroughboundaries.geojson',what='sp') | |
Boroughs = c('Manhattan', 'Bronx', 'Brooklyn', 'Queens', 'Staten Island') #BoroughCode Mapping | |
nyc2<-nyc | |
nyc2@data[,2:3]<-NULL #only keep the borough code, throw away the others | |
green_taxi1<-read.csv('green_tripdata_2015_01-06.csv.gz',stringsAsFactors = F) | |
green_taxi2<-read.csv('green_tripdata_2015_07-12.csv.gz',stringsAsFactors = F) | |
green_taxi1$dummy1 <- NULL #handle the excess ',' in all the rows of the csv files | |
green_taxi1$dummy2 <- NULL | |
green_taxi2$dummy1 <- NULL | |
green_taxi2$dummy2 <- NULL | |
green_taxi<-rbind(green_taxi1,green_taxi2) | |
names(green_taxi) = c("VendorID" , "Pickup_datetime" ,"Dropoff_datetime" ,"Store_and_fwd_flag" , | |
"RateCodeID" , "Pickup_longitude" , "Pickup_latitude" , "Dropoff_longitude" , | |
"Dropoff_latitude" , "Passenger_count" , "Trip_distance" , "Fare_amount" , | |
"Extra" , "MTA_tax" , "Tip_amount" , "Tolls_amount" , | |
"Ehail_fee" , "improvement_surcharge" ,"Total_amount" , "Payment_type" , | |
"Trip_type" ) | |
rm(green_taxi1) | |
rm(green_taxi2) | |
PickupPts<-SpatialPoints(cbind(green_taxi$Pickup_longitude,green_taxi$Pickup_latitude)) | |
PickupPts@proj4string <- nyc@proj4string | |
pickupBoroughCodes<-PickupPts %over% nyc2 | |
rm(PickupPts) | |
green_taxi$boroughCode_p <- pickupBoroughCodes$boroughCode | |
rm(pickupBoroughCodes) | |
DropoffPts<-SpatialPoints(cbind(green_taxi$Dropoff_longitude,green_taxi$Dropoff_latitude)) | |
DropoffPts@proj4string <- nyc@proj4string | |
dropoffBoroughCodes<-DropoffPts %over% nyc2 | |
rm(DropoffPts) | |
green_taxi$boroughCode_d <- dropoffBoroughCodes$boroughCode | |
rm(dropoffBoroughCodes) | |
green_taxi$month <- month(green_taxi$Pickup_datetime) | |
green_taxi$wday <- wday(green_taxi$Pickup_datetime) | |
green_taxi$hour <- hour(green_taxi$Pickup_datetime) | |
# Change the format of datetime from string to POSIXct objects | |
green_taxi$Pickup_datetime <- as.POSIXct(green_taxi$Pickup_datetime,format='%Y-%m-%d %H:%M:%S') | |
green_taxi$Dropoff_datetime <- as.POSIXct(green_taxi$Dropoff_datetime,format='%Y-%m-%d %H:%M:%S') | |
green_taxi$duration <- floor(as.double(green_taxi$Dropoff_datetime-green_taxi$Pickup_datetime)/60.0) | |
green_taxi$percent <- green_taxi$Tip_amount/green_taxi$Fare_amount * 100.0 | |
green_taxi$speed <- green_taxi$Trip_distance/green_taxi$duration *60 | |
save(green_taxi,file='green_tripdata_2015.RData') # saving the raw data to R native binary format for | |
# future usage | |
# Even though the data has no missing values (it has some unused column whose data is missing), the | |
# data is very dirty. Source: untrained human input error, GPS instrument error...,etc. We have to apply | |
# filter to remove those outliers which can damage our analysis. Furthermore, we need to fiter away | |
#taxi rides which are actually FHV (for hire vehicle) rides which can last several hours and a few hundred | |
#dollors. We fiter these rare events away. | |
green_adm <- filter(green_taxi, RateCodeID %in% c(1,2,6), Payment_type %in% 1:2) | |
# filter away non-credit card/cash ride and rides to Westchester county,etc. | |
rm(green_taxi) | |
green_clean<-filter(green_adm,Fare_amount<200,Total_amount<1000,Tolls_amount<30,Fare_amount>0,Trip_distance<50, | |
Trip_distance>0.02,duration<120,duration>=1,speed>0,speed<100,percent<100, | |
!is.na(boroughCode_p),!is.na(boroughCode_d)) | |
# green_clean is the data.frame for which the Time-domain analysis is performed | |
by_wdayNhour<-green_clean %>% group_by(.,wday,hour) %>% summarise(.,speed=mean(speed),duration=mean(duration), | |
Payment_type=mean(Payment_type), dist=mean(Trip_distance),percent=mean(percent),count=n()/365.0) | |
ggplot(by_wdayNhour, aes(x=as.factor(wday), y=as.factor(hour),fill=speed)) + geom_tile() + scale_fill_gradientn(colors=c('black','dark red','red', | |
'orange','yellow','white')) + scale_x_discrete(name='week day', | |
labels=c('1'='S','2'='M','3'='T','4'='W','5'='T','6'='F','7'='S'))+ggtitle( | |
'The green taxi speed in a weekday vs hour heat map') + ylab('hour') + labs(fill="Avg Speed (mph)") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
by_speed<-green_clean %>% filter(Payment_type==1,speed<50,percent<50) %>% group_by(speed=floor(2*speed)/2) %>% summarise( | |
duration=mean(duration),percent=mean(percent),dist=mean(Trip_distance),count=n()/365.0) | |
# use ggplot to plot the tip percentage w.r.t. speeds | |
ggplot(by_speed, aes(x=(speed), y=percent)) + geom_point(aes(color | |
=count),size=2,shape=1) + scale_x_continuous(name='speed (mph)', | |
breaks=seq(0,50,by=5))+geom_smooth(se=F) + ylab('tip percentage') | |
# to plot the multi-intraday time-windows tip percent vs speed | |
# We refine this into different hours in the day | |
newLevel<-c(rep('0-6am',6),rep('6-9am',3),rep('9am-4pm',7),rep('4-7pm',3),rep('7-12pm',5)) | |
by_speed_H <-green_clean %>% filter(Payment_type==1,speed<30,percent<50) %>% group_by(speed=floor(2*speed)/2,hour=factor(hour)) %>% summarise( | |
duration=mean(duration),percent=mean(percent),dist=mean(Trip_distance),count=n()/365.0) | |
levels(by_speed_H$hour)<-newLevel | |
g<-ggplot(by_speed_H, aes(x=(speed), y=percent)) + scale_x_continuous( | |
name='speed (mph)',breaks=seq(0,50,by=10))+ geom_smooth(se=F,aes(color=I('green'))) + ylab('tip percentages') | |
g+facet_wrap(~hour) + ggtitle('The Tip Percentages at Differerent Speeds') | |
g<-ggplot(by_speed_H, aes(x=(speed), y=duration)) + scale_x_continuous( | |
name='speed (mph)',breaks=seq(0,50,by=10))+geom_smooth(se=F,aes(color=I('green'))) + ylab('duration (minutes)') | |
g+facet_wrap(~hour) + ggtitle('The Ride Duration at Differerent Speeds') | |
# To analyze the percentages of credit card paying passengers who do NOT pay | |
noPay_byspeed<-green_clean %>% filter(Payment_type==1,speed<50,percent==0.0) %>% group_by(speed=floor(2*speed)/2) %>% summarise( | |
duration=mean(duration),percent=mean(percent),dist=mean(Trip_distance),count=n()/365.0) | |
noPay_byspeed$ratio = noPay_byspeed$count/by_speed$count * 100 | |
ggplot(noPay_byspeed, aes(x=(speed), y=ratio)) + geom_point(size=2,shape=1) + scale_x_continuous(name='speed (mph)', | |
breaks=seq(0,50,by=5))+geom_smooth(se=F)+ylab('non-tippers\' population percentages') | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Map.PartialReduce.yellow.taxi<-function(yellow_adm) { | |
yellow_clean<-yellow_adm %>% filter(Fare_amount<200,Total_amount<1000,Tolls_amount<30,Fare_amount>0,Trip_distance<50, | |
Trip_distance>0.02,duration<120,duration>=1,speed>0,speed<100,percent<100) | |
print(paste('month', month, 'yellow_clean number of rows:',nrow(yellow_clean))) | |
x<-select(yellow_clean,month,wday,hour,duration,speed, percent,Tip_amount,Fare_amount,Total_amount,Tolls_amount, | |
Trip_distance,Payment_type,RateCodeID,Passenger_count,boroughCode_p,boroughCode_d) | |
rm(yellow_clean) | |
y<-transmute(x,Passenger_count,month,wday,hour,duration,percent, Fare_amount, speed, Total_amount, Tolls_amount, Trip_distance,Payment_type,RateCodeID,boroughCode_p,boroughCode_d) | |
y<-filter(y,speed<80,percent<50,duration<60,Trip_distance<20) | |
y<-filter(y,Fare_amount<200,Total_amount<1000,Tolls_amount<30,Fare_amount>0,Trip_distance<20, | |
Trip_distance>0.02,duration<120,duration>=1,speed>0, | |
!is.na(boroughCode_p),!is.na(boroughCode_d)) | |
by_passenger <- y %>% group_by(Passenger_count) %>% summarise(count=n()) | |
by_wday_hour <- y %>% group_by(wday,hour) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n()) | |
by_month <- y %>% group_by(month) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n()) | |
by_wday <- y %>% group_by(wday) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n()) | |
by_hour <- y %>% group_by(hour) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n()) | |
by_origin_hour<- y %>% group_by(Pickup_Borough=factor(boroughCode_p,labels=Boroughs),hour=factor(hour)) %>% | |
summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n()) | |
by_speed <- y %>% filter(Payment_type==1,speed<50) %>% group_by(speed=floor(speed*2)*0.5) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), distSum=sum(Trip_distance), count=n()) | |
by_speed_hour <- y %>% filter(Payment_type==1,speed<50) %>% group_by(speed=floor(speed*2)*0.5,hour) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), distSum=sum(Trip_distance), count=n()) | |
by_duration <- y %>% filter(duration<60) %>% group_by(duration=floor(2*duration)*0.5) %>% summarise(percentSum=sum(percent),speedSum=sum(speed),distSum=sum(Trip_distance), count=n()) | |
by_dist <- y %>% filter(Trip_distance<10) %>% group_by(dist=floor(2*Trip_distance)*0.5) %>% summarise(percentSum=sum(percent),speedSum=sum(speed),durationSum=sum(duration), count=n()) | |
connection<- y %>% group_by(Pickup_Borough=factor(boroughCode_p,labels=Boroughs), | |
Destination_Borough=factor(boroughCode_d,labels=Boroughs)) %>% | |
summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n()) | |
pay_byspeed_H <- y %>% filter(Payment_type==1,speed<50,percent>0,percent<50) %>% group_by(speed=floor(2*speed)/2,hour=factor(hour)) %>% summarise( | |
durationSum=sum(duration),percentSum=sum(percent),distSum=sum(Trip_distance),count=n()) | |
noPay_byspeed_H <- y %>% filter(Payment_type==1,speed<50,percent==0) %>% group_by(speed=floor(2*speed)/2,hour=factor(hour)) %>% summarise( | |
durationSum=sum(duration),distSum=sum(Trip_distance),count=n()) | |
newLevel<-c(rep('0-6am',6),rep('6-9am',3),rep('9am-4pm',7),rep('4-7pm',3),rep('7-12pm',5)) | |
myList <- c('noTipG_by_speed','tipG_by_speed', 'by_passenger', 'by_wday_hour','by_speed', 'pay_byspeed_H','noPay_byspeed_H', | |
'by_month','by_wday', 'by_hour', 'by_speed_hour','by_origin_hour', 'by_dist', 'by_duration','connection') | |
noTipG <-filter(y,percent==0,Payment_type==1) | |
noTipG_by_speed <- noTipG %>% group_by(speed=floor(speed*2)*0.5) %>% summarise( | |
percentSum=sum(percent),durationSum=sum(duration),distSum=sum(Trip_distance),count=n()) | |
tipG <- filter(y,percent>0,Payment_type==1) | |
tipG_by_speed <- tipG %>% group_by(speed=floor(speed*2)*0.5) %>% summarise( | |
percentSum=sum(percent), durationSum=sum(duration), distSum=sum(Trip_distance), count=n()) | |
for (vName in myList) { | |
assign(paste0(vName,months[month]), get(vName)) | |
} | |
dump(paste0(myList,months[month]),file='yellow_taxi_dump.R',append=T) | |
rm(x,y) | |
return(myList) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Preprocessing.Yellow.Taxi<-function(fileName,nyc) { | |
library(saves) | |
library(stringr) | |
library(sp) | |
Boroughs = c('Manhattan', 'Bronx', 'Brooklyn', 'Queens', 'Staten Island') #BoroughCode Mapping | |
nyc2<-nyc | |
nyc2@data[,2:3]<-NULL | |
yellow_taxi<-read.csv(fileName,stringsAsFactors = F) | |
print(paste("Finish reading the yellow taxi file for month",month)) | |
names(yellow_taxi)=c("VendorID","Pickup_datetime","Dropoff_datetime", | |
"Passenger_count","Trip_distance","Pickup_longitude","Pickup_latitude","RateCodeID","Store_and_fwd_flag", | |
"Dropoff_longitude","Dropoff_latitude","Payment_type","Fare_amount","Extra", | |
"Mta_tax","Tip_amount","Tolls_amount","Improvement_surcharge","Total_amount") | |
# Convert the long, lat info to the NYC borough categories | |
PickupPts<-SpatialPoints(cbind(yellow_taxi$Pickup_longitude,yellow_taxi$Pickup_latitude)) | |
PickupPts@proj4string <- nyc@proj4string | |
pickupBoroughCodes<-PickupPts %over% nyc2 | |
rm(PickupPts) | |
yellow_taxi$boroughCode_p <- pickupBoroughCodes$boroughCode | |
rm(pickupBoroughCodes) | |
DropoffPts<-SpatialPoints(cbind(yellow_taxi$Dropoff_longitude,yellow_taxi$Dropoff_latitude)) | |
DropoffPts@proj4string <- nyc@proj4string | |
dropoffBoroughCodes<-DropoffPts %over% nyc2 | |
rm(DropoffPts) | |
yellow_taxi$boroughCode_d <- dropoffBoroughCodes$boroughCode | |
rm(dropoffBoroughCodes) | |
# convert the datetime string into month, wday, hour, etc. | |
yellow_taxi$month = month(yellow_taxi$Pickup_datetime) | |
yellow_taxi$wday = wday(yellow_taxi$Pickup_datetime) | |
yellow_taxi$hour = hour(yellow_taxi$Pickup_datetime) | |
# add derived variables which we will analyze | |
yellow_taxi$duration = floor(as.double(as.POSIXct(yellow_taxi$Dropoff_datetime,format='%Y-%m-%d %H:%M:%S') | |
-as.POSIXct(yellow_taxi$Pickup_datetime,format='%Y-%m-%d %H:%M:%S'))/60.0) | |
yellow_taxi$speed = (yellow_taxi$Trip_distance/yellow_taxi$duration)*60 | |
yellow_taxi$percent = (yellow_taxi$Tip_amount/yellow_taxi$Fare_amount) *100 | |
yellow_taxi$Pickup_datetime<-NULL | |
yellow_taxi$Dropoff_datetime<-NULL | |
# Restrict to the credit card/cash payment, standard fare, JFK rides and group rides, etc. | |
# we do not intend to include negotiated fares or payment by non-standard method. | |
yellow_adm <- filter(yellow_taxi,RateCodeID %in% c(1,2,6),Payment_type %in% 1:2) | |
print(paste('month', month, 'raw number of rows:', nrow(yellow_taxi))) | |
rm(yellow_taxi) | |
return(yellow_adm) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# After restarting a new session (or rm(list=ls()) ), assuming that the relavent data has been saved into the files | |
library(dplyr) | |
library(lubridate) | |
library(reshape2) | |
library(stringr) | |
library(saves) | |
library(ggplot2) | |
library(ggmap) | |
source('yellow_taxi_dump.R') # source and dump to the file, dump allows writing in an appending mode | |
# varNames are from the previous session | |
by_wday <- Assign.yellow.taxi('by_wday') # aggregating the monthly tables into a single data frame by aggregating | |
by_wday <- Averaging.yellow.taxi(by_wday) #conver the sum statistics to mean statistics | |
# the graphing to ggplot is not display as they are almost identical to the green taxis'. | |
# in the following, demonstrate how to plot the origination locaitons of the yellow taxis | |
nyc_map_g_str <- get_map(location = "new york city", zoom = 11) # google map of the NYC | |
months = str_pad(1:12,2,"left",pad='0') | |
yellow_files = paste0('yellow_adm_2015_',months,'.RDatas') | |
colNames = c('Pickup_longitude','Pickup_latitude','percent','boroughCode_p','boroughCode_d') | |
colNames = c('Dropoff_longitude','Dropoff_latitude','boroughCode_p','boroughCode_d') | |
inNYC = list() | |
for (i in 1:12) { | |
yellow_partial<-loads(file=yellow_files[i],variables=colNames, to.data.frame=T) | |
# test the boroughCodes to determine if the origin and the destination are both in the city | |
inNYC[[i]] <- (!is.na(yellow_partial$boroughCode_p))&(!is.na(yellow_partial$boroughCode_d)) | |
rm(yellow_partial) | |
if (i%%3==0) {gc()} # call the garbage collector | |
} | |
colNames <- c('Pickup_longitude','Pickup_latitude','duration','percent') | |
yellow_samples = as.data.frame(matrix(nrow=0,ncol=length(colNames))) | |
colnames(yellow_samples) <- colNames | |
for (i in 1:12) { | |
yellow_partial<-loads(file=yellow_files[i],variables=colNames, to.data.frame=T) | |
yellow_samples <- rbind(yellow_samples, sample_frac(filter(yellow_partial,inNYC[[i]]),size=0.015)) | |
rm(yellow_partial) # tell the garbage collector that it can discard the variable | |
if (i%%3==0) { gc() } # calling garbage collector every 3 iteration steps | |
} | |
yellow_sample <- transmute(yellow_samples, lon=signif(Pickup_longitude,5), lat=signif(Pickup_latitude,5)) # truncate the long-lat coordinates | |
#or for drop off locations | |
#yellow_sample <- transmute(yellow_samples, lon=signif(Dropoff_longitude,5), lat=signif(Dropoff_latitude,5)) | |
rm(yellow_samples) | |
yellow_sample<-filter(yellow_sample,lon > -80,lon< -65,lat > 39,lat< 46) | |
# rounding the long, lat to the desired accuracies | |
ggmap(nyc_map_g_str, extent = "device") + stat_density2d(data = yellow_sample, aes(x = lon, y = lat, fill = ..level.., alpha = ..level..), size = 0.01, bins = 80, geom = "polygon") + scale_fill_gradient(low = "blue", | |
high = "red") + scale_alpha(range = c(0, 0.3), guide = FALSE)+ theme(legend.position='none') +ggtitle('yellow taxi origination plot') | |
rm(yellow_sample) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(lubridate) | |
library(reshape2) | |
library(stringr) | |
library(ggmap) | |
library(ggplot2) | |
library(ggrepel) | |
library(grid) | |
library(saves) | |
library(geojsonio) | |
library(sp) | |
library(rgdal) | |
nyc<-geojson_read('nycboroughboundaries.geojson',what='sp') # load the nyc spatial polygon data frame | |
Boroughs = c('Manhattan', 'Bronx', 'Brooklyn', 'Queens', 'Staten Island') #BoroughCode Mapping | |
months = str_pad(1:12,2,"left",pad='0') | |
yellow_taxi_files = paste0('yellow_tripdata_2015-',months,'.csv.gz') | |
yellow_taxi_output_Files = paste0('yellow_adm_2015_',months,'.RDatas') | |
varNames = c() # will store the header names of the variables in yellow_taxi_dump.R | |
# The bottleneck of our computation is the RAM ~ 16G. Otherwise, we could have used | |
#the multi-threading support of R (FOREACH) to parallelly compute the monthly map and reduce steps | |
#as they are totally un-related. | |
for (month in 1:12) { | |
yellow_adm <- Preprocessing.Yellow.Taxi(yellow_taxi_files[month],nyc) | |
# 'saves' stores the yellow_adm as many 1 column data.frames | |
saves(yellow_adm,file=yellow_taxi_output_Files[month],overwrite=T) | |
varNames <- Map.PartialReduce.yellow.taxi(yellow_adm) | |
rm(yellow_adm) | |
gc() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment