ikovsky/Assign.yellow.taxi.R Secret

## Assign.yellow.taxi.R
Assign.yellow.taxi<-function(strName,months=1:12) {   # the end result is the sum of the tables from the various months

  monthStr <- str_pad(months,2,"left",pad='0')
  x        <- get(paste0(strName,monthStr[1]))
  colNames <- colnames(x)
  numColNames <- grep(pattern='Sum$',colNames,value=T)
  numColNames <- c(numColNames, 'count')
  keyNames    <- setdiff(colNames, numColNames)
  for (monthToken in monthStr[2:length(months)]) {

    y<-get(paste0(strName,monthToken))
    # Do the sums for numerical columns but taking care of the missing rows by using outer_join
    w<-full_join(x,y,keyNames)
    w[is.na(w)]<-0
    x<-w[,c(keyNames, paste0(numColNames,'.x'))]
    y<-w[,paste0(numColNames,'.y')]
    names(x) <- colNames
    names(y) <- numColNames
    x[,numColNames]<-x[,numColNames] + y
  }

  return(x)
}

## Averaging.yellow.taxi.R
Averaging.yellow.taxi<-function(df) {

  colKeys<-names(df)
  try(if (!('count' %in% colKeys)) stop("No count in the columns of df"))
  newCols<-sub('Sum','',colKeys)

  x<-grep(pattern='Sum$',colKeys,value=T)  # grep the column names which end by 'Sum'
  y<-as.data.frame(matrix(nrow=nrow(df),ncol=length(newCols)))
  names(y)<-newCols
  for (key in colKeys) {
    if (key %in% x) {
      headStr<-sub('Sum','',key)  #collapsing the 'Sum' into an empty string
      y[[headStr]]<-ifelse(df$count>0,df[[key]]/df$count,numeric(nrow(df)))
    }
    else {y[,key] <- df[,key]}
  }
  return(y)
}

## cross_borough_analysis.R
connection<- green_clean %>% filter(speed>1,speed<80,percent<50,duration<60,Trip_distance<20) %>%
  group_by(Pickup_Borough=boroughCode_p,Destination_Borough=boroughCode_d) %>%
  summarise(count=n(),dist=mean(Trip_distance), speed=mean(speed),duration=mean(duration),percent=mean(percent))

connection <- filter(connection,Pickup_Borough!=5,Destination_Borough!=5)

connection$Pickup_Borough <- factor(connection$Pickup_Borough,labels=Boroughs[1:4])
connection$Destination_Borough <- factor(connection$Destination_Borough,labels=Boroughs[1:4])


# starting boroughs vs target borough in a bar graph

ggplot(data=connection,aes(x=Pickup_Borough,y=count)) + geom_bar(stat='identity',aes(fill=Destination_Borough),
position='dodge') + xlab('Starting Borough') + ylab('green taxi rides in 2015') + scale_y_continuous(breaks=seq(0,4e6,by=2e6),
labels=c('0M','2M','4M'))

# the speed of cross-borough traffic in a 2D heat map, using start, end borough as horizontal, vertical indexes

ggplot(connection, aes(x=as.factor(Pickup_Borough), y=as.factor(Destination_Borough),fill=speed)) + geom_tile() + scale_fill_gradientn(colors=c('black','dark red','red',
    'orange','yellow','white')) + scale_x_discrete(name='Green Taxi Originated Borough') +ggtitle(
'speed-origin vs target borough heat map') + ylab('Target Borough') + labs(fill="speed")

# To analyze the borough's traffic across different hours in the day

by_origin_hour <- green_clean %>% filter(speed>1,speed<80,percent<50,duration<60,Trip_distance<20,boroughCode_p!=5) %>%
  group_by(Pickup_Borough=factor(boroughCode_p,labels=Boroughs[1:4]),hour=factor(hour)) %>%
  summarise(count=n(),dist=mean(Trip_distance), speed=mean(speed),duration=mean(duration),percent=mean(percent))


ggplot(data=by_origin_hour,aes(x=as.factor(Pickup_Borough),y=as.factor(hour))) + geom_tile(aes(fill=count)) + scale_fill_gradientn(colors=c('black','dark red','red',
'orange','yellow','white'), breaks=seq(1e5,4e5,by=1e5),labels=c('0.1M','0.2M','0.3M','0.4M')) + scale_x_discrete(name='Originated Borough') +ggtitle(
'The ride count in an origin vs hour heat map') + ylab('hour') + labs(fill="daily taxi rides")


## green_taxi_2D_heatMap.R
# Step 1. Data Pre-processing, data cleaning/filtering, etc.
# We bundle the 12 monthly NYC green taxi files into two half-year gz files by using cat, >> and gzip utilities.

# Loading the NYC geojson polygon file
nyc<-geojson_read('nycboroughboundaries.geojson',what='sp')
Boroughs = c('Manhattan', 'Bronx', 'Brooklyn', 'Queens', 'Staten Island') #BoroughCode Mapping

nyc2<-nyc
nyc2@data[,2:3]<-NULL #only keep the borough code, throw away the others

green_taxi1<-read.csv('green_tripdata_2015_01-06.csv.gz',stringsAsFactors = F)
green_taxi2<-read.csv('green_tripdata_2015_07-12.csv.gz',stringsAsFactors = F)

green_taxi1$dummy1 <- NULL  #handle the excess ',' in all the rows of the csv files
green_taxi1$dummy2 <- NULL

green_taxi2$dummy1 <- NULL
green_taxi2$dummy2 <- NULL

green_taxi<-rbind(green_taxi1,green_taxi2)

names(green_taxi) = c("VendorID"  ,            "Pickup_datetime"  ,"Dropoff_datetime" ,"Store_and_fwd_flag" ,
                      "RateCodeID"        ,    "Pickup_longitude"  ,    "Pickup_latitude"   ,    "Dropoff_longitude" ,
                      "Dropoff_latitude"   ,   "Passenger_count"    ,   "Trip_distance"     ,    "Fare_amount"  ,
                      "Extra"       ,          "MTA_tax"       ,        "Tip_amount"        ,    "Tolls_amount" ,
                      "Ehail_fee"    ,         "improvement_surcharge" ,"Total_amount"      ,    "Payment_type"  ,
                      "Trip_type" )

rm(green_taxi1)
rm(green_taxi2)

PickupPts<-SpatialPoints(cbind(green_taxi$Pickup_longitude,green_taxi$Pickup_latitude))

PickupPts@proj4string <- nyc@proj4string
pickupBoroughCodes<-PickupPts %over% nyc2
rm(PickupPts)
green_taxi$boroughCode_p <- pickupBoroughCodes$boroughCode
rm(pickupBoroughCodes)

DropoffPts<-SpatialPoints(cbind(green_taxi$Dropoff_longitude,green_taxi$Dropoff_latitude))
DropoffPts@proj4string <- nyc@proj4string
dropoffBoroughCodes<-DropoffPts %over% nyc2
rm(DropoffPts)
green_taxi$boroughCode_d <- dropoffBoroughCodes$boroughCode
rm(dropoffBoroughCodes)

green_taxi$month <- month(green_taxi$Pickup_datetime)
green_taxi$wday  <- wday(green_taxi$Pickup_datetime)
green_taxi$hour  <- hour(green_taxi$Pickup_datetime)

# Change the format of datetime from string to POSIXct objects

green_taxi$Pickup_datetime <- as.POSIXct(green_taxi$Pickup_datetime,format='%Y-%m-%d %H:%M:%S')
green_taxi$Dropoff_datetime <- as.POSIXct(green_taxi$Dropoff_datetime,format='%Y-%m-%d %H:%M:%S')

green_taxi$duration <- floor(as.double(green_taxi$Dropoff_datetime-green_taxi$Pickup_datetime)/60.0)
green_taxi$percent  <- green_taxi$Tip_amount/green_taxi$Fare_amount * 100.0
green_taxi$speed    <- green_taxi$Trip_distance/green_taxi$duration *60

save(green_taxi,file='green_tripdata_2015.RData')  # saving the raw data to R native binary format for
# future usage

# Even though the data has no missing values (it has some unused column whose data is missing), the
# data is very dirty. Source: untrained human input error, GPS instrument error...,etc. We have to apply
# filter to remove those outliers which can damage our analysis. Furthermore, we need to fiter away
#taxi rides which are actually FHV (for hire vehicle) rides which can last several hours and a few hundred
#dollors.  We fiter these rare events away.

green_adm <- filter(green_taxi, RateCodeID %in% c(1,2,6), Payment_type %in% 1:2)
# filter away non-credit card/cash ride and rides to Westchester county,etc.
rm(green_taxi)

green_clean<-filter(green_adm,Fare_amount<200,Total_amount<1000,Tolls_amount<30,Fare_amount>0,Trip_distance<50,
                    Trip_distance>0.02,duration<120,duration>=1,speed>0,speed<100,percent<100,
                    !is.na(boroughCode_p),!is.na(boroughCode_d))
# green_clean is the data.frame for which the Time-domain analysis is performed

by_wdayNhour<-green_clean %>% group_by(.,wday,hour) %>% summarise(.,speed=mean(speed),duration=mean(duration),
Payment_type=mean(Payment_type), dist=mean(Trip_distance),percent=mean(percent),count=n()/365.0)


ggplot(by_wdayNhour, aes(x=as.factor(wday), y=as.factor(hour),fill=speed)) + geom_tile() + scale_fill_gradientn(colors=c('black','dark red','red',
'orange','yellow','white')) + scale_x_discrete(name='week day',
labels=c('1'='S','2'='M','3'='T','4'='W','5'='T','6'='F','7'='S'))+ggtitle(
'The green taxi speed in a weekday vs hour heat map') + ylab('hour') + labs(fill="Avg Speed (mph)")

## green_taxi_tip_analysis.R
by_speed<-green_clean %>% filter(Payment_type==1,speed<50,percent<50) %>% group_by(speed=floor(2*speed)/2) %>% summarise(
duration=mean(duration),percent=mean(percent),dist=mean(Trip_distance),count=n()/365.0)

# use ggplot to plot the tip percentage w.r.t. speeds

ggplot(by_speed, aes(x=(speed), y=percent)) + geom_point(aes(color
=count),size=2,shape=1) + scale_x_continuous(name='speed (mph)',
breaks=seq(0,50,by=5))+geom_smooth(se=F) + ylab('tip percentage')

# to plot the multi-intraday time-windows tip percent vs speed

# We refine this into different hours in the day

newLevel<-c(rep('0-6am',6),rep('6-9am',3),rep('9am-4pm',7),rep('4-7pm',3),rep('7-12pm',5))

by_speed_H <-green_clean %>% filter(Payment_type==1,speed<30,percent<50) %>% group_by(speed=floor(2*speed)/2,hour=factor(hour)) %>% summarise(
  duration=mean(duration),percent=mean(percent),dist=mean(Trip_distance),count=n()/365.0)

levels(by_speed_H$hour)<-newLevel

g<-ggplot(by_speed_H, aes(x=(speed), y=percent))  + scale_x_continuous(
name='speed (mph)',breaks=seq(0,50,by=10))+ geom_smooth(se=F,aes(color=I('green'))) + ylab('tip percentages')
g+facet_wrap(~hour) + ggtitle('The Tip Percentages at Differerent Speeds')

g<-ggplot(by_speed_H, aes(x=(speed), y=duration)) + scale_x_continuous(
  name='speed (mph)',breaks=seq(0,50,by=10))+geom_smooth(se=F,aes(color=I('green'))) + ylab('duration (minutes)')
g+facet_wrap(~hour) + ggtitle('The Ride Duration at Differerent Speeds')

# To analyze the percentages of credit card paying passengers who do NOT pay

noPay_byspeed<-green_clean %>% filter(Payment_type==1,speed<50,percent==0.0) %>% group_by(speed=floor(2*speed)/2) %>% summarise(
  duration=mean(duration),percent=mean(percent),dist=mean(Trip_distance),count=n()/365.0)

noPay_byspeed$ratio = noPay_byspeed$count/by_speed$count * 100

ggplot(noPay_byspeed, aes(x=(speed), y=ratio)) + geom_point(size=2,shape=1) + scale_x_continuous(name='speed (mph)',
breaks=seq(0,50,by=5))+geom_smooth(se=F)+ylab('non-tippers\' population percentages')


## Map.PartialReduce.yellow.taxi.R
Map.PartialReduce.yellow.taxi<-function(yellow_adm) {

  yellow_clean<-yellow_adm %>% filter(Fare_amount<200,Total_amount<1000,Tolls_amount<30,Fare_amount>0,Trip_distance<50,
                       Trip_distance>0.02,duration<120,duration>=1,speed>0,speed<100,percent<100)
  print(paste('month', month, 'yellow_clean number of rows:',nrow(yellow_clean)))

  x<-select(yellow_clean,month,wday,hour,duration,speed, percent,Tip_amount,Fare_amount,Total_amount,Tolls_amount,
            Trip_distance,Payment_type,RateCodeID,Passenger_count,boroughCode_p,boroughCode_d)
  rm(yellow_clean)

  y<-transmute(x,Passenger_count,month,wday,hour,duration,percent, Fare_amount, speed, Total_amount, Tolls_amount, Trip_distance,Payment_type,RateCodeID,boroughCode_p,boroughCode_d)
  y<-filter(y,speed<80,percent<50,duration<60,Trip_distance<20)
  y<-filter(y,Fare_amount<200,Total_amount<1000,Tolls_amount<30,Fare_amount>0,Trip_distance<20,
            Trip_distance>0.02,duration<120,duration>=1,speed>0,
            !is.na(boroughCode_p),!is.na(boroughCode_d))

  by_passenger  <- y %>% group_by(Passenger_count) %>% summarise(count=n())
  by_wday_hour  <- y %>% group_by(wday,hour) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n())
  by_month      <- y %>% group_by(month) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n())
  by_wday       <- y %>% group_by(wday) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n())
  by_hour       <- y %>% group_by(hour) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n())
  by_origin_hour<- y %>% group_by(Pickup_Borough=factor(boroughCode_p,labels=Boroughs),hour=factor(hour)) %>%
    summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n())
  by_speed      <- y %>% filter(Payment_type==1,speed<50) %>% group_by(speed=floor(speed*2)*0.5) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), distSum=sum(Trip_distance), count=n())
  by_speed_hour <- y %>% filter(Payment_type==1,speed<50) %>% group_by(speed=floor(speed*2)*0.5,hour) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), distSum=sum(Trip_distance), count=n())
  by_duration   <- y %>% filter(duration<60) %>% group_by(duration=floor(2*duration)*0.5) %>% summarise(percentSum=sum(percent),speedSum=sum(speed),distSum=sum(Trip_distance), count=n())
  by_dist       <- y %>% filter(Trip_distance<10) %>% group_by(dist=floor(2*Trip_distance)*0.5) %>% summarise(percentSum=sum(percent),speedSum=sum(speed),durationSum=sum(duration), count=n())
  connection<- y %>% group_by(Pickup_Borough=factor(boroughCode_p,labels=Boroughs),
                              Destination_Borough=factor(boroughCode_d,labels=Boroughs)) %>%
    summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n())
  pay_byspeed_H <- y %>% filter(Payment_type==1,speed<50,percent>0,percent<50) %>% group_by(speed=floor(2*speed)/2,hour=factor(hour)) %>% summarise(
    durationSum=sum(duration),percentSum=sum(percent),distSum=sum(Trip_distance),count=n())
  noPay_byspeed_H <- y %>% filter(Payment_type==1,speed<50,percent==0) %>% group_by(speed=floor(2*speed)/2,hour=factor(hour)) %>% summarise(
    durationSum=sum(duration),distSum=sum(Trip_distance),count=n())

  newLevel<-c(rep('0-6am',6),rep('6-9am',3),rep('9am-4pm',7),rep('4-7pm',3),rep('7-12pm',5))

  myList <- c('noTipG_by_speed','tipG_by_speed', 'by_passenger', 'by_wday_hour','by_speed', 'pay_byspeed_H','noPay_byspeed_H',
              'by_month','by_wday', 'by_hour', 'by_speed_hour','by_origin_hour', 'by_dist', 'by_duration','connection')


  noTipG  <-filter(y,percent==0,Payment_type==1)
  noTipG_by_speed <- noTipG %>% group_by(speed=floor(speed*2)*0.5) %>% summarise(
  percentSum=sum(percent),durationSum=sum(duration),distSum=sum(Trip_distance),count=n())
  tipG    <- filter(y,percent>0,Payment_type==1)
  tipG_by_speed <- tipG %>% group_by(speed=floor(speed*2)*0.5) %>% summarise(
    percentSum=sum(percent), durationSum=sum(duration), distSum=sum(Trip_distance), count=n())

  for (vName in myList) {
    assign(paste0(vName,months[month]), get(vName))
  }
  dump(paste0(myList,months[month]),file='yellow_taxi_dump.R',append=T)
  rm(x,y)

  return(myList)
}

## Preprocessing.Yellow.Taxi.R
Preprocessing.Yellow.Taxi<-function(fileName,nyc) {

library(saves)
library(stringr)
library(sp)

Boroughs = c('Manhattan', 'Bronx', 'Brooklyn', 'Queens', 'Staten Island') #BoroughCode Mapping

nyc2<-nyc
nyc2@data[,2:3]<-NULL

yellow_taxi<-read.csv(fileName,stringsAsFactors = F)

print(paste("Finish reading the yellow taxi file for month",month))
names(yellow_taxi)=c("VendorID","Pickup_datetime","Dropoff_datetime",
                     "Passenger_count","Trip_distance","Pickup_longitude","Pickup_latitude","RateCodeID","Store_and_fwd_flag",
                     "Dropoff_longitude","Dropoff_latitude","Payment_type","Fare_amount","Extra",
                     "Mta_tax","Tip_amount","Tolls_amount","Improvement_surcharge","Total_amount")

# Convert the long, lat info to the NYC borough categories
PickupPts<-SpatialPoints(cbind(yellow_taxi$Pickup_longitude,yellow_taxi$Pickup_latitude))

PickupPts@proj4string <- nyc@proj4string
pickupBoroughCodes<-PickupPts %over% nyc2
rm(PickupPts)
yellow_taxi$boroughCode_p <- pickupBoroughCodes$boroughCode
rm(pickupBoroughCodes)

DropoffPts<-SpatialPoints(cbind(yellow_taxi$Dropoff_longitude,yellow_taxi$Dropoff_latitude))
DropoffPts@proj4string <- nyc@proj4string
dropoffBoroughCodes<-DropoffPts %over% nyc2
rm(DropoffPts)
yellow_taxi$boroughCode_d <- dropoffBoroughCodes$boroughCode
rm(dropoffBoroughCodes)

# convert the datetime string into month, wday, hour, etc.

yellow_taxi$month = month(yellow_taxi$Pickup_datetime)
yellow_taxi$wday  = wday(yellow_taxi$Pickup_datetime)
yellow_taxi$hour  = hour(yellow_taxi$Pickup_datetime)

# add derived variables which we will analyze

yellow_taxi$duration = floor(as.double(as.POSIXct(yellow_taxi$Dropoff_datetime,format='%Y-%m-%d %H:%M:%S')
                                       -as.POSIXct(yellow_taxi$Pickup_datetime,format='%Y-%m-%d %H:%M:%S'))/60.0)

yellow_taxi$speed   = (yellow_taxi$Trip_distance/yellow_taxi$duration)*60
yellow_taxi$percent = (yellow_taxi$Tip_amount/yellow_taxi$Fare_amount) *100
yellow_taxi$Pickup_datetime<-NULL
yellow_taxi$Dropoff_datetime<-NULL

# Restrict to the credit card/cash payment, standard fare, JFK rides and group rides, etc.
# we do not intend to include negotiated fares or payment by non-standard method.

yellow_adm <- filter(yellow_taxi,RateCodeID %in% c(1,2,6),Payment_type %in% 1:2)
print(paste('month', month, 'raw number of rows:', nrow(yellow_taxi)))
rm(yellow_taxi)
return(yellow_adm)
}

## yellow.taxi.analysis.R
# After restarting a new session (or rm(list=ls()) ), assuming that the relavent data has been saved into the files
library(dplyr)
library(lubridate)
library(reshape2)
library(stringr)
library(saves)
library(ggplot2)
library(ggmap)

source('yellow_taxi_dump.R')  # source and dump to the file, dump allows writing in an appending mode
# varNames are from the previous session
by_wday <- Assign.yellow.taxi('by_wday') # aggregating the monthly tables into a single data frame by aggregating
by_wday <- Averaging.yellow.taxi(by_wday) #conver the sum statistics to mean statistics

# the graphing to ggplot is not display as they are almost identical to the green taxis'.
# in the following, demonstrate how to plot the origination locaitons of the yellow taxis

nyc_map_g_str <- get_map(location = "new york city", zoom = 11)  # google map of the NYC

months = str_pad(1:12,2,"left",pad='0')


yellow_files = paste0('yellow_adm_2015_',months,'.RDatas')
colNames = c('Pickup_longitude','Pickup_latitude','percent','boroughCode_p','boroughCode_d')
colNames = c('Dropoff_longitude','Dropoff_latitude','boroughCode_p','boroughCode_d')

inNYC    = list()
for (i in 1:12) {
yellow_partial<-loads(file=yellow_files[i],variables=colNames, to.data.frame=T)
# test the boroughCodes to determine if the origin and the destination are both in the city
inNYC[[i]] <- (!is.na(yellow_partial$boroughCode_p))&(!is.na(yellow_partial$boroughCode_d))
rm(yellow_partial)
if (i%%3==0) {gc()}  # call the garbage collector
}

colNames <- c('Pickup_longitude','Pickup_latitude','duration','percent')
yellow_samples = as.data.frame(matrix(nrow=0,ncol=length(colNames)))
colnames(yellow_samples) <- colNames

for (i in 1:12) {

  yellow_partial<-loads(file=yellow_files[i],variables=colNames, to.data.frame=T)
  yellow_samples <- rbind(yellow_samples, sample_frac(filter(yellow_partial,inNYC[[i]]),size=0.015))
  rm(yellow_partial)  # tell the garbage collector that it can discard the variable
  if (i%%3==0) { gc() }  # calling garbage collector every 3 iteration steps
}


yellow_sample <- transmute(yellow_samples, lon=signif(Pickup_longitude,5), lat=signif(Pickup_latitude,5)) # truncate the long-lat coordinates

#or for drop off locations

#yellow_sample <- transmute(yellow_samples, lon=signif(Dropoff_longitude,5), lat=signif(Dropoff_latitude,5))

rm(yellow_samples)

yellow_sample<-filter(yellow_sample,lon > -80,lon< -65,lat > 39,lat< 46)
# rounding the long, lat to the desired accuracies

ggmap(nyc_map_g_str, extent = "device") + stat_density2d(data = yellow_sample, aes(x = lon, y = lat, fill = ..level.., alpha = ..level..), size = 0.01, bins = 80, geom = "polygon") + scale_fill_gradient(low = "blue",
high = "red") + scale_alpha(range = c(0, 0.3), guide = FALSE)+ theme(legend.position='none') +ggtitle('yellow taxi origination plot')

rm(yellow_sample)

## yellow_taxi_looping.R
library(dplyr)
library(lubridate)
library(reshape2)
library(stringr)
library(ggmap)
library(ggplot2)
library(ggrepel)
library(grid)
library(saves)
library(geojsonio)
library(sp)
library(rgdal)


nyc<-geojson_read('nycboroughboundaries.geojson',what='sp')  # load the nyc spatial polygon data frame
Boroughs = c('Manhattan', 'Bronx', 'Brooklyn', 'Queens', 'Staten Island') #BoroughCode Mapping

months = str_pad(1:12,2,"left",pad='0')
yellow_taxi_files         = paste0('yellow_tripdata_2015-',months,'.csv.gz')
yellow_taxi_output_Files  = paste0('yellow_adm_2015_',months,'.RDatas')

varNames = c()  # will store the header names of the variables in yellow_taxi_dump.R

# The bottleneck of our computation is the RAM ~ 16G. Otherwise, we could have used
#the multi-threading support of R (FOREACH) to parallelly compute the monthly map and reduce steps
#as they are totally un-related.

for (month in 1:12) {

  yellow_adm <- Preprocessing.Yellow.Taxi(yellow_taxi_files[month],nyc)
  # 'saves' stores the yellow_adm as many 1 column data.frames
  saves(yellow_adm,file=yellow_taxi_output_Files[month],overwrite=T)
  varNames <- Map.PartialReduce.yellow.taxi(yellow_adm)
  rm(yellow_adm)
  gc()
}
	Assign.yellow.taxi<-function(strName,months=1:12) { # the end result is the sum of the tables from the various months

	monthStr <- str_pad(months,2,"left",pad='0')
	x <- get(paste0(strName,monthStr[1]))
	colNames <- colnames(x)
	numColNames <- grep(pattern='Sum$',colNames,value=T)
	numColNames <- c(numColNames, 'count')
	keyNames <- setdiff(colNames, numColNames)
	for (monthToken in monthStr[2:length(months)]) {

	y<-get(paste0(strName,monthToken))
	# Do the sums for numerical columns but taking care of the missing rows by using outer_join
	w<-full_join(x,y,keyNames)
	w[is.na(w)]<-0
	x<-w[,c(keyNames, paste0(numColNames,'.x'))]
	y<-w[,paste0(numColNames,'.y')]
	names(x) <- colNames
	names(y) <- numColNames
	x[,numColNames]<-x[,numColNames] + y
	}

	return(x)
	}
	Averaging.yellow.taxi<-function(df) {

	colKeys<-names(df)
	try(if (!('count' %in% colKeys)) stop("No count in the columns of df"))
	newCols<-sub('Sum','',colKeys)

	x<-grep(pattern='Sum$',colKeys,value=T) # grep the column names which end by 'Sum'
	y<-as.data.frame(matrix(nrow=nrow(df),ncol=length(newCols)))
	names(y)<-newCols
	for (key in colKeys) {
	if (key %in% x) {
	headStr<-sub('Sum','',key) #collapsing the 'Sum' into an empty string
	y[[headStr]]<-ifelse(df$count>0,df[[key]]/df$count,numeric(nrow(df)))
	}
	else {y[,key] <- df[,key]}
	}
	return(y)
	}
	connection<- green_clean %>% filter(speed>1,speed<80,percent<50,duration<60,Trip_distance<20) %>%
	group_by(Pickup_Borough=boroughCode_p,Destination_Borough=boroughCode_d) %>%
	summarise(count=n(),dist=mean(Trip_distance), speed=mean(speed),duration=mean(duration),percent=mean(percent))

	connection <- filter(connection,Pickup_Borough!=5,Destination_Borough!=5)

	connection$Pickup_Borough <- factor(connection$Pickup_Borough,labels=Boroughs[1:4])
	connection$Destination_Borough <- factor(connection$Destination_Borough,labels=Boroughs[1:4])


	# starting boroughs vs target borough in a bar graph

	ggplot(data=connection,aes(x=Pickup_Borough,y=count)) + geom_bar(stat='identity',aes(fill=Destination_Borough),
	position='dodge') + xlab('Starting Borough') + ylab('green taxi rides in 2015') + scale_y_continuous(breaks=seq(0,4e6,by=2e6),
	labels=c('0M','2M','4M'))

	# the speed of cross-borough traffic in a 2D heat map, using start, end borough as horizontal, vertical indexes

	ggplot(connection, aes(x=as.factor(Pickup_Borough), y=as.factor(Destination_Borough),fill=speed)) + geom_tile() + scale_fill_gradientn(colors=c('black','dark red','red',
	'orange','yellow','white')) + scale_x_discrete(name='Green Taxi Originated Borough') +ggtitle(
	'speed-origin vs target borough heat map') + ylab('Target Borough') + labs(fill="speed")

	# To analyze the borough's traffic across different hours in the day

	by_origin_hour <- green_clean %>% filter(speed>1,speed<80,percent<50,duration<60,Trip_distance<20,boroughCode_p!=5) %>%
	group_by(Pickup_Borough=factor(boroughCode_p,labels=Boroughs[1:4]),hour=factor(hour)) %>%
	summarise(count=n(),dist=mean(Trip_distance), speed=mean(speed),duration=mean(duration),percent=mean(percent))


	ggplot(data=by_origin_hour,aes(x=as.factor(Pickup_Borough),y=as.factor(hour))) + geom_tile(aes(fill=count)) + scale_fill_gradientn(colors=c('black','dark red','red',
	'orange','yellow','white'), breaks=seq(1e5,4e5,by=1e5),labels=c('0.1M','0.2M','0.3M','0.4M')) + scale_x_discrete(name='Originated Borough') +ggtitle(
	'The ride count in an origin vs hour heat map') + ylab('hour') + labs(fill="daily taxi rides")
	# Step 1. Data Pre-processing, data cleaning/filtering, etc.
	# We bundle the 12 monthly NYC green taxi files into two half-year gz files by using cat, >> and gzip utilities.

	# Loading the NYC geojson polygon file
	nyc<-geojson_read('nycboroughboundaries.geojson',what='sp')
	Boroughs = c('Manhattan', 'Bronx', 'Brooklyn', 'Queens', 'Staten Island') #BoroughCode Mapping

	nyc2<-nyc
	nyc2@data[,2:3]<-NULL #only keep the borough code, throw away the others

	green_taxi1<-read.csv('green_tripdata_2015_01-06.csv.gz',stringsAsFactors = F)
	green_taxi2<-read.csv('green_tripdata_2015_07-12.csv.gz',stringsAsFactors = F)

	green_taxi1$dummy1 <- NULL #handle the excess ',' in all the rows of the csv files
	green_taxi1$dummy2 <- NULL

	green_taxi2$dummy1 <- NULL
	green_taxi2$dummy2 <- NULL

	green_taxi<-rbind(green_taxi1,green_taxi2)

	names(green_taxi) = c("VendorID" , "Pickup_datetime" ,"Dropoff_datetime" ,"Store_and_fwd_flag" ,
	"RateCodeID" , "Pickup_longitude" , "Pickup_latitude" , "Dropoff_longitude" ,
	"Dropoff_latitude" , "Passenger_count" , "Trip_distance" , "Fare_amount" ,
	"Extra" , "MTA_tax" , "Tip_amount" , "Tolls_amount" ,
	"Ehail_fee" , "improvement_surcharge" ,"Total_amount" , "Payment_type" ,
	"Trip_type" )

	rm(green_taxi1)
	rm(green_taxi2)

	PickupPts<-SpatialPoints(cbind(green_taxi$Pickup_longitude,green_taxi$Pickup_latitude))

	PickupPts@proj4string <- nyc@proj4string
	pickupBoroughCodes<-PickupPts %over% nyc2
	rm(PickupPts)
	green_taxi$boroughCode_p <- pickupBoroughCodes$boroughCode
	rm(pickupBoroughCodes)

	DropoffPts<-SpatialPoints(cbind(green_taxi$Dropoff_longitude,green_taxi$Dropoff_latitude))
	DropoffPts@proj4string <- nyc@proj4string
	dropoffBoroughCodes<-DropoffPts %over% nyc2
	rm(DropoffPts)
	green_taxi$boroughCode_d <- dropoffBoroughCodes$boroughCode
	rm(dropoffBoroughCodes)

	green_taxi$month <- month(green_taxi$Pickup_datetime)
	green_taxi$wday <- wday(green_taxi$Pickup_datetime)
	green_taxi$hour <- hour(green_taxi$Pickup_datetime)

	# Change the format of datetime from string to POSIXct objects

	green_taxi$Pickup_datetime <- as.POSIXct(green_taxi$Pickup_datetime,format='%Y-%m-%d %H:%M:%S')
	green_taxi$Dropoff_datetime <- as.POSIXct(green_taxi$Dropoff_datetime,format='%Y-%m-%d %H:%M:%S')

	green_taxi$duration <- floor(as.double(green_taxi$Dropoff_datetime-green_taxi$Pickup_datetime)/60.0)
	green_taxi$percent <- green_taxi$Tip_amount/green_taxi$Fare_amount * 100.0
	green_taxi$speed <- green_taxi$Trip_distance/green_taxi$duration *60

	save(green_taxi,file='green_tripdata_2015.RData') # saving the raw data to R native binary format for
	# future usage

	# Even though the data has no missing values (it has some unused column whose data is missing), the
	# data is very dirty. Source: untrained human input error, GPS instrument error...,etc. We have to apply
	# filter to remove those outliers which can damage our analysis. Furthermore, we need to fiter away
	#taxi rides which are actually FHV (for hire vehicle) rides which can last several hours and a few hundred
	#dollors. We fiter these rare events away.

	green_adm <- filter(green_taxi, RateCodeID %in% c(1,2,6), Payment_type %in% 1:2)
	# filter away non-credit card/cash ride and rides to Westchester county,etc.
	rm(green_taxi)

	green_clean<-filter(green_adm,Fare_amount<200,Total_amount<1000,Tolls_amount<30,Fare_amount>0,Trip_distance<50,
	Trip_distance>0.02,duration<120,duration>=1,speed>0,speed<100,percent<100,
	!is.na(boroughCode_p),!is.na(boroughCode_d))
	# green_clean is the data.frame for which the Time-domain analysis is performed

	by_wdayNhour<-green_clean %>% group_by(.,wday,hour) %>% summarise(.,speed=mean(speed),duration=mean(duration),
	Payment_type=mean(Payment_type), dist=mean(Trip_distance),percent=mean(percent),count=n()/365.0)


	ggplot(by_wdayNhour, aes(x=as.factor(wday), y=as.factor(hour),fill=speed)) + geom_tile() + scale_fill_gradientn(colors=c('black','dark red','red',
	'orange','yellow','white')) + scale_x_discrete(name='week day',
	labels=c('1'='S','2'='M','3'='T','4'='W','5'='T','6'='F','7'='S'))+ggtitle(
	'The green taxi speed in a weekday vs hour heat map') + ylab('hour') + labs(fill="Avg Speed (mph)")
	by_speed<-green_clean %>% filter(Payment_type==1,speed<50,percent<50) %>% group_by(speed=floor(2*speed)/2) %>% summarise(
	duration=mean(duration),percent=mean(percent),dist=mean(Trip_distance),count=n()/365.0)

	# use ggplot to plot the tip percentage w.r.t. speeds

	ggplot(by_speed, aes(x=(speed), y=percent)) + geom_point(aes(color
	=count),size=2,shape=1) + scale_x_continuous(name='speed (mph)',
	breaks=seq(0,50,by=5))+geom_smooth(se=F) + ylab('tip percentage')

	# to plot the multi-intraday time-windows tip percent vs speed

	# We refine this into different hours in the day

	newLevel<-c(rep('0-6am',6),rep('6-9am',3),rep('9am-4pm',7),rep('4-7pm',3),rep('7-12pm',5))

	by_speed_H <-green_clean %>% filter(Payment_type==1,speed<30,percent<50) %>% group_by(speed=floor(2*speed)/2,hour=factor(hour)) %>% summarise(
	duration=mean(duration),percent=mean(percent),dist=mean(Trip_distance),count=n()/365.0)

	levels(by_speed_H$hour)<-newLevel

	g<-ggplot(by_speed_H, aes(x=(speed), y=percent)) + scale_x_continuous(
	name='speed (mph)',breaks=seq(0,50,by=10))+ geom_smooth(se=F,aes(color=I('green'))) + ylab('tip percentages')
	g+facet_wrap(~hour) + ggtitle('The Tip Percentages at Differerent Speeds')

	g<-ggplot(by_speed_H, aes(x=(speed), y=duration)) + scale_x_continuous(
	name='speed (mph)',breaks=seq(0,50,by=10))+geom_smooth(se=F,aes(color=I('green'))) + ylab('duration (minutes)')
	g+facet_wrap(~hour) + ggtitle('The Ride Duration at Differerent Speeds')

	# To analyze the percentages of credit card paying passengers who do NOT pay

	noPay_byspeed<-green_clean %>% filter(Payment_type==1,speed<50,percent==0.0) %>% group_by(speed=floor(2*speed)/2) %>% summarise(
	duration=mean(duration),percent=mean(percent),dist=mean(Trip_distance),count=n()/365.0)

	noPay_byspeed$ratio = noPay_byspeed$count/by_speed$count * 100

	ggplot(noPay_byspeed, aes(x=(speed), y=ratio)) + geom_point(size=2,shape=1) + scale_x_continuous(name='speed (mph)',
	breaks=seq(0,50,by=5))+geom_smooth(se=F)+ylab('non-tippers\' population percentages')
	Map.PartialReduce.yellow.taxi<-function(yellow_adm) {

	yellow_clean<-yellow_adm %>% filter(Fare_amount<200,Total_amount<1000,Tolls_amount<30,Fare_amount>0,Trip_distance<50,
	Trip_distance>0.02,duration<120,duration>=1,speed>0,speed<100,percent<100)
	print(paste('month', month, 'yellow_clean number of rows:',nrow(yellow_clean)))

	x<-select(yellow_clean,month,wday,hour,duration,speed, percent,Tip_amount,Fare_amount,Total_amount,Tolls_amount,
	Trip_distance,Payment_type,RateCodeID,Passenger_count,boroughCode_p,boroughCode_d)
	rm(yellow_clean)

	y<-transmute(x,Passenger_count,month,wday,hour,duration,percent, Fare_amount, speed, Total_amount, Tolls_amount, Trip_distance,Payment_type,RateCodeID,boroughCode_p,boroughCode_d)
	y<-filter(y,speed<80,percent<50,duration<60,Trip_distance<20)
	y<-filter(y,Fare_amount<200,Total_amount<1000,Tolls_amount<30,Fare_amount>0,Trip_distance<20,
	Trip_distance>0.02,duration<120,duration>=1,speed>0,
	!is.na(boroughCode_p),!is.na(boroughCode_d))

	by_passenger <- y %>% group_by(Passenger_count) %>% summarise(count=n())
	by_wday_hour <- y %>% group_by(wday,hour) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n())
	by_month <- y %>% group_by(month) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n())
	by_wday <- y %>% group_by(wday) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n())
	by_hour <- y %>% group_by(hour) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n())
	by_origin_hour<- y %>% group_by(Pickup_Borough=factor(boroughCode_p,labels=Boroughs),hour=factor(hour)) %>%
	summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n())
	by_speed <- y %>% filter(Payment_type==1,speed<50) %>% group_by(speed=floor(speed2)0.5) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), distSum=sum(Trip_distance), count=n())
	by_speed_hour <- y %>% filter(Payment_type==1,speed<50) %>% group_by(speed=floor(speed2)0.5,hour) %>% summarise(percentSum=sum(percent), durationSum=sum(duration), distSum=sum(Trip_distance), count=n())
	by_duration <- y %>% filter(duration<60) %>% group_by(duration=floor(2duration)0.5) %>% summarise(percentSum=sum(percent),speedSum=sum(speed),distSum=sum(Trip_distance), count=n())
	by_dist <- y %>% filter(Trip_distance<10) %>% group_by(dist=floor(2Trip_distance)0.5) %>% summarise(percentSum=sum(percent),speedSum=sum(speed),durationSum=sum(duration), count=n())
	connection<- y %>% group_by(Pickup_Borough=factor(boroughCode_p,labels=Boroughs),
	Destination_Borough=factor(boroughCode_d,labels=Boroughs)) %>%
	summarise(percentSum=sum(percent), durationSum=sum(duration), speedSum=sum(speed), distSum=sum(Trip_distance), count=n())
	pay_byspeed_H <- y %>% filter(Payment_type==1,speed<50,percent>0,percent<50) %>% group_by(speed=floor(2*speed)/2,hour=factor(hour)) %>% summarise(
	durationSum=sum(duration),percentSum=sum(percent),distSum=sum(Trip_distance),count=n())
	noPay_byspeed_H <- y %>% filter(Payment_type==1,speed<50,percent==0) %>% group_by(speed=floor(2*speed)/2,hour=factor(hour)) %>% summarise(
	durationSum=sum(duration),distSum=sum(Trip_distance),count=n())

	newLevel<-c(rep('0-6am',6),rep('6-9am',3),rep('9am-4pm',7),rep('4-7pm',3),rep('7-12pm',5))

	myList <- c('noTipG_by_speed','tipG_by_speed', 'by_passenger', 'by_wday_hour','by_speed', 'pay_byspeed_H','noPay_byspeed_H',
	'by_month','by_wday', 'by_hour', 'by_speed_hour','by_origin_hour', 'by_dist', 'by_duration','connection')


	noTipG <-filter(y,percent==0,Payment_type==1)
	noTipG_by_speed <- noTipG %>% group_by(speed=floor(speed2)0.5) %>% summarise(
	percentSum=sum(percent),durationSum=sum(duration),distSum=sum(Trip_distance),count=n())
	tipG <- filter(y,percent>0,Payment_type==1)
	tipG_by_speed <- tipG %>% group_by(speed=floor(speed2)0.5) %>% summarise(
	percentSum=sum(percent), durationSum=sum(duration), distSum=sum(Trip_distance), count=n())

	for (vName in myList) {
	assign(paste0(vName,months[month]), get(vName))
	}
	dump(paste0(myList,months[month]),file='yellow_taxi_dump.R',append=T)
	rm(x,y)

	return(myList)
	}
	Preprocessing.Yellow.Taxi<-function(fileName,nyc) {

	library(saves)
	library(stringr)
	library(sp)

	Boroughs = c('Manhattan', 'Bronx', 'Brooklyn', 'Queens', 'Staten Island') #BoroughCode Mapping

	nyc2<-nyc
	nyc2@data[,2:3]<-NULL

	yellow_taxi<-read.csv(fileName,stringsAsFactors = F)

	print(paste("Finish reading the yellow taxi file for month",month))
	names(yellow_taxi)=c("VendorID","Pickup_datetime","Dropoff_datetime",
	"Passenger_count","Trip_distance","Pickup_longitude","Pickup_latitude","RateCodeID","Store_and_fwd_flag",
	"Dropoff_longitude","Dropoff_latitude","Payment_type","Fare_amount","Extra",
	"Mta_tax","Tip_amount","Tolls_amount","Improvement_surcharge","Total_amount")

	# Convert the long, lat info to the NYC borough categories
	PickupPts<-SpatialPoints(cbind(yellow_taxi$Pickup_longitude,yellow_taxi$Pickup_latitude))

	PickupPts@proj4string <- nyc@proj4string
	pickupBoroughCodes<-PickupPts %over% nyc2
	rm(PickupPts)
	yellow_taxi$boroughCode_p <- pickupBoroughCodes$boroughCode
	rm(pickupBoroughCodes)

	DropoffPts<-SpatialPoints(cbind(yellow_taxi$Dropoff_longitude,yellow_taxi$Dropoff_latitude))
	DropoffPts@proj4string <- nyc@proj4string
	dropoffBoroughCodes<-DropoffPts %over% nyc2
	rm(DropoffPts)
	yellow_taxi$boroughCode_d <- dropoffBoroughCodes$boroughCode
	rm(dropoffBoroughCodes)

	# convert the datetime string into month, wday, hour, etc.

	yellow_taxi$month = month(yellow_taxi$Pickup_datetime)
	yellow_taxi$wday = wday(yellow_taxi$Pickup_datetime)
	yellow_taxi$hour = hour(yellow_taxi$Pickup_datetime)

	# add derived variables which we will analyze

	yellow_taxi$duration = floor(as.double(as.POSIXct(yellow_taxi$Dropoff_datetime,format='%Y-%m-%d %H:%M:%S')
	-as.POSIXct(yellow_taxi$Pickup_datetime,format='%Y-%m-%d %H:%M:%S'))/60.0)

	yellow_taxi$speed = (yellow_taxi$Trip_distance/yellow_taxi$duration)*60
	yellow_taxi$percent = (yellow_taxi$Tip_amount/yellow_taxi$Fare_amount) *100
	yellow_taxi$Pickup_datetime<-NULL
	yellow_taxi$Dropoff_datetime<-NULL

	# Restrict to the credit card/cash payment, standard fare, JFK rides and group rides, etc.
	# we do not intend to include negotiated fares or payment by non-standard method.

	yellow_adm <- filter(yellow_taxi,RateCodeID %in% c(1,2,6),Payment_type %in% 1:2)
	print(paste('month', month, 'raw number of rows:', nrow(yellow_taxi)))
	rm(yellow_taxi)
	return(yellow_adm)
	}
	# After restarting a new session (or rm(list=ls()) ), assuming that the relavent data has been saved into the files
	library(dplyr)
	library(lubridate)
	library(reshape2)
	library(stringr)
	library(saves)
	library(ggplot2)
	library(ggmap)

	source('yellow_taxi_dump.R') # source and dump to the file, dump allows writing in an appending mode
	# varNames are from the previous session
	by_wday <- Assign.yellow.taxi('by_wday') # aggregating the monthly tables into a single data frame by aggregating
	by_wday <- Averaging.yellow.taxi(by_wday) #conver the sum statistics to mean statistics

	# the graphing to ggplot is not display as they are almost identical to the green taxis'.
	# in the following, demonstrate how to plot the origination locaitons of the yellow taxis

	nyc_map_g_str <- get_map(location = "new york city", zoom = 11) # google map of the NYC

	months = str_pad(1:12,2,"left",pad='0')


	yellow_files = paste0('yellow_adm_2015_',months,'.RDatas')
	colNames = c('Pickup_longitude','Pickup_latitude','percent','boroughCode_p','boroughCode_d')
	colNames = c('Dropoff_longitude','Dropoff_latitude','boroughCode_p','boroughCode_d')

	inNYC = list()
	for (i in 1:12) {
	yellow_partial<-loads(file=yellow_files[i],variables=colNames, to.data.frame=T)
	# test the boroughCodes to determine if the origin and the destination are both in the city
	inNYC[[i]] <- (!is.na(yellow_partial$boroughCode_p))&(!is.na(yellow_partial$boroughCode_d))
	rm(yellow_partial)
	if (i%%3==0) {gc()} # call the garbage collector
	}

	colNames <- c('Pickup_longitude','Pickup_latitude','duration','percent')
	yellow_samples = as.data.frame(matrix(nrow=0,ncol=length(colNames)))
	colnames(yellow_samples) <- colNames

	for (i in 1:12) {

	yellow_partial<-loads(file=yellow_files[i],variables=colNames, to.data.frame=T)
	yellow_samples <- rbind(yellow_samples, sample_frac(filter(yellow_partial,inNYC[[i]]),size=0.015))
	rm(yellow_partial) # tell the garbage collector that it can discard the variable
	if (i%%3==0) { gc() } # calling garbage collector every 3 iteration steps
	}


	yellow_sample <- transmute(yellow_samples, lon=signif(Pickup_longitude,5), lat=signif(Pickup_latitude,5)) # truncate the long-lat coordinates

	#or for drop off locations

	#yellow_sample <- transmute(yellow_samples, lon=signif(Dropoff_longitude,5), lat=signif(Dropoff_latitude,5))

	rm(yellow_samples)

	yellow_sample<-filter(yellow_sample,lon > -80,lon< -65,lat > 39,lat< 46)
	# rounding the long, lat to the desired accuracies

	ggmap(nyc_map_g_str, extent = "device") + stat_density2d(data = yellow_sample, aes(x = lon, y = lat, fill = ..level.., alpha = ..level..), size = 0.01, bins = 80, geom = "polygon") + scale_fill_gradient(low = "blue",
	high = "red") + scale_alpha(range = c(0, 0.3), guide = FALSE)+ theme(legend.position='none') +ggtitle('yellow taxi origination plot')

	rm(yellow_sample)