Skip to content

Instantly share code, notes, and snippets.

@viveknarang
Last active April 29, 2017 01:32
Show Gist options
  • Save viveknarang/4c4d4b43ec7de692ae7a7bb62159ad54 to your computer and use it in GitHub Desktop.
Save viveknarang/4c4d4b43ec7de692ae7a7bb62159ad54 to your computer and use it in GitHub Desktop.
####################################################################
##################### Assignment A8 - B ############################
######################################### Vivek Narang #############
# Start
# Loading required libraries
library(rmr2)
library(tidyr)
library(readr)
library(dplyr)
# Setting hadoop backend to local
rmr.options(backend = "local")
# CSV File reference
url <- "c:\\A8.csv"
# Reading Comma separated CSV file
t <- read_delim(url, delim=',')
# Using the data frame to create a temp file on virtual HDFS
hdfs.temp <- to.dfs(data.frame(t))
# Mapper: Grouping all the Taxi Out data for each Airport ID
mapper <- function(k,v) {
key <- v$Origin
value <- v$TaxiOut
keyval(key,value)
}
# Reducer: Computing min, max, average for each Taxi Out subset against Each airport ID
reducer <- function(k,v) {
key <- k # Origin Airport
value <- c(min(v, na.rm = TRUE), mean(v, na.rm = TRUE), max(v, na.rm = TRUE))
keyval(key,value)
}
# Initiating the Hadoop Mapreduce Job.
out = mapreduce( input = hdfs.temp, map = mapper, reduce = reducer)
# Fetching the data frame from batch processed data
t3 <- as.data.frame(from.dfs(out))
t3$measure <- c('min','mean','max')
# Converting to tabular form.
stats2 <- spread(t3,measure,val)
colnames(stats2) <- c('Origin Airport','Max Taxi Out time','Mean Taxi Out time','Min Taxi Out time')
#Ordering by airport ID
stats3 <- stats2 %>% arrange(`Origin Airport`)
# Print output on console top 6 rows
head(stats3)
# End
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment