shivaram/sparkr-demo

## sparkr-demo
# If you are using Spark 1.4, then launch SparkR with the command
#
# ./bin/sparkR --packages com.databricks:spark-csv_2.10:1.0.3
# as the `sparkPackages=` flag was only added in Spark 1.4.1.

# # This will work in Spark 1.4.1.
sc <- sparkR.init(spark_link, sparkPackages = "com.databricks:spark-csv_2.10:1.0.3")
sqlContext <- sparkRSQL.init(sc)

flights <- read.df(sqlContext, "s3n://sparkr-data/nycflights13.csv","com.databricks.spark.csv", header="true")

# Print the first few rows
head(flights)

# Run a query to print the top 5 most frequent destinations from JFK
jfk_flights <- filter(flights, flights$origin == "JFK")

# Group the flights by destination and aggregate by the number of flights
dest_flights <- agg(group_by(jfk_flights, jfk_flights$dest), count = n(jfk_flights$dest))

# Now sort by the `count` column and print the first few rows
head(arrange(dest_flights, desc(dest_flights$count)))

##  dest count
##1  LAX 11262
##2  SFO  8204
##3  BOS  5898

# Running SQL Queries
registerTempTable(flights, "flightsTable")
delayDF <- sql(sqlContext, "SELECT dest, arr_delay FROM flightsTable")

# Creating new Columns, Deleting columns
flights$air_time_hr <- flights$air_time / 60
flights$air_time_hr <- NULL

# Combine the whole query into two lines using magrittr
library(magrittr)
dest_flights <- filter(flights, flights$origin == "JFK") %>%
  group_by(flights$dest) %>%
  summarize(count = n(flights$dest))

top_dests <- head(arrange(dest_flights, desc(dest_flights$count)))
barplot(top_dests$count, names.arg = top_dests$dest)
	# If you are using Spark 1.4, then launch SparkR with the command
	#
	# ./bin/sparkR --packages com.databricks:spark-csv_2.10:1.0.3
	# as the `sparkPackages=` flag was only added in Spark 1.4.1.

	# # This will work in Spark 1.4.1.
	sc <- sparkR.init(spark_link, sparkPackages = "com.databricks:spark-csv_2.10:1.0.3")
	sqlContext <- sparkRSQL.init(sc)

	flights <- read.df(sqlContext, "s3n://sparkr-data/nycflights13.csv","com.databricks.spark.csv", header="true")

	# Print the first few rows
	head(flights)

	# Run a query to print the top 5 most frequent destinations from JFK
	jfk_flights <- filter(flights, flights$origin == "JFK")

	# Group the flights by destination and aggregate by the number of flights
	dest_flights <- agg(group_by(jfk_flights, jfk_flights$dest), count = n(jfk_flights$dest))

	# Now sort by the `count` column and print the first few rows
	head(arrange(dest_flights, desc(dest_flights$count)))

	## dest count
	##1 LAX 11262
	##2 SFO 8204
	##3 BOS 5898

	# Running SQL Queries
	registerTempTable(flights, "flightsTable")
	delayDF <- sql(sqlContext, "SELECT dest, arr_delay FROM flightsTable")

	# Creating new Columns, Deleting columns
	flights$air_time_hr <- flights$air_time / 60
	flights$air_time_hr <- NULL

	# Combine the whole query into two lines using magrittr
	library(magrittr)
	dest_flights <- filter(flights, flights$origin == "JFK") %>%
	group_by(flights$dest) %>%
	summarize(count = n(flights$dest))

	top_dests <- head(arrange(dest_flights, desc(dest_flights$count)))
	barplot(top_dests$count, names.arg = top_dests$dest)