matt-dray/sparklyr-test.R

## sparklyr-test.R
# Testing sparklyr for Coffee & Coding
# Matt Dray
# 29 Nov 2017
# Following https://spark.rstudio.com/
# R 3.4.2 & RStudio 1.1.383


# What? -------------------------------------------------------------------


# 'Apache Spark is a fast and general engine for large-scale data processing'
# (http://spark.apache.org/)
# Apache Spark is an open-source cluster-computing framework (Wikipedia)

# Connect to Spark from R. The sparklyr package provides a  complete dplyr
# backend. Filter and aggregate Spark datasets then bring them into R for
# analysis and visualization. Use Spark's distributed machine learning library
# from R. (https://spark.rstudio.com/)

# This script can be used to test out a local connection.
# The local mode is very convenient for testing, debugging or demonstration
# purposes as it requires no earlier setup to launch Spark applications.

# Code below is from https://spark.rstudio.com/ (further detail available on the
# site)


# Installation ------------------------------------------------------------


install.packages("sparklyr")  # if not run before
library(sparklyr)  # load package
spark_install(version = "2.1.0")  # latest version


# Connect to Spark --------------------------------------------------------


library(sparklyr)

# Connect to local Spark cluster
sc <- spark_connect(master = "local")


# Using dplyr -------------------------------------------------------------


# We'll start by copying some datasets from R into the Spark cluster (note that
# you may need to install the nycflights13 and Lahman packages in order to
# execute this code):
install.packages(c("nycflights13", "Lahman"))

library(dplyr)

# Copy the data to the cluster (added to the 'Connections' tab of RStudio)
iris_tbl <- copy_to(sc, iris)
flights_tbl <- copy_to(sc, nycflights13::flights, "flights")
batting_tbl <- copy_to(sc, Lahman::Batting, "batting")

# List the tables
src_tbls(sc)

# Filter by departure delay and print the first few records
# Note in output that source is 'lazy query' and database is 'spark_connection'
flights_tbl %>%
  filter(dep_delay == 2)


# Example of filtering and plotting
delay <- flights_tbl %>%
  group_by(tailnum) %>%
  summarise(count = n(), dist = mean(distance), delay = mean(arr_delay)) %>%
  filter(count > 20, dist < 2000, !is.na(delay)) %>%
  collect

library(ggplot2)

ggplot(delay, aes(dist, delay)) +
  geom_point(aes(size = count), alpha = 1/2) +
  geom_smooth() +
  scale_size_area(max_size = 2)


# Using SQL ---------------------------------------------------------------


# Package for interfacing with databases
library(DBI)

iris_preview <- dbGetQuery(sc, "SELECT * FROM iris LIMIT 10")
iris_preview


# Diagnostics -------------------------------------------------------------


# Click the 'SparkUI' button in the 'Connections' tab to launch the broswer
# interface, which has tabs for jobs, stages, storage, environment, executors
# and SQL

# Check the log locally
spark_log(sc, n = 10)

# Finishing up ------------------------------------------------------------

# Disconnect
spark_disconnect(sc)

# Or click the 'Disconnect from a connection button (T-shaped pipe with a red
# cross) in the 'Connections' tab of RStudio

# Learn more at https://spark.rstudio.com/
	# Testing sparklyr for Coffee & Coding
	# Matt Dray
	# 29 Nov 2017
	# Following https://spark.rstudio.com/
	# R 3.4.2 & RStudio 1.1.383



	# What? -------------------------------------------------------------------


	# 'Apache Spark is a fast and general engine for large-scale data processing'
	# (http://spark.apache.org/)
	# Apache Spark is an open-source cluster-computing framework (Wikipedia)

	# Connect to Spark from R. The sparklyr package provides a complete dplyr
	# backend. Filter and aggregate Spark datasets then bring them into R for
	# analysis and visualization. Use Spark's distributed machine learning library
	# from R. (https://spark.rstudio.com/)

	# This script can be used to test out a local connection.
	# The local mode is very convenient for testing, debugging or demonstration
	# purposes as it requires no earlier setup to launch Spark applications.

	# Code below is from https://spark.rstudio.com/ (further detail available on the
	# site)


	# Installation ------------------------------------------------------------


	install.packages("sparklyr") # if not run before
	library(sparklyr) # load package
	spark_install(version = "2.1.0") # latest version


	# Connect to Spark --------------------------------------------------------


	library(sparklyr)

	# Connect to local Spark cluster
	sc <- spark_connect(master = "local")


	# Using dplyr -------------------------------------------------------------


	# We'll start by copying some datasets from R into the Spark cluster (note that
	# you may need to install the nycflights13 and Lahman packages in order to
	# execute this code):
	install.packages(c("nycflights13", "Lahman"))

	library(dplyr)

	# Copy the data to the cluster (added to the 'Connections' tab of RStudio)
	iris_tbl <- copy_to(sc, iris)
	flights_tbl <- copy_to(sc, nycflights13::flights, "flights")
	batting_tbl <- copy_to(sc, Lahman::Batting, "batting")

	# List the tables
	src_tbls(sc)

	# Filter by departure delay and print the first few records
	# Note in output that source is 'lazy query' and database is 'spark_connection'
	flights_tbl %>%
	filter(dep_delay == 2)


	# Example of filtering and plotting
	delay <- flights_tbl %>%
	group_by(tailnum) %>%
	summarise(count = n(), dist = mean(distance), delay = mean(arr_delay)) %>%
	filter(count > 20, dist < 2000, !is.na(delay)) %>%
	collect

	library(ggplot2)

	ggplot(delay, aes(dist, delay)) +
	geom_point(aes(size = count), alpha = 1/2) +
	geom_smooth() +
	scale_size_area(max_size = 2)


	# Using SQL ---------------------------------------------------------------


	# Package for interfacing with databases
	library(DBI)

	iris_preview <- dbGetQuery(sc, "SELECT * FROM iris LIMIT 10")
	iris_preview


	# Diagnostics -------------------------------------------------------------


	# Click the 'SparkUI' button in the 'Connections' tab to launch the broswer
	# interface, which has tabs for jobs, stages, storage, environment, executors
	# and SQL

	# Check the log locally
	spark_log(sc, n = 10)

	# Finishing up ------------------------------------------------------------

	# Disconnect
	spark_disconnect(sc)

	# Or click the 'Disconnect from a connection button (T-shaped pipe with a red
	# cross) in the 'Connections' tab of RStudio

	# Learn more at https://spark.rstudio.com/