Created
September 21, 2018 13:56
-
-
Save matt-dray/625e810940335d5388b3ba07e8f66a1c to your computer and use it in GitHub Desktop.
Testing the sparklyr package for Coffee & Coding
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Testing sparklyr for Coffee & Coding | |
# Matt Dray | |
# 29 Nov 2017 | |
# Following https://spark.rstudio.com/ | |
# R 3.4.2 & RStudio 1.1.383 | |
# What? ------------------------------------------------------------------- | |
# 'Apache Spark is a fast and general engine for large-scale data processing' | |
# (http://spark.apache.org/) | |
# Apache Spark is an open-source cluster-computing framework (Wikipedia) | |
# Connect to Spark from R. The sparklyr package provides a complete dplyr | |
# backend. Filter and aggregate Spark datasets then bring them into R for | |
# analysis and visualization. Use Spark's distributed machine learning library | |
# from R. (https://spark.rstudio.com/) | |
# This script can be used to test out a local connection. | |
# The local mode is very convenient for testing, debugging or demonstration | |
# purposes as it requires no earlier setup to launch Spark applications. | |
# Code below is from https://spark.rstudio.com/ (further detail available on the | |
# site) | |
# Installation ------------------------------------------------------------ | |
install.packages("sparklyr") # if not run before | |
library(sparklyr) # load package | |
spark_install(version = "2.1.0") # latest version | |
# Connect to Spark -------------------------------------------------------- | |
library(sparklyr) | |
# Connect to local Spark cluster | |
sc <- spark_connect(master = "local") | |
# Using dplyr ------------------------------------------------------------- | |
# We'll start by copying some datasets from R into the Spark cluster (note that | |
# you may need to install the nycflights13 and Lahman packages in order to | |
# execute this code): | |
install.packages(c("nycflights13", "Lahman")) | |
library(dplyr) | |
# Copy the data to the cluster (added to the 'Connections' tab of RStudio) | |
iris_tbl <- copy_to(sc, iris) | |
flights_tbl <- copy_to(sc, nycflights13::flights, "flights") | |
batting_tbl <- copy_to(sc, Lahman::Batting, "batting") | |
# List the tables | |
src_tbls(sc) | |
# Filter by departure delay and print the first few records | |
# Note in output that source is 'lazy query' and database is 'spark_connection' | |
flights_tbl %>% | |
filter(dep_delay == 2) | |
# Example of filtering and plotting | |
delay <- flights_tbl %>% | |
group_by(tailnum) %>% | |
summarise(count = n(), dist = mean(distance), delay = mean(arr_delay)) %>% | |
filter(count > 20, dist < 2000, !is.na(delay)) %>% | |
collect | |
library(ggplot2) | |
ggplot(delay, aes(dist, delay)) + | |
geom_point(aes(size = count), alpha = 1/2) + | |
geom_smooth() + | |
scale_size_area(max_size = 2) | |
# Using SQL --------------------------------------------------------------- | |
# Package for interfacing with databases | |
library(DBI) | |
iris_preview <- dbGetQuery(sc, "SELECT * FROM iris LIMIT 10") | |
iris_preview | |
# Diagnostics ------------------------------------------------------------- | |
# Click the 'SparkUI' button in the 'Connections' tab to launch the broswer | |
# interface, which has tabs for jobs, stages, storage, environment, executors | |
# and SQL | |
# Check the log locally | |
spark_log(sc, n = 10) | |
# Finishing up ------------------------------------------------------------ | |
# Disconnect | |
spark_disconnect(sc) | |
# Or click the 'Disconnect from a connection button (T-shaped pipe with a red | |
# cross) in the 'Connections' tab of RStudio | |
# Learn more at https://spark.rstudio.com/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment