vsimko/sparklyr-is-working.R

## sparklyr-is-working.R
# if (!require('devtools')) install.packages('devtools')
# devtools::install_github('apache/spark@v2.0.2', subdir='R/pkg')

library(SparkR)
library(sparklyr)
library(dplyr)

# use specific version of spark/hadoop
sc <- spark_connect("local", version = "2.0.2", hadoop_version = "2.7")

# create spark table "tab1" representing the CSV file (all types are "chr")
spark_read_csv(
  sc, "bmw_tab1", "~/SeminarKIT_censored.csv", delimiter = ";",
  memory = FALSE, infer_schema = FALSE) -> tab1

# define types for specific colums and use it as "tab2"
# Note: some columns may be shuffled so look at the end of the column list
tab1 %>%
  mutate(
    lfdNr = as.integer(lfdNr),
    Fahrzeugalter = as.numeric(Fahrzeugalter)
    ) %>%
  sdf_register("bmw_tab2") -> tab2

# select specific columns and show that we are using new types
tab2 %>% select(VIN, lfdNr, Fahrzeugalter) %>% head(10) %>% collect

# now apply simple filter and extract the results to an in-memory R dataframe
tab1 %>% filter(P_Histo_Laden_5 == "2") %>% head(200) %>% collect -> df1
	# if (!require('devtools')) install.packages('devtools')
	# devtools::install_github('apache/spark@v2.0.2', subdir='R/pkg')

	library(SparkR)
	library(sparklyr)
	library(dplyr)

	# use specific version of spark/hadoop
	sc <- spark_connect("local", version = "2.0.2", hadoop_version = "2.7")

	# create spark table "tab1" representing the CSV file (all types are "chr")
	spark_read_csv(
	sc, "bmw_tab1", "~/SeminarKIT_censored.csv", delimiter = ";",
	memory = FALSE, infer_schema = FALSE) -> tab1

	# define types for specific colums and use it as "tab2"
	# Note: some columns may be shuffled so look at the end of the column list
	tab1 %>%
	mutate(
	lfdNr = as.integer(lfdNr),
	Fahrzeugalter = as.numeric(Fahrzeugalter)
	) %>%
	sdf_register("bmw_tab2") -> tab2

	# select specific columns and show that we are using new types
	tab2 %>% select(VIN, lfdNr, Fahrzeugalter) %>% head(10) %>% collect

	# now apply simple filter and extract the results to an in-memory R dataframe
	tab1 %>% filter(P_Histo_Laden_5 == "2") %>% head(200) %>% collect -> df1