vsimko/sparklyr-is-working2.R

## sparklyr-is-working2.R
library(SparkR)
library(sparklyr)
library(dplyr)

# use specific version of spark/hadoop
sc <- spark_connect("local", version = "2.0.2", hadoop_version = "2.7")

# assuming comma-separated input
spark_read_csv(
  sc, "bmw", "~/SOC.csv",
  memory = FALSE, infer_schema = TRUE, null_value = "null") -> tab1

# copy subsample into R's memory as df1
tab1 %>% sdf_sample(fraction = 0.01) %>% collect -> df1

# use corplot to investigate correlations amongst columns
library(corrplot)
df1$VIN <- NULL # because VIN is chr
M <- cor(df1, use = "pairwise.complete.obs", method = "pearson") # also method="spearman" is possible

corrplot(M, order = "FPC")
# take a look at https://cran.r-project.org/web/packages/corrplot/vignettes/corrplot-intro.html
# or this https://github.com/taiyun/corrplot
	library(SparkR)
	library(sparklyr)
	library(dplyr)

	# use specific version of spark/hadoop
	sc <- spark_connect("local", version = "2.0.2", hadoop_version = "2.7")

	# assuming comma-separated input
	spark_read_csv(
	sc, "bmw", "~/SOC.csv",
	memory = FALSE, infer_schema = TRUE, null_value = "null") -> tab1

	# copy subsample into R's memory as df1
	tab1 %>% sdf_sample(fraction = 0.01) %>% collect -> df1

	# use corplot to investigate correlations amongst columns
	library(corrplot)
	df1$VIN <- NULL # because VIN is chr
	M <- cor(df1, use = "pairwise.complete.obs", method = "pearson") # also method="spearman" is possible

	corrplot(M, order = "FPC")
	# take a look at https://cran.r-project.org/web/packages/corrplot/vignettes/corrplot-intro.html
	# or this https://github.com/taiyun/corrplot