Szilard Pafka szilard

## mlbenchm-spark-gendata.txt
## get the data

for yr in 2005 2006 2007; do
  wget http://stat-computing.org/dataexpo/2009/$yr.csv.bz2
  bunzip2 $yr.csv.bz2
done


## install R and data.table

## mlbenchm-spark-RF.txt
spark-1.3.0-bin-hadoop2.4/bin/spark-shell --driver-memory 100G --executor-memory 100G


import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics


## mlbenchm-py-int-encoded.R
## generate integer-encoded categoricals

for SIZE in 1; do

time R --vanilla --quiet << EOF

library(data.table)

d1 <- as.data.frame(fread("train-${SIZE}m.csv"))
d2 <- as.data.frame(fread("test.csv"))

## mlbenchm-py-RF-int-enc.py
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

d_train = pd.read_csv("train-intcateg-1m.csv", header=None)
d_test = pd.read_csv("test-intcateg-1m.csv", header=None)

X_train = d_train.ix[:,0:7]
y_train = d_train.ix[:,8]

## SparkR-datatable-aggr100M.txt
data.table vs SparkR

group-by aggregate on 100M records (1M groups)


data.table 6.5 sec (without key) / 1.3 sec (with key) - all 1 core

SparkR cached 200 sec (8 cores)
30x / 150x   ( 240x / 1200x per core)

## benchm-ml-spark
Here are 2 code snippets:
(1) Compute one-hot encoded data for Spark, using the data generated by https://github.com/szilard/benchm-ml/blob/master/0-init/2-gendata.txt
(2) Run MLlib, computing soft predictions by hand.

I ran these with Spark 1.4, and they should work for 1.5 as well.

Note: There's no real need to switch to DataFrames yet for benchmarking.  Both the RDD and DataFrame APIs use the same underlying implementation.  (I hope to improve on that in Spark 1.6 if there is time.)

Ran on EC2 cluster with 4 workers with 9.6GB memory each, and 8 partitions for training RDD.
For the 1M dataset, training the forest took 2080.814977193 sec and achieved AUC 0.7129779357732448 on the test set.

## adding_numbers.R

x <- as.numeric(1:1e9)

system.time(sum(x))

## psum.c
/*

Adapted from:
https://computing.llnl.gov/tutorials/pthreads/samples/arrayloops.c
http://stackoverflow.com/questions/2962785/c-using-clock-to-measure-time-in-multi-threaded-programs

Run as:
gcc -Ofast -pthread psum.c -lm && ./a.out

*/

## overfitting.R
library(ggplot2)

n <- 30
d <- data.frame(x = 1:n, y = runif(n))

ggplot(d, aes(x = x, y = y)) + geom_point() +
  geom_smooth(se = FALSE, span = 0.1)

## h2o_sum_1bn.R

install.packages("h2o", type="source", repos=(c("http://h2o-release.s3.amazonaws.com/h2o/rel-slater/9/R")))


library(h2o)
h2oServer <- h2o.init(nthreads = -1)


system.time({
d <- h2o.createFrame(h2oServer, rows = 1e9, cols = 1, missing_fraction = 0,
	## get the data

	for yr in 2005 2006 2007; do
	wget http://stat-computing.org/dataexpo/2009/$yr.csv.bz2
	bunzip2 $yr.csv.bz2
	done



	## install R and data.table
	spark-1.3.0-bin-hadoop2.4/bin/spark-shell --driver-memory 100G --executor-memory 100G



	import org.apache.spark.mllib.regression.LabeledPoint
	import org.apache.spark.mllib.linalg.Vectors
	import org.apache.spark.mllib.tree.RandomForest
	import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
	## generate integer-encoded categoricals

	for SIZE in 1; do

	time R --vanilla --quiet << EOF

	library(data.table)

	d1 <- as.data.frame(fread("train-${SIZE}m.csv"))
	d2 <- as.data.frame(fread("test.csv"))
	import numpy as np
	import pandas as pd
	from sklearn.ensemble import RandomForestClassifier
	from sklearn import metrics

	d_train = pd.read_csv("train-intcateg-1m.csv", header=None)
	d_test = pd.read_csv("test-intcateg-1m.csv", header=None)

	X_train = d_train.ix[:,0:7]
	y_train = d_train.ix[:,8]
	data.table vs SparkR

	group-by aggregate on 100M records (1M groups)


	data.table 6.5 sec (without key) / 1.3 sec (with key) - all 1 core

	SparkR cached 200 sec (8 cores)
	30x / 150x ( 240x / 1200x per core)
	Here are 2 code snippets:
	(1) Compute one-hot encoded data for Spark, using the data generated by https://github.com/szilard/benchm-ml/blob/master/0-init/2-gendata.txt
	(2) Run MLlib, computing soft predictions by hand.

	I ran these with Spark 1.4, and they should work for 1.5 as well.

	Note: There's no real need to switch to DataFrames yet for benchmarking. Both the RDD and DataFrame APIs use the same underlying implementation. (I hope to improve on that in Spark 1.6 if there is time.)

	Ran on EC2 cluster with 4 workers with 9.6GB memory each, and 8 partitions for training RDD.
	For the 1M dataset, training the forest took 2080.814977193 sec and achieved AUC 0.7129779357732448 on the test set.
	/*

	Adapted from:
	https://computing.llnl.gov/tutorials/pthreads/samples/arrayloops.c
	http://stackoverflow.com/questions/2962785/c-using-clock-to-measure-time-in-multi-threaded-programs

	Run as:
	gcc -Ofast -pthread psum.c -lm && ./a.out

	*/
	library(ggplot2)

	n <- 30
	d <- data.frame(x = 1:n, y = runif(n))

	ggplot(d, aes(x = x, y = y)) + geom_point() +
	geom_smooth(se = FALSE, span = 0.1)

	install.packages("h2o", type="source", repos=(c("http://h2o-release.s3.amazonaws.com/h2o/rel-slater/9/R")))


	library(h2o)
	h2oServer <- h2o.init(nthreads = -1)


	system.time({
	d <- h2o.createFrame(h2oServer, rows = 1e9, cols = 1, missing_fraction = 0,