Szilard Pafka szilard

## data_table_materialized_join_vs_not.R
## count

benchmark(
  nrow(d[dm, nomatch=0L, on="x"]),
  d[dm, .N, nomatch=0L, on="x"],
replications = 5, columns = c("test", "replications", "elapsed", "relative"))

#                                test replications elapsed relative
#2   d[dm, .N, nomatch = 0, on = "x"]            5  28.535    1.000
#1 nrow(d[dm, nomatch = 0, on = "x"])            5  38.562    1.351

## nnet_outliers.R

library(nnet)
library(h2o)
h2o.init()

set.seed(123)

n <- 1000

x1 <- runif(n)

## sparse-linreg.R
library(Matrix)

rm(list=ls())

set.seed(123)


## parameters

n <- 1e6

## meetup_raffle.R
library(yaml)
library(RJSONIO)
library(httr)

event_id <- 132296372
n_max <- 20

api_key <- yaml.load_file("meetup_api_key.yml")$api_key
  ## get your api key from  http://www.meetup.com/meetup_api/key/  while logged in

## h2o-group_by.txt

########### R

library(h2o)

h2oServer <- h2o.init(max_mem_size = "50g", nthreads = -1)

d <- h2o.importFile(h2oServer, path = "d.csv")

system.time({

## h2o_sum_1bn.R

install.packages("h2o", type="source", repos=(c("http://h2o-release.s3.amazonaws.com/h2o/rel-slater/9/R")))


library(h2o)
h2oServer <- h2o.init(nthreads = -1)


system.time({
d <- h2o.createFrame(h2oServer, rows = 1e9, cols = 1, missing_fraction = 0,

## overfitting.R
library(ggplot2)

n <- 30
d <- data.frame(x = 1:n, y = runif(n))

ggplot(d, aes(x = x, y = y)) + geom_point() +
  geom_smooth(se = FALSE, span = 0.1)

## psum.c
/*

Adapted from:
https://computing.llnl.gov/tutorials/pthreads/samples/arrayloops.c
http://stackoverflow.com/questions/2962785/c-using-clock-to-measure-time-in-multi-threaded-programs

Run as:
gcc -Ofast -pthread psum.c -lm && ./a.out

*/

## adding_numbers.R

x <- as.numeric(1:1e9)

system.time(sum(x))

## benchm-ml-spark
Here are 2 code snippets:
(1) Compute one-hot encoded data for Spark, using the data generated by https://github.com/szilard/benchm-ml/blob/master/0-init/2-gendata.txt
(2) Run MLlib, computing soft predictions by hand.

I ran these with Spark 1.4, and they should work for 1.5 as well.

Note: There's no real need to switch to DataFrames yet for benchmarking.  Both the RDD and DataFrame APIs use the same underlying implementation.  (I hope to improve on that in Spark 1.6 if there is time.)

Ran on EC2 cluster with 4 workers with 9.6GB memory each, and 8 partitions for training RDD.
For the 1M dataset, training the forest took 2080.814977193 sec and achieved AUC 0.7129779357732448 on the test set.
	## count

	benchmark(
	nrow(d[dm, nomatch=0L, on="x"]),
	d[dm, .N, nomatch=0L, on="x"],
	replications = 5, columns = c("test", "replications", "elapsed", "relative"))

	# test replications elapsed relative
	#2 d[dm, .N, nomatch = 0, on = "x"] 5 28.535 1.000
	#1 nrow(d[dm, nomatch = 0, on = "x"]) 5 38.562 1.351

	library(nnet)
	library(h2o)
	h2o.init()

	set.seed(123)

	n <- 1000

	x1 <- runif(n)
	library(Matrix)

	rm(list=ls())

	set.seed(123)


	## parameters

	n <- 1e6
	library(yaml)
	library(RJSONIO)
	library(httr)

	event_id <- 132296372
	n_max <- 20

	api_key <- yaml.load_file("meetup_api_key.yml")$api_key
	## get your api key from http://www.meetup.com/meetup_api/key/ while logged in

	########### R

	library(h2o)

	h2oServer <- h2o.init(max_mem_size = "50g", nthreads = -1)

	d <- h2o.importFile(h2oServer, path = "d.csv")

	system.time({

	install.packages("h2o", type="source", repos=(c("http://h2o-release.s3.amazonaws.com/h2o/rel-slater/9/R")))


	library(h2o)
	h2oServer <- h2o.init(nthreads = -1)


	system.time({
	d <- h2o.createFrame(h2oServer, rows = 1e9, cols = 1, missing_fraction = 0,
	library(ggplot2)

	n <- 30
	d <- data.frame(x = 1:n, y = runif(n))

	ggplot(d, aes(x = x, y = y)) + geom_point() +
	geom_smooth(se = FALSE, span = 0.1)
	/*

	Adapted from:
	https://computing.llnl.gov/tutorials/pthreads/samples/arrayloops.c
	http://stackoverflow.com/questions/2962785/c-using-clock-to-measure-time-in-multi-threaded-programs

	Run as:
	gcc -Ofast -pthread psum.c -lm && ./a.out

	*/
	Here are 2 code snippets:
	(1) Compute one-hot encoded data for Spark, using the data generated by https://github.com/szilard/benchm-ml/blob/master/0-init/2-gendata.txt
	(2) Run MLlib, computing soft predictions by hand.

	I ran these with Spark 1.4, and they should work for 1.5 as well.

	Note: There's no real need to switch to DataFrames yet for benchmarking. Both the RDD and DataFrame APIs use the same underlying implementation. (I hope to improve on that in Spark 1.6 if there is time.)

	Ran on EC2 cluster with 4 workers with 9.6GB memory each, and 8 partitions for training RDD.
	For the 1M dataset, training the forest took 2080.814977193 sec and achieved AUC 0.7129779357732448 on the test set.