rxin/benchmark.scala

## benchmark.scala

// Launch spark-shell
MASTER=local[4] bin/spark-shell --driver-memory 4G --conf spark.shuffle.memoryFraction=0.5 --packages com.databricks:spark-csv_2.10:1.2.0

// Read the DF in
val pdf = sqlContext.read.parquet("d_small_key.parquet")
sqlContext.setConf("spark.sql.shuffle.partitions", "8")

// Data reading
val start = System.currentTimeMillis
pdf.queryExecution.executedPlan.execute().count()
val end = System.currentTimeMillis
print("time taken: " + ((end - start) / 1e3))

// Aggregate without shuffle, on the TungstenAggregate operator
val start = System.currentTimeMillis
pdf.groupBy("x").agg(mean("y").as("ym")).orderBy("ym").queryExecution.executedPlan(6).execute().count()
val end = System.currentTimeMillis
print("time taken: " + ((end - start) / 1e3))

// End-to-end
val start = System.currentTimeMillis
pdf.groupBy("x").agg(mean("y").as("ym")).orderBy("ym").head(5)
val end = System.currentTimeMillis
print("time taken: " + ((end - start) / 1e3))


## data-generation.R
time R --vanilla --quiet << EOF
set.seed(123)
n <- 100e6
m <- 1e3
d <- data.frame(x = sample(m, n, replace=TRUE), y = runif(n))
write.table(d, file = "d_small_key.csv", row.names = FALSE, sep = ",")
EOF

	// Launch spark-shell
	MASTER=local[4] bin/spark-shell --driver-memory 4G --conf spark.shuffle.memoryFraction=0.5 --packages com.databricks:spark-csv_2.10:1.2.0

	// Read the DF in
	val pdf = sqlContext.read.parquet("d_small_key.parquet")
	sqlContext.setConf("spark.sql.shuffle.partitions", "8")

	// Data reading
	val start = System.currentTimeMillis
	pdf.queryExecution.executedPlan.execute().count()
	val end = System.currentTimeMillis
	print("time taken: " + ((end - start) / 1e3))

	// Aggregate without shuffle, on the TungstenAggregate operator
	val start = System.currentTimeMillis
	pdf.groupBy("x").agg(mean("y").as("ym")).orderBy("ym").queryExecution.executedPlan(6).execute().count()
	val end = System.currentTimeMillis
	print("time taken: " + ((end - start) / 1e3))

	// End-to-end
	val start = System.currentTimeMillis
	pdf.groupBy("x").agg(mean("y").as("ym")).orderBy("ym").head(5)
	val end = System.currentTimeMillis
	print("time taken: " + ((end - start) / 1e3))
	time R --vanilla --quiet << EOF
	set.seed(123)
	n <- 100e6
	m <- 1e3
	d <- data.frame(x = sample(m, n, replace=TRUE), y = runif(n))
	write.table(d, file = "d_small_key.csv", row.names = FALSE, sep = ",")
	EOF