Reynold Xin rxin

## NaNTesting.java
package com.databricks.unsafe.util.benchmark;

import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.RunnerException;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;

## CodegenTest.scala
package org.apache.spark.sql

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.functions._

object CodegenTest {

  def main(args: Array[String]): Unit = {
    val sc = SparkContext.getOrCreate()
    val sqlContext = new SQLContext(sc)

## benchmark.scala

// Launch spark-shell
MASTER=local[4] bin/spark-shell --driver-memory 4G --conf spark.shuffle.memoryFraction=0.5 --packages com.databricks:spark-csv_2.10:1.2.0

// Read the DF in
val pdf = sqlContext.read.parquet("d_small_key.parquet")
sqlContext.setConf("spark.sql.shuffle.partitions", "8")

// Data reading
val start = System.currentTimeMillis

## ampcamp-ecnu-2013-data.sh
################################################################################
# Step 1. Download wiki traffic log.
# from
#  https://s3.amazonaws.com/ampcamp/ampcamp-ecnu-2013/wikistats/part-00095.gz
# to
#  https://s3.amazonaws.com/ampcamp/ampcamp-ecnu-2013/wikistats/part-00168.gz
# Note that 095 and 168 are both 0 bytes. The sole purpose of their existence is
# to verify the downloads.

# NOTE THAT THE FOLLOWING SCRIPT STARTS wget AS BACKGROUND PROCESSES.

## testwrite.scala
def testWrite(path: String): Long = {
  val startTime = System.currentTimeMillis()
  val out = new java.io.FileWriter(path)
  var i = 1
  val bytes = " " * (1024 * 1024)
  while (i < 1000) {
    out.write(bytes)
    i += 1
  }
  out.close

## BytecodeAnalyzer.scala
package spark.util

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}

import scala.collection.mutable

import org.objectweb.asm.{ClassReader, MethodVisitor}
import org.objectweb.asm.commons.EmptyVisitor
import org.objectweb.asm.Opcodes._

## InsertPerf.scala


// 1001  381	384	384	383	384	407	404	409	407

object ArrayBufferBenchmark extends scala.testing.Benchmark {

  def run = {

    val len = 10 * 1000 * 1000
    val a = new scala.collection.mutable.ArrayBuffer[Int](len)

## update.sh
set -e
set -o pipefail

/root/spark/bin/stop-all.sh

rm -rf ~/.ivy2/local/org.spark*
rm -rf ~/.ivy2/cache/org.spark*

cd /root/spark
git checkout master

## gist:6896688

  def takeAsync(num: Int): FutureAction[Seq[T]] = {
    val promise = new CancellablePromise[Seq[T]]

    promise.run {
      val buf = new ArrayBuffer[T](num)
      val totalParts = self.partitions.length
      var partsScanned = 0
      while (buf.size < num && partsScanned < totalParts && !promise.cancelled) {
        // The number of partitions to try in this iteration. It is ok for this number to be

## df.py
data = sqlContext.load("/home/rxin/ints.parquet")
data.groupBy("a").agg(col("a"), avg("num")).collect()
	package com.databricks.unsafe.util.benchmark;

	import org.openjdk.jmh.annotations.Benchmark;
	import org.openjdk.jmh.annotations.Scope;
	import org.openjdk.jmh.annotations.State;
	import org.openjdk.jmh.runner.Runner;
	import org.openjdk.jmh.runner.RunnerException;
	import org.openjdk.jmh.runner.options.Options;
	import org.openjdk.jmh.runner.options.OptionsBuilder;
	package org.apache.spark.sql

	import org.apache.spark.{SparkConf, SparkContext}
	import org.apache.spark.sql.functions._

	object CodegenTest {

	def main(args: Array[String]): Unit = {
	val sc = SparkContext.getOrCreate()
	val sqlContext = new SQLContext(sc)

	// Launch spark-shell
	MASTER=local[4] bin/spark-shell --driver-memory 4G --conf spark.shuffle.memoryFraction=0.5 --packages com.databricks:spark-csv_2.10:1.2.0

	// Read the DF in
	val pdf = sqlContext.read.parquet("d_small_key.parquet")
	sqlContext.setConf("spark.sql.shuffle.partitions", "8")

	// Data reading
	val start = System.currentTimeMillis
	################################################################################
	# Step 1. Download wiki traffic log.
	# from
	# https://s3.amazonaws.com/ampcamp/ampcamp-ecnu-2013/wikistats/part-00095.gz
	# to
	# https://s3.amazonaws.com/ampcamp/ampcamp-ecnu-2013/wikistats/part-00168.gz
	# Note that 095 and 168 are both 0 bytes. The sole purpose of their existence is
	# to verify the downloads.

	# NOTE THAT THE FOLLOWING SCRIPT STARTS wget AS BACKGROUND PROCESSES.
	def testWrite(path: String): Long = {
	val startTime = System.currentTimeMillis()
	val out = new java.io.FileWriter(path)
	var i = 1
	val bytes = " " * (1024 * 1024)
	while (i < 1000) {
	out.write(bytes)
	i += 1
	}
	out.close
	package spark.util

	import java.io.{ByteArrayInputStream, ByteArrayOutputStream}

	import scala.collection.mutable

	import org.objectweb.asm.{ClassReader, MethodVisitor}
	import org.objectweb.asm.commons.EmptyVisitor
	import org.objectweb.asm.Opcodes._


	// 1001 381 384 384 383 384 407 404 409 407

	object ArrayBufferBenchmark extends scala.testing.Benchmark {

	def run = {

	val len = 10 * 1000 * 1000
	val a = new scala.collection.mutable.ArrayBuffer[Int](len)
	set -e
	set -o pipefail

	/root/spark/bin/stop-all.sh

	rm -rf ~/.ivy2/local/org.spark*
	rm -rf ~/.ivy2/cache/org.spark*

	cd /root/spark
	git checkout master

	def takeAsync(num: Int): FutureAction[Seq[T]] = {
	val promise = new CancellablePromise[Seq[T]]

	promise.run {
	val buf = new ArrayBuffer[T](num)
	val totalParts = self.partitions.length
	var partsScanned = 0
	while (buf.size < num && partsScanned < totalParts && !promise.cancelled) {
	// The number of partitions to try in this iteration. It is ok for this number to be
	data = sqlContext.load("/home/rxin/ints.parquet")
	data.groupBy("a").agg(col("a"), avg("num")).collect()