Matt Gilham mgilham

## SillyShuffle.scala
import scala.util.Random

object SillyShuffle {

  def randomJitterShuffle[T](sortedList: List[T],
                             ordFun: T => Int,
                             copyFun: (T, Int) => T,
                             jitterMin: Int,
                             jitterMax: Int): List[T] = {
    sortedList.reverse.foldLeft(List.empty[T]) {

## leftAttachOne.scala
/** Utility method that implements a constrained special-purpose join more efficiently than Spark SQL's current left outer join.
    Assumes that the left DataFrame contains exactly one row per key, and the right DataFrame contains zero or one row per key.
  */
def leftAttachOne(left: DataFrame, right: DataFrame, leftKeyPos: Int = 0, rightKeyPos: Int = 0): DataFrame = {
  val nullableRightSchema = new StructType(right.schema.fields.map(_.copy(nullable=true)))
  val nullRightRow = Row.fromSeq(Array.fill(right.schema.fields.size)(null))
  val leftKeyTuples = left.rdd.map(row => (row(leftKeyPos), row))
  val rightKeyTuples = right.rdd.map(row => (row(rightKeyPos), row))
  val cogrouped = leftKeyTuples.cogroup(rightKeyTuples).flatMap { case (_, (leftRows, rightRows)) =>
    leftRows.headOption.map(leftRow => Row.merge(leftRow, rightRows.headOption.getOrElse(nullRightRow)))

## SumVector.scala
import org.apache.spark.sql.expressions.MutableAggregationBuffer
import org.apache.spark.sql.expressions.UserDefinedAggregateFunction
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.mllib.linalg.SparseVector
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.VectorUDT

// MLG: this is highly unoptimized, but likely good enough for now

## gist:93967d36e978074b2235
#!/usr/bin/python

import sys
import json

def getAssignmentJson(line):
	splut = line.split()
	topic, partition, replicas = splut[1], splut[3], splut[7]
	replicas = replicas.split(',')
	if len(replicas) == 1:
	import scala.util.Random

	object SillyShuffle {

	def randomJitterShuffle[T](sortedList: List[T],
	ordFun: T => Int,
	copyFun: (T, Int) => T,
	jitterMin: Int,
	jitterMax: Int): List[T] = {
	sortedList.reverse.foldLeft(List.empty[T]) {
	/** Utility method that implements a constrained special-purpose join more efficiently than Spark SQL's current left outer join.
	Assumes that the left DataFrame contains exactly one row per key, and the right DataFrame contains zero or one row per key.
	*/
	def leftAttachOne(left: DataFrame, right: DataFrame, leftKeyPos: Int = 0, rightKeyPos: Int = 0): DataFrame = {
	val nullableRightSchema = new StructType(right.schema.fields.map(_.copy(nullable=true)))
	val nullRightRow = Row.fromSeq(Array.fill(right.schema.fields.size)(null))
	val leftKeyTuples = left.rdd.map(row => (row(leftKeyPos), row))
	val rightKeyTuples = right.rdd.map(row => (row(rightKeyPos), row))
	val cogrouped = leftKeyTuples.cogroup(rightKeyTuples).flatMap { case (_, (leftRows, rightRows)) =>
	leftRows.headOption.map(leftRow => Row.merge(leftRow, rightRows.headOption.getOrElse(nullRightRow)))
	import org.apache.spark.sql.expressions.MutableAggregationBuffer
	import org.apache.spark.sql.expressions.UserDefinedAggregateFunction
	import org.apache.spark.sql.Row
	import org.apache.spark.sql.types._
	import org.apache.spark.mllib.linalg.SparseVector
	import org.apache.spark.mllib.linalg.Vector
	import org.apache.spark.mllib.linalg.Vectors
	import org.apache.spark.mllib.linalg.VectorUDT

	// MLG: this is highly unoptimized, but likely good enough for now
	#!/usr/bin/python

	import sys
	import json

	def getAssignmentJson(line):
	splut = line.split()
	topic, partition, replicas = splut[1], splut[3], splut[7]
	replicas = replicas.split(',')
	if len(replicas) == 1: