Skip to content

Instantly share code, notes, and snippets.

@jdegoes
Created February 8, 2013 15:11
Show Gist options
  • Star 15 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save jdegoes/4739573 to your computer and use it in GitHub Desktop.
Save jdegoes/4739573 to your computer and use it in GitHub Desktop.
Example code for the Creating a Data Science Platform in Scala talk.
object BenchmarkCommon {
import scala.util.Random
val DatasetSize = 10000
val Iterations = 10000
val ArrayPoolSize = 1000
val ArrayPool = {
def randomArray(): Array[Int] = {
val array = new Array[Int](DatasetSize)
(0 until array.length).foreach { index =>
array(index) = Random.nextInt(DatasetSize)
}
array
}
val pool = new Array[Array[Int]](ArrayPoolSize)
(0 until pool.length).foreach { index =>
pool(index) = randomArray()
}
pool
}
def benchmark(label: String)(f: => Unit) = {
val start = System.nanoTime()
var i = 0
while (i < Iterations) {
f
i += 1
}
val end = System.nanoTime()
val length = (end - start) / 1000000000.0
println(label + ": " + length + " seconds")
}
def randomArray() = ArrayPool(Random.nextInt(ArrayPoolSize))
}
object LazyNaiveAttempt {
import scala.annotation.tailrec
type Joined[A, B] = Dataset[(Option[A], Option[B])]
trait Dataset[A] extends (Int => A) { self =>
def join[B, C](that: Dataset[B])(f: A => C, g: B => C)
(implicit ord: Ordering[C], cmA: ClassManifest[A], cmB: ClassManifest[B]): Joined[A, B] = {
@tailrec def loop(acc: Joined[A, B], d1: Dataset[A], d2: Dataset[B]): Joined[A, B] = {
(d1.uncons, d2.uncons) match {
case (None, None) => acc
case (None, Some((_, _))) =>
acc.concat(d2.map(x => (None, Some(x))))
case (Some((_, _)), None) =>
acc.concat(d1.map(x => (Some(x), None)))
case (Some((x, xs)), Some((y, ys))) =>
val cmp = ord.compare(f(x), g(y))
if (cmp < 0) loop(acc.append((Some(x), None)), xs, d2)
else if (cmp > 0) loop(acc.append((None, Some(y))), d1, ys)
else loop(acc.append((Some(x), Some(y))), xs, ys)
}
}
loop(Dataset.empty[(Option[A], Option[B])], self.sortBy(f), that.sortBy(g))
}
def sortBy[B](f: A => B)(implicit cm: ClassManifest[A], ord: Ordering[B]) =
Dataset.fromArray(toArray.sortBy(f))
def append(x: A) = new Dataset[A] {
def apply(i: Int) = if (i < self.size) self.apply(i) else x
def size = self.size + 1
}
def concat(that: Dataset[A]) = new Dataset[A] {
def apply(i: Int) = if (i < self.size) self(i) else that(i - self.size)
def size = self.size + that.size
}
def uncons: Option[(A, Dataset[A])] = {
if (size == 0) None
else Some(self(0), new Dataset[A] {
def apply(i: Int): A = self.apply(i + 1)
def size = self.size - 1
})
}
def map[B](f: A => B) = new Dataset[B] {
def apply(i: Int): B = f(self(i))
def size = self.size
}
def fold[B](zero: B)(f: (B, A) => B): B = {
uncons match {
case None => zero
case Some((x, xs)) => xs.fold(f(zero, x))(f)
}
}
def size: Int
def toArray(implicit cm: ClassManifest[A]) = (0 until size).map(self).toArray
}
object Dataset {
def empty[A: ClassManifest] = fromArray(new Array[A](0))
def fromArray[A](value: Array[A]): Dataset[A] = new Dataset[A] {
def apply(i: Int): A = value(i)
def size = value.length
}
}
}
object SmarterStrictAttempt {
type Joined[A, B] = Dataset[(Option[A], Option[B])]
case class Dataset[A: ClassManifest](value: Array[A]) extends (Int => A) { self =>
def apply(i: Int): A = value(i)
def join[B, C, D](that: Dataset[B])(f: A => C, g: B => C)(left: A => D, right: B => D, both: (A, B) => D)
(implicit ord: Ordering[C], cmA: ClassManifest[A], cmB: ClassManifest[B], cmD: ClassManifest[D]): Dataset[D] = {
val builder = scala.collection.mutable.ArrayBuilder.make[D]
val sort1 = self.value.sortBy(f)
val sort2 = that.value.sortBy(g)
var i = 0
var j = 0
var more = true
while (more) {
if (i < sort1.length) {
if (j < sort2.length) {
val x = sort1(i)
val y = sort2(j)
val cmp = ord.compare(f(x), g(y))
if (cmp < 0) {
i += 1
builder += left(x)
} else if (cmp > 0) {
j += 1
builder += right(y)
} else {
i += 1
j += i
builder += both(x, y)
}
} else {
builder ++= sort1.drop(i).map(left)
more = false
}
} else {
builder ++= sort2.drop(j).map(right)
more = false
}
}
Dataset(builder.result())
}
def map[B: ClassManifest](f: A => B) = Dataset(value = value.map(f))
def fold[B](zero: B)(f: (B, A) => B): B = {
var i = 0
var acc = zero
while (i < value.length) {
acc = f(acc, value(i))
i += 1
}
acc
}
def size: Int = value.length
}
}
import BenchmarkCommon._
/* benchmark("lazy-naive") {
import LazyNaiveAttempt._
val arr1 = Dataset.fromArray(randomArray())
val arr2 = Dataset.fromArray(randomArray())
arr1.map(_ * 2).join(arr2.map(_ * 2))(identity[Int], identity[Int]).fold(0) {
case (acc, (None, None)) => acc
case (acc, (None, Some(x))) => acc + x
case (acc, (Some(x), None)) => acc + x
case (acc, (Some(x), Some(y))) => acc + x + y
}
} */
benchmark("smarter-strict") {
import SmarterStrictAttempt._
val arr1 = Dataset(randomArray())
val arr2 = Dataset(randomArray())
arr1.map(_ * 2).join(arr2.map(_ * 2))(identity[Int], identity[Int])(identity[Int], identity[Int], _ + _).fold(0)(_ + _)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment