Skip to content

Instantly share code, notes, and snippets.

@shivaram
Last active August 29, 2015 14:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shivaram/63620c47f0ad50106e0a to your computer and use it in GitHub Desktop.
Save shivaram/63620c47f0ad50106e0a to your computer and use it in GitHub Desktop.
takeOrdered microbenchmark
package org.apache.spark.scheduler
import scala.util.Random
import org.apache.spark.storage.BlockManagerId
import org.apache.spark.util.collection.{Utils => CollectionUtils}
object TopkBench extends testing.Benchmark {
val toTake = sys.props("toTake").toInt
val numHosts = 1000
val numReducers = sys.props("numReducers").toInt
val numEntries = sys.props("numEntries").toInt
var allLocs: Seq[Seq[(BlockManagerId, Long)]] = null
override def setUp () = {
allLocs = generateLocs(numEntries, numReducers)
}
def run = {
var r = 0
while (r < numReducers) {
val topLocs = CollectionUtils.takeOrdered(allLocs(r).iterator,
toTake)(Ordering.by[(BlockManagerId, Long), Long](_._2).reverse).toSeq
r = r + 1
}
}
def generateLocs(numEntries: Int, numReducers: Int) = {
(0 until numReducers).map { r =>
(0 until numEntries).map { i =>
(makeBlockManagerId(), Random.nextInt(numEntries).toLong)
}
}
}
def makeBlockManagerId() = {
BlockManagerId("exec0", "host" + Random.nextInt(numHosts), 10, 10)
}
}
@shivaram
Copy link
Author

shivaram commented Aug 1, 2014

java -cp assembly/target/scala-2.10/spark-assembly-1.1.0-SNAPSHOT-hadoop1.0.4.jar -DnumEntries=100000 org.apache.spark.scheduler.TopkBench 10
10 runs with 100k entries in the array
org.apache.spark.scheduler.TopkBench$ 103 17 15 6 5 5 4 4 4 4

@shivaram
Copy link
Author

shivaram commented Aug 1, 2014

1000 Mappers, top 5
numReducers=100
org.apache.spark.scheduler.TopkBench$ 85 9 9 10 8 9 9 8 8 9

numReducers=1000
org.apache.spark.scheduler.TopkBench$ 178 70 69 175 85 93 87 87 85 126

10000 mappers, top 5
numReducers=100
org.apache.spark.scheduler.TopkBench$ 156 50 51 119 66 68 73 64 69 93

numReducers=1000
org.apache.spark.scheduler.TopkBench$ 860 1099 777 848 2717 488 1258 1841 1123 1679

numReducers=10000
Crashes with out of memory exceptions with a 5g heap

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment