-
-
Save tbertelsen/efcdfbf82c200518274a to your computer and use it in GitHub Desktop.
Sample benchmark for http://stackoverflow.com/a/27984211/722929
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Sample executions (locally 4 threads) | |
// n time1 time2 | |
// 1000 1.16 s 2.91 s | |
// 10000 4.88 s 980.45 s | |
val n = 10000 | |
val data = sc parallelize(1L to n.toLong) map (x => (x, "Number " + x)) | |
val pairs = data flatMap {_ => | |
for (x <- 1 to Random.nextInt(30)) yield { | |
(Random.nextInt(n).toLong + 1, Random.nextInt(n).toLong + 1) | |
} | |
} | |
val dataCount = data.cache().count() | |
val pairCount = pairs.cache().count() | |
val startTime1 = System.currentTimeMillis() | |
val firstjoin = pairs.map(x => (x._1, x)).join(data).map(_._2) | |
val result = firstjoin.map({case (x,y) => (x._2, (x,y))}).join(data).map({case(x, (y, z))=>(y._1, (y._2, z))}) | |
val count1 = result count() | |
val endTime1 = System.currentTimeMillis() | |
val startTime2 = System.currentTimeMillis() | |
val cartesianData = data cartesian data map {case((k1,v1),(k2,v2)) => ((k1,k2),(v1,v2))} | |
val count2 = pairs map {(_,0)} join cartesianData mapValues {_._2} count() | |
val endTime2 = System.currentTimeMillis() | |
printf("|data| = %d%n", dataCount) | |
printf("|pairs| = %d | %d | %d%n", pairCount, count1, count2) | |
printf("1 Done in %.2f s%n%n", (endTime1 - startTime1) / 1000.0) | |
printf("2 Done in %.2f s%n%n", (endTime2 - startTime2) / 1000.0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment