Skip to content

Instantly share code, notes, and snippets.

@tbertelsen
Last active August 29, 2015 14:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tbertelsen/efcdfbf82c200518274a to your computer and use it in GitHub Desktop.
Save tbertelsen/efcdfbf82c200518274a to your computer and use it in GitHub Desktop.
// Sample executions (locally 4 threads)
// n time1 time2
// 1000 1.16 s 2.91 s
// 10000 4.88 s 980.45 s
val n = 10000
val data = sc parallelize(1L to n.toLong) map (x => (x, "Number " + x))
val pairs = data flatMap {_ =>
for (x <- 1 to Random.nextInt(30)) yield {
(Random.nextInt(n).toLong + 1, Random.nextInt(n).toLong + 1)
}
}
val dataCount = data.cache().count()
val pairCount = pairs.cache().count()
val startTime1 = System.currentTimeMillis()
val firstjoin = pairs.map(x => (x._1, x)).join(data).map(_._2)
val result = firstjoin.map({case (x,y) => (x._2, (x,y))}).join(data).map({case(x, (y, z))=>(y._1, (y._2, z))})
val count1 = result count()
val endTime1 = System.currentTimeMillis()
val startTime2 = System.currentTimeMillis()
val cartesianData = data cartesian data map {case((k1,v1),(k2,v2)) => ((k1,k2),(v1,v2))}
val count2 = pairs map {(_,0)} join cartesianData mapValues {_._2} count()
val endTime2 = System.currentTimeMillis()
printf("|data| = %d%n", dataCount)
printf("|pairs| = %d | %d | %d%n", pairCount, count1, count2)
printf("1 Done in %.2f s%n%n", (endTime1 - startTime1) / 1000.0)
printf("2 Done in %.2f s%n%n", (endTime2 - startTime2) / 1000.0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment