Skip to content

Instantly share code, notes, and snippets.

@zsennenga
Created January 6, 2020 22:17
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zsennenga/a6cb2267beeab45d26371346c65c6663 to your computer and use it in GitHub Desktop.
Save zsennenga/a6cb2267beeab45d26371346c65c6663 to your computer and use it in GitHub Desktop.
import java.time.LocalDate
def hashCodeTuple(one: String, two: Int, mod: Int): Int = {
 val rawMod = (one, two).hashCode % mod
 rawMod + (if (rawMod < 0) mod else 0) 
}
def iteration(numberDS: Int, filesPerPartition: Int): (Double, Double, Double) = {
 val hashedRandKeys = (0 to numberDS - 1).map(x => LocalDate.of(2019, 1, 1).plusDays(x)).flatMap(
 x => (0 to filesPerPartition - 1).map(y => hashCodeTuple(x.toString, y, filesPerPartition*numberDS))
 )
hashedRandKeys.size // Number of unique keys, with the random factor
val groupedHashedKeys = hashedRandKeys.groupBy(identity).view.mapValues(_.size).toSeq
groupedHashedKeys.size // number of actual sPartitions used
val sortedKeyCollisions = groupedHashedKeys.filter(_._2 != 1).sortBy(_._2).reverse
 
 val sortedSevereKeyCollisions = groupedHashedKeys.filter(_._2 > 2).sortBy(_._2).reverse
sortedKeyCollisions.size // number of sPartitions with a hashing collision
// (collisions, occurences)
 val collisionCounts = sortedKeyCollisions.map(_._2).groupBy(identity).view.mapValues(_.size).toSeq.sortBy(_._2).reverse
 
 (
 groupedHashedKeys.size.toDouble / hashedRandKeys.size.toDouble, 
 sortedKeyCollisions.size.toDouble / groupedHashedKeys.size.toDouble,
 sortedSevereKeyCollisions.size.toDouble / groupedHashedKeys.size.toDouble
 )
}
val results = Seq(
 iteration(365, 1),
 iteration(365, 5),
 iteration(365, 10),
 iteration(365, 100),
 iteration(365 * 2, 100),
 iteration(365 * 5, 100),
 iteration(365 * 10, 100)
)
val avgEfficiency = results.map(_._1).sum / results.length // What is the ratio of executors / output files
val avgCollisionRate = results.map(_._2).sum / results.length // What is the average collision rate
val avgSevereCollisionRate = results.map(_._3).sum / results.length // What is the average collision rate where 3 or more hashes collide
(avgEfficiency, avgCollisionRate, avgSevereCollisionRate) // 63.2% Efficiency, 42% collision rate, 12.6% severe collision rate
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment