-
-
Save zsennenga/a6cb2267beeab45d26371346c65c6663 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.time.LocalDate | |
def hashCodeTuple(one: String, two: Int, mod: Int): Int = { | |
val rawMod = (one, two).hashCode % mod | |
rawMod + (if (rawMod < 0) mod else 0) | |
} | |
def iteration(numberDS: Int, filesPerPartition: Int): (Double, Double, Double) = { | |
val hashedRandKeys = (0 to numberDS - 1).map(x => LocalDate.of(2019, 1, 1).plusDays(x)).flatMap( | |
x => (0 to filesPerPartition - 1).map(y => hashCodeTuple(x.toString, y, filesPerPartition*numberDS)) | |
) | |
hashedRandKeys.size // Number of unique keys, with the random factor | |
val groupedHashedKeys = hashedRandKeys.groupBy(identity).view.mapValues(_.size).toSeq | |
groupedHashedKeys.size // number of actual sPartitions used | |
val sortedKeyCollisions = groupedHashedKeys.filter(_._2 != 1).sortBy(_._2).reverse | |
val sortedSevereKeyCollisions = groupedHashedKeys.filter(_._2 > 2).sortBy(_._2).reverse | |
sortedKeyCollisions.size // number of sPartitions with a hashing collision | |
// (collisions, occurences) | |
val collisionCounts = sortedKeyCollisions.map(_._2).groupBy(identity).view.mapValues(_.size).toSeq.sortBy(_._2).reverse | |
( | |
groupedHashedKeys.size.toDouble / hashedRandKeys.size.toDouble, | |
sortedKeyCollisions.size.toDouble / groupedHashedKeys.size.toDouble, | |
sortedSevereKeyCollisions.size.toDouble / groupedHashedKeys.size.toDouble | |
) | |
} | |
val results = Seq( | |
iteration(365, 1), | |
iteration(365, 5), | |
iteration(365, 10), | |
iteration(365, 100), | |
iteration(365 * 2, 100), | |
iteration(365 * 5, 100), | |
iteration(365 * 10, 100) | |
) | |
val avgEfficiency = results.map(_._1).sum / results.length // What is the ratio of executors / output files | |
val avgCollisionRate = results.map(_._2).sum / results.length // What is the average collision rate | |
val avgSevereCollisionRate = results.map(_._3).sum / results.length // What is the average collision rate where 3 or more hashes collide | |
(avgEfficiency, avgCollisionRate, avgSevereCollisionRate) // 63.2% Efficiency, 42% collision rate, 12.6% severe collision rate |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment