Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
hash tests
%matplotlib inline
import md5, struct
import seaborn as sns
import matplotlib.pyplot as plt
import mmh3
def md5hash(i):
return md5.new(str(i)).digest()[12:16]
def mm(i):
return mmh3.hash(str(i), seed = 1234 )
f, axes = plt.subplots(2, 1, figsize=(7, 7), sharex=True)
n = 42
z = 100000
nums = [struct.unpack("<i", md5hash(i))[0] % n for i in range(0,z)]
#sns.distplot(nums, kde=False, ax=axes[0])
nums_m = [mm(i) % n for i in range(0,z)]
#sns.distplot(nums_m, kde=False, ax=axes[1])
pd.Series(nums).hist(ax=axes[0])
pd.Series(nums_m).hist(ax=axes[1])
#plt.setp(axes, yticks=[])
#plt.tight_layout()
@tovbinm

This comment has been minimized.

Copy link

@tovbinm tovbinm commented May 12, 2015

The same test in Scala:

package com.badgeville

import scala.util.hashing.MurmurHash3
import com.amazonaws.util.Md5Utils
import java.nio.ByteBuffer
import com.quantifind.charts.Highcharts._
import org.bson.types.ObjectId


object HashMain extends App {
  val items = 0 until 1000000
  val bins  = 42

  def s = ObjectId.get().toHexString

  def testHash(hash: String => Int) = {
    val counts = Array.fill(bins)(0)
    for { i <- items } {
      val h = math.abs(hash(s)) % bins
      counts(h) += 1
    }
    counts
  }

  val mur3 = testHash(MurmurHash3.stringHash)
  val md5  = testHash(x => ByteBuffer.wrap(Md5Utils.computeMD5Hash(x.getBytes).take(4)).getInt)

  areaspline(items zip mur3)
  title("murmur3 32bit")

  areaspline(items zip md5)
  title("4 bytes of md5")

}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment