Skip to content

Instantly share code, notes, and snippets.

@y-matsuwitter
Created December 16, 2013 16:43
Show Gist options
  • Save y-matsuwitter/7990110 to your computer and use it in GitHub Desktop.
Save y-matsuwitter/7990110 to your computer and use it in GitHub Desktop.
Goでb-bit Minwise Hashing実装した話 ref: http://qiita.com/y_matsuwitter/items/505565d1ccf403a48c77
sim(a, b) = \frac{|a \cap b|}{|a \cup b|}
あるハッシュ関数を2つのデータの全要素に対して適用した時、各データの最小のハッシュ値が一致する確率はJaccard係数に等しい
Sim(a,b) = P(min\{h(d_a)| d_a \in a\} = min\{h(d_b)| d_b \in b\})
Similarity = P(ハッシュ値の最小値が一致する確率) - P(ハッシュ値の衝突確率)
$ go run minhash.go
0.375
0.28125
package main
import (
"./mmh"
"fmt"
"math"
"math/big"
"math/rand"
)
var bitMask = uint32(0x1)
func minKey(l map[string]uint32) (string, uint32) {
var result string
m := uint32(math.MaxUint32)
for k := range l {
if m > l[k] {
m = l[k]
result = k
}
}
return result, m
}
func minHash(data []string, seed uint32) uint32 {
vector := make(map[string]uint32)
for k := range data {
vector[data[k]] = mmh.Murmurhash3_32(data[k], seed)
}
_, value := minKey(vector)
return value
}
func signature(data []string) uint32 {
rand.Seed(1)
sig := uint32(0)
for i := 0; i < 128; i++ {
sig += (minHash(data, rand.Uint32()) & bitMask) << uint32(i)
}
return sig
}
func signatureBig(data []string) *big.Int {
rand.Seed(1)
sigBig := big.NewInt(0)
for i := 0; i < 128; i++ {
sigBig.SetBit(sigBig, i, uint(minHash(data, rand.Uint32())&bitMask))
}
return sigBig
}
func popCount(bits uint32) uint32 {
bits = (bits & 0x55555555) + (bits >> 1 & 0x55555555)
bits = (bits & 0x33333333) + (bits >> 2 & 0x33333333)
bits = (bits & 0x0f0f0f0f) + (bits >> 4 & 0x0f0f0f0f)
bits = (bits & 0x00ff00ff) + (bits >> 8 & 0x00ff00ff)
return (bits & 0x0000ffff) + (bits >> 16 & 0x0000ffff)
}
func popCountBig(bits *big.Int) int {
result := 0
for _, v := range bits.Bytes() {
result += int(popCount(uint32(v)))
}
return result
}
func calcJaccard(v1, v2 []string) float32 {
commonBig := big.NewInt(0)
commonBig.Xor(signatureBig(v1), signatureBig(v2))
return 2.0 * (float32((128.0-popCountBig(commonBig)))/128.0 - 0.5)
}
func calc() {
fmt.Println(calcJaccard([]string{
"21", "歳", "ビール", "飲む", "アサヒ", "リクルート", "連携",
}, []string{
"21", "歳", "ビール", "無料", "アサヒ", "若者", "向け", "企画",
}))
fmt.Println(calcJaccard([]string{
"21", "歳", "ビール", "飲む", "アサヒ", "リクルート", "連携",
}, []string{
"サンクト", "ガーレン", "バレンタイン", "向け", "チョコレート", "風味", "ビール", "4", "種", "発売",
}))
}
func main() {
calc();
}
package mmh
import (
"bytes"
"encoding/binary"
)
var mask32 = uint32(0xffffffff)
func rotl(x, r uint32) uint32 {
return ((x << r) | (x >> (32 - r))) & mask32
}
func mmix(h uint32) uint32 {
h &= mask32
h ^= h >> 16
h = (h * 0x85ebca6b) & mask32
h ^= h >> 13
h = (h * 0xc2b2ae35) & mask32
return h ^ (h >> 16)
}
func Murmurhash3_32(key string, seed uint32) uint32 {
var h1 uint32 = seed
var k uint32
var c1, c2 uint32 = 0xcc9e2d51, 0x1b873593
buffer := bytes.NewBufferString(key)
keyBytes := []byte(key)
length := buffer.Len()
if length == 0 {
return 0
}
nblocks := length / 4
for i := 0; i < nblocks; i++ {
binary.Read(buffer, binary.LittleEndian, &k)
k *= c1
k = rotl(k, 15)
k *= c2
h1 ^= k
h1 = rotl(h1, 13)
h1 = h1*5 + 0xe6546b64
}
var k1 uint32 = 0
tail := nblocks * 4
switch length & 3 {
case 3:
k1 ^= uint32(keyBytes[tail+2]) << 16
fallthrough
case 2:
k1 ^= uint32(keyBytes[tail+1]) << 8
fallthrough
case 1:
k1 ^= uint32(keyBytes[tail])
k *= c1
k = rotl(k, 15)
k *= c2
h1 ^= k
}
// finalize
h1 ^= uint32(length)
return mmix(h1)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment