Created
December 16, 2013 16:43
-
-
Save y-matsuwitter/7990110 to your computer and use it in GitHub Desktop.
Goでb-bit Minwise Hashing実装した話 ref: http://qiita.com/y_matsuwitter/items/505565d1ccf403a48c77
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sim(a, b) = \frac{|a \cap b|}{|a \cup b|} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
あるハッシュ関数を2つのデータの全要素に対して適用した時、各データの最小のハッシュ値が一致する確率はJaccard係数に等しい |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sim(a,b) = P(min\{h(d_a)| d_a \in a\} = min\{h(d_b)| d_b \in b\}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Similarity = P(ハッシュ値の最小値が一致する確率) - P(ハッシュ値の衝突確率) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ go run minhash.go | |
0.375 | |
0.28125 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"./mmh" | |
"fmt" | |
"math" | |
"math/big" | |
"math/rand" | |
) | |
var bitMask = uint32(0x1) | |
func minKey(l map[string]uint32) (string, uint32) { | |
var result string | |
m := uint32(math.MaxUint32) | |
for k := range l { | |
if m > l[k] { | |
m = l[k] | |
result = k | |
} | |
} | |
return result, m | |
} | |
func minHash(data []string, seed uint32) uint32 { | |
vector := make(map[string]uint32) | |
for k := range data { | |
vector[data[k]] = mmh.Murmurhash3_32(data[k], seed) | |
} | |
_, value := minKey(vector) | |
return value | |
} | |
func signature(data []string) uint32 { | |
rand.Seed(1) | |
sig := uint32(0) | |
for i := 0; i < 128; i++ { | |
sig += (minHash(data, rand.Uint32()) & bitMask) << uint32(i) | |
} | |
return sig | |
} | |
func signatureBig(data []string) *big.Int { | |
rand.Seed(1) | |
sigBig := big.NewInt(0) | |
for i := 0; i < 128; i++ { | |
sigBig.SetBit(sigBig, i, uint(minHash(data, rand.Uint32())&bitMask)) | |
} | |
return sigBig | |
} | |
func popCount(bits uint32) uint32 { | |
bits = (bits & 0x55555555) + (bits >> 1 & 0x55555555) | |
bits = (bits & 0x33333333) + (bits >> 2 & 0x33333333) | |
bits = (bits & 0x0f0f0f0f) + (bits >> 4 & 0x0f0f0f0f) | |
bits = (bits & 0x00ff00ff) + (bits >> 8 & 0x00ff00ff) | |
return (bits & 0x0000ffff) + (bits >> 16 & 0x0000ffff) | |
} | |
func popCountBig(bits *big.Int) int { | |
result := 0 | |
for _, v := range bits.Bytes() { | |
result += int(popCount(uint32(v))) | |
} | |
return result | |
} | |
func calcJaccard(v1, v2 []string) float32 { | |
commonBig := big.NewInt(0) | |
commonBig.Xor(signatureBig(v1), signatureBig(v2)) | |
return 2.0 * (float32((128.0-popCountBig(commonBig)))/128.0 - 0.5) | |
} | |
func calc() { | |
fmt.Println(calcJaccard([]string{ | |
"21", "歳", "ビール", "飲む", "アサヒ", "リクルート", "連携", | |
}, []string{ | |
"21", "歳", "ビール", "無料", "アサヒ", "若者", "向け", "企画", | |
})) | |
fmt.Println(calcJaccard([]string{ | |
"21", "歳", "ビール", "飲む", "アサヒ", "リクルート", "連携", | |
}, []string{ | |
"サンクト", "ガーレン", "バレンタイン", "向け", "チョコレート", "風味", "ビール", "4", "種", "発売", | |
})) | |
} | |
func main() { | |
calc(); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package mmh | |
import ( | |
"bytes" | |
"encoding/binary" | |
) | |
var mask32 = uint32(0xffffffff) | |
func rotl(x, r uint32) uint32 { | |
return ((x << r) | (x >> (32 - r))) & mask32 | |
} | |
func mmix(h uint32) uint32 { | |
h &= mask32 | |
h ^= h >> 16 | |
h = (h * 0x85ebca6b) & mask32 | |
h ^= h >> 13 | |
h = (h * 0xc2b2ae35) & mask32 | |
return h ^ (h >> 16) | |
} | |
func Murmurhash3_32(key string, seed uint32) uint32 { | |
var h1 uint32 = seed | |
var k uint32 | |
var c1, c2 uint32 = 0xcc9e2d51, 0x1b873593 | |
buffer := bytes.NewBufferString(key) | |
keyBytes := []byte(key) | |
length := buffer.Len() | |
if length == 0 { | |
return 0 | |
} | |
nblocks := length / 4 | |
for i := 0; i < nblocks; i++ { | |
binary.Read(buffer, binary.LittleEndian, &k) | |
k *= c1 | |
k = rotl(k, 15) | |
k *= c2 | |
h1 ^= k | |
h1 = rotl(h1, 13) | |
h1 = h1*5 + 0xe6546b64 | |
} | |
var k1 uint32 = 0 | |
tail := nblocks * 4 | |
switch length & 3 { | |
case 3: | |
k1 ^= uint32(keyBytes[tail+2]) << 16 | |
fallthrough | |
case 2: | |
k1 ^= uint32(keyBytes[tail+1]) << 8 | |
fallthrough | |
case 1: | |
k1 ^= uint32(keyBytes[tail]) | |
k *= c1 | |
k = rotl(k, 15) | |
k *= c2 | |
h1 ^= k | |
} | |
// finalize | |
h1 ^= uint32(length) | |
return mmix(h1) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment