Skip to content

Instantly share code, notes, and snippets.

@sat0yu
Created September 15, 2019 01:26
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sat0yu/895e28cce3f42dd980eeb8ea4666319f to your computer and use it in GitHub Desktop.
Save sat0yu/895e28cce3f42dd980eeb8ea4666319f to your computer and use it in GitHub Desktop.
A MinHash implementation in go
package main
import (
"fmt"
"log"
"math/rand"
"time"
)
const DIM = 26 // |{'a', ..., 'z'}| = 26
const MaxInt = int(^uint(0) >> 1)
type vector [DIM]int
type hashFunc func(vector) int
func genRandomVector() vector {
v := vector{}
for i := 0; i < DIM; i++ {
v[i] = rand.Int()
}
return v
}
func genHash() hashFunc {
randVec := genRandomVector()
return func(vec vector) int {
var min = MaxInt
for i, v := range vec {
// update the min value if the vector contains at least one charactor at the i-th index
if v > 0 && randVec[i] < min {
min = randVec[i]
}
}
return min
}
}
func convTermToVec(t string) vector {
vec := vector{}
for _, c := range t {
if c < 97 || 122 < c {
log.Fatal("found invalid charactor: ", string(c))
}
// a has 97, ..., z does 122
vec[c-97] += 1
}
return vec
}
func main() {
rand.Seed(time.Now().UnixNano())
N := 100
terms := []string{
"apple",
"pineapple",
"orange",
"lemon",
"grape",
"cherry",
"strawberry",
"banana",
"peach",
}
fmt.Print("Enter query: ")
var q string
fmt.Scanln(&q)
qVec := convTermToVec(q)
fmt.Printf("query: %s, v: %v\n", q, qVec)
fmt.Println("")
vectors := make([]vector, len(terms))
for i, t := range terms {
vectors[i] = convTermToVec(t)
fmt.Printf("[%s] v: %v\n", t, vectors[i])
}
fmt.Println("")
scores := make([]int, len(terms))
for i := 0; i < N; i++ {
h := genHash()
qHash := h(qVec)
for i, v := range vectors {
if qHash == h(v) {
scores[i] += 1
}
}
}
for i, s := range scores {
fmt.Printf("[%s] score: %d\n", terms[i], s)
}
}
@sat0yu
Copy link
Author

sat0yu commented Sep 15, 2019

-[95690]% go run minhash.go
Enter query: raspberry
query: raspberry, v: [1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 3 1 0 0 0 0 0 1 0]

[apple] v: [1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 2 0 0 0 0 0 0 0 0 0 0]
[pineapple] v: [1 0 0 0 2 0 0 0 1 0 0 1 0 1 0 3 0 0 0 0 0 0 0 0 0 0]
[orange] v: [1 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0]
[lemon] v: [0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0]
[grape] v: [1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0]
[cherry] v: [0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 1 0]
[strawberry] v: [1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 3 1 1 0 0 1 0 1 0]
[banana] v: [3 1 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0]
[peach] v: [1 0 1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]

[apple] score: 33
[pineapple] score: 24
[orange] score: 34
[lemon] score: 9
[grape] score: 51
[cherry] score: 39
[strawberry] score: 72
[banana] score: 23
[peach] score: 28

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment