Created
April 2, 2014 19:11
-
-
Save roman-kashitsyn/9941024 to your computer and use it in GitHub Desktop.
Simple command line utility for fuzzy search.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"container/heap" | |
"flag" | |
"fmt" | |
"io" | |
"os" | |
"sort" | |
"strings" | |
"unicode" | |
) | |
const ( | |
MinEntryLen = 1 | |
JaccardSimilarityCoeff = 0.5 | |
SubstringSimilarityCoeff = 0.5 | |
eof = gram(-1) | |
) | |
type gram int64 | |
type gramSet []gram | |
type entryId int32 | |
type EntrySet []entryId | |
type SearchEntry struct { | |
text string | |
textLower string | |
grams gramSet | |
} | |
type WeightedEntry struct { | |
SearchEntry | |
score float64 | |
} | |
type Result []*WeightedEntry | |
type GramIndex struct { | |
gramToEntries map[gram]EntrySet | |
idToEntry []*SearchEntry | |
} | |
func forEachGram(s string, f func(gram)) { | |
rs := []rune(s) | |
l := len(rs) | |
n := l - 1 | |
for i, c := range rs { | |
rs[i] = unicode.ToLower(c) | |
} | |
// Extra "starts with" gram | |
f(eof<<32 | gram(rs[0])) | |
for i := 0; i < n; i += 1 { | |
b := gram(rs[i])<<32 | gram(rs[i+1]) | |
f(b) | |
} | |
} | |
func newGramSet(pattern string) gramSet { | |
s := make(gramSet, 0, len(pattern)-1) | |
forEachGram(pattern, func(g gram) { | |
s = append(s, g) | |
}) | |
sort.Sort(s) | |
unique(&s) | |
return s | |
} | |
func unique(s *gramSet) { | |
n := len(*s) | |
gs := *s | |
if n <= 1 { | |
return | |
} | |
b, e := 0, 1 | |
for { | |
for e < n && gs[e] == gs[b] { | |
e++ | |
} | |
if e == n { | |
break | |
} | |
b++ | |
if e-b > 0 { | |
gs[b] = gs[e] | |
} | |
} | |
*s = gs[:b+1] | |
} | |
func (gs gramSet) Len() int { | |
return len(gs) | |
} | |
func (gs gramSet) Swap(i, j int) { | |
gs[i], gs[j] = gs[j], gs[i] | |
} | |
func (gs gramSet) Less(i, j int) bool { | |
return gs[i] < gs[j] | |
} | |
func newEntry(s string) *SearchEntry { | |
return &SearchEntry{s, strings.ToLower(s), newGramSet(s)} | |
} | |
// Gram Index | |
func NewGramIndex() *GramIndex { | |
index := &GramIndex{} | |
index.gramToEntries = make(map[gram]EntrySet) | |
index.idToEntry = make([]*SearchEntry, 0) | |
return index | |
} | |
func (i *GramIndex) Add(e *SearchEntry) { | |
if len(e.text) < MinEntryLen { | |
return | |
} | |
i.idToEntry = append(i.idToEntry, e) | |
eid := entryId(len(i.idToEntry) - 1) | |
for _, g := range e.grams { | |
s := i.gramToEntries[g] | |
if s == nil { | |
s = EntrySet{} | |
} | |
s = append(s, eid) | |
i.gramToEntries[g] = s | |
} | |
} | |
// Implementation of fast read-only search entry heap. It turns out | |
// that it's (3-5x times) faster to inline container/heap module | |
// functions here then to use the module directly. | |
type searchHeapView []EntrySet | |
func (hp *searchHeapView) Init() { | |
n := len(*hp) | |
for i := n/2 - 1; i >= 0; i-- { | |
hp.down(i, n) | |
} | |
} | |
func (hp *searchHeapView) popOnce() entryId { | |
top := (*hp)[0] | |
e := top[0] | |
(*hp)[0] = top[1:] | |
n := len(*hp) - 1 | |
hp.Swap(0, n) | |
if len(top) > 1 { | |
hp.down(0, n) | |
hp.up(n) | |
} else { | |
hp.down(0, n) | |
*hp = (*hp)[:n] | |
} | |
return e | |
} | |
func (hp *searchHeapView) PopItem() (e entryId, n int) { | |
e = hp.popOnce() | |
n = 1 | |
for !hp.IsEmpty() && (*hp)[0][0] == e { | |
hp.popOnce() | |
n++ | |
} | |
return e, n | |
} | |
func (hp *searchHeapView) IsEmpty() bool { | |
return len(*hp) == 0 | |
} | |
func (hp *searchHeapView) Push(x interface{}) { | |
*hp = append(*hp, x.(EntrySet)) | |
} | |
func (hp *searchHeapView) Pop() interface{} { | |
h := *hp | |
end := len(h) - 1 | |
x := h[end] | |
*hp = h[:end] | |
return x | |
} | |
func (h searchHeapView) Less(i, j int) bool { | |
return h[i][0] < h[j][0] | |
} | |
func (h searchHeapView) Swap(i, j int) { | |
h[i], h[j] = h[j], h[i] | |
} | |
func (h searchHeapView) Len() int { | |
return len(h) | |
} | |
func (h searchHeapView) up(j int) { | |
for { | |
i := (j - 1) / 2 // parent | |
if i == j || !(h[j][0] < h[i][0]) { | |
break | |
} | |
h.Swap(i, j) | |
j = i | |
} | |
} | |
func (h searchHeapView) down(i, n int) { | |
for { | |
j1 := 2*i + 1 | |
if j1 >= n || j1 < 0 { // j1 < 0 after int overflow | |
break | |
} | |
j := j1 // left child | |
if j2 := j1 + 1; j2 < n && !(h[j1][0] < h[j2][0]) { | |
j = j2 // = 2*i + 2 // right child | |
} | |
if !(h[j][0] < h[i][0]) { | |
break | |
} | |
h.Swap(i, j) | |
i = j | |
} | |
} | |
func (lhs *WeightedEntry) Less(rhs *WeightedEntry) bool { | |
return lhs.score < rhs.score | |
} | |
func (r Result) Len() int { | |
return len(r) | |
} | |
func (r Result) Swap(i, j int) { | |
r[i], r[j] = r[j], r[i] | |
} | |
func (r Result) Less(i, j int) bool { | |
return r[i].Less(r[j]) | |
} | |
func (r *Result) Push(x interface{}) { | |
*r = append(*r, x.(*WeightedEntry)) | |
} | |
func (r *Result) Pop() interface{} { | |
old := *r | |
end := len(old) - 1 | |
x := old[end] | |
*r = old[0:end] | |
return x | |
} | |
func (s EntrySet) Len() int { | |
return len(s) | |
} | |
func (s EntrySet) Less(i, j int) bool { | |
return s[i] < s[j] | |
} | |
func (s EntrySet) Swap(i, j int) { | |
s[i], s[j] = s[j], s[i] | |
} | |
func (s *EntrySet) Push(x interface{}) { | |
*s = append(*s, x.(entryId)) | |
} | |
func (s *EntrySet) Pop() interface{} { | |
old := *s | |
end := len(old) - 1 | |
e := old[end] | |
*s = old[0:end] | |
return e | |
} | |
// FuzzySearch executes search of a given pattern and returns | |
// the result. | |
func (i *GramIndex) FuzzySearch(pattern string, | |
limit int, | |
scoreThreshold float64) Result { | |
maxGrams := len(pattern) - 1 | |
gramSet := newGramSet(pattern) | |
gramItems := make([]EntrySet, 0, maxGrams) | |
for _, g := range gramSet { | |
entrySet, found := i.gramToEntries[g] | |
if !found { | |
continue | |
} | |
gramItems = append(gramItems, entrySet) | |
} | |
return singlePassSearch(i, pattern, gramSet, gramItems, limit, scoreThreshold) | |
} | |
// singlePassSearch function uses ideas from the SPS algorithm | |
// described in the "Efficient Top-k Algorithms for Fuzzy Search in | |
// String Collections" article by Rares Vernica and Chen Li. | |
func singlePassSearch(i *GramIndex, | |
pattern string, | |
patternGramSet gramSet, | |
items []EntrySet, | |
k int, | |
scoreThreshold float64) Result { | |
result := make(Result, 0, k) | |
n := patternGramSet.Len() | |
frequencyThreshold := 1 | |
searchHeap := searchHeapView(items) | |
searchHeap.Init() | |
for !searchHeap.IsEmpty() { | |
t, p := searchHeap.PopItem() | |
e := i.idToEntry[t] | |
score := getScore(patternGramSet, pattern, e, p) | |
if p >= frequencyThreshold && score >= scoreThreshold { | |
if result.Len() < k { | |
result = append(result, &WeightedEntry{*e, score}) | |
} else if result[0].score < score { | |
heap.Pop(&result) | |
heap.Push(&result, &WeightedEntry{*e, score}) | |
q := recomputeThreshold(result[0].score, n) | |
if q > frequencyThreshold { | |
frequencyThreshold = q | |
} | |
if frequencyThreshold > n { | |
break | |
} | |
} | |
} | |
} | |
sort.Sort(sort.Reverse(result)) | |
return result | |
} | |
func recomputeThreshold(kthScore float64, gramsInQuery int) int { | |
tMin := (kthScore - 1.0 * SubstringSimilarityCoeff) / float64(JaccardSimilarityCoeff) | |
leftPart := tMin * float64(gramsInQuery) | |
MinGramsInCollection := 2 | |
rightPart := float64(gramsInQuery+MinGramsInCollection) / | |
(1 + 1/float64(tMin)) | |
maxPart := leftPart | |
if maxPart < rightPart { | |
maxPart = rightPart | |
} | |
return int(maxPart) | |
} | |
func isSep(c uint8) bool { | |
return c == '.' || c == ':' || c == '/' | |
} | |
func getScore(p gramSet, pattern string, e *SearchEntry, weight int) float64 { | |
unionCnt := len(p) + len(e.grams) - weight | |
jaccardSim := float64(weight) / float64(unionCnt) | |
substrSim := 0.0 | |
if len(p)-weight <= 1 { | |
i := strings.Index(e.textLower, pattern) | |
if i == 0 || i > 0 && isSep(e.textLower[i-1]) { | |
substrSim = 1.0 | |
} else if i > 0 { | |
substrSim = 0.5 | |
} | |
} | |
return jaccardSim*JaccardSimilarityCoeff + substrSim*SubstringSimilarityCoeff | |
} | |
func main() { | |
limit := flag.Int("limit", 50, "max number of results") | |
scoreThreshold := flag.Float64("threshold", 0.05, "score threshold") | |
flag.Parse() | |
n := flag.NArg() | |
if n != 1 { | |
fmt.Printf("Bad number of arguments: %d\n\n", n) | |
flag.Usage() | |
fmt.Printf(" PATTERN\n") | |
return | |
} | |
pattern := flag.Arg(0) | |
in := bufio.NewReader(os.Stdin) | |
idx := NewGramIndex() | |
for { | |
line, err := in.ReadString('\n') | |
if err == io.EOF { | |
break | |
} | |
if err != nil { | |
fmt.Printf("ERROR: %v\n", err) | |
} | |
idx.Add(newEntry(strings.TrimRight(line, "\r\n"))) | |
} | |
result := idx.FuzzySearch(pattern, *limit, *scoreThreshold) | |
for _, e := range result { | |
fmt.Printf("%1.4f %s\n", e.score, e.text) | |
} | |
if len(result) == 0 { | |
os.Exit(1) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment