Skip to content

Instantly share code, notes, and snippets.

@roman-kashitsyn
Created April 2, 2014 19:11
Show Gist options
  • Save roman-kashitsyn/9941024 to your computer and use it in GitHub Desktop.
Save roman-kashitsyn/9941024 to your computer and use it in GitHub Desktop.
Simple command line utility for fuzzy search.
package main
import (
"bufio"
"container/heap"
"flag"
"fmt"
"io"
"os"
"sort"
"strings"
"unicode"
)
const (
MinEntryLen = 1
JaccardSimilarityCoeff = 0.5
SubstringSimilarityCoeff = 0.5
eof = gram(-1)
)
type gram int64
type gramSet []gram
type entryId int32
type EntrySet []entryId
type SearchEntry struct {
text string
textLower string
grams gramSet
}
type WeightedEntry struct {
SearchEntry
score float64
}
type Result []*WeightedEntry
type GramIndex struct {
gramToEntries map[gram]EntrySet
idToEntry []*SearchEntry
}
func forEachGram(s string, f func(gram)) {
rs := []rune(s)
l := len(rs)
n := l - 1
for i, c := range rs {
rs[i] = unicode.ToLower(c)
}
// Extra "starts with" gram
f(eof<<32 | gram(rs[0]))
for i := 0; i < n; i += 1 {
b := gram(rs[i])<<32 | gram(rs[i+1])
f(b)
}
}
func newGramSet(pattern string) gramSet {
s := make(gramSet, 0, len(pattern)-1)
forEachGram(pattern, func(g gram) {
s = append(s, g)
})
sort.Sort(s)
unique(&s)
return s
}
func unique(s *gramSet) {
n := len(*s)
gs := *s
if n <= 1 {
return
}
b, e := 0, 1
for {
for e < n && gs[e] == gs[b] {
e++
}
if e == n {
break
}
b++
if e-b > 0 {
gs[b] = gs[e]
}
}
*s = gs[:b+1]
}
func (gs gramSet) Len() int {
return len(gs)
}
func (gs gramSet) Swap(i, j int) {
gs[i], gs[j] = gs[j], gs[i]
}
func (gs gramSet) Less(i, j int) bool {
return gs[i] < gs[j]
}
func newEntry(s string) *SearchEntry {
return &SearchEntry{s, strings.ToLower(s), newGramSet(s)}
}
// Gram Index
func NewGramIndex() *GramIndex {
index := &GramIndex{}
index.gramToEntries = make(map[gram]EntrySet)
index.idToEntry = make([]*SearchEntry, 0)
return index
}
func (i *GramIndex) Add(e *SearchEntry) {
if len(e.text) < MinEntryLen {
return
}
i.idToEntry = append(i.idToEntry, e)
eid := entryId(len(i.idToEntry) - 1)
for _, g := range e.grams {
s := i.gramToEntries[g]
if s == nil {
s = EntrySet{}
}
s = append(s, eid)
i.gramToEntries[g] = s
}
}
// Implementation of fast read-only search entry heap. It turns out
// that it's (3-5x times) faster to inline container/heap module
// functions here then to use the module directly.
type searchHeapView []EntrySet
func (hp *searchHeapView) Init() {
n := len(*hp)
for i := n/2 - 1; i >= 0; i-- {
hp.down(i, n)
}
}
func (hp *searchHeapView) popOnce() entryId {
top := (*hp)[0]
e := top[0]
(*hp)[0] = top[1:]
n := len(*hp) - 1
hp.Swap(0, n)
if len(top) > 1 {
hp.down(0, n)
hp.up(n)
} else {
hp.down(0, n)
*hp = (*hp)[:n]
}
return e
}
func (hp *searchHeapView) PopItem() (e entryId, n int) {
e = hp.popOnce()
n = 1
for !hp.IsEmpty() && (*hp)[0][0] == e {
hp.popOnce()
n++
}
return e, n
}
func (hp *searchHeapView) IsEmpty() bool {
return len(*hp) == 0
}
func (hp *searchHeapView) Push(x interface{}) {
*hp = append(*hp, x.(EntrySet))
}
func (hp *searchHeapView) Pop() interface{} {
h := *hp
end := len(h) - 1
x := h[end]
*hp = h[:end]
return x
}
func (h searchHeapView) Less(i, j int) bool {
return h[i][0] < h[j][0]
}
func (h searchHeapView) Swap(i, j int) {
h[i], h[j] = h[j], h[i]
}
func (h searchHeapView) Len() int {
return len(h)
}
func (h searchHeapView) up(j int) {
for {
i := (j - 1) / 2 // parent
if i == j || !(h[j][0] < h[i][0]) {
break
}
h.Swap(i, j)
j = i
}
}
func (h searchHeapView) down(i, n int) {
for {
j1 := 2*i + 1
if j1 >= n || j1 < 0 { // j1 < 0 after int overflow
break
}
j := j1 // left child
if j2 := j1 + 1; j2 < n && !(h[j1][0] < h[j2][0]) {
j = j2 // = 2*i + 2 // right child
}
if !(h[j][0] < h[i][0]) {
break
}
h.Swap(i, j)
i = j
}
}
func (lhs *WeightedEntry) Less(rhs *WeightedEntry) bool {
return lhs.score < rhs.score
}
func (r Result) Len() int {
return len(r)
}
func (r Result) Swap(i, j int) {
r[i], r[j] = r[j], r[i]
}
func (r Result) Less(i, j int) bool {
return r[i].Less(r[j])
}
func (r *Result) Push(x interface{}) {
*r = append(*r, x.(*WeightedEntry))
}
func (r *Result) Pop() interface{} {
old := *r
end := len(old) - 1
x := old[end]
*r = old[0:end]
return x
}
func (s EntrySet) Len() int {
return len(s)
}
func (s EntrySet) Less(i, j int) bool {
return s[i] < s[j]
}
func (s EntrySet) Swap(i, j int) {
s[i], s[j] = s[j], s[i]
}
func (s *EntrySet) Push(x interface{}) {
*s = append(*s, x.(entryId))
}
func (s *EntrySet) Pop() interface{} {
old := *s
end := len(old) - 1
e := old[end]
*s = old[0:end]
return e
}
// FuzzySearch executes search of a given pattern and returns
// the result.
func (i *GramIndex) FuzzySearch(pattern string,
limit int,
scoreThreshold float64) Result {
maxGrams := len(pattern) - 1
gramSet := newGramSet(pattern)
gramItems := make([]EntrySet, 0, maxGrams)
for _, g := range gramSet {
entrySet, found := i.gramToEntries[g]
if !found {
continue
}
gramItems = append(gramItems, entrySet)
}
return singlePassSearch(i, pattern, gramSet, gramItems, limit, scoreThreshold)
}
// singlePassSearch function uses ideas from the SPS algorithm
// described in the "Efficient Top-k Algorithms for Fuzzy Search in
// String Collections" article by Rares Vernica and Chen Li.
func singlePassSearch(i *GramIndex,
pattern string,
patternGramSet gramSet,
items []EntrySet,
k int,
scoreThreshold float64) Result {
result := make(Result, 0, k)
n := patternGramSet.Len()
frequencyThreshold := 1
searchHeap := searchHeapView(items)
searchHeap.Init()
for !searchHeap.IsEmpty() {
t, p := searchHeap.PopItem()
e := i.idToEntry[t]
score := getScore(patternGramSet, pattern, e, p)
if p >= frequencyThreshold && score >= scoreThreshold {
if result.Len() < k {
result = append(result, &WeightedEntry{*e, score})
} else if result[0].score < score {
heap.Pop(&result)
heap.Push(&result, &WeightedEntry{*e, score})
q := recomputeThreshold(result[0].score, n)
if q > frequencyThreshold {
frequencyThreshold = q
}
if frequencyThreshold > n {
break
}
}
}
}
sort.Sort(sort.Reverse(result))
return result
}
func recomputeThreshold(kthScore float64, gramsInQuery int) int {
tMin := (kthScore - 1.0 * SubstringSimilarityCoeff) / float64(JaccardSimilarityCoeff)
leftPart := tMin * float64(gramsInQuery)
MinGramsInCollection := 2
rightPart := float64(gramsInQuery+MinGramsInCollection) /
(1 + 1/float64(tMin))
maxPart := leftPart
if maxPart < rightPart {
maxPart = rightPart
}
return int(maxPart)
}
func isSep(c uint8) bool {
return c == '.' || c == ':' || c == '/'
}
func getScore(p gramSet, pattern string, e *SearchEntry, weight int) float64 {
unionCnt := len(p) + len(e.grams) - weight
jaccardSim := float64(weight) / float64(unionCnt)
substrSim := 0.0
if len(p)-weight <= 1 {
i := strings.Index(e.textLower, pattern)
if i == 0 || i > 0 && isSep(e.textLower[i-1]) {
substrSim = 1.0
} else if i > 0 {
substrSim = 0.5
}
}
return jaccardSim*JaccardSimilarityCoeff + substrSim*SubstringSimilarityCoeff
}
func main() {
limit := flag.Int("limit", 50, "max number of results")
scoreThreshold := flag.Float64("threshold", 0.05, "score threshold")
flag.Parse()
n := flag.NArg()
if n != 1 {
fmt.Printf("Bad number of arguments: %d\n\n", n)
flag.Usage()
fmt.Printf(" PATTERN\n")
return
}
pattern := flag.Arg(0)
in := bufio.NewReader(os.Stdin)
idx := NewGramIndex()
for {
line, err := in.ReadString('\n')
if err == io.EOF {
break
}
if err != nil {
fmt.Printf("ERROR: %v\n", err)
}
idx.Add(newEntry(strings.TrimRight(line, "\r\n")))
}
result := idx.FuzzySearch(pattern, *limit, *scoreThreshold)
for _, e := range result {
fmt.Printf("%1.4f %s\n", e.score, e.text)
}
if len(result) == 0 {
os.Exit(1)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment