roman-kashitsyn/bigramsearch.go

## bigramsearch.go
package main

import (
	"bufio"
	"container/heap"
	"flag"
	"fmt"
	"io"
	"os"
	"sort"
	"strings"
	"unicode"
)

const (
	MinEntryLen              = 1
	JaccardSimilarityCoeff   = 0.5
	SubstringSimilarityCoeff = 0.5
	eof                      = gram(-1)
)

type gram int64
type gramSet []gram
type entryId int32
type EntrySet []entryId

type SearchEntry struct {
	text      string
	textLower string
	grams     gramSet
}

type WeightedEntry struct {
	SearchEntry
	score float64
}

type Result []*WeightedEntry

type GramIndex struct {
	gramToEntries map[gram]EntrySet
	idToEntry     []*SearchEntry
}

func forEachGram(s string, f func(gram)) {
	rs := []rune(s)
	l := len(rs)
	n := l - 1

	for i, c := range rs {
		rs[i] = unicode.ToLower(c)
	}

	// Extra "starts with" gram
	f(eof<<32 | gram(rs[0]))
	for i := 0; i < n; i += 1 {
		b := gram(rs[i])<<32 | gram(rs[i+1])
		f(b)
	}
}

func newGramSet(pattern string) gramSet {
	s := make(gramSet, 0, len(pattern)-1)
	forEachGram(pattern, func(g gram) {
		s = append(s, g)
	})
	sort.Sort(s)
	unique(&s)
	return s
}

func unique(s *gramSet) {
	n := len(*s)
	gs := *s
	if n <= 1 {
		return
	}
	b, e := 0, 1
	for {
		for e < n && gs[e] == gs[b] {
			e++
		}

		if e == n {
			break
		}

		b++

		if e-b > 0 {
			gs[b] = gs[e]
		}
	}
	*s = gs[:b+1]
}

func (gs gramSet) Len() int {
	return len(gs)
}

func (gs gramSet) Swap(i, j int) {
	gs[i], gs[j] = gs[j], gs[i]
}

func (gs gramSet) Less(i, j int) bool {
	return gs[i] < gs[j]
}

func newEntry(s string) *SearchEntry {
	return &SearchEntry{s, strings.ToLower(s), newGramSet(s)}
}

// Gram Index
func NewGramIndex() *GramIndex {
	index := &GramIndex{}
	index.gramToEntries = make(map[gram]EntrySet)
	index.idToEntry = make([]*SearchEntry, 0)
	return index
}

func (i *GramIndex) Add(e *SearchEntry) {
	if len(e.text) < MinEntryLen {
		return
	}

	i.idToEntry = append(i.idToEntry, e)
	eid := entryId(len(i.idToEntry) - 1)

	for _, g := range e.grams {
		s := i.gramToEntries[g]
		if s == nil {
			s = EntrySet{}
		}
		s = append(s, eid)
		i.gramToEntries[g] = s
	}
}

// Implementation of fast read-only search entry heap. It turns out
// that it's (3-5x times) faster to inline container/heap module
// functions here then to use the module directly.

type searchHeapView []EntrySet

func (hp *searchHeapView) Init() {
	n := len(*hp)
	for i := n/2 - 1; i >= 0; i-- {
		hp.down(i, n)
	}
}

func (hp *searchHeapView) popOnce() entryId {
	top := (*hp)[0]
	e := top[0]
	(*hp)[0] = top[1:]

	n := len(*hp) - 1
	hp.Swap(0, n)

	if len(top) > 1 {
		hp.down(0, n)
		hp.up(n)

	} else {
		hp.down(0, n)
		*hp = (*hp)[:n]
	}
	return e
}

func (hp *searchHeapView) PopItem() (e entryId, n int) {
	e = hp.popOnce()
	n = 1

	for !hp.IsEmpty() && (*hp)[0][0] == e {
		hp.popOnce()
		n++
	}

	return e, n
}

func (hp *searchHeapView) IsEmpty() bool {
	return len(*hp) == 0
}

func (hp *searchHeapView) Push(x interface{}) {
	*hp = append(*hp, x.(EntrySet))
}

func (hp *searchHeapView) Pop() interface{} {
	h := *hp
	end := len(h) - 1
	x := h[end]
	*hp = h[:end]
	return x
}

func (h searchHeapView) Less(i, j int) bool {
	return h[i][0] < h[j][0]
}

func (h searchHeapView) Swap(i, j int) {
	h[i], h[j] = h[j], h[i]
}

func (h searchHeapView) Len() int {
	return len(h)
}

func (h searchHeapView) up(j int) {
	for {
		i := (j - 1) / 2 // parent
		if i == j || !(h[j][0] < h[i][0]) {
			break
		}
		h.Swap(i, j)
		j = i
	}
}

func (h searchHeapView) down(i, n int) {
	for {
		j1 := 2*i + 1
		if j1 >= n || j1 < 0 { // j1 < 0 after int overflow
			break
		}
		j := j1 // left child
		if j2 := j1 + 1; j2 < n && !(h[j1][0] < h[j2][0]) {
			j = j2 // = 2*i + 2  // right child
		}
		if !(h[j][0] < h[i][0]) {
			break
		}
		h.Swap(i, j)
		i = j
	}
}

func (lhs *WeightedEntry) Less(rhs *WeightedEntry) bool {
	return lhs.score < rhs.score
}

func (r Result) Len() int {
	return len(r)
}

func (r Result) Swap(i, j int) {
	r[i], r[j] = r[j], r[i]
}

func (r Result) Less(i, j int) bool {
	return r[i].Less(r[j])
}

func (r *Result) Push(x interface{}) {
	*r = append(*r, x.(*WeightedEntry))
}

func (r *Result) Pop() interface{} {
	old := *r
	end := len(old) - 1
	x := old[end]
	*r = old[0:end]
	return x

}

func (s EntrySet) Len() int {
	return len(s)
}

func (s EntrySet) Less(i, j int) bool {
	return s[i] < s[j]
}

func (s EntrySet) Swap(i, j int) {
	s[i], s[j] = s[j], s[i]
}

func (s *EntrySet) Push(x interface{}) {
	*s = append(*s, x.(entryId))
}

func (s *EntrySet) Pop() interface{} {
	old := *s
	end := len(old) - 1
	e := old[end]
	*s = old[0:end]
	return e
}

// FuzzySearch executes search of a given pattern and returns
// the result.
func (i *GramIndex) FuzzySearch(pattern string,
	limit int,
	scoreThreshold float64) Result {

	maxGrams := len(pattern) - 1

	gramSet := newGramSet(pattern)
	gramItems := make([]EntrySet, 0, maxGrams)

	for _, g := range gramSet {
		entrySet, found := i.gramToEntries[g]
		if !found {
			continue
		}
		gramItems = append(gramItems, entrySet)
	}

	return singlePassSearch(i, pattern, gramSet, gramItems, limit, scoreThreshold)
}

// singlePassSearch function uses ideas from the SPS algorithm
// described in the "Efficient Top-k Algorithms for Fuzzy Search in
// String Collections" article by Rares Vernica and Chen Li.
func singlePassSearch(i *GramIndex,
	pattern string,
	patternGramSet gramSet,
	items []EntrySet,
	k int,
	scoreThreshold float64) Result {

	result := make(Result, 0, k)
	n := patternGramSet.Len()
	frequencyThreshold := 1

	searchHeap := searchHeapView(items)
	searchHeap.Init()

	for !searchHeap.IsEmpty() {
		t, p := searchHeap.PopItem()
		e := i.idToEntry[t]
		score := getScore(patternGramSet, pattern, e, p)

		if p >= frequencyThreshold && score >= scoreThreshold {
			if result.Len() < k {
				result = append(result, &WeightedEntry{*e, score})
			} else if result[0].score < score {
				heap.Pop(&result)
				heap.Push(&result, &WeightedEntry{*e, score})

				q := recomputeThreshold(result[0].score, n)

				if q > frequencyThreshold {
					frequencyThreshold = q
				}

				if frequencyThreshold > n {
					break
				}
			}
		}
	}
	sort.Sort(sort.Reverse(result))
	return result
}

func recomputeThreshold(kthScore float64, gramsInQuery int) int {
	tMin := (kthScore - 1.0 * SubstringSimilarityCoeff) / float64(JaccardSimilarityCoeff)
	leftPart := tMin * float64(gramsInQuery)
	MinGramsInCollection := 2
	rightPart := float64(gramsInQuery+MinGramsInCollection) /
		(1 + 1/float64(tMin))

	maxPart := leftPart
	if maxPart < rightPart {
		maxPart = rightPart
	}

	return int(maxPart)
}

func isSep(c uint8) bool {
	return c == '.' || c == ':' || c == '/'
}

func getScore(p gramSet, pattern string, e *SearchEntry, weight int) float64 {
	unionCnt := len(p) + len(e.grams) - weight
	jaccardSim := float64(weight) / float64(unionCnt)
	substrSim := 0.0

	if len(p)-weight <= 1 {
		i := strings.Index(e.textLower, pattern)
		if i == 0 || i > 0 && isSep(e.textLower[i-1]) {
			substrSim = 1.0
		} else if i > 0 {
			substrSim = 0.5
		}
	}
	return jaccardSim*JaccardSimilarityCoeff + substrSim*SubstringSimilarityCoeff
}

func main() {
	limit := flag.Int("limit", 50, "max number of results")
	scoreThreshold := flag.Float64("threshold", 0.05, "score threshold")
	flag.Parse()

	n := flag.NArg()

	if n != 1 {
		fmt.Printf("Bad number of arguments: %d\n\n", n)
		flag.Usage()
		fmt.Printf("  PATTERN\n")
		return
	}

	pattern := flag.Arg(0)

	in := bufio.NewReader(os.Stdin)
	idx := NewGramIndex()

	for {
		line, err := in.ReadString('\n')
		if err == io.EOF {
			break
		}
		if err != nil {
			fmt.Printf("ERROR: %v\n", err)
		}

		idx.Add(newEntry(strings.TrimRight(line, "\r\n")))
	}

	result := idx.FuzzySearch(pattern, *limit, *scoreThreshold)

	for _, e := range result {
		fmt.Printf("%1.4f %s\n", e.score, e.text)
	}

	if len(result) == 0 {
		os.Exit(1)
	}
}
	package main

	import (
	"bufio"
	"container/heap"
	"flag"
	"fmt"
	"io"
	"os"
	"sort"
	"strings"
	"unicode"
	)

	const (
	MinEntryLen = 1
	JaccardSimilarityCoeff = 0.5
	SubstringSimilarityCoeff = 0.5
	eof = gram(-1)
	)

	type gram int64
	type gramSet []gram
	type entryId int32
	type EntrySet []entryId

	type SearchEntry struct {
	text string
	textLower string
	grams gramSet
	}

	type WeightedEntry struct {
	SearchEntry
	score float64
	}

	type Result []*WeightedEntry

	type GramIndex struct {
	gramToEntries map[gram]EntrySet
	idToEntry []*SearchEntry
	}

	func forEachGram(s string, f func(gram)) {
	rs := []rune(s)
	l := len(rs)
	n := l - 1

	for i, c := range rs {
	rs[i] = unicode.ToLower(c)
	}

	// Extra "starts with" gram
	f(eof<<32 \| gram(rs[0]))
	for i := 0; i < n; i += 1 {
	b := gram(rs[i])<<32 \| gram(rs[i+1])
	f(b)
	}
	}

	func newGramSet(pattern string) gramSet {
	s := make(gramSet, 0, len(pattern)-1)
	forEachGram(pattern, func(g gram) {
	s = append(s, g)
	})
	sort.Sort(s)
	unique(&s)
	return s
	}

	func unique(s *gramSet) {
	n := len(*s)
	gs := *s
	if n <= 1 {
	return
	}
	b, e := 0, 1
	for {
	for e < n && gs[e] == gs[b] {
	e++
	}

	if e == n {
	break
	}

	b++

	if e-b > 0 {
	gs[b] = gs[e]
	}
	}
	*s = gs[:b+1]
	}

	func (gs gramSet) Len() int {
	return len(gs)
	}

	func (gs gramSet) Swap(i, j int) {
	gs[i], gs[j] = gs[j], gs[i]
	}

	func (gs gramSet) Less(i, j int) bool {
	return gs[i] < gs[j]
	}

	func newEntry(s string) *SearchEntry {
	return &SearchEntry{s, strings.ToLower(s), newGramSet(s)}
	}

	// Gram Index
	func NewGramIndex() *GramIndex {
	index := &GramIndex{}
	index.gramToEntries = make(map[gram]EntrySet)
	index.idToEntry = make([]*SearchEntry, 0)
	return index
	}

	func (i GramIndex) Add(e SearchEntry) {
	if len(e.text) < MinEntryLen {
	return
	}

	i.idToEntry = append(i.idToEntry, e)
	eid := entryId(len(i.idToEntry) - 1)

	for _, g := range e.grams {
	s := i.gramToEntries[g]
	if s == nil {
	s = EntrySet{}
	}
	s = append(s, eid)
	i.gramToEntries[g] = s
	}
	}

	// Implementation of fast read-only search entry heap. It turns out
	// that it's (3-5x times) faster to inline container/heap module
	// functions here then to use the module directly.

	type searchHeapView []EntrySet

	func (hp *searchHeapView) Init() {
	n := len(*hp)
	for i := n/2 - 1; i >= 0; i-- {
	hp.down(i, n)
	}
	}

	func (hp *searchHeapView) popOnce() entryId {
	top := (*hp)[0]
	e := top[0]
	(*hp)[0] = top[1:]

	n := len(*hp) - 1
	hp.Swap(0, n)

	if len(top) > 1 {
	hp.down(0, n)
	hp.up(n)

	} else {
	hp.down(0, n)
	hp = (hp)[:n]
	}
	return e
	}

	func (hp *searchHeapView) PopItem() (e entryId, n int) {
	e = hp.popOnce()
	n = 1

	for !hp.IsEmpty() && (*hp)[0][0] == e {
	hp.popOnce()
	n++
	}

	return e, n
	}

	func (hp *searchHeapView) IsEmpty() bool {
	return len(*hp) == 0
	}

	func (hp *searchHeapView) Push(x interface{}) {
	hp = append(hp, x.(EntrySet))
	}

	func (hp *searchHeapView) Pop() interface{} {
	h := *hp
	end := len(h) - 1
	x := h[end]
	*hp = h[:end]
	return x
	}

	func (h searchHeapView) Less(i, j int) bool {
	return h[i][0] < h[j][0]
	}

	func (h searchHeapView) Swap(i, j int) {
	h[i], h[j] = h[j], h[i]
	}

	func (h searchHeapView) Len() int {
	return len(h)
	}

	func (h searchHeapView) up(j int) {
	for {
	i := (j - 1) / 2 // parent
	if i == j \|\| !(h[j][0] < h[i][0]) {
	break
	}
	h.Swap(i, j)
	j = i
	}
	}

	func (h searchHeapView) down(i, n int) {
	for {
	j1 := 2*i + 1
	if j1 >= n \|\| j1 < 0 { // j1 < 0 after int overflow
	break
	}
	j := j1 // left child
	if j2 := j1 + 1; j2 < n && !(h[j1][0] < h[j2][0]) {
	j = j2 // = 2*i + 2 // right child
	}
	if !(h[j][0] < h[i][0]) {
	break
	}
	h.Swap(i, j)
	i = j
	}
	}

	func (lhs WeightedEntry) Less(rhs WeightedEntry) bool {
	return lhs.score < rhs.score
	}

	func (r Result) Len() int {
	return len(r)
	}

	func (r Result) Swap(i, j int) {
	r[i], r[j] = r[j], r[i]
	}

	func (r Result) Less(i, j int) bool {
	return r[i].Less(r[j])
	}

	func (r *Result) Push(x interface{}) {
	r = append(r, x.(*WeightedEntry))
	}

	func (r *Result) Pop() interface{} {
	old := *r
	end := len(old) - 1
	x := old[end]
	*r = old[0:end]
	return x

	}

	func (s EntrySet) Len() int {
	return len(s)
	}

	func (s EntrySet) Less(i, j int) bool {
	return s[i] < s[j]
	}

	func (s EntrySet) Swap(i, j int) {
	s[i], s[j] = s[j], s[i]
	}

	func (s *EntrySet) Push(x interface{}) {
	s = append(s, x.(entryId))
	}

	func (s *EntrySet) Pop() interface{} {
	old := *s
	end := len(old) - 1
	e := old[end]
	*s = old[0:end]
	return e
	}

	// FuzzySearch executes search of a given pattern and returns
	// the result.
	func (i *GramIndex) FuzzySearch(pattern string,
	limit int,
	scoreThreshold float64) Result {

	maxGrams := len(pattern) - 1

	gramSet := newGramSet(pattern)
	gramItems := make([]EntrySet, 0, maxGrams)

	for _, g := range gramSet {
	entrySet, found := i.gramToEntries[g]
	if !found {
	continue
	}
	gramItems = append(gramItems, entrySet)
	}

	return singlePassSearch(i, pattern, gramSet, gramItems, limit, scoreThreshold)
	}

	// singlePassSearch function uses ideas from the SPS algorithm
	// described in the "Efficient Top-k Algorithms for Fuzzy Search in
	// String Collections" article by Rares Vernica and Chen Li.
	func singlePassSearch(i *GramIndex,
	pattern string,
	patternGramSet gramSet,
	items []EntrySet,
	k int,
	scoreThreshold float64) Result {

	result := make(Result, 0, k)
	n := patternGramSet.Len()
	frequencyThreshold := 1

	searchHeap := searchHeapView(items)
	searchHeap.Init()

	for !searchHeap.IsEmpty() {
	t, p := searchHeap.PopItem()
	e := i.idToEntry[t]
	score := getScore(patternGramSet, pattern, e, p)

	if p >= frequencyThreshold && score >= scoreThreshold {
	if result.Len() < k {
	result = append(result, &WeightedEntry{*e, score})
	} else if result[0].score < score {
	heap.Pop(&result)
	heap.Push(&result, &WeightedEntry{*e, score})

	q := recomputeThreshold(result[0].score, n)

	if q > frequencyThreshold {
	frequencyThreshold = q
	}

	if frequencyThreshold > n {
	break
	}
	}
	}
	}
	sort.Sort(sort.Reverse(result))
	return result
	}

	func recomputeThreshold(kthScore float64, gramsInQuery int) int {
	tMin := (kthScore - 1.0 * SubstringSimilarityCoeff) / float64(JaccardSimilarityCoeff)
	leftPart := tMin * float64(gramsInQuery)
	MinGramsInCollection := 2
	rightPart := float64(gramsInQuery+MinGramsInCollection) /
	(1 + 1/float64(tMin))

	maxPart := leftPart
	if maxPart < rightPart {
	maxPart = rightPart
	}

	return int(maxPart)
	}

	func isSep(c uint8) bool {
	return c == '.' \|\| c == ':' \|\| c == '/'
	}

	func getScore(p gramSet, pattern string, e *SearchEntry, weight int) float64 {
	unionCnt := len(p) + len(e.grams) - weight
	jaccardSim := float64(weight) / float64(unionCnt)
	substrSim := 0.0

	if len(p)-weight <= 1 {
	i := strings.Index(e.textLower, pattern)
	if i == 0 \|\| i > 0 && isSep(e.textLower[i-1]) {
	substrSim = 1.0
	} else if i > 0 {
	substrSim = 0.5
	}
	}
	return jaccardSimJaccardSimilarityCoeff + substrSimSubstringSimilarityCoeff
	}

	func main() {
	limit := flag.Int("limit", 50, "max number of results")
	scoreThreshold := flag.Float64("threshold", 0.05, "score threshold")
	flag.Parse()

	n := flag.NArg()

	if n != 1 {
	fmt.Printf("Bad number of arguments: %d\n\n", n)
	flag.Usage()
	fmt.Printf(" PATTERN\n")
	return
	}

	pattern := flag.Arg(0)

	in := bufio.NewReader(os.Stdin)
	idx := NewGramIndex()

	for {
	line, err := in.ReadString('\n')
	if err == io.EOF {
	break
	}
	if err != nil {
	fmt.Printf("ERROR: %v\n", err)
	}

	idx.Add(newEntry(strings.TrimRight(line, "\r\n")))
	}

	result := idx.FuzzySearch(pattern, limit, scoreThreshold)

	for _, e := range result {
	fmt.Printf("%1.4f %s\n", e.score, e.text)
	}

	if len(result) == 0 {
	os.Exit(1)
	}
	}