Skip to content

Instantly share code, notes, and snippets.

@dgryski
Last active August 29, 2015 14:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dgryski/d8f1c8cb9c8df5a438a3 to your computer and use it in GitHub Desktop.
Save dgryski/d8f1c8cb9c8df5a438a3 to your computer and use it in GitHub Desktop.
package main
import (
"bufio"
"fmt"
"io"
"os"
"sort"
"strconv"
"strings"
"time"
"github.com/dgryski/go-trigram"
"github.com/peterh/liner"
)
func main() {
var docs []string
var ids []trigram.DocID
var idx trigram.Index
line := liner.NewLiner()
defer line.Close()
REPL:
for {
var err error
var command string
command, err = line.Prompt("trigram> ")
if err == io.EOF {
break
}
if err != nil {
fmt.Println("error reading line: ", err)
continue
}
fields := strings.Fields(command)
if len(fields) == 0 {
continue
}
line.AppendHistory(command)
switch fields[0] {
case "brute", "b":
if idx == nil {
fmt.Println("no index loaded")
break
}
if len(fields) == 1 {
fmt.Println("missing argument")
break
}
patterns := fields[1:]
if len(ids) != 0 {
ids = ids[:0]
}
t0 := time.Now()
for i, s := range docs {
var mismatch = false
search:
for _, pat := range patterns {
if !strings.Contains(s, pat) {
mismatch = true
break search
}
}
if !mismatch {
ids = append(ids, trigram.DocID(i))
}
}
fmt.Println("found", len(ids), "documents in", time.Since(t0))
case "f", "filter":
if idx == nil {
fmt.Println("no index loaded")
break
}
var ts []trigram.T
for _, f := range fields[1:] {
ts = trigram.Extract(f, ts)
}
t0 := time.Now()
ids = idx.Filter(ids, ts)
fmt.Println("filtered", len(ids), "documents in", time.Since(t0))
case "h", "help":
fmt.Println("b[rute] pattern -- brute force search for `pattern`")
fmt.Println("f[ilter] pat1 pat2... -- filter current matches with additional trigrams")
fmt.Println("h[elp] -- this help")
fmt.Println("index file.txt -- load a file into the index")
fmt.Println("p[rint] -- print current matches")
fmt.Println("q[uit] -- quit")
fmt.Println("s[earch] pat1 pat2... -- trigram search for docs containing the specified patterns")
fmt.Println("t[rigram] pat1 pat2... -- show trigram frequencies for the given patterns")
case "index":
if len(fields) != 2 {
fmt.Println("missing argument")
break
}
fname := fields[1]
f, err := os.Open(fname)
if err != nil {
fmt.Println(err)
break
}
scanner := bufio.NewScanner(f)
if len(docs) != 0 {
docs = docs[:0]
}
idx = trigram.NewIndex(nil)
t0 := time.Now()
for scanner.Scan() {
d := scanner.Text()
docs = append(docs, d)
// add the trigrams
idx.Add(d)
}
fmt.Printf("indexed %d documents in %s\n", len(docs), time.Since(t0))
case "p", "print":
for _, id := range ids {
fmt.Printf("%05d: %q\n", id, docs[id])
}
case "prune":
if idx == nil {
fmt.Println("no index loaded")
break
}
if len(fields) != 2 {
}
pct, _ := strconv.Atoi(fields[1])
pruned := idx.Prune(float64(pct) / 100)
fmt.Println("pruned", pruned, "at", pct)
case "q", "quit":
break REPL
case "s", "search":
if idx == nil {
fmt.Println("no index loaded")
break
}
var ts []trigram.T
for _, f := range fields[1:] {
ts = trigram.Extract(f, ts)
}
t0 := time.Now()
ids = idx.QueryTrigrams(ts)
fmt.Println("found", len(ids), "documents in", time.Since(t0))
case "top":
var freq []int
for _, v := range idx {
freq = append(freq, len(v))
}
sort.Ints(freq)
for i := 0; i < 100; i++ {
fmt.Println(freq[len(freq)-1-i])
}
case "t", "tri", "trigram", "trigrams":
if idx == nil {
fmt.Println("no index loaded")
break
}
var ts []trigram.T
for _, f := range fields[1:] {
ts = trigram.Extract(f, ts)
}
for _, t := range ts {
fmt.Printf("%q: %d\n", t, len(idx[t]))
}
default:
fmt.Println("unknown command, try `help`")
}
}
fmt.Println("bye")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment