Skip to content

Instantly share code, notes, and snippets.

@Dieterbe
Created October 19, 2018 16:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Dieterbe/d21dba1c25b4c60644ea7919d186a456 to your computer and use it in GitHub Desktop.
Save Dieterbe/d21dba1c25b4c60644ea7919d186a456 to your computer and use it in GitHub Desktop.
package main
import (
"bufio"
"errors"
"flag"
"fmt"
"log"
"os"
"runtime"
"runtime/pprof"
"time"
"github.com/dgryski/go-bloomindex"
"github.com/dgryski/trifles/repl"
"github.com/dustin/go-humanize"
)
func main() {
cpuprofile := flag.String("cpuprofile", "", "write cpu profile to file")
interactive := flag.Bool("interactive", true, "interactive")
benchmark := flag.Bool("benchmark", true, "benchmark")
ngram := flag.Int("ngram", 4, "n-gram size")
flag.Parse()
var docs []string
var idx *bloomindex.ShardedIndex
// var idx *bloomindex.Index
var ids []bloomindex.DocID
cIndex := func(args []string) error {
if len(args) < 1 {
return errors.New("missing argument")
}
fname := args[0]
f, err := os.Open(fname)
if err != nil {
return err
}
scanner := bufio.NewScanner(f)
if len(docs) != 0 {
docs = docs[:0]
}
var mem runtime.MemStats
runtime.ReadMemStats(&mem)
log.Println(humanize.Bytes(mem.Alloc))
idx = bloomindex.NewShardedIndex(0.01, 4)
// idx = bloomindex.NewIndex(512, 65536, 4)
t0 := time.Now()
var tokens []T
var skipped int
for scanner.Scan() {
d := scanner.Text()
var toks []uint32
for _, t := range Extract(d, *ngram, tokens[:0]) {
toks = append(toks, jenkinsShift(uint32(t)))
}
if len(toks) > 0 {
docs = append(docs, d)
idx.AddDocument(toks)
}
}
if err := scanner.Err(); err != nil {
fmt.Println("error during scan: ", err)
}
fmt.Printf("indexed %d documents in %s (skipped %d)\n", len(docs), time.Since(t0), skipped)
runtime.GC()
runtime.ReadMemStats(&mem)
log.Println(humanize.Bytes(mem.Alloc))
if *cpuprofile != "" {
f, err = os.Create(*cpuprofile)
if err != nil {
log.Fatal(err)
}
pprof.StartCPUProfile(f)
}
return nil
}
cPrint := func(args []string) error {
for _, id := range ids {
fmt.Printf("%05d: %q\n", id, docs[id])
}
return nil
}
cSearch := func(args []string) error {
if idx == nil {
return errors.New("no index loaded")
}
var tokens []T
var toks []uint32
for _, arg := range args {
for _, tok := range Extract(arg, *ngram, tokens[:0]) {
toks = append(toks, jenkinsShift(uint32(tok)))
}
}
if !*benchmark {
t0 := time.Now()
ids = idx.Query(toks)
fmt.Printf("found %d documents in %v\n", len(ids), time.Since(t0))
} else {
var total time.Duration
for i := 0; i < 30; i++ {
t0 := time.Now()
for j := 0; j < 100; j++ {
ids = idx.Query(toks)
}
total += time.Since(t0)
fmt.Println(time.Since(t0))
}
total /= 3000
fmt.Println("QPS: ", int(time.Second/total))
fmt.Printf("found %d documents\n", len(ids))
}
return nil
}
if !*interactive {
cIndex([]string{"/usr/share/dict/words"})
cSearch([]string{"cherry"})
// cIndex([]string{"/home/dgryski/work/data/emails-1m.txt"})
// cSearch([]string{"cherry", "stick"})
} else {
repl.Run("bindex> ",
map[string]repl.Cmd{
"index": cIndex,
"print": cPrint,
"search": cSearch,
},
)
}
if *cpuprofile != "" {
pprof.StopCPUProfile()
}
}
type T uint64
// Extract returns a list of all the unique trigrams in s
func Extract(s string, ngram int, trigrams []T) []T {
for i := 0; i <= len(s)-ngram; i++ {
var t T
for j := 0; j < ngram; j++ {
t = (t << 8) | T(s[i+j])
}
trigrams = appendIfUnique(trigrams, t)
}
return trigrams
}
func appendIfUnique(t []T, n T) []T {
for _, v := range t {
if v == n {
return t
}
}
return append(t, n)
}
func jenkinsShift(a uint32) uint32 {
a -= (a << 6)
a ^= (a >> 17)
a -= (a << 9)
a ^= (a << 4)
a -= (a << 3)
a ^= (a << 10)
a ^= (a >> 15)
return a
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment