Skip to content

Instantly share code, notes, and snippets.

@Dieterbe
Created October 19, 2018 16:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Dieterbe/301eb88113e7adcabcfe852961444fb2 to your computer and use it in GitHub Desktop.
Save Dieterbe/301eb88113e7adcabcfe852961444fb2 to your computer and use it in GitHub Desktop.
package main
import (
"bufio"
"errors"
"flag"
"fmt"
"log"
"os"
"runtime"
"runtime/pprof"
"time"
"github.com/dgryski/go-postings"
"github.com/dgryski/trifles/repl"
"github.com/dustin/go-humanize"
)
func main() {
cpuprofile := flag.String("cpuprofile", "", "write cpu profile to file")
interactive := flag.Bool("interactive", true, "interactive")
benchmark := flag.Bool("benchmark", true, "benchmark")
ngram := flag.Int("ngram", 3, "n-gram size")
flag.Parse()
var docs []string
var pidx *postings.Index
var idx postings.InvertedIndex
var ids []postings.DocID
cIndex := func(args []string) error {
if len(args) < 1 {
return errors.New("missing argument")
}
fname := args[0]
f, err := os.Open(fname)
if err != nil {
return err
}
scanner := bufio.NewScanner(f)
if len(docs) != 0 {
docs = docs[:0]
}
var mem runtime.MemStats
runtime.ReadMemStats(&mem)
log.Println(humanize.Bytes(mem.Alloc))
pidx = postings.NewIndex(nil)
t0 := time.Now()
var tokens []postings.TermID
var skipped int
for scanner.Scan() {
d := scanner.Text()
toks := Extract(d, *ngram, tokens[:0])
if len(toks) > 0 {
docs = append(docs, d)
pidx.AddDocument(toks)
}
}
if err := scanner.Err(); err != nil {
fmt.Println("error during scan: ", err)
}
fmt.Printf("indexed %d documents in %s (skipped %d)\n", len(docs), time.Since(t0), skipped)
runtime.GC()
runtime.ReadMemStats(&mem)
log.Println(humanize.Bytes(mem.Alloc))
idx = pidx
return nil
}
cPrint := func(args []string) error {
for _, id := range ids {
fmt.Printf("%05d: %q\n", id, docs[id])
}
return nil
}
cCompress := func(args []string) error {
var mem runtime.MemStats
runtime.ReadMemStats(&mem)
log.Println(humanize.Bytes(mem.Alloc))
t0 := time.Now()
idx = postings.NewCompressedIndex(pidx)
pidx = nil
fmt.Println("compression took", time.Since(t0))
runtime.GC()
runtime.ReadMemStats(&mem)
log.Println(humanize.Bytes(mem.Alloc))
if *cpuprofile != "" {
f, err := os.Create(*cpuprofile)
if err != nil {
log.Fatal(err)
}
pprof.StartCPUProfile(f)
}
return nil
}
cMem := func(args []string) error {
runtime.GC()
var mem runtime.MemStats
runtime.ReadMemStats(&mem)
log.Println(humanize.Bytes(mem.Alloc))
return nil
}
cSearch := func(args []string) error {
if idx == nil {
return errors.New("no index loaded")
}
var tokens []postings.TermID
var toks []postings.TermID
for _, arg := range args {
for _, tok := range Extract(arg, *ngram, tokens[:0]) {
toks = append(toks, tok)
}
}
log.Printf("toks = %+q\n", toks)
if !*benchmark {
t0 := time.Now()
ids = postings.Query(idx, toks)
fmt.Printf("found %d documents in %v\n", len(ids), time.Since(t0))
} else {
var total time.Duration
const iterations = 100
const samples = 30
for i := 0; i < samples; i++ {
t0 := time.Now()
for j := 0; j < iterations; j++ {
ids = postings.Query(idx, toks)
}
total += time.Since(t0)
fmt.Println(time.Since(t0))
}
total /= (iterations * samples)
fmt.Println("QPS: ", int(time.Second/total))
fmt.Printf("found %d documents\n", len(ids))
}
return nil
}
if !*interactive {
cIndex([]string{"/home/dgryski/m.out"})
cSearch([]string{"geoipsrv-01", "lhr4", "booking_com"})
cCompress(nil)
cSearch([]string{"geoipsrv-01", "lhr4", "booking_com"})
cPrint(nil)
} else {
repl.Run("pindex> ",
map[string]repl.Cmd{
"index": cIndex,
"print": cPrint,
"search": cSearch,
"compress": cCompress,
"mem": cMem,
},
)
}
if *cpuprofile != "" {
defer pprof.StopCPUProfile()
}
}
func Extract(s string, ngram int, terms []postings.TermID) []postings.TermID {
for i := 0; i <= len(s)-ngram; i++ {
var t postings.TermID
for j := 0; j < ngram; j++ {
t = (t << 8) | postings.TermID(s[i+j])
}
terms = append(terms, t)
}
return terms
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment