Created
October 19, 2018 16:24
-
-
Save Dieterbe/301eb88113e7adcabcfe852961444fb2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"errors" | |
"flag" | |
"fmt" | |
"log" | |
"os" | |
"runtime" | |
"runtime/pprof" | |
"time" | |
"github.com/dgryski/go-postings" | |
"github.com/dgryski/trifles/repl" | |
"github.com/dustin/go-humanize" | |
) | |
func main() { | |
cpuprofile := flag.String("cpuprofile", "", "write cpu profile to file") | |
interactive := flag.Bool("interactive", true, "interactive") | |
benchmark := flag.Bool("benchmark", true, "benchmark") | |
ngram := flag.Int("ngram", 3, "n-gram size") | |
flag.Parse() | |
var docs []string | |
var pidx *postings.Index | |
var idx postings.InvertedIndex | |
var ids []postings.DocID | |
cIndex := func(args []string) error { | |
if len(args) < 1 { | |
return errors.New("missing argument") | |
} | |
fname := args[0] | |
f, err := os.Open(fname) | |
if err != nil { | |
return err | |
} | |
scanner := bufio.NewScanner(f) | |
if len(docs) != 0 { | |
docs = docs[:0] | |
} | |
var mem runtime.MemStats | |
runtime.ReadMemStats(&mem) | |
log.Println(humanize.Bytes(mem.Alloc)) | |
pidx = postings.NewIndex(nil) | |
t0 := time.Now() | |
var tokens []postings.TermID | |
var skipped int | |
for scanner.Scan() { | |
d := scanner.Text() | |
toks := Extract(d, *ngram, tokens[:0]) | |
if len(toks) > 0 { | |
docs = append(docs, d) | |
pidx.AddDocument(toks) | |
} | |
} | |
if err := scanner.Err(); err != nil { | |
fmt.Println("error during scan: ", err) | |
} | |
fmt.Printf("indexed %d documents in %s (skipped %d)\n", len(docs), time.Since(t0), skipped) | |
runtime.GC() | |
runtime.ReadMemStats(&mem) | |
log.Println(humanize.Bytes(mem.Alloc)) | |
idx = pidx | |
return nil | |
} | |
cPrint := func(args []string) error { | |
for _, id := range ids { | |
fmt.Printf("%05d: %q\n", id, docs[id]) | |
} | |
return nil | |
} | |
cCompress := func(args []string) error { | |
var mem runtime.MemStats | |
runtime.ReadMemStats(&mem) | |
log.Println(humanize.Bytes(mem.Alloc)) | |
t0 := time.Now() | |
idx = postings.NewCompressedIndex(pidx) | |
pidx = nil | |
fmt.Println("compression took", time.Since(t0)) | |
runtime.GC() | |
runtime.ReadMemStats(&mem) | |
log.Println(humanize.Bytes(mem.Alloc)) | |
if *cpuprofile != "" { | |
f, err := os.Create(*cpuprofile) | |
if err != nil { | |
log.Fatal(err) | |
} | |
pprof.StartCPUProfile(f) | |
} | |
return nil | |
} | |
cMem := func(args []string) error { | |
runtime.GC() | |
var mem runtime.MemStats | |
runtime.ReadMemStats(&mem) | |
log.Println(humanize.Bytes(mem.Alloc)) | |
return nil | |
} | |
cSearch := func(args []string) error { | |
if idx == nil { | |
return errors.New("no index loaded") | |
} | |
var tokens []postings.TermID | |
var toks []postings.TermID | |
for _, arg := range args { | |
for _, tok := range Extract(arg, *ngram, tokens[:0]) { | |
toks = append(toks, tok) | |
} | |
} | |
log.Printf("toks = %+q\n", toks) | |
if !*benchmark { | |
t0 := time.Now() | |
ids = postings.Query(idx, toks) | |
fmt.Printf("found %d documents in %v\n", len(ids), time.Since(t0)) | |
} else { | |
var total time.Duration | |
const iterations = 100 | |
const samples = 30 | |
for i := 0; i < samples; i++ { | |
t0 := time.Now() | |
for j := 0; j < iterations; j++ { | |
ids = postings.Query(idx, toks) | |
} | |
total += time.Since(t0) | |
fmt.Println(time.Since(t0)) | |
} | |
total /= (iterations * samples) | |
fmt.Println("QPS: ", int(time.Second/total)) | |
fmt.Printf("found %d documents\n", len(ids)) | |
} | |
return nil | |
} | |
if !*interactive { | |
cIndex([]string{"/home/dgryski/m.out"}) | |
cSearch([]string{"geoipsrv-01", "lhr4", "booking_com"}) | |
cCompress(nil) | |
cSearch([]string{"geoipsrv-01", "lhr4", "booking_com"}) | |
cPrint(nil) | |
} else { | |
repl.Run("pindex> ", | |
map[string]repl.Cmd{ | |
"index": cIndex, | |
"print": cPrint, | |
"search": cSearch, | |
"compress": cCompress, | |
"mem": cMem, | |
}, | |
) | |
} | |
if *cpuprofile != "" { | |
defer pprof.StopCPUProfile() | |
} | |
} | |
func Extract(s string, ngram int, terms []postings.TermID) []postings.TermID { | |
for i := 0; i <= len(s)-ngram; i++ { | |
var t postings.TermID | |
for j := 0; j < ngram; j++ { | |
t = (t << 8) | postings.TermID(s[i+j]) | |
} | |
terms = append(terms, t) | |
} | |
return terms | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment