Skip to content

Instantly share code, notes, and snippets.

@magiconair
Last active January 16, 2020 14:26
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save magiconair/5952535 to your computer and use it in GitHub Desktop.
Save magiconair/5952535 to your computer and use it in GitHub Desktop.
Tool for extracting bucket/key data from riak's bitcask hint files
// Riak key extractor
//
// extracts bucket names and keys from riak's bitcask hint files
//
// Authors: The CAS Team 2013
//
package main
import (
"bytes"
"flag"
"fmt"
"io"
"log"
"os"
"path/filepath"
"time"
)
var (
dirname = flag.String("dirname", "", "bitcask directory")
printKeys = flag.Bool("printKeys", true, "print the keys")
cache = flag.Bool("cache", true, "read hint files in RAM first")
)
func readInt(r io.Reader, buf []uint8) (n uint32, err error) {
count, err := r.Read(buf[:4])
if err != nil {
return 0, err
}
if count != 4 {
return 0, fmt.Errorf("EOF when reading an int")
}
return (uint32)(buf[0]<<24 | buf[1]<<16 | buf[2]<<8 | buf[3]), nil
}
func readString(r io.ReadSeeker, buf []uint8) (s string, err error) {
_, err = r.Seek(1, 1) // skip 6D (field indicator?)
if err != nil {
return "", err
}
// read length
sz, err := readInt(r, buf)
if err != nil {
return "", err
}
// check length
if sz > uint32(len(buf)) {
return "", fmt.Errorf("String too long: %d > %d", sz, len(buf))
}
// read string
count, err := r.Read(buf[:sz])
if err != nil {
return "", err
}
if uint32(count) != sz {
return "", fmt.Errorf("EOF while reading string")
}
return (string)(buf[:sz]), nil
}
func extractKeys(r io.ReadSeeker) int {
var (
err error
count int
buf []uint8
)
count = 0
buf = make([]uint8, 256)
for {
// skip timestamp (4) + key_sz (2) + total_sz (4) + offset (8) + start_of_key (3) = 21 bytes
_, err = r.Seek(21, 1)
if err != nil {
break
}
bucket, err := readString(r, buf)
if err != nil {
break
}
key, err := readString(r, buf)
if err != nil {
break
}
if *printKeys {
fmt.Printf("%s/%s\n", bucket, key)
}
count++
}
if err != nil {
fmt.Fprintf(os.Stderr, "%v", err)
}
return count
}
func extractKeysFromHintFile(filename string) int {
f, err := os.Open(filename)
if err != nil {
log.Fatal(err)
}
defer f.Close()
fi, err := f.Stat()
if err != nil {
log.Fatal(err)
}
var r io.ReadSeeker = f
if *cache {
buf := make([]byte, fi.Size())
_, err = io.ReadFull(f, buf)
if err != nil {
log.Fatal(err)
}
r = bytes.NewReader(buf)
}
return extractKeys(r)
}
func extractKeysFromBitcaskDir(dirname string) {
start := time.Now()
hintFiles, err := filepath.Glob(dirname + "/*/*.hint")
if err != nil {
log.Fatal(err)
}
counts := make(chan int)
for _, f := range hintFiles {
go func(filename string) {
counts <- extractKeysFromHintFile(filename)
}(f)
}
total := 0
for i := 0; i < len(hintFiles); i++ {
total += <-counts
}
duration := time.Since(start)
throughput := int(float64(total) / duration.Seconds())
fmt.Fprintf(os.Stderr, "Extracted %d keys in %2.3f seconds (%d keys/sec)\n", total, duration.Seconds(), throughput)
}
func main() {
flag.Parse()
if *dirname == "" {
flag.Usage()
return
}
extractKeysFromBitcaskDir(*dirname)
}
@magiconair
Copy link
Author

So Java, Scala and Go now read 338780 keys out of the data file.

  • The single core Go version is slower than the PHP version (1.7 vs. 1.4 seconds)

[cas-go master] $ GOMAXPROCS=1 riak-key-extractor -dirname=/Users/frschroeder/Temp/bitcask -printKeys=true -cache=false > /dev/null
Extracted 338780 keys in 1.697 seconds (199576 keys/sec)

  • The eight core Go version is about 2x faster (380K keys/sec):

[cas-go master] $ GOMAXPROCS=8 riak-key-extractor -dirname=/Users/frschroeder/Temp/bitcask -printKeys=true -cache=false > /dev/null
Extracted 338780 keys in 0.885 seconds (382933 keys/sec)

  • Enabling caching brings a 4x improvement (730K keys/sec):

[cas-go master] $ GOMAXPROCS=1 riak-key-extractor -dirname=/Users/frschroeder/Temp/bitcask -printKeys=true -cache=true > /dev/null
Extracted 338780 keys in 0.465 seconds (727859 keys/sec)

  • Enabling caching and concurrency brings an 8x improvement (1.6M keys/sec):

[cas-go master] $ GOMAXPROCS=8 riak-key-extractor -dirname=/Users/frschroeder/Temp/bitcask -printKeys=true -cache=true > /dev/null
Extracted 338780 keys in 0.212 seconds (1601749 keys/sec)

  • Disabling the output instead of sending to /dev/null combined with caching and concurrency brings a 37x improvement (7.5M keys/sec):

[cas-go master] $ GOMAXPROCS=8 riak-key-extractor -dirname=/Users/frschroeder/Temp/bitcask -printKeys=false -cache=true
Extracted 338780 keys in 0.045 seconds (7498993 keys/sec)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment