Skip to content

Instantly share code, notes, and snippets.

@klauspost
Last active August 10, 2023 07:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save klauspost/e18f2db63db60b82c4f4d6c7d2d1d856 to your computer and use it in GitHub Desktop.
Save klauspost/e18f2db63db60b82c4f4d6c7d2d1d856 to your computer and use it in GitHub Desktop.
Simple dict builder
package main
import (
"encoding/binary"
"flag"
"fmt"
"io/ioutil"
"log"
"os"
"path/filepath"
"sort"
)
type match struct {
hash uint32
n uint32
offset int64
}
type matchValue struct {
value []byte
followBy map[uint32]uint32
preceededBy map[uint32]uint32
}
var wantLenFlag = flag.Int("len", 128<<10, "Specify custom output size")
var wantHashBytes = flag.Int("hash", 4, "Hash bytes match length")
var wantMaxBytes = flag.Int("max", 16<<10, "Max input length to index")
var wantOutput = flag.String("o", "dict-out.txt", "Output name")
func main() {
flag.Parse()
matches := make(map[uint32]uint32)
offsets := make(map[uint32]int64)
var total uint64
base := flag.Arg(0)
if base == "" {
log.Fatal("no path with files specified")
}
wantLen := *wantLenFlag
hashBytes := *wantHashBytes
if hashBytes < 4 || hashBytes > 8 {
log.Fatal("-bytes must be >= 4 and <= 8")
}
maxBytes := *wantMaxBytes + 8
found := make(map[uint32]struct{})
// Index ALL hashes in all files.
filepath.Walk(base, func(path string, info os.FileInfo, err error) error {
if info.IsDir() {
return nil
}
b, err := ioutil.ReadFile(filepath.Join(base, info.Name()))
if err != nil {
log.Print(err)
return nil
}
if len(b) < 8 {
return nil
}
if len(b) > maxBytes {
b = b[:maxBytes]
}
for k := range found {
delete(found, k)
}
for i := range b {
rem := b[i:]
if len(rem) < 8 {
break
}
h := hashLen(binary.LittleEndian.Uint64(rem), 32, uint8(hashBytes))
if _, ok := found[h]; ok {
// Only count first occurrence
continue
}
matches[h]++
offsets[h] += int64(i)
total++
found[h] = struct{}{}
}
fmt.Print("\r"+info.Name(), " Indexed...")
return nil
})
threshold := uint32(total / uint64(len(matches)))
fmt.Println("total", total, "match", len(matches), "avg", threshold)
sorted := make([]match, 0, len(matches)/2)
for k, v := range matches {
if v <= threshold {
continue
}
sorted = append(sorted, match{hash: k, n: v, offset: offsets[k]})
}
sort.Slice(sorted, func(i, j int) bool {
if sorted[i].n == sorted[j].n {
return sorted[i].offset < sorted[j].offset
}
return sorted[i].n > sorted[j].n
})
fmt.Println("Sorted len:", len(sorted))
if len(sorted) > wantLen {
sorted = sorted[:wantLen]
}
fmt.Println("Cropped len:", len(sorted), "Lowest occurrence:", sorted[len(sorted)-1].n)
wantMatches := make(map[uint32]uint32, len(sorted))
for _, v := range sorted {
wantMatches[v.hash] = v.n
}
output := make(map[uint32]matchValue, len(sorted))
filepath.Walk(base, func(path string, info os.FileInfo, err error) error {
if info.IsDir() {
return nil
}
b, err := ioutil.ReadFile(filepath.Join(base, info.Name()))
if err != nil {
log.Print(err)
return nil
}
if len(b) < 8 {
return nil
}
if len(b) > maxBytes {
b = b[:maxBytes]
}
for i := range b {
rem := b[i:]
if len(rem) < 8 {
break
}
var prev []byte
if i > hashBytes {
prev = b[i-hashBytes:]
}
h := hashLen(binary.LittleEndian.Uint64(rem), 32, uint8(hashBytes))
if _, ok := wantMatches[h]; !ok {
continue
}
mv := output[h]
if len(mv.value) == 0 {
var tmp = make([]byte, hashBytes)
copy(tmp[:], rem)
mv.value = tmp[:]
}
if mv.followBy == nil {
mv.followBy = make(map[uint32]uint32, 4)
mv.preceededBy = make(map[uint32]uint32, 4)
}
if len(rem) > hashBytes+8 {
// Check if we should add next as well.
hNext := hashLen(binary.LittleEndian.Uint64(rem[hashBytes:]), 32, uint8(hashBytes))
if _, ok := wantMatches[hNext]; ok {
mv.followBy[hNext]++
}
}
if len(prev) >= 8 {
// Check if we should prev next as well.
hPrev := hashLen(binary.LittleEndian.Uint64(prev), 32, uint8(hashBytes))
if _, ok := wantMatches[hPrev]; ok {
mv.preceededBy[hPrev]++
}
}
output[h] = mv
}
fmt.Print("\r"+info.Name(), " Re-read...")
return nil
})
dst := make([][]byte, 0, wantLen/hashBytes)
for i, e := range sorted {
m, ok := output[e.hash]
if !ok {
// Already added
continue
}
var tmp = make([]byte, 0, hashBytes*2)
{
sortedPrev := make([]match, 0, len(m.followBy))
for k, v := range m.preceededBy {
if _, ok := output[k]; !ok {
continue
}
sortedPrev = append(sortedPrev, match{
hash: k,
n: v,
})
}
if len(sortedPrev) > 0 {
sort.Slice(sortedPrev, func(i, j int) bool {
return sortedPrev[i].n > sortedPrev[j].n
})
bestPrev := output[sortedPrev[0].hash]
tmp = append(tmp, bestPrev.value...)
}
}
tmp = append(tmp, m.value...)
delete(output, e.hash)
wantLen := e.n / uint32(hashBytes) / 2
for {
sortedFollow := make([]match, 0, len(m.followBy))
for k, v := range m.followBy {
if _, ok := output[k]; !ok {
continue
}
if v < wantLen {
// Not significant enough.
continue
}
sortedFollow = append(sortedFollow, match{
hash: k,
n: v,
})
}
if len(sortedFollow) == 0 {
break
}
sort.Slice(sortedFollow, func(i, j int) bool {
return sortedFollow[i].n > sortedFollow[j].n
})
nh := sortedFollow[0].hash
m, ok = output[nh]
if !ok {
break
}
tmp = append(tmp, m.value...)
delete(output, nh)
}
if i < 100 {
fmt.Println("")
fmt.Printf("ENTRY %d: %q (%d occurrences)", i, string(tmp), e.n)
}
// Delete substrings already added.
if len(tmp) > 8 {
for j := range tmp[:len(tmp)-hashBytes] {
var t8 [8]byte
copy(t8[:], tmp[j:])
if i < 100 {
fmt.Println("")
fmt.Printf("DELETE %q", string(t8[:hashBytes]))
}
delete(output, hashLen(binary.LittleEndian.Uint64(t8[:]), 32, uint8(hashBytes)))
}
}
dst = append(dst, tmp)
}
o, err := os.Create(*wantOutput)
if err != nil {
panic(err)
}
defer o.Close()
written := 0
for i, toWrite := range dst {
if len(toWrite)+written > wantLen {
toWrite = toWrite[:wantLen-written]
}
dst[i] = toWrite
written += len(toWrite)
if written >= wantLen {
dst = dst[:i+1]
break
}
}
// Write in reverse order.
for i := range dst {
toWrite := dst[len(dst)-i-1]
o.Write(toWrite)
}
}
const (
prime3bytes = 506832829
prime4bytes = 2654435761
prime5bytes = 889523592379
prime6bytes = 227718039650203
prime7bytes = 58295818150454627
prime8bytes = 0xcf1bbcdcb7a56463
)
// hashLen returns a hash of the lowest l bytes of u for a size size of h bytes.
// l must be >=4 and <=8. Any other value will return hash for 4 bytes.
// h should always be <32.
// Preferably h and l should be a constant.
// LENGTH 4 is passed straight through
func hashLen(u uint64, hashLog, mls uint8) uint32 {
switch mls {
case 5:
return hash5(u, hashLog)
case 6:
return hash6(u, hashLog)
case 7:
return hash7(u, hashLog)
case 8:
return hash8(u, hashLog)
default:
return uint32(u)
}
}
// hash3 returns the hash of the lower 3 bytes of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <32.
func hash3(u uint32, h uint8) uint32 {
return ((u << (32 - 24)) * prime3bytes) >> ((32 - h) & 31)
}
// hash4 returns the hash of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <32.
func hash4(u uint32, h uint8) uint32 {
return (u * prime4bytes) >> ((32 - h) & 31)
}
// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <32.
func hash4x64(u uint64, h uint8) uint32 {
return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
}
// hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <64.
func hash5(u uint64, h uint8) uint32 {
return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
}
// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <64.
func hash6(u uint64, h uint8) uint32 {
return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
}
// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <64.
func hash7(u uint64, h uint8) uint32 {
return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
}
// hash8 returns the hash of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <64.
func hash8(u uint64, h uint8) uint32 {
return uint32((u * prime8bytes) >> ((64 - h) & 63))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment