klauspost/main.go

## main.go
package main

import (
	"encoding/binary"
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"os"
	"path/filepath"
	"sort"
)

type match struct {
	hash   uint32
	n      uint32
	offset int64
}

type matchValue struct {
	value       []byte
	followBy    map[uint32]uint32
	preceededBy map[uint32]uint32
}

var wantLenFlag = flag.Int("len", 128<<10, "Specify custom output size")
var wantHashBytes = flag.Int("hash", 4, "Hash bytes match length")
var wantMaxBytes = flag.Int("max", 16<<10, "Max input length to index")
var wantOutput = flag.String("o", "dict-out.txt", "Output name")

func main() {

	flag.Parse()
	matches := make(map[uint32]uint32)
	offsets := make(map[uint32]int64)
	var total uint64
	base := flag.Arg(0)
	if base == "" {
		log.Fatal("no path with files specified")
	}

	wantLen := *wantLenFlag
	hashBytes := *wantHashBytes
	if hashBytes < 4 || hashBytes > 8 {
		log.Fatal("-bytes must be >= 4 and <= 8")
	}
	maxBytes := *wantMaxBytes + 8
	found := make(map[uint32]struct{})
	// Index ALL hashes in all files.
	filepath.Walk(base, func(path string, info os.FileInfo, err error) error {
		if info.IsDir() {
			return nil
		}

		b, err := ioutil.ReadFile(filepath.Join(base, info.Name()))
		if err != nil {
			log.Print(err)
			return nil
		}
		if len(b) < 8 {
			return nil
		}
		if len(b) > maxBytes {
			b = b[:maxBytes]
		}
		for k := range found {
			delete(found, k)
		}
		for i := range b {
			rem := b[i:]
			if len(rem) < 8 {
				break
			}
			h := hashLen(binary.LittleEndian.Uint64(rem), 32, uint8(hashBytes))
			if _, ok := found[h]; ok {
				// Only count first occurrence
				continue
			}
			matches[h]++
			offsets[h] += int64(i)
			total++
			found[h] = struct{}{}
		}
		fmt.Print("\r"+info.Name(), " Indexed...")
		return nil
	})
	threshold := uint32(total / uint64(len(matches)))
	fmt.Println("total", total, "match", len(matches), "avg", threshold)
	sorted := make([]match, 0, len(matches)/2)
	for k, v := range matches {
		if v <= threshold {
			continue
		}
		sorted = append(sorted, match{hash: k, n: v, offset: offsets[k]})
	}
	sort.Slice(sorted, func(i, j int) bool {
		if sorted[i].n == sorted[j].n {
			return sorted[i].offset < sorted[j].offset
		}
		return sorted[i].n > sorted[j].n
	})
	fmt.Println("Sorted len:", len(sorted))
	if len(sorted) > wantLen {
		sorted = sorted[:wantLen]
	}
	fmt.Println("Cropped len:", len(sorted), "Lowest occurrence:", sorted[len(sorted)-1].n)

	wantMatches := make(map[uint32]uint32, len(sorted))
	for _, v := range sorted {
		wantMatches[v.hash] = v.n
	}

	output := make(map[uint32]matchValue, len(sorted))
	filepath.Walk(base, func(path string, info os.FileInfo, err error) error {
		if info.IsDir() {
			return nil
		}
		b, err := ioutil.ReadFile(filepath.Join(base, info.Name()))
		if err != nil {
			log.Print(err)
			return nil
		}

		if len(b) < 8 {
			return nil
		}
		if len(b) > maxBytes {
			b = b[:maxBytes]
		}

		for i := range b {
			rem := b[i:]
			if len(rem) < 8 {
				break
			}
			var prev []byte
			if i > hashBytes {
				prev = b[i-hashBytes:]
			}

			h := hashLen(binary.LittleEndian.Uint64(rem), 32, uint8(hashBytes))
			if _, ok := wantMatches[h]; !ok {
				continue
			}
			mv := output[h]
			if len(mv.value) == 0 {
				var tmp = make([]byte, hashBytes)
				copy(tmp[:], rem)
				mv.value = tmp[:]
			}
			if mv.followBy == nil {
				mv.followBy = make(map[uint32]uint32, 4)
				mv.preceededBy = make(map[uint32]uint32, 4)
			}
			if len(rem) > hashBytes+8 {
				// Check if we should add next as well.
				hNext := hashLen(binary.LittleEndian.Uint64(rem[hashBytes:]), 32, uint8(hashBytes))
				if _, ok := wantMatches[hNext]; ok {
					mv.followBy[hNext]++
				}
			}
			if len(prev) >= 8 {
				// Check if we should prev next as well.
				hPrev := hashLen(binary.LittleEndian.Uint64(prev), 32, uint8(hashBytes))
				if _, ok := wantMatches[hPrev]; ok {
					mv.preceededBy[hPrev]++
				}
			}
			output[h] = mv
		}
		fmt.Print("\r"+info.Name(), " Re-read...")
		return nil
	})

	dst := make([][]byte, 0, wantLen/hashBytes)
	for i, e := range sorted {
		m, ok := output[e.hash]
		if !ok {
			// Already added
			continue
		}
		var tmp = make([]byte, 0, hashBytes*2)
		{
			sortedPrev := make([]match, 0, len(m.followBy))
			for k, v := range m.preceededBy {
				if _, ok := output[k]; !ok {
					continue
				}
				sortedPrev = append(sortedPrev, match{
					hash: k,
					n:    v,
				})
			}
			if len(sortedPrev) > 0 {
				sort.Slice(sortedPrev, func(i, j int) bool {
					return sortedPrev[i].n > sortedPrev[j].n
				})
				bestPrev := output[sortedPrev[0].hash]
				tmp = append(tmp, bestPrev.value...)
			}
		}
		tmp = append(tmp, m.value...)
		delete(output, e.hash)
		wantLen := e.n / uint32(hashBytes) / 2
		for {
			sortedFollow := make([]match, 0, len(m.followBy))
			for k, v := range m.followBy {
				if _, ok := output[k]; !ok {
					continue
				}
				if v < wantLen {
					// Not significant enough.
					continue
				}
				sortedFollow = append(sortedFollow, match{
					hash: k,
					n:    v,
				})
			}
			if len(sortedFollow) == 0 {
				break
			}
			sort.Slice(sortedFollow, func(i, j int) bool {
				return sortedFollow[i].n > sortedFollow[j].n
			})
			nh := sortedFollow[0].hash
			m, ok = output[nh]
			if !ok {
				break
			}
			tmp = append(tmp, m.value...)
			delete(output, nh)
		}
		if i < 100 {
			fmt.Println("")
			fmt.Printf("ENTRY %d: %q (%d occurrences)", i, string(tmp), e.n)
		}
		// Delete substrings already added.
		if len(tmp) > 8 {
			for j := range tmp[:len(tmp)-hashBytes] {
				var t8 [8]byte
				copy(t8[:], tmp[j:])
				if i < 100 {
					fmt.Println("")
					fmt.Printf("DELETE %q", string(t8[:hashBytes]))
				}
				delete(output, hashLen(binary.LittleEndian.Uint64(t8[:]), 32, uint8(hashBytes)))
			}
		}
		dst = append(dst, tmp)
	}
	o, err := os.Create(*wantOutput)
	if err != nil {
		panic(err)
	}
	defer o.Close()
	written := 0
	for i, toWrite := range dst {
		if len(toWrite)+written > wantLen {
			toWrite = toWrite[:wantLen-written]
		}
		dst[i] = toWrite
		written += len(toWrite)
		if written >= wantLen {
			dst = dst[:i+1]
			break
		}
	}
	// Write in reverse order.
	for i := range dst {
		toWrite := dst[len(dst)-i-1]
		o.Write(toWrite)
	}
}

const (
	prime3bytes = 506832829
	prime4bytes = 2654435761
	prime5bytes = 889523592379
	prime6bytes = 227718039650203
	prime7bytes = 58295818150454627
	prime8bytes = 0xcf1bbcdcb7a56463
)

// hashLen returns a hash of the lowest l bytes of u for a size size of h bytes.
// l must be >=4 and <=8. Any other value will return hash for 4 bytes.
// h should always be <32.
// Preferably h and l should be a constant.
// LENGTH 4 is passed straight through
func hashLen(u uint64, hashLog, mls uint8) uint32 {
	switch mls {
	case 5:
		return hash5(u, hashLog)
	case 6:
		return hash6(u, hashLog)
	case 7:
		return hash7(u, hashLog)
	case 8:
		return hash8(u, hashLog)
	default:
		return uint32(u)
	}
}

// hash3 returns the hash of the lower 3 bytes of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <32.
func hash3(u uint32, h uint8) uint32 {
	return ((u << (32 - 24)) * prime3bytes) >> ((32 - h) & 31)
}

// hash4 returns the hash of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <32.
func hash4(u uint32, h uint8) uint32 {
	return (u * prime4bytes) >> ((32 - h) & 31)
}

// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <32.
func hash4x64(u uint64, h uint8) uint32 {
	return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
}

// hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <64.
func hash5(u uint64, h uint8) uint32 {
	return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
}

// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <64.
func hash6(u uint64, h uint8) uint32 {
	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
}

// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <64.
func hash7(u uint64, h uint8) uint32 {
	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
}

// hash8 returns the hash of u to fit in a hash table with h bits.
// Preferably h should be a constant and should always be <64.
func hash8(u uint64, h uint8) uint32 {
	return uint32((u * prime8bytes) >> ((64 - h) & 63))
}
	package main

	import (
	"encoding/binary"
	"flag"
	"fmt"
	"io/ioutil"
	"log"
	"os"
	"path/filepath"
	"sort"
	)

	type match struct {
	hash uint32
	n uint32
	offset int64
	}

	type matchValue struct {
	value []byte
	followBy map[uint32]uint32
	preceededBy map[uint32]uint32
	}

	var wantLenFlag = flag.Int("len", 128<<10, "Specify custom output size")
	var wantHashBytes = flag.Int("hash", 4, "Hash bytes match length")
	var wantMaxBytes = flag.Int("max", 16<<10, "Max input length to index")
	var wantOutput = flag.String("o", "dict-out.txt", "Output name")

	func main() {

	flag.Parse()
	matches := make(map[uint32]uint32)
	offsets := make(map[uint32]int64)
	var total uint64
	base := flag.Arg(0)
	if base == "" {
	log.Fatal("no path with files specified")
	}

	wantLen := *wantLenFlag
	hashBytes := *wantHashBytes
	if hashBytes < 4 \|\| hashBytes > 8 {
	log.Fatal("-bytes must be >= 4 and <= 8")
	}
	maxBytes := *wantMaxBytes + 8
	found := make(map[uint32]struct{})
	// Index ALL hashes in all files.
	filepath.Walk(base, func(path string, info os.FileInfo, err error) error {
	if info.IsDir() {
	return nil
	}

	b, err := ioutil.ReadFile(filepath.Join(base, info.Name()))
	if err != nil {
	log.Print(err)
	return nil
	}
	if len(b) < 8 {
	return nil
	}
	if len(b) > maxBytes {
	b = b[:maxBytes]
	}
	for k := range found {
	delete(found, k)
	}
	for i := range b {
	rem := b[i:]
	if len(rem) < 8 {
	break
	}
	h := hashLen(binary.LittleEndian.Uint64(rem), 32, uint8(hashBytes))
	if _, ok := found[h]; ok {
	// Only count first occurrence
	continue
	}
	matches[h]++
	offsets[h] += int64(i)
	total++
	found[h] = struct{}{}
	}
	fmt.Print("\r"+info.Name(), " Indexed...")
	return nil
	})
	threshold := uint32(total / uint64(len(matches)))
	fmt.Println("total", total, "match", len(matches), "avg", threshold)
	sorted := make([]match, 0, len(matches)/2)
	for k, v := range matches {
	if v <= threshold {
	continue
	}
	sorted = append(sorted, match{hash: k, n: v, offset: offsets[k]})
	}
	sort.Slice(sorted, func(i, j int) bool {
	if sorted[i].n == sorted[j].n {
	return sorted[i].offset < sorted[j].offset
	}
	return sorted[i].n > sorted[j].n
	})
	fmt.Println("Sorted len:", len(sorted))
	if len(sorted) > wantLen {
	sorted = sorted[:wantLen]
	}
	fmt.Println("Cropped len:", len(sorted), "Lowest occurrence:", sorted[len(sorted)-1].n)

	wantMatches := make(map[uint32]uint32, len(sorted))
	for _, v := range sorted {
	wantMatches[v.hash] = v.n
	}

	output := make(map[uint32]matchValue, len(sorted))
	filepath.Walk(base, func(path string, info os.FileInfo, err error) error {
	if info.IsDir() {
	return nil
	}
	b, err := ioutil.ReadFile(filepath.Join(base, info.Name()))
	if err != nil {
	log.Print(err)
	return nil
	}

	if len(b) < 8 {
	return nil
	}
	if len(b) > maxBytes {
	b = b[:maxBytes]
	}

	for i := range b {
	rem := b[i:]
	if len(rem) < 8 {
	break
	}
	var prev []byte
	if i > hashBytes {
	prev = b[i-hashBytes:]
	}

	h := hashLen(binary.LittleEndian.Uint64(rem), 32, uint8(hashBytes))
	if _, ok := wantMatches[h]; !ok {
	continue
	}
	mv := output[h]
	if len(mv.value) == 0 {
	var tmp = make([]byte, hashBytes)
	copy(tmp[:], rem)
	mv.value = tmp[:]
	}
	if mv.followBy == nil {
	mv.followBy = make(map[uint32]uint32, 4)
	mv.preceededBy = make(map[uint32]uint32, 4)
	}
	if len(rem) > hashBytes+8 {
	// Check if we should add next as well.
	hNext := hashLen(binary.LittleEndian.Uint64(rem[hashBytes:]), 32, uint8(hashBytes))
	if _, ok := wantMatches[hNext]; ok {
	mv.followBy[hNext]++
	}
	}
	if len(prev) >= 8 {
	// Check if we should prev next as well.
	hPrev := hashLen(binary.LittleEndian.Uint64(prev), 32, uint8(hashBytes))
	if _, ok := wantMatches[hPrev]; ok {
	mv.preceededBy[hPrev]++
	}
	}
	output[h] = mv
	}
	fmt.Print("\r"+info.Name(), " Re-read...")
	return nil
	})

	dst := make([][]byte, 0, wantLen/hashBytes)
	for i, e := range sorted {
	m, ok := output[e.hash]
	if !ok {
	// Already added
	continue
	}
	var tmp = make([]byte, 0, hashBytes*2)
	{
	sortedPrev := make([]match, 0, len(m.followBy))
	for k, v := range m.preceededBy {
	if _, ok := output[k]; !ok {
	continue
	}
	sortedPrev = append(sortedPrev, match{
	hash: k,
	n: v,
	})
	}
	if len(sortedPrev) > 0 {
	sort.Slice(sortedPrev, func(i, j int) bool {
	return sortedPrev[i].n > sortedPrev[j].n
	})
	bestPrev := output[sortedPrev[0].hash]
	tmp = append(tmp, bestPrev.value...)
	}
	}
	tmp = append(tmp, m.value...)
	delete(output, e.hash)
	wantLen := e.n / uint32(hashBytes) / 2
	for {
	sortedFollow := make([]match, 0, len(m.followBy))
	for k, v := range m.followBy {
	if _, ok := output[k]; !ok {
	continue
	}
	if v < wantLen {
	// Not significant enough.
	continue
	}
	sortedFollow = append(sortedFollow, match{
	hash: k,
	n: v,
	})
	}
	if len(sortedFollow) == 0 {
	break
	}
	sort.Slice(sortedFollow, func(i, j int) bool {
	return sortedFollow[i].n > sortedFollow[j].n
	})
	nh := sortedFollow[0].hash
	m, ok = output[nh]
	if !ok {
	break
	}
	tmp = append(tmp, m.value...)
	delete(output, nh)
	}
	if i < 100 {
	fmt.Println("")
	fmt.Printf("ENTRY %d: %q (%d occurrences)", i, string(tmp), e.n)
	}
	// Delete substrings already added.
	if len(tmp) > 8 {
	for j := range tmp[:len(tmp)-hashBytes] {
	var t8 [8]byte
	copy(t8[:], tmp[j:])
	if i < 100 {
	fmt.Println("")
	fmt.Printf("DELETE %q", string(t8[:hashBytes]))
	}
	delete(output, hashLen(binary.LittleEndian.Uint64(t8[:]), 32, uint8(hashBytes)))
	}
	}
	dst = append(dst, tmp)
	}
	o, err := os.Create(*wantOutput)
	if err != nil {
	panic(err)
	}
	defer o.Close()
	written := 0
	for i, toWrite := range dst {
	if len(toWrite)+written > wantLen {
	toWrite = toWrite[:wantLen-written]
	}
	dst[i] = toWrite
	written += len(toWrite)
	if written >= wantLen {
	dst = dst[:i+1]
	break
	}
	}
	// Write in reverse order.
	for i := range dst {
	toWrite := dst[len(dst)-i-1]
	o.Write(toWrite)
	}
	}

	const (
	prime3bytes = 506832829
	prime4bytes = 2654435761
	prime5bytes = 889523592379
	prime6bytes = 227718039650203
	prime7bytes = 58295818150454627
	prime8bytes = 0xcf1bbcdcb7a56463
	)

	// hashLen returns a hash of the lowest l bytes of u for a size size of h bytes.
	// l must be >=4 and <=8. Any other value will return hash for 4 bytes.
	// h should always be <32.
	// Preferably h and l should be a constant.
	// LENGTH 4 is passed straight through
	func hashLen(u uint64, hashLog, mls uint8) uint32 {
	switch mls {
	case 5:
	return hash5(u, hashLog)
	case 6:
	return hash6(u, hashLog)
	case 7:
	return hash7(u, hashLog)
	case 8:
	return hash8(u, hashLog)
	default:
	return uint32(u)
	}
	}

	// hash3 returns the hash of the lower 3 bytes of u to fit in a hash table with h bits.
	// Preferably h should be a constant and should always be <32.
	func hash3(u uint32, h uint8) uint32 {
	return ((u << (32 - 24)) * prime3bytes) >> ((32 - h) & 31)
	}

	// hash4 returns the hash of u to fit in a hash table with h bits.
	// Preferably h should be a constant and should always be <32.
	func hash4(u uint32, h uint8) uint32 {
	return (u * prime4bytes) >> ((32 - h) & 31)
	}

	// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
	// Preferably h should be a constant and should always be <32.
	func hash4x64(u uint64, h uint8) uint32 {
	return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
	}

	// hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
	// Preferably h should be a constant and should always be <64.
	func hash5(u uint64, h uint8) uint32 {
	return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
	}

	// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
	// Preferably h should be a constant and should always be <64.
	func hash6(u uint64, h uint8) uint32 {
	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
	}

	// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
	// Preferably h should be a constant and should always be <64.
	func hash7(u uint64, h uint8) uint32 {
	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
	}

	// hash8 returns the hash of u to fit in a hash table with h bits.
	// Preferably h should be a constant and should always be <64.
	func hash8(u uint64, h uint8) uint32 {
	return uint32((u * prime8bytes) >> ((64 - h) & 63))
	}