Skip to content

Instantly share code, notes, and snippets.

@thesyncim
Last active August 31, 2018 11:06
Show Gist options
  • Save thesyncim/4b234f7816c642b7aeb748a956361425 to your computer and use it in GitHub Desktop.
Save thesyncim/4b234f7816c642b7aeb748a956361425 to your computer and use it in GitHub Desktop.
count the word frequency
//todo balanced tree (should make any difference because keys are kinda random)
//todo avoid allocation bytes.ToLower
//todo reduce allocations
package main
import (
"bufio"
"bytes"
"errors"
"fmt"
"hash/crc64"
"io"
"math"
"os"
"sort"
"log"
)
var crcTable = crc64.MakeTable(crc64.ISO)
type node struct {
hash uint64
counter uint64
Left *node
Right *node
}
func (n *node) Insert(hash, counter uint64) error {
if n == nil {
return errors.New("Cannot insert a hash into a nil tree")
}
switch {
// If the hash is already in the tree, update the counter.
case hash == n.hash:
n.counter = counter
return nil
case hash < n.hash:
if n.Left == nil {
n.Left = &node{hash: hash, counter: counter}
return nil
}
return n.Left.Insert(hash, counter)
case hash > n.hash:
if n.Right == nil {
n.Right = &node{hash: hash, counter: counter}
return nil
}
return n.Right.Insert(hash, counter)
}
return nil
}
func (n *node) Find(s uint64) (*uint64, bool) {
if n == nil {
return nil, false
}
switch {
case s == n.hash:
return &n.counter, true
case s < n.hash:
return n.Left.Find(s)
default:
return n.Right.Find(s)
}
}
type tree struct {
Root *node
}
func (t *tree) Insert(value, data uint64) error {
if t.Root == nil {
t.Root = &node{hash: value, counter: data}
return nil
}
return t.Root.Insert(value, data)
}
// `find` calls `node.find` unless the root node is `nil`
func (t *tree) find(s uint64) (*uint64, bool) {
if t.Root == nil {
return nil, false
}
return t.Root.Find(s)
}
type wordCounter struct {
src io.ReadSeeker
bufSrc *bufio.Reader
top *topN
stats tree
}
func newWordCounter(src io.ReadSeeker) *wordCounter {
return &wordCounter{
src: src,
bufSrc: bufio.NewReaderSize(src, 32*1024),
}
}
func (wc *wordCounter) readWord(offset int) []byte {
wc.src.Seek(int64(offset), 0)
var word []byte
for {
bbyte := make([]byte, 1)
_, err := wc.src.Read(bbyte)
if err != nil && err != io.EOF {
panic(err)
}
if !isValidChar(bbyte[0]) {
return word
}
word = append(word, bbyte[0])
}
}
func (wc *wordCounter) Top(n int) error {
wc.top = newTopN(n)
wc.src.Seek(0, 0)
var word= make([]byte,0,128)
var b byte
var err error
for offset:= 0; ; offset++ {
b, err = wc.bufSrc.ReadByte()
if err == io.EOF {
break
} else if err != nil {
return err
}
if !isValidChar(b) {
wlen := len(word)
if wlen > 0 {
//at this stage we can a complete word
m := &wordStat{
count: 1,
checksum: crc64.Checksum(bytes.ToLower(word), crcTable),
wordIndex: offset - wlen,
}
if wcounter, _ := wc.stats.find(m.checksum); wcounter != nil {
*wcounter++
m.count = *wcounter
} else {
//new entry
wc.stats.Insert(m.checksum, m.count)
}
wc.top.Account(m)
}
//start a new word
word = nil
} else {
word = append(word, b)
}
}
return nil
}
func (wc *wordCounter) PrintResults() {
sort.Stable(sort.Reverse(wc.top))
for i := 0; i < len(wc.top.elements); i++ {
fmt.Printf(" %d %s\n", wc.top.elements[i].count, bytes.ToLower(wc.readWord(wc.top.elements[i].wordIndex)))
}
}
type wordStat struct {
checksum uint64
count uint64
wordIndex int
}
type topN struct {
N int
elements []*wordStat
min uint64
max uint64
}
func (top topN) Len() int { return len(top.elements) }
func (top topN) Less(i, j int) bool { return top.elements[i].count < top.elements[j].count }
func (top topN) Swap(i, j int) { top.elements[i], top.elements[j] = top.elements[j], top.elements[i] }
func (top *topN) exists(h uint64) (bool, int) {
for i := range top.elements {
if top.elements[i].checksum == h {
return true, i
}
}
return false, -1
}
func (top *topN) setMinMax(value uint64) {
if value < top.min {
top.min = value
}
if value > top.max {
top.max = value
}
}
func (top *topN) Account(ws *wordStat) {
exists, index := top.exists(ws.checksum)
if len(top.elements) < top.N && !exists {
top.elements = append(top.elements, ws)
top.setMinMax(ws.count)
return
}
if ws.count >= top.min && exists {
top.elements[index] = ws
top.setMinMax(ws.count)
return
}
for i := range top.elements {
if top.elements[i].count < ws.count {
top.elements[i] = ws
top.setMinMax(ws.count)
return
}
}
}
func newTopN(n int) *topN {
return &topN{
max: 0,
min: math.MaxUint64,
N: n,
elements: make([]*wordStat, 0, n),
}
}
func main() {
/* p := profile.Start(profile.MemProfile, profile.ProfilePath("."), profile.NoShutdownHook)
defer p.Stop()
*/
if len(os.Args) < 2 {
log.Fatalln("invalid number of arguments")
}
file := os.Args[1]
f, err := os.Open(file)
if err != nil {
panic(err)
}
defer f.Close()
c := newWordCounter(f)
c.Top(20)
c.PrintResults()
}
func isValidChar(c byte) bool {
if c >= 'a' && c <= 'z' ||
c >= 'A' && c <= 'Z' {
return true
}
return false
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment