Last active
August 31, 2018 11:06
-
-
Save thesyncim/4b234f7816c642b7aeb748a956361425 to your computer and use it in GitHub Desktop.
count the word frequency
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//todo balanced tree (should make any difference because keys are kinda random) | |
//todo avoid allocation bytes.ToLower | |
//todo reduce allocations | |
package main | |
import ( | |
"bufio" | |
"bytes" | |
"errors" | |
"fmt" | |
"hash/crc64" | |
"io" | |
"math" | |
"os" | |
"sort" | |
"log" | |
) | |
var crcTable = crc64.MakeTable(crc64.ISO) | |
type node struct { | |
hash uint64 | |
counter uint64 | |
Left *node | |
Right *node | |
} | |
func (n *node) Insert(hash, counter uint64) error { | |
if n == nil { | |
return errors.New("Cannot insert a hash into a nil tree") | |
} | |
switch { | |
// If the hash is already in the tree, update the counter. | |
case hash == n.hash: | |
n.counter = counter | |
return nil | |
case hash < n.hash: | |
if n.Left == nil { | |
n.Left = &node{hash: hash, counter: counter} | |
return nil | |
} | |
return n.Left.Insert(hash, counter) | |
case hash > n.hash: | |
if n.Right == nil { | |
n.Right = &node{hash: hash, counter: counter} | |
return nil | |
} | |
return n.Right.Insert(hash, counter) | |
} | |
return nil | |
} | |
func (n *node) Find(s uint64) (*uint64, bool) { | |
if n == nil { | |
return nil, false | |
} | |
switch { | |
case s == n.hash: | |
return &n.counter, true | |
case s < n.hash: | |
return n.Left.Find(s) | |
default: | |
return n.Right.Find(s) | |
} | |
} | |
type tree struct { | |
Root *node | |
} | |
func (t *tree) Insert(value, data uint64) error { | |
if t.Root == nil { | |
t.Root = &node{hash: value, counter: data} | |
return nil | |
} | |
return t.Root.Insert(value, data) | |
} | |
// `find` calls `node.find` unless the root node is `nil` | |
func (t *tree) find(s uint64) (*uint64, bool) { | |
if t.Root == nil { | |
return nil, false | |
} | |
return t.Root.Find(s) | |
} | |
type wordCounter struct { | |
src io.ReadSeeker | |
bufSrc *bufio.Reader | |
top *topN | |
stats tree | |
} | |
func newWordCounter(src io.ReadSeeker) *wordCounter { | |
return &wordCounter{ | |
src: src, | |
bufSrc: bufio.NewReaderSize(src, 32*1024), | |
} | |
} | |
func (wc *wordCounter) readWord(offset int) []byte { | |
wc.src.Seek(int64(offset), 0) | |
var word []byte | |
for { | |
bbyte := make([]byte, 1) | |
_, err := wc.src.Read(bbyte) | |
if err != nil && err != io.EOF { | |
panic(err) | |
} | |
if !isValidChar(bbyte[0]) { | |
return word | |
} | |
word = append(word, bbyte[0]) | |
} | |
} | |
func (wc *wordCounter) Top(n int) error { | |
wc.top = newTopN(n) | |
wc.src.Seek(0, 0) | |
var word= make([]byte,0,128) | |
var b byte | |
var err error | |
for offset:= 0; ; offset++ { | |
b, err = wc.bufSrc.ReadByte() | |
if err == io.EOF { | |
break | |
} else if err != nil { | |
return err | |
} | |
if !isValidChar(b) { | |
wlen := len(word) | |
if wlen > 0 { | |
//at this stage we can a complete word | |
m := &wordStat{ | |
count: 1, | |
checksum: crc64.Checksum(bytes.ToLower(word), crcTable), | |
wordIndex: offset - wlen, | |
} | |
if wcounter, _ := wc.stats.find(m.checksum); wcounter != nil { | |
*wcounter++ | |
m.count = *wcounter | |
} else { | |
//new entry | |
wc.stats.Insert(m.checksum, m.count) | |
} | |
wc.top.Account(m) | |
} | |
//start a new word | |
word = nil | |
} else { | |
word = append(word, b) | |
} | |
} | |
return nil | |
} | |
func (wc *wordCounter) PrintResults() { | |
sort.Stable(sort.Reverse(wc.top)) | |
for i := 0; i < len(wc.top.elements); i++ { | |
fmt.Printf(" %d %s\n", wc.top.elements[i].count, bytes.ToLower(wc.readWord(wc.top.elements[i].wordIndex))) | |
} | |
} | |
type wordStat struct { | |
checksum uint64 | |
count uint64 | |
wordIndex int | |
} | |
type topN struct { | |
N int | |
elements []*wordStat | |
min uint64 | |
max uint64 | |
} | |
func (top topN) Len() int { return len(top.elements) } | |
func (top topN) Less(i, j int) bool { return top.elements[i].count < top.elements[j].count } | |
func (top topN) Swap(i, j int) { top.elements[i], top.elements[j] = top.elements[j], top.elements[i] } | |
func (top *topN) exists(h uint64) (bool, int) { | |
for i := range top.elements { | |
if top.elements[i].checksum == h { | |
return true, i | |
} | |
} | |
return false, -1 | |
} | |
func (top *topN) setMinMax(value uint64) { | |
if value < top.min { | |
top.min = value | |
} | |
if value > top.max { | |
top.max = value | |
} | |
} | |
func (top *topN) Account(ws *wordStat) { | |
exists, index := top.exists(ws.checksum) | |
if len(top.elements) < top.N && !exists { | |
top.elements = append(top.elements, ws) | |
top.setMinMax(ws.count) | |
return | |
} | |
if ws.count >= top.min && exists { | |
top.elements[index] = ws | |
top.setMinMax(ws.count) | |
return | |
} | |
for i := range top.elements { | |
if top.elements[i].count < ws.count { | |
top.elements[i] = ws | |
top.setMinMax(ws.count) | |
return | |
} | |
} | |
} | |
func newTopN(n int) *topN { | |
return &topN{ | |
max: 0, | |
min: math.MaxUint64, | |
N: n, | |
elements: make([]*wordStat, 0, n), | |
} | |
} | |
func main() { | |
/* p := profile.Start(profile.MemProfile, profile.ProfilePath("."), profile.NoShutdownHook) | |
defer p.Stop() | |
*/ | |
if len(os.Args) < 2 { | |
log.Fatalln("invalid number of arguments") | |
} | |
file := os.Args[1] | |
f, err := os.Open(file) | |
if err != nil { | |
panic(err) | |
} | |
defer f.Close() | |
c := newWordCounter(f) | |
c.Top(20) | |
c.PrintResults() | |
} | |
func isValidChar(c byte) bool { | |
if c >= 'a' && c <= 'z' || | |
c >= 'A' && c <= 'Z' { | |
return true | |
} | |
return false | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment