Skip to content

Instantly share code, notes, and snippets.

@helinwang
Last active January 27, 2016 08:14
Show Gist options
  • Save helinwang/ac5cac246716aae44f62 to your computer and use it in GitHub Desktop.
Save helinwang/ac5cac246716aae44f62 to your computer and use it in GitHub Desktop.
calculate max tfidf of each word in text corpus (given directory) in golang
package main
import (
"flag"
"fmt"
"io/ioutil"
"math"
"os"
"sort"
"strings"
"sync"
)
var mu sync.Mutex
var wordToCount = make(map[string]int)
var docToWordCount = make(map[string]map[string]int)
type tfidf struct {
word string
score float64
}
type tfidfs []tfidf
func (t tfidfs) Len() int {
return len(t)
}
func (t tfidfs) Less(i, j int) bool {
return t[i].score < t[j].score
}
func (t tfidfs) Swap(i, j int) {
t[i], t[j] = t[j], t[i]
}
func getStat(path string, token chan struct{}, wg *sync.WaitGroup) {
defer wg.Done()
<-token
defer func() {
token <- struct{}{}
}()
f, err := os.Open(path)
if err != nil {
panic(err)
}
body, err := ioutil.ReadAll(f)
if err != nil {
panic(err)
}
f.Close()
// remove anything that is not from a-z, A-Z
for idx, b := range body {
if (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') {
continue
}
body[idx] = ' '
}
curWordCount := make(map[string]int)
words := strings.Split(string(body), " ")
for _, w := range words {
if w == "" {
continue
}
curWordCount[strings.ToLower(w)]++
}
mu.Lock()
for w, c := range curWordCount {
wordToCount[w] += c
}
docToWordCount[path] = curWordCount
mu.Unlock()
}
func main() {
concurrent := flag.Int("c", 10240, "concurrent read limit")
inDir := flag.String("inDir", "", "input dir to calculate tfidf")
flag.Parse()
if *inDir == "" {
flag.Usage()
return
}
files, err := ioutil.ReadDir(*inDir)
if err != nil {
panic(err)
}
token := make(chan struct{}, *concurrent)
for i := 0; i < *concurrent; i++ {
token <- struct{}{}
}
var wg sync.WaitGroup
for _, file := range files {
if file.IsDir() {
continue
}
go getStat(*inDir+"/"+file.Name(), token, &wg)
wg.Add(1)
}
wg.Wait()
mu.Lock()
tfidfs := make(tfidfs, len(wordToCount))
totalDoc := len(docToWordCount)
idx := 0
for w, _ := range wordToCount {
docCount := 0
maxFreq := float64(0)
for _, wm := range docToWordCount {
if c, ok := wm[w]; ok {
totalWord := 0
for _, c := range wm {
totalWord += c
}
wordFreq := float64(c) / float64(totalWord)
if wordFreq > maxFreq {
maxFreq = wordFreq
}
docCount++
}
}
itf := math.Log2(float64(totalDoc) / float64(docCount))
tfidfs[idx].score = maxFreq * itf
tfidfs[idx].word = w
idx++
}
mu.Unlock()
sort.Sort(tfidfs)
for _, t := range tfidfs {
fmt.Println(t.score, t.word)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment