Skip to content

Instantly share code, notes, and snippets.

@Stantheman
Created October 19, 2014 04:13
Show Gist options
  • Save Stantheman/609b36fd60e805adca28 to your computer and use it in GitHub Desktop.
Save Stantheman/609b36fd60e805adca28 to your computer and use it in GitHub Desktop.
package main
import (
"bufio"
"bytes"
"fmt"
IO "io/ioutil"
"os"
"sort"
"strings"
)
type wordsSortableByFrequency []*wordWithFrequency
type wordWithFrequency struct {
word string
frequency int
}
func (d wordsSortableByFrequency) Len() int {
return len(d)
}
func (d wordsSortableByFrequency) Swap(i, j int) {
d[i], d[j] = d[j], d[i]
}
func (d wordsSortableByFrequency) Less(i, j int) bool {
return d[i].frequency < d[j].frequency
}
func main() {
// Get the current path
//
// TODO: Check the error
// Join the path til the repo with the text
//
PRIDE_AND_PREJUDICE := "pride-and-prejudice.txt"
STOP_WORDS := "stop_words.txt"
// Generate the stop words and put them in an array
// in case of stopwords we can just read the file and put in memory
//
stopWordsContents, _ := IO.ReadFile(STOP_WORDS)
stopWordsContents = stopWordsContents[0:(len(stopWordsContents) - 3)] // remove three line breaks at the end
// TODO: check the error
// Split the contents of the file to generate the words to ignore
//
stopWords := strings.Split(strings.ToLower(string(stopWordsContents)), ",")
// Now merge the single letters too...
// Generate the alphabet in lowercase: a..z (97..123 in ascii)
//
for i := 97; i < 123; i++ {
stopWords = append(stopWords, string(i))
}
// Leave this one open (defer closing)
prideAndPrejudiceTextFile, _ := os.Open(PRIDE_AND_PREJUDICE)
defer prideAndPrejudiceTextFile.Close()
// For reading the pride and prejudice text, we use a scanner instead
//
prideAndPrejudiceTextReader := bufio.NewReader(prideAndPrejudiceTextFile)
scanner := bufio.NewScanner(prideAndPrejudiceTextReader)
// Only capture lowercase alphanumeric characters
//
wordFrequency := make(map[string]int)
var wordBuffer bytes.Buffer
for scanner.Scan() {
line := strings.ToLower(scanner.Text())
for _, c := range line {
if c >= 97 && c <= 123 { // Filter alphanumeric
wordBuffer.WriteRune(c)
} else if c == 32 { // Empty space, meaning that we have a word
if w := wordBuffer.String(); len(w) > 0 {
wordFrequency[w]++
wordBuffer.Truncate(0)
}
} else {
if w := wordBuffer.String(); len(w) > 0 {
wordFrequency[w]++
wordBuffer.Truncate(0)
}
}
}
if w := wordBuffer.String(); len(w) > 0 {
wordFrequency[w]++
wordBuffer.Truncate(0)
}
}
// Remove the words that should be ignored
//
for _, word := range stopWords {
delete(wordFrequency, word)
}
// Turn the wordFrequency map into a list so that entries are comparable?
//
sortableWordsList := make(wordsSortableByFrequency, 0, len(wordFrequency))
for word, frequency := range wordFrequency {
w := wordWithFrequency{word, frequency}
sortableWordsList = append(sortableWordsList, &w) // pass reference
}
// Sort!
//
sort.Sort(sort.Reverse(sortableWordsList))
for i, w := range sortableWordsList {
fmt.Println(w.word, " - ", w.frequency)
if i > 25 {
break
}
}
}
@wallyqs
Copy link

wallyqs commented Oct 19, 2014

Thanks!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment