Created
October 19, 2014 04:13
-
-
Save Stantheman/609b36fd60e805adca28 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"bytes" | |
"fmt" | |
IO "io/ioutil" | |
"os" | |
"sort" | |
"strings" | |
) | |
type wordsSortableByFrequency []*wordWithFrequency | |
type wordWithFrequency struct { | |
word string | |
frequency int | |
} | |
func (d wordsSortableByFrequency) Len() int { | |
return len(d) | |
} | |
func (d wordsSortableByFrequency) Swap(i, j int) { | |
d[i], d[j] = d[j], d[i] | |
} | |
func (d wordsSortableByFrequency) Less(i, j int) bool { | |
return d[i].frequency < d[j].frequency | |
} | |
func main() { | |
// Get the current path | |
// | |
// TODO: Check the error | |
// Join the path til the repo with the text | |
// | |
PRIDE_AND_PREJUDICE := "pride-and-prejudice.txt" | |
STOP_WORDS := "stop_words.txt" | |
// Generate the stop words and put them in an array | |
// in case of stopwords we can just read the file and put in memory | |
// | |
stopWordsContents, _ := IO.ReadFile(STOP_WORDS) | |
stopWordsContents = stopWordsContents[0:(len(stopWordsContents) - 3)] // remove three line breaks at the end | |
// TODO: check the error | |
// Split the contents of the file to generate the words to ignore | |
// | |
stopWords := strings.Split(strings.ToLower(string(stopWordsContents)), ",") | |
// Now merge the single letters too... | |
// Generate the alphabet in lowercase: a..z (97..123 in ascii) | |
// | |
for i := 97; i < 123; i++ { | |
stopWords = append(stopWords, string(i)) | |
} | |
// Leave this one open (defer closing) | |
prideAndPrejudiceTextFile, _ := os.Open(PRIDE_AND_PREJUDICE) | |
defer prideAndPrejudiceTextFile.Close() | |
// For reading the pride and prejudice text, we use a scanner instead | |
// | |
prideAndPrejudiceTextReader := bufio.NewReader(prideAndPrejudiceTextFile) | |
scanner := bufio.NewScanner(prideAndPrejudiceTextReader) | |
// Only capture lowercase alphanumeric characters | |
// | |
wordFrequency := make(map[string]int) | |
var wordBuffer bytes.Buffer | |
for scanner.Scan() { | |
line := strings.ToLower(scanner.Text()) | |
for _, c := range line { | |
if c >= 97 && c <= 123 { // Filter alphanumeric | |
wordBuffer.WriteRune(c) | |
} else if c == 32 { // Empty space, meaning that we have a word | |
if w := wordBuffer.String(); len(w) > 0 { | |
wordFrequency[w]++ | |
wordBuffer.Truncate(0) | |
} | |
} else { | |
if w := wordBuffer.String(); len(w) > 0 { | |
wordFrequency[w]++ | |
wordBuffer.Truncate(0) | |
} | |
} | |
} | |
if w := wordBuffer.String(); len(w) > 0 { | |
wordFrequency[w]++ | |
wordBuffer.Truncate(0) | |
} | |
} | |
// Remove the words that should be ignored | |
// | |
for _, word := range stopWords { | |
delete(wordFrequency, word) | |
} | |
// Turn the wordFrequency map into a list so that entries are comparable? | |
// | |
sortableWordsList := make(wordsSortableByFrequency, 0, len(wordFrequency)) | |
for word, frequency := range wordFrequency { | |
w := wordWithFrequency{word, frequency} | |
sortableWordsList = append(sortableWordsList, &w) // pass reference | |
} | |
// Sort! | |
// | |
sort.Sort(sort.Reverse(sortableWordsList)) | |
for i, w := range sortableWordsList { | |
fmt.Println(w.word, " - ", w.frequency) | |
if i > 25 { | |
break | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks!