Skip to content

Instantly share code, notes, and snippets.

@krrishd
Last active February 7, 2017 05:53
Show Gist options
  • Save krrishd/f5379e757dc0f0f57d92c714689a41b6 to your computer and use it in GitHub Desktop.
Save krrishd/f5379e757dc0f0f57d92c714689a41b6 to your computer and use it in GitHub Desktop.
Word frequency counter for content in your write.surge.sh savefile.
'use strict';
const obviousWords = ["a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an", "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot", "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get", "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if", "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me", "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on", "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since", "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your", "ain't", "aren't", "can't", "could've", "couldn't", "didn't", "doesn't", "don't", "hasn't", "he'd", "he'll", "he's", "how'd", "how'll", "how's", "i'd", "i'll", "i'm", "i've", "isn't", "it's", "might've", "mightn't", "must've", "mustn't", "shan't", "she'd", "she'll", "she's", "should've", "shouldn't", "that'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "wasn't", "we'd", "we'll", "we're", "weren't", "what'd", "what's", "when'd", "when'll", "when's", "where'd", "where'll", "where's", "who'd", "who'll", "who's", "why'd", "why'll", "why's", "won't", "would've", "wouldn't", "you'd", "you'll", "you're", "you've"];
const countFromRawText = (text) => {
let words = [];
const textAsTokens = text.split(' ');
textAsTokens.forEach(token => {
if (obviousWords.indexOf(token) !== -1) {
return 0;
}
let wordIndex = -1;
words.forEach((word, index) => {
if (word.word == token) {
wordIndex = index;
}
});
if (wordIndex !== -1) {
words[wordIndex].count += 1;
} else {
words.push({
word: token,
count: 1
});
}
});
return words;
}
const produceRawTextFromSavefile = (filename) => {
let file = require(`./${filename}`);
let corpus = '';
file.forEach(item => {
let itemWithoutTags = item.content.replace(/<[^>]*>/g, ' ');
corpus += itemWithoutTags + ' ';
});
return corpus.toLowerCase();
}
const produceTopWords = (topX, wordArray) => {
const sortedWords = wordArray.sort((a, b) => {
return (a.count - b.count);
}).reverse();
for (let i = 0; i < topX; i++) {
console.log(`${i + 1}. ${sortedWords[i].word}`);
}
}
const compose = () => {
const filename = process.argv[2];
const topX = process.argv[3];
const rawText = produceRawTextFromSavefile(filename);
const words = countFromRawText(rawText);
produceTopWords(topX, words);
}
compose();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment