Last active
February 7, 2017 05:53
-
-
Save krrishd/f5379e757dc0f0f57d92c714689a41b6 to your computer and use it in GitHub Desktop.
Word frequency counter for content in your write.surge.sh savefile.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'use strict'; | |
const obviousWords = ["a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an", "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot", "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get", "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if", "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me", "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on", "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since", "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your", "ain't", "aren't", "can't", "could've", "couldn't", "didn't", "doesn't", "don't", "hasn't", "he'd", "he'll", "he's", "how'd", "how'll", "how's", "i'd", "i'll", "i'm", "i've", "isn't", "it's", "might've", "mightn't", "must've", "mustn't", "shan't", "she'd", "she'll", "she's", "should've", "shouldn't", "that'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "wasn't", "we'd", "we'll", "we're", "weren't", "what'd", "what's", "when'd", "when'll", "when's", "where'd", "where'll", "where's", "who'd", "who'll", "who's", "why'd", "why'll", "why's", "won't", "would've", "wouldn't", "you'd", "you'll", "you're", "you've"]; | |
const countFromRawText = (text) => { | |
let words = []; | |
const textAsTokens = text.split(' '); | |
textAsTokens.forEach(token => { | |
if (obviousWords.indexOf(token) !== -1) { | |
return 0; | |
} | |
let wordIndex = -1; | |
words.forEach((word, index) => { | |
if (word.word == token) { | |
wordIndex = index; | |
} | |
}); | |
if (wordIndex !== -1) { | |
words[wordIndex].count += 1; | |
} else { | |
words.push({ | |
word: token, | |
count: 1 | |
}); | |
} | |
}); | |
return words; | |
} | |
const produceRawTextFromSavefile = (filename) => { | |
let file = require(`./${filename}`); | |
let corpus = ''; | |
file.forEach(item => { | |
let itemWithoutTags = item.content.replace(/<[^>]*>/g, ' '); | |
corpus += itemWithoutTags + ' '; | |
}); | |
return corpus.toLowerCase(); | |
} | |
const produceTopWords = (topX, wordArray) => { | |
const sortedWords = wordArray.sort((a, b) => { | |
return (a.count - b.count); | |
}).reverse(); | |
for (let i = 0; i < topX; i++) { | |
console.log(`${i + 1}. ${sortedWords[i].word}`); | |
} | |
} | |
const compose = () => { | |
const filename = process.argv[2]; | |
const topX = process.argv[3]; | |
const rawText = produceRawTextFromSavefile(filename); | |
const words = countFromRawText(rawText); | |
produceTopWords(topX, words); | |
} | |
compose(); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment