Skip to content

Instantly share code, notes, and snippets.

@meekg33k
Created January 18, 2021 21:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save meekg33k/c67c3b6f0959592ba58659c633ec09f9 to your computer and use it in GitHub Desktop.
Save meekg33k/c67c3b6f0959592ba58659c633ec09f9 to your computer and use it in GitHub Desktop.
Algorithm to return most frequently used words
const wordsToExclude = ['and', 'he', 'the', 'to', 'is', 'Jack', 'Jill']
const literatureText = 'Jack and Jill went to the market to buy bread and cheese. Cheese is Jack\'s and Jill\'s favorite food'
const getTokenizedWords = (word) => {
if (!word || word === '') return []
return word.split(/\W+/)
}
const getValidTokenizedWords = (words, excludedWords) => {
if (!words || words.length === 0) return []
if (!excludedWords || excludedWords.length === 0) return words
let validWords = [];
for (const word of words) {
if (excludedWords.indexOf(word) == -1) {
validWords.push(word.toLowerCase())
}
}
return validWords;
}
const getMostFrequentlyUsedWords = (words) => {
const wordCountMap = {};
let countWordBucket = []; //Use the indices of the array to keep track of the occurrences of words
for (const word of words) {
let count = wordCountMap[word]
//Check if word exists in map. If it does, it has an existing count, so just add 1 to it
if (count) {
wordCountMap[word] = count + 1;
}
else {
//If it doesn't exist, this is the first time we are encountering it, so set it to 1
wordCountMap[word] = 1;
}
count = wordCountMap[word]
if (countWordBucket[count]) {
countWordBucket[count].push(word)
}
else {
countWordBucket[count] = [word]
}
}
return countWordBucket[countWordBucket.length - 1]; //The words with the highest occurrence will be at the end of the array
}
const returnMostFrequentlyUsedWords = (literatureText, wordsToExclude) => {
if (!literatureText || literatureText === '') {
return []
}
const tokenizedWords = getTokenizedWords(literatureText);
const validTokenizedWords = getValidTokenizedWords(tokenizedWords, wordsToExclude)
return getMostFrequentlyUsedWords(validTokenizedWords);
}
console.log(returnMostFrequentlyUsedWords(literatureText, wordsToExclude))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment