Created
August 31, 2018 10:15
-
-
Save bee-san/da5efa92b21a73032c84916f236afc99 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function inverseDocumentFrequency(document){ | |
// calculates the inverse document frequency of every sentence | |
const words_without_stopwords = prettify(document); | |
const unique_words_set = uniqueWords(words_without_stopwords); | |
const sentences = document.split(".").map(item => item.trim()); | |
sentences[0] = sentences[0].substring(146); | |
const lengthOfDocuments = sentences.length; | |
// prettifys each sentence so it doesn't have stopwords | |
const wordCountAll = countWords(words_without_stopwords); | |
// counts words of each sentence | |
// as each sentence is a document | |
wordCountSentences = []; | |
for (let i = 0; i <= lengthOfDocuments - 1; i ++){ | |
wordCountSentences.push(countWords(prettify(sentences[i]))); | |
} | |
// calculate TF values of all documents | |
let IDFVals = {}; | |
// how many times that word appears in all sentences (documents) | |
wordCountSentencesLength = wordCountSentences.length; | |
// for every unique word | |
for (let i = 0; i <= unique_words_set.length - 1; i++){ | |
let temp_add = 0; | |
// count how many times unique word appears in all sentences | |
for (let x = 0; x <= wordCountSentencesLength - 1; x++){ | |
if (unique_words_set[i] in wordCountSentences[x]){ | |
temp_add =+ 1; | |
} | |
} | |
IDFVals[unique_words_set[i]] = Math.log10(wordCountAll[unique_words_set[i]] / temp_add); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment