Skip to content

Instantly share code, notes, and snippets.

@zapthedingbat
Created October 19, 2018 14:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zapthedingbat/e537c64dc700c216efa5715b73a0d6f4 to your computer and use it in GitHub Desktop.
Save zapthedingbat/e537c64dc700c216efa5715b73a0d6f4 to your computer and use it in GitHub Desktop.
Simple TF-IDF
function getTermKey(term) {
return term
.normalize('NFD')
.replace(/[\u0300-\u036f]/g, "")
.replace(/[^\w]/g, "")
.toLowerCase()
};
function getTermsIn(document) {
return document.split(/[\s_():.!?,;]+/)
.map(getTermKey);
}
function corpus() {
const _documents = [];
function add(document) {
const termsInDocument = new Set(getTermsIn(document));
_documents.push(termsInDocument);
}
function idf(term) {
const termKey = getTermKey(term);
const numberOfDocumentsContainingTerm = _documents.reduce((prev, document) => document.has(termKey) ? prev + 1 : prev, 0);
return Math.log(_documents.length / (1 + numberOfDocumentsContainingTerm)) / Math.log(10);
}
function tf(term, document) {
const termKey = getTermKey(term);
const termsInDocument = getTermsIn(document)
const termFrequency = termsInDocument.reduce((frequency, docTermKey) => docTermKey === termKey ? ++frequency : frequency, 0);
return termFrequency / termsInDocument.length;
}
function tfIdf(term, document) {
return tf(term, document) * idf(term);
}
return {
add,
tf,
idf,
tfIdf
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment