Created
September 5, 2020 03:41
-
-
Save bernatfortet/ee9ade8260f3b45445878e5703dfe923 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// search returns array of matching docs ranked by tf-idf score | |
// aka term frequency * inverse document frequency | |
// | |
// tf = # of occurences of term in document / # of words in document | |
// idf = log ( # of documents / # of documents with term ) | |
// tf-idf = tf * idf | |
// multi-term tf-idf = sum of tf-idf scores (per document) | |
type Doc = string | |
export function search(docs: Doc[], terms: string[]): Doc[] { | |
// tf = # of occurences of term in document / # of words in document | |
const docTfs = [] | |
docs.forEach((doc, index) => { | |
docTfs[index] = {} | |
const words = doc.split(' ').length | |
terms.forEach(term => { | |
const occ = countOcurrences(doc, term) | |
docTfs[index][term] = occ/words | |
}) | |
}) | |
const tfidfs = docs.map((doc, index) => { | |
const tfidf = terms.reduce((prev, term) => { | |
const tf = docTfs[index][term] | |
const docsWithTerm = countDocumentsWithTerm(docs, term) | |
const idf = docsWithTerm == 0 ? 0 : Math.log(docs.length / docsWithTerm ) | |
const tfidf = prev + (tf * idf) | |
return tfidf | |
}, 0) | |
return { | |
doc: doc, | |
score: tfidf | |
} | |
}) | |
const ordered = tfidfs.sort( (a, b) => b.score - a.score) | |
return ordered.map( o => o.doc) | |
} | |
function countOcurrences(doc: Doc, term: string) { | |
var regExp = new RegExp(term, "gi") | |
return (doc.match(regExp) || []).length | |
} | |
function countDocumentsWithTerm(docs: Doc[], term: string){ | |
const t = docs.reduce((prev, doc) => { | |
return doc.indexOf(term) > 1 ? prev + 1 : prev | |
}, 0) | |
return t | |
} | |
const myTerms = ['rabbit', 'dog'] | |
const docs: Doc[] = [ | |
"the quick brown fox jumps over the lazy dog", | |
"i like to eat beans for dinner", | |
"dogs are a man's best friend. cat we like dogs", | |
"cats are the biggest trolls", | |
"the dog and the cat don't get along", | |
"do cats like hamburgers? let's test and find out", | |
"the rabbit likes a carrot in her stew" | |
] | |
// const results = search(docs, myTerms) | |
const processTerms = process.argv.slice(2) | |
console.log("Query:", ...processTerms) | |
console.log("\n") | |
const result = search(docs, processTerms) | |
for (const doc of result) { | |
console.log(doc) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment