Skip to content

Instantly share code, notes, and snippets.

@bernatfortet
Created September 5, 2020 03:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bernatfortet/ee9ade8260f3b45445878e5703dfe923 to your computer and use it in GitHub Desktop.
Save bernatfortet/ee9ade8260f3b45445878e5703dfe923 to your computer and use it in GitHub Desktop.
// search returns array of matching docs ranked by tf-idf score
// aka term frequency * inverse document frequency
//
// tf = # of occurences of term in document / # of words in document
// idf = log ( # of documents / # of documents with term )
// tf-idf = tf * idf
// multi-term tf-idf = sum of tf-idf scores (per document)
type Doc = string
export function search(docs: Doc[], terms: string[]): Doc[] {
// tf = # of occurences of term in document / # of words in document
const docTfs = []
docs.forEach((doc, index) => {
docTfs[index] = {}
const words = doc.split(' ').length
terms.forEach(term => {
const occ = countOcurrences(doc, term)
docTfs[index][term] = occ/words
})
})
const tfidfs = docs.map((doc, index) => {
const tfidf = terms.reduce((prev, term) => {
const tf = docTfs[index][term]
const docsWithTerm = countDocumentsWithTerm(docs, term)
const idf = docsWithTerm == 0 ? 0 : Math.log(docs.length / docsWithTerm )
const tfidf = prev + (tf * idf)
return tfidf
}, 0)
return {
doc: doc,
score: tfidf
}
})
const ordered = tfidfs.sort( (a, b) => b.score - a.score)
return ordered.map( o => o.doc)
}
function countOcurrences(doc: Doc, term: string) {
var regExp = new RegExp(term, "gi")
return (doc.match(regExp) || []).length
}
function countDocumentsWithTerm(docs: Doc[], term: string){
const t = docs.reduce((prev, doc) => {
return doc.indexOf(term) > 1 ? prev + 1 : prev
}, 0)
return t
}
const myTerms = ['rabbit', 'dog']
const docs: Doc[] = [
"the quick brown fox jumps over the lazy dog",
"i like to eat beans for dinner",
"dogs are a man's best friend. cat we like dogs",
"cats are the biggest trolls",
"the dog and the cat don't get along",
"do cats like hamburgers? let's test and find out",
"the rabbit likes a carrot in her stew"
]
// const results = search(docs, myTerms)
const processTerms = process.argv.slice(2)
console.log("Query:", ...processTerms)
console.log("\n")
const result = search(docs, processTerms)
for (const doc of result) {
console.log(doc)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment