Created
March 24, 2019 01:10
-
-
Save SergProduction/3c91bef18b51309472d17c63317805bc to your computer and use it in GitHub Desktop.
tf-idf weight and compare this cosine similarity
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ----- lib ----- | |
const plus = (a, b) => a + b | |
const mul = (a, b) => a * b | |
const sum = arr => arr.reduce(plus) | |
const zip = (f, arr1, arr2) => Array.from({ length: Math.min(arr1.length, arr2.length) }, (_, i) => ( | |
f(arr1[i], arr2[i]) | |
)) | |
compose = (...fns) => | |
fns.reduceRight((prevFn, nextFn) => | |
(...args) => nextFn(prevFn(...args)), | |
value => value | |
) | |
// ----- program ----- | |
splitDocToWords = (doc) => doc.toLocaleLowerCase().replace(/\.|,/g, '').split(' ').filter(Boolean) | |
conuntWords = (words)=> words.reduce((map, word) => ( | |
map.has(word) | |
? map.set(word, map.get(word) + 1) | |
: map.set(word, 1) | |
), new Map()) | |
// TF термина а = (Количество раз, когда термин а встретился в тексте / количество всех слов в тексте) | |
// { [термин]: Количество раз, когда термин а встретился в тексте / количество всех слов в тексте } | |
calculateTF = docs => docs.map(doc => { | |
const listWord = splitDocToWords(doc) | |
const docRowTF = conuntWords(listWord) | |
const docTF = new Map() | |
for (var [word, count] of docRowTF) { | |
docTF.set(word, count / docRowTF.size) | |
} | |
return docTF | |
}) | |
// { [термин]: Количество документов, в которых встречается термин } | |
const calculateDF = docsTF => docsTF.reduce((DF, docTF) => { | |
let wordIsThisDoc = new Map() | |
for (var word of docTF.keys()) { | |
if (wordIsThisDoc.has(word) === false) { | |
if (DF.has(word)) | |
DF.set(word, DF.get(word) + 1) | |
else | |
DF.set(word, 1) | |
wordIsThisDoc.set(word, true) | |
} | |
} | |
return DF | |
}, new Map()) | |
// IDF термина а = логарифм(Общее количество документов / Количество документов, в которых встречается термин а) | |
// возращает словать всех слов значениями которых будет IDF | |
const calculateIDF = (rowIDF) => { | |
const mapIDF = new Map() | |
for (var [word, countInDocs] of rowIDF) { | |
mapIDF.set(word, Math.log( rowIDF.size / countInDocs )) | |
} | |
return mapIDF | |
} | |
const calculateTFIDF = (docsTF, termsIDF) => docsTF.map(docTF => { | |
const docTFIDF = new Map() | |
for (var [word, tf] of docTF) { | |
docTFIDF.set(word, { tf, tf_idf: termsIDF.get(word) * tf }) | |
} | |
return docTFIDF | |
}) | |
const toVectorMeasureAllTerms = (terms, docTFIDF) => { | |
const vec = [] | |
for ( const word of terms.keys()) { | |
if (docTFIDF.has(word)) | |
vec.push( docTFIDF.get(word).tf_idf ) | |
else | |
vec.push(0) | |
} | |
return vec | |
} | |
const cosine_similarity = (v1, v2) => { | |
const dot_product2 = (v1, v2) => sum(zip(mul, v1, v2)) | |
const prod = dot_product2(v1, v2) | |
const len1 = Math.sqrt(dot_product2(v1, v1)) | |
const len2 = Math.sqrt(dot_product2(v2, v2)) | |
return prod / (len1 * len2) | |
} | |
const docs = [ | |
'Я люблю тортики больше, чем яблоки', | |
'Я уважаю апельсины больше, чем торты', | |
'Яблочные сады раскинулись над дорогой', | |
'Ехал Грека через реку', | |
] | |
const checkDocs = [ | |
'Тортики делают из муки, апельсины и воды', | |
'Торты исчезли там, где появился я', | |
'Ехал тортик через реку', | |
] | |
const docsTF = calculateTF(docs) | |
// console.log(docsTF) | |
const termsIDF = compose(calculateIDF, calculateDF)(docsTF) | |
// console.log(wordsIDF) | |
const docsTFIDF = calculateTFIDF(docsTF, termsIDF) | |
// ----- compare ----- | |
const checkDocsTF = calculateTF(checkDocs) | |
const checkDocsTFIDF = calculateTFIDF(checkDocsTF, termsIDF) | |
/* | |
console.log(termsIDF.keys()) | |
console.log(checkDocsTFIDF[0]) | |
console.log(toVectorMeasureAllTerms(termsIDF, checkDocsTFIDF[0])) | |
*/ | |
checkDocsTFIDF.forEach((checkDoc, checkIdx) => { | |
console.log('check', checkDocs[checkIdx]) | |
const chechVec = toVectorMeasureAllTerms(termsIDF, checkDoc) | |
docsTFIDF.map((sourceDoc, srcIdx) => { | |
console.log('source', docs[srcIdx]) | |
const sourceVec = toVectorMeasureAllTerms(termsIDF, sourceDoc) | |
const dif = cosine_similarity(chechVec, sourceVec) | |
console.log(dif) | |
}) | |
}) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment