Skip to content

Instantly share code, notes, and snippets.

@pataiadam
Created November 8, 2019 11:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pataiadam/7a4217e82ecc943f68acebe2c4a5bbaf to your computer and use it in GitHub Desktop.
Save pataiadam/7a4217e82ecc943f68acebe2c4a5bbaf to your computer and use it in GitHub Desktop.
k means tfidf
const kmeans = require('node-kmeans');
let data = [
{'category': 'szoke' , content: 'ket szoke no beszelget egy masik szoke novel'},
{'category': 'szoke' , content: 'Egy szoke no bemegy az egyetemre'},
{'category': 'szoke' , content: 'Egy szoke no bemegy az boltba'},
{'category': 'allat' , content: 'A kutya elmegy setalni'},
{'category': 'allat' , content: 'A medve meg a roka setal az erdoben'},
{'category': 'allat' , content: 'Ket hernyo hernyoskodik'},
{'category': 'zsido' , content: 'Ket zsido vesz egy hazat es egy autot'},
{'category': 'zsido' , content: 'Egy zsido bemegy a bankba'},
{'category': 'zsido' , content: 'Egy szoke zsido elmegy nyaralni'},
];
let allWords = {}
for (item of data) {
const wordCount = {}
const tf = {}
const words = item.content.toLowerCase().split(' ');
words.forEach(w => {
wordCount[w] = wordCount[w] || 0
wordCount[w]++
allWords[w] = 1
})
for (let item in wordCount) {
tf[item] = wordCount[item] / words.length
}
item.wordCount = wordCount
item.tf = tf
}
allWords = Object.keys(allWords)
const idf = {};
for (let dataItem of data) {
for (let item in dataItem.wordCount) {
idf[item] = idf[item] || 0;
idf[item]++
}
}
for (item in idf) {
idf[item] = Math.log(data.length/idf[item])
}
for (let dataItem of data) {
dataItem.tfIdf = {};
for (let item in dataItem.tf) {
dataItem.tfIdf[item] = dataItem.tf[item] * idf[item];
}
}
let vectors = []
for (let dataItem of data) {
vectors.push(allWords.map(w =>{
return dataItem.tfIdf[w] || 0
}))
}
kmeans.clusterize(vectors, {k: 3}, (err,res) => {
if (err) console.error(err);
else console.log('%o',res.map(r=>r.clusterInd));
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment