Skip to content

Instantly share code, notes, and snippets.

@johndigital
Last active November 1, 2017 23:04
Show Gist options
  • Save johndigital/5fb36e2c2164317235174d05aad31a98 to your computer and use it in GitHub Desktop.
Save johndigital/5fb36e2c2164317235174d05aad31a98 to your computer and use it in GitHub Desktop.
K-Means strategy for finding groups of related posts in a blog
const distance = require( 'compute-cosine-distance' )
const _ = require('lodash')
// helper to average an array of vectors
// down to a single vector
const average = points => {
return _.range(points[0].length).map(dim => {
return _.mean(points.map(p => p[dim]))
})
}
// helper to compare two vectors and
// determine if they are different
const pointsChanged = (oldV, newV) => {
return !!oldV.find((p, i) => {
return !_.isEqual(p, newV[i])
})
}
// helper to split title or content
// and normalize all words
const tokenize = string => {
return string.split(' ').map(w => {
return w.trim().toLowerCase().replace(/[^a-zA-Z]/g, '')
}).filter(w => w)
}
// main function to run K-Means and determine
// centers of post clusters given an array of
// posts and a K value
const getClusterCenters = (posts, k) => {
// gather all unique words from all
// posts into array
const allWords = _.uniq(
_.flatten(posts.map(p => {
return tokenize(`${ p.title } ${ p.content }`)
}))
)
// convert all posts into point vectors
// using allWords as reference
const points = posts.map(p => {
const pWords = tokenize(`${ p.title } ${ p.content }`)
return allWords.map(w => {
return pWords.includes(w) ? 1 : 0
})
})
// initialize K random points
let randPoints = _.range(k).map(i => {
// create random vector of 1s and 0s
return allWords.map(w => _.random(1))
})
// initialize array to hold cluster
let clusters = []
// keep track if points are moving
let moving = true
while (moving) {
// map over random points and create a
// post cluster for each
clusters = randPoints.map(randPoint => {
// create an array of all posts this
// random point is closest to (cluster)
return points.reduce((acc, point, i) => {
// measure distance from current point to all random points
const distances = randPoints.map(rp => distance(rp, point))
// if this is the closest, push into cluster
if ( _.min(distances) == distance(randPoint, point) ){
acc.push({ idx: i, point })
}
// return accumulating cluster
return acc
}, [])
})
// move random points by averaging all vectors
// within their cluster. These are "new points"
const newPoints = clusters.map((clust, i) => {
if ( !clust.length ) return randPoints[i]
return average(clust.map(c => c.point))
})
// check if the new points moved at all, if not break
moving = pointsChanged(newPoints, randPoints)
// reset random points to new points
randPoints = newPoints
}
// centers have stopped moving, randPoints
// are now at the cluster centers
const clusterCenters = randPoints
return clusterCenters
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment