Last active
November 1, 2017 23:04
-
-
Save johndigital/5fb36e2c2164317235174d05aad31a98 to your computer and use it in GitHub Desktop.
K-Means strategy for finding groups of related posts in a blog
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const distance = require( 'compute-cosine-distance' ) | |
const _ = require('lodash') | |
// helper to average an array of vectors | |
// down to a single vector | |
const average = points => { | |
return _.range(points[0].length).map(dim => { | |
return _.mean(points.map(p => p[dim])) | |
}) | |
} | |
// helper to compare two vectors and | |
// determine if they are different | |
const pointsChanged = (oldV, newV) => { | |
return !!oldV.find((p, i) => { | |
return !_.isEqual(p, newV[i]) | |
}) | |
} | |
// helper to split title or content | |
// and normalize all words | |
const tokenize = string => { | |
return string.split(' ').map(w => { | |
return w.trim().toLowerCase().replace(/[^a-zA-Z]/g, '') | |
}).filter(w => w) | |
} | |
// main function to run K-Means and determine | |
// centers of post clusters given an array of | |
// posts and a K value | |
const getClusterCenters = (posts, k) => { | |
// gather all unique words from all | |
// posts into array | |
const allWords = _.uniq( | |
_.flatten(posts.map(p => { | |
return tokenize(`${ p.title } ${ p.content }`) | |
})) | |
) | |
// convert all posts into point vectors | |
// using allWords as reference | |
const points = posts.map(p => { | |
const pWords = tokenize(`${ p.title } ${ p.content }`) | |
return allWords.map(w => { | |
return pWords.includes(w) ? 1 : 0 | |
}) | |
}) | |
// initialize K random points | |
let randPoints = _.range(k).map(i => { | |
// create random vector of 1s and 0s | |
return allWords.map(w => _.random(1)) | |
}) | |
// initialize array to hold cluster | |
let clusters = [] | |
// keep track if points are moving | |
let moving = true | |
while (moving) { | |
// map over random points and create a | |
// post cluster for each | |
clusters = randPoints.map(randPoint => { | |
// create an array of all posts this | |
// random point is closest to (cluster) | |
return points.reduce((acc, point, i) => { | |
// measure distance from current point to all random points | |
const distances = randPoints.map(rp => distance(rp, point)) | |
// if this is the closest, push into cluster | |
if ( _.min(distances) == distance(randPoint, point) ){ | |
acc.push({ idx: i, point }) | |
} | |
// return accumulating cluster | |
return acc | |
}, []) | |
}) | |
// move random points by averaging all vectors | |
// within their cluster. These are "new points" | |
const newPoints = clusters.map((clust, i) => { | |
if ( !clust.length ) return randPoints[i] | |
return average(clust.map(c => c.point)) | |
}) | |
// check if the new points moved at all, if not break | |
moving = pointsChanged(newPoints, randPoints) | |
// reset random points to new points | |
randPoints = newPoints | |
} | |
// centers have stopped moving, randPoints | |
// are now at the cluster centers | |
const clusterCenters = randPoints | |
return clusterCenters | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment