Skip to content

Instantly share code, notes, and snippets.

@loretoparisi
Created March 15, 2018 13:37
Show Gist options
  • Save loretoparisi/07f3af8114a7a70169511351326064ae to your computer and use it in GitHub Desktop.
Save loretoparisi/07f3af8114a7a70169511351326064ae to your computer and use it in GitHub Desktop.
LDA in JavaScript
/**
* Latent Dirichlet Allocation
* Based on javascript implementation https://github.com/awaisathar/lda.js
* @author Loreto Parisi (loretoparisi at gmail dot com)
*/
"use strict";
/**
* Based on javascript https://github.com/primaryobjects/lda/
* Based on javascript implementation https://github.com/awaisathar/lda.js
* Original code based on http://www.arbylon.net/projects/LdaGibbsSampler.java
**/
var process = function(stem, stopwords, sentences, numberOfTopics, numberOfTermsPerTopic, minWordSize,
alphaValue, betaValue, iterations, burnIn, thinInterval, sampleLag, randomSeed) {
// The result will consist of topics and their included terms [[{"term":"word1", "probability":0.065}, {"term":"word2", "probability":0.047}, ... ], [{"term":"word1", "probability":0.085}, {"term":"word2", "probability":0.024}, ... ]].
var result = [];
// Index-encoded array of sentences, with each row containing the indices of the words in the vocabulary.
var documents = new Array();
// Hash of vocabulary words and the count of how many times each word has been seen.
var f = {};
// Vocabulary of unique words (porter stemmed).
var vocab=new Array();
// Vocabulary of unique words in their original form.
var vocabOrig = {};
if (sentences && sentences.length > 0) {
for(var i=0;i<sentences.length;i++) {
if (sentences[i]=="") continue;
documents[i] = new Array();
var words = sentences[i] ? sentences[i].split(/[\s,\"]+/) : null;
if(!words) continue;
for(var wc=0;wc<words.length;wc++) {
var w=words[wc].toLowerCase().replace(/[^a-z\'A-Z0-9\u00C0-\u00ff ]+/g, '');
var wStemmed = stem(w);
if (w=="" || !wStemmed || w.length<=minWordSize || stopwords.indexOf(w.replace("'", "")) > -1 || stopwords.indexOf(wStemmed) > -1) continue;
if (f[wStemmed]) {
f[wStemmed]=f[wStemmed]+1;
}
else if(wStemmed) {
f[wStemmed]=1;
vocab.push(wStemmed);
vocabOrig[wStemmed] = w;
};
documents[i].push(vocab.indexOf(wStemmed));
}
}
documents = documents.filter((doc) => { return doc.length }); // filter empty documents
var V = vocab.length;
var M = documents.length;
var K = parseInt(numberOfTopics);
var alpha = alphaValue || 0.1; // per-document distributions over topics
var beta = betaValue || .01; // per-topic distributions over words
console.log("lda vocabulary:%d documents:%d", V, M);
lda.configure(documents, V, iterations, burnIn, thinInterval, sampleLag, randomSeed);
lda.gibbs(K, alpha, beta);
var theta = lda.getTheta();
var phi = lda.getPhi();
var text = '';
//topics
var topTerms=numberOfTermsPerTopic;
for (var k = 0; k < phi.length; k++) {
var things = new Array();
for (var w = 0; w < phi[k].length; w++) {
things.push(""+phi[k][w].toPrecision(2)+"_"+vocab[w] + "_" + vocabOrig[vocab[w]]);
}
things.sort().reverse();
if(topTerms>vocab.length) topTerms=vocab.length;
//console.log('Topic ' + (k + 1));
var row = [];
for (var t = 0; t < topTerms; t++) {
var topicTerm=things[t].split("_")[2];
var prob=parseInt(things[t].split("_")[0]*100);
if (prob<2) continue;
//console.log('Top Term: ' + topicTerm + ' (' + prob + '%)');
var term = {};
term.term = topicTerm;
term.probability = parseFloat(things[t].split("_")[0]);
row.push(term);
}
result.push(row);
}
}
return result;
}
function makeArray(x) {
var a = new Array();
for (var i=0;i<x;i++) {
a[i]=0;
}
return a;
}
function make2DArray(x,y) {
var a = new Array();
for (var i=0;i<x;i++) {
a[i]=new Array();
for (var j=0;j<y;j++)
a[i][j]=0;
}
return a;
}
var lda = new function() {
var documents,z,nw,nd,nwsum,ndsum,thetasum,phisum,V,K,alpha,beta;
var THIN_INTERVAL = 20;
var BURN_IN = 100;
var ITERATIONS = 1000;
var SAMPLE_LAG;
var RANDOM_SEED;
var dispcol = 0;
var numstats=0;
this.configure = function (docs,v,iterations,burnIn,thinInterval,sampleLag,randomSeed) {
this.ITERATIONS = iterations;
this.BURN_IN = burnIn;
this.THIN_INTERVAL = thinInterval;
this.SAMPLE_LAG = sampleLag;
this.RANDOM_SEED = randomSeed;
this.documents = docs;
this.V = v;
this.dispcol=0;
this.numstats=0;
}
this.initialState = function (K) {
var i;
var M = this.documents.length;
this.nw = make2DArray(this.V,K);
this.nd = make2DArray(M,K);
this.nwsum = makeArray(K);
this.ndsum = makeArray(M);
this.z = new Array(); for (i=0;i<M;i++) this.z[i]=new Array();
for (var m = 0; m < M; m++) {
var N = this.documents[m].length;
this.z[m] = new Array();
for (var n = 0; n < N; n++) {
var topic = parseInt(""+(this.getRandom() * K));
this.z[m][n] = topic;
this.nw[this.documents[m][n]][topic]++;
this.nd[m][topic]++;
this.nwsum[topic]++;
}
this.ndsum[m] = N;
}
}
this.gibbs = function (K,alpha,beta) {
var i;
this.K = K;
this.alpha = alpha;
this.beta = beta;
if (this.SAMPLE_LAG > 0) {
this.thetasum = make2DArray(this.documents.length,this.K);
this.phisum = make2DArray(this.K,this.V);
this.numstats = 0;
}
this.initialState(K);
//document.write("Sampling " + this.ITERATIONS
// + " iterations with burn-in of " + this.BURN_IN + " (B/S="
// + this.THIN_INTERVAL + ").<br/>");
for (i = 0; i < this.ITERATIONS; i++) {
for (var m = 0; m < this.z.length; m++) {
for (var n = 0; n < this.z[m].length; n++) {
var topic = this.sampleFullConditional(m, n);
this.z[m][n] = topic;
}
}
if ((i < this.BURN_IN) && (i % this.THIN_INTERVAL == 0)) {
this.dispcol++;
}
if ((i > this.BURN_IN) && (i % this.THIN_INTERVAL == 0)) {
this.dispcol++;
}
if ((i > this.BURN_IN) && (this.SAMPLE_LAG > 0) && (i % this.SAMPLE_LAG == 0)) {
this.updateParams();
if (i % this.THIN_INTERVAL != 0)
this.dispcol++;
}
if (this.dispcol >= 100) {
this.dispcol = 0;
}
}
}
this.sampleFullConditional = function(m,n) {
var topic = this.z[m][n];
this.nw[this.documents[m][n]][topic]--;
this.nd[m][topic]--;
this.nwsum[topic]--;
this.ndsum[m]--;
var p = makeArray(this.K);
for (var k = 0; k < this.K; k++) {
p[k] = (this.nw[this.documents[m][n]][k] + this.beta) / (this.nwsum[k] + this.V * this.beta)
* (this.nd[m][k] + this.alpha) / (this.ndsum[m] + this.K * this.alpha);
}
for (var k = 1; k < p.length; k++) {
p[k] += p[k - 1];
}
var u = this.getRandom() * p[this.K - 1];
for (topic = 0; topic < p.length; topic++) {
if (u < p[topic])
break;
}
this.nw[this.documents[m][n]][topic]++;
this.nd[m][topic]++;
this.nwsum[topic]++;
this.ndsum[m]++;
return topic;
}
this.updateParams =function () {
for (var m = 0; m < this.documents.length; m++) {
for (var k = 0; k < this.K; k++) {
this.thetasum[m][k] += (this.nd[m][k] + this.alpha) / (this.ndsum[m] + this.K * this.alpha);
}
}
for (var k = 0; k < this.K; k++) {
for (var w = 0; w < this.V; w++) {
this.phisum[k][w] += (this.nw[w][k] + this.beta) / (this.nwsum[k] + this.V * this.beta);
}
}
this.numstats++;
}
this.getTheta = function() {
var theta = new Array(); for(var i=0;i<this.documents.length;i++) theta[i] = new Array();
if (this.SAMPLE_LAG > 0) {
for (var m = 0; m < this.documents.length; m++) {
for (var k = 0; k < this.K; k++) {
theta[m][k] = this.thetasum[m][k] / this.numstats;
}
}
} else {
for (var m = 0; m < this.documents.length; m++) {
for (var k = 0; k < this.K; k++) {
theta[m][k] = (this.nd[m][k] + this.alpha) / (this.ndsum[m] + this.K * this.alpha);
}
}
}
return theta;
}
this.getPhi = function () {
var phi = new Array(); for(var i=0;i<this.K;i++) phi[i] = new Array();
if (this.SAMPLE_LAG > 0) {
for (var k = 0; k < this.K; k++) {
for (var w = 0; w < this.V; w++) {
phi[k][w] = this.phisum[k][w] / this.numstats;
}
}
} else {
for (var k = 0; k < this.K; k++) {
for (var w = 0; w < this.V; w++) {
phi[k][w] = (this.nw[w][k] + this.beta) / (this.nwsum[k] + this.V * this.beta);
}
}
}
return phi;
}
this.getRandom = function() {
if (this.RANDOM_SEED) {
// generate a pseudo-random number using a seed to ensure reproducable results.
var x = Math.sin(this.RANDOM_SEED++) * 1000000;
return x - Math.floor(x);
} else {
// use standard random algorithm.
return Math.random();
}
}
}
var processP = function() {
return new Promise((resolve, reject) => {
try {
var res = process.apply(this,arguments);
return resolve(res);
} catch(error) {
return reject(error);
}
});
}
module.exports = processP
@loretoparisi
Copy link
Author

Your question it is more related to the performances in JavaScript rather than the implementation itself, that it matches the LDA implementation in Java. See here for more info about JavaScript and CPU intensive tasks - https://stackoverflow.com/questions/49300814/how-to-avoid-javascript-vm-to-be-blocked-by-cpu-bound-task

@Berkmann18
Copy link

No, my question was about your implementation compared to https://github.com/primaryobjects/lda and co.
Nothing to do with Java/Python/C++/R/... implementations, only (Node)JS ones.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment