Skip to content

Instantly share code, notes, and snippets.

@john-guerra
Created August 22, 2018 20:38
Show Gist options
  • Save john-guerra/3d277480c8ad89a2418387a50a13b1c2 to your computer and use it in GitHub Desktop.
Save john-guerra/3d277480c8ad89a2418387a50a13b1c2 to your computer and use it in GitHub Desktop.
netClustering.js
/*
Copyright 2010 by Robin W. Spencer
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You can find a copy of the GNU General Public License
at http://www.gnu.org/licenses/.
*/
/*jslint browser: true, devel: true, regexp: true, white: true */
/*
*/
"use strict";
var netClustering = {
version: "0.1"
};
(function() {
// "use strict";
// Specific utilities: data import & recursive traverses
function binaryTreeWalk(currentNode,tree,depth,doLeafAction,doDownAction,doUpAction){
// General purpose recursive binary depth-first tree walk, with three possible action functions:
// at each leaf node, on the way down a branch, and on the way back up a branch.
if(tree[currentNode].leftChild>-1){
depth+=1;
if(doDownAction) { doDownAction(currentNode,tree,depth); }
binaryTreeWalk(tree[currentNode].leftChild,tree,depth,doLeafAction,doDownAction,doUpAction);
}
if(tree[currentNode].rightChild==-1){ // It"s a leaf node
if(doLeafAction){ doLeafAction(currentNode,tree,depth); }
}
if(tree[currentNode].rightChild>-1){
binaryTreeWalk(tree[currentNode].rightChild,tree,depth,doLeafAction,doDownAction,doUpAction);
if(doUpAction){ doUpAction(currentNode,tree,depth); }
depth-=1;
}
}
// Community detection algorithms and recursion
function addClosestPairByCommunity(tree,deltaQ,a){
// Newman"s communities algorithm, http://arxiv.org/abs/cond-mat/0309508v1
// Find the largest deltaQ for each row, and overall
// Where Newman et al keep the H as max-heaps, we rebuild H from scratch for code clarity.
// Semi-sparse: We still make H arrays for sorting but do it in one pass from the sparse deltaQ.
// logTime("start step");
var n=tree.length;
var H={};
for(var hash in deltaQ){
var dQ=deltaQ[hash];
var keys=hash.split("~");
var i=Number(keys[0]);
var j=Number(keys[1]);
if (!H[i]) { H[i]=[]; }
if (!H[j]) { H[j]=[]; }
H[i].push({"dQ": dQ, "i": i, "j": j});
H[j].push({"dQ": dQ, "i": i, "j": j});
}
// logTime("assign H");
// Find nodes to join
var Hmax={"dQ":-Infinity, "i": -1, "j": -1};
for(var i=0;i<n;i++){
if (H[i]){
H[i].sort(function (a,b) {
return b.dQ-a.dQ;
}); // Full sort, overkill but native & clean
// The [0] element in each H array now has the largest deltaQ for that row
if(H[i][0].dQ>Hmax.dQ){
Hmax.dQ=H[i][0].dQ;
Hmax.i =H[i][0].i;
Hmax.j =H[i][0].j;
}
}
}
// logTime("find Hmax");
// Diagnostic info
// console("&nbsp;("+Hmax.i+","+Hmax.j+") -> "+n+"&nbsp; &nbsp; &Delta;Q = "+Hmax.dQ.toFixed(3));
// On full recursion, unweighted datasets can end up with degenerate small subsets, trapped here.
if (Hmax.i==-1) {
return null;
}
// Create a combined node. The tree[] is needed only for later drawing;
// all the work here is done with the deltaQ[].
var wt=tree[Hmax.i].weight+tree[Hmax.j].weight;
tree.push({"parent": -1, "leftChild": Hmax.i, "rightChild": Hmax.j, "weight": wt, "dQ": Hmax.dQ});
tree[Hmax.i].parent=n; // n = the new one we just created
tree[Hmax.j].parent=n;
// Update all deltaQ, Clauset eq 10a-c
var hashToZap=[]; // Remember the deltaQ for the nodes we"re joining, to null out later.
for(var k=0;k<n;k++){
if(k!=Hmax.i && k!=Hmax.j && H[k]){ // H[k]!=null => node still in play
var hashik=Math.min(Hmax.i,k)+"~"+Math.max(Hmax.i,k);
var hashjk=Math.min(Hmax.j,k)+"~"+Math.max(Hmax.j,k);
var hashNew=k+"~"+n;
var t1=deltaQ[hashik];
var t2=deltaQ[hashjk];
// Javascript thinks zero and null are both false, so some type tricks are needed;
// zero is a valid entry for deltaQ and common for small graphs.
if(!isNaN(t1)){
hashToZap.push(hashik);
if(!isNaN(t2)){
hashToZap.push(hashjk);
deltaQ[hashNew]=t1+t2;
}else{
deltaQ[hashNew]=t1-2.0*a[Hmax.j]*a[k];
}
}else{
if(!isNaN(t2)){
hashToZap.push(hashjk);
deltaQ[hashNew]=t2-2.0*a[Hmax.i]*a[k];
}else{
deltaQ[hashNew]=null; // Important to zap dQ when t1 & t2 undefined
}
}
}
}
// logTime("update dQ");
// Update a[]
a[n]=a[Hmax.i]+a[Hmax.j];
// a[Hmax.i]=0;a[Hmax.j]=0; // No need to zero-out; these a[] are not used again
// Experiments verify that sum a[i] = 1.00 at all stages of agglomeration.
// Remove any deltaQ for nodes now absorbed in this join.
deltaQ[Hmax.i+"~"+Hmax.j]=null;
for(var i=0;i<hashToZap.length;i++){
deltaQ[hashToZap[i]]=null;
}
// Make dQ array smaller by not copying over any dQ set to null above.
var dQcopy={};
var ndq=0;
for(var hash in deltaQ){
if (deltaQ[hash]) {
dQcopy[hash]=deltaQ[hash];
ndq++;
}
}
// logTime("add a[], prune dQ");
return {"value": Hmax.dQ, "array": dQcopy};
}
function buildTreeByCommunities(dataObj,showNotes){
// Implement the fast Newman method, as in Clauset, Newman, Moore, arXiv:cond-mat/0408187v2
var n=dataObj.names.length;
var k=[];
for(var i=0;i<n;i++){
k[i]=0;
}
var m=0;
var W=0;
if(dataObj.useWeights){
for(var hash in dataObj.distances){
var keys=hash.split("~");
var i=Number(keys[0]);
var j=Number(keys[1]);
var d=dataObj.distances[hash];
k[i]+=d;
k[j]+=d;
W+=d;
m+=1;
}
if(!W){W=1e-7}; // This avoids some errors with disconnected components
var inv2m=1/(2*W);
}else{
for(var hash in dataObj.distances){
var keys=hash.split("~");
var i=Number(keys[0]);
var j=Number(keys[1]);
k[i]+=1;
k[j]+=1;
m+=1;
}
if(!m){m=1e-7}; // This avoids some errors with disconnected components
var inv2m=1/(2*m);
}
// See Berry et al arXiv:0903.1072v2 eq 16; note the 2x difference between Clauset and Berry.
var deltaQ={};
for(var hash in dataObj.distances){
var keys=hash.split("~");
var i=Number(keys[0]);
var j=Number(keys[1]);
if(dataObj.useWeights){
deltaQ[hash]=2.0*inv2m*dataObj.distances[hash] - 2.0*inv2m*inv2m*k[i]*k[j];
}else{
deltaQ[hash]=2.0*(inv2m-k[i]*k[j]*inv2m*inv2m); // 2x assures identical Q for unweighted datasets
}
}
var a=[];
for(var i=0;i<n;i++){
a[i]=inv2m*k[i];
}
// var s=0;for(var i=0;i<a.length;i++){s+=a[i]};alert(s); // always 1.00 but good to check
// Initialize the binary tree, used only for later display.
var tree=[];
for(var i=0;i<n;i++){
tree.push({"parent": -1, "leftChild": -1 ,"rightChild": -1, "weight": 1, "linkage": a[i], "name": dataObj.names[i], "primaryKey": i});
}
// logTime("initialize k,a,dQ");
// Do the actual agglomerative joining
var Q=0.0;
var maxQ=-Infinity;
var dQobj={"value": 0, "array": deltaQ};
var numCommunities=1;
while(dQobj && tree.length<(2*n-1)) {
dQobj=addClosestPairByCommunity(tree,dQobj.array,a);
if(dQobj) {
Q+=dQobj.value;
if(dQobj.value<0){numCommunities+=1};
maxQ=Math.max(maxQ,Q);
}else{
// We hit a small degenerate subset -- another stop to recursion
return null;
}
}
// showTimes(); // diagnostic for speed optimization
// Assign index = sequence of traverse for coloring
var index = 0;
var root=tree.length - 1;
binaryTreeWalk(root, tree, 0, function (currentNode,tree,depth){
tree[currentNode].index=index;
index+=1;
},null,null);
if(showNotes){
var notes=n+" nodes, "+m+" of "+(n*(n-1)/2)+" possible edges ("+Math.round(200*m/(n*(n-1)))+"%) ";
notes+="with data, and "+numCommunities+" primary communities identified.";
notes+="&nbsp; &nbsp; Q="+maxQ.toFixed(3);
document.getElementById("notes").innerHTML=notes;
}
return {"tree": tree, "distances": dataObj.distances, "root": root, "names": dataObj.names, "useWeights": dataObj.useWeights};
}
function findSplits(treeObj){
// The treeObj has dQ info from the Newman joining, so we can
// identify communities based on when dQ went negative.
var breakNext = true;
var breakNodes = [];
var g = -1;
var group = [];
var members = "";
var tracker = 0;
binaryTreeWalk(treeObj.root, treeObj.tree, 0, function (node,tree,depth) {
if(breakNext) {
g += 1;
breakNodes.push(node);
breakNext = false;
}
group[node] = g;
members += treeObj.tree[node].name + ",";
},
function(node, tree, depth) {
var thisNode = tree[node];
if(thisNode.dQ < 0){
breakNext = true;
tracker = 0;
}
tracker += 1;
},
function(node, tree, depth) {
var thisNode = tree[node];
tracker -= 1;
if (tracker == 1){
breakNext = true;
members += "~";
}
});
// //Search for any nodes that aren"t in any group and put them in one
// var createdGroupForLoners = false, lonersCount = 0;
// treeObj.names.forEach(function (d, i) {
// if (!group.hasOwnProperty(i)) {
// if (members[members.length-1]!="~" && !createdGroupForLoners) {
// members+="~"
// }
// group[i] = g+1;
// createdGroupForLoners = true;
// members+=d + ",";
// lonersCount++;
// }
// })
// //If we created a new group for loners increase the group size
// if (createdGroupForLoners) {
// g+=1;
// members+="~";
// console.log("netClustering lonersCount=" + lonersCount);
// }
var numGroups = g + 1;
members = members.slice(0, -2);
members = members.replace(/,~/g, "~");
// members has the names of nodes, comma separated within groups and "~" between groups
return {"numGroups": numGroups, "group": group, "members": members, "breakNodes": breakNodes};
}
function findSubCommunities(treeObj,depth,prevGroup){
if (!treeObj) {return}
var tree = treeObj.tree;
var root = treeObj.root;
var names = treeObj.names;
// Identify the communities in the data
var splitInfo = findSplits(treeObj);
var numGroups = splitInfo.numGroups;
var group = splitInfo.group;
var t = splitInfo.members.split("~"); // string to array
var groups = [];
for(var g = 0; g < t.length; g++) {
groups.push((t[g]).split(","));
}
// groups is now a set of nested arrays of names, what we return
// Split the original distance & name tables into separate dataObj for each group
// so we can repeat the analysis recursively.
var dataObjList = [];
for (var g = 0; g < numGroups; g++){
dataObjList.push({"names":[],"distances":[],"useWeights":treeObj.useWeights});
}
var nameKeys=[]; // New groups must have new hash codes, thus new index keys
for(var i=0;i<names.length;i++){
var name=names[i];
dataObjList[group[i]].names.push(name); // Assign names to groups in one pass
nameKeys[i]=dataObjList[group[i]].names.length-1;
}
for(var hash in treeObj.distances){
var keys=hash.split("~");
var i=Number(keys[0]);
var j=Number(keys[1]);
if(group[i]==group[j]){
// These two are in the same group so keep this distance value
var newHash=nameKeys[i]+"~"+nameKeys[j];
dataObjList[group[i]].distances[newHash]=treeObj.distances[hash]
}
}
// Now we have separate dataObj"s for each community. Do the analysis on each.
if(numGroups>1){
for(var g=0;g<numGroups;g++){
// This is where CNM is called for the sub-community
var innerTreeObj=buildTreeByCommunities(dataObjList[g]);
if(innerTreeObj && innerTreeObj.tree){
var innerGroups=findSplits(innerTreeObj).numGroups;
// Simplest stopping rule -- communities of one
if(innerGroups>1){
// Recursion: find any sub-groups
var subgroups=findSubCommunities(innerTreeObj,depth+1,g);
// Replace the current group with its set of subgroups, which builds a tree.
groups[g]=subgroups;
}
}
}
}
return groups;
}
//Exports
netClustering.buildTreeByCommunities = buildTreeByCommunities;
netClustering.findSubCommunities = findSubCommunities;
//buildTreeByCommunities parameter should be
//Object {names: Array[39], distances: Array[0], method: "newman", useWeights: true}
//Receives nodes and edges on the d3 format clusters them, and return the clusters
// the nodes should be a list of objects that at least contains an attribute id
// and the edges should be a list of objects {source:index, target:index, count}
netClustering.cluster = function (nodes, edges, clusterAttr, edgesCountAttr) {
var dataObj = {},
treeObj, groups = [], i;
if (clusterAttr === undefined) {
clusterAttr="cluster";
}
if (edgesCountAttr === undefined) {
edgesCountAttr="value";
}
var linksForClustering = [];
//Do edges come with pointers to the nodes or just indexes?
if (edges.length>0 && typeof(edges[0].source)==="object") {
linksForClustering = edges.map(function (d) {
var sourceId=nodes.indexOf(d.source),
targetId=nodes.indexOf(d.target);
if (sourceId===-1 || targetId===-1) {
return null;
} else {
return {
source: sourceId,
target: targetId,
count : d[edgesCountAttr]!==undefined ? d[edgesCountAttr] : 1
}
}
})
} else {
//copy the links to ensure they have counts
edges.forEach(function (d) {
linksForClustering.push({
source: d.source,
target: d.target,
count: d[edgesCountAttr]!==undefined ? d[edgesCountAttr] : 1
});
});
}
dataObj.method = "newman";
dataObj.useWeights = true;
dataObj.names = nodes.map(function (d, i) { return ""+i; });
// dataObj.names = nodes;
dataObj.distances = {};
linksForClustering = linksForClustering.filter(function (d) { return d.source !== d.target; }); //avoid loops
linksForClustering.forEach(function (d) {
var hash = Math.min(d.source, d.target) + "~" + Math.max(d.source, d.target); // hash always has left key < right key})
dataObj.distances[hash] = +d.count;
});
dataObj = addDummyMetaNode(dataObj);
treeObj=buildTreeByCommunities(dataObj, false);
groups=findSubCommunities(treeObj,0,0);
groups=removeDummyMetaNode(groups);
i=0;
groups.forEach(function (d, i) {
function assignToCluster(ele, i) {
if (ele instanceof Array) {
ele.forEach(function (e) {
assignToCluster(e, i);
});
} else {
nodes[+ele][clusterAttr] = i.toString();
}
}
assignToCluster(d, i);
});
return groups;
};
function addDummyMetaNode(dataObj) {
//Add a dummy node linked to all the nodes to guarantee that the graph is connected
dataObj.names.push("DUMMY");
dataObj.names.forEach(function (d, i) {
if (i===dataObj.names.length-1) {
return;
}
var hash = i+"~"+ (dataObj.names.length-1);
dataObj.distances[hash] = 0.1;
});
return dataObj;
}
function removeDummyMetaNode(groups) {
var i, ele;
for (i = 0; i < groups.length; i++) {
ele = groups[i];
if (ele instanceof Array) {
ele = removeDummyMetaNode(ele);
if (ele.length===0) {
//DUMMY was on an empty cluster, delete it
groups.splice(i, 1);
}
// groups[i];
} else {
if (ele === "DUMMY" || ele === "DUMM") {
groups.splice(i, 1);//Remove
break;
}
}
}
return groups;
}
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment