Skip to content

Instantly share code, notes, and snippets.

@MNoichl
Last active May 4, 2018 17:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MNoichl/5e78bfcc9c57329362bd0794e90504fc to your computer and use it in GitHub Desktop.
Save MNoichl/5e78bfcc9c57329362bd0794e90504fc to your computer and use it in GitHub Desktop.
test-tsne
license: mit
2.0 6.094098448068141 15.60462332764012
3.0 5.025815402832417 15.23494344194712
4.0 5.263199487005986 15.1056213251487
5.0 4.891657492989145 14.982829414717433
6.0 4.709492354136956 14.905587842458099
7.0 5.143533078325832 14.693552112932617
8.0 4.8764989589400045 14.66691663114093
9.0 4.8333751618645096 14.516567432196636
10.0 4.603408631388721 14.440741525661632
11.0 4.462243132206957 14.337444987117259
12.0 4.436149903184536 14.361224075743785
13.0 4.626023700329075 14.163712514620748
'use strict';
// MODULES //
var isObject = require( 'validate.io-object' ),
isFunction = require( 'validate.io-function' ),
isNumber = require( 'validate.io-number-primitive' ),
isArrayArray = require( 'validate.io-array-array' ),
isString = require( 'validate.io-string-primitive' );
var manhattan = require('compute-manhattan-distance' ),
euclidean = require( 'compute-euclidean-distance' ),
canberra = require( 'compute-canberra-distance' ),
chebyshev = require( 'compute-chebyshev-distance' ),
minkowski = require( 'compute-minkowski-distance' ),
cosine = require( 'compute-cosine-distance');
// PDIST //
/**
* FUNCTION: pdist( X[, options] )
* Computes the pairwise distances of the elements in X
*
* @param {Array} x - input array of arrays
* @param {Object} [options] - function options
* @param {String} [options.distance='euclidean'] - specifies which ditance function to use
* @param {Function} [options.accessor] - accessor function for accessing object array values
* @param {Number} [options.p=2] - norm order for Minkowski distance
* @returns {Object} an array holding the pairwise distances with helper methods "get" and "toMatrix"
*/
function pdist( X, opts ) {
var distance = euclidean, D, i, j, n, p = 2, val, clbk;
if ( !isArrayArray( X ) ) {
throw new TypeError( 'pdist()::invalid input argument. First argument must be an array of arrays. Value: `' + X + '`.' );
}
n = X.length;
D = [];
if ( arguments.length > 1 ) {
if ( !isObject( opts ) ) {
throw new TypeError( 'pdist()::invalid input argument. Options argument must be an object. Value: `' + opts + '`.' );
}
if ( opts.hasOwnProperty( 'p' ) ) {
p = opts.p;
if ( !isNumber( p ) || p <= 0 ) {
throw new TypeError( 'pdist()::invalid option. `p` option must be a positive number primitive. Option: `' + p + '`.' );
}
}
if ( opts.hasOwnProperty( 'accessor' ) ) {
clbk = opts.accessor;
if ( !isFunction( clbk ) ) {
throw new TypeError( 'pdist()::invalid option. Accessor must be a function. Option: `' + clbk + '`.' );
}
}
if ( opts.hasOwnProperty( 'distance' ) ) {
if ( !isString( opts.distance ) ) {
throw new TypeError( 'pdist()::invalid option. Distance must be a string. Option: `' + opts.distance + '`.' );
}
switch (opts.distance) {
case 'canberra':
distance = canberra;
break;
case 'chebyshev':
distance = chebyshev;
break;
case 'cosine':
distance = cosine;
break;
case 'euclidean':
distance = euclidean;
break;
case 'manhattan':
distance = manhattan;
break;
case 'minkowski':
distance = minkowski;
break;
}
}
}
for ( i = 0; i < n; i++ ) {
for ( j = i + 1; j < n; j++ ) {
if ( distance === minkowski ) {
if ( clbk ) {
val = distance( X[i], X[j], {'p': p, 'accessor': clbk } );
} else {
val = distance( X[i], X[j], {'p': p} );
}
} else {
if ( clbk ) {
val = distance( X[i], X[j], clbk );
} else {
val = distance( X[i], X[j] );
}
}
D.push( val );
}
}
/**
* METHOD: get( i, j )
* retrieves the pairwise distance of element i and j, where i < j
* @param {Number} i - index of first element
* @param {Number} j - index of second element
* @returns {Number} distance between i and j
*/
Object.defineProperty( D, 'get', {
enumerable: false,
configurable: true,
value: function( i, j ) {
var index = (i === 0) ? j - 1 : i * (n - 1) - (i * ( i - 1 ) / 2) + j - i - 1;
return this[ index ];
}
});
return D;
} // end FUNCTION pdist()
// EXPORTS //
module.exports = pdist;
<!DOCTYPE html>
<head>
<meta charset="utf-8">
<script src="https://d3js.org/d3.v4.min.js"></script>
<script src="tsne.js"></script>
<style>
body { margin:0;position:fixed;top:0;right:0;bottom:0;left:0; }
</style>
</head>
<body>
<script>
// Feel free to change or delete any of the code you see in this editor!
var svg = d3.select("body").append("svg")
.attr("width", 960)
.attr("height", 500)
var opt = {}
opt.epsilon = 10; // epsilon is learning rate (10 = default)
opt.perplexity = 30; // roughly how many neighbors each point influences (30 = default)
opt.dim = 2; // dimensionality of the embedding (2 = default)
var tsne = new tsnejs.tSNE(opt); // create a tSNE instance
// initialize data. Here we have 3 points and some example pairwise dissimilarities
d3.csv("data.csv", function(data) {
dists = [[0,2,3],[0,4,6]]
tsne.initDataRaw(dists);
for(var k = 0; k < 500; k++) {
tsne.step(); // every time you call this, solution gets better
}
var Y = tsne.getSolution(); // Y is an array of 2-D points that you can plot
console.log("Y[0]")
console.log("test: "+typeof(Y))
console.log(Y[0][0])
svg.selectAll("dot")
.data(Y)
.enter().append("circle")
.attr("r", 3.5)
.attr("cx", function(d,i) { return Y[i][0]+100; })
.attr("cy", function(d,i) { return Y[i][1]+100; });
//function(d,i) { return y(d.close); }
}
)
</script>
</body>
// create main global object
var tsnejs = tsnejs || { REVISION: 'ALPHA' };
(function(global) {
"use strict";
// utility function
var assert = function(condition, message) {
if (!condition) { throw message || "Assertion failed"; }
}
// syntax sugar
var getopt = function(opt, field, defaultval) {
if(opt.hasOwnProperty(field)) {
return opt[field];
} else {
return defaultval;
}
}
// return 0 mean unit standard deviation random number
var return_v = false;
var v_val = 0.0;
var gaussRandom = function() {
if(return_v) {
return_v = false;
return v_val;
}
var u = 2*Math.random()-1;
var v = 2*Math.random()-1;
var r = u*u + v*v;
if(r == 0 || r > 1) return gaussRandom();
var c = Math.sqrt(-2*Math.log(r)/r);
v_val = v*c; // cache this for next function call for efficiency
return_v = true;
return u*c;
}
// return random normal number
var randn = function(mu, std){ return mu+gaussRandom()*std; }
// utilitity that creates contiguous vector of zeros of size n
var zeros = function(n) {
if(typeof(n)==='undefined' || isNaN(n)) { return []; }
if(typeof ArrayBuffer === 'undefined') {
// lacking browser support
var arr = new Array(n);
for(var i=0;i<n;i++) { arr[i]= 0; }
return arr;
} else {
return new Float64Array(n); // typed arrays are faster
}
}
// utility that returns 2d array filled with random numbers
// or with value s, if provided
var randn2d = function(n,d,s) {
var uses = typeof s !== 'undefined';
var x = [];
for(var i=0;i<n;i++) {
var xhere = [];
for(var j=0;j<d;j++) {
if(uses) {
xhere.push(s);
} else {
xhere.push(randn(0.0, 1e-4));
}
}
x.push(xhere);
}
return x;
}
// compute L2 distance between two vectors
var L2 = function(x1, x2) {
var D = x1.length;
var d = 0;
for(var i=0;i<D;i++) {
var x1i = x1[i];
var x2i = x2[i];
d += (x1i-x2i)*(x1i-x2i);
}
return d;
}
// compute pairwise distance in all vectors in X
var xtod = function(X) {
var N = X.length;
var dist = zeros(N * N); // allocate contiguous array
for(var i=0;i<N;i++) {
for(var j=i+1;j<N;j++) {
var d = L2(X[i], X[j]);
dist[i*N+j] = d;
dist[j*N+i] = d;
}
}
return dist;
}
// compute (p_{i|j} + p_{j|i})/(2n)
var d2p = function(D, perplexity, tol) {
var Nf = Math.sqrt(D.length); // this better be an integer
var N = Math.floor(Nf);
assert(N === Nf, "D should have square number of elements.");
var Htarget = Math.log(perplexity); // target entropy of distribution
var P = zeros(N * N); // temporary probability matrix
var prow = zeros(N); // a temporary storage compartment
for(var i=0;i<N;i++) {
var betamin = -Infinity;
var betamax = Infinity;
var beta = 1; // initial value of precision
var done = false;
var maxtries = 50;
// perform binary search to find a suitable precision beta
// so that the entropy of the distribution is appropriate
var num = 0;
while(!done) {
//debugger;
// compute entropy and kernel row with beta precision
var psum = 0.0;
for(var j=0;j<N;j++) {
var pj = Math.exp(- D[i*N+j] * beta);
if(i===j) { pj = 0; } // we dont care about diagonals
prow[j] = pj;
psum += pj;
}
// normalize p and compute entropy
var Hhere = 0.0;
for(var j=0;j<N;j++) {
if(psum == 0) {
var pj = 0;
} else {
var pj = prow[j] / psum;
}
prow[j] = pj;
if(pj > 1e-7) Hhere -= pj * Math.log(pj);
}
// adjust beta based on result
if(Hhere > Htarget) {
// entropy was too high (distribution too diffuse)
// so we need to increase the precision for more peaky distribution
betamin = beta; // move up the bounds
if(betamax === Infinity) { beta = beta * 2; }
else { beta = (beta + betamax) / 2; }
} else {
// converse case. make distrubtion less peaky
betamax = beta;
if(betamin === -Infinity) { beta = beta / 2; }
else { beta = (beta + betamin) / 2; }
}
// stopping conditions: too many tries or got a good precision
num++;
if(Math.abs(Hhere - Htarget) < tol) { done = true; }
if(num >= maxtries) { done = true; }
}
// console.log('data point ' + i + ' gets precision ' + beta + ' after ' + num + ' binary search steps.');
// copy over the final prow to P at row i
for(var j=0;j<N;j++) { P[i*N+j] = prow[j]; }
} // end loop over examples i
// symmetrize P and normalize it to sum to 1 over all ij
var Pout = zeros(N * N);
var N2 = N*2;
for(var i=0;i<N;i++) {
for(var j=0;j<N;j++) {
Pout[i*N+j] = Math.max((P[i*N+j] + P[j*N+i])/N2, 1e-100);
}
}
return Pout;
}
// helper function
function sign(x) { return x > 0 ? 1 : x < 0 ? -1 : 0; }
var tSNE = function(opt) {
var opt = opt || {};
this.perplexity = getopt(opt, "perplexity", 30); // effective number of nearest neighbors
this.dim = getopt(opt, "dim", 2); // by default 2-D tSNE
this.epsilon = getopt(opt, "epsilon", 10); // learning rate
this.iter = 0;
}
tSNE.prototype = {
// this function takes a set of high-dimensional points
// and creates matrix P from them using gaussian kernel
initDataRaw: function(X) {
var N = X.length;
var D = X[0].length;
assert(N > 0, " X is empty? You must have some data!");
assert(D > 0, " X[0] is empty? Where is the data?");
var dists = xtod(X); // convert X to distances using gaussian kernel
this.P = d2p(dists, this.perplexity, 1e-4); // attach to object
this.N = N; // back up the size of the dataset
this.initSolution(); // refresh this
},
// this function takes a given distance matrix and creates
// matrix P from them.
// D is assumed to be provided as a list of lists, and should be symmetric
initDataDist: function(D) {
var N = D.length;
assert(N > 0, " X is empty? You must have some data!");
// convert D to a (fast) typed array version
var dists = zeros(N * N); // allocate contiguous array
for(var i=0;i<N;i++) {
for(var j=i+1;j<N;j++) {
var d = D[i][j];
dists[i*N+j] = d;
dists[j*N+i] = d;
}
}
this.P = d2p(dists, this.perplexity, 1e-4);
this.N = N;
this.initSolution(); // refresh this
},
// (re)initializes the solution to random
initSolution: function() {
// generate random solution to t-SNE
this.Y = randn2d(this.N, this.dim); // the solution
this.gains = randn2d(this.N, this.dim, 1.0); // step gains to accelerate progress in unchanging directions
this.ystep = randn2d(this.N, this.dim, 0.0); // momentum accumulator
this.iter = 0;
},
// return pointer to current solution
getSolution: function() {
return this.Y;
},
// perform a single step of optimization to improve the embedding
step: function() {
this.iter += 1;
var N = this.N;
var cg = this.costGrad(this.Y); // evaluate gradient
var cost = cg.cost;
var grad = cg.grad;
// perform gradient step
var ymean = zeros(this.dim);
for(var i=0;i<N;i++) {
for(var d=0;d<this.dim;d++) {
var gid = grad[i][d];
var sid = this.ystep[i][d];
var gainid = this.gains[i][d];
// compute gain update
var newgain = sign(gid) === sign(sid) ? gainid * 0.8 : gainid + 0.2;
if(newgain < 0.01) newgain = 0.01; // clamp
this.gains[i][d] = newgain; // store for next turn
// compute momentum step direction
var momval = this.iter < 250 ? 0.5 : 0.8;
var newsid = momval * sid - this.epsilon * newgain * grad[i][d];
this.ystep[i][d] = newsid; // remember the step we took
// step!
this.Y[i][d] += newsid;
ymean[d] += this.Y[i][d]; // accumulate mean so that we can center later
}
}
// reproject Y to be zero mean
for(var i=0;i<N;i++) {
for(var d=0;d<this.dim;d++) {
this.Y[i][d] -= ymean[d]/N;
}
}
//if(this.iter%100===0) console.log('iter ' + this.iter + ', cost: ' + cost);
return cost; // return current cost
},
// for debugging: gradient check
debugGrad: function() {
var N = this.N;
var cg = this.costGrad(this.Y); // evaluate gradient
var cost = cg.cost;
var grad = cg.grad;
var e = 1e-5;
for(var i=0;i<N;i++) {
for(var d=0;d<this.dim;d++) {
var yold = this.Y[i][d];
this.Y[i][d] = yold + e;
var cg0 = this.costGrad(this.Y);
this.Y[i][d] = yold - e;
var cg1 = this.costGrad(this.Y);
var analytic = grad[i][d];
var numerical = (cg0.cost - cg1.cost) / ( 2 * e );
console.log(i + ',' + d + ': gradcheck analytic: ' + analytic + ' vs. numerical: ' + numerical);
this.Y[i][d] = yold;
}
}
},
// return cost and gradient, given an arrangement
costGrad: function(Y) {
var N = this.N;
var dim = this.dim; // dim of output space
var P = this.P;
var pmul = this.iter < 100 ? 4 : 1; // trick that helps with local optima
// compute current Q distribution, unnormalized first
var Qu = zeros(N * N);
var qsum = 0.0;
for(var i=0;i<N;i++) {
for(var j=i+1;j<N;j++) {
var dsum = 0.0;
for(var d=0;d<dim;d++) {
var dhere = Y[i][d] - Y[j][d];
dsum += dhere * dhere;
}
var qu = 1.0 / (1.0 + dsum); // Student t-distribution
Qu[i*N+j] = qu;
Qu[j*N+i] = qu;
qsum += 2 * qu;
}
}
// normalize Q distribution to sum to 1
var NN = N*N;
var Q = zeros(NN);
for(var q=0;q<NN;q++) { Q[q] = Math.max(Qu[q] / qsum, 1e-100); }
var cost = 0.0;
var grad = [];
for(var i=0;i<N;i++) {
var gsum = new Array(dim); // init grad for point i
for(var d=0;d<dim;d++) { gsum[d] = 0.0; }
for(var j=0;j<N;j++) {
cost += - P[i*N+j] * Math.log(Q[i*N+j]); // accumulate cost (the non-constant portion at least...)
var premult = 4 * (pmul * P[i*N+j] - Q[i*N+j]) * Qu[i*N+j];
for(var d=0;d<dim;d++) {
gsum[d] += premult * (Y[i][d] - Y[j][d]);
}
}
grad.push(gsum);
}
return {cost: cost, grad: grad};
}
}
global.tSNE = tSNE; // export tSNE class
})(tsnejs);
// export the library to window, or to module in nodejs
(function(lib) {
"use strict";
if (typeof module === "undefined" || typeof module.exports === "undefined") {
window.tsnejs = lib; // in ordinary browser attach library to window
} else {
module.exports = lib; // in nodejs
}
})(tsnejs);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment