Created
July 15, 2018 04:31
-
-
Save Orbifold/6083b5a4b4b3cd03f28f681d1a66e4c8 to your computer and use it in GitHub Desktop.
t-Distributed Stochastic Neighbor Embedding (t-SNE) is a (prize-winning) technique for dimensionality reduction that is particularly well suited for the visualization of high-dimensional datasets. The technique can be implemented via Barnes-Hut approximations, allowing it to be applied on large real-world datasets.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from collections import defaultdict | |
import json | |
import itertools | |
from sklearn import cluster, preprocessing, manifold | |
from datetime import datetime | |
class KeplerMapper(object): | |
def __init__(self, cluster_algorithm=cluster.DBSCAN(eps=0.5,min_samples=3), nr_cubes=10, | |
overlap_perc=0.1, scaler=preprocessing.MinMaxScaler(), reducer=None, color_function="distance_origin", | |
link_local=False, verbose=1): | |
self.clf = cluster_algorithm | |
self.nr_cubes = nr_cubes | |
self.overlap_perc = overlap_perc | |
self.scaler = scaler | |
self.color_function = color_function | |
self.verbose = verbose | |
self.link_local = link_local | |
self.reducer = reducer | |
self.chunk_dist = [] | |
self.overlap_dist = [] | |
self.d = [] | |
if self.verbose > 0: | |
print("\nnr_cubes = %s \n\noverlap_perc = %s\n\nlink_local = %s\n\nClusterer = %s\n\nScaler = %s\n\n"%(self.nr_cubes, overlap_perc, self.link_local, str(self.clf),str(self.scaler))) | |
def fit_transform(self, X): | |
# Dimensionality Reduction | |
if self.reducer != None: | |
if self.verbose > 0: | |
self.reducer.set_params(**{"verbose":self.verbose}) | |
print("\n..Reducing Dimensionality using: \n\t%s\n"%str(self.reducer)) | |
reducer = self.reducer | |
X = reducer.fit_transform(X) | |
# Scaling | |
if self.scaler != None: | |
if self.verbose > 0: | |
print("\n..Scaling\n") | |
scaler = self.scaler | |
X = scaler.fit_transform(X) | |
# We chop up the min-max column ranges into 'nr_cubes' parts | |
self.chunk_dist = (np.max(X, axis=0) - np.min(X, axis=0))/self.nr_cubes | |
# We calculate the overlapping windows distance | |
self.overlap_dist = self.overlap_perc * self.chunk_dist | |
# We find our starting point | |
self.d = np.min(X, axis=0) | |
return X | |
def map(self, X, dimension_index=[0], dimension_name=""): | |
# This maps the data to a simplicial complex. Returns a dictionary with nodes and links. | |
start = datetime.now() | |
def cube_coordinates_all(nr_cubes, nr_dimensions): | |
# if there are 4 cubes per dimension and 3 dimensions | |
# return the bottom left (origin) coordinates of 64 hypercubes, in a sorted list of Numpy arrays | |
l = [] | |
for x in range(nr_cubes): | |
l += [x] * nr_dimensions | |
return [np.array(list(f)) for f in sorted(set(itertools.permutations(l,nr_dimensions)))] | |
nodes = defaultdict(list) | |
links = defaultdict(list) | |
complex = {} | |
if self.verbose > 0: | |
print("Mapping on data shaped %s using dimensions %s\n"%(str(X.shape),str(dimension_index))) | |
# Scaling | |
if self.scaler != None: | |
scaler = self.scaler | |
X = scaler.fit_transform(X) | |
# Initialize Cluster Algorithm | |
clf = self.clf | |
# Prefix'ing the data with ID's | |
ids = np.array([x for x in range(X.shape[0])]) | |
X = np.c_[ids,X] | |
# Subdivide the data X in intervals/hypercubes with overlap | |
if self.verbose > 0: | |
total_cubes = len(cube_coordinates_all(self.nr_cubes,len(dimension_index))) | |
print("Creating %s hypercubes."%total_cubes) | |
di = np.array(dimension_index) | |
for i, coor in enumerate(cube_coordinates_all(self.nr_cubes,di.shape[0])): | |
# Slice the hypercube | |
hypercube = X[ np.invert(np.any((X[:,di+1] >= self.d[di] + (coor * self.chunk_dist[di])) & | |
(X[:,di+1] < self.d[di] + (coor * self.chunk_dist[di]) + self.chunk_dist[di] + self.overlap_dist[di]) == False, axis=1 )) ] | |
if self.verbose > 1: | |
print("There are %s points in cube_%s / %s with starting range %s"% | |
(hypercube.shape[0],i,total_cubes,self.d[di] + (coor * self.chunk_dist[di]))) | |
# If at least one sample inside the hypercube | |
if hypercube.shape[0] > 0: | |
# Cluster the data point(s) inside the cube, skipping the id-column | |
clf.fit(hypercube[:,1:]) | |
if self.verbose > 1: | |
print("Found %s clusters in cube_%s\n"%(np.unique(clf.labels_[clf.labels_ > -1]).shape[0],i)) | |
#Now for every (sample id in cube, predicted cluster label) | |
for a in np.c_[hypercube[:,0],clf.labels_]: | |
if a[1] != -1: #if not predicted as noise | |
cluster_id = str(coor[0])+"_"+str(i)+"_"+str(a[1])+"_"+str(coor)+"_"+str(self.d[di] + (coor * self.chunk_dist[di])) # Rudimentary cluster id | |
nodes[cluster_id].append( int(a[0]) ) # Append the member id's as integers | |
else: | |
if self.verbose > 1: | |
print("Cube_%s is empty.\n"%(i)) | |
# Create links when clusters from different hypercubes have members with the same sample id. | |
for k in nodes: | |
for kn in nodes: | |
if k != kn: | |
if len(nodes[k] + nodes[kn]) != len(set(nodes[kn] + nodes[k])): # there are non-unique id's in the union | |
links[k].append( kn ) | |
# Create links between local hypercube clusters if setting link_local = True | |
if self.link_local: | |
if k.split("_")[0] == kn.split("_")[0]: | |
links[k].append( kn ) | |
# Reporting | |
if self.verbose > 0: | |
nr_links = 0 | |
for k in links: | |
nr_links += len(links[k]) | |
print("\ncreated %s edges and %s nodes in %s."%(nr_links,len(nodes),str(datetime.now()-start))) | |
complex["nodes"] = nodes | |
complex["links"] = links | |
complex["meta"] = dimension_name | |
return complex | |
def visualize(self, complex, path_html="mapper_visualization_output.html", title="My Data", graph_link_distance=30, graph_gravity=0.1, graph_charge=-120, custom_tooltips=None): | |
# Turns the dictionary 'complex' in a html file with d3.js | |
# Format JSON | |
json_s = {} | |
json_s["nodes"] = [] | |
json_s["links"] = [] | |
k2e = {} # a key to incremental int dict, used for id's when linking | |
for e, k in enumerate(complex["nodes"]): | |
# Tooltip formatting | |
if custom_tooltips is not None: | |
tooltip_s = "<h2>Cluster %s</h2>"%k + " ".join([str(f) for f in custom_tooltips[complex["nodes"][k]]]) | |
if self.color_function == "average_signal_cluster": | |
tooltip_i = int(((sum([f for f in custom_tooltips[complex["nodes"][k]]]) / len(custom_tooltips[complex["nodes"][k]])) * 30) ) | |
json_s["nodes"].append({"name": str(k), "tooltip": tooltip_s, "group": 2 * int(np.log(len(complex["nodes"][k]))), "color": str(tooltip_i)}) | |
else: | |
json_s["nodes"].append({"name": str(k), "tooltip": tooltip_s, "group": 2 * int(np.log(len(complex["nodes"][k]))), "color": str(k.split("_")[0])}) | |
else: | |
tooltip_s = "<h2>Cluster %s</h2>Contains %s members."%(k,len(complex["nodes"][k])) | |
json_s["nodes"].append({"name": str(k), "tooltip": tooltip_s, "group": 2 * int(np.log(len(complex["nodes"][k]))), "color": str(k.split("_")[0])}) | |
#json_s["nodes"].append({"name": str(k), "tooltip": tooltip_s, "group": 2 * int(np.log(len(complex["nodes"][k]))), "color": str(tooltip_i)}) | |
k2e[k] = e | |
for k in complex["links"]: | |
for link in complex["links"][k]: | |
json_s["links"].append({"source": k2e[k], "target":k2e[link],"value":1}) | |
with open(path_html,"wb") as outfile: | |
html = """<!DOCTYPE html> | |
<meta charset="utf-8"> | |
<meta name="generator" content="KeplerMapper"> | |
<title>%s | KeplerMapper</title> | |
<link href='https://fonts.googleapis.com/css?family=Roboto:700,300' rel='stylesheet' type='text/css'> | |
<style> | |
* {margin: 0; padding: 0;} | |
html { height: 100%%;} | |
body {background: #111; height: 100%%; font: 100 16px Roboto, Sans-serif;} | |
.link { stroke: #999; stroke-opacity: .333; } | |
.divs div { border-radius: 50%%; background: red; position: absolute; } | |
.divs { position: absolute; top: 0; left: 0; } | |
#holder { position: relative; width: 1300px; height: 900px; background: #111; display: block;} | |
h1 { padding: 20px; color: #fafafa; text-shadow: 0px 1px #000,0px -1px #000; position: absolute; font: 300 30px Roboto, Sans-serif;} | |
h2 { text-shadow: 0px 1px #000,0px -1px #000; font: 700 16px Roboto, Sans-serif;} | |
p { position: absolute; opacity: 0.9; width: 220px; top: 80px; left: 20px; display: block; background: #000; line-height: 25px; color: #fafafa; border: 20px solid #000; font: 100 16px Roboto, Sans-serif;} | |
div.tooltip { position: absolute; width: 380px; display: block; padding: 20px; background: #000; border: 0px; border-radius: 3px; pointer-events: none; z-index: 999; color: #FAFAFA;} | |
} | |
</style> | |
<body> | |
<div id="holder"> | |
<h1>%s</h1> | |
<p> | |
<b>Lens</b><br>%s<br><br> | |
<b>Cubes per dimension</b><br>%s<br><br> | |
<b>Overlap percentage</b><br>%s%%<br><br> | |
<b>Linking locally</b><br>%s<br><br> | |
<b>Color Function</b><br>%s( %s )<br><br> | |
<b>Clusterer</b><br>%s<br><br> | |
<b>Scaler</b><br>%s | |
</p> | |
</div> | |
<script src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js"></script> | |
<script> | |
var width = 1300, | |
height = 900; | |
var color = d3.scale.ordinal() | |
.domain(["0","1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30"]) | |
.range(["#FF0000","#FF1400","#FF2800","#FF3c00","#FF5000","#FF6400","#FF7800","#FF8c00","#FFa000","#FFb400","#FFc800","#FFdc00","#FFf000","#fdff00","#b0ff00","#65ff00","#17ff00","#00ff36","#00ff83","#00ffd0","#00e4ff","#00c4ff","#00a4ff","#00a4ff","#0084ff","#0064ff","#0044ff","#0022ff","#0002ff","#0100ff","#0300ff","#0500ff"]); | |
var force = d3.layout.force() | |
.charge(%s) | |
.linkDistance(%s) | |
.gravity(%s) | |
.size([width, height]); | |
var svg = d3.select("#holder").append("svg") | |
.attr("width", width) | |
.attr("height", height); | |
var div = d3.select("#holder").append("div") | |
.attr("class", "tooltip") | |
.style("opacity", 0.0); | |
var divs = d3.select('#holder').append('div') | |
.attr('class', 'divs') | |
.attr('style', function(d) { return 'overflow: hidden; width: ' + width + 'px; height: ' + height + 'px;'; }); | |
graph = %s; | |
force | |
.nodes(graph.nodes) | |
.links(graph.links) | |
.start(); | |
var link = svg.selectAll(".link") | |
.data(graph.links) | |
.enter().append("line") | |
.attr("class", "link") | |
.style("stroke-width", function(d) { return Math.sqrt(d.value); }); | |
var node = divs.selectAll('div') | |
.data(graph.nodes) | |
.enter().append('div') | |
.on("mouseover", function(d) { | |
div.transition() | |
.duration(200) | |
.style("opacity", .9); | |
div .html(d.tooltip + "<br/>") | |
.style("left", (d3.event.pageX + 100) + "px") | |
.style("top", (d3.event.pageY - 28) + "px"); | |
}) | |
.on("mouseout", function(d) { | |
div.transition() | |
.duration(500) | |
.style("opacity", 0); | |
}) | |
.call(force.drag); | |
node.append("title") | |
.text(function(d) { return d.name; }); | |
force.on("tick", function() { | |
link.attr("x1", function(d) { return d.source.x; }) | |
.attr("y1", function(d) { return d.source.y; }) | |
.attr("x2", function(d) { return d.target.x; }) | |
.attr("y2", function(d) { return d.target.y; }); | |
node.attr("cx", function(d) { return d.x; }) | |
.attr("cy", function(d) { return d.y; }) | |
.attr('style', function(d) { return 'width: ' + (d.group * 2) + 'px; height: ' + (d.group * 2) + 'px; ' + 'left: '+(d.x-(d.group))+'px; ' + 'top: '+(d.y-(d.group))+'px; background: '+color(d.color)+'; box-shadow: 0px 0px 3px #111; box-shadow: 0px 0px 33px '+color(d.color)+', inset 0px 0px 5px rgba(0, 0, 0, 0.2);'}) | |
; | |
}); | |
</script>"""%(title,title,complex["meta"],self.nr_cubes,self.overlap_perc*100,self.link_local,self.color_function,complex["meta"],str(self.clf),str(self.scaler),graph_charge,graph_link_distance,graph_gravity,json.dumps(json_s)) | |
outfile.write(html.encode("utf-8")) | |
if self.verbose > 0: | |
print("\nWrote d3.js graph to '%s'"%path_html) | |
if __name__ == "__main__": | |
# Get data | |
import pandas as pd | |
train = pd.read_csv("./train.csv") | |
features = [c for c in train.columns if c not in "label"] | |
data = np.array(train[features])[:4000] | |
y = np.array(train["label"])[:4000] | |
# Create images for a custom tooltip array | |
from io import BytesIO | |
from scipy.misc import imsave, toimage | |
import base64 | |
tooltip_s = [] | |
for image_data in data: | |
output = BytesIO() | |
img = toimage(image_data.reshape((28,28))) # Data was a flat row of 64 "pixels". | |
img.save(output, format="PNG") | |
contents = "<img src='data:image/png;base64,"+base64.encodebytes(output.getvalue()).decode()+"'>" | |
tooltip_s.append( contents ) | |
output.close() | |
tooltip_s = np.array(tooltip_s) # need to make sure to feed it as a NumPy array, not a list | |
# Initialize to use t-SNE with 2 components (reduces data to 2 dimensions). Also note high overlap_percentage. | |
mapper = KeplerMapper(cluster_algorithm=cluster.DBSCAN(eps=0.3, min_samples=15), | |
reducer = manifold.TSNE(), nr_cubes=35, overlap_perc=0.5, | |
link_local=False, verbose=1) | |
# Fit and transform data | |
data = mapper.fit_transform(data) | |
# Create the graph | |
complex = mapper.map(data, dimension_index=[0,1], dimension_name="t-SNE(2) 2D") | |
# Create the visualizations (increased the graph_gravity for a tighter graph-look.) | |
# Tooltips with image data for every cluster member | |
mapper.visualize(complex, "results.html", "t-SNE on first 4k images", graph_charge=-50, graph_gravity=0.15, custom_tooltips=tooltip_s) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment