Skip to content

Instantly share code, notes, and snippets.

@foota
Created May 14, 2012 18:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save foota/2695542 to your computer and use it in GitHub Desktop.
Save foota/2695542 to your computer and use it in GitHub Desktop.
Hierarchical clustering
clusters_tbb.cpp
// hierarchical clustering using the TBB library
// by nox, 2009.07.02
//
// Linux - Intel C++
// icpc clusters_tbb.cpp -O3 -ltbb -o clusters_tbb
//
// Linux - GCC
// g++ clusters_tbb.cpp -O3 -ltbb -o clusters_tbb
//
// Windows - Microsoft Visual Studio 2008
// cl clusters_tbb.cpp /EHsc /Ox /MD
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
#include <algorithm>
#include <utility>
#include <cmath>
#include "tbb/task_scheduler_init.h"
#include "tbb/blocked_range.h"
#include "tbb/parallel_for.h"
#include "tbb/parallel_reduce.h"
#include "tbb/parallel_sort.h"
using namespace std;
using namespace tbb;
double calc_distance(const vector<double>& vec1, const vector<double>& vec2)
{
vector<double>::const_iterator p, q;
double s = 0.0;
for (p = vec1.begin(), q = vec2.begin(); p != vec1.end(); ++p, ++q)
s += (*p - *q) * (*p - *q);
return sqrt(s);
}
static vector<int> table_i;
struct bicluster {
bicluster* left;
bicluster* right;
int id;
vector<int> vec_ids;
double distance;
bicluster(bicluster* left_, bicluster* right_, int id_, vector<int> vec_ids_, double distance_) :
left(left_), right(right_), id(id_), vec_ids(vec_ids_), distance(distance_) { }
};
typedef vector<bicluster*>::iterator vec_t;
typedef pair<vec_t, vec_t> pair_t;
typedef pair<int, int> distance_pair_t;
typedef vector<double> distances_t;
class CompleteLink {
public:
double dist;
const vector<int>& clust;
const distances_t& distances;
CompleteLink(double d_, const vector<int>& clust_, const distances_t& distances_)
: dist(d_), clust(clust_), distances(distances_) { }
CompleteLink(const CompleteLink& cl, split) : dist(cl.dist), clust(cl.clust), distances(cl.distances) { }
void operator()(const blocked_range<vector<int>::const_iterator>& r) {
for (vector<int>::const_iterator p = r.begin(); p != r.end(); ++p) {
for (vector<int>::const_iterator q = clust.begin(); q != clust.end(); ++q) {
int i = min(*p, *q);
int j = max(*p, *q);
dist = max(distances[table_i[i] + j], dist);
}
}
}
void join(const CompleteLink& cl) {
dist = max(dist, cl.dist);
}
};
double calc_cluster_distance(const vector<int>& clust_i, const vector<int>& clust_j, const distances_t& distances)
{
CompleteLink cl(0.0, clust_j, distances);
parallel_reduce(blocked_range<vector<int>::const_iterator>(clust_i.begin(), clust_i.end(), 100), cl);
return cl.dist;
}
class ReduceHCluster {
public:
const vector<vector<double> >& data;
const vector<bicluster*>& clusters;
distances_t& distances;
pair_t lowest_pair;
double closest;
ReduceHCluster(const vector<vector<double> >& data_, const vector<bicluster*>& clusters_, distances_t& distances_, pair_t lowest_pair_, double closest_)
: data(data_), clusters(clusters_), distances(distances_), lowest_pair(lowest_pair_), closest(closest_) { }
ReduceHCluster(const ReduceHCluster& hc, split)
: data(hc.data), clusters(hc.clusters), distances(hc.distances), lowest_pair(hc.lowest_pair), closest(hc.closest) { }
void operator()(const blocked_range<vec_t>& r) {
for (vec_t p = r.begin(); p != r.end(); ++p) {
for (vec_t q = p + 1; q != clusters.end(); ++q) {
int i = min((*p)->id, (*q)->id);
int j = max((*p)->id, (*q)->id);
if (distances[table_i[i] + j] < 0)
distances[table_i[i] + j] = calc_cluster_distance((*p)->vec_ids, (*q)->vec_ids, distances);
double d = distances[table_i[i] + j];
if (d < closest) {
closest = d;
lowest_pair = make_pair(p, q);
}
}
}
}
void join(const ReduceHCluster& hc) {
if (closest > hc.closest) {
closest = hc.closest;
lowest_pair = hc.lowest_pair;
}
}
};
class HClusterDistances {
private:
const vector<vector<double> >& data;
int n;
distances_t& distances;
public:
HClusterDistances(const vector<vector<double> >& data_, int n_, distances_t& distances_)
: data(data_), n(n_), distances(distances_) { }
void operator()(const blocked_range<int>& r) const {
for (int i = r.begin(); i != r.end(); i++)
for (int j = i + 1; j < n; j++)
distances[table_i[i] + j] = calc_distance(data[i], data[j]);
}
};
inline int index_i(int n, int max_n)
{
int s = 0;
for (int i = 0; i < n; i++) s += max_n - i;
return s - n - 1;
}
void hcluster(const vector<vector<double> >& data, vector<bicluster*>& clusters)
{
int n = clusters.size() * 2;
for (int i = 0; i < n; i++) table_i.push_back(index_i(i, n - 1));
// distance between i and j: distances[table_i(i) + j]
distances_t distances(n * (n - 1) / 2, -1.0);
int current_clust_id = clusters.size();
parallel_for(blocked_range<int>(0, clusters.size() - 1, 100), HClusterDistances(data, clusters.size(), distances));
while (clusters.size() > 1) {
cerr << clusters.size() << endl;
pair_t lowest_pair = make_pair(clusters.begin(), clusters.begin() + 1);
double closest = calc_cluster_distance((*lowest_pair.first)->vec_ids, (*lowest_pair.second)->vec_ids, distances);
ReduceHCluster hc(data, clusters, distances, lowest_pair, closest);
parallel_reduce(blocked_range<vec_t>(clusters.begin(), clusters.end(), 50), hc);
closest = hc.closest;
lowest_pair = hc.lowest_pair;
vector<int> vec_ids((*lowest_pair.first)->vec_ids);
vec_ids.insert(vec_ids.end(), (*lowest_pair.second)->vec_ids.begin(), (*lowest_pair.second)->vec_ids.end());
vector<int>().swap((*lowest_pair.second)->vec_ids);
vector<int>().swap((*lowest_pair.first)->vec_ids);
bicluster* new_cluster = new bicluster(*lowest_pair.first, *lowest_pair.second, current_clust_id, vec_ids, closest);
current_clust_id++;
clusters.erase(lowest_pair.second);
clusters.erase(lowest_pair.first);
clusters.push_back(new_cluster);
}
}
void read_data(const string fname, vector<vector<double> >& data)
{
fstream fs(fname.c_str(), ios_base::in);
string line;
stringstream ss;
vector<double> v;
double value;
while (getline(fs, line)) {
for (ss.str(line), v.clear(); ss >> value; v.push_back(value));
ss.clear();
data.push_back(v);
}
}
void store_clusters(const bicluster* cluster, vector<pair<double, distance_pair_t> >& cluster_data, int max_n)
{
static int n = 0;
if (cluster->left && cluster->right)
cluster_data.push_back(make_pair(cluster->distance, make_pair(
cluster->left->id < max_n ? cluster->left->id : -(cluster->left->id - max_n + 1),
cluster->right->id < max_n ? cluster->right->id : -(cluster->right->id - max_n + 1))));
if (cluster->left) store_clusters(cluster->left, cluster_data, max_n);
if (cluster->right) store_clusters(cluster->right, cluster_data, max_n);
}
void write_clusters(const string fname, const vector<pair<double, distance_pair_t> >& cluster_data)
{
fstream fs(fname.c_str(), ios_base::out | ios_base::trunc);
for (vector<pair<double, distance_pair_t> >::const_iterator p = cluster_data.begin(); p != cluster_data.end(); ++p) {
fs << p->first << " " << p->second.first << " " << p->second.second << endl;
}
}
int main(int argc, char** argv)
{
if (argc < 3) {
cerr << "Hierarchical clustering using the TBB library." << endl;
cerr << endl;
cerr << " Usage: " << argv[0] << " input_data_file output_cluster_file" << endl;
exit(1);
}
task_scheduler_init init;
vector<vector<double> > data;
vector<bicluster*> clusters;
read_data(argv[1], data);
for (int i = 0; i < data.size(); i++)
clusters.push_back(new bicluster(NULL, NULL, i, vector<int>(1, i), 0.0));
hcluster(data, clusters);
vector<pair<double, distance_pair_t> > cluster_data;
store_clusters(clusters[0], cluster_data, data.size());
parallel_sort(cluster_data.begin(), cluster_data.end());
write_clusters(argv[2], cluster_data);
return 0;
}
#!/usr/bin/env python
"""
Draw dendrogram for hierarchical clustering
by nox, 2009.07.02
"""
import sys, os
from PIL import Image, ImageDraw
HEAD_LENGTH = 50
TAIL_LENGTH = 150
WIDTH = 1200
HEIGHT = 20
class bicluster:
def __init__(self, left=None, right=None, distance=0.0, id=None):
self.left = left
self.right = right
self.id = id
self.distance = distance
def find_cluster(clust, id):
for i, c in enumerate(clust):
if c.id == id: return i
return None
def read_clusters(fname, clust, labels):
has_labels = False
if labels: has_labels = True
lines = file(fname).readlines()
num = len(lines) + 1
for i in range(num):
clust.append(bicluster(id=i))
if not has_labels: labels.append(i)
for i, l in enumerate(lines):
i = -(i + 1)
data = l.strip().split()
left_id = find_cluster(clust, int(data[1]))
right_id = find_cluster(clust, int(data[2]))
clust.append(bicluster(id=i,
left=clust[left_id],
right=clust[right_id],
distance=float(data[0])))
del clust[right_id]
del clust[left_id]
def get_height(clust):
if clust.left == None and clust.right == None: return 1
return get_height(clust.left) + get_height(clust.right)
def get_depth(clust):
if clust.left == None and clust.right == None: return 0
return max(get_depth(clust.left), get_depth(clust.right)) + clust.distance
def draw_dendrogram(clust, labels, image_file):
h = get_height(clust) * HEIGHT
depth = clust.distance
scaling = float(WIDTH - (HEAD_LENGTH + TAIL_LENGTH)) / depth
img = Image.new("RGB", (WIDTH, h), (255, 255, 255))
draw = ImageDraw.Draw(img)
draw.line((0, h / 2, HEAD_LENGTH, h / 2), fill=(255, 0, 0))
draw_node(draw, clust, HEAD_LENGTH, (h / 2), scaling, labels)
img.save(image_file)
def draw_node(draw, clust, x, y, scaling, labels):
if clust.id < 0:
h1 = get_height(clust.left) * HEIGHT
h2 = get_height(clust.right) * HEIGHT
top = y - (h1 + h2) / 2
bottom = y + (h1 + h2) / 2
ll = clust.distance * scaling
lll = ll - clust.left.distance * scaling
llr = ll - clust.right.distance * scaling
draw.line((x, top + h1 / 2, x, bottom - h2 / 2), fill=(255, 0, 0))
draw.line((x, top + h1 / 2, x + lll, top + h1 / 2), fill=(255, 0, 0))
draw.line((x, bottom - h2 / 2, x + llr, bottom - h2 / 2), fill=(255, 0, 0))
draw_node(draw, clust.left, x + lll, top + h1 / 2, scaling, labels)
draw_node(draw, clust.right, x + llr, bottom - h2 / 2, scaling, labels)
else:
draw.line((x, y, WIDTH - TAIL_LENGTH, y), fill=(255, 0, 0))
draw.text((WIDTH - TAIL_LENGTH + 5, y - 7), str(labels[clust.id]), (0, 0, 0))
def main(args):
if len(args) < 2:
print >>sys.stderr, "Draw dendrogram for hierarchical clustering."
print >>sys.stderr
print >>sys.stderr, " Usage: %s cluster_data_file [image_file=clusters.jpg label_data_file=[]]" % os.path.basename(args[0])
sys.exit(1)
labels = []
if len(args) > 2: image_file = args[2]
else: image_file = "clusters.jpg"
if len(args) > 3:
for l in file(args[3]):
labels.append(l.strip())
clust = []
read_clusters(args[1], clust, labels)
draw_dendrogram(clust[0], labels, image_file)
if __name__ == "__main__": main(sys.argv)
#!/usr/bin/env python
"""
Grouping clustering data
by nox, 2009.07.02
"""
import sys, os, math, re
def expand_group(clust, n):
group = []
for x in clust[n][1:]:
if x >= 0: group.append(x)
else: group.extend(expand_group(clust, -(x + 1)))
return group
def arrange_cluster(clust, threshold):
tt = [[i, data[1:]] for i, data in enumerate(clust) if data[0] > threshold]
t = []
for d in tt:
for x in d[1]:
if -x < tt[0][0] + 1: t.append(x)
groups = []
for x in t:
g = []
if x >= 0: g = [x]
else: g = expand_group(clust, -(x + 1))
groups.append(g)
if not groups: groups = [range(1, len(clust) + 2)]
return groups
def read_clust(fname, clust_data):
for l in file(fname):
data = l.strip().split()
clust_data.append((float(data[0]), int(data[1]), int(data[2])))
def output_group(fname, groups):
f = file(fname, "w")
count = 1
for c in groups:
f.write("%03d:" % count)
for d in c: f.write(" %d" % d)
f.write("\n")
count += 1
f.close()
def main(args):
if len(args) < 4:
print >>sys.stderr, "Grouping clustering data."
print >>sys.stderr
print >>sys.stderr, " Usage: %s cluster_data_file output_group_file threshold" % os.path.basename(args[0])
sys.exit(1)
clust_file = args[1]
group_file = args[2]
threshold = float(args[3])
print "Input cluster file: %s" % clust_file
print "Output group file : %s" % group_file
print "Threshold : %f" % threshold
clust_data = []
read_clust(clust_file, clust_data)
groups = arrange_cluster(clust_data, threshold)
print "Number of elements: %d" % sum([len(x) for x in groups])
print "Number of clusters: %d" % len(groups)
output_group(group_file, groups)
if __name__ == "__main__": main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment