Skip to content

Instantly share code, notes, and snippets.

@LovelyBuggies
Last active November 17, 2020 14:08
Show Gist options
  • Save LovelyBuggies/0c01d0e43c19cb7c3d1f06a1249754f7 to your computer and use it in GitHub Desktop.
Save LovelyBuggies/0c01d0e43c19cb7c3d1f06a1249754f7 to your computer and use it in GitHub Desktop.
Using Minimum Spanning Tree and K-Means to cluster 2-d points.

In this note, I'll show how to cluster the points into K clusters via MST and K-means.

Note that these two algorithms will generate different results.

MST

#include <iostream>
#include <vector>
#include <string>
#include <cmath>
#include <fstream>
#include <iostream>
#include <sstream>
using namespace std;


struct Point {
	string name;
    double x, y;
    Point(string name, double x, double y) {
    	this->name = name;
    	this->x = x;
    	this->y = y;
    }
};


vector<Point> readtxt() {
    vector<Point> points;
    string line;
    ifstream file("hw6_sample_input.txt");

    while (getline(file, line)) {
        stringstream lineStream(line);
        string bit;
        string name;
        double x, y;
        getline(lineStream, bit, ' ');
        name = bit;
        getline(lineStream, bit, ' ');
        x = stof(bit);
        getline(lineStream, bit, '\n');
        y = stof(bit);

        points.push_back(Point(name, x, y));
    }
    return points;
}


vector< vector<Point> > initializeCluster(vector<Point> points) {
	vector< vector<Point> > clusters;
	for (int i = 0; i < points.size(); i++) {
		clusters.push_back(vector<Point> (1, points[i]));
	}
	return clusters;
}


vector< vector<float> > generateDistanceMap(vector<Point> points) {
	int n = points.size();
	vector< vector<float> > dist(n, vector<float> (n, 100));
	for (int i = 0; i < n; i++) {
		for (int j = i + 1; j < n; j++) {
			dist[i][j] = pow((points[i].x - points[j].x), 2) + pow((points[i].y - points[j].y), 2);
		}
	}
	return dist;
}


void selectNearestNeighbors(vector< vector<float> >& dist, int& x, int& y) {
	int minDist = 100;
	for (int i = 0; i < dist.size(); i++)
		for (int j = i + 1; j < dist[i].size(); j++) 
			if (dist[i][j] < minDist) {
				x = i;
				y = j;
				minDist = dist[i][j];
			}
	dist[x][y] = 100;
	return ;
}


int mergerTwoClusters(vector< vector<Point> >& clusters, Point& pointX, Point& pointY) {
	int a = -1, b = -1;
	for (int i = 0; i < clusters.size(); i++) {
		for (int j = 0; j < clusters[i].size(); j++) {
			if (clusters[i][j].name == pointX.name) a = i;
			if (clusters[i][j].name == pointY.name) b = i;
		}
	}
	if (a == b) return clusters.size();
	for (int i = 0; i < clusters[b].size(); i++)
		clusters[a].push_back(clusters[b][i]);

	clusters.erase(clusters.begin() + b);
	return clusters.size();
}


void printCluster(vector< vector<Point> > clusters) {
	for(int i  = 0; i < clusters.size(); i++) {
		cout << "Cluster " << i << ": ";
		for (int j = 0; j < clusters[i].size(); j++) {
			cout << clusters[i][j].name << ", ";
		}
		cout << endl;
	} 
}


void printDistanceMap(vector< vector<float> > dist) {
	for(int i  = 0; i < dist.size(); i++) {
		for (int j = 0; j < dist[i].size(); j++) {
			cout << dist[i][j] << ", ";
		}
		cout << endl;
	}
}


int main(int argc, char const *argv[])
{	
	int K = 2;
	vector<Point> points = readtxt();
	vector< vector<Point> > clusters = initializeCluster(points);
	vector< vector<float> > dist = generateDistanceMap(points);

	int clusterNum = 100;
	while (clusterNum != K) {
		int x = 0, y = 0;
		selectNearestNeighbors(dist, x, y);

		Point pointX = points[x], pointY = points[y];
		clusterNum = mergerTwoClusters(clusters, pointX, pointY);
	}

	printCluster(clusters);

	return 0;
}

K-Means

#include <ctime>
#include <fstream>
#include <iostream>
#include <sstream>
#include <vector>
#include <string>

using namespace std;


struct Point {
	string name;
    double x, y;
    int cluster;
    double minDist;

    Point() : 
    	name(""),
        x(0.0), 
        y(0.0),
        cluster(-1),
        minDist(__DBL_MAX__) {}
        
    Point(string name, double x, double y) : 
    	name(name),
        x(x), 
        y(y),
        cluster(-1),
        minDist(__DBL_MAX__) {}

    double distance(Point p) {
        return (p.x - x) * (p.x - x) + (p.y - y) * (p.y - y);
    }
};

vector<Point> readtxt() {
    vector<Point> points;
    string line;
    ifstream file("hw6_sample_input.txt");

    while (getline(file, line)) {
        stringstream lineStream(line);
        string bit;
        string name;
        double x, y;
        getline(lineStream, bit, ' ');
        name = bit;
        getline(lineStream, bit, ' ');
        x = stof(bit);
        getline(lineStream, bit, '\n');
        y = stof(bit);

        points.push_back(Point(name, x, y));
    }
    return points;
}

void kMeansClustering(vector<Point>* points, int n, int iterations, int k) {
	vector<Point> centroids;
	srand(time(0));
	// for (int i = 0; i < k; ++i) {
 //    	centroids.push_back(points->at(rand() % n));
	// }
	centroids.push_back(points->at(0));
	centroids.push_back(points->at(14));
	for(int a = 0; a < iterations; a++) {
		for (vector<Point>::iterator c = begin(centroids); c != end(centroids); ++c) {
		    int clusterId = c - begin(centroids);

		    for (vector<Point>::iterator it = points->begin();
		         it != points->end(); ++it) {
		             
		        Point p = *it;
		        double dist = c->distance(p);
		        if (dist < p.minDist) {
		            p.minDist = dist;
		            p.cluster = clusterId;
		        }
		        *it = p;
		    }
		}

		vector<int> nPoints;
		vector<double> sumX, sumY;

		for (int j = 0; j < k; ++j) {
		    nPoints.push_back(0);
		    sumX.push_back(0.0);
		    sumY.push_back(0.0);
		}

		for (vector<Point>::iterator it = points->begin(); 
		    it != points->end(); ++it) {
		    int clusterId = it->cluster;
		    nPoints[clusterId] += 1;
		    sumX[clusterId] += it->x;
		    sumY[clusterId] += it->y;

		    it->minDist = __DBL_MAX__;
		}

		for (vector<Point>::iterator c = begin(centroids); 
		     c != end(centroids); ++c) {
		    int clusterId = c - begin(centroids);
		    c->x = sumX[clusterId] / nPoints[clusterId];
		    c->y = sumY[clusterId] / nPoints[clusterId];
		}
	}
	
	ofstream myfile;
	myfile.open("output.txt");
	myfile << "x,y,c" << endl;
	for (vector<Point>::iterator it = points->begin(); it != points->end(); ++it) {
	    myfile << it->name << it->x << "," << it->y << "," << it->cluster << endl;
	}
	myfile.close();
}


int main(int argc, char const *argv[])
{
	vector<Point> points = readtxt();
	for(int i = 0; i < 100; i ++) {kMeansClustering(&points, 15, 30, 2);}
	return 0;
}

References

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment