Skip to content

Instantly share code, notes, and snippets.

@dno89
Created March 29, 2022 00:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dno89/4a0efa22469738daa748dd9f56774ff6 to your computer and use it in GitHub Desktop.
Save dno89/4a0efa22469738daa748dd9f56774ff6 to your computer and use it in GitHub Desktop.
Naive implementation of K-means for 2D points
#include <iostream>
#include <vector>
#include <random>
#include <algorithm>
#include <numeric>
#include <cstdlib>
#include <Eigen/Dense>
#define T(X) cerr << #X": " << X << endl;
template<typename T>
std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
for(const auto& x : v) {
out << x << ", ";
}
return out;
}
using namespace std;
// Typedefs
using Point2 = Eigen::Vector2d;
vector<Point2> GeneratePoints(const vector<Point2>& means, size_t points_per_mean) {
default_random_engine dre;
vector<Point2> result;
result.reserve(means.size()*points_per_mean);
for(const auto& mean : means) {
normal_distribution<> gaussian_noise(0.0, 1.0);
for(size_t ii = 0; ii < points_per_mean; ++ii) {
result.push_back(
mean + Point2{gaussian_noise(dre), gaussian_noise(dre)}
);
}
}
random_shuffle(result.begin(), result.end());
return result;
}
bool CheckConvergence(const vector<Point2>& old_means, const vector<Point2>& new_means, double distance_threshold) {
double residual = 0.0;
for(size_t ii = 0; ii < old_means.size(); ++ii) {
residual += (old_means[ii]-new_means[ii]).squaredNorm();
}
return residual < distance_threshold;
}
std::pair<vector<size_t>, vector<Point2>> KMeansClusters(const vector<Point2>& points, size_t k) {
vector<size_t> assigned_clusters(points.size()); // size N
generate(assigned_clusters.begin(), assigned_clusters.end(), [k](){ return rand()%k; });
// T(assigned_clusters)
vector<Point2> clusters_mean; // size k
auto assign = [&](){
for(size_t ii = 0; ii < points.size(); ++ii) {
const auto& p = points[ii];
auto closest_cluster = min_element(clusters_mean.begin(), clusters_mean.end(),
[&p](const Point2& m1, const Point2& m2) { return (m1-p).squaredNorm() < (m2-p).squaredNorm();} );
assigned_clusters[ii] = std::distance(clusters_mean.begin(), closest_cluster);
}
};
auto update = [&](){
vector<Point2> result(k, Point2{0.0, 0.0});
// T(result)
vector<size_t> clusters_counter(k, 0);
for(size_t ii = 0; ii < points.size(); ++ii) {
result[assigned_clusters[ii]] += points[ii];
++clusters_counter[assigned_clusters[ii]];
}
// T(clusters_counter)
for(size_t ii = 0; ii < result.size(); ++ii) {
if(clusters_counter[ii] > 0)
result[ii] /= clusters_counter[ii];
}
return result;
};
clusters_mean = update();
// T(clusters_mean)
while(true) {
assign();
auto new_means = update();
// T(new_means)
if(CheckConvergence(new_means, clusters_mean, 1e-3)) {
break;
}
clusters_mean = std::move(new_means);
};
return {std::move(assigned_clusters), std::move(clusters_mean)};
}
int main(int, char**) {
const auto points = GeneratePoints({{0.0, 0.0}, {10.0, 10.0}, {5.0, 5.0}}, 10);
const auto& [clusters, means] = KMeansClusters(points, 3);
// T(points)
T(clusters)
T(means)
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment