Skip to content

Instantly share code, notes, and snippets.

@dongguosheng
Last active April 14, 2016 11:46
Show Gist options
  • Save dongguosheng/6aa308c6f5787325fe8d to your computer and use it in GitHub Desktop.
Save dongguosheng/6aa308c6f5787325fe8d to your computer and use it in GitHub Desktop.
a simple k-means implementation, use c++11 and openmp
#ifndef KMEANS_H
#define KMEANS_H
#include <vector>
#include <set>
#include <random>
#include <ctime>
#include <cfloat>
#include <cassert>
#include <cmath>
#include <iostream>
#include <omp.h>
template<typename T>
struct FeatDense {
std::vector<T> data;
size_t len;
FeatDense(size_t len) : len(len) {
data = std::vector<T>(len, static_cast<T>(0));
}
FeatDense(const std::vector<T> &data) : data(data) {
len = data.size();
}
T operator[](size_t i) const { return data[i]; }
size_t size(void) const { return data.size(); }
FeatDense& operator+=(const FeatDense<T> &right) {
assert(data.size() == right.size());
for(size_t i = 0; i < data.size(); ++ i) {
data[i] += right[i];
}
return *this;
}
FeatDense& operator/(size_t n) {
float num = static_cast<float>(n);
for(size_t i = 0; i < data.size(); ++ i) {
data[i] /= num;
}
return *this;
}
};
template<typename T>
struct Distance {
virtual float eval(const FeatDense<T> &left, const FeatDense<T> &right) = 0;
};
template<typename T>
struct DotProduct : Distance<T> {
float eval(const FeatDense<T> &left, const FeatDense<T> &right) {
assert(left.size() == right.size());
float dist = 0.0f;
for(size_t i = 0; i < left.size(); ++ i) {
dist += left[i] * right[i];
}
return -dist;
}
};
template<typename T>
struct Euclidean : Distance<T> {
float eval(const FeatDense<T> &left, const FeatDense<T> &right) {
assert(left.size() == right.size());
float dist = 0.0f;
for(size_t i = 0; i < left.size(); ++ i) {
float tmp = static_cast<float>(std::abs(left[i] - right[i]));
dist += tmp * tmp;
}
return dist;
}
};
const static int THREAD_NUM = omp_get_max_threads();
template<typename T>
class Kmeans {
public:
typedef FeatDense<T> FeatDenseT;
typedef Distance<T> DistanceT;
Kmeans(size_t k) : k(k) {
centroids.reserve(k);
cluster_rs.reserve(k);
}
Kmeans(size_t k, std::vector<FeatDenseT> &data, DistanceT *dist) : k(k), data(data), distance(dist) {
centroids.reserve(k);
cluster_rs = std::vector<int>(data.size(), -1);
cluster_sizes = std::vector<int>(k, 0);
}
virtual ~Kmeans() {}
void SetData(const std::vector<FeatDenseT> &data) {
data = data;
}
void SetDistance(const DistanceT *dist) {
dist = dist;
}
void InitCentroids(void) {
std::set<size_t> centroid_idx_set;
std::mt19937 rng(static_cast<unsigned int>(time(NULL)));
std::uniform_int_distribution<size_t> ud(0, data.size() - 1);
size_t idx = 0;
while(centroid_idx_set.size() < k) {
idx = ud(rng);
if(centroid_idx_set.find(idx) == centroid_idx_set.end()) {
centroids.push_back(data[idx]);
centroid_idx_set.insert(idx);
}
}
}
void ClusterOnce(void) {
// need reset all cluster result ?
for(size_t i = 0; i < cluster_sizes.size(); ++ i) {
cluster_sizes[i] = 0;
}
# pragma omp parallel for num_threads(THREAD_NUM)
for(int i = 0; i < static_cast<int>(data.size()); ++ i) {
float dist, dist_min;
int idx_min;
dist_min = FLT_MAX;
idx_min = 0;
for(int j = 0; j < static_cast<int>(centroids.size()); ++ j) {
dist = distance->eval(data[i], centroids[j]);
if(dist < dist_min) {
idx_min = j;
dist_min = dist;
}
}
cluster_rs[i] = idx_min;
cluster_sizes[idx_min] += 1;
}
}
void UpdateCentroids(void) {
for(size_t i = 0; i < centroids.size(); ++ i) {
centroids[i] = FeatDenseT(data[0].size());
}
for(int i = 0; i < cluster_rs.size(); ++ i) {
centroids[cluster_rs[i]] += data[i];
}
for(size_t i = 0; i < centroids.size(); ++ i) {
centroids[i] = centroids[i] / cluster_sizes[i];
}
}
void Cluster(size_t n_iter) {
std::cout << "THREAD_NUM: " << THREAD_NUM << std::endl;
if(k < 1 || k > data.size()) return;
InitCentroids();
for(size_t i = 0; i < n_iter; ++ i) {
std::cout << "iter: " << i << std::endl;
ClusterOnce();
UpdateCentroids();
}
}
std::vector<FeatDenseT> GetCentroid() const {
return centroids;
}
std::vector<int> GetClusters() const {
return cluster_rs;
}
private:
size_t k;
std::vector<FeatDenseT> data;
std::vector<FeatDenseT> centroids;
std::vector<int> cluster_rs;
std::vector<int> cluster_sizes;
DistanceT *distance;
Kmeans(const Kmeans &other) {}
Kmeans& operator=(const Kmeans &other) {}
};
#endif /*KMEANS_H*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment