Last active
April 14, 2016 11:46
-
-
Save dongguosheng/6aa308c6f5787325fe8d to your computer and use it in GitHub Desktop.
a simple k-means implementation, use c++11 and openmp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef KMEANS_H | |
#define KMEANS_H | |
#include <vector> | |
#include <set> | |
#include <random> | |
#include <ctime> | |
#include <cfloat> | |
#include <cassert> | |
#include <cmath> | |
#include <iostream> | |
#include <omp.h> | |
template<typename T> | |
struct FeatDense { | |
std::vector<T> data; | |
size_t len; | |
FeatDense(size_t len) : len(len) { | |
data = std::vector<T>(len, static_cast<T>(0)); | |
} | |
FeatDense(const std::vector<T> &data) : data(data) { | |
len = data.size(); | |
} | |
T operator[](size_t i) const { return data[i]; } | |
size_t size(void) const { return data.size(); } | |
FeatDense& operator+=(const FeatDense<T> &right) { | |
assert(data.size() == right.size()); | |
for(size_t i = 0; i < data.size(); ++ i) { | |
data[i] += right[i]; | |
} | |
return *this; | |
} | |
FeatDense& operator/(size_t n) { | |
float num = static_cast<float>(n); | |
for(size_t i = 0; i < data.size(); ++ i) { | |
data[i] /= num; | |
} | |
return *this; | |
} | |
}; | |
template<typename T> | |
struct Distance { | |
virtual float eval(const FeatDense<T> &left, const FeatDense<T> &right) = 0; | |
}; | |
template<typename T> | |
struct DotProduct : Distance<T> { | |
float eval(const FeatDense<T> &left, const FeatDense<T> &right) { | |
assert(left.size() == right.size()); | |
float dist = 0.0f; | |
for(size_t i = 0; i < left.size(); ++ i) { | |
dist += left[i] * right[i]; | |
} | |
return -dist; | |
} | |
}; | |
template<typename T> | |
struct Euclidean : Distance<T> { | |
float eval(const FeatDense<T> &left, const FeatDense<T> &right) { | |
assert(left.size() == right.size()); | |
float dist = 0.0f; | |
for(size_t i = 0; i < left.size(); ++ i) { | |
float tmp = static_cast<float>(std::abs(left[i] - right[i])); | |
dist += tmp * tmp; | |
} | |
return dist; | |
} | |
}; | |
const static int THREAD_NUM = omp_get_max_threads(); | |
template<typename T> | |
class Kmeans { | |
public: | |
typedef FeatDense<T> FeatDenseT; | |
typedef Distance<T> DistanceT; | |
Kmeans(size_t k) : k(k) { | |
centroids.reserve(k); | |
cluster_rs.reserve(k); | |
} | |
Kmeans(size_t k, std::vector<FeatDenseT> &data, DistanceT *dist) : k(k), data(data), distance(dist) { | |
centroids.reserve(k); | |
cluster_rs = std::vector<int>(data.size(), -1); | |
cluster_sizes = std::vector<int>(k, 0); | |
} | |
virtual ~Kmeans() {} | |
void SetData(const std::vector<FeatDenseT> &data) { | |
data = data; | |
} | |
void SetDistance(const DistanceT *dist) { | |
dist = dist; | |
} | |
void InitCentroids(void) { | |
std::set<size_t> centroid_idx_set; | |
std::mt19937 rng(static_cast<unsigned int>(time(NULL))); | |
std::uniform_int_distribution<size_t> ud(0, data.size() - 1); | |
size_t idx = 0; | |
while(centroid_idx_set.size() < k) { | |
idx = ud(rng); | |
if(centroid_idx_set.find(idx) == centroid_idx_set.end()) { | |
centroids.push_back(data[idx]); | |
centroid_idx_set.insert(idx); | |
} | |
} | |
} | |
void ClusterOnce(void) { | |
// need reset all cluster result ? | |
for(size_t i = 0; i < cluster_sizes.size(); ++ i) { | |
cluster_sizes[i] = 0; | |
} | |
# pragma omp parallel for num_threads(THREAD_NUM) | |
for(int i = 0; i < static_cast<int>(data.size()); ++ i) { | |
float dist, dist_min; | |
int idx_min; | |
dist_min = FLT_MAX; | |
idx_min = 0; | |
for(int j = 0; j < static_cast<int>(centroids.size()); ++ j) { | |
dist = distance->eval(data[i], centroids[j]); | |
if(dist < dist_min) { | |
idx_min = j; | |
dist_min = dist; | |
} | |
} | |
cluster_rs[i] = idx_min; | |
cluster_sizes[idx_min] += 1; | |
} | |
} | |
void UpdateCentroids(void) { | |
for(size_t i = 0; i < centroids.size(); ++ i) { | |
centroids[i] = FeatDenseT(data[0].size()); | |
} | |
for(int i = 0; i < cluster_rs.size(); ++ i) { | |
centroids[cluster_rs[i]] += data[i]; | |
} | |
for(size_t i = 0; i < centroids.size(); ++ i) { | |
centroids[i] = centroids[i] / cluster_sizes[i]; | |
} | |
} | |
void Cluster(size_t n_iter) { | |
std::cout << "THREAD_NUM: " << THREAD_NUM << std::endl; | |
if(k < 1 || k > data.size()) return; | |
InitCentroids(); | |
for(size_t i = 0; i < n_iter; ++ i) { | |
std::cout << "iter: " << i << std::endl; | |
ClusterOnce(); | |
UpdateCentroids(); | |
} | |
} | |
std::vector<FeatDenseT> GetCentroid() const { | |
return centroids; | |
} | |
std::vector<int> GetClusters() const { | |
return cluster_rs; | |
} | |
private: | |
size_t k; | |
std::vector<FeatDenseT> data; | |
std::vector<FeatDenseT> centroids; | |
std::vector<int> cluster_rs; | |
std::vector<int> cluster_sizes; | |
DistanceT *distance; | |
Kmeans(const Kmeans &other) {} | |
Kmeans& operator=(const Kmeans &other) {} | |
}; | |
#endif /*KMEANS_H*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment