Skip to content

Instantly share code, notes, and snippets.

@ialhashim
Last active March 24, 2024 22:04
Show Gist options
  • Star 20 You must be signed in to star a gist
  • Fork 10 You must be signed in to fork a gist
  • Save ialhashim/b29e5455333aa6ae0071 to your computer and use it in GitHub Desktop.
Save ialhashim/b29e5455333aa6ae0071 to your computer and use it in GitHub Desktop.
Portable Clustering Algorithms in C++ (DBSCAN) and (Mean-Shift) and (k-medoids)
// Interface for the The C clustering library
void clusterlibrary::cluster(std::vector< std::vector<float> > & data, int k, int iterations, std::vector<int> & clusterid)
{
int nrows = data.size();
int ncolumns = data.front().size();
float** data_ptr = new float*[nrows];
for(size_t i = 0; i < data.size(); i++) data_ptr[i] = &data[i][0];
std::vector< std::vector<int> > mask(nrows, std::vector<int>(ncolumns, 1));
int ** mask_ptr = new int*[nrows];
for(size_t i = 0; i < mask.size(); i++) mask_ptr[i] = &mask[i][0];
std::vector<float> data_weights(data.size(),1.0);
float * weights = &data_weights[0];
char distType = 'e';
std::vector< std::vector<float> > cdata(k, std::vector<float>(ncolumns, 0));
float ** cdata_ptr = new float*[ncolumns];
for(size_t i = 0; i < cdata.size(); i++) cdata_ptr[i] = &cdata[i][0];
std::vector< std::vector<int> > cmask(k, std::vector<int>(ncolumns, 1));
int ** cmask_ptr = new int*[ncolumns];
for(size_t i = 0; i < cmask.size(); i++) cmask_ptr[i] = &cmask[i][0];
float err = 0;
std::vector< int > tclusterid(nrows, 0);
std::vector< int > counts(k, 0);
std::vector< int > mapping(k, 0);
clusterid.clear();
clusterid.resize(nrows, 0);
clusterlib::kmeans(k, nrows, ncolumns, data_ptr, mask_ptr, weights, 0, iterations,
distType, cdata_ptr, cmask_ptr, &clusterid[0], &err, &tclusterid[0], &counts[0], &mapping[0] );
}
#pragma once
// Code adapted from https://github.com/propanoid/DBSCAN
#include <vector>
#include <algorithm>
#include <omp.h>
// Any basic vector/matrix library should also work
#include <Eigen/Core>
namespace clustering
{
template<typename Vector, typename Matrix>
class DBSCAN
{
public:
typedef Vector FeaturesWeights;
typedef Matrix ClusterData;
typedef Matrix DistanceMatrix;
typedef std::vector<unsigned int> Neighbors;
typedef std::vector<int> Labels;
private:
double m_eps;
size_t m_min_elems;
double m_dmin;
double m_dmax;
Labels m_labels;
public:
// 'eps' is the search space for neighbors in the range [0,1], where 0.0 is exactly self and 1.0 is entire dataset
DBSCAN(double eps, size_t min_elems)
: m_eps( eps )
, m_min_elems( min_elems )
, m_dmin(0.0)
, m_dmax(0.0)
{
reset();
}
// Call this to perform clustering, get results by calling 'get_labels()'
void fit( const ClusterData & C )
{
const FeaturesWeights W = std_weights( C.cols() );
wfit( C, W );
}
const Labels & get_labels() const
{
return m_labels;
}
void reset()
{
m_labels.clear();
}
void init(double eps, size_t min_elems)
{
m_eps = eps;
m_min_elems = min_elems;
}
// Useful for testing
static ClusterData gen_cluster_data( size_t features_num, size_t elements_num )
{
ClusterData cl_d( elements_num, features_num );
for (size_t i = 0; i < elements_num; ++i)
for (size_t j = 0; j < features_num; ++j)
cl_d(i, j) = (-1.0 + rand() * (2.0) / RAND_MAX);
return cl_d;
}
FeaturesWeights std_weights( size_t s )
{
// num cols
FeaturesWeights ws( s );
for (size_t i = 0; i < s; ++i)
ws(i) = 1.0;
return ws;
}
void fit_precomputed( const DistanceMatrix & D )
{
prepare_labels( D.rows() );
dbscan( D );
}
void wfit( const ClusterData & C, const FeaturesWeights & W )
{
prepare_labels( C.rows() );
const DistanceMatrix D = calc_dist_matrix( C, W );
dbscan( D );
}
private:
void prepare_labels( size_t s )
{
m_labels.resize(s, -1);
}
Neighbors find_neighbors(const DistanceMatrix & D, unsigned int pid)
{
Neighbors ne;
for (unsigned int j = 0; j < D.rows(); ++j)
{
if ( D(pid, j) <= m_eps )
{
ne.push_back(j);
}
}
return ne;
}
const DistanceMatrix calc_dist_matrix( const ClusterData & C, const FeaturesWeights & W )
{
ClusterData cl_d = C;
#pragma omp parallel for
for (int i = 0; i < (int)cl_d.cols(); ++i)
{
auto col = cl_d.col(i);
const auto r = std::minmax_element( col.data(), col.data() + col.size() );
double data_min = *r.first;
double data_range = *r.second - *r.first;
if (data_range == 0.0) { data_range = 1.0; }
const double scale = 1/data_range;
const double min = -1.0*data_min*scale;
col *= scale;
col += Vector::Constant(col.size(), min);
cl_d.col(i) = col;
}
// rows x rows
DistanceMatrix d_m( cl_d.rows(), cl_d.rows() );
Vector d_max( cl_d.rows() );
Vector d_min( cl_d.rows() );
for (int i = 0; i < (int)cl_d.rows(); ++i)
{
#pragma omp parallel for
for (int j = i; j < (int)cl_d.rows(); ++j)
{
d_m(i, j) = 0.0;
if (i != j)
{
Vector U = cl_d.row(i);
Vector V = cl_d.row(j);
Vector diff = ( U-V );
for(int k = 0; k < (int)diff.size(); k++)
{
auto e = diff[k];
d_m(i, j) += fabs(e)*W[k];
}
d_m(j, i) = d_m(i, j);
}
}
const auto cur_row = d_m.row(i);
const auto mm = std::minmax_element( cur_row.data(), cur_row.data() + cur_row.size() );
d_max(i) = *mm.second;
d_min(i) = *mm.first;
}
m_dmin = *(std::min_element( d_min.data(), d_min.data() + d_min.size() ));
m_dmax = *(std::max_element( d_max.data(), d_max.data() + d_max.size() ));
m_eps = (m_dmax - m_dmin) * m_eps + m_dmin;
return d_m;
}
void dbscan( const DistanceMatrix & dm )
{
std::vector<unsigned int> visited( dm.rows() );
unsigned int cluster_id = 0;
for (unsigned int pid = 0; pid < dm.rows(); ++pid)
{
if ( !visited[pid] )
{
visited[pid] = 1;
Neighbors ne = find_neighbors(dm, pid );
if (ne.size() >= m_min_elems)
{
m_labels[pid] = cluster_id;
for (unsigned int i = 0; i < ne.size(); ++i)
{
unsigned int nPid = ne[i];
if ( !visited[nPid] )
{
visited[nPid] = 1;
Neighbors ne1 = find_neighbors(dm, nPid);
if ( ne1.size() >= m_min_elems )
{
for (const auto & n1 : ne1)
{
ne.push_back(n1);
}
}
}
if ( m_labels[nPid] == -1 )
{
m_labels[nPid] = cluster_id;
}
}
++cluster_id;
}
}
}
}
};
}
// Incomplete..
//Generalized k-means
//Copyright (C) 2013 Balazs Szalkai
//If you use this program in your research, please cite the following article:
//B. Szalkai: An implementation of the relational k-means algorithm. ArXiv e-prints, 2013.
namespace GeneralizedKMeans
{
typedef Eigen::MatrixXf DistanceMatrix;
struct Clusters
{
DistanceMatrix Matrix;
int Count;
int[] ObjectToCluster;
List<int>[] ClusterToObjects;
CentroidDistance cd;
Clusters(DistanceMatrix & matrix, int count)
{
Matrix = matrix;
Count = count;
ObjectToCluster = new int[Matrix.Count];
ClusterToObjects = new List<int>[Count];
for (int i = 0; i < Count; i++) ClusterToObjects[i] = new List<int>();
Randomize();
}
static Random Rand = new Random();
void Randomize()
{
for (int i = 0; i < Count; i++) ClusterToObjects[i].Clear();
for (int i = 0; i < Matrix.Count; i++)
{
int cluster = Rand.Next(Count);
ObjectToCluster[i] = cluster;
ClusterToObjects[cluster].Add(i);
}
}
void Make_ClusterToObjects()
{
for (int cluster = 0; cluster < Count; cluster++)
ClusterToObjects[cluster].Clear();
for (int i = 0; i < Matrix.Count; i++)
ClusterToObjects[ObjectToCluster[i]].Add(i);
}
double[] AdditiveConstant;
// Call this before using the other members functions
void Initialize()
{
// calculate additive constants for clusters
AdditiveConstant = new double[clusters.Count];
for (int cluster = 0; cluster < clusters.Count; cluster++)
{
var objects = clusters.ClusterToObjects[cluster];
double x = 0;
foreach (var i in objects)
foreach (var j in objects)
x += clusters.Matrix.SqrDistance[i, j];
AdditiveConstant[cluster] = -x / (2 * Utils.Sqr((double)objects.Count));
}
}
// Calculates a squared centroid distance
double CalculateSqrDist(int obj, int cluster)
{
var objects = clusters.ClusterToObjects[cluster];
double x = 0;
foreach (int j in objects) x += clusters.Matrix.SqrDistance[obj, j];
double dist = AdditiveConstant[cluster] + x / objects.Count;
return dist;
}
int GetNearestCluster(int obj)
{
int nearestCluster = -1;
double minSqrDist = 0;
for (int cluster = 0; cluster < clusters.Count; cluster++)
{
double sqrDist = CalculateSqrDist(obj, cluster);
if (nearestCluster < 0 || sqrDist < minSqrDist)
{
minSqrDist = sqrDist;
nearestCluster = cluster;
}
}
return nearestCluster;
}
double GetClusteringValue()
{
double result = 0;
for (int i = 0; i < clusters.Matrix.Count; i++)
result += CalculateSqrDist(i, clusters.ObjectToCluster[i]);
return result;
}
// Returns how many objects have changed cluster
int Iterate()
{
int changed = 0;
cd.Initialize();
// find the nearest centroid for the objects
int[] New_ObjectToCluster = new int[Matrix.Count];
for (int i = 0; i < Matrix.Count; i++)
{
New_ObjectToCluster[i] = cd.GetNearestCluster(i);
if (New_ObjectToCluster[i] != ObjectToCluster[i]) changed++;
}
// update the configuration
double oldValue = GetValue();
int[] Old_ObjectToCluster = ObjectToCluster;
ObjectToCluster = New_ObjectToCluster;
Make_ClusterToObjects();
double newValue = GetValue();
// clustering got worse (this is possible with some distance matrices) or stayed the same?
if (oldValue <= newValue)
{
// undo iteration
ObjectToCluster = Old_ObjectToCluster;
Make_ClusterToObjects();
return 0;
}
return changed;
}
double GetValue()
{
cd.Initialize();
return cd.GetClusteringValue();
}
}
void Run( DistanceMatrix & m, int nClusters )
{
Clusters bestClusters (m, nClusters);
int badLuckStreak = 0;
int blockId = 0;
int maxBadLuckStreak = 100;
while (true)
{
for (int i = 0; i < nThreads; i++)
{
Clusters c = new Clusters(m, nClusters);
while (true)
{
if (c.Iterate() == 0) break;
}
Clusters c = clustersArray[i];
if (c.GetValue() < bestClusters.GetValue())
{
bestClusters = c;
badLuckStreak = 0;
}
else
badLuckStreak++;
}
if (badLuckStreak >= maxBadLuckStreak) break;
}
}
}
// Works well
//////////////////////////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2010, Lawrence Livermore National Security, LLC.
// Produced at the Lawrence Livermore National Laboratory
//////////////////////////////////////////////////////////////////////////////////////////////////
/// @author Todd Gamblin tgamblin@llnl.gov
#include <vector>
#include <sstream>
#include <algorithm>
#include <numeric>
#include <cassert>
#include <cstdlib>
using namespace std;
//#include "random.h"
//#include "bic.h"
//#include "matrix_utils.h"
namespace clustering {
typedef float Scalar;
typedef Eigen::Matrix<Scalar,-1,-1> dissimilarity_matrix;
typedef size_t medoid_id; ///< More descriptive type for medoid index
typedef size_t object_id; ///< More descriptive type for object index
///
/// Explicit representation of a clustering. Instead of a vecto of representative
/// ids, this has <i>k</i> sets of object_ids indicating which objects are in a
/// particular cluster. You can convert a partition to a cluster_list with
/// to_cluster_list().
///
typedef std::vector< std::set<object_id> > cluster_list;
struct kmedoids{
/// Gives the index of the object that is the ith medoid.
/// medoids[i] == index in object array for last call to findClusters()
std::vector<object_id> medoid_ids;
/// Gives cluster id (index in medoids) for the ith object.
/// clusterid[i] == id of cluster of which object i is a member.
/// medoids[clusterid[i]] == representative of that cluster.
std::vector<medoid_id> cluster_ids;
std::vector<medoid_id> sec_nearest; /// Index of second closest medoids. Used by PAM.
double total_dissimilarity; /// Total dissimilarity bt/w objects and their medoid
bool sort_medoids; /// Whether medoids should be canonically sorted by object id.
double epsilon; /// Normalized sensitivity for convergence
size_t init_size; /// initial sample size (before 2*k)
size_t max_reps; /// initial sample size (before 2*k)
typedef std::mt19937 random_type; /// Type for RNG used in this algorithm
kmedoids(size_t num_objects)
: cluster_ids(num_objects, 0),
total_dissimilarity(std::numeric_limits<double>::infinity()),
sort_medoids(false),
epsilon(1e-15),
init_size(40),
max_reps(5)
{ if (num_objects) medoid_ids.resize(1); }
double average_dissimilarity() const {
return total_dissimilarity / cluster_ids.size();
}
void set_sort_medoids(bool sort) {
sort_medoids = sort;
}
void set_epsilon(double e) {
epsilon = e;
}
void init_medoids(size_t k, const dissimilarity_matrix& distance) {
medoid_ids.clear();
// find first object: object minimum dissimilarity to others
object_id first_medoid = 0;
double min_dissim = std::numeric_limits<Scalar>::max();
for (size_t i=0; i < distance.rows(); i++) {
double total = 0.0;
for (size_t j=0; j < distance.cols(); j++) {
total += distance(i,j);
}
if (total < min_dissim) {
min_dissim = total;
first_medoid = i;
}
}
// add first object to medoids and compute medoid ids.
medoid_ids.push_back(first_medoid);
assign_objects_to_clusters(distance);
// now select next k-1 objects according to KR's BUILD algorithm
for (size_t cur_k = 1; cur_k < k; cur_k++) {
object_id best_obj = 0;
double max_gain = 0.0;
for (size_t i=0; i < distance.rows(); i++) {
if (is_medoid(i)) continue;
double gain = 0.0;
#pragma omp parallel for reduction(+:gain)
for (int j=0; j < (int)distance.rows(); j++) {
double Dj = distance(j, medoid_ids[cluster_ids[j]]); // distance from j to its medoid
gain += max(Dj - distance(i,j), 0.0); // gain from selecting i
}
if (gain >= max_gain) { // set the next medoid to the object that
max_gain = gain; // maximizes the gain function.
best_obj = i;
}
}
medoid_ids.push_back(best_obj);
assign_objects_to_clusters(distance);
}
}
double cost(medoid_id i, object_id h, const dissimilarity_matrix& distance) const {
double total = 0;
for (object_id j = 0; j < cluster_ids.size(); j++) {
object_id mi = medoid_ids[i]; // object id of medoid i
double dhj = distance(h, j); // distance between object h and object j
object_id mj1 = medoid_ids[cluster_ids[j]]; // object id of j's nearest medoid
double dj1 = distance(mj1, j); // distance to j's nearest medoid
// check if distance bt/w medoid i and j is same as j's current nearest medoid.
if (distance(mi, j) == dj1) {
double dj2 = std::numeric_limits<Scalar>::max();
if (medoid_ids.size() > 1) { // look at 2nd nearest if there's more than one medoid.
object_id mj2 = medoid_ids[sec_nearest[j]]; // object id of j's 2nd-nearest medoid
dj2 = distance(mj2, j); // distance to j's 2nd-nearest medoid
}
total += min(dj2, dhj) - dj1;
} else if (dhj < dj1) {
total += dhj - dj1;
}
}
return total;
}
/// True if and only if object i is a medoid.
bool is_medoid(object_id oi) const {
return medoid_ids[cluster_ids[oi]] == oi;
}
void pam(const dissimilarity_matrix& distance, size_t k, const object_id *initial_medoids) {
if (k > distance.rows()) {
throw std::logic_error("Attempt to run PAM with more clusters than data.");
}
if (distance.rows() != distance.cols()) {
throw std::logic_error("Error: distance matrix is not square!");
}
// first get this the right size.
cluster_ids.resize(distance.rows());
// size cluster_ids appropriately and randomly pick initial medoids
if (initial_medoids) {
medoid_ids.clear();
copy(initial_medoids, initial_medoids + k, back_inserter(medoid_ids));
} else {
init_medoids(k, distance);
}
// set tolerance equal to epsilon times mean magnitude of distances.
// Note that distances *should* all be non-negative.
double tolerance = epsilon * distance.sum() / (distance.rows() * distance.cols());
while (true) {
// initial cluster setup
total_dissimilarity = assign_objects_to_clusters(distance);
//vars to keep track of minimum
double minTotalCost = std::numeric_limits<Scalar>::max();
medoid_id minMedoid = 0;
object_id minObject = 0;
//iterate over each medoid
for (medoid_id i=0; i < k; i++) {
//iterate over all non-medoid objects
for (object_id h = 0; h < cluster_ids.size(); h++) {
if (is_medoid(h)) continue;
//see if the total cost of swapping i & h was less than min
double curCost = cost(i, h, distance);
if (curCost < minTotalCost) {
minTotalCost = curCost;
minMedoid = i;
minObject = h;
}
}
}
// bail if we can't gain anything more (we've converged)
if (minTotalCost >= -tolerance) break;
// install the new medoid if we found a beneficial swap
medoid_ids[minMedoid] = minObject;
cluster_ids[minObject] = minMedoid;
}
//if (sort_medoids) sort();
}
/// Assign each object to the cluster with the closest medoid.
///
/// @return Total dissimilarity of objects w/their medoids.
///
/// @param distance a callable object that computes distances between indices, as a distance
/// matrix would. Algorithms are free to use real distance matrices (as in PAM)
/// or to compute lazily (as in CLARA medoid assignment).
double assign_objects_to_clusters(const dissimilarity_matrix& distance) {
if (sec_nearest.size() != cluster_ids.size()) {
sec_nearest.resize(cluster_ids.size());
}
// go through and assign each object to nearest medoid, keeping track of total dissimilarity.
double total_dissimilarity = 0;
#pragma omp parallel for reduction(+:total_dissimilarity)
for (int i=0; i < (int)cluster_ids.size(); i++) {
double d1, d2; // smallest, second smallest distance to medoid, respectively
medoid_id m1, m2; // index of medoids with distances d1, d2 from object i, respectively
d1 = d2 = std::numeric_limits<Scalar>::max();
m1 = m2 = medoid_ids.size();
for (medoid_id m=0; m < medoid_ids.size(); m++) {
double d = distance(i, medoid_ids[m]);
if (d < d1 || medoid_ids[m] == i) { // prefer the medoid in case of ties.
d2 = d1; m2 = m1;
d1 = d; m1 = m;
} else if (d < d2) {
d2 = d; m2 = m;
}
}
cluster_ids[i] = m1;
sec_nearest[i] = m2;
total_dissimilarity += d1;
}
return total_dissimilarity;
}
};
} // namespace cluster
#pragma once
/* Meanshift non-parametric mode estimator.
*
* Code adapted from : Sebastian Nowozin <nowozin@gmail.com>
*/
#define _USE_MATH_DEFINES
#include <math.h>
#include <vector>
#include <map>
#include <iostream>
//#include <gmm/gmm.h>
#include <Eigen/Core>
namespace gmm{
template<typename V>
void clear(V & v){
for(size_t i = 0; i < v.size(); i++)
v[i] = 0;
}
template<typename V, typename W>
void copy( const V & v, W & w ){
for(size_t i = 0; i < v.size(); i++)
w[i] = v[i];
}
template<typename V, typename W>
void add( const V & v, W & w ){
for(size_t i = 0; i < w.size(); i++)
w[i] += v[i];
}
template<typename V>
V scaled(const V & v, double s){
V a = v;
for(size_t i = 0; i < a.size(); i++)
a[i] *= s;
return a;
}
template<typename V>
void scale( V & v, double s ){
for(size_t i = 0; i < v.size(); i++)
v[i] *= s;
}
template<typename V, typename W>
double vect_dist2(V & v, W & w){
Eigen::VectorXd a = Eigen::Map<Eigen::VectorXd>(&v[0], v.size());
Eigen::VectorXd b = Eigen::Map<Eigen::VectorXd>(&w[0], w.size());
return (a-b).lpNorm<2>();
}
template<typename V>
double vect_norm2(V & v){
return Eigen::Map<Eigen::VectorXd>(&v[0], v.size()).norm();
}
}
namespace clustering {
/* We use the notation of the description of Mean Shift in
* [Comaniciu2000], Dorin Comaniciu, Peter Meer,
* "Mean Shift: A Robust Approach toward Feature Space Analysis"
*
* The implementation is naive and does not exploit fast nearest neighbor
* lookups or triangle inequalities to speed up the mean shift procedure.
* Therefore, it is only suitable for low-dimensional input spaces (say, <=
* 10) with relatively few samples (say, < 1e6).
*/
class Meanshift {
public:
/* kernel_type selects the kernel profile:
* 0 for the Epanechnikov kernel profile
* k_E(x) = (1 - x) if (0 <= x <= 1), 0 otherwise.
* 1 for the truncated multivariate normal kernel profile
* k_N(x) = exp(-0.5 * x)
* kernel_bandwidth: The positive bandwidth parameter.
* mode_tolerance: mode matching tolerance. Modes which have a L2
* distance closer than this value will be treated as being the same.
*/
Meanshift(int kernel_type, double kernel_bandwidth,
int dim, double mode_tolerance);
/* Find modes of an empirical distribution X.
*
* X: N vectors of size M, representing N samples from the distribution.
* modes: Output, will be allocated properly. Return all modes found.
* indexmap: N-vector of absolute mode indices. Each sample point is
* assigned to one mode. The vector must already be properly sized.
* procedurecount: The mean shift procedure will be run at least this many
* times, sampling randomly from X as initialization. If zero, all
* samples from X are used as initializations.
*/
void FindModes(const std::vector<std::vector<double> >& X,
std::vector<std::vector<double> >& modes,
std::vector<int>& indexmap,
unsigned int procedure_count = 0) const;
/* Mean Shift Procedure, starting from mode, perform mean shift on the
* distribution empirically sampled in X.
*
* X: N vectors of size M, representing N samples from the distribution.
* mode: Starting point (for example a point from X). The result will be
* given in mode.
* visited: N-vector of indicator variables. If the mean shift procedure
* passes through a point in X, the corresponding index in visited will
* be set to one.
*
* Return the number of iterations used.
*/
int MeanshiftProcedure(const std::vector<std::vector<double> >& X,
std::vector<double>& mode, std::vector<unsigned int>& visited) const;
private:
int dim; // input space dimension
int kernel_type; // 0: Epanechnikov, 1: truncated multivariate normal
double kernel_c; // constant normalization factor
double kernel_bandwidth;
double mode_tolerance;
};
Meanshift::Meanshift(int kernel_type, double kernel_bandwidth,
int dim, double mode_tolerance)
: dim(dim), kernel_type(kernel_type),
kernel_bandwidth(kernel_bandwidth),
mode_tolerance(mode_tolerance) {
assert(kernel_type >= 0 && kernel_type <= 1);
assert(kernel_bandwidth > 0.0);
assert(dim > 0);
// Compute normalization constant
if (kernel_type == 0) {
kernel_c = 0.5 * (static_cast<double>(dim) + 2.0) / 1.234;
// TODO: replace 1.234 with the volume of the d-dimensional unit
// sphere FIXME
} else if (kernel_type == 1) {
kernel_c = pow(2.0 * M_PI, -static_cast<double>(dim)/2.0);
}
}
void Meanshift::FindModes(const std::vector<std::vector<double> >& X,
std::vector<std::vector<double> >& modes,
std::vector<int>& indexmap,
unsigned int procedure_count) const
{
//assert(indexmap.size() == X.size());
indexmap.clear();
indexmap.resize(X.size(), 0);
modes.clear();
if (procedure_count != 0 && procedure_count >= X.size())
procedure_count = 0;
// Perform dense mode finding: for each sample in X, perform the mean
// shift procedure
std::vector<double> mode(X[0].size());
std::vector<unsigned int> visited(X.size());
// [mode_index][sample_index] = number of times sample_index was visited
// when arriving at the mode with the mode_index.
std::vector<std::map<unsigned int, unsigned int> > visit_counts;
for (unsigned int si = 0; (procedure_count == 0 && si < X.size())
|| (procedure_count != 0 && si < procedure_count); ++si) {
if (procedure_count == 0) {
gmm::copy(X[si], mode);
} else {
// Pick a random one from X
unsigned sample_index = rand() % X.size();
gmm::copy(X[sample_index], mode);
}
std::cout << "Sample " << si << " of "
<< (procedure_count == 0 ? X.size() : procedure_count)
<< std::endl;
gmm::clear(visited);
MeanshiftProcedure(X, mode, visited);
// Identify whether this is a novel mode or a known one
bool found = false;
unsigned int M_cur = 0;
for (std::vector<std::vector<double> >::iterator Mi =
modes.begin(); found == false && Mi != modes.end(); ++Mi) {
if (gmm::vect_dist2(*Mi, mode) < mode_tolerance) {
M_cur = Mi - modes.begin();
found = true;
}
}
if (found == false) {
// Add novel mode
modes.push_back(mode);
// TODO: remove this debug output
std::cout << modes.size() << " mode"
<< (modes.size() >= 2 ? "s" : "") << std::endl;
// Add a new mapping of which samples have been visited while
// approaching the novel mode.
std::map<unsigned int, unsigned int> mode_visits;
for (std::vector<unsigned int>::const_iterator vi =
visited.begin(); vi != visited.end(); ++vi) {
if (*vi != 0)
mode_visits[vi - visited.begin()] = 1;
}
visit_counts.push_back(mode_visits);
} else {
// The mode has been known, but we maybe crossed old and new
// samples. Update the counts.
for (std::vector<unsigned int>::const_iterator vi =
visited.begin(); vi != visited.end(); ++vi) {
if (*vi != 0)
visit_counts[M_cur][vi - visited.begin()] += 1;
}
}
}
#ifdef DEBUG
std::cout << "Found " << modes.size() << " modes." << std::endl;
#endif
// Perform index mapping: each sample gets assigned to one mode index.
unsigned int unmapped_count = 0;
for (unsigned int sample_index = 0;
sample_index < X.size(); ++sample_index) {
// Find mode index with highest count
unsigned int maximum_count = 0;
int maximum_mode_index = -1;
for (unsigned int mode_index = 0;
mode_index < modes.size(); ++mode_index) {
if (visit_counts[mode_index].count(sample_index) == 0)
continue;
unsigned int count_cur = visit_counts[mode_index][sample_index];
std::cout << " mode " << mode_index << " visited "
<< "sample " << sample_index << " for "
<< count_cur << " times." << std::endl;
if (count_cur > maximum_count) {
maximum_mode_index = mode_index;
maximum_count = count_cur;
}
}
if (maximum_mode_index == -1)
unmapped_count += 1;
indexmap[sample_index] = maximum_mode_index;
}
std::cout << unmapped_count << " unmapped samples." << std::endl;
}
int Meanshift::MeanshiftProcedure(const std::vector<std::vector<double> >& X,
std::vector<double>& mode, std::vector<unsigned int>& visited) const {
assert(X.size() > 0);
assert(visited.size() == X.size());
assert(X[0].size() == mode.size());
std::vector<double> meanshift(mode.size(),0);
std::vector<double> meanshift_cur(mode.size(),0);
int iter = 0;
bool converged = false;
do {
// Compute the mean shift vector
gmm::clear(meanshift);
double denominator = 0.0;
for (std::vector<std::vector<double> >::const_iterator Xi = X.begin();
Xi != X.end(); ++Xi) {
gmm::copy(mode, meanshift_cur);
gmm::add(gmm::scaled(*Xi, -1.0), meanshift_cur);
gmm::scale(meanshift_cur, 1.0 / kernel_bandwidth);
double weight_cur = gmm::vect_norm2(meanshift_cur);
if (kernel_type == 0) {
// Epanechnikov kernel is the shadow of the uniform kernel
if (weight_cur <= 1.0)
weight_cur = 1.0;
else
weight_cur = 0.0;
} else if (kernel_type == 1) {
// Multivariate normal kernel is the shadow of itself
weight_cur = kernel_c * exp(-0.5 * weight_cur);
// Truncate
if (weight_cur < 1e-2)
weight_cur = 0.0;
}
if (weight_cur >= 1e-6)
visited[Xi - X.begin()] = 1;
gmm::add(gmm::scaled(*Xi, weight_cur), meanshift);
denominator += weight_cur;
}
gmm::scale(meanshift, 1.0 / denominator);
double distance_moved = gmm::vect_dist2(meanshift, mode);
gmm::copy(meanshift, mode);
#ifdef DEBUG
std::cout << "iter " << iter << ", moved "
<< distance_moved << std::endl;
#endif
iter += 1;
if (distance_moved <= 1e-8)
converged = true;
} while (converged == false);
return (iter);
}
}
/*=========================================================================
*
* Copyright David Doria 2012 daviddoria@gmail.com
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0.txt
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*=========================================================================*/
/** XMeans clustering is a method that creates K clusters of points from
* an unorganized set of input points. It is an extension of KMeans clustering
* that attempts to determine K during the algorithm.
*/
#ifndef XMeansClustering_h
#define XMeansClustering_h
// STL
#include <vector>
#include <iostream>
#include <limits>
#include <numeric>
#include <set>
#include <stdexcept>
// Eigen
#include <Eigen/Dense>
namespace EigenHelpers{
template <typename TVector>
TVector RandomUnitVector(const unsigned int dim)
{
TVector randomUnitVector(dim);
for(int i = 0; i < randomUnitVector.size(); ++i)
{
randomUnitVector[i] = (double(rand()) / RAND_MAX);
}
randomUnitVector.normalize();
return randomUnitVector;
}
template <typename TPoint>
void GetBoundingBox(const std::vector<TPoint, Eigen::aligned_allocator<TPoint> >& data, TPoint& minCorner, TPoint& maxCorner)
{
assert(data.size() > 0);
minCorner.resize(data[0].size());
maxCorner.resize(data[0].size());
for(int coordinate = 0; coordinate < data[0].size(); ++coordinate)
{
minCorner[coordinate] = std::numeric_limits<typename TPoint::Scalar>::max();
maxCorner[coordinate] = std::numeric_limits<typename TPoint::Scalar>::min();
for(unsigned int pointId = 0; pointId < data.size(); ++pointId)
{
if(data[pointId][coordinate] > maxCorner[coordinate])
{
maxCorner[coordinate] = data[pointId][coordinate];
}
if(data[pointId][coordinate] < minCorner[coordinate])
{
minCorner[coordinate] = data[pointId][coordinate];
}
}
}
}
}
class XMeansClustering
{
public:
typedef Eigen::VectorXf PointType;
typedef std::vector<PointType, Eigen::aligned_allocator<PointType> > VectorOfPoints;
/** Constructor. */
XMeansClustering();
/** Set the maximum number of clusters to find. */
void SetMaxK(const unsigned int maxk);
/** Get the maximum number of clusters to find. */
unsigned int GetMaxK();
/** Get the ids of the points that belong to class 'label'. */
std::vector<unsigned int> GetIndicesWithLabel(const unsigned int label);
/** Get the coordinates of the points that belong to class 'label'. */
VectorOfPoints GetPointsWithLabel(const unsigned int label);
/** Set the points to cluster. */
void SetPoints(const VectorOfPoints& points);
/** Get the resulting cluster id for each point. */
std::vector<unsigned int> GetLabels();
/** Actually perform the clustering. */
void Cluster();
/** Write the cluster centers to the standard output. */
void OutputClusterCenters();
private:
/** Split every cluster into two clusters if that helps the description of the data. */
void SplitClusters();
/** The label (cluster ID) of each point. */
std::vector<unsigned int> Labels;
/** The maximum number of clusters to find */
unsigned int MaxK;
/** The points to cluster. */
VectorOfPoints Points;
/** The current cluster centers. */
VectorOfPoints ClusterCenters;
};
XMeansClustering::XMeansClustering() : MaxK(3)
{
}
void XMeansClustering::Cluster()
{
if(this->Points.size() < this->MaxK)
{
std::stringstream ss;
ss << "The number of points (" << this->Points.size()
<< " must be larger than the maximum number of clusters (" << this->MaxK << ")";
throw std::runtime_error(ss.str());
}
// We must store the labels at the previous iteration to determine whether any labels changed at each iteration.
std::vector<unsigned int> oldLabels(this->Points.size(), 0); // initialize to all zeros
// Initialize the labels array
this->Labels.resize(this->Points.size());
do
{
// Save the old labels
oldLabels = this->Labels;
}while(this->ClusterCenters.size() < this->MaxK);
}
void XMeansClustering::SplitClusters()
{
assert(this->Points.size() > 0);
VectorOfPoints newClusterCenters;
for(unsigned int clusterId = 0; clusterId < this->ClusterCenters.size(); ++clusterId)
{
// Generate a random direction
PointType randomUnitVector = EigenHelpers::RandomUnitVector<PointType>(this->Points[0].size());
// Get the bounding box of the points that belong to this cluster
PointType minCorner;
PointType maxCorner;
EigenHelpers::GetBoundingBox(this->Points, minCorner, maxCorner);
// Scale the unit vector by the size of the region
PointType splitVector = randomUnitVector * (maxCorner - minCorner) / 2.0f;
PointType childCenter1 = this->ClusterCenters[clusterId] + splitVector;
PointType childCenter2 = this->ClusterCenters[clusterId] + splitVector;
// Compute the BIC of the original model
float BIC_parent;
// Compute the BIC of the new (split) model
float BIC_children;
// If the split was useful, keep it
if(BIC_children < BIC_parent)
{
newClusterCenters.push_back(childCenter1);
newClusterCenters.push_back(childCenter2);
}
else
{
newClusterCenters.push_back(this->ClusterCenters[clusterId]);
}
}
this->ClusterCenters = newClusterCenters;
}
std::vector<unsigned int> XMeansClustering::GetIndicesWithLabel(unsigned int label)
{
std::vector<unsigned int> pointsWithLabel;
for(unsigned int i = 0; i < this->Labels.size(); i++)
{
if(this->Labels[i] == label)
{
pointsWithLabel.push_back(i);
}
}
return pointsWithLabel;
}
XMeansClustering::VectorOfPoints XMeansClustering::GetPointsWithLabel(const unsigned int label)
{
VectorOfPoints points;
std::vector<unsigned int> indicesWithLabel = GetIndicesWithLabel(label);
for(unsigned int i = 0; i < indicesWithLabel.size(); i++)
{
points.push_back(this->Points[indicesWithLabel[i]]);
}
return points;
}
void XMeansClustering::SetMaxK(const unsigned int maxk)
{
this->MaxK = maxk;
}
unsigned int XMeansClustering::GetMaxK()
{
return this->MaxK;
}
void XMeansClustering::SetPoints(const VectorOfPoints& points)
{
this->Points = points;
}
std::vector<unsigned int> XMeansClustering::GetLabels()
{
return this->Labels;
}
void XMeansClustering::OutputClusterCenters()
{
std::cout << std::endl << "Cluster centers: " << std::endl;
for(unsigned int i = 0; i < ClusterCenters.size(); ++i)
{
std::cout << ClusterCenters[i] << " ";
}
std::cout << std::endl;
}
#endif
@forestsen
Copy link

ヽ(✿゚▽゚)ノ

@raymond30031
Copy link

raymond30031 commented Sep 30, 2019

Hello,
For k-medoids,
How do you construct the distance matrix given a distance function?
Do you fill the entire nxn matrix or only upper or lower triangle? n is the number of points

Thanks,
JC

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment