Skip to content

Instantly share code, notes, and snippets.

@matovitch
Last active August 29, 2015 14:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save matovitch/178fc3d59be942274d0e to your computer and use it in GitHub Desktop.
Save matovitch/178fc3d59be942274d0e to your computer and use it in GitHub Desktop.
#ifndef __KMEANS_H__
#define __KMEANS_H__
#include <ctime>
#include <vector>
#include <limits>
#include <cstddef>
namespace kmeans
{
template <typename Distance, typename Hasher>
struct Algorithm
{
typedef typename std::vector<typename Distance::argument_type>::iterator clusters_iterator;
std::vector<typename Distance::argument_type> m_clusters;
struct History
{
typename Distance::argument_type* m_oldCluster;
typename Distance::argument_type m_newCluster;
std::size_t m_nPoints;
std::size_t m_hash;
History() : m_hash(0), m_nPoints(1) {}
};
template <typename PointSetIterator>
void operator()(const PointSetIterator& pointsBegin,
const PointSetIterator& pointsEnd, std::size_t nClusters)
{
HistorySet histories(nClusters);
std::size_t oldClustersHash = 0;
std::size_t newClustersHash = 1;
initClusters(pointsBegin, pointsEnd, nClusters, histories);
while (newClustersHash != oldClustersHash)
{
oldClustersHash = newClustersHash;
for(PointSetIterator pointSetIterator = pointsBegin;
pointSetIterator != pointsEnd;
pointSetIterator++)
{
DistanceType minClusterDistance = std::numeric_limits<DistanceType>::max();
HistorySetIterator nearestCluster;
for (HistorySetIterator it = histories.begin(); it != histories.end(); it++)
{
const DistanceType clusterDistance =
m_distance(*pointSetIterator, *(it->m_oldCluster));
if (clusterDistance < minClusterDistance)
{
minClusterDistance = clusterDistance;
nearestCluster = it;
}
}
nearestCluster->m_hash ^= m_hasher(*pointSetIterator);
nearestCluster->m_newCluster += *pointSetIterator;
nearestCluster->m_nPoints++;
}
newClustersHash = 0;
for (HistorySetIterator it = histories.begin(); it != histories.end(); it++)
{
if (it->m_nPoints == 0)
{
*(it->m_oldCluster) = *pointsBegin;
}
else
{
it->m_newCluster /= static_cast<FieldType>(it->m_nPoints);
*(it->m_oldCluster) = it->m_newCluster;
}
newClustersHash += it->m_hash;
it->m_nPoints = 1;
it->m_hash = 0;
}
}
}
private:
typedef std::vector<History> HistorySet;
typedef typename Distance::return_type DistanceType;
typedef typename HistorySet::iterator HistorySetIterator;
typedef typename Distance::argument_type::field_type FieldType;
Distance m_distance;
Hasher m_hasher;
template <typename PointSetIterator>
void initClusters(const PointSetIterator& pointsBegin,
const PointSetIterator& pointsEnd,
std::size_t nClusters,
HistorySet& histories)
{
m_clusters.resize(nClusters);
m_clusters[0] = *pointsBegin;
for (std::size_t i = 1; i < nClusters; i++)
{
const DistanceType cost = computeCost(pointsBegin, pointsEnd, i);
PointSetIterator it = pointsBegin;
PointSetIterator newCluster = pointsEnd;
while (newCluster == pointsEnd)
{
DistanceType minClusterDistance = std::numeric_limits<DistanceType>::max();
for (std::size_t j = 0; j < i; j++)
{
const DistanceType clusterDistance =
m_distance(*it, m_clusters[j]);
if (clusterDistance < minClusterDistance)
{
minClusterDistance = clusterDistance;
}
}
if (minClusterDistance * static_cast<FieldType>(RAND_MAX) >
static_cast<FieldType>(rand()) * cost)
{
newCluster = it;
}
it = (it + 1 != pointsEnd) ? it + 1 : pointsBegin;
}
m_clusters[i] = *newCluster;
}
clusters_iterator clusterSetIterator = m_clusters.begin();
std::vector<History>::iterator historySetIterator = histories.begin();
while (clusterSetIterator != m_clusters.end())
{
historySetIterator->m_oldCluster = &(*clusterSetIterator);
historySetIterator->m_newCluster = *clusterSetIterator;
historySetIterator++;
clusterSetIterator++;
}
}
template <typename PointSetIterator>
DistanceType computeCost(const PointSetIterator& pointsBegin,
const PointSetIterator& pointsEnd, std::size_t nClusters)
{
DistanceType cost = static_cast<DistanceType>(0);
for(PointSetIterator pointSetIterator = pointsBegin;
pointSetIterator != pointsEnd;
pointSetIterator++)
{
DistanceType minClusterDistance = std::numeric_limits<DistanceType>::max();
for (std::size_t j = 0; j < nClusters; j++)
{
const DistanceType clusterDistance =
m_distance(*pointSetIterator, m_clusters[j]);
if (clusterDistance < minClusterDistance)
{
minClusterDistance = clusterDistance;
}
}
cost += minClusterDistance;
}
return cost;
}
};
} // end kmeans namespace
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment