Skip to content

Instantly share code, notes, and snippets.

@disa-mhembere
Last active April 17, 2017 05:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save disa-mhembere/7d1b5b82487c0169569c4d42d1dbf81e to your computer and use it in GitHub Desktop.
Save disa-mhembere/7d1b5b82487c0169569c4d42d1dbf81e to your computer and use it in GitHub Desktop.
Another version of the kmeans intel daal's kmeans
""" A class for K-Means clustering """
"""
Adapted from Zhang Zhang zhang.zhang@intel.com
https://github.com/daaltces/pydaal-tutorials/blob/master/kmeans.py
https://github.com/daaltces/pydaal-tutorials/blob/master/kmeans_example.ipynb
"""
import daal.algorithms.kmeans as kmeans
from daal.algorithms.kmeans import init
from daal.data_management import HomogenNumericTable
from time import time
from sys import argv
import numpy as np
from daal.data_management import HomogenNumericTable, BlockDescriptor_Float64, readOnly
def getArrayFromNT(table, nrows=0):
bd = BlockDescriptor_Float64()
if nrows == 0:
nrows = table.getNumberOfRows()
table.getBlockOfRows(0, nrows, readOnly, bd)
npa = bd.getArray()
table.releaseBlockOfRows(bd)
return npa
def printNT(table, nrows = 0, message=''):
npa = getArrayFromNT(table, nrows)
print(message, '\n', npa)
class KMeans:
def __init__(self, nclusters, randomseed = None):
"""Initialize class parameters
Args:
nclusters: Number of clusters
randomseed: An integer used to seed the random number generator
"""
self.nclusters_ = nclusters
self.seed_ = 1234 if randomseed is None else randomseed
self.centroids_ = None
self.assignments_ = None
self.goalfunction_ = None
self.niterations_ = None
def compute(self, data, centroids = None, maxiters = 100):
"""Compute K-Means clustering for the input data
Args:
data: Input data to be clustered
centroids: User defined input centroids. If None then initial
centroids will be randomly chosen
maxiters: The maximum number of iterations
"""
if centroids is None:
# Create an algorithm object for centroids initialization
init_alg = init.Batch_Float64RandomDense(self.nclusters_)
# Set input
init_alg.input.set(init.data, data)
# Set parameters
init_alg.parameter.seed = self.seed_
# Compute initial centroids
self.centroids_ = init_alg.compute().get(init.centroids)
else:
self.centroids_ = centroids
# Create an algorithm object for clustering
clustering_alg = kmeans.Batch_Float64LloydDense(
self.nclusters_,
maxiters)
# Set input
clustering_alg.input.set(kmeans.data, data)
clustering_alg.input.set(kmeans.inputCentroids, self.centroids_)
# compute
result = clustering_alg.compute()
self.centroids_ = result.get(kmeans.centroids)
self.assignments_ = result.get(kmeans.assignments)
self.goalfunction_ = result.get(kmeans.goalFunction)
self.niterations_ = result.get(kmeans.nIterations)
if __name__ == "__main__":
assert len(argv) > 1, "<dataset> <k> <niters>"
datasetFileName = argv[1]
nclusters = int(argv[2])
niter = int(argv[3])
data = np.genfromtxt(datasetFileName, dtype = np.double, delimiter=' ')
#import pdb; pdb.set_trace()
data_ = HomogenNumericTable(data)
start = time()
clustering = KMeans(nclusters)
clustering.compute(data_, maxiters=niter)
print("\nAlg computation time: {} sec\n".format(time()-start))
assignments = getArrayFromNT(clustering.assignments_).flatten().astype(np.int)
print("Iterations: ", getArrayFromNT(clustering.niterations_))
printNT(clustering.assignments_, 10, "First 10 cluster assignments:")
printNT(clustering.centroids_, 10, "First 10 dimensions of centroids:")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment