Last active
April 17, 2017 05:46
-
-
Save disa-mhembere/7d1b5b82487c0169569c4d42d1dbf81e to your computer and use it in GitHub Desktop.
Another version of the kmeans intel daal's kmeans
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" A class for K-Means clustering """ | |
""" | |
Adapted from Zhang Zhang zhang.zhang@intel.com | |
https://github.com/daaltces/pydaal-tutorials/blob/master/kmeans.py | |
https://github.com/daaltces/pydaal-tutorials/blob/master/kmeans_example.ipynb | |
""" | |
import daal.algorithms.kmeans as kmeans | |
from daal.algorithms.kmeans import init | |
from daal.data_management import HomogenNumericTable | |
from time import time | |
from sys import argv | |
import numpy as np | |
from daal.data_management import HomogenNumericTable, BlockDescriptor_Float64, readOnly | |
def getArrayFromNT(table, nrows=0): | |
bd = BlockDescriptor_Float64() | |
if nrows == 0: | |
nrows = table.getNumberOfRows() | |
table.getBlockOfRows(0, nrows, readOnly, bd) | |
npa = bd.getArray() | |
table.releaseBlockOfRows(bd) | |
return npa | |
def printNT(table, nrows = 0, message=''): | |
npa = getArrayFromNT(table, nrows) | |
print(message, '\n', npa) | |
class KMeans: | |
def __init__(self, nclusters, randomseed = None): | |
"""Initialize class parameters | |
Args: | |
nclusters: Number of clusters | |
randomseed: An integer used to seed the random number generator | |
""" | |
self.nclusters_ = nclusters | |
self.seed_ = 1234 if randomseed is None else randomseed | |
self.centroids_ = None | |
self.assignments_ = None | |
self.goalfunction_ = None | |
self.niterations_ = None | |
def compute(self, data, centroids = None, maxiters = 100): | |
"""Compute K-Means clustering for the input data | |
Args: | |
data: Input data to be clustered | |
centroids: User defined input centroids. If None then initial | |
centroids will be randomly chosen | |
maxiters: The maximum number of iterations | |
""" | |
if centroids is None: | |
# Create an algorithm object for centroids initialization | |
init_alg = init.Batch_Float64RandomDense(self.nclusters_) | |
# Set input | |
init_alg.input.set(init.data, data) | |
# Set parameters | |
init_alg.parameter.seed = self.seed_ | |
# Compute initial centroids | |
self.centroids_ = init_alg.compute().get(init.centroids) | |
else: | |
self.centroids_ = centroids | |
# Create an algorithm object for clustering | |
clustering_alg = kmeans.Batch_Float64LloydDense( | |
self.nclusters_, | |
maxiters) | |
# Set input | |
clustering_alg.input.set(kmeans.data, data) | |
clustering_alg.input.set(kmeans.inputCentroids, self.centroids_) | |
# compute | |
result = clustering_alg.compute() | |
self.centroids_ = result.get(kmeans.centroids) | |
self.assignments_ = result.get(kmeans.assignments) | |
self.goalfunction_ = result.get(kmeans.goalFunction) | |
self.niterations_ = result.get(kmeans.nIterations) | |
if __name__ == "__main__": | |
assert len(argv) > 1, "<dataset> <k> <niters>" | |
datasetFileName = argv[1] | |
nclusters = int(argv[2]) | |
niter = int(argv[3]) | |
data = np.genfromtxt(datasetFileName, dtype = np.double, delimiter=' ') | |
#import pdb; pdb.set_trace() | |
data_ = HomogenNumericTable(data) | |
start = time() | |
clustering = KMeans(nclusters) | |
clustering.compute(data_, maxiters=niter) | |
print("\nAlg computation time: {} sec\n".format(time()-start)) | |
assignments = getArrayFromNT(clustering.assignments_).flatten().astype(np.int) | |
print("Iterations: ", getArrayFromNT(clustering.niterations_)) | |
printNT(clustering.assignments_, 10, "First 10 cluster assignments:") | |
printNT(clustering.centroids_, 10, "First 10 dimensions of centroids:") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment