disa-mhembere/kmeans_intel_daal2.py

## kmeans_intel_daal2.py
""" A class for K-Means clustering """

"""
Adapted from Zhang Zhang zhang.zhang@intel.com

https://github.com/daaltces/pydaal-tutorials/blob/master/kmeans.py
https://github.com/daaltces/pydaal-tutorials/blob/master/kmeans_example.ipynb
"""

import daal.algorithms.kmeans as kmeans
from daal.algorithms.kmeans import init
from daal.data_management import HomogenNumericTable
from time import time
from sys import argv

import numpy as np
from daal.data_management import HomogenNumericTable, BlockDescriptor_Float64, readOnly

def getArrayFromNT(table, nrows=0):
    bd = BlockDescriptor_Float64()
    if nrows == 0:
        nrows = table.getNumberOfRows()
    table.getBlockOfRows(0, nrows, readOnly, bd)
    npa = bd.getArray()
    table.releaseBlockOfRows(bd)
    return npa

def printNT(table, nrows = 0, message=''):
    npa = getArrayFromNT(table, nrows)
    print(message, '\n', npa)

class KMeans:

    def __init__(self, nclusters, randomseed = None):
        """Initialize class parameters

        Args:
           nclusters: Number of clusters
           randomseed: An integer used to seed the random number generator
        """

        self.nclusters_ = nclusters
        self.seed_ = 1234 if randomseed is None else randomseed
        self.centroids_ = None
        self.assignments_ = None
        self.goalfunction_ = None
        self.niterations_ = None


    def compute(self, data, centroids = None, maxiters = 100):
        """Compute K-Means clustering for the input data

        Args:
           data: Input data to be clustered
           centroids: User defined input centroids. If None then initial
               centroids will be randomly chosen
           maxiters: The maximum number of iterations
        """

        if centroids is None:
            # Create an algorithm object for centroids initialization
            init_alg = init.Batch_Float64RandomDense(self.nclusters_)
            # Set input
            init_alg.input.set(init.data, data)
            # Set parameters
            init_alg.parameter.seed = self.seed_
            # Compute initial centroids
            self.centroids_ = init_alg.compute().get(init.centroids)
        else:
            self.centroids_ = centroids

        # Create an algorithm object for clustering
        clustering_alg = kmeans.Batch_Float64LloydDense(
                self.nclusters_,
                maxiters)
        # Set input
        clustering_alg.input.set(kmeans.data, data)
        clustering_alg.input.set(kmeans.inputCentroids, self.centroids_)
        # compute
        result = clustering_alg.compute()
        self.centroids_ = result.get(kmeans.centroids)
        self.assignments_ = result.get(kmeans.assignments)
        self.goalfunction_ = result.get(kmeans.goalFunction)
        self.niterations_ = result.get(kmeans.nIterations)

if __name__ == "__main__":
    assert len(argv) > 1, "<dataset> <k> <niters>"
    datasetFileName = argv[1]
    nclusters = int(argv[2])
    niter = int(argv[3])

    data = np.genfromtxt(datasetFileName, dtype = np.double, delimiter=' ')
    #import pdb; pdb.set_trace()
    data_ = HomogenNumericTable(data)
    start = time()

    clustering = KMeans(nclusters)
    clustering.compute(data_, maxiters=niter)
    print("\nAlg computation time: {} sec\n".format(time()-start))

    assignments = getArrayFromNT(clustering.assignments_).flatten().astype(np.int)
    print("Iterations: ", getArrayFromNT(clustering.niterations_))
    printNT(clustering.assignments_, 10, "First 10 cluster assignments:")
    printNT(clustering.centroids_, 10, "First 10 dimensions of centroids:")
	""" A class for K-Means clustering """

	"""
	Adapted from Zhang Zhang zhang.zhang@intel.com

	https://github.com/daaltces/pydaal-tutorials/blob/master/kmeans.py
	https://github.com/daaltces/pydaal-tutorials/blob/master/kmeans_example.ipynb
	"""

	import daal.algorithms.kmeans as kmeans
	from daal.algorithms.kmeans import init
	from daal.data_management import HomogenNumericTable
	from time import time
	from sys import argv

	import numpy as np
	from daal.data_management import HomogenNumericTable, BlockDescriptor_Float64, readOnly

	def getArrayFromNT(table, nrows=0):
	bd = BlockDescriptor_Float64()
	if nrows == 0:
	nrows = table.getNumberOfRows()
	table.getBlockOfRows(0, nrows, readOnly, bd)
	npa = bd.getArray()
	table.releaseBlockOfRows(bd)
	return npa

	def printNT(table, nrows = 0, message=''):
	npa = getArrayFromNT(table, nrows)
	print(message, '\n', npa)

	class KMeans:

	def __init__(self, nclusters, randomseed = None):
	"""Initialize class parameters

	Args:
	nclusters: Number of clusters
	randomseed: An integer used to seed the random number generator
	"""

	self.nclusters_ = nclusters
	self.seed_ = 1234 if randomseed is None else randomseed
	self.centroids_ = None
	self.assignments_ = None
	self.goalfunction_ = None
	self.niterations_ = None


	def compute(self, data, centroids = None, maxiters = 100):
	"""Compute K-Means clustering for the input data

	Args:
	data: Input data to be clustered
	centroids: User defined input centroids. If None then initial
	centroids will be randomly chosen
	maxiters: The maximum number of iterations
	"""

	if centroids is None:
	# Create an algorithm object for centroids initialization
	init_alg = init.Batch_Float64RandomDense(self.nclusters_)
	# Set input
	init_alg.input.set(init.data, data)
	# Set parameters
	init_alg.parameter.seed = self.seed_
	# Compute initial centroids
	self.centroids_ = init_alg.compute().get(init.centroids)
	else:
	self.centroids_ = centroids

	# Create an algorithm object for clustering
	clustering_alg = kmeans.Batch_Float64LloydDense(
	self.nclusters_,
	maxiters)
	# Set input
	clustering_alg.input.set(kmeans.data, data)
	clustering_alg.input.set(kmeans.inputCentroids, self.centroids_)
	# compute
	result = clustering_alg.compute()
	self.centroids_ = result.get(kmeans.centroids)
	self.assignments_ = result.get(kmeans.assignments)
	self.goalfunction_ = result.get(kmeans.goalFunction)
	self.niterations_ = result.get(kmeans.nIterations)

	if __name__ == "__main__":
	assert len(argv) > 1, "<dataset> <k> <niters>"
	datasetFileName = argv[1]
	nclusters = int(argv[2])
	niter = int(argv[3])

	data = np.genfromtxt(datasetFileName, dtype = np.double, delimiter=' ')
	#import pdb; pdb.set_trace()
	data_ = HomogenNumericTable(data)
	start = time()

	clustering = KMeans(nclusters)
	clustering.compute(data_, maxiters=niter)
	print("\nAlg computation time: {} sec\n".format(time()-start))

	assignments = getArrayFromNT(clustering.assignments_).flatten().astype(np.int)
	print("Iterations: ", getArrayFromNT(clustering.niterations_))
	printNT(clustering.assignments_, 10, "First 10 cluster assignments:")
	printNT(clustering.centroids_, 10, "First 10 dimensions of centroids:")