aerinkim/Kmeans.py

## Kmeans.py
from sklearn import datasets

def Kmeans(X, K):
    m = len(X)
    X_centroid = dict() # Save which sample belong to which cluster.
    X_centroid.fromkeys(range(0, m))
    C = dict()          # Save cluster's cordinate
    C.fromkeys(range(0, K))
    old_C = None        # Cache to save old C. Used for an early termination.
    # 1. Randomly initialize k centroids.
    rand = random.sample(range(m), K)
    for i in range(0, K):
        C[i] = [X[rand[i]], 0]
    # 2. Iterate 1000 times.
    for iteration in range(0, 1000):
        # 3. Assign the centroid for each sample. This line determines the computational complexity : m * K * d. (d is len(X[0]).)
        for i in range(0, m):
            dist = [np.linalg.norm(X[i] - C[j][0], 2) for j in range(0, K)]
            X_centroid[i] = np.argmin(dist)
        # 4. Calculate the new centroid cordinates (Notice I'm only reading the data once for the efficiency.)
        temp_coordinates_counts = [[np.zeros(len(X[0])), 0] for _ in range(0, K)]
        for i in range(0, m):
            temp_coordinates_counts[X_centroid[i]][0] += X[i]
            temp_coordinates_counts[X_centroid[i]][1] += 1
        for i in range(0, K):
            coordinates_sum = temp_coordinates_counts[i][0]
            count = temp_coordinates_counts[i][1]
            C[i] = [coordinates_sum / count, count]
        if old_C == C:  # Early termination
            break
        old_C = C
    return X_centroid, C


if __name__ == '__main__':
    np.random.seed(5)
    iris = datasets.load_iris()
    X = iris.data
    K = 3

    # From Scratch
    X_centroid, C = Kmeans(X, K)
    print("From Scratch: ", C)

    # From SKLearn
    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=K)
    kmeans = kmeans.fit(X)
    print("From SKlearn: ", kmeans.cluster_centers_)

    # They match!

    """
    <Note : Algorithm Properties>

    1. K-means results are sensitive to initialization.

    2. Complexity is O( m * K * I * d )
    – m = number of points, K = number of clusters, I = number of iterations, d = number of features.

    3. Guarantee: Monotonically decreases the objective function ("dist" variable in the code) until convergence.

    4. Scalability : Every iteration is linear in the size of its input.

    5. K-means is a special case of EM clustering. Think about it!

    6. There is no accepted method to discover k!!!
    """
	from sklearn import datasets

	def Kmeans(X, K):
	m = len(X)
	X_centroid = dict() # Save which sample belong to which cluster.
	X_centroid.fromkeys(range(0, m))
	C = dict() # Save cluster's cordinate
	C.fromkeys(range(0, K))
	old_C = None # Cache to save old C. Used for an early termination.
	# 1. Randomly initialize k centroids.
	rand = random.sample(range(m), K)
	for i in range(0, K):
	C[i] = [X[rand[i]], 0]
	# 2. Iterate 1000 times.
	for iteration in range(0, 1000):
	# 3. Assign the centroid for each sample. This line determines the computational complexity : m * K * d. (d is len(X[0]).)
	for i in range(0, m):
	dist = [np.linalg.norm(X[i] - C[j][0], 2) for j in range(0, K)]
	X_centroid[i] = np.argmin(dist)
	# 4. Calculate the new centroid cordinates (Notice I'm only reading the data once for the efficiency.)
	temp_coordinates_counts = [[np.zeros(len(X[0])), 0] for _ in range(0, K)]
	for i in range(0, m):
	temp_coordinates_counts[X_centroid[i]][0] += X[i]
	temp_coordinates_counts[X_centroid[i]][1] += 1
	for i in range(0, K):
	coordinates_sum = temp_coordinates_counts[i][0]
	count = temp_coordinates_counts[i][1]
	C[i] = [coordinates_sum / count, count]
	if old_C == C: # Early termination
	break
	old_C = C
	return X_centroid, C


	if __name__ == '__main__':
	np.random.seed(5)
	iris = datasets.load_iris()
	X = iris.data
	K = 3

	# From Scratch
	X_centroid, C = Kmeans(X, K)
	print("From Scratch: ", C)

	# From SKLearn
	from sklearn.cluster import KMeans
	kmeans = KMeans(n_clusters=K)
	kmeans = kmeans.fit(X)
	print("From SKlearn: ", kmeans.cluster_centers_)

	# They match!

	"""
	<Note : Algorithm Properties>

	1. K-means results are sensitive to initialization.

	2. Complexity is O( m * K * I * d )
	– m = number of points, K = number of clusters, I = number of iterations, d = number of features.

	3. Guarantee: Monotonically decreases the objective function ("dist" variable in the code) until convergence.

	4. Scalability : Every iteration is linear in the size of its input.

	5. K-means is a special case of EM clustering. Think about it!

	6. There is no accepted method to discover k!!!
	"""