Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@germanramos

This comment has been minimized.

Copy link

germanramos commented Jan 19, 2016

Excellent work!!!

I have put your code in a reusable function and added an stop parameter used when valid centroids values are reached:

import tensorflow as tf

def kMeansCluster(vector_values, num_clusters, max_num_steps, stop_coeficient = 0.0):
  vectors = tf.constant(vector_values)
  centroids = tf.Variable(tf.slice(tf.random_shuffle(vectors),
                                   [0,0],[num_clusters,-1]))
  old_centroids = tf.Variable(tf.zeros([num_clusters,2]))
  centroid_distance = tf.Variable(tf.zeros([num_clusters,2]))

  expanded_vectors = tf.expand_dims(vectors, 0)
  expanded_centroids = tf.expand_dims(centroids, 1)

  print expanded_vectors.get_shape()
  print expanded_centroids.get_shape()

  distances = tf.reduce_sum(
    tf.square(tf.sub(expanded_vectors, expanded_centroids)), 2)
  assignments = tf.argmin(distances, 0)

  means = tf.concat(0, [
    tf.reduce_mean(
        tf.gather(vectors,
                  tf.reshape(
                    tf.where(
                      tf.equal(assignments, c)
                    ),[1,-1])
                 ),reduction_indices=[1])
    for c in xrange(num_clusters)])

  save_old_centroids = tf.assign(old_centroids, centroids)

  update_centroids = tf.assign(centroids, means)
  init_op = tf.initialize_all_variables()

  performance = tf.assign(centroid_distance, tf.sub(centroids, old_centroids))
  check_stop = tf.reduce_sum(tf.abs(performance))

  with tf.Session() as sess:
    sess.run(init_op)
    for step in xrange(max_num_steps):
      print "Running step " + str(step)
      sess.run(save_old_centroids)
      _, centroid_values, assignment_values = sess.run([update_centroids,
                                                        centroids,
                                                        assignments])
      sess.run(check_stop)
      current_stop_coeficient = check_stop.eval()
      print "coeficient:", current_stop_coeficient
      if current_stop_coeficient <= stop_coeficient:
        break

    return centroid_values, assignment_values
@narphorium

This comment has been minimized.

Copy link
Owner Author

narphorium commented Jan 29, 2016

Thanks @germanramos! That looks great.

@vlad17

This comment has been minimized.

Copy link

vlad17 commented Apr 27, 2016

This looks like it serializes the centroids and assignments, copies them from the backend to the python process, and then sends them back to the engine in the next step. Is there any way to avoid this copying without making max_num_steps ops?

@narphorium

This comment has been minimized.

Copy link
Owner Author

narphorium commented Jun 6, 2016

That's a good point @vlad17. You can do iteration in TF with tf.tf.while_loop but it is a bit more advanced.

@amineHorseman

This comment has been minimized.

Copy link

amineHorseman commented Jun 15, 2016

Good tutorial,

We can simplify the code of calculating the means by using tf.boolean_mask instead of tf.reshape(tf.where(..)):

means = tf.pack([
    tf.reduce_mean(
        tf.boolean_mask(
            vectors, tf.equal(assignments, c)
        ), 0) 
    for c in xrange(num_clusters)])

I think it's more intuitive

@h4p

This comment has been minimized.

Copy link

h4p commented Sep 3, 2016

Hello,

when I input values of shape (1000,1), I'm getting a lot of NaNs in the centroid list.

array([[-0.0615779 ],
       [ 0.        ],
       [-0.01855482],
       [        nan],
       [        nan],
       [        nan],
       [        nan],
       [-0.03768255],
       [ 0.01288017],
       [ 0.01535422],
       [ 0.04958867],
       [        nan],
       [-0.01960552],
       [ 0.09472825],
       [-0.09461572],
       [        nan]]

Basically I want to do the same as this MATLAB code does:

  >> load fisheriris
  >> X = meas(:,3); 
  >> [idx,C] = kmeans(X,3);
  >> size(X) => [150,1]
  >> size(idx) => [150,1]
  >> size(C) => [3,1]

I think there's problem with the calculation of means, because this is where the assignment for centroids is coming from, but I'm not sure where the nan is coming from. Can somebody please give me a hint to fix? :)

@nickleefly

This comment has been minimized.

Copy link

nickleefly commented Jun 13, 2017

tf.sub need changes to tf.subtract
and

means = tf.concat(0, [
    tf.reduce_mean(
        tf.gather(vectors,
                  tf.reshape(
                    tf.where(
                      tf.equal(assignments, c)
                    ),[1,-1])
                 ),reduction_indices=[1])
    for c in xrange(num_clusters)])

to

means = tf.concat([
    tf.reduce_mean(
        tf.gather(vectors,
                  tf.reshape(
                    tf.where(
                      tf.equal(assignments, c)
                    ),[1,-1])
                 ),reduction_indices=[1])
    for c in xrange(num_clusters)], 0)
@ghdcjs14

This comment has been minimized.

Copy link

ghdcjs14 commented Nov 12, 2018

Thank you!!
In python 3 , I think it works!

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf

num_points = 2000
vectors_set = []

for i in range(num_points):
  if np.random.random() > 0.5:
    vectors_set.append([np.random.normal(0.0, 0.9), np.random.normal(0.0, 0.9)])
  else :
    vectors_set.append([np.random.normal(3.0, 0.5), np.random.normal(1.0, 0.5)])
    
df = pd.DataFrame({"x": [v[0] for v in vectors_set], "y": [v[1] for v in vectors_set]})
sns.lmplot("x","y", data=df, fit_reg=False, size=6)
plt.show()

# k-means algorithm
vectors = tf.constant(vectors_set)
num_clusters = 4
centroides = tf.Variable(tf.slice(tf.random_shuffle(vectors),[0,0],[k,-1]))

expanded_vectors = tf.expand_dims(vectors, 0)
expanded_centroides = tf.expand_dims(centroides, 1)

assignments = tf.argmin(tf.reduce_sum(tf.square(tf.subtract(expanded_vectors,expanded_centroides)), 2), 0)

means = tf.concat(axis=0, values=[
    tf.reduce_mean(
        tf.gather(vectors, 
                  tf.reshape(
                      tf.where(
                          tf.equal(assignments, c)
                      ), [1,-1])
                 ), axis=[1]) 
    for c in range(num_clusters)])

update_centroides = tf.assign(centroides, means)

init_op = tf.initialize_all_variables()

sess = tf.Session()
sess.run(init_op)

for step in range(100):
  _, centroid_values, assignment_values = sess.run([update_centroides, centroides, assignments])
  
data = {"x": [], "y": [], "cluster": []}

for i in range(len(assignment_values)):
  data["x"].append(vectors_set[i][0])
  data["y"].append(vectors_set[i][1])
  data["cluster"].append(assignment_values[i])
  
df = pd.DataFrame(data)
sns.lmplot("x","y",data=df,fit_reg=False, size=6, hue="cluster", legend=False)
plt.show()
@yusinshin

This comment has been minimized.

Copy link

yusinshin commented Mar 20, 2019

In python 3.6, it still works well. Thank You :D

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf

num_points = 2000
vectors_set = []

for i in range(num_points):
    if np.random.random() > 0.5:
        vectors_set.append([np.random.normal(0.0, 0.9), np.random.normal(0.0, 0.9)])
    else:
        vectors_set.append([np.random.normal(3.0, 0.5), np.random.normal(1.0, 0.5)])

df = pd.DataFrame({"x": [v[0] for v in vectors_set], "y": [v[1] for v in vectors_set]})
sns.lmplot("x", "y", data=df, fit_reg=False, height=6)
plt.show()

# k-means algorithm
vectors = tf.constant(vectors_set)
num_clusters = 4
centroides = tf.Variable(tf.slice(tf.random_shuffle(vectors), [0, 0], [num_clusters, -1]))

expanded_vectors = tf.expand_dims(vectors, 0)
expanded_centroides = tf.expand_dims(centroides, 1)

assignments = tf.argmin(tf.reduce_sum(tf.square(tf.subtract(expanded_vectors, expanded_centroides)), 2), 0)

means = tf.concat(axis=0, values=[
    tf.reduce_mean(
        tf.gather(vectors,
                  tf.reshape(
                      tf.where(
                          tf.equal(assignments, c)
                      ), [1, -1])
                  ), axis=[1])
    for c in range(num_clusters)])

update_centroides = tf.assign(centroides, means)

init_op = tf.global_variables_initializer()

sess = tf.Session()
sess.run(init_op)

for step in range(100):
    _, centroid_values, assignment_values = sess.run([update_centroides, centroides, assignments])

data = {"x": [], "y": [], "cluster": []}

for i in range(len(assignment_values)):
    data["x"].append(vectors_set[i][0])
    data["y"].append(vectors_set[i][1])
    data["cluster"].append(assignment_values[i])

df = pd.DataFrame(data)
sns.lmplot("x", "y", data=df, fit_reg=False, height=6, hue="cluster", legend=False)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.