kylecampbell/k_means_clustering.py

## k_means_clustering.py
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
#uncomment below if using Jupyter
#%config InlineBackend.figure_format = 'retina'

# get data
df = pd.read_csv('../some/data/path')

def k_means_clustering(df_col_1, df_col_2, clusters, steps=100):
    '''
    Takes in two dataframe columns and outputs plot of clusters.
    '''
    vec_vals = []
    for i in range(len(df_col_1)):
        a = df_col_1[i]
        b = df_col_2[i]
        vec_vals.append([a, b])
    v_vals = np.array(vec_vals)
    np.random.shuffle(v_vals)

    sess = tf.Session()
    k = clusters
    points = v_vals
    data = tf.constant(points)

    # random initial centroids (points shuffled above)
    centroids = tf.Variable(data[:k, :])

    # add k dim to data and n dim to centroids to make matrices compatible
    # for array operations instead of loops
    data_expanded = tf.expand_dims(data, 0)
    centroids_expanded = tf.expand_dims(centroids, 1)

    # computes squared Euclidean distance between every point and every centroid
    # and get closest centroid for each point
    allocations = tf.argmin(tf.reduce_sum(tf.square(data_expanded - centroids_expanded), 2), 0)

    sess.run(tf.global_variables_initializer())
    c = 0 # index of centroid
    tf.equal(allocations, c)
    tf.gather(data, tf.where(tf.equal(allocations, c)))

    means = tf.concat(
        [tf.reduce_mean(
            tf.gather(data,
                      tf.where(tf.equal(allocations, c))), 0) for c in range(k)], 0)

    update_centroids = tf.assign(centroids, means)

    for step in range(steps):
        _, centroid_values, allocation_values = sess.run([update_centroids, centroids, allocations])

    clusters_df = pd.DataFrame({df_col_1.name: points[:,0], df_col_2.name: points[:,1], "cluster": allocation_values})
    sns.lmplot(df_col_1.name, df_col_2.name, data=clusters_df, fit_reg=False, size=6, hue="cluster")
    plt.show()


k_means_clustering(df['col_name_a'], df['col_name_b'], 3)
	import numpy as np
	import pandas as pd
	import tensorflow as tf
	import matplotlib.pyplot as plt
	import seaborn as sns
	#uncomment below if using Jupyter
	#%config InlineBackend.figure_format = 'retina'

	# get data
	df = pd.read_csv('../some/data/path')

	def k_means_clustering(df_col_1, df_col_2, clusters, steps=100):
	'''
	Takes in two dataframe columns and outputs plot of clusters.
	'''
	vec_vals = []
	for i in range(len(df_col_1)):
	a = df_col_1[i]
	b = df_col_2[i]
	vec_vals.append([a, b])
	v_vals = np.array(vec_vals)
	np.random.shuffle(v_vals)

	sess = tf.Session()
	k = clusters
	points = v_vals
	data = tf.constant(points)

	# random initial centroids (points shuffled above)
	centroids = tf.Variable(data[:k, :])

	# add k dim to data and n dim to centroids to make matrices compatible
	# for array operations instead of loops
	data_expanded = tf.expand_dims(data, 0)
	centroids_expanded = tf.expand_dims(centroids, 1)

	# computes squared Euclidean distance between every point and every centroid
	# and get closest centroid for each point
	allocations = tf.argmin(tf.reduce_sum(tf.square(data_expanded - centroids_expanded), 2), 0)

	sess.run(tf.global_variables_initializer())
	c = 0 # index of centroid
	tf.equal(allocations, c)
	tf.gather(data, tf.where(tf.equal(allocations, c)))

	means = tf.concat(
	[tf.reduce_mean(
	tf.gather(data,
	tf.where(tf.equal(allocations, c))), 0) for c in range(k)], 0)

	update_centroids = tf.assign(centroids, means)

	for step in range(steps):
	_, centroid_values, allocation_values = sess.run([update_centroids, centroids, allocations])

	clusters_df = pd.DataFrame({df_col_1.name: points[:,0], df_col_2.name: points[:,1], "cluster": allocation_values})
	sns.lmplot(df_col_1.name, df_col_2.name, data=clusters_df, fit_reg=False, size=6, hue="cluster")
	plt.show()


	k_means_clustering(df['col_name_a'], df['col_name_b'], 3)