Skip to content

Instantly share code, notes, and snippets.

@VaughnGH
Last active April 26, 2016 18:05
Show Gist options
  • Save VaughnGH/b9bdeee8699111c56af86badc1bdf854 to your computer and use it in GitHub Desktop.
Save VaughnGH/b9bdeee8699111c56af86badc1bdf854 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"ename": "IndentationError",
"evalue": "expected an indented block (<ipython-input-2-bf5824d58837>, line 44)",
"output_type": "error",
"traceback": [
"\u001b[0;36m File \u001b[0;32m\"<ipython-input-2-bf5824d58837>\"\u001b[0;36m, line \u001b[0;32m44\u001b[0m\n\u001b[0;31m def getCentroids(dataSet, labels, k):\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m expected an indented block\n"
]
}
],
"source": [
"##FROM:https://stanford.edu/~cpiech/cs221/handouts/kmeans.html\n",
"# Function: K Means\n",
"\n",
"def kmeans(dataSet, k):\n",
" # Initialize centroids randomly\n",
" numFeatures = dataSet.getNumFeatures()\n",
" centroids = getRandomCentroids(numFeatures, k)\n",
" \n",
" # Initialize book keeping vars.\n",
" iterations = 0\n",
" oldCentroids = None\n",
" \n",
" # Run the main k-means algorithm\n",
" while not shouldStop(oldCentroids, centroids, iterations):\n",
" # Save old centroids for convergence test. Book keeping.\n",
" oldCentroids = centroids\n",
" iterations += 1\n",
" \n",
" # Assign labels to each datapoint based on centroids\n",
" labels = getLabels(dataSet, centroids)\n",
" \n",
" # Assign centroids based on datapoint labels\n",
" centroids = getCentroids(dataSet, labels, k)\n",
" \n",
" # We can get the labels too by calling getLabels(dataSet, centroids)\n",
" return centroids\n",
"# Function: Should Stop\n",
"# -------------\n",
"# Returns True or False if k-means is done. K-means terminates either\n",
"# because it has run a maximum number of iterations OR the centroids\n",
"# stop changing.\n",
"def shouldStop(oldCentroids, centroids, iterations):\n",
" if iterations > MAX_ITERATIONS: return True\n",
" return oldCentroids == centroids\n",
"# Function: Get Labels\n",
"# -------------\n",
"# Returns a label for each piece of data in the dataset. \n",
"def getLabels(dataSet, centroids):\n",
" # For each element in the dataset, chose the closest centroid. \n",
" # Make that centroid the element's label.\n",
"# Function: Get Centroids\n",
"# -------------\n",
"# Returns k random centroids, each of dimension n.\n",
"def getCentroids(dataSet, labels, k):\n",
" # Each centroid is the geometric mean of the points that\n",
" # have that centroid's label. Important: If a centroid is empty (no points have\n",
" # that centroid's label) you should randomly re-initialize it."
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Initialization complete\n",
"Iteration 0, inertia 15642.000\n",
"Iteration 1, inertia 13282.267\n",
"Converged at iteration 1\n",
"Initialization complete\n",
"Iteration 0, inertia 20262.000\n",
"Iteration 1, inertia 12128.933\n",
"Converged at iteration 1\n",
"Initialization complete\n",
"Iteration 0, inertia 17342.000\n",
"Iteration 1, inertia 14088.333\n",
"Converged at iteration 1\n",
"Initialization complete\n",
"Iteration 0, inertia 13062.000\n",
"Iteration 1, inertia 11285.000\n",
"Converged at iteration 1\n",
"Initialization complete\n",
"Iteration 0, inertia 13062.000\n",
"Iteration 1, inertia 12151.667\n",
"Converged at iteration 1\n",
"Initialization complete\n",
"Iteration 0, inertia 16842.000\n",
"Iteration 1, inertia 13617.333\n",
"Iteration 2, inertia 11285.000\n",
"Converged at iteration 2\n",
"Initialization complete\n",
"Iteration 0, inertia 15562.000\n",
"Iteration 1, inertia 11456.000\n",
"Converged at iteration 1\n",
"Initialization complete\n",
"Iteration 0, inertia 18662.000\n",
"Iteration 1, inertia 12128.933\n",
"Converged at iteration 1\n",
"Initialization complete\n",
"Iteration 0, inertia 13942.000\n",
"Iteration 1, inertia 11345.600\n",
"Converged at iteration 1\n",
"Initialization complete\n",
"Iteration 0, inertia 19162.000\n",
"Iteration 1, inertia 15126.000\n",
"Converged at iteration 1\n"
]
},
{
"data": {
"text/plain": [
"KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=8, n_init=10,\n",
" n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,\n",
" verbose=1)"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.cluster import KMeans\n",
"from numpy import array\n",
"X = array([[50, -100], [0, -170], [60, -260], [20, -180], [150, -270], [-50, -220], [100, -270], [40, -100], [70, -180], [50, -80], [-30, 0], [-20, 0], [-20, -70], [40, -120], [-50, 30], [10, -10], [1, -1], [0, -90], [-70, -30], [120, -70]])\n",
"# output dataset \n",
"y = array( [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] ) #.T\n",
"cluster = KMeans(max_iter=100, verbose=1)\n",
"cluster.fit(X, y)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Automatically created module for IPython interactive environment\n"
]
}
],
"source": [
"print(__doc__)\n",
"\n",
"\n",
"# Code source: Gaël Varoquaux\n",
"# Modified for documentation by Jaques Grobler\n",
"# License: BSD 3 clause\n",
"\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from mpl_toolkits.mplot3d import Axes3D\n",
"\n",
"\n",
"from sklearn.cluster import KMeans\n",
"from sklearn import datasets\n",
"\n",
"np.random.seed(5)\n",
"\n",
"centers = [[1, 1], [-1, -1], [1, -1]]\n",
"iris = datasets.load_iris()\n",
"X = iris.data\n",
"y = iris.target\n",
"\n",
"estimators = {'k_means_iris_3': KMeans(n_clusters=3),\n",
" 'k_means_iris_8': KMeans(n_clusters=8),\n",
" 'k_means_iris_bad_init': KMeans(n_clusters=3, n_init=1,\n",
" init='random')}\n",
"\n",
"\n",
"fignum = 1\n",
"for name, est in estimators.items():\n",
" fig = plt.figure(fignum, figsize=(4, 3))\n",
" plt.clf()\n",
" ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)\n",
"\n",
" plt.cla()\n",
" est.fit(X)\n",
" labels = est.labels_\n",
"\n",
" ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=labels.astype(np.float))\n",
"\n",
" ax.w_xaxis.set_ticklabels([])\n",
" ax.w_yaxis.set_ticklabels([])\n",
" ax.w_zaxis.set_ticklabels([])\n",
" ax.set_xlabel('Petal width')\n",
" ax.set_ylabel('Sepal length')\n",
" ax.set_zlabel('Petal length')\n",
" fignum = fignum + 1\n",
"\n",
"# Plot the ground truth\n",
"fig = plt.figure(fignum, figsize=(4, 3))\n",
"plt.clf()\n",
"ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)\n",
"\n",
"plt.cla()\n",
"\n",
"for name, label in [('Setosa', 0),\n",
" ('Versicolour', 1),\n",
" ('Virginica', 2)]:\n",
" ax.text3D(X[y == label, 3].mean(),\n",
" X[y == label, 0].mean() + 1.5,\n",
" X[y == label, 2].mean(), name,\n",
" horizontalalignment='center',\n",
" bbox=dict(alpha=.5, edgecolor='w', facecolor='w'))\n",
"# Reorder the labels to have colors matching the cluster results\n",
"y = np.choose(y, [1, 2, 0]).astype(np.float)\n",
"ax.scatter(X[:, 3], X[:, 0], X[:, 2], c=y)\n",
"\n",
"ax.w_xaxis.set_ticklabels([])\n",
"ax.w_yaxis.set_ticklabels([])\n",
"ax.w_zaxis.set_ticklabels([])\n",
"ax.set_xlabel('Petal width')\n",
"ax.set_ylabel('Sepal length')\n",
"ax.set_zlabel('Petal length')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"estimators = {'k_means_iris_3': KMeans(n_clusters=3),\n",
" 'k_means_iris_8': KMeans(n_clusters=8),\n",
" 'k_means_iris_bad_init': KMeans(n_clusters=3, n_init=1,\n",
" init='random')}\n",
"[[190 240 140]\n",
" [310 310 140]\n",
" [340 400 140]\n",
" [310 330 150]\n",
" [280 430 160]\n",
" [430 380 160]\n",
" [400 500 230]\n",
" [260 300 200]\n",
" [290 360 180]\n",
" [180 230 150]\n",
" [410 380 380]\n",
" [200 180 180]\n",
" [300 280 210]\n",
" [340 380 260]\n",
" [230 180 210]\n",
" [150 160 150]\n",
" [ 50 51 50]\n",
" [250 250 160]\n",
" [280 210 180]\n",
" [310 430 360]]\n",
"[1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0]\n"
]
}
],
"source": [
"print(__doc__)\n",
"\n",
"\n",
"# Code source: Gaël Varoquaux\n",
"# Modified for documentation by Jaques Grobler\n",
"# License: BSD 3 clause\n",
"\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from mpl_toolkits.mplot3d import Axes3D\n",
"\n",
"\n",
"from sklearn.cluster import KMeans\n",
"from sklearn import datasets\n",
"\n",
"np.random.seed(5)\n",
"\n",
"centers = [[1, 1], [-1, -1], [1, -1]]\n",
"iris = datasets.load_iris()\n",
"ALT = 1\n",
"DEBUG = 1\n",
"if not ALT:\n",
" X = iris.data\n",
" y = iris.target\n",
"else:\n",
" X = np.array([[190, 240, 140], [310, 310, 140], [340, 400, 140], [310, 330, 150], [280, 430, 160], [430, 380, 160], [400, 500, 230], [260, 300, 200], [290, 360, 180], [180, 230, 150], [410, 380, 380], [200, 180, 180], [300, 280, 210], [340, 380, 260], [230, 180, 210], [150, 160, 150], [50, 51, 50], [250, 250, 160], [280, 210, 180], [310, 430, 360]])\n",
" y = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])\n",
"if DEBUG:\n",
" print X\n",
" print y\n",
"fignum = 1\n",
"'''estimators = {'k_means_iris_3': KMeans(n_clusters=3),\n",
" 'k_means_iris_8': KMeans(n_clusters=8),\n",
" 'k_means_iris_bad_init': KMeans(n_clusters=3, n_init=1,\n",
" init='random')}'''\n",
"estimators = {'k_means_iris_3': KMeans(n_clusters=3)}\n",
"\n",
"for name, est in estimators.items():\n",
" fig = plt.figure(fignum, figsize=(3, 2))\n",
" plt.clf()\n",
" ax = Axes3D(fig)#, rect=[0, 0, .95, 1])#, elev=48, azim=134)\n",
"\n",
" plt.cla()\n",
" est.fit(X)\n",
" labels = est.labels_\n",
"\n",
" ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=labels.astype(np.float))\n",
"\n",
" ax.w_xaxis.set_ticklabels([])\n",
" ax.w_yaxis.set_ticklabels([])\n",
" ax.w_zaxis.set_ticklabels([])\n",
" ax.set_xlabel('First Time')\n",
" ax.set_ylabel('Second Time')\n",
" ax.set_zlabel('Third Time')\n",
" fignum = fignum + 1\n",
" #plt.show()\n",
"\n",
"# Plot the ground truth\n",
"fig = plt.figure(fignum, figsize=(3, 2))\n",
"plt.clf()\n",
"ax = Axes3D(fig)\n",
"\n",
"plt.cla()\n",
"\n",
"for name, label in [('True', 0),\n",
" ('False', 1)]:\n",
" ax.text3D(X[y == label, 1].mean(),\n",
" X[y == label, 0].mean() + 1.5,\n",
" X[y == label, 2].mean(), name,\n",
" horizontalalignment='center',\n",
" bbox=dict(alpha=.5, edgecolor='w', facecolor='w'))\n",
"# Reorder the labels to have colors matching the cluster results\n",
"y = np.choose(y, [0, 1, 2]).astype(np.float)\n",
"ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y)\n",
"\n",
"ax.w_xaxis.set_ticklabels([])\n",
"ax.w_yaxis.set_ticklabels([])\n",
"ax.w_zaxis.set_ticklabels([])\n",
"ax.set_xlabel('First Time')\n",
"ax.set_ylabel('Second Time')\n",
"ax.set_zlabel('Third Time')\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment