Skip to content

Instantly share code, notes, and snippets.

@digitalWestie
Last active October 30, 2019 12:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save digitalWestie/ea468aa340722c5b985eba17d21e27a3 to your computer and use it in GitHub Desktop.
Save digitalWestie/ea468aa340722c5b985eba17d21e27a3 to your computer and use it in GitHub Desktop.
Basic clustering for UrbanTide analytics
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
import pandas as pd
import io
import requests
from sklearn.cluster import KMeans
#Download datasets
url = "https://gist.githubusercontent.com/digitalWestie/b68b86cae1d893d4d3d3b01aca59be8d/raw/28908e0d394802181762dc7429f67c0f79fb9fad/Make%2520Model%2520Data%25202016-edited.csv"
s=requests.get(url).content
dataset=pd.read_csv(io.StringIO(s.decode('windows-1252')))
dataset.iloc[:3,:]
# Only include specified columns:
subset = dataset.loc[:, ['Label', 'Engine, Noise and Exhaust %', 'Chassis and Body %']]
subset
#Discard label columns (we only want to feed numeric values to algorithm)
subset_data=subset.iloc[:, 1:]
#Run clustering (4 clusters)
kmeans = KMeans(n_clusters=4).fit(subset_data)
y_kmeans = kmeans.predict(subset_data)
y_kmeans
#Draw graph of cluster
from matplotlib import pyplot as plt
plt.scatter(subset_data.iloc[:,0], subset_data.iloc[:,1], c=y_kmeans, s=50, cmap='viridis')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5);
#Combine labels with groups
result = pd.crosstab(subset.iloc[:,0], y_kmeans)
result
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>X</th>\n",
" <th>y</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5.910131</td>\n",
" <td>4.714615</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2.500393</td>\n",
" <td>2.076238</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.946845</td>\n",
" <td>2.548811</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7.102233</td>\n",
" <td>4.615368</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>6.168895</td>\n",
" <td>3.264107</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" X y\n",
"0 5.910131 4.714615\n",
"1 2.500393 2.076238\n",
"2 3.946845 2.548811\n",
"3 7.102233 4.615368\n",
"4 6.168895 3.264107"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from matplotlib import pyplot as plt\n",
"\n",
"# Generate 'random' data\n",
"np.random.seed(0)\n",
"X = 2.5 * np.random.randn(100) + 1.5 # Array of 100 values with mean = 1.5, stddev = 2.5\n",
"res = 0.5 * np.random.randn(100) # Generate 100 residual terms\n",
"y = 2 + 0.3 * X + res # Actual values of Y\n",
"\n",
"# Create pandas dataframe to store our X and y values\n",
"df = pd.DataFrame(\n",
" {'X': X,\n",
" 'y': y}\n",
")\n",
"\n",
"# Show the first five rows of our dataframe\n",
"df.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"alpha = 2.0031670124623426\n",
"beta = 0.3229396867092763\n"
]
}
],
"source": [
"# Calculate the mean of X and y\n",
"xmean = np.mean(X)\n",
"ymean = np.mean(y)\n",
"\n",
"# Calculate the terms needed for the numator and denominator of beta\n",
"df['xycov'] = (df['X'] - xmean) * (df['y'] - ymean)\n",
"df['xvar'] = (df['X'] - xmean)**2\n",
"\n",
"# Calculate beta and alpha\n",
"beta = df['xycov'].sum() / df['xvar'].sum()\n",
"alpha = ymean - (beta * xmean)\n",
"print(f'alpha = {alpha}')\n",
"print(f'beta = {beta}')\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"#Great, we now have an estimate for alpha and beta! \n",
"#Our model can be written as Yₑ = 2.003 + 0.323 X,​ and we can make predictions:\n",
"ypred = alpha + beta * X\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 864x432 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# Plot regression against actual data\n",
"plt.figure(figsize=(12, 6))\n",
"plt.plot(X, ypred) # regression line\n",
"plt.plot(X, y, 'ro') # scatter plot showing actual data\n",
"plt.title('Actual vs Predicted')\n",
"plt.xlabel('X')\n",
"plt.ylabel('y')\n",
"\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment