alitrack/customer-segmentation.py

## customer-segmentation.py
import pandas as pd
# http://blog.yhathq.com/static/misc/data/WineKMC.xlsx
df_offers = pd.read_excel("./WineKMC.xlsx", sheetname=0)
df_offers.columns = ["offer_id", "campaign", "varietal", "min_qty", "discount", "origin", "past_peak"]
df_offers.head()

df_transactions = pd.read_excel("./WineKMC.xlsx", sheetname=1)
df_transactions.columns = ["customer_name", "offer_id"]
df_transactions['n'] = 1
df_transactions.head()

# join the offers and transactions table
df = pd.merge(df_offers, df_transactions)
# create a "pivot table" which will give us the number of times each
# customer responded to a given variable
matrix = df.pivot_table(index=['customer_name'], columns=['offer_id'], values='n')
# a little tidying up. fill NA values with 0 and make the index into a column
matrix = matrix.fillna(0).reset_index()
x_cols = matrix.columns[1:]

from sklearn.cluster import KMeans

cluster = KMeans(n_clusters=5)
# slice matrix so we only include the 0/1 indicator columns in the clustering
matrix['cluster'] = cluster.fit_predict(matrix[x_cols])
matrix.cluster.value_counts()

from ggplot import *
ggplot(matrix, aes(x='factor(cluster)')) + geom_bar() + xlab("Cluster") + ylab("Customers\n(# in cluster)")

from sklearn.decomposition import PCA

pca = PCA(n_components=2)
matrix['x'] = pca.fit_transform(matrix[x_cols])[:,0]
matrix['y'] = pca.fit_transform(matrix[x_cols])[:,1]
matrix = matrix.reset_index()

customer_clusters = matrix[['customer_name', 'cluster', 'x', 'y']]
customer_clusters.head()

df = pd.merge(df_transactions, customer_clusters)
df = pd.merge(df_offers, df)

from ggplot import *

ggplot(df, aes(x='x', y='y', color='cluster')) + \
    geom_point(size=75) + \
    ggtitle("Customers Grouped by Cluster")

cluster_centers = pca.transform(cluster.cluster_centers_)
cluster_centers = pd.DataFrame(cluster_centers, columns=['x', 'y'])
cluster_centers['cluster'] = range(0, len(cluster_centers))

ggplot(df, aes(x='x', y='y', color='cluster')) + \
    geom_point(size=75) + \
    geom_point(cluster_centers, size=500) +\
    ggtitle("Customers Grouped by Cluster")


df['is_4'] = df.cluster==4
df.groupby("is_4").varietal.value_counts()
df.groupby("is_4")[['min_qty', 'discount']].mean()
	import pandas as pd
	# http://blog.yhathq.com/static/misc/data/WineKMC.xlsx
	df_offers = pd.read_excel("./WineKMC.xlsx", sheetname=0)
	df_offers.columns = ["offer_id", "campaign", "varietal", "min_qty", "discount", "origin", "past_peak"]
	df_offers.head()

	df_transactions = pd.read_excel("./WineKMC.xlsx", sheetname=1)
	df_transactions.columns = ["customer_name", "offer_id"]
	df_transactions['n'] = 1
	df_transactions.head()

	# join the offers and transactions table
	df = pd.merge(df_offers, df_transactions)
	# create a "pivot table" which will give us the number of times each
	# customer responded to a given variable
	matrix = df.pivot_table(index=['customer_name'], columns=['offer_id'], values='n')
	# a little tidying up. fill NA values with 0 and make the index into a column
	matrix = matrix.fillna(0).reset_index()
	x_cols = matrix.columns[1:]

	from sklearn.cluster import KMeans

	cluster = KMeans(n_clusters=5)
	# slice matrix so we only include the 0/1 indicator columns in the clustering
	matrix['cluster'] = cluster.fit_predict(matrix[x_cols])
	matrix.cluster.value_counts()

	from ggplot import *
	ggplot(matrix, aes(x='factor(cluster)')) + geom_bar() + xlab("Cluster") + ylab("Customers\n(# in cluster)")

	from sklearn.decomposition import PCA

	pca = PCA(n_components=2)
	matrix['x'] = pca.fit_transform(matrix[x_cols])[:,0]
	matrix['y'] = pca.fit_transform(matrix[x_cols])[:,1]
	matrix = matrix.reset_index()

	customer_clusters = matrix[['customer_name', 'cluster', 'x', 'y']]
	customer_clusters.head()

	df = pd.merge(df_transactions, customer_clusters)
	df = pd.merge(df_offers, df)

	from ggplot import *

	ggplot(df, aes(x='x', y='y', color='cluster')) + \
	geom_point(size=75) + \
	ggtitle("Customers Grouped by Cluster")

	cluster_centers = pca.transform(cluster.cluster_centers_)
	cluster_centers = pd.DataFrame(cluster_centers, columns=['x', 'y'])
	cluster_centers['cluster'] = range(0, len(cluster_centers))

	ggplot(df, aes(x='x', y='y', color='cluster')) + \
	geom_point(size=75) + \
	geom_point(cluster_centers, size=500) +\
	ggtitle("Customers Grouped by Cluster")


	df['is_4'] = df.cluster==4
	df.groupby("is_4").varietal.value_counts()
	df.groupby("is_4")[['min_qty', 'discount']].mean()