Skip to content

Instantly share code, notes, and snippets.

@karamanbk
Created June 2, 2019 08:31
Show Gist options
  • Save karamanbk/524e438c97f8df1fe46aa32c79611ab4 to your computer and use it in GitHub Desktop.
Save karamanbk/524e438c97f8df1fe46aa32c79611ab4 to your computer and use it in GitHub Desktop.
#get max purchase date for Recency and create a dataframe
tx_max_purchase = tx_6m.groupby('CustomerID').InvoiceDate.max().reset_index()
tx_max_purchase.columns = ['CustomerID','MaxPurchaseDate']
#find the recency in days and add it to tx_user
tx_max_purchase['Recency'] = (tx_max_purchase['MaxPurchaseDate'].max() - tx_max_purchase['MaxPurchaseDate']).dt.days
tx_user = pd.merge(tx_user, tx_max_purchase[['CustomerID','Recency']], on='CustomerID')
#plot recency
plot_data = [
go.Histogram(
x=tx_user['Recency']
)
]
plot_layout = go.Layout(
title='Recency'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#clustering for Recency
kmeans = KMeans(n_clusters=4)
kmeans.fit(tx_user[['Recency']])
tx_user['RecencyCluster'] = kmeans.predict(tx_user[['Recency']])
#order cluster method
def order_cluster(cluster_field_name, target_field_name,df,ascending):
new_cluster_field_name = 'new_' + cluster_field_name
df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
df_new['index'] = df_new.index
df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
df_final = df_final.drop([cluster_field_name],axis=1)
df_final = df_final.rename(columns={"index":cluster_field_name})
return df_final
#order recency clusters
tx_user = order_cluster('RecencyCluster', 'Recency',tx_user,False)
#print cluster characteristics
tx_user.groupby('RecencyCluster')['Recency'].describe()
#get total purchases for frequency scores
tx_frequency = tx_6m.groupby('CustomerID').InvoiceDate.count().reset_index()
tx_frequency.columns = ['CustomerID','Frequency']
#add frequency column to tx_user
tx_user = pd.merge(tx_user, tx_frequency, on='CustomerID')
#plot frequency
plot_data = [
go.Histogram(
x=tx_user.query('Frequency < 1000')['Frequency']
)
]
plot_layout = go.Layout(
title='Frequency'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#clustering for frequency
kmeans = KMeans(n_clusters=4)
kmeans.fit(tx_user[['Frequency']])
tx_user['FrequencyCluster'] = kmeans.predict(tx_user[['Frequency']])
#order frequency clusters and show the characteristics
tx_user = order_cluster('FrequencyCluster', 'Frequency',tx_user,True)
tx_user.groupby('FrequencyCluster')['Frequency'].describe()
#calculate monetary value, create a dataframe with it
tx_6m['Revenue'] = tx_6m['UnitPrice'] * tx_6m['Quantity']
tx_revenue = tx_6m.groupby('CustomerID').Revenue.sum().reset_index()
#add Revenue column to tx_user
tx_user = pd.merge(tx_user, tx_revenue, on='CustomerID')
#plot Revenue
plot_data = [
go.Histogram(
x=tx_user.query('Revenue < 10000')['Revenue']
)
]
plot_layout = go.Layout(
title='Monetary Value'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#Revenue clusters
kmeans = KMeans(n_clusters=4)
kmeans.fit(tx_user[['Revenue']])
tx_user['RevenueCluster'] = kmeans.predict(tx_user[['Revenue']])
#ordering clusters and who the characteristics
tx_user = order_cluster('RevenueCluster', 'Revenue',tx_user,True)
tx_user.groupby('RevenueCluster')['Revenue'].describe()
#building overall segmentation
tx_user['OverallScore'] = tx_user['RecencyCluster'] + tx_user['FrequencyCluster'] + tx_user['RevenueCluster']
#assign segment names
tx_user['Segment'] = 'Low-Value'
tx_user.loc[tx_user['OverallScore']>2,'Segment'] = 'Mid-Value'
tx_user.loc[tx_user['OverallScore']>4,'Segment'] = 'High-Value'
#plot revenue vs frequency
tx_graph = tx_user.query("Revenue < 50000 and Frequency < 2000")
plot_data = [
go.Scatter(
x=tx_graph.query("Segment == 'Low-Value'")['Frequency'],
y=tx_graph.query("Segment == 'Low-Value'")['Revenue'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
)
),
go.Scatter(
x=tx_graph.query("Segment == 'Mid-Value'")['Frequency'],
y=tx_graph.query("Segment == 'Mid-Value'")['Revenue'],
mode='markers',
name='Mid',
marker= dict(size= 9,
line= dict(width=1),
color= 'green',
opacity= 0.5
)
),
go.Scatter(
x=tx_graph.query("Segment == 'High-Value'")['Frequency'],
y=tx_graph.query("Segment == 'High-Value'")['Revenue'],
mode='markers',
name='High',
marker= dict(size= 11,
line= dict(width=1),
color= 'red',
opacity= 0.9
)
),
]
plot_layout = go.Layout(
yaxis= {'title': "Revenue"},
xaxis= {'title': "Frequency"},
title='Segments'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#plot revenue vs recency
tx_graph = tx_user.query("Revenue < 50000 and Frequency < 2000")
plot_data = [
go.Scatter(
x=tx_graph.query("Segment == 'Low-Value'")['Recency'],
y=tx_graph.query("Segment == 'Low-Value'")['Revenue'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
)
),
go.Scatter(
x=tx_graph.query("Segment == 'Mid-Value'")['Recency'],
y=tx_graph.query("Segment == 'Mid-Value'")['Revenue'],
mode='markers',
name='Mid',
marker= dict(size= 9,
line= dict(width=1),
color= 'green',
opacity= 0.5
)
),
go.Scatter(
x=tx_graph.query("Segment == 'High-Value'")['Recency'],
y=tx_graph.query("Segment == 'High-Value'")['Revenue'],
mode='markers',
name='High',
marker= dict(size= 11,
line= dict(width=1),
color= 'red',
opacity= 0.9
)
),
]
plot_layout = go.Layout(
yaxis= {'title': "Revenue"},
xaxis= {'title': "Recency"},
title='Segments'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
#plot frequency vs recency
tx_graph = tx_user.query("Revenue < 50000 and Frequency < 2000")
plot_data = [
go.Scatter(
x=tx_graph.query("Segment == 'Low-Value'")['Recency'],
y=tx_graph.query("Segment == 'Low-Value'")['Frequency'],
mode='markers',
name='Low',
marker= dict(size= 7,
line= dict(width=1),
color= 'blue',
opacity= 0.8
)
),
go.Scatter(
x=tx_graph.query("Segment == 'Mid-Value'")['Recency'],
y=tx_graph.query("Segment == 'Mid-Value'")['Frequency'],
mode='markers',
name='Mid',
marker= dict(size= 9,
line= dict(width=1),
color= 'green',
opacity= 0.5
)
),
go.Scatter(
x=tx_graph.query("Segment == 'High-Value'")['Recency'],
y=tx_graph.query("Segment == 'High-Value'")['Frequency'],
mode='markers',
name='High',
marker= dict(size= 11,
line= dict(width=1),
color= 'red',
opacity= 0.9
)
),
]
plot_layout = go.Layout(
yaxis= {'title': "Frequency"},
xaxis= {'title': "Recency"},
title='Segments'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment