karamanbk

## g1_cohort_retention.py
#create our retention table again with crosstab() and add firs purchase year month view
tx_retention = pd.crosstab(tx_user_purchase['CustomerID'], tx_user_purchase['InvoiceYearMonth']).reset_index()
tx_retention = pd.merge(tx_retention,tx_min_purchase[['CustomerID','MinPurchaseYearMonth']],on='CustomerID')
new_column_names = [ 'm_' + str(column) for column in tx_retention.columns[:-1]]
new_column_names.append('MinPurchaseYearMonth')
tx_retention.columns = new_column_names

#create the array of Retained users for each cohort monthly
retention_array = []
for i in range(len(months)):

## g1_jpn.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                karamanbk
                / g1_jpn.ipynb
            
            
              Created
              May 3, 2019 17:15
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## g1_jpn.ipynb

      
              1 file
            
          
              14 forks
            
          
              2 comments
            
          
              11 stars
            
          
                karamanbk
                / g1_jpn.ipynb
            
            
              Last active
              January 3, 2024 22:36
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## g2_g1_recap.py
# import libraries
from datetime import datetime, timedelta
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division

import plotly.plotly as py

## g2_calc_recency.py
#create a generic user dataframe to keep CustomerID and new segmentation scores
tx_user = pd.DataFrame(tx_data['CustomerID'].unique())
tx_user.columns = ['CustomerID']

#get the max purchase date for each customer and create a dataframe with it
tx_max_purchase = tx_uk.groupby('CustomerID').InvoiceDate.max().reset_index()
tx_max_purchase.columns = ['CustomerID','MaxPurchaseDate']

#we take our observation point as the max invoice date in our dataset
tx_max_purchase['Recency'] = (tx_max_purchase['MaxPurchaseDate'].max() - tx_max_purchase['MaxPurchaseDate']).dt.days

## g2_recency_elbow.py
from sklearn.cluster import KMeans

sse={}
tx_recency = tx_user[['Recency']]
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(tx_recency)
    tx_recency["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))

## g2_recency_cluster.py
#build 4 clusters for recency and add it to dataframe
kmeans = KMeans(n_clusters=4)
kmeans.fit(tx_user[['Recency']])
tx_user['RecencyCluster'] = kmeans.predict(tx_user[['Recency']])

#function for ordering cluster numbers
def order_cluster(cluster_field_name, target_field_name,df,ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)

## g2_frequency.py
#get order counts for each user and create a dataframe with it
tx_frequency = tx_uk.groupby('CustomerID').InvoiceDate.count().reset_index()
tx_frequency.columns = ['CustomerID','Frequency']

#add this data to our main dataframe
tx_user = pd.merge(tx_user, tx_frequency, on='CustomerID')

#plot the histogram
plot_data = [
    go.Histogram(

## g2_frequency_cluster.py
#k-means
kmeans = KMeans(n_clusters=4)
kmeans.fit(tx_user[['Frequency']])
tx_user['FrequencyCluster'] = kmeans.predict(tx_user[['Frequency']])

#order the frequency cluster
tx_user = order_cluster('FrequencyCluster', 'Frequency',tx_user,True)

#see details of each cluster
tx_user.groupby('FrequencyCluster')['Frequency'].describe()

## g2_revenue.py
#calculate revenue for each customer
tx_uk['Revenue'] = tx_uk['UnitPrice'] * tx_uk['Quantity']
tx_revenue = tx_uk.groupby('CustomerID').Revenue.sum().reset_index()

#merge it with our main dataframe
tx_user = pd.merge(tx_user, tx_revenue, on='CustomerID')

#plot the histogram
plot_data = [
    go.Histogram(
	#create our retention table again with crosstab() and add firs purchase year month view
	tx_retention = pd.crosstab(tx_user_purchase['CustomerID'], tx_user_purchase['InvoiceYearMonth']).reset_index()
	tx_retention = pd.merge(tx_retention,tx_min_purchase[['CustomerID','MinPurchaseYearMonth']],on='CustomerID')
	new_column_names = [ 'm_' + str(column) for column in tx_retention.columns[:-1]]
	new_column_names.append('MinPurchaseYearMonth')
	tx_retention.columns = new_column_names

	#create the array of Retained users for each cohort monthly
	retention_array = []
	for i in range(len(months)):
	# import libraries
	from datetime import datetime, timedelta
	import pandas as pd
	%matplotlib inline
	import matplotlib.pyplot as plt
	import numpy as np
	import seaborn as sns
	from __future__ import division

	import plotly.plotly as py
	#create a generic user dataframe to keep CustomerID and new segmentation scores
	tx_user = pd.DataFrame(tx_data['CustomerID'].unique())
	tx_user.columns = ['CustomerID']

	#get the max purchase date for each customer and create a dataframe with it
	tx_max_purchase = tx_uk.groupby('CustomerID').InvoiceDate.max().reset_index()
	tx_max_purchase.columns = ['CustomerID','MaxPurchaseDate']

	#we take our observation point as the max invoice date in our dataset
	tx_max_purchase['Recency'] = (tx_max_purchase['MaxPurchaseDate'].max() - tx_max_purchase['MaxPurchaseDate']).dt.days
	from sklearn.cluster import KMeans

	sse={}
	tx_recency = tx_user[['Recency']]
	for k in range(1, 10):
	kmeans = KMeans(n_clusters=k, max_iter=1000).fit(tx_recency)
	tx_recency["clusters"] = kmeans.labels_
	sse[k] = kmeans.inertia_
	plt.figure()
	plt.plot(list(sse.keys()), list(sse.values()))
	#build 4 clusters for recency and add it to dataframe
	kmeans = KMeans(n_clusters=4)
	kmeans.fit(tx_user[['Recency']])
	tx_user['RecencyCluster'] = kmeans.predict(tx_user[['Recency']])

	#function for ordering cluster numbers
	def order_cluster(cluster_field_name, target_field_name,df,ascending):
	new_cluster_field_name = 'new_' + cluster_field_name
	df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
	df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
	#get order counts for each user and create a dataframe with it
	tx_frequency = tx_uk.groupby('CustomerID').InvoiceDate.count().reset_index()
	tx_frequency.columns = ['CustomerID','Frequency']

	#add this data to our main dataframe
	tx_user = pd.merge(tx_user, tx_frequency, on='CustomerID')

	#plot the histogram
	plot_data = [
	go.Histogram(
	#k-means
	kmeans = KMeans(n_clusters=4)
	kmeans.fit(tx_user[['Frequency']])
	tx_user['FrequencyCluster'] = kmeans.predict(tx_user[['Frequency']])

	#order the frequency cluster
	tx_user = order_cluster('FrequencyCluster', 'Frequency',tx_user,True)

	#see details of each cluster
	tx_user.groupby('FrequencyCluster')['Frequency'].describe()
	#calculate revenue for each customer
	tx_uk['Revenue'] = tx_uk['UnitPrice'] * tx_uk['Quantity']
	tx_revenue = tx_uk.groupby('CustomerID').Revenue.sum().reset_index()

	#merge it with our main dataframe
	tx_user = pd.merge(tx_user, tx_revenue, on='CustomerID')

	#plot the histogram
	plot_data = [
	go.Histogram(