Skip to content

Instantly share code, notes, and snippets.

@Gedevan-Aleksizde
Last active May 1, 2016 06:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Gedevan-Aleksizde/48183856a4f702495d09627e80762f76 to your computer and use it in GitHub Desktop.
Save Gedevan-Aleksizde/48183856a4f702495d09627e80762f76 to your computer and use it in GitHub Desktop.
# !/usr/bin/env python3
# -*- coding: utf-8 -*-
# coding: utf-8
# In[1]:
get_ipython().magic('matplotlib inline'
import pystan # ver. 2.9.0.0
import numpy as np # ver.1.11.0
import pandas as pd # ver. 0.18.0 # xlrd, mumxpr module needed
import matplotlib.pyplot as plt # libpng*-dev, libjpeg*-dev, freetype6-dev must be installed before
import matplotlib.dates as mdates # date histogram
from datetime import datetime
# http://stackoverflow.com/questions/9829175/pip-install-matplotlib-error-with-virtualenv
directory='/FOO/20160406_RFM/' # working directory
# In[2]:
# read xlxs file
data = pd.read_excel(io=directory + 'Online Retail.xlsx',
converters={'InvoiceNo': str, 'StockCode': str, 'Description': str,
'CustomerID': str, 'Country': str
}
)
# In[3]:
data.info()
# In[4]:
data = data.loc[(data.CustomerID == data.CustomerID) & (data.InvoiceNo == data.InvoiceNo) & (data.StockCode == data.StockCode) ]
data.shape
# In[5]:
data['Sale'] = data.Quantity * data.UnitPrice
data['Date'] = data.InvoiceDate.map(pd.Timestamp.date)
data['Time'] = data.InvoiceDate.map(pd.Timestamp.time)
# Status == 0: Ordered, 1: Canceled
data['Status'] = data.InvoiceNo.str.contains('^C') *1
# In[6]:
data_adj = data
data_adj = data_adj.sort_values(['CustomerID', 'StockCode', 'Date', 'Status'])
data_adj = data_adj.reset_index()
data_adj['GrpKey'] = data_adj.CustomerID + '_' + data_adj.StockCode
temp_dt = float('nan')
for i, row in data_adj.iterrows() :
if ( i != 0 and (row.GrpKey != l_row.GrpKey) ) :
temp_dt = float('nan')
if (row.Status == 1):
temp_dt = row.Date
else:
row.Lag_Date = temp_dt
l_row = row
data_adj = data_adj.groupby(['CustomerID', 'StockCode', 'Date']).agg({'Sale': np.sum}).reset_index().query('Sale > 0')
# In[7]:
data_adj = data_adj.query('Sale > 1 ')
data_adj.describe(include='all')
# In[12]:
# survival span
span = data_adj.groupby('CustomerID').agg({'Date': [np.max, np.min]})
span.columns = ['Latest','Oldest']
span['End'] = pd.to_datetime('2011-12-09')
span['Oldest'] = pd.to_datetime(span.Oldest)
span['Latest'] = pd.to_datetime(span.Latest)
span['Time'] = (span.End - span.Oldest).astype('timedelta64[D]') / 365
span['time'] = (span.Latest - span.Oldest).astype('timedelta64[D]') / 365
span['Recency'] = (span.End - span.Latest).astype('timedelta64[D]') / 365
# In[13]:
# frequency
freq = data_adj.groupby('CustomerID').agg({'Date': pd.Series.nunique})
freq.columns = ['Count']
freq = freq.query('Count >= 1 & Count == Count')
freq['Freq'] = freq.Count - 1
# In[14]:
# monetary
# for discription
money_disc = data_adj.groupby('CustomerID').agg({'Sale': np.sum}).query('Sale != 0')
money_disc.columns = ['Monetary']
# exclude refund
money_disc2 = data.query('Status == 0').groupby('CustomerID').agg({'Sale': np.sum})
money_disc2.columns = ['Monetary_no_refund']
money_disc = pd.concat([money_disc, money_disc2], axis=1)
# In[15]:
# merge 3 measures
data2 = pd.concat([span, freq, money_disc], axis=1)
data2['Monetary'] = data2.Monetary/data2.Count
data2 = data2.query('Monetary == Monetary')
data2 = data2.sort_index()
data2.query('Recency != Recency | Freq != Freq | Monetary != Monetary')
# In[16]:
# between 01/DEC/2010~31/DEC/2010
data2 = data2[ (pd.to_datetime("2010-12-01") <= data2.Oldest) & (data2.Oldest <= pd.to_datetime("2010-12-31") )]
data2.describe(include='all')
# In[17]:
# histogram
plt.figure(num=None, figsize=(16, 12), dpi=80, facecolor='w', edgecolor='k')
fig = plt.figure(1)
fig1 = fig.add_subplot(4,2,1)
fig1.set_title("Recency")
fig1 = plt.hist(x = data2.Recency, bins=50)
fig2 = fig.add_subplot(4,2,2)
fig2.set_title("T")
fig2 = plt.hist(x = data2.Time, bins=50)
fig6 = fig.add_subplot(4,2,3)
fig6.set_title("First purchasing")
histdata = mdates.date2num(data2.Oldest.astype(datetime))
fig6.xaxis.set_major_locator(mdates.YearLocator())
fig6.xaxis.set_major_formatter(mdates.DateFormatter('%d/%m/%y'))
fig6.hist(x = histdata, bins=50)
fig3 = fig.add_subplot(4,2,5)
fig3.set_title("Freqnency")
fig3 = plt.hist(x = data2.Count, bins=50)
fig4 = fig.add_subplot(4,2,7)
fig4.set_title("log (Monetary)")
fig4 = plt.hist(x = np.log(data2.Monetary), bins=50)
fig5 = fig.add_subplot(4,2,8)
fig5.set_title("log (Monetary), no refund")
fig5 = plt.hist(x = np.log(data2.Monetary_no_refund), bins=50)
# In[42]:
# create a spending history matrix
spending = data_adj.groupby(['CustomerID', 'Date'], sort=True).agg({'Sale': np.sum}).reset_index()
spending = pd.merge(pd.DataFrame(data2.index, columns=['CustomerID']), spending, on='CustomerID', how='left')
spending["order"] = spending.groupby("CustomerID", sort=True).cumcount()+1
spending_mat = spending.pivot("CustomerID", "order", "Sale").fillna(0)
spending_freq = spending.groupby('CustomerID', sort=True).size()
# In[19]:
plt_pooled_noRefund = plt.hist(np.log(spending.Sale),bins=50)
# In[20]:
data2.to_csv(directory + 'rfm.csv')
spending_freq.to_csv(directory+'spending_freq.csv')
spending_mat.to_csv(directory + 'spending_mat.csv')
# In[21]:
model = pystan.StanModel(file=directory + 'rfm_hierarchical.stan')
# In[ ]:
#fit = model.sampling(data={'N': data2.shape[0], 'Time': data2.Time, 'time': data2.time, 'x': data2.Freq.astype('int'), 'Spend': spending_mat,
# 'K': spending_freq.values.max(), 'NSpend': spending_freq.values, 'delta': .0027 }, chains=4, warmup=4000, iter=10000)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment