Last active
May 1, 2016 06:09
-
-
Save Gedevan-Aleksizde/48183856a4f702495d09627e80762f76 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# !/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# coding: utf-8 | |
# In[1]: | |
get_ipython().magic('matplotlib inline' | |
import pystan # ver. 2.9.0.0 | |
import numpy as np # ver.1.11.0 | |
import pandas as pd # ver. 0.18.0 # xlrd, mumxpr module needed | |
import matplotlib.pyplot as plt # libpng*-dev, libjpeg*-dev, freetype6-dev must be installed before | |
import matplotlib.dates as mdates # date histogram | |
from datetime import datetime | |
# http://stackoverflow.com/questions/9829175/pip-install-matplotlib-error-with-virtualenv | |
directory='/FOO/20160406_RFM/' # working directory | |
# In[2]: | |
# read xlxs file | |
data = pd.read_excel(io=directory + 'Online Retail.xlsx', | |
converters={'InvoiceNo': str, 'StockCode': str, 'Description': str, | |
'CustomerID': str, 'Country': str | |
} | |
) | |
# In[3]: | |
data.info() | |
# In[4]: | |
data = data.loc[(data.CustomerID == data.CustomerID) & (data.InvoiceNo == data.InvoiceNo) & (data.StockCode == data.StockCode) ] | |
data.shape | |
# In[5]: | |
data['Sale'] = data.Quantity * data.UnitPrice | |
data['Date'] = data.InvoiceDate.map(pd.Timestamp.date) | |
data['Time'] = data.InvoiceDate.map(pd.Timestamp.time) | |
# Status == 0: Ordered, 1: Canceled | |
data['Status'] = data.InvoiceNo.str.contains('^C') *1 | |
# In[6]: | |
data_adj = data | |
data_adj = data_adj.sort_values(['CustomerID', 'StockCode', 'Date', 'Status']) | |
data_adj = data_adj.reset_index() | |
data_adj['GrpKey'] = data_adj.CustomerID + '_' + data_adj.StockCode | |
temp_dt = float('nan') | |
for i, row in data_adj.iterrows() : | |
if ( i != 0 and (row.GrpKey != l_row.GrpKey) ) : | |
temp_dt = float('nan') | |
if (row.Status == 1): | |
temp_dt = row.Date | |
else: | |
row.Lag_Date = temp_dt | |
l_row = row | |
data_adj = data_adj.groupby(['CustomerID', 'StockCode', 'Date']).agg({'Sale': np.sum}).reset_index().query('Sale > 0') | |
# In[7]: | |
data_adj = data_adj.query('Sale > 1 ') | |
data_adj.describe(include='all') | |
# In[12]: | |
# survival span | |
span = data_adj.groupby('CustomerID').agg({'Date': [np.max, np.min]}) | |
span.columns = ['Latest','Oldest'] | |
span['End'] = pd.to_datetime('2011-12-09') | |
span['Oldest'] = pd.to_datetime(span.Oldest) | |
span['Latest'] = pd.to_datetime(span.Latest) | |
span['Time'] = (span.End - span.Oldest).astype('timedelta64[D]') / 365 | |
span['time'] = (span.Latest - span.Oldest).astype('timedelta64[D]') / 365 | |
span['Recency'] = (span.End - span.Latest).astype('timedelta64[D]') / 365 | |
# In[13]: | |
# frequency | |
freq = data_adj.groupby('CustomerID').agg({'Date': pd.Series.nunique}) | |
freq.columns = ['Count'] | |
freq = freq.query('Count >= 1 & Count == Count') | |
freq['Freq'] = freq.Count - 1 | |
# In[14]: | |
# monetary | |
# for discription | |
money_disc = data_adj.groupby('CustomerID').agg({'Sale': np.sum}).query('Sale != 0') | |
money_disc.columns = ['Monetary'] | |
# exclude refund | |
money_disc2 = data.query('Status == 0').groupby('CustomerID').agg({'Sale': np.sum}) | |
money_disc2.columns = ['Monetary_no_refund'] | |
money_disc = pd.concat([money_disc, money_disc2], axis=1) | |
# In[15]: | |
# merge 3 measures | |
data2 = pd.concat([span, freq, money_disc], axis=1) | |
data2['Monetary'] = data2.Monetary/data2.Count | |
data2 = data2.query('Monetary == Monetary') | |
data2 = data2.sort_index() | |
data2.query('Recency != Recency | Freq != Freq | Monetary != Monetary') | |
# In[16]: | |
# between 01/DEC/2010~31/DEC/2010 | |
data2 = data2[ (pd.to_datetime("2010-12-01") <= data2.Oldest) & (data2.Oldest <= pd.to_datetime("2010-12-31") )] | |
data2.describe(include='all') | |
# In[17]: | |
# histogram | |
plt.figure(num=None, figsize=(16, 12), dpi=80, facecolor='w', edgecolor='k') | |
fig = plt.figure(1) | |
fig1 = fig.add_subplot(4,2,1) | |
fig1.set_title("Recency") | |
fig1 = plt.hist(x = data2.Recency, bins=50) | |
fig2 = fig.add_subplot(4,2,2) | |
fig2.set_title("T") | |
fig2 = plt.hist(x = data2.Time, bins=50) | |
fig6 = fig.add_subplot(4,2,3) | |
fig6.set_title("First purchasing") | |
histdata = mdates.date2num(data2.Oldest.astype(datetime)) | |
fig6.xaxis.set_major_locator(mdates.YearLocator()) | |
fig6.xaxis.set_major_formatter(mdates.DateFormatter('%d/%m/%y')) | |
fig6.hist(x = histdata, bins=50) | |
fig3 = fig.add_subplot(4,2,5) | |
fig3.set_title("Freqnency") | |
fig3 = plt.hist(x = data2.Count, bins=50) | |
fig4 = fig.add_subplot(4,2,7) | |
fig4.set_title("log (Monetary)") | |
fig4 = plt.hist(x = np.log(data2.Monetary), bins=50) | |
fig5 = fig.add_subplot(4,2,8) | |
fig5.set_title("log (Monetary), no refund") | |
fig5 = plt.hist(x = np.log(data2.Monetary_no_refund), bins=50) | |
# In[42]: | |
# create a spending history matrix | |
spending = data_adj.groupby(['CustomerID', 'Date'], sort=True).agg({'Sale': np.sum}).reset_index() | |
spending = pd.merge(pd.DataFrame(data2.index, columns=['CustomerID']), spending, on='CustomerID', how='left') | |
spending["order"] = spending.groupby("CustomerID", sort=True).cumcount()+1 | |
spending_mat = spending.pivot("CustomerID", "order", "Sale").fillna(0) | |
spending_freq = spending.groupby('CustomerID', sort=True).size() | |
# In[19]: | |
plt_pooled_noRefund = plt.hist(np.log(spending.Sale),bins=50) | |
# In[20]: | |
data2.to_csv(directory + 'rfm.csv') | |
spending_freq.to_csv(directory+'spending_freq.csv') | |
spending_mat.to_csv(directory + 'spending_mat.csv') | |
# In[21]: | |
model = pystan.StanModel(file=directory + 'rfm_hierarchical.stan') | |
# In[ ]: | |
#fit = model.sampling(data={'N': data2.shape[0], 'Time': data2.Time, 'time': data2.time, 'x': data2.Freq.astype('int'), 'Spend': spending_mat, | |
# 'K': spending_freq.values.max(), 'NSpend': spending_freq.values, 'delta': .0027 }, chains=4, warmup=4000, iter=10000) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment