Skip to content

Instantly share code, notes, and snippets.

@Evryjazz
Created July 18, 2018 07:40
Show Gist options
  • Save Evryjazz/c426f9c2ecf5be5708aa5186f9a671c9 to your computer and use it in GitHub Desktop.
Save Evryjazz/c426f9c2ecf5be5708aa5186f9a671c9 to your computer and use it in GitHub Desktop.
#libraries
import sys
sys.path.append('/opt/insight-repositories/')
from pyLBC import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
# request set up
path_to_write = "/data/insight/jupyter/jupyter_notebooks/julienz/retention/extracts/"
ref_path = "/data/insight/jupyter/jupyter_notebooks/julienz/retention/"
os.chdir(ref_path)
# date
if len(sys.argv) > 1:
ref_date = toStrDateIso(sys.argv[1])
else:
ref_date = date.today()
current_date = toStrDateIso(ref_date + timedelta(days = -2))
previous_date = toStrDateIso(ref_date + timedelta(days = -3))
# environment
responsive = "&space={s:562498}"
ios = "&space={s:562494}"
android = "&space={s:562490}"
env_list = [responsive, ios, android]
env_name = ["responsive", "ios", "android"]
env_custom_var = ["cl_364618", "cl_362815", "cl_362806"]
def uv_cat(df_row):
# user categorization as new column
if df_row.new_visitor_previous_date == "-" and pd.isnull(df_row.new_visitor_current_date):
return "lost"
if df_row.new_visitor_previous_date == "New Visitors" and pd.isnull(df_row.new_visitor_current_date):
return "lost"
if df_row.new_visitor_previous_date == "New Visitors" and df_row.new_visitor_current_date == "-":
return "loyal"
if df_row.new_visitor_previous_date == "-" and df_row.new_visitor_current_date == "-":
return "loyal"
if pd.isnull(df_row.new_visitor_previous_date) and df_row.new_visitor_current_date == "New Visitors":
return "new"
# xiti request for all platform
for idx, env in enumerate(env_list):
current_date_name = 'df_{}_{}'.format(env_name[idx], current_date.replace('-', '_'))
previous_date_name = 'df_{}_{}'.format(env_name[idx], previous_date.replace('-', '_'))
print ('getting {} information for {}'.format(env_name[idx], current_date))
# xiti request execution
sub_request = "columns={d_uv_id," + env_custom_var[idx] + ",m_vu}"\
"&sort={-m_vu}"\
+ env\
+ "&period={D:" + current_date + "}"
xiti_get_results_parallel_page(sub_request, path_to_write, current_date_name, "julien.zanni+databot@schibsted.com", "Mpd32chiloelbc$")
# file concatenation and folder cleaning
os.system("sh /opt/insight-repositories/pyLBC/reporting/concat_and_clean.sh " + current_date_name)
for idx, env in enumerate(env_list):
current_date_name = 'df_{}_{}'.format(env_name[idx], current_date.replace('-', '_'))
previous_date_name = 'df_{}_{}'.format(env_name[idx], previous_date.replace('-', '_'))
print ('analyzing {} information for {}'.format(env_name[idx], current_date))
# make previous_date and current_date global csv as df
previous_date_name = 'global_' + previous_date_name + '.csv'
df_previous_date = pd.read_csv(path_to_write + previous_date_name)
current_date_name = 'global_' + current_date_name + '.csv'
df_current_date = pd.read_csv(path_to_write + current_date_name)
# outer join and rename columns
print ('merging {} and {}'.format(previous_date, current_date))
df_merge = pd.merge(df_previous_date, df_current_date, how="outer", on="d_uv_id", suffixes=["_previous_date", "_current_date"])
# rewriting columns
df_merge = df_merge.rename(columns={env_custom_var[idx] + '_previous_date': 'new_visitor_previous_date',
env_custom_var[idx] + '_current_date': 'new_visitor_current_date'})
print ('renaming {} columns for {}'.format(env_name[idx], current_date))
# apply function ()
print ('applying uv_cat column to df_merge for {}'.format(current_date_name))
df_merge['uv_cat'] = df_merge.apply(uv_cat, axis=1)
# retention calculation
print ('retention calculation')
active_user_previous_date = df_previous_date.d_uv_id.nunique()
active_user_current_date = df_current_date.d_uv_id.nunique()
lost = len(df_merge[df_merge.uv_cat == 'lost'])
loyal = len(df_merge[df_merge.uv_cat == 'loyal'])
new = len(df_merge[df_merge.uv_cat == 'new'])
retention = active_user_current_date - new / active_user_previous_date * 100
# create tmp dataframe
data_tmp = {'retention': retention, 'platform' : env_name[idx]}
df_tmp = pd.DataFrame(index=[current_date], data=data_tmp)
# append date and retention value to the csv
df_tmp.to_csv('daily_retention.csv', mode='a', header=False)
print ('daily_retention.csv successfully updated with {} data'.format(current_date))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment