Skip to content

Instantly share code, notes, and snippets.

@FBosler
FBosler / cohorts_imports.py
Created August 17, 2019 09:12
imports for cohorts
import pandas as pd
import numpy as np
import datetime
import string
import names
import matplotlib.pyplot as plt
import seaborn as sns
@FBosler
FBosler / seed_data.py
Last active August 19, 2019 06:18
seed data for cohorts
ADJECTIVES = [
'cool','smart','beautiful','funky','dorky', 'babyish', 'back', 'bad', 'baggy', 'bare', 'barren', 'dorky', 'bad',
'calculating', 'calm', 'candid', 'canine', 'capital', 'carefree', 'hairy', 'half', 'handmade', 'handsome', 'handy',
'crazy', 'deliberate'
]
PEOPLE = list(set([names.get_first_name().lower() for i in range(10000)]))
PRODUCTS = [
'airplane','banana','train','bow','map','skull'
@FBosler
FBosler / helper_functions.py
Last active August 19, 2019 05:11
helper functions for the cohorts analysis
def generate_dummy_names(adj,sub,number_names=10):
"""
function generates random name combinations of the provided adjectives and subjects
>>> name_generator(adj=['cool','strong'],sub=['harry','kate'],number_names=3)
['cool_harry', 'strong_kate', 'strong_harry', 'cool_kate']
"""
if number_names > len(adj)*len(sub):
raise ValueError(f"""
Can at most genereate {len(adj)*len(sub) -1} names, increase adj or sub to allow for more names
""")
@FBosler
FBosler / generate_dummy_data.py
Last active August 19, 2019 05:11
generates dummy data for cohorts project
def generate_dummy_dataframe(
dummy_products,
dummy_customers,
dummy_customer_types = ['company','private','government'],
first_date=datetime.datetime(2014,1,1),
last_date=datetime.datetime(2018,12,31),
data_points=1000):
customer_type = {customer:np.random.choice(['company','private','government']) for customer in dummy_customers}
product_prices = {product:np.random.randint(100,10000) for product in dummy_products}
@FBosler
FBosler / dummy_data.py
Last active August 19, 2019 05:16
shows how to call generate_dummy_data
customers = generate_dummy_names(ADJECTIVES, PEOPLE, 15000)
# we create 15000 fake customer names: ['smart_randy', 'canine_carol', 'carefree_cheryl', ...]
products = generate_dummy_names(ADJECTIVES, PRODUCTS, 10)
# we create 10 fake products: ['carefree_skull', 'bare_map', 'calculating_banana', 'funky_train', 'cool_train', ...]
@FBosler
FBosler / cohorts_enrichment.py
Last active August 18, 2019 08:22
enriches dummy data
customer_data = pd.DataFrame(df.groupby('customer')['order_date'].min())
customer_data.columns = ['customer_first_order']
customer_data.head(2)
# combine first booking with our dummy data
df = pd.merge(df,customer_data.reset_index(),on='customer')
df.head(2)
#determine if a order is a repeat order or first order
df['type_of_order'] = np.where(df['order_date'] != df['customer_first_order'], 'repeat', 'first')
@FBosler
FBosler / generate_cohorts.py
Last active August 18, 2019 10:48
function generates cohorts, excel or figure
def generate_cohort_analysis(df, metric, record_type='all', period_agg='quarterly', fig=True, size=10, save_fig=True):
"""
For metric use 'number_of_orders', 'number_of_items_bought' or 'total_order_value'
For record_type use 'all' or specific customer_type ['private','company','government']
no_fig controlls the output of a figure, by default True (i.e. no figure)
"""
dataset = df.copy()
if record_type != 'all':
dataset = df[df.customer_type == record_type].copy()
@FBosler
FBosler / execute_create_dummy_data.py
Last active July 24, 2020 14:24
creates dummy data frame
df = generate_dummy_data(products, customers)
@FBosler
FBosler / cohort_analysis_helper.py
Created August 18, 2019 10:00
functions extracted from main function for simplicity
def _generate_cohorts(dataset, metric):
cohorts = dataset.groupby(['cohort','order_period']).agg({
'order_id':pd.Series.nunique,
'order_size':sum,
'basket_size':sum
})
cohorts.columns = ['number_of_orders','number_of_items_bought','total_order_value']
cohorts = cohorts[metric].unstack(0)
@FBosler
FBosler / samples.py
Created August 18, 2019 10:06
generates different cohort analysis
generate_cohort_analysis(df=df, metric='number_of_orders')
generate_cohort_analysis(df=df, metric='number_of_orders', period_agg='monthly')
generate_cohort_analysis(df=df, metric='number_of_items_bought')
generate_cohort_analysis(df=df, metric='number_of_items_bought', record_type='private')
...