Skip to content

Instantly share code, notes, and snippets.

Last active September 2, 2022 12:32
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
Star You must be signed in to star a gist
What would you like to do?
### Cohort Analysis
import matplotlib.pyplot as plt
# Connect to the BigQuery API
from googleapiclient.discovery import build
from oauth2client import client
credentials = client._get_application_default_credential_from_file('client_secrets.json')
credentials = credentials.create_scoped('')
bigquery_service = build('bigquery', 'v2', credentials=credentials)
# Run a SQL query against the HITgroup data
# to find out the dates of first and last activity
# for each requester. Then, aggregate on top,
# and report the number of requesters that were firstSeen
# on a month X, and were lastSeen on a month Y
# The string manipulation is just to get the dates
# represented as YYYY-MM
query_request =
query_data = {
'query': (
COUNT(*) AS cnt
STRING(YEAR(MIN(firstSeen))) + '-' + RIGHT('0' + STRING(MONTH(MIN(firstSeen))), 2) AS firstSeen,
STRING(YEAR(MAX(lastSeen))) + '-' + RIGHT('0' + STRING(MONTH(MAX(lastSeen))), 2) AS lastSeen,
COUNT(groupId) AS HITgroupsPosted
query_response = query_request.query(
# Put the SQL results in a Pandas Dataframe
import pandas as pd
columns = [f.get('name') for f in query_response['schema']['fields']]
rows = [tuple([row['f'][i]['v'] for i in range(len(row['f']))]) for row in query_response['rows']]
df = pd.DataFrame(data=rows, columns=columns, dtype=int)
# The code above is for connecting to the BigQuery API and running the SQL query
# Alternatively, you can just download the CSV file from
# and load it in a dataframe
# df = pd.read_csv("cohort_analysis.csv", sep="\t")
# Transform the dataframe into a table with lastSeen month as rows and firstSeen month as columns
# This table contains how many requesters have left on any given month (shown in the rows),
# from each cohort (shown as columns)
pivot = pd.pivot_table(df, values='cnt', index=['lastSeen'], columns=['firstSeen'])
# We will do a set of transformations here, to create a pivotTable
# with each column containing a cohort of users, and each row
# showing how many requesters from that cohort are still active
# (active means that they have posted a HIT on that month or later)
import numpy as np
# The cumulative sum creates columns that have the "opposite" order
# than what we want. Each column indicates how the total number
# of requesters for that cohort that have left until that month
pc = pivot.cumsum()
# Ideally, I would like to "reverse" the columns ignoring NaNs
# but I cannot figure out an easy way to do so. So, I revert
# to a set of for loops.
# The np.amax(pc,axis=0)[c] gets the maximum value for the column c
# which is the total number of requesters that were firstSeen
# on month c. Then, for each row at month Y, we subtract
# the number of requesters that have left until that month
# We use the .shift() operator because we do not want to subtract
# from month Y the requesters that were still active on month Y
pc2 = pd.DataFrame(pivot)
for c in pc.columns.values:
pc2[c] = np.amax(pc,axis=0)[c] - pc[c].shift()
# After the operation above, the diagonal is NaN. We need to fill
# it in with the total number of requesters that appeared in that
# month.
for c in pc.columns.values:[c,c] = np.amax(pc,axis=0)[c]
# Create the plot showing the cohorts
f = plt.figure(edgecolor='k')
pc2.plot(kind='area', stacked=True, legend=True, figsize=(16,8), cmap='Paired', grid=True, ax=ax);
plt.title('Amazon Mechanical Turk Cohort Analysis', color='black')
plt.legend(loc='lower center', ncol=8, bbox_to_anchor=[0.5, -0.25])
ax.set_ylabel("Active Requesters")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment