Skip to content

Instantly share code, notes, and snippets.

@gregglind
Last active August 13, 2017 00:23
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gregglind/fee9984c0d65957c4ab5 to your computer and use it in GitHub Desktop.
Save gregglind/fee9984c0d65957c4ab5 to your computer and use it in GitHub Desktop.
SHIELD-Variations
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# ### SHIELD Variations, Data Pull and Analysis
# In[2]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history
from pprint import pprint as pp
from operator import itemgetter
get_ipython().magic(u'pylab inline')
sc.defaultParallelism
# In[3]:
# helpers and utils
PINGNAME = 'x-shield-studies'
HBNAME = 'x-shield-study-performance-1'
STUDYNAME = 'screen Performance X1'
study_start = "20160325" ## start of study
import datetime as DT
today_fmt = DT.date.today().strftime("%Y%m%d")
#week_ago = today - DT.timedelta(days=DAYS)
#week_ago_fmt = week_ago.strftime("%Y%m%d")
def updateWith(fn):
def newFn(ping):
out = fn(ping)
ping.update(out)
return ping
return newFn
from IPython.core.display import display, HTML
def format_pivot(table, formats):
# example 'f1': '{:,.1%}'.format # as % with 1 place!
return table.to_html(formatters=formats)
# In[4]:
# http://stackoverflow.com/questions/13059011/is-there-any-python-function-library-for-calculate-binomial-confidence-intervals
def exact_CI(x, N, alpha=0.95):
"""
Calculate the exact confidence interval of a proportion
where there is a wide range in the sample size or the proportion.
This method avoids the assumption that data are normally distributed. The sample size
and proportion are desctibed by a beta distribution.
Parameters
----------
x: the number of cases from which the proportion is calulated as a positive integer.
N: the sample size as a positive integer.
alpha : set at 0.95 for 95% confidence intervals.
Returns
-------
The proportion with the lower and upper confidence intervals as a dict.
"""
from scipy.stats import beta
x = float(x)
N = float(N)
p = round((x/N)*100,2)
intervals = [round(i,4)*100 for i in beta.interval(alpha,x,N-x+1)]
intervals.insert(0,p)
result = {'Proportion': intervals[0], 'Lower CI': intervals[1], 'Upper CI': intervals[2]}
return result
# ## Input Enrollement data for release
#
# ```
# mysql> select count(id) as n, sum(extra like "%button%") as amo, question_id, variation_id from heartbeat_answer where survey_id="messaging" and question_id="x-shield-study-performance-1" and is_test=0 group by variation_id having n > 5;
# +-------+------+------------------------------+--------------------+
# | n | amo | question_id | variation_id |
# +-------+------+------------------------------+--------------------+
# | 28131 | 1613 | x-shield-study-performance-1 | nowant-shapefuture |
# | 27845 | 2263 | x-shield-study-performance-1 | nowant-trynew |
# | 27924 | 1322 | x-shield-study-performance-1 | want-shapefuture |
# | 27467 | 2124 | x-shield-study-performance-1 | want-trynew |
# +-------+------+------------------------------+--------------------+
# ```
# In[5]:
import pandas as pd
offer_release = pd.DataFrame([[28131,1613,"nowant-shapefuture"],[27845,2263,"nowant-trynew"],[27924,1322,"want-shapefuture"],[27467,2124,'want-trynew']], columns=["clientId","wentToAmo","surveyVersion"])
# ### Extract, Transform, Analyze
#
# 0. [docs](https://github.com/mozilla/python_moztelemetry/blob/master/moztelemetry/spark.py)
# 1. Get all shield-studies and heartbeat pings
#
# In[6]:
## shield pings, hb OFFER pings
## shield.
kwargs = dict(
doc_type="OTHER",
submission_date=(study_start,today_fmt),
app="Firefox"
)
def pingVersion(ping):
v = ping['payload'].get('version','0.0.0').split(".")
v = tuple(map(int,v))
return v
def getShieldProps(p):
d = p['payload']
out = dict(
clientId=p.get('clientId',None),
channel=p["application"]["channel"],
creationDate = int(long(p['meta']['creationTimestamp'])/1e6),
submissionDate = p['meta']['submissionDate'],
os = p['environment'].get('os',{}).get('name','unknown'),
dtype = p["meta"]["docType"]
)
for k in ['firstrun', 'msg', 'name', 'variation']:
out[k] = d[k]
out['firstrun'] = int(out['firstrun'])
return out
pings = get_pings(sc, channel="release", **kwargs).union(
get_pings(sc, channel="aurora", **kwargs)).union(
get_pings(sc, channel="beta", **kwargs)).union(
get_pings(sc, channel="nightly", **kwargs))
pings = pings.filter(lambda p: p["meta"]["docType"] == PINGNAME)
pings = pings.filter(lambda p: p['payload']['name'] == STUDYNAME)
pings = pings.map(getShieldProps).filter(itemgetter('clientId'))
## heartbeat
kwargs = dict(
doc_type="heartbeat",
submission_date=(study_start,today_fmt),
app="Firefox"
)
hbpings = get_pings(sc, channel="release", **kwargs).union(
get_pings(sc, channel="aurora", **kwargs)).union(
get_pings(sc, channel="beta", **kwargs)).union(
get_pings(sc, channel="nightly", **kwargs))
_hbProps = dict(
clientId = "clientId",
channel="application/channel",
surveyId = "payload/surveyId",
surveyVersion = "payload/surveyVersion",
engagedTS = "payload/engagedTS",
dtype = "meta/docType"
)
hbpings = get_pings_properties(hbpings,_hbProps)
hbpings = hbpings.filter(lambda x: x['surveyId']==HBNAME)
# In[14]:
pp(pings.first())
pp(hbpings.first())
# In[15]:
df = pd.DataFrame(hbpings.collect())
df['wentToAmo'] = df['engagedTS'].apply(lambda x: x > 0)
## df.groupby(['channel','surveyId','surveyVersion']).agg({"clientId": lambda x: len(set(x)), "wentToAmo": lambda x: len(filter(None,x))})
t1 = pd.pivot_table(df,values=['clientId', 'wentToAmo'],index=['channel','surveyId','surveyVersion'], aggfunc={"clientId": lambda x: len(set(x)), "wentToAmo": lambda x: len(filter(None,x))}, fill_value=0, margins=True)
t1['%']= t1['wentToAmo']/t1['clientId']
# get some CIs and append.
ci = [exact_CI(x[0],x[1])for x in zip(t1['wentToAmo'],t1['clientId'])]
t1['lower'] = map(itemgetter('Lower CI'),ci)
t1['upper'] = map(itemgetter('Upper CI'),ci)
## format it nicer.
display(HTML(format_pivot(t1,{'%': '{:,.1%}'.format})))
# In[16]:
offer_release['channel'] = "release"
offer_release['surveyId'] = "x-shield-study-performance-1"
offer_release.set_index(['channel','surveyId','surveyVersion'])
ci = [exact_CI(x[0],x[1])for x in zip(offer_release['wentToAmo'],offer_release['clientId'])]
offer_release['%']= offer_release['wentToAmo']/offer_release['clientId']
offer_release['lower'] = map(itemgetter('Lower CI'),ci)
offer_release['upper'] = map(itemgetter('Upper CI'),ci)
## format it nicer.
display(HTML(format_pivot(offer_release,{'%': '{:,.1%}'.format})))
# In[17]:
df2 = pd.DataFrame(pings.collect())
df2.groupby(['channel','name','variation','msg']).agg({"clientId": lambda x: len(set(x))})
pd.pivot_table(df2, values='clientId', index=['channel','name','variation'], columns='msg', aggfunc=lambda x: len(set(x)), fill_value=0, margins=True)
# In[18]:
# not yet sorted by ts, etc. but this is the approach!
from operator import itemgetter
'''
def aggUV(agg, item):
#return agg
#print agg, other
#ans = other['msg']
agg.append((item['msg'],item['creationDate'], item['creationDate']-item['firstrun'] ))
return agg
def aggUU(agg1, agg2):
return agg1 + agg2
def sortByTime(blarg):
return (blarg[0], sorted(blarg[1], key=lambda x: x[1]))
'''
summaryProto = dict(
clientId = None,
channel = None,
os = None,
firstrun = None,
variation = None,
installed = False,
ineligible = False,
seen1 = False,
seen2 = False,
seen3 = False,
seen7 = False,
leftStudy = False,
completed = False
)
def daysRunning(item):
return int((item['creationDate']-item['firstrun'])/(86400*1000))
def aggUV(agg, item):
dtype = item['dtype']
agg['clientId'] = item['clientId']
agg['channel'] = item['channel']
agg['os'] = item['os']
if dtype == "heartbeat":
agg['prompt'] = surveyVersion
elif dtype == PINGNAME:
msg = item['msg']
agg['name'] = item['name']
agg['firstrun'] = item['firstrun']
if (not agg['ineligible']):
agg['started'] = True
agg['variation'] = item['variation']
# days seen
d = daysRunning(item)
if d == 1: agg['seen1'] = True
if d == 2: agg['seen2'] = True
if d == 3: agg['seen3'] = True
if d == 7: agg['seen7'] = True
if msg == "user-ended-study":
agg['leftStudy'] = True
elif msg == 'install':
agg['installed'] = True
elif msg == 'end-of-study':
agg['completed'] = True
elif msg == "ineligible":
agg['ineligible'] = True
return agg
def aggUU(agg1, agg2):
for (k,v) in agg2.iteritems():
if v: agg1[k]=v
return agg1
states = pings.keyBy(itemgetter('clientId')).aggregateByKey(summaryProto, aggUV, aggUU).values()
statesDf = pd.DataFrame(states.collect())
# In[19]:
pd.pivot_table(statesDf, values=['installed','seen1','seen2','seen3','seen7','ineligible','leftStudy','completed'], index=['channel','name','variation'], columns=[], aggfunc=lambda x: sum(map(int,x)), fill_value=0, margins=True)
# In[20]:
t = pd.pivot_table(statesDf, values=['installed','seen1','seen2','seen3','seen7','ineligible','leftStudy','completed'], index=['channel','name','variation'], columns=[], aggfunc=lambda x: sum(map(int,x)), fill_value=0, margins=True)
ci = [exact_CI(x[0],x[1])for x in zip(t['seen7'],t['installed'])]
t['lower7'] = map(itemgetter('Lower CI'),ci)
t['upper7'] = map(itemgetter('Upper CI'),ci)
for k in ('seen1','seen2','seen3','seen7'):
t[k] = t[k]/t['installed']
fmt = '{:,.1%}'.format
display(HTML(format_pivot(t,{'seen1': fmt, 'seen2':fmt, 'seen3':fmt, 'seen7':fmt})))
# ## Data Quality Issues
# In[21]:
def depOn(thing,aname,depList):
if (thing[aname]):
for d in depList:
if not thing[d]:
return "{0} needs {1}".format(aname, d)
def excludes(thing,aname,depList):
if (thing[aname]):
for d in depList:
if thing[d]:
return "{0} and {1}".format(aname, d)
# ['started','seen1','seen2','seen3','seen7','ineligible','leftStudy']
def checkLifeCycle(lc):
return depOn(lc,'seen7',['started']) or depOn(lc,'seen3',['started']) or depOn(lc,'seen2',['started']) or depOn(lc,'seen1',['started']) or excludes(lc,'ineligible',['leftStudy','installed','seen1','seen2','seen3','seen7'])
states.map(checkLifeCycle).countByValue()
# In[22]:
states.filter(lambda x: x['ineligible'] and x['leftStudy']).map(itemgetter('os','channel','variation')).countByValue()
# In[23]:
def aggList(u,v):
if isinstance(v,list):
return u + v
else:
u.append(v)
return u
# sc.parallelize([{'a':1},{'b':2},{'c':3}]).aggregate([],aggList,aggList)
#aggHB = pings.keyBy(itemgetter('clientId')).aggregateByKey([], aggList, aggList)
#aggHB.first()
# ### Heartbeat Offers During the Same Period
#
# Is the slow enrollment 'real' or an artefact?
#
# Compare to US-Release HB offers in the same period
# In[24]:
## hb by day by study.
kwargs = dict(
doc_type="heartbeat",
submission_date=(study_start,today_fmt),
app="Firefox"
)
allhb = get_pings(sc, channel="release", **kwargs).union(
get_pings(sc, channel="aurora", **kwargs)).union(
get_pings(sc, channel="beta", **kwargs)).union(
get_pings(sc, channel="nightly", **kwargs))
_hbProps = dict(
clientId = "clientId",
#creation = int(long(p['meta']['creationTimestamp'])/1e6),
creation = 'meta/creationTimestamp',
channel="application/channel",
surveyId = "payload/surveyId",
#surveyVersion = "payload/surveyVersion",
offeredTS = "payload/offeredTS",
locale = 'environment/settings/locale'
#dtype = "meta/docType"
)
allhb = get_pings_properties(allhb, _hbProps).filter(lambda x: (x['locale'] or '').lower() =="en-us")
#hbpings = hbpings.filter(lambda x: x['surveyId']==HBNAME)
allhb.first()
allhbDf = pd.DataFrame(allhb.collect())
# In[25]:
## Offers for both studies (which had the same sample)
allhbDf["day"] = allhbDf['creation'].map(lambda x: int(long(x)/1e6)/(86400*1000))
allhbDf.drop_duplicates(subset=['clientId','surveyId'],inplace=True)
pd.pivot_table(allhbDf,
#values=['installed','seen1','seen2','seen3','seen7','ineligible','leftStudy','completed'],
index=['channel','surveyId'],
columns=['day'],
aggfunc = lambda x: len(x),
#aggfunc=lambda x: sum(map(int,x)),
fill_value=0, margins=True)
# ## Shield Study 1: `nglayout.initialpaint.delay` Variations
#
#
# - The test deployed and ran correctly.
# - Some questions (effectiveness of the actual setting) were underpowered.
#
# ### Variations: Is there a right setting for `nglayout.initialpaint.delay`?
#
# We observed **no difference in 7-Day-Retention** for any of the settings (powered for 10%) differences.
#
# This does not mean there is no difference, but that it is likely to be have less than a 10% impact on that retention measure.
#
# ### UX
#
# 1. (prompt) Use the "try something new" prompts.
#
# - BETTER: Want to try someting new in Firefox?
# - WORSE: Want to shape the future of Firefox?
#
# Using 'want to' showed no effect (underpowered to detect).
#
# This held in Aurora (34.7%), Beta (9.7%), Release (8.1%).
#
# Prompt uptake was similar to other HB prompts
#
# 2. Weak conversion / install of experiment.
#
# Users might be scared off by word 'install' or prompts during install.
#
#
# ### Systems Aspects
#
#
# 1. Unified Telemetry / Data Pipeline worked correctly.
#
# - No issues with data loss.
# - Analysis was straightoward.
# - maintained "near real time" on all stats.
# - analysis code is in good shape.
#
# 1. AMO (Addons.m.o)
#
# - hosting was robust
# - AMO stats are wildly inconsistent with install stats from UT ( https://addons.mozilla.org/en-US/firefox/addon/x-screen-performance-study-1/statistics/?last=30
# )
#
#
# ### Next time:
#
# 1. Better UX for enroll. Avoid the 'install' prompt
#
# - consider 'opt-out' style
# - consider other ways of explaining how to be in the study
#
#
# 1. AMO issues
#
# - instrument the amo page with GA to know if the button is the issue
#
# 1. Better probes in addon that are 'fuse' based, rather than just on startup shutdown. Should phone home every day.
#
# 1. easier viz /publishing of results (in particular, P-values, etc.)
# 1. Better / earlier code review.
# 1. Address legal concerns about explicitly explaining exactly what will change. Reconcile this desire with
#
#
# ### Technical timeline:
#
# - Addon Approval: 2016/03/20
# - AMO page live: 2016/03/24
# - aurora launch: 2016/03/25
# - beta launch: 2016/03/26
# - release launch: 2016/03/28
# - enrollment end: 2016/04/04
#
#
# ### Heroes (incomplete):
#
# - Mark Reid: `#datapipeline` and lots of help with packets
# - John Gruen: Designed / implemented the AMO page, based on Test Pilot https://github.com/mozilla/addons-server/issues/1880
# - Matt Grimes: Setting up ALL THE MEETINGS. Survey Work
# - Christopher Grebs + Andy McKay, AMO page. https://github.com/mozilla/addons-server/issues/1880
# - Rebecca Weiss: Study design, consent issues, language of consent page.
# - Ilana Segall, language and prompts
# - Chris Hartjes: Addon QA
# - Jorge Villalobos: Addon Review
# - Winston Bowden: Branding Review, "Firefox-ness"
# - UX: Philip, Madhava, Stephen (final UX tentative approval)
#
# In[5]:
bucket = "telemetry-parquet"
prefix = "main_summary/v1"
get_ipython().magic(u'time d1 = sqlContext.read.load("s3://{}/{}".format(bucket, prefix), "parquet")')
# In[6]:
about = """
root
|-- document_id: string (nullable = false)
|-- client_id: string (nullable = true)
|-- sample_id: integer (nullable = true)
|-- channel: string (nullable = true)
|-- normalized_channel: string (nullable = true)
|-- country: string (nullable = true)
|-- city: string (nullable = true)
|-- os: string (nullable = true)
|-- os_version: string (nullable = true)
|-- os_service_pack_major: string (nullable = true)
|-- os_service_pack_minor: string (nullable = true)
|-- profile_creation_date: integer (nullable = true)
|-- subsession_start_date: string (nullable = true)
|-- subsession_length: integer (nullable = true)
|-- distribution_id: string (nullable = true)
|-- submission_date: string (nullable = false)
|-- sync_configured: boolean (nullable = true)
|-- sync_count_desktop: integer (nullable = true)
|-- sync_count_mobile: integer (nullable = true)
|-- app_build_id: string (nullable = true)
|-- app_display_version: string (nullable = true)
|-- app_name: string (nullable = true)
|-- app_version: string (nullable = true)
|-- timestamp: long (nullable = false)
|-- env_build_id: string (nullable = true)
|-- env_build_version: string (nullable = true)
|-- env_build_arch: string (nullable = true)
|-- e10s_enabled: boolean (nullable = true)
|-- e10s_cohort: string (nullable = true)
|-- locale: string (nullable = true)
|-- active_experiment_id: string (nullable = true)
|-- active_experiment_branch: string (nullable = true)
|-- reason: string (nullable = true)
|-- timezone_offset: integer (nullable = true)
|-- plugin_hangs: integer (nullable = true)
|-- aborts_plugin: integer (nullable = true)
|-- aborts_content: integer (nullable = true)
|-- aborts_gmplugin: integer (nullable = true)
|-- crashes_detected_plugin: integer (nullable = true)
|-- crashes_detected_content: integer (nullable = true)
|-- crashes_detected_gmplugin: integer (nullable = true)
|-- crash_submit_attempt_main: integer (nullable = true)
|-- crash_submit_attempt_content: integer (nullable = true)
|-- crash_submit_attempt_plugin: integer (nullable = true)
|-- crash_submit_success_main: integer (nullable = true)
|-- crash_submit_success_content: integer (nullable = true)
|-- crash_submit_success_plugin: integer (nullable = true)
|-- active_addons_count: integer (nullable = true)
|-- flash_version: string (nullable = true)
|-- vendor: string (nullable = true)
|-- is_default_browser: boolean (nullable = true)
|-- default_search_engine_data_name: string (nullable = true)
|-- loop_activity_open_panel: integer (nullable = true)
|-- loop_activity_open_conversation: integer (nullable = true)
|-- loop_activity_room_open: integer (nullable = true)
|-- loop_activity_room_share: integer (nullable = true)
|-- loop_activity_room_delete: integer (nullable = true)
|-- devtools_toolbox_opened_count: integer (nullable = true)
|-- search_counts: array (nullable = true)
| |-- element: struct (containsNull = false)
| | |-- engine: string (nullable = false)
| | |-- source: string (nullable = false)
| | |-- count: long (nullable = false)
|-- submission_date_s3: string (nullable = true)
"""
# In[7]:
#dir(d1)
"""
Row(client_id=u'0a9d9dcd-1f8e-4188-97d7-233f46d99d48', subsession_start_date=u'2016-03-25T00:00:00.0-07:00', subsession_length=16383, submission_date_s3=u'20160325')
Row(document_id=u'7c9835b8-93ca-4a00-81d1-fc66076a9ebd', client_id=u'0a9d9dcd-1f8e-4188-97d7-233f46d99d48', sample_id=1, channel=u'release', normalized_channel=u'release', country=u'US', city=u'Fremont', os=u'Windows_NT', os_version=u'6.1', os_service_pack_major=None, os_service_pack_minor=None, profile_creation_date=16871, subsession_start_date=u'2016-03-25T00:00:00.0-07:00', subsession_length=16383, distribution_id=None, submission_date=u'20160325', sync_configured=False, sync_count_desktop=None, sync_count_mobile=None, app_build_id=u'20160315153207', app_display_version=None, app_name=u'Firefox', app_version=u'45.0.1', timestamp=1458947604814447616, env_build_id=u'20160315153207', env_build_version=u'45.0.1', env_build_arch=u'x86', e10s_enabled=False, e10s_cohort=None, locale=u'en-US', active_experiment_id=None, active_experiment_branch=None, reason=u'shutdown', timezone_offset=-420, plugin_hangs=0, aborts_plugin=0, aborts_content=0, aborts_gmplugin=0, crashes_detected_plugin=0, crashes_detected_content=0, crashes_detected_gmplugin=0, crash_submit_attempt_main=0, crash_submit_attempt_content=0, crash_submit_attempt_plugin=0, crash_submit_success_main=0, crash_submit_success_content=0, crash_submit_success_plugin=0, active_addons_count=1, flash_version=u'21.0.0.197', vendor=u'Mozilla', is_default_browser=True, default_search_engine_data_name=u'Yahoo', search_counts=[Row(engine=u'yahoo', source=u'searchbar', count=1)], submission_date_s3=u'20160325')
"""
# In[10]:
pp(study_start)
from pyspark.sql.functions import col
_fields = [
'client_id',
'subsession_start_date',
'subsession_length',
'submission_date_s3'
]
ids = set(['0a9d9dcd-1f8e-4188-97d7-233f46d99d48','c'])
ids2 = ['0a9d9dcd-1f8e-4188-97d7-233f46d99d48', 'a', 'b']
idFrame = sqlContext.createDataFrame([tuple([x,]) for x in ids],['client_id'])
idFrame2 = sqlContext.createDataFrame([tuple([x,]) for x in ids2],['client_id'])
idFrame.printSchema()
# df.join(otherDf).drop(otherDf.col("id"))
print idFrame.join(idFrame2, "client_id", "left").collect()
#d1.filter(d1.submission_date >= study_start).select(*_fields).head()
get_ipython().magic(u'time d1.filter(d1.submission_date_s3 == study_start).select(*_fields).printSchema()')
#%time d1.filter(d1.submission_date_s3 == study_start).filter(d1.client_id in ids).count()
get_ipython().magic(u'time d1_sub = d1.filter(d1.submission_date_s3 == study_start).select(*_fields)')
get_ipython().magic(u"time print idFrame.join(d1_sub,'client_id','left').head()")
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment