gregglind/SHIELD-Variations.ipynb Secret

## SHIELD-Variations.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              SHIELD-Variations.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## SHIELD-Variations.py

# coding: utf-8

# ### SHIELD Variations, Data Pull and Analysis

# In[2]:

import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py

from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history

from pprint import pprint as pp

from operator import itemgetter


get_ipython().magic(u'pylab inline')

sc.defaultParallelism


# In[3]:

# helpers and utils
PINGNAME = 'x-shield-studies'
HBNAME = 'x-shield-study-performance-1'
STUDYNAME = 'screen Performance X1'

study_start = "20160325"  ## start of study

import datetime as DT
today_fmt = DT.date.today().strftime("%Y%m%d")

#week_ago = today - DT.timedelta(days=DAYS)
#week_ago_fmt = week_ago.strftime("%Y%m%d")


def updateWith(fn):
  def newFn(ping):
    out = fn(ping)
    ping.update(out)
    return ping
  return newFn


from IPython.core.display import display, HTML

def format_pivot(table, formats):
    # example 'f1': '{:,.1%}'.format # as % with 1 place!
    return table.to_html(formatters=formats)


# In[4]:

# http://stackoverflow.com/questions/13059011/is-there-any-python-function-library-for-calculate-binomial-confidence-intervals
def exact_CI(x, N, alpha=0.95):
    """
    Calculate the exact confidence interval of a proportion
    where there is a wide range in the sample size or the proportion.

    This method avoids the assumption that data are normally distributed. The sample size
    and proportion are desctibed by a beta distribution.

    Parameters
    ----------

    x: the number of cases from which the proportion is calulated as a positive integer.

    N: the sample size as a positive integer.

    alpha : set at 0.95 for 95% confidence intervals.

    Returns
    -------
    The proportion with the lower and upper confidence intervals as a dict.

    """
    from scipy.stats import beta
    x = float(x)
    N = float(N)
    p = round((x/N)*100,2)

    intervals = [round(i,4)*100 for i in beta.interval(alpha,x,N-x+1)]
    intervals.insert(0,p)

    result = {'Proportion': intervals[0], 'Lower CI': intervals[1], 'Upper CI': intervals[2]}

    return result


# ## Input Enrollement data for release
#
# ```
# mysql> select count(id) as n, sum(extra like "%button%") as amo, question_id, variation_id from heartbeat_answer where survey_id="messaging" and question_id="x-shield-study-performance-1" and is_test=0 group by variation_id  having n > 5;
# +-------+------+------------------------------+--------------------+
# | n     | amo  | question_id                  | variation_id       |
# +-------+------+------------------------------+--------------------+
# | 28131 | 1613 | x-shield-study-performance-1 | nowant-shapefuture |
# | 27845 | 2263 | x-shield-study-performance-1 | nowant-trynew      |
# | 27924 | 1322 | x-shield-study-performance-1 | want-shapefuture   |
# | 27467 | 2124 | x-shield-study-performance-1 | want-trynew        |
# +-------+------+------------------------------+--------------------+
# ```

# In[5]:

import pandas as pd

offer_release = pd.DataFrame([[28131,1613,"nowant-shapefuture"],[27845,2263,"nowant-trynew"],[27924,1322,"want-shapefuture"],[27467,2124,'want-trynew']], columns=["clientId","wentToAmo","surveyVersion"])


# ### Extract, Transform, Analyze
#
# 0.  [docs](https://github.com/mozilla/python_moztelemetry/blob/master/moztelemetry/spark.py)
# 1.  Get all shield-studies and heartbeat pings
#

# In[6]:

## shield pings, hb OFFER pings

## shield.
kwargs = dict(
    doc_type="OTHER",
    submission_date=(study_start,today_fmt),
    app="Firefox"
)

def pingVersion(ping):
    v = ping['payload'].get('version','0.0.0').split(".")
    v = tuple(map(int,v))
    return v

def getShieldProps(p):
    d = p['payload']
    out = dict(
        clientId=p.get('clientId',None),
        channel=p["application"]["channel"],
        creationDate = int(long(p['meta']['creationTimestamp'])/1e6),
        submissionDate = p['meta']['submissionDate'],
        os = p['environment'].get('os',{}).get('name','unknown'),
        dtype = p["meta"]["docType"]
    )
    for k in ['firstrun', 'msg', 'name', 'variation']:
        out[k] = d[k]

    out['firstrun'] = int(out['firstrun'])
    return out

pings = get_pings(sc, channel="release", **kwargs).union(
        get_pings(sc, channel="aurora",  **kwargs)).union(
        get_pings(sc, channel="beta",    **kwargs)).union(
        get_pings(sc, channel="nightly", **kwargs))

pings = pings.filter(lambda p: p["meta"]["docType"] == PINGNAME)
pings = pings.filter(lambda p: p['payload']['name'] == STUDYNAME)
pings = pings.map(getShieldProps).filter(itemgetter('clientId'))


## heartbeat
kwargs = dict(
    doc_type="heartbeat",
    submission_date=(study_start,today_fmt),
    app="Firefox"
)

hbpings = get_pings(sc, channel="release", **kwargs).union(
        get_pings(sc, channel="aurora",  **kwargs)).union(
        get_pings(sc, channel="beta",    **kwargs)).union(
        get_pings(sc, channel="nightly", **kwargs))

_hbProps = dict(
    clientId = "clientId",
    channel="application/channel",
    surveyId = "payload/surveyId",
    surveyVersion = "payload/surveyVersion",
    engagedTS = "payload/engagedTS",
    dtype = "meta/docType"
)

hbpings = get_pings_properties(hbpings,_hbProps)
hbpings = hbpings.filter(lambda x: x['surveyId']==HBNAME)


# In[14]:

pp(pings.first())
pp(hbpings.first())


# In[15]:

df = pd.DataFrame(hbpings.collect())
df['wentToAmo'] = df['engagedTS'].apply(lambda x: x > 0)
##  df.groupby(['channel','surveyId','surveyVersion']).agg({"clientId": lambda x: len(set(x)), "wentToAmo": lambda x: len(filter(None,x))})

t1 = pd.pivot_table(df,values=['clientId', 'wentToAmo'],index=['channel','surveyId','surveyVersion'], aggfunc={"clientId": lambda x: len(set(x)), "wentToAmo": lambda x: len(filter(None,x))}, fill_value=0, margins=True)
t1['%']= t1['wentToAmo']/t1['clientId']

# get some CIs and append.
ci = [exact_CI(x[0],x[1])for x in zip(t1['wentToAmo'],t1['clientId'])]
t1['lower'] = map(itemgetter('Lower CI'),ci)
t1['upper'] = map(itemgetter('Upper CI'),ci)

## format it nicer.
display(HTML(format_pivot(t1,{'%': '{:,.1%}'.format})))


# In[16]:

offer_release['channel'] = "release"
offer_release['surveyId'] = "x-shield-study-performance-1"
offer_release.set_index(['channel','surveyId','surveyVersion'])

ci = [exact_CI(x[0],x[1])for x in zip(offer_release['wentToAmo'],offer_release['clientId'])]
offer_release['%']= offer_release['wentToAmo']/offer_release['clientId']
offer_release['lower'] = map(itemgetter('Lower CI'),ci)
offer_release['upper'] = map(itemgetter('Upper CI'),ci)

## format it nicer.
display(HTML(format_pivot(offer_release,{'%': '{:,.1%}'.format})))


# In[17]:

df2 = pd.DataFrame(pings.collect())
df2.groupby(['channel','name','variation','msg']).agg({"clientId": lambda x: len(set(x))})

pd.pivot_table(df2, values='clientId', index=['channel','name','variation'], columns='msg', aggfunc=lambda x: len(set(x)), fill_value=0, margins=True)


# In[18]:

# not yet sorted by ts, etc.  but this is the approach!

from operator import itemgetter
'''
def aggUV(agg, item):
    #return agg
    #print agg, other
    #ans = other['msg']
    agg.append((item['msg'],item['creationDate'], item['creationDate']-item['firstrun'] ))
    return agg

def aggUU(agg1, agg2):
    return agg1 + agg2


def sortByTime(blarg):
    return (blarg[0], sorted(blarg[1], key=lambda x: x[1]))
'''

summaryProto = dict(
    clientId = None,
    channel = None,
    os = None,
    firstrun = None,
    variation = None,
    installed = False,
    ineligible = False,
    seen1 = False,
    seen2 = False,
    seen3 = False,
    seen7 = False,
    leftStudy = False,
    completed = False
)

def daysRunning(item):
    return int((item['creationDate']-item['firstrun'])/(86400*1000))

def aggUV(agg, item):
    dtype = item['dtype']
    agg['clientId'] = item['clientId']
    agg['channel'] = item['channel']
    agg['os'] = item['os']
    if dtype == "heartbeat":
        agg['prompt'] = surveyVersion
    elif dtype == PINGNAME:
        msg = item['msg']
        agg['name'] = item['name']
        agg['firstrun'] = item['firstrun']
        if (not agg['ineligible']):
            agg['started'] = True

        agg['variation'] = item['variation']
        # days seen
        d = daysRunning(item)
        if d == 1: agg['seen1'] = True
        if d == 2: agg['seen2'] = True
        if d == 3: agg['seen3'] = True
        if d == 7: agg['seen7'] = True

        if msg == "user-ended-study":
            agg['leftStudy'] = True

        elif msg == 'install':
            agg['installed'] = True

        elif msg == 'end-of-study':
            agg['completed'] = True

        elif msg == "ineligible":
            agg['ineligible'] = True


    return agg

def aggUU(agg1, agg2):
    for (k,v) in agg2.iteritems():
        if v:  agg1[k]=v

    return agg1

states = pings.keyBy(itemgetter('clientId')).aggregateByKey(summaryProto, aggUV, aggUU).values()
statesDf = pd.DataFrame(states.collect())


# In[19]:

pd.pivot_table(statesDf, values=['installed','seen1','seen2','seen3','seen7','ineligible','leftStudy','completed'], index=['channel','name','variation'], columns=[], aggfunc=lambda x: sum(map(int,x)), fill_value=0, margins=True)


# In[20]:

t = pd.pivot_table(statesDf, values=['installed','seen1','seen2','seen3','seen7','ineligible','leftStudy','completed'], index=['channel','name','variation'], columns=[], aggfunc=lambda x: sum(map(int,x)), fill_value=0, margins=True)
ci = [exact_CI(x[0],x[1])for x in zip(t['seen7'],t['installed'])]
t['lower7'] = map(itemgetter('Lower CI'),ci)
t['upper7'] = map(itemgetter('Upper CI'),ci)

for k in ('seen1','seen2','seen3','seen7'):
    t[k] = t[k]/t['installed']


fmt = '{:,.1%}'.format
display(HTML(format_pivot(t,{'seen1': fmt, 'seen2':fmt, 'seen3':fmt, 'seen7':fmt})))


# ## Data Quality Issues

# In[21]:

def depOn(thing,aname,depList):
  if (thing[aname]):
    for d in depList:
      if not thing[d]:
        return "{0} needs {1}".format(aname, d)

def excludes(thing,aname,depList):
  if (thing[aname]):
    for d in depList:
      if thing[d]:
        return "{0} and {1}".format(aname, d)

# ['started','seen1','seen2','seen3','seen7','ineligible','leftStudy']
def checkLifeCycle(lc):
  return depOn(lc,'seen7',['started']) or     depOn(lc,'seen3',['started']) or     depOn(lc,'seen2',['started']) or     depOn(lc,'seen1',['started']) or     excludes(lc,'ineligible',['leftStudy','installed','seen1','seen2','seen3','seen7'])


states.map(checkLifeCycle).countByValue()


# In[22]:

states.filter(lambda x: x['ineligible'] and x['leftStudy']).map(itemgetter('os','channel','variation')).countByValue()


# In[23]:

def aggList(u,v):
    if isinstance(v,list):
        return u + v
    else:
        u.append(v)
        return u


# sc.parallelize([{'a':1},{'b':2},{'c':3}]).aggregate([],aggList,aggList)

#aggHB = pings.keyBy(itemgetter('clientId')).aggregateByKey([], aggList, aggList)
#aggHB.first()


# ### Heartbeat Offers During the Same Period
#
# Is the slow enrollment 'real' or an artefact?
#
# Compare to US-Release HB offers in the same period

# In[24]:

## hb by day by study.

kwargs = dict(
    doc_type="heartbeat",
    submission_date=(study_start,today_fmt),
    app="Firefox"
)

allhb = get_pings(sc, channel="release", **kwargs).union(
        get_pings(sc, channel="aurora",  **kwargs)).union(
        get_pings(sc, channel="beta",    **kwargs)).union(
        get_pings(sc, channel="nightly", **kwargs))

_hbProps = dict(
    clientId = "clientId",
    #creation = int(long(p['meta']['creationTimestamp'])/1e6),
    creation = 'meta/creationTimestamp',
    channel="application/channel",
    surveyId = "payload/surveyId",
    #surveyVersion = "payload/surveyVersion",
    offeredTS = "payload/offeredTS",
    locale = 'environment/settings/locale'
    #dtype = "meta/docType"
)

allhb = get_pings_properties(allhb, _hbProps).filter(lambda x: (x['locale'] or '').lower() =="en-us")
#hbpings = hbpings.filter(lambda x: x['surveyId']==HBNAME)

allhb.first()
allhbDf = pd.DataFrame(allhb.collect())


# In[25]:

## Offers for both studies (which had the same sample)

allhbDf["day"] = allhbDf['creation'].map(lambda x: int(long(x)/1e6)/(86400*1000))
allhbDf.drop_duplicates(subset=['clientId','surveyId'],inplace=True)
pd.pivot_table(allhbDf,
               #values=['installed','seen1','seen2','seen3','seen7','ineligible','leftStudy','completed'],
               index=['channel','surveyId'],
               columns=['day'],
               aggfunc = lambda x: len(x),
               #aggfunc=lambda x: sum(map(int,x)),
               fill_value=0, margins=True)


# ## Shield Study 1:   `nglayout.initialpaint.delay` Variations
#
#
# - The test deployed and ran correctly.
# - Some questions (effectiveness of the actual setting) were underpowered.
#
# ### Variations:  Is there a right setting for `nglayout.initialpaint.delay`?
#
# We observed **no difference in 7-Day-Retention** for any of the settings (powered for 10%) differences.
#
# This does not mean there is no difference, but that it is likely to be have less than a 10% impact on that retention measure.
#
# ### UX
#
# 1.  (prompt) Use the "try something new" prompts.
#
#      - BETTER: Want to try someting new in Firefox?
#      - WORSE:  Want to shape the future of Firefox?
#
#      Using 'want to' showed no effect (underpowered to detect).
#
#      This held in Aurora (34.7%), Beta (9.7%), Release (8.1%).
#
#      Prompt uptake was similar to other HB prompts
#
# 2.  Weak conversion / install of experiment.
#
#     Users might be scared off by word 'install' or prompts during install.
#
#
# ### Systems Aspects
#
#
# 1.  Unified Telemetry / Data Pipeline worked correctly.
#
#    - No issues with data loss.
#    - Analysis was straightoward.
#    - maintained "near real time" on all stats.
#    - analysis code is in good shape.
#
# 1.  AMO (Addons.m.o)
#
#    - hosting was robust
#    - AMO stats are wildly inconsistent with install stats from UT (  https://addons.mozilla.org/en-US/firefox/addon/x-screen-performance-study-1/statistics/?last=30
# )
#
#
# ### Next time:
#
# 1.  Better UX for enroll.  Avoid the 'install' prompt
#
#    - consider 'opt-out' style
#    - consider other ways of explaining how to be in the study
#
#
# 1. AMO issues
#
#   - instrument the amo page with GA to know if the button is the issue
#
# 1. Better probes in addon that are 'fuse' based, rather than just on startup shutdown.  Should phone home every day.
#
# 1. easier viz /publishing of results (in particular, P-values, etc.)
# 1. Better / earlier code review.
# 1. Address legal concerns about explicitly explaining exactly what will change.  Reconcile this desire with
#
#
# ### Technical timeline:
#
# - Addon Approval: 2016/03/20
# - AMO page live:  2016/03/24
# - aurora launch:  2016/03/25
# - beta launch:    2016/03/26
# - release launch: 2016/03/28
# - enrollment end: 2016/04/04
#
#
# ### Heroes (incomplete):
#
# - Mark Reid:  `#datapipeline` and lots of help with packets
# - John Gruen:  Designed / implemented the AMO page, based on Test Pilot https://github.com/mozilla/addons-server/issues/1880
# - Matt Grimes:  Setting up ALL THE MEETINGS.  Survey Work
# - Christopher Grebs + Andy McKay, AMO page.  https://github.com/mozilla/addons-server/issues/1880
# - Rebecca Weiss:  Study design, consent issues, language of consent page.
# - Ilana Segall, language and prompts
# - Chris Hartjes:  Addon QA
# - Jorge Villalobos: Addon Review
# - Winston Bowden: Branding Review, "Firefox-ness"
# - UX: Philip, Madhava, Stephen (final UX tentative approval)
#

# In[5]:

bucket = "telemetry-parquet"
prefix = "main_summary/v1"
get_ipython().magic(u'time d1 = sqlContext.read.load("s3://{}/{}".format(bucket, prefix), "parquet")')


# In[6]:

about = """
root
 |-- document_id: string (nullable = false)
 |-- client_id: string (nullable = true)
 |-- sample_id: integer (nullable = true)
 |-- channel: string (nullable = true)
 |-- normalized_channel: string (nullable = true)
 |-- country: string (nullable = true)
 |-- city: string (nullable = true)
 |-- os: string (nullable = true)
 |-- os_version: string (nullable = true)
 |-- os_service_pack_major: string (nullable = true)
 |-- os_service_pack_minor: string (nullable = true)
 |-- profile_creation_date: integer (nullable = true)
 |-- subsession_start_date: string (nullable = true)
 |-- subsession_length: integer (nullable = true)
 |-- distribution_id: string (nullable = true)
 |-- submission_date: string (nullable = false)
 |-- sync_configured: boolean (nullable = true)
 |-- sync_count_desktop: integer (nullable = true)
 |-- sync_count_mobile: integer (nullable = true)
 |-- app_build_id: string (nullable = true)
 |-- app_display_version: string (nullable = true)
 |-- app_name: string (nullable = true)
 |-- app_version: string (nullable = true)
 |-- timestamp: long (nullable = false)
 |-- env_build_id: string (nullable = true)
 |-- env_build_version: string (nullable = true)
 |-- env_build_arch: string (nullable = true)
 |-- e10s_enabled: boolean (nullable = true)
 |-- e10s_cohort: string (nullable = true)
 |-- locale: string (nullable = true)
 |-- active_experiment_id: string (nullable = true)
 |-- active_experiment_branch: string (nullable = true)
 |-- reason: string (nullable = true)
 |-- timezone_offset: integer (nullable = true)
 |-- plugin_hangs: integer (nullable = true)
 |-- aborts_plugin: integer (nullable = true)
 |-- aborts_content: integer (nullable = true)
 |-- aborts_gmplugin: integer (nullable = true)
 |-- crashes_detected_plugin: integer (nullable = true)
 |-- crashes_detected_content: integer (nullable = true)
 |-- crashes_detected_gmplugin: integer (nullable = true)
 |-- crash_submit_attempt_main: integer (nullable = true)
 |-- crash_submit_attempt_content: integer (nullable = true)
 |-- crash_submit_attempt_plugin: integer (nullable = true)
 |-- crash_submit_success_main: integer (nullable = true)
 |-- crash_submit_success_content: integer (nullable = true)
 |-- crash_submit_success_plugin: integer (nullable = true)
 |-- active_addons_count: integer (nullable = true)
 |-- flash_version: string (nullable = true)
 |-- vendor: string (nullable = true)
 |-- is_default_browser: boolean (nullable = true)
 |-- default_search_engine_data_name: string (nullable = true)
 |-- loop_activity_open_panel: integer (nullable = true)
 |-- loop_activity_open_conversation: integer (nullable = true)
 |-- loop_activity_room_open: integer (nullable = true)
 |-- loop_activity_room_share: integer (nullable = true)
 |-- loop_activity_room_delete: integer (nullable = true)
 |-- devtools_toolbox_opened_count: integer (nullable = true)
 |-- search_counts: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- engine: string (nullable = false)
 |    |    |-- source: string (nullable = false)
 |    |    |-- count: long (nullable = false)
 |-- submission_date_s3: string (nullable = true)
"""


# In[7]:

#dir(d1)

"""
Row(client_id=u'0a9d9dcd-1f8e-4188-97d7-233f46d99d48', subsession_start_date=u'2016-03-25T00:00:00.0-07:00', subsession_length=16383, submission_date_s3=u'20160325')

Row(document_id=u'7c9835b8-93ca-4a00-81d1-fc66076a9ebd', client_id=u'0a9d9dcd-1f8e-4188-97d7-233f46d99d48', sample_id=1, channel=u'release', normalized_channel=u'release', country=u'US', city=u'Fremont', os=u'Windows_NT', os_version=u'6.1', os_service_pack_major=None, os_service_pack_minor=None, profile_creation_date=16871, subsession_start_date=u'2016-03-25T00:00:00.0-07:00', subsession_length=16383, distribution_id=None, submission_date=u'20160325', sync_configured=False, sync_count_desktop=None, sync_count_mobile=None, app_build_id=u'20160315153207', app_display_version=None, app_name=u'Firefox', app_version=u'45.0.1', timestamp=1458947604814447616, env_build_id=u'20160315153207', env_build_version=u'45.0.1', env_build_arch=u'x86', e10s_enabled=False, e10s_cohort=None, locale=u'en-US', active_experiment_id=None, active_experiment_branch=None, reason=u'shutdown', timezone_offset=-420, plugin_hangs=0, aborts_plugin=0, aborts_content=0, aborts_gmplugin=0, crashes_detected_plugin=0, crashes_detected_content=0, crashes_detected_gmplugin=0, crash_submit_attempt_main=0, crash_submit_attempt_content=0, crash_submit_attempt_plugin=0, crash_submit_success_main=0, crash_submit_success_content=0, crash_submit_success_plugin=0, active_addons_count=1, flash_version=u'21.0.0.197', vendor=u'Mozilla', is_default_browser=True, default_search_engine_data_name=u'Yahoo', search_counts=[Row(engine=u'yahoo', source=u'searchbar', count=1)], submission_date_s3=u'20160325')

"""


# In[10]:

pp(study_start)
from pyspark.sql.functions import col
_fields = [
    'client_id',
    'subsession_start_date',
    'subsession_length',
    'submission_date_s3'
]

ids = set(['0a9d9dcd-1f8e-4188-97d7-233f46d99d48','c'])
ids2 = ['0a9d9dcd-1f8e-4188-97d7-233f46d99d48', 'a', 'b']
idFrame = sqlContext.createDataFrame([tuple([x,]) for x in ids],['client_id'])
idFrame2 = sqlContext.createDataFrame([tuple([x,]) for x in ids2],['client_id'])

idFrame.printSchema()

# df.join(otherDf).drop(otherDf.col("id"))
print idFrame.join(idFrame2, "client_id", "left").collect()

#d1.filter(d1.submission_date >= study_start).select(*_fields).head()
get_ipython().magic(u'time d1.filter(d1.submission_date_s3 == study_start).select(*_fields).printSchema()')


#%time d1.filter(d1.submission_date_s3 == study_start).filter(d1.client_id in ids).count()
get_ipython().magic(u'time d1_sub = d1.filter(d1.submission_date_s3 == study_start).select(*_fields)')
get_ipython().magic(u"time print idFrame.join(d1_sub,'client_id','left').head()")


# In[ ]:

	# coding: utf-8

	# ### SHIELD Variations, Data Pull and Analysis

	# In[2]:

	import ujson as json
	import matplotlib.pyplot as plt
	import pandas as pd
	import numpy as np
	import plotly.plotly as py

	from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client, get_clients_history

	from pprint import pprint as pp

	from operator import itemgetter


	get_ipython().magic(u'pylab inline')

	sc.defaultParallelism


	# In[3]:

	# helpers and utils
	PINGNAME = 'x-shield-studies'
	HBNAME = 'x-shield-study-performance-1'
	STUDYNAME = 'screen Performance X1'

	study_start = "20160325" ## start of study

	import datetime as DT
	today_fmt = DT.date.today().strftime("%Y%m%d")

	#week_ago = today - DT.timedelta(days=DAYS)
	#week_ago_fmt = week_ago.strftime("%Y%m%d")


	def updateWith(fn):
	def newFn(ping):
	out = fn(ping)
	ping.update(out)
	return ping
	return newFn


	from IPython.core.display import display, HTML

	def format_pivot(table, formats):
	# example 'f1': '{:,.1%}'.format # as % with 1 place!
	return table.to_html(formatters=formats)


	# In[4]:

	# http://stackoverflow.com/questions/13059011/is-there-any-python-function-library-for-calculate-binomial-confidence-intervals
	def exact_CI(x, N, alpha=0.95):
	"""
	Calculate the exact confidence interval of a proportion
	where there is a wide range in the sample size or the proportion.

	This method avoids the assumption that data are normally distributed. The sample size
	and proportion are desctibed by a beta distribution.

	Parameters
	----------

	x: the number of cases from which the proportion is calulated as a positive integer.

	N: the sample size as a positive integer.

	alpha : set at 0.95 for 95% confidence intervals.

	Returns
	-------
	The proportion with the lower and upper confidence intervals as a dict.

	"""
	from scipy.stats import beta
	x = float(x)
	N = float(N)
	p = round((x/N)*100,2)

	intervals = [round(i,4)*100 for i in beta.interval(alpha,x,N-x+1)]
	intervals.insert(0,p)

	result = {'Proportion': intervals[0], 'Lower CI': intervals[1], 'Upper CI': intervals[2]}

	return result


	# ## Input Enrollement data for release
	#
	# ```
	# mysql> select count(id) as n, sum(extra like "%button%") as amo, question_id, variation_id from heartbeat_answer where survey_id="messaging" and question_id="x-shield-study-performance-1" and is_test=0 group by variation_id having n > 5;
	# +-------+------+------------------------------+--------------------+
	# \| n \| amo \| question_id \| variation_id \|
	# +-------+------+------------------------------+--------------------+
	# \| 28131 \| 1613 \| x-shield-study-performance-1 \| nowant-shapefuture \|
	# \| 27845 \| 2263 \| x-shield-study-performance-1 \| nowant-trynew \|
	# \| 27924 \| 1322 \| x-shield-study-performance-1 \| want-shapefuture \|
	# \| 27467 \| 2124 \| x-shield-study-performance-1 \| want-trynew \|
	# +-------+------+------------------------------+--------------------+
	# ```

	# In[5]:

	import pandas as pd

	offer_release = pd.DataFrame([[28131,1613,"nowant-shapefuture"],[27845,2263,"nowant-trynew"],[27924,1322,"want-shapefuture"],[27467,2124,'want-trynew']], columns=["clientId","wentToAmo","surveyVersion"])


	# ### Extract, Transform, Analyze
	#
	# 0. [docs](https://github.com/mozilla/python_moztelemetry/blob/master/moztelemetry/spark.py)
	# 1. Get all shield-studies and heartbeat pings
	#

	# In[6]:

	## shield pings, hb OFFER pings

	## shield.
	kwargs = dict(
	doc_type="OTHER",
	submission_date=(study_start,today_fmt),
	app="Firefox"
	)

	def pingVersion(ping):
	v = ping['payload'].get('version','0.0.0').split(".")
	v = tuple(map(int,v))
	return v

	def getShieldProps(p):
	d = p['payload']
	out = dict(
	clientId=p.get('clientId',None),
	channel=p["application"]["channel"],
	creationDate = int(long(p['meta']['creationTimestamp'])/1e6),
	submissionDate = p['meta']['submissionDate'],
	os = p['environment'].get('os',{}).get('name','unknown'),
	dtype = p["meta"]["docType"]
	)
	for k in ['firstrun', 'msg', 'name', 'variation']:
	out[k] = d[k]

	out['firstrun'] = int(out['firstrun'])
	return out

	pings = get_pings(sc, channel="release", **kwargs).union(
	get_pings(sc, channel="aurora", **kwargs)).union(
	get_pings(sc, channel="beta", **kwargs)).union(
	get_pings(sc, channel="nightly", **kwargs))

	pings = pings.filter(lambda p: p["meta"]["docType"] == PINGNAME)
	pings = pings.filter(lambda p: p['payload']['name'] == STUDYNAME)
	pings = pings.map(getShieldProps).filter(itemgetter('clientId'))


	## heartbeat
	kwargs = dict(
	doc_type="heartbeat",
	submission_date=(study_start,today_fmt),
	app="Firefox"
	)

	hbpings = get_pings(sc, channel="release", **kwargs).union(
	get_pings(sc, channel="aurora", **kwargs)).union(
	get_pings(sc, channel="beta", **kwargs)).union(
	get_pings(sc, channel="nightly", **kwargs))

	_hbProps = dict(
	clientId = "clientId",
	channel="application/channel",
	surveyId = "payload/surveyId",
	surveyVersion = "payload/surveyVersion",
	engagedTS = "payload/engagedTS",
	dtype = "meta/docType"
	)

	hbpings = get_pings_properties(hbpings,_hbProps)
	hbpings = hbpings.filter(lambda x: x['surveyId']==HBNAME)


	# In[14]:

	pp(pings.first())
	pp(hbpings.first())


	# In[15]:

	df = pd.DataFrame(hbpings.collect())
	df['wentToAmo'] = df['engagedTS'].apply(lambda x: x > 0)
	## df.groupby(['channel','surveyId','surveyVersion']).agg({"clientId": lambda x: len(set(x)), "wentToAmo": lambda x: len(filter(None,x))})

	t1 = pd.pivot_table(df,values=['clientId', 'wentToAmo'],index=['channel','surveyId','surveyVersion'], aggfunc={"clientId": lambda x: len(set(x)), "wentToAmo": lambda x: len(filter(None,x))}, fill_value=0, margins=True)
	t1['%']= t1['wentToAmo']/t1['clientId']

	# get some CIs and append.
	ci = [exact_CI(x[0],x[1])for x in zip(t1['wentToAmo'],t1['clientId'])]
	t1['lower'] = map(itemgetter('Lower CI'),ci)
	t1['upper'] = map(itemgetter('Upper CI'),ci)

	## format it nicer.
	display(HTML(format_pivot(t1,{'%': '{:,.1%}'.format})))


	# In[16]:

	offer_release['channel'] = "release"
	offer_release['surveyId'] = "x-shield-study-performance-1"
	offer_release.set_index(['channel','surveyId','surveyVersion'])

	ci = [exact_CI(x[0],x[1])for x in zip(offer_release['wentToAmo'],offer_release['clientId'])]
	offer_release['%']= offer_release['wentToAmo']/offer_release['clientId']
	offer_release['lower'] = map(itemgetter('Lower CI'),ci)
	offer_release['upper'] = map(itemgetter('Upper CI'),ci)

	## format it nicer.
	display(HTML(format_pivot(offer_release,{'%': '{:,.1%}'.format})))


	# In[17]:

	df2 = pd.DataFrame(pings.collect())
	df2.groupby(['channel','name','variation','msg']).agg({"clientId": lambda x: len(set(x))})

	pd.pivot_table(df2, values='clientId', index=['channel','name','variation'], columns='msg', aggfunc=lambda x: len(set(x)), fill_value=0, margins=True)


	# In[18]:

	# not yet sorted by ts, etc. but this is the approach!

	from operator import itemgetter
	'''
	def aggUV(agg, item):
	#return agg
	#print agg, other
	#ans = other['msg']
	agg.append((item['msg'],item['creationDate'], item['creationDate']-item['firstrun'] ))
	return agg

	def aggUU(agg1, agg2):
	return agg1 + agg2


	def sortByTime(blarg):
	return (blarg[0], sorted(blarg[1], key=lambda x: x[1]))
	'''

	summaryProto = dict(
	clientId = None,
	channel = None,
	os = None,
	firstrun = None,
	variation = None,
	installed = False,
	ineligible = False,
	seen1 = False,
	seen2 = False,
	seen3 = False,
	seen7 = False,
	leftStudy = False,
	completed = False
	)

	def daysRunning(item):
	return int((item['creationDate']-item['firstrun'])/(86400*1000))

	def aggUV(agg, item):
	dtype = item['dtype']
	agg['clientId'] = item['clientId']
	agg['channel'] = item['channel']
	agg['os'] = item['os']
	if dtype == "heartbeat":
	agg['prompt'] = surveyVersion
	elif dtype == PINGNAME:
	msg = item['msg']
	agg['name'] = item['name']
	agg['firstrun'] = item['firstrun']
	if (not agg['ineligible']):
	agg['started'] = True

	agg['variation'] = item['variation']
	# days seen
	d = daysRunning(item)
	if d == 1: agg['seen1'] = True
	if d == 2: agg['seen2'] = True
	if d == 3: agg['seen3'] = True
	if d == 7: agg['seen7'] = True

	if msg == "user-ended-study":
	agg['leftStudy'] = True

	elif msg == 'install':
	agg['installed'] = True

	elif msg == 'end-of-study':
	agg['completed'] = True

	elif msg == "ineligible":
	agg['ineligible'] = True



	return agg

	def aggUU(agg1, agg2):
	for (k,v) in agg2.iteritems():
	if v: agg1[k]=v

	return agg1

	states = pings.keyBy(itemgetter('clientId')).aggregateByKey(summaryProto, aggUV, aggUU).values()
	statesDf = pd.DataFrame(states.collect())


	# In[19]:

	pd.pivot_table(statesDf, values=['installed','seen1','seen2','seen3','seen7','ineligible','leftStudy','completed'], index=['channel','name','variation'], columns=[], aggfunc=lambda x: sum(map(int,x)), fill_value=0, margins=True)


	# In[20]:

	t = pd.pivot_table(statesDf, values=['installed','seen1','seen2','seen3','seen7','ineligible','leftStudy','completed'], index=['channel','name','variation'], columns=[], aggfunc=lambda x: sum(map(int,x)), fill_value=0, margins=True)
	ci = [exact_CI(x[0],x[1])for x in zip(t['seen7'],t['installed'])]
	t['lower7'] = map(itemgetter('Lower CI'),ci)
	t['upper7'] = map(itemgetter('Upper CI'),ci)

	for k in ('seen1','seen2','seen3','seen7'):
	t[k] = t[k]/t['installed']


	fmt = '{:,.1%}'.format
	display(HTML(format_pivot(t,{'seen1': fmt, 'seen2':fmt, 'seen3':fmt, 'seen7':fmt})))


	# ## Data Quality Issues

	# In[21]:

	def depOn(thing,aname,depList):
	if (thing[aname]):
	for d in depList:
	if not thing[d]:
	return "{0} needs {1}".format(aname, d)

	def excludes(thing,aname,depList):
	if (thing[aname]):
	for d in depList:
	if thing[d]:
	return "{0} and {1}".format(aname, d)

	# ['started','seen1','seen2','seen3','seen7','ineligible','leftStudy']
	def checkLifeCycle(lc):
	return depOn(lc,'seen7',['started']) or depOn(lc,'seen3',['started']) or depOn(lc,'seen2',['started']) or depOn(lc,'seen1',['started']) or excludes(lc,'ineligible',['leftStudy','installed','seen1','seen2','seen3','seen7'])


	states.map(checkLifeCycle).countByValue()


	# In[22]:

	states.filter(lambda x: x['ineligible'] and x['leftStudy']).map(itemgetter('os','channel','variation')).countByValue()


	# In[23]:

	def aggList(u,v):
	if isinstance(v,list):
	return u + v
	else:
	u.append(v)
	return u


	# sc.parallelize([{'a':1},{'b':2},{'c':3}]).aggregate([],aggList,aggList)

	#aggHB = pings.keyBy(itemgetter('clientId')).aggregateByKey([], aggList, aggList)
	#aggHB.first()


	# ### Heartbeat Offers During the Same Period
	#
	# Is the slow enrollment 'real' or an artefact?
	#
	# Compare to US-Release HB offers in the same period

	# In[24]:

	## hb by day by study.

	kwargs = dict(
	doc_type="heartbeat",
	submission_date=(study_start,today_fmt),
	app="Firefox"
	)

	allhb = get_pings(sc, channel="release", **kwargs).union(
	get_pings(sc, channel="aurora", **kwargs)).union(
	get_pings(sc, channel="beta", **kwargs)).union(
	get_pings(sc, channel="nightly", **kwargs))

	_hbProps = dict(
	clientId = "clientId",
	#creation = int(long(p['meta']['creationTimestamp'])/1e6),
	creation = 'meta/creationTimestamp',
	channel="application/channel",
	surveyId = "payload/surveyId",
	#surveyVersion = "payload/surveyVersion",
	offeredTS = "payload/offeredTS",
	locale = 'environment/settings/locale'
	#dtype = "meta/docType"
	)

	allhb = get_pings_properties(allhb, _hbProps).filter(lambda x: (x['locale'] or '').lower() =="en-us")
	#hbpings = hbpings.filter(lambda x: x['surveyId']==HBNAME)

	allhb.first()
	allhbDf = pd.DataFrame(allhb.collect())





	# In[25]:

	## Offers for both studies (which had the same sample)

	allhbDf["day"] = allhbDf['creation'].map(lambda x: int(long(x)/1e6)/(86400*1000))
	allhbDf.drop_duplicates(subset=['clientId','surveyId'],inplace=True)
	pd.pivot_table(allhbDf,
	#values=['installed','seen1','seen2','seen3','seen7','ineligible','leftStudy','completed'],
	index=['channel','surveyId'],
	columns=['day'],
	aggfunc = lambda x: len(x),
	#aggfunc=lambda x: sum(map(int,x)),
	fill_value=0, margins=True)


	# ## Shield Study 1: `nglayout.initialpaint.delay` Variations
	#
	#
	# - The test deployed and ran correctly.
	# - Some questions (effectiveness of the actual setting) were underpowered.
	#
	# ### Variations: Is there a right setting for `nglayout.initialpaint.delay`?
	#
	# We observed no difference in 7-Day-Retention for any of the settings (powered for 10%) differences.
	#
	# This does not mean there is no difference, but that it is likely to be have less than a 10% impact on that retention measure.
	#
	# ### UX
	#
	# 1. (prompt) Use the "try something new" prompts.
	#
	# - BETTER: Want to try someting new in Firefox?
	# - WORSE: Want to shape the future of Firefox?
	#
	# Using 'want to' showed no effect (underpowered to detect).
	#
	# This held in Aurora (34.7%), Beta (9.7%), Release (8.1%).
	#
	# Prompt uptake was similar to other HB prompts
	#
	# 2. Weak conversion / install of experiment.
	#
	# Users might be scared off by word 'install' or prompts during install.
	#
	#
	# ### Systems Aspects
	#
	#
	# 1. Unified Telemetry / Data Pipeline worked correctly.
	#
	# - No issues with data loss.
	# - Analysis was straightoward.
	# - maintained "near real time" on all stats.
	# - analysis code is in good shape.
	#
	# 1. AMO (Addons.m.o)
	#
	# - hosting was robust
	# - AMO stats are wildly inconsistent with install stats from UT ( https://addons.mozilla.org/en-US/firefox/addon/x-screen-performance-study-1/statistics/?last=30
	# )
	#
	#
	# ### Next time:
	#
	# 1. Better UX for enroll. Avoid the 'install' prompt
	#
	# - consider 'opt-out' style
	# - consider other ways of explaining how to be in the study
	#
	#
	# 1. AMO issues
	#
	# - instrument the amo page with GA to know if the button is the issue
	#
	# 1. Better probes in addon that are 'fuse' based, rather than just on startup shutdown. Should phone home every day.
	#
	# 1. easier viz /publishing of results (in particular, P-values, etc.)
	# 1. Better / earlier code review.
	# 1. Address legal concerns about explicitly explaining exactly what will change. Reconcile this desire with
	#
	#
	# ### Technical timeline:
	#
	# - Addon Approval: 2016/03/20
	# - AMO page live: 2016/03/24
	# - aurora launch: 2016/03/25
	# - beta launch: 2016/03/26
	# - release launch: 2016/03/28
	# - enrollment end: 2016/04/04
	#
	#
	# ### Heroes (incomplete):
	#
	# - Mark Reid: `#datapipeline` and lots of help with packets
	# - John Gruen: Designed / implemented the AMO page, based on Test Pilot https://github.com/mozilla/addons-server/issues/1880
	# - Matt Grimes: Setting up ALL THE MEETINGS. Survey Work
	# - Christopher Grebs + Andy McKay, AMO page. https://github.com/mozilla/addons-server/issues/1880
	# - Rebecca Weiss: Study design, consent issues, language of consent page.
	# - Ilana Segall, language and prompts
	# - Chris Hartjes: Addon QA
	# - Jorge Villalobos: Addon Review
	# - Winston Bowden: Branding Review, "Firefox-ness"
	# - UX: Philip, Madhava, Stephen (final UX tentative approval)
	#

	# In[5]:

	bucket = "telemetry-parquet"
	prefix = "main_summary/v1"
	get_ipython().magic(u'time d1 = sqlContext.read.load("s3://{}/{}".format(bucket, prefix), "parquet")')


	# In[6]:

	about = """
	root
	\|-- document_id: string (nullable = false)
	\|-- client_id: string (nullable = true)
	\|-- sample_id: integer (nullable = true)
	\|-- channel: string (nullable = true)
	\|-- normalized_channel: string (nullable = true)
	\|-- country: string (nullable = true)
	\|-- city: string (nullable = true)
	\|-- os: string (nullable = true)
	\|-- os_version: string (nullable = true)
	\|-- os_service_pack_major: string (nullable = true)
	\|-- os_service_pack_minor: string (nullable = true)
	\|-- profile_creation_date: integer (nullable = true)
	\|-- subsession_start_date: string (nullable = true)
	\|-- subsession_length: integer (nullable = true)
	\|-- distribution_id: string (nullable = true)
	\|-- submission_date: string (nullable = false)
	\|-- sync_configured: boolean (nullable = true)
	\|-- sync_count_desktop: integer (nullable = true)
	\|-- sync_count_mobile: integer (nullable = true)
	\|-- app_build_id: string (nullable = true)
	\|-- app_display_version: string (nullable = true)
	\|-- app_name: string (nullable = true)
	\|-- app_version: string (nullable = true)
	\|-- timestamp: long (nullable = false)
	\|-- env_build_id: string (nullable = true)
	\|-- env_build_version: string (nullable = true)
	\|-- env_build_arch: string (nullable = true)
	\|-- e10s_enabled: boolean (nullable = true)
	\|-- e10s_cohort: string (nullable = true)
	\|-- locale: string (nullable = true)
	\|-- active_experiment_id: string (nullable = true)
	\|-- active_experiment_branch: string (nullable = true)
	\|-- reason: string (nullable = true)
	\|-- timezone_offset: integer (nullable = true)
	\|-- plugin_hangs: integer (nullable = true)
	\|-- aborts_plugin: integer (nullable = true)
	\|-- aborts_content: integer (nullable = true)
	\|-- aborts_gmplugin: integer (nullable = true)
	\|-- crashes_detected_plugin: integer (nullable = true)
	\|-- crashes_detected_content: integer (nullable = true)
	\|-- crashes_detected_gmplugin: integer (nullable = true)
	\|-- crash_submit_attempt_main: integer (nullable = true)
	\|-- crash_submit_attempt_content: integer (nullable = true)
	\|-- crash_submit_attempt_plugin: integer (nullable = true)
	\|-- crash_submit_success_main: integer (nullable = true)
	\|-- crash_submit_success_content: integer (nullable = true)
	\|-- crash_submit_success_plugin: integer (nullable = true)
	\|-- active_addons_count: integer (nullable = true)
	\|-- flash_version: string (nullable = true)
	\|-- vendor: string (nullable = true)
	\|-- is_default_browser: boolean (nullable = true)
	\|-- default_search_engine_data_name: string (nullable = true)
	\|-- loop_activity_open_panel: integer (nullable = true)
	\|-- loop_activity_open_conversation: integer (nullable = true)
	\|-- loop_activity_room_open: integer (nullable = true)
	\|-- loop_activity_room_share: integer (nullable = true)
	\|-- loop_activity_room_delete: integer (nullable = true)
	\|-- devtools_toolbox_opened_count: integer (nullable = true)
	\|-- search_counts: array (nullable = true)
	\| \|-- element: struct (containsNull = false)
	\| \| \|-- engine: string (nullable = false)
	\| \| \|-- source: string (nullable = false)
	\| \| \|-- count: long (nullable = false)
	\|-- submission_date_s3: string (nullable = true)
	"""


	# In[7]:

	#dir(d1)

	"""
	Row(client_id=u'0a9d9dcd-1f8e-4188-97d7-233f46d99d48', subsession_start_date=u'2016-03-25T00:00:00.0-07:00', subsession_length=16383, submission_date_s3=u'20160325')

	Row(document_id=u'7c9835b8-93ca-4a00-81d1-fc66076a9ebd', client_id=u'0a9d9dcd-1f8e-4188-97d7-233f46d99d48', sample_id=1, channel=u'release', normalized_channel=u'release', country=u'US', city=u'Fremont', os=u'Windows_NT', os_version=u'6.1', os_service_pack_major=None, os_service_pack_minor=None, profile_creation_date=16871, subsession_start_date=u'2016-03-25T00:00:00.0-07:00', subsession_length=16383, distribution_id=None, submission_date=u'20160325', sync_configured=False, sync_count_desktop=None, sync_count_mobile=None, app_build_id=u'20160315153207', app_display_version=None, app_name=u'Firefox', app_version=u'45.0.1', timestamp=1458947604814447616, env_build_id=u'20160315153207', env_build_version=u'45.0.1', env_build_arch=u'x86', e10s_enabled=False, e10s_cohort=None, locale=u'en-US', active_experiment_id=None, active_experiment_branch=None, reason=u'shutdown', timezone_offset=-420, plugin_hangs=0, aborts_plugin=0, aborts_content=0, aborts_gmplugin=0, crashes_detected_plugin=0, crashes_detected_content=0, crashes_detected_gmplugin=0, crash_submit_attempt_main=0, crash_submit_attempt_content=0, crash_submit_attempt_plugin=0, crash_submit_success_main=0, crash_submit_success_content=0, crash_submit_success_plugin=0, active_addons_count=1, flash_version=u'21.0.0.197', vendor=u'Mozilla', is_default_browser=True, default_search_engine_data_name=u'Yahoo', search_counts=[Row(engine=u'yahoo', source=u'searchbar', count=1)], submission_date_s3=u'20160325')

	"""


	# In[10]:

	pp(study_start)
	from pyspark.sql.functions import col
	_fields = [
	'client_id',
	'subsession_start_date',
	'subsession_length',
	'submission_date_s3'
	]

	ids = set(['0a9d9dcd-1f8e-4188-97d7-233f46d99d48','c'])
	ids2 = ['0a9d9dcd-1f8e-4188-97d7-233f46d99d48', 'a', 'b']
	idFrame = sqlContext.createDataFrame([tuple([x,]) for x in ids],['client_id'])
	idFrame2 = sqlContext.createDataFrame([tuple([x,]) for x in ids2],['client_id'])

	idFrame.printSchema()

	# df.join(otherDf).drop(otherDf.col("id"))
	print idFrame.join(idFrame2, "client_id", "left").collect()

	#d1.filter(d1.submission_date >= study_start).select(*_fields).head()
	get_ipython().magic(u'time d1.filter(d1.submission_date_s3 == study_start).select(*_fields).printSchema()')


	#%time d1.filter(d1.submission_date_s3 == study_start).filter(d1.client_id in ids).count()
	get_ipython().magic(u'time d1_sub = d1.filter(d1.submission_date_s3 == study_start).select(*_fields)')
	get_ipython().magic(u"time print idFrame.join(d1_sub,'client_id','left').head()")


	# In[ ]: