mreid-moz/Telemetry - Test Pilot KPI Validity Check.ipynb Secret

## Telemetry - Test Pilot KPI Validity Check.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Telemetry - Test Pilot KPI Validity Check.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## Telemetry - Test Pilot KPI Validity Check.py

# coding: utf-8

# # Telemetry - Telemetry KPI Validity Check
#
# In this notebook we follow the standard for computing MAU and DAU as captured in [this notebook](https://gist.github.com/mreid-moz/8b2c2b1c6594d658ca5e).  This is to address [bug #1264049](https://bugzilla.mozilla.org/show_bug.cgi?id=1264049).  This notebook will create a csv that will be made available on [sql.telemetry.mozilla.org](https://sql.telemetry.mozilla.org).

# In[2]:

import boto3, botocore
import csv, os.path
from datetime import datetime as dt, timedelta, date
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from boto3.s3.transfer import S3Transfer
from moztelemetry import get_pings, get_pings_properties, standards

get_ipython().magic(u'pylab inline')


# ## Define functions for the task

# In[3]:

def fmt(the_date, date_format="%Y%m%d"):
    return dt.strftime(the_date, date_format)

def convert_date(p):
    try:
        p['creationDate'] = dt.strptime(p.get('creationDate'), "%Y-%m-%dT%H:%M:%S.%fZ")
        if p['creationDate']:
            p['creationDate'] = standards.unix_time_nanos(p['creationDate'])
        return p
    except Exception as e:
        return e

def get_num_tests(p):
    result = len(p.get('payload/tests', None))
    if result is not None:
        p['num_tests'] = result
    return p

def should_be_updated(record,
        target_col="day",
        generated_col="generated_on",
        date_format="%Y%m%d"):
    target = dt.strptime(record[target_col], date_format)
    generated = dt.strptime(record[generated_col], date_format)

    # Don't regenerate data that was already updated today.
    today = fmt(dt.utcnow(), date_format)
    if record[generated_col] >= today:
        return False

    diff = generated - target
    return diff.days <= 10

def update_engagement_csv(dataset, old_filename, new_filename,
                          cutoff_days=30, date_format="%Y%m%d"):
    cutoff_date = dt.utcnow() - timedelta(cutoff_days)
    cutoff = fmt(cutoff_date, date_format)
    print "Cutoff date: {}".format(cutoff)

    fields = ["day", "dau", "mau", "generated_on"]

    should_write_header = True
    potential_updates = {}
    # Carry over rows we won't touch
    if os.path.exists(old_filename):
        with open(old_filename) as csv_old:
            reader = csv.DictReader(csv_old)
            with open(new_filename, "w") as csv_new:
                writer = csv.DictWriter(csv_new, fields)
                writer.writeheader()
                should_write_header = False
                for row in reader:
                    if row['day'] < cutoff:
                        writer.writerow(row)
                    else:
                        potential_updates[row['day']] = row

    with open(new_filename, "a") as csv_new:
        writer = csv.DictWriter(csv_new, fields)
        if should_write_header:
            writer.writeheader()

        for i in range(cutoff_days, 0, -1):
            target_day = fmt(dt.utcnow() - timedelta(i), date_format)
            if target_day in potential_updates and not should_be_updated(potential_updates[target_day]):
                # It's fine as-is.
                writer.writerow(potential_updates[target_day])
            else:
                # Update it.
                print "We should update data for {}".format(target_day)
                record = {"day": target_day, "generated_on": fmt(dt.utcnow(), date_format)}
                print "Starting dau {} at {}".format(target_day, dt.utcnow())
                record["dau"] = standards.dau(dataset, target_day)
                print "Finished dau {} at {}".format(target_day, dt.utcnow())
                print "Starting mau {} at {}".format(target_day, dt.utcnow())
                record["mau"] = standards.mau(dataset, target_day)
                print "Finished mau {} at {}".format(target_day, dt.utcnow())
                writer.writerow(record)


# # Configure task variables

# In[4]:

data_bucket = "net-mozaws-prod-us-west-2-pipeline-analysis"
engagement_basename = "testpilot_engagement_ratio.csv"
new_engagement_basename = "testpilot_engagement_ratio.{}.csv".format(dt.strftime(dt.utcnow(), "%Y%m%d"))
s3path = "mreid/TxP_maudau"
data_version = "v1"

# Store the latest complete file in a subdir for hive / presto compatibility
engagement_key = "{}/{}/{}".format(s3path, data_version, engagement_basename)

new_engagement_key = "{}/{}".format(s3path, new_engagement_basename)

client = boto3.client('s3', 'us-west-2')
transfer = S3Transfer(client)


# # Retrieve TxP pings and create DataFrame

# In[5]:

pings = get_pings(sc, doc_type="testpilot", app="Firefox")

subset = get_pings_properties(pings, ["clientId",
                                      "creationDate",
                                      "meta/submissionDate",
                                      "payload/tests"])

# Convert date string to appropriate format and count number of current tests
subset = subset.map(lambda p : convert_date(p)).map(lambda p: get_num_tests(p))


# In[6]:

from pyspark.sql import Row

rows = subset.map(lambda p: Row(clientId=p['clientId'],
                                activityTimestamp=p['creationDate'],
                                submission_date_s3=p['meta/submissionDate'],
                                num_tests=p['num_tests']))

dataset = sqlContext.createDataFrame(rows)


# # Filter out those clients that do *not* have a test installed

# In[7]:

dataset_w_tests = dataset[dataset.num_tests > 0]
print "Clients without tests = {}".format(standards.count_distinct_clientids(dataset))
print "Clients with at least 1 test = {}".format(standards.count_distinct_clientids(dataset_w_tests))


# # Fetch existing data

# In[10]:

try:
    transfer.download_file(data_bucket, engagement_key, engagement_basename)
except botocore.exceptions.ClientError as e:
    # If the file wasn't there, that's ok. Otherwise, abort!
    if e.response['Error']['Code'] != "404":
        raise e
    else:
        print "Did not find an existing file at '{}'".format(engagement_key)

get_ipython().magic(u'time update_engagement_csv(dataset_w_tests, engagement_basename, new_engagement_basename, cutoff_days=35)')


# In[12]:

## Upload the updated csv file to S3

# Update the day-specific file:
transfer.upload_file(new_engagement_basename, data_bucket, new_engagement_key)

# Update the "main" file
transfer.upload_file(new_engagement_basename, data_bucket, engagement_key)


# In[ ]:

	# coding: utf-8

	# # Telemetry - Telemetry KPI Validity Check
	#
	# In this notebook we follow the standard for computing MAU and DAU as captured in [this notebook](https://gist.github.com/mreid-moz/8b2c2b1c6594d658ca5e). This is to address [bug #1264049](https://bugzilla.mozilla.org/show_bug.cgi?id=1264049). This notebook will create a csv that will be made available on [sql.telemetry.mozilla.org](https://sql.telemetry.mozilla.org).

	# In[2]:

	import boto3, botocore
	import csv, os.path
	from datetime import datetime as dt, timedelta, date
	from pyspark.sql import SQLContext
	from pyspark.sql.types import *
	from boto3.s3.transfer import S3Transfer
	from moztelemetry import get_pings, get_pings_properties, standards

	get_ipython().magic(u'pylab inline')


	# ## Define functions for the task

	# In[3]:

	def fmt(the_date, date_format="%Y%m%d"):
	return dt.strftime(the_date, date_format)

	def convert_date(p):
	try:
	p['creationDate'] = dt.strptime(p.get('creationDate'), "%Y-%m-%dT%H:%M:%S.%fZ")
	if p['creationDate']:
	p['creationDate'] = standards.unix_time_nanos(p['creationDate'])
	return p
	except Exception as e:
	return e

	def get_num_tests(p):
	result = len(p.get('payload/tests', None))
	if result is not None:
	p['num_tests'] = result
	return p

	def should_be_updated(record,
	target_col="day",
	generated_col="generated_on",
	date_format="%Y%m%d"):
	target = dt.strptime(record[target_col], date_format)
	generated = dt.strptime(record[generated_col], date_format)

	# Don't regenerate data that was already updated today.
	today = fmt(dt.utcnow(), date_format)
	if record[generated_col] >= today:
	return False

	diff = generated - target
	return diff.days <= 10

	def update_engagement_csv(dataset, old_filename, new_filename,
	cutoff_days=30, date_format="%Y%m%d"):
	cutoff_date = dt.utcnow() - timedelta(cutoff_days)
	cutoff = fmt(cutoff_date, date_format)
	print "Cutoff date: {}".format(cutoff)

	fields = ["day", "dau", "mau", "generated_on"]

	should_write_header = True
	potential_updates = {}
	# Carry over rows we won't touch
	if os.path.exists(old_filename):
	with open(old_filename) as csv_old:
	reader = csv.DictReader(csv_old)
	with open(new_filename, "w") as csv_new:
	writer = csv.DictWriter(csv_new, fields)
	writer.writeheader()
	should_write_header = False
	for row in reader:
	if row['day'] < cutoff:
	writer.writerow(row)
	else:
	potential_updates[row['day']] = row

	with open(new_filename, "a") as csv_new:
	writer = csv.DictWriter(csv_new, fields)
	if should_write_header:
	writer.writeheader()

	for i in range(cutoff_days, 0, -1):
	target_day = fmt(dt.utcnow() - timedelta(i), date_format)
	if target_day in potential_updates and not should_be_updated(potential_updates[target_day]):
	# It's fine as-is.
	writer.writerow(potential_updates[target_day])
	else:
	# Update it.
	print "We should update data for {}".format(target_day)
	record = {"day": target_day, "generated_on": fmt(dt.utcnow(), date_format)}
	print "Starting dau {} at {}".format(target_day, dt.utcnow())
	record["dau"] = standards.dau(dataset, target_day)
	print "Finished dau {} at {}".format(target_day, dt.utcnow())
	print "Starting mau {} at {}".format(target_day, dt.utcnow())
	record["mau"] = standards.mau(dataset, target_day)
	print "Finished mau {} at {}".format(target_day, dt.utcnow())
	writer.writerow(record)


	# # Configure task variables

	# In[4]:

	data_bucket = "net-mozaws-prod-us-west-2-pipeline-analysis"
	engagement_basename = "testpilot_engagement_ratio.csv"
	new_engagement_basename = "testpilot_engagement_ratio.{}.csv".format(dt.strftime(dt.utcnow(), "%Y%m%d"))
	s3path = "mreid/TxP_maudau"
	data_version = "v1"

	# Store the latest complete file in a subdir for hive / presto compatibility
	engagement_key = "{}/{}/{}".format(s3path, data_version, engagement_basename)

	new_engagement_key = "{}/{}".format(s3path, new_engagement_basename)

	client = boto3.client('s3', 'us-west-2')
	transfer = S3Transfer(client)


	# # Retrieve TxP pings and create DataFrame

	# In[5]:

	pings = get_pings(sc, doc_type="testpilot", app="Firefox")

	subset = get_pings_properties(pings, ["clientId",
	"creationDate",
	"meta/submissionDate",
	"payload/tests"])

	# Convert date string to appropriate format and count number of current tests
	subset = subset.map(lambda p : convert_date(p)).map(lambda p: get_num_tests(p))


	# In[6]:

	from pyspark.sql import Row

	rows = subset.map(lambda p: Row(clientId=p['clientId'],
	activityTimestamp=p['creationDate'],
	submission_date_s3=p['meta/submissionDate'],
	num_tests=p['num_tests']))

	dataset = sqlContext.createDataFrame(rows)


	# # Filter out those clients that do not have a test installed

	# In[7]:

	dataset_w_tests = dataset[dataset.num_tests > 0]
	print "Clients without tests = {}".format(standards.count_distinct_clientids(dataset))
	print "Clients with at least 1 test = {}".format(standards.count_distinct_clientids(dataset_w_tests))


	# # Fetch existing data

	# In[10]:

	try:
	transfer.download_file(data_bucket, engagement_key, engagement_basename)
	except botocore.exceptions.ClientError as e:
	# If the file wasn't there, that's ok. Otherwise, abort!
	if e.response['Error']['Code'] != "404":
	raise e
	else:
	print "Did not find an existing file at '{}'".format(engagement_key)

	get_ipython().magic(u'time update_engagement_csv(dataset_w_tests, engagement_basename, new_engagement_basename, cutoff_days=35)')


	# In[12]:

	## Upload the updated csv file to S3

	# Update the day-specific file:
	transfer.upload_file(new_engagement_basename, data_bucket, new_engagement_key)

	# Update the "main" file
	transfer.upload_file(new_engagement_basename, data_bucket, engagement_key)


	# In[ ]: