bsmedberg/daily-latency-metrics.ipynb

## daily-latency-metrics.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              daily-latency-metrics.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## daily-latency-metrics.py

# coding: utf-8

# In[1]:

import ujson as json
import pandas as pd
import numpy as np

from moztelemetry import get_pings_properties
from moztelemetry.dataset import Dataset
from moztelemetry.histogram import Histogram
from operator import add

from datetime import date, timedelta

get_ipython().magic(u'matplotlib inline')


# In[2]:

from collections import namedtuple


# In[3]:

import itertools


# In[4]:

import pyspark
import pyspark.sql.types as st
import pyspark.sql.functions as sf


# In[12]:

import boto3


# In[5]:

def ping_filter(p):
    if p.get("environment/system/os/name", None) != "Windows_NT":
        return False
    if p.get("payload/info/subsessionLength", 0) <= 0:
        return False
    if p.get("environment/settings/e10sEnabled", False) != True:
        return False
    return True


# In[6]:

Ping = namedtuple(
    "Ping",
    (
        "client_id",
        "build_id",
        "quantum_ready",
        "chrome_input_latency_gt_250",
        "chrome_input_latency_gt_2500",
        "content_input_latency_gt_250",
        "content_input_latency_gt_2500",
        "chrome_gc_gt_150",
        "chrome_cc_gt_150",
        "content_gc_gt_2500",
        "content_cc_gt_2500",
        "ghost_windows",
        "subsession_length",
    )
)

ping_properties = [
    "clientId",
    "environment/build/buildId",
    "environment/system/os/name",
    "environment/settings/e10sEnabled",
    "environment/addons/theme/id",
    "environment/addons/activeAddons",
    "payload/histograms/INPUT_EVENT_RESPONSE_COALESCED_MS",
    "payload/histograms/CYCLE_COLLECTOR_MAX_PAUSE",
    "payload/histograms/GC_MAX_PAUSE_MS_2",
    "payload/histograms/GHOST_WINDOWS",
    "payload/info/subsessionLength",
]

default_themes = (
    '{972ce4c6-7e08-4474-a285-3208198ce6fd}',
    'firefox-compact-light@mozilla.org',
    'firefox-compact-dark@mozilla.org',
)


# In[7]:

def th(d, name, cutoff):
    h = d[name]
    if h is None:
        return 0
    return int(h.truncate(before=cutoff).sum())

def ping_mapper(d):
    if not d["environment/settings/e10sEnabled"]:
        quantum_ready = "no_e10s"
    elif any(not(e.get("isSystem", False) or e.get("isWebExtension", False))
             for e in (d["environment/addons/activeAddons"] or {}).itervalues()):
        quantum_ready = "no_addons"
    elif d["environment/addons/theme/id"] not in default_themes:
        quantum_ready = "no_other_theme"
    else:
        quantum_ready = "yes"

    return Ping(
        d["clientId"],
        d["environment/build/buildId"],
        quantum_ready,
        th(d, "payload/histograms/INPUT_EVENT_RESPONSE_COALESCED_MS_parent", 250),
        th(d, "payload/histograms/INPUT_EVENT_RESPONSE_COALESCED_MS_parent", 2500),
        th(d, "payload/histograms/INPUT_EVENT_RESPONSE_COALESCED_MS_children", 250),
        th(d, "payload/histograms/INPUT_EVENT_RESPONSE_COALESCED_MS_children", 2500),
        th(d, "payload/histograms/GC_MAX_PAUSE_MS_2_parent", 150),
        th(d, "payload/histograms/CYCLE_COLLECTOR_MAX_PAUSE_parent", 150),
        th(d, "payload/histograms/GC_MAX_PAUSE_MS_2_children", 2500),
        th(d, "payload/histograms/CYCLE_COLLECTOR_MAX_PAUSE_children", 2500),
        th(d, "payload/histograms/GHOST_WINDOWS", 1),
        d["payload/info/subsessionLength"]
    )


# In[8]:

ping_schema = st.StructType([
    st.StructField("client_id", st.StringType()),
    st.StructField("build_id", st.StringType()),
    st.StructField("quantum_ready", st.StringType()),
    st.StructField("chrome_input_latency_gt_250", st.IntegerType()),
    st.StructField("chrome_input_latency_gt_2500", st.IntegerType()),
    st.StructField("content_input_latency_gt_250", st.IntegerType()),
    st.StructField("content_input_latency_gt_2500", st.IntegerType()),
    st.StructField("chrome_gc_gt_150", st.IntegerType()),
    st.StructField("chrome_cc_gt_150", st.IntegerType()),
    st.StructField("content_gc_gt_2500", st.IntegerType()),
    st.StructField("content_cc_gt_2500", st.IntegerType()),
    st.StructField("ghost_windows", st.IntegerType()),
    st.StructField("subsession_length", st.LongType()),
])


# In[34]:

def save_submission_date(day):
    ds = Dataset.from_source("telemetry")         .where(docType='main')         .where(submissionDate=lambda d: d >= day.strftime('%Y%m%d'))         .where(appUpdateChannel="nightly")

    pings = ds.records(sc)

    data = get_pings_properties(pings, ping_properties, with_processes=True)         .filter(ping_filter).map(ping_mapper)
    ds = spark.createDataFrame(data, ping_schema)
    ds.write.mode("overwrite").parquet("s3a://net-mozaws-prod-us-west-2-pipeline-analysis/bsmedberg/quantum-dataset/submission_date={}".format(day.strftime("%Y%m%d")))


# In[52]:

def daily_stats(dataset):
    summations = (
        "subsession_length",
        "chrome_input_latency_gt_250",
        "chrome_input_latency_gt_2500",
        "content_input_latency_gt_250",
        "content_input_latency_gt_2500",
    )
    props = [sf.sum(dataset[n]).alias(n) for n in summations]
    props.append(sf.count("*").alias("total_subsessions"))
    props.append(sf.sum(sf.when(dataset.ghost_windows > 0, 1).otherwise(0)).alias("subsessions_with_ghost_windows"))
    data = dataset.agg(*props).first()
    hours = data.subsession_length / 60.0 / 60.0

    return {
        "total_subsessions": data.total_subsessions,
        "ghost_windows_rate": 1.0 * data.subsessions_with_ghost_windows / data.total_subsessions,
        "mtbf_chrome_input_latency_gt_250": hours / data.chrome_input_latency_gt_250,
        "mtbf_chrome_input_latency_gt_2500": hours / data.chrome_input_latency_gt_2500,
        "mtbf_content_input_latency_gt_250": hours / data.content_input_latency_gt_250,
        "mtbf_content_input_latency_gt_2500": hours / data.content_input_latency_gt_2500,
    }

def weekly_stats(dataset):
    data = dataset.agg(
        sf.count("*").alias("client_count"),
        sf.sum(sf.when(dataset.chrome_gc_gt_150, 1).otherwise(0)).alias("chrome_gc_gt_150"),
        sf.sum(sf.when(dataset.chrome_cc_gt_150, 1).otherwise(0)).alias("chrome_cc_gt_150"),
        sf.sum(sf.when(dataset.chrome_gc_gt_150 | dataset.chrome_cc_gt_150, 1).otherwise(0)).alias("chrome_gccc_gt_150"),
        sf.sum(sf.when(dataset.content_gc_gt_2500, 1).otherwise(0)).alias("content_gc_gt_2500"),
        sf.sum(sf.when(dataset.content_cc_gt_2500, 1).otherwise(0)).alias("content_cc_gt_2500"),
        sf.sum(sf.when(dataset.content_gc_gt_2500 | dataset.content_cc_gt_2500, 1).otherwise(0)).alias("content_gccc_gt_2500")
    ).first()

    total = float(data.client_count)

    return {
        "total_users": data.client_count,
        "chrome_gc_gt_150": data.chrome_gc_gt_150 / total,
        "chrome_cc_gt_150": data.chrome_cc_gt_150 / total,
        "chrome_gccc_gt_150": data.chrome_gccc_gt_150 / total,
        "content_gc_gt_2500": data.content_gc_gt_2500 / total,
        "content_cc_gt_2500": data.content_cc_gt_2500 / total,
        "content_gccc_gt_2500": data.content_gccc_gt_2500 / total,
    }

def analyze_for_date(day):
    weekago = day - timedelta(days=7)
    dataset = spark.read.parquet("s3n://net-mozaws-prod-us-west-2-pipeline-analysis/bsmedberg/quantum-dataset")

    daily_data = dataset.where(dataset.submission_date == day.strftime("%Y%m%d")).cache()

    weekly_data = dataset.where((dataset.submission_date > weekago.strftime("%Y%m%d")) & (dataset.submission_date <= day.strftime("%Y%m%d")))
    grouped_by_client = weekly_data.groupBy('client_id').agg(
        (sf.sum(sf.when(dataset.quantum_ready == "yes", 0).otherwise(1)) == 0).alias("quantum_ready"),
        (sf.sum(dataset.chrome_gc_gt_150) > 0).alias("chrome_gc_gt_150"),
        (sf.sum(dataset.chrome_cc_gt_150) > 0).alias("chrome_cc_gt_150"),
        (sf.sum(dataset.content_gc_gt_2500) > 0).alias("content_gc_gt_2500"),
        (sf.sum(dataset.content_cc_gt_2500) > 0).alias("content_cc_gt_2500")
    ).cache()

    output_data = {
        'nightly_all': daily_stats(daily_data),
        'nightly_quantumready': daily_stats(daily_data.where(daily_data.quantum_ready == "yes")),
        'quantum_readiness': dict(dataset.groupBy("quantum_ready").count().collect()),
        'weekly_all': weekly_stats(grouped_by_client),
        'weekly_quantumready': weekly_stats(grouped_by_client.where(grouped_by_client.quantum_ready)),
    }

    bucket = boto3.resource('s3').Bucket('telemetry-public-analysis-2')
    bucket.put_object(Body=json.dumps(output_data),
                      Key='bsmedberg/daily-latency-metrics/{}.json'.format(day.strftime("%Y%m%d")))
    return output_data


# In[44]:

save_submission_date(date(2017, 5, 28))


# In[45]:

ds = spark.read.parquet("s3a://net-mozaws-prod-us-west-2-pipeline-analysis/bsmedberg/quantum-dataset")
ds.select("submission_date").distinct().sort('submission_date').collect()


# In[55]:

q = analyze_for_date(date(2017, 6, 5))
q


# In[51]:

day=date(2017, 6, 8)
bucket = boto3.resource('s3').Bucket('telemetry-public-analysis-2')
bucket.put_object(Body=json.dumps(q),
                  Key='bsmedberg/daily-latency-metrics/{}.json'.format(day.strftime("%Y%m%d")))


# In[15]:

s3 = boto3.resource('s3')
my_bucket = s3.Bucket('net-mozaws-prod-us-west-2-pipeline-analysis')


# In[30]:

files = list(my_bucket.objects.filter(Prefix="bsmedberg/quantum-dataset/submission_date="))


# In[29]:

def too_old(files):
    for f in files:
        if not f.key.startswith("bsmedberg/quantum-dataset/submission_date="):
            continue
        d = f.key[42:]
        if d < "20170605":
            yield f.key

def delete_list(l):
    my_bucket.delete_objects(Delete={'Objects': [{'Key': key} for key in l]})

dlist = []
for key in too_old(files):
    dlist.append(key)
    if len(dlist) == 1000:
        delete_list(dlist)
        dlist = []
delete_list(dlist)

	# coding: utf-8

	# In[1]:

	import ujson as json
	import pandas as pd
	import numpy as np

	from moztelemetry import get_pings_properties
	from moztelemetry.dataset import Dataset
	from moztelemetry.histogram import Histogram
	from operator import add

	from datetime import date, timedelta

	get_ipython().magic(u'matplotlib inline')


	# In[2]:

	from collections import namedtuple


	# In[3]:

	import itertools


	# In[4]:

	import pyspark
	import pyspark.sql.types as st
	import pyspark.sql.functions as sf


	# In[12]:

	import boto3


	# In[5]:

	def ping_filter(p):
	if p.get("environment/system/os/name", None) != "Windows_NT":
	return False
	if p.get("payload/info/subsessionLength", 0) <= 0:
	return False
	if p.get("environment/settings/e10sEnabled", False) != True:
	return False
	return True


	# In[6]:

	Ping = namedtuple(
	"Ping",
	(
	"client_id",
	"build_id",
	"quantum_ready",
	"chrome_input_latency_gt_250",
	"chrome_input_latency_gt_2500",
	"content_input_latency_gt_250",
	"content_input_latency_gt_2500",
	"chrome_gc_gt_150",
	"chrome_cc_gt_150",
	"content_gc_gt_2500",
	"content_cc_gt_2500",
	"ghost_windows",
	"subsession_length",
	)
	)

	ping_properties = [
	"clientId",
	"environment/build/buildId",
	"environment/system/os/name",
	"environment/settings/e10sEnabled",
	"environment/addons/theme/id",
	"environment/addons/activeAddons",
	"payload/histograms/INPUT_EVENT_RESPONSE_COALESCED_MS",
	"payload/histograms/CYCLE_COLLECTOR_MAX_PAUSE",
	"payload/histograms/GC_MAX_PAUSE_MS_2",
	"payload/histograms/GHOST_WINDOWS",
	"payload/info/subsessionLength",
	]

	default_themes = (
	'{972ce4c6-7e08-4474-a285-3208198ce6fd}',
	'firefox-compact-light@mozilla.org',
	'firefox-compact-dark@mozilla.org',
	)


	# In[7]:

	def th(d, name, cutoff):
	h = d[name]
	if h is None:
	return 0
	return int(h.truncate(before=cutoff).sum())

	def ping_mapper(d):
	if not d["environment/settings/e10sEnabled"]:
	quantum_ready = "no_e10s"
	elif any(not(e.get("isSystem", False) or e.get("isWebExtension", False))
	for e in (d["environment/addons/activeAddons"] or {}).itervalues()):
	quantum_ready = "no_addons"
	elif d["environment/addons/theme/id"] not in default_themes:
	quantum_ready = "no_other_theme"
	else:
	quantum_ready = "yes"

	return Ping(
	d["clientId"],
	d["environment/build/buildId"],
	quantum_ready,
	th(d, "payload/histograms/INPUT_EVENT_RESPONSE_COALESCED_MS_parent", 250),
	th(d, "payload/histograms/INPUT_EVENT_RESPONSE_COALESCED_MS_parent", 2500),
	th(d, "payload/histograms/INPUT_EVENT_RESPONSE_COALESCED_MS_children", 250),
	th(d, "payload/histograms/INPUT_EVENT_RESPONSE_COALESCED_MS_children", 2500),
	th(d, "payload/histograms/GC_MAX_PAUSE_MS_2_parent", 150),
	th(d, "payload/histograms/CYCLE_COLLECTOR_MAX_PAUSE_parent", 150),
	th(d, "payload/histograms/GC_MAX_PAUSE_MS_2_children", 2500),
	th(d, "payload/histograms/CYCLE_COLLECTOR_MAX_PAUSE_children", 2500),
	th(d, "payload/histograms/GHOST_WINDOWS", 1),
	d["payload/info/subsessionLength"]
	)


	# In[8]:

	ping_schema = st.StructType([
	st.StructField("client_id", st.StringType()),
	st.StructField("build_id", st.StringType()),
	st.StructField("quantum_ready", st.StringType()),
	st.StructField("chrome_input_latency_gt_250", st.IntegerType()),
	st.StructField("chrome_input_latency_gt_2500", st.IntegerType()),
	st.StructField("content_input_latency_gt_250", st.IntegerType()),
	st.StructField("content_input_latency_gt_2500", st.IntegerType()),
	st.StructField("chrome_gc_gt_150", st.IntegerType()),
	st.StructField("chrome_cc_gt_150", st.IntegerType()),
	st.StructField("content_gc_gt_2500", st.IntegerType()),
	st.StructField("content_cc_gt_2500", st.IntegerType()),
	st.StructField("ghost_windows", st.IntegerType()),
	st.StructField("subsession_length", st.LongType()),
	])


	# In[34]:

	def save_submission_date(day):
	ds = Dataset.from_source("telemetry") .where(docType='main') .where(submissionDate=lambda d: d >= day.strftime('%Y%m%d')) .where(appUpdateChannel="nightly")

	pings = ds.records(sc)

	data = get_pings_properties(pings, ping_properties, with_processes=True) .filter(ping_filter).map(ping_mapper)
	ds = spark.createDataFrame(data, ping_schema)
	ds.write.mode("overwrite").parquet("s3a://net-mozaws-prod-us-west-2-pipeline-analysis/bsmedberg/quantum-dataset/submission_date={}".format(day.strftime("%Y%m%d")))


	# In[52]:

	def daily_stats(dataset):
	summations = (
	"subsession_length",
	"chrome_input_latency_gt_250",
	"chrome_input_latency_gt_2500",
	"content_input_latency_gt_250",
	"content_input_latency_gt_2500",
	)
	props = [sf.sum(dataset[n]).alias(n) for n in summations]
	props.append(sf.count("*").alias("total_subsessions"))
	props.append(sf.sum(sf.when(dataset.ghost_windows > 0, 1).otherwise(0)).alias("subsessions_with_ghost_windows"))
	data = dataset.agg(*props).first()
	hours = data.subsession_length / 60.0 / 60.0

	return {
	"total_subsessions": data.total_subsessions,
	"ghost_windows_rate": 1.0 * data.subsessions_with_ghost_windows / data.total_subsessions,
	"mtbf_chrome_input_latency_gt_250": hours / data.chrome_input_latency_gt_250,
	"mtbf_chrome_input_latency_gt_2500": hours / data.chrome_input_latency_gt_2500,
	"mtbf_content_input_latency_gt_250": hours / data.content_input_latency_gt_250,
	"mtbf_content_input_latency_gt_2500": hours / data.content_input_latency_gt_2500,
	}

	def weekly_stats(dataset):
	data = dataset.agg(
	sf.count("*").alias("client_count"),
	sf.sum(sf.when(dataset.chrome_gc_gt_150, 1).otherwise(0)).alias("chrome_gc_gt_150"),
	sf.sum(sf.when(dataset.chrome_cc_gt_150, 1).otherwise(0)).alias("chrome_cc_gt_150"),
	sf.sum(sf.when(dataset.chrome_gc_gt_150 \| dataset.chrome_cc_gt_150, 1).otherwise(0)).alias("chrome_gccc_gt_150"),
	sf.sum(sf.when(dataset.content_gc_gt_2500, 1).otherwise(0)).alias("content_gc_gt_2500"),
	sf.sum(sf.when(dataset.content_cc_gt_2500, 1).otherwise(0)).alias("content_cc_gt_2500"),
	sf.sum(sf.when(dataset.content_gc_gt_2500 \| dataset.content_cc_gt_2500, 1).otherwise(0)).alias("content_gccc_gt_2500")
	).first()

	total = float(data.client_count)

	return {
	"total_users": data.client_count,
	"chrome_gc_gt_150": data.chrome_gc_gt_150 / total,
	"chrome_cc_gt_150": data.chrome_cc_gt_150 / total,
	"chrome_gccc_gt_150": data.chrome_gccc_gt_150 / total,
	"content_gc_gt_2500": data.content_gc_gt_2500 / total,
	"content_cc_gt_2500": data.content_cc_gt_2500 / total,
	"content_gccc_gt_2500": data.content_gccc_gt_2500 / total,
	}

	def analyze_for_date(day):
	weekago = day - timedelta(days=7)
	dataset = spark.read.parquet("s3n://net-mozaws-prod-us-west-2-pipeline-analysis/bsmedberg/quantum-dataset")

	daily_data = dataset.where(dataset.submission_date == day.strftime("%Y%m%d")).cache()

	weekly_data = dataset.where((dataset.submission_date > weekago.strftime("%Y%m%d")) & (dataset.submission_date <= day.strftime("%Y%m%d")))
	grouped_by_client = weekly_data.groupBy('client_id').agg(
	(sf.sum(sf.when(dataset.quantum_ready == "yes", 0).otherwise(1)) == 0).alias("quantum_ready"),
	(sf.sum(dataset.chrome_gc_gt_150) > 0).alias("chrome_gc_gt_150"),
	(sf.sum(dataset.chrome_cc_gt_150) > 0).alias("chrome_cc_gt_150"),
	(sf.sum(dataset.content_gc_gt_2500) > 0).alias("content_gc_gt_2500"),
	(sf.sum(dataset.content_cc_gt_2500) > 0).alias("content_cc_gt_2500")
	).cache()

	output_data = {
	'nightly_all': daily_stats(daily_data),
	'nightly_quantumready': daily_stats(daily_data.where(daily_data.quantum_ready == "yes")),
	'quantum_readiness': dict(dataset.groupBy("quantum_ready").count().collect()),
	'weekly_all': weekly_stats(grouped_by_client),
	'weekly_quantumready': weekly_stats(grouped_by_client.where(grouped_by_client.quantum_ready)),
	}

	bucket = boto3.resource('s3').Bucket('telemetry-public-analysis-2')
	bucket.put_object(Body=json.dumps(output_data),
	Key='bsmedberg/daily-latency-metrics/{}.json'.format(day.strftime("%Y%m%d")))
	return output_data


	# In[44]:

	save_submission_date(date(2017, 5, 28))


	# In[45]:

	ds = spark.read.parquet("s3a://net-mozaws-prod-us-west-2-pipeline-analysis/bsmedberg/quantum-dataset")
	ds.select("submission_date").distinct().sort('submission_date').collect()


	# In[55]:

	q = analyze_for_date(date(2017, 6, 5))
	q


	# In[51]:

	day=date(2017, 6, 8)
	bucket = boto3.resource('s3').Bucket('telemetry-public-analysis-2')
	bucket.put_object(Body=json.dumps(q),
	Key='bsmedberg/daily-latency-metrics/{}.json'.format(day.strftime("%Y%m%d")))


	# In[15]:

	s3 = boto3.resource('s3')
	my_bucket = s3.Bucket('net-mozaws-prod-us-west-2-pipeline-analysis')


	# In[30]:

	files = list(my_bucket.objects.filter(Prefix="bsmedberg/quantum-dataset/submission_date="))


	# In[29]:

	def too_old(files):
	for f in files:
	if not f.key.startswith("bsmedberg/quantum-dataset/submission_date="):
	continue
	d = f.key[42:]
	if d < "20170605":
	yield f.key

	def delete_list(l):
	my_bucket.delete_objects(Delete={'Objects': [{'Key': key} for key in l]})

	dlist = []
	for key in too_old(files):
	dlist.append(key)
	if len(dlist) == 1000:
	delete_list(dlist)
	dlist = []
	delete_list(dlist)