Skip to content

Instantly share code, notes, and snippets.

@bsmedberg
Last active August 18, 2017 18:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bsmedberg/2c2b1989bcfa525e73121ec990ba353d to your computer and use it in GitHub Desktop.
Save bsmedberg/2c2b1989bcfa525e73121ec990ba353d to your computer and use it in GitHub Desktop.
input-latency-cc-tabusage
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"import ujson as json\n",
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import numpy as np\n",
"import plotly.plotly as py\n",
"\n",
"from plotly.graph_objs import *\n",
"from moztelemetry import get_pings_properties\n",
"from moztelemetry.dataset import Dataset\n",
"from moztelemetry.histogram import Histogram\n",
"from operator import add\n",
"\n",
"from datetime import date, timedelta\n",
"\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from collections import namedtuple"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def ping_filter(p):\n",
" if p.get(\"environment/system/os/name\", None) != \"Windows_NT\":\n",
" return False\n",
" if p.get(\"payload/info/subsessionLength\", 0) <= 0:\n",
" return False\n",
" if p.get(\"environment/settings/e10sEnabled\", False) != True:\n",
" return False\n",
" addons = p.get(\"environment/addons/activeAddons\", {}) or {}\n",
" for a in addons.itervalues():\n",
" if a.get(\"isSystem\", False) != True:\n",
" return False\n",
" return True"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"pings = Dataset.from_source(\"telemetry\") \\\n",
" .where(docType=\"main\") \\\n",
" .where(submissionDate=lambda d: \"20170428\" <= d <= \"20170504\") \\\n",
" .where(appUpdateChannel=\"nightly\") \\\n",
" .records(sc)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data = get_pings_properties(pings, [\n",
" \"clientId\",\n",
" \"environment/system/os/name\",\n",
" \"environment/settings/e10sEnabled\",\n",
" \"environment/addons/activeAddons\",\n",
" \"payload/info/subsessionLength\",\n",
" \"payload/histograms/INPUT_EVENT_RESPONSE_MS\",\n",
" \"payload/histograms/GC_MAX_PAUSE_MS\",\n",
" \"payload/histograms/CYCLE_COLLECTOR_MAX_PAUSE\",\n",
" \"payload/histograms/GHOST_WINDOWS\",\n",
" \"payload/histograms/CHECKERBOARD_DURATION\",\n",
" \"payload/processes/parent/scalars/browser.engagement.max_concurrent_tab_count\",\n",
" ], with_processes=True)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data2 = data.filter(ping_filter)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def add_nullq(a, b):\n",
" if a is None:\n",
" return b\n",
" if b is None:\n",
" return a\n",
" return a + b\n",
"\n",
"def max_nullq(a, b):\n",
" if a is None:\n",
" return b\n",
" if b is None:\n",
" return a\n",
" return max(a, b)\n",
"\n",
"props = (\n",
" ('session_length', 'payload/info/subsessionLength', add_nullq),\n",
" ('input_event_response_chrome', 'payload/histograms/INPUT_EVENT_RESPONSE_MS_parent', add_nullq),\n",
" ('input_event_response_content', 'payload/histograms/INPUT_EVENT_RESPONSE_MS_children', add_nullq),\n",
" ('gc_max_pause_chrome', 'payload/histograms/GC_MAX_PAUSE_MS_parent', add_nullq),\n",
" ('gc_max_pause_content', 'payload/histograms/GC_MAX_PAUSE_MS_children', add_nullq),\n",
" ('cc_max_pause_chrome', 'payload/histograms/CYCLE_COLLECTOR_MAX_PAUSE_parent', add_nullq),\n",
" ('cc_max_pause_content', 'payload/histograms/CYCLE_COLLECTOR_MAX_PAUSE_children', add_nullq),\n",
" ('ghost_windows', 'payload/histograms/GHOST_WINDOWS', add_nullq),\n",
" ('checkerboard_duration', 'payload/histograms/CHECKERBOARD_DURATION', add_nullq),\n",
" ('max_tab_count', 'payload/processes/parent/scalars/browser.engagement.max_concurrent_tab_count', max_nullq),\n",
")\n",
"\n",
"PingData = namedtuple(\"PingData\", (p for p, h, o in props))\n",
"\n",
"def json_to_pingdata(d):\n",
" return PingData._make(d[h] for p, h, o in props)\n",
"\n",
"def add_pingdata(a, b):\n",
" return PingData._make(o(va, vb) for (p, h, o), va, vb in zip(props, a, b))\n",
"\n",
"data3 = data2.map(lambda p: (p['clientId'], json_to_pingdata(p)))"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"data3 = data3.cache()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data_by_client = data3.reduceByKey(add_pingdata).cache()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"23534"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_by_client.count()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"two_hours = 60 * 60 * 2\n",
"real_users = data_by_client.filter(lambda (id, d): d.session_length > two_hours)\n",
"total_users = real_users.count()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"13444"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"total_users"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Ghost window and GC/CC pause times"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fraction of subsessions that see ghost windows: 9.9%\n"
]
}
],
"source": [
"def ghostfinder((id, d)):\n",
" return (d.ghost_windows is not None) and ((d.ghost_windows.iloc[1::] > 0).any())\n",
"\n",
"ghost_subsessions = data3.filter(ghostfinder).count()\n",
"all_subsessions = data3.count()\n",
"\n",
"print \"Fraction of subsessions that see ghost windows: {:.1f}%\".format(float(ghost_subsessions) / all_subsessions * 100)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"users who experienced ghost windows: 6330 (47.1%)\n",
"users who experienced chrome GC/CC max pause time >150ms: 7691 (57.2%)\n"
]
}
],
"source": [
"ghost_users = real_users.filter(ghostfinder).count()\n",
"\n",
"print \"users who experienced ghost windows: {:d} ({:.1f}%)\".format(ghost_users, ghost_users / float(total_users) * 100)\n",
"\n",
"def pausefinder((id, d)):\n",
" if (d.gc_max_pause_chrome is not None) and (d.gc_max_pause_chrome.truncate(before=150) > 0).any():\n",
" return True\n",
" if (d.cc_max_pause_chrome is not None) and (d.cc_max_pause_chrome.truncate(before=150) > 0).any():\n",
" return True\n",
" return False\n",
"\n",
"chrome_gcpause_users = real_users.filter(pausefinder).count()\n",
"print \"users who experienced chrome GC/CC max pause time >150ms: {:d} ({:.1f}%)\".format(chrome_gcpause_users, chrome_gcpause_users / float(total_users) * 100)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Distribution of max chrome cc/gc pause time by user (over one week):\n",
" 9501 - 10000: 290 ( 2.2%) cumulative ( 2.2%)\n",
" 9001 - 9500: 0 ( 0.0%) cumulative ( 2.2%)\n",
" 8501 - 9000: 0 ( 0.0%) cumulative ( 2.2%)\n",
" 8001 - 8500: 43 ( 0.3%) cumulative ( 2.5%)\n",
" 7501 - 8000: 0 ( 0.0%) cumulative ( 2.5%)\n",
" 7001 - 7500: 47 ( 0.3%) cumulative ( 2.8%)\n",
" 6501 - 7000: 0 ( 0.0%) cumulative ( 2.8%)\n",
" 6001 - 6500: 0 ( 0.0%) cumulative ( 2.8%)\n",
" 5501 - 6000: 45 ( 0.3%) cumulative ( 3.2%)\n",
" 5001 - 5500: 48 ( 0.4%) cumulative ( 3.5%)\n",
" 4501 - 5000: 0 ( 0.0%) cumulative ( 3.5%)\n",
" 4001 - 4500: 55 ( 0.4%) cumulative ( 3.9%)\n",
" 3501 - 4000: 62 ( 0.5%) cumulative ( 4.4%)\n",
" 3001 - 3500: 0 ( 0.0%) cumulative ( 4.4%)\n",
" 2501 - 3000: 134 ( 1.0%) cumulative ( 5.4%)\n",
" 2001 - 2500: 62 ( 0.5%) cumulative ( 5.8%)\n",
" 1501 - 2000: 82 ( 0.6%) cumulative ( 6.5%)\n",
" 1001 - 1500: 276 ( 2.1%) cumulative ( 8.5%)\n",
" 901 - 1000: 1486 (11.1%) cumulative (19.6%)\n",
" 801 - 900: 159 ( 1.2%) cumulative (20.7%)\n",
" 701 - 800: 216 ( 1.6%) cumulative (22.4%)\n",
" 601 - 700: 241 ( 1.8%) cumulative (24.1%)\n",
" 501 - 600: 317 ( 2.4%) cumulative (26.5%)\n",
" 401 - 500: 427 ( 3.2%) cumulative (29.7%)\n",
" 301 - 400: 896 ( 6.7%) cumulative (36.3%)\n",
" 201 - 300: 1573 (11.7%) cumulative (48.0%)\n",
" 176 - 200: 585 ( 4.4%) cumulative (52.4%)\n",
" 151 - 175: 659 ( 4.9%) cumulative (57.3%)\n",
" 126 - 150: 1739 (12.9%) cumulative (70.2%)\n",
" 101 - 125: 1142 ( 8.5%) cumulative (78.7%)\n",
" 76 - 100: 1273 ( 9.5%) cumulative (88.2%)\n",
" 51 - 75: 993 ( 7.4%) cumulative (95.6%)\n",
" 26 - 50: 401 ( 3.0%) cumulative (98.6%)\n",
" 0 - 25: 40 ( 0.3%) cumulative (98.9%)\n"
]
}
],
"source": [
"def max_cc_pause((id, d)):\n",
" max_cc_pause = None\n",
" if d.cc_max_pause_chrome is not None:\n",
" max_cc_pause = (d.cc_max_pause_chrome > 0).sort_index(ascending=False).idxmax()\n",
" max_gc_pause = None\n",
" if d.gc_max_pause_chrome is not None:\n",
" max_gc_pause = (d.gc_max_pause_chrome > 0).sort_index(ascending=False).idxmax()\n",
" return max(max_cc_pause, max_gc_pause)\n",
"\n",
"\n",
"buckets = [0] + range(26, 200, 25) + range(201, 1001, 100) + range(1001, 10001, 500) + [10001]\n",
"max_chrome_gccc_pause_by_client = real_users.map(max_cc_pause).histogram(buckets)\n",
"\n",
"print \"Distribution of max chrome cc/gc pause time by user (over one week):\"\n",
"cum = 0\n",
"for i in range(len(buckets) - 2, -1, -1):\n",
" start = buckets[i]\n",
" end = buckets[i+1] - 1\n",
" user_count = max_chrome_gccc_pause_by_client[1][i]\n",
" cum += user_count\n",
" print \"{:5d} - {:5d}: {:4d} ({:4.1f}%) cumulative ({:4.1f}%)\".format(start, end, user_count, user_count / float(total_users) * 100, cum / float(total_users) * 100)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"*Note: the above distribution is based on the histogram buckets. The GC histogram max is 1000 (one second). The CC histogram max is 10000 (ten seconds). This partly explains the uneven bump of the 1-second bucket for long GCs but short CCs.*"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"users who experienced content CC max pause time >2500ms: 3110 (23.1%)\n"
]
}
],
"source": [
"def pausefinder_content((id, d)):\n",
" # don't bother measuring GC pause times here, because the max recorded GC pause time is\n",
" # one second. TODO?\n",
" if (d.cc_max_pause_content is not None) and (d.cc_max_pause_content.truncate(before=2500) > 0).any():\n",
" return True\n",
" return False\n",
"\n",
"content_gcpause_users = real_users.filter(pausefinder_content).count()\n",
"print \"users who experienced content CC max pause time >2500ms: {:d} ({:.1f}%)\".format(content_gcpause_users, content_gcpause_users / float(total_users) * 100)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Distribution of max content cc/gc pause time by user (over one week):\n",
" 9501 - 10000: 1114 ( 8.3%) cumulative ( 8.3%)\n",
" 9001 - 9500: 0 ( 0.0%) cumulative ( 8.3%)\n",
" 8501 - 9000: 0 ( 0.0%) cumulative ( 8.3%)\n",
" 8001 - 8500: 185 ( 1.4%) cumulative ( 9.7%)\n",
" 7501 - 8000: 0 ( 0.0%) cumulative ( 9.7%)\n",
" 7001 - 7500: 185 ( 1.4%) cumulative (11.0%)\n",
" 6501 - 7000: 0 ( 0.0%) cumulative (11.0%)\n",
" 6001 - 6500: 0 ( 0.0%) cumulative (11.0%)\n",
" 5501 - 6000: 198 ( 1.5%) cumulative (12.5%)\n",
" 5001 - 5500: 258 ( 1.9%) cumulative (14.4%)\n",
" 4501 - 5000: 0 ( 0.0%) cumulative (14.4%)\n",
" 4001 - 4500: 285 ( 2.1%) cumulative (16.6%)\n",
" 3501 - 4000: 302 ( 2.2%) cumulative (18.8%)\n",
" 3001 - 3500: 0 ( 0.0%) cumulative (18.8%)\n",
" 2501 - 3000: 647 ( 4.8%) cumulative (23.6%)\n",
" 2001 - 2500: 376 ( 2.8%) cumulative (26.4%)\n",
" 1501 - 2000: 411 ( 3.1%) cumulative (29.5%)\n",
" 1001 - 1500: 1270 ( 9.4%) cumulative (38.9%)\n",
" 901 - 1000: 741 ( 5.5%) cumulative (44.4%)\n",
" 801 - 900: 439 ( 3.3%) cumulative (47.7%)\n",
" 701 - 800: 460 ( 3.4%) cumulative (51.1%)\n",
" 601 - 700: 426 ( 3.2%) cumulative (54.3%)\n",
" 501 - 600: 455 ( 3.4%) cumulative (57.7%)\n",
" 401 - 500: 474 ( 3.5%) cumulative (61.2%)\n",
" 301 - 400: 814 ( 6.1%) cumulative (67.2%)\n",
" 201 - 300: 843 ( 6.3%) cumulative (73.5%)\n",
" 176 - 200: 310 ( 2.3%) cumulative (75.8%)\n",
" 151 - 175: 303 ( 2.3%) cumulative (78.1%)\n",
" 126 - 150: 508 ( 3.8%) cumulative (81.9%)\n",
" 101 - 125: 330 ( 2.5%) cumulative (84.3%)\n",
" 76 - 100: 437 ( 3.3%) cumulative (87.6%)\n",
" 51 - 75: 439 ( 3.3%) cumulative (90.8%)\n",
" 26 - 50: 557 ( 4.1%) cumulative (95.0%)\n",
" 0 - 25: 427 ( 3.2%) cumulative (98.1%)\n"
]
}
],
"source": [
"def max_cc_pause_content((id, d)):\n",
" max_cc_pause = None\n",
" if d.cc_max_pause_content is not None:\n",
" max_cc_pause = (d.cc_max_pause_content > 0).sort_index(ascending=False).idxmax()\n",
" max_gc_pause = None\n",
" if d.gc_max_pause_content is not None:\n",
" max_gc_pause = (d.gc_max_pause_content > 0).sort_index(ascending=False).idxmax()\n",
" return max(max_cc_pause, max_gc_pause)\n",
"\n",
"max_content_gccc_pause_by_client = real_users.map(max_cc_pause_content).histogram(buckets)\n",
"\n",
"print \"Distribution of max content cc/gc pause time by user (over one week):\"\n",
"cum = 0\n",
"for i in range(len(buckets) - 2, -1, -1):\n",
" start = buckets[i]\n",
" end = buckets[i+1] - 1\n",
" user_count = max_content_gccc_pause_by_client[1][i]\n",
" cum += user_count\n",
" print \"{:5d} - {:5d}: {:4d} ({:4.1f}%) cumulative ({:4.1f}%)\".format(start, end, user_count, user_count / float(total_users) * 100, cum / float(total_users) * 100)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Finally let's look at these GC pauses as MTBF. This isn't quite fair, because the measurements in question only record the max pause, not any pause. But it's likely that there's one dominant pause in a GC/CC cycle in general, so it's still interesting."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"total_hours = float(aggregate_all.session_length / 60 / 60)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Chrome process pseudo-MTBF, chrome GC/CC pauses >150ms: 5.8 hours\n"
]
}
],
"source": [
"def count_chrome_pauses((id, d)):\n",
" gc_count = 0 if d.gc_max_pause_chrome is None else d.gc_max_pause_chrome.truncate(before=150).sum()\n",
" cc_count = 0 if d.cc_max_pause_chrome is None else d.cc_max_pause_chrome.truncate(before=150).sum()\n",
" return gc_count + cc_count\n",
"\n",
"total_chrome_pauses = real_users.map(count_chrome_pauses).reduce(add)\n",
"print \"Chrome process pseudo-MTBF, chrome GC/CC pauses >150ms: {:.1f} hours\".format(total_hours / total_chrome_pauses)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"# Checkerboarding MTBF"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"I (bsmedberg) have asserted that checkerboarding happens infrequently-enough that it should not be a main focus of quantum flow efforts. This is based on an analysis of the duration of checkerboarding events from telemetry."
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def add_users(a, b):\n",
" a_client, a_data = a\n",
" b_client, b_data = b\n",
" return ('*', add_pingdata(a_data, b_data))\n",
"\n",
"aggregate_all = real_users.reduce(add_users)[1]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 100000ms: 1081.293 hours\n",
" 79889ms: 1007.872 hours\n",
" 63822ms: 956.691 hours\n",
" 50987ms: 897.117 hours\n",
" 40733ms: 842.107 hours\n",
" 32541ms: 788.346 hours\n",
" 25997ms: 731.811 hours\n",
" 20769ms: 676.243 hours\n",
" 16592ms: 623.452 hours\n",
" 13255ms: 551.192 hours\n",
" 10589ms: 489.011 hours\n",
" 8459ms: 429.421 hours\n",
" 6758ms: 371.643 hours\n",
" 5399ms: 317.450 hours\n",
" 4313ms: 261.752 hours\n",
" 3446ms: 215.656 hours\n",
" 2753ms: 174.150 hours\n",
" 2199ms: 138.944 hours\n",
" 1757ms: 110.164 hours\n",
" 1404ms: 86.374 hours\n",
" 1122ms: 67.121 hours\n",
" 896ms: 52.098 hours\n",
" 716ms: 40.628 hours\n",
" 572ms: 31.529 hours\n",
" 457ms: 24.438 hours\n",
" 365ms: 18.682 hours\n",
" 292ms: 13.999 hours\n",
" 233ms: 10.224 hours\n",
" 186ms: 8.362 hours\n"
]
}
],
"source": [
"stotal = 0\n",
"for cutoff, count in aggregate_all.checkerboard_duration.sort_index(ascending=False).iteritems():\n",
" if cutoff < 150:\n",
" break\n",
" stotal += count\n",
" print \" {:5}ms: {:0.3f} hours\".format(cutoff, total_hours / stotal)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Tab Usage\n",
"\n",
"Many Firefox developers assume that most users browse with multiple tabs open on a regular basis. The data shows otherwise!"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Distribution of the maximum number of tabs users had open over one week:\n",
" 1 - 1: 646 (4.8%)\n",
" 2 - 2: 1284 (9.6%)\n",
" 3 - 3: 1482 (11.0%)\n",
" 4 - 4: 1431 (10.6%)\n",
" 5 - 5: 1216 (9.0%)\n",
" 6 - 9: 3326 (24.7%)\n",
" 10 - 14: 1786 (13.3%)\n",
" 15 - 19: 750 (5.6%)\n",
" 20 - 24: 341 (2.5%)\n",
" 25 - 49: 513 (3.8%)\n",
" 50 - 99: 158 (1.2%)\n",
" 100 - 199: 67 (0.5%)\n",
" 200 - 299: 14 (0.1%)\n",
" 300 - 999: 28 (0.2%)\n"
]
}
],
"source": [
"tab_buckets = [1, 2, 3, 4, 5, 6, 10, 15, 20, 25, 50, 100, 200, 300, 1000]\n",
"tab_histogram = real_users.map(lambda (id, d): d.max_tab_count).histogram(tab_buckets)\n",
"\n",
"print \"Distribution of the maximum number of tabs users had open over one week:\"\n",
"for i in range(0, len(tab_buckets) - 1):\n",
" start = tab_buckets[i]\n",
" end = tab_buckets[i+1] - 1\n",
" user_count = tab_histogram[1][i]\n",
" print \"{:4d} - {:4d}: {:d} ({:.1f}%)\".format(start, end, user_count, user_count / float(total_users) * 100)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"What about *heavy* users? If we define a heavy users as anyone who used Firefox more than 40 hours this week, how does that change the distribution?"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Distribution of the maximum number of tabs users had open over one week:\n",
" 1 - 1: 162 (3.9%)\n",
" 2 - 2: 210 (5.1%)\n",
" 3 - 3: 258 (6.3%)\n",
" 4 - 4: 287 (7.0%)\n",
" 5 - 5: 255 (6.2%)\n",
" 6 - 9: 998 (24.2%)\n",
" 10 - 14: 764 (18.5%)\n",
" 15 - 19: 392 (9.5%)\n",
" 20 - 24: 184 (4.5%)\n",
" 25 - 49: 302 (7.3%)\n",
" 50 - 99: 112 (2.7%)\n",
" 100 - 199: 46 (1.1%)\n",
" 200 - 299: 8 (0.2%)\n",
" 300 - 999: 17 (0.4%)\n"
]
}
],
"source": [
"heavy_users = real_users.filter(lambda (id, d): d.session_length > 40 * 60 * 60)\n",
"tab_histogram = heavy_users.map(lambda (id, d): d.max_tab_count).histogram(tab_buckets)\n",
"\n",
"total_heavy_users = heavy_users.count()\n",
"\n",
"print \"Distribution of the maximum number of tabs users had open over one week:\"\n",
"for i in range(0, len(tab_buckets) - 1):\n",
" start = tab_buckets[i]\n",
" end = tab_buckets[i+1] - 1\n",
" user_count = tab_histogram[1][i]\n",
" print \"{:4d} - {:4d}: {:d} ({:.1f}%)\".format(start, end, user_count, user_count / float(total_heavy_users) * 100)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Conclusion: Heavy user tab usage isn't that much different from the average user.\n",
"*Caveat: this is nightly, which users are completely atypical.*"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
# coding: utf-8
# In[5]:
import ujson as json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
from plotly.graph_objs import *
from moztelemetry import get_pings_properties
from moztelemetry.dataset import Dataset
from moztelemetry.histogram import Histogram
from operator import add
from datetime import date, timedelta
get_ipython().magic(u'matplotlib inline')
# In[6]:
from collections import namedtuple
# In[7]:
def ping_filter(p):
if p.get("environment/system/os/name", None) != "Windows_NT":
return False
if p.get("payload/info/subsessionLength", 0) <= 0:
return False
if p.get("environment/settings/e10sEnabled", False) != True:
return False
addons = p.get("environment/addons/activeAddons", {}) or {}
for a in addons.itervalues():
if a.get("isSystem", False) != True:
return False
return True
# In[8]:
pings = Dataset.from_source("telemetry") .where(docType="main") .where(submissionDate=lambda d: "20170428" <= d <= "20170504") .where(appUpdateChannel="nightly") .records(sc)
# In[9]:
data = get_pings_properties(pings, [
"clientId",
"environment/system/os/name",
"environment/settings/e10sEnabled",
"environment/addons/activeAddons",
"payload/info/subsessionLength",
"payload/histograms/INPUT_EVENT_RESPONSE_MS",
"payload/histograms/GC_MAX_PAUSE_MS",
"payload/histograms/CYCLE_COLLECTOR_MAX_PAUSE",
"payload/histograms/GHOST_WINDOWS",
"payload/histograms/CHECKERBOARD_DURATION",
"payload/processes/parent/scalars/browser.engagement.max_concurrent_tab_count",
], with_processes=True)
# In[10]:
data2 = data.filter(ping_filter)
# In[11]:
def add_nullq(a, b):
if a is None:
return b
if b is None:
return a
return a + b
def max_nullq(a, b):
if a is None:
return b
if b is None:
return a
return max(a, b)
props = (
('session_length', 'payload/info/subsessionLength', add_nullq),
('input_event_response_chrome', 'payload/histograms/INPUT_EVENT_RESPONSE_MS_parent', add_nullq),
('input_event_response_content', 'payload/histograms/INPUT_EVENT_RESPONSE_MS_children', add_nullq),
('gc_max_pause_chrome', 'payload/histograms/GC_MAX_PAUSE_MS_parent', add_nullq),
('gc_max_pause_content', 'payload/histograms/GC_MAX_PAUSE_MS_children', add_nullq),
('cc_max_pause_chrome', 'payload/histograms/CYCLE_COLLECTOR_MAX_PAUSE_parent', add_nullq),
('cc_max_pause_content', 'payload/histograms/CYCLE_COLLECTOR_MAX_PAUSE_children', add_nullq),
('ghost_windows', 'payload/histograms/GHOST_WINDOWS', add_nullq),
('checkerboard_duration', 'payload/histograms/CHECKERBOARD_DURATION', add_nullq),
('max_tab_count', 'payload/processes/parent/scalars/browser.engagement.max_concurrent_tab_count', max_nullq),
)
PingData = namedtuple("PingData", (p for p, h, o in props))
def json_to_pingdata(d):
return PingData._make(d[h] for p, h, o in props)
def add_pingdata(a, b):
return PingData._make(o(va, vb) for (p, h, o), va, vb in zip(props, a, b))
data3 = data2.map(lambda p: (p['clientId'], json_to_pingdata(p)))
# In[27]:
data3 = data3.cache()
# In[12]:
data_by_client = data3.reduceByKey(add_pingdata).cache()
# In[13]:
data_by_client.count()
# In[14]:
two_hours = 60 * 60 * 2
real_users = data_by_client.filter(lambda (id, d): d.session_length > two_hours)
total_users = real_users.count()
# In[15]:
total_users
# # Ghost window and GC/CC pause times
# In[28]:
def ghostfinder((id, d)):
return (d.ghost_windows is not None) and ((d.ghost_windows.iloc[1::] > 0).any())
ghost_subsessions = data3.filter(ghostfinder).count()
all_subsessions = data3.count()
print "Fraction of subsessions that see ghost windows: {:.1f}%".format(float(ghost_subsessions) / all_subsessions * 100)
# In[16]:
ghost_users = real_users.filter(ghostfinder).count()
print "users who experienced ghost windows: {:d} ({:.1f}%)".format(ghost_users, ghost_users / float(total_users) * 100)
def pausefinder((id, d)):
if (d.gc_max_pause_chrome is not None) and (d.gc_max_pause_chrome.truncate(before=150) > 0).any():
return True
if (d.cc_max_pause_chrome is not None) and (d.cc_max_pause_chrome.truncate(before=150) > 0).any():
return True
return False
chrome_gcpause_users = real_users.filter(pausefinder).count()
print "users who experienced chrome GC/CC max pause time >150ms: {:d} ({:.1f}%)".format(chrome_gcpause_users, chrome_gcpause_users / float(total_users) * 100)
# In[17]:
def max_cc_pause((id, d)):
max_cc_pause = None
if d.cc_max_pause_chrome is not None:
max_cc_pause = (d.cc_max_pause_chrome > 0).sort_index(ascending=False).idxmax()
max_gc_pause = None
if d.gc_max_pause_chrome is not None:
max_gc_pause = (d.gc_max_pause_chrome > 0).sort_index(ascending=False).idxmax()
return max(max_cc_pause, max_gc_pause)
buckets = [0] + range(26, 200, 25) + range(201, 1001, 100) + range(1001, 10001, 500) + [10001]
max_chrome_gccc_pause_by_client = real_users.map(max_cc_pause).histogram(buckets)
print "Distribution of max chrome cc/gc pause time by user (over one week):"
cum = 0
for i in range(len(buckets) - 2, -1, -1):
start = buckets[i]
end = buckets[i+1] - 1
user_count = max_chrome_gccc_pause_by_client[1][i]
cum += user_count
print "{:5d} - {:5d}: {:4d} ({:4.1f}%) cumulative ({:4.1f}%)".format(start, end, user_count, user_count / float(total_users) * 100, cum / float(total_users) * 100)
# *Note: the above distribution is based on the histogram buckets. The GC histogram max is 1000 (one second). The CC histogram max is 10000 (ten seconds). This partly explains the uneven bump of the 1-second bucket for long GCs but short CCs.*
# In[18]:
def pausefinder_content((id, d)):
# don't bother measuring GC pause times here, because the max recorded GC pause time is
# one second. TODO?
if (d.cc_max_pause_content is not None) and (d.cc_max_pause_content.truncate(before=2500) > 0).any():
return True
return False
content_gcpause_users = real_users.filter(pausefinder_content).count()
print "users who experienced content CC max pause time >2500ms: {:d} ({:.1f}%)".format(content_gcpause_users, content_gcpause_users / float(total_users) * 100)
# In[19]:
def max_cc_pause_content((id, d)):
max_cc_pause = None
if d.cc_max_pause_content is not None:
max_cc_pause = (d.cc_max_pause_content > 0).sort_index(ascending=False).idxmax()
max_gc_pause = None
if d.gc_max_pause_content is not None:
max_gc_pause = (d.gc_max_pause_content > 0).sort_index(ascending=False).idxmax()
return max(max_cc_pause, max_gc_pause)
max_content_gccc_pause_by_client = real_users.map(max_cc_pause_content).histogram(buckets)
print "Distribution of max content cc/gc pause time by user (over one week):"
cum = 0
for i in range(len(buckets) - 2, -1, -1):
start = buckets[i]
end = buckets[i+1] - 1
user_count = max_content_gccc_pause_by_client[1][i]
cum += user_count
print "{:5d} - {:5d}: {:4d} ({:4.1f}%) cumulative ({:4.1f}%)".format(start, end, user_count, user_count / float(total_users) * 100, cum / float(total_users) * 100)
# Finally let's look at these GC pauses as MTBF. This isn't quite fair, because the measurements in question only record the max pause, not any pause. But it's likely that there's one dominant pause in a GC/CC cycle in general, so it's still interesting.
# In[ ]:
total_hours = float(aggregate_all.session_length / 60 / 60)
# In[25]:
def count_chrome_pauses((id, d)):
gc_count = 0 if d.gc_max_pause_chrome is None else d.gc_max_pause_chrome.truncate(before=150).sum()
cc_count = 0 if d.cc_max_pause_chrome is None else d.cc_max_pause_chrome.truncate(before=150).sum()
return gc_count + cc_count
total_chrome_pauses = real_users.map(count_chrome_pauses).reduce(add)
print "Chrome process pseudo-MTBF, chrome GC/CC pauses >150ms: {:.1f} hours".format(total_hours / total_chrome_pauses)
# # Checkerboarding MTBF
# I (bsmedberg) have asserted that checkerboarding happens infrequently-enough that it should not be a main focus of quantum flow efforts. This is based on an analysis of the duration of checkerboarding events from telemetry.
# In[20]:
def add_users(a, b):
a_client, a_data = a
b_client, b_data = b
return ('*', add_pingdata(a_data, b_data))
aggregate_all = real_users.reduce(add_users)[1]
# In[21]:
stotal = 0
for cutoff, count in aggregate_all.checkerboard_duration.sort_index(ascending=False).iteritems():
if cutoff < 150:
break
stotal += count
print " {:5}ms: {:0.3f} hours".format(cutoff, total_hours / stotal)
# # Tab Usage
#
# Many Firefox developers assume that most users browse with multiple tabs open on a regular basis. The data shows otherwise!
# In[22]:
tab_buckets = [1, 2, 3, 4, 5, 6, 10, 15, 20, 25, 50, 100, 200, 300, 1000]
tab_histogram = real_users.map(lambda (id, d): d.max_tab_count).histogram(tab_buckets)
print "Distribution of the maximum number of tabs users had open over one week:"
for i in range(0, len(tab_buckets) - 1):
start = tab_buckets[i]
end = tab_buckets[i+1] - 1
user_count = tab_histogram[1][i]
print "{:4d} - {:4d}: {:d} ({:.1f}%)".format(start, end, user_count, user_count / float(total_users) * 100)
# What about *heavy* users? If we define a heavy users as anyone who used Firefox more than 40 hours this week, how does that change the distribution?
# In[23]:
heavy_users = real_users.filter(lambda (id, d): d.session_length > 40 * 60 * 60)
tab_histogram = heavy_users.map(lambda (id, d): d.max_tab_count).histogram(tab_buckets)
total_heavy_users = heavy_users.count()
print "Distribution of the maximum number of tabs users had open over one week:"
for i in range(0, len(tab_buckets) - 1):
start = tab_buckets[i]
end = tab_buckets[i+1] - 1
user_count = tab_histogram[1][i]
print "{:4d} - {:4d}: {:d} ({:.1f}%)".format(start, end, user_count, user_count / float(total_heavy_users) * 100)
# Conclusion: Heavy user tab usage isn't that much different from the average user.
# *Caveat: this is nightly, which users are completely atypical.*
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment