Skip to content

Instantly share code, notes, and snippets.

@bsmedberg
Created February 19, 2016 03:49
Show Gist options
  • Save bsmedberg/5410d8daa5f0283d86b0 to your computer and use it in GitHub Desktop.
Save bsmedberg/5410d8daa5f0283d86b0 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import numpy as np\n",
"import plotly.plotly as py\n",
"\n",
"%pylab inline"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from pyspark.sql import *\n",
"from pyspark.sql.functions import *\n",
"import itertools\n",
"import datetime"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"frame = sqlContext.read.load(\"s3://telemetry-parquet/longitudinal/v20160212\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"subselect = frame.selectExpr(\"info.subsessionStartDate\", \"activeAddons\")"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"StructField(activeAddons,ArrayType(MapType(StringType,StructType(List(StructField(blocklisted,BooleanType,true),StructField(description,StringType,true),StructField(name,StringType,true),StructField(userDisabled,BooleanType,true),StructField(appDisabled,BooleanType,true),StructField(version,StringType,true),StructField(scope,IntegerType,true),StructField(type,StringType,true),StructField(foreignInstall,BooleanType,true),StructField(hasBinaryComponents,BooleanType,true),StructField(installDay,LongType,true),StructField(updateDay,LongType,true),StructField(signedState,IntegerType,true))),false),false),true)"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"MapType(StringType,StructType(List(StructField(blocklisted,BooleanType,true),StructField(description,StringType,true),StructField(name,StringType,true),StructField(userDisabled,BooleanType,true),StructField(appDisabled,BooleanType,true),StructField(version,StringType,true),StructField(scope,IntegerType,true),StructField(type,StringType,true),StructField(foreignInstall,BooleanType,true),StructField(hasBinaryComponents,BooleanType,true),StructField(installDay,LongType,true),StructField(updateDay,LongType,true),StructField(signedState,IntegerType,true))),false)"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"activeAddonsType = [f.dataType for f in subselect.schema.fields if f.name == 'activeAddons'][0].elementType\n",
"activeAddonsType"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def find_first_recent_activeAddons(subsessionStartDate, activeAddons):\n",
" if activeAddons is None:\n",
" return None\n",
" for subsessionStartDate, activeAddons in itertools.izip(subsessionStartDate, activeAddons):\n",
" try:\n",
" if datetime.datetime.strptime(subsessionStartDate[:10], \"%Y-%m-%d\").date() >= datetime.date(2016, 1, 18) and activeAddons is not None:\n",
" return activeAddons\n",
" except ValueError:\n",
" continue\n",
" return None\n",
"find_first_recent_activeAddons_udf = udf(find_first_recent_activeAddons, activeAddonsType)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"lastActiveAddons = subselect.select(find_first_recent_activeAddons_udf(subselect.subsessionStartDate, subselect.activeAddons).alias(\"lastActiveAddons\")).where(\"lastActiveAddons IS NOT NULL\")"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"collapsed": false
},
"outputs": [
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-50-13a4f1f6f863>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mlastActiveAddons\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mselectExpr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"lastActiveAddons IS NULL AS isNull\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgroupBy\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"isNull\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcount\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcollect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m/usr/lib/spark/python/pyspark/sql/dataframe.py\u001b[0m in \u001b[0;36mcollect\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 277\u001b[0m \"\"\"\n\u001b[0;32m 278\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0mSCCallSiteSync\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sc\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mcss\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 279\u001b[1;33m \u001b[0mport\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jvm\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mPythonRDD\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcollectAndServe\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjavaToPython\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrdd\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 280\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_load_from_socket\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mport\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mBatchedSerializer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mPickleSerializer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 281\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/hadoop/anaconda2/lib/python2.7/site-packages/py4j/java_gateway.pyc\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m 534\u001b[0m \u001b[0mEND_COMMAND_PART\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 535\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 536\u001b[1;33m \u001b[0manswer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 537\u001b[0m return_value = get_return_value(answer, self.gateway_client,\n\u001b[0;32m 538\u001b[0m self.target_id, self.name)\n",
"\u001b[1;32m/home/hadoop/anaconda2/lib/python2.7/site-packages/py4j/java_gateway.pyc\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command, retry)\u001b[0m\n\u001b[0;32m 362\u001b[0m \u001b[0mconnection\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 363\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 364\u001b[1;33m \u001b[0mresponse\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 365\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_give_back_connection\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mconnection\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 366\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mPy4JNetworkError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/hadoop/anaconda2/lib/python2.7/site-packages/py4j/java_gateway.pyc\u001b[0m in \u001b[0;36msend_command\u001b[1;34m(self, command)\u001b[0m\n\u001b[0;32m 471\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 472\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msocket\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msendall\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'utf-8'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 473\u001b[1;33m \u001b[0manswer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msmart_decode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstream\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m-\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 474\u001b[0m \u001b[0mlogger\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Answer received: {0}\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0manswer\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 475\u001b[0m \u001b[1;31m# Happens when a the other end is dead. There might be an empty\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/hadoop/anaconda2/lib/python2.7/socket.pyc\u001b[0m in \u001b[0;36mreadline\u001b[1;34m(self, size)\u001b[0m\n\u001b[0;32m 432\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 433\u001b[0m \u001b[1;32mwhile\u001b[0m \u001b[0mdata\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[1;34m\"\\n\"\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 434\u001b[1;33m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mrecv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 435\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 436\u001b[0m \u001b[1;32mbreak\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"lastActiveAddons.selectExpr(\"lastActiveAddons IS NULL AS isNull\").groupBy(\"isNull\").count().collect()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The previous line takes approximately 35 minutes on a single node. 25-40% of that time is running Java GC."
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"DataFrame[addonId: string, name: string, version: string, signedState: int, count: bigint]"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"groupedAddons = lastActiveAddons.select(explode(lastActiveAddons.lastActiveAddons).alias(\"addonId\", \"addon\")).groupBy(\"addonId\", \"addon.name\", \"addon.version\", \"addon.signedState\").count().cache()\n",
"groupedAddons"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 356 ms, sys: 184 ms, total: 540 ms\n",
"Wall time: 47min 10s\n"
]
}
],
"source": [
"%time top_addons_by_id = groupedAddons.groupBy(\"addonId\").sum(\"count\").orderBy(\"sum(count)\", ascending=False).take(200)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[Row(addonId=u'{d10d0bf8-f5b5-c8b4-a8b2-2b9879e08c5d}', sum(count)=322865),\n",
" Row(addonId=u'{82AF8DCA-6DE9-405D-BD5E-43525BDAD38A}', sum(count)=185393),\n",
" Row(addonId=u'firefox-hotfix@mozilla.org', sum(count)=164857),\n",
" Row(addonId=u'wrc@avast.com', sum(count)=73195),\n",
" Row(addonId=u'{b9db16a4-6edc-47ec-a1f4-b86292ed211d}', sum(count)=69010),\n",
" Row(addonId=u'{4ED1F68A-5463-4931-9384-8FFF5ED91D92}', sum(count)=57977),\n",
" Row(addonId=u'cpmanager@mozillaonline.com', sum(count)=52816),\n",
" Row(addonId=u'cehomepage@mozillaonline.com', sum(count)=51901),\n",
" Row(addonId=u'tabtweak@mozillaonline.com', sum(count)=51463),\n",
" Row(addonId=u'easyscreenshot@mozillaonline.com', sum(count)=51071),\n",
" Row(addonId=u'commonfix@mozillaonline.com', sum(count)=49290),\n",
" Row(addonId=u'coba@mozilla.com.cn', sum(count)=47864),\n",
" Row(addonId=u'wx-assistant@mozillaonline.com', sum(count)=46713),\n",
" Row(addonId=u'vb@yandex.ru', sum(count)=46194),\n",
" Row(addonId=u'yasearch@yandex.ru', sum(count)=45145),\n",
" Row(addonId=u'abs@avira.com', sum(count)=32225),\n",
" Row(addonId=u'firebug@software.joehewitt.com', sum(count)=31513),\n",
" Row(addonId=u'mozilla_cc2@internetdownloadmanager.com', sum(count)=29924),\n",
" Row(addonId=u'sovetnik@metabar.ru', sum(count)=27538),\n",
" Row(addonId=u'light_plugin_D772DC8D6FAF43A29B25C4EBAA5AD1DE@kaspersky.com', sum(count)=23662),\n",
" Row(addonId=u'sp@avast.com', sum(count)=22689),\n",
" Row(addonId=u'firefox@mega.co.nz', sum(count)=19712),\n",
" Row(addonId=u'{a38384b3-2d1d-4f36-bc22-0f7ae402bcd7}', sum(count)=19419),\n",
" Row(addonId=u'adblockpopups@jessehakanen.net', sum(count)=17086),\n",
" Row(addonId=u'{DDC359D1-844A-42a7-9AA1-88A850A938A8}', sum(count)=17049),\n",
" Row(addonId=u'virtual_keyboard_07402848C2F6470194F131B0F3DE025E@kaspersky.com', sum(count)=17031),\n",
" Row(addonId=u'content_blocker_663BE84DBCC949E88C7600F63CA7F098@kaspersky.com', sum(count)=17031),\n",
" Row(addonId=u'homepage@mail.ru', sum(count)=16348),\n",
" Row(addonId=u'search@mail.ru', sum(count)=16234),\n",
" Row(addonId=u'firefox@ghostery.com', sum(count)=16069),\n",
" Row(addonId=u'{635abd67-4fe9-1b23-4f01-e679fa7484c1}', sum(count)=16062),\n",
" Row(addonId=u'{b9bfaf1c-a63f-47cd-8b9a-29526ced9060}', sum(count)=15799),\n",
" Row(addonId=u'avg@toolbar', sum(count)=15772),\n",
" Row(addonId=u'helper-sig@savefrom.net', sum(count)=15236),\n",
" Row(addonId=u'{73a6fe31-595d-460b-a920-fcc0f8843232}', sum(count)=15010),\n",
" Row(addonId=u'artur.dubovoy@gmail.com', sum(count)=14592),\n",
" Row(addonId=u'adbhelper@mozilla.org', sum(count)=14551),\n",
" Row(addonId=u'{e4a8a97b-f2ed-450b-b12d-ee082ba24781}', sum(count)=14384),\n",
" Row(addonId=u'{a0d7ccb3-214d-498b-b4aa-0e8fda9a7bf7}', sum(count)=14367),\n",
" Row(addonId=u'{C1A2A613-35F1-4FCF-B27F-2840527B6556}', sum(count)=14229),\n",
" Row(addonId=u'online_banking_08806E753BE44495B44E90AA2513BDC5@kaspersky.com', sum(count)=13861),\n",
" Row(addonId=u'fxdevtools-adapters@mozilla.org', sum(count)=12435),\n",
" Row(addonId=u'content_blocker@kaspersky.com', sum(count)=12178),\n",
" Row(addonId=u'feca4b87-3be4-43da-a1b1-137c24220968@jetpack', sum(count)=11238),\n",
" Row(addonId=u'support@lastpass.com', sum(count)=10968),\n",
" Row(addonId=u'translator@zoli.bod', sum(count)=10798),\n",
" Row(addonId=u'{19503e42-ca3c-4c27-b1e2-9cdb2170ee34}', sum(count)=10744),\n",
" Row(addonId=u'YoutubeDownloader@PeterOlayev.com', sum(count)=10697),\n",
" Row(addonId=u'_65Members_@download.fromdoctopdf.com', sum(count)=10374),\n",
" Row(addonId=u'{dc572301-7619-498c-a57d-39143191b318}', sum(count)=10307),\n",
" Row(addonId=u'elemhidehelper@adblockplus.org', sum(count)=9806),\n",
" Row(addonId=u'LVD-SAE@iacsearchandmedia.com', sum(count)=9500),\n",
" Row(addonId=u'{bee6eb20-01e0-ebd1-da83-080329fb9a3a}', sum(count)=9308),\n",
" Row(addonId=u'testpilot@labs.mozilla.com', sum(count)=9104),\n",
" Row(addonId=u'anti_banner@kaspersky.com', sum(count)=9048),\n",
" Row(addonId=u'jid1-YcMV6ngYmQRA2w@jetpack', sum(count)=8531),\n",
" Row(addonId=u'virtual_keyboard@kaspersky.com', sum(count)=8468),\n",
" Row(addonId=u'info@youtube-mp3.org', sum(count)=8316),\n",
" Row(addonId=u'online_banking@kaspersky.com', sum(count)=8304),\n",
" Row(addonId=u'url_advisor@kaspersky.com', sum(count)=8062),\n",
" Row(addonId=u'jid1-F9UJ2thwoAm5gQ@jetpack', sum(count)=7749),\n",
" Row(addonId=u'{a7c6cf7f-112c-4500-a7ea-39801a327e5f}', sum(count)=7551),\n",
" Row(addonId=u'client@anonymox.net', sum(count)=7534),\n",
" Row(addonId=u'{195A3098-0BD5-4e90-AE22-BA1C540AFD1E}', sum(count)=7513),\n",
" Row(addonId=u'firefoxdav@icloud.com', sum(count)=7452),\n",
" Row(addonId=u'{1018e4d6-728f-4b20-ad56-37578a4de76b}', sum(count)=7366),\n",
" Row(addonId=u'abb@amazon.com', sum(count)=7199),\n",
" Row(addonId=u'uBlock0@raymondhill.net', sum(count)=7165),\n",
" Row(addonId=u'deskCutv2@gmail.com', sum(count)=7134),\n",
" Row(addonId=u'bingsearch.full@microsoft.com', sum(count)=7129),\n",
" Row(addonId=u'web2pdfextension@web2pdf.adobedotcom', sum(count)=7089),\n",
" Row(addonId=u'mailcheck@web.de', sum(count)=6914),\n",
" Row(addonId=u'{fe272bd1-5f76-4ea4-8501-a05d35d823fc}', sum(count)=6755),\n",
" Row(addonId=u'ols@f-secure.com', sum(count)=6667),\n",
" Row(addonId=u'cliqz@cliqz.com', sum(count)=6550),\n",
" Row(addonId=u'{c45c406e-ab73-11d8-be73-000a95be3b12}', sum(count)=6459),\n",
" Row(addonId=u'yahooprotected@gmail.com', sum(count)=6428),\n",
" Row(addonId=u'quicklaunch@mozillaonline.com', sum(count)=6426),\n",
" Row(addonId=u'_dzMembers_@www.pconverter.com', sum(count)=6385),\n",
" Row(addonId=u'anttoolbar@ant.com', sum(count)=6318),\n",
" Row(addonId=u'WebProtection@360safe.com', sum(count)=5915),\n",
" Row(addonId=u'_4zMembers_@www.videodownloadconverter.com', sum(count)=5889),\n",
" Row(addonId=u'{E0B8C461-F8FB-49b4-8373-FE32E9252800}', sum(count)=5784),\n",
" Row(addonId=u'_39Members_@www.mapsgalaxy.com', sum(count)=5772),\n",
" Row(addonId=u'jid1-HAV2inXAnQPIeA@jetpack', sum(count)=5692),\n",
" Row(addonId=u'{d40f5e7b-d2cf-4856-b441-cc613eeffbe3}', sum(count)=5532),\n",
" Row(addonId=u'safesearch@avira.com', sum(count)=5504),\n",
" Row(addonId=u'virtual_keyboard_294FF26A1D5B455495946778FDE7CEDB@kaspersky.com', sum(count)=5404),\n",
" Row(addonId=u'content_blocker_6418E0D362104DADA084DC312DFA8ABC@kaspersky.com', sum(count)=5403),\n",
" Row(addonId=u'{37964A3C-4EE8-47b1-8321-34DE2C39BA4D}', sum(count)=5357),\n",
" Row(addonId=u'{5384767E-00D9-40E9-B72F-9CC39D655D6F}', sum(count)=5221),\n",
" Row(addonId=u'{1BC9BA34-1EED-42ca-A505-6D2F1A935BBB}', sum(count)=5194),\n",
" Row(addonId=u'{b9acf540-acba-11e1-8ccb-001fd0e08bd4}', sum(count)=5010),\n",
" Row(addonId=u'personas@christopher.beard', sum(count)=4989),\n",
" Row(addonId=u'firefox@zenmate.com', sum(count)=4736),\n",
" Row(addonId=u'default_newtabff@gmail.com', sum(count)=4712),\n",
" Row(addonId=u's3google@translator', sum(count)=4704),\n",
" Row(addonId=u'{3d7eb24f-2740-49df-8937-200b1cc08f8a}', sum(count)=4675),\n",
" Row(addonId=u'{64161300-e22b-11db-8314-0800200c9a66}', sum(count)=4670),\n",
" Row(addonId=u'jid1-4P0kohSJxU1qGg@jetpack', sum(count)=4628),\n",
" Row(addonId=u'{9AA46F4F-4DC7-4c06-97AF-5035170634FE}', sum(count)=4476),\n",
" Row(addonId=u'{20a82645-c095-46ed-80e3-08825760534b}', sum(count)=4448),\n",
" Row(addonId=u'zotero@chnm.gmu.edu', sum(count)=4385),\n",
" Row(addonId=u'2020Player_IKEA@2020Technologies.com', sum(count)=4374),\n",
" Row(addonId=u'firegestures@xuldev.org', sum(count)=4337),\n",
" Row(addonId=u'searchme@mybrowserbar.com', sum(count)=4322),\n",
" Row(addonId=u'foxmarks@kei.com', sum(count)=4305),\n",
" Row(addonId=u'_gcMembers_@www.weatherblink.com', sum(count)=4220),\n",
" Row(addonId=u'netvideohunter@netvideohunter.com', sum(count)=4213),\n",
" Row(addonId=u'MUB-SAE@iacsearchandmedia.com', sum(count)=4175),\n",
" Row(addonId=u'{f3bd3dd2-2888-44c5-91a2-2caeb33fb898}', sum(count)=4163),\n",
" Row(addonId=u'browser-mailcheck@web.de', sum(count)=4100),\n",
" Row(addonId=u'share_all_cn@mozillaonline.com', sum(count)=4082),\n",
" Row(addonId=u'online_banking_69A4E213815F42BD863D889007201D82@kaspersky.com', sum(count)=4067),\n",
" Row(addonId=u'vk@sergeykolosov.mp', sum(count)=3992),\n",
" Row(addonId=u'{E6C1199F-E687-42da-8C24-E7770CC3AE66}', sum(count)=3899),\n",
" Row(addonId=u'{81BF1D23-5F17-408D-AC6B-BD6DF7CAF670}', sum(count)=3831),\n",
" Row(addonId=u'jid1-q4sG8pYhq8KGHs@jetpack', sum(count)=3778),\n",
" Row(addonId=u'mg.mail.yahoo.com@services.mozilla.org', sum(count)=3732),\n",
" Row(addonId=u'homeutil@yandex.ru', sum(count)=3731),\n",
" Row(addonId=u'{6AC85730-7D0F-4de0-B3FA-21142DD85326}', sum(count)=3716),\n",
" Row(addonId=u'{46551EC9-40F0-4e47-8E18-8E5CF550CFB8}', sum(count)=3650),\n",
" Row(addonId=u'{e968fc70-8f95-4ab9-9e79-304de2a71ee1}', sum(count)=3637),\n",
" Row(addonId=u'ich@maltegoetz.de', sum(count)=3632),\n",
" Row(addonId=u'ClassicThemeRestorer@ArisT2Noia4dev', sum(count)=3631),\n",
" Row(addonId=u'_e5Members_@www.productivityboss.com', sum(count)=3609),\n",
" Row(addonId=u'{0545b830-f0aa-4d7e-8820-50a4629a56fe}', sum(count)=3572),\n",
" Row(addonId=u'caa1-aDOiCAxFFMOVIX@jetpack', sum(count)=3561),\n",
" Row(addonId=u'_gtMembers_@free.gamingwonderland.com', sum(count)=3513),\n",
" Row(addonId=u'ffext_basicvideoext@startpage24', sum(count)=3485),\n",
" Row(addonId=u'{0b457cAA-602d-484a-8fe7-c1d894a011ba}', sum(count)=3473),\n",
" Row(addonId=u'_64Members_@download.televisionfanatic.com', sum(count)=3461),\n",
" Row(addonId=u'xthunder@lshai.com', sum(count)=3385),\n",
" Row(addonId=u'{1B33E42F-EF14-4cd3-B6DC-174571C4349C}', sum(count)=3344),\n",
" Row(addonId=u'leethax@leethax.net', sum(count)=3333),\n",
" Row(addonId=u'{7b1bf0b6-a1b9-42b0-b75d-252036438bdc}', sum(count)=3107),\n",
" Row(addonId=u'jid1-xUfzOsOFlzSOXg@jetpack', sum(count)=3084),\n",
" Row(addonId=u'youtubeunblocker@unblocker.yt', sum(count)=3075),\n",
" Row(addonId=u'_8hMembers_@download.allin1convert.com', sum(count)=3071),\n",
" Row(addonId=u'jetpack-extension@dashlane.com', sum(count)=3001),\n",
" Row(addonId=u'mailcheck@gmx.net', sum(count)=2929),\n",
" Row(addonId=u'_9tMembers_@download.internetspeedtracker.com', sum(count)=2918),\n",
" Row(addonId=u'_dqMembers_@www.downspeedtest.com', sum(count)=2821),\n",
" Row(addonId=u'youdao-translate@mozillaonline.com', sum(count)=2806),\n",
" Row(addonId=u'foxyproxy@eric.h.jung', sum(count)=2805),\n",
" Row(addonId=u'{77b819fa-95ad-4f2c-ac7c-486b356188a9}', sum(count)=2791),\n",
" Row(addonId=u'{6c28e999-e900-4635-a39d-b1ec90ba0c0f}', sum(count)=2772),\n",
" Row(addonId=u'_9pMembers_@free.onlinemapfinder.com', sum(count)=2735),\n",
" Row(addonId=u'FGZ-SAE@iacsearchandmedia.com', sum(count)=2712),\n",
" Row(addonId=u'jid0-GXjLLfbCoAx0LcltEdFrEkQdQPI@jetpack', sum(count)=2683),\n",
" Row(addonId=u'browser-mailcheck@gmx.net', sum(count)=2642),\n",
" Row(addonId=u'{BBB77B49-9FF4-4d5c-8FE2-92B1D6CD696C}', sum(count)=2603),\n",
" Row(addonId=u'donottrackplus@abine.com', sum(count)=2587),\n",
" Row(addonId=u'e67f8350-7edf-11e3-baa7-0800200c9a66@fri-gate.org', sum(count)=2580),\n",
" Row(addonId=u'{1280606b-2510-4fe0-97ef-9b5a22eafe30}', sum(count)=2535),\n",
" Row(addonId=u'SQLiteManager@mrinalkant.blogspot.com', sum(count)=2522),\n",
" Row(addonId=u'onepassword4@agilebits.com', sum(count)=2430),\n",
" Row(addonId=u'2.0@disconnect.me', sum(count)=2401),\n",
" Row(addonId=u'{097d3191-e6fa-4728-9826-b533d755359d}', sum(count)=2392),\n",
" Row(addonId=u'{7f57cf46-4467-4c2d-adfa-0cba7c507e54}', sum(count)=2354),\n",
" Row(addonId=u'{6d96bb5e-1175-4ebf-8ab5-5f56f1c79f65}', sum(count)=2347),\n",
" Row(addonId=u'helper@savefrom.net', sum(count)=2328),\n",
" Row(addonId=u'zoteroWinWordIntegration@zotero.org', sum(count)=2322),\n",
" Row(addonId=u'{22119944-ED35-4ab1-910B-E619EA06A115}', sum(count)=2317),\n",
" Row(addonId=u'{cd617375-6743-4ee8-bac4-fbf10f35729e}', sum(count)=2252),\n",
" Row(addonId=u'pavel.sherbakov@gmail.com', sum(count)=2244),\n",
" Row(addonId=u'{87F8774F-B485-47E2-A755-A40A8A5E886D}', sum(count)=2238),\n",
" Row(addonId=u'{2b10c1c8-a11f-4bad-fe9c-1c11e82cac42}', sum(count)=2232),\n",
" Row(addonId=u'tabscope@xuldev.org', sum(count)=2198),\n",
" Row(addonId=u'paulsaintuzb@gmail.com', sum(count)=2178),\n",
" Row(addonId=u'https-everywhere-eff@eff.org', sum(count)=2118),\n",
" Row(addonId=u'tmbepff@trendmicro.com', sum(count)=2091),\n",
" Row(addonId=u'youtubemp3podcaster@jeremy.d.gregorio.com', sum(count)=2070),\n",
" Row(addonId=u'plus.google.com@services.mozilla.org', sum(count)=2034),\n",
" Row(addonId=u'adguardadblocker@adguard.com', sum(count)=2028),\n",
" Row(addonId=u'amznUWL2@amazon.com', sum(count)=2003),\n",
" Row(addonId=u'defsearchp@gmail.com', sum(count)=2000),\n",
" Row(addonId=u'{75CEEE46-9B64-46f8-94BF-54012DE155F0}', sum(count)=1950),\n",
" Row(addonId=u'belgiumeid@eid.belgium.be', sum(count)=1944),\n",
" Row(addonId=u'{B17C1C5A-04B1-11DB-9804-B622A1EF5492}', sum(count)=1937),\n",
" Row(addonId=u'{C7AE725D-FA5C-4027-BB4C-787EF9F8248A}', sum(count)=1927),\n",
" Row(addonId=u'_paMembers_@www.filmfanatic.com', sum(count)=1926),\n",
" Row(addonId=u'gmailnoads@mywebber.com', sum(count)=1887),\n",
" Row(addonId=u'vdpure@link64', sum(count)=1869),\n",
" Row(addonId=u'{66E978CD-981F-47DF-AC42-E3CF417C1467}', sum(count)=1839),\n",
" Row(addonId=u'webrootsecure@webroot.com', sum(count)=1833),\n",
" Row(addonId=u'{068e178c-61a9-4a63-b74f-87404a6f5ea1}', sum(count)=1830),\n",
" Row(addonId=u'{dd3d7613-0246-469d-bc65-2a3cc1668adc}', sum(count)=1810),\n",
" Row(addonId=u'nosquint@urandom.ca', sum(count)=1801),\n",
" Row(addonId=u'{ea614400-e918-4741-9a97-7a972ff7c30b}', sum(count)=1753),\n",
" Row(addonId=u'{22181a4d-af90-4ca3-a569-faed9118d6bc}', sum(count)=1738),\n",
" Row(addonId=u'jqs@sun.com', sum(count)=1736),\n",
" Row(addonId=u'fastdial@telega.phpnet.us', sum(count)=1714),\n",
" Row(addonId=u'{02450914-cdd9-410f-b1da-db004e18c671}', sum(count)=1704),\n",
" Row(addonId=u'personas@mozillaonline.com', sum(count)=1695),\n",
" Row(addonId=u'{170503FA-3349-4F17-BC86-001888A5C8E2}', sum(count)=1689),\n",
" Row(addonId=u'jid1-P34HaABBBpOerQ@jetpack', sum(count)=1686),\n",
" Row(addonId=u'{8f8fe09b-0bd3-4470-bc1b-8cad42b8203a}', sum(count)=1679),\n",
" Row(addonId=u'{ad0d925d-88f8-47f1-85ea-8463569e756e}', sum(count)=1678),\n",
" Row(addonId=u'toolbar@shopathome.com', sum(count)=1674)]"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"top_addons_by_id"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import boto3\n",
"from gzip import GzipFile\n",
"from cStringIO import StringIO\n",
"import csv\n",
"\n",
"class S3CompressedWriter(object):\n",
" def __init__(self, bucket, path, mimetype='text/plain'):\n",
" self.bucket = bucket\n",
" self.path = path\n",
" self.mimetype = mimetype\n",
" self._buffer = None\n",
"\n",
" def __enter__(self):\n",
" self._buffer = StringIO();\n",
" self._writer = GzipFile(mode=\"wb\", fileobj=self._buffer)\n",
" return self._writer\n",
"\n",
" def __exit__(self, exc_type, exc_value, traceback):\n",
" if exc_value is None:\n",
" self._writer.close()\n",
" self._buffer.seek(0)\n",
" s3 = boto3.resource('s3')\n",
" s3.Object(self.bucket, self.path).put(Body=self._buffer, ContentEncoding='gzip', ContentType=self.mimetype)\n",
" self._buffer = None\n",
"\n",
" def __del__(self):\n",
" assert self._buffer is None\n"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"with S3CompressedWriter('telemetry-public-analysis-2', 'bsmedberg/20160212-top-addons.csv') as fd:\n",
" csvw = csv.writer(fd)\n",
" for row in top_addons_by_id:\n",
" csvw.writerow(row)\n"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"322865"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"top_addons_by_id[0][1]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment