Skip to content

Instantly share code, notes, and snippets.

@georgf
Last active August 12, 2016 13:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save georgf/0ffe4f915861be180909037a7204d7b9 to your computer and use it in GitHub Desktop.
Save georgf/0ffe4f915861be180909037a7204d7b9 to your computer and use it in GitHub Desktop.
mobile repeated profile date
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Bug 1291265 - Check for repeated client counts in new_records in Fennec dashboard data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/hadoop/anaconda2/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.\n",
" warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import numpy as np\n",
"import plotly.plotly as py\n",
"\n",
"%pylab inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"160"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sc.defaultParallelism"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Load the mobile clients parquet file for performant analysis."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"4119561698"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = sqlContext.read.load(\"s3n://net-mozaws-prod-us-west-2-pipeline-analysis/mobile/mobile_clients\", \"parquet\")\n",
"dataset.count()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Row(clientid=u'e8c84fb4-f260-47a9-823e-d06e3cdd1764', submissiondate=datetime.datetime(2016, 5, 16, 0, 0), creationdate=None, profiledate=datetime.datetime(2016, 3, 22, 0, 0), geocountry=u'JP', locale=u'ja-JA', os=u'Android', osversion=u'19', buildid=u'20160515030241', appversion=u'49.0a1', device=u'iNet-ADP-921', arch=u'armeabi-v7a', defaultsearch=u'google', distributionid=None, experiments=u'[\"offline-cache\",\"urlbar-show-origin-only\",\"bookmark-history-menu\",\"content-notifications-5pm\",\"urlbar-show-ev-cert-owner\",\"promote-add-to-homescreen\",\"search-term\",\"onboarding2-c\"]', channel=u'nightly', submission=u'20160516')"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.rdd.first()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Filter out pings sent on d0"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"d0 = dataset.filter(\"channel = 'release'\")\\\n",
" .filter(\"os = 'Android'\")\\\n",
" .filter(\"submissiondate = profiledate\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"139685854"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"d0.count()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.034"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"round(float(d0.count()) / dataset.count(), 3)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"Row(clientid=u'ea677620-b51b-40e7-86dc-f91881d6426b', submissiondate=datetime.datetime(2016, 6, 5, 0, 0), creationdate=None, profiledate=datetime.datetime(2016, 6, 5, 0, 0), geocountry=u'GR', locale=u'el-GR', os=u'Android', osversion=u'22', buildid=u'20160502161457', appversion=u'46.0.1', device=u'Sony-E5303', arch=u'armeabi-v7a', defaultsearch=None, distributionid=None, experiments=u'[\"bookmark-history-menu\"]', channel=u'release', submission=u'20160605')"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"d0.rdd.first()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Check for repeated d0 per client"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"First count on how many different days we saw clients submitting d0 pings."
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"d0counts = d0.groupBy(['clientid', 'submissiondate'])\\\n",
" .count()\\\n",
" .groupBy('clientid')\\\n",
" .count()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[Row(clientid=u'27aa46a1-f1a8-4200-b08f-0b8a0b4f5c3e', count=1),\n",
" Row(clientid=u'28d21e91-8bea-4725-be72-3f599a2bcfbd', count=1),\n",
" Row(clientid=u'f53e3c2a-521e-46ab-b235-4b32f2c90238', count=1)]"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"d0counts.rdd.take(3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now, how many of these submitted d0 pings on more than one day?"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"d0counts.filter(\"count > 1\")\\\n",
" .count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Check for repeated profile dates"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Ok, that is really low, that does not seem to be a problem.\n",
"Following up from here, how many clients do actually submit more than one profiledate?"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Group clients profiledate submissions together."
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"repeatCounts = dataset.filter(\"channel = 'release'\")\\\n",
" .filter(\"os = 'Android'\")\\\n",
" .groupBy(['clientid', 'profiledate'])\\\n",
" .count()"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[Row(clientid=u'c5e9483e-83d5-4812-aaee-24cfcd29293f', profiledate=datetime.datetime(2016, 5, 18, 0, 0), count=808),\n",
" Row(clientid=u'934ba8d0-a183-44f8-acfc-701794e33a26', profiledate=datetime.datetime(2016, 4, 2, 0, 0), count=481),\n",
" Row(clientid=u'5ce8ab95-e9dc-40dc-a0b2-7927d15920a5', profiledate=datetime.datetime(2016, 6, 16, 0, 0), count=586)]"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"repeatCounts.rdd.take(3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now check how many of them submitted more than one profiledate value."
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"6509"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"repeatCounts.groupBy('clientid')\\\n",
" .count()\\\n",
" .filter('count > 1')\\\n",
" .count()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
# coding: utf-8
# ### Bug 1291265 - Check for repeated client counts in new_records in Fennec dashboard data
# In[1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
get_ipython().magic(u'pylab inline')
# In[2]:
sc.defaultParallelism
# Load the mobile clients parquet file for performant analysis.
# In[3]:
dataset = sqlContext.read.load("s3n://net-mozaws-prod-us-west-2-pipeline-analysis/mobile/mobile_clients", "parquet")
dataset.count()
# In[4]:
dataset.rdd.first()
# ### Filter out pings sent on d0
# In[6]:
d0 = dataset.filter("channel = 'release'") .filter("os = 'Android'") .filter("submissiondate = profiledate")
# In[10]:
d0.count()
# In[11]:
round(float(d0.count()) / dataset.count(), 3)
# In[25]:
d0.rdd.first()
# ### Check for repeated d0 per client
# First count on how many different days we saw clients submitting d0 pings.
# In[20]:
d0counts = d0.groupBy(['clientid', 'submissiondate']) .count() .groupBy('clientid') .count()
# In[21]:
d0counts.rdd.take(3)
# Now, how many of these submitted d0 pings on more than one day?
# In[23]:
d0counts.filter("count > 1") .count()
# ### Check for repeated profile dates
# Ok, that is really low, that does not seem to be a problem.
# Following up from here, how many clients do actually submit more than one profiledate?
# Group clients profiledate submissions together.
# In[30]:
repeatCounts = dataset.filter("channel = 'release'") .filter("os = 'Android'") .groupBy(['clientid', 'profiledate']) .count()
# In[31]:
repeatCounts.rdd.take(3)
# Now check how many of them submitted more than one profiledate value.
# In[32]:
repeatCounts.groupBy('clientid') .count() .filter('count > 1') .count()
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment