Last active
May 12, 2016 02:00
-
-
Save bsmedberg/e2f728893ee78e509630c1b4b71b1bb4 to your computer and use it in GitHub Desktop.
Upgrade Progression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# ### Longitudinal Dataset Tutorial | |
# In[1]: | |
import pandas as pd | |
import numpy as np | |
import plotly.plotly as py | |
get_ipython().magic(u'pylab inline') | |
# In[2]: | |
sc.defaultParallelism | |
# In[4]: | |
frame = sqlContext.sql("SELECT subsession_start_date, build, settings FROM longitudinal") | |
# Number of profiles: | |
# In[5]: | |
frame.count() | |
# In[8]: | |
few = frame.rdd.take(10) | |
# In[29]: | |
get_ipython().run_cell_magic(u'time', u'', u"\nfrom datetime import datetime, date\nfrom itertools import imap\n\ntotal_records = sc.accumulator(0)\nnone_records = sc.accumulator(0)\nnon_release = sc.accumulator(0)\nvalue_error = sc.accumulator(0)\n\ndef map_upgrades(row):\n total_records.add(1)\n \n if row.settings is None or row.subsession_start_date is None or row.build is None:\n none_records.add(1)\n return\n \n # ignore people who aren't release-only or have really weird data\n for s in row.settings:\n if s.update.channel != 'release':\n non_release.add(1)\n return\n\n try:\n versions = [tuple(imap(int, b.version.split('.'))) for b in row.build]\n dates = [datetime.strptime(ssd[:10], '%Y-%m-%d').date()\n for ssd in row.subsession_start_date]\n except (ValueError, AttributeError):\n value_error.add(1)\n return\n \n version_dates = zip(versions, dates)\n version_dates.reverse()\n \n # If the first session we see isn't running up against the 6-month window\n # don't record the first session as an upgrade\n \n last_version = None\n version, ssd = version_dates[0]\n if ssd < date(2016, 1, 1):\n last_version = version\n \n for version, ssd in version_dates:\n if version > last_version:\n yield ssd, version\n last_version = version\n \nr = frame.flatMap(map_upgrades).countByValue()") | |
# In[33]: | |
print total_records.value | |
print none_records.value | |
print non_release.value | |
print value_error.value | |
# In[44]: | |
from datetime import date, datetime | |
updates_46_0_1 = [(d, c) for ((d, v), c) in r.iteritems() | |
if v == (45, 0, 1) | |
and d > date(2016, 3, 1) and d < date(2016, 5, 11)] | |
updates_46_0_1.sort(key=lambda (d, c): d) | |
# In[45]: | |
import plotly.plotly as py | |
import plotly.graph_objs as go | |
d = [ | |
go.Bar( | |
x=[d for (d, c) in updates_46_0_1], | |
y=[c for (d, c) in updates_46_0_1] | |
) | |
] | |
py.iplot(d) | |
# In[71]: | |
idx = pd.Series() | |
f = pd.DataFrame((c for d, c in updates_46_0_1), columns=["c"], index=(pd.to_datetime(d) for d, c in updates_46_0_1)) | |
fig = plt.figure(figsize=(11, 4)) | |
plt.title("Upgrades to 45.0.1 over time") | |
plt.ylabel("Upgrades per day") | |
plt.xlabel("Date of First 45.0.1 run") | |
plt.plot(f.index, f) | |
plt.show() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment