Skip to content

Instantly share code, notes, and snippets.

@bsmedberg
Last active May 12, 2016 02:00
Show Gist options
  • Save bsmedberg/e2f728893ee78e509630c1b4b71b1bb4 to your computer and use it in GitHub Desktop.
Save bsmedberg/e2f728893ee78e509630c1b4b71b1bb4 to your computer and use it in GitHub Desktop.
Upgrade Progression
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# ### Longitudinal Dataset Tutorial
# In[1]:
import pandas as pd
import numpy as np
import plotly.plotly as py
get_ipython().magic(u'pylab inline')
# In[2]:
sc.defaultParallelism
# In[4]:
frame = sqlContext.sql("SELECT subsession_start_date, build, settings FROM longitudinal")
# Number of profiles:
# In[5]:
frame.count()
# In[8]:
few = frame.rdd.take(10)
# In[29]:
get_ipython().run_cell_magic(u'time', u'', u"\nfrom datetime import datetime, date\nfrom itertools import imap\n\ntotal_records = sc.accumulator(0)\nnone_records = sc.accumulator(0)\nnon_release = sc.accumulator(0)\nvalue_error = sc.accumulator(0)\n\ndef map_upgrades(row):\n total_records.add(1)\n \n if row.settings is None or row.subsession_start_date is None or row.build is None:\n none_records.add(1)\n return\n \n # ignore people who aren't release-only or have really weird data\n for s in row.settings:\n if s.update.channel != 'release':\n non_release.add(1)\n return\n\n try:\n versions = [tuple(imap(int, b.version.split('.'))) for b in row.build]\n dates = [datetime.strptime(ssd[:10], '%Y-%m-%d').date()\n for ssd in row.subsession_start_date]\n except (ValueError, AttributeError):\n value_error.add(1)\n return\n \n version_dates = zip(versions, dates)\n version_dates.reverse()\n \n # If the first session we see isn't running up against the 6-month window\n # don't record the first session as an upgrade\n \n last_version = None\n version, ssd = version_dates[0]\n if ssd < date(2016, 1, 1):\n last_version = version\n \n for version, ssd in version_dates:\n if version > last_version:\n yield ssd, version\n last_version = version\n \nr = frame.flatMap(map_upgrades).countByValue()")
# In[33]:
print total_records.value
print none_records.value
print non_release.value
print value_error.value
# In[44]:
from datetime import date, datetime
updates_46_0_1 = [(d, c) for ((d, v), c) in r.iteritems()
if v == (45, 0, 1)
and d > date(2016, 3, 1) and d < date(2016, 5, 11)]
updates_46_0_1.sort(key=lambda (d, c): d)
# In[45]:
import plotly.plotly as py
import plotly.graph_objs as go
d = [
go.Bar(
x=[d for (d, c) in updates_46_0_1],
y=[c for (d, c) in updates_46_0_1]
)
]
py.iplot(d)
# In[71]:
idx = pd.Series()
f = pd.DataFrame((c for d, c in updates_46_0_1), columns=["c"], index=(pd.to_datetime(d) for d, c in updates_46_0_1))
fig = plt.figure(figsize=(11, 4))
plt.title("Upgrades to 45.0.1 over time")
plt.ylabel("Upgrades per day")
plt.xlabel("Date of First 45.0.1 run")
plt.plot(f.index, f)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment