Skip to content

Instantly share code, notes, and snippets.

@bsmedberg
Last active August 18, 2017 18:14
Show Gist options
  • Save bsmedberg/8404d60cf6bd906693235d62f34e9b0c to your computer and use it in GitHub Desktop.
Save bsmedberg/8404d60cf6bd906693235d62f34e9b0c to your computer and use it in GitHub Desktop.
Build%20Reversion
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Downgrade and Channel Switching Analysis\n",
"========================================\n",
"\n",
"authors:\n",
"- bsmedberg\n",
"\n",
"last run: 2017-03-08\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Longitudinal Dataset Tutorial"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
}
],
"source": [
"import matplotlib.pyplot as plt\n",
"import pandas as pd\n",
"import numpy as np\n",
"import plotly.plotly as py\n",
"\n",
"%pylab inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"192"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sc.defaultParallelism"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The longitudinal dataset can be accessed as a Spark [DataFrame](https://spark.apache.org/docs/1.5.2/api/python/pyspark.sql.html#pyspark.sql.DataFrame), which is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"frame = sqlContext.sql(\"SELECT client_id, profile_subsession_counter, build, settings FROM longitudinal\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<pyspark.context.SparkContext at 0x7fcf2bfce890>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sc"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def get_version(v):\n",
" if v is None:\n",
" return None\n",
" try:\n",
" return int(v.split(\".\")[0])\n",
" except ValueError:\n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"None\n",
"12\n",
"12\n",
"None\n"
]
}
],
"source": [
"print get_version(\"\")\n",
"print get_version(\"12.3\")\n",
"print get_version(\"12\")\n",
"print get_version(\"abcd\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"users that switched channels at all: 0.48%\n",
"users that reverted to an older version: 2.55%\n",
"users that reverted to an older version, staying on the release channel: 2.19\n"
]
}
],
"source": [
"count_total = sc.accumulator(0)\n",
"count_backwardsversion = sc.accumulator(0)\n",
"count_channelswitch = sc.accumulator(0)\n",
"count_backwardsversion_releaseonly = sc.accumulator(0)\n",
"\n",
"def mapper(row):\n",
" channel_switch = False\n",
" release_only = True\n",
" backwardsversion = False\n",
"\n",
" # sessions are sorted by subsessionStartDate and then profileSubsessionCounter, newest-first.\n",
" last_version = 99\n",
" last_channel = None\n",
"\n",
" if row.settings is None:\n",
" return\n",
" \n",
" for settings in row.settings:\n",
" channel = settings.update.channel\n",
" if channel != \"release\":\n",
" release_only = False\n",
" if last_channel is None:\n",
" last_channel = channel\n",
" elif last_channel != channel:\n",
" channel_switch = True\n",
"\n",
" for build in row.build:\n",
" version = get_version(build.version)\n",
" if version is not None:\n",
" if version > last_version:\n",
" backwardsversion = True\n",
" last_version = version\n",
"\n",
" count_total.add(1)\n",
" if backwardsversion:\n",
" count_backwardsversion.add(1)\n",
" if release_only:\n",
" count_backwardsversion_releaseonly.add(1)\n",
" if channel_switch:\n",
" count_channelswitch.add(1)\n",
"frame.rdd.foreach(mapper)\n",
"\n",
"total = float(count_total.value)\n",
"print \"users that switched channels at all: {:.2f}%\".format(count_channelswitch.value / total * 100)\n",
"print \"users that reverted to an older version: {:.2f}%\".format(count_backwardsversion.value / total * 100)\n",
"print \"users that reverted to an older version, staying on the release channel: {:.2f}\".format(count_backwardsversion_releaseonly.value / total * 100)\n"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
# coding: utf-8
# Downgrade and Channel Switching Analysis
# ========================================
#
# authors:
# - bsmedberg
#
# last run: 2017-03-08
#
# ### Longitudinal Dataset Tutorial
# In[1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py
get_ipython().magic(u'pylab inline')
# In[2]:
sc.defaultParallelism
# The longitudinal dataset can be accessed as a Spark [DataFrame](https://spark.apache.org/docs/1.5.2/api/python/pyspark.sql.html#pyspark.sql.DataFrame), which is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python.
# In[3]:
frame = sqlContext.sql("SELECT client_id, profile_subsession_counter, build, settings FROM longitudinal")
# In[4]:
sc
# In[10]:
def get_version(v):
if v is None:
return None
try:
return int(v.split(".")[0])
except ValueError:
return None
# In[6]:
print get_version("")
print get_version("12.3")
print get_version("12")
print get_version("abcd")
# In[11]:
count_total = sc.accumulator(0)
count_backwardsversion = sc.accumulator(0)
count_channelswitch = sc.accumulator(0)
count_backwardsversion_releaseonly = sc.accumulator(0)
def mapper(row):
channel_switch = False
release_only = True
backwardsversion = False
# sessions are sorted by subsessionStartDate and then profileSubsessionCounter, newest-first.
last_version = 99
last_channel = None
if row.settings is None:
return
for settings in row.settings:
channel = settings.update.channel
if channel != "release":
release_only = False
if last_channel is None:
last_channel = channel
elif last_channel != channel:
channel_switch = True
for build in row.build:
version = get_version(build.version)
if version is not None:
if version > last_version:
backwardsversion = True
last_version = version
count_total.add(1)
if backwardsversion:
count_backwardsversion.add(1)
if release_only:
count_backwardsversion_releaseonly.add(1)
if channel_switch:
count_channelswitch.add(1)
frame.rdd.foreach(mapper)
total = float(count_total.value)
print "users that switched channels at all: {:.2f}%".format(count_channelswitch.value / total * 100)
print "users that reverted to an older version: {:.2f}%".format(count_backwardsversion.value / total * 100)
print "users that reverted to an older version, staying on the release channel: {:.2f}".format(count_backwardsversion_releaseonly.value / total * 100)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment