bsmedberg/Build%20Reversion.ipynb

## Build%20Reversion.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Downgrade and Channel Switching Analysis\n",
    "========================================\n",
    "\n",
    "authors:\n",
    "- bsmedberg\n",
    "\n",
    "last run: 2017-03-08\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Longitudinal Dataset Tutorial"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Populating the interactive namespace from numpy and matplotlib\n"
     ]
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import plotly.plotly as py\n",
    "\n",
    "%pylab inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "192"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sc.defaultParallelism"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The longitudinal dataset can be accessed as a Spark [DataFrame](https://spark.apache.org/docs/1.5.2/api/python/pyspark.sql.html#pyspark.sql.DataFrame), which is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "frame = sqlContext.sql(\"SELECT client_id, profile_subsession_counter, build, settings FROM longitudinal\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<pyspark.context.SparkContext at 0x7fcf2bfce890>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_version(v):\n",
    "    if v is None:\n",
    "        return None\n",
    "    try:\n",
    "        return int(v.split(\".\")[0])\n",
    "    except ValueError:\n",
    "        return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "None\n",
      "12\n",
      "12\n",
      "None\n"
     ]
    }
   ],
   "source": [
    "print get_version(\"\")\n",
    "print get_version(\"12.3\")\n",
    "print get_version(\"12\")\n",
    "print get_version(\"abcd\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "users that switched channels at all: 0.48%\n",
      "users that reverted to an older version: 2.55%\n",
      "users that reverted to an older version, staying on the release channel: 2.19\n"
     ]
    }
   ],
   "source": [
    "count_total = sc.accumulator(0)\n",
    "count_backwardsversion = sc.accumulator(0)\n",
    "count_channelswitch = sc.accumulator(0)\n",
    "count_backwardsversion_releaseonly = sc.accumulator(0)\n",
    "\n",
    "def mapper(row):\n",
    "    channel_switch = False\n",
    "    release_only = True\n",
    "    backwardsversion = False\n",
    "\n",
    "    # sessions are sorted by subsessionStartDate and then profileSubsessionCounter, newest-first.\n",
    "    last_version = 99\n",
    "    last_channel = None\n",
    "\n",
    "    if row.settings is None:\n",
    "        return\n",
    "            \n",
    "    for settings in row.settings:\n",
    "        channel = settings.update.channel\n",
    "        if channel != \"release\":\n",
    "            release_only = False\n",
    "        if last_channel is None:\n",
    "            last_channel = channel\n",
    "        elif last_channel != channel:\n",
    "            channel_switch = True\n",
    "\n",
    "    for build in row.build:\n",
    "        version = get_version(build.version)\n",
    "        if version is not None:\n",
    "            if version > last_version:\n",
    "                backwardsversion = True\n",
    "            last_version = version\n",
    "\n",
    "    count_total.add(1)\n",
    "    if backwardsversion:\n",
    "        count_backwardsversion.add(1)\n",
    "        if release_only:\n",
    "            count_backwardsversion_releaseonly.add(1)\n",
    "    if channel_switch:\n",
    "        count_channelswitch.add(1)\n",
    "frame.rdd.foreach(mapper)\n",
    "\n",
    "total = float(count_total.value)\n",
    "print \"users that switched channels at all: {:.2f}%\".format(count_channelswitch.value / total * 100)\n",
    "print \"users that reverted to an older version: {:.2f}%\".format(count_backwardsversion.value / total * 100)\n",
    "print \"users that reverted to an older version, staying on the release channel: {:.2f}\".format(count_backwardsversion_releaseonly.value / total * 100)\n"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}

## Build%20Reversion.py

# coding: utf-8

# Downgrade and Channel Switching Analysis
# ========================================
#
# authors:
# - bsmedberg
#
# last run: 2017-03-08
#

# ### Longitudinal Dataset Tutorial

# In[1]:

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import plotly.plotly as py

get_ipython().magic(u'pylab inline')


# In[2]:

sc.defaultParallelism


# The longitudinal dataset can be accessed as a Spark [DataFrame](https://spark.apache.org/docs/1.5.2/api/python/pyspark.sql.html#pyspark.sql.DataFrame), which is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python.

# In[3]:

frame = sqlContext.sql("SELECT client_id, profile_subsession_counter, build, settings FROM longitudinal")


# In[4]:

sc


# In[10]:

def get_version(v):
    if v is None:
        return None
    try:
        return int(v.split(".")[0])
    except ValueError:
        return None


# In[6]:

print get_version("")
print get_version("12.3")
print get_version("12")
print get_version("abcd")


# In[11]:

count_total = sc.accumulator(0)
count_backwardsversion = sc.accumulator(0)
count_channelswitch = sc.accumulator(0)
count_backwardsversion_releaseonly = sc.accumulator(0)

def mapper(row):
    channel_switch = False
    release_only = True
    backwardsversion = False

    # sessions are sorted by subsessionStartDate and then profileSubsessionCounter, newest-first.
    last_version = 99
    last_channel = None

    if row.settings is None:
        return

    for settings in row.settings:
        channel = settings.update.channel
        if channel != "release":
            release_only = False
        if last_channel is None:
            last_channel = channel
        elif last_channel != channel:
            channel_switch = True

    for build in row.build:
        version = get_version(build.version)
        if version is not None:
            if version > last_version:
                backwardsversion = True
            last_version = version

    count_total.add(1)
    if backwardsversion:
        count_backwardsversion.add(1)
        if release_only:
            count_backwardsversion_releaseonly.add(1)
    if channel_switch:
        count_channelswitch.add(1)
frame.rdd.foreach(mapper)

total = float(count_total.value)
print "users that switched channels at all: {:.2f}%".format(count_channelswitch.value / total * 100)
print "users that reverted to an older version: {:.2f}%".format(count_backwardsversion.value / total * 100)
print "users that reverted to an older version, staying on the release channel: {:.2f}".format(count_backwardsversion_releaseonly.value / total * 100)
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Downgrade and Channel Switching Analysis\n",
	"========================================\n",
	"\n",
	"authors:\n",
	"- bsmedberg\n",
	"\n",
	"last run: 2017-03-08\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Longitudinal Dataset Tutorial"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Populating the interactive namespace from numpy and matplotlib\n"
	]
	}
	],
	"source": [
	"import matplotlib.pyplot as plt\n",
	"import pandas as pd\n",
	"import numpy as np\n",
	"import plotly.plotly as py\n",
	"\n",
	"%pylab inline"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"192"
	]
	},
	"execution_count": 2,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sc.defaultParallelism"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"The longitudinal dataset can be accessed as a Spark [DataFrame](https://spark.apache.org/docs/1.5.2/api/python/pyspark.sql.html#pyspark.sql.DataFrame), which is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"frame = sqlContext.sql(\"SELECT client_id, profile_subsession_counter, build, settings FROM longitudinal\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"<pyspark.context.SparkContext at 0x7fcf2bfce890>"
	]
	},
	"execution_count": 4,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sc"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def get_version(v):\n",
	" if v is None:\n",
	" return None\n",
	" try:\n",
	" return int(v.split(\".\")[0])\n",
	" except ValueError:\n",
	" return None"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"None\n",
	"12\n",
	"12\n",
	"None\n"
	]
	}
	],
	"source": [
	"print get_version(\"\")\n",
	"print get_version(\"12.3\")\n",
	"print get_version(\"12\")\n",
	"print get_version(\"abcd\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"users that switched channels at all: 0.48%\n",
	"users that reverted to an older version: 2.55%\n",
	"users that reverted to an older version, staying on the release channel: 2.19\n"
	]
	}
	],
	"source": [
	"count_total = sc.accumulator(0)\n",
	"count_backwardsversion = sc.accumulator(0)\n",
	"count_channelswitch = sc.accumulator(0)\n",
	"count_backwardsversion_releaseonly = sc.accumulator(0)\n",
	"\n",
	"def mapper(row):\n",
	" channel_switch = False\n",
	" release_only = True\n",
	" backwardsversion = False\n",
	"\n",
	" # sessions are sorted by subsessionStartDate and then profileSubsessionCounter, newest-first.\n",
	" last_version = 99\n",
	" last_channel = None\n",
	"\n",
	" if row.settings is None:\n",
	" return\n",
	" \n",
	" for settings in row.settings:\n",
	" channel = settings.update.channel\n",
	" if channel != \"release\":\n",
	" release_only = False\n",
	" if last_channel is None:\n",
	" last_channel = channel\n",
	" elif last_channel != channel:\n",
	" channel_switch = True\n",
	"\n",
	" for build in row.build:\n",
	" version = get_version(build.version)\n",
	" if version is not None:\n",
	" if version > last_version:\n",
	" backwardsversion = True\n",
	" last_version = version\n",
	"\n",
	" count_total.add(1)\n",
	" if backwardsversion:\n",
	" count_backwardsversion.add(1)\n",
	" if release_only:\n",
	" count_backwardsversion_releaseonly.add(1)\n",
	" if channel_switch:\n",
	" count_channelswitch.add(1)\n",
	"frame.rdd.foreach(mapper)\n",
	"\n",
	"total = float(count_total.value)\n",
	"print \"users that switched channels at all: {:.2f}%\".format(count_channelswitch.value / total * 100)\n",
	"print \"users that reverted to an older version: {:.2f}%\".format(count_backwardsversion.value / total * 100)\n",
	"print \"users that reverted to an older version, staying on the release channel: {:.2f}\".format(count_backwardsversion_releaseonly.value / total * 100)\n"
	]
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"kernelspec": {
	"display_name": "Python [default]",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.12"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}

	# coding: utf-8

	# Downgrade and Channel Switching Analysis
	# ========================================
	#
	# authors:
	# - bsmedberg
	#
	# last run: 2017-03-08
	#

	# ### Longitudinal Dataset Tutorial

	# In[1]:

	import matplotlib.pyplot as plt
	import pandas as pd
	import numpy as np
	import plotly.plotly as py

	get_ipython().magic(u'pylab inline')


	# In[2]:

	sc.defaultParallelism


	# The longitudinal dataset can be accessed as a Spark [DataFrame](https://spark.apache.org/docs/1.5.2/api/python/pyspark.sql.html#pyspark.sql.DataFrame), which is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python.

	# In[3]:

	frame = sqlContext.sql("SELECT client_id, profile_subsession_counter, build, settings FROM longitudinal")


	# In[4]:

	sc


	# In[10]:

	def get_version(v):
	if v is None:
	return None
	try:
	return int(v.split(".")[0])
	except ValueError:
	return None


	# In[6]:

	print get_version("")
	print get_version("12.3")
	print get_version("12")
	print get_version("abcd")


	# In[11]:

	count_total = sc.accumulator(0)
	count_backwardsversion = sc.accumulator(0)
	count_channelswitch = sc.accumulator(0)
	count_backwardsversion_releaseonly = sc.accumulator(0)

	def mapper(row):
	channel_switch = False
	release_only = True
	backwardsversion = False

	# sessions are sorted by subsessionStartDate and then profileSubsessionCounter, newest-first.
	last_version = 99
	last_channel = None

	if row.settings is None:
	return

	for settings in row.settings:
	channel = settings.update.channel
	if channel != "release":
	release_only = False
	if last_channel is None:
	last_channel = channel
	elif last_channel != channel:
	channel_switch = True

	for build in row.build:
	version = get_version(build.version)
	if version is not None:
	if version > last_version:
	backwardsversion = True
	last_version = version

	count_total.add(1)
	if backwardsversion:
	count_backwardsversion.add(1)
	if release_only:
	count_backwardsversion_releaseonly.add(1)
	if channel_switch:
	count_channelswitch.add(1)
	frame.rdd.foreach(mapper)

	total = float(count_total.value)
	print "users that switched channels at all: {:.2f}%".format(count_channelswitch.value / total * 100)
	print "users that reverted to an older version: {:.2f}%".format(count_backwardsversion.value / total * 100)
	print "users that reverted to an older version, staying on the release channel: {:.2f}".format(count_backwardsversion_releaseonly.value / total * 100)