Created
March 8, 2017 23:06
-
-
Save bsmedberg/1af70857106bfe29128a0e8d0b656804 to your computer and use it in GitHub Desktop.
downgrade-analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# Downgrade and Channel Switching Analysis | |
# ======================================== | |
# | |
# authors: | |
# - bsmedberg | |
# | |
# last run: 2017-03-08 | |
# | |
# ### Longitudinal Dataset Tutorial | |
# In[1]: | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import numpy as np | |
import plotly.plotly as py | |
get_ipython().magic(u'pylab inline') | |
# In[2]: | |
sc.defaultParallelism | |
# The longitudinal dataset can be accessed as a Spark [DataFrame](https://spark.apache.org/docs/1.5.2/api/python/pyspark.sql.html#pyspark.sql.DataFrame), which is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python. | |
# In[3]: | |
frame = sqlContext.sql("SELECT client_id, profile_subsession_counter, build, settings FROM longitudinal") | |
# In[4]: | |
sc | |
# In[10]: | |
def get_version(v): | |
if v is None: | |
return None | |
try: | |
return int(v.split(".")[0]) | |
except ValueError: | |
return None | |
# In[6]: | |
print get_version("") | |
print get_version("12.3") | |
print get_version("12") | |
print get_version("abcd") | |
# In[11]: | |
count_total = sc.accumulator(0) | |
count_backwardsversion = sc.accumulator(0) | |
count_channelswitch = sc.accumulator(0) | |
count_backwardsversion_releaseonly = sc.accumulator(0) | |
def mapper(row): | |
channel_switch = False | |
release_only = True | |
backwardsversion = False | |
# sessions are sorted by subsessionStartDate and then profileSubsessionCounter, newest-first. | |
last_version = 99 | |
last_channel = None | |
if row.settings is None: | |
return | |
for settings in row.settings: | |
channel = settings.update.channel | |
if channel != "release": | |
release_only = False | |
if last_channel is None: | |
last_channel = channel | |
elif last_channel != channel: | |
channel_switch = True | |
for build in row.build: | |
version = get_version(build.version) | |
if version is not None: | |
if version > last_version: | |
backwardsversion = True | |
last_version = version | |
count_total.add(1) | |
if backwardsversion: | |
count_backwardsversion.add(1) | |
if release_only: | |
count_backwardsversion_releaseonly.add(1) | |
if channel_switch: | |
count_channelswitch.add(1) | |
frame.rdd.foreach(mapper) | |
total = float(count_total.value) | |
print "users that switched channels at all: {:.2f}%".format(count_channelswitch.value / total * 100) | |
print "users that reverted to an older version: {:.2f}%".format(count_backwardsversion.value / total * 100) | |
print "users that reverted to an older version, staying on the release channel: {:.2f}".format(count_backwardsversion_releaseonly.value / total * 100) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment