vitillo/addons.py

## addons.py
import json

def map(k, d, v, cx):
    j = json.loads(v)

    try:
        addons = j['info']['addons']
    except:
        return

    if addons is None:
        return

    for addon in addons.split(','):
        #remove version number
        addon = addon.split(':')[0]
        cx.write(addon, 1)

def reduce(k, v, cx):
    cx.writecsv([sum(v), k])

## analysis.sh
#!/bin/bash

./run.sh addons
sort -r -V -k 1,1 /mnt/telemetry/addons.out > addons.csv
./run_local.sh generic
cat header.txt /mnt/telemetry/generic.out > ./generic.csv

## filter.json
{
  "version": 1,
  "dimensions": [
    {
      "field_name": "reason",
      "allowed_values": ["saved-session"]
    },
    {
      "field_name": "appName",
      "allowed_values": "Firefox"
    },
    {
      "field_name": "appUpdateChannel",
      "allowed_values": ["release"]
    },
    {
      "field_name": "appVersion",
      "allowed_values": "28.0"
    },
    {
      "field_name": "appBuildID",
      "allowed_values": "*"
    },
    {
      "field_name": "submission_date",
      "allowed_values": ["20140420"]
    }
  ]
}

## generic.py
import json

simple_keys = ["shutdownDuration" , "failedProfileLockCount" ,"maximalNumberOfConcurrentThreads", "uptime", "pingsOverdue", "savedPings", "sessionRestored"]
system_keys = ["OS", "cpucount", "memsize", "arch", "version", "adapterVendorID", "flashVersion"]
addon_keys = []

# Get top addons
with open('../addons.csv') as f:
    addons = []
    subs = -1
    lines = f.readlines()
    total = float(lines[0].split(',')[0])
    threshold = 100

    for idx, line in enumerate(lines[1:]):
        if idx == threshold:
            break

        nr, name = line.split(',')
        nr = float(nr)

        if nr >= 0.01*total:
            addon_keys.append(name.strip())
        else:
            break

    assert(len(addon_keys) > 0)

with open('../header.txt', 'w') as f:
    def ph(keys):
        for key in keys:
            f.write(key +",")

    ph(simple_keys)
    ph(system_keys)
    ph(addon_keys)
    f.write('multiplicity\n')

def add(keys, dic, add_bool = False):
    out = []

    for key in keys:
        if key in dic:
            out.append(dic[key] if not add_bool else 1)
        else:
            out.append(None)

    return out

def parse_addons(raw):
    addons = raw.split(',')
    out = set([addon.split(':')[0].strip() for addon in addons])
    assert(len(out) > 0)
    return out

def map(k, d, v, cx):
    j = json.loads(v)
    out = []

    out += add(simple_keys, j['simpleMeasurements'])
    out += add(system_keys, j['info'])

    if 'addons' in j['info']:
        out += add(addon_keys, parse_addons(j['info']['addons']), True)

    cx.write(tuple(out), 1)

def reduce(k, v, cx):
    cx.writecsv(list(k) + [sum(v)])

## run.sh
#!/bin/bash

cd telemetry-server
python -m mapreduce.job ../$1.py \
   --input-filter ../filter.json \
   --num-mappers 16 \
   --num-reducers 4 \
   --data-dir /mnt/telemetry/work \
   --work-dir /mnt/telemetry/work \
   --output /mnt/telemetry/$1.out \
   --bucket "telemetry-published-v1"

## run_local.sh
#!/bin/bash

cd telemetry-server
python -m mapreduce.job ../$1.py \
   --input-filter ../filter.json \
   --num-mappers 16 \
   --num-reducers 4 \
   --data-dir /mnt/telemetry/work/cache \
   --work-dir /mnt/telemetry/work \
   --output /mnt/telemetry/$1.out \
   --bucket "telemetry-published-v1" \
   --local-only

## slow.R
library(ggplot2)
library(dplyr)
library(GGally)
library(vcd)
library(pwr)

effect_table <- function(table) {
  as.numeric(assocstats(table)[5])
}

power_chisq <- function(table, test) {
  pwr.chisq.test(N=sum(table), df=as.numeric(test[2]), sig.level=0.05, w=0.1)
}

remove_outliers <- function(x, na.rm = TRUE, ...) {
  qnt <- quantile(x, probs=c(.005, .995), na.rm = na.rm, ...)
  y <- x
  y[x < (qnt[1])] <- NA
  y[x > (qnt[2])] <- NA
  y
}

trim <- function (x) gsub("^\\s+|\\s+$", "", x)

get_flash_version <- function(str) {
  sub("\\..*", "", as.character(str))
}

clean <- function(df) {
  df %.%
    select(-multiplicity, -pingsOverdue, -savedPings) %.%
    mutate(shutdownDuration = remove_outliers(shutdownDuration),
           uptime = remove_outliers(uptime),
           flashMainVersion = as.factor(get_flash_version(flashVersion))) %.%
    filter(!is.na(uptime),
           !is.na(shutdownDuration),
           cpucount %in% c(1, 2, 4, 8, 12)) %.%
    mutate(cpucount = as.factor(cpucount),
           failedProfileLockCount = as.factor(failedProfileLockCount >= 1),
           version = as.factor(version),
           adapterVendorID = as.factor(adapterVendorID),
           memsize = cut(memsize,
                         breaks=c(0, 1024, 2048, 4096, 8192, 16384, Inf),
                         labels=c("1024", "2048", "4096", "8192", "16384", "Inf")))
}

plot_shutdown <-function(df) {
  ggplot(aes(x=shutdownDuration), data=df) +
    geom_histogram(aes(y=..count../sum(..count..)), fill="blue", alpha=0.7, binwidth=0.01) +
    scale_x_log10(name="Shutdown Duration") +
    scale_y_continuous(name="Density") +
    theme_bw()
}

rem <- function (param){ substring(param, 3) }
adistrib <- function(df) {
  addons <- df[, c(11:length(df))]
  addons <- summary(sapply(addons, as.factor))[2,]
  addons <- sapply(addons, rem)
  addons <- sapply(addons, trim)
  addons <- sapply(addons, strtoi)
}

#######################################################################################
#######################################################################################

# Let's have a look first at a representative sample of the population for release 28

df <- read.csv("generic_big.csv", row.names=NULL)
df[is.na(df)] <- 0 # we can safely impute to 0

# Let's clean up the data
df <- clean(df)

# Let's see if we can spot some pattern
#ggpairs(df[, c(1:9)])
# Can't see anything clearly correlated to shutdownDuration, besides a small correlation with
# cpucount and memsize

# Plot shutdownDuration distribution
plot_shutdown(df)
# We got a nice spike from 5000 to 8000

# Plot shutdownDuration vs cpucount
ggplot(aes(y=shutdownDuration, x=cpucount), data=df) +
  geom_boxplot(fill="blue", alpha=0.7) +
  scale_x_discrete(name="Cores") +
  scale_y_continuous(name="Shutdown Duration") +
  coord_cartesian(ylim = c(-500, 10000)) +
  theme_bw()
# The number of cores influences negatively shutdownDuration;
# Since memsize is correlated to the number of cores there is no need for another plot

# Investigate submissions with a shutdown duration of 5000-8000
df_com <- df %.% filter(shutdownDuration <= 5000)
df_slow <- df %.% filter(shutdownDuration > 5000 & shutdownDuration < 8000)

# Is the distribution of threads different?
wilcox.test(df_com$maximalNumberOfConcurrentThreads, df_slow$maximalNumberOfConcurrentThreads)
summary(df_com$maximalNumberOfConcurrentThreads)
summary(df_slow$maximalNumberOfConcurrentThreads) # Looks like the distribution is shifted to the right by 1

# Let's have a look at the cpu count
table <- rbind(summary(df_com$cpucount), summary(df_slow$cpucount))
chisq.test(table, simulate.p.value = TRUE)
# df_slow contains machines with less cores, news at 11

# Let's have a look at the memory
summary(df_com$memsize)
summary(df_slow$memsize)
table <- rbind(summary(df_com$memsize), summary(df_slow$memsize))
effect_table(table) # small but still interesting
chisq.test(table, simulate.p.value=TRUE)
# df_slow contains machiens with less memory, news at 11

# Let's have a look at the flash version
summary(df_com$flashMainVersion)
summary(df_slow$flashMainVersion)
table <- rbind(summary(df_com$flashMainVersion), summary(df_slow$flashMainVersion))
effect_table(table)
chisq.test(table, simulate.p.value=TRUE) # Nothing really interesting here

# Let's compare the failed profile lock count
table <- rbind(summary(df_com$failedProfileLockCount), summary(df_slow$failedProfileLockCount))
effect_table(table) # small, still worth investigating though
chisq.test(table, simulate.p.value=TRUE)

com <- summary(df_com$failedProfileLockCount)[2] / summary(df_com$failedProfileLockCount)[1]
slow <- summary(df_slow$failedProfileLockCount)[2] / summary(df_slow$failedProfileLockCount)[1]
cat("slow has", slow/com, "times more locks than com")
cat("% of locked profiles for slow:", slow*100)

# Note that even though slow has significantly more locks than com (6.5 times), the chance of having a lock
# in the first place is quite low (1-2%)

# Is the distribution of sessionRestored different?
wilcox.test(df_com$sessionRestored, df_slow$sessionRestored)
summary(df_com$sessionRestored)
summary(df_slow$sessionRestored)

# Let's compare the distribution of add-ons
addons <- rbind(adistrib(df_com), adistrib(df_slow))
addons[is.na(addons)] <- 0

# Let's just use the most frequent ones, if we can't spot a pattern there then all hopes are lost
top_addons = addons[, 1:20]
effect_table(addons) # very small
test <- chisq.test(top_addons)

# Couldn't find any difference, do we have enough power (0.8) for our smallest significant effect size (0.1)?
power_chisq(top_addons, test) # We do

#######################################################################################
#######################################################################################

# Let's have a look now only at a "big" sample of submissions with failedProfileLockCount >=1
df_lock <- read.csv("lockcount.csv", row.names=NULL)
df_lock[is.na(df_lock)] <- 0 # we can safely impute to 0
df_lock <- clean(df_lock)

# Let's look at the OS
summary(df_lock$OS) # There is only WINNT
df <- df %.%
  filter(version %in% c(5.1, 5.2, 6.1, 6.2, 6.3)) %.%
  mutate(version = factor(version))

df_lock <- df_lock %.%
  filter(version %in% c(5.1, 5.2, 6.1, 6.2, 6.3)) %.%
  mutate(version = factor(version))

# Let's see if we can spot some pattern
#ggpairs(df_lock[, c(1, 3, 4, 6, 7, 8, 9)])

# Plot shutdownDuration distribution
plot_shutdown(df_lock)
# shutdownDuration in df_lock is signifcantly higher than in df;
# it seems to be 10 times higher when comparing medians

# Is the distribution of threads different?
wilcox.test(df_lock$maximalNumberOfConcurrentThreads, df$maximalNumberOfConcurrentThreads)
summary(df$maximalNumberOfConcurrentThreads)
summary(df_lock$maximalNumberOfConcurrentThreads) # Looks like the distribution is shifted to the right by 3

# Is the uptime distribution different?
wilcox.test(df_lock$uptime, df$uptime)
summary(df$uptime)
summary(df_lock$uptime) # whoops, uptime seems to be 5 times higher when comparing medians

# Is the OS version different? (only WINNT data entries show up in the df_lock)
table <- rbind(summary(df$version), summary(df_lock$version))
chisq.test(table) # Yep, let's see what our effect size is
effect_table(table) # Medium (0.35), so it's significant
# Seems like less users here are on 6.3, while the distribution is the same for the other versions

# Is the CPU count distribution different?
table <- rbind(summary(df$cpucount), summary(df_lock$cpucount))
chisq.test(table) # Yep, let's see what our effect size is
effect_table(table) # Medium (0.35), so it's significant
# CPU distribution is shifted slightly to the left, i.e. lower end machines

# Let's have a look at the flash version
summary(df$flashMainVersion)
summary(df_lock$flashMainVersion)
table <- rbind(summary(df$flashMainVersion), summary(df_lock$flashMainVersion))
effect_table(table)
chisq.test(table, simulate.p.value=TRUE) # There is a clear difference here

# Is the distribution of sessionRestored different?
wilcox.test(df_lock$sessionRestored, df$sessionRestored)
summary(df$sessionRestored)
summary(df_lock$sessionRestored)

# Is the distribution for the top add-ons different?
adistrib(df_lock) # Looks like there isn't an add-on that correlates with all those submissions so don't bother
	import json

	def map(k, d, v, cx):
	j = json.loads(v)

	try:
	addons = j['info']['addons']
	except:
	return

	if addons is None:
	return

	for addon in addons.split(','):
	#remove version number
	addon = addon.split(':')[0]
	cx.write(addon, 1)

	def reduce(k, v, cx):
	cx.writecsv([sum(v), k])
	#!/bin/bash

	./run.sh addons
	sort -r -V -k 1,1 /mnt/telemetry/addons.out > addons.csv
	./run_local.sh generic
	cat header.txt /mnt/telemetry/generic.out > ./generic.csv
	{
	"version": 1,
	"dimensions": [
	{
	"field_name": "reason",
	"allowed_values": ["saved-session"]
	},
	{
	"field_name": "appName",
	"allowed_values": "Firefox"
	},
	{
	"field_name": "appUpdateChannel",
	"allowed_values": ["release"]
	},
	{
	"field_name": "appVersion",
	"allowed_values": "28.0"
	},
	{
	"field_name": "appBuildID",
	"allowed_values": "*"
	},
	{
	"field_name": "submission_date",
	"allowed_values": ["20140420"]
	}
	]
	}
	import json

	simple_keys = ["shutdownDuration" , "failedProfileLockCount" ,"maximalNumberOfConcurrentThreads", "uptime", "pingsOverdue", "savedPings", "sessionRestored"]
	system_keys = ["OS", "cpucount", "memsize", "arch", "version", "adapterVendorID", "flashVersion"]
	addon_keys = []

	# Get top addons
	with open('../addons.csv') as f:
	addons = []
	subs = -1
	lines = f.readlines()
	total = float(lines[0].split(',')[0])
	threshold = 100

	for idx, line in enumerate(lines[1:]):
	if idx == threshold:
	break

	nr, name = line.split(',')
	nr = float(nr)

	if nr >= 0.01*total:
	addon_keys.append(name.strip())
	else:
	break

	assert(len(addon_keys) > 0)

	with open('../header.txt', 'w') as f:
	def ph(keys):
	for key in keys:
	f.write(key +",")

	ph(simple_keys)
	ph(system_keys)
	ph(addon_keys)
	f.write('multiplicity\n')

	def add(keys, dic, add_bool = False):
	out = []

	for key in keys:
	if key in dic:
	out.append(dic[key] if not add_bool else 1)
	else:
	out.append(None)

	return out

	def parse_addons(raw):
	addons = raw.split(',')
	out = set([addon.split(':')[0].strip() for addon in addons])
	assert(len(out) > 0)
	return out

	def map(k, d, v, cx):
	j = json.loads(v)
	out = []

	out += add(simple_keys, j['simpleMeasurements'])
	out += add(system_keys, j['info'])

	if 'addons' in j['info']:
	out += add(addon_keys, parse_addons(j['info']['addons']), True)

	cx.write(tuple(out), 1)

	def reduce(k, v, cx):
	cx.writecsv(list(k) + [sum(v)])
	#!/bin/bash

	cd telemetry-server
	python -m mapreduce.job ../$1.py \
	--input-filter ../filter.json \
	--num-mappers 16 \
	--num-reducers 4 \
	--data-dir /mnt/telemetry/work \
	--work-dir /mnt/telemetry/work \
	--output /mnt/telemetry/$1.out \
	--bucket "telemetry-published-v1"
	library(ggplot2)
	library(dplyr)
	library(GGally)
	library(vcd)
	library(pwr)

	effect_table <- function(table) {
	as.numeric(assocstats(table)[5])
	}

	power_chisq <- function(table, test) {
	pwr.chisq.test(N=sum(table), df=as.numeric(test[2]), sig.level=0.05, w=0.1)
	}

	remove_outliers <- function(x, na.rm = TRUE, ...) {
	qnt <- quantile(x, probs=c(.005, .995), na.rm = na.rm, ...)
	y <- x
	y[x < (qnt[1])] <- NA
	y[x > (qnt[2])] <- NA
	y
	}

	trim <- function (x) gsub("^\\s+\|\\s+$", "", x)

	get_flash_version <- function(str) {
	sub("\\..*", "", as.character(str))
	}

	clean <- function(df) {
	df %.%
	select(-multiplicity, -pingsOverdue, -savedPings) %.%
	mutate(shutdownDuration = remove_outliers(shutdownDuration),
	uptime = remove_outliers(uptime),
	flashMainVersion = as.factor(get_flash_version(flashVersion))) %.%
	filter(!is.na(uptime),
	!is.na(shutdownDuration),
	cpucount %in% c(1, 2, 4, 8, 12)) %.%
	mutate(cpucount = as.factor(cpucount),
	failedProfileLockCount = as.factor(failedProfileLockCount >= 1),
	version = as.factor(version),
	adapterVendorID = as.factor(adapterVendorID),
	memsize = cut(memsize,
	breaks=c(0, 1024, 2048, 4096, 8192, 16384, Inf),
	labels=c("1024", "2048", "4096", "8192", "16384", "Inf")))
	}

	plot_shutdown <-function(df) {
	ggplot(aes(x=shutdownDuration), data=df) +
	geom_histogram(aes(y=..count../sum(..count..)), fill="blue", alpha=0.7, binwidth=0.01) +
	scale_x_log10(name="Shutdown Duration") +
	scale_y_continuous(name="Density") +
	theme_bw()
	}

	rem <- function (param){ substring(param, 3) }
	adistrib <- function(df) {
	addons <- df[, c(11:length(df))]
	addons <- summary(sapply(addons, as.factor))[2,]
	addons <- sapply(addons, rem)
	addons <- sapply(addons, trim)
	addons <- sapply(addons, strtoi)
	}

	#######################################################################################
	#######################################################################################

	# Let's have a look first at a representative sample of the population for release 28

	df <- read.csv("generic_big.csv", row.names=NULL)
	df[is.na(df)] <- 0 # we can safely impute to 0

	# Let's clean up the data
	df <- clean(df)

	# Let's see if we can spot some pattern
	#ggpairs(df[, c(1:9)])
	# Can't see anything clearly correlated to shutdownDuration, besides a small correlation with
	# cpucount and memsize

	# Plot shutdownDuration distribution
	plot_shutdown(df)
	# We got a nice spike from 5000 to 8000

	# Plot shutdownDuration vs cpucount
	ggplot(aes(y=shutdownDuration, x=cpucount), data=df) +
	geom_boxplot(fill="blue", alpha=0.7) +
	scale_x_discrete(name="Cores") +
	scale_y_continuous(name="Shutdown Duration") +
	coord_cartesian(ylim = c(-500, 10000)) +
	theme_bw()
	# The number of cores influences negatively shutdownDuration;
	# Since memsize is correlated to the number of cores there is no need for another plot

	# Investigate submissions with a shutdown duration of 5000-8000
	df_com <- df %.% filter(shutdownDuration <= 5000)
	df_slow <- df %.% filter(shutdownDuration > 5000 & shutdownDuration < 8000)

	# Is the distribution of threads different?
	wilcox.test(df_com$maximalNumberOfConcurrentThreads, df_slow$maximalNumberOfConcurrentThreads)
	summary(df_com$maximalNumberOfConcurrentThreads)
	summary(df_slow$maximalNumberOfConcurrentThreads) # Looks like the distribution is shifted to the right by 1

	# Let's have a look at the cpu count
	table <- rbind(summary(df_com$cpucount), summary(df_slow$cpucount))
	chisq.test(table, simulate.p.value = TRUE)
	# df_slow contains machines with less cores, news at 11

	# Let's have a look at the memory
	summary(df_com$memsize)
	summary(df_slow$memsize)
	table <- rbind(summary(df_com$memsize), summary(df_slow$memsize))
	effect_table(table) # small but still interesting
	chisq.test(table, simulate.p.value=TRUE)
	# df_slow contains machiens with less memory, news at 11

	# Let's have a look at the flash version
	summary(df_com$flashMainVersion)
	summary(df_slow$flashMainVersion)
	table <- rbind(summary(df_com$flashMainVersion), summary(df_slow$flashMainVersion))
	effect_table(table)
	chisq.test(table, simulate.p.value=TRUE) # Nothing really interesting here

	# Let's compare the failed profile lock count
	table <- rbind(summary(df_com$failedProfileLockCount), summary(df_slow$failedProfileLockCount))
	effect_table(table) # small, still worth investigating though
	chisq.test(table, simulate.p.value=TRUE)

	com <- summary(df_com$failedProfileLockCount)[2] / summary(df_com$failedProfileLockCount)[1]
	slow <- summary(df_slow$failedProfileLockCount)[2] / summary(df_slow$failedProfileLockCount)[1]
	cat("slow has", slow/com, "times more locks than com")
	cat("% of locked profiles for slow:", slow*100)

	# Note that even though slow has significantly more locks than com (6.5 times), the chance of having a lock
	# in the first place is quite low (1-2%)

	# Is the distribution of sessionRestored different?
	wilcox.test(df_com$sessionRestored, df_slow$sessionRestored)
	summary(df_com$sessionRestored)
	summary(df_slow$sessionRestored)

	# Let's compare the distribution of add-ons
	addons <- rbind(adistrib(df_com), adistrib(df_slow))
	addons[is.na(addons)] <- 0

	# Let's just use the most frequent ones, if we can't spot a pattern there then all hopes are lost
	top_addons = addons[, 1:20]
	effect_table(addons) # very small
	test <- chisq.test(top_addons)

	# Couldn't find any difference, do we have enough power (0.8) for our smallest significant effect size (0.1)?
	power_chisq(top_addons, test) # We do

	#######################################################################################
	#######################################################################################

	# Let's have a look now only at a "big" sample of submissions with failedProfileLockCount >=1
	df_lock <- read.csv("lockcount.csv", row.names=NULL)
	df_lock[is.na(df_lock)] <- 0 # we can safely impute to 0
	df_lock <- clean(df_lock)

	# Let's look at the OS
	summary(df_lock$OS) # There is only WINNT
	df <- df %.%
	filter(version %in% c(5.1, 5.2, 6.1, 6.2, 6.3)) %.%
	mutate(version = factor(version))

	df_lock <- df_lock %.%
	filter(version %in% c(5.1, 5.2, 6.1, 6.2, 6.3)) %.%
	mutate(version = factor(version))

	# Let's see if we can spot some pattern
	#ggpairs(df_lock[, c(1, 3, 4, 6, 7, 8, 9)])

	# Plot shutdownDuration distribution
	plot_shutdown(df_lock)
	# shutdownDuration in df_lock is signifcantly higher than in df;
	# it seems to be 10 times higher when comparing medians

	# Is the distribution of threads different?
	wilcox.test(df_lock$maximalNumberOfConcurrentThreads, df$maximalNumberOfConcurrentThreads)
	summary(df$maximalNumberOfConcurrentThreads)
	summary(df_lock$maximalNumberOfConcurrentThreads) # Looks like the distribution is shifted to the right by 3

	# Is the uptime distribution different?
	wilcox.test(df_lock$uptime, df$uptime)
	summary(df$uptime)
	summary(df_lock$uptime) # whoops, uptime seems to be 5 times higher when comparing medians

	# Is the OS version different? (only WINNT data entries show up in the df_lock)
	table <- rbind(summary(df$version), summary(df_lock$version))
	chisq.test(table) # Yep, let's see what our effect size is
	effect_table(table) # Medium (0.35), so it's significant
	# Seems like less users here are on 6.3, while the distribution is the same for the other versions

	# Is the CPU count distribution different?
	table <- rbind(summary(df$cpucount), summary(df_lock$cpucount))
	chisq.test(table) # Yep, let's see what our effect size is
	effect_table(table) # Medium (0.35), so it's significant
	# CPU distribution is shifted slightly to the left, i.e. lower end machines

	# Let's have a look at the flash version
	summary(df$flashMainVersion)
	summary(df_lock$flashMainVersion)
	table <- rbind(summary(df$flashMainVersion), summary(df_lock$flashMainVersion))
	effect_table(table)
	chisq.test(table, simulate.p.value=TRUE) # There is a clear difference here

	# Is the distribution of sessionRestored different?
	wilcox.test(df_lock$sessionRestored, df$sessionRestored)
	summary(df$sessionRestored)
	summary(df_lock$sessionRestored)

	# Is the distribution for the top add-ons different?
	adistrib(df_lock) # Looks like there isn't an add-on that correlates with all those submissions so don't bother