vitillo/mainthreadio.R

## mainthreadio.py
import simplejson as json
import numpy
import io
import csv
import scipy.stats

from string import maketrans

def clean(s):
    return normalize(s).translate(None, ",")

def normalize(s):
    if type(s) == unicode:
        return s.encode('utf8', 'ignore')
    else:
        return str(s)

def safe_key(pieces):
    output = io.BytesIO()
    writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)
    writer.writerow(pieces)
    return output.getvalue().strip()

def map(k, d, v, cx):
    global n_pings

    parsed = json.loads(v)
    reason, appName, appUpdateChannel, appVersion, appBuildID, submission_date = d

    if not "fileIOReports" in parsed:
        return

    if not parsed["fileIOReports"]:
        return

    disk = parsed['info'].get('profileHDDModel', "NA")
    arch = parsed['info']['arch']
    OS_version = parsed['info']['version']
    addons = parsed['info'].get('addons', "")
    addons = addons.replace(',',';')

    for f, arr in parsed["fileIOReports"].iteritems():
        arr.append(arch)
        arr.append(OS_version)
        arr.append(disk)
        arr.append(addons)
        arr.append(addons.count(';'))
        cx.write(safe_key([clean(f)]), arr)

def setup_reduce(cx):
    cx.field_separator = ","

def reduce(k, v, cx):
    n_pings = len(v)

    if n_pings > 10000:
        for total, n_open, n_read, n_write, n_fsync, n_stat, arch, OS_version, disk, addons, addons_count in v[:n_pings/2]:
            cx.write(k, ",".join([str(total), str(n_open), str(n_read), str(n_write), str(n_fsync), str(n_stat), arch, OS_version, disk, addons, str(addons_count)]))

## mainthreadio.R
# Exploratory analysis hackjob for Mainthread-IO data

library(data.table)
library(ggplot2)
library(doMC)
registerDoMC(8)

############################################################
# High frequency files analysis
############################################################

# Map-reduce job preselected only the files with a frequency > 10%
frame <- fread("mainthreadio.csv")

# Clean dataset
frame <- frame[frame$filename != "TOTAL",]
frame <- frame[frame$time > 0]
frame$isSQL <- as.factor(grepl("sqlite", frame$filename))
frame$hasSSD <- as.factor(grepl("SSD", frame$disk))
frame$arch <- as.factor(frame$arch)
frame$version <- as.factor(frame$OS_version)
frame$filename <- as.factor(frame$filename)
frame$addons_count <- as.numeric(frame$addons_count)

# How common are the various operations?
summary(frame$n_opens)
summary(frame$n_read)
summary(frame$n_stat)
summary(frame$n_write)
summary(frame$n_fsync)

# Is there a correlation between the number of operations and time?
cor(frame$n_write+frame$n_read+frame$n_stat+frame$n_fsync, frame$time)

# Does having an SSD disk lower the total acc. time?
summary(frame[frame$hasSSD == TRUE]$time)
summary(frame[frame$hasSSD == FALSE]$time)
wilcox.test(frame[frame$hasSSD == FALSE]$time, frame[frame$hasSSD == TRUE]$time)

# Are IO operations on sqlite databases more expensive than on normal files?
summary(frame[frame$isSQL == TRUE]$time)
summary(frame[frame$isSQL == FALSE]$time)
wilcox.test(frame[frame$isSQL == FALSE]$time, frame[frame$isSQL == TRUE]$time)

# Investigate differences between top100 offenders and genpop
frame <- frame[order(-time)]
top100 <- head(frame, n=100)

# Does the distribution of windows versions differ between the population and the top 100
fisher.test(cbind(summary(top100$version), summary(frame$version)), workspace=1e9)

# Does the distribution of architectures differ between the population and the top 100?
fisher.test(cbind(summary(top100$arch), summary(frame$arch)), workspace=1e9)

# Does the distribution of addons differ between the population and the top 100?
cor(frame$addons_count, frame$time) # very low
ks.test(frame$addons_count, head(frame, n=100)$addons_count)

# Does the distribution of the disk type differ between the population and the top 100?
fisher.test(cbind(summary(top100$hasSSD), summary(frame$hasSSD)), workspace=1e9)

# Plot the top 10 outliers
top <- head(summary(frame[1:100,]$filename), n=10)
p <- qplot(x=reorder(names(top), top, function(y){-1*y}),y=top, geom="histogram", stat="identity", fill=reorder(names(top), top, function(y){-1*y}))
p <- p + theme(legend.title = element_blank(), axis.text.x = element_blank(), axis.ticks = element_blank()) + xlab("") + ylab("freq")

# Now let's look at the aggregates
third_quartile <- function(input){
  if (is.numeric(input))
    return(quantile(input, names=T)[4])
  else
    return(NA)
}

frame$count <- rep(1, nrow(frame))
agg <- frame[, list(sum(count), third_quartile(time)), by=filename]
names(agg) <- c("filename", "freq", "time")

# Plot the top 20 filenames in the highest frequency bin
agg$submission_bins <- as.numeric(cut(agg$freq, 10))
agg <- agg[order(-submission_bins, -time)]
top20 <- head(agg[submission_bins==10], n=20)
p <- qplot(x=reorder(filename, time, function(y){-1*y}), y=time, data=top20, geom="histogram", fill=reorder(filename, time, function(y){-1*y}), stat="identity")
p <- p + theme(legend.title = element_blank(), axis.text.x = element_blank(), axis.ticks = element_blank()) + xlab("") + ylab("ms")

############################################################
# Outliers analysis
############################################################

# Map-reduce job preselected only the outliers for every file
outliers <- fread("mainthreadio_outliers.csv")
outliers <- outliers[outliers$filename != "TOTAL",]
outliers <- outliers[outliers$time > 0]
outliers <- outliers[order(-time)]

get_culprit <- function(filename){
  path <- unlist(strsplit(filename, '/'))
  if(length(path) < 2)
    return(path[1])
  else if(path[2] == "extensions")
    return(path[3])
  else
    return(path[2])
}

# Aggregate files by extensions
outliers$filename <- gsub('\\\\', '/', outliers$filename)
outliers$extension <- sapply(outliers$filename, get_culprit)
outliers$filename <- as.factor(outliers$filename)
outliers$extension <- as.factor(outliers$extension)

outliers$count <- rep(1, nrow(outliers))
agg <- outliers[, list(sum(count), max(time)), by=extension]
top <- head(agg[order(-V2)], n=25)

## mainthreadio_outliers.py
import simplejson as json
import numpy
import io
import csv
import scipy.stats

from string import maketrans

def clean(s):
    return normalize(s).translate(None, ",")

def normalize(s):
    if type(s) == unicode:
        return s.encode('utf8', 'ignore')
    else:
        return str(s)

def safe_key(pieces):
    output = io.BytesIO()
    writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)
    writer.writerow(pieces)
    return output.getvalue().strip()

def map(k, d, v, cx):
    global n_pings

    parsed = json.loads(v)
    reason, appName, appUpdateChannel, appVersion, appBuildID, submission_date = d

    if not "fileIOReports" in parsed:
        return

    if not parsed["fileIOReports"]:
        return

    disk = parsed['info'].get('profileHDDModel', "NA")
    arch = parsed['info']['arch']
    OS_version = parsed['info']['version']
    addons = parsed['info'].get('addons', "")
    addons = addons.replace(',',';')

    for f, arr in parsed["fileIOReports"].iteritems():
        arr.append(arch)
        arr.append(OS_version)
        arr.append(disk)
        arr.append(addons)
        arr.append(addons.count(';'))

        cx.write(safe_key([clean(f)]), arr)

def setup_reduce(cx):
    cx.field_separator = ","

def reduce(k, v, cx):
    n_pings = len(v)
    top = 0

    for idx, entry in enumerate(v[:n_pings/2]):
        if entry[0] > v[top]:
            top = idx

    total, n_open, n_read, n_write, n_fsync, n_stat, arch, OS_version, disk, addons, addons_count = v[top]
    cx.write(k, ",".join([str(total), str(n_open), str(n_read), str(n_write), str(n_fsync), str(n_stat), arch, OS_version, disk, addons, str(addons_count)])
	import simplejson as json
	import numpy
	import io
	import csv
	import scipy.stats

	from string import maketrans

	def clean(s):
	return normalize(s).translate(None, ",")

	def normalize(s):
	if type(s) == unicode:
	return s.encode('utf8', 'ignore')
	else:
	return str(s)

	def safe_key(pieces):
	output = io.BytesIO()
	writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)
	writer.writerow(pieces)
	return output.getvalue().strip()

	def map(k, d, v, cx):
	global n_pings

	parsed = json.loads(v)
	reason, appName, appUpdateChannel, appVersion, appBuildID, submission_date = d

	if not "fileIOReports" in parsed:
	return

	if not parsed["fileIOReports"]:
	return

	disk = parsed['info'].get('profileHDDModel', "NA")
	arch = parsed['info']['arch']
	OS_version = parsed['info']['version']
	addons = parsed['info'].get('addons', "")
	addons = addons.replace(',',';')

	for f, arr in parsed["fileIOReports"].iteritems():
	arr.append(arch)
	arr.append(OS_version)
	arr.append(disk)
	arr.append(addons)
	arr.append(addons.count(';'))
	cx.write(safe_key([clean(f)]), arr)

	def setup_reduce(cx):
	cx.field_separator = ","

	def reduce(k, v, cx):
	n_pings = len(v)

	if n_pings > 10000:
	for total, n_open, n_read, n_write, n_fsync, n_stat, arch, OS_version, disk, addons, addons_count in v[:n_pings/2]:
	cx.write(k, ",".join([str(total), str(n_open), str(n_read), str(n_write), str(n_fsync), str(n_stat), arch, OS_version, disk, addons, str(addons_count)]))
	# Exploratory analysis hackjob for Mainthread-IO data

	library(data.table)
	library(ggplot2)
	library(doMC)
	registerDoMC(8)

	############################################################
	# High frequency files analysis
	############################################################

	# Map-reduce job preselected only the files with a frequency > 10%
	frame <- fread("mainthreadio.csv")

	# Clean dataset
	frame <- frame[frame$filename != "TOTAL",]
	frame <- frame[frame$time > 0]
	frame$isSQL <- as.factor(grepl("sqlite", frame$filename))
	frame$hasSSD <- as.factor(grepl("SSD", frame$disk))
	frame$arch <- as.factor(frame$arch)
	frame$version <- as.factor(frame$OS_version)
	frame$filename <- as.factor(frame$filename)
	frame$addons_count <- as.numeric(frame$addons_count)

	# How common are the various operations?
	summary(frame$n_opens)
	summary(frame$n_read)
	summary(frame$n_stat)
	summary(frame$n_write)
	summary(frame$n_fsync)

	# Is there a correlation between the number of operations and time?
	cor(frame$n_write+frame$n_read+frame$n_stat+frame$n_fsync, frame$time)

	# Does having an SSD disk lower the total acc. time?
	summary(frame[frame$hasSSD == TRUE]$time)
	summary(frame[frame$hasSSD == FALSE]$time)
	wilcox.test(frame[frame$hasSSD == FALSE]$time, frame[frame$hasSSD == TRUE]$time)

	# Are IO operations on sqlite databases more expensive than on normal files?
	summary(frame[frame$isSQL == TRUE]$time)
	summary(frame[frame$isSQL == FALSE]$time)
	wilcox.test(frame[frame$isSQL == FALSE]$time, frame[frame$isSQL == TRUE]$time)

	# Investigate differences between top100 offenders and genpop
	frame <- frame[order(-time)]
	top100 <- head(frame, n=100)

	# Does the distribution of windows versions differ between the population and the top 100
	fisher.test(cbind(summary(top100$version), summary(frame$version)), workspace=1e9)

	# Does the distribution of architectures differ between the population and the top 100?
	fisher.test(cbind(summary(top100$arch), summary(frame$arch)), workspace=1e9)

	# Does the distribution of addons differ between the population and the top 100?
	cor(frame$addons_count, frame$time) # very low
	ks.test(frame$addons_count, head(frame, n=100)$addons_count)

	# Does the distribution of the disk type differ between the population and the top 100?
	fisher.test(cbind(summary(top100$hasSSD), summary(frame$hasSSD)), workspace=1e9)

	# Plot the top 10 outliers
	top <- head(summary(frame[1:100,]$filename), n=10)
	p <- qplot(x=reorder(names(top), top, function(y){-1y}),y=top, geom="histogram", stat="identity", fill=reorder(names(top), top, function(y){-1y}))
	p <- p + theme(legend.title = element_blank(), axis.text.x = element_blank(), axis.ticks = element_blank()) + xlab("") + ylab("freq")

	# Now let's look at the aggregates
	third_quartile <- function(input){
	if (is.numeric(input))
	return(quantile(input, names=T)[4])
	else
	return(NA)
	}

	frame$count <- rep(1, nrow(frame))
	agg <- frame[, list(sum(count), third_quartile(time)), by=filename]
	names(agg) <- c("filename", "freq", "time")

	# Plot the top 20 filenames in the highest frequency bin
	agg$submission_bins <- as.numeric(cut(agg$freq, 10))
	agg <- agg[order(-submission_bins, -time)]
	top20 <- head(agg[submission_bins==10], n=20)
	p <- qplot(x=reorder(filename, time, function(y){-1y}), y=time, data=top20, geom="histogram", fill=reorder(filename, time, function(y){-1y}), stat="identity")
	p <- p + theme(legend.title = element_blank(), axis.text.x = element_blank(), axis.ticks = element_blank()) + xlab("") + ylab("ms")

	############################################################
	# Outliers analysis
	############################################################

	# Map-reduce job preselected only the outliers for every file
	outliers <- fread("mainthreadio_outliers.csv")
	outliers <- outliers[outliers$filename != "TOTAL",]
	outliers <- outliers[outliers$time > 0]
	outliers <- outliers[order(-time)]

	get_culprit <- function(filename){
	path <- unlist(strsplit(filename, '/'))
	if(length(path) < 2)
	return(path[1])
	else if(path[2] == "extensions")
	return(path[3])
	else
	return(path[2])
	}

	# Aggregate files by extensions
	outliers$filename <- gsub('\\\\', '/', outliers$filename)
	outliers$extension <- sapply(outliers$filename, get_culprit)
	outliers$filename <- as.factor(outliers$filename)
	outliers$extension <- as.factor(outliers$extension)

	outliers$count <- rep(1, nrow(outliers))
	agg <- outliers[, list(sum(count), max(time)), by=extension]
	top <- head(agg[order(-V2)], n=25)