Skip to content

Instantly share code, notes, and snippets.

@vitillo
Last active August 29, 2015 13:56
Show Gist options
  • Save vitillo/9175242 to your computer and use it in GitHub Desktop.
Save vitillo/9175242 to your computer and use it in GitHub Desktop.
Exploratory analysis hackjob for Mainthread-IO data
import simplejson as json
import numpy
import io
import csv
import scipy.stats
from string import maketrans
def clean(s):
return normalize(s).translate(None, ",")
def normalize(s):
if type(s) == unicode:
return s.encode('utf8', 'ignore')
else:
return str(s)
def safe_key(pieces):
output = io.BytesIO()
writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)
writer.writerow(pieces)
return output.getvalue().strip()
def map(k, d, v, cx):
global n_pings
parsed = json.loads(v)
reason, appName, appUpdateChannel, appVersion, appBuildID, submission_date = d
if not "fileIOReports" in parsed:
return
if not parsed["fileIOReports"]:
return
disk = parsed['info'].get('profileHDDModel', "NA")
arch = parsed['info']['arch']
OS_version = parsed['info']['version']
addons = parsed['info'].get('addons', "")
addons = addons.replace(',',';')
for f, arr in parsed["fileIOReports"].iteritems():
arr.append(arch)
arr.append(OS_version)
arr.append(disk)
arr.append(addons)
arr.append(addons.count(';'))
cx.write(safe_key([clean(f)]), arr)
def setup_reduce(cx):
cx.field_separator = ","
def reduce(k, v, cx):
n_pings = len(v)
if n_pings > 10000:
for total, n_open, n_read, n_write, n_fsync, n_stat, arch, OS_version, disk, addons, addons_count in v[:n_pings/2]:
cx.write(k, ",".join([str(total), str(n_open), str(n_read), str(n_write), str(n_fsync), str(n_stat), arch, OS_version, disk, addons, str(addons_count)]))
# Exploratory analysis hackjob for Mainthread-IO data
library(data.table)
library(ggplot2)
library(doMC)
registerDoMC(8)
############################################################
# High frequency files analysis
############################################################
# Map-reduce job preselected only the files with a frequency > 10%
frame <- fread("mainthreadio.csv")
# Clean dataset
frame <- frame[frame$filename != "TOTAL",]
frame <- frame[frame$time > 0]
frame$isSQL <- as.factor(grepl("sqlite", frame$filename))
frame$hasSSD <- as.factor(grepl("SSD", frame$disk))
frame$arch <- as.factor(frame$arch)
frame$version <- as.factor(frame$OS_version)
frame$filename <- as.factor(frame$filename)
frame$addons_count <- as.numeric(frame$addons_count)
# How common are the various operations?
summary(frame$n_opens)
summary(frame$n_read)
summary(frame$n_stat)
summary(frame$n_write)
summary(frame$n_fsync)
# Is there a correlation between the number of operations and time?
cor(frame$n_write+frame$n_read+frame$n_stat+frame$n_fsync, frame$time)
# Does having an SSD disk lower the total acc. time?
summary(frame[frame$hasSSD == TRUE]$time)
summary(frame[frame$hasSSD == FALSE]$time)
wilcox.test(frame[frame$hasSSD == FALSE]$time, frame[frame$hasSSD == TRUE]$time)
# Are IO operations on sqlite databases more expensive than on normal files?
summary(frame[frame$isSQL == TRUE]$time)
summary(frame[frame$isSQL == FALSE]$time)
wilcox.test(frame[frame$isSQL == FALSE]$time, frame[frame$isSQL == TRUE]$time)
# Investigate differences between top100 offenders and genpop
frame <- frame[order(-time)]
top100 <- head(frame, n=100)
# Does the distribution of windows versions differ between the population and the top 100
fisher.test(cbind(summary(top100$version), summary(frame$version)), workspace=1e9)
# Does the distribution of architectures differ between the population and the top 100?
fisher.test(cbind(summary(top100$arch), summary(frame$arch)), workspace=1e9)
# Does the distribution of addons differ between the population and the top 100?
cor(frame$addons_count, frame$time) # very low
ks.test(frame$addons_count, head(frame, n=100)$addons_count)
# Does the distribution of the disk type differ between the population and the top 100?
fisher.test(cbind(summary(top100$hasSSD), summary(frame$hasSSD)), workspace=1e9)
# Plot the top 10 outliers
top <- head(summary(frame[1:100,]$filename), n=10)
p <- qplot(x=reorder(names(top), top, function(y){-1*y}),y=top, geom="histogram", stat="identity", fill=reorder(names(top), top, function(y){-1*y}))
p <- p + theme(legend.title = element_blank(), axis.text.x = element_blank(), axis.ticks = element_blank()) + xlab("") + ylab("freq")
# Now let's look at the aggregates
third_quartile <- function(input){
if (is.numeric(input))
return(quantile(input, names=T)[4])
else
return(NA)
}
frame$count <- rep(1, nrow(frame))
agg <- frame[, list(sum(count), third_quartile(time)), by=filename]
names(agg) <- c("filename", "freq", "time")
# Plot the top 20 filenames in the highest frequency bin
agg$submission_bins <- as.numeric(cut(agg$freq, 10))
agg <- agg[order(-submission_bins, -time)]
top20 <- head(agg[submission_bins==10], n=20)
p <- qplot(x=reorder(filename, time, function(y){-1*y}), y=time, data=top20, geom="histogram", fill=reorder(filename, time, function(y){-1*y}), stat="identity")
p <- p + theme(legend.title = element_blank(), axis.text.x = element_blank(), axis.ticks = element_blank()) + xlab("") + ylab("ms")
############################################################
# Outliers analysis
############################################################
# Map-reduce job preselected only the outliers for every file
outliers <- fread("mainthreadio_outliers.csv")
outliers <- outliers[outliers$filename != "TOTAL",]
outliers <- outliers[outliers$time > 0]
outliers <- outliers[order(-time)]
get_culprit <- function(filename){
path <- unlist(strsplit(filename, '/'))
if(length(path) < 2)
return(path[1])
else if(path[2] == "extensions")
return(path[3])
else
return(path[2])
}
# Aggregate files by extensions
outliers$filename <- gsub('\\\\', '/', outliers$filename)
outliers$extension <- sapply(outliers$filename, get_culprit)
outliers$filename <- as.factor(outliers$filename)
outliers$extension <- as.factor(outliers$extension)
outliers$count <- rep(1, nrow(outliers))
agg <- outliers[, list(sum(count), max(time)), by=extension]
top <- head(agg[order(-V2)], n=25)
import simplejson as json
import numpy
import io
import csv
import scipy.stats
from string import maketrans
def clean(s):
return normalize(s).translate(None, ",")
def normalize(s):
if type(s) == unicode:
return s.encode('utf8', 'ignore')
else:
return str(s)
def safe_key(pieces):
output = io.BytesIO()
writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)
writer.writerow(pieces)
return output.getvalue().strip()
def map(k, d, v, cx):
global n_pings
parsed = json.loads(v)
reason, appName, appUpdateChannel, appVersion, appBuildID, submission_date = d
if not "fileIOReports" in parsed:
return
if not parsed["fileIOReports"]:
return
disk = parsed['info'].get('profileHDDModel', "NA")
arch = parsed['info']['arch']
OS_version = parsed['info']['version']
addons = parsed['info'].get('addons', "")
addons = addons.replace(',',';')
for f, arr in parsed["fileIOReports"].iteritems():
arr.append(arch)
arr.append(OS_version)
arr.append(disk)
arr.append(addons)
arr.append(addons.count(';'))
cx.write(safe_key([clean(f)]), arr)
def setup_reduce(cx):
cx.field_separator = ","
def reduce(k, v, cx):
n_pings = len(v)
top = 0
for idx, entry in enumerate(v[:n_pings/2]):
if entry[0] > v[top]:
top = idx
total, n_open, n_read, n_write, n_fsync, n_stat, arch, OS_version, disk, addons, addons_count = v[top]
cx.write(k, ",".join([str(total), str(n_open), str(n_read), str(n_write), str(n_fsync), str(n_stat), arch, OS_version, disk, addons, str(addons_count)])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment