Last active
August 29, 2015 13:56
-
-
Save vitillo/9175242 to your computer and use it in GitHub Desktop.
Exploratory analysis hackjob for Mainthread-IO data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import simplejson as json | |
import numpy | |
import io | |
import csv | |
import scipy.stats | |
from string import maketrans | |
def clean(s): | |
return normalize(s).translate(None, ",") | |
def normalize(s): | |
if type(s) == unicode: | |
return s.encode('utf8', 'ignore') | |
else: | |
return str(s) | |
def safe_key(pieces): | |
output = io.BytesIO() | |
writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL) | |
writer.writerow(pieces) | |
return output.getvalue().strip() | |
def map(k, d, v, cx): | |
global n_pings | |
parsed = json.loads(v) | |
reason, appName, appUpdateChannel, appVersion, appBuildID, submission_date = d | |
if not "fileIOReports" in parsed: | |
return | |
if not parsed["fileIOReports"]: | |
return | |
disk = parsed['info'].get('profileHDDModel', "NA") | |
arch = parsed['info']['arch'] | |
OS_version = parsed['info']['version'] | |
addons = parsed['info'].get('addons', "") | |
addons = addons.replace(',',';') | |
for f, arr in parsed["fileIOReports"].iteritems(): | |
arr.append(arch) | |
arr.append(OS_version) | |
arr.append(disk) | |
arr.append(addons) | |
arr.append(addons.count(';')) | |
cx.write(safe_key([clean(f)]), arr) | |
def setup_reduce(cx): | |
cx.field_separator = "," | |
def reduce(k, v, cx): | |
n_pings = len(v) | |
if n_pings > 10000: | |
for total, n_open, n_read, n_write, n_fsync, n_stat, arch, OS_version, disk, addons, addons_count in v[:n_pings/2]: | |
cx.write(k, ",".join([str(total), str(n_open), str(n_read), str(n_write), str(n_fsync), str(n_stat), arch, OS_version, disk, addons, str(addons_count)])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Exploratory analysis hackjob for Mainthread-IO data | |
library(data.table) | |
library(ggplot2) | |
library(doMC) | |
registerDoMC(8) | |
############################################################ | |
# High frequency files analysis | |
############################################################ | |
# Map-reduce job preselected only the files with a frequency > 10% | |
frame <- fread("mainthreadio.csv") | |
# Clean dataset | |
frame <- frame[frame$filename != "TOTAL",] | |
frame <- frame[frame$time > 0] | |
frame$isSQL <- as.factor(grepl("sqlite", frame$filename)) | |
frame$hasSSD <- as.factor(grepl("SSD", frame$disk)) | |
frame$arch <- as.factor(frame$arch) | |
frame$version <- as.factor(frame$OS_version) | |
frame$filename <- as.factor(frame$filename) | |
frame$addons_count <- as.numeric(frame$addons_count) | |
# How common are the various operations? | |
summary(frame$n_opens) | |
summary(frame$n_read) | |
summary(frame$n_stat) | |
summary(frame$n_write) | |
summary(frame$n_fsync) | |
# Is there a correlation between the number of operations and time? | |
cor(frame$n_write+frame$n_read+frame$n_stat+frame$n_fsync, frame$time) | |
# Does having an SSD disk lower the total acc. time? | |
summary(frame[frame$hasSSD == TRUE]$time) | |
summary(frame[frame$hasSSD == FALSE]$time) | |
wilcox.test(frame[frame$hasSSD == FALSE]$time, frame[frame$hasSSD == TRUE]$time) | |
# Are IO operations on sqlite databases more expensive than on normal files? | |
summary(frame[frame$isSQL == TRUE]$time) | |
summary(frame[frame$isSQL == FALSE]$time) | |
wilcox.test(frame[frame$isSQL == FALSE]$time, frame[frame$isSQL == TRUE]$time) | |
# Investigate differences between top100 offenders and genpop | |
frame <- frame[order(-time)] | |
top100 <- head(frame, n=100) | |
# Does the distribution of windows versions differ between the population and the top 100 | |
fisher.test(cbind(summary(top100$version), summary(frame$version)), workspace=1e9) | |
# Does the distribution of architectures differ between the population and the top 100? | |
fisher.test(cbind(summary(top100$arch), summary(frame$arch)), workspace=1e9) | |
# Does the distribution of addons differ between the population and the top 100? | |
cor(frame$addons_count, frame$time) # very low | |
ks.test(frame$addons_count, head(frame, n=100)$addons_count) | |
# Does the distribution of the disk type differ between the population and the top 100? | |
fisher.test(cbind(summary(top100$hasSSD), summary(frame$hasSSD)), workspace=1e9) | |
# Plot the top 10 outliers | |
top <- head(summary(frame[1:100,]$filename), n=10) | |
p <- qplot(x=reorder(names(top), top, function(y){-1*y}),y=top, geom="histogram", stat="identity", fill=reorder(names(top), top, function(y){-1*y})) | |
p <- p + theme(legend.title = element_blank(), axis.text.x = element_blank(), axis.ticks = element_blank()) + xlab("") + ylab("freq") | |
# Now let's look at the aggregates | |
third_quartile <- function(input){ | |
if (is.numeric(input)) | |
return(quantile(input, names=T)[4]) | |
else | |
return(NA) | |
} | |
frame$count <- rep(1, nrow(frame)) | |
agg <- frame[, list(sum(count), third_quartile(time)), by=filename] | |
names(agg) <- c("filename", "freq", "time") | |
# Plot the top 20 filenames in the highest frequency bin | |
agg$submission_bins <- as.numeric(cut(agg$freq, 10)) | |
agg <- agg[order(-submission_bins, -time)] | |
top20 <- head(agg[submission_bins==10], n=20) | |
p <- qplot(x=reorder(filename, time, function(y){-1*y}), y=time, data=top20, geom="histogram", fill=reorder(filename, time, function(y){-1*y}), stat="identity") | |
p <- p + theme(legend.title = element_blank(), axis.text.x = element_blank(), axis.ticks = element_blank()) + xlab("") + ylab("ms") | |
############################################################ | |
# Outliers analysis | |
############################################################ | |
# Map-reduce job preselected only the outliers for every file | |
outliers <- fread("mainthreadio_outliers.csv") | |
outliers <- outliers[outliers$filename != "TOTAL",] | |
outliers <- outliers[outliers$time > 0] | |
outliers <- outliers[order(-time)] | |
get_culprit <- function(filename){ | |
path <- unlist(strsplit(filename, '/')) | |
if(length(path) < 2) | |
return(path[1]) | |
else if(path[2] == "extensions") | |
return(path[3]) | |
else | |
return(path[2]) | |
} | |
# Aggregate files by extensions | |
outliers$filename <- gsub('\\\\', '/', outliers$filename) | |
outliers$extension <- sapply(outliers$filename, get_culprit) | |
outliers$filename <- as.factor(outliers$filename) | |
outliers$extension <- as.factor(outliers$extension) | |
outliers$count <- rep(1, nrow(outliers)) | |
agg <- outliers[, list(sum(count), max(time)), by=extension] | |
top <- head(agg[order(-V2)], n=25) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import simplejson as json | |
import numpy | |
import io | |
import csv | |
import scipy.stats | |
from string import maketrans | |
def clean(s): | |
return normalize(s).translate(None, ",") | |
def normalize(s): | |
if type(s) == unicode: | |
return s.encode('utf8', 'ignore') | |
else: | |
return str(s) | |
def safe_key(pieces): | |
output = io.BytesIO() | |
writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL) | |
writer.writerow(pieces) | |
return output.getvalue().strip() | |
def map(k, d, v, cx): | |
global n_pings | |
parsed = json.loads(v) | |
reason, appName, appUpdateChannel, appVersion, appBuildID, submission_date = d | |
if not "fileIOReports" in parsed: | |
return | |
if not parsed["fileIOReports"]: | |
return | |
disk = parsed['info'].get('profileHDDModel', "NA") | |
arch = parsed['info']['arch'] | |
OS_version = parsed['info']['version'] | |
addons = parsed['info'].get('addons', "") | |
addons = addons.replace(',',';') | |
for f, arr in parsed["fileIOReports"].iteritems(): | |
arr.append(arch) | |
arr.append(OS_version) | |
arr.append(disk) | |
arr.append(addons) | |
arr.append(addons.count(';')) | |
cx.write(safe_key([clean(f)]), arr) | |
def setup_reduce(cx): | |
cx.field_separator = "," | |
def reduce(k, v, cx): | |
n_pings = len(v) | |
top = 0 | |
for idx, entry in enumerate(v[:n_pings/2]): | |
if entry[0] > v[top]: | |
top = idx | |
total, n_open, n_read, n_write, n_fsync, n_stat, arch, OS_version, disk, addons, addons_count = v[top] | |
cx.write(k, ",".join([str(total), str(n_open), str(n_read), str(n_write), str(n_fsync), str(n_stat), arch, OS_version, disk, addons, str(addons_count)]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment