public
Last active

Visualizing Categorizations blog post code

  • Download Gist
visualizing-categorizations.R
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
# Demo of techniques to visualize the predictions made by a categorization model.
 
library(ROCR)
library(ggplot2)
 
load(url('http://dl.dropbox.com/u/7644953/classifier-visualization.Rdata'))
 
pred.df$actual.bin <- ifelse(pred.df$actual == 'yes', 1, 0)
pred.df <- pred.df[order(pred.df$predicted, decreasing=TRUE), ]
 
pred.df$cumsum <- cumsum(pred.df$actual.bin)/sum(pred.df$actual.bin)
pred.df$cumsum.yes <- with(pred.df, cumsum * actual.bin)
 
head(pred.df)
summary(pred.df)
 
baserate <- mean(pred.df$actual.bin)
 
# visualizations using ROCR
 
# convert to their object type (labels should be some sort of ordered type)
pred.rocr <- prediction(pred.df$predicted, pred.df$actual)
 
# some scalar summary stats
# Area Under the ROC Curve
performance(pred.rocr, 'auc')@y.values[[1]]
# Precision/Recall breakeven
performance(pred.rocr, 'prbe')@x.values[[1]]
 
# some ROCR graphs
# ROC Curve
plot(performance(pred.rocr, 'tpr', 'fpr'))
# Sensitivity/Specificity
plot(performance(pred.rocr, 'sens', 'spec'))
# Lift Curve
plot(performance(pred.rocr, 'lift', 'rpp'))
 
lift.perf <- performance(pred.rocr, 'lift', 'rpp')
lift.df <- data.frame(predicted=lift.perf@alpha.values[[1]], # alpha is cutoff
lift=lift.perf@y.values[[1]])
lift.df <- lift.df[2:nrow(lift.df), ]
 
# decile table
dec.table <- ldply((1:10)/10, function(x) data.frame(
decile=x,
prop.yes=sum(pred.df$actual.bin[1:ceiling(nrow(pred.df)*x)])/sum(pred.df$actual.bin),
lift=mean(pred.df$actual.bin[1:ceiling(nrow(pred.df)*x)])/mean(pred.df$actual.bin)))
print(dec.table, digits=2)
 
# dual smear graph
plot.dual.smear <- ggplot(pred.df, aes(predicted, actual.bin)) +
geom_jitter(alpha=.3, position=position_jitter(height=.04,width=0)) +
stat_smooth() +
geom_abline(slope=1, intercept=0, color='grey', size=.1) +
geom_vline(xintercept=baserate, color='orange', size=.5) +
scale_x_continuous('Predicted', breaks=(0:4)/4, limits=c(0,.75), labels=sprintf('%d%%', (0:4)*25)) +
scale_y_continuous('Actual', breaks=(0:4)/4, limits=c(-.05,1.05), labels=c('No', '', '', '', 'Yes')) +
coord_equal()
 
# single smear graph
plot.single.smear <- ggplot(pred.df, aes(predicted, actual.bin, colour=actual)) +
geom_jitter(alpha=.3, position=position_jitter(height=.5,width=0)) +
geom_line(data=lift.df, aes(y=lift, color=NULL)) +
scale_color_manual(values=c('darkblue', 'darkred')) +
scale_x_continuous('Predicted', breaks=(0:4)/4, limits=c(0,.75), labels=sprintf('%d%%', (0:4)*25)) +
scale_y_continuous('Lift') +
geom_vline(xintercept=baserate, color='orange', size=.5) +
geom_hline(yintercept=1, color='darkgrey', size=.5)
# dual smear with cumulative reach of yesses
plot.dual.cumulative <- ggplot(pred.df, aes(predicted, cumsum.yes, colour=actual)) +
geom_jitter(alpha=.3, position=position_jitter(height=.04,width=0)) +
geom_vline(xintercept=baserate, color='orange', size=.5) +
scale_color_manual(values=c('darkblue', 'darkred')) +
scale_x_continuous('Predicted/Threshold', breaks=(0:4)/4, limits=c(0,.75), labels=sprintf('%d%%', (0:4)*25)) +
scale_y_continuous('Cumulative Proportion of Yesses Seen')
 
# pos/neg densities
plot.densities <- ggplot(pred.df, aes(predicted, ..count.., fill=actual)) +
stat_density(alpha=.5, position='identity') +
scale_x_continuous('Predicted', breaks=(0:4)/4, limits=c(0,1), labels=sprintf('%d%%', (0:4)*25)) +
scale_y_sqrt('Density')

It it OK to use 4 lines of this (78 to 82) for my research (Master's Thesis)?

Aleksandr, sure! It's in the public domain. If you feel like citing me, a footnote (not a formal citation) with my name and a link to the gist would be appreciated. What's your topic? (Email me directly -- my email address is easy to google.)

Harlan, Thanks! Couldn't find your email but added you to my Google+. I'm doing Bioinformatics research. The topic is: Removing Badly Aligned or Unrelated Sequences from Multiple Sequence Alignments. So, here is one of the places where you're code ends up: https://github.com/alevchuk/ms/commit/f348713a7414de40963c9c7b0ee8b4152bd82102#L0R37

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.