Create a gist now

Instantly share code, notes, and snippets.

Visualizing Categorizations blog post code
# Demo of techniques to visualize the predictions made by a categorization model.
pred.df$actual.bin <- ifelse(pred.df$actual == 'yes', 1, 0)
pred.df <- pred.df[order(pred.df$predicted, decreasing=TRUE), ]
pred.df$cumsum <- cumsum(pred.df$actual.bin)/sum(pred.df$actual.bin)
pred.df$cumsum.yes <- with(pred.df, cumsum * actual.bin)
baserate <- mean(pred.df$actual.bin)
# visualizations using ROCR
# convert to their object type (labels should be some sort of ordered type)
pred.rocr <- prediction(pred.df$predicted, pred.df$actual)
# some scalar summary stats
# Area Under the ROC Curve
performance(pred.rocr, 'auc')@y.values[[1]]
# Precision/Recall breakeven
performance(pred.rocr, 'prbe')@x.values[[1]]
# some ROCR graphs
# ROC Curve
plot(performance(pred.rocr, 'tpr', 'fpr'))
# Sensitivity/Specificity
plot(performance(pred.rocr, 'sens', 'spec'))
# Lift Curve
plot(performance(pred.rocr, 'lift', 'rpp'))
lift.perf <- performance(pred.rocr, 'lift', 'rpp')
lift.df <- data.frame(predicted=lift.perf@alpha.values[[1]], # alpha is cutoff
lift.df <- lift.df[2:nrow(lift.df), ]
# decile table
dec.table <- ldply((1:10)/10, function(x) data.frame(
print(dec.table, digits=2)
# dual smear graph
plot.dual.smear <- ggplot(pred.df, aes(predicted, actual.bin)) +
geom_jitter(alpha=.3, position=position_jitter(height=.04,width=0)) +
stat_smooth() +
geom_abline(slope=1, intercept=0, color='grey', size=.1) +
geom_vline(xintercept=baserate, color='orange', size=.5) +
scale_x_continuous('Predicted', breaks=(0:4)/4, limits=c(0,.75), labels=sprintf('%d%%', (0:4)*25)) +
scale_y_continuous('Actual', breaks=(0:4)/4, limits=c(-.05,1.05), labels=c('No', '', '', '', 'Yes')) +
# single smear graph
plot.single.smear <- ggplot(pred.df, aes(predicted, actual.bin, colour=actual)) +
geom_jitter(alpha=.3, position=position_jitter(height=.5,width=0)) +
geom_line(data=lift.df, aes(y=lift, color=NULL)) +
scale_color_manual(values=c('darkblue', 'darkred')) +
scale_x_continuous('Predicted', breaks=(0:4)/4, limits=c(0,.75), labels=sprintf('%d%%', (0:4)*25)) +
scale_y_continuous('Lift') +
geom_vline(xintercept=baserate, color='orange', size=.5) +
geom_hline(yintercept=1, color='darkgrey', size=.5)
# dual smear with cumulative reach of yesses
plot.dual.cumulative <- ggplot(pred.df, aes(predicted, cumsum.yes, colour=actual)) +
geom_jitter(alpha=.3, position=position_jitter(height=.04,width=0)) +
geom_vline(xintercept=baserate, color='orange', size=.5) +
scale_color_manual(values=c('darkblue', 'darkred')) +
scale_x_continuous('Predicted/Threshold', breaks=(0:4)/4, limits=c(0,.75), labels=sprintf('%d%%', (0:4)*25)) +
scale_y_continuous('Cumulative Proportion of Yesses Seen')
# pos/neg densities
plot.densities <- ggplot(pred.df, aes(predicted, ..count.., fill=actual)) +
stat_density(alpha=.5, position='identity') +
scale_x_continuous('Predicted', breaks=(0:4)/4, limits=c(0,1), labels=sprintf('%d%%', (0:4)*25)) +

It it OK to use 4 lines of this (78 to 82) for my research (Master's Thesis)?


Aleksandr, sure! It's in the public domain. If you feel like citing me, a footnote (not a formal citation) with my name and a link to the gist would be appreciated. What's your topic? (Email me directly -- my email address is easy to google.)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment