Skip to content

Embed URL

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Visualizing Categorizations blog post code
# Demo of techniques to visualize the predictions made by a categorization model.
library(ROCR)
library(ggplot2)
load(url('http://dl.dropbox.com/u/7644953/classifier-visualization.Rdata'))
pred.df$actual.bin <- ifelse(pred.df$actual == 'yes', 1, 0)
pred.df <- pred.df[order(pred.df$predicted, decreasing=TRUE), ]
pred.df$cumsum <- cumsum(pred.df$actual.bin)/sum(pred.df$actual.bin)
pred.df$cumsum.yes <- with(pred.df, cumsum * actual.bin)
head(pred.df)
summary(pred.df)
baserate <- mean(pred.df$actual.bin)
# visualizations using ROCR
# convert to their object type (labels should be some sort of ordered type)
pred.rocr <- prediction(pred.df$predicted, pred.df$actual)
# some scalar summary stats
# Area Under the ROC Curve
performance(pred.rocr, 'auc')@y.values[[1]]
# Precision/Recall breakeven
performance(pred.rocr, 'prbe')@x.values[[1]]
# some ROCR graphs
# ROC Curve
plot(performance(pred.rocr, 'tpr', 'fpr'))
# Sensitivity/Specificity
plot(performance(pred.rocr, 'sens', 'spec'))
# Lift Curve
plot(performance(pred.rocr, 'lift', 'rpp'))
lift.perf <- performance(pred.rocr, 'lift', 'rpp')
lift.df <- data.frame(predicted=lift.perf@alpha.values[[1]], # alpha is cutoff
lift=lift.perf@y.values[[1]])
lift.df <- lift.df[2:nrow(lift.df), ]
# decile table
dec.table <- ldply((1:10)/10, function(x) data.frame(
decile=x,
prop.yes=sum(pred.df$actual.bin[1:ceiling(nrow(pred.df)*x)])/sum(pred.df$actual.bin),
lift=mean(pred.df$actual.bin[1:ceiling(nrow(pred.df)*x)])/mean(pred.df$actual.bin)))
print(dec.table, digits=2)
# dual smear graph
plot.dual.smear <- ggplot(pred.df, aes(predicted, actual.bin)) +
geom_jitter(alpha=.3, position=position_jitter(height=.04,width=0)) +
stat_smooth() +
geom_abline(slope=1, intercept=0, color='grey', size=.1) +
geom_vline(xintercept=baserate, color='orange', size=.5) +
scale_x_continuous('Predicted', breaks=(0:4)/4, limits=c(0,.75), labels=sprintf('%d%%', (0:4)*25)) +
scale_y_continuous('Actual', breaks=(0:4)/4, limits=c(-.05,1.05), labels=c('No', '', '', '', 'Yes')) +
coord_equal()
# single smear graph
plot.single.smear <- ggplot(pred.df, aes(predicted, actual.bin, colour=actual)) +
geom_jitter(alpha=.3, position=position_jitter(height=.5,width=0)) +
geom_line(data=lift.df, aes(y=lift, color=NULL)) +
scale_color_manual(values=c('darkblue', 'darkred')) +
scale_x_continuous('Predicted', breaks=(0:4)/4, limits=c(0,.75), labels=sprintf('%d%%', (0:4)*25)) +
scale_y_continuous('Lift') +
geom_vline(xintercept=baserate, color='orange', size=.5) +
geom_hline(yintercept=1, color='darkgrey', size=.5)
# dual smear with cumulative reach of yesses
plot.dual.cumulative <- ggplot(pred.df, aes(predicted, cumsum.yes, colour=actual)) +
geom_jitter(alpha=.3, position=position_jitter(height=.04,width=0)) +
geom_vline(xintercept=baserate, color='orange', size=.5) +
scale_color_manual(values=c('darkblue', 'darkred')) +
scale_x_continuous('Predicted/Threshold', breaks=(0:4)/4, limits=c(0,.75), labels=sprintf('%d%%', (0:4)*25)) +
scale_y_continuous('Cumulative Proportion of Yesses Seen')
# pos/neg densities
plot.densities <- ggplot(pred.df, aes(predicted, ..count.., fill=actual)) +
stat_density(alpha=.5, position='identity') +
scale_x_continuous('Predicted', breaks=(0:4)/4, limits=c(0,1), labels=sprintf('%d%%', (0:4)*25)) +
scale_y_sqrt('Density')
@alevchuk

It it OK to use 4 lines of this (78 to 82) for my research (Master's Thesis)?

@HarlanH
Owner

Aleksandr, sure! It's in the public domain. If you feel like citing me, a footnote (not a formal citation) with my name and a link to the gist would be appreciated. What's your topic? (Email me directly -- my email address is easy to google.)

@alevchuk
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.