HarlanH/visualizing-categorizations.R

## visualizing-categorizations.R
# Demo of techniques to visualize the predictions made by a categorization model.

library(ROCR)
library(ggplot2)

load(url('http://dl.dropbox.com/u/7644953/classifier-visualization.Rdata'))

pred.df$actual.bin <- ifelse(pred.df$actual == 'yes', 1, 0)
pred.df <- pred.df[order(pred.df$predicted, decreasing=TRUE), ]

pred.df$cumsum <- cumsum(pred.df$actual.bin)/sum(pred.df$actual.bin)
pred.df$cumsum.yes <- with(pred.df, cumsum * actual.bin)

head(pred.df)
summary(pred.df)

baserate <- mean(pred.df$actual.bin)

# visualizations using ROCR

# convert to their object type (labels should be some sort of ordered type)
pred.rocr <- prediction(pred.df$predicted, pred.df$actual)

# some scalar summary stats
# Area Under the ROC Curve
performance(pred.rocr, 'auc')@y.values[[1]]
# Precision/Recall breakeven
performance(pred.rocr, 'prbe')@x.values[[1]]

# some ROCR graphs
# ROC Curve
plot(performance(pred.rocr, 'tpr', 'fpr'))
# Sensitivity/Specificity
plot(performance(pred.rocr, 'sens', 'spec'))
# Lift Curve
plot(performance(pred.rocr, 'lift', 'rpp'))

lift.perf <- performance(pred.rocr, 'lift', 'rpp')
lift.df <- data.frame(predicted=lift.perf@alpha.values[[1]], # alpha is cutoff
    lift=lift.perf@y.values[[1]])
lift.df <- lift.df[2:nrow(lift.df), ]

# decile table
dec.table <- ldply((1:10)/10, function(x) data.frame(
    decile=x,
    prop.yes=sum(pred.df$actual.bin[1:ceiling(nrow(pred.df)*x)])/sum(pred.df$actual.bin),
    lift=mean(pred.df$actual.bin[1:ceiling(nrow(pred.df)*x)])/mean(pred.df$actual.bin)))
print(dec.table, digits=2)

# dual smear graph
plot.dual.smear <- ggplot(pred.df, aes(predicted, actual.bin)) +
  geom_jitter(alpha=.3, position=position_jitter(height=.04,width=0)) +
  stat_smooth() +
  geom_abline(slope=1, intercept=0, color='grey', size=.1) +
  geom_vline(xintercept=baserate, color='orange', size=.5) +
  scale_x_continuous('Predicted', breaks=(0:4)/4, limits=c(0,.75), labels=sprintf('%d%%', (0:4)*25)) +
  scale_y_continuous('Actual', breaks=(0:4)/4, limits=c(-.05,1.05), labels=c('No', '', '', '', 'Yes')) +
  coord_equal()

# single smear graph
plot.single.smear <- ggplot(pred.df, aes(predicted, actual.bin, colour=actual)) +
  geom_jitter(alpha=.3, position=position_jitter(height=.5,width=0)) +
  geom_line(data=lift.df, aes(y=lift, color=NULL)) +
  scale_color_manual(values=c('darkblue', 'darkred')) +
  scale_x_continuous('Predicted', breaks=(0:4)/4, limits=c(0,.75), labels=sprintf('%d%%', (0:4)*25)) +
  scale_y_continuous('Lift') +
  geom_vline(xintercept=baserate, color='orange', size=.5) +
  geom_hline(yintercept=1, color='darkgrey', size=.5)

# dual smear with cumulative reach of yesses
plot.dual.cumulative <- ggplot(pred.df, aes(predicted, cumsum.yes, colour=actual)) +
  geom_jitter(alpha=.3, position=position_jitter(height=.04,width=0)) +
  geom_vline(xintercept=baserate, color='orange', size=.5) +
  scale_color_manual(values=c('darkblue', 'darkred')) +
  scale_x_continuous('Predicted/Threshold', breaks=(0:4)/4, limits=c(0,.75), labels=sprintf('%d%%', (0:4)*25)) +
  scale_y_continuous('Cumulative Proportion of Yesses Seen')

# pos/neg densities
plot.densities <- ggplot(pred.df, aes(predicted, ..count.., fill=actual)) +
  stat_density(alpha=.5, position='identity') +
  scale_x_continuous('Predicted', breaks=(0:4)/4, limits=c(0,1), labels=sprintf('%d%%', (0:4)*25)) +
  scale_y_sqrt('Density')
	# Demo of techniques to visualize the predictions made by a categorization model.

	library(ROCR)
	library(ggplot2)

	load(url('http://dl.dropbox.com/u/7644953/classifier-visualization.Rdata'))

	pred.df$actual.bin <- ifelse(pred.df$actual == 'yes', 1, 0)
	pred.df <- pred.df[order(pred.df$predicted, decreasing=TRUE), ]

	pred.df$cumsum <- cumsum(pred.df$actual.bin)/sum(pred.df$actual.bin)
	pred.df$cumsum.yes <- with(pred.df, cumsum * actual.bin)

	head(pred.df)
	summary(pred.df)

	baserate <- mean(pred.df$actual.bin)

	# visualizations using ROCR

	# convert to their object type (labels should be some sort of ordered type)
	pred.rocr <- prediction(pred.df$predicted, pred.df$actual)

	# some scalar summary stats
	# Area Under the ROC Curve
	performance(pred.rocr, 'auc')@y.values[[1]]
	# Precision/Recall breakeven
	performance(pred.rocr, 'prbe')@x.values[[1]]

	# some ROCR graphs
	# ROC Curve
	plot(performance(pred.rocr, 'tpr', 'fpr'))
	# Sensitivity/Specificity
	plot(performance(pred.rocr, 'sens', 'spec'))
	# Lift Curve
	plot(performance(pred.rocr, 'lift', 'rpp'))

	lift.perf <- performance(pred.rocr, 'lift', 'rpp')
	lift.df <- data.frame(predicted=lift.perf@alpha.values[[1]], # alpha is cutoff
	lift=lift.perf@y.values[[1]])
	lift.df <- lift.df[2:nrow(lift.df), ]

	# decile table
	dec.table <- ldply((1:10)/10, function(x) data.frame(
	decile=x,
	prop.yes=sum(pred.df$actual.bin[1:ceiling(nrow(pred.df)*x)])/sum(pred.df$actual.bin),
	lift=mean(pred.df$actual.bin[1:ceiling(nrow(pred.df)*x)])/mean(pred.df$actual.bin)))
	print(dec.table, digits=2)

	# dual smear graph
	plot.dual.smear <- ggplot(pred.df, aes(predicted, actual.bin)) +
	geom_jitter(alpha=.3, position=position_jitter(height=.04,width=0)) +
	stat_smooth() +
	geom_abline(slope=1, intercept=0, color='grey', size=.1) +
	geom_vline(xintercept=baserate, color='orange', size=.5) +
	scale_x_continuous('Predicted', breaks=(0:4)/4, limits=c(0,.75), labels=sprintf('%d%%', (0:4)*25)) +
	scale_y_continuous('Actual', breaks=(0:4)/4, limits=c(-.05,1.05), labels=c('No', '', '', '', 'Yes')) +
	coord_equal()

	# single smear graph
	plot.single.smear <- ggplot(pred.df, aes(predicted, actual.bin, colour=actual)) +
	geom_jitter(alpha=.3, position=position_jitter(height=.5,width=0)) +
	geom_line(data=lift.df, aes(y=lift, color=NULL)) +
	scale_color_manual(values=c('darkblue', 'darkred')) +
	scale_x_continuous('Predicted', breaks=(0:4)/4, limits=c(0,.75), labels=sprintf('%d%%', (0:4)*25)) +
	scale_y_continuous('Lift') +
	geom_vline(xintercept=baserate, color='orange', size=.5) +
	geom_hline(yintercept=1, color='darkgrey', size=.5)

	# dual smear with cumulative reach of yesses
	plot.dual.cumulative <- ggplot(pred.df, aes(predicted, cumsum.yes, colour=actual)) +
	geom_jitter(alpha=.3, position=position_jitter(height=.04,width=0)) +
	geom_vline(xintercept=baserate, color='orange', size=.5) +
	scale_color_manual(values=c('darkblue', 'darkred')) +
	scale_x_continuous('Predicted/Threshold', breaks=(0:4)/4, limits=c(0,.75), labels=sprintf('%d%%', (0:4)*25)) +
	scale_y_continuous('Cumulative Proportion of Yesses Seen')

	# pos/neg densities
	plot.densities <- ggplot(pred.df, aes(predicted, ..count.., fill=actual)) +
	stat_density(alpha=.5, position='identity') +
	scale_x_continuous('Predicted', breaks=(0:4)/4, limits=c(0,1), labels=sprintf('%d%%', (0:4)*25)) +
	scale_y_sqrt('Density')