leeper/slopegraph.r

## slopegraph.r
# R Function to Draw Edward Tufte-style Slopeplots

slopegraph <-
function(
df,
xlim = c(.5,ncol(df)+.5),
ylim = c(min(df)-diff(range(df))/100,max(df)+diff(range(df))/100),
main = NULL,
bty = 'n',
yaxt = 'n',
xaxt = 'n',
xlab = '',
ylab = '',
add.before = NULL, # an expression to add something between drawing the blank canvas and adding the plot content (i.e., behind the slopegraph)
add.after = NULL, # an expression to add something after adding the plot content
labels = names(df),
labpos.left = 2,
labpos.right = 4,
col.lines = par('fg'),
col.lab = par('fg'),
col.num = par('fg'),
col.xaxt = par('fg'),
offset.x = .1, # THIS DOESN'T SEEM TO WORK???
offset.lab = .1,
binval = 1.5, # threshold at which to force binning of labels and values (multiplier of the height of an "m")
cex.lab = 1,
cex.num = 1,
font.lab = 1,
font.num = 1,
lty = par("lty"),
lwd = par("lwd"),
mai = NULL,
...)
{
    if(ncol(df) < 2)
        stop('`df` must have at least two columns')
    # draw margins
    if(is.null(mai))
        par(mai=c(1, 0, if(is.null(main)) 0 else 1, 0))
    else
        par(mai=mai)

    plot(NA, y=NULL, xlim=xlim, ylim=ylim, main=main,
         bty=bty, yaxt=yaxt, xaxt=xaxt, xlab=xlab, ylab=ylab, ...)
    # optional expression
    if(!is.null(add.before))
        eval(add.before)

    # x-axis
    axis(1, 1:ncol(df), labels=labels, col=col.xaxt, col.ticks=col.xaxt)

    # height and width of 'm' on plotting device
    h <- strheight('m')
    w <- strwidth('m')

    # function for finding consecutive indices
    # from: http://stackoverflow.com/a/16118320/2338862
    seqle <- function(x,incr=1) {
        if(!is.numeric(x)) x <- as.numeric(x)
        n <- length(x)
        y <- x[-1L] != x[-n] + incr
        i <- c(which(y|is.na(y)),n)
        list(lengths = diff(c(0L,i)),
             values = x[head(c(0L,i)+1L,-1L)])
    }

    overlaps <- function(coldf, cat='rownames'){
        # conditionally remove exactly duplicated values
        if(any(duplicated(coldf[,1]))){
            u <- unique(coldf[,1])
            out <- cbind.data.frame(t(sapply(u, function(i)
                c(paste(rownames(coldf)[coldf[,1]==i],collapse='\n'),i))))
            rownames(out) <- out[,1]
            out[,1] <- NULL
            names(out) <- names(coldf)
            out[,1] <- as.numeric(as.character(out[,1]))
            coldf <- out[order(out[,1]),,drop=FALSE]
        }
        # function to fix overlaps
        overlaps <- which(abs(diff(coldf[,1]))<(binval*h))
        if(length(overlaps)){
            runs <- seqle(overlaps) # use seqle function
            overlaps2 <- mapply(function(i,j) seq(i,length.out=j+1), runs$values, runs$lengths)
            oldlabs <- coldf[-unique(c(overlaps,overlaps+1)),,drop=FALSE]
            newlabs <- data.frame(sapply(overlaps2, function(i) mean(coldf[i,1])))
            names(newlabs) <- names(coldf)
            if(cat=='rownames'){
                rownames(newlabs) <-
                    sapply(overlaps2, function(i) paste(rownames(coldf)[rev(i)],collapse='\n'))
            } else if(cat=='values'){
                rownames(oldlabs) <- oldlabs[,1]
                rownames(newlabs) <-
                    sapply(overlaps2, function(i) paste(as.character(coldf[rev(i),1]),collapse='\n'))
            }
            return(rbind(oldlabs,newlabs))
        } else
            return(coldf)
    }

    # left-side labels
    l <- overlaps(df[order(df[,1]),1,drop=FALSE])
    text(1-offset.lab, l[,1],
         col=col.lab, rownames(l), pos=labpos.left, cex=cex.lab, font=font.lab)

    # right-side labels
    r <- overlaps(df[order(df[,ncol(df)]),ncol(df),drop=FALSE])
    text(ncol(df)+offset.lab, r[,1],
         col=col.lab, rownames(r), pos=labpos.right, cex=cex.lab, font=font.lab)

    # numeric value labels
    valslist <- lapply(seq_along(df), function(i) overlaps(df[order(df[,i]),i,drop=FALSE], cat='values'))
    for(i in 1:length(valslist)){
        text(rep(i,nrow(valslist[[i]])), valslist[[i]][,1], rownames(valslist[[i]]),
            col=col.num, cex=cex.num, font=font.num)
    }

    # draw lines
    offset.x <- .1 # small offset for `segments`
    col.lines <- rep(col.lines, length.out=nrow(df))
    lty <- rep(lty, length.out=nrow(df))
    lwd <- rep(lwd, length.out=nrow(df))
    for(i in 1:nrow(df)){
        mapply(function(x1,y1,x2,y2,...){
            ysloped <- (y2-y1)*offset.x
            segments(x1+offset.x, if(y1==y2) y1 else (y1+ysloped),
                     x2-offset.x, if(y1==y2) y2 else (y2-ysloped),
                     col=col.lines[i],
                     lty=lty[i],
                     lwd=lwd[i]
                    )},
               1:(length(df[i,])-1), # x1-positions
               df[i,][-length(df[i,])], # y1-positions
               2:(length(df[i,])), # x2-positions
               df[i,][-1] # y2-positions
               )
    }

    # optional expression
    if(!is.null(add.after))
        eval(add.after)

    # return invisibly
    invisible(NULL)
}


# EXAMPLES

## Tufte's Cancer Graph (to the correct scale)
cancer <- read.csv('tufte-cancer-survival-data.csv')
rownames(cancer) <- cancer[,1]
cancer <- cancer [,-1]
pdf('tufte-cancer-survival-plot.pdf',height=16, width=12, family='Palatino')
slopegraph(cancer, col.line='gray', xlim=c(-.5,5.5), labels=c('5 Year','10 Year','15 Year','20 Year'), binval=2.5)
dev.off()


## Tufte's GNP Graph
gnp <- read.csv('tufte-gnp-data.csv')
gnp[,3] <- NULL
pdf('tufte-gnp-plot.pdf',height=12, width=8, family='Palatino')
slopegraph(gnp, col.line='gray', labels=c('1970','1979'),
    main='Current Receipts of Goverment as a Percentage of Gross Domestic Product', binval=3.75)
dev.off()

## tufte-cancer-survival-data.csv

          
            Year 5
            Year 10
            Year 15
            Year 20

            
              Prostate
              99.0
              95.0
              87.0
              81.0

            
              Thyroid
              96.0
              96.0
              94.0
              95.0

            
              Testis
              95.0
              94.0
              91.0
              88.0

            
              Melanomas
              89.0
              87.0
              84.0
              83.0

            
              Breast
              86.0
              78.0
              71.0
              75.0

            
              Hodgkin's
              85.0
              80.0
              74.0
              67.0

            
              Uterus
              84.0
              83.0
              81.0
              79.0

            
              Urinary
              82.0
              76.0
              70.0
              68.0

            
              Cervix
              71.0
              64.0
              63.0
              60.0

            
              Larynx
              69.0
              57.0
              46.0
              38.0

            
              Rectum
              63.0
              55.0
              52.0
              49.0

            
              Kidney
              62.0
              54.0
              50.0
              47.0

            
              Colon
              62.0
              55.0
              54.0
              52.0

            
              Non-Hodgkin's
              58.0
              46.0
              38.0
              34.0

            
              Oral
              57.0
              46.0
              38.0
              33.0

            
              Ovary
              55.0
              49.0
              50.0
              50.0

            
              Leukemia
              43.0
              32.0
              30.0
              26.0

            
              Brain
              32.0
              29.0
              28.0
              26.0

            
              Multiple myeloma
              30.0
              13.0
              7.0
              5.0

            
              Stomach
              24.0
              19.0
              19.0
              15.0

            
              Lung
              15.0
              11.0
              7.0
              6.0

            
              Esophagus
              14.0
              8.0
              8.0
              5.0

            
              Liver
              8.0
              6.0
              6.0
              8.0

            
              Pancreas
              4.0
              3.0
              3.0
              3.0

## tufte-gnp-data.csv
Year1970,Year1979,
Sweden,46.9,57.4
Netherlands,44.0,55.8
Norway,43.5,52.2,
Britain,40.7,39.0
France,39.0,43.4
Germany,37.5,42.9
Belgium,35.2,43.2
Canada,35.2,35.8
Finland,34.9,38.2
Italy,30.4,35.7
US,30.3,32.5
Greece,26.8,30.6
Switzerland,26.5,33.2
Spain,22.5,27.1
Japan,20.7,26.6
	# R Function to Draw Edward Tufte-style Slopeplots

	slopegraph <-
	function(
	df,
	xlim = c(.5,ncol(df)+.5),
	ylim = c(min(df)-diff(range(df))/100,max(df)+diff(range(df))/100),
	main = NULL,
	bty = 'n',
	yaxt = 'n',
	xaxt = 'n',
	xlab = '',
	ylab = '',
	add.before = NULL, # an expression to add something between drawing the blank canvas and adding the plot content (i.e., behind the slopegraph)
	add.after = NULL, # an expression to add something after adding the plot content
	labels = names(df),
	labpos.left = 2,
	labpos.right = 4,
	col.lines = par('fg'),
	col.lab = par('fg'),
	col.num = par('fg'),
	col.xaxt = par('fg'),
	offset.x = .1, # THIS DOESN'T SEEM TO WORK???
	offset.lab = .1,
	binval = 1.5, # threshold at which to force binning of labels and values (multiplier of the height of an "m")
	cex.lab = 1,
	cex.num = 1,
	font.lab = 1,
	font.num = 1,
	lty = par("lty"),
	lwd = par("lwd"),
	mai = NULL,
	...)
	{
	if(ncol(df) < 2)
	stop('`df` must have at least two columns')
	# draw margins
	if(is.null(mai))
	par(mai=c(1, 0, if(is.null(main)) 0 else 1, 0))
	else
	par(mai=mai)

	plot(NA, y=NULL, xlim=xlim, ylim=ylim, main=main,
	bty=bty, yaxt=yaxt, xaxt=xaxt, xlab=xlab, ylab=ylab, ...)
	# optional expression
	if(!is.null(add.before))
	eval(add.before)

	# x-axis
	axis(1, 1:ncol(df), labels=labels, col=col.xaxt, col.ticks=col.xaxt)

	# height and width of 'm' on plotting device
	h <- strheight('m')
	w <- strwidth('m')

	# function for finding consecutive indices
	# from: http://stackoverflow.com/a/16118320/2338862
	seqle <- function(x,incr=1) {
	if(!is.numeric(x)) x <- as.numeric(x)
	n <- length(x)
	y <- x[-1L] != x[-n] + incr
	i <- c(which(y\|is.na(y)),n)
	list(lengths = diff(c(0L,i)),
	values = x[head(c(0L,i)+1L,-1L)])
	}

	overlaps <- function(coldf, cat='rownames'){
	# conditionally remove exactly duplicated values
	if(any(duplicated(coldf[,1]))){
	u <- unique(coldf[,1])
	out <- cbind.data.frame(t(sapply(u, function(i)
	c(paste(rownames(coldf)[coldf[,1]==i],collapse='\n'),i))))
	rownames(out) <- out[,1]
	out[,1] <- NULL
	names(out) <- names(coldf)
	out[,1] <- as.numeric(as.character(out[,1]))
	coldf <- out[order(out[,1]),,drop=FALSE]
	}
	# function to fix overlaps
	overlaps <- which(abs(diff(coldf[,1]))<(binval*h))
	if(length(overlaps)){
	runs <- seqle(overlaps) # use seqle function
	overlaps2 <- mapply(function(i,j) seq(i,length.out=j+1), runs$values, runs$lengths)
	oldlabs <- coldf[-unique(c(overlaps,overlaps+1)),,drop=FALSE]
	newlabs <- data.frame(sapply(overlaps2, function(i) mean(coldf[i,1])))
	names(newlabs) <- names(coldf)
	if(cat=='rownames'){
	rownames(newlabs) <-
	sapply(overlaps2, function(i) paste(rownames(coldf)[rev(i)],collapse='\n'))
	} else if(cat=='values'){
	rownames(oldlabs) <- oldlabs[,1]
	rownames(newlabs) <-
	sapply(overlaps2, function(i) paste(as.character(coldf[rev(i),1]),collapse='\n'))
	}
	return(rbind(oldlabs,newlabs))
	} else
	return(coldf)
	}

	# left-side labels
	l <- overlaps(df[order(df[,1]),1,drop=FALSE])
	text(1-offset.lab, l[,1],
	col=col.lab, rownames(l), pos=labpos.left, cex=cex.lab, font=font.lab)

	# right-side labels
	r <- overlaps(df[order(df[,ncol(df)]),ncol(df),drop=FALSE])
	text(ncol(df)+offset.lab, r[,1],
	col=col.lab, rownames(r), pos=labpos.right, cex=cex.lab, font=font.lab)

	# numeric value labels
	valslist <- lapply(seq_along(df), function(i) overlaps(df[order(df[,i]),i,drop=FALSE], cat='values'))
	for(i in 1:length(valslist)){
	text(rep(i,nrow(valslist[[i]])), valslist[[i]][,1], rownames(valslist[[i]]),
	col=col.num, cex=cex.num, font=font.num)
	}

	# draw lines
	offset.x <- .1 # small offset for `segments`
	col.lines <- rep(col.lines, length.out=nrow(df))
	lty <- rep(lty, length.out=nrow(df))
	lwd <- rep(lwd, length.out=nrow(df))
	for(i in 1:nrow(df)){
	mapply(function(x1,y1,x2,y2,...){
	ysloped <- (y2-y1)*offset.x
	segments(x1+offset.x, if(y1==y2) y1 else (y1+ysloped),
	x2-offset.x, if(y1==y2) y2 else (y2-ysloped),
	col=col.lines[i],
	lty=lty[i],
	lwd=lwd[i]
	)},
	1:(length(df[i,])-1), # x1-positions
	df[i,][-length(df[i,])], # y1-positions
	2:(length(df[i,])), # x2-positions
	df[i,][-1] # y2-positions
	)
	}

	# optional expression
	if(!is.null(add.after))
	eval(add.after)

	# return invisibly
	invisible(NULL)
	}


	# EXAMPLES

	## Tufte's Cancer Graph (to the correct scale)
	cancer <- read.csv('tufte-cancer-survival-data.csv')
	rownames(cancer) <- cancer[,1]
	cancer <- cancer [,-1]
	pdf('tufte-cancer-survival-plot.pdf',height=16, width=12, family='Palatino')
	slopegraph(cancer, col.line='gray', xlim=c(-.5,5.5), labels=c('5 Year','10 Year','15 Year','20 Year'), binval=2.5)
	dev.off()


	## Tufte's GNP Graph
	gnp <- read.csv('tufte-gnp-data.csv')
	gnp[,3] <- NULL
	pdf('tufte-gnp-plot.pdf',height=12, width=8, family='Palatino')
	slopegraph(gnp, col.line='gray', labels=c('1970','1979'),
	main='Current Receipts of Goverment as a Percentage of Gross Domestic Product', binval=3.75)
	dev.off()
	Year 5	Year 10	Year 15	Year 20
Prostate	99.0	95.0	87.0	81.0
Thyroid	96.0	96.0	94.0	95.0
Testis	95.0	94.0	91.0	88.0
Melanomas	89.0	87.0	84.0	83.0
Breast	86.0	78.0	71.0	75.0
Hodgkin's	85.0	80.0	74.0	67.0
Uterus	84.0	83.0	81.0	79.0
Urinary	82.0	76.0	70.0	68.0
Cervix	71.0	64.0	63.0	60.0
Larynx	69.0	57.0	46.0	38.0
Rectum	63.0	55.0	52.0	49.0
Kidney	62.0	54.0	50.0	47.0
Colon	62.0	55.0	54.0	52.0
Non-Hodgkin's	58.0	46.0	38.0	34.0
Oral	57.0	46.0	38.0	33.0
Ovary	55.0	49.0	50.0	50.0
Leukemia	43.0	32.0	30.0	26.0
Brain	32.0	29.0	28.0	26.0
Multiple myeloma	30.0	13.0	7.0	5.0
Stomach	24.0	19.0	19.0	15.0
Lung	15.0	11.0	7.0	6.0
Esophagus	14.0	8.0	8.0	5.0
Liver	8.0	6.0	6.0	8.0
Pancreas	4.0	3.0	3.0	3.0
	Year1970,Year1979,
	Sweden,46.9,57.4
	Netherlands,44.0,55.8
	Norway,43.5,52.2,
	Britain,40.7,39.0
	France,39.0,43.4
	Germany,37.5,42.9
	Belgium,35.2,43.2
	Canada,35.2,35.8
	Finland,34.9,38.2
	Italy,30.4,35.7
	US,30.3,32.5
	Greece,26.8,30.6
	Switzerland,26.5,33.2
	Spain,22.5,27.1
	Japan,20.7,26.6