Created
June 27, 2017 14:15
-
-
Save aaiezza/1df71151875536de051a1b9628c5cb1c to your computer and use it in GitHub Desktop.
ggplot2 plotting function for a Normal Distribution with lightly shaded and labeled standard deviations from the mean. User may also give lower and upper bounds, which adds shading under the curve between those points and a label with the area under the curve. (Made to accommodate for Mathematical Statistics with Applications 7th Ed. - WMS 2008)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## | |
# Standard function for printing Normal Distribution in ggplot2 | |
# # # # | |
plotNormCurve <- function( mean = 0, sd = 1, variance = sd ^ 2, | |
lowerBound = NULL, upperBound = NULL, | |
sdRange = 3.5, | |
from = mean - (sdRange * sd), | |
to = mean + (sdRange * sd), | |
plotTitle = 'Normal Curve', xlab = 'x', ylab = 'density' | |
) | |
{ | |
sd <- sqrt( variance ) | |
# This calculation protects us from an altered x-range | |
lowerSD <- (mean - from) / sd | |
upperSD <- ( to - mean) / sd | |
normPlot <- ggplot() + aes( x = c( from, to ) ) + | |
#### | |
# Figuring out what the x scale should be | |
scale_x_continuous( breaks = function(li) | |
{ | |
br <- seq( ceiling(li[1]), li[2], ifelse( log(sd) >= 0, 1, abs(log(-log(sd))) ) ) | |
if ( length( br ) > 30 || log(sd) < 0 ) | |
{ | |
numOfBreaks <- ceiling( min( sdRange + 6, 10 ) ) | |
# cat( sprintf( 'numOfBreaks = %d\n', numOfBreaks ) ) | |
ltmean <- seq( li[1], mean, length.out = numOfBreaks%/%2 ) | |
# cat( sprintf( 'lb = %f, ltmean = %s\n', li[1], paste( ltmean, collapse = ', ' ) ) ) | |
gtmean <- seq( mean, li[2], length.out = numOfBreaks%/%2 ) | |
# cat( sprintf( 'ub = %f, gtmean = %s\n', li[2], paste( gtmean, collapse = ', ' ) ) ) | |
br <- c( ltmean[-length(ltmean)], | |
mean, | |
gtmean[-1] ) | |
} | |
roundDigits <- ceiling(max(2,ifelse( sd == 1 || log(sd) >= 0, 0, abs(log(-log(sd)))*2 ) ) ) | |
# cat( sprintf( 'roundDigits = %d\n', roundDigits ) ) | |
return( round( br, roundDigits ) ) | |
} ) + | |
#### | |
# Title and axis titles | |
labs( title = plotTitle, x = xlab, y = ylab ) + | |
theme_bw() + | |
theme( plot.title = element_text( hjust = 0.5 ), | |
panel.grid.major = element_line( size = 1.0 ), | |
axis.ticks.length = unit( 5, 'pt' ), | |
plot.margin = unit( c( 2, 2, 2, 2 ), 'lines'))#, | |
#aspect.ratio = 1/φ ) | |
#### | |
# Shade every standard deviations from the mean | |
lowerSDOut <- -floor( lowerSD ) | |
upperSDOut <- floor( upperSD ) | |
sdLi <- mean + sd * lowerSDOut:upperSDOut | |
sdL <- data.frame( | |
x = sdLi, | |
xend = sdLi, | |
y = 0, | |
yend = dnorm(sdLi,mean,sd) | |
) | |
shadeBounds <- rbind( c( from, sdL$x ), c( sdL$x, to ) ) | |
## TODO don't assume symmetry! Numbers should be based on area's distance from the mean! | |
sdLi <- c( 1:ceiling(ncol(shadeBounds)/2), | |
(ceiling(ncol(shadeBounds)/2) - | |
ifelse( ncol(shadeBounds)%%2 == 1, 1, 0 ) ):1 ) | |
shadeBounds <- rbind( shadeBounds, sdLi ) | |
rownames( shadeBounds ) <- NULL | |
shades <- apply( shadeBounds, 2, function( x, i ) | |
{ | |
areax <- seq( x[1], x[2], length.out = 100 ) | |
data.frame( x = areax, ymin = 0, ymax = dnorm( areax, mean, sd ), id = x[3] ) | |
} ) | |
for ( shade in shades ) | |
{ | |
normPlot <- normPlot + geom_ribbon( data = shade, aes( | |
x = x, ymin = ymin, ymax = ymax, alpha = factor(id) ), | |
fill = 'lightgrey', color = NA, | |
show.legend = FALSE ) | |
} | |
##### | |
normPlot <- normPlot + scale_alpha_manual( name = NULL, | |
breaks = levels( factor( sdLi ) ), | |
values = ( as.double( levels( factor( sdLi ) ) ) * 0.185 )^2 ) | |
##### | |
normPlot <- normPlot + geom_segment( data = sdL, aes( | |
x = x, y = y, xend = xend, yend = yend ), | |
linetype = 'dotted', size = 0.3 ) | |
area <- list() | |
#### | |
# User-defined shading under the curve | |
if ( !missing( lowerBound ) && !missing( upperBound ) ) | |
{ | |
for ( i in 1:length(lowerBound) ) | |
{ | |
xmin <- max( lowerBound[i], from ) | |
xmax <- min( upperBound[i], to ) | |
areax <- seq( xmin, xmax, length.out = 100 ) | |
area[[i]] <- data.frame( | |
x = areax, | |
ymin = 0, | |
ymax = dnorm( areax, mean = mean, sd = sd ) ) | |
getLabel <- function( x ) | |
{ | |
sapply( x, function( j ) | |
{ | |
if ( lowerBound[i] < xmin && j == xmin ) | |
return( lowerBound[i] ) | |
else if ( upperBound[i] > xmax && j == xmax) | |
return( upperBound[i] ) | |
else | |
return( j ) | |
} ) | |
} | |
# User-defined shading and dashed lines at cutoffs | |
normPlot <- normPlot + geom_ribbon( data = area[[i]], aes( | |
x = x, ymin = ymin, ymax = ymax ), | |
fill = 'lightblue', alpha = 0.7 ) + | |
geom_segment( data = area[[i]][c(1,nrow(area[[i]])),], | |
aes( x = x, y = ymin, xend = x, yend = ymax ), | |
linetype = '42', size = 0.4 ) | |
} | |
} | |
#### | |
# The actual normal Curve | |
normPlot <- normPlot + stat_function( fun = dnorm, | |
args = list( mean = mean, sd = sd ) ) + | |
#### | |
# Dashed line for the average | |
geom_segment( aes( | |
x = mean, y = 0, xend = mean, yend = dnorm(mean,mean,sd) ), | |
linetype = '82', size = 0.55 ) | |
#### | |
# Label every standard deviations from the mean | |
normPlot <- normPlot + geom_label( data = data.frame( | |
x = sdL$x, y = sdL$yend ), | |
aes( x = x, y = y, label = x ), label.padding = unit(0.15, "lines"), | |
vjust = 0.0, size = 2.6, alpha = 0.8, hjust = 'outward', | |
label.size = 0.25 ) | |
if ( !missing( lowerBound ) && !missing( upperBound ) ) | |
{ | |
for ( i in 1:length(lowerBound) ) | |
{ | |
# The labels on the User-defined bounds for shaded region | |
normPlot <- normPlot + geom_label_repel( | |
data = area[[i]][c(1,nrow(area[[i]])),], | |
aes( x = x, y = ymax*7/8, label = getLabel( x ) ), | |
fontface = 'italic', color = 'black', fill = 'lightgrey', | |
show.legend = FALSE, | |
size = 2.6, force = 0, | |
label.size = 0.25, | |
segment.size = 0.3, | |
arrow = arrow(25, unit(0.01,'npc')), | |
box.padding = unit( 0.02, 'npc' ), | |
point.padding = unit( 0.6, 'lines' ), | |
nudge_x = 0.1 | |
) | |
# The label of probability under the curve | |
pLabel.x <- meanAB( mean, sd, lowerBound[i], upperBound[i] ) | |
pLabel.y <- dnorm( pLabel.x, mean, sd ) | |
normPlot <- normPlot + geom_label_repel( | |
data = data.frame( | |
x = pLabel.x, | |
y = pLabel.y / 2, | |
label = sprintf( "P = %.4f\n( %.3f )", | |
pnormAB( mean, sd, lowerBound[i], upperBound[i] ), pLabel.x ) ), | |
aes( x = x, y = y, label = label ), | |
fontface = 'italic', color = 'black', fill = 'lightgrey', | |
show.legend = FALSE, | |
size = 2.5, force = 2, | |
label.size = 0.25, | |
segment.size = 0.3, | |
arrow = arrow(25, unit(0.01,'npc')), | |
box.padding = unit( 0.02, 'npc' ), | |
point.padding = unit( 0.6, 'lines' ) | |
) | |
} | |
} | |
return( normPlot ) | |
} | |
## | |
# Calculates the Expected value of a Truncated Normal Distribution | |
# # # # | |
expectedValueOfNormalXbetweenAandB <- meanAB <- function( mean, sd, a, b ) | |
{ | |
µ <- mean | |
σ <- sd | |
µ + ( σ * | |
( ( exp( -(((a-µ)^2)/(2*(σ^2))) ) - exp( -(((b-µ)^2)/(2*(σ^2))) ) ) / | |
( sqrt(pi/2) * | |
( erf( (b-µ) / (sqrt(2)*σ) ) | |
- erf( (a-µ) / (sqrt(2)*σ) ) | |
) | |
) | |
) | |
) | |
} | |
## | |
# Calculates area under a normal curve from lowerBound to upperBound | |
# # # # | |
pnormAB <- function( mean, sd, lowerBound, upperBound ) | |
{ | |
return( | |
pnorm( upperBound, mean, sd ) - | |
pnorm( lowerBound, mean, sd ) ) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment