ShixiangWang/summary_fun.R

## summary_fun.R
#' @function Summary functions for error bar plot.
#' @references http://www.jianshu.com/p/003138ac593b

## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%).
##   data: a data frame.
##   measurevar: the name of a column that contains the variable to be summariezed
##   groupvars: a vector containing names of columns that contain grouping variables
##   na.rm: a boolean that indicates whether to ignore NA's
##   conf.interval: the percent range of the confidence interval (default is 95%)
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
                      conf.interval=.95, .drop=TRUE) {
    library(plyr)

    # New version of length which can handle NA's: if na.rm==T, don't count them
    length2 <- function (x, na.rm=FALSE) {
        if (na.rm) sum(!is.na(x))
        else       length(x)
    }

    # This does the summary. For each group's data frame, return a vector with
    # N, mean, and sd
    datac <- ddply(data, groupvars, .drop=.drop,
                   .fun = function(xx, col) {
                       c(N    = length2(xx[[col]], na.rm=na.rm),
                         mean = mean   (xx[[col]], na.rm=na.rm),
                         sd   = sd     (xx[[col]], na.rm=na.rm)
                       )
                   },
                   measurevar
    )

    # Rename the "mean" column
    datac <- rename(datac, c("mean" = measurevar))

    datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean

    # Confidence interval multiplier for standard error
    # Calculate t-statistic for confidence interval:
    # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
    ciMult <- qt(conf.interval/2 + .5, datac$N-1)
    datac$ci <- datac$se * ciMult

    return(datac)
}

## Norms the data within specified groups in a data frame; it normalizes each
## subject (identified by idvar) so that they have the same mean, within each group
## specified by betweenvars.
##   data: a data frame.
##   idvar: the name of a column that identifies each subject (or matched subjects)
##   measurevar: the name of a column that contains the variable to be summariezed
##   betweenvars: a vector containing names of columns that are between-subjects variables
##   na.rm: a boolean that indicates whether to ignore NA's
normDataWithin <- function(data=NULL, idvar, measurevar, betweenvars=NULL,
                           na.rm=FALSE, .drop=TRUE) {
    library(plyr)

    # Measure var on left, idvar + between vars on right of formula.
    data.subjMean <- ddply(data, c(idvar, betweenvars), .drop=.drop,
                           .fun = function(xx, col, na.rm) {
                               c(subjMean = mean(xx[,col], na.rm=na.rm))
                           },
                           measurevar,
                           na.rm
    )

    # Put the subject means with original data
    data <- merge(data, data.subjMean)

    # Get the normalized data in a new column
    measureNormedVar <- paste(measurevar, "_norm", sep="")
    data[,measureNormedVar] <- data[,measurevar] - data[,"subjMean"] +
        mean(data[,measurevar], na.rm=na.rm)

    # Remove this subject mean column
    data$subjMean <- NULL

    return(data)
}

## Summarizes data, handling within-subjects variables by removing inter-subject variability.
## It will still work if there are no within-S variables.
## Gives count, un-normed mean, normed mean (with same between-group mean),
##   standard deviation, standard error of the mean, and confidence interval.
## If there are within-subject variables, calculate adjusted values using method from Morey (2008).
##   data: a data frame.
##   measurevar: the name of a column that contains the variable to be summariezed
##   betweenvars: a vector containing names of columns that are between-subjects variables
##   withinvars: a vector containing names of columns that are within-subjects variables
##   idvar: the name of a column that identifies each subject (or matched subjects)
##   na.rm: a boolean that indicates whether to ignore NA's
##   conf.interval: the percent range of the confidence interval (default is 95%)
summarySEwithin <- function(data=NULL, measurevar, betweenvars=NULL, withinvars=NULL,
                            idvar=NULL, na.rm=FALSE, conf.interval=.95, .drop=TRUE) {

    # Ensure that the betweenvars and withinvars are factors
    factorvars <- vapply(data[, c(betweenvars, withinvars), drop=FALSE],
                         FUN=is.factor, FUN.VALUE=logical(1))

    if (!all(factorvars)) {
        nonfactorvars <- names(factorvars)[!factorvars]
        message("Automatically converting the following non-factors to factors: ",
                paste(nonfactorvars, collapse = ", "))
        data[nonfactorvars] <- lapply(data[nonfactorvars], factor)
    }

    # Get the means from the un-normed data
    datac <- summarySE(data, measurevar, groupvars=c(betweenvars, withinvars),
                       na.rm=na.rm, conf.interval=conf.interval, .drop=.drop)

    # Drop all the unused columns (these will be calculated with normed data)
    datac$sd <- NULL
    datac$se <- NULL
    datac$ci <- NULL

    # Norm each subject's data
    ndata <- normDataWithin(data, idvar, measurevar, betweenvars, na.rm, .drop=.drop)

    # This is the name of the new column
    measurevar_n <- paste(measurevar, "_norm", sep="")

    # Collapse the normed data - now we can treat between and within vars the same
    ndatac <- summarySE(ndata, measurevar_n, groupvars=c(betweenvars, withinvars),
                        na.rm=na.rm, conf.interval=conf.interval, .drop=.drop)

    # Apply correction from Morey (2008) to the standard error and confidence interval
    #  Get the product of the number of conditions of within-S variables
    nWithinGroups    <- prod(vapply(ndatac[,withinvars, drop=FALSE], FUN=nlevels,
                                    FUN.VALUE=numeric(1)))
    correctionFactor <- sqrt( nWithinGroups / (nWithinGroups-1) )

    # Apply the correction factor
    ndatac$sd <- ndatac$sd * correctionFactor
    ndatac$se <- ndatac$se * correctionFactor
    ndatac$ci <- ndatac$ci * correctionFactor

    # Combine the un-normed means with the normed results
    merge(datac, ndatac)
}
	#' @function Summary functions for error bar plot.
	#' @references http://www.jianshu.com/p/003138ac593b

	## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%).
	## data: a data frame.
	## measurevar: the name of a column that contains the variable to be summariezed
	## groupvars: a vector containing names of columns that contain grouping variables
	## na.rm: a boolean that indicates whether to ignore NA's
	## conf.interval: the percent range of the confidence interval (default is 95%)
	summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
	conf.interval=.95, .drop=TRUE) {
	library(plyr)

	# New version of length which can handle NA's: if na.rm==T, don't count them
	length2 <- function (x, na.rm=FALSE) {
	if (na.rm) sum(!is.na(x))
	else length(x)
	}

	# This does the summary. For each group's data frame, return a vector with
	# N, mean, and sd
	datac <- ddply(data, groupvars, .drop=.drop,
	.fun = function(xx, col) {
	c(N = length2(xx[[col]], na.rm=na.rm),
	mean = mean (xx[[col]], na.rm=na.rm),
	sd = sd (xx[[col]], na.rm=na.rm)
	)
	},
	measurevar
	)

	# Rename the "mean" column
	datac <- rename(datac, c("mean" = measurevar))

	datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean

	# Confidence interval multiplier for standard error
	# Calculate t-statistic for confidence interval:
	# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
	ciMult <- qt(conf.interval/2 + .5, datac$N-1)
	datac$ci <- datac$se * ciMult

	return(datac)
	}

	## Norms the data within specified groups in a data frame; it normalizes each
	## subject (identified by idvar) so that they have the same mean, within each group
	## specified by betweenvars.
	## data: a data frame.
	## idvar: the name of a column that identifies each subject (or matched subjects)
	## measurevar: the name of a column that contains the variable to be summariezed
	## betweenvars: a vector containing names of columns that are between-subjects variables
	## na.rm: a boolean that indicates whether to ignore NA's
	normDataWithin <- function(data=NULL, idvar, measurevar, betweenvars=NULL,
	na.rm=FALSE, .drop=TRUE) {
	library(plyr)

	# Measure var on left, idvar + between vars on right of formula.
	data.subjMean <- ddply(data, c(idvar, betweenvars), .drop=.drop,
	.fun = function(xx, col, na.rm) {
	c(subjMean = mean(xx[,col], na.rm=na.rm))
	},
	measurevar,
	na.rm
	)

	# Put the subject means with original data
	data <- merge(data, data.subjMean)

	# Get the normalized data in a new column
	measureNormedVar <- paste(measurevar, "_norm", sep="")
	data[,measureNormedVar] <- data[,measurevar] - data[,"subjMean"] +
	mean(data[,measurevar], na.rm=na.rm)

	# Remove this subject mean column
	data$subjMean <- NULL

	return(data)
	}

	## Summarizes data, handling within-subjects variables by removing inter-subject variability.
	## It will still work if there are no within-S variables.
	## Gives count, un-normed mean, normed mean (with same between-group mean),
	## standard deviation, standard error of the mean, and confidence interval.
	## If there are within-subject variables, calculate adjusted values using method from Morey (2008).
	## data: a data frame.
	## measurevar: the name of a column that contains the variable to be summariezed
	## betweenvars: a vector containing names of columns that are between-subjects variables
	## withinvars: a vector containing names of columns that are within-subjects variables
	## idvar: the name of a column that identifies each subject (or matched subjects)
	## na.rm: a boolean that indicates whether to ignore NA's
	## conf.interval: the percent range of the confidence interval (default is 95%)
	summarySEwithin <- function(data=NULL, measurevar, betweenvars=NULL, withinvars=NULL,
	idvar=NULL, na.rm=FALSE, conf.interval=.95, .drop=TRUE) {

	# Ensure that the betweenvars and withinvars are factors
	factorvars <- vapply(data[, c(betweenvars, withinvars), drop=FALSE],
	FUN=is.factor, FUN.VALUE=logical(1))

	if (!all(factorvars)) {
	nonfactorvars <- names(factorvars)[!factorvars]
	message("Automatically converting the following non-factors to factors: ",
	paste(nonfactorvars, collapse = ", "))
	data[nonfactorvars] <- lapply(data[nonfactorvars], factor)
	}

	# Get the means from the un-normed data
	datac <- summarySE(data, measurevar, groupvars=c(betweenvars, withinvars),
	na.rm=na.rm, conf.interval=conf.interval, .drop=.drop)

	# Drop all the unused columns (these will be calculated with normed data)
	datac$sd <- NULL
	datac$se <- NULL
	datac$ci <- NULL

	# Norm each subject's data
	ndata <- normDataWithin(data, idvar, measurevar, betweenvars, na.rm, .drop=.drop)

	# This is the name of the new column
	measurevar_n <- paste(measurevar, "_norm", sep="")

	# Collapse the normed data - now we can treat between and within vars the same
	ndatac <- summarySE(ndata, measurevar_n, groupvars=c(betweenvars, withinvars),
	na.rm=na.rm, conf.interval=conf.interval, .drop=.drop)

	# Apply correction from Morey (2008) to the standard error and confidence interval
	# Get the product of the number of conditions of within-S variables
	nWithinGroups <- prod(vapply(ndatac[,withinvars, drop=FALSE], FUN=nlevels,
	FUN.VALUE=numeric(1)))
	correctionFactor <- sqrt( nWithinGroups / (nWithinGroups-1) )

	# Apply the correction factor
	ndatac$sd <- ndatac$sd * correctionFactor
	ndatac$se <- ndatac$se * correctionFactor
	ndatac$ci <- ndatac$ci * correctionFactor

	# Combine the un-normed means with the normed results
	merge(datac, ndatac)
	}