Created
December 10, 2017 14:59
-
-
Save ShixiangWang/1a3ba38fc8a769d83aed27bad2200911 to your computer and use it in GitHub Desktop.
数据总结函数
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' @function Summary functions for error bar plot. | |
#' @references http://www.jianshu.com/p/003138ac593b | |
## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%). | |
## data: a data frame. | |
## measurevar: the name of a column that contains the variable to be summariezed | |
## groupvars: a vector containing names of columns that contain grouping variables | |
## na.rm: a boolean that indicates whether to ignore NA's | |
## conf.interval: the percent range of the confidence interval (default is 95%) | |
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE, | |
conf.interval=.95, .drop=TRUE) { | |
library(plyr) | |
# New version of length which can handle NA's: if na.rm==T, don't count them | |
length2 <- function (x, na.rm=FALSE) { | |
if (na.rm) sum(!is.na(x)) | |
else length(x) | |
} | |
# This does the summary. For each group's data frame, return a vector with | |
# N, mean, and sd | |
datac <- ddply(data, groupvars, .drop=.drop, | |
.fun = function(xx, col) { | |
c(N = length2(xx[[col]], na.rm=na.rm), | |
mean = mean (xx[[col]], na.rm=na.rm), | |
sd = sd (xx[[col]], na.rm=na.rm) | |
) | |
}, | |
measurevar | |
) | |
# Rename the "mean" column | |
datac <- rename(datac, c("mean" = measurevar)) | |
datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean | |
# Confidence interval multiplier for standard error | |
# Calculate t-statistic for confidence interval: | |
# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1 | |
ciMult <- qt(conf.interval/2 + .5, datac$N-1) | |
datac$ci <- datac$se * ciMult | |
return(datac) | |
} | |
## Norms the data within specified groups in a data frame; it normalizes each | |
## subject (identified by idvar) so that they have the same mean, within each group | |
## specified by betweenvars. | |
## data: a data frame. | |
## idvar: the name of a column that identifies each subject (or matched subjects) | |
## measurevar: the name of a column that contains the variable to be summariezed | |
## betweenvars: a vector containing names of columns that are between-subjects variables | |
## na.rm: a boolean that indicates whether to ignore NA's | |
normDataWithin <- function(data=NULL, idvar, measurevar, betweenvars=NULL, | |
na.rm=FALSE, .drop=TRUE) { | |
library(plyr) | |
# Measure var on left, idvar + between vars on right of formula. | |
data.subjMean <- ddply(data, c(idvar, betweenvars), .drop=.drop, | |
.fun = function(xx, col, na.rm) { | |
c(subjMean = mean(xx[,col], na.rm=na.rm)) | |
}, | |
measurevar, | |
na.rm | |
) | |
# Put the subject means with original data | |
data <- merge(data, data.subjMean) | |
# Get the normalized data in a new column | |
measureNormedVar <- paste(measurevar, "_norm", sep="") | |
data[,measureNormedVar] <- data[,measurevar] - data[,"subjMean"] + | |
mean(data[,measurevar], na.rm=na.rm) | |
# Remove this subject mean column | |
data$subjMean <- NULL | |
return(data) | |
} | |
## Summarizes data, handling within-subjects variables by removing inter-subject variability. | |
## It will still work if there are no within-S variables. | |
## Gives count, un-normed mean, normed mean (with same between-group mean), | |
## standard deviation, standard error of the mean, and confidence interval. | |
## If there are within-subject variables, calculate adjusted values using method from Morey (2008). | |
## data: a data frame. | |
## measurevar: the name of a column that contains the variable to be summariezed | |
## betweenvars: a vector containing names of columns that are between-subjects variables | |
## withinvars: a vector containing names of columns that are within-subjects variables | |
## idvar: the name of a column that identifies each subject (or matched subjects) | |
## na.rm: a boolean that indicates whether to ignore NA's | |
## conf.interval: the percent range of the confidence interval (default is 95%) | |
summarySEwithin <- function(data=NULL, measurevar, betweenvars=NULL, withinvars=NULL, | |
idvar=NULL, na.rm=FALSE, conf.interval=.95, .drop=TRUE) { | |
# Ensure that the betweenvars and withinvars are factors | |
factorvars <- vapply(data[, c(betweenvars, withinvars), drop=FALSE], | |
FUN=is.factor, FUN.VALUE=logical(1)) | |
if (!all(factorvars)) { | |
nonfactorvars <- names(factorvars)[!factorvars] | |
message("Automatically converting the following non-factors to factors: ", | |
paste(nonfactorvars, collapse = ", ")) | |
data[nonfactorvars] <- lapply(data[nonfactorvars], factor) | |
} | |
# Get the means from the un-normed data | |
datac <- summarySE(data, measurevar, groupvars=c(betweenvars, withinvars), | |
na.rm=na.rm, conf.interval=conf.interval, .drop=.drop) | |
# Drop all the unused columns (these will be calculated with normed data) | |
datac$sd <- NULL | |
datac$se <- NULL | |
datac$ci <- NULL | |
# Norm each subject's data | |
ndata <- normDataWithin(data, idvar, measurevar, betweenvars, na.rm, .drop=.drop) | |
# This is the name of the new column | |
measurevar_n <- paste(measurevar, "_norm", sep="") | |
# Collapse the normed data - now we can treat between and within vars the same | |
ndatac <- summarySE(ndata, measurevar_n, groupvars=c(betweenvars, withinvars), | |
na.rm=na.rm, conf.interval=conf.interval, .drop=.drop) | |
# Apply correction from Morey (2008) to the standard error and confidence interval | |
# Get the product of the number of conditions of within-S variables | |
nWithinGroups <- prod(vapply(ndatac[,withinvars, drop=FALSE], FUN=nlevels, | |
FUN.VALUE=numeric(1))) | |
correctionFactor <- sqrt( nWithinGroups / (nWithinGroups-1) ) | |
# Apply the correction factor | |
ndatac$sd <- ndatac$sd * correctionFactor | |
ndatac$se <- ndatac$se * correctionFactor | |
ndatac$ci <- ndatac$ci * correctionFactor | |
# Combine the un-normed means with the normed results | |
merge(datac, ndatac) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment