Skip to content

Instantly share code, notes, and snippets.

@flovv
Created April 11, 2017 06:58
Show Gist options
  • Save flovv/e5ffa5fb22f84435a92fb859ddfea945 to your computer and use it in GitHub Desktop.
Save flovv/e5ffa5fb22f84435a92fb859ddfea945 to your computer and use it in GitHub Desktop.
Showing weighted.summarySE function to aggregate weighted means with standard errors
require(ggplot2)
require(ggthemes)
require(plyr)
require(stringr)
require(Rmisc)
require(lubridate)
require(popbio) #betaval - to create random beta variables
#### Experiment setting
rnorm2 <- function(n,mean,sd) { mean+sd*scale(rnorm(n)) }
getDates <- function(d) {as.Date(Sys.Date() -d)}
bouncerate <- replicate(100, betaval(.5, sd=.3))
user <- rnorm2(100, 30, 10)
Date <- unlist(llply(1:100, function(x){getDates(x)}))
df <- data.frame(Date=as.Date(Date, origin = as.Date("1970-01-01")), BounceRate = bouncerate, Sessions=user)
df$wday <- wday(df$Date, abbr = T , label = T)
## create the well formated data frame to use in ggplot
dfc1 <- summarySE(df, measurevar = "BounceRate", groupvars = "wday")
p1<-ggplot(dfc1, aes(wday, BounceRate)) + geom_point() + geom_errorbar(aes(ymin=BounceRate-se, ymax=BounceRate+se), width=.1)
p1 + theme_economist(base_size = 16) + ylab("Unweighted Bounce-rate") + xlab("")
############ adapt function
weighted.summarySE <- function(data=NULL, measurevar, groupvars=NULL, weights, na.rm=FALSE,
conf.interval=.95, .drop=TRUE) {
# New version of length which can handle NA's: if na.rm==T, don't count them
length2 <- function (x, na.rm=FALSE) {
if (na.rm) sum(!is.na(x))
else length(x)
}
#weighted - SD function!
w.sd <- function(x, w,na.rm=TRUE ) ( (sum(w*x*x, na.rm=na.rm)/sum(w, na.rm=na.rm)) - weighted.mean(x,w, na.rm=na.rm)^2 )^.5
# This does the summary. For each group's data frame, return a vector with
datac <- ddply(data, groupvars,
.fun = function(xx, col, weights) {
c(N = length2(xx[[col]], na.rm=na.rm),
mean = weighted.mean(xx[[col]], xx[[weights]], na.rm=na.rm),
sd = w.sd(xx[[col]], xx[[weights]], na.rm=na.rm)
)
},
measurevar, weights
)
# Rename the "mean" column
datac <- rename(datac, c("mean" = measurevar))
datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean
# Confidence interval multiplier for standard error
# Calculate t-statistic for confidence interval:
# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
ciMult <- qt(conf.interval/2 + .5, datac$N-1)
datac$ci <- datac$se * ciMult
return(datac)
}
### the correct way to plot the means!
dfc2 <- weighted.summarySE(df, measurevar = "BounceRate", groupvars = "wday", weights = "Sessions")
p1<-ggplot(dfc2, aes(wday, BounceRate)) + geom_point() + geom_errorbar(aes(ymin=BounceRate-se, ymax=BounceRate+se), width=.1)
p1 + theme_economist(base_size = 16) + ylab("Weighted Bounce-rate") + xlab("")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment