phillip-burger-sculpturearts/interviewer-effect-analysis.R

## interviewer-effect-analysis.R
# Name: interviewer-effect-analysis.R
# Author: Phillip Burger
# Date: December 11, 2013
# Purpose: Takes an already munged and cleaned data set. Does some exploration
# of the population and the groups. The does a series of tests. This is a good
# skeleton set of code and test for testing for interviewer effect in survey
# results. A .Rmd containing analysis and code chunks is also available.
# Posted: http://www.phillipburger.net/wordpress/2014/02/10/interviewer-effect-in-survey-results/

# initializespace
suppressPackageStartupMessages(require(data.table))
suppressPackageStartupMessages(require(dplyr))
suppressPackageStartupMessages(require(ggplot2))
suppressPackageStartupMessages(require(gridExtra))
suppressPackageStartupMessages(require(lawstat))
set.seed(38249842)

# privatehist
# Builds a histogram. It is ok to extend this function to make it more flexible.
# Parameters: vector of data to explore, the number of breaks as either
# 	a string or a numeric scaler, and the title to display.
# Preconditions: Valid input from the calling program. Caller must want the
# 	color values used in this function.
# Post conditions: None
# Returns: None
privateHist <- function(x = x, n = 20, mainTitle = mainTitle) {
	hist(x,
		prob = FALSE,
		breaks = 20,
		col = rgb(0.2, 0.1, 0.4, 0.15),
		xlim = c(min(x) - 0.01, 1.0),
		xlab = "Score",
		ylab = "Count",
		main = mainTitle)
	abline(v = mean(x), col = "red", lwd = 3)
	abline(v = median(x), col = "blue", lwd = 3)
}

# loadandsanitize
load("dfResponse.Rdata")
dt <- data.table(unique(dfResponse[,c("surveyId", "surveyorLastName", "surveyMean")]))
setnames(dt, c(1, 2, 3), c("surveyId", "interviewer", "score"))

# sanitize the names (names already came in in dfResponse.RData)
dt[interviewer == "A_interviewer", interviewer := "A"]
dt[interviewer == "B_interviewer", interviewer := "B"]
dt[interviewer == "C_interviewer", interviewer := "C"]
dt[interviewer == "D_interviewer", interviewer := "D"]
dt$interviewer <- droplevels(dt)$interviewer

# summarystats
(means <- tapply(dt$score, dt$interviewer, mean))
(var <- tapply(dt$score, dt$interviewer, var))

# popplots
plot1 <- ggplot(dt, aes(x = factor(interviewer), y = score))
plot1 <- plot1 + geom_boxplot(fill = rgb(0.2, 0.1, 0.4),
	colour = "black",
	alpha = 0.15,
	notch = TRUE,
	width = 0.75,
	outlier.size = 3.0,
	outlier.shape = 21,
	outlier.colour = "black") +
	stat_summary(fun.y = "mean",
		geom = "point",
		shape = 23,
		size = 3,
		fill = "white") +
	labs(x = "Interviewer",
		y = "Score",
		title = "Boxplot of Scores by Interviewer") +
	theme(plot.title = element_text(size = rel(1.3),
			face = 'bold'),
		axis.title.y = element_text(face = 'bold',
			size = 12),
		axis.title.x = element_text(face = 'bold',
			size = 12),
		legend.position = "none")
# histogram
# values of x in the annotations will vary depending on aspect ratio
plot2 <- ggplot(dt, aes(x = score))
plot2 <- plot2 + geom_histogram(
	colour = "black",
	alpha = 0.15) +
	labs(x = "Score",
		y = "Count",
		title = "Histogram of Scores (N=375)") +
	theme(plot.title = element_text(size = rel(1.3),
			face = 'bold'),
		axis.title.y = element_text(face = 'bold',
			size = 12),
		axis.title.x = element_text(face = 'bold',
			size = 12),
		legend.position = "none") +
	scale_y_continuous(limits = c(0, 39)) +
	annotate("text",
		x = 0.84,
		y = 39,
		label = "mean",
		size = 4.5,
		color = "#2a2a2a",
		fontface = 3) +
	annotate("text",
		x = 0.955,
		y = 39,
		label = "median",
		size = 4.5,
		color = "#2a2a2a",
		fontface = 3) +
	geom_vline(xintercept = mean(dt$score),
		linetype = "dashed",
		color = "red",
		size = 1) +
	geom_vline(xintercept = median(dt$score),
		linetype = "dashed",
		color = "blue",
		size = 1)
# output to dev or file
grid.arrange(plot1, plot2, ncol = 2)
# png("interviewer-population-charts.png",
#  	width = 650,
#  	units = "px",
#  	bg = "white")

# grouphists
classNames <- c("A", "B", "C", "D")
par(mfrow=c(2,2), oma=c(0,0,2,0))
for(i in 1:4) {
	x = filter(dt, interviewer == classNames[i])$score
	privateHist(x, 20, paste0("Histogram for Group ", classNames[i]))
}
mtext("Count By Interviewer/Group Histograms",
	side = 3,
	line = 0,
	cex = 1.2,
	col = "black",
	outer = TRUE)

# getsamples
# obtain n for smallest group
interviewerGroup1 <- group_by(dt, interviewer)
interviewerGroup2 <- group_size(interviewerGroup1)
cat("size of smallest group is: ", (lowestN <- min(interviewerGroup2)))
# sample the groups with no replacement
# A
A <- as.data.frame(sample(dt[interviewer == "A", ]$score,
	lowestN, replace = FALSE))
setnames(A, c(1), c("score"))
A$interviewer <- "A"
# B
B <- as.data.frame(sample(dt[interviewer == "B", ]$score,
	lowestN, replace = FALSE))
setnames(B, c(1), c("score"))
B$interviewer <- "B"
# C
C <- as.data.frame(sample(dt[interviewer == "C", ]$score,
	lowestN, replace = FALSE))
setnames(C, c(1), c("score"))
C$interviewer <- "C"
# D
D <- as.data.frame(sample(dt[interviewer == "D", ]$score,
	lowestN, replace = FALSE))
setnames(D, c(1), c("score"))
D$interviewer <- "D"
# housekeping
rm(interviewerGroup1, interviewerGroup2)

# testonewayanova
oneway.test(rbind(A, B, C, D)$score ~ rbind(A, B, C, D)$interviewer)

# testbartlett
lawstat::levene.test(dt$score, group = dt$interviewer, location = c("mean"))

# testkruskal
kruskal.test(dt$score ~ dt$interviewer)

# testaov
aovOut <- aov(rbind(A, B, C, D)$score ~ rbind(A, B, C, D)$interviewer)
summary(aovOut)

# testtukeyhsd
TukeyHSD(aovOut)

# pairwiseunadj
pairwise.t.test(dt$score, dt$interviewer, p.adj = "none")

# pairwiseadjbonf
pairwise.t.test(dt$score, dt$interviewer, p.adj = "bonf")

# pairwiseadjholm, cache=TRUE, eval=TRUE, warning=FALSE, echo=FALSE, message=FALSE}
pairwise.t.test(dt$score, dt$interviewer, p.adj = "holm")
	# Name: interviewer-effect-analysis.R
	# Author: Phillip Burger
	# Date: December 11, 2013
	# Purpose: Takes an already munged and cleaned data set. Does some exploration
	# of the population and the groups. The does a series of tests. This is a good
	# skeleton set of code and test for testing for interviewer effect in survey
	# results. A .Rmd containing analysis and code chunks is also available.
	# Posted: http://www.phillipburger.net/wordpress/2014/02/10/interviewer-effect-in-survey-results/

	# initializespace
	suppressPackageStartupMessages(require(data.table))
	suppressPackageStartupMessages(require(dplyr))
	suppressPackageStartupMessages(require(ggplot2))
	suppressPackageStartupMessages(require(gridExtra))
	suppressPackageStartupMessages(require(lawstat))
	set.seed(38249842)

	# privatehist
	# Builds a histogram. It is ok to extend this function to make it more flexible.
	# Parameters: vector of data to explore, the number of breaks as either
	# a string or a numeric scaler, and the title to display.
	# Preconditions: Valid input from the calling program. Caller must want the
	# color values used in this function.
	# Post conditions: None
	# Returns: None
	privateHist <- function(x = x, n = 20, mainTitle = mainTitle) {
	hist(x,
	prob = FALSE,
	breaks = 20,
	col = rgb(0.2, 0.1, 0.4, 0.15),
	xlim = c(min(x) - 0.01, 1.0),
	xlab = "Score",
	ylab = "Count",
	main = mainTitle)
	abline(v = mean(x), col = "red", lwd = 3)
	abline(v = median(x), col = "blue", lwd = 3)
	}

	# loadandsanitize
	load("dfResponse.Rdata")
	dt <- data.table(unique(dfResponse[,c("surveyId", "surveyorLastName", "surveyMean")]))
	setnames(dt, c(1, 2, 3), c("surveyId", "interviewer", "score"))

	# sanitize the names (names already came in in dfResponse.RData)
	dt[interviewer == "A_interviewer", interviewer := "A"]
	dt[interviewer == "B_interviewer", interviewer := "B"]
	dt[interviewer == "C_interviewer", interviewer := "C"]
	dt[interviewer == "D_interviewer", interviewer := "D"]
	dt$interviewer <- droplevels(dt)$interviewer

	# summarystats
	(means <- tapply(dt$score, dt$interviewer, mean))
	(var <- tapply(dt$score, dt$interviewer, var))

	# popplots
	plot1 <- ggplot(dt, aes(x = factor(interviewer), y = score))
	plot1 <- plot1 + geom_boxplot(fill = rgb(0.2, 0.1, 0.4),
	colour = "black",
	alpha = 0.15,
	notch = TRUE,
	width = 0.75,
	outlier.size = 3.0,
	outlier.shape = 21,
	outlier.colour = "black") +
	stat_summary(fun.y = "mean",
	geom = "point",
	shape = 23,
	size = 3,
	fill = "white") +
	labs(x = "Interviewer",
	y = "Score",
	title = "Boxplot of Scores by Interviewer") +
	theme(plot.title = element_text(size = rel(1.3),
	face = 'bold'),
	axis.title.y = element_text(face = 'bold',
	size = 12),
	axis.title.x = element_text(face = 'bold',
	size = 12),
	legend.position = "none")
	# histogram
	# values of x in the annotations will vary depending on aspect ratio
	plot2 <- ggplot(dt, aes(x = score))
	plot2 <- plot2 + geom_histogram(
	colour = "black",
	alpha = 0.15) +
	labs(x = "Score",
	y = "Count",
	title = "Histogram of Scores (N=375)") +
	theme(plot.title = element_text(size = rel(1.3),
	face = 'bold'),
	axis.title.y = element_text(face = 'bold',
	size = 12),
	axis.title.x = element_text(face = 'bold',
	size = 12),
	legend.position = "none") +
	scale_y_continuous(limits = c(0, 39)) +
	annotate("text",
	x = 0.84,
	y = 39,
	label = "mean",
	size = 4.5,
	color = "#2a2a2a",
	fontface = 3) +
	annotate("text",
	x = 0.955,
	y = 39,
	label = "median",
	size = 4.5,
	color = "#2a2a2a",
	fontface = 3) +
	geom_vline(xintercept = mean(dt$score),
	linetype = "dashed",
	color = "red",
	size = 1) +
	geom_vline(xintercept = median(dt$score),
	linetype = "dashed",
	color = "blue",
	size = 1)
	# output to dev or file
	grid.arrange(plot1, plot2, ncol = 2)
	# png("interviewer-population-charts.png",
	# width = 650,
	# units = "px",
	# bg = "white")

	# grouphists
	classNames <- c("A", "B", "C", "D")
	par(mfrow=c(2,2), oma=c(0,0,2,0))
	for(i in 1:4) {
	x = filter(dt, interviewer == classNames[i])$score
	privateHist(x, 20, paste0("Histogram for Group ", classNames[i]))
	}
	mtext("Count By Interviewer/Group Histograms",
	side = 3,
	line = 0,
	cex = 1.2,
	col = "black",
	outer = TRUE)

	# getsamples
	# obtain n for smallest group
	interviewerGroup1 <- group_by(dt, interviewer)
	interviewerGroup2 <- group_size(interviewerGroup1)
	cat("size of smallest group is: ", (lowestN <- min(interviewerGroup2)))
	# sample the groups with no replacement
	# A
	A <- as.data.frame(sample(dt[interviewer == "A", ]$score,
	lowestN, replace = FALSE))
	setnames(A, c(1), c("score"))
	A$interviewer <- "A"
	# B
	B <- as.data.frame(sample(dt[interviewer == "B", ]$score,
	lowestN, replace = FALSE))
	setnames(B, c(1), c("score"))
	B$interviewer <- "B"
	# C
	C <- as.data.frame(sample(dt[interviewer == "C", ]$score,
	lowestN, replace = FALSE))
	setnames(C, c(1), c("score"))
	C$interviewer <- "C"
	# D
	D <- as.data.frame(sample(dt[interviewer == "D", ]$score,
	lowestN, replace = FALSE))
	setnames(D, c(1), c("score"))
	D$interviewer <- "D"
	# housekeping
	rm(interviewerGroup1, interviewerGroup2)

	# testonewayanova
	oneway.test(rbind(A, B, C, D)$score ~ rbind(A, B, C, D)$interviewer)

	# testbartlett
	lawstat::levene.test(dt$score, group = dt$interviewer, location = c("mean"))

	# testkruskal
	kruskal.test(dt$score ~ dt$interviewer)

	# testaov
	aovOut <- aov(rbind(A, B, C, D)$score ~ rbind(A, B, C, D)$interviewer)
	summary(aovOut)

	# testtukeyhsd
	TukeyHSD(aovOut)

	# pairwiseunadj
	pairwise.t.test(dt$score, dt$interviewer, p.adj = "none")

	# pairwiseadjbonf
	pairwise.t.test(dt$score, dt$interviewer, p.adj = "bonf")

	# pairwiseadjholm, cache=TRUE, eval=TRUE, warning=FALSE, echo=FALSE, message=FALSE}
	pairwise.t.test(dt$score, dt$interviewer, p.adj = "holm")