LittleOrangeC/lme4_arm_example.R

## lme4_arm_example.R
#
## 12/20/12 - Ths is my fork of DSparks Denver debate analysis script
#
###########
# Requirement - run this command to create the "denver.txt" file
# curl https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt > denver.txt
# From: http://www.cnn.com/2012/10/03/politics/debate-transcript/index.html

# Note - none of these efforts work on Linux or OSX due to SSL / HTTPS limitations
###############################
#Transcript <- readLines("https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt")
#library(RCurl)
#    eval( expr =
#        parse( Transcript = getURL("https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt", ssl.verifypeer=FALSE) ))
###############################

## Prep work
rm(list = ls())
doInstall <- TRUE  # Change to FALSE if you don't want packages installed.
toInstall <- c("zoo", "tm", "ggplot2", "lme4", "arm", "Snowball", "gridExtra")
if(doInstall){install.packages(toInstall, repos = "http://cran.r-project.org")}
lapply(toInstall, library, character.only = TRUE)

con <- file("denver.txt", "r", blocking = FALSE)
Transcript <- readLines(con)
head(Transcript, 20)

Transcript <- data.frame(Words = Transcript, Speaker = NA, stringsAsFactors = FALSE)
Transcript$Speaker[regexpr("LEHRER: ", Transcript$Words) != -1] <- 1
Transcript$Speaker[regexpr("OBAMA: ", Transcript$Words) != -1] <- 2
Transcript$Speaker[regexpr("ROMNEY: ", Transcript$Words) != -1] <- 3
table(Transcript$Speaker)
Transcript$Speaker <- na.locf(Transcript$Speaker)

# Remove moderator:
Transcript <- Transcript[Transcript$Speaker != 1, ]

myCorpus <- Corpus(DataframeSource(Transcript))
##inspect(myCorpus)
head(myCorpus)

myCorpus <- tm_map(myCorpus, tolower)  # Make lowercase
myCorpus <- tm_map(myCorpus, removePunctuation, preserve_intra_word_dashes = FALSE)
myCorpus <- tm_map(myCorpus, removeWords, stopwords("english"))  # Remove stopwords
myCorpus <- tm_map(myCorpus, removeWords, c("lehrer", "obama", "romney"))
myCorpus <- tm_map(myCorpus, stemDocument, language = "english")  # Stem words

##inspect(myCorpus)
head(myCorpus)
docTermMatrix <- DocumentTermMatrix(myCorpus)

docTermMatrix <- inspect(docTermMatrix)
#head(docTermMatrix) # my idea, not necesssary
sort(colSums(docTermMatrix))
table(colSums(docTermMatrix))

termCountFrame <- data.frame(Term = colnames(docTermMatrix))
termCountFrame$Obama <- colSums(docTermMatrix[Transcript$Speaker == 2, ])
termCountFrame$Romney <- colSums(docTermMatrix[Transcript$Speaker == 3, ])

head(termCountFrame)

### New ###

tallCountFrame <- with(termCountFrame, data.frame(Term = c(rep(Term, Obama),
                                      rep(Term, Romney))))
###### I'e adjusted this line from isRomney with c(0,1)
tallCountFrame$isObama  <- rep(c(1, 0), colSums(termCountFrame[, -1]))
tallCountFrame$isRomney <- rep(c(0, 1), colSums(termCountFrame[, -1]))
tallCountFrame$Term <- colnames(docTermMatrix)[tallCountFrame$Term]

randomInterceptModelO <- lmer(isObama ~ (1 | Term) - 1, family = "binomial", data = tallCountFrame)
randomInterceptModelR <- lmer(isRomney ~ (1 | Term) - 1, family = "binomial", data = tallCountFrame)

### so 150 here = items said 7 times
### so 100 here = items said 10 times
### so  50 here = items said 17 times
### so  25 here = items said 24 times
cutoffCount <- tail(sort(colSums(docTermMatrix)), 25)[1]

#####################
# Convert lmer model to plot-able data.
coefficientFrame1 <- data.frame(Term = rownames(coef(randomInterceptModelR)$Term))
coefficientFrame1$Estimate <- coef(randomInterceptModelR)$Term[, 1]
coefficientFrame1$SE <- se.coef(randomInterceptModelR)$Term[, 1]

coefficientFrame1$Count <- colSums(docTermMatrix)[coefficientFrame1$Term]
coefficientFrame1$z0 <- with(coefficientFrame1, plogis(Estimate))
coefficientFrame1$z_1 <- with(coefficientFrame1, plogis(Estimate-SE))
coefficientFrame1$z1 <- with(coefficientFrame1, plogis(Estimate+SE))
coefficientFrame1$z_2 <- with(coefficientFrame1, plogis(Estimate-2*SE))
coefficientFrame1$z2 <- with(coefficientFrame1, plogis(Estimate+2*SE))

coefficientFrame1$Term <- factor(coefficientFrame1$Term,
  levels = coefficientFrame1$Term[order(coefficientFrame1$Estimate)])

zp1 <- ggplot(coefficientFrame1[coefficientFrame1$Count >= cutoffCount, ],
#                                      		aes(x = Term, ymin = z_2, ymax = z2))
              aes(x = Term, y = z0, ymin = z_2, ymax = z2))
zp1 <- zp1 + geom_linerange(size = 1/2, colour = 'red')
zp1 <- zp1 + geom_linerange(aes(ymin = z_1, ymax = z1),size = 1, colour = 'red')
zp1 <- zp1 + geom_point(colour = "WHITE", shape = 15, alpha = 1, size = 10/9)
zp1 <- zp1 + scale_y_continuous("Romney use probability", expand = c(0, 0))
zp1 <- zp1 + coord_flip()
zp1 <- zp1 + ggtitle(paste("p(Romney Said It | Term) terms that occur at least ", cutoffCount, " times", sep = ""))
##zp1

#####################
# Convert lmer model to plot-able data.
coefficientFrame2 <- data.frame(Term = rownames(coef(randomInterceptModelO)$Term))
coefficientFrame2$Estimate <- coef(randomInterceptModelO)$Term[, 1]
coefficientFrame2$SE <- se.coef(randomInterceptModelO)$Term[, 1]

coefficientFrame2$Count <- colSums(docTermMatrix)[coefficientFrame2$Term]
coefficientFrame2$z0 <- with(coefficientFrame2, plogis(Estimate))
coefficientFrame2$z_1 <- with(coefficientFrame2, plogis(Estimate-SE))
coefficientFrame2$z1 <- with(coefficientFrame2, plogis(Estimate+SE))
coefficientFrame2$z_2 <- with(coefficientFrame2, plogis(Estimate-2*SE))
coefficientFrame2$z2 <- with(coefficientFrame2, plogis(Estimate+2*SE))

coefficientFrame2$Term <- factor(coefficientFrame2$Term,
  levels = coefficientFrame2$Term[order(coefficientFrame2$Estimate)])

zp2 <- ggplot(coefficientFrame2[coefficientFrame2$Count >= cutoffCount, ],
              aes(x = Term, y = z0,
                  ymin = z_2, ymax = z2))
zp2 <- zp2 + geom_linerange(size = 1/2, colour = 'blue')
zp2 <- zp2 + geom_linerange(aes(ymin = z_1, ymax = z1),size = 1, colour = 'blue')
zp2 <- zp2 + geom_point(colour = "WHITE", shape = 15, alpha = 1, size = 10/9)
zp2 <- zp2 + scale_y_continuous("Obama use probability", expand = c(0, 0))
#zp2 <- zp2 + scale_y_continuous("Use probability", expand = c(0, 0))
zp2 <- zp2 + coord_flip()
zp2 <- zp2 + ggtitle(paste("p(Obama Said It | Term) terms that occur at least ", cutoffCount, " times", sep = ""))
##zp2

grid.arrange(zp1, zp2)
	#
	## 12/20/12 - Ths is my fork of DSparks Denver debate analysis script
	#
	###########
	# Requirement - run this command to create the "denver.txt" file
	# curl https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt > denver.txt
	# From: http://www.cnn.com/2012/10/03/politics/debate-transcript/index.html

	# Note - none of these efforts work on Linux or OSX due to SSL / HTTPS limitations
	###############################
	#Transcript <- readLines("https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt")
	#library(RCurl)
	# eval( expr =
	# parse( Transcript = getURL("https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt", ssl.verifypeer=FALSE) ))
	###############################

	## Prep work
	rm(list = ls())
	doInstall <- TRUE # Change to FALSE if you don't want packages installed.
	toInstall <- c("zoo", "tm", "ggplot2", "lme4", "arm", "Snowball", "gridExtra")
	if(doInstall){install.packages(toInstall, repos = "http://cran.r-project.org")}
	lapply(toInstall, library, character.only = TRUE)

	con <- file("denver.txt", "r", blocking = FALSE)
	Transcript <- readLines(con)
	head(Transcript, 20)

	Transcript <- data.frame(Words = Transcript, Speaker = NA, stringsAsFactors = FALSE)
	Transcript$Speaker[regexpr("LEHRER: ", Transcript$Words) != -1] <- 1
	Transcript$Speaker[regexpr("OBAMA: ", Transcript$Words) != -1] <- 2
	Transcript$Speaker[regexpr("ROMNEY: ", Transcript$Words) != -1] <- 3
	table(Transcript$Speaker)
	Transcript$Speaker <- na.locf(Transcript$Speaker)

	# Remove moderator:
	Transcript <- Transcript[Transcript$Speaker != 1, ]

	myCorpus <- Corpus(DataframeSource(Transcript))
	##inspect(myCorpus)
	head(myCorpus)

	myCorpus <- tm_map(myCorpus, tolower) # Make lowercase
	myCorpus <- tm_map(myCorpus, removePunctuation, preserve_intra_word_dashes = FALSE)
	myCorpus <- tm_map(myCorpus, removeWords, stopwords("english")) # Remove stopwords
	myCorpus <- tm_map(myCorpus, removeWords, c("lehrer", "obama", "romney"))
	myCorpus <- tm_map(myCorpus, stemDocument, language = "english") # Stem words

	##inspect(myCorpus)
	head(myCorpus)
	docTermMatrix <- DocumentTermMatrix(myCorpus)

	docTermMatrix <- inspect(docTermMatrix)
	#head(docTermMatrix) # my idea, not necesssary
	sort(colSums(docTermMatrix))
	table(colSums(docTermMatrix))

	termCountFrame <- data.frame(Term = colnames(docTermMatrix))
	termCountFrame$Obama <- colSums(docTermMatrix[Transcript$Speaker == 2, ])
	termCountFrame$Romney <- colSums(docTermMatrix[Transcript$Speaker == 3, ])

	head(termCountFrame)

	### New ###

	tallCountFrame <- with(termCountFrame, data.frame(Term = c(rep(Term, Obama),
	rep(Term, Romney))))
	###### I'e adjusted this line from isRomney with c(0,1)
	tallCountFrame$isObama <- rep(c(1, 0), colSums(termCountFrame[, -1]))
	tallCountFrame$isRomney <- rep(c(0, 1), colSums(termCountFrame[, -1]))
	tallCountFrame$Term <- colnames(docTermMatrix)[tallCountFrame$Term]

	randomInterceptModelO <- lmer(isObama ~ (1 \| Term) - 1, family = "binomial", data = tallCountFrame)
	randomInterceptModelR <- lmer(isRomney ~ (1 \| Term) - 1, family = "binomial", data = tallCountFrame)

	### so 150 here = items said 7 times
	### so 100 here = items said 10 times
	### so 50 here = items said 17 times
	### so 25 here = items said 24 times
	cutoffCount <- tail(sort(colSums(docTermMatrix)), 25)[1]

	#####################
	# Convert lmer model to plot-able data.
	coefficientFrame1 <- data.frame(Term = rownames(coef(randomInterceptModelR)$Term))
	coefficientFrame1$Estimate <- coef(randomInterceptModelR)$Term[, 1]
	coefficientFrame1$SE <- se.coef(randomInterceptModelR)$Term[, 1]

	coefficientFrame1$Count <- colSums(docTermMatrix)[coefficientFrame1$Term]
	coefficientFrame1$z0 <- with(coefficientFrame1, plogis(Estimate))
	coefficientFrame1$z_1 <- with(coefficientFrame1, plogis(Estimate-SE))
	coefficientFrame1$z1 <- with(coefficientFrame1, plogis(Estimate+SE))
	coefficientFrame1$z_2 <- with(coefficientFrame1, plogis(Estimate-2*SE))
	coefficientFrame1$z2 <- with(coefficientFrame1, plogis(Estimate+2*SE))

	coefficientFrame1$Term <- factor(coefficientFrame1$Term,
	levels = coefficientFrame1$Term[order(coefficientFrame1$Estimate)])

	zp1 <- ggplot(coefficientFrame1[coefficientFrame1$Count >= cutoffCount, ],
	# aes(x = Term, ymin = z_2, ymax = z2))
	aes(x = Term, y = z0, ymin = z_2, ymax = z2))
	zp1 <- zp1 + geom_linerange(size = 1/2, colour = 'red')
	zp1 <- zp1 + geom_linerange(aes(ymin = z_1, ymax = z1),size = 1, colour = 'red')
	zp1 <- zp1 + geom_point(colour = "WHITE", shape = 15, alpha = 1, size = 10/9)
	zp1 <- zp1 + scale_y_continuous("Romney use probability", expand = c(0, 0))
	zp1 <- zp1 + coord_flip()
	zp1 <- zp1 + ggtitle(paste("p(Romney Said It \| Term) terms that occur at least ", cutoffCount, " times", sep = ""))
	##zp1

	#####################
	# Convert lmer model to plot-able data.
	coefficientFrame2 <- data.frame(Term = rownames(coef(randomInterceptModelO)$Term))
	coefficientFrame2$Estimate <- coef(randomInterceptModelO)$Term[, 1]
	coefficientFrame2$SE <- se.coef(randomInterceptModelO)$Term[, 1]

	coefficientFrame2$Count <- colSums(docTermMatrix)[coefficientFrame2$Term]
	coefficientFrame2$z0 <- with(coefficientFrame2, plogis(Estimate))
	coefficientFrame2$z_1 <- with(coefficientFrame2, plogis(Estimate-SE))
	coefficientFrame2$z1 <- with(coefficientFrame2, plogis(Estimate+SE))
	coefficientFrame2$z_2 <- with(coefficientFrame2, plogis(Estimate-2*SE))
	coefficientFrame2$z2 <- with(coefficientFrame2, plogis(Estimate+2*SE))

	coefficientFrame2$Term <- factor(coefficientFrame2$Term,
	levels = coefficientFrame2$Term[order(coefficientFrame2$Estimate)])

	zp2 <- ggplot(coefficientFrame2[coefficientFrame2$Count >= cutoffCount, ],
	aes(x = Term, y = z0,
	ymin = z_2, ymax = z2))
	zp2 <- zp2 + geom_linerange(size = 1/2, colour = 'blue')
	zp2 <- zp2 + geom_linerange(aes(ymin = z_1, ymax = z1),size = 1, colour = 'blue')
	zp2 <- zp2 + geom_point(colour = "WHITE", shape = 15, alpha = 1, size = 10/9)
	zp2 <- zp2 + scale_y_continuous("Obama use probability", expand = c(0, 0))
	#zp2 <- zp2 + scale_y_continuous("Use probability", expand = c(0, 0))
	zp2 <- zp2 + coord_flip()
	zp2 <- zp2 + ggtitle(paste("p(Obama Said It \| Term) terms that occur at least ", cutoffCount, " times", sep = ""))
	##zp2

	grid.arrange(zp1, zp2)