Skip to content

Instantly share code, notes, and snippets.

@LittleOrangeC
Forked from dsparks/lme4_arm_example.R
Last active December 10, 2015 01:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save LittleOrangeC/4359324 to your computer and use it in GitHub Desktop.
Save LittleOrangeC/4359324 to your computer and use it in GitHub Desktop.
#
## 12/20/12 - Ths is my fork of DSparks Denver debate analysis script
#
###########
# Requirement - run this command to create the "denver.txt" file
# curl https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt > denver.txt
# From: http://www.cnn.com/2012/10/03/politics/debate-transcript/index.html
# Note - none of these efforts work on Linux or OSX due to SSL / HTTPS limitations
###############################
#Transcript <- readLines("https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt")
#library(RCurl)
# eval( expr =
# parse( Transcript = getURL("https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt", ssl.verifypeer=FALSE) ))
###############################
## Prep work
rm(list = ls())
doInstall <- TRUE # Change to FALSE if you don't want packages installed.
toInstall <- c("zoo", "tm", "ggplot2", "lme4", "arm", "Snowball", "gridExtra")
if(doInstall){install.packages(toInstall, repos = "http://cran.r-project.org")}
lapply(toInstall, library, character.only = TRUE)
con <- file("denver.txt", "r", blocking = FALSE)
Transcript <- readLines(con)
head(Transcript, 20)
Transcript <- data.frame(Words = Transcript, Speaker = NA, stringsAsFactors = FALSE)
Transcript$Speaker[regexpr("LEHRER: ", Transcript$Words) != -1] <- 1
Transcript$Speaker[regexpr("OBAMA: ", Transcript$Words) != -1] <- 2
Transcript$Speaker[regexpr("ROMNEY: ", Transcript$Words) != -1] <- 3
table(Transcript$Speaker)
Transcript$Speaker <- na.locf(Transcript$Speaker)
# Remove moderator:
Transcript <- Transcript[Transcript$Speaker != 1, ]
myCorpus <- Corpus(DataframeSource(Transcript))
##inspect(myCorpus)
head(myCorpus)
myCorpus <- tm_map(myCorpus, tolower) # Make lowercase
myCorpus <- tm_map(myCorpus, removePunctuation, preserve_intra_word_dashes = FALSE)
myCorpus <- tm_map(myCorpus, removeWords, stopwords("english")) # Remove stopwords
myCorpus <- tm_map(myCorpus, removeWords, c("lehrer", "obama", "romney"))
myCorpus <- tm_map(myCorpus, stemDocument, language = "english") # Stem words
##inspect(myCorpus)
head(myCorpus)
docTermMatrix <- DocumentTermMatrix(myCorpus)
docTermMatrix <- inspect(docTermMatrix)
#head(docTermMatrix) # my idea, not necesssary
sort(colSums(docTermMatrix))
table(colSums(docTermMatrix))
termCountFrame <- data.frame(Term = colnames(docTermMatrix))
termCountFrame$Obama <- colSums(docTermMatrix[Transcript$Speaker == 2, ])
termCountFrame$Romney <- colSums(docTermMatrix[Transcript$Speaker == 3, ])
head(termCountFrame)
### New ###
tallCountFrame <- with(termCountFrame, data.frame(Term = c(rep(Term, Obama),
rep(Term, Romney))))
###### I'e adjusted this line from isRomney with c(0,1)
tallCountFrame$isObama <- rep(c(1, 0), colSums(termCountFrame[, -1]))
tallCountFrame$isRomney <- rep(c(0, 1), colSums(termCountFrame[, -1]))
tallCountFrame$Term <- colnames(docTermMatrix)[tallCountFrame$Term]
randomInterceptModelO <- lmer(isObama ~ (1 | Term) - 1, family = "binomial", data = tallCountFrame)
randomInterceptModelR <- lmer(isRomney ~ (1 | Term) - 1, family = "binomial", data = tallCountFrame)
### so 150 here = items said 7 times
### so 100 here = items said 10 times
### so 50 here = items said 17 times
### so 25 here = items said 24 times
cutoffCount <- tail(sort(colSums(docTermMatrix)), 25)[1]
#####################
# Convert lmer model to plot-able data.
coefficientFrame1 <- data.frame(Term = rownames(coef(randomInterceptModelR)$Term))
coefficientFrame1$Estimate <- coef(randomInterceptModelR)$Term[, 1]
coefficientFrame1$SE <- se.coef(randomInterceptModelR)$Term[, 1]
coefficientFrame1$Count <- colSums(docTermMatrix)[coefficientFrame1$Term]
coefficientFrame1$z0 <- with(coefficientFrame1, plogis(Estimate))
coefficientFrame1$z_1 <- with(coefficientFrame1, plogis(Estimate-SE))
coefficientFrame1$z1 <- with(coefficientFrame1, plogis(Estimate+SE))
coefficientFrame1$z_2 <- with(coefficientFrame1, plogis(Estimate-2*SE))
coefficientFrame1$z2 <- with(coefficientFrame1, plogis(Estimate+2*SE))
coefficientFrame1$Term <- factor(coefficientFrame1$Term,
levels = coefficientFrame1$Term[order(coefficientFrame1$Estimate)])
zp1 <- ggplot(coefficientFrame1[coefficientFrame1$Count >= cutoffCount, ],
# aes(x = Term, ymin = z_2, ymax = z2))
aes(x = Term, y = z0, ymin = z_2, ymax = z2))
zp1 <- zp1 + geom_linerange(size = 1/2, colour = 'red')
zp1 <- zp1 + geom_linerange(aes(ymin = z_1, ymax = z1),size = 1, colour = 'red')
zp1 <- zp1 + geom_point(colour = "WHITE", shape = 15, alpha = 1, size = 10/9)
zp1 <- zp1 + scale_y_continuous("Romney use probability", expand = c(0, 0))
zp1 <- zp1 + coord_flip()
zp1 <- zp1 + ggtitle(paste("p(Romney Said It | Term) terms that occur at least ", cutoffCount, " times", sep = ""))
##zp1
#####################
# Convert lmer model to plot-able data.
coefficientFrame2 <- data.frame(Term = rownames(coef(randomInterceptModelO)$Term))
coefficientFrame2$Estimate <- coef(randomInterceptModelO)$Term[, 1]
coefficientFrame2$SE <- se.coef(randomInterceptModelO)$Term[, 1]
coefficientFrame2$Count <- colSums(docTermMatrix)[coefficientFrame2$Term]
coefficientFrame2$z0 <- with(coefficientFrame2, plogis(Estimate))
coefficientFrame2$z_1 <- with(coefficientFrame2, plogis(Estimate-SE))
coefficientFrame2$z1 <- with(coefficientFrame2, plogis(Estimate+SE))
coefficientFrame2$z_2 <- with(coefficientFrame2, plogis(Estimate-2*SE))
coefficientFrame2$z2 <- with(coefficientFrame2, plogis(Estimate+2*SE))
coefficientFrame2$Term <- factor(coefficientFrame2$Term,
levels = coefficientFrame2$Term[order(coefficientFrame2$Estimate)])
zp2 <- ggplot(coefficientFrame2[coefficientFrame2$Count >= cutoffCount, ],
aes(x = Term, y = z0,
ymin = z_2, ymax = z2))
zp2 <- zp2 + geom_linerange(size = 1/2, colour = 'blue')
zp2 <- zp2 + geom_linerange(aes(ymin = z_1, ymax = z1),size = 1, colour = 'blue')
zp2 <- zp2 + geom_point(colour = "WHITE", shape = 15, alpha = 1, size = 10/9)
zp2 <- zp2 + scale_y_continuous("Obama use probability", expand = c(0, 0))
#zp2 <- zp2 + scale_y_continuous("Use probability", expand = c(0, 0))
zp2 <- zp2 + coord_flip()
zp2 <- zp2 + ggtitle(paste("p(Obama Said It | Term) terms that occur at least ", cutoffCount, " times", sep = ""))
##zp2
grid.arrange(zp1, zp2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment