public
Last active

Denver debate analysis II

  • Download Gist
lme4_arm_example.R
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
rm(list = ls())
doInstall <- TRUE # Change to FALSE if you don't want packages installed.
toInstall <- c("zoo", "tm", "ggplot2", "lme4", "arm", "Snowball")
if(doInstall){install.packages(toInstall, repos = "http://cran.r-project.org")}
lapply(toInstall, library, character.only = TRUE)
 
# From: http://www.cnn.com/2012/10/03/politics/debate-transcript/index.html
Transcript <- readLines("https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt")
head(Transcript, 20)
 
Transcript <- data.frame(Words = Transcript, Speaker = NA, stringsAsFactors = FALSE)
Transcript$Speaker[regexpr("LEHRER: ", Transcript$Words) != -1] <- 1
Transcript$Speaker[regexpr("OBAMA: ", Transcript$Words) != -1] <- 2
Transcript$Speaker[regexpr("ROMNEY: ", Transcript$Words) != -1] <- 3
table(Transcript$Speaker)
Transcript$Speaker <- na.locf(Transcript$Speaker)
 
# Remove moderator:
Transcript <- Transcript[Transcript$Speaker != 1, ]
 
myCorpus <- Corpus(DataframeSource(Transcript))
inspect(myCorpus)
 
myCorpus <- tm_map(myCorpus, tolower) # Make lowercase
myCorpus <- tm_map(myCorpus, removePunctuation, preserve_intra_word_dashes = FALSE)
myCorpus <- tm_map(myCorpus, removeWords, stopwords("english")) # Remove stopwords
myCorpus <- tm_map(myCorpus, removeWords, c("lehrer", "obama", "romney"))
myCorpus <- tm_map(myCorpus, stemDocument) # Stem words
 
inspect(myCorpus)
docTermMatrix <- DocumentTermMatrix(myCorpus)
 
docTermMatrix <- inspect(docTermMatrix)
sort(colSums(docTermMatrix))
table(colSums(docTermMatrix))
 
termCountFrame <- data.frame(Term = colnames(docTermMatrix))
termCountFrame$Obama <- colSums(docTermMatrix[Transcript$Speaker == 2, ])
termCountFrame$Romney <- colSums(docTermMatrix[Transcript$Speaker == 3, ])
 
head(termCountFrame)
 
### New ###
 
tallCountFrame <- with(termCountFrame, data.frame(Term = c(rep(Term, Obama),
rep(Term, Romney))))
tallCountFrame$isRomney <- rep(c(0, 1), colSums(termCountFrame[, -1]))
tallCountFrame$Term <- colnames(docTermMatrix)[tallCountFrame$Term]
 
randomInterceptModel <- lmer(isRomney ~ (1 | Term) - 1,
family = "binomial", data = tallCountFrame)
 
# Convert lmer model to plot-able data.
coefficientFrame <- data.frame(Term = rownames(coef(randomInterceptModel)$Term))
coefficientFrame$Estimate <- coef(randomInterceptModel)$Term[, 1]
coefficientFrame$SE <- se.coef(randomInterceptModel)$Term[, 1]
 
coefficientFrame$Count <- colSums(docTermMatrix)[coefficientFrame$Term]
coefficientFrame$z0 <- with(coefficientFrame, plogis(Estimate))
coefficientFrame$z_1 <- with(coefficientFrame, plogis(Estimate-SE))
coefficientFrame$z1 <- with(coefficientFrame, plogis(Estimate+SE))
coefficientFrame$z_2 <- with(coefficientFrame, plogis(Estimate-2*SE))
coefficientFrame$z2 <- with(coefficientFrame, plogis(Estimate+2*SE))
 
coefficientFrame$Term <- factor(coefficientFrame$Term,
levels = coefficientFrame$Term[order(coefficientFrame$Estimate)])
 
cutoffCount <- tail(sort(colSums(docTermMatrix)), 100)[1]
zp1 <- ggplot(coefficientFrame[coefficientFrame$Count >= cutoffCount, ],
aes(x = Term, y = z0,
ymin = z_2, ymax = z2))
zp1 <- zp1 + geom_linerange(size = 1/2)
zp1 <- zp1 + geom_linerange(aes(ymin = z_1, ymax = z1),
size = 1)
zp1 <- zp1 + geom_point(colour = "WHITE", shape = 15, alpha = 1, size = 10/9)
zp1 <- zp1 + scale_y_continuous("Romney use probability",
expand = c(0, 0))
zp1 <- zp1 + coord_flip()
zp1 <- zp1 + ggtitle(paste("p(Romney Said It | Term)\nterms that occur at least ", cutoffCount, " times", sep = ""))
zp1

So for this line:
Transcript <- readLines("https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt")

I'm getting this error:
Error in file(con, "r") : cannot open the connection
In addition: Warning message:
In file(con, "r") : unsupported URL scheme

I've checked the modules to ensure they are installed and loaded. I've googled and tried adjusting the URL, plus I've CURLed the URL directly from the command line to ensure it works.

Do you have any other recommendations?

Thanks!

Environment:
R version 2.15.2 (2012-10-26) -- "Trick or Treat"
Platform: x86_64-apple-darwin9.8.0/x86_64 (64-bit)

Ok - it looks like this is the issue:
Https is only supported on Windows, when R is started with the --internet2 command line option
http://stackoverflow.com/questions/7715723/sourcing-r-script-over-https/7715768#7715768

So I solved this problem with a manual CURL, I'll annotate in my fork: https://gist.github.com/4359324

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.