Denver debate analysis I
rm(list = ls()) | |
doInstall <- TRUE # Change to FALSE if you don't want packages installed. | |
toInstall <- c("zoo", "tm", "ggplot2", "Snowball") | |
if(doInstall){install.packages(toInstall, repos = "http://cran.r-project.org")} | |
lapply(toInstall, library, character.only = TRUE) | |
# From: http://www.cnn.com/2012/10/03/politics/debate-transcript/index.html | |
Transcript <- readLines("https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt") | |
head(Transcript, 20) | |
Transcript <- data.frame(Words = Transcript, Speaker = NA, stringsAsFactors = FALSE) | |
Transcript$Speaker[regexpr("LEHRER: ", Transcript$Words) != -1] <- 1 | |
Transcript$Speaker[regexpr("OBAMA: ", Transcript$Words) != -1] <- 2 | |
Transcript$Speaker[regexpr("ROMNEY: ", Transcript$Words) != -1] <- 3 | |
table(Transcript$Speaker) | |
Transcript$Speaker <- na.locf(Transcript$Speaker) | |
# Remove moderator: | |
Transcript <- Transcript[Transcript$Speaker != 1, ] | |
myCorpus <- Corpus(DataframeSource(Transcript)) | |
inspect(myCorpus) | |
myCorpus <- tm_map(myCorpus, tolower) # Make lowercase | |
myCorpus <- tm_map(myCorpus, removePunctuation, preserve_intra_word_dashes = FALSE) | |
myCorpus <- tm_map(myCorpus, removeWords, stopwords("english")) # Remove stopwords | |
myCorpus <- tm_map(myCorpus, removeWords, c("lehrer", "obama", "romney")) | |
myCorpus <- tm_map(myCorpus, stemDocument) # Stem words | |
inspect(myCorpus) | |
docTermMatrix <- DocumentTermMatrix(myCorpus) | |
docTermMatrix <- inspect(docTermMatrix) | |
sort(colSums(docTermMatrix)) | |
table(colSums(docTermMatrix)) | |
termCountFrame <- data.frame(Term = colnames(docTermMatrix)) | |
termCountFrame$Obama <- colSums(docTermMatrix[Transcript$Speaker == 2, ]) | |
termCountFrame$Romney <- colSums(docTermMatrix[Transcript$Speaker == 3, ]) | |
head(termCountFrame) | |
# Plot | |
zp1 <- ggplot(termCountFrame) | |
zp1 <- zp1 + geom_text(aes(x = Obama, y = Romney, label = Term)) | |
print(zp1) |
This comment has been minimized.
This comment has been minimized.
I'm having trouble with this and and any help I can get is much appreciated. After line 45, I get "ERROR: 'x' must be an array of at least two dimensions. ERROR: object 'Romney' not found" Going back to >head(termCountFrame), the result is a table with 2 columns, "Term" and "Obama". There is no Romney. Going back to >Transcript$Speaker[regexpr("ROMNEY: ", Transcript$Words) != -1] <- 3, there was no error message. But the result from >table(Transcript$Speaker) is "3" and "1". At the end of the script, >Transcript$Speaker results in "3". This is odd because it's Romney which is missing from termCountFrame, rather than "2" for Obama which is not missing. Thanks is advance. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This comment has been minimized.
Replace line 8 with:
library(RCurl)
Transcript <- getURL("https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt")
and it will work