Created
December 11, 2012 16:43
-
-
Save dsparks/4260167 to your computer and use it in GitHub Desktop.
Denver debate analysis I
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rm(list = ls()) | |
doInstall <- TRUE # Change to FALSE if you don't want packages installed. | |
toInstall <- c("zoo", "tm", "ggplot2", "Snowball") | |
if(doInstall){install.packages(toInstall, repos = "http://cran.r-project.org")} | |
lapply(toInstall, library, character.only = TRUE) | |
# From: http://www.cnn.com/2012/10/03/politics/debate-transcript/index.html | |
Transcript <- readLines("https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt") | |
head(Transcript, 20) | |
Transcript <- data.frame(Words = Transcript, Speaker = NA, stringsAsFactors = FALSE) | |
Transcript$Speaker[regexpr("LEHRER: ", Transcript$Words) != -1] <- 1 | |
Transcript$Speaker[regexpr("OBAMA: ", Transcript$Words) != -1] <- 2 | |
Transcript$Speaker[regexpr("ROMNEY: ", Transcript$Words) != -1] <- 3 | |
table(Transcript$Speaker) | |
Transcript$Speaker <- na.locf(Transcript$Speaker) | |
# Remove moderator: | |
Transcript <- Transcript[Transcript$Speaker != 1, ] | |
myCorpus <- Corpus(DataframeSource(Transcript)) | |
inspect(myCorpus) | |
myCorpus <- tm_map(myCorpus, tolower) # Make lowercase | |
myCorpus <- tm_map(myCorpus, removePunctuation, preserve_intra_word_dashes = FALSE) | |
myCorpus <- tm_map(myCorpus, removeWords, stopwords("english")) # Remove stopwords | |
myCorpus <- tm_map(myCorpus, removeWords, c("lehrer", "obama", "romney")) | |
myCorpus <- tm_map(myCorpus, stemDocument) # Stem words | |
inspect(myCorpus) | |
docTermMatrix <- DocumentTermMatrix(myCorpus) | |
docTermMatrix <- inspect(docTermMatrix) | |
sort(colSums(docTermMatrix)) | |
table(colSums(docTermMatrix)) | |
termCountFrame <- data.frame(Term = colnames(docTermMatrix)) | |
termCountFrame$Obama <- colSums(docTermMatrix[Transcript$Speaker == 2, ]) | |
termCountFrame$Romney <- colSums(docTermMatrix[Transcript$Speaker == 3, ]) | |
head(termCountFrame) | |
# Plot | |
zp1 <- ggplot(termCountFrame) | |
zp1 <- zp1 + geom_text(aes(x = Obama, y = Romney, label = Term)) | |
print(zp1) |
I'm having trouble with this and and any help I can get is much appreciated. After line 45, I get "ERROR: 'x' must be an array of at least two dimensions. ERROR: object 'Romney' not found" Going back to >head(termCountFrame), the result is a table with 2 columns, "Term" and "Obama". There is no Romney. Going back to >Transcript$Speaker[regexpr("ROMNEY: ", Transcript$Words) != -1] <- 3, there was no error message. But the result from >table(Transcript$Speaker) is "3" and "1". At the end of the script, >Transcript$Speaker results in "3". This is odd because it's Romney which is missing from termCountFrame, rather than "2" for Obama which is not missing. Thanks is advance.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Replace line 8 with:
library(RCurl)
Transcript <- getURL("https://raw.github.com/dsparks/Test_image/master/Denver_Debate_Transcript.txt")
and it will work