Skip to content

Instantly share code, notes, and snippets.

@simkimsia
Created November 23, 2013 11:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save simkimsia/7613396 to your computer and use it in GitHub Desktop.
Save simkimsia/7613396 to your computer and use it in GitHub Desktop.
code to scrape and count keywords in articles
class(my_df[1,1])
[1] "numeric"
> as.character(my_df[1,1:2])
[1] "0" "0"
library(XML)
library(RCurl)
url.link <- 'http://www.jamesaltucher.com/sitemap.xml'
blog <- getURL(url.link)
blog <- htmlParse(blog, encoding = "UTF-8")
titles <- xpathSApply (blog ,"//loc",xmlValue) ## titles
traverse_each_page <- function(x){
tmp <- htmlParse(getURI(x))
xpathSApply(tmp, '//div[@id="mainContent"]', xmlValue)
}
pages <- sapply(titles[2:10], traverse_each_page)
#remove newline and non-text characters'
nont <- c("\n", "\t", "\r")
pages <- gsub(paste(nont,collapse="|"), " ", pages)
pages
require(tm)
# convert list into corpus
mycorpus <- Corpus(VectorSource(pages))
# prepare to remove stopwords, ie. common words like 'the'
skipWords <- function(x) removeWords(x, stopwords("english"))
# prepare to remove other bits we usually don't care about
funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords)
# do it
a <- tm_map(mycorpus, FUN = tm_reduce, tmFuns = funcs)
# make document term matrix
mydtm <- DocumentTermMatrix(a, control = list(wordLengths = c(3,10)))
inspect(mydtm)
# you can assign it to a data frame for more convenient viewing
my_df <- inspect(mydtm)
apply(mydtm, 2, sum)
my_df_new <- matrix(0, 9, 1715)
my_df_new[,2:1715] <- my_df[,2:1715]
my_df_new[,1] <- as.character(my_df[,1])
write.csv(my_df_new, "your_file_name.csv", row.names=FALSE)
class(my_df[1,1])
as.character(my_df[1,1:2])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment