Created
November 23, 2013 11:17
-
-
Save simkimsia/7613396 to your computer and use it in GitHub Desktop.
code to scrape and count keywords in articles
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class(my_df[1,1]) | |
[1] "numeric" | |
> as.character(my_df[1,1:2]) | |
[1] "0" "0" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(XML) | |
library(RCurl) | |
url.link <- 'http://www.jamesaltucher.com/sitemap.xml' | |
blog <- getURL(url.link) | |
blog <- htmlParse(blog, encoding = "UTF-8") | |
titles <- xpathSApply (blog ,"//loc",xmlValue) ## titles | |
traverse_each_page <- function(x){ | |
tmp <- htmlParse(getURI(x)) | |
xpathSApply(tmp, '//div[@id="mainContent"]', xmlValue) | |
} | |
pages <- sapply(titles[2:10], traverse_each_page) | |
#remove newline and non-text characters' | |
nont <- c("\n", "\t", "\r") | |
pages <- gsub(paste(nont,collapse="|"), " ", pages) | |
pages | |
require(tm) | |
# convert list into corpus | |
mycorpus <- Corpus(VectorSource(pages)) | |
# prepare to remove stopwords, ie. common words like 'the' | |
skipWords <- function(x) removeWords(x, stopwords("english")) | |
# prepare to remove other bits we usually don't care about | |
funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords) | |
# do it | |
a <- tm_map(mycorpus, FUN = tm_reduce, tmFuns = funcs) | |
# make document term matrix | |
mydtm <- DocumentTermMatrix(a, control = list(wordLengths = c(3,10))) | |
inspect(mydtm) | |
# you can assign it to a data frame for more convenient viewing | |
my_df <- inspect(mydtm) | |
apply(mydtm, 2, sum) | |
my_df_new <- matrix(0, 9, 1715) | |
my_df_new[,2:1715] <- my_df[,2:1715] | |
my_df_new[,1] <- as.character(my_df[,1]) | |
write.csv(my_df_new, "your_file_name.csv", row.names=FALSE) | |
class(my_df[1,1]) | |
as.character(my_df[1,1:2]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment