simkimsia/output

## output
 class(my_df[1,1])
[1] "numeric"
> as.character(my_df[1,1:2])
[1] "0" "0"

## scrape_altucher.r
library(XML)
library(RCurl)
url.link <- 'http://www.jamesaltucher.com/sitemap.xml'
blog <- getURL(url.link)
blog          <- htmlParse(blog, encoding = "UTF-8")
titles  <- xpathSApply (blog ,"//loc",xmlValue)             ## titles

traverse_each_page <- function(x){
  tmp <- htmlParse(getURI(x))
  xpathSApply(tmp, '//div[@id="mainContent"]', xmlValue)
}
pages <- sapply(titles[2:10], traverse_each_page)

#remove newline and non-text characters'
nont <- c("\n", "\t", "\r")
pages <- gsub(paste(nont,collapse="|"), " ", pages)

pages

require(tm)
# convert list into corpus
mycorpus <- Corpus(VectorSource(pages))
# prepare to remove stopwords, ie. common words like 'the'
skipWords <- function(x) removeWords(x, stopwords("english"))
# prepare to remove other bits we usually don't care about
funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords)
# do it
a <- tm_map(mycorpus, FUN = tm_reduce, tmFuns = funcs)
# make document term matrix
mydtm <- DocumentTermMatrix(a, control = list(wordLengths = c(3,10)))

inspect(mydtm)
# you can assign it to a data frame for more convenient viewing
my_df <- inspect(mydtm)


apply(mydtm, 2, sum)

my_df_new <- matrix(0, 9, 1715)
my_df_new[,2:1715] <- my_df[,2:1715]
my_df_new[,1] <- as.character(my_df[,1])
write.csv(my_df_new, "your_file_name.csv", row.names=FALSE)

class(my_df[1,1])
as.character(my_df[1,1:2])
	class(my_df[1,1])
	[1] "numeric"
	> as.character(my_df[1,1:2])
	[1] "0" "0"
	library(XML)
	library(RCurl)
	url.link <- 'http://www.jamesaltucher.com/sitemap.xml'
	blog <- getURL(url.link)
	blog <- htmlParse(blog, encoding = "UTF-8")
	titles <- xpathSApply (blog ,"//loc",xmlValue) ## titles

	traverse_each_page <- function(x){
	tmp <- htmlParse(getURI(x))
	xpathSApply(tmp, '//div[@id="mainContent"]', xmlValue)
	}
	pages <- sapply(titles[2:10], traverse_each_page)

	#remove newline and non-text characters'
	nont <- c("\n", "\t", "\r")
	pages <- gsub(paste(nont,collapse="\|"), " ", pages)

	pages

	require(tm)
	# convert list into corpus
	mycorpus <- Corpus(VectorSource(pages))
	# prepare to remove stopwords, ie. common words like 'the'
	skipWords <- function(x) removeWords(x, stopwords("english"))
	# prepare to remove other bits we usually don't care about
	funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords)
	# do it
	a <- tm_map(mycorpus, FUN = tm_reduce, tmFuns = funcs)
	# make document term matrix
	mydtm <- DocumentTermMatrix(a, control = list(wordLengths = c(3,10)))

	inspect(mydtm)
	# you can assign it to a data frame for more convenient viewing
	my_df <- inspect(mydtm)




	apply(mydtm, 2, sum)

	my_df_new <- matrix(0, 9, 1715)
	my_df_new[,2:1715] <- my_df[,2:1715]
	my_df_new[,1] <- as.character(my_df[,1])
	write.csv(my_df_new, "your_file_name.csv", row.names=FALSE)

	class(my_df[1,1])
	as.character(my_df[1,1:2])