Ray901/pttTextMining.R

## pttTextMining.R
rm(list=ls(all=TRUE))

library(rJava)
library(Rwordseg)
library(tm)
library(tmcn)
library(wordcloud)
library(XML)
library(RCurl)
library(googlesheets)
library(dplyr)

setbbsName<-"StupidClown"

indexStart<-2000
indexEnd<-2451

#################################################################

setGoogleUrl <- "https://docs.google.com/spreadsheets/d/1Fyt697dbjv6ZloW4fFD8CVTX3rp2DNip8wAxqYs91UE/edit?usp=sharing"
seqmentCNWords <- setGoogleUrl %>% register_ss

setInsertWords<-seqmentCNWords %>% get_via_csv(ws = "newInsert",encoding = "UTF-8")
setRemoveWords<-seqmentCNWords %>% get_via_csv(ws = "newRemove",encoding = "UTF-8")

#################################################################

allUrl <- NULL
for( i in indexStart:indexEnd){
  tmp <- paste(i, '.html', sep='')
  url <- paste(paste0('https://www.ptt.cc/bbs/',setbbsName,'/index'), tmp, sep='')
  html <- htmlParse(getURL(url))
  url.list <- xpathSApply(html, "//div[@class='title']/a[@href]", xmlAttrs)
  allUrl <- c(allUrl, paste('https://www.ptt.cc', url.list, sep=''))
}

data<-vector(mode="list",length(allUrl))
for (i in 1:length(data)) {
  html <- htmlParse(getURL(allUrl[i]), encoding='UTF-8')
  data[[i]] <- xpathSApply(html, "//div[@id='main-content']", xmlValue)
  if (length(data[[i]])>0) {
    data[[i]] <- segmentCN(
      removePunctuation(
        gsub("[（：）「」\r\n0-9]","",data[[i]])
      ),nature=T)
  }
  rm(html)
  Sys.sleep(0.1)
}
data<-data[unlist(lapply(data,length))>1]

# 上網擷取常用字詞
url<-"http://blog.cnyes.com/My/cozywu015/article1225389"
html <- htmlParse(getURL(url), encoding='utf-8')
pttword <- xpathSApply(html, "//span[@style='color: rgb(0, 0, 255);']", xmlValue)
pttword <- gsub("[（：）「」\r\n]","",pttword)
pttword <- toUTF8(pttword)
insertWords(c(pttword,setInsertWords$word))

d.corpus <- Corpus(VectorSource(data), list(language = NA))
myStopWords <- c(stopwordsCN(),setRemoveWords$word,"編輯", "時間", "標題", "發信", "實業", "作者","推")
d.corpus <- tm_map(d.corpus,removeWords,myStopWords)
dtm1 <- DocumentTermMatrix(d.corpus,
                           control = list(
                             wordLengths=c(1, Inf), # to allow long words
                             bounds = list(global = c(5,Inf)), # each term appears in at least 5 docs
                             removeNumbers = TRUE,
                             # removePunctuation  = list(preserve_intra_word_dashes = FALSE),
                             weighting = weightTf,
                             encoding = "UTF-8")
)

m <- as.matrix(dtm1)
v <- sort(colSums(m), decreasing=TRUE)
myNames <- names(v)
wordDat <- data.frame(word=myNames, freq=v)

wordcloud(wordDat$word, wordDat$freq, min.freq = 10,max.words=100 ,
          random.order = F, ordered.colors = F,
          colors = rainbow(length(row.names(m))))
	rm(list=ls(all=TRUE))

	library(rJava)
	library(Rwordseg)
	library(tm)
	library(tmcn)
	library(wordcloud)
	library(XML)
	library(RCurl)
	library(googlesheets)
	library(dplyr)

	setbbsName<-"StupidClown"

	indexStart<-2000
	indexEnd<-2451

	#################################################################

	setGoogleUrl <- "https://docs.google.com/spreadsheets/d/1Fyt697dbjv6ZloW4fFD8CVTX3rp2DNip8wAxqYs91UE/edit?usp=sharing"
	seqmentCNWords <- setGoogleUrl %>% register_ss

	setInsertWords<-seqmentCNWords %>% get_via_csv(ws = "newInsert",encoding = "UTF-8")
	setRemoveWords<-seqmentCNWords %>% get_via_csv(ws = "newRemove",encoding = "UTF-8")

	#################################################################

	allUrl <- NULL
	for( i in indexStart:indexEnd){
	tmp <- paste(i, '.html', sep='')
	url <- paste(paste0('https://www.ptt.cc/bbs/',setbbsName,'/index'), tmp, sep='')
	html <- htmlParse(getURL(url))
	url.list <- xpathSApply(html, "//div[@class='title']/a[@href]", xmlAttrs)
	allUrl <- c(allUrl, paste('https://www.ptt.cc', url.list, sep=''))
	}

	data<-vector(mode="list",length(allUrl))
	for (i in 1:length(data)) {
	html <- htmlParse(getURL(allUrl[i]), encoding='UTF-8')
	data[[i]] <- xpathSApply(html, "//div[@id='main-content']", xmlValue)
	if (length(data[[i]])>0) {
	data[[i]] <- segmentCN(
	removePunctuation(
	gsub("[（：）「」\r\n0-9]","",data[[i]])
	),nature=T)
	}
	rm(html)
	Sys.sleep(0.1)
	}
	data<-data[unlist(lapply(data,length))>1]

	# 上網擷取常用字詞
	url<-"http://blog.cnyes.com/My/cozywu015/article1225389"
	html <- htmlParse(getURL(url), encoding='utf-8')
	pttword <- xpathSApply(html, "//span[@style='color: rgb(0, 0, 255);']", xmlValue)
	pttword <- gsub("[（：）「」\r\n]","",pttword)
	pttword <- toUTF8(pttword)
	insertWords(c(pttword,setInsertWords$word))

	d.corpus <- Corpus(VectorSource(data), list(language = NA))
	myStopWords <- c(stopwordsCN(),setRemoveWords$word,"編輯", "時間", "標題", "發信", "實業", "作者","推")
	d.corpus <- tm_map(d.corpus,removeWords,myStopWords)
	dtm1 <- DocumentTermMatrix(d.corpus,
	control = list(
	wordLengths=c(1, Inf), # to allow long words
	bounds = list(global = c(5,Inf)), # each term appears in at least 5 docs
	removeNumbers = TRUE,
	# removePunctuation = list(preserve_intra_word_dashes = FALSE),
	weighting = weightTf,
	encoding = "UTF-8")
	)

	m <- as.matrix(dtm1)
	v <- sort(colSums(m), decreasing=TRUE)
	myNames <- names(v)
	wordDat <- data.frame(word=myNames, freq=v)

	wordcloud(wordDat$word, wordDat$freq, min.freq = 10,max.words=100 ,
	random.order = F, ordered.colors = F,
	colors = rainbow(length(row.names(m))))