Skip to content

Instantly share code, notes, and snippets.

@Ray901
Last active August 29, 2015 14:18
Show Gist options
  • Save Ray901/1692b6accc2cef6d9788 to your computer and use it in GitHub Desktop.
Save Ray901/1692b6accc2cef6d9788 to your computer and use it in GitHub Desktop.
mining ptt with R
rm(list=ls(all=TRUE))
library(rJava)
library(Rwordseg)
library(tm)
library(tmcn)
library(wordcloud)
library(XML)
library(RCurl)
library(googlesheets)
library(dplyr)
setbbsName<-"StupidClown"
indexStart<-2000
indexEnd<-2451
#################################################################
setGoogleUrl <- "https://docs.google.com/spreadsheets/d/1Fyt697dbjv6ZloW4fFD8CVTX3rp2DNip8wAxqYs91UE/edit?usp=sharing"
seqmentCNWords <- setGoogleUrl %>% register_ss
setInsertWords<-seqmentCNWords %>% get_via_csv(ws = "newInsert",encoding = "UTF-8")
setRemoveWords<-seqmentCNWords %>% get_via_csv(ws = "newRemove",encoding = "UTF-8")
#################################################################
allUrl <- NULL
for( i in indexStart:indexEnd){
tmp <- paste(i, '.html', sep='')
url <- paste(paste0('https://www.ptt.cc/bbs/',setbbsName,'/index'), tmp, sep='')
html <- htmlParse(getURL(url))
url.list <- xpathSApply(html, "//div[@class='title']/a[@href]", xmlAttrs)
allUrl <- c(allUrl, paste('https://www.ptt.cc', url.list, sep=''))
}
data<-vector(mode="list",length(allUrl))
for (i in 1:length(data)) {
html <- htmlParse(getURL(allUrl[i]), encoding='UTF-8')
data[[i]] <- xpathSApply(html, "//div[@id='main-content']", xmlValue)
if (length(data[[i]])>0) {
data[[i]] <- segmentCN(
removePunctuation(
gsub("[(:)「」\r\n0-9]","",data[[i]])
),nature=T)
}
rm(html)
Sys.sleep(0.1)
}
data<-data[unlist(lapply(data,length))>1]
# 上網擷取常用字詞
url<-"http://blog.cnyes.com/My/cozywu015/article1225389"
html <- htmlParse(getURL(url), encoding='utf-8')
pttword <- xpathSApply(html, "//span[@style='color: rgb(0, 0, 255);']", xmlValue)
pttword <- gsub("[(:)「」\r\n]","",pttword)
pttword <- toUTF8(pttword)
insertWords(c(pttword,setInsertWords$word))
d.corpus <- Corpus(VectorSource(data), list(language = NA))
myStopWords <- c(stopwordsCN(),setRemoveWords$word,"編輯", "時間", "標題", "發信", "實業", "作者","推")
d.corpus <- tm_map(d.corpus,removeWords,myStopWords)
dtm1 <- DocumentTermMatrix(d.corpus,
control = list(
wordLengths=c(1, Inf), # to allow long words
bounds = list(global = c(5,Inf)), # each term appears in at least 5 docs
removeNumbers = TRUE,
# removePunctuation = list(preserve_intra_word_dashes = FALSE),
weighting = weightTf,
encoding = "UTF-8")
)
m <- as.matrix(dtm1)
v <- sort(colSums(m), decreasing=TRUE)
myNames <- names(v)
wordDat <- data.frame(word=myNames, freq=v)
wordcloud(wordDat$word, wordDat$freq, min.freq = 10,max.words=100 ,
random.order = F, ordered.colors = F,
colors = rainbow(length(row.names(m))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment