Skip to content

Instantly share code, notes, and snippets.

@s13731105
Last active June 2, 2016 22:12
Show Gist options
  • Save s13731105/819f6387220892917e20 to your computer and use it in GitHub Desktop.
Save s13731105/819f6387220892917e20 to your computer and use it in GitHub Desktop.
用 R 進行中文 text Mining
library(XML)
library(RCurl)
library(tm)
library(tmcn)
library(Rwordseg)
setwd("C:/test")
d.corpus <-Corpus(DirSource("/test/doc"), readerControl = list(language="UTF-8"))
#上兩行為抓取本機文字檔, 餘皆同網路上說明文字
d.corpus <- tm_map(d.corpus, removePunctuation)
d.corpus <- tm_map(d.corpus, removeNumbers)
d.corpus <- tm_map(d.corpus, function(word) {
gsub("[A-Za-z0-9]", "", word)
})
words <- readLines("http://wubi.sogou.com/dict/download_txt.php?id=9182")
words <- toTrad(words)
insertWords(words)
d.corpus <- tm_map(d.corpus[1:100], segmentCN, nature = TRUE)
d.corpus <- tm_map(d.corpus, function(sentence) {
noun <- lapply(sentence, function(w) {
w[names(w) == "n"]
})
unlist(noun)
})
d.corpus <- Corpus(VectorSource(d.corpus))
myStopWords <- c(stopwordsCN(), "編輯", "時間", "標題", "發信", "實業", "作者")
d.corpus <- tm_map(d.corpus, removeWords, myStopWords)
head(myStopWords, 20)
tdm <- TermDocumentMatrix(d.corpus, control = list(wordLengths = c(2, Inf)))
inspect(tdm[1:10, 1:2])
library(wordcloud)
m1 <- as.matrix(tdm)
v <- sort(rowSums(m1), decreasing = TRUE)
d <- data.frame(word = names(v), freq = v)
wordcloud(d$word, d$freq, min.freq = 10, random.order = F, ordered.colors = F,
colors = rainbow(length(row.names(m1))))
d.dtm <- DocumentTermMatrix(d.corpus, control = list(wordLengths = c(2, Inf)))
inspect(d.dtm[1:10, 1:2])
findFreqTerms(d.dtm, 30)
findAssocs(d.dtm, "同學", 0.5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment