Last active
August 29, 2015 14:05
-
-
Save lsfalimis/4312832039a2cd81c262 to your computer and use it in GitHub Desktop.
怎么对微博做聚类?求教...
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# MySQL | |
# 先安装 RMySQL,相当折腾 http://lsfalimis.github.io/link--install-rmysql-on-mavericks/ | |
library(RMySQL) | |
con = dbConnect(MySQL(), user="USERNAME", password="PASSWORD", dbname="DATABASENAME", host="HOST") | |
# 为了使中文不再显示为问号 | |
dbSendQuery(con, 'set names utf8') | |
# COLUMN 是微博文字 | |
rs = dbSendQuery(con, "select COLUMN from TABLE limit 100") | |
data = fetch(rs, n=-1) | |
# Clean | |
# 去网址 | |
data = gsub(pattern="http:[a-zA-Z\\/\\.0-9]+","",data) | |
# Segment | |
library("Rwordseg") | |
# 对每条微博进行分词 | |
corpus <- lapply(X=data, FUN=segmentCN) | |
# Covert into corpus | |
library(tm) | |
# 把向量 corpus 转化成语料库 | |
doc.cor = Corpus(VectorSource(corpus)) | |
# 微博有很多英文词汇,所以把英文也考虑在内。在去英文 stop words 之前先转化为全字母小写 | |
doc.cor = tm_map(doc.cor, tolower) | |
# 去数字 | |
doc.cor = tm_map(doc.cor, removeNumbers) | |
# 之前的清理步骤会产生多余空格,在这里去掉多余空格 | |
doc.cor = tm_map(doc.cor, stripWhitespace) | |
doc.cor = tm_map(doc.cor, PlainTextDocument) | |
stopwordsCN = readLines("stopwordsCN.txt") | |
# 默认最小单词长度为3,这里调整为1 | |
control = list(stopwords = stopwordsCN, wordLengths = c(1, Inf)) | |
# Term为第一列 | |
doc.dtm <- TermDocumentMatrix(doc.cor, control) | |
# Document为第一列 | |
# doc.dtm <- DocumentTermMatrix(doc.cor, control) | |
inspect(doc.dtm) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment