lsfalimis/weibo.r

## weibo.r
# MySQL
# 先安装 RMySQL，相当折腾 http://lsfalimis.github.io/link--install-rmysql-on-mavericks/
library(RMySQL)
con = dbConnect(MySQL(), user="USERNAME", password="PASSWORD", dbname="DATABASENAME", host="HOST")
# 为了使中文不再显示为问号
dbSendQuery(con, 'set names utf8')
# COLUMN 是微博文字
rs = dbSendQuery(con, "select COLUMN from TABLE limit 100")
data = fetch(rs, n=-1)

# Clean
# 去网址
data = gsub(pattern="http:[a-zA-Z\\/\\.0-9]+","",data)

# Segment
library("Rwordseg")
# 对每条微博进行分词
corpus <- lapply(X=data, FUN=segmentCN)

# Covert into corpus
library(tm)
# 把向量 corpus 转化成语料库
doc.cor = Corpus(VectorSource(corpus))
# 微博有很多英文词汇，所以把英文也考虑在内。在去英文 stop words 之前先转化为全字母小写
doc.cor = tm_map(doc.cor, tolower)
# 去数字
doc.cor = tm_map(doc.cor, removeNumbers)
# 之前的清理步骤会产生多余空格，在这里去掉多余空格
doc.cor = tm_map(doc.cor, stripWhitespace)
doc.cor = tm_map(doc.cor, PlainTextDocument)
stopwordsCN = readLines("stopwordsCN.txt")
# 默认最小单词长度为3，这里调整为1
control = list(stopwords = stopwordsCN, wordLengths = c(1, Inf))
# Term为第一列
doc.dtm <- TermDocumentMatrix(doc.cor, control)
# Document为第一列
# doc.dtm <- DocumentTermMatrix(doc.cor, control)
inspect(doc.dtm)
	# MySQL
	# 先安装 RMySQL，相当折腾 http://lsfalimis.github.io/link--install-rmysql-on-mavericks/
	library(RMySQL)
	con = dbConnect(MySQL(), user="USERNAME", password="PASSWORD", dbname="DATABASENAME", host="HOST")
	# 为了使中文不再显示为问号
	dbSendQuery(con, 'set names utf8')
	# COLUMN 是微博文字
	rs = dbSendQuery(con, "select COLUMN from TABLE limit 100")
	data = fetch(rs, n=-1)

	# Clean
	# 去网址
	data = gsub(pattern="http:[a-zA-Z\\/\\.0-9]+","",data)

	# Segment
	library("Rwordseg")
	# 对每条微博进行分词
	corpus <- lapply(X=data, FUN=segmentCN)

	# Covert into corpus
	library(tm)
	# 把向量 corpus 转化成语料库
	doc.cor = Corpus(VectorSource(corpus))
	# 微博有很多英文词汇，所以把英文也考虑在内。在去英文 stop words 之前先转化为全字母小写
	doc.cor = tm_map(doc.cor, tolower)
	# 去数字
	doc.cor = tm_map(doc.cor, removeNumbers)
	# 之前的清理步骤会产生多余空格，在这里去掉多余空格
	doc.cor = tm_map(doc.cor, stripWhitespace)
	doc.cor = tm_map(doc.cor, PlainTextDocument)
	stopwordsCN = readLines("stopwordsCN.txt")
	# 默认最小单词长度为3，这里调整为1
	control = list(stopwords = stopwordsCN, wordLengths = c(1, Inf))
	# Term为第一列
	doc.dtm <- TermDocumentMatrix(doc.cor, control)
	# Document为第一列
	# doc.dtm <- DocumentTermMatrix(doc.cor, control)
	inspect(doc.dtm)