Skip to content

Instantly share code, notes, and snippets.

@yunho0130
Last active February 10, 2017 13:48
Show Gist options
  • Save yunho0130/237ebea66661c1af5c0c74b782dddb79 to your computer and use it in GitHub Desktop.
Save yunho0130/237ebea66661c1af5c0c74b782dddb79 to your computer and use it in GitHub Desktop.
2016-07-26
install.packages("tm")
library(tm)
#mobile
news <- read.csv("mobile2014.csv",stringsAsFactors=F)
news.corpus <- Corpus(VectorSource(news$x))
news.corpus <- tm_map(news.corpus, stemDocument, language = "english")
tdm <- TermDocumentMatrix(news.corpus,
control = list(removeNumbers = T,
removePunctuation = T,
stopwords=stopwords("SMART"),
weighting=weightTfIdf))
dim(tdm)
#install.packages("slam")
library(slam)
word.count <- as.array(rollup(tdm, 2))
word.order <- order(word.count, decreasing = T)
freq.word <- word.order[1:30]
row.names(tdm[freq.word,])
freq.word <- word.order[1:1000]
# latent semantic analysis
install.packages("lsa")
library(lsa)
news.lsa <- lsa(tdm,30) # if it doesn't work, reduce the dimensions
gc() # garbage collector
news.lsa <- lsa(tdm[freq.word,], 30) # 30 dimensions
news.lsa$tk[,1] # first dimension
for(i in 1:30){ # show relevant words of each dimension
print(i)
importance <- order(abs(news.lsa$tk[,i]), decreasing = T)
print(news.lsa$tk[importance[1:10], i])
}
install.packages("GPArotation")
library(GPArotation) # varimax rotation
tk <- Varimax(news.lsa$tk)$loadings
for(i in 1:30){ # show relevant words of each dimension
print(i)
importance <- order(abs(tk[,i]), decreasing = T)
print(tk[importance[1:10], i])
}
#mobile
news <- read.csv("mobile2014.csv",stringsAsFactors=F)
news.corpus <- Corpus(VectorSource(news$x))
news.corpus <- tm_map(news.corpus, stemDocument, language = "english")
tdm <- TermDocumentMatrix(news.corpus,
control = list(removeNumbers = T,
removePunctuation = T,
wordLengths=c(3,Inf),
stopwords=stopwords("SMART"),
weighting=weightTfIdf))
dim(tdm)
#LDA
# install.packages("topicmodels")
# library(topicmodels)
# install.packages("lda")
# library(lda)
# install.packages("SnowballC")
# library(SnowballC)
ldaform <- dtm2ldaformat(tdm, omit_empty = T)
result.lda <- lda.collapsed.gibbs.sampler(documents = ldaform$documents,
K = 15, # number of topics
vocab = ldaform$vocab, # used words
num.iterations = 5000, # number of iterations
burnin = 1000, # number of past iterations to used
alpha = 0.01, # amount of topics in a document if it is greater than 1, there are many topics in a document. If it is less than 0 there are less number of topics in a document.
eta = 0.01) # amount of words in a topic
result.lda$topics
lw <- as.matrix(top.topic.words(result.lda$topics))
result.lda$topic_sums
result.lda$document_sums
tdm
for(j in 1:15){
cat(j)
cat(": ")
for(i in 1:5){
cat(row.names(tdm[as.numeric(lw[i,j]),]))
cat(", ") # show words in Topic 1
}
print(" ")
}
#daum review
new.reviews <- read.csv("reviews.csv",stringsAsFactors=F)
review.corpus <- Corpus(VectorSource(new.reviews$x))
library(KoNLP)
ko.words.noun <- function(doc){
d <- as.character(doc)
pos <- extractNoun(d)
}
options(mc.cores=1)
tdm2 <- TermDocumentMatrix(review.corpus,control=list(tokenize=ko.words.noun,wordLengths=c(1,Inf),removePunctuation=T,removeNumbers=T))
ldaform2 <- dtm2ldaformat(tdm2, omit_empty = T)
result.lda2 <- lda.collapsed.gibbs.sampler(documents = ldaform2$documents,
K = 3, # number of topics
vocab = ldaform2$vocab, # used words
num.iterations = 5000, # number of iterations
burnin = 1000, # number of past iterations to used
alpha = 0.01, # amount of topics in a document if it is greater than 1, there are many topics in a document. If it is less than 0 there are less number of topics in a document.
eta = 0.01) # amount of words in a topic
result.lda2$topics
lw2 <- as.matrix(top.topic.words(result.lda2$topics))
result.lda2$topic_sums
result.lda2$document_sums
for(i in 1:20){
print(row.names(tdm2[as.numeric(lw2[i,3]),])) # show words in Topic 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment