Skip to content

Instantly share code, notes, and snippets.

@bobchennan
Created March 27, 2017 15:40
Show Gist options
  • Save bobchennan/95e9e9f9b76d29dfaf02f5bb75c507c2 to your computer and use it in GitHub Desktop.
Save bobchennan/95e9e9f9b76d29dfaf02f5bb75c507c2 to your computer and use it in GitHub Desktop.
# The data can be found in https://github.com/tdhopper/topic-modeling-datasets/blob/master/README.md
# The stop word lists can be found in https://github.com/stanfordnlp/CoreNLP/blob/master/data/edu/stanford/nlp/patterns/surface/stopwords.txt
# Data clean in http://www.mjdenny.com/Text_Processing_In_R.html
library(stringr)
library(gtools)
createDataFile = function(){
processFile = function(filepath) {
con = file(filepath, "r")
inAbstract = FALSE
abs=""
while ( TRUE ) {
line = readLines(con, n = 1)
if ( length(line) == 0 ) {
break
}
if ( startsWith(line, " Abstract:")){
inAbstract=TRUE
line=substring(iconv(line, "latin1", "ASCII", sub=""), 13)
}
if ( startsWith(line, " -------------------")){
inAbstract=FALSE
#' Remove everything that is not a number or letter (may want to keep more
#' stuff in your actual analyses).
temp <- stringr::str_replace_all(str_trim(abs),"[^a-zA-Z\\s]", " ")
temp <- tolower(str_trim(temp))
# Shrink down to just one white space
temp <- stringr::str_replace_all(temp,"[\\s]+", " ")
if(temp!=""){
cat(temp)
cat('\n')
}
abs=""
}
if(inAbstract){
abs=paste(abs,line,sep=" ")
}
}
close(con)
}
processFile('cgcbib.txt')
}
HDP = function(obs, test_data){
init_K <- 62
gamma_a <- .1
gamma_b <- 1
h_sym <- 0.5
alpha0_a <- .1
alpha0_b <- 1
beta_u <- 1
maxIter <- 1000
cntm <- as.matrix(obs)
numw_doc <- rowSums(cntm)
docs_sel <- as.vector(numw_doc>1)
cntm <- cntm[docs_sel,]
num_doc <- dim(cntm)[1]
num_word <- dim(cntm)[2]
numw_doc <- rowSums(cntm)
word_occ <- colSums(cntm)
K <- init_K
topics <- vector()
init.Z = function(){
Z <- list()
freq = rep(1./init_K, init_K)
for(i in 1:num_doc){
word_t <- rmultinom(numw_doc[i], 1, freq)
Z[[i]] <- max.col(t(word_t))
}
return(Z)
}
init.njk = function(Z){
njk <- list()
for(i in 1:num_doc){
tmp = list()
for(j in 1:K)
tmp[as.character(j)] <- 0
for(j in seq_along(Z[[i]])){
zij <- as.character(Z[[i]][[j]])
tmp[[zij]] <- tmp[[zij]] + 1
}
njk[[i]] <- tmp
}
return(njk)
}
init.nkw = function(Z){
nkw <- list()
for(i in 1:K){
nkw[[as.character(i)]] <- array(0, num_word)
}
for(i in 1:num_doc){
idx <- 0
for(j in 1:num_word){
if (cntm[[i,j]]!=0)
for(k in 1:cntm[[i,j]]){
idx <- idx + 1
zij <- as.character(Z[[i]][[idx]])
nkw[[zij]][[j]] <- nkw[[zij]][[j]] + 1
}
}
}
return(nkw)
}
init.nk = function(Z){
nk <- list()
for(i in 1:K){
nk[[as.character(i)]] <- 0
}
for(i in 1:num_doc){
idx <- 0
for(j in 1:num_word)
if (cntm[[i,j]]>0)
for(k in 1:cntm[[i,j]]){
idx <- idx + 1
zij <- as.character(Z[[i]][idx])
nk[[zij]] <- nk[[zij]] + 1
}
}
return(nk)
}
init.alpha = function(alpha0_a, alpha0_b){
return(alpha0_a/alpha0_b)
}
init.gamma = function(gamma_a, gamma_b){
return(gamma_a/gamma_b)
}
init.beta = function(){
beta <- list()
new_K <- 0
for(k in 1:K){
if(nk[[as.character(k)]]>0){
new_K <- new_K + 1
beta_l <- rbeta(1, 1, gamma)
beta[[as.character(k)]] <- beta_l * beta_u
beta_u <<- beta_u * (1 - beta_l)
topics <<- c(topics, k)
}
}
K <<- new_K
return(beta)
}
sample.Z = function(Z){
for(j in 1:num_doc){
w <- 0
for(w1 in 1:num_word)
if (cntm[[j,w1]]>0)
for(w2 in 1:cntm[[j,w1]])
{
w <- w + 1
zij <- as.character(Z[[j]][w])
if(njk[[j]][[zij]]==0)
stop("before error")
post <- array(0, K+1)
for(i in 1:K){
k <- as.character(topics[[i]])
if(nk[[zij]]-(k==zij)==0)
post[[i]] <- 0
else{
if (!is.null(njk[[j]][[k]])&&!is.na(njk[[j]][[k]]))
njki=njk[[j]][[k]]
else
njki=0
prior <- njki-(k==zij) + alpha * beta[[zij]]
likelihood <- (max(0,nkw[[k]][w1])-(k==zij)+h_sym)/(max(0, nk[[k]])-(k==zij)+h_sym*num_word)
if(is.na(prior)||is.na(likelihood)||likelihood<0||prior<0)
{
#print(njk[[j]])
#print(numw_doc[j])
print(k)
print(j)
stop("error in likelihood or prior")
}
post[i] <- prior * likelihood
}
}
post[[K+1]] <- alpha * beta_u * (1./num_word)
if(is.na(post[[K+1]]))
stop(c(alpha, "error", beta_u))
if(sum(post)==0)
{
print(njk[[j]])
print(numw_doc[[j]])
stop("post sum to 0")
}
if(anyNA(post)){
print(post)
stop("post contains NA")
}
post <- post / sum(post)
if(anyNA(post)){
print(post)
stop("post contains NA II")
}
topic <- which.max(rmultinom(1, 1, post))
if (topic>K){
beta_l <- rbeta(1, 1, gamma)
beta[[as.character(topic)]] <<- beta_l * beta_u
beta_u <<- beta_u * (1 - beta_l)
K <<- K+1
topics[K] <<- topic
nkw[[as.character(K)]] <<- array(0, num_word)
nk[[as.character(K)]] <<- 0
for(jj in 1:num_doc)
njk[[jj]][[as.character(K)]] <<- 0
print(c("add topic",K))
}
else
topic <- topics[[topic]]
Z[[j]][[w]] <- topic
topic <- as.character(topic)
if (topic != zij){
njk[[j]][[zij]] <<- njk[[j]][[zij]] - 1
nkw[[zij]][[w1]] <<- nkw[[zij]][[w1]] - 1
nk[[zij]] <<- nk[[zij]] - 1
#print(c(zij,"down to",nk[[zij]]))
if(nk[[zij]]==0){
for(tmp in 1:K)
if(topics[tmp]==zij){
topics[[tmp]] <<- topics[K]
topics <- head(topics, -1)
break
}
K <<- K - 1
nk[[zij]] <<- NULL
nkw[[zij]] <<- NULL
for(jj in 1:num_doc)
njk[[jj]][[zij]]=NULL
print(c("delete topic",K))
}
njk[[j]][[topic]] <<- njk[[j]][[topic]] + 1
nkw[[topic]][[w1]] <<- nkw[[topic]][[w1]] + 1
nk[[topic]] <<- nk[[topic]] + 1
}
if(njk[[j]][[topic]]==0)
stop("after error")
}
}
print("sample z")
return(Z)
}
sample.m=function(){
m <- array(0, K+1)
m[K+1]<-gamma
for(i in 1:K){
k <- topics[[i]]
concen <- alpha*beta[[as.character(k)]]
tmp <- 0
for(j in 1:num_doc){
if(k<=length(njk[[j]])&&!is.null(njk[[j]][[as.character(k)]])&&!is.na(njk[[j]][[as.character(k)]])){
now <- njk[[j]][[as.character(k)]]
tmp <- tmp + now
if(now>0){
p <- concen / (concen + now)
m[i] <- m[i] + rbinom(1, now, p)
}
}
}
if (tmp!=nk[as.character(k)])
stop(c(tmp,"nk error",tmp))
}
return(m)
}
sample.beta=function(m){
new_beta <- rdirichlet(1, m)
for(i in 1:K){
k <- as.character(topics[[i]])
beta[[k]] <<- new_beta[[i]]
}
beta_u <<- new_beta[[K+1]]
}
#sample.alpha=function(){
#}
#sample.beta=function(){
#}
#Initialize
Z <- init.Z() #Topic assignment of each word in each document
njk <- init.njk(Z) #Number of words in each topic k in each document j
nkw <- init.nkw(Z) #Number of word w in topic k
nk <- init.nk(Z) #Number of words in topic k
alpha <- init.alpha(alpha0_a, alpha0_b)
gamma <- init.gamma(gamma_a, gamma_b)
beta <- init.beta()
for(cas in 1:50){
Z <- sample.Z(Z)
sample.beta(sample.m())
#print(beta)
#print(beta_u)
}
}
#createDataFile()
require(tm)
x <- read.csv('data.txt', header = FALSE)
corp <- Corpus(DataframeSource(x))
corp <- tm_map(corp, removeWords, stopwords("english"))
dtm <- DocumentTermMatrix(corp, control=list(bounds = list(global = c(10,Inf))))
HDP(dtm,dtm)
#dim(dtm)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment