Skip to content

Instantly share code, notes, and snippets.

@eliardocosta
Last active April 21, 2019 03:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eliardocosta/7c350eece9f09a7caa0f48852621063c to your computer and use it in GitHub Desktop.
Save eliardocosta/7c350eece9f09a7caa0f48852621063c to your computer and use it in GitHub Desktop.
Blocked gibbs sampling for Dirichlet process.
n <- 50
w <- 1
set.seed(161017)
lam <- sample(c(5, 25, 50), n, replace = TRUE, prob = c(0.35, 0.3, 0.35)) # 3 clusters!
x <- rpois(n, w*lam)
hist(x, prob = TRUE, breaks = 15)
source("https://gist.githubusercontent.com/eliardocosta/7c350eece9f09a7caa0f48852621063c/raw/dd6eae6d9bb3101bdeeff0d9d087db91f9d7e74d/ppostPoiDP.R")
foo <- ppostPoiDP(x = x, w = w, phi = sqrt(mean(x)/var(x)), lam0 = mean(x), alpha = 1, nburn = 500, nsam = 1E3)
hist(foo$nclu) # histograma num. de clusters por iteracao
plot(foo$lam, foo$fdist) # distribuicao acumulada a posteriori
# ppostPoiDP.R
#
# INFERENCIA USANDO O ALGORITMO BLOCKED-GIBBS SAMPLING PARA O PROCESSO
# DIRICHLET (Ishwaran & Zarepour, 2000).
#
# Output - uma lista com 3 elementos: 'lam', 'fdist' e 'nclu'
# 1) lam: vetor com os valores do gride para o espaco parametrico, o espacamento do gride
# é definido pelo argumento 'cgrid'
# 2) fdist: probabilidade acumulada a posteriori para cada valor do gride no vetor 'lam'
# 3) nclu: vetor com o numero de clusters em cada iteracao.
ppostPoiDP <- function (x, w, phi, lam0, alpha, nburn, nsam, cgrid = 0.1, eps = 1E-1) {
nx <- length(x) # tamanho da amostra
N <- min(nx, ceiling(1 - alpha*log(eps/(4*nx)))) # num. de termos da representacao em soma do DP (stick-breaking)
lam <- rgamma(N, shape = phi, rate = phi/lam0) # valores iniciais para lambda
p <- rep(1/N, N) # prob's iniciais p/ cada termo da soma truncada
K <- numeric() # vetor indexador de cluster
pstar <- matrix(NA, nx, N)
nclu <- numeric() # vetor para armazenar num. de cluster em cada iteracao
for (r in 1:nburn) { # inicio do loop do burn-in
for(i in 1:nx) {
pstar[i, ] <- p*dpois(x[i], w*lam)
}
for(i in 1:nx) {
K[i] <- sample.int(N, size = 1, prob = pstar[i, ])
}
Ktable <- as.data.frame(table(K), stringsAsFactors = FALSE) # tabela freq. dos K's
Kstar <- as.vector(Ktable[ ,1], mode = "numeric") # vetor com os K's unicos
nclu <- append(nclu, length(Kstar))
m <- rep(0, N)
for (j in 1:length(Ktable[ ,2])) {
for (i in 1:N) {
if (i == Ktable[j, 1]) m[i] <- Ktable[j, 2]
}
}
D <- rgamma(length(m), shape = alpha/N + m, rate = 1) # gammas para gerar a Dirichlet
p <- D/sum(D) # atualizando vetor 'p' com uma Dirichlet
for (j in 1:length(Kstar)) {
set = numeric()
for (i in 1:nx) {
if (K[i] == Kstar[j]) set <- append(set, i)
}
lam[Kstar[j]] <- rgamma(1, shape = phi + sum(x[set]),
rate = length(set)*w + phi/lam0)
}
} # fim do loop do burn-in
# inicio da amostragem da posteroiri
pesos <- matrix(NA, nsam, N) # pesos a posteriori da soma truncada amostrados por iteracao
lam.post <- matrix(NA, nsam, N) # lambda's a posteriori amostrados por iteracao
for (r in 1:nsam) { # inicio do loop p/ amostras a posteriori
for(i in 1:nx) {
pstar[i, ] <- p*dpois(x[i], w*lam)
}
for(i in 1:nx) {
K[i] <- sample.int(N, size = 1, prob = pstar[i, ])
}
Ktable <- as.data.frame(table(K), stringsAsFactors = FALSE) # tabela freq. dos K's
Kstar <- as.vector(Ktable[ ,1], mode = "numeric") # vetor com os K's unicos
m <- rep(0, N)
for (j in 1:length(Ktable[ ,2])) {
for (i in 1:N) {
if (i == Ktable[j, 1]) m[i] <- Ktable[j, 2]
}
}
D <- rgamma(length(m), shape = alpha/N + m, rate = 1) # gammas para gerar a Dirichlet
p <- D/sum(D) # atualizando vetor 'p' com uma Dirichlet
for (j in 1:length(Kstar)) {
set = numeric()
for (i in 1:nx) {
if (K[i] == Kstar[j]) set <- append(set, i)
}
lam[Kstar[j]] <- rgamma(1, shape = phi + sum(x[set]),
rate = length(set)*w + phi/lam0)
}
lam.post[r, ] <- lam
pesos[r, ] <- p
} # fim do loop p/ amostragem a posteriori
grid.val <- seq(0, ceiling(max(lam.post)), cgrid) # grid de valores que define a particao sobre possiveis valores de lambda
grid.val <- tail(grid.val, -1)
ppost.it <- matrix(NA, nsam, length(grid.val)) # prob. acumulada a posteriori em relacao aos valores do gride por iteracao
for (i in 1:nsam) {
for (j in 1:length(grid.val)) {
ppost.it[i, j] <- sum(p[which(lam.post[i, ] <= grid.val[j])])
}
}
ppost <- apply(ppost.it, 2, mean) # estimativa da prob. acumulada a posterirori
return(list(lam = grid.val, fdist = ppost, nclu = nclu))
} # FIM
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment