Skip to content

Instantly share code, notes, and snippets.

@joelkr
Created September 5, 2014 22:36
Show Gist options
  • Save joelkr/327c7483b996f74bb301 to your computer and use it in GitHub Desktop.
Save joelkr/327c7483b996f74bb301 to your computer and use it in GitHub Desktop.
# BayesProbability.R
#set.seed(49)
# Build a dummy dictionary
a <- letters
b <- letters
tokens <- apply(expand.grid(a, b), 1, function(x) paste(x, collapse=""))
# Number of tokens to use 1-676
tl <- 100
p_spam_tokens <- rep(c(0.19, 0.01), (tl/2))
#p_ham_tokens <- rep(c(0.01, 0.19), (tl/2))
p_ham_tokens <- 1 - p_spam_tokens
hamcount <- rep(0, tl)
names(hamcount) <- tokens[1:tl]
spamcount <- rep(0, tl)
names(spamcount) <- tokens[1:tl]
totalcount <- rep(0, tl)
names(totalcount) <- tokens[1:tl]
spam_messages <- 100
ham_messages <- 100
Pspam <- spam_messages / (spam_messages + ham_messages)
Pham <- 1 - Pspam
for(i in 1:spam_messages) {
s1 <- sample(tokens[1:tl], sample(10:20, 1), replace=T, prob=p_spam_tokens)
s1 <- unique(s1)
spamcount[s1] <- spamcount[s1] + 1
totalcount[s1] <- totalcount[s1] + 1
}
for(i in 1:ham_messages) {
h1 <- sample(tokens[1:tl], sample(10:20, 1), replace=T, prob=p_ham_tokens)
h1 <- unique(h1)
hamcount[h1] <- hamcount[h1] + 1
totalcount[h1] <- totalcount[h1] + 1
}
# Calculate theta
thetaS <- (1 + spamcount) / (2 + spam_messages)
# from book
#alpha <- 0.2
#beta <- 0.201
#thetaS <- (alpha + spamcount) / (beta + totalcount)
thetaH <- (1 + hamcount) / (2 + ham_messages)
# from book
#thetaH <- (alpha + hamcount) / (beta + totalcount)
# Calculate logXGivenSpam
wS <- log(thetaS/(1 - thetaS))
w0S <- sum(log(1-thetaS))
# logXGivenHam
wH <- log(thetaH/(1 - thetaH))
w0H <- sum(log(1-thetaH))
x <- list()
# All spam words
x[[1]] <- rep(c(1,0), (tl/2))
# All ham words
x[[2]] <- rep(c(0,1), (tl/2))
# Every word
x[[3]] <- rep(1, tl)
# Random favoring 0
x[[4]] <- sample(c(1,0), tl, replace=TRUE, prob=c(0.29,0.71))
# Random favoring 1
x[[5]] <- sample(c(1,0), tl, replace=TRUE, prob=c(0.71,0.29))
logXGivenSpam <- rep(0, 5)
logXGivenHam <- rep(0,5)
logSpamGivenX <- rep(0,5)
logPspam <- log(Pspam)
logPham <- log(Pham)
for(i in 1:5) {
print(x[[i]])
logXGivenSpam[i] <- sum(x[[i]] * wS) + w0S
logXGivenHam[i] <- sum(x[[i]] * wH) + w0H
cat("logXGivenSpam: ", logXGivenSpam[i], "\n")
cat("logXGivenHam: ", logXGivenHam[i], "\n")
# log identity: log(a + b) = log(a) + log(1 + exp(log(b) - log(a)))
# formula being solved:
# log(p(spam|word)) = log(p(word|spam)p(spam)/
# (p(word|spam)p(spam) + p(word|ham)p(ham)))
logSpamGivenX[i] <- -( log(1 + exp(logXGivenHam[i] + logPham - logXGivenSpam[i] - logPspam)) )
cat("**logSpamGivenX: ", logSpamGivenX[i], "\n")
cat("** p(S|x): ", exp(logSpamGivenX[i]), "\n")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment