joelkr/BayesProbability.R

## BayesProbability.R
# BayesProbability.R

#set.seed(49)

# Build a dummy dictionary
a <- letters
b <- letters
tokens <- apply(expand.grid(a, b), 1, function(x) paste(x, collapse=""))
# Number of tokens to use 1-676
tl <- 100

p_spam_tokens <- rep(c(0.19, 0.01), (tl/2))
#p_ham_tokens <- rep(c(0.01, 0.19), (tl/2))
p_ham_tokens <- 1 - p_spam_tokens

hamcount <- rep(0, tl)
names(hamcount) <- tokens[1:tl]
spamcount <- rep(0, tl)
names(spamcount) <- tokens[1:tl]
totalcount <- rep(0, tl)
names(totalcount) <- tokens[1:tl]
spam_messages <- 100
ham_messages <- 100
Pspam <- spam_messages / (spam_messages + ham_messages)
Pham <- 1 - Pspam

for(i in 1:spam_messages) {
  s1 <- sample(tokens[1:tl], sample(10:20, 1), replace=T, prob=p_spam_tokens)
  s1 <- unique(s1)
  spamcount[s1] <- spamcount[s1] + 1
  totalcount[s1] <- totalcount[s1] + 1
}

for(i in 1:ham_messages) {
  h1 <- sample(tokens[1:tl], sample(10:20, 1), replace=T, prob=p_ham_tokens)
  h1 <- unique(h1)
  hamcount[h1] <- hamcount[h1] + 1
  totalcount[h1] <- totalcount[h1] + 1
}

# Calculate theta
thetaS <- (1 + spamcount) / (2 + spam_messages)
# from book
#alpha <- 0.2
#beta <- 0.201
#thetaS <- (alpha + spamcount) / (beta + totalcount)

thetaH <- (1 + hamcount) / (2 + ham_messages)
# from book
#thetaH <- (alpha + hamcount) / (beta + totalcount)

# Calculate logXGivenSpam
wS <- log(thetaS/(1 - thetaS))
w0S <- sum(log(1-thetaS))
# logXGivenHam
wH <- log(thetaH/(1 - thetaH))
w0H <- sum(log(1-thetaH))

x <- list()
# All spam words
x[[1]] <- rep(c(1,0), (tl/2))
# All ham words
x[[2]] <- rep(c(0,1), (tl/2))
# Every word
x[[3]] <- rep(1, tl)
# Random favoring 0
x[[4]] <- sample(c(1,0), tl, replace=TRUE, prob=c(0.29,0.71))
# Random favoring 1
x[[5]] <- sample(c(1,0), tl, replace=TRUE, prob=c(0.71,0.29))
logXGivenSpam <- rep(0, 5)
logXGivenHam <- rep(0,5)
logSpamGivenX <- rep(0,5)

logPspam <- log(Pspam)
logPham <- log(Pham)

for(i in 1:5) {
  print(x[[i]])
  logXGivenSpam[i] <- sum(x[[i]] * wS) + w0S
  logXGivenHam[i] <- sum(x[[i]] * wH) + w0H
  cat("logXGivenSpam: ", logXGivenSpam[i], "\n")
  cat("logXGivenHam: ", logXGivenHam[i], "\n")
 # log identity: log(a + b) = log(a) + log(1 + exp(log(b) - log(a)))
  # formula being solved:
  # log(p(spam|word)) = log(p(word|spam)p(spam)/
  #        (p(word|spam)p(spam) + p(word|ham)p(ham)))
  logSpamGivenX[i] <- -( log(1 + exp(logXGivenHam[i] + logPham - logXGivenSpam[i] - logPspam)) )
  cat("**logSpamGivenX: ", logSpamGivenX[i], "\n")
  cat("** p(S|x): ", exp(logSpamGivenX[i]), "\n")
}
	# BayesProbability.R

	#set.seed(49)

	# Build a dummy dictionary
	a <- letters
	b <- letters
	tokens <- apply(expand.grid(a, b), 1, function(x) paste(x, collapse=""))
	# Number of tokens to use 1-676
	tl <- 100

	p_spam_tokens <- rep(c(0.19, 0.01), (tl/2))
	#p_ham_tokens <- rep(c(0.01, 0.19), (tl/2))
	p_ham_tokens <- 1 - p_spam_tokens

	hamcount <- rep(0, tl)
	names(hamcount) <- tokens[1:tl]
	spamcount <- rep(0, tl)
	names(spamcount) <- tokens[1:tl]
	totalcount <- rep(0, tl)
	names(totalcount) <- tokens[1:tl]
	spam_messages <- 100
	ham_messages <- 100
	Pspam <- spam_messages / (spam_messages + ham_messages)
	Pham <- 1 - Pspam

	for(i in 1:spam_messages) {
	s1 <- sample(tokens[1:tl], sample(10:20, 1), replace=T, prob=p_spam_tokens)
	s1 <- unique(s1)
	spamcount[s1] <- spamcount[s1] + 1
	totalcount[s1] <- totalcount[s1] + 1
	}

	for(i in 1:ham_messages) {
	h1 <- sample(tokens[1:tl], sample(10:20, 1), replace=T, prob=p_ham_tokens)
	h1 <- unique(h1)
	hamcount[h1] <- hamcount[h1] + 1
	totalcount[h1] <- totalcount[h1] + 1
	}

	# Calculate theta
	thetaS <- (1 + spamcount) / (2 + spam_messages)
	# from book
	#alpha <- 0.2
	#beta <- 0.201
	#thetaS <- (alpha + spamcount) / (beta + totalcount)

	thetaH <- (1 + hamcount) / (2 + ham_messages)
	# from book
	#thetaH <- (alpha + hamcount) / (beta + totalcount)

	# Calculate logXGivenSpam
	wS <- log(thetaS/(1 - thetaS))
	w0S <- sum(log(1-thetaS))
	# logXGivenHam
	wH <- log(thetaH/(1 - thetaH))
	w0H <- sum(log(1-thetaH))

	x <- list()
	# All spam words
	x[[1]] <- rep(c(1,0), (tl/2))
	# All ham words
	x[[2]] <- rep(c(0,1), (tl/2))
	# Every word
	x[[3]] <- rep(1, tl)
	# Random favoring 0
	x[[4]] <- sample(c(1,0), tl, replace=TRUE, prob=c(0.29,0.71))
	# Random favoring 1
	x[[5]] <- sample(c(1,0), tl, replace=TRUE, prob=c(0.71,0.29))
	logXGivenSpam <- rep(0, 5)
	logXGivenHam <- rep(0,5)
	logSpamGivenX <- rep(0,5)

	logPspam <- log(Pspam)
	logPham <- log(Pham)

	for(i in 1:5) {
	print(x[[i]])
	logXGivenSpam[i] <- sum(x[[i]] * wS) + w0S
	logXGivenHam[i] <- sum(x[[i]] * wH) + w0H
	cat("logXGivenSpam: ", logXGivenSpam[i], "\n")
	cat("logXGivenHam: ", logXGivenHam[i], "\n")
	# log identity: log(a + b) = log(a) + log(1 + exp(log(b) - log(a)))
	# formula being solved:
	# log(p(spam\|word)) = log(p(word\|spam)p(spam)/
	# (p(word\|spam)p(spam) + p(word\|ham)p(ham)))
	logSpamGivenX[i] <- -( log(1 + exp(logXGivenHam[i] + logPham - logXGivenSpam[i] - logPspam)) )
	cat("**logSpamGivenX: ", logSpamGivenX[i], "\n")
	cat("** p(S\|x): ", exp(logSpamGivenX[i]), "\n")
	}