yashhere/NaiveBayes.md

## NaiveBayes.md
#!/usr/bin/python2.7

spam=['free movie tickets','free watch offer','rolex watch discount']
ham=['I watch movie','I am free']


## function to calculate prior probability for SPAM ##
## how many SPAM seen out of all messages? ##
def get_prior_spam_probability(spam,ham):
    return float(len(spam))/(len(spam)+len(ham))

spam_prior=get_prior_spam_probability(spam,ham)

### define the function for calculating word frequencies ###
### input is a list of strings ###
def build_word_frequency(messages):
    word_frequency={}
    for message in messages:
        message=message.replace('.','') #just stripoff dots in the message
        tokens=message.split()
        for token in tokens:
            if token not in word_frequency:
                word_frequency[token]=1
            else:
                word_frequency[token]=word_frequency[token]+1
    return word_frequency


### build word fequencies ###
spam_map=build_word_frequency(spam)
ham_map=build_word_frequency(ham)


### function to calculate P(WORD|SPAM) and P(WORD|HAM) ###
def calculate_likelihoods(word,spam_frequency_map,ham_frequency_map,debug=True):
    word_spam_frequency=0
    word_ham_frequency=0

    #count of total words in spam messages seen so far
    total_spam_word_count = sum(spam_frequency_map.values())
    #count of total words in spam messages seen sofar
    total_ham_word_count = sum(ham_frequency_map.values())

    ## calculate vocabulary size for Laplace Smoothing; unique keys from both maps ##
    vocabulary_size=len(set(spam_frequency_map.keys()+ham_frequency_map.keys()))

    if word in spam_frequency_map: #if at all the word was used atleast once in a spam
        word_spam_frequency=spam_frequency_map[word]

    if word in ham_frequency_map: #if at all the word was used atleast once in a ham
        word_ham_frequency=ham_frequency_map[word]

    #calculate the probability of a word being spam using lapace smoothing so that #
    # even if it is a new word, still some probability is there for it to be spam #
    p_word_spam= (word_spam_frequency + 1.0)/ (total_spam_word_count + vocabulary_size)
    p_word_ham= (word_ham_frequency + 1.0)/ (total_ham_word_count  + vocabulary_size)
    return p_word_spam,p_word_ham


## classify a text message using Naive Bayes formula
def classify_message(message,spam_prior=0.5,threshold=0.5):
    posterior=0.0
    spam_likelihood=1.0
    ham_likelihood=1.0
    #convert to lower case and calculate total posterior probability
    for word in message.lower().split():
        sl,hl=calculate_likelihoods(word,spam_map,ham_map)
        spam_likelihood=spam_likelihood*sl
        ham_likelihood=ham_likelihood*hl
    posterior=spam_likelihood*spam_prior/((spam_likelihood*spam_prior)+(ham_likelihood*(1.0-spam_prior)))
    if posterior>threshold:
        return 'SPAM',posterior
    return 'HAM',posterior


### test for a toy example
print classify_message('watch free movie',spam_prior)
	#!/usr/bin/python2.7

	spam=['free movie tickets','free watch offer','rolex watch discount']
	ham=['I watch movie','I am free']


	## function to calculate prior probability for SPAM ##
	## how many SPAM seen out of all messages? ##
	def get_prior_spam_probability(spam,ham):
	return float(len(spam))/(len(spam)+len(ham))

	spam_prior=get_prior_spam_probability(spam,ham)

	### define the function for calculating word frequencies ###
	### input is a list of strings ###
	def build_word_frequency(messages):
	word_frequency={}
	for message in messages:
	message=message.replace('.','') #just stripoff dots in the message
	tokens=message.split()
	for token in tokens:
	if token not in word_frequency:
	word_frequency[token]=1
	else:
	word_frequency[token]=word_frequency[token]+1
	return word_frequency


	### build word fequencies ###
	spam_map=build_word_frequency(spam)
	ham_map=build_word_frequency(ham)


	### function to calculate P(WORD\|SPAM) and P(WORD\|HAM) ###
	def calculate_likelihoods(word,spam_frequency_map,ham_frequency_map,debug=True):
	word_spam_frequency=0
	word_ham_frequency=0

	#count of total words in spam messages seen so far
	total_spam_word_count = sum(spam_frequency_map.values())
	#count of total words in spam messages seen sofar
	total_ham_word_count = sum(ham_frequency_map.values())

	## calculate vocabulary size for Laplace Smoothing; unique keys from both maps ##
	vocabulary_size=len(set(spam_frequency_map.keys()+ham_frequency_map.keys()))

	if word in spam_frequency_map: #if at all the word was used atleast once in a spam
	word_spam_frequency=spam_frequency_map[word]

	if word in ham_frequency_map: #if at all the word was used atleast once in a ham
	word_ham_frequency=ham_frequency_map[word]

	#calculate the probability of a word being spam using lapace smoothing so that #
	# even if it is a new word, still some probability is there for it to be spam #
	p_word_spam= (word_spam_frequency + 1.0)/ (total_spam_word_count + vocabulary_size)
	p_word_ham= (word_ham_frequency + 1.0)/ (total_ham_word_count + vocabulary_size)
	return p_word_spam,p_word_ham


	## classify a text message using Naive Bayes formula
	def classify_message(message,spam_prior=0.5,threshold=0.5):
	posterior=0.0
	spam_likelihood=1.0
	ham_likelihood=1.0
	#convert to lower case and calculate total posterior probability
	for word in message.lower().split():
	sl,hl=calculate_likelihoods(word,spam_map,ham_map)
	spam_likelihood=spam_likelihood*sl
	ham_likelihood=ham_likelihood*hl
	posterior=spam_likelihoodspam_prior/((spam_likelihoodspam_prior)+(ham_likelihood*(1.0-spam_prior)))
	if posterior>threshold:
	return 'SPAM',posterior
	return 'HAM',posterior


	### test for a toy example
	print classify_message('watch free movie',spam_prior)