Skip to content

Instantly share code, notes, and snippets.

@yashhere
Last active May 22, 2018 08:09
Show Gist options
  • Save yashhere/4f5b14192b2c968ea102c79116a14054 to your computer and use it in GitHub Desktop.
Save yashhere/4f5b14192b2c968ea102c79116a14054 to your computer and use it in GitHub Desktop.
#!/usr/bin/python2.7
spam=['free movie tickets','free watch offer','rolex watch discount']
ham=['I watch movie','I am free']
## function to calculate prior probability for SPAM ##
## how many SPAM seen out of all messages? ##
def get_prior_spam_probability(spam,ham):
return float(len(spam))/(len(spam)+len(ham))
spam_prior=get_prior_spam_probability(spam,ham)
### define the function for calculating word frequencies ###
### input is a list of strings ###
def build_word_frequency(messages):
word_frequency={}
for message in messages:
message=message.replace('.','') #just stripoff dots in the message
tokens=message.split()
for token in tokens:
if token not in word_frequency:
word_frequency[token]=1
else:
word_frequency[token]=word_frequency[token]+1
return word_frequency
### build word fequencies ###
spam_map=build_word_frequency(spam)
ham_map=build_word_frequency(ham)
### function to calculate P(WORD|SPAM) and P(WORD|HAM) ###
def calculate_likelihoods(word,spam_frequency_map,ham_frequency_map,debug=True):
word_spam_frequency=0
word_ham_frequency=0
#count of total words in spam messages seen so far
total_spam_word_count = sum(spam_frequency_map.values())
#count of total words in spam messages seen sofar
total_ham_word_count = sum(ham_frequency_map.values())
## calculate vocabulary size for Laplace Smoothing; unique keys from both maps ##
vocabulary_size=len(set(spam_frequency_map.keys()+ham_frequency_map.keys()))
if word in spam_frequency_map: #if at all the word was used atleast once in a spam
word_spam_frequency=spam_frequency_map[word]
if word in ham_frequency_map: #if at all the word was used atleast once in a ham
word_ham_frequency=ham_frequency_map[word]
#calculate the probability of a word being spam using lapace smoothing so that #
# even if it is a new word, still some probability is there for it to be spam #
p_word_spam= (word_spam_frequency + 1.0)/ (total_spam_word_count + vocabulary_size)
p_word_ham= (word_ham_frequency + 1.0)/ (total_ham_word_count + vocabulary_size)
return p_word_spam,p_word_ham
## classify a text message using Naive Bayes formula
def classify_message(message,spam_prior=0.5,threshold=0.5):
posterior=0.0
spam_likelihood=1.0
ham_likelihood=1.0
#convert to lower case and calculate total posterior probability
for word in message.lower().split():
sl,hl=calculate_likelihoods(word,spam_map,ham_map)
spam_likelihood=spam_likelihood*sl
ham_likelihood=ham_likelihood*hl
posterior=spam_likelihood*spam_prior/((spam_likelihood*spam_prior)+(ham_likelihood*(1.0-spam_prior)))
if posterior>threshold:
return 'SPAM',posterior
return 'HAM',posterior
### test for a toy example
print classify_message('watch free movie',spam_prior)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment