Skip to content

Instantly share code, notes, and snippets.

@makispl
Last active April 12, 2020 19:02
Show Gist options
  • Save makispl/dc38c7f796b9421cb91c08e64efaddfc to your computer and use it in GitHub Desktop.
Save makispl/dc38c7f796b9421cb91c08e64efaddfc to your computer and use it in GitHub Desktop.
def sms_classify(message):
'''
Takes in as input a new sms (w1, w2, ..., wn),
calculates P(Spam|w1, w2, ..., wn) and P(Ham|w1, w2, ..., wn),
compares them and outcomes whether the message is spam or not.
'''
# Replace addresses (hhtp, email), numbers (plain, phone), money symbols
message = message.replace(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', ' ')
message = message.replace(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', ' ')
message = message.replace(r'£|\$', ' ')
message = message.replace(r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', ' ')
message = message.replace(r'\d+(\.\d+)?', ' ')
# Remove punctuation, collapse all whitespace (spaces, line breaks, tabs) into a single space & eliminate any leading/trailing whitespace.
message = message.replace(r'[^\w\d\s]', ' ')
message = message.replace(r'\s+', ' ')
message = message.replace(r'^\s+|\s+?$', '')
# Lowercase the entire corpus
message = message.lower()
# Remove stop words
terms = []
for term in message.split():
if term not in set(stop_words):
terms.append(term)
message = ' '.join(terms)
# Lemmatization
message = ' '.join(lemmatizer.lemmatize(term, pos='v') for term in message.split())
# Stemming
message = ' '.join(porter.stem(term) for term in message.split())
# Tokenization
message = message.split()
p_spam_given_message = p_spam
p_ham_given_message = p_ham
for word in message:
if word in parameters_spam:
p_spam_given_message *= parameters_spam[word]
if word in parameters_ham:
p_ham_given_message *= parameters_ham[word]
print('P(Spam|message):', p_spam_given_message)
print('P(Ham|message):', p_ham_given_message)
if p_ham_given_message > p_spam_given_message:
print('Label: Ham')
elif p_ham_given_message < p_spam_given_message:
print('Label: Spam')
else:
print('Equal probabilities ~ Human action needed!')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment