makispl/classify_message.py

## classify_message.py
def sms_classify(message):
    '''
    Takes in as input a new sms (w1, w2, ..., wn),
    calculates P(Spam|w1, w2, ..., wn) and P(Ham|w1, w2, ..., wn),
    compares them and outcomes whether the message is spam or not.
    '''

    # Replace addresses (hhtp, email), numbers (plain, phone), money symbols
    message = message.replace(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', ' ')
    message = message.replace(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', ' ')
    message = message.replace(r'£|\$', ' ')
    message = message.replace(r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', ' ')
    message = message.replace(r'\d+(\.\d+)?', ' ')

    # Remove punctuation, collapse all whitespace (spaces, line breaks, tabs) into a single space & eliminate any leading/trailing whitespace.
    message = message.replace(r'[^\w\d\s]', ' ')
    message = message.replace(r'\s+', ' ')
    message = message.replace(r'^\s+|\s+?$', '')

    # Lowercase the entire corpus
    message = message.lower()

    # Remove stop words
    terms = []
    for term in message.split():
        if term not in set(stop_words):
            terms.append(term)
            message = ' '.join(terms)

    # Lemmatization
    message = ' '.join(lemmatizer.lemmatize(term, pos='v') for term in message.split())

    # Stemming
    message = ' '.join(porter.stem(term) for term in message.split())

    # Tokenization
    message = message.split()

    p_spam_given_message = p_spam
    p_ham_given_message = p_ham

    for word in message:
        if word in parameters_spam:
            p_spam_given_message *= parameters_spam[word]

        if word in parameters_ham:
            p_ham_given_message *= parameters_ham[word]

    print('P(Spam|message):', p_spam_given_message)
    print('P(Ham|message):', p_ham_given_message)

    if p_ham_given_message > p_spam_given_message:
        print('Label: Ham')
    elif p_ham_given_message < p_spam_given_message:
        print('Label: Spam')
    else:
        print('Equal probabilities ~ Human action needed!')
	def sms_classify(message):
	'''
	Takes in as input a new sms (w1, w2, ..., wn),
	calculates P(Spam\|w1, w2, ..., wn) and P(Ham\|w1, w2, ..., wn),
	compares them and outcomes whether the message is spam or not.
	'''

	# Replace addresses (hhtp, email), numbers (plain, phone), money symbols
	message = message.replace(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', ' ')
	message = message.replace(r'(http[s]?\S+)\|(\w+\.[A-Za-z]{2,4}\S*)', ' ')
	message = message.replace(r'£\|\$', ' ')
	message = message.replace(r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', ' ')
	message = message.replace(r'\d+(\.\d+)?', ' ')

	# Remove punctuation, collapse all whitespace (spaces, line breaks, tabs) into a single space & eliminate any leading/trailing whitespace.
	message = message.replace(r'[^\w\d\s]', ' ')
	message = message.replace(r'\s+', ' ')
	message = message.replace(r'^\s+\|\s+?$', '')

	# Lowercase the entire corpus
	message = message.lower()

	# Remove stop words
	terms = []
	for term in message.split():
	if term not in set(stop_words):
	terms.append(term)
	message = ' '.join(terms)

	# Lemmatization
	message = ' '.join(lemmatizer.lemmatize(term, pos='v') for term in message.split())

	# Stemming
	message = ' '.join(porter.stem(term) for term in message.split())

	# Tokenization
	message = message.split()

	p_spam_given_message = p_spam
	p_ham_given_message = p_ham

	for word in message:
	if word in parameters_spam:
	p_spam_given_message *= parameters_spam[word]

	if word in parameters_ham:
	p_ham_given_message *= parameters_ham[word]

	print('P(Spam\|message):', p_spam_given_message)
	print('P(Ham\|message):', p_ham_given_message)

	if p_ham_given_message > p_spam_given_message:
	print('Label: Ham')
	elif p_ham_given_message < p_spam_given_message:
	print('Label: Spam')
	else:
	print('Equal probabilities ~ Human action needed!')