Skip to content

Instantly share code, notes, and snippets.

View makispl's full-sized avatar

Plegas Gerasimos makispl

View GitHub Profile
# Filter the spam and ham dataframes
spam_df = training_set_final[training_set_final['Label'] == 'spam'].copy()
ham_df = training_set_final[training_set_final['Label'] == 'ham'].copy()
# Calculate P(Spam) and P(Ham)
p_spam = spam_df.shape[0] / training_set_final.shape[0]
p_ham = ham_df.shape[0] / training_set_final.shape[0]
# Calculate Nspam, Nham and Nvocabulary
spam_words_per_message = spam_df['SMS'].apply(len)
# Create two dictionaries that match each unique word with the respective probability value.
parameters_spam = {unique_word: 0 for unique_word in vocabulary}
parameters_ham = {unique_word: 0 for unique_word in vocabulary}
# Iterate over the vocabulary and for each word, calculate P(wi|Spam) and P(wi|Ham)
for unique_word in vocabulary:
p_unique_word_spam = (spam_df[unique_word].sum() + alpha) / (n_spam + alpha * n_vocabulary)
p_unique_word_ham = (ham_df[unique_word].sum() + alpha) / (n_ham + alpha * n_vocabulary)
# Update the calculated propabilities to the dictionaries
def sms_classify(message):
'''
Takes in as input a new sms (w1, w2, ..., wn),
calculates P(Spam|w1, w2, ..., wn) and P(Ham|w1, w2, ..., wn),
compares them and outcomes whether the message is spam or not.
'''
# Replace addresses (hhtp, email), numbers (plain, phone), money symbols
message = message.replace(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', ' ')
message = message.replace(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', ' ')
# classify a new message, coming from advertising content
sms_classify('''Hey, Sign up with this promo code and get your card for amazing
exchange fees abroad and £5 to spend anywhere! Promocode: D48KV7BN''')
# classify a new message, coming from a private thread
sms_classify('''Okey Stan! Seems to be a reasonable amount of money. I'll think
of it and let you know ASAP.''')
# Classify the messages of the test_set
test_set['sms_predicted'] = test_set['SMS'].apply(sms_classify_test_set)
# Calculate the accuracy of the algorithm
correct = 0
total = test_set.shape[0]
for row in test_set.iterrows():
row = row[1]
if row['Label'] == row['sms_predicted']:
# Replace addresses (hhtp, email), numbers (plain, phone), money symbols
training_set['SMS'] = training_set['SMS'].str.replace(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', ' ')
training_set['SMS'] = training_set['SMS'].str.replace(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', ' ')
training_set['SMS'] = training_set['SMS'].str.replace(r'£|\$', ' ')
training_set['SMS'] = training_set['SMS'].str.replace(r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', ' ')
training_set['SMS'] = training_set['SMS'].str.replace(r'\d+(\.\d+)?', ' ')
# Remove punctuation, collapse all whitespace (spaces, line breaks, tabs) into a single space & eliminate any leading/trailing whitespace.
training_set['SMS'] = training_set['SMS'].str.replace(r'[^\w\d\s]', ' ')
training_set['SMS'] = training_set['SMS'].str.replace(r'\s+', ' ')
lemmatizer = nltk.stem.WordNetLemmatizer()
training_set['SMS'] = training_set['SMS'].apply(lambda x: ' '.join(
lemmatizer.lemmatize(term, pos='v') for term in x.split())
)
porter = nltk.PorterStemmer()
training_set['SMS'] = training_set['SMS'].apply(lambda x: ' '.join(
porter.stem(term) for term in x.split())
)
training_set['SMS'] = training_set['SMS'].apply(lambda sms: nltk.word_tokenize(sms))