Skip to content

Instantly share code, notes, and snippets.

@ja-mf
Created October 23, 2012 16:10
Show Gist options
  • Save ja-mf/3939736 to your computer and use it in GitHub Desktop.
Save ja-mf/3939736 to your computer and use it in GitHub Desktop.
naive-bayes spam filter
from __future__ import division
import string
class BayesianSpam:
# training-set[categoria] := lista de mails en una categoria
# mail := lista de palabras
training = {'spam': [], 'no-spam': []}
# probabilidades de cero ocurrencia en cada categoria, a priori
# y de ocurrencia de cada palabra en cada categoria.
# esto es:
# priors['spam'] := P(S=1), sea spam
# priors['no-spam'] := P(S=0), no sea spam
# features[cat][word] := P(W/S)
zeroOccurrences = {}
priors = {}
features = {}
def __init__(self):
# parsear el training set
self.parse_training()
total = len(self.training['spam']) + len(self.training['no-spam'])
# probabilidad a priori de que un mail sea o no spam,
# dado un training set
self.priors['spam'] = len(self.training['spam']) / total
self.priors['no-spam'] = len(self.training['no-spam']) / total
# iterar categorias del training
for cat in self.training:
self.features[cat] = {}
# ocurrencia de una sola vez
# para ninguna ocurrencia se utilizara singleOccurrence
singleOccurrence = 1/len(self.training[cat])
self.zeroOccurrences[cat] = singleOccurrence
# inicializacion de las features (calculo de probabilidad condicional)
for mail in self.training[cat]:
for word in mail:
if word in self.features[cat]:
self.features[cat][word] = self.features[cat][word] + singleOccurrence
else:
self.features[cat][word] = self.zeroOccurrences[cat] + singleOccurrence
def parse_training(self):
spam_mail = ["The low prices and highest quality pills.Fast Worldwide Delivery. We accept Visa, AmEx, ACH & MasterCard",
"No Matter what you are selling - Hit-Booster will send targeted visitors to your website! Within 15 minutes you will have your own website traffic generator that will bring in an ever increasing amount of hits to your websites! Automatically This software is perfect for bringing real traffic to your site... even if... it's an affiliate link where you have no control over the website content!", "Numerous companies are looking for workers to submit information into online forms and they will pay you nicely in return. You can get paid up to $25 per transaction. This is not a get-rich-quick scheme but a legitimate way to earn money from home. We have already helped thousands of people worldwide achieve financial freedom and enjoy a better life.", "buy drugs online from our pharma", "buy insurance at low prices"]
no_spam_mail = [
"Birthdays are like boogers, the more you have the harder it is to breathe.",
"We know we're getting old when the only thing we want for our birthday is not to be reminded of it.",
"I ve attached my resume. I would be grateful if you could read it and get back to me at your earliest convenience. I have all the experience you are looking for:",
"I completely disagree with Sheryl and Peter about top-posting, as they reference it. I am an executive and people who truncate their message irritate me! In this day & age, people want to find all of the information in one place. I receive more than 500 e-mails per day.",
"newsletter from your favorite website"]
# excluir signos de puntuacion y demases
for spam in spam_mail:
self.training['spam'].append(self.parse_mail(spam))
for mail in no_spam_mail:
self.training['no-spam'].append(self.parse_mail(mail))
def parse_mail(self, message):
table = string.maketrans("","")
return message.translate(table, string.punctuation).lower().split()
# calculara la probabilidad para una categoria
def score(self, mail, cat):
score = self.priors[cat]
for word in mail:
if word in self.features[cat]:
score = score * self.features[cat][word]
else:
score = score * self.zeroOccurrences[cat]
return(score)
# calculara la probabilidad para todas las categorias, dado un mail
def classify(self, mail):
scores = {}
for cat in self.training:
scores[cat] = self.score(mail, cat)
print scores
return 'spam' if scores['spam'] > scores['no-spam'] else 'no spam!'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment