Created
October 23, 2012 16:10
-
-
Save ja-mf/3939736 to your computer and use it in GitHub Desktop.
naive-bayes spam filter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
import string | |
class BayesianSpam: | |
# training-set[categoria] := lista de mails en una categoria | |
# mail := lista de palabras | |
training = {'spam': [], 'no-spam': []} | |
# probabilidades de cero ocurrencia en cada categoria, a priori | |
# y de ocurrencia de cada palabra en cada categoria. | |
# esto es: | |
# priors['spam'] := P(S=1), sea spam | |
# priors['no-spam'] := P(S=0), no sea spam | |
# features[cat][word] := P(W/S) | |
zeroOccurrences = {} | |
priors = {} | |
features = {} | |
def __init__(self): | |
# parsear el training set | |
self.parse_training() | |
total = len(self.training['spam']) + len(self.training['no-spam']) | |
# probabilidad a priori de que un mail sea o no spam, | |
# dado un training set | |
self.priors['spam'] = len(self.training['spam']) / total | |
self.priors['no-spam'] = len(self.training['no-spam']) / total | |
# iterar categorias del training | |
for cat in self.training: | |
self.features[cat] = {} | |
# ocurrencia de una sola vez | |
# para ninguna ocurrencia se utilizara singleOccurrence | |
singleOccurrence = 1/len(self.training[cat]) | |
self.zeroOccurrences[cat] = singleOccurrence | |
# inicializacion de las features (calculo de probabilidad condicional) | |
for mail in self.training[cat]: | |
for word in mail: | |
if word in self.features[cat]: | |
self.features[cat][word] = self.features[cat][word] + singleOccurrence | |
else: | |
self.features[cat][word] = self.zeroOccurrences[cat] + singleOccurrence | |
def parse_training(self): | |
spam_mail = ["The low prices and highest quality pills.Fast Worldwide Delivery. We accept Visa, AmEx, ACH & MasterCard", | |
"No Matter what you are selling - Hit-Booster will send targeted visitors to your website! Within 15 minutes you will have your own website traffic generator that will bring in an ever increasing amount of hits to your websites! Automatically This software is perfect for bringing real traffic to your site... even if... it's an affiliate link where you have no control over the website content!", "Numerous companies are looking for workers to submit information into online forms and they will pay you nicely in return. You can get paid up to $25 per transaction. This is not a get-rich-quick scheme but a legitimate way to earn money from home. We have already helped thousands of people worldwide achieve financial freedom and enjoy a better life.", "buy drugs online from our pharma", "buy insurance at low prices"] | |
no_spam_mail = [ | |
"Birthdays are like boogers, the more you have the harder it is to breathe.", | |
"We know we're getting old when the only thing we want for our birthday is not to be reminded of it.", | |
"I ve attached my resume. I would be grateful if you could read it and get back to me at your earliest convenience. I have all the experience you are looking for:", | |
"I completely disagree with Sheryl and Peter about top-posting, as they reference it. I am an executive and people who truncate their message irritate me! In this day & age, people want to find all of the information in one place. I receive more than 500 e-mails per day.", | |
"newsletter from your favorite website"] | |
# excluir signos de puntuacion y demases | |
for spam in spam_mail: | |
self.training['spam'].append(self.parse_mail(spam)) | |
for mail in no_spam_mail: | |
self.training['no-spam'].append(self.parse_mail(mail)) | |
def parse_mail(self, message): | |
table = string.maketrans("","") | |
return message.translate(table, string.punctuation).lower().split() | |
# calculara la probabilidad para una categoria | |
def score(self, mail, cat): | |
score = self.priors[cat] | |
for word in mail: | |
if word in self.features[cat]: | |
score = score * self.features[cat][word] | |
else: | |
score = score * self.zeroOccurrences[cat] | |
return(score) | |
# calculara la probabilidad para todas las categorias, dado un mail | |
def classify(self, mail): | |
scores = {} | |
for cat in self.training: | |
scores[cat] = self.score(mail, cat) | |
print scores | |
return 'spam' if scores['spam'] > scores['no-spam'] else 'no spam!' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment