ja-mf/nbspam.py

## nbspam.py
from __future__ import division
import string

class BayesianSpam:
	# training-set[categoria] := lista de mails en una categoria
	# mail := lista de palabras
	training = {'spam': [], 'no-spam': []}

	# probabilidades de cero ocurrencia en cada categoria, a priori
	# y de ocurrencia de cada palabra en cada categoria.
	# esto es:
	# priors['spam'] := P(S=1), sea spam
	# priors['no-spam'] := P(S=0), no sea spam
	# features[cat][word] := P(W/S)
	zeroOccurrences = {}
	priors = {}
	features = {}

	def __init__(self):

		# parsear el training set
		self.parse_training()

		total = len(self.training['spam']) + len(self.training['no-spam'])

		# probabilidad a priori de que un mail sea o no spam,
		# dado un training set
		self.priors['spam'] = len(self.training['spam']) / total
		self.priors['no-spam'] = len(self.training['no-spam']) / total

		# iterar categorias del training
		for cat in self.training:
			self.features[cat] = {}

			# ocurrencia de una sola vez
			# para ninguna ocurrencia se utilizara singleOccurrence
			singleOccurrence = 1/len(self.training[cat])
			self.zeroOccurrences[cat] = singleOccurrence

			# inicializacion de las features (calculo de probabilidad condicional)
			for mail in self.training[cat]:
				for word in mail:
					if word in self.features[cat]:
						self.features[cat][word] = self.features[cat][word] + singleOccurrence
					else:
						self.features[cat][word] = self.zeroOccurrences[cat] + singleOccurrence

	def parse_training(self):
		spam_mail = ["The low prices and highest quality pills.Fast Worldwide Delivery. We accept Visa, AmEx, ACH & MasterCard",
						 "No Matter what you are selling - Hit-Booster will send targeted visitors to your website! Within 15 minutes you will have your own website traffic generator that will bring in an ever increasing amount of hits to your websites! Automatically This software is perfect for bringing real traffic to your site... even if... it's an affiliate link where you  have no control over the website content!", "Numerous companies are looking for workers  to submit information into online forms and they will pay you nicely in return. You can get paid up to $25 per transaction. This is not a get-rich-quick scheme but a legitimate way to earn money from home. We have already helped thousands of people worldwide achieve financial freedom and enjoy a better life.", "buy drugs online from our pharma", "buy insurance at low prices"]

		no_spam_mail = [
"Birthdays are like boogers, the more you have the harder it is to breathe.",
"We know we're getting old when the only thing we want for our birthday is not to be reminded of it.",
"I ve attached my resume. I would be grateful if you could read it and get back to me at your earliest convenience. I have all the experience you are looking for:",
"I completely disagree with Sheryl and Peter about top-posting, as they reference it. I am an executive and people who truncate their message irritate me! In this day & age, people want to find all of the information in one place. I receive more than 500 e-mails per day.",
"newsletter from your favorite website"]

		# excluir signos de puntuacion y demases
		for spam in spam_mail:
			self.training['spam'].append(self.parse_mail(spam))

		for mail in no_spam_mail:
			self.training['no-spam'].append(self.parse_mail(mail))

	def parse_mail(self, message):
		table = string.maketrans("","")
		return message.translate(table, string.punctuation).lower().split()

	# calculara la probabilidad para una categoria
	def score(self, mail, cat):
		score = self.priors[cat]

		for word in mail:
			if word in self.features[cat]:
				score = score * self.features[cat][word]
			else:
				score = score * self.zeroOccurrences[cat]

		return(score)

	# calculara la probabilidad para todas las categorias, dado un mail
	def classify(self, mail):
		scores = {}

		for cat in self.training:
			scores[cat] = self.score(mail, cat)

		print scores

		return 'spam' if scores['spam'] > scores['no-spam'] else 'no spam!'
	from __future__ import division
	import string

	class BayesianSpam:
	# training-set[categoria] := lista de mails en una categoria
	# mail := lista de palabras
	training = {'spam': [], 'no-spam': []}

	# probabilidades de cero ocurrencia en cada categoria, a priori
	# y de ocurrencia de cada palabra en cada categoria.
	# esto es:
	# priors['spam'] := P(S=1), sea spam
	# priors['no-spam'] := P(S=0), no sea spam
	# features[cat][word] := P(W/S)
	zeroOccurrences = {}
	priors = {}
	features = {}

	def __init__(self):

	# parsear el training set
	self.parse_training()

	total = len(self.training['spam']) + len(self.training['no-spam'])

	# probabilidad a priori de que un mail sea o no spam,
	# dado un training set
	self.priors['spam'] = len(self.training['spam']) / total
	self.priors['no-spam'] = len(self.training['no-spam']) / total

	# iterar categorias del training
	for cat in self.training:
	self.features[cat] = {}

	# ocurrencia de una sola vez
	# para ninguna ocurrencia se utilizara singleOccurrence
	singleOccurrence = 1/len(self.training[cat])
	self.zeroOccurrences[cat] = singleOccurrence

	# inicializacion de las features (calculo de probabilidad condicional)
	for mail in self.training[cat]:
	for word in mail:
	if word in self.features[cat]:
	self.features[cat][word] = self.features[cat][word] + singleOccurrence
	else:
	self.features[cat][word] = self.zeroOccurrences[cat] + singleOccurrence

	def parse_training(self):
	spam_mail = ["The low prices and highest quality pills.Fast Worldwide Delivery. We accept Visa, AmEx, ACH & MasterCard",
	"No Matter what you are selling - Hit-Booster will send targeted visitors to your website! Within 15 minutes you will have your own website traffic generator that will bring in an ever increasing amount of hits to your websites! Automatically This software is perfect for bringing real traffic to your site... even if... it's an affiliate link where you have no control over the website content!", "Numerous companies are looking for workers to submit information into online forms and they will pay you nicely in return. You can get paid up to $25 per transaction. This is not a get-rich-quick scheme but a legitimate way to earn money from home. We have already helped thousands of people worldwide achieve financial freedom and enjoy a better life.", "buy drugs online from our pharma", "buy insurance at low prices"]

	no_spam_mail = [
	"Birthdays are like boogers, the more you have the harder it is to breathe.",
	"We know we're getting old when the only thing we want for our birthday is not to be reminded of it.",
	"I ve attached my resume. I would be grateful if you could read it and get back to me at your earliest convenience. I have all the experience you are looking for:",
	"I completely disagree with Sheryl and Peter about top-posting, as they reference it. I am an executive and people who truncate their message irritate me! In this day & age, people want to find all of the information in one place. I receive more than 500 e-mails per day.",
	"newsletter from your favorite website"]

	# excluir signos de puntuacion y demases
	for spam in spam_mail:
	self.training['spam'].append(self.parse_mail(spam))

	for mail in no_spam_mail:
	self.training['no-spam'].append(self.parse_mail(mail))

	def parse_mail(self, message):
	table = string.maketrans("","")
	return message.translate(table, string.punctuation).lower().split()

	# calculara la probabilidad para una categoria
	def score(self, mail, cat):
	score = self.priors[cat]

	for word in mail:
	if word in self.features[cat]:
	score = score * self.features[cat][word]
	else:
	score = score * self.zeroOccurrences[cat]

	return(score)

	# calculara la probabilidad para todas las categorias, dado un mail
	def classify(self, mail):
	scores = {}

	for cat in self.training:
	scores[cat] = self.score(mail, cat)

	print scores

	return 'spam' if scores['spam'] > scores['no-spam'] else 'no spam!'