Skip to content

Instantly share code, notes, and snippets.

@tuner
Created September 27, 2017 08:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tuner/761562318f66ccc5cdbf0daf5277112f to your computer and use it in GitHub Desktop.
Save tuner/761562318f66ccc5cdbf0daf5277112f to your computer and use it in GitHub Desktop.
Bayes spam "filter"
import re
import pathlib
from collections import defaultdict
from math import log, exp
def read_table(filename):
pattern = re.compile("^\s+(\d+)\s+([^\s]+)$")
table = {}
with open(filename) as csvfile:
for row in csvfile:
match = pattern.search(row)
if match:
table[match.groups()[1]] = float(match.groups()[0])
return table
def to_probability(table):
total = sum(table.values())
return {k: v / total for k, v in table.items()}
def read_mail(filename):
with open(filename) as f:
return [word.strip() for line in f for word in line.split()]
def spamicity(words, ham_prob, spam_prob):
baseline = lambda: 0.000001
spam_prob = defaultdict(baseline, spam_prob)
ham_prob = defaultdict(baseline, ham_prob)
#R = estimates.prior_spam / estimates.prior_ham
logR = 0.0
for w in words:
logR += log(spam_prob[w]) - log(ham_prob[w])
return exp(logR)
def get_file_list():
file_list = []
for p in pathlib.Path('./mails').iterdir():
if p.is_file():
file_list.append(str(p))
return file_list
def main():
ham_prob = to_probability(read_table("hamcount.txt"))
spam_prob = to_probability(read_table("spamcount.txt"))
for filename in get_file_list():
R = spamicity(read_mail(filename), ham_prob, spam_prob)
probability = R / (1 + R)
print("{}, spam probability: {}".format(filename, probability))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment