Skip to content

Instantly share code, notes, and snippets.

@willemneal
Created March 23, 2015 15:36
Show Gist options
  • Save willemneal/561bca8a3e212fb421c1 to your computer and use it in GitHub Desktop.
Save willemneal/561bca8a3e212fb421c1 to your computer and use it in GitHub Desktop.
import glob ## needed for listing files in folder
import math
import string ## needed to remove punctuation
MAC = "\r\n"
PC = "\n"
def makeDictionary(folderName):
trainingFiles = glob.glob("./emails/" + folderName + "/*.txt")
#returns list of filenames in folder that end in .txt
wordDictionary = {}
_sum = 0
for filename in trainingFiles:
for line in file(filename):
line = line.rstrip(PC)
words = line.split()
for word in words:
word = word.translate(string.maketrans("",""),string.punctuation)
#removes punctuation
if word != "" and not word.isdigit():
_sum += 1
if word in wordDictionary:
wordDictionary[word] = wordDictionary[word] + 1
else:
wordDictionary[word] = 1
for key in wordDictionary.keys():
if wordDictionary[key] <=2:
del wordDictionary[key]
return wordDictionary, _sum, len(trainingFiles)
hamDictionary, hamSum, hamFiles = makeDictionary("hamtraining")
spamDictionary, spamSum, spamFiles = makeDictionary("spamtraining")
#print len(hamDictionary.values())
#print len(spamDictionary.values())
hamProb = float(hamFiles)/(hamFiles + spamFiles)
spamProb = float(spamFiles)/(hamFiles + spamFiles)
trainingDictionary = {"spam" : (spamDictionary, spamSum) , "ham": (hamDictionary,hamSum) }
m = 25
def getProb(word, _class, m):
classDict, classSum = trainingDictionary[_class]
if word not in classDict:
return 0
prob = (float(classDict[word]) + m)/(classSum + len(classDict)*m)
return math.log(prob)
def testFile(fileName):
ham_Prob = math.log(hamProb)
spam_Prob = math.log(spamProb)
for line in file(fileName):
line = line.rstrip(PC)
words = line.split()
for word in words:
word = word.translate(string.maketrans("",""),string.punctuation)
#remove punctuation
ham_Prob = ham_Prob + getProb(word,"ham",m)
spam_Prob = spam_Prob + getProb(word,"spam",m)
if ham_Prob < spam_Prob:
return "ham"
elif spam_Prob < ham_Prob:
return "spam"
return "spam"
def testFiles(folderName,_class):
testingFiles = glob.glob("./emails/" + folderName + "/*.txt")
numCorrect = 0.0
for _file in testingFiles:
result = testFile(_file)
if result == _class:
numCorrect +=1
print "we got {result} percent correct".format(result=numCorrect/len(testingFiles)*100)
testFiles("hamtesting","ham")
testFiles("spamtesting","spam")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment