Created
March 23, 2015 15:36
-
-
Save willemneal/561bca8a3e212fb421c1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob ## needed for listing files in folder | |
import math | |
import string ## needed to remove punctuation | |
MAC = "\r\n" | |
PC = "\n" | |
def makeDictionary(folderName): | |
trainingFiles = glob.glob("./emails/" + folderName + "/*.txt") | |
#returns list of filenames in folder that end in .txt | |
wordDictionary = {} | |
_sum = 0 | |
for filename in trainingFiles: | |
for line in file(filename): | |
line = line.rstrip(PC) | |
words = line.split() | |
for word in words: | |
word = word.translate(string.maketrans("",""),string.punctuation) | |
#removes punctuation | |
if word != "" and not word.isdigit(): | |
_sum += 1 | |
if word in wordDictionary: | |
wordDictionary[word] = wordDictionary[word] + 1 | |
else: | |
wordDictionary[word] = 1 | |
for key in wordDictionary.keys(): | |
if wordDictionary[key] <=2: | |
del wordDictionary[key] | |
return wordDictionary, _sum, len(trainingFiles) | |
hamDictionary, hamSum, hamFiles = makeDictionary("hamtraining") | |
spamDictionary, spamSum, spamFiles = makeDictionary("spamtraining") | |
#print len(hamDictionary.values()) | |
#print len(spamDictionary.values()) | |
hamProb = float(hamFiles)/(hamFiles + spamFiles) | |
spamProb = float(spamFiles)/(hamFiles + spamFiles) | |
trainingDictionary = {"spam" : (spamDictionary, spamSum) , "ham": (hamDictionary,hamSum) } | |
m = 25 | |
def getProb(word, _class, m): | |
classDict, classSum = trainingDictionary[_class] | |
if word not in classDict: | |
return 0 | |
prob = (float(classDict[word]) + m)/(classSum + len(classDict)*m) | |
return math.log(prob) | |
def testFile(fileName): | |
ham_Prob = math.log(hamProb) | |
spam_Prob = math.log(spamProb) | |
for line in file(fileName): | |
line = line.rstrip(PC) | |
words = line.split() | |
for word in words: | |
word = word.translate(string.maketrans("",""),string.punctuation) | |
#remove punctuation | |
ham_Prob = ham_Prob + getProb(word,"ham",m) | |
spam_Prob = spam_Prob + getProb(word,"spam",m) | |
if ham_Prob < spam_Prob: | |
return "ham" | |
elif spam_Prob < ham_Prob: | |
return "spam" | |
return "spam" | |
def testFiles(folderName,_class): | |
testingFiles = glob.glob("./emails/" + folderName + "/*.txt") | |
numCorrect = 0.0 | |
for _file in testingFiles: | |
result = testFile(_file) | |
if result == _class: | |
numCorrect +=1 | |
print "we got {result} percent correct".format(result=numCorrect/len(testingFiles)*100) | |
testFiles("hamtesting","ham") | |
testFiles("spamtesting","spam") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment