Created
March 23, 2017 00:46
-
-
Save ngopal/b15e4cf09e6bb0d11d7ced6b03bddc00 to your computer and use it in GitHub Desktop.
A first attempt at creating bayesian a spam filter from stratch; Trying to do this in lisp as a learning exercise; Definitely still a work in progress...
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; A first attempt at creating bayesian a spam filter from stratch | |
;; Trying to do this in lisp as a learning exercise | |
;; Definitely still a work in progress... | |
(defparameter *spam* '(this is a spam document that is fake news)) | |
(defparameter *ham* '(this is nikhils document)) | |
(defparameter *testdoc* '(this is a test document that is meant to be fake)) | |
(setq spam-table (make-hash-table)) | |
(setq ham-table (make-hash-table)) | |
# count the words | |
(defun count-words(hashds corpus) | |
(mapcar (lambda(x) | |
(princ x) | |
(if (gethash x hashds) | |
(setf (gethash x hashds) (+ 1 (gethash x hashds))) | |
(setf (gethash x hashds) 1) | |
) | |
) | |
corpus | |
) | |
) | |
(count-words spam-table *SPAM*) | |
(count-words ham-table *HAM*) | |
(defun nbayes(ws s wh h) | |
;;P(S|W) = P(W|S)*P(S) / ( P(W|S)*P(S) + P(W|H)*P(H) ) | |
(if | |
(not (eq 0 (+ (* ws s) (* wh h)))) | |
(/ | |
(* ws s) | |
(+ (* ws s) (* wh h)) | |
)) | |
) | |
(defun getfreq(word table) | |
(/ | |
(if (gethash word table) | |
(gethash word table) | |
0 | |
) | |
(hash-table-count table) | |
) | |
) | |
(getfreq 'THIS spam-table) | |
(getfreq 'THIS ham-table) | |
(nbayes (getfreq 'THIS spam-table) 0.5 (getfreq 'THIS ham-table) 0.5) | |
(defun posterior(word prob-spam prob-ham) | |
;; prevent divison by zero by checking word before use | |
(unless (eq (nbayes (getfreq word spam-table) prob-spam (getfreq word ham-table) prob-ham) nil) | |
"NO CAN DO" | |
(nbayes (getfreq word spam-table) prob-spam (getfreq word ham-table) prob-ham) | |
) | |
) | |
(posterior 'FAKE 0.5 0.5) | |
(posterior 'DOCUMENT 0.2 0.8) | |
(posterior 'NIKHILS 0.8 0.2) | |
(posterior 'THIS 0.2 0.8) | |
;; What I need is another hash that has all words used over both documents as keys, and frequencies as values | |
(defun print-hash-entry (key value) | |
(format t "The value associated with the key ~S is ~S~%" key value)) | |
(maphash #'print-hash-entry spam-table) | |
(defun getkeys(key value) | |
(key) | |
) | |
(maphash #'getkeys spam-table) | |
(cons (gethash x spam-table) (gethash x ham-table) ) | |
;;(/ (* p1 p2 p3 p4 p5 p6 pn..) (+ (p1 p2 p3 p4 p5 p6 pn..) ((- p1 1) (- p2 1) (- p3 1))) ) | |
(* | |
(remove nil (mapcar (lambda(x) | |
(posterior x 0.5 0.5) | |
) | |
*testdoc* | |
) | |
) | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment