Skip to content

Instantly share code, notes, and snippets.

@ngopal
Created March 23, 2017 00:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ngopal/b15e4cf09e6bb0d11d7ced6b03bddc00 to your computer and use it in GitHub Desktop.
Save ngopal/b15e4cf09e6bb0d11d7ced6b03bddc00 to your computer and use it in GitHub Desktop.
A first attempt at creating bayesian a spam filter from stratch; Trying to do this in lisp as a learning exercise; Definitely still a work in progress...
;; A first attempt at creating bayesian a spam filter from stratch
;; Trying to do this in lisp as a learning exercise
;; Definitely still a work in progress...
(defparameter *spam* '(this is a spam document that is fake news))
(defparameter *ham* '(this is nikhils document))
(defparameter *testdoc* '(this is a test document that is meant to be fake))
(setq spam-table (make-hash-table))
(setq ham-table (make-hash-table))
# count the words
(defun count-words(hashds corpus)
(mapcar (lambda(x)
(princ x)
(if (gethash x hashds)
(setf (gethash x hashds) (+ 1 (gethash x hashds)))
(setf (gethash x hashds) 1)
)
)
corpus
)
)
(count-words spam-table *SPAM*)
(count-words ham-table *HAM*)
(defun nbayes(ws s wh h)
;;P(S|W) = P(W|S)*P(S) / ( P(W|S)*P(S) + P(W|H)*P(H) )
(if
(not (eq 0 (+ (* ws s) (* wh h))))
(/
(* ws s)
(+ (* ws s) (* wh h))
))
)
(defun getfreq(word table)
(/
(if (gethash word table)
(gethash word table)
0
)
(hash-table-count table)
)
)
(getfreq 'THIS spam-table)
(getfreq 'THIS ham-table)
(nbayes (getfreq 'THIS spam-table) 0.5 (getfreq 'THIS ham-table) 0.5)
(defun posterior(word prob-spam prob-ham)
;; prevent divison by zero by checking word before use
(unless (eq (nbayes (getfreq word spam-table) prob-spam (getfreq word ham-table) prob-ham) nil)
"NO CAN DO"
(nbayes (getfreq word spam-table) prob-spam (getfreq word ham-table) prob-ham)
)
)
(posterior 'FAKE 0.5 0.5)
(posterior 'DOCUMENT 0.2 0.8)
(posterior 'NIKHILS 0.8 0.2)
(posterior 'THIS 0.2 0.8)
;; What I need is another hash that has all words used over both documents as keys, and frequencies as values
(defun print-hash-entry (key value)
(format t "The value associated with the key ~S is ~S~%" key value))
(maphash #'print-hash-entry spam-table)
(defun getkeys(key value)
(key)
)
(maphash #'getkeys spam-table)
(cons (gethash x spam-table) (gethash x ham-table) )
;;(/ (* p1 p2 p3 p4 p5 p6 pn..) (+ (p1 p2 p3 p4 p5 p6 pn..) ((- p1 1) (- p2 1) (- p3 1))) )
(*
(remove nil (mapcar (lambda(x)
(posterior x 0.5 0.5)
)
*testdoc*
)
)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment