Skip to content

Instantly share code, notes, and snippets.

@abienert
Created May 24, 2011 01:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abienert/987997 to your computer and use it in GitHub Desktop.
Save abienert/987997 to your computer and use it in GitHub Desktop.
Bayesian Text Classifier
open System
open System.Collections.Generic
open System.Text.RegularExpressions
let rec addWordTo newCorpus corpus word =
match corpus with
| [] ->
match word with
| None -> newCorpus
| Some(value) -> newCorpus @ [value]
| h::t ->
match word with
| None -> addWordTo (newCorpus @ [h]) t None
| Some(value) ->
if (fst h = fst value) then
addWordTo (newCorpus @ [(fst h, snd h + snd value)]) t None
else
addWordTo (newCorpus @ [h]) t word
let rec trainCorpus corpus text =
match text with
| [] -> corpus
| h::t ->
trainCorpus (addWordTo [] corpus (Some h)) t
let countFrom corpus word =
let result = corpus |> List.tryFind (fun (w, c) -> w = word)
match result with
| None -> 0
| Some(v) -> snd v
let prCategory (count0, total0) (count1, total1) =
let pword = (count0 / total0) / ((count0 / total0) + (count1 / total1))
let ctotal = count0 + count1
((0.5 + (ctotal * pword) ) / (1.0 + ctotal))
let prWord fstCorpus sndCorpus word =
let fstCount = double (word |> countFrom fstCorpus)
let sndCount = double (word |> countFrom sndCorpus)
let sum corpus = double (corpus |> List.map (fun (_, c:int) -> c) |> List.sum)
let pr = prCategory (fstCount, sum fstCorpus) (sndCount, sum sndCorpus)
if Double.IsNaN(pr) then None else Some(pr)
let prBayesian prOptionList =
let pList = prOptionList |> List.choose (fun v -> v) // removes all None's from the list
let prod = pList |> List.reduce ( * )
let inv = pList |> List.map (fun v -> 1.0 - v) |> List.reduce ( * )
prod / (prod + inv)
let classifyText fstCorpus sndCorpus text =
prBayesian (text |> List.map (fun (w, _) -> prWord fstCorpus sndCorpus w))
let parse src =
// could add additional data cleansing functionality here
Regex.Replace(src, @"[^\w\'@-]", " ").Split(' ')
|> Array.map (fun w -> w.ToUpper())
|> Array.filter (fun w -> w <> System.String.Empty)
|> Set.ofArray |> Set.map (fun w -> (w, 1)) |> Set.toList
//
// Tests
//
// Setup the spam and ham classifications
let spamCorpusSample = [("FREE", 10); ("SELL", 5); ("OFFER", 24); ("NAME", 4)]
let hamCorpusSample = [("NAME", 16); ("DATE", 8); ("REGARDS", 27)]
// Classify as Spam
let unknownText = parse "Enter the prize draw for free. A limited offer. Enter your name and address."
classifyText spamCorpusSample hamCorpusSample unknownText
// Classify as Ham
let unknownText = parse "Thanks for agreeing to the proposed date. Kind regards Bob."
classifyText spamCorpusSample hamCorpusSample unknownText
// Append/train the spam classification
let knownSpamText = parse "Enter the prize draw for free. A limited offer. Enter your name and address."
let newSpam = trainCorpus spamCorpusSample knownSpamText
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment