Created
May 24, 2011 01:23
-
-
Save abienert/987997 to your computer and use it in GitHub Desktop.
Bayesian Text Classifier
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
open System | |
open System.Collections.Generic | |
open System.Text.RegularExpressions | |
let rec addWordTo newCorpus corpus word = | |
match corpus with | |
| [] -> | |
match word with | |
| None -> newCorpus | |
| Some(value) -> newCorpus @ [value] | |
| h::t -> | |
match word with | |
| None -> addWordTo (newCorpus @ [h]) t None | |
| Some(value) -> | |
if (fst h = fst value) then | |
addWordTo (newCorpus @ [(fst h, snd h + snd value)]) t None | |
else | |
addWordTo (newCorpus @ [h]) t word | |
let rec trainCorpus corpus text = | |
match text with | |
| [] -> corpus | |
| h::t -> | |
trainCorpus (addWordTo [] corpus (Some h)) t | |
let countFrom corpus word = | |
let result = corpus |> List.tryFind (fun (w, c) -> w = word) | |
match result with | |
| None -> 0 | |
| Some(v) -> snd v | |
let prCategory (count0, total0) (count1, total1) = | |
let pword = (count0 / total0) / ((count0 / total0) + (count1 / total1)) | |
let ctotal = count0 + count1 | |
((0.5 + (ctotal * pword) ) / (1.0 + ctotal)) | |
let prWord fstCorpus sndCorpus word = | |
let fstCount = double (word |> countFrom fstCorpus) | |
let sndCount = double (word |> countFrom sndCorpus) | |
let sum corpus = double (corpus |> List.map (fun (_, c:int) -> c) |> List.sum) | |
let pr = prCategory (fstCount, sum fstCorpus) (sndCount, sum sndCorpus) | |
if Double.IsNaN(pr) then None else Some(pr) | |
let prBayesian prOptionList = | |
let pList = prOptionList |> List.choose (fun v -> v) // removes all None's from the list | |
let prod = pList |> List.reduce ( * ) | |
let inv = pList |> List.map (fun v -> 1.0 - v) |> List.reduce ( * ) | |
prod / (prod + inv) | |
let classifyText fstCorpus sndCorpus text = | |
prBayesian (text |> List.map (fun (w, _) -> prWord fstCorpus sndCorpus w)) | |
let parse src = | |
// could add additional data cleansing functionality here | |
Regex.Replace(src, @"[^\w\'@-]", " ").Split(' ') | |
|> Array.map (fun w -> w.ToUpper()) | |
|> Array.filter (fun w -> w <> System.String.Empty) | |
|> Set.ofArray |> Set.map (fun w -> (w, 1)) |> Set.toList | |
// | |
// Tests | |
// | |
// Setup the spam and ham classifications | |
let spamCorpusSample = [("FREE", 10); ("SELL", 5); ("OFFER", 24); ("NAME", 4)] | |
let hamCorpusSample = [("NAME", 16); ("DATE", 8); ("REGARDS", 27)] | |
// Classify as Spam | |
let unknownText = parse "Enter the prize draw for free. A limited offer. Enter your name and address." | |
classifyText spamCorpusSample hamCorpusSample unknownText | |
// Classify as Ham | |
let unknownText = parse "Thanks for agreeing to the proposed date. Kind regards Bob." | |
classifyText spamCorpusSample hamCorpusSample unknownText | |
// Append/train the spam classification | |
let knownSpamText = parse "Enter the prize draw for free. A limited offer. Enter your name and address." | |
let newSpam = trainCorpus spamCorpusSample knownSpamText |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment