abienert/BayesianTextClassifier.fsx

## BayesianTextClassifier.fsx
open System
open System.Collections.Generic
open System.Text.RegularExpressions

let rec addWordTo newCorpus corpus word =
    match corpus with
        | [] ->
            match word with
                | None -> newCorpus
                | Some(value) -> newCorpus @ [value]
        | h::t ->
            match word with
                | None -> addWordTo (newCorpus @ [h]) t None
                | Some(value) ->
                    if (fst h = fst value) then
                        addWordTo (newCorpus @ [(fst h, snd h + snd value)]) t None
                    else
                        addWordTo (newCorpus @ [h]) t word

let rec trainCorpus corpus text =
    match text with
        | [] -> corpus
        | h::t ->
            trainCorpus (addWordTo [] corpus (Some h)) t

let countFrom corpus word =
    let result = corpus |> List.tryFind (fun (w, c) -> w = word)
    match result with
        | None -> 0
        | Some(v) -> snd v

let prCategory (count0, total0) (count1, total1) =
    let pword =  (count0 / total0) / ((count0 / total0) + (count1 / total1))
    let ctotal = count0 + count1
    ((0.5 + (ctotal * pword) ) / (1.0 + ctotal))

let prWord fstCorpus sndCorpus word =
    let fstCount = double (word |> countFrom fstCorpus)
    let sndCount = double (word |> countFrom sndCorpus)
    let sum corpus = double (corpus |> List.map (fun (_, c:int) -> c) |> List.sum)
    let pr = prCategory (fstCount, sum fstCorpus) (sndCount, sum sndCorpus)
    if Double.IsNaN(pr) then None else Some(pr)

let prBayesian prOptionList =
    let pList = prOptionList |> List.choose (fun v -> v) // removes all None's from the list
    let prod = pList |> List.reduce ( * )
    let inv = pList |> List.map (fun v -> 1.0 - v) |> List.reduce ( * )
    prod / (prod + inv)

let classifyText fstCorpus sndCorpus text =
     prBayesian (text |> List.map (fun (w, _) -> prWord fstCorpus sndCorpus w))

let parse src =
    // could add additional data cleansing functionality here
    Regex.Replace(src, @"[^\w\'@-]", " ").Split(' ')
    |> Array.map (fun w -> w.ToUpper())
    |> Array.filter (fun w -> w <> System.String.Empty)
    |> Set.ofArray  |> Set.map (fun w -> (w, 1)) |> Set.toList


//
// Tests
//

// Setup the spam and ham classifications
let spamCorpusSample = [("FREE", 10); ("SELL", 5); ("OFFER", 24); ("NAME", 4)]
let hamCorpusSample = [("NAME", 16); ("DATE", 8); ("REGARDS", 27)]

// Classify as Spam
let unknownText = parse "Enter the prize draw for free. A limited offer. Enter your name and address."
classifyText spamCorpusSample hamCorpusSample unknownText

// Classify as Ham
let unknownText = parse "Thanks for agreeing to the proposed date. Kind regards Bob."
classifyText spamCorpusSample hamCorpusSample unknownText

// Append/train the spam classification
let knownSpamText = parse "Enter the prize draw for free. A limited offer. Enter your name and address."
let newSpam = trainCorpus spamCorpusSample knownSpamText
	open System
	open System.Collections.Generic
	open System.Text.RegularExpressions

	let rec addWordTo newCorpus corpus word =
	match corpus with
	\| [] ->
	match word with
	\| None -> newCorpus
	\| Some(value) -> newCorpus @ [value]
	\| h::t ->
	match word with
	\| None -> addWordTo (newCorpus @ [h]) t None
	\| Some(value) ->
	if (fst h = fst value) then
	addWordTo (newCorpus @ [(fst h, snd h + snd value)]) t None
	else
	addWordTo (newCorpus @ [h]) t word

	let rec trainCorpus corpus text =
	match text with
	\| [] -> corpus
	\| h::t ->
	trainCorpus (addWordTo [] corpus (Some h)) t

	let countFrom corpus word =
	let result = corpus \|> List.tryFind (fun (w, c) -> w = word)
	match result with
	\| None -> 0
	\| Some(v) -> snd v

	let prCategory (count0, total0) (count1, total1) =
	let pword = (count0 / total0) / ((count0 / total0) + (count1 / total1))
	let ctotal = count0 + count1
	((0.5 + (ctotal * pword) ) / (1.0 + ctotal))

	let prWord fstCorpus sndCorpus word =
	let fstCount = double (word \|> countFrom fstCorpus)
	let sndCount = double (word \|> countFrom sndCorpus)
	let sum corpus = double (corpus \|> List.map (fun (_, c:int) -> c) \|> List.sum)
	let pr = prCategory (fstCount, sum fstCorpus) (sndCount, sum sndCorpus)
	if Double.IsNaN(pr) then None else Some(pr)

	let prBayesian prOptionList =
	let pList = prOptionList \|> List.choose (fun v -> v) // removes all None's from the list
	let prod = pList \|> List.reduce ( * )
	let inv = pList \|> List.map (fun v -> 1.0 - v) \|> List.reduce ( * )
	prod / (prod + inv)

	let classifyText fstCorpus sndCorpus text =
	prBayesian (text \|> List.map (fun (w, _) -> prWord fstCorpus sndCorpus w))

	let parse src =
	// could add additional data cleansing functionality here
	Regex.Replace(src, @"[^\w\'@-]", " ").Split(' ')
	\|> Array.map (fun w -> w.ToUpper())
	\|> Array.filter (fun w -> w <> System.String.Empty)
	\|> Set.ofArray \|> Set.map (fun w -> (w, 1)) \|> Set.toList



	//
	// Tests
	//

	// Setup the spam and ham classifications
	let spamCorpusSample = [("FREE", 10); ("SELL", 5); ("OFFER", 24); ("NAME", 4)]
	let hamCorpusSample = [("NAME", 16); ("DATE", 8); ("REGARDS", 27)]

	// Classify as Spam
	let unknownText = parse "Enter the prize draw for free. A limited offer. Enter your name and address."
	classifyText spamCorpusSample hamCorpusSample unknownText

	// Classify as Ham
	let unknownText = parse "Thanks for agreeing to the proposed date. Kind regards Bob."
	classifyText spamCorpusSample hamCorpusSample unknownText

	// Append/train the spam classification
	let knownSpamText = parse "Enter the prize draw for free. A limited offer. Enter your name and address."
	let newSpam = trainCorpus spamCorpusSample knownSpamText