Created
December 12, 2013 16:11
-
-
Save mathias-brandewinder/7930546 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
let source = __SOURCE_DIRECTORY__ | |
#load "NaiveBayes.fs" | |
open MachineLearning.NaiveBayes | |
open System | |
open System.IO | |
open System.Text | |
open System.Text.RegularExpressions | |
(* ********************************************** | |
0. PREFACE: GETTING DATA | |
First let's grab some data! | |
The following code is boring, but will spare you some time | |
loading up a training and validation set. | |
The dataset is a collection of SMS messages, | |
marked as "Spam" or "Ham". | |
The original dataset has been taken from | |
the UC Irvine Machine Learning Repository: | |
http://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection | |
*) | |
// TODO: SIT BACK, RELAX, RUN THE CODE BELOW :) | |
let trainingPath = source + "\SpamTraining" | |
let validationPath = source + "\SpamValidation" | |
// we define 2 classes, Ham or Spam | |
type Class = Spam | Ham | |
let spamOrHam (line: string) = | |
if line.StartsWith("ham") then (Ham, line.Remove(0,4)) | |
elif line.StartsWith("spam") then (Spam, line.Remove(0,5)) | |
else failwith "What is this?" | |
let read path = | |
File.ReadAllLines(path) | |
|> Array.map spamOrHam | |
let trainingSample = read trainingPath | |
let validationSample = read validationPath | |
(* ********************************************** | |
CHAPTER 1: GET TO KNOW YOUR DATA | |
It's always a good idea to spend some time | |
to know your data, and become "intimate" with it. | |
The more you understand it, the better you can | |
help your machine get smart! | |
*) | |
// let's look at the 20 first "ham" items | |
let ham_20 = | |
trainingSample | |
|> Array.filter (fun (cl, txt) -> cl = Ham) | |
|> Seq.take 20 | |
|> Seq.iter (fun (cl, txt) -> printfn "%s" txt) | |
// TODO: DISPLAY 20 FIRST SPAM SMS | |
let spam_20 = | |
trainingSample | |
|> Array.filter (fun (cl, txt) -> cl = Spam) | |
|> Seq.take 20 | |
|> Seq.iter (fun (cl, txt) -> printfn "%s" txt) | |
(* ********************************************** | |
CHAPTER 2: ESTABLISH A BASELINE | |
It is crucial to establish a baseline for what | |
"a good/bad prediction" is. What we have to beat | |
here is a "naive" prediction, the most likely class. | |
What is the probability that a SMS message | |
from the training set is spam or ham? | |
How about the validation sample? | |
Proba(SMS is Spam) = count(Spam SMS) / count(SMS) | |
*) | |
// TODO: COMPUTE PROBABILITY OF HAM, SPAM | |
let pHam, pSpam = | |
let ham, spam = | |
trainingSample | |
|> Array.partition (fun (cl,txt) -> cl = Ham) | |
float ham.Length / float trainingSample.Length, | |
float spam.Length / float trainingSample.Length | |
let baseline = max pHam pSpam | |
(* ********************************************** | |
CHAPTER 3: MEET REVEREND BAYES AND HIS THEOREM | |
Bayes Theorem enables incorporating additional data, | |
to refine a prediction, using the formula: | |
P(A|B) = P(B|A) x P(A) / P(B) | |
In this case: *) | |
// Proba (SMS is Spam, if SMS contains "chat) = | |
// Proba (SMS contains "chat", if SMS is Spam) x | |
// Proba (SMS is Spam) / Proba (SMS contains "chat") | |
(* | |
Our sample contains 4000 SMS: | |
Spam SMS 13.350% (534) | |
Spam SMS with "chat" 03.558% (19) | |
Ham SMS 86.650% (3466) | |
Ham SMS with "chat" 00.288% (10) | |
SMS with "chat" 00.725% (29) | |
*) | |
let p_spam = 0.13350 | |
let p_ham = 1. - p_spam | |
let p_chat = 0.00725 | |
let p_chat_if_spam = 0.03558 | |
let p_chat_if_ham = 0.00288 | |
// TODO: IF A SMS MESSAGE CONTAINS "chat", | |
// WHAT IS THE PROBABILITY IT IS SPAM? HAM? | |
let p_spam_if_chat = p_chat_if_spam * p_spam / p_chat | |
let p_ham_if_chat = p_chat_if_ham * p_ham / p_chat | |
abs (p_spam_if_chat + p_ham_if_chat - 1.) < 0.001 | |
(* ********************************************** | |
CHAPTER 4: CLASSIFY A MESSAGE BASED ON CONTENT | |
What is the probability that a spam SMS message | |
contains the word "ringtone"? "mom"? "800"? | |
Quick math recap, just in case: | |
*) | |
// Proba(Spam SMS contains "ringtone") = | |
// count(Spam SMS containing "ringtone") / count(Spam SMS) | |
// Proba(SMS is Spam if contains "ringtone") = | |
// Proba(SMS contains "ringtone" if it is Spam) * | |
// Proba(SMS is Spam) / Proba(SMS contains "ringtone") | |
(* | |
This is a direct application of Bayes' Theorem: | |
(See Chapter 3) | |
Note that if we just want to decide whether | |
a message is ham or spam, we can ignore the | |
Proba(SMS contains "chat") part. | |
*) | |
// TODO: COMPUTE PROBABILITY THAT | |
// HAM, SPAM MESSAGE CONTAINS "ringtone" | |
let contains (txt:string) token = txt.Contains(token) | |
let probaHamSpamIfContains token = | |
let ham, spam = | |
trainingSample | |
|> Array.partition (fun (cl,txt) -> cl = Ham) | |
let hamWithToken = ham |> Array.filter (fun (cl,txt) -> contains txt token) | |
let spamWithToken = spam |> Array.filter (fun (cl,txt) -> contains txt token) | |
float hamWithToken.Length / float ham.Length, | |
float spamWithToken.Length / float spam.Length | |
// TODO: PROBA THAT MESSAGE IS HAM OR SPAM | |
// IF CONTAINS "ringtone", "800", ... | |
probaHamSpamIfContains "ringtone" | |
probaHamSpamIfContains "800" | |
(* ********************************************** | |
CHAPTER 5: NAIVE BAYES CLASSIFIER DEMO | |
The Naive Bayes classifier uses the same idea, | |
but instead of using one token, it will combine | |
the probabilities of each token into one aggregate | |
probability. | |
Instead of coding it from scratch, we'll use then | |
basic implementation from NaiveBayes.fs | |
Below is an illustration on how to train a classifier, | |
and use some of the built-in functions. | |
*) | |
// TODO: SIT BACK, RELAX, RUN THE CODE BELOW :) | |
// select what tokens to use: | |
// a large part of how good the classifier is, | |
// depends on chosing good tokens. | |
let demoTokens = Set.ofList [ "chat"; "800"; "mom"; "ringtone"; "prize"; "you"] | |
// train a classifier using a sample and tokens | |
let demoClassifier = classifier bagOfWords trainingSample demoTokens | |
// look at what the classifier is doing :) | |
validationSample.[0..19] | |
|> Array.iter (fun (cl, text) -> | |
printfn "%A -> %A / %s" cl (demoClassifier text) text) | |
// Let's compute the % correctly classified | |
validationSample | |
|> Seq.averageBy (fun (cl,txt) -> if cl = demoClassifier txt then 1. else 0.) | |
|> printfn "Correct: %f" | |
(* ********************************************** | |
CHAPTER 6: SENTIMENT ANALYSIS | |
Looking at what words are frequently used | |
in different groups can give insight into | |
what "defines" these groups. This is often | |
referred to a "sentiment" analysis. | |
*) | |
// Extract tokens from training sample | |
let tokens = extractWords trainingSample | |
// Compute count of token in sample | |
let frequency = bagOfWords (prepare trainingSample) tokens | |
// TODO: MOST FREQUENT TOKENS IN HAM, SPAM? | |
// Hint: Map.toSeq will convert a Map<a,b> | |
// into a sequence of tuples (a,b) | |
let ham, spam = trainingSample |> Array.partition (fun (cl,_) -> cl = Ham) | |
let topHam = | |
bagOfWords (prepare ham) tokens | |
|> Map.toSeq | |
|> Seq.sortBy (fun (tok,freq) -> - freq) | |
|> Seq.map fst | |
|> Seq.take 10 | |
|> Set.ofSeq | |
let topSpam = | |
bagOfWords (prepare spam) tokens | |
|> Map.toSeq | |
|> Seq.sortBy (fun (tok,freq) -> - freq) | |
|> Seq.map fst | |
|> Seq.take 10 | |
|> Set.ofSeq | |
(* ********************************************** | |
CHAPTER 7: STOP WORDS! | |
Did you note that some of the top words in | |
both Ham and Spam are just very common English | |
words, like "i", "you", "to"... ? | |
These are probably not very informative, and | |
often called "stop words". | |
Let's create a "clean" list of tokens by removing | |
the stop words, and check our top tokens again. | |
*) | |
// http://www.textfixer.com/resources/common-english-words.txt | |
let stopWords = | |
let asString = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your" | |
asString.Split(',') |> Set.ofArray | |
// TODO: CLEAN TOKENS = TOKENS - STOP WORDS | |
let cleanTokens = | |
seq { for t in tokens do if not (Set.contains t stopWords) then yield t } | |
|> Set.ofSeq | |
// TODO, AGAIN: MOST FREQUENT TOKENS IN HAM, SPAM? | |
let topCleanHam = | |
bagOfWords (prepare ham) cleanTokens | |
|> Map.toSeq | |
|> Seq.sortBy (fun (tok,freq) -> - freq) | |
|> Seq.map fst | |
|> Seq.take 10 | |
|> Set.ofSeq | |
let topCleanSpam = | |
bagOfWords (prepare spam) cleanTokens | |
|> Map.toSeq | |
|> Seq.sortBy (fun (tok,freq) -> - freq) | |
|> Seq.map fst | |
|> Seq.take 10 | |
|> Set.ofSeq | |
(* ********************************************** | |
CHAPTER 8: OUR FIRST CLASSIFIER | |
Now that we have a decent list of tokens | |
to start with, let's train a classifier. | |
*) | |
// TODO: PICK TOP 10 SPAM + TOP 10 HAM CLEAN TOKENS | |
let betterTokens = Set.union topCleanHam topCleanSpam | |
// TODO: TRAIN CLASSIFIER WITH THESE TOKENS | |
// train a classifier using a sample and tokens | |
let betterClassifier = classifier bagOfWords trainingSample betterTokens | |
// TODO: TAKE 200 FIRST SMS FROM VALIDATION, | |
// AND COMPUTE % CORRECTLY CLASSIFIED | |
validationSample | |
|> Seq.averageBy (fun (cl,txt) -> if cl = betterClassifier txt then 1. else 0.) | |
(* ********************************************** | |
CHAPTER 9: BUT HERE'S MY NUMBER, SO CALL ME MAYBE | |
Remember in Chapter 4, when we checked for messages | |
containing "800"? Did you notice how many Spam SMSs | |
contain numbers (phone or text)? | |
Can we make them into a feature / token? | |
*) | |
let numbersRegex = Regex(@"\d{3,}") | |
let replaceNumbers (text: string) = numbersRegex.Replace(text, "__number__") | |
let exampleReplacement = "Call 1800123456 for your free spam" |> replaceNumbers | |
// TODO: PRE-PROCESS TEXT TO DEAL WITH "NUMBERS": | |
// REPLACE NUMBERS WITH __number__ | |
let training = trainingSample |> Array.map (fun (cl,txt) -> cl,replaceNumbers txt) | |
let validation = validationSample |> Array.map (fun (cl,txt) -> cl,replaceNumbers txt) | |
// TODO: TRAIN A CLASSIFIER ON PRE-PROCESSED | |
// TRAINING SET, AND EVALUATE QUALITY | |
// USING THAT NEW FEATURE | |
// ON 200 FIRST SMS FROM VALIDATION, | |
// AND COMPUTE % CORRECTLY CLASSIFIED | |
let finalHam, finalSpam = training |> Array.partition (fun (cl,_) -> cl = Ham) | |
let finalTokens = extractWords training |> Set.filter (fun t -> not (Set.contains t stopWords)) | |
let finalHamTokens = | |
bagOfWords (prepare finalHam) finalTokens | |
|> Map.toSeq | |
|> Seq.sortBy (fun (tok,freq) -> - freq) | |
|> Seq.map fst | |
|> Seq.take 10 | |
|> Set.ofSeq | |
let finalSpamTokens = | |
bagOfWords (prepare finalSpam) finalTokens | |
|> Map.toSeq | |
|> Seq.sortBy (fun (tok,freq) -> - freq) | |
|> Seq.map fst | |
|> Seq.take 10 | |
|> Set.ofSeq | |
// TODO: TRAIN CLASSIFIER WITH THESE TOKENS | |
let bestTokens = Set.union finalHamTokens finalSpamTokens | |
// train a classifier using a sample and tokens | |
let finalClassifier = classifier bagOfWords training bestTokens | |
validation | |
|> Seq.averageBy (fun (cl,txt) -> if cl = finalClassifier txt then 1. else 0.) | |
(* ********************************************** | |
EPILOGUE... | |
*) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment