Skip to content

Instantly share code, notes, and snippets.

@bohdanszymanik
Last active December 28, 2015 04:29
Show Gist options
  • Save bohdanszymanik/7442499 to your computer and use it in GitHub Desktop.
Save bohdanszymanik/7442499 to your computer and use it in GitHub Desktop.
Kaggle numeric character recognition in F# using: 1. CsvFile typeprovider to get the data 2. vectors with cosine similarity to determine k nearest neighbours and 3. the ability to display characters with wpf using wpf Rectangles laid out onto a canvas. Based on the machine learning example from Mathias Brandewinder and the coding dojo here: http…
open System
#r @"../packages/FSharp.Data.1.1.10/lib/net40/FSharp.Data.dll"
#r @"c:\wd\MathDemo\packages\MathNet.Numerics.2.6.2\lib\net40\MathNet.Numerics.dll"
#r "../packages/MathNet.Numerics.FSharp.2.6.0/lib/net40/MathNet.Numerics.FSharp.dll"
open MathNet.Numerics.LinearAlgebra
open MathNet.Numerics.LinearAlgebra.Double
(* CsvProvider nice but you can't enumerate the row columns
//type trainingChars = FSharp.Data.CsvProvider<"http://brandewinder.blob.core.windows.net/public/trainingsample.csv">
type trainingChars = FSharp.Data.CsvProvider<"c://temp//trainingsample.csv">
//let tChars = trainingChars.Load("http://brandewinder.blob.core.windows.net/public/trainingsample.csv")
let tChars = trainingChars.Load(@"c:/temp/trainingsample.csv")
tChars.Data |> Seq.head
(tChars.Data |> Seq.head).label
(tChars.Data |> Seq.head).pixel0
*)
// for large files such as these character files could be, the CsvFile reader seems more sensible than
// the CsvProvider - CsvFile represents each row as a string array and doesnt cache by default.
open FSharp.Data.Csv
open FSharp.Data.Csv.Extensions
//let tChars = CsvFile.Load("http://brandewinder.blob.core.windows.net/public/trainingsample.csv")
let rawTrainCases = CsvFile.Load(@"c:/temp/trainingsample.csv")
(* some sample usage
(rawTrainCases.Data |> Seq.head).Columns
(rawTrainCases.Data |> Seq.head).GetColumn("pixel1")
(rawTrainCases.Data |> Seq.head).Columns.[0]
(rawTrainCases.Data |> Seq.head).["pixel1"].AsInteger()
*)
// define a type that represents each data point
type Number = { Label: int; Pixels: int[] }
type Number1 = { Label: int; Pixels1: DenseVector }
let rawCaseToNumber1 (raw:string[]) =
let a = raw.[1..] |> Array.map Convert.ToDouble
{Label = (int)raw.[0]; Pixels1 = DenseVector (raw.[1..] |> Array.map Convert.ToDouble) }
let trainCases1 =
(rawTrainCases.Data)
|> Seq.map (fun r -> r.Columns)
|> Seq.map rawCaseToNumber1
|> Seq.cache
// distance measures
// euclidean approach - don't worry about the sqr to avoid the extra computation
let euclideanDistance (a:int) (b:int) = (a-b)*(a-b)
// another approach - cosine similarity that nicely ranges from (-1 to) 0 to 1
let cosineSimilarity (x:DenseVector) (y:DenseVector) =
sqrt(1.0 / x.DotProduct(x) / y.DotProduct(y)) * x.DotProduct(y)
cosineSimilarity (DenseVector [|1.0; 2.; 3.|]) (DenseVector [|2.0; 4.; 6.|])
// for any unknown record in test set, find similarity to all records in training set
// return n nearest neighbours
let rawTestCases = CsvFile.Load(@"c:/temp/validationsample.csv")
let testCases1 =
(rawTestCases.Data)
|> Seq.map (fun r -> r.Columns)
|> Seq.map rawCaseToNumber1
|> Seq.cache
let findkNNCases k (knownCases:seq<Number1>) (unknownCase:Number1) =
knownCases
|> Seq.map (fun n -> (n.Label, cosineSimilarity n.Pixels1 unknownCase.Pixels1) )
|> Seq.sortBy (fun (l,d) -> d )
|> List.ofSeq
|> List.rev
|> Seq.take k
let testOurTestCases =
testCases1
//|> Seq.take 10
|> Seq.map (fun n ->
let closestCases = findkNNCases 10 trainCases1 n
(n, closestCases, ( closestCases |> Seq.countBy (fun (c,d) -> c ) |> Seq.maxBy ( fun (_,cnt) -> cnt ) ) )
)
// how accurate were we?
testOurTestCases
|> Seq.countBy (fun (n, _, (predicted,_) ) -> n.Label = predicted)
// turns out to be 94% accurate 471/500 predicted correctly, 29 falsely
// what do the failed test cases look like?
#r "WindowsBase"
#r "PresentationCore"
#r "PresentationFramework"
#r "System.Xaml"
open System
open System.Windows
open System.Windows.Controls
open System.Windows.Shapes
open System.Windows.Media
open System.Xaml
let drawPixels (someCase:Number1) title =
let w = new Window(Topmost=true)
w.Width <- 280.
w.Height <- 308.
w.Title <- title
w.Show()
let c = new Canvas()
w.Content <- c
someCase.Pixels1
|> Seq.iteri (fun i p ->
let scb = new SolidColorBrush()
scb.Color <- Color.FromRgb (0uy,0uy,(byte)p)
let r = new Rectangle(Width=10., Height=10., Fill=scb)
c.Children.Add(r) |> ignore
Canvas.SetLeft(r, Convert.ToDouble((i % 28) * 10))
Canvas.SetTop(r, Convert.ToDouble( i/28 * 10) )
)
// draw up the first of our cases
drawPixels (testCases1 |> Seq.head) "first case"
// draw up our failed cases
testOurTestCases
|> Seq.filter (fun (n, _, (predicted,_) ) -> n.Label <> predicted)
|> Seq.take 5
|> Seq.iter (fun (n, _, (_,_) ) -> drawPixels n (Convert.ToString(n.Label)) )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment