mathias-brandewinder/gradient-boosting-1.fsx

## gradient-boosting-1.fsx
// blog post: brandewinder.com/2016/08/06/gradient-boosting-part-1
// https://en.wikipedia.org/wiki/Gradient_boosting#Algorithm

(*
Exploring the dataset
*)

#I "./packages/"
#r "fsharp.data/lib/net40/fsharp.data.dll"
open FSharp.Data
#r "xplot.googlecharts/lib/net45/xplot.googlecharts.dll"
#r "google.datatable.net.wrapper/lib/google.datatable.net.wrapper.dll"
open XPlot.GoogleCharts

type Wine = CsvProvider<"data/winequality-red.csv",";",InferRows=1500>

let reds = Wine.Load("data/winequality-red.csv")

type Observation = Wine.Row

type Feature = Observation -> float

let ``Alcohol Level`` : Feature =
    fun obs -> obs.Alcohol |> float

let ``Volatile Acidity`` : Feature =
    fun obs -> obs.``Volatile acidity`` |> float

let ``Fixed Acidity`` : Feature =
    fun obs -> obs.``Fixed acidity`` |> float

let options = Configuration.Options()
options.dataOpacity <- 0.25
options.pointSize <- 10

reds.Rows
|> Seq.map (fun obs -> ``Alcohol Level`` obs, obs.Quality)
|> Chart.Scatter
|> Chart.WithOptions options
|> Chart.WithTitle "Alcohol Level vs. Quality"
|> Chart.WithXTitle "Alcohol Level"
|> Chart.WithYTitle "Quality"
|> Chart.Show

reds.Rows
|> Seq.map (fun obs -> ``Volatile Acidity`` obs, obs.Quality)
|> Chart.Scatter
|> Chart.WithOptions options
|> Chart.WithTitle "Volatile Acidity vs. Quality"
|> Chart.WithXTitle "Volatile Acidity"
|> Chart.WithYTitle "Quality"
|> Chart.Show

reds.Rows
|> Seq.map (fun obs -> ``Fixed Acidity`` obs, obs.Quality)
|> Chart.Scatter
|> Chart.WithOptions options
|> Chart.WithTitle "Fixed Acidity vs. Quality"
|> Chart.WithXTitle "Fixed Acidity"
|> Chart.WithYTitle "Quality"
|> Chart.Show

(*
Stumps
*)

type Example = Observation * float

type Predictor = Observation -> float

let learnStump (sample:Example seq) (feature:Feature) threshold =
    let under =
        sample
        |> Seq.filter (fun (obs,lbl) -> feature obs <= threshold)
        |> Seq.averageBy (fun (obs,lbl) -> lbl)
    let over =
        sample
        |> Seq.filter (fun (obs,lbl) -> feature obs > threshold)
        |> Seq.averageBy (fun (obs,lbl) -> lbl)
    fun obs ->
        if (feature obs <= threshold)
        then under
        else over

let redSample =
    reds.Rows
    |> Seq.map (fun row -> row, row.Quality |> float)

let testStump = learnStump redSample ``Alcohol Level`` 11.0

let predicted =
    redSample
    |> Seq.map (fun (obs,value) -> (``Alcohol Level`` obs, obs |> testStump))

predicted
|> Seq.sortBy fst
|> Chart.Line
|> Chart.WithTitle "Alcohol Level vs. Quality"
|> Chart.WithXTitle "Alcohol Level"
|> Chart.WithYTitle "Quality"
|> Chart.Show

(*
Picking the best stump
2 issues: quality, and possible splits
*)

let sumOfSquares (sample:Example seq) predictor =
    sample
    |> Seq.sumBy (fun (obs,lbl) ->
        pown (lbl - predictor obs) 2)

sumOfSquares redSample testStump

let evenSplits (sample:Example seq) (feature:Feature) (n:int) =
    let values = sample |> Seq.map (fst >> feature)
    let min = values |> Seq.min
    let max = values |> Seq.max
    let width = (max-min) / (float (n + 1))
    [ min + width .. width .. max - width ]

let alcoholSplits = evenSplits redSample ``Alcohol Level`` 10

let bestStump =
    alcoholSplits
    |> List.map (learnStump redSample ``Alcohol Level``)
    |> List.minBy (sumOfSquares redSample)

sumOfSquares redSample bestStump

redSample
|> Seq.map (fun (obs,value) -> (``Alcohol Level`` obs, obs |> bestStump))
|> Seq.sortBy fst
|> Chart.Line
|> Chart.WithTitle "Alcohol Level vs. Quality"
|> Chart.WithXTitle "Alcohol Level"
|> Chart.WithYTitle "Quality"
|> Chart.Show

(*
Analyzing the residuals
*)

redSample
|> Seq.map (fun (obs,lbl) -> ``Alcohol Level`` obs, lbl - (obs |> bestStump))
|> Chart.Scatter
|> Chart.WithOptions options
|> Chart.WithTitle "Alcohol Level vs. Residuals"
|> Chart.WithXTitle "Alcohol Level"
|> Chart.WithYTitle "Residuals"
|> Chart.Show

// alternate chart, aggregating together Observations
// with same alcohol level
redSample
|> Seq.map (fun (obs,lbl) -> ``Alcohol Level`` obs, lbl - (obs |> bestStump))
|> Seq.groupBy fst
|> Seq.map (fun (x,group) ->
    x,
    group
    |> Seq.map snd
    |> Seq.average)
|> Chart.Scatter
|> Chart.WithOptions options
|> Chart.WithTitle "Alcohol Level vs. Residuals"
|> Chart.WithXTitle "Alcohol Level"
|> Chart.WithYTitle "Residuals"
|> Chart.Show

(*
Fitting another stump on the residuals
*)

let residualsSample =
    redSample
    |> Seq.map (fun (obs,lbl) -> obs, lbl - (obs |> bestStump))

let residualsStump =
    alcoholSplits
    |> List.map (learnStump residualsSample ``Alcohol Level``)
    |> List.minBy (sumOfSquares redSample)

let combined = fun obs -> bestStump obs + residualsStump obs

sumOfSquares redSample combined

redSample
|> Seq.map (fun (obs,value) -> (``Alcohol Level`` obs, obs |> combined))
|> Seq.sortBy fst
|> Chart.Line
|> Chart.WithTitle "Alcohol Level vs. Quality"
|> Chart.WithXTitle "Alcohol Level"
|> Chart.WithYTitle "Quality"
|> Chart.Show

// residuals
redSample
|> Seq.map (fun (obs,lbl) -> ``Alcohol Level`` obs, lbl - (obs |> combined))
|> Chart.Scatter
|> Chart.WithOptions options
|> Chart.WithTitle "Alcohol Level vs. Residuals"
|> Chart.WithXTitle "Alcohol Level"
|> Chart.WithYTitle "Residuals"
|> Chart.Show

(*
Iteratively adding stumps
*)

let learn (sample:Example seq) (feature:Feature) (depth:int) =

    let splits = evenSplits sample feature 10

    let rec next iterationsLeft predictor =

        // we have reached depth 0: we are done
        if iterationsLeft = 0
        then predictor
        else
            // compute new residuals
            let newSample =
                sample
                |> Seq.map (fun (obs,y) -> obs, y - predictor obs)
            // learn possible stumps against residuals,
            // and pick the one with smallest error
            let newStump =
                splits
                |> Seq.map (learnStump newSample feature)
                |> Seq.minBy (sumOfSquares newSample)
            // create new predictor
            let newPredictor = fun obs -> predictor obs + newStump obs
            // ... and keep going
            next (iterationsLeft - 1) newPredictor

    // initialize with a predictor that
    // predicts the average sample value
    let baseValue = sample |> Seq.map snd |> Seq.average
    let basePredictor = fun (obs:Observation) -> baseValue

    next depth basePredictor

let model = learn redSample ``Alcohol Level`` 10

sumOfSquares redSample model

redSample
|> Seq.map (fun (obs,value) -> (``Alcohol Level`` obs, obs |> model))
|> Seq.sortBy fst
|> Chart.Line
|> Chart.WithTitle "Alcohol Level vs. Quality"
|> Chart.WithXTitle "Alcohol Level"
|> Chart.WithYTitle "Quality"
|> Chart.Show

// increasing depth

[ 1 .. 15 ]
|> Seq.map (fun depth -> depth, learn redSample ``Alcohol Level`` depth)
|> Seq.map (fun (depth,model) -> depth, sumOfSquares redSample model)
|> Chart.Column
|> Chart.Show

## gradient-boosting-2.fsx
// blog post: brandewinder.com/2016/08/14/gradient-boosting-part-2
//https://en.wikipedia.org/wiki/Gradient_boosting#Algorithm

(*
Exploring the dataset
*)

#I "./packages/"
#r "fsharp.data/lib/net40/fsharp.data.dll"
open FSharp.Data
#r "xplot.googlecharts/lib/net45/xplot.googlecharts.dll"
#r "google.datatable.net.wrapper/lib/google.datatable.net.wrapper.dll"
open XPlot.GoogleCharts

type Wine = CsvProvider<"data/winequality-red.csv",";",InferRows=1500>

let reds = Wine.Load("data/winequality-red.csv")

type Observation = Wine.Row

type Feature = Observation -> float

let ``Alcohol Level`` : Feature =
    fun obs -> obs.Alcohol |> float

let ``Chlorides`` : Feature =
    fun obs -> obs.Chlorides |> float

let ``Citric Acid`` : Feature =
    fun obs -> obs.``Citric acid`` |> float

let ``Density`` : Feature =
    fun obs -> obs.Density |> float

let ``Fixed Acidity`` : Feature =
    fun obs -> obs.``Fixed acidity`` |> float

let ``Free Sulfur Dioxide`` : Feature =
    fun obs -> obs.``Free sulfur dioxide`` |> float

let ``PH`` : Feature =
    fun obs -> obs.PH |> float

let ``Residual Sugar`` : Feature =
    fun obs -> obs.``Residual sugar`` |> float

let ``Total Sulfur Dioxide`` : Feature =
    fun obs -> obs.``Total sulfur dioxide`` |> float

let ``Volatile Acidity`` : Feature =
    fun obs -> obs.``Volatile acidity`` |> float

(*
Trees
*)

type Example = Observation * float

type Predictor = Observation -> float

type Tree =
    | Leaf of float
    | Branch of (Feature * float) * Tree * Tree

let exampleTree =
    Branch(
        (``Alcohol Level``, 10.5),
        Branch(
            (``Volatile Acidity``, 0.8),
            Leaf(6.0),
            Leaf(3.0)
        ),
        Leaf(5.5)
    )

let rec predict (tree:Tree) (obs:Observation) =
    match tree with
    | Leaf(prediction) -> prediction
    | Branch((feature,split),under,over) ->
        let featureValue = feature obs
        if featureValue <= split
        then predict under obs
        else predict over obs

predict exampleTree (reds.Rows |> Seq.head)

let examplePredictor = predict exampleTree

let sumOfSquares (sample:Example seq) predictor =
    sample
    |> Seq.sumBy (fun (obs,lbl) ->
        pown (lbl - predictor obs) 2)

let redSample =
    reds.Rows
    |> Seq.map (fun row -> row, row.Quality |> float)

sumOfSquares redSample examplePredictor

(*
Learning a Tree
*)

let learnStump (sample:Example seq) (feature:Feature) threshold =
    let under =
        sample
        |> Seq.filter (fun (obs,lbl) -> feature obs <= threshold)
        |> Seq.averageBy (fun (obs,lbl) -> lbl)
    let over =
        sample
        |> Seq.filter (fun (obs,lbl) -> feature obs > threshold)
        |> Seq.averageBy (fun (obs,lbl) -> lbl)
    fun obs ->
        if (feature obs <= threshold)
        then under
        else over

let evenSplits (sample:Example seq) (feature:Feature) (n:int) =
    let values = sample |> Seq.map (fst >> feature)
    let min = values |> Seq.min
    let max = values |> Seq.max
    let width = (max-min) / (float (n + 1))
    [ min + width .. width .. max - width ]

let rec draftLearnTree (sample:Example seq) (features:Feature list) (depth:int) =

    if depth = 0
    then
        let avg = sample |> Seq.averageBy snd
        Leaf(avg)
    else
        let (bestFeature,bestSplit) =
            // create all feature * split combinations
            seq {
                for feature in features do
                    let splits = evenSplits sample feature 10
                    for split in splits -> feature,split
            }
            // find the split with the smallest error
            |> Seq.minBy (fun (feature,split) ->
                let predictor = learnStump sample feature split
                sumOfSquares sample predictor)

        let under =
            sample
            |> Seq.filter (fun (obs,_) ->
                bestFeature obs <= bestSplit)
        let over =
            sample
            |> Seq.filter (fun (obs,_) ->
                bestFeature obs > bestSplit)

        let underTree = draftLearnTree under features (depth - 1)
        let overTree =  draftLearnTree over features (depth - 1)

        Branch((bestFeature,bestSplit),underTree,overTree)

// replicate the original stump
let originalStump = draftLearnTree redSample [ ``Alcohol Level`` ] 1
sumOfSquares redSample (predict originalStump)

let deeperTree = draftLearnTree redSample [``Alcohol Level``;``Volatile Acidity``] 4
sumOfSquares redSample (predict deeperTree)

// problem!
let explodingTree = draftLearnTree redSample [``Alcohol Level``] 5

(*
Cleaning things up
*)

let underOver (sample:Example seq) (feat:Feature,split:float) =
    let under = sample |> Seq.filter (fun (obs,_) -> feat obs <= split)
    let over =  sample |> Seq.filter (fun (obs,_) -> feat obs > split)
    under,over

type Splitter = Example seq -> Feature -> float list

type Cost = Example seq -> float

let rec learnTree (splitter:Splitter,cost:Cost) (sample:Example seq) (features:Feature list) (depth:int) =

    if depth = 0
    then
        let avg = sample |> Seq.averageBy snd
        Leaf(avg)
    else
        let initialCost = cost sample
        let candidates =
            // build up all the feature/split candidates,
            // and their associated sample splits
            seq {
                for feature in features do
                    let splits = splitter sample feature
                    for split in splits ->
                        let under,over = underOver sample (feature,split)
                        (feature,split),(under,over)
            }
            // compute and append cost of split
            |> Seq.map (fun (candidate,(under,over)) ->
                candidate,(under,over), cost under + cost over)
            // retain only candidates with strict cost improvement
            |> Seq.filter (fun (candidate,(under,over),splitCost) ->
                splitCost < initialCost)

        if (Seq.isEmpty candidates)
        then
            let avg = sample |> Seq.averageBy snd
            Leaf(avg)
        else
            let ((bestFeature,bestSplit),(under,over),spliCost) =
                candidates
                |> Seq.minBy (fun (_,_,splitCost) -> splitCost)

            let underTree = learnTree (splitter,cost) under features (depth - 1)
            let overTree =  learnTree (splitter,cost) over features (depth - 1)

            Branch((bestFeature,bestSplit),underTree,overTree)

let evenSplitter n (sample:Example seq) (feature:Feature) =
    let values = sample |> Seq.map (fst >> feature)
    let min = values |> Seq.min
    let max = values |> Seq.max
    if min = max
    then []
    else
        let width = (max-min) / (float (n + 1))
        [ min + width .. width .. max - width ]

let sumOfSquaresCost (sample:Example seq) =
    let avg = sample |> Seq.averageBy snd
    sample |> Seq.sumBy (fun (_,lbl) -> pown (lbl - avg) 2)

// alternate cost specification
let manhattanCost (sample:Example seq) =
    let avg = sample |> Seq.averageBy snd
    sample |> Seq.sumBy (fun (_,lbl) -> abs (lbl - avg))

let stableTree = learnTree (evenSplitter 10,sumOfSquaresCost) redSample [``Alcohol Level``;``Volatile Acidity``] 10

sumOfSquares redSample (predict stableTree)

// we include every feature available
let features = [
    ``Alcohol Level``
    ``Chlorides``
    ``Citric Acid``
    ``Density``
    ``Fixed Acidity``
    ``Free Sulfur Dioxide``
    ``PH``
    ``Residual Sugar``
    ``Total Sulfur Dioxide``
    ``Volatile Acidity``
]

let fullTree = learnTree (evenSplitter 5,sumOfSquaresCost) redSample features 10
sumOfSquares redSample (predict fullTree)

// plotting actual vs. predicted values
let options = Configuration.Options()
options.dataOpacity <- 0.25
options.pointSize <- 10

redSample
|> Seq.map (fun (obs,lbl) -> lbl, predict fullTree obs)
|> Chart.Scatter
|> Chart.WithOptions options
|> Chart.WithTitle "Wine Quality: Actual vs. Predicted"
|> Chart.WithXTitle "Actual"
|> Chart.WithYTitle "Predicted"
|> Chart.Show

(*
Over-fitting?
*)

// we split the sample in halves
let sampleSize = redSample |> Seq.length
let training = redSample |> Seq.take (sampleSize/2)
let testing = redSample |> Seq.skip (sampleSize/2)

// careful - this takes a bit of time :)
let trees =
    [ for depth in 1 .. 10 ->
        // for increasing depth
        depth,
        // we train a tree on the training sample
        learnTree (evenSplitter 10,sumOfSquaresCost) training features depth
    ]

// we evaluate errors, on the training and the testing samples
let trainingError = trees |> List.map (fun (d,tree) -> d, sumOfSquares training (predict tree))
let testingError = trees |> List.map (fun (d,tree) -> d, sumOfSquares testing (predict tree))

[ trainingError; testingError ]
|> Chart.Line
|> Chart.WithLabels ["Train"; "Test"]
|> Chart.WithTitle "Over-Fitting Analysis"
|> Chart.WithXTitle "Depth"
|> Chart.WithYTitle "Error"
|> Chart.Show

## gradient-boosting-3.fsx
// blog post: brandewinder.com/2016/09/03/gradient-boosting-part-3

(*
Dependencies
*)

#I "./packages/"

#r "fsharp.data/lib/net40/fsharp.data.dll"
open FSharp.Data

#r "xplot.googlecharts/lib/net45/xplot.googlecharts.dll"
#r "google.datatable.net.wrapper/lib/google.datatable.net.wrapper.dll"
open XPlot.GoogleCharts

#r "fsalg/lib/fsalg.dll"
#r "diffsharp/lib/diffsharp.dll"
open DiffSharp.Numerical

let scatterOptions = Configuration.Options()
scatterOptions.dataOpacity <- 0.25
scatterOptions.pointSize <- 10
scatterOptions.hAxis <- Axis(minValue = 0, maxValue = 10)
scatterOptions.vAxis <- Axis(minValue = 0, maxValue = 10)

(*
Declaring our core types and importing the data.
*)

type Wine = CsvProvider<"data/winequality-red.csv",";",InferRows=1500>

type Observation = Wine.Row
type Feature = Observation -> float
type Example = Observation * float
type Predictor = Observation -> float

let redSample =
    Wine.Load("data/winequality-red.csv").Rows
    |> Seq.map (fun row -> row, row.Quality |> float)

(*
Creating features for that dataset
*)

let ``Alcohol Level`` : Feature =
    fun obs -> obs.Alcohol |> float

let ``Chlorides`` : Feature =
    fun obs -> obs.Chlorides |> float

let ``Citric Acid`` : Feature =
    fun obs -> obs.``Citric acid`` |> float

let ``Density`` : Feature =
    fun obs -> obs.Density |> float

let ``Fixed Acidity`` : Feature =
    fun obs -> obs.``Fixed acidity`` |> float

let ``Free Sulfur Dioxide`` : Feature =
    fun obs -> obs.``Free sulfur dioxide`` |> float

let ``PH`` : Feature =
    fun obs -> obs.PH |> float

let ``Residual Sugar`` : Feature =
    fun obs -> obs.``Residual sugar`` |> float

let ``Total Sulfur Dioxide`` : Feature =
    fun obs -> obs.``Total sulfur dioxide`` |> float

let ``Volatile Acidity`` : Feature =
    fun obs -> obs.``Volatile acidity`` |> float

let features = [
    ``Alcohol Level``
    ``Chlorides``
    ``Citric Acid``
    ``Density``
    ``Fixed Acidity``
    ``Free Sulfur Dioxide``
    ``PH``
    ``Residual Sugar``
    ``Total Sulfur Dioxide``
    ``Volatile Acidity``
]

(*
Basic regression tree implementation
*)

type Tree =
    | Leaf of float
    | Branch of (Feature * float) * Tree * Tree

let rec predict (tree:Tree) (obs:Observation) =
    match tree with
    | Leaf(prediction) -> prediction
    | Branch((feature,split),under,over) ->
        let featureValue = feature obs
        if featureValue <= split
        then predict under obs
        else predict over obs

let underOver (sample:Example seq) (feat:Feature,split:float) =
    let under = sample |> Seq.filter (fun (obs,_) -> feat obs <= split)
    let over =  sample |> Seq.filter (fun (obs,_) -> feat obs > split)
    under,over

type Splitter = Example seq -> Feature -> float list
type Cost = Example seq -> float

let rec learnTree (splitter:Splitter,cost:Cost) (sample:Example seq) (features:Feature list) (depth:int) =

    if depth = 0
    then
        let avg = sample |> Seq.averageBy snd
        Leaf(avg)
    else
        let initialCost = cost sample
        let candidates =
            // build up all the feature/split candidates,
            // and their associated sample splits
            seq {
                for feature in features do
                    let splits = splitter sample feature
                    for split in splits ->
                        let under,over = underOver sample (feature,split)
                        (feature,split),(under,over)
            }
            // compute and append cost of split
            |> Seq.map (fun (candidate,(under,over)) ->
                let underSize = under |> Seq.length |> float
                let overSize = over |> Seq.length |> float
                let size = underSize + overSize
                let weightedCost = (underSize / size) * (cost under) + (overSize / size) * (cost over)
                candidate,(under,over), weightedCost)
            // retain only candidates with strict cost improvement
            |> Seq.filter (fun (candidate,(under,over),splitCost) ->
                splitCost < initialCost)

        if (Seq.isEmpty candidates)
        then
            let avg = sample |> Seq.averageBy snd
            Leaf(avg)
        else
            let ((bestFeature,bestSplit),(under,over),spliCost) =
                candidates
                |> Seq.minBy (fun (_,_,splitCost) -> splitCost)

            let underTree = learnTree (splitter,cost) under features (depth - 1)
            let overTree =  learnTree (splitter,cost) over features (depth - 1)

            Branch((bestFeature,bestSplit),underTree,overTree)

let evenSplitter n (sample:Example seq) (feature:Feature) =
    let values = sample |> Seq.map (fst >> feature)
    let min = values |> Seq.min
    let max = values |> Seq.max
    if min = max
    then []
    else
        let width = (max-min) / (float (n + 1))
        [ min + width .. width .. max - width ]

let sumOfSquaresCost (sample:Example seq) =
    let avg = sample |> Seq.averageBy snd
    sample |> Seq.sumBy (fun (_,lbl) -> pown (lbl - avg) 2)

let fullTree = learnTree (evenSplitter 5,sumOfSquaresCost) redSample features 3

let averageSquareError (sample:Example seq) predictor =
    sample
    |> Seq.averageBy (fun (obs,lbl) ->
        pown (lbl - predictor obs) 2)

averageSquareError redSample (predict fullTree)

redSample
|> Seq.map (fun (obs,lbl) -> lbl, (predict fullTree) obs)
|> Chart.Scatter
|> Chart.WithOptions scatterOptions
|> Chart.WithTitle "Wine Quality: Actual vs. Predicted (Tree)"
|> Chart.WithXTitle "Actual"
|> Chart.WithYTitle "Predicted"
|> Chart.Show

(*
Gradient Boosting
*)

type Learner = Example seq -> Predictor

let learn (sample:Example seq) (learner:Learner) (depth:int) =

    let rec next iterationsLeft predictor =

        // we have reached depth 0: we are done
        if iterationsLeft = 0
        then predictor
        else
            // compute new residuals,
            let newSample =
                sample
                |> Seq.map (fun (obs,y) -> obs, y - predictor obs)

            // learn a predictor against residuals,
            let residualsPredictor = learner newSample

            // create new predictor
            let newPredictor =
                fun obs -> predictor obs + residualsPredictor obs

            // ... and keep going
            next (iterationsLeft - 1) newPredictor

    // initialize with a predictor that
    // predicts the average sample value
    let baseValue = sample |> Seq.map snd |> Seq.average
    let basePredictor = fun (obs:Observation) -> baseValue

    next depth basePredictor

let treeLearner (sample:Example seq) =
    learnTree (evenSplitter 5,sumOfSquaresCost) sample features 3
    |> predict

// evaluate boosting at different depth
[ 1 .. 5 ]
|> List.map (fun depth ->
    let model = learn redSample treeLearner depth
    depth, averageSquareError redSample model)

(*
True Gradient Boosting, using pseudo-residuals
*)

type Loss = float -> float

let draftBoostedLearn (sample:Example seq) (learner:Learner) (loss:Loss) (depth:int) =

    let pseudoResiduals = diff loss

    let rec next iterationsLeft predictor =

        // we have reached depth 0: we are done
        if iterationsLeft = 0
        then predictor
        else
            // compute new residuals,
            let newSample =
                sample
                |> Seq.map (fun (obs,y) ->
                    obs,
                    pseudoResiduals (y - predictor obs))

            // learn a tree against residuals,
            let residualsPredictor = learner newSample

            // create new predictor
            let newPredictor =
                fun obs ->
                    predictor obs + residualsPredictor obs

            // ... and keep going
            next (iterationsLeft - 1) newPredictor

    // initialize with a predictor that
    // predicts the average sample value
    let baseValue = sample |> Seq.map snd |> Seq.average
    let basePredictor = fun (obs:Observation) -> baseValue

    next depth basePredictor

// we should have the same results as before

let squareLoss : Loss = fun x -> 0.5 * pown x 2

[ 1 .. 5 ]
|> List.map (fun depth ->
    let model = draftBoostedLearn redSample treeLearner squareLoss depth
    depth, averageSquareError redSample model)


// illustration: differentiating the square loss function
// does produce the residuals.
let diffSquareLoss = diff squareLoss

[ - 5.0 .. 0.1 .. 5.0 ]
|> List.map (fun x -> x, diffSquareLoss x)
|> Chart.Line
|> Chart.Show


(*
Optimal combination of predictors
*)

let combination f1 f2 gamma : Predictor =
    fun obs -> f1 obs + gamma * f2 obs

let gradientDescent f x0 eta epsilon =
    let rec desc x =
        let g = diff f x
        if abs g < epsilon
        then x
        else
            printfn "%.3f" x
            desc (x - eta * g)
    desc x0

// illustration
let foo x = pown x 2
let min_foo = gradientDescent foo 10. 0.1 0.0001

let optimalGamma (sample:Example seq) f1 f2 (loss:Loss) =

    let combine gamma = combination f1 f2 gamma
    let costOf gamma =
        sample
        |> Seq.sumBy (fun (obs,y) ->
            combine gamma obs - y |> loss)

    gradientDescent costOf 1.0 0.001 0.01

let boostedLearn (sample:Example seq) (learner:Learner) (loss:Loss) (depth:int) =

    let pseudoResiduals = diff loss

    let rec next iterationsLeft predictor =

        // we have reached depth 0: we are done
        if iterationsLeft = 0
        then predictor
        else
            // compute new residuals,
            let newSample =
                sample
                |> Seq.map (fun (obs,y) ->
                    obs,
                    pseudoResiduals (y - predictor obs))

            // learn a tree against residuals,
            let residualsPredictor = learner newSample

            // find optimal gamma
            let gamma = optimalGamma sample predictor residualsPredictor loss

            // create new predictor
            let newPredictor =
                fun obs ->
                    predictor obs + gamma * residualsPredictor obs

            // ... and keep going
            next (iterationsLeft - 1) newPredictor

    // initialize with a predictor that
    // predicts the average sample value
    let baseValue = sample |> Seq.map snd |> Seq.average
    let basePredictor = fun (obs:Observation) -> baseValue

    next depth basePredictor

[ 1 .. 5 ]
|> List.map (fun depth ->
    let model = boostedLearn redSample treeLearner squareLoss depth
    depth, averageSquareError redSample model)

let ssrPredictor = boostedLearn redSample treeLearner squareLoss 5

redSample
|> Seq.map (fun (obs,lbl) -> lbl, ssrPredictor obs)
|> Chart.Scatter
|> Chart.WithOptions scatterOptions
|> Chart.WithTitle "Wine Quality: Actual vs. Predicted (SSR)"
|> Chart.WithXTitle "Actual"
|> Chart.WithYTitle "Predicted"
|> Chart.Show

(*
Using a more complex Loss function, the Huber Loss
*)

// https://en.wikipedia.org/wiki/Huber_loss#Definition
let huber delta x =
    if abs x <= delta
    then 0.5 * pown x 2
    else delta * (abs x - 0.5 * delta)

[ - 5.0 .. 0.1 .. 5.0 ]
|> List.map (fun x -> x, huber 1.0 x)
|> Chart.Line
|> Chart.Show

// illustration: differenting the square loss function
// does produce the residuals.
let diffHuber = diff (huber 1.0)

[ - 5.0 .. 0.1 .. 5.0 ]
|> List.map (fun x -> x, diffHuber x)
|> Chart.Line
|> Chart.Show

[ 1 .. 5 ]
|> List.map (fun depth ->
    let model = boostedLearn redSample treeLearner (huber 1.0) depth
    depth, averageSquareError redSample model)

let huberPredictor = boostedLearn redSample treeLearner (huber 1.0) 5

redSample
|> Seq.map (fun (obs,lbl) -> lbl, huberPredictor obs)
|> Chart.Scatter
|> Chart.WithOptions scatterOptions
|> Chart.WithTitle "Wine Quality: Actual vs. Predicted (Huber 1.0)"
|> Chart.WithXTitle "Actual"
|> Chart.WithYTitle "Predicted"
|> Chart.Show
	// blog post: brandewinder.com/2016/08/06/gradient-boosting-part-1
	// https://en.wikipedia.org/wiki/Gradient_boosting#Algorithm

	(*
	Exploring the dataset
	*)

	#I "./packages/"
	#r "fsharp.data/lib/net40/fsharp.data.dll"
	open FSharp.Data
	#r "xplot.googlecharts/lib/net45/xplot.googlecharts.dll"
	#r "google.datatable.net.wrapper/lib/google.datatable.net.wrapper.dll"
	open XPlot.GoogleCharts

	type Wine = CsvProvider<"data/winequality-red.csv",";",InferRows=1500>

	let reds = Wine.Load("data/winequality-red.csv")

	type Observation = Wine.Row

	type Feature = Observation -> float

	let ``Alcohol Level`` : Feature =
	fun obs -> obs.Alcohol \|> float

	let ``Volatile Acidity`` : Feature =
	fun obs -> obs.``Volatile acidity`` \|> float

	let ``Fixed Acidity`` : Feature =
	fun obs -> obs.``Fixed acidity`` \|> float

	let options = Configuration.Options()
	options.dataOpacity <- 0.25
	options.pointSize <- 10

	reds.Rows
	\|> Seq.map (fun obs -> ``Alcohol Level`` obs, obs.Quality)
	\|> Chart.Scatter
	\|> Chart.WithOptions options
	\|> Chart.WithTitle "Alcohol Level vs. Quality"
	\|> Chart.WithXTitle "Alcohol Level"
	\|> Chart.WithYTitle "Quality"
	\|> Chart.Show

	reds.Rows
	\|> Seq.map (fun obs -> ``Volatile Acidity`` obs, obs.Quality)
	\|> Chart.Scatter
	\|> Chart.WithOptions options
	\|> Chart.WithTitle "Volatile Acidity vs. Quality"
	\|> Chart.WithXTitle "Volatile Acidity"
	\|> Chart.WithYTitle "Quality"
	\|> Chart.Show

	reds.Rows
	\|> Seq.map (fun obs -> ``Fixed Acidity`` obs, obs.Quality)
	\|> Chart.Scatter
	\|> Chart.WithOptions options
	\|> Chart.WithTitle "Fixed Acidity vs. Quality"
	\|> Chart.WithXTitle "Fixed Acidity"
	\|> Chart.WithYTitle "Quality"
	\|> Chart.Show

	(*
	Stumps
	*)

	type Example = Observation * float

	type Predictor = Observation -> float

	let learnStump (sample:Example seq) (feature:Feature) threshold =
	let under =
	sample
	\|> Seq.filter (fun (obs,lbl) -> feature obs <= threshold)
	\|> Seq.averageBy (fun (obs,lbl) -> lbl)
	let over =
	sample
	\|> Seq.filter (fun (obs,lbl) -> feature obs > threshold)
	\|> Seq.averageBy (fun (obs,lbl) -> lbl)
	fun obs ->
	if (feature obs <= threshold)
	then under
	else over

	let redSample =
	reds.Rows
	\|> Seq.map (fun row -> row, row.Quality \|> float)

	let testStump = learnStump redSample ``Alcohol Level`` 11.0

	let predicted =
	redSample
	\|> Seq.map (fun (obs,value) -> (``Alcohol Level`` obs, obs \|> testStump))

	predicted
	\|> Seq.sortBy fst
	\|> Chart.Line
	\|> Chart.WithTitle "Alcohol Level vs. Quality"
	\|> Chart.WithXTitle "Alcohol Level"
	\|> Chart.WithYTitle "Quality"
	\|> Chart.Show

	(*
	Picking the best stump
	2 issues: quality, and possible splits
	*)

	let sumOfSquares (sample:Example seq) predictor =
	sample
	\|> Seq.sumBy (fun (obs,lbl) ->
	pown (lbl - predictor obs) 2)

	sumOfSquares redSample testStump

	let evenSplits (sample:Example seq) (feature:Feature) (n:int) =
	let values = sample \|> Seq.map (fst >> feature)
	let min = values \|> Seq.min
	let max = values \|> Seq.max
	let width = (max-min) / (float (n + 1))
	[ min + width .. width .. max - width ]

	let alcoholSplits = evenSplits redSample ``Alcohol Level`` 10

	let bestStump =
	alcoholSplits
	\|> List.map (learnStump redSample ``Alcohol Level``)
	\|> List.minBy (sumOfSquares redSample)

	sumOfSquares redSample bestStump

	redSample
	\|> Seq.map (fun (obs,value) -> (``Alcohol Level`` obs, obs \|> bestStump))
	\|> Seq.sortBy fst
	\|> Chart.Line
	\|> Chart.WithTitle "Alcohol Level vs. Quality"
	\|> Chart.WithXTitle "Alcohol Level"
	\|> Chart.WithYTitle "Quality"
	\|> Chart.Show

	(*
	Analyzing the residuals
	*)

	redSample
	\|> Seq.map (fun (obs,lbl) -> ``Alcohol Level`` obs, lbl - (obs \|> bestStump))
	\|> Chart.Scatter
	\|> Chart.WithOptions options
	\|> Chart.WithTitle "Alcohol Level vs. Residuals"
	\|> Chart.WithXTitle "Alcohol Level"
	\|> Chart.WithYTitle "Residuals"
	\|> Chart.Show

	// alternate chart, aggregating together Observations
	// with same alcohol level
	redSample
	\|> Seq.map (fun (obs,lbl) -> ``Alcohol Level`` obs, lbl - (obs \|> bestStump))
	\|> Seq.groupBy fst
	\|> Seq.map (fun (x,group) ->
	x,
	group
	\|> Seq.map snd
	\|> Seq.average)
	\|> Chart.Scatter
	\|> Chart.WithOptions options
	\|> Chart.WithTitle "Alcohol Level vs. Residuals"
	\|> Chart.WithXTitle "Alcohol Level"
	\|> Chart.WithYTitle "Residuals"
	\|> Chart.Show

	(*
	Fitting another stump on the residuals
	*)

	let residualsSample =
	redSample
	\|> Seq.map (fun (obs,lbl) -> obs, lbl - (obs \|> bestStump))

	let residualsStump =
	alcoholSplits
	\|> List.map (learnStump residualsSample ``Alcohol Level``)
	\|> List.minBy (sumOfSquares redSample)

	let combined = fun obs -> bestStump obs + residualsStump obs

	sumOfSquares redSample combined

	redSample
	\|> Seq.map (fun (obs,value) -> (``Alcohol Level`` obs, obs \|> combined))
	\|> Seq.sortBy fst
	\|> Chart.Line
	\|> Chart.WithTitle "Alcohol Level vs. Quality"
	\|> Chart.WithXTitle "Alcohol Level"
	\|> Chart.WithYTitle "Quality"
	\|> Chart.Show

	// residuals
	redSample
	\|> Seq.map (fun (obs,lbl) -> ``Alcohol Level`` obs, lbl - (obs \|> combined))
	\|> Chart.Scatter
	\|> Chart.WithOptions options
	\|> Chart.WithTitle "Alcohol Level vs. Residuals"
	\|> Chart.WithXTitle "Alcohol Level"
	\|> Chart.WithYTitle "Residuals"
	\|> Chart.Show

	(*
	Iteratively adding stumps
	*)

	let learn (sample:Example seq) (feature:Feature) (depth:int) =

	let splits = evenSplits sample feature 10

	let rec next iterationsLeft predictor =

	// we have reached depth 0: we are done
	if iterationsLeft = 0
	then predictor
	else
	// compute new residuals
	let newSample =
	sample
	\|> Seq.map (fun (obs,y) -> obs, y - predictor obs)
	// learn possible stumps against residuals,
	// and pick the one with smallest error
	let newStump =
	splits
	\|> Seq.map (learnStump newSample feature)
	\|> Seq.minBy (sumOfSquares newSample)
	// create new predictor
	let newPredictor = fun obs -> predictor obs + newStump obs
	// ... and keep going
	next (iterationsLeft - 1) newPredictor

	// initialize with a predictor that
	// predicts the average sample value
	let baseValue = sample \|> Seq.map snd \|> Seq.average
	let basePredictor = fun (obs:Observation) -> baseValue

	next depth basePredictor

	let model = learn redSample ``Alcohol Level`` 10

	sumOfSquares redSample model

	redSample
	\|> Seq.map (fun (obs,value) -> (``Alcohol Level`` obs, obs \|> model))
	\|> Seq.sortBy fst
	\|> Chart.Line
	\|> Chart.WithTitle "Alcohol Level vs. Quality"
	\|> Chart.WithXTitle "Alcohol Level"
	\|> Chart.WithYTitle "Quality"
	\|> Chart.Show

	// increasing depth

	[ 1 .. 15 ]
	\|> Seq.map (fun depth -> depth, learn redSample ``Alcohol Level`` depth)
	\|> Seq.map (fun (depth,model) -> depth, sumOfSquares redSample model)
	\|> Chart.Column
	\|> Chart.Show
	// blog post: brandewinder.com/2016/08/14/gradient-boosting-part-2
	//https://en.wikipedia.org/wiki/Gradient_boosting#Algorithm

	(*
	Exploring the dataset
	*)

	#I "./packages/"
	#r "fsharp.data/lib/net40/fsharp.data.dll"
	open FSharp.Data
	#r "xplot.googlecharts/lib/net45/xplot.googlecharts.dll"
	#r "google.datatable.net.wrapper/lib/google.datatable.net.wrapper.dll"
	open XPlot.GoogleCharts

	type Wine = CsvProvider<"data/winequality-red.csv",";",InferRows=1500>

	let reds = Wine.Load("data/winequality-red.csv")

	type Observation = Wine.Row

	type Feature = Observation -> float

	let ``Alcohol Level`` : Feature =
	fun obs -> obs.Alcohol \|> float

	let ``Chlorides`` : Feature =
	fun obs -> obs.Chlorides \|> float

	let ``Citric Acid`` : Feature =
	fun obs -> obs.``Citric acid`` \|> float

	let ``Density`` : Feature =
	fun obs -> obs.Density \|> float

	let ``Fixed Acidity`` : Feature =
	fun obs -> obs.``Fixed acidity`` \|> float

	let ``Free Sulfur Dioxide`` : Feature =
	fun obs -> obs.``Free sulfur dioxide`` \|> float

	let ``PH`` : Feature =
	fun obs -> obs.PH \|> float

	let ``Residual Sugar`` : Feature =
	fun obs -> obs.``Residual sugar`` \|> float

	let ``Total Sulfur Dioxide`` : Feature =
	fun obs -> obs.``Total sulfur dioxide`` \|> float

	let ``Volatile Acidity`` : Feature =
	fun obs -> obs.``Volatile acidity`` \|> float

	(*
	Trees
	*)

	type Example = Observation * float

	type Predictor = Observation -> float

	type Tree =
	\| Leaf of float
	\| Branch of (Feature * float) * Tree * Tree

	let exampleTree =
	Branch(
	(``Alcohol Level``, 10.5),
	Branch(
	(``Volatile Acidity``, 0.8),
	Leaf(6.0),
	Leaf(3.0)
	),
	Leaf(5.5)
	)

	let rec predict (tree:Tree) (obs:Observation) =
	match tree with
	\| Leaf(prediction) -> prediction
	\| Branch((feature,split),under,over) ->
	let featureValue = feature obs
	if featureValue <= split
	then predict under obs
	else predict over obs

	predict exampleTree (reds.Rows \|> Seq.head)

	let examplePredictor = predict exampleTree

	let sumOfSquares (sample:Example seq) predictor =
	sample
	\|> Seq.sumBy (fun (obs,lbl) ->
	pown (lbl - predictor obs) 2)

	let redSample =
	reds.Rows
	\|> Seq.map (fun row -> row, row.Quality \|> float)

	sumOfSquares redSample examplePredictor

	(*
	Learning a Tree
	*)

	let learnStump (sample:Example seq) (feature:Feature) threshold =
	let under =
	sample
	\|> Seq.filter (fun (obs,lbl) -> feature obs <= threshold)
	\|> Seq.averageBy (fun (obs,lbl) -> lbl)
	let over =
	sample
	\|> Seq.filter (fun (obs,lbl) -> feature obs > threshold)
	\|> Seq.averageBy (fun (obs,lbl) -> lbl)
	fun obs ->
	if (feature obs <= threshold)
	then under
	else over

	let evenSplits (sample:Example seq) (feature:Feature) (n:int) =
	let values = sample \|> Seq.map (fst >> feature)
	let min = values \|> Seq.min
	let max = values \|> Seq.max
	let width = (max-min) / (float (n + 1))
	[ min + width .. width .. max - width ]

	let rec draftLearnTree (sample:Example seq) (features:Feature list) (depth:int) =

	if depth = 0
	then
	let avg = sample \|> Seq.averageBy snd
	Leaf(avg)
	else
	let (bestFeature,bestSplit) =
	// create all feature * split combinations
	seq {
	for feature in features do
	let splits = evenSplits sample feature 10
	for split in splits -> feature,split
	}
	// find the split with the smallest error
	\|> Seq.minBy (fun (feature,split) ->
	let predictor = learnStump sample feature split
	sumOfSquares sample predictor)

	let under =
	sample
	\|> Seq.filter (fun (obs,_) ->
	bestFeature obs <= bestSplit)
	let over =
	sample
	\|> Seq.filter (fun (obs,_) ->
	bestFeature obs > bestSplit)

	let underTree = draftLearnTree under features (depth - 1)
	let overTree = draftLearnTree over features (depth - 1)

	Branch((bestFeature,bestSplit),underTree,overTree)

	// replicate the original stump
	let originalStump = draftLearnTree redSample [ ``Alcohol Level`` ] 1
	sumOfSquares redSample (predict originalStump)

	let deeperTree = draftLearnTree redSample [``Alcohol Level``;``Volatile Acidity``] 4
	sumOfSquares redSample (predict deeperTree)

	// problem!
	let explodingTree = draftLearnTree redSample [``Alcohol Level``] 5

	(*
	Cleaning things up
	*)

	let underOver (sample:Example seq) (feat:Feature,split:float) =
	let under = sample \|> Seq.filter (fun (obs,_) -> feat obs <= split)
	let over = sample \|> Seq.filter (fun (obs,_) -> feat obs > split)
	under,over

	type Splitter = Example seq -> Feature -> float list

	type Cost = Example seq -> float

	let rec learnTree (splitter:Splitter,cost:Cost) (sample:Example seq) (features:Feature list) (depth:int) =

	if depth = 0
	then
	let avg = sample \|> Seq.averageBy snd
	Leaf(avg)
	else
	let initialCost = cost sample
	let candidates =
	// build up all the feature/split candidates,
	// and their associated sample splits
	seq {
	for feature in features do
	let splits = splitter sample feature
	for split in splits ->
	let under,over = underOver sample (feature,split)
	(feature,split),(under,over)
	}
	// compute and append cost of split
	\|> Seq.map (fun (candidate,(under,over)) ->
	candidate,(under,over), cost under + cost over)
	// retain only candidates with strict cost improvement
	\|> Seq.filter (fun (candidate,(under,over),splitCost) ->
	splitCost < initialCost)

	if (Seq.isEmpty candidates)
	then
	let avg = sample \|> Seq.averageBy snd
	Leaf(avg)
	else
	let ((bestFeature,bestSplit),(under,over),spliCost) =
	candidates
	\|> Seq.minBy (fun (_,_,splitCost) -> splitCost)

	let underTree = learnTree (splitter,cost) under features (depth - 1)
	let overTree = learnTree (splitter,cost) over features (depth - 1)

	Branch((bestFeature,bestSplit),underTree,overTree)

	let evenSplitter n (sample:Example seq) (feature:Feature) =
	let values = sample \|> Seq.map (fst >> feature)
	let min = values \|> Seq.min
	let max = values \|> Seq.max
	if min = max
	then []
	else
	let width = (max-min) / (float (n + 1))
	[ min + width .. width .. max - width ]

	let sumOfSquaresCost (sample:Example seq) =
	let avg = sample \|> Seq.averageBy snd
	sample \|> Seq.sumBy (fun (_,lbl) -> pown (lbl - avg) 2)

	// alternate cost specification
	let manhattanCost (sample:Example seq) =
	let avg = sample \|> Seq.averageBy snd
	sample \|> Seq.sumBy (fun (_,lbl) -> abs (lbl - avg))

	let stableTree = learnTree (evenSplitter 10,sumOfSquaresCost) redSample [``Alcohol Level``;``Volatile Acidity``] 10

	sumOfSquares redSample (predict stableTree)

	// we include every feature available
	let features = [
	``Alcohol Level``
	``Chlorides``
	``Citric Acid``
	``Density``
	``Fixed Acidity``
	``Free Sulfur Dioxide``
	``PH``
	``Residual Sugar``
	``Total Sulfur Dioxide``
	``Volatile Acidity``
	]

	let fullTree = learnTree (evenSplitter 5,sumOfSquaresCost) redSample features 10
	sumOfSquares redSample (predict fullTree)

	// plotting actual vs. predicted values
	let options = Configuration.Options()
	options.dataOpacity <- 0.25
	options.pointSize <- 10

	redSample
	\|> Seq.map (fun (obs,lbl) -> lbl, predict fullTree obs)
	\|> Chart.Scatter
	\|> Chart.WithOptions options
	\|> Chart.WithTitle "Wine Quality: Actual vs. Predicted"
	\|> Chart.WithXTitle "Actual"
	\|> Chart.WithYTitle "Predicted"
	\|> Chart.Show

	(*
	Over-fitting?
	*)

	// we split the sample in halves
	let sampleSize = redSample \|> Seq.length
	let training = redSample \|> Seq.take (sampleSize/2)
	let testing = redSample \|> Seq.skip (sampleSize/2)

	// careful - this takes a bit of time :)
	let trees =
	[ for depth in 1 .. 10 ->
	// for increasing depth
	depth,
	// we train a tree on the training sample
	learnTree (evenSplitter 10,sumOfSquaresCost) training features depth
	]

	// we evaluate errors, on the training and the testing samples
	let trainingError = trees \|> List.map (fun (d,tree) -> d, sumOfSquares training (predict tree))
	let testingError = trees \|> List.map (fun (d,tree) -> d, sumOfSquares testing (predict tree))

	[ trainingError; testingError ]
	\|> Chart.Line
	\|> Chart.WithLabels ["Train"; "Test"]
	\|> Chart.WithTitle "Over-Fitting Analysis"
	\|> Chart.WithXTitle "Depth"
	\|> Chart.WithYTitle "Error"
	\|> Chart.Show
	// blog post: brandewinder.com/2016/09/03/gradient-boosting-part-3

	(*
	Dependencies
	*)

	#I "./packages/"

	#r "fsharp.data/lib/net40/fsharp.data.dll"
	open FSharp.Data

	#r "xplot.googlecharts/lib/net45/xplot.googlecharts.dll"
	#r "google.datatable.net.wrapper/lib/google.datatable.net.wrapper.dll"
	open XPlot.GoogleCharts

	#r "fsalg/lib/fsalg.dll"
	#r "diffsharp/lib/diffsharp.dll"
	open DiffSharp.Numerical

	let scatterOptions = Configuration.Options()
	scatterOptions.dataOpacity <- 0.25
	scatterOptions.pointSize <- 10
	scatterOptions.hAxis <- Axis(minValue = 0, maxValue = 10)
	scatterOptions.vAxis <- Axis(minValue = 0, maxValue = 10)

	(*
	Declaring our core types and importing the data.
	*)

	type Wine = CsvProvider<"data/winequality-red.csv",";",InferRows=1500>

	type Observation = Wine.Row
	type Feature = Observation -> float
	type Example = Observation * float
	type Predictor = Observation -> float

	let redSample =
	Wine.Load("data/winequality-red.csv").Rows
	\|> Seq.map (fun row -> row, row.Quality \|> float)

	(*
	Creating features for that dataset
	*)

	let ``Alcohol Level`` : Feature =
	fun obs -> obs.Alcohol \|> float

	let ``Chlorides`` : Feature =
	fun obs -> obs.Chlorides \|> float

	let ``Citric Acid`` : Feature =
	fun obs -> obs.``Citric acid`` \|> float

	let ``Density`` : Feature =
	fun obs -> obs.Density \|> float

	let ``Fixed Acidity`` : Feature =
	fun obs -> obs.``Fixed acidity`` \|> float

	let ``Free Sulfur Dioxide`` : Feature =
	fun obs -> obs.``Free sulfur dioxide`` \|> float

	let ``PH`` : Feature =
	fun obs -> obs.PH \|> float

	let ``Residual Sugar`` : Feature =
	fun obs -> obs.``Residual sugar`` \|> float

	let ``Total Sulfur Dioxide`` : Feature =
	fun obs -> obs.``Total sulfur dioxide`` \|> float

	let ``Volatile Acidity`` : Feature =
	fun obs -> obs.``Volatile acidity`` \|> float

	let features = [
	``Alcohol Level``
	``Chlorides``
	``Citric Acid``
	``Density``
	``Fixed Acidity``
	``Free Sulfur Dioxide``
	``PH``
	``Residual Sugar``
	``Total Sulfur Dioxide``
	``Volatile Acidity``
	]

	(*
	Basic regression tree implementation
	*)

	type Tree =
	\| Leaf of float
	\| Branch of (Feature * float) * Tree * Tree

	let rec predict (tree:Tree) (obs:Observation) =
	match tree with
	\| Leaf(prediction) -> prediction
	\| Branch((feature,split),under,over) ->
	let featureValue = feature obs
	if featureValue <= split
	then predict under obs
	else predict over obs

	let underOver (sample:Example seq) (feat:Feature,split:float) =
	let under = sample \|> Seq.filter (fun (obs,_) -> feat obs <= split)
	let over = sample \|> Seq.filter (fun (obs,_) -> feat obs > split)
	under,over

	type Splitter = Example seq -> Feature -> float list
	type Cost = Example seq -> float

	let rec learnTree (splitter:Splitter,cost:Cost) (sample:Example seq) (features:Feature list) (depth:int) =

	if depth = 0
	then
	let avg = sample \|> Seq.averageBy snd
	Leaf(avg)
	else
	let initialCost = cost sample
	let candidates =
	// build up all the feature/split candidates,
	// and their associated sample splits
	seq {
	for feature in features do
	let splits = splitter sample feature
	for split in splits ->
	let under,over = underOver sample (feature,split)
	(feature,split),(under,over)
	}
	// compute and append cost of split
	\|> Seq.map (fun (candidate,(under,over)) ->
	let underSize = under \|> Seq.length \|> float
	let overSize = over \|> Seq.length \|> float
	let size = underSize + overSize
	let weightedCost = (underSize / size) * (cost under) + (overSize / size) * (cost over)
	candidate,(under,over), weightedCost)
	// retain only candidates with strict cost improvement
	\|> Seq.filter (fun (candidate,(under,over),splitCost) ->
	splitCost < initialCost)

	if (Seq.isEmpty candidates)
	then
	let avg = sample \|> Seq.averageBy snd
	Leaf(avg)
	else
	let ((bestFeature,bestSplit),(under,over),spliCost) =
	candidates
	\|> Seq.minBy (fun (_,_,splitCost) -> splitCost)

	let underTree = learnTree (splitter,cost) under features (depth - 1)
	let overTree = learnTree (splitter,cost) over features (depth - 1)

	Branch((bestFeature,bestSplit),underTree,overTree)

	let evenSplitter n (sample:Example seq) (feature:Feature) =
	let values = sample \|> Seq.map (fst >> feature)
	let min = values \|> Seq.min
	let max = values \|> Seq.max
	if min = max
	then []
	else
	let width = (max-min) / (float (n + 1))
	[ min + width .. width .. max - width ]

	let sumOfSquaresCost (sample:Example seq) =
	let avg = sample \|> Seq.averageBy snd
	sample \|> Seq.sumBy (fun (_,lbl) -> pown (lbl - avg) 2)

	let fullTree = learnTree (evenSplitter 5,sumOfSquaresCost) redSample features 3

	let averageSquareError (sample:Example seq) predictor =
	sample
	\|> Seq.averageBy (fun (obs,lbl) ->
	pown (lbl - predictor obs) 2)

	averageSquareError redSample (predict fullTree)

	redSample
	\|> Seq.map (fun (obs,lbl) -> lbl, (predict fullTree) obs)
	\|> Chart.Scatter
	\|> Chart.WithOptions scatterOptions
	\|> Chart.WithTitle "Wine Quality: Actual vs. Predicted (Tree)"
	\|> Chart.WithXTitle "Actual"
	\|> Chart.WithYTitle "Predicted"
	\|> Chart.Show

	(*
	Gradient Boosting
	*)

	type Learner = Example seq -> Predictor

	let learn (sample:Example seq) (learner:Learner) (depth:int) =

	let rec next iterationsLeft predictor =

	// we have reached depth 0: we are done
	if iterationsLeft = 0
	then predictor
	else
	// compute new residuals,
	let newSample =
	sample
	\|> Seq.map (fun (obs,y) -> obs, y - predictor obs)

	// learn a predictor against residuals,
	let residualsPredictor = learner newSample

	// create new predictor
	let newPredictor =
	fun obs -> predictor obs + residualsPredictor obs

	// ... and keep going
	next (iterationsLeft - 1) newPredictor

	// initialize with a predictor that
	// predicts the average sample value
	let baseValue = sample \|> Seq.map snd \|> Seq.average
	let basePredictor = fun (obs:Observation) -> baseValue

	next depth basePredictor

	let treeLearner (sample:Example seq) =
	learnTree (evenSplitter 5,sumOfSquaresCost) sample features 3
	\|> predict

	// evaluate boosting at different depth
	[ 1 .. 5 ]
	\|> List.map (fun depth ->
	let model = learn redSample treeLearner depth
	depth, averageSquareError redSample model)

	(*
	True Gradient Boosting, using pseudo-residuals
	*)

	type Loss = float -> float

	let draftBoostedLearn (sample:Example seq) (learner:Learner) (loss:Loss) (depth:int) =

	let pseudoResiduals = diff loss

	let rec next iterationsLeft predictor =

	// we have reached depth 0: we are done
	if iterationsLeft = 0
	then predictor
	else
	// compute new residuals,
	let newSample =
	sample
	\|> Seq.map (fun (obs,y) ->
	obs,
	pseudoResiduals (y - predictor obs))

	// learn a tree against residuals,
	let residualsPredictor = learner newSample

	// create new predictor
	let newPredictor =
	fun obs ->
	predictor obs + residualsPredictor obs

	// ... and keep going
	next (iterationsLeft - 1) newPredictor

	// initialize with a predictor that
	// predicts the average sample value
	let baseValue = sample \|> Seq.map snd \|> Seq.average
	let basePredictor = fun (obs:Observation) -> baseValue

	next depth basePredictor

	// we should have the same results as before

	let squareLoss : Loss = fun x -> 0.5 * pown x 2

	[ 1 .. 5 ]
	\|> List.map (fun depth ->
	let model = draftBoostedLearn redSample treeLearner squareLoss depth
	depth, averageSquareError redSample model)


	// illustration: differentiating the square loss function
	// does produce the residuals.
	let diffSquareLoss = diff squareLoss

	[ - 5.0 .. 0.1 .. 5.0 ]
	\|> List.map (fun x -> x, diffSquareLoss x)
	\|> Chart.Line
	\|> Chart.Show


	(*
	Optimal combination of predictors
	*)

	let combination f1 f2 gamma : Predictor =
	fun obs -> f1 obs + gamma * f2 obs

	let gradientDescent f x0 eta epsilon =
	let rec desc x =
	let g = diff f x
	if abs g < epsilon
	then x
	else
	printfn "%.3f" x
	desc (x - eta * g)
	desc x0

	// illustration
	let foo x = pown x 2
	let min_foo = gradientDescent foo 10. 0.1 0.0001

	let optimalGamma (sample:Example seq) f1 f2 (loss:Loss) =

	let combine gamma = combination f1 f2 gamma
	let costOf gamma =
	sample
	\|> Seq.sumBy (fun (obs,y) ->
	combine gamma obs - y \|> loss)

	gradientDescent costOf 1.0 0.001 0.01

	let boostedLearn (sample:Example seq) (learner:Learner) (loss:Loss) (depth:int) =

	let pseudoResiduals = diff loss

	let rec next iterationsLeft predictor =

	// we have reached depth 0: we are done
	if iterationsLeft = 0
	then predictor
	else
	// compute new residuals,
	let newSample =
	sample
	\|> Seq.map (fun (obs,y) ->
	obs,
	pseudoResiduals (y - predictor obs))

	// learn a tree against residuals,
	let residualsPredictor = learner newSample

	// find optimal gamma
	let gamma = optimalGamma sample predictor residualsPredictor loss

	// create new predictor
	let newPredictor =
	fun obs ->
	predictor obs + gamma * residualsPredictor obs

	// ... and keep going
	next (iterationsLeft - 1) newPredictor

	// initialize with a predictor that
	// predicts the average sample value
	let baseValue = sample \|> Seq.map snd \|> Seq.average
	let basePredictor = fun (obs:Observation) -> baseValue

	next depth basePredictor

	[ 1 .. 5 ]
	\|> List.map (fun depth ->
	let model = boostedLearn redSample treeLearner squareLoss depth
	depth, averageSquareError redSample model)

	let ssrPredictor = boostedLearn redSample treeLearner squareLoss 5

	redSample
	\|> Seq.map (fun (obs,lbl) -> lbl, ssrPredictor obs)
	\|> Chart.Scatter
	\|> Chart.WithOptions scatterOptions
	\|> Chart.WithTitle "Wine Quality: Actual vs. Predicted (SSR)"
	\|> Chart.WithXTitle "Actual"
	\|> Chart.WithYTitle "Predicted"
	\|> Chart.Show

	(*
	Using a more complex Loss function, the Huber Loss
	*)

	// https://en.wikipedia.org/wiki/Huber_loss#Definition
	let huber delta x =
	if abs x <= delta
	then 0.5 * pown x 2
	else delta * (abs x - 0.5 * delta)

	[ - 5.0 .. 0.1 .. 5.0 ]
	\|> List.map (fun x -> x, huber 1.0 x)
	\|> Chart.Line
	\|> Chart.Show

	// illustration: differenting the square loss function
	// does produce the residuals.
	let diffHuber = diff (huber 1.0)

	[ - 5.0 .. 0.1 .. 5.0 ]
	\|> List.map (fun x -> x, diffHuber x)
	\|> Chart.Line
	\|> Chart.Show

	[ 1 .. 5 ]
	\|> List.map (fun depth ->
	let model = boostedLearn redSample treeLearner (huber 1.0) depth
	depth, averageSquareError redSample model)

	let huberPredictor = boostedLearn redSample treeLearner (huber 1.0) 5

	redSample
	\|> Seq.map (fun (obs,lbl) -> lbl, huberPredictor obs)
	\|> Chart.Scatter
	\|> Chart.WithOptions scatterOptions
	\|> Chart.WithTitle "Wine Quality: Actual vs. Predicted (Huber 1.0)"
	\|> Chart.WithXTitle "Actual"
	\|> Chart.WithYTitle "Predicted"
	\|> Chart.Show