Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Recursive minimal entropy partitioning, based on Fayyad & Irani: break a continuous variable into discrete intervals top-down, maximizing the entropy gained at each step, with a stopping rule using the Minimum Description Length principle.
namespace Discretization
// Recursive minimal entropy partitioning,
// based on Fayyad & Irani 1993.
// See the following article, section 3.3,
// for a description of the algorithm:
// http://www.math.unipd.it/~dulli/corso04/disc.pdf
// Note: this can certainly be optimized.
module MDL =
open System
// Logarithm of n in base b
let logb n b = log n / log b
let entropy (data: (_ * _) seq) =
let N = data |> Seq.length |> (float)
data
|> Seq.countBy snd
|> Seq.sumBy (fun (_,count) ->
let p = (float)count/N
- p * log p)
// A Block of data to be split, with its
// relevant characteristics (size, number
// of classes, entropy)
type Block (data: (float * int) []) =
let s = data |> Array.length |> (float)
let classes = data |> Array.map snd |> Set.ofArray |> Set.count
let k = classes |> (float)
let h = entropy (data)
member this.Data = data
member this.Classes = classes
member this.S = s
member this.K = k
member this.H = h
// Entropy gained by splitting "original" block
// into 2 blocks "left" and "right"
let private entropyGain (original:Block) (left:Block) (right:Block) =
original.H -
((left.S / original.S) * left.H + (right.S / original.S) * right.H)
// Minimum entropy gain required
// for a split of the "original" block
// into 2 blocks "left" and "right"
let private minGain (original:Block) (left:Block) (right:Block) =
let delta =
logb (pown 3. original.Classes - 2.) 2. -
(original.K * original.H - left.K * left.H - right.K * right.H)
((logb (original.S - 1.) 2.) / original.S) + (delta / original.S)
// Identify the best acceptable value
// to split a block of data
let split (data:Block) =
// Candidate values to use as split
// We remove the smallest, because
// by definition no value is smaller
let candidates =
data.Data
|> Array.map fst
|> Seq.distinct
|> Seq.sort
|> Seq.toList
|> List.tail
let walls = seq {
for value in candidates do
// Split the data into 2 groups,
// below/above the value
let g1, g2 =
data.Data
|> Array.partition (fun (v,c) -> v < value)
let block1 = Block(g1)
let block2 = Block(g2)
let gain = entropyGain data block1 block2
let threshold = minGain data block1 block2
// if minimum threshold is met,
// the value is an acceptable candidate
if gain >= threshold
then yield (value, gain, block1, block2) }
if (Seq.isEmpty walls) then None
else
// Return the split value that
// yields the best entropy gain
walls
|> Seq.maxBy (fun (value, gain, b1, b2) -> gain)
|> Some
// Top-down recursive partition of a data block,
// accumulating the partitioning values into
// a list of "walls" (splitting values)
let partition (data:Block) =
let rec recursiveSplit (walls:float list) (data:Block) =
match (split data) with
| None -> walls // no split found
| Some(value, gain, b1, b2) ->
// append new split value
let walls = value::walls
// Search for new splits in first group
let walls = recursiveSplit walls b1
// Search for new splits in second group
recursiveSplit walls b2
// and go search!
recursiveSplit [] data |> List.sort
#load "MDL.fs"
open System
open Discretization.MDL
let rng = System.Random()
let tests = [
// one single class
"Single",
[| for i in 1..100 -> (rng.NextDouble(), 0) |];
// class 0 from 0 to 1, class 1 from 1 to 2
"Separate",
[| for i in 1..100 -> (rng.NextDouble(), 0)
for i in 1..100 -> (rng.NextDouble() + 1.0, 1) |];
// overlapping classes
"Mixture",
[| for i in 1..100 -> (rng.NextDouble(), rng.Next(0,2))
for i in 1..100 -> (rng.NextDouble() + 0.5, rng.Next(1,3))
for i in 1..100 -> (rng.NextDouble() + 1.0, rng.Next(2,4))
for i in 1..100 -> (rng.NextDouble() + 1.5, rng.Next(3,5)) |];
"Alternating",
[| for i in 0 .. 100 -> ((float)i, if i % 2 = 0 then 0 else 1) |]; ]
tests |> List.iter (fun (title, testData) ->
printfn "Sample: %s" title
let data = Block(testData)
let result = partition data
printfn "[ %s ]" (String.Join(", ", result)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment