Created
January 30, 2014 22:31
-
-
Save kralo/8721440 to your computer and use it in GitHub Desktop.
Example Implementation in Scala of Lloyd and Fuzzy-C-Means Clustering
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Preface: | |
* | |
* this is an educational implementation of two algorithms for unsupervised learning. | |
* They may not follow every standard notation but I hope they are illustrative. | |
* I am neither the most versatile scala programmer, so there may always be a way to express | |
* things in a more idiomatic way. | |
* | |
* This example is not packaged so that you can run it easily via the scala REPL. | |
* try with scala <this_filename>.scala | |
* | |
* Fuzzy-C-Means Clustering: Abortion criteria is the maximum in the difference of | |
* the partition matrix which holds probabilities between 0-1 | |
* | |
* K-Menas Clustering with iterative Centroid refinement, abortion criteria is the maximum | |
* in the difference of the centroids between run. | |
* | |
* Nota on the example set: this is not ideal for unsupervised learning, that I took a pre-labeled dataset, | |
* but for the assignment, it did the job | |
* | |
* public domain.Provided without any warranty always happy if you link, forge, improve... | |
*/ | |
import scala.math._ | |
import types._ | |
import vecHelper._ | |
import vecHelper._ | |
import algorithms._ | |
object types { | |
type centroidHolder = Vector[Double] // your turn to chose what happens when your centroids are non-numeric... | |
type exampleHolder = Vector[Double] // and examples as well | |
type centroidsHolder = Vector[centroidHolder] | |
type examplesHolder = Vector[exampleHolder] | |
} | |
object vecHelper { | |
/** | |
* Mini-Footprint Lib to handle Vector operations | |
*/ | |
def vecAdd(a: Vector[Double], b: Vector[Double]): Vector[Double] = { | |
if (a.length != b.length) throw new IllegalArgumentException("vecAdd: input Vectors must have the same length!") | |
(for (i <- a.indices) yield (a(i) + b(i))).toVector | |
} | |
def vecSubstr(a: Vector[Double], b: Vector[Double]): Vector[Double] = { | |
if (a.length != b.length) throw new IllegalArgumentException("vecSubstr: input Vectors must have the same length!") | |
(for (i <- a.indices) yield (a(i) - b(i))).toVector | |
} | |
def vecDist(a: Vector[Double], b: Vector[Double]): Double = { | |
if (a.length != b.length) throw new IllegalArgumentException("vecDist: input Vectors must have the same length!") | |
//computes euclidean distance between n-dim vectors | |
vecSubstr(a, b).map(i => pow(i, 2)).sum | |
} | |
def multScalVect(scalar: Double, b: Vector[Double]): Vector[Double] = b.map(i => scalar * i) | |
//def divVectScal(b: Vector[Double], scalar: Double): Vector[Double] = multScalVect(1 / scalar, b) | |
def divVectScal(scalar: Double, b: Vector[Double]): Vector[Double] = multScalVect(1 / scalar, b) | |
} | |
object algorithms { | |
/** | |
* Bundle all functions related to c-means-fuzzy-clustering | |
* following formulas in http://de.wikipedia.org/wiki/Fuzzy_C-Means | |
* and "Ejercicios resueltos de Visión por Computador, Pajares Martinsanz" | |
*/ | |
//returns the distance and ids of the vectors which are the nearest | |
def findClosestCentroid(examples: examplesHolder, centroids: centroidsHolder): Vector[(Vector[Double], Int)] = { | |
for ( | |
example <- examples; | |
//compute Distance to each Centroid and output "best" id | |
centrDists = (for (curCentrId <- centroids.indices) yield (vecDist(example, centroids(curCentrId)), curCentrId)) | |
) yield (example, centrDists.reduceLeft((x, y) => if (x._1 < y._1) x else y)._2) | |
} | |
/** | |
* Calculate Sum (like a cost-function) of all examples to their closest centroid | |
*/ | |
def sumClosestCentroidDists(examples: examplesHolder, centroids: centroidsHolder): Double = { | |
(for ( | |
example <- examples; | |
//compute Distance to each Centroid and output "best" id | |
centrDists = (for (curCentrId <- centroids.indices) yield vecDist(example, centroids(curCentrId))) | |
) yield (centrDists.reduceLeft((x, y) => if (x < y) x else y))).sum | |
} | |
/** | |
* Membership-Function | |
* u_{ik}=\frac{1}{\sum_{j=1}^C \left(\frac{d_{ik}}{d_{jk}}\right)^\frac{2}{m-1}} | |
* whereas C is #centroids | |
*/ | |
def calcMembershipFunc(example: exampleHolder, centroids: centroidsHolder, curCentr: Int, m: Double): Double = { | |
val d_ik = vecDist(centroids(curCentr), example) | |
return 1 / (for (j <- centroids) yield (pow(d_ik / vecDist(j, example), 2 / (m - 1)))).sum | |
} | |
/** | |
* returns a #centr x #examples Matrix with values 0-1 (prob.) for example to belong in that cluster | |
*/ | |
def compPartitionMatrix(centr: centroidsHolder, examples: examplesHolder, m: Double): Vector[Vector[Double]] = { | |
def partCompleteExampleSetToOneCentr(centr: centroidsHolder, centrId: Int, examples: examplesHolder, m: Double): Vector[Double] = { | |
for (example <- examples) yield (calcMembershipFunc(example, centr, centrId, m)) | |
} | |
(for (i <- centr.indices) yield (partCompleteExampleSetToOneCentr(centr, i, examples, m))).toVector | |
} | |
/** | |
* Computes the maximum Value in the Difference of the Partition Matrizes U_(t) and U_(t-1) | |
*/ | |
def compPartitionMatDelta(oldCentr: centroidsHolder, newCentr: centroidsHolder, examples: examplesHolder, m: Double): Double = { | |
val oldU = compPartitionMatrix(oldCentr, examples, m) | |
val newU = compPartitionMatrix(newCentr, examples, m) | |
(for (row <- oldU.indices) yield ( | |
(for (column <- oldU(row).indices) yield ( | |
oldU(row)(column) - newU(row)(column))) | |
.foldLeft(0.0)((x, y) => max(x, y)))) //find max in row | |
.foldLeft(0.0)((x, y) => max(x, y)) //in columns | |
} | |
/** | |
* Computes the maximum Value in the Difference of centroids | |
*/ | |
def compCentroidsDelta(oldCentr: centroidsHolder, newCentr: centroidsHolder): Double = { | |
(for (row <- oldCentr.indices) yield ( | |
(for (column <- oldCentr(row).indices) yield ( | |
oldCentr(row)(column) - newCentr(row)(column))) | |
.foldLeft(0.0)((x, y) => max(x, y)))) //find max in row | |
.foldLeft(0.0)((x, y) => max(x, y)) //in columns | |
} | |
/** | |
* Updating the Centre of the selected Centroid | |
* | |
* v_i=\frac{\sum_{k=1}^N u_{ik}^m x_k}{\sum_{k=1}^N u_{ik}^m} | |
* whereas N is the #examples | |
*/ | |
def v_i(examples: examplesHolder, centroids: centroidsHolder, curCentr: Int, m: Double): Vector[Double] = { | |
//upper means upper part of the fraction | |
val upper = examples | |
.map(j => (j, pow(calcMembershipFunc(j, centroids, curCentr, m), m))) //generate Tuple from x_j and u_ik ^2 | |
.map(j => multScalVect(j._2, j._1)) // multipy x_j and u_ik^2 from above | |
.reduceLeft((a, b) => vecAdd(a, b)) //sum to make final vector | |
val lower = examples.map(j => pow(calcMembershipFunc(j, centroids, curCentr, m), m)).sum //compute u_ik for each x_j | |
return divVectScal(lower, upper) | |
} | |
def execute_fuzzyCMeans(examples: examplesHolder, initial_Centroids: centroidsHolder): centroidsHolder = { | |
//Der "fuzzifier" m(>1) bestimmt, wie scharf die Objekte den Clustern zugeordnet werden. | |
//Lässt man m gegen unendlich laufen, so nähern sich die u_{ik} dem Wert \tfrac{1}{C} an, d.h. die | |
//Zugehörigkeit der Punkte ist zu allen Clustern gleich groß. | |
val fuzzifier = 2 //paso exp. b | |
val finCritEpsilon = 0.01 //tolerance epsilon | |
val maxIter = 10; | |
var lastCentr = initial_Centroids | |
var newCentr = initial_Centroids | |
var lastcompFin = finCritEpsilon * 10 | |
var iter = 1 | |
println("This is the fuzzy-C-Means speaking.") | |
println("You set the fuzzifier to " + fuzzifier) | |
println("and finCritEpsilon to " + finCritEpsilon) | |
//println("U_init is" + compPartitionMatrix(newCentr, examples, fuzzifier)) | |
while (!(iter > maxIter || lastcompFin < finCritEpsilon)) { //if inner bracket holds true, then abort | |
println("--This is iter " + iter + " --") | |
newCentr = (for (i <- lastCentr.indices) yield (v_i(examples, lastCentr, i, fuzzifier))).toVector | |
println("Centroids are: " + newCentr) | |
//println("U_act is" + compPartitionMatrix(newCentr, examples, fuzzifier)) | |
lastcompFin = compPartitionMatDelta(lastCentr, newCentr, examples, fuzzifier) | |
println("delta is " + lastcompFin) | |
lastCentr = newCentr | |
iter += 1 | |
} | |
return lastCentr | |
} | |
/**Lloyd K-Means specific**/ | |
/** | |
* Iterative LLoyds approach. There may also be a version, where you update the Centroids in batches | |
*/ | |
def execute_lloydAlgPajares(examples: examplesHolder, initial_Centroids: centroidsHolder): centroidsHolder = { | |
val razon_aprendiz = 0.1 | |
val finCritEpsilon = 10e-10 //tolerance epsilon | |
val maxIter = 10; | |
var lastCentr = initial_Centroids | |
var newCentr = initial_Centroids | |
var curFinCrit = finCritEpsilon * 10 //last computed finalization criterion | |
var iter = 1 | |
println("This is the LLoyds by Pajares") | |
println("You set learning rate to " + razon_aprendiz) | |
println("and finCritEps to " + finCritEpsilon) | |
println("and maxiter to " + maxIter) | |
while (!(iter > maxIter || curFinCrit < finCritEpsilon)) { //if inner bracket holds true, then abort | |
println("--This is iter " + iter + " --") | |
//update centroid with id f._2 | |
//in f._1 will be the example vector iterando por cada centroide | |
for (exampleCloseCentroid <- findClosestCentroid(examples, newCentr)) { | |
newCentr = newCentr.updated(exampleCloseCentroid._2, vecAdd(newCentr(exampleCloseCentroid._2), multScalVect(razon_aprendiz, vecSubstr(exampleCloseCentroid._1, newCentr(exampleCloseCentroid._2))))) | |
} | |
curFinCrit = compCentroidsDelta(lastCentr, newCentr) | |
lastCentr = newCentr | |
println("Centroids are: " + lastCentr) | |
println("FinCrit is : " + curFinCrit) | |
iter += 1 | |
} | |
return lastCentr | |
} | |
} | |
object haupt { | |
def main(args: Array[String]): Unit = { | |
val input = "" | |
//if you can, set absolute path here | |
//val input = System.getProperty("user.dir") + "/Iris2Clases.txt" | |
/*val source = scala.io.Source.fromFile(input) //build dataset, strip last String as it's the text where it belongs | |
val dataset = (for ( | |
line <- source.getLines; | |
val rawdata = line.toString().split(",") | |
) yield (rawdata.dropRight(1).toVector.map(i => i.toDouble), rawdata(rawdata.length - 1))).toList.toIndexedSeq | |
source.close()*/ | |
//workaround so no file is read | |
val source = Vector(Vector("5.1,3.5,1.4,0.2,Iris-setosa"), Vector("4.9,3.0,1.4,0.2,Iris-setosa"), Vector("4.7,3.2,1.3,0.2,Iris-setosa"), Vector("4.6,3.1,1.5,0.2,Iris-setosa"), Vector("5.0,3.6,1.4,0.2,Iris-setosa"), Vector("5.4,3.9,1.7,0.4,Iris-setosa"), Vector("4.6,3.4,1.4,0.3,Iris-setosa"), Vector("5.0,3.4,1.5,0.2,Iris-setosa"), Vector("4.4,2.9,1.4,0.2,Iris-setosa"), Vector("4.9,3.1,1.5,0.1,Iris-setosa"), Vector("5.4,3.7,1.5,0.2,Iris-setosa"), Vector("4.8,3.4,1.6,0.2,Iris-setosa"), Vector("4.8,3.0,1.4,0.1,Iris-setosa"), Vector("4.3,3.0,1.1,0.1,Iris-setosa"), Vector("5.8,4.0,1.2,0.2,Iris-setosa"), Vector("5.7,4.4,1.5,0.4,Iris-setosa"), Vector("5.4,3.9,1.3,0.4,Iris-setosa"), Vector("5.1,3.5,1.4,0.3,Iris-setosa"), Vector("5.7,3.8,1.7,0.3,Iris-setosa"), Vector("5.1,3.8,1.5,0.3,Iris-setosa"), Vector("5.4,3.4,1.7,0.2,Iris-setosa"), Vector("5.1,3.7,1.5,0.4,Iris-setosa"), Vector("4.6,3.6,1.0,0.2,Iris-setosa"), Vector("5.1,3.3,1.7,0.5,Iris-setosa"), Vector("4.8,3.4,1.9,0.2,Iris-setosa"), Vector("5.0,3.0,1.6,0.2,Iris-setosa"), Vector("5.0,3.4,1.6,0.4,Iris-setosa"), Vector("5.2,3.5,1.5,0.2,Iris-setosa"), Vector("5.2,3.4,1.4,0.2,Iris-setosa"), Vector("4.7,3.2,1.6,0.2,Iris-setosa"), Vector("4.8,3.1,1.6,0.2,Iris-setosa"), Vector("5.4,3.4,1.5,0.4,Iris-setosa"), Vector("5.2,4.1,1.5,0.1,Iris-setosa"), Vector("5.5,4.2,1.4,0.2,Iris-setosa"), Vector("4.9,3.1,1.5,0.1,Iris-setosa"), Vector("5.0,3.2,1.2,0.2,Iris-setosa"), Vector("5.5,3.5,1.3,0.2,Iris-setosa"), Vector("4.9,3.1,1.5,0.1,Iris-setosa"), Vector("4.4,3.0,1.3,0.2,Iris-setosa"), Vector("5.1,3.4,1.5,0.2,Iris-setosa"), Vector("5.0,3.5,1.3,0.3,Iris-setosa"), Vector("4.5,2.3,1.3,0.3,Iris-setosa"), Vector("4.4,3.2,1.3,0.2,Iris-setosa"), Vector("5.0,3.5,1.6,0.6,Iris-setosa"), Vector("5.1,3.8,1.9,0.4,Iris-setosa"), Vector("4.8,3.0,1.4,0.3,Iris-setosa"), Vector("5.1,3.8,1.6,0.2,Iris-setosa"), Vector("4.6,3.2,1.4,0.2,Iris-setosa"), Vector("5.3,3.7,1.5,0.2,Iris-setosa"), Vector("5.0,3.3,1.4,0.2,Iris-setosa"), Vector("7.0,3.2,4.7,1.4,Iris-versicolor"), Vector("6.4,3.2,4.5,1.5,Iris-versicolor"), Vector("6.9,3.1,4.9,1.5,Iris-versicolor"), Vector("5.5,2.3,4.0,1.3,Iris-versicolor"), Vector("6.5,2.8,4.6,1.5,Iris-versicolor"), Vector("5.7,2.8,4.5,1.3,Iris-versicolor"), Vector("6.3,3.3,4.7,1.6,Iris-versicolor"), Vector("4.9,2.4,3.3,1.0,Iris-versicolor"), Vector("6.6,2.9,4.6,1.3,Iris-versicolor"), Vector("5.2,2.7,3.9,1.4,Iris-versicolor"), Vector("5.0,2.0,3.5,1.0,Iris-versicolor"), Vector("5.9,3.0,4.2,1.5,Iris-versicolor"), Vector("6.0,2.2,4.0,1.0,Iris-versicolor"), Vector("6.1,2.9,4.7,1.4,Iris-versicolor"), Vector("5.6,2.9,3.6,1.3,Iris-versicolor"), Vector("6.7,3.1,4.4,1.4,Iris-versicolor"), Vector("5.6,3.0,4.5,1.5,Iris-versicolor"), Vector("5.8,2.7,4.1,1.0,Iris-versicolor"), Vector("6.2,2.2,4.5,1.5,Iris-versicolor"), Vector("5.6,2.5,3.9,1.1,Iris-versicolor"), Vector("5.9,3.2,4.8,1.8,Iris-versicolor"), Vector("6.1,2.8,4.0,1.3,Iris-versicolor"), Vector("6.3,2.5,4.9,1.5,Iris-versicolor"), Vector("6.1,2.8,4.7,1.2,Iris-versicolor"), Vector("6.4,2.9,4.3,1.3,Iris-versicolor"), Vector("6.6,3.0,4.4,1.4,Iris-versicolor"), Vector("6.8,2.8,4.8,1.4,Iris-versicolor"), Vector("6.7,3.0,5.0,1.7,Iris-versicolor"), Vector("6.0,2.9,4.5,1.5,Iris-versicolor"), Vector("5.7,2.6,3.5,1.0,Iris-versicolor"), Vector("5.5,2.4,3.8,1.1,Iris-versicolor"), Vector("5.5,2.4,3.7,1.0,Iris-versicolor"), Vector("5.8,2.7,3.9,1.2,Iris-versicolor"), Vector("6.0,2.7,5.1,1.6,Iris-versicolor"), Vector("5.4,3.0,4.5,1.5,Iris-versicolor"), Vector("6.0,3.4,4.5,1.6,Iris-versicolor"), Vector("6.7,3.1,4.7,1.5,Iris-versicolor"), Vector("6.3,2.3,4.4,1.3,Iris-versicolor"), Vector("5.6,3.0,4.1,1.3,Iris-versicolor"), Vector("5.5,2.5,4.0,1.3,Iris-versicolor"), Vector("5.5,2.6,4.4,1.2,Iris-versicolor"), Vector("6.1,3.0,4.6,1.4,Iris-versicolor"), Vector("5.8,2.6,4.0,1.2,Iris-versicolor"), Vector("5.0,2.3,3.3,1.0,Iris-versicolor"), Vector("5.6,2.7,4.2,1.3,Iris-versicolor"), Vector("5.7,3.0,4.2,1.2,Iris-versicolor"), Vector("5.7,2.9,4.2,1.3,Iris-versicolor"), Vector("6.2,2.9,4.3,1.3,Iris-versicolor"), Vector("5.1,2.5,3.0,1.1,Iris-versicolor"), Vector("5.7,2.8,4.1,1.3,Iris-versicolor")) | |
val dataset = (for ( | |
i <- source.iterator; | |
val rawdata = i.head.toString().split(",") | |
) yield (rawdata.dropRight(1).toVector.map(i => i.toDouble), rawdata(rawdata.length - 1))).toList.toIndexedSeq | |
val examples: examplesHolder = (for (i <- dataset) yield (i._1)).toVector | |
val num_classes = 2 | |
//centroids were provided. You may wish to initialize them randomly. | |
val initial_centroid: centroidsHolder = Vector( | |
Vector(4.6, 3.0, 4.0, 0.0), | |
Vector(6.8, 3.4, 4.6, 0.7)) | |
var lastCentr: centroidsHolder = initial_centroid; | |
for (i <- Range(0, 2)) { | |
if (i == 0) //in each round execute one of them | |
lastCentr = execute_fuzzyCMeans(examples, initial_centroid) | |
else | |
lastCentr = execute_lloydAlgPajares(examples, initial_centroid) | |
//say, class 0 is Iris-setosa and class1 is iris-versicolor | |
//Are the examples well classified? | |
println("") | |
val final_classific = findClosestCentroid(examples, lastCentr).toIndexedSeq | |
//builds a list of tuples (#correct, #incorrect) -ly classified | |
val compareRes = (for (i <- final_classific.indices) yield (if (final_classific(i)._2.formatted("%d") | |
.replaceFirst("0", "Iris-setosa") | |
.replaceFirst("1", "Iris-versicolor") | |
.compareTo(dataset(i)._2) == 0) (1, 0) else (0, 1))).reduceLeft((ex1, ex2) => (ex1._1 + ex2._1, ex1._2 + ex2._2)) | |
println("correctly classified: " + compareRes._1 + ", wrong: " + compareRes._2) | |
println("total 'cost': " + sumClosestCentroidDists(examples, lastCentr)) | |
println("Clasificar los 3 ejemplos proporcionados:") | |
val testIris01 = Vector(5.1, 3.5, 1.4, 0.2) //"Iris-Setosa" | |
println("Should yield to : 0") | |
println(findClosestCentroid(Vector(testIris01), lastCentr)) | |
val testIris02 = Vector(6.9, 3.1, 4.9, 1.5) //"Iris-versicolor" | |
println("Should yield to : 1") | |
println(findClosestCentroid(Vector(testIris02), lastCentr)) | |
val testIris03 = Vector(5.0, 3.4, 1.5, 0.2) //"Iris-setosa" | |
println("Should yield to : 0") | |
println(findClosestCentroid(Vector(testIris03), lastCentr)) | |
println("---") | |
} | |
} | |
} | |
//as it will be executed as script, we call the main here | |
//remove if you want to compile it | |
haupt.main(null) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment