Skip to content

Instantly share code, notes, and snippets.

@kralo
Created January 30, 2014 22:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kralo/8721440 to your computer and use it in GitHub Desktop.
Save kralo/8721440 to your computer and use it in GitHub Desktop.
Example Implementation in Scala of Lloyd and Fuzzy-C-Means Clustering
/**
* Preface:
*
* this is an educational implementation of two algorithms for unsupervised learning.
* They may not follow every standard notation but I hope they are illustrative.
* I am neither the most versatile scala programmer, so there may always be a way to express
* things in a more idiomatic way.
*
* This example is not packaged so that you can run it easily via the scala REPL.
* try with scala <this_filename>.scala
*
* Fuzzy-C-Means Clustering: Abortion criteria is the maximum in the difference of
* the partition matrix which holds probabilities between 0-1
*
* K-Menas Clustering with iterative Centroid refinement, abortion criteria is the maximum
* in the difference of the centroids between run.
*
* Nota on the example set: this is not ideal for unsupervised learning, that I took a pre-labeled dataset,
* but for the assignment, it did the job
*
* public domain.Provided without any warranty always happy if you link, forge, improve...
*/
import scala.math._
import types._
import vecHelper._
import vecHelper._
import algorithms._
object types {
type centroidHolder = Vector[Double] // your turn to chose what happens when your centroids are non-numeric...
type exampleHolder = Vector[Double] // and examples as well
type centroidsHolder = Vector[centroidHolder]
type examplesHolder = Vector[exampleHolder]
}
object vecHelper {
/**
* Mini-Footprint Lib to handle Vector operations
*/
def vecAdd(a: Vector[Double], b: Vector[Double]): Vector[Double] = {
if (a.length != b.length) throw new IllegalArgumentException("vecAdd: input Vectors must have the same length!")
(for (i <- a.indices) yield (a(i) + b(i))).toVector
}
def vecSubstr(a: Vector[Double], b: Vector[Double]): Vector[Double] = {
if (a.length != b.length) throw new IllegalArgumentException("vecSubstr: input Vectors must have the same length!")
(for (i <- a.indices) yield (a(i) - b(i))).toVector
}
def vecDist(a: Vector[Double], b: Vector[Double]): Double = {
if (a.length != b.length) throw new IllegalArgumentException("vecDist: input Vectors must have the same length!")
//computes euclidean distance between n-dim vectors
vecSubstr(a, b).map(i => pow(i, 2)).sum
}
def multScalVect(scalar: Double, b: Vector[Double]): Vector[Double] = b.map(i => scalar * i)
//def divVectScal(b: Vector[Double], scalar: Double): Vector[Double] = multScalVect(1 / scalar, b)
def divVectScal(scalar: Double, b: Vector[Double]): Vector[Double] = multScalVect(1 / scalar, b)
}
object algorithms {
/**
* Bundle all functions related to c-means-fuzzy-clustering
* following formulas in http://de.wikipedia.org/wiki/Fuzzy_C-Means
* and "Ejercicios resueltos de Visión por Computador, Pajares Martinsanz"
*/
//returns the distance and ids of the vectors which are the nearest
def findClosestCentroid(examples: examplesHolder, centroids: centroidsHolder): Vector[(Vector[Double], Int)] = {
for (
example <- examples;
//compute Distance to each Centroid and output "best" id
centrDists = (for (curCentrId <- centroids.indices) yield (vecDist(example, centroids(curCentrId)), curCentrId))
) yield (example, centrDists.reduceLeft((x, y) => if (x._1 < y._1) x else y)._2)
}
/**
* Calculate Sum (like a cost-function) of all examples to their closest centroid
*/
def sumClosestCentroidDists(examples: examplesHolder, centroids: centroidsHolder): Double = {
(for (
example <- examples;
//compute Distance to each Centroid and output "best" id
centrDists = (for (curCentrId <- centroids.indices) yield vecDist(example, centroids(curCentrId)))
) yield (centrDists.reduceLeft((x, y) => if (x < y) x else y))).sum
}
/**
* Membership-Function
* u_{ik}=\frac{1}{\sum_{j=1}^C \left(\frac{d_{ik}}{d_{jk}}\right)^\frac{2}{m-1}}
* whereas C is #centroids
*/
def calcMembershipFunc(example: exampleHolder, centroids: centroidsHolder, curCentr: Int, m: Double): Double = {
val d_ik = vecDist(centroids(curCentr), example)
return 1 / (for (j <- centroids) yield (pow(d_ik / vecDist(j, example), 2 / (m - 1)))).sum
}
/**
* returns a #centr x #examples Matrix with values 0-1 (prob.) for example to belong in that cluster
*/
def compPartitionMatrix(centr: centroidsHolder, examples: examplesHolder, m: Double): Vector[Vector[Double]] = {
def partCompleteExampleSetToOneCentr(centr: centroidsHolder, centrId: Int, examples: examplesHolder, m: Double): Vector[Double] = {
for (example <- examples) yield (calcMembershipFunc(example, centr, centrId, m))
}
(for (i <- centr.indices) yield (partCompleteExampleSetToOneCentr(centr, i, examples, m))).toVector
}
/**
* Computes the maximum Value in the Difference of the Partition Matrizes U_(t) and U_(t-1)
*/
def compPartitionMatDelta(oldCentr: centroidsHolder, newCentr: centroidsHolder, examples: examplesHolder, m: Double): Double = {
val oldU = compPartitionMatrix(oldCentr, examples, m)
val newU = compPartitionMatrix(newCentr, examples, m)
(for (row <- oldU.indices) yield (
(for (column <- oldU(row).indices) yield (
oldU(row)(column) - newU(row)(column)))
.foldLeft(0.0)((x, y) => max(x, y)))) //find max in row
.foldLeft(0.0)((x, y) => max(x, y)) //in columns
}
/**
* Computes the maximum Value in the Difference of centroids
*/
def compCentroidsDelta(oldCentr: centroidsHolder, newCentr: centroidsHolder): Double = {
(for (row <- oldCentr.indices) yield (
(for (column <- oldCentr(row).indices) yield (
oldCentr(row)(column) - newCentr(row)(column)))
.foldLeft(0.0)((x, y) => max(x, y)))) //find max in row
.foldLeft(0.0)((x, y) => max(x, y)) //in columns
}
/**
* Updating the Centre of the selected Centroid
*
* v_i=\frac{\sum_{k=1}^N u_{ik}^m x_k}{\sum_{k=1}^N u_{ik}^m}
* whereas N is the #examples
*/
def v_i(examples: examplesHolder, centroids: centroidsHolder, curCentr: Int, m: Double): Vector[Double] = {
//upper means upper part of the fraction
val upper = examples
.map(j => (j, pow(calcMembershipFunc(j, centroids, curCentr, m), m))) //generate Tuple from x_j and u_ik ^2
.map(j => multScalVect(j._2, j._1)) // multipy x_j and u_ik^2 from above
.reduceLeft((a, b) => vecAdd(a, b)) //sum to make final vector
val lower = examples.map(j => pow(calcMembershipFunc(j, centroids, curCentr, m), m)).sum //compute u_ik for each x_j
return divVectScal(lower, upper)
}
def execute_fuzzyCMeans(examples: examplesHolder, initial_Centroids: centroidsHolder): centroidsHolder = {
//Der "fuzzifier" m(>1) bestimmt, wie scharf die Objekte den Clustern zugeordnet werden.
//Lässt man m gegen unendlich laufen, so nähern sich die u_{ik} dem Wert \tfrac{1}{C} an, d.h. die
//Zugehörigkeit der Punkte ist zu allen Clustern gleich groß.
val fuzzifier = 2 //paso exp. b
val finCritEpsilon = 0.01 //tolerance epsilon
val maxIter = 10;
var lastCentr = initial_Centroids
var newCentr = initial_Centroids
var lastcompFin = finCritEpsilon * 10
var iter = 1
println("This is the fuzzy-C-Means speaking.")
println("You set the fuzzifier to " + fuzzifier)
println("and finCritEpsilon to " + finCritEpsilon)
//println("U_init is" + compPartitionMatrix(newCentr, examples, fuzzifier))
while (!(iter > maxIter || lastcompFin < finCritEpsilon)) { //if inner bracket holds true, then abort
println("--This is iter " + iter + " --")
newCentr = (for (i <- lastCentr.indices) yield (v_i(examples, lastCentr, i, fuzzifier))).toVector
println("Centroids are: " + newCentr)
//println("U_act is" + compPartitionMatrix(newCentr, examples, fuzzifier))
lastcompFin = compPartitionMatDelta(lastCentr, newCentr, examples, fuzzifier)
println("delta is " + lastcompFin)
lastCentr = newCentr
iter += 1
}
return lastCentr
}
/**Lloyd K-Means specific**/
/**
* Iterative LLoyds approach. There may also be a version, where you update the Centroids in batches
*/
def execute_lloydAlgPajares(examples: examplesHolder, initial_Centroids: centroidsHolder): centroidsHolder = {
val razon_aprendiz = 0.1
val finCritEpsilon = 10e-10 //tolerance epsilon
val maxIter = 10;
var lastCentr = initial_Centroids
var newCentr = initial_Centroids
var curFinCrit = finCritEpsilon * 10 //last computed finalization criterion
var iter = 1
println("This is the LLoyds by Pajares")
println("You set learning rate to " + razon_aprendiz)
println("and finCritEps to " + finCritEpsilon)
println("and maxiter to " + maxIter)
while (!(iter > maxIter || curFinCrit < finCritEpsilon)) { //if inner bracket holds true, then abort
println("--This is iter " + iter + " --")
//update centroid with id f._2
//in f._1 will be the example vector iterando por cada centroide
for (exampleCloseCentroid <- findClosestCentroid(examples, newCentr)) {
newCentr = newCentr.updated(exampleCloseCentroid._2, vecAdd(newCentr(exampleCloseCentroid._2), multScalVect(razon_aprendiz, vecSubstr(exampleCloseCentroid._1, newCentr(exampleCloseCentroid._2)))))
}
curFinCrit = compCentroidsDelta(lastCentr, newCentr)
lastCentr = newCentr
println("Centroids are: " + lastCentr)
println("FinCrit is : " + curFinCrit)
iter += 1
}
return lastCentr
}
}
object haupt {
def main(args: Array[String]): Unit = {
val input = ""
//if you can, set absolute path here
//val input = System.getProperty("user.dir") + "/Iris2Clases.txt"
/*val source = scala.io.Source.fromFile(input) //build dataset, strip last String as it's the text where it belongs
val dataset = (for (
line <- source.getLines;
val rawdata = line.toString().split(",")
) yield (rawdata.dropRight(1).toVector.map(i => i.toDouble), rawdata(rawdata.length - 1))).toList.toIndexedSeq
source.close()*/
//workaround so no file is read
val source = Vector(Vector("5.1,3.5,1.4,0.2,Iris-setosa"), Vector("4.9,3.0,1.4,0.2,Iris-setosa"), Vector("4.7,3.2,1.3,0.2,Iris-setosa"), Vector("4.6,3.1,1.5,0.2,Iris-setosa"), Vector("5.0,3.6,1.4,0.2,Iris-setosa"), Vector("5.4,3.9,1.7,0.4,Iris-setosa"), Vector("4.6,3.4,1.4,0.3,Iris-setosa"), Vector("5.0,3.4,1.5,0.2,Iris-setosa"), Vector("4.4,2.9,1.4,0.2,Iris-setosa"), Vector("4.9,3.1,1.5,0.1,Iris-setosa"), Vector("5.4,3.7,1.5,0.2,Iris-setosa"), Vector("4.8,3.4,1.6,0.2,Iris-setosa"), Vector("4.8,3.0,1.4,0.1,Iris-setosa"), Vector("4.3,3.0,1.1,0.1,Iris-setosa"), Vector("5.8,4.0,1.2,0.2,Iris-setosa"), Vector("5.7,4.4,1.5,0.4,Iris-setosa"), Vector("5.4,3.9,1.3,0.4,Iris-setosa"), Vector("5.1,3.5,1.4,0.3,Iris-setosa"), Vector("5.7,3.8,1.7,0.3,Iris-setosa"), Vector("5.1,3.8,1.5,0.3,Iris-setosa"), Vector("5.4,3.4,1.7,0.2,Iris-setosa"), Vector("5.1,3.7,1.5,0.4,Iris-setosa"), Vector("4.6,3.6,1.0,0.2,Iris-setosa"), Vector("5.1,3.3,1.7,0.5,Iris-setosa"), Vector("4.8,3.4,1.9,0.2,Iris-setosa"), Vector("5.0,3.0,1.6,0.2,Iris-setosa"), Vector("5.0,3.4,1.6,0.4,Iris-setosa"), Vector("5.2,3.5,1.5,0.2,Iris-setosa"), Vector("5.2,3.4,1.4,0.2,Iris-setosa"), Vector("4.7,3.2,1.6,0.2,Iris-setosa"), Vector("4.8,3.1,1.6,0.2,Iris-setosa"), Vector("5.4,3.4,1.5,0.4,Iris-setosa"), Vector("5.2,4.1,1.5,0.1,Iris-setosa"), Vector("5.5,4.2,1.4,0.2,Iris-setosa"), Vector("4.9,3.1,1.5,0.1,Iris-setosa"), Vector("5.0,3.2,1.2,0.2,Iris-setosa"), Vector("5.5,3.5,1.3,0.2,Iris-setosa"), Vector("4.9,3.1,1.5,0.1,Iris-setosa"), Vector("4.4,3.0,1.3,0.2,Iris-setosa"), Vector("5.1,3.4,1.5,0.2,Iris-setosa"), Vector("5.0,3.5,1.3,0.3,Iris-setosa"), Vector("4.5,2.3,1.3,0.3,Iris-setosa"), Vector("4.4,3.2,1.3,0.2,Iris-setosa"), Vector("5.0,3.5,1.6,0.6,Iris-setosa"), Vector("5.1,3.8,1.9,0.4,Iris-setosa"), Vector("4.8,3.0,1.4,0.3,Iris-setosa"), Vector("5.1,3.8,1.6,0.2,Iris-setosa"), Vector("4.6,3.2,1.4,0.2,Iris-setosa"), Vector("5.3,3.7,1.5,0.2,Iris-setosa"), Vector("5.0,3.3,1.4,0.2,Iris-setosa"), Vector("7.0,3.2,4.7,1.4,Iris-versicolor"), Vector("6.4,3.2,4.5,1.5,Iris-versicolor"), Vector("6.9,3.1,4.9,1.5,Iris-versicolor"), Vector("5.5,2.3,4.0,1.3,Iris-versicolor"), Vector("6.5,2.8,4.6,1.5,Iris-versicolor"), Vector("5.7,2.8,4.5,1.3,Iris-versicolor"), Vector("6.3,3.3,4.7,1.6,Iris-versicolor"), Vector("4.9,2.4,3.3,1.0,Iris-versicolor"), Vector("6.6,2.9,4.6,1.3,Iris-versicolor"), Vector("5.2,2.7,3.9,1.4,Iris-versicolor"), Vector("5.0,2.0,3.5,1.0,Iris-versicolor"), Vector("5.9,3.0,4.2,1.5,Iris-versicolor"), Vector("6.0,2.2,4.0,1.0,Iris-versicolor"), Vector("6.1,2.9,4.7,1.4,Iris-versicolor"), Vector("5.6,2.9,3.6,1.3,Iris-versicolor"), Vector("6.7,3.1,4.4,1.4,Iris-versicolor"), Vector("5.6,3.0,4.5,1.5,Iris-versicolor"), Vector("5.8,2.7,4.1,1.0,Iris-versicolor"), Vector("6.2,2.2,4.5,1.5,Iris-versicolor"), Vector("5.6,2.5,3.9,1.1,Iris-versicolor"), Vector("5.9,3.2,4.8,1.8,Iris-versicolor"), Vector("6.1,2.8,4.0,1.3,Iris-versicolor"), Vector("6.3,2.5,4.9,1.5,Iris-versicolor"), Vector("6.1,2.8,4.7,1.2,Iris-versicolor"), Vector("6.4,2.9,4.3,1.3,Iris-versicolor"), Vector("6.6,3.0,4.4,1.4,Iris-versicolor"), Vector("6.8,2.8,4.8,1.4,Iris-versicolor"), Vector("6.7,3.0,5.0,1.7,Iris-versicolor"), Vector("6.0,2.9,4.5,1.5,Iris-versicolor"), Vector("5.7,2.6,3.5,1.0,Iris-versicolor"), Vector("5.5,2.4,3.8,1.1,Iris-versicolor"), Vector("5.5,2.4,3.7,1.0,Iris-versicolor"), Vector("5.8,2.7,3.9,1.2,Iris-versicolor"), Vector("6.0,2.7,5.1,1.6,Iris-versicolor"), Vector("5.4,3.0,4.5,1.5,Iris-versicolor"), Vector("6.0,3.4,4.5,1.6,Iris-versicolor"), Vector("6.7,3.1,4.7,1.5,Iris-versicolor"), Vector("6.3,2.3,4.4,1.3,Iris-versicolor"), Vector("5.6,3.0,4.1,1.3,Iris-versicolor"), Vector("5.5,2.5,4.0,1.3,Iris-versicolor"), Vector("5.5,2.6,4.4,1.2,Iris-versicolor"), Vector("6.1,3.0,4.6,1.4,Iris-versicolor"), Vector("5.8,2.6,4.0,1.2,Iris-versicolor"), Vector("5.0,2.3,3.3,1.0,Iris-versicolor"), Vector("5.6,2.7,4.2,1.3,Iris-versicolor"), Vector("5.7,3.0,4.2,1.2,Iris-versicolor"), Vector("5.7,2.9,4.2,1.3,Iris-versicolor"), Vector("6.2,2.9,4.3,1.3,Iris-versicolor"), Vector("5.1,2.5,3.0,1.1,Iris-versicolor"), Vector("5.7,2.8,4.1,1.3,Iris-versicolor"))
val dataset = (for (
i <- source.iterator;
val rawdata = i.head.toString().split(",")
) yield (rawdata.dropRight(1).toVector.map(i => i.toDouble), rawdata(rawdata.length - 1))).toList.toIndexedSeq
val examples: examplesHolder = (for (i <- dataset) yield (i._1)).toVector
val num_classes = 2
//centroids were provided. You may wish to initialize them randomly.
val initial_centroid: centroidsHolder = Vector(
Vector(4.6, 3.0, 4.0, 0.0),
Vector(6.8, 3.4, 4.6, 0.7))
var lastCentr: centroidsHolder = initial_centroid;
for (i <- Range(0, 2)) {
if (i == 0) //in each round execute one of them
lastCentr = execute_fuzzyCMeans(examples, initial_centroid)
else
lastCentr = execute_lloydAlgPajares(examples, initial_centroid)
//say, class 0 is Iris-setosa and class1 is iris-versicolor
//Are the examples well classified?
println("")
val final_classific = findClosestCentroid(examples, lastCentr).toIndexedSeq
//builds a list of tuples (#correct, #incorrect) -ly classified
val compareRes = (for (i <- final_classific.indices) yield (if (final_classific(i)._2.formatted("%d")
.replaceFirst("0", "Iris-setosa")
.replaceFirst("1", "Iris-versicolor")
.compareTo(dataset(i)._2) == 0) (1, 0) else (0, 1))).reduceLeft((ex1, ex2) => (ex1._1 + ex2._1, ex1._2 + ex2._2))
println("correctly classified: " + compareRes._1 + ", wrong: " + compareRes._2)
println("total 'cost': " + sumClosestCentroidDists(examples, lastCentr))
println("Clasificar los 3 ejemplos proporcionados:")
val testIris01 = Vector(5.1, 3.5, 1.4, 0.2) //"Iris-Setosa"
println("Should yield to : 0")
println(findClosestCentroid(Vector(testIris01), lastCentr))
val testIris02 = Vector(6.9, 3.1, 4.9, 1.5) //"Iris-versicolor"
println("Should yield to : 1")
println(findClosestCentroid(Vector(testIris02), lastCentr))
val testIris03 = Vector(5.0, 3.4, 1.5, 0.2) //"Iris-setosa"
println("Should yield to : 0")
println(findClosestCentroid(Vector(testIris03), lastCentr))
println("---")
}
}
}
//as it will be executed as script, we call the main here
//remove if you want to compile it
haupt.main(null)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment