kralo/ex-clasification.scala

## ex-clasification.scala
/**
 * Preface:
 *
 * this is an educational implementation of two algorithms for unsupervised learning.
 * They may not follow every standard notation but I hope they are illustrative.
 * I am neither the most versatile scala programmer, so there may always be a way to express
 * things in a more idiomatic way.
 *
 * This example is not packaged so that you can run it easily via the scala REPL.
 * try with scala <this_filename>.scala
 *
 * Fuzzy-C-Means Clustering: Abortion criteria is the maximum in the difference of
 * 		the partition matrix which holds probabilities between 0-1
 *
 * K-Menas Clustering with iterative Centroid refinement, abortion criteria is the maximum
 * 		in the difference of the centroids between run.
 *
 * Nota on the example set: this is not ideal for unsupervised learning, that I took a pre-labeled dataset,
 * but for the assignment, it did the job
 *
 * public domain.Provided without any warranty always happy if you link, forge, improve...
 */
import scala.math._

import types._
import vecHelper._

import vecHelper._
import algorithms._

object types {
  type centroidHolder = Vector[Double] // your turn to chose what happens when your centroids are non-numeric...
  type exampleHolder = Vector[Double] // and examples as well
  type centroidsHolder = Vector[centroidHolder]
  type examplesHolder = Vector[exampleHolder]
}

object vecHelper {
  /**
   * Mini-Footprint Lib to handle Vector operations
   */

  def vecAdd(a: Vector[Double], b: Vector[Double]): Vector[Double] = {
    if (a.length != b.length) throw new IllegalArgumentException("vecAdd: input Vectors must have the same length!")

    (for (i <- a.indices) yield (a(i) + b(i))).toVector
  }

  def vecSubstr(a: Vector[Double], b: Vector[Double]): Vector[Double] = {
    if (a.length != b.length) throw new IllegalArgumentException("vecSubstr: input Vectors must have the same length!")

    (for (i <- a.indices) yield (a(i) - b(i))).toVector
  }

  def vecDist(a: Vector[Double], b: Vector[Double]): Double = {
    if (a.length != b.length) throw new IllegalArgumentException("vecDist: input Vectors must have the same length!")
    //computes euclidean distance between n-dim vectors
    vecSubstr(a, b).map(i => pow(i, 2)).sum
  }

  def multScalVect(scalar: Double, b: Vector[Double]): Vector[Double] = b.map(i => scalar * i)

  //def divVectScal(b: Vector[Double], scalar: Double): Vector[Double] = multScalVect(1 / scalar, b)
  def divVectScal(scalar: Double, b: Vector[Double]): Vector[Double] = multScalVect(1 / scalar, b)
}

object algorithms {
  /**
   * Bundle all functions related to c-means-fuzzy-clustering
   * following formulas in http://de.wikipedia.org/wiki/Fuzzy_C-Means
   * and "Ejercicios resueltos de Visión por Computador, Pajares Martinsanz"
   */

  //returns the distance and ids of the vectors which are the nearest
  def findClosestCentroid(examples: examplesHolder, centroids: centroidsHolder): Vector[(Vector[Double], Int)] = {
    for (
      example <- examples;
      //compute Distance to each Centroid and output "best" id
      centrDists = (for (curCentrId <- centroids.indices) yield (vecDist(example, centroids(curCentrId)), curCentrId))
    ) yield (example, centrDists.reduceLeft((x, y) => if (x._1 < y._1) x else y)._2)
  }

  /**
   * Calculate Sum (like a cost-function) of all examples to their closest centroid
   */
  def sumClosestCentroidDists(examples: examplesHolder, centroids: centroidsHolder): Double = {
    (for (
      example <- examples;
      //compute Distance to each Centroid and output "best" id
      centrDists = (for (curCentrId <- centroids.indices) yield vecDist(example, centroids(curCentrId)))
    ) yield (centrDists.reduceLeft((x, y) => if (x < y) x else y))).sum
  }

  /**
   * Membership-Function
   *  u_{ik}=\frac{1}{\sum_{j=1}^C \left(\frac{d_{ik}}{d_{jk}}\right)^\frac{2}{m-1}}
   *  whereas C is #centroids
   */
  def calcMembershipFunc(example: exampleHolder, centroids: centroidsHolder, curCentr: Int, m: Double): Double = {
    val d_ik = vecDist(centroids(curCentr), example)
    return 1 / (for (j <- centroids) yield (pow(d_ik / vecDist(j, example), 2 / (m - 1)))).sum
  }

  /**
   * returns a #centr x #examples Matrix with values 0-1 (prob.) for example to belong in that cluster
   */
  def compPartitionMatrix(centr: centroidsHolder, examples: examplesHolder, m: Double): Vector[Vector[Double]] = {
    def partCompleteExampleSetToOneCentr(centr: centroidsHolder, centrId: Int, examples: examplesHolder, m: Double): Vector[Double] = {
      for (example <- examples) yield (calcMembershipFunc(example, centr, centrId, m))
    }

    (for (i <- centr.indices) yield (partCompleteExampleSetToOneCentr(centr, i, examples, m))).toVector
  }

  /**
   * Computes the maximum Value in the Difference of the Partition Matrizes U_(t) and U_(t-1)
   */
  def compPartitionMatDelta(oldCentr: centroidsHolder, newCentr: centroidsHolder, examples: examplesHolder, m: Double): Double = {
    val oldU = compPartitionMatrix(oldCentr, examples, m)
    val newU = compPartitionMatrix(newCentr, examples, m)

    (for (row <- oldU.indices) yield (
      (for (column <- oldU(row).indices) yield (
        oldU(row)(column) - newU(row)(column)))
      .foldLeft(0.0)((x, y) => max(x, y)))) //find max in row
      .foldLeft(0.0)((x, y) => max(x, y)) //in columns
  }

  /**
   * Computes the maximum Value in the Difference of centroids
   */
  def compCentroidsDelta(oldCentr: centroidsHolder, newCentr: centroidsHolder): Double = {
    (for (row <- oldCentr.indices) yield (
      (for (column <- oldCentr(row).indices) yield (
        oldCentr(row)(column) - newCentr(row)(column)))
      .foldLeft(0.0)((x, y) => max(x, y)))) //find max in row
      .foldLeft(0.0)((x, y) => max(x, y)) //in columns
  }

  /**
   * Updating the Centre of the selected Centroid
   *
   * v_i=\frac{\sum_{k=1}^N u_{ik}^m x_k}{\sum_{k=1}^N u_{ik}^m}
   * whereas N is the #examples
   */
  def v_i(examples: examplesHolder, centroids: centroidsHolder, curCentr: Int, m: Double): Vector[Double] = {
    //upper means upper part of the fraction
    val upper = examples
      .map(j => (j, pow(calcMembershipFunc(j, centroids, curCentr, m), m))) //generate Tuple from x_j and u_ik ^2
      .map(j => multScalVect(j._2, j._1)) // multipy x_j and u_ik^2 from above
      .reduceLeft((a, b) => vecAdd(a, b)) //sum to make final vector

    val lower = examples.map(j => pow(calcMembershipFunc(j, centroids, curCentr, m), m)).sum //compute u_ik for each x_j

    return divVectScal(lower, upper)
  }

  def execute_fuzzyCMeans(examples: examplesHolder, initial_Centroids: centroidsHolder): centroidsHolder = {
    //Der "fuzzifier" m(>1) bestimmt, wie scharf die Objekte den Clustern zugeordnet werden.
    //Lässt man m gegen unendlich laufen, so nähern sich die u_{ik} dem Wert \tfrac{1}{C} an, d.h. die
    //Zugehörigkeit der Punkte ist zu allen Clustern gleich groß.
    val fuzzifier = 2 //paso exp. b
    val finCritEpsilon = 0.01 //tolerance epsilon
    val maxIter = 10;

    var lastCentr = initial_Centroids
    var newCentr = initial_Centroids

    var lastcompFin = finCritEpsilon * 10
    var iter = 1
    println("This is the fuzzy-C-Means speaking.")
    println("You set the fuzzifier to " + fuzzifier)
    println("and finCritEpsilon to " + finCritEpsilon)

    //println("U_init is" + compPartitionMatrix(newCentr, examples, fuzzifier))

    while (!(iter > maxIter || lastcompFin < finCritEpsilon)) { //if inner bracket holds true, then abort

      println("--This is iter " + iter + " --")

      newCentr = (for (i <- lastCentr.indices) yield (v_i(examples, lastCentr, i, fuzzifier))).toVector

      println("Centroids are: " + newCentr)
      //println("U_act is" + compPartitionMatrix(newCentr, examples, fuzzifier))
      lastcompFin = compPartitionMatDelta(lastCentr, newCentr, examples, fuzzifier)
      println("delta is " + lastcompFin)

      lastCentr = newCentr
      iter += 1
    }

    return lastCentr

  }

  /**Lloyd K-Means specific**/

  /**
   * Iterative LLoyds approach. There may also be a version, where you update the Centroids in batches
   */
  def execute_lloydAlgPajares(examples: examplesHolder, initial_Centroids: centroidsHolder): centroidsHolder = {
    val razon_aprendiz = 0.1
    val finCritEpsilon = 10e-10 //tolerance epsilon
    val maxIter = 10;

    var lastCentr = initial_Centroids
    var newCentr = initial_Centroids
    var curFinCrit = finCritEpsilon * 10 //last computed finalization criterion
    var iter = 1
    println("This is the LLoyds by Pajares")
    println("You set learning rate to " + razon_aprendiz)
    println("and finCritEps to " + finCritEpsilon)
    println("and maxiter to " + maxIter)

    while (!(iter > maxIter || curFinCrit < finCritEpsilon)) { //if inner bracket holds true, then abort

      println("--This is iter " + iter + " --")
      //update centroid with id f._2
      //in f._1 will be the example vector iterando por cada centroide
      for (exampleCloseCentroid <- findClosestCentroid(examples, newCentr)) {
        newCentr = newCentr.updated(exampleCloseCentroid._2, vecAdd(newCentr(exampleCloseCentroid._2), multScalVect(razon_aprendiz, vecSubstr(exampleCloseCentroid._1, newCentr(exampleCloseCentroid._2)))))
      }
      curFinCrit = compCentroidsDelta(lastCentr, newCentr)
      lastCentr = newCentr
      println("Centroids are: " + lastCentr)
      println("FinCrit is : " + curFinCrit)

      iter += 1
    }

    return lastCentr
  }

}

object haupt {

  def main(args: Array[String]): Unit = {
    val input = ""
    //if you can, set absolute path here
    //val input = System.getProperty("user.dir") + "/Iris2Clases.txt"

    /*val source = scala.io.Source.fromFile(input) //build dataset, strip last String as it's the text where it belongs

    val dataset = (for (
      line <- source.getLines;
      val rawdata = line.toString().split(",")
    ) yield (rawdata.dropRight(1).toVector.map(i => i.toDouble), rawdata(rawdata.length - 1))).toList.toIndexedSeq
    source.close()*/

    //workaround so no file is read
    val source = Vector(Vector("5.1,3.5,1.4,0.2,Iris-setosa"), Vector("4.9,3.0,1.4,0.2,Iris-setosa"), Vector("4.7,3.2,1.3,0.2,Iris-setosa"), Vector("4.6,3.1,1.5,0.2,Iris-setosa"), Vector("5.0,3.6,1.4,0.2,Iris-setosa"), Vector("5.4,3.9,1.7,0.4,Iris-setosa"), Vector("4.6,3.4,1.4,0.3,Iris-setosa"), Vector("5.0,3.4,1.5,0.2,Iris-setosa"), Vector("4.4,2.9,1.4,0.2,Iris-setosa"), Vector("4.9,3.1,1.5,0.1,Iris-setosa"), Vector("5.4,3.7,1.5,0.2,Iris-setosa"), Vector("4.8,3.4,1.6,0.2,Iris-setosa"), Vector("4.8,3.0,1.4,0.1,Iris-setosa"), Vector("4.3,3.0,1.1,0.1,Iris-setosa"), Vector("5.8,4.0,1.2,0.2,Iris-setosa"), Vector("5.7,4.4,1.5,0.4,Iris-setosa"), Vector("5.4,3.9,1.3,0.4,Iris-setosa"), Vector("5.1,3.5,1.4,0.3,Iris-setosa"), Vector("5.7,3.8,1.7,0.3,Iris-setosa"), Vector("5.1,3.8,1.5,0.3,Iris-setosa"), Vector("5.4,3.4,1.7,0.2,Iris-setosa"), Vector("5.1,3.7,1.5,0.4,Iris-setosa"), Vector("4.6,3.6,1.0,0.2,Iris-setosa"), Vector("5.1,3.3,1.7,0.5,Iris-setosa"), Vector("4.8,3.4,1.9,0.2,Iris-setosa"), Vector("5.0,3.0,1.6,0.2,Iris-setosa"), Vector("5.0,3.4,1.6,0.4,Iris-setosa"), Vector("5.2,3.5,1.5,0.2,Iris-setosa"), Vector("5.2,3.4,1.4,0.2,Iris-setosa"), Vector("4.7,3.2,1.6,0.2,Iris-setosa"), Vector("4.8,3.1,1.6,0.2,Iris-setosa"), Vector("5.4,3.4,1.5,0.4,Iris-setosa"), Vector("5.2,4.1,1.5,0.1,Iris-setosa"), Vector("5.5,4.2,1.4,0.2,Iris-setosa"), Vector("4.9,3.1,1.5,0.1,Iris-setosa"), Vector("5.0,3.2,1.2,0.2,Iris-setosa"), Vector("5.5,3.5,1.3,0.2,Iris-setosa"), Vector("4.9,3.1,1.5,0.1,Iris-setosa"), Vector("4.4,3.0,1.3,0.2,Iris-setosa"), Vector("5.1,3.4,1.5,0.2,Iris-setosa"), Vector("5.0,3.5,1.3,0.3,Iris-setosa"), Vector("4.5,2.3,1.3,0.3,Iris-setosa"), Vector("4.4,3.2,1.3,0.2,Iris-setosa"), Vector("5.0,3.5,1.6,0.6,Iris-setosa"), Vector("5.1,3.8,1.9,0.4,Iris-setosa"), Vector("4.8,3.0,1.4,0.3,Iris-setosa"), Vector("5.1,3.8,1.6,0.2,Iris-setosa"), Vector("4.6,3.2,1.4,0.2,Iris-setosa"), Vector("5.3,3.7,1.5,0.2,Iris-setosa"), Vector("5.0,3.3,1.4,0.2,Iris-setosa"), Vector("7.0,3.2,4.7,1.4,Iris-versicolor"), Vector("6.4,3.2,4.5,1.5,Iris-versicolor"), Vector("6.9,3.1,4.9,1.5,Iris-versicolor"), Vector("5.5,2.3,4.0,1.3,Iris-versicolor"), Vector("6.5,2.8,4.6,1.5,Iris-versicolor"), Vector("5.7,2.8,4.5,1.3,Iris-versicolor"), Vector("6.3,3.3,4.7,1.6,Iris-versicolor"), Vector("4.9,2.4,3.3,1.0,Iris-versicolor"), Vector("6.6,2.9,4.6,1.3,Iris-versicolor"), Vector("5.2,2.7,3.9,1.4,Iris-versicolor"), Vector("5.0,2.0,3.5,1.0,Iris-versicolor"), Vector("5.9,3.0,4.2,1.5,Iris-versicolor"), Vector("6.0,2.2,4.0,1.0,Iris-versicolor"), Vector("6.1,2.9,4.7,1.4,Iris-versicolor"), Vector("5.6,2.9,3.6,1.3,Iris-versicolor"), Vector("6.7,3.1,4.4,1.4,Iris-versicolor"), Vector("5.6,3.0,4.5,1.5,Iris-versicolor"), Vector("5.8,2.7,4.1,1.0,Iris-versicolor"), Vector("6.2,2.2,4.5,1.5,Iris-versicolor"), Vector("5.6,2.5,3.9,1.1,Iris-versicolor"), Vector("5.9,3.2,4.8,1.8,Iris-versicolor"), Vector("6.1,2.8,4.0,1.3,Iris-versicolor"), Vector("6.3,2.5,4.9,1.5,Iris-versicolor"), Vector("6.1,2.8,4.7,1.2,Iris-versicolor"), Vector("6.4,2.9,4.3,1.3,Iris-versicolor"), Vector("6.6,3.0,4.4,1.4,Iris-versicolor"), Vector("6.8,2.8,4.8,1.4,Iris-versicolor"), Vector("6.7,3.0,5.0,1.7,Iris-versicolor"), Vector("6.0,2.9,4.5,1.5,Iris-versicolor"), Vector("5.7,2.6,3.5,1.0,Iris-versicolor"), Vector("5.5,2.4,3.8,1.1,Iris-versicolor"), Vector("5.5,2.4,3.7,1.0,Iris-versicolor"), Vector("5.8,2.7,3.9,1.2,Iris-versicolor"), Vector("6.0,2.7,5.1,1.6,Iris-versicolor"), Vector("5.4,3.0,4.5,1.5,Iris-versicolor"), Vector("6.0,3.4,4.5,1.6,Iris-versicolor"), Vector("6.7,3.1,4.7,1.5,Iris-versicolor"), Vector("6.3,2.3,4.4,1.3,Iris-versicolor"), Vector("5.6,3.0,4.1,1.3,Iris-versicolor"), Vector("5.5,2.5,4.0,1.3,Iris-versicolor"), Vector("5.5,2.6,4.4,1.2,Iris-versicolor"), Vector("6.1,3.0,4.6,1.4,Iris-versicolor"), Vector("5.8,2.6,4.0,1.2,Iris-versicolor"), Vector("5.0,2.3,3.3,1.0,Iris-versicolor"), Vector("5.6,2.7,4.2,1.3,Iris-versicolor"), Vector("5.7,3.0,4.2,1.2,Iris-versicolor"), Vector("5.7,2.9,4.2,1.3,Iris-versicolor"), Vector("6.2,2.9,4.3,1.3,Iris-versicolor"), Vector("5.1,2.5,3.0,1.1,Iris-versicolor"), Vector("5.7,2.8,4.1,1.3,Iris-versicolor"))

    val dataset = (for (
      i <- source.iterator;
      val rawdata = i.head.toString().split(",")
    ) yield (rawdata.dropRight(1).toVector.map(i => i.toDouble), rawdata(rawdata.length - 1))).toList.toIndexedSeq

    val examples: examplesHolder = (for (i <- dataset) yield (i._1)).toVector

    val num_classes = 2
    //centroids were provided. You may wish to initialize them randomly.
    val initial_centroid: centroidsHolder = Vector(
      Vector(4.6, 3.0, 4.0, 0.0),
      Vector(6.8, 3.4, 4.6, 0.7))
    var lastCentr: centroidsHolder = initial_centroid;
    for (i <- Range(0, 2)) {
      if (i == 0) //in each round execute one of them
        lastCentr = execute_fuzzyCMeans(examples, initial_centroid)
      else
        lastCentr = execute_lloydAlgPajares(examples, initial_centroid)

      //say, class 0 is Iris-setosa and class1 is iris-versicolor
      //Are the examples well classified?
      println("")
      val final_classific = findClosestCentroid(examples, lastCentr).toIndexedSeq
      //builds a list of tuples (#correct, #incorrect) -ly classified
      val compareRes = (for (i <- final_classific.indices) yield (if (final_classific(i)._2.formatted("%d")
        .replaceFirst("0", "Iris-setosa")
        .replaceFirst("1", "Iris-versicolor")
        .compareTo(dataset(i)._2) == 0) (1, 0) else (0, 1))).reduceLeft((ex1, ex2) => (ex1._1 + ex2._1, ex1._2 + ex2._2))
      println("correctly classified: " + compareRes._1 + ", wrong: " + compareRes._2)

      println("total 'cost': " + sumClosestCentroidDists(examples, lastCentr))

      println("Clasificar los 3 ejemplos proporcionados:")
      val testIris01 = Vector(5.1, 3.5, 1.4, 0.2) //"Iris-Setosa"
      println("Should yield to : 0")
      println(findClosestCentroid(Vector(testIris01), lastCentr))

      val testIris02 = Vector(6.9, 3.1, 4.9, 1.5) //"Iris-versicolor"
      println("Should yield to : 1")
      println(findClosestCentroid(Vector(testIris02), lastCentr))

      val testIris03 = Vector(5.0, 3.4, 1.5, 0.2) //"Iris-setosa"
      println("Should yield to : 0")
      println(findClosestCentroid(Vector(testIris03), lastCentr))

      println("---")
    }
  }
}

//as it will be executed as script, we call the main here
//remove if you want to compile it
haupt.main(null)
	/**
	* Preface:
	*
	* this is an educational implementation of two algorithms for unsupervised learning.
	* They may not follow every standard notation but I hope they are illustrative.
	* I am neither the most versatile scala programmer, so there may always be a way to express
	* things in a more idiomatic way.
	*
	* This example is not packaged so that you can run it easily via the scala REPL.
	* try with scala <this_filename>.scala
	*
	* Fuzzy-C-Means Clustering: Abortion criteria is the maximum in the difference of
	* the partition matrix which holds probabilities between 0-1
	*
	* K-Menas Clustering with iterative Centroid refinement, abortion criteria is the maximum
	* in the difference of the centroids between run.
	*
	* Nota on the example set: this is not ideal for unsupervised learning, that I took a pre-labeled dataset,
	* but for the assignment, it did the job
	*
	* public domain.Provided without any warranty always happy if you link, forge, improve...
	*/
	import scala.math._

	import types._
	import vecHelper._

	import vecHelper._
	import algorithms._

	object types {
	type centroidHolder = Vector[Double] // your turn to chose what happens when your centroids are non-numeric...
	type exampleHolder = Vector[Double] // and examples as well
	type centroidsHolder = Vector[centroidHolder]
	type examplesHolder = Vector[exampleHolder]
	}

	object vecHelper {
	/**
	* Mini-Footprint Lib to handle Vector operations
	*/

	def vecAdd(a: Vector[Double], b: Vector[Double]): Vector[Double] = {
	if (a.length != b.length) throw new IllegalArgumentException("vecAdd: input Vectors must have the same length!")

	(for (i <- a.indices) yield (a(i) + b(i))).toVector
	}

	def vecSubstr(a: Vector[Double], b: Vector[Double]): Vector[Double] = {
	if (a.length != b.length) throw new IllegalArgumentException("vecSubstr: input Vectors must have the same length!")

	(for (i <- a.indices) yield (a(i) - b(i))).toVector
	}

	def vecDist(a: Vector[Double], b: Vector[Double]): Double = {
	if (a.length != b.length) throw new IllegalArgumentException("vecDist: input Vectors must have the same length!")
	//computes euclidean distance between n-dim vectors
	vecSubstr(a, b).map(i => pow(i, 2)).sum
	}

	def multScalVect(scalar: Double, b: Vector[Double]): Vector[Double] = b.map(i => scalar * i)

	//def divVectScal(b: Vector[Double], scalar: Double): Vector[Double] = multScalVect(1 / scalar, b)
	def divVectScal(scalar: Double, b: Vector[Double]): Vector[Double] = multScalVect(1 / scalar, b)
	}

	object algorithms {
	/**
	* Bundle all functions related to c-means-fuzzy-clustering
	* following formulas in http://de.wikipedia.org/wiki/Fuzzy_C-Means
	* and "Ejercicios resueltos de Visión por Computador, Pajares Martinsanz"
	*/

	//returns the distance and ids of the vectors which are the nearest
	def findClosestCentroid(examples: examplesHolder, centroids: centroidsHolder): Vector[(Vector[Double], Int)] = {
	for (
	example <- examples;
	//compute Distance to each Centroid and output "best" id
	centrDists = (for (curCentrId <- centroids.indices) yield (vecDist(example, centroids(curCentrId)), curCentrId))
	) yield (example, centrDists.reduceLeft((x, y) => if (x._1 < y._1) x else y)._2)
	}

	/**
	* Calculate Sum (like a cost-function) of all examples to their closest centroid
	*/
	def sumClosestCentroidDists(examples: examplesHolder, centroids: centroidsHolder): Double = {
	(for (
	example <- examples;
	//compute Distance to each Centroid and output "best" id
	centrDists = (for (curCentrId <- centroids.indices) yield vecDist(example, centroids(curCentrId)))
	) yield (centrDists.reduceLeft((x, y) => if (x < y) x else y))).sum
	}

	/**
	* Membership-Function
	* u_{ik}=\frac{1}{\sum_{j=1}^C \left(\frac{d_{ik}}{d_{jk}}\right)^\frac{2}{m-1}}
	* whereas C is #centroids
	*/
	def calcMembershipFunc(example: exampleHolder, centroids: centroidsHolder, curCentr: Int, m: Double): Double = {
	val d_ik = vecDist(centroids(curCentr), example)
	return 1 / (for (j <- centroids) yield (pow(d_ik / vecDist(j, example), 2 / (m - 1)))).sum
	}

	/**
	* returns a #centr x #examples Matrix with values 0-1 (prob.) for example to belong in that cluster
	*/
	def compPartitionMatrix(centr: centroidsHolder, examples: examplesHolder, m: Double): Vector[Vector[Double]] = {
	def partCompleteExampleSetToOneCentr(centr: centroidsHolder, centrId: Int, examples: examplesHolder, m: Double): Vector[Double] = {
	for (example <- examples) yield (calcMembershipFunc(example, centr, centrId, m))
	}

	(for (i <- centr.indices) yield (partCompleteExampleSetToOneCentr(centr, i, examples, m))).toVector
	}

	/**
	* Computes the maximum Value in the Difference of the Partition Matrizes U_(t) and U_(t-1)
	*/
	def compPartitionMatDelta(oldCentr: centroidsHolder, newCentr: centroidsHolder, examples: examplesHolder, m: Double): Double = {
	val oldU = compPartitionMatrix(oldCentr, examples, m)
	val newU = compPartitionMatrix(newCentr, examples, m)

	(for (row <- oldU.indices) yield (
	(for (column <- oldU(row).indices) yield (
	oldU(row)(column) - newU(row)(column)))
	.foldLeft(0.0)((x, y) => max(x, y)))) //find max in row
	.foldLeft(0.0)((x, y) => max(x, y)) //in columns
	}

	/**
	* Computes the maximum Value in the Difference of centroids
	*/
	def compCentroidsDelta(oldCentr: centroidsHolder, newCentr: centroidsHolder): Double = {
	(for (row <- oldCentr.indices) yield (
	(for (column <- oldCentr(row).indices) yield (
	oldCentr(row)(column) - newCentr(row)(column)))
	.foldLeft(0.0)((x, y) => max(x, y)))) //find max in row
	.foldLeft(0.0)((x, y) => max(x, y)) //in columns
	}

	/**
	* Updating the Centre of the selected Centroid
	*
	* v_i=\frac{\sum_{k=1}^N u_{ik}^m x_k}{\sum_{k=1}^N u_{ik}^m}
	* whereas N is the #examples
	*/
	def v_i(examples: examplesHolder, centroids: centroidsHolder, curCentr: Int, m: Double): Vector[Double] = {
	//upper means upper part of the fraction
	val upper = examples
	.map(j => (j, pow(calcMembershipFunc(j, centroids, curCentr, m), m))) //generate Tuple from x_j and u_ik ^2
	.map(j => multScalVect(j._2, j._1)) // multipy x_j and u_ik^2 from above
	.reduceLeft((a, b) => vecAdd(a, b)) //sum to make final vector

	val lower = examples.map(j => pow(calcMembershipFunc(j, centroids, curCentr, m), m)).sum //compute u_ik for each x_j

	return divVectScal(lower, upper)
	}

	def execute_fuzzyCMeans(examples: examplesHolder, initial_Centroids: centroidsHolder): centroidsHolder = {
	//Der "fuzzifier" m(>1) bestimmt, wie scharf die Objekte den Clustern zugeordnet werden.
	//Lässt man m gegen unendlich laufen, so nähern sich die u_{ik} dem Wert \tfrac{1}{C} an, d.h. die
	//Zugehörigkeit der Punkte ist zu allen Clustern gleich groß.
	val fuzzifier = 2 //paso exp. b
	val finCritEpsilon = 0.01 //tolerance epsilon
	val maxIter = 10;

	var lastCentr = initial_Centroids
	var newCentr = initial_Centroids

	var lastcompFin = finCritEpsilon * 10
	var iter = 1
	println("This is the fuzzy-C-Means speaking.")
	println("You set the fuzzifier to " + fuzzifier)
	println("and finCritEpsilon to " + finCritEpsilon)

	//println("U_init is" + compPartitionMatrix(newCentr, examples, fuzzifier))

	while (!(iter > maxIter \|\| lastcompFin < finCritEpsilon)) { //if inner bracket holds true, then abort

	println("--This is iter " + iter + " --")

	newCentr = (for (i <- lastCentr.indices) yield (v_i(examples, lastCentr, i, fuzzifier))).toVector

	println("Centroids are: " + newCentr)
	//println("U_act is" + compPartitionMatrix(newCentr, examples, fuzzifier))
	lastcompFin = compPartitionMatDelta(lastCentr, newCentr, examples, fuzzifier)
	println("delta is " + lastcompFin)

	lastCentr = newCentr
	iter += 1
	}

	return lastCentr

	}

	/Lloyd K-Means specific/

	/**
	* Iterative LLoyds approach. There may also be a version, where you update the Centroids in batches
	*/
	def execute_lloydAlgPajares(examples: examplesHolder, initial_Centroids: centroidsHolder): centroidsHolder = {
	val razon_aprendiz = 0.1
	val finCritEpsilon = 10e-10 //tolerance epsilon
	val maxIter = 10;

	var lastCentr = initial_Centroids
	var newCentr = initial_Centroids
	var curFinCrit = finCritEpsilon * 10 //last computed finalization criterion
	var iter = 1
	println("This is the LLoyds by Pajares")
	println("You set learning rate to " + razon_aprendiz)
	println("and finCritEps to " + finCritEpsilon)
	println("and maxiter to " + maxIter)

	while (!(iter > maxIter \|\| curFinCrit < finCritEpsilon)) { //if inner bracket holds true, then abort

	println("--This is iter " + iter + " --")
	//update centroid with id f._2
	//in f._1 will be the example vector iterando por cada centroide
	for (exampleCloseCentroid <- findClosestCentroid(examples, newCentr)) {
	newCentr = newCentr.updated(exampleCloseCentroid._2, vecAdd(newCentr(exampleCloseCentroid._2), multScalVect(razon_aprendiz, vecSubstr(exampleCloseCentroid._1, newCentr(exampleCloseCentroid._2)))))
	}
	curFinCrit = compCentroidsDelta(lastCentr, newCentr)
	lastCentr = newCentr
	println("Centroids are: " + lastCentr)
	println("FinCrit is : " + curFinCrit)

	iter += 1
	}

	return lastCentr
	}

	}

	object haupt {

	def main(args: Array[String]): Unit = {
	val input = ""
	//if you can, set absolute path here
	//val input = System.getProperty("user.dir") + "/Iris2Clases.txt"

	/*val source = scala.io.Source.fromFile(input) //build dataset, strip last String as it's the text where it belongs

	val dataset = (for (
	line <- source.getLines;
	val rawdata = line.toString().split(",")
	) yield (rawdata.dropRight(1).toVector.map(i => i.toDouble), rawdata(rawdata.length - 1))).toList.toIndexedSeq
	source.close()*/

	//workaround so no file is read
	val source = Vector(Vector("5.1,3.5,1.4,0.2,Iris-setosa"), Vector("4.9,3.0,1.4,0.2,Iris-setosa"), Vector("4.7,3.2,1.3,0.2,Iris-setosa"), Vector("4.6,3.1,1.5,0.2,Iris-setosa"), Vector("5.0,3.6,1.4,0.2,Iris-setosa"), Vector("5.4,3.9,1.7,0.4,Iris-setosa"), Vector("4.6,3.4,1.4,0.3,Iris-setosa"), Vector("5.0,3.4,1.5,0.2,Iris-setosa"), Vector("4.4,2.9,1.4,0.2,Iris-setosa"), Vector("4.9,3.1,1.5,0.1,Iris-setosa"), Vector("5.4,3.7,1.5,0.2,Iris-setosa"), Vector("4.8,3.4,1.6,0.2,Iris-setosa"), Vector("4.8,3.0,1.4,0.1,Iris-setosa"), Vector("4.3,3.0,1.1,0.1,Iris-setosa"), Vector("5.8,4.0,1.2,0.2,Iris-setosa"), Vector("5.7,4.4,1.5,0.4,Iris-setosa"), Vector("5.4,3.9,1.3,0.4,Iris-setosa"), Vector("5.1,3.5,1.4,0.3,Iris-setosa"), Vector("5.7,3.8,1.7,0.3,Iris-setosa"), Vector("5.1,3.8,1.5,0.3,Iris-setosa"), Vector("5.4,3.4,1.7,0.2,Iris-setosa"), Vector("5.1,3.7,1.5,0.4,Iris-setosa"), Vector("4.6,3.6,1.0,0.2,Iris-setosa"), Vector("5.1,3.3,1.7,0.5,Iris-setosa"), Vector("4.8,3.4,1.9,0.2,Iris-setosa"), Vector("5.0,3.0,1.6,0.2,Iris-setosa"), Vector("5.0,3.4,1.6,0.4,Iris-setosa"), Vector("5.2,3.5,1.5,0.2,Iris-setosa"), Vector("5.2,3.4,1.4,0.2,Iris-setosa"), Vector("4.7,3.2,1.6,0.2,Iris-setosa"), Vector("4.8,3.1,1.6,0.2,Iris-setosa"), Vector("5.4,3.4,1.5,0.4,Iris-setosa"), Vector("5.2,4.1,1.5,0.1,Iris-setosa"), Vector("5.5,4.2,1.4,0.2,Iris-setosa"), Vector("4.9,3.1,1.5,0.1,Iris-setosa"), Vector("5.0,3.2,1.2,0.2,Iris-setosa"), Vector("5.5,3.5,1.3,0.2,Iris-setosa"), Vector("4.9,3.1,1.5,0.1,Iris-setosa"), Vector("4.4,3.0,1.3,0.2,Iris-setosa"), Vector("5.1,3.4,1.5,0.2,Iris-setosa"), Vector("5.0,3.5,1.3,0.3,Iris-setosa"), Vector("4.5,2.3,1.3,0.3,Iris-setosa"), Vector("4.4,3.2,1.3,0.2,Iris-setosa"), Vector("5.0,3.5,1.6,0.6,Iris-setosa"), Vector("5.1,3.8,1.9,0.4,Iris-setosa"), Vector("4.8,3.0,1.4,0.3,Iris-setosa"), Vector("5.1,3.8,1.6,0.2,Iris-setosa"), Vector("4.6,3.2,1.4,0.2,Iris-setosa"), Vector("5.3,3.7,1.5,0.2,Iris-setosa"), Vector("5.0,3.3,1.4,0.2,Iris-setosa"), Vector("7.0,3.2,4.7,1.4,Iris-versicolor"), Vector("6.4,3.2,4.5,1.5,Iris-versicolor"), Vector("6.9,3.1,4.9,1.5,Iris-versicolor"), Vector("5.5,2.3,4.0,1.3,Iris-versicolor"), Vector("6.5,2.8,4.6,1.5,Iris-versicolor"), Vector("5.7,2.8,4.5,1.3,Iris-versicolor"), Vector("6.3,3.3,4.7,1.6,Iris-versicolor"), Vector("4.9,2.4,3.3,1.0,Iris-versicolor"), Vector("6.6,2.9,4.6,1.3,Iris-versicolor"), Vector("5.2,2.7,3.9,1.4,Iris-versicolor"), Vector("5.0,2.0,3.5,1.0,Iris-versicolor"), Vector("5.9,3.0,4.2,1.5,Iris-versicolor"), Vector("6.0,2.2,4.0,1.0,Iris-versicolor"), Vector("6.1,2.9,4.7,1.4,Iris-versicolor"), Vector("5.6,2.9,3.6,1.3,Iris-versicolor"), Vector("6.7,3.1,4.4,1.4,Iris-versicolor"), Vector("5.6,3.0,4.5,1.5,Iris-versicolor"), Vector("5.8,2.7,4.1,1.0,Iris-versicolor"), Vector("6.2,2.2,4.5,1.5,Iris-versicolor"), Vector("5.6,2.5,3.9,1.1,Iris-versicolor"), Vector("5.9,3.2,4.8,1.8,Iris-versicolor"), Vector("6.1,2.8,4.0,1.3,Iris-versicolor"), Vector("6.3,2.5,4.9,1.5,Iris-versicolor"), Vector("6.1,2.8,4.7,1.2,Iris-versicolor"), Vector("6.4,2.9,4.3,1.3,Iris-versicolor"), Vector("6.6,3.0,4.4,1.4,Iris-versicolor"), Vector("6.8,2.8,4.8,1.4,Iris-versicolor"), Vector("6.7,3.0,5.0,1.7,Iris-versicolor"), Vector("6.0,2.9,4.5,1.5,Iris-versicolor"), Vector("5.7,2.6,3.5,1.0,Iris-versicolor"), Vector("5.5,2.4,3.8,1.1,Iris-versicolor"), Vector("5.5,2.4,3.7,1.0,Iris-versicolor"), Vector("5.8,2.7,3.9,1.2,Iris-versicolor"), Vector("6.0,2.7,5.1,1.6,Iris-versicolor"), Vector("5.4,3.0,4.5,1.5,Iris-versicolor"), Vector("6.0,3.4,4.5,1.6,Iris-versicolor"), Vector("6.7,3.1,4.7,1.5,Iris-versicolor"), Vector("6.3,2.3,4.4,1.3,Iris-versicolor"), Vector("5.6,3.0,4.1,1.3,Iris-versicolor"), Vector("5.5,2.5,4.0,1.3,Iris-versicolor"), Vector("5.5,2.6,4.4,1.2,Iris-versicolor"), Vector("6.1,3.0,4.6,1.4,Iris-versicolor"), Vector("5.8,2.6,4.0,1.2,Iris-versicolor"), Vector("5.0,2.3,3.3,1.0,Iris-versicolor"), Vector("5.6,2.7,4.2,1.3,Iris-versicolor"), Vector("5.7,3.0,4.2,1.2,Iris-versicolor"), Vector("5.7,2.9,4.2,1.3,Iris-versicolor"), Vector("6.2,2.9,4.3,1.3,Iris-versicolor"), Vector("5.1,2.5,3.0,1.1,Iris-versicolor"), Vector("5.7,2.8,4.1,1.3,Iris-versicolor"))

	val dataset = (for (
	i <- source.iterator;
	val rawdata = i.head.toString().split(",")
	) yield (rawdata.dropRight(1).toVector.map(i => i.toDouble), rawdata(rawdata.length - 1))).toList.toIndexedSeq

	val examples: examplesHolder = (for (i <- dataset) yield (i._1)).toVector

	val num_classes = 2
	//centroids were provided. You may wish to initialize them randomly.
	val initial_centroid: centroidsHolder = Vector(
	Vector(4.6, 3.0, 4.0, 0.0),
	Vector(6.8, 3.4, 4.6, 0.7))
	var lastCentr: centroidsHolder = initial_centroid;
	for (i <- Range(0, 2)) {
	if (i == 0) //in each round execute one of them
	lastCentr = execute_fuzzyCMeans(examples, initial_centroid)
	else
	lastCentr = execute_lloydAlgPajares(examples, initial_centroid)

	//say, class 0 is Iris-setosa and class1 is iris-versicolor
	//Are the examples well classified?
	println("")
	val final_classific = findClosestCentroid(examples, lastCentr).toIndexedSeq
	//builds a list of tuples (#correct, #incorrect) -ly classified
	val compareRes = (for (i <- final_classific.indices) yield (if (final_classific(i)._2.formatted("%d")
	.replaceFirst("0", "Iris-setosa")
	.replaceFirst("1", "Iris-versicolor")
	.compareTo(dataset(i)._2) == 0) (1, 0) else (0, 1))).reduceLeft((ex1, ex2) => (ex1._1 + ex2._1, ex1._2 + ex2._2))
	println("correctly classified: " + compareRes._1 + ", wrong: " + compareRes._2)

	println("total 'cost': " + sumClosestCentroidDists(examples, lastCentr))

	println("Clasificar los 3 ejemplos proporcionados:")
	val testIris01 = Vector(5.1, 3.5, 1.4, 0.2) //"Iris-Setosa"
	println("Should yield to : 0")
	println(findClosestCentroid(Vector(testIris01), lastCentr))

	val testIris02 = Vector(6.9, 3.1, 4.9, 1.5) //"Iris-versicolor"
	println("Should yield to : 1")
	println(findClosestCentroid(Vector(testIris02), lastCentr))

	val testIris03 = Vector(5.0, 3.4, 1.5, 0.2) //"Iris-setosa"
	println("Should yield to : 0")
	println(findClosestCentroid(Vector(testIris03), lastCentr))

	println("---")
	}
	}
	}

	//as it will be executed as script, we call the main here
	//remove if you want to compile it
	haupt.main(null)