krishnanraman/RESULTS.txt

## build.sbt
name := "cbir Job"

version := "1.0"

scalaVersion := "2.10.4"
assemblyJarName := "cbir.jar"

libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.1"
libraryDependencies += "org.apache.spark" % "spark-mllib_2.10" % "1.4.1"

mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
   {
    case PathList("META-INF", xs @ _*) => MergeStrategy.discard
    case x => MergeStrategy.first
   }
}

## cbir.scala
import java.awt.image.{BufferedImage, WritableRaster}
import javax.imageio.ImageIO
import java.io.File
import scala.collection.JavaConversions._
import org.apache.commons.math3.ml.clustering.{KMeansPlusPlusClusterer, DoublePoint}
import org.apache.commons.math3.ml.distance.EuclideanDistance
import org.apache.commons.math3.stat.descriptive.moment.{Mean, Variance, Skewness}

/*
Given a picture,
run kmeans to compute 8 dominant colors.
replace picture with simpler picture containing only 8 dominant colors
Compute a cbir feature ( Array[Double] of length 45 ) based primarily on statistical moments (mean,variance,skew)
of the simpler picture.
To run: scala -cp commonsmath.jar:. cbir directory
*/
object cbir extends App {

	// first 3 moments ( try more ?)
	def stats(x:Array[Double]):List[Double] = {
		List(new Mean().evaluate(x, 0, x.size),
		new Variance().evaluate(x, 0, x.size),
		new Skewness().evaluate(x, 0, x.size))
	}

	def compute(imagefile:String):List[Double] = {

		// read an image & get its raster
		val img = ImageIO.read(new File(imagefile))
		val raster:WritableRaster = img.getRaster
		val (w,h) = (img.getWidth, img.getHeight)

		// extract all colors from raster
		val allColors = (0 until w).map { x=>
			(0 until h).map { y=>
				val arr = Array.fill[Double](3)(0.0)
				raster.getPixel(x,y, arr)
				new DoublePoint(arr)
			}
		}.flatten

		// find 8 dominant colors of image via kmeans
		val k = 8
		val kmeans = new KMeansPlusPlusClusterer[DoublePoint](k, 1000)
		kmeans.getRandomGenerator().setSeed(1234567L)
		val centroids = kmeans.cluster(allColors.toIterable)
		val colors = centroids.map{ x=> x.getCenter.getPoint }
		val euclidean = new EuclideanDistance()

		// update raster with 8 color palette
		(0 until w).foreach { x=>
			(0 until h).foreach { y=>
				val arr = Array.fill[Double](3)(0.0)
				raster.getPixel(x,y, arr) // loads the BGR color into arr
				val closestIdx = colors
					.zipWithIndex
					.map{ ci =>
						val (c,idx) = ci
						(idx,euclidean.compute(c,arr))
					}
					.minBy{ x=> x._2 } // want smallest euclidean
					._1 // index of smallest euclidean

				// replace each pixel with closest dominant color
				val closestColor = colors(closestIdx)
				raster.setPixel(x,y,closestColor) // replace pixel color in raster
			}
		}

		// extract all the colors again from the updated raster
		val data:Seq[Array[Double]] = (0 until w).map { x=>
			(0 until h).map { y=>
				val arr = Array.fill[Double](3)(0.0)
				raster.getPixel(x,y, arr) // copies pixel's x,y value to arr!
				arr
			}
		}.flatten.toSeq

		// EACH COLOR IN THE COLOR SPACE HAS MEAN, VARIANCE & SKEW = List of size 3
		// We USE THE RGB COLOR SPACE
		// So r=3, g=3,b=3 => an image has a signature of length 3+3+3=9.
		// That's the global sig.
		// For local, we break up image into x pieces, treat each piece as an image.
		// Hence local sig = x*9, x = spatial

		// global sig
		val global = stats(data.map{ x:Array[Double] => x(0) }.toArray) ++ // B
			stats(data.map{ x:Array[Double] => x(1) }.toArray) ++
			stats(data.map{ x:Array[Double] => x(2) }.toArray)

		assert(global.size == 9)
		//println(global)

		// compute signature on spatial groups
		val spatial = args(1).toInt
		//println(spatial)
		val n = data.size
		val local = data
		.grouped(n/spatial)
		.map{ gp:Seq[Array[Double]] =>
			val b = stats(gp.map{ x=> x(0)}.toArray)
			val r = stats(gp.map{ x=> x(1)}.toArray)
			val g = stats(gp.map{ x=> x(2)}.toArray)
			//println(b)
			//println(g)
			//println(r)
			b ++ r ++ g
		}
		.reduceLeft(_ ++ _)

		//println(local.size)
		assert(local.size == spatial * 9)


		local ++ global
	}

	def routine = {

	}

	def save(x:List[Double], dir:String, file:String) = {
		val path = dir + file+"_sig.txt"
		val pw = new java.io.PrintWriter(path)
		println("Saving " + path)
		x.foreach(pw.println)
		pw.flush
		pw.close
	}

	// find all image files in dir
	val dir = args(0)
	val files = new File(dir).list().filter{ x=> x.endsWith(".jpg")}
	files.foreach{
		file =>
		val path = dir + file
		println("Processing " + path)
		save(cbir.compute(path), dir, file)
	}
}

## cbirJob.scala
package com.marin.cv

// To build: sbt assembly
// To run: spark-submit --driver-memory 120g --executor-memory 120g --class com.marin.cv.cbirJob --master local[*] target/scala-2.10/cbir.jar "--dir /media/kraman/disk1/comp_vision/sig"
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.rdd._
import org.apache.spark.rdd.PairRDDFunctions
import com.marin.util.Args
import java.io.PrintWriter
import org.apache.spark.mllib.classification._
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import collection.mutable.ListBuffer

object cbirJob extends App {
    val conf = new SparkConf()
        .setMaster("local[28]")
        .set("spark.driver.maxResultSize", "100g")
        .set("spark.local.dir", "/media/kraman/disk2/tmp")
        .set("spark.akka.threads", "256")

    val sc = new SparkContext(conf)
    val myargs = Args(args.mkString)
    val dir = myargs("dir")
    val mylog = new PrintWriter("log")

    val classNames:Map[Int, String] = Map(
        300->"bus",
        400->"dino",
        500->"elephant",
        600->"flower",
        700->"horse",
        800->"nature",
        900->"food")

    // read all the CBIR image signatures & convert to features
    val classFeatures:Map[String, ListBuffer[LabeledPoint]] = sc
    .wholeTextFiles(dir) // RDD[(String, String)]
    .map{ kv =>
        val (filename, contents) = kv
        val array:Array[String] = contents.split("\n")
        val features:DenseVector = new DenseVector(array.map{_.toDouble})
        val lp:LabeledPoint = new LabeledPoint(1.0d, features)
        val idx = filename.indexOf(".jpg_sig.txt")
        val key = (filename.slice(idx-3,idx).toInt/100)*100 // eg. turn 368 into 300
        val name = classNames(key)
        (name, lp)
    } // RDD[String, LabeledPoint]
    .aggregateByKey(collection.mutable.ListBuffer[LabeledPoint]())(
        (u,v) => u.+:(v),
        (u1,u2) => u1 ++  u2
    ) // RDD[String, ListBuffer[LabeledPoint]]
    .collectAsMap.toMap

    // for each class(bus, dino, elephant, flower), train binary classifier
    // uniform sample to ensure equal number of negative examples
    val results = classFeatures.map{ kv =>
        val (className, features) = kv
        val trainTestRatio = Array(0.8,0.2)
        val array = sc.makeRDD(features).randomSplit(trainTestRatio)
        val (posTrain, posTest) = (array(0), array(1))

        val negFeatures = sc.makeRDD(classFeatures
            .keys
            .filterNot(name => name==className)
            .map{ negativeClass => classFeatures(negativeClass)}
            .reduceLeft(_++_))
            .map{ x:LabeledPoint => new LabeledPoint(0, x.features)} // flip labels

        // want |negatives| == |positives|, but |negatives| > |positives| in dataset
        val fraction = posTrain.count.toDouble/negFeatures.count
        val negTrain = negFeatures.sample(false, fraction)
        val negTest = negFeatures.subtract(negTrain)

        mylog.println("Counts(NegTrain, NegTest, PosTrain, PosTest):" +
            negTrain.count + "," + negTest.count + "," + posTrain.count + "," + posTest.count)

        val model:SVMModel = SVMWithSGD.train(posTrain ++ negTrain, 1000)
        val testSet = posTest ++ negTest
        val scores = model.predict( testSet.map{ lp => lp.features } )
        val labels = testSet.map{ lp => lp.label }
        val scoresAndLabels = scores.zip(labels)
        val metrics = new BinaryClassificationMetrics(scoresAndLabels)

        mylog.println( className + "," + metrics.areaUnderPR + "," + metrics.areaUnderROC)

        (className, metrics.areaUnderPR, metrics.areaUnderROC)

    }.toList

    sc.makeRDD(results,1).saveAsTextFile("svm classifier results"+System.currentTimeMillis)
    mylog.close
}

## cbirJob2.scala
// THIS VERSION TRAINS MULTIPLE CLASSIFIERS PER CLASS
// Corel has 7 classes, each class has 100 images
// WANT 80 +ve, 80-ve PER CLASSIFIER PER CLASS.
// So 600 -ve per class = 80*7 + 40. So 7 classifiers PER CLASS
// TOTAL 7 classifiers per class * 7 classes = 49 classifiers

package com.marin.cv
// To build: sbt assembly
// To run: spark-submit --driver-memory 120g --executor-memory 120g --class com.marin.cv.cbirJob --master local[*] target/scala-2.10/cbir.jar "--dir /media/kraman/disk1/comp_vision/sig"

import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.rdd._
import org.apache.spark.rdd.PairRDDFunctions
import com.marin.util.Args
import java.io.PrintWriter
import org.apache.spark.mllib.classification._
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import collection.mutable.ListBuffer

object cbirJob extends App {
    val conf = new SparkConf()
        .setMaster("local[28]")
        .set("spark.driver.maxResultSize", "100g")
        .set("spark.local.dir", "/media/kraman/disk2/tmp")
        .set("spark.akka.threads", "256")

    val sc = new SparkContext(conf)
    val myargs = Args(args.mkString)
    val dir = myargs("dir")
    val mylog = new PrintWriter("log")

    val classNames:Map[Int, String] = Map(
        300->"bus",
        400->"dino",
        500->"elephant",
        600->"flower",
        700->"horse",
        800->"nature",
        900->"food")

    // read all the CBIR image signatures & convert to features
    val classFeatures:Map[String, ListBuffer[LabeledPoint]] = sc
    .wholeTextFiles(dir) // RDD[(String, String)]
    .map{ kv =>
        val (filename, contents) = kv
        val array:Array[String] = contents.split("\n")
        val features:DenseVector = new DenseVector(array.map{_.toDouble})
        val lp:LabeledPoint = new LabeledPoint(1.0d, features)
        val idx = filename.indexOf(".jpg_sig.txt")
        val key = (filename.slice(idx-3,idx).toInt/100)*100 // eg. turn 368 into 300
        val name = classNames(key)
        (name, lp)
    } // RDD[String, LabeledPoint]
    .aggregateByKey(collection.mutable.ListBuffer[LabeledPoint]())(
        (u,v) => u.+:(v),
        (u1,u2) => u1 ++  u2
    ) // RDD[String, ListBuffer[LabeledPoint]]
    .collectAsMap.toMap

    // for each class(bus, dino, elephant, flower), train binary classifier
    val results = classFeatures.map{ kv =>
        val (className, features) = kv
        val trainTestRatio = Array(0.8,0.2)
        val array = sc.makeRDD(features).randomSplit(trainTestRatio)
        val (posTrain, posTest) = (array(0), array(1))

        val negFeatures = sc.makeRDD(classFeatures
            .keys
            .filterNot(name => name==className)
            .map{ negativeClass => classFeatures(negativeClass)}
            .reduceLeft(_++_))
            .map{ x:LabeledPoint => new LabeledPoint(0, x.features)} // flip labels

        // want |negatives| == |positives|, but |negatives| > |positives| in dataset
        // so make as many sample buckets as you need to
        // sample bucket size must be equal to pos sample size = 80
        // 80 * 7 + 40 = 600
        // so need 7 buckets, plus 1 test set bucket
        val negRatio = Array(80,80,80,80,80,80,80,40).map{ _/600.0 }
        val negArray = negFeatures.randomSplit(negRatio)
        val negTest = negArray(7) // SET TEST SET ASIDE
        val testSet = posTest ++ negTest
        val testSetFeatures = testSet.map{ lp => lp.features }
        // NOW BUILD 1 classifier PER NEG SAMPLE BUCKET
        val scores:Seq[RDD[Double]] = (0 to 6).toSeq.map{ idx =>
            val negTrain = negArray(idx)
            val model:SVMModel = SVMWithSGD.train(posTrain ++ negTrain, 1000)

            mylog.println("SVM: " + idx + " Counts(NegTrain, NegTest, PosTrain, PosTest):" +
            negTrain.count + "," + negTest.count + "," + posTrain.count + "," + posTest.count)
            model.predict( testSetFeatures )
        }
        /* fuck this! each classifier gives you an RDD[Double] full of scores!
        so you have 7 columns full of scores, like
        0 1 1 ...
        1 0 1
        0 1 1
        0 1 1
        you want 1 column full of scores,
        which contains a 1 if the consensus is 1, otherwise 0.
        consensus on 7 = ceil(7/2) = 4
        so add all columns, divide by 4. if it exceeds 1, then 1 else 0
        */
        val consensus:RDD[Double] = scores.reduceLeft{ (a:RDD[Double], b:RDD[Double]) =>
            a.zip(b).map{ rec => rec._1 + rec._2 }
        }.map{ x=> if( x/4.0 >= 1.0) 1.0 else 0.0 }

        val labels = testSet.map{ lp => lp.label }
        val scoresAndLabels = consensus.zip(labels)
        val metrics = new BinaryClassificationMetrics(scoresAndLabels)

        mylog.println( className + "," + metrics.areaUnderPR + "," + metrics.areaUnderROC)

        (className, metrics.areaUnderPR, metrics.areaUnderROC)

    }.toList

    sc.makeRDD(results,1).saveAsTextFile("svm classifier results"+System.currentTimeMillis)
    mylog.close
}

## RESULTS.txt
Dataset: COREL subset ( 7 classes, 100 images per class  => 7*100 = 700 jpgs )
COREL: https://sites.google.com/site/dctresearch/Home/content-based-image-retrieval

Training Test Ratio: 80-20

Equal number of true & false samples ie. train on 80 dinos & 80 random non-dinos out of 700-100 = 600 non-dinos.
So training sample size = 80 + 80 = 160
Test sample = 20 dinos + 20 non-dinos

Train 1 SVM classifier per class => 7 SVM classifiers
Hyperparam: 1000 iterations on the SGD with training rate 1.0

(Name of Classifier, AUC = Area under Precision Recall Curve, Area under ROC Curve)
(bus,0.7266666666666667,0.6215384615384616)
(dino,0.9289215686274509,0.9179566563467493)
(flower,0.9762845849802372,0.9545454545454546)
(horse,0.9277836134453781,0.8854166666666667)
(food,0.7559523809523809,0.6142857142857143)
(nature,0.843956043956044,0.7318181818181818)
(elephant,0.5694235588972432,0.5354691075514875)

Conclusions:
Easiest to classify = Flower, Dino
Hardesy to classify = Elephant

wiki for AUC: http://fastml.com/what-you-wanted-to-know-about-auc/  ( we want 1.0, we get 0.56 to 0.97 )
wiki for ROC:http://gim.unmc.edu/dxtests/roc3.htm ( we want 1.0, we get 0.53 to 0.95)

Run2: Test Set: 20 positives + 520 negatives ( negative test samples far overwhelm positives )

(bus,0.3872202166064982,0.7410112359550562)
(dino,0.6296296296296297,0.9430740037950663)
(flower,0.5792057698992006,0.8983012559862654)
(horse,0.4008389605074688,0.6706503014642549)
(food,0.24998883499915142,0.6119496855345912)
(nature,0.11913195351266649,0.5286529060293319)
(elephant,0.5030071466512145,0.650146771037182)

Run3: Use a 1:9 ratio for training. 1 positive sample for 9 negative samples!
Justification: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4061540/
Essentially, opposite of Run2.

Summary: (CLASSNAME, AUC, ROC)
(bus,0.6923076923076923,0.5)
(dino,0.9463950683462878,0.9024390243902439)
(flower,0.9299660441426146,0.881578947368421)
(horse,0.6826025459688826,0.5142857142857142)
(food,0.7222222222222222,0.5)
(nature,0.7056831267357584,0.7325162220620043)
(elephant,0.6941747572815534,0.5)
---------
Detailed Counts
CLASSNAME, AUC, ROC
---------
Counts(NegTrain, NegTest, PosTrain, PosTest):536,64,60,40
bus,0.6923076923076923,0.5
Counts(NegTrain, NegTest, PosTrain, PosTest):550,50,59,41
dino,0.9463950683462878,0.9024390243902439
Counts(NegTrain, NegTest, PosTrain, PosTest):545,55,62,38
flower,0.9299660441426146,0.881578947368421
Counts(NegTrain, NegTest, PosTrain, PosTest):534,66,65,35
horse,0.6826025459688826,0.5142857142857142
Counts(NegTrain, NegTest, PosTrain, PosTest):550,50,60,40
food,0.7222222222222222,0.5
Counts(NegTrain, NegTest, PosTrain, PosTest):527,73,62,38
nature,0.7056831267357584,0.7325162220620043
Counts(NegTrain, NegTest, PosTrain, PosTest):537,63,60,40
elephant,0.6941747572815534,0.5

Run 4. Use equal number of positive & negative training set.
But since number of negatives > number of positives,
use as many classifiers PER class as required to cover the entire negative sample training set.
// THIS VERSION TRAINS MULTIPLE CLASSIFIERS PER CLASS
// Corel has 7 classes, each class has 100 images
// WANT 80 +ve, 80-ve PER CLASSIFIER PER CLASS.
// So 600 -ve per class = 80*7 + 40. So 7 classifiers PER CLASS
// TOTAL 7 classifiers per class * 7 classes = 49 classifiers

Justification:
http://sci2s.ugr.es/keel/pdf/specific/congreso/akbani_svm_2004.pdf
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.309.904&rep=rep1&type=pdf
Summary: (CLASSNAME, AUC, ROC)
(bus,0.7278735632183908,0.7034313725490197)
(dino,0.9265931372549019,0.9295485636114911)
(flower,0.9380952380952381,0.9047619047619048)
(horse,0.7213669950738917,0.7648809523809524)
(food,0.7255244755244755,0.7071428571428572)
(nature,0.6842105263157895,0.6556390977443609)
(elephant,0.7199602780536245,0.6795665634674922)
	name := "cbir Job"

	version := "1.0"

	scalaVersion := "2.10.4"
	assemblyJarName := "cbir.jar"

	libraryDependencies += "org.apache.spark" %% "spark-core" % "1.4.1"
	libraryDependencies += "org.apache.spark" % "spark-mllib_2.10" % "1.4.1"

	mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
	{
	case PathList("META-INF", xs @ _*) => MergeStrategy.discard
	case x => MergeStrategy.first
	}
	}
	import java.awt.image.{BufferedImage, WritableRaster}
	import javax.imageio.ImageIO
	import java.io.File
	import scala.collection.JavaConversions._
	import org.apache.commons.math3.ml.clustering.{KMeansPlusPlusClusterer, DoublePoint}
	import org.apache.commons.math3.ml.distance.EuclideanDistance
	import org.apache.commons.math3.stat.descriptive.moment.{Mean, Variance, Skewness}

	/*
	Given a picture,
	run kmeans to compute 8 dominant colors.
	replace picture with simpler picture containing only 8 dominant colors
	Compute a cbir feature ( Array[Double] of length 45 ) based primarily on statistical moments (mean,variance,skew)
	of the simpler picture.
	To run: scala -cp commonsmath.jar:. cbir directory
	*/
	object cbir extends App {

	// first 3 moments ( try more ?)
	def stats(x:Array[Double]):List[Double] = {
	List(new Mean().evaluate(x, 0, x.size),
	new Variance().evaluate(x, 0, x.size),
	new Skewness().evaluate(x, 0, x.size))
	}

	def compute(imagefile:String):List[Double] = {

	// read an image & get its raster
	val img = ImageIO.read(new File(imagefile))
	val raster:WritableRaster = img.getRaster
	val (w,h) = (img.getWidth, img.getHeight)

	// extract all colors from raster
	val allColors = (0 until w).map { x=>
	(0 until h).map { y=>
	val arr = Array.fill[Double](3)(0.0)
	raster.getPixel(x,y, arr)
	new DoublePoint(arr)
	}
	}.flatten

	// find 8 dominant colors of image via kmeans
	val k = 8
	val kmeans = new KMeansPlusPlusClusterer[DoublePoint](k, 1000)
	kmeans.getRandomGenerator().setSeed(1234567L)
	val centroids = kmeans.cluster(allColors.toIterable)
	val colors = centroids.map{ x=> x.getCenter.getPoint }
	val euclidean = new EuclideanDistance()

	// update raster with 8 color palette
	(0 until w).foreach { x=>
	(0 until h).foreach { y=>
	val arr = Array.fill[Double](3)(0.0)
	raster.getPixel(x,y, arr) // loads the BGR color into arr
	val closestIdx = colors
	.zipWithIndex
	.map{ ci =>
	val (c,idx) = ci
	(idx,euclidean.compute(c,arr))
	}
	.minBy{ x=> x._2 } // want smallest euclidean
	._1 // index of smallest euclidean

	// replace each pixel with closest dominant color
	val closestColor = colors(closestIdx)
	raster.setPixel(x,y,closestColor) // replace pixel color in raster
	}
	}

	// extract all the colors again from the updated raster
	val data:Seq[Array[Double]] = (0 until w).map { x=>
	(0 until h).map { y=>
	val arr = Array.fill[Double](3)(0.0)
	raster.getPixel(x,y, arr) // copies pixel's x,y value to arr!
	arr
	}
	}.flatten.toSeq

	// EACH COLOR IN THE COLOR SPACE HAS MEAN, VARIANCE & SKEW = List of size 3
	// We USE THE RGB COLOR SPACE
	// So r=3, g=3,b=3 => an image has a signature of length 3+3+3=9.
	// That's the global sig.
	// For local, we break up image into x pieces, treat each piece as an image.
	// Hence local sig = x*9, x = spatial

	// global sig
	val global = stats(data.map{ x:Array[Double] => x(0) }.toArray) ++ // B
	stats(data.map{ x:Array[Double] => x(1) }.toArray) ++
	stats(data.map{ x:Array[Double] => x(2) }.toArray)

	assert(global.size == 9)
	//println(global)

	// compute signature on spatial groups
	val spatial = args(1).toInt
	//println(spatial)
	val n = data.size
	val local = data
	.grouped(n/spatial)
	.map{ gp:Seq[Array[Double]] =>
	val b = stats(gp.map{ x=> x(0)}.toArray)
	val r = stats(gp.map{ x=> x(1)}.toArray)
	val g = stats(gp.map{ x=> x(2)}.toArray)
	//println(b)
	//println(g)
	//println(r)
	b ++ r ++ g
	}
	.reduceLeft(_ ++ _)

	//println(local.size)
	assert(local.size == spatial * 9)



	local ++ global
	}

	def routine = {

	}

	def save(x:List[Double], dir:String, file:String) = {
	val path = dir + file+"_sig.txt"
	val pw = new java.io.PrintWriter(path)
	println("Saving " + path)
	x.foreach(pw.println)
	pw.flush
	pw.close
	}

	// find all image files in dir
	val dir = args(0)
	val files = new File(dir).list().filter{ x=> x.endsWith(".jpg")}
	files.foreach{
	file =>
	val path = dir + file
	println("Processing " + path)
	save(cbir.compute(path), dir, file)
	}
	}
	package com.marin.cv

	// To build: sbt assembly
	// To run: spark-submit --driver-memory 120g --executor-memory 120g --class com.marin.cv.cbirJob --master local[*] target/scala-2.10/cbir.jar "--dir /media/kraman/disk1/comp_vision/sig"
	import org.apache.spark.SparkContext
	import org.apache.spark.SparkContext._
	import org.apache.spark.SparkConf
	import org.apache.spark.rdd._
	import org.apache.spark.rdd.PairRDDFunctions
	import com.marin.util.Args
	import java.io.PrintWriter
	import org.apache.spark.mllib.classification._
	import org.apache.spark.mllib.regression.LabeledPoint
	import org.apache.spark.mllib.linalg._
	import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
	import collection.mutable.ListBuffer

	object cbirJob extends App {
	val conf = new SparkConf()
	.setMaster("local[28]")
	.set("spark.driver.maxResultSize", "100g")
	.set("spark.local.dir", "/media/kraman/disk2/tmp")
	.set("spark.akka.threads", "256")

	val sc = new SparkContext(conf)
	val myargs = Args(args.mkString)
	val dir = myargs("dir")
	val mylog = new PrintWriter("log")

	val classNames:Map[Int, String] = Map(
	300->"bus",
	400->"dino",
	500->"elephant",
	600->"flower",
	700->"horse",
	800->"nature",
	900->"food")

	// read all the CBIR image signatures & convert to features
	val classFeatures:Map[String, ListBuffer[LabeledPoint]] = sc
	.wholeTextFiles(dir) // RDD[(String, String)]
	.map{ kv =>
	val (filename, contents) = kv
	val array:Array[String] = contents.split("\n")
	val features:DenseVector = new DenseVector(array.map{_.toDouble})
	val lp:LabeledPoint = new LabeledPoint(1.0d, features)
	val idx = filename.indexOf(".jpg_sig.txt")
	val key = (filename.slice(idx-3,idx).toInt/100)*100 // eg. turn 368 into 300
	val name = classNames(key)
	(name, lp)
	} // RDD[String, LabeledPoint]
	.aggregateByKey(collection.mutable.ListBuffer[LabeledPoint]())(
	(u,v) => u.+:(v),
	(u1,u2) => u1 ++ u2
	) // RDD[String, ListBuffer[LabeledPoint]]
	.collectAsMap.toMap

	// for each class(bus, dino, elephant, flower), train binary classifier
	// uniform sample to ensure equal number of negative examples
	val results = classFeatures.map{ kv =>
	val (className, features) = kv
	val trainTestRatio = Array(0.8,0.2)
	val array = sc.makeRDD(features).randomSplit(trainTestRatio)
	val (posTrain, posTest) = (array(0), array(1))

	val negFeatures = sc.makeRDD(classFeatures
	.keys
	.filterNot(name => name==className)
	.map{ negativeClass => classFeatures(negativeClass)}
	.reduceLeft(_++_))
	.map{ x:LabeledPoint => new LabeledPoint(0, x.features)} // flip labels

	// want \|negatives\| == \|positives\|, but \|negatives\| > \|positives\| in dataset
	val fraction = posTrain.count.toDouble/negFeatures.count
	val negTrain = negFeatures.sample(false, fraction)
	val negTest = negFeatures.subtract(negTrain)

	mylog.println("Counts(NegTrain, NegTest, PosTrain, PosTest):" +
	negTrain.count + "," + negTest.count + "," + posTrain.count + "," + posTest.count)

	val model:SVMModel = SVMWithSGD.train(posTrain ++ negTrain, 1000)
	val testSet = posTest ++ negTest
	val scores = model.predict( testSet.map{ lp => lp.features } )
	val labels = testSet.map{ lp => lp.label }
	val scoresAndLabels = scores.zip(labels)
	val metrics = new BinaryClassificationMetrics(scoresAndLabels)

	mylog.println( className + "," + metrics.areaUnderPR + "," + metrics.areaUnderROC)

	(className, metrics.areaUnderPR, metrics.areaUnderROC)

	}.toList

	sc.makeRDD(results,1).saveAsTextFile("svm classifier results"+System.currentTimeMillis)
	mylog.close
	}
	// THIS VERSION TRAINS MULTIPLE CLASSIFIERS PER CLASS
	// Corel has 7 classes, each class has 100 images
	// WANT 80 +ve, 80-ve PER CLASSIFIER PER CLASS.
	// So 600 -ve per class = 80*7 + 40. So 7 classifiers PER CLASS
	// TOTAL 7 classifiers per class * 7 classes = 49 classifiers

	package com.marin.cv
	// To build: sbt assembly
	// To run: spark-submit --driver-memory 120g --executor-memory 120g --class com.marin.cv.cbirJob --master local[*] target/scala-2.10/cbir.jar "--dir /media/kraman/disk1/comp_vision/sig"

	import org.apache.spark.SparkContext
	import org.apache.spark.SparkContext._
	import org.apache.spark.SparkConf
	import org.apache.spark.rdd._
	import org.apache.spark.rdd.PairRDDFunctions
	import com.marin.util.Args
	import java.io.PrintWriter
	import org.apache.spark.mllib.classification._
	import org.apache.spark.mllib.regression.LabeledPoint
	import org.apache.spark.mllib.linalg._
	import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
	import collection.mutable.ListBuffer

	object cbirJob extends App {
	val conf = new SparkConf()
	.setMaster("local[28]")
	.set("spark.driver.maxResultSize", "100g")
	.set("spark.local.dir", "/media/kraman/disk2/tmp")
	.set("spark.akka.threads", "256")

	val sc = new SparkContext(conf)
	val myargs = Args(args.mkString)
	val dir = myargs("dir")
	val mylog = new PrintWriter("log")

	val classNames:Map[Int, String] = Map(
	300->"bus",
	400->"dino",
	500->"elephant",
	600->"flower",
	700->"horse",
	800->"nature",
	900->"food")

	// read all the CBIR image signatures & convert to features
	val classFeatures:Map[String, ListBuffer[LabeledPoint]] = sc
	.wholeTextFiles(dir) // RDD[(String, String)]
	.map{ kv =>
	val (filename, contents) = kv
	val array:Array[String] = contents.split("\n")
	val features:DenseVector = new DenseVector(array.map{_.toDouble})
	val lp:LabeledPoint = new LabeledPoint(1.0d, features)
	val idx = filename.indexOf(".jpg_sig.txt")
	val key = (filename.slice(idx-3,idx).toInt/100)*100 // eg. turn 368 into 300
	val name = classNames(key)
	(name, lp)
	} // RDD[String, LabeledPoint]
	.aggregateByKey(collection.mutable.ListBuffer[LabeledPoint]())(
	(u,v) => u.+:(v),
	(u1,u2) => u1 ++ u2
	) // RDD[String, ListBuffer[LabeledPoint]]
	.collectAsMap.toMap

	// for each class(bus, dino, elephant, flower), train binary classifier
	val results = classFeatures.map{ kv =>
	val (className, features) = kv
	val trainTestRatio = Array(0.8,0.2)
	val array = sc.makeRDD(features).randomSplit(trainTestRatio)
	val (posTrain, posTest) = (array(0), array(1))

	val negFeatures = sc.makeRDD(classFeatures
	.keys
	.filterNot(name => name==className)
	.map{ negativeClass => classFeatures(negativeClass)}
	.reduceLeft(_++_))
	.map{ x:LabeledPoint => new LabeledPoint(0, x.features)} // flip labels

	// want \|negatives\| == \|positives\|, but \|negatives\| > \|positives\| in dataset
	// so make as many sample buckets as you need to
	// sample bucket size must be equal to pos sample size = 80
	// 80 * 7 + 40 = 600
	// so need 7 buckets, plus 1 test set bucket
	val negRatio = Array(80,80,80,80,80,80,80,40).map{ _/600.0 }
	val negArray = negFeatures.randomSplit(negRatio)
	val negTest = negArray(7) // SET TEST SET ASIDE
	val testSet = posTest ++ negTest
	val testSetFeatures = testSet.map{ lp => lp.features }
	// NOW BUILD 1 classifier PER NEG SAMPLE BUCKET
	val scores:Seq[RDD[Double]] = (0 to 6).toSeq.map{ idx =>
	val negTrain = negArray(idx)
	val model:SVMModel = SVMWithSGD.train(posTrain ++ negTrain, 1000)

	mylog.println("SVM: " + idx + " Counts(NegTrain, NegTest, PosTrain, PosTest):" +
	negTrain.count + "," + negTest.count + "," + posTrain.count + "," + posTest.count)
	model.predict( testSetFeatures )
	}
	/* fuck this! each classifier gives you an RDD[Double] full of scores!
	so you have 7 columns full of scores, like
	0 1 1 ...
	1 0 1
	0 1 1
	0 1 1
	you want 1 column full of scores,
	which contains a 1 if the consensus is 1, otherwise 0.
	consensus on 7 = ceil(7/2) = 4
	so add all columns, divide by 4. if it exceeds 1, then 1 else 0
	*/
	val consensus:RDD[Double] = scores.reduceLeft{ (a:RDD[Double], b:RDD[Double]) =>
	a.zip(b).map{ rec => rec._1 + rec._2 }
	}.map{ x=> if( x/4.0 >= 1.0) 1.0 else 0.0 }

	val labels = testSet.map{ lp => lp.label }
	val scoresAndLabels = consensus.zip(labels)
	val metrics = new BinaryClassificationMetrics(scoresAndLabels)

	mylog.println( className + "," + metrics.areaUnderPR + "," + metrics.areaUnderROC)

	(className, metrics.areaUnderPR, metrics.areaUnderROC)

	}.toList

	sc.makeRDD(results,1).saveAsTextFile("svm classifier results"+System.currentTimeMillis)
	mylog.close
	}
	Dataset: COREL subset ( 7 classes, 100 images per class => 7*100 = 700 jpgs )
	COREL: https://sites.google.com/site/dctresearch/Home/content-based-image-retrieval

	Training Test Ratio: 80-20

	Equal number of true & false samples ie. train on 80 dinos & 80 random non-dinos out of 700-100 = 600 non-dinos.
	So training sample size = 80 + 80 = 160
	Test sample = 20 dinos + 20 non-dinos

	Train 1 SVM classifier per class => 7 SVM classifiers
	Hyperparam: 1000 iterations on the SGD with training rate 1.0

	(Name of Classifier, AUC = Area under Precision Recall Curve, Area under ROC Curve)
	(bus,0.7266666666666667,0.6215384615384616)
	(dino,0.9289215686274509,0.9179566563467493)
	(flower,0.9762845849802372,0.9545454545454546)
	(horse,0.9277836134453781,0.8854166666666667)
	(food,0.7559523809523809,0.6142857142857143)
	(nature,0.843956043956044,0.7318181818181818)
	(elephant,0.5694235588972432,0.5354691075514875)

	Conclusions:
	Easiest to classify = Flower, Dino
	Hardesy to classify = Elephant

	wiki for AUC: http://fastml.com/what-you-wanted-to-know-about-auc/ ( we want 1.0, we get 0.56 to 0.97 )
	wiki for ROC:http://gim.unmc.edu/dxtests/roc3.htm ( we want 1.0, we get 0.53 to 0.95)

	Run2: Test Set: 20 positives + 520 negatives ( negative test samples far overwhelm positives )

	(bus,0.3872202166064982,0.7410112359550562)
	(dino,0.6296296296296297,0.9430740037950663)
	(flower,0.5792057698992006,0.8983012559862654)
	(horse,0.4008389605074688,0.6706503014642549)
	(food,0.24998883499915142,0.6119496855345912)
	(nature,0.11913195351266649,0.5286529060293319)
	(elephant,0.5030071466512145,0.650146771037182)

	Run3: Use a 1:9 ratio for training. 1 positive sample for 9 negative samples!
	Justification: http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4061540/
	Essentially, opposite of Run2.

	Summary: (CLASSNAME, AUC, ROC)
	(bus,0.6923076923076923,0.5)
	(dino,0.9463950683462878,0.9024390243902439)
	(flower,0.9299660441426146,0.881578947368421)
	(horse,0.6826025459688826,0.5142857142857142)
	(food,0.7222222222222222,0.5)
	(nature,0.7056831267357584,0.7325162220620043)
	(elephant,0.6941747572815534,0.5)
	---------
	Detailed Counts
	CLASSNAME, AUC, ROC
	---------
	Counts(NegTrain, NegTest, PosTrain, PosTest):536,64,60,40
	bus,0.6923076923076923,0.5
	Counts(NegTrain, NegTest, PosTrain, PosTest):550,50,59,41
	dino,0.9463950683462878,0.9024390243902439
	Counts(NegTrain, NegTest, PosTrain, PosTest):545,55,62,38
	flower,0.9299660441426146,0.881578947368421
	Counts(NegTrain, NegTest, PosTrain, PosTest):534,66,65,35
	horse,0.6826025459688826,0.5142857142857142
	Counts(NegTrain, NegTest, PosTrain, PosTest):550,50,60,40
	food,0.7222222222222222,0.5
	Counts(NegTrain, NegTest, PosTrain, PosTest):527,73,62,38
	nature,0.7056831267357584,0.7325162220620043
	Counts(NegTrain, NegTest, PosTrain, PosTest):537,63,60,40
	elephant,0.6941747572815534,0.5

	Run 4. Use equal number of positive & negative training set.
	But since number of negatives > number of positives,
	use as many classifiers PER class as required to cover the entire negative sample training set.
	// THIS VERSION TRAINS MULTIPLE CLASSIFIERS PER CLASS
	// Corel has 7 classes, each class has 100 images
	// WANT 80 +ve, 80-ve PER CLASSIFIER PER CLASS.
	// So 600 -ve per class = 80*7 + 40. So 7 classifiers PER CLASS
	// TOTAL 7 classifiers per class * 7 classes = 49 classifiers

	Justification:
	http://sci2s.ugr.es/keel/pdf/specific/congreso/akbani_svm_2004.pdf
	http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.309.904&rep=rep1&type=pdf
	Summary: (CLASSNAME, AUC, ROC)
	(bus,0.7278735632183908,0.7034313725490197)
	(dino,0.9265931372549019,0.9295485636114911)
	(flower,0.9380952380952381,0.9047619047619048)
	(horse,0.7213669950738917,0.7648809523809524)
	(food,0.7255244755244755,0.7071428571428572)
	(nature,0.6842105263157895,0.6556390977443609)
	(elephant,0.7199602780536245,0.6795665634674922)