mpizosdim/TDigest.scala

## TDigest.scala
import java.nio.ByteBuffer

import org.apache.spark.{SparkContext, SparkConf}
import org.apache.commons.math3.distribution.ExponentialDistribution
import scala.collection._
import scala.collection.generic.CanBuildFrom
import scala.util.Random

case class Centroid(var mean: Double, var count: Long) extends Ordered[Centroid] with Serializable {

  def update(x: Double, weight: Long): Unit = {
    this.count += weight
    this.mean += weight * (x - this.mean) / this.count
  }

  def compare(that: Centroid): Int = this.mean compare that.mean

  override def equals(o: Any): Boolean = o match {
    case that: Centroid => that.mean == this.mean
    case _ => false
  }

  override def hashCode: Int = this.mean.hashCode()
}


case class TDigest(delta: Double = 0.01,
                   k: Int = 25,
                   var n: Long = 0,
                  // speedups with better DS
                   val centroids: mutable.TreeSet[Centroid] = mutable.TreeSet[Centroid]()) extends Serializable {

  def size: Int = centroids.size

  def ++(other: TDigest): TDigest = {
    val bothCentroids: Seq[Centroid] = Random.shuffle(other.centroids.toSeq ++ this.centroids.toSeq)
    val newDigest = TDigest(this.delta, this.k)
    bothCentroids.foreach{ c => newDigest.addCentroid(c) }
    //newDigest.compress
    newDigest
  }

  def addCentroid(c: Centroid, increment: Boolean = false): Unit = {
    if (increment) { this.n += c.count }
    this.centroids.contains(c) match {
      case true => this.updateCentroid(c, c.mean, c.count)
      case false => this.centroids.add(c)
    }
  }

  def computeCentroidQuantile(centroid: Centroid): Double = {
    this.centroids.filter{ c => c.mean <= centroid.mean }.map{ _.count }.sum / this.n.toDouble
  }

  def updateCentroid(c: Centroid, x: Double, weight: Long): Unit = {
    centroids.find{ _ == c } match {
      case None =>
      case Some(c) => c.update(x, weight)
    }
  }


  // maybe protected
  def findClosestCentroids(x: Double): mutable.TreeSet[Centroid] = {
    val (below, above) = this.centroids.partition{c => c.mean < x}
    val out: mutable.TreeSet[Centroid] = mutable.TreeSet[Centroid]()
    (below.lastOption ++ above.headOption).foreach{ c => out.add(c) }
    out
  }

  def threshold(q: Double): Long = Math.round(4 * this.n * this.delta * q * (1 - q))

  // check this
  def compress: Unit = {
    val oldCentroids: Seq[Centroid] = Random.shuffle(this.centroids.toSeq)
    this.centroids.clear
    oldCentroids.foreach{ c => this.update(c.mean, c.count) }
  }

  // insert a new element
  def update(x: Double, weight: Long = 1): Unit = {
    this.n += weight
    this.size > 0 match {
      case false => this.addCentroid(Centroid(x, weight))
      case true =>
        val S: mutable.TreeSet[Centroid] = this.findClosestCentroids(x)
        var w: Long = weight
        Random.shuffle(S.toSeq).foreach{ c =>
          val q: Double = this.computeCentroidQuantile(c)
          val delta_w: Long = Seq(this.threshold(q) - c.count, w).min
          if ((w > 0) && ((c.count + w) <= this.threshold(q))) {
            this.updateCentroid(c, x, delta_w)
            w -= delta_w
          }
        }
      if (w > 0) {
        this.addCentroid(Centroid(x, weight))
      }
      /*
      if (this.size > (this.k / this.delta)) {
        this.compress
      }
      */
    }
  }

  def batchUpdate(X: Seq[Double], weight: Long = 1): Unit = {
    X.foreach( x => this.update( x, weight ))
    //this.compress
  }

  def invCDF(p: Double): Double = {

    val cumProb: Seq[(Double, Centroid)] = this.centroids.map{ _.count / this.n.toDouble }
                                                        .scanLeft(0.0)( _ + _ ).zip(this.centroids).toSeq
    val above: Option[(Double, Centroid)] = cumProb.find{ _._1 > p }
    val below: Option[(Double, Centroid)] = cumProb.reverse.find(_._1 < p)

    (below, above) match {
      case (None, None) => -1.0 // raise error here, we don't have any centroids
      case (None, Some(aC)) => aC._2.mean
      case (Some(bC), None) => bC._2.mean
      case (Some(bC), Some(aC)) =>
        // linear interpolation between means
        val deltaX: Double = aC._2.mean - bC._2.mean
        val deltaP =  (p - bC._1) / (aC._1 - bC._1)
        bC._2.mean + ( deltaP * deltaX )
    }
  }

  def cdf(x: Double): Double = {

    val cumCount: Seq[(Double, Centroid)] = this.centroids.toSeq.map{ _.count / this.n.toDouble }.scanLeft(0.0)( _ + _ ).zip(this.centroids)
    val above: Option[(Double, Centroid)] = cumCount.find{ _._2.mean > x }
    val below: Option[(Double, Centroid)] = cumCount.reverse.find{ _._2.mean < x }

    (below, above) match {
      case (None, None) => -1.0 // raise error here, we don't have any centroids
      case (None, Some(aC)) => aC._1
      case (Some(bC), None) => bC._1
      case (Some(bC), Some(aC)) =>
        // piece-wise uniform, distribution
        val deltaX: Double = (x - bC._2.mean) / (aC._2.mean - bC._2.mean)
        val deltaP =  aC._1 - bC._1
        bC._1 + ( deltaP * deltaX )
    }
  }

  def trimmedMean(x0: Double, x1: Double): Double = {
    // the mean value, from a window of the distribution
    val within = this.centroids.filter{ c => c.mean > x0 && c.mean < x1 }
    val s = within.map{ _.count.toDouble }.sum
    val trimmedMean = within.map{ c => c.mean * (c.count / s) }.sum
    trimmedMean
  }

}


object TDigestAppCustom {
  def main(arg: Array[String]): Unit = {
    val appName: String = "TDigest-Test"
    val conf: SparkConf = new SparkConf().setAppName(appName).setMaster("local[16]")
    val sc: SparkContext = new SparkContext(conf)
    val trueDist0: ExponentialDistribution = new ExponentialDistribution(15)
    val trueDist1: ExponentialDistribution = new ExponentialDistribution(30)
    val data0: immutable.Seq[Double] = (0 until 10000).map{ i => trueDist0.sample()}.toSeq
    val data1: immutable.Seq[Double] = (0 until 10000).map{ i => trueDist1.sample()}.toSeq
    val TD0: TDigest = new TDigest()
    val TD1: TDigest = new TDigest()
    TD0.batchUpdate(data0, 1)
    TD1.batchUpdate(data1, 1)
    println(TD0.cdf(15.0))
    println(TD1.cdf(30.0))
    println(TD0.invCDF(0.50))
    println(TD1.invCDF(0.50))
    val bothTD: TDigest = TD0 ++ TD1
    println(bothTD.cdf(15.0))
    println(bothTD.cdf(30.0))
    println(bothTD.invCDF(0.50))


  }
}
	import java.nio.ByteBuffer

	import org.apache.spark.{SparkContext, SparkConf}
	import org.apache.commons.math3.distribution.ExponentialDistribution
	import scala.collection._
	import scala.collection.generic.CanBuildFrom
	import scala.util.Random

	case class Centroid(var mean: Double, var count: Long) extends Ordered[Centroid] with Serializable {

	def update(x: Double, weight: Long): Unit = {
	this.count += weight
	this.mean += weight * (x - this.mean) / this.count
	}

	def compare(that: Centroid): Int = this.mean compare that.mean

	override def equals(o: Any): Boolean = o match {
	case that: Centroid => that.mean == this.mean
	case _ => false
	}

	override def hashCode: Int = this.mean.hashCode()
	}




	case class TDigest(delta: Double = 0.01,
	k: Int = 25,
	var n: Long = 0,
	// speedups with better DS
	val centroids: mutable.TreeSet[Centroid] = mutable.TreeSet[Centroid]()) extends Serializable {

	def size: Int = centroids.size

	def ++(other: TDigest): TDigest = {
	val bothCentroids: Seq[Centroid] = Random.shuffle(other.centroids.toSeq ++ this.centroids.toSeq)
	val newDigest = TDigest(this.delta, this.k)
	bothCentroids.foreach{ c => newDigest.addCentroid(c) }
	//newDigest.compress
	newDigest
	}

	def addCentroid(c: Centroid, increment: Boolean = false): Unit = {
	if (increment) { this.n += c.count }
	this.centroids.contains(c) match {
	case true => this.updateCentroid(c, c.mean, c.count)
	case false => this.centroids.add(c)
	}
	}

	def computeCentroidQuantile(centroid: Centroid): Double = {
	this.centroids.filter{ c => c.mean <= centroid.mean }.map{ _.count }.sum / this.n.toDouble
	}

	def updateCentroid(c: Centroid, x: Double, weight: Long): Unit = {
	centroids.find{ _ == c } match {
	case None =>
	case Some(c) => c.update(x, weight)
	}
	}


	// maybe protected
	def findClosestCentroids(x: Double): mutable.TreeSet[Centroid] = {
	val (below, above) = this.centroids.partition{c => c.mean < x}
	val out: mutable.TreeSet[Centroid] = mutable.TreeSet[Centroid]()
	(below.lastOption ++ above.headOption).foreach{ c => out.add(c) }
	out
	}

	def threshold(q: Double): Long = Math.round(4 * this.n * this.delta * q * (1 - q))

	// check this
	def compress: Unit = {
	val oldCentroids: Seq[Centroid] = Random.shuffle(this.centroids.toSeq)
	this.centroids.clear
	oldCentroids.foreach{ c => this.update(c.mean, c.count) }
	}

	// insert a new element
	def update(x: Double, weight: Long = 1): Unit = {
	this.n += weight
	this.size > 0 match {
	case false => this.addCentroid(Centroid(x, weight))
	case true =>
	val S: mutable.TreeSet[Centroid] = this.findClosestCentroids(x)
	var w: Long = weight
	Random.shuffle(S.toSeq).foreach{ c =>
	val q: Double = this.computeCentroidQuantile(c)
	val delta_w: Long = Seq(this.threshold(q) - c.count, w).min
	if ((w > 0) && ((c.count + w) <= this.threshold(q))) {
	this.updateCentroid(c, x, delta_w)
	w -= delta_w
	}
	}
	if (w > 0) {
	this.addCentroid(Centroid(x, weight))
	}
	/*
	if (this.size > (this.k / this.delta)) {
	this.compress
	}
	*/
	}
	}

	def batchUpdate(X: Seq[Double], weight: Long = 1): Unit = {
	X.foreach( x => this.update( x, weight ))
	//this.compress
	}

	def invCDF(p: Double): Double = {

	val cumProb: Seq[(Double, Centroid)] = this.centroids.map{ _.count / this.n.toDouble }
	.scanLeft(0.0)( _ + _ ).zip(this.centroids).toSeq
	val above: Option[(Double, Centroid)] = cumProb.find{ _._1 > p }
	val below: Option[(Double, Centroid)] = cumProb.reverse.find(_._1 < p)

	(below, above) match {
	case (None, None) => -1.0 // raise error here, we don't have any centroids
	case (None, Some(aC)) => aC._2.mean
	case (Some(bC), None) => bC._2.mean
	case (Some(bC), Some(aC)) =>
	// linear interpolation between means
	val deltaX: Double = aC._2.mean - bC._2.mean
	val deltaP = (p - bC._1) / (aC._1 - bC._1)
	bC._2.mean + ( deltaP * deltaX )
	}
	}

	def cdf(x: Double): Double = {

	val cumCount: Seq[(Double, Centroid)] = this.centroids.toSeq.map{ _.count / this.n.toDouble }.scanLeft(0.0)( _ + _ ).zip(this.centroids)
	val above: Option[(Double, Centroid)] = cumCount.find{ _._2.mean > x }
	val below: Option[(Double, Centroid)] = cumCount.reverse.find{ _._2.mean < x }

	(below, above) match {
	case (None, None) => -1.0 // raise error here, we don't have any centroids
	case (None, Some(aC)) => aC._1
	case (Some(bC), None) => bC._1
	case (Some(bC), Some(aC)) =>
	// piece-wise uniform, distribution
	val deltaX: Double = (x - bC._2.mean) / (aC._2.mean - bC._2.mean)
	val deltaP = aC._1 - bC._1
	bC._1 + ( deltaP * deltaX )
	}
	}

	def trimmedMean(x0: Double, x1: Double): Double = {
	// the mean value, from a window of the distribution
	val within = this.centroids.filter{ c => c.mean > x0 && c.mean < x1 }
	val s = within.map{ _.count.toDouble }.sum
	val trimmedMean = within.map{ c => c.mean * (c.count / s) }.sum
	trimmedMean
	}

	}


	object TDigestAppCustom {
	def main(arg: Array[String]): Unit = {
	val appName: String = "TDigest-Test"
	val conf: SparkConf = new SparkConf().setAppName(appName).setMaster("local[16]")
	val sc: SparkContext = new SparkContext(conf)
	val trueDist0: ExponentialDistribution = new ExponentialDistribution(15)
	val trueDist1: ExponentialDistribution = new ExponentialDistribution(30)
	val data0: immutable.Seq[Double] = (0 until 10000).map{ i => trueDist0.sample()}.toSeq
	val data1: immutable.Seq[Double] = (0 until 10000).map{ i => trueDist1.sample()}.toSeq
	val TD0: TDigest = new TDigest()
	val TD1: TDigest = new TDigest()
	TD0.batchUpdate(data0, 1)
	TD1.batchUpdate(data1, 1)
	println(TD0.cdf(15.0))
	println(TD1.cdf(30.0))
	println(TD0.invCDF(0.50))
	println(TD1.invCDF(0.50))
	val bothTD: TDigest = TD0 ++ TD1
	println(bothTD.cdf(15.0))
	println(bothTD.cdf(30.0))
	println(bothTD.invCDF(0.50))


	}
	}