felixcheung/sparkPercentile.scala

## sparkPercentile.scala
  /**
   * compute percentile from an unsorted Spark RDD
   * @param data: input data set of Long integers
   * @param tile: percentile to compute (eg. 85 percentile)
   * @return value of input data at the specified percentile
   */
  def computePercentile(data: RDD[Long], tile: Double): Double = {
    // NIST method; data to be sorted in ascending order
    val r = data.sortBy(x => x)
    val c = r.count()
    if (c == 1) r.first()
    else {
      val n = (tile / 100d) * (c + 1d)
      val k = math.floor(n).toLong
      val d = n - k
      if (k <= 0) r.first()
      else {
        val index = r.zipWithIndex().map(_.swap)
        val last = c
        if (k >= c) {
          index.lookup(last - 1).head
        } else {
          index.lookup(k - 1).head + d * (index.lookup(k).head - index.lookup(k - 1).head)
        }
      }
    }
  }
	/**
	* compute percentile from an unsorted Spark RDD
	* @param data: input data set of Long integers
	* @param tile: percentile to compute (eg. 85 percentile)
	* @return value of input data at the specified percentile
	*/
	def computePercentile(data: RDD[Long], tile: Double): Double = {
	// NIST method; data to be sorted in ascending order
	val r = data.sortBy(x => x)
	val c = r.count()
	if (c == 1) r.first()
	else {
	val n = (tile / 100d) * (c + 1d)
	val k = math.floor(n).toLong
	val d = n - k
	if (k <= 0) r.first()
	else {
	val index = r.zipWithIndex().map(_.swap)
	val last = c
	if (k >= c) {
	index.lookup(last - 1).head
	} else {
	index.lookup(k - 1).head + d * (index.lookup(k).head - index.lookup(k - 1).head)
	}
	}
	}
	}