samklr/QTreeAlgebirdTaxiData.scala

## QTreeAlgebirdTaxiData.scala
// transcribed from an Apache Spark 1.0 spark-shell session
// using data from http://chriswhong.com/open-data/foil_nyc_taxi/
// and the QTree algorithm for approximate quantiles over large datasets
// each of the distanceRange and minutesRange calculations below takes about 15 minutes on my four-core SSD-based Macbook Pro

import com.twitter.algebird._
import com.twitter.algebird.Operators._
implicit val qtSemigroupD = new QTreeSemigroup[Double](6)

val in = sc.textFile("trip_data") // a directory containing all the trip_data*.csv files downloaded from the above link

// According to https://github.com/avibryant/qtree the median of a QTree of doubles is the midpoint of the 50% quantile bounds
val distanceRange = in.filter { line => !line.matches("^medallion.*") }.map { line => line.split(",")(9).toDouble }.filter(_ > 0).map { d => QTree(d) }.reduce(_ + _).quantileBounds(0.5)
val minutesRange = in.filter { line => !line.matches("^medallion.*") }.map { line => line.split(",")(8).toDouble }.filter(_ > 0).map { d => QTree(d) }.reduce(_ + _).quantileBounds(0.5)

val distance = (distanceRange._1 + distanceRange._2) / 2
val minutes = (minutesRange._1 + minutesRange._2) / 2
	// transcribed from an Apache Spark 1.0 spark-shell session
	// using data from http://chriswhong.com/open-data/foil_nyc_taxi/
	// and the QTree algorithm for approximate quantiles over large datasets
	// each of the distanceRange and minutesRange calculations below takes about 15 minutes on my four-core SSD-based Macbook Pro

	import com.twitter.algebird._
	import com.twitter.algebird.Operators._
	implicit val qtSemigroupD = new QTreeSemigroup[Double](6)

	val in = sc.textFile("trip_data") // a directory containing all the trip_data*.csv files downloaded from the above link

	// According to https://github.com/avibryant/qtree the median of a QTree of doubles is the midpoint of the 50% quantile bounds
	val distanceRange = in.filter { line => !line.matches("^medallion.*") }.map { line => line.split(",")(9).toDouble }.filter(_ > 0).map { d => QTree(d) }.reduce(_ + _).quantileBounds(0.5)
	val minutesRange = in.filter { line => !line.matches("^medallion.*") }.map { line => line.split(",")(8).toDouble }.filter(_ > 0).map { d => QTree(d) }.reduce(_ + _).quantileBounds(0.5)

	val distance = (distanceRange._1 + distanceRange._2) / 2
	val minutes = (minutesRange._1 + minutesRange._2) / 2