Skip to content

Instantly share code, notes, and snippets.

@krishnanraman
Last active December 15, 2015 12:19
Show Gist options
  • Save krishnanraman/5259351 to your computer and use it in GitHub Desktop.
Save krishnanraman/5259351 to your computer and use it in GitHub Desktop.
Given some data, dump its pdf into a MYSQL db for purposes of plotting a histogram ( db accessed via Ruby )
import com.twitter.scalding._
import com.twitter.scalding.mathematics.Histogram
import util.Random
class HistogramTest(args : Args) extends Job(args) {
def cdf2pdf(cdf:Map[Double,Double], keys:List[Double], size:Int):Map[Double,Double] = {
var m = Map[Double,Double]()
keys.foldLeft((m, 0.0d))((a,b) =>{
val myval = cdf(b)*size
val myactual = myval - a._2
m += (b->myactual)
(m,myval)
})._1
}
def l2t11(x:List[Double]) = {
Tuple11(x(0), x(1),x(2),x(3), x(4),x(5),x(6), x(7),x(8),x(9), x(10))
}
val tuples = (1 to 1000).map( x=> Random.nextGaussian + 5).filter(x=> x>2 && x< 8) // mean = 5, sigma = 1
val bins = 11
val bin_names = (1 to bins).map( i=> Symbol("b"+i))
IterableSource(tuples, 'n)
.read
.groupAll{
_.histogram('n -> 'h, (8-2)/(bins+0.0d))
}.mapTo('h->bin_names){
h:Histogram =>
val size = h.size
val cdf = h.cdf
val keys = h.keys
var pdf = cdf2pdf( cdf, keys, size )
keys.foreach( p=> printf("%f, %f\n", p, pdf(p)))
val values = h.keys.map( k => pdf(k))
l2t11( values )
}
.write(Tsv("data/histo_test"))
}
Results:
cat data/histo_test
3.0 27.0 70.0 143.0 196.0 216.0 175.0 101.0 48.0 16.0 3.0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment