I hereby claim:
- I am avibryant on github.
- I am avi (https://keybase.io/avi) on keybase.
- I have a public key whose fingerprint is 9DFC 19D0 8558 8DE8 6BF3 5C19 3879 4907 3690 98AF
To claim this, I am signing this object:
I hereby claim:
To claim this, I am signing this object:
import com.twitter.algebird._ | |
case class Preparer[A, T](prepareFn: A => T) { | |
def map[U](fn: T => U) = | |
Preparer[A, U](fn.compose(prepareFn)) | |
def flatMap[U](fn: T => TraversableOnce[U]) = | |
FlatPreparer[A, U](fn.compose(prepareFn)) | |
def aggregate[B, C](aggregator: Aggregator[T, B, C]): Aggregator[A, B, C] = |
case class SetSizeAggregator(hllBits: Int, maxSetSize: Int = 10) | |
extends MonoidAggregator[Array[Byte], Either[HLL, Set[Array[Byte]]], Long] { | |
def prepare(in: Array[Byte]) = Right(Set(in)) | |
def present(sum: Either[HLL, Set[Array[Byte]]]) = { | |
sum match { | |
case Right(set) => set.size | |
case Left(hll) => hll.approximateSize.estimate | |
} | |
} |
def takeBy(pipe: TypedPipe[(K,V)], max: Int)(fn: V => Double): TypedPipe[(K,V)] = { | |
implicit val qtreeSemi = QTreeSemigroup(4) //magic number, determines how much RAM the trees take | |
val qtrees = pipe.map{case (k,v) => k -> QTree(fn(v))}.sumByKey | |
val maxV = qtrees.flatMap{case (k,q) => | |
if(q.size > max) { | |
val targetQuantile = max.toDouble / q.size | |
val (lower, upper) = q.quantileBounds(targetQuantile) | |
Some(k -> upper) //this will give us at least max values; use lower to get at most max values | |
} | |
else |
class File | |
def seek_to(str) | |
until eof? | |
start = pos | |
buf = read(10000) | |
if(offset = buf.index(str)) | |
seek(start + offset + str.size) | |
return true | |
else | |
seek(start + 5000) |
require 'stringio' | |
require 'base64' | |
def read_varint(io) | |
value = index = 0 | |
begin | |
byte = io.readchar | |
value |= (byte & 0x7f) << (7 * index) | |
index += 1 | |
end while (byte & 0x80).nonzero? |
require 'date' | |
DUE_DATE = "2013-05-19" | |
#data taken from http://spacefem.com/pregnant/charts/duedate2.php | |
#starts at day 222 | |
DATA = [ | |
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
1, 2, 1, 4, 2, 2, 1, 4, 6, 7, 5, 1, 5, 8, 7, 9, 10, 11, 13, 13, 18, | |
14, 13, 9, 27, 29, 27, 31, 27, 26, 36, 43, 43, 51, 67, 74, 60, 47, |
class WordCount(args : Args) extends Job(args) { | |
Tsv(args("input"), ('doc_id, 'text)) | |
.flatMapTo('text -> 'token){line : String => line.split("[ \\[\\]\\(\\),.]")} | |
.map('token -> 'token){token : String => token.trim.toLowerCase} | |
.filter('token){token : String => token.length > 0} | |
.groupBy('token){g => g.size} | |
.write(Tsv(args("output"))) | |
} |
# visualize the output with gg.js | |
# gg({layers: [{ geometry: 'line', mapping: { x: 'minutes', y: 'task', group: 'stage', color: 'type'}}]}); | |
def parse(line) | |
output = {} | |
parts = line.split(/[ "]/) | |
output["TYPE"] = parts.shift | |
while(parts.size > 0) | |
next_part = parts.shift | |
if next_part =~ /^(\w+)=$/ |
def likelihoodRatio(k1 : Int, n1 : Int, k2 : Int, n2 : Int) = { | |
def kLogP(k : Int, p : Double) = if(k == 0) 0 else k * math.log(p) | |
def logL(p : Double, k : Int, n : Int) = kLogP(k, p) + kLogP(n - k, 1 - p) | |
val p1 = k1.toDouble / n1.toDouble | |
val p2 = k2.toDouble / n2.toDouble | |
val p = (k1 + k2).toDouble / (n1 + n2).toDouble | |
logL(p1, k1, n1) + logL(p2, k2, n2) - logL(p, k1, n1) - logL(p, k2, n2) | |
} |