Skip to content

Instantly share code, notes, and snippets.

View avibryant's full-sized avatar

Avi Bryant avibryant

  • Galiano Island, BC
View GitHub Profile
@avibryant
avibryant / keybase.md
Created March 17, 2014 20:55
keybase.md

Keybase proof

I hereby claim:

  • I am avibryant on github.
  • I am avi (https://keybase.io/avi) on keybase.
  • I have a public key whose fingerprint is 9DFC 19D0 8558 8DE8 6BF3 5C19 3879 4907 3690 98AF

To claim this, I am signing this object:

import com.twitter.algebird._
case class Preparer[A, T](prepareFn: A => T) {
def map[U](fn: T => U) =
Preparer[A, U](fn.compose(prepareFn))
def flatMap[U](fn: T => TraversableOnce[U]) =
FlatPreparer[A, U](fn.compose(prepareFn))
def aggregate[B, C](aggregator: Aggregator[T, B, C]): Aggregator[A, B, C] =
case class SetSizeAggregator(hllBits: Int, maxSetSize: Int = 10)
extends MonoidAggregator[Array[Byte], Either[HLL, Set[Array[Byte]]], Long] {
def prepare(in: Array[Byte]) = Right(Set(in))
def present(sum: Either[HLL, Set[Array[Byte]]]) = {
sum match {
case Right(set) => set.size
case Left(hll) => hll.approximateSize.estimate
}
}
def takeBy(pipe: TypedPipe[(K,V)], max: Int)(fn: V => Double): TypedPipe[(K,V)] = {
implicit val qtreeSemi = QTreeSemigroup(4) //magic number, determines how much RAM the trees take
val qtrees = pipe.map{case (k,v) => k -> QTree(fn(v))}.sumByKey
val maxV = qtrees.flatMap{case (k,q) =>
if(q.size > max) {
val targetQuantile = max.toDouble / q.size
val (lower, upper) = q.quantileBounds(targetQuantile)
Some(k -> upper) //this will give us at least max values; use lower to get at most max values
}
else
class File
def seek_to(str)
until eof?
start = pos
buf = read(10000)
if(offset = buf.index(str))
seek(start + offset + str.size)
return true
else
seek(start + 5000)
require 'stringio'
require 'base64'
def read_varint(io)
value = index = 0
begin
byte = io.readchar
value |= (byte & 0x7f) << (7 * index)
index += 1
end while (byte & 0x80).nonzero?
@avibryant
avibryant / preg.rb
Last active September 29, 2015 15:47
require 'date'
DUE_DATE = "2013-05-19"
#data taken from http://spacefem.com/pregnant/charts/duedate2.php
#starts at day 222
DATA = [
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 2, 1, 4, 2, 2, 1, 4, 6, 7, 5, 1, 5, 8, 7, 9, 10, 11, 13, 13, 18,
14, 13, 9, 27, 29, 27, 31, 27, 26, 36, 43, 43, 51, 67, 74, 60, 47,
@avibryant
avibryant / Main.java
Created July 17, 2012 21:50 — forked from ceteri/ Main.java
Cascading for the Impatient, part 3
class WordCount(args : Args) extends Job(args) {
Tsv(args("input"), ('doc_id, 'text))
.flatMapTo('text -> 'token){line : String => line.split("[ \\[\\]\\(\\),.]")}
.map('token -> 'token){token : String => token.trim.toLowerCase}
.filter('token){token : String => token.length > 0}
.groupBy('token){g => g.size}
.write(Tsv(args("output")))
}
@avibryant
avibryant / swimlines.rb
Created September 5, 2012 21:25
Simple visualization of hadoop job history files
# visualize the output with gg.js
# gg({layers: [{ geometry: 'line', mapping: { x: 'minutes', y: 'task', group: 'stage', color: 'type'}}]});
def parse(line)
output = {}
parts = line.split(/[ "]/)
output["TYPE"] = parts.shift
while(parts.size > 0)
next_part = parts.shift
if next_part =~ /^(\w+)=$/
@avibryant
avibryant / gist:3802616
Created September 28, 2012 23:36
Likelihood ratio test for binomials
def likelihoodRatio(k1 : Int, n1 : Int, k2 : Int, n2 : Int) = {
def kLogP(k : Int, p : Double) = if(k == 0) 0 else k * math.log(p)
def logL(p : Double, k : Int, n : Int) = kLogP(k, p) + kLogP(n - k, 1 - p)
val p1 = k1.toDouble / n1.toDouble
val p2 = k2.toDouble / n2.toDouble
val p = (k1 + k2).toDouble / (n1 + n2).toDouble
logL(p1, k1, n1) + logL(p2, k2, n2) - logL(p, k1, n1) - logL(p, k2, n2)
}