Avi Bryant avibryant

## keybase.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                avibryant
                / keybase.md
            
            
              Created
              March 17, 2014 20:55
            
              
                keybase.md
              
          
    Keybase proof

I hereby claim:

I am avibryant on github.
I am avi (https://keybase.io/avi) on keybase.
I have a public key whose fingerprint is 9DFC 19D0 8558 8DE8 6BF3  5C19 3879 4907 3690 98AF

To claim this, I am signing this object:

  
## gist:b43d3db8933556001285
import com.twitter.algebird._

case class Preparer[A, T](prepareFn: A => T) {
  def map[U](fn: T => U) =
    Preparer[A, U](fn.compose(prepareFn))

  def flatMap[U](fn: T => TraversableOnce[U]) =
    FlatPreparer[A, U](fn.compose(prepareFn))

  def aggregate[B, C](aggregator: Aggregator[T, B, C]): Aggregator[A, B, C] =

## gist:f158a2e4abe977dc15f3
case class SetSizeAggregator(hllBits: Int, maxSetSize: Int = 10)
    extends MonoidAggregator[Array[Byte], Either[HLL, Set[Array[Byte]]], Long] {

  def prepare(in: Array[Byte]) = Right(Set(in))
  def present(sum: Either[HLL, Set[Array[Byte]]]) = {
    sum match {
      case Right(set) => set.size
      case Left(hll) => hll.approximateSize.estimate
    }
  }

## gist:b2df11671f1a8e5099d7
def takeBy(pipe: TypedPipe[(K,V)], max: Int)(fn: V => Double): TypedPipe[(K,V)] = {
  implicit val qtreeSemi = QTreeSemigroup(4) //magic number, determines how much RAM the trees take
  val qtrees = pipe.map{case (k,v) => k -> QTree(fn(v))}.sumByKey
  val maxV = qtrees.flatMap{case (k,q) =>
    if(q.size > max) {
      val targetQuantile = max.toDouble / q.size
      val (lower, upper) = q.quantileBounds(targetQuantile)
      Some(k -> upper) //this will give us at least max values; use lower to get at most max values
    }
    else

## safari-bing-patch.rb
class File
	def seek_to(str)
		until eof?
			start = pos
			buf = read(10000)
			if(offset = buf.index(str))
				seek(start + offset + str.size)
				return true
			else
				seek(start + 5000)

## gist:882550
require 'stringio'
require 'base64'

def read_varint(io)
  value = index = 0
  begin
    byte = io.readchar
    value |= (byte & 0x7f) << (7 * index)
    index += 1
  end while (byte & 0x80).nonzero?

## preg.rb
require 'date'

DUE_DATE = "2013-05-19"

#data taken from http://spacefem.com/pregnant/charts/duedate2.php
#starts at day 222
DATA = [
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  1, 2, 1, 4, 2, 2, 1, 4, 6, 7, 5, 1, 5, 8, 7, 9, 10, 11, 13, 13, 18,
  14, 13, 9, 27, 29, 27, 31, 27, 26, 36, 43, 43, 51, 67, 74, 60, 47,

## Main.java
class WordCount(args : Args) extends Job(args) {
  Tsv(args("input"), ('doc_id, 'text))
    .flatMapTo('text -> 'token){line : String => line.split("[ \\[\\]\\(\\),.]")}
    .map('token -> 'token){token : String => token.trim.toLowerCase}
    .filter('token){token : String => token.length > 0}
    .groupBy('token){g => g.size}
    .write(Tsv(args("output")))
}

## swimlines.rb
# visualize the output with gg.js
# gg({layers: [{ geometry: 'line', mapping: { x: 'minutes', y: 'task', group: 'stage', color: 'type'}}]});

def parse(line)
  output = {}
  parts = line.split(/[ "]/)
  output["TYPE"] = parts.shift
  while(parts.size > 0)
    next_part = parts.shift
    if next_part =~ /^(\w+)=$/

## gist:3802616
    def likelihoodRatio(k1 : Int, n1 : Int, k2 : Int, n2 : Int) = {
        def kLogP(k : Int, p : Double) = if(k == 0) 0 else k * math.log(p)
        def logL(p : Double, k : Int, n : Int) = kLogP(k, p) + kLogP(n - k, 1 - p)

        val p1 = k1.toDouble / n1.toDouble
        val p2 = k2.toDouble / n2.toDouble
        val p = (k1 + k2).toDouble / (n1 + n2).toDouble
        logL(p1, k1, n1) + logL(p2, k2, n2) - logL(p, k1, n1) - logL(p, k2, n2)
    }
	import com.twitter.algebird._

	case class Preparer[A, T](prepareFn: A => T) {
	def map[U](fn: T => U) =
	Preparer[A, U](fn.compose(prepareFn))

	def flatMap[U](fn: T => TraversableOnce[U]) =
	FlatPreparer[A, U](fn.compose(prepareFn))

	def aggregate[B, C](aggregator: Aggregator[T, B, C]): Aggregator[A, B, C] =
	case class SetSizeAggregator(hllBits: Int, maxSetSize: Int = 10)
	extends MonoidAggregator[Array[Byte], Either[HLL, Set[Array[Byte]]], Long] {

	def prepare(in: Array[Byte]) = Right(Set(in))
	def present(sum: Either[HLL, Set[Array[Byte]]]) = {
	sum match {
	case Right(set) => set.size
	case Left(hll) => hll.approximateSize.estimate
	}
	}
	def takeBy(pipe: TypedPipe[(K,V)], max: Int)(fn: V => Double): TypedPipe[(K,V)] = {
	implicit val qtreeSemi = QTreeSemigroup(4) //magic number, determines how much RAM the trees take
	val qtrees = pipe.map{case (k,v) => k -> QTree(fn(v))}.sumByKey
	val maxV = qtrees.flatMap{case (k,q) =>
	if(q.size > max) {
	val targetQuantile = max.toDouble / q.size
	val (lower, upper) = q.quantileBounds(targetQuantile)
	Some(k -> upper) //this will give us at least max values; use lower to get at most max values
	}
	else
	class File
	def seek_to(str)
	until eof?
	start = pos
	buf = read(10000)
	if(offset = buf.index(str))
	seek(start + offset + str.size)
	return true
	else
	seek(start + 5000)
	require 'stringio'
	require 'base64'

	def read_varint(io)
	value = index = 0
	begin
	byte = io.readchar
	value \|= (byte & 0x7f) << (7 * index)
	index += 1
	end while (byte & 0x80).nonzero?
	require 'date'

	DUE_DATE = "2013-05-19"

	#data taken from http://spacefem.com/pregnant/charts/duedate2.php
	#starts at day 222
	DATA = [
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
	1, 2, 1, 4, 2, 2, 1, 4, 6, 7, 5, 1, 5, 8, 7, 9, 10, 11, 13, 13, 18,
	14, 13, 9, 27, 29, 27, 31, 27, 26, 36, 43, 43, 51, 67, 74, 60, 47,
	class WordCount(args : Args) extends Job(args) {
	Tsv(args("input"), ('doc_id, 'text))
	.flatMapTo('text -> 'token){line : String => line.split("[ \\[\\]\\(\\),.]")}
	.map('token -> 'token){token : String => token.trim.toLowerCase}
	.filter('token){token : String => token.length > 0}
	.groupBy('token){g => g.size}
	.write(Tsv(args("output")))
	}
	# visualize the output with gg.js
	# gg({layers: [{ geometry: 'line', mapping: { x: 'minutes', y: 'task', group: 'stage', color: 'type'}}]});

	def parse(line)
	output = {}
	parts = line.split(/[ "]/)
	output["TYPE"] = parts.shift
	while(parts.size > 0)
	next_part = parts.shift
	if next_part =~ /^(\w+)=$/
	def likelihoodRatio(k1 : Int, n1 : Int, k2 : Int, n2 : Int) = {
	def kLogP(k : Int, p : Double) = if(k == 0) 0 else k * math.log(p)
	def logL(p : Double, k : Int, n : Int) = kLogP(k, p) + kLogP(n - k, 1 - p)

	val p1 = k1.toDouble / n1.toDouble
	val p2 = k2.toDouble / n2.toDouble
	val p = (k1 + k2).toDouble / (n1 + n2).toDouble
	logL(p1, k1, n1) + logL(p2, k2, n2) - logL(p, k1, n1) - logL(p, k2, n2)
	}