Dan Frank danielhfrank

## README.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              2 stars
            
          
                ebroder
                / README.md
            
            
              Last active
              August 29, 2015 14:11
            
          
    As this started to mature into a real project, I've moved the code to https://github.com/ebroder/rugdby
(There are tests now!)

  
## imap-delete.py
#!/usr/bin/env python
import imaplib
import os
import logging
import optparse

# LABEL = 'support'
# QUERY = 'before:2015-06-01 -label:to-delete'
# DEST  = 'to-delete'
LABEL = 'to-delete'

## nsq_data.py
import tornado.httpclient
import logging
import json
import urllib
import functools


_http_client = None
def http_client():
    global _http_client

## check_aggregate_nsqd_depth.py
#!/usr/bin/env python2.7
import sys
import logging
import functools
import time
import tornado.options
import tornado.ioloop
import nsq_data


## git_helpers.sh
function git_likely_authority() {
  # Outputs a ranked list of likely author(itie)s regarding tracked files matching *${1}*
  # (Makes it easier to discover who likely knows most about the matched files)
  # (Note: Not always indicative if large refactors/renamings have occurred)
  # E.g. `git_likely_authority some_pattern_that_matches_a_path`
  git ls-files "*${1}*" |
  grep -v -E 'node_modules|vendor' | # edit to avoid certain files/dirs
  xargs -n1 git blame --line-porcelain |
  sed -n 's/^author //p' |
  sort -f |

## impala-hll.md

      
              1 file
            
          
              1 fork
            
          
              2 comments
            
          
              14 stars
            
          
                avibryant
                / impala-hll.md
            
            
              Last active
              June 22, 2020 06:39
            
          
    Recent versions of Cloudera's Impala added NDV, a "number of distinct values" aggregate function that uses the HyperLogLog algorithm to estimate this number, in parallel, in a fixed amount of space.
This can make a really, really big difference: in a large table I tested this on, which had roughly 100M unique values of mycolumn, using NDV(mycolumn) got me an approximate answer in 27 seconds, whereas the exact answer using count(distinct mycolumn) took ... well, I don't know how long, because I got tired of waiting for it after 45 minutes.
It's fun to note, though, that because of another recent addition to Impala's dialect of SQL, the fnv_hash function, you don't actually need to use NDV; instead, you can build HyperLogLog yourself from mathematical primitives.
HyperLogLog hashes each value it sees, and then assigns them to a bucket based on the low order bits of the hash. It's common to use 1024 buckets, so we can get the bucket by using a bitwise & with 1023:
select

  
## gist:7451236
import numpy as np

def gaussian(x0, sigma):
    return lambda x : np.exp(- 0.5 * ((x - x0) / sigma)**2 ) / (sigma * np.sqrt(2 * np.pi))

def kde(points, sigma=.5):
    functions = [gaussian(x0, sigma) for x0 in points]
    def sampler(x):
        return sum(f(x) for f in functions)
    return sampler

## sublime_config.pp
class config::sublime {

  define addpkg {
    $packagedir = "/Library/Application Support/Sublime Text 2/Packages/"
    $pkgarray = split($name, '[/]')
    $pkgname = $pkgarray[1]

    exec { "git clone https://github.com/${name}.git":
      cwd      => "/Users/${::luser}${packagedir}",
      provider => 'shell',

## .zshrc
gifify() {
  if [[ -n "$1" ]]; then
    if [[ $2 == '--good' ]]; then
      ffmpeg -i $1 -r 10 -vcodec png out-static-%05d.png
      time convert -verbose +dither -layers Optimize -resize 600x600\> out-static*.png  GIF:- | gifsicle --colors 128 --delay=5 --loop --optimize=3 --multifile - > $1.gif
      rm out-static*.png
    else
      ffmpeg -i $1 -s 600x400 -pix_fmt rgb24 -r 10 -f gif - | gifsicle --optimize=3 --delay=3 > $1.gif
    fi
  else

## rg_dyno_sim.R
# you can make a text file of request times (in ms, one number per line) and import it here, or you can use a probability distribution to simulate request times (see below where setting req_durations_in_ms)
# rq = read.table("~/Downloads/request_times.txt", header=FALSE)$V1

# argument notes:
# parallel_router_count is only relevant if router_mode is set to "intelligent"
# choice_of_two, power_of_two, and unicorn_workers_per_dyno are only relevant if router_mode is set to "naive"
# you can only select one of choice_of_two, power_of_two, and unicorn_workers_per_dyno

run_simulation = function(router_mode = "naive",
                          reqs_per_minute = 9000,
	import tornado.httpclient
	import logging
	import json
	import urllib
	import functools


	_http_client = None
	def http_client():
	global _http_client
	#!/usr/bin/env python2.7
	import sys
	import logging
	import functools
	import time
	import tornado.options
	import tornado.ioloop
	import nsq_data
	function git_likely_authority() {
	# Outputs a ranked list of likely author(itie)s regarding tracked files matching ${1}
	# (Makes it easier to discover who likely knows most about the matched files)
	# (Note: Not always indicative if large refactors/renamings have occurred)
	# E.g. `git_likely_authority some_pattern_that_matches_a_path`
	git ls-files "${1}" \|
	grep -v -E 'node_modules\|vendor' \| # edit to avoid certain files/dirs
	xargs -n1 git blame --line-porcelain \|
	sed -n 's/^author //p' \|
	sort -f \|
	import numpy as np

	def gaussian(x0, sigma):
	return lambda x : np.exp(- 0.5 * ((x - x0) / sigma)*2 ) / (sigma np.sqrt(2 * np.pi))

	def kde(points, sigma=.5):
	functions = [gaussian(x0, sigma) for x0 in points]
	def sampler(x):
	return sum(f(x) for f in functions)
	return sampler
	class config::sublime {

	define addpkg {
	$packagedir = "/Library/Application Support/Sublime Text 2/Packages/"
	$pkgarray = split($name, '[/]')
	$pkgname = $pkgarray[1]

	exec { "git clone https://github.com/${name}.git":
	cwd => "/Users/${::luser}${packagedir}",
	provider => 'shell',
	gifify() {
	if [[ -n "$1" ]]; then
	if [[ $2 == '--good' ]]; then
	ffmpeg -i $1 -r 10 -vcodec png out-static-%05d.png
	time convert -verbose +dither -layers Optimize -resize 600x600\> out-static*.png GIF:- \| gifsicle --colors 128 --delay=5 --loop --optimize=3 --multifile - > $1.gif
	rm out-static*.png
	else
	ffmpeg -i $1 -s 600x400 -pix_fmt rgb24 -r 10 -f gif - \| gifsicle --optimize=3 --delay=3 > $1.gif
	fi
	else
	# you can make a text file of request times (in ms, one number per line) and import it here, or you can use a probability distribution to simulate request times (see below where setting req_durations_in_ms)
	# rq = read.table("~/Downloads/request_times.txt", header=FALSE)$V1

	# argument notes:
	# parallel_router_count is only relevant if router_mode is set to "intelligent"
	# choice_of_two, power_of_two, and unicorn_workers_per_dyno are only relevant if router_mode is set to "naive"
	# you can only select one of choice_of_two, power_of_two, and unicorn_workers_per_dyno

	run_simulation = function(router_mode = "naive",
	reqs_per_minute = 9000,