Skip to content

Instantly share code, notes, and snippets.

View erikbern's full-sized avatar

Erik Bernhardsson erikbern

View GitHub Profile
class BsddbTarget(luigi.LocalTarget):
def open(self, mode):
return bsddb.hashopen(self.path, mode) # TODO: make this atomic!!
class TCTarget(luigi.LocalTarget):
open_tcs = {}
def open(self, mode):
if mode == 'r':
if self.path not in TCTarget.open_tcs:
import luigi
# Here we are importing our own tasks, provided they are
# arranged in a python module (folder) named "components"
from components.SomeTaskA import SomeTaskA
from components.SomeTaskB import SomeTaskB
from components.SomeTaskC import SomeTaskC
# ------------------------------------------
# DEFINE THE MAIN WORKFLOW DEPENDENCY GRAPH
import os, shutil
import luigi
import sparkey
import random
class SparkeyTarget(luigi.Target):
def __init__(self, path=None, spi='data.spi', spl='data.spl', writer_cls=sparkey.HashWriter, reader_cls=sparkey.HashReader):
self.path = path
self.spi_path = spi
self.spl_path = spl
@erikbern
erikbern / gist:fc05e8cccd64dccde630
Last active August 29, 2015 14:03
Generate Dirichlet distribution
import random, time
import pylab, numpy
def method1(n):
s = 1.0
r = []
for i in xrange(n):
t = s * (1 - random.random() ** (1.0 / (n - i)))
s -= t
r.append(t)
def tabCounter() = {
implicit def input = getInput()
input.map(_.split('\t').size).reduce(_ + _)
}
val task = LuigiTask().requires(MyTsvJob(buildId)).output(HdfsTarget("output")).do(tabCounter)
val otherTask = LuigiTask().requires(task).output(HdfsTarget("output-2")).do(somethingElse)
otherTask.run() // schedule task and otherTask
if (indices.size() <= (size_t)_K) {
for (size_t i = 0; i < indices.size(); i++)
m->children[i] = indices[i];
}
import subprocess, itertools, numpy
import matplotlib.pyplot as plt
command = 'git log --shortstat --log-size --format=oneline --no-merges'.split()
data = subprocess.check_output(command).split('\n')
def read_groups():
buf = []
for line in data:
buf.append(line)
@erikbern
erikbern / gist:ba3456f836ccc9c044e8
Last active August 29, 2015 14:18
simple javascript task framework to flip recursion inside out
function serializeArgs(args) {
return JSON.stringify(args);
}
function unroll(f) {
if (f._cache == undefined)
f._cache = {};
var f_new = function() {
var key = serializeArgs(arguments);
import numpy as np
import matplotlib.pyplot as plt
mean = np.array([1, 1, 1])
cov = np.array([[1, 0.5, 0.5], [0.5, 1, 0.5], [0.5, 0.5, 0.5]])
people = np.random.multivariate_normal(mean, cov, 100000)
criterion = np.array([0, 0.2, 1.0])
scores = np.dot(people, criterion)
@erikbern
erikbern / gist:1f32359164e994ef0dd2
Created May 26, 2015 02:23
LSHF, Annoy, Flann, Panns
----------------------------------------------------------------------------------------
n_samples: 1000 n_features: 100
LSHF index build time: 0.0258071422577
ANNOY index build time: 0.012866973877
FLANN index build time: 0.00251913070679
Panns index build time: 3.09596705437
LSHF average query time: 0.00537742614746 , Average accuracy: 0.536
ANNOY average query time: 0.000244197845459 , Average accuracy: 0.9954
FLANN average query time: 0.000175647735596 , Average accuracy: 0.5824
Panns average query time: 0.0499544477463 , Average accuracy: 0.8956