Ted Dunning tdunning

## multi-schema.json
[
    {"name":"t", "class":"sequence", "lengthDistribution": 3, "base":{
	"class":"random-walk", "sd": 1, "mean": 100}},
    {"name":"v1", "class":"sequence", "lengthDistribution": 3, "base":{
	"class":"random-walk", "sd": 10, "mean": 0}},
    {"name":"v2", "class":"sequence", "lengthDistribution": 3, "base":{
	"class":"random-walk", "sd": 10, "mean": 0}}
]

## desired-output.json
{"t":100.44380099679421,"v1":-3.2124876152877886,"v2":20.668968387311498}
{"t":200.7658959087325,"v1":-17.9729521628487,"v2":39.70574384652023}
{"t":301.6982003576183,"v1":-11.10212822944568,"v2":33.97732641377096}
{"t":402.02369599592936,"v1":-10.357254868666516,"v2":28.14680001787952}
{"t":500.361291107067,"v1":-15.358599092346992,"v2":24.449514473332897}
{"t":601.0570362962695,"v1":-18.17981637433697,"v2":36.90232317832954}
{"t":700.3591871348365,"v1":-4.184888093902327,"v2":43.63398291814092}
{"t":801.7683561318721,"v1":2.266724195547855,"v2":37.96260382309654}
{"t":901.3906331330202,"v1":1.8027188779133356,"v2":34.558440299721525}
{"t":998.8508331065062,"v1":-1.3711844607134631,"v2":34.518652181145356}

## multi-flatten.json
{"t":[100.44380099679421,200.7658959087325,301.6982003576183],"v1":[-3.2124876152877886,-17.9729521628487,-11.10212822944568],"v2":[20.668968387311498,39.70574384652023,33.97732641377096]}
{"t":[402.02369599592936,500.361291107067,601.0570362962695],"v1":[-10.357254868666516,-15.358599092346992,-18.17981637433697],"v2":[28.14680001787952,24.449514473332897,36.90232317832954]}
{"t":[700.3591871348365,801.7683561318721,901.3906331330202],"v1":[-4.184888093902327,2.266724195547855,1.8027188779133356],"v2":[43.63398291814092,37.96260382309654,34.558440299721525]}
{"t":[998.8508331065062,1097.1401685144158,1198.1819481032155],"v1":[-1.3711844607134631,5.5027134050661735,5.111544086242255],"v2":[34.518652181145356,39.89433181166691,44.621036340105604]}
{"t":[1298.1931737425077,1398.9874465151283,1498.3317303744573],"v1":[5.597833929076392,23.21742527898042,20.160346681365283],"v2":[46.6957213571881,36.773578638699526,26.80096644321689]}
{"t":[1598.4807447946152,1698.0002145693118,1797.9744831102964],"v1":[30.550780

## read.svmlight
require(Matrix)

read.svmlight2 <- function( filename ) {
    f <- file( filename, "r")
    lines = readLines( f )
    close(f)
    temp = strsplit(lines,'[: ]')
    target = sapply(temp, function(row){as.numeric(row[1])})
    raw = lapply(temp, function(row){
        n = length(row);

## gist:e8f0c9ca213a3ff055cc
import fileinput
from string import join
import json
import csv
import json
### read the output from MAHOUT and collect into hash ###
with open('x','rb') as csv_file:
    csv_reader = csv.reader(csv_file,delimiter='\t')
    old_id = ""
    indicators = []

## on-board
Log in to the cluster:

    ted:downloads$ ssh se-node10.se.lab
    Last login: Mon Mar 23 17:35:37 2015 from 10.250.0.220
    Please check the cluster reservation calendar:
    https://www.google.com/calendar/embed?src=maprtech.com_2d38343133383836382d313737%40resource.calendar.google.com

Poke around looking for my volume and such:

    [tdunning@se-node10 ~]$ ls /mapr/se1/user/t

## t-digest-size-test.r
# Experiments with t-digest in R

standard.size.bound = function(n, q) {
    4 * n * q * (1-q)
}

constant.size.bound = function(n, q) {
    n
}

## Clustering is hard
# picking the corners of the hyper cube at random usually gives us a good selection
d = 0
while (d == 0) {
    centers = matrix(runif(10*10)>0.5, ncol=10) + 0
    # but occasionally we get a duplicate row that is easily detected
    d = det(centers)
}

# start x out by selecting clusters
x = data.frame(n = ceiling(runif(10000,1e-10,10)))

## gist:22432450b9e27948b6b5

public class HbaseLookup {
    static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(TrigoMathFunctions.class);

    private HbaseLookup(){}

    @FunctionTemplate(name = "hLookup", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL)
    public static class Lookup implements DrillSimpleFunc {

        @Param VarCharHolder table;                // the table to read from

## gist:c5d276dc94bdb17e5d84
  Set<String> common = Sets.newHashSet(firstListOfEmails);
  common.retainAll(secondListOfEmails);
	[
	{"name":"t", "class":"sequence", "lengthDistribution": 3, "base":{
	"class":"random-walk", "sd": 1, "mean": 100}},
	{"name":"v1", "class":"sequence", "lengthDistribution": 3, "base":{
	"class":"random-walk", "sd": 10, "mean": 0}},
	{"name":"v2", "class":"sequence", "lengthDistribution": 3, "base":{
	"class":"random-walk", "sd": 10, "mean": 0}}
	]
	{"t":100.44380099679421,"v1":-3.2124876152877886,"v2":20.668968387311498}
	{"t":200.7658959087325,"v1":-17.9729521628487,"v2":39.70574384652023}
	{"t":301.6982003576183,"v1":-11.10212822944568,"v2":33.97732641377096}
	{"t":402.02369599592936,"v1":-10.357254868666516,"v2":28.14680001787952}
	{"t":500.361291107067,"v1":-15.358599092346992,"v2":24.449514473332897}
	{"t":601.0570362962695,"v1":-18.17981637433697,"v2":36.90232317832954}
	{"t":700.3591871348365,"v1":-4.184888093902327,"v2":43.63398291814092}
	{"t":801.7683561318721,"v1":2.266724195547855,"v2":37.96260382309654}
	{"t":901.3906331330202,"v1":1.8027188779133356,"v2":34.558440299721525}
	{"t":998.8508331065062,"v1":-1.3711844607134631,"v2":34.518652181145356}
	{"t":[100.44380099679421,200.7658959087325,301.6982003576183],"v1":[-3.2124876152877886,-17.9729521628487,-11.10212822944568],"v2":[20.668968387311498,39.70574384652023,33.97732641377096]}
	{"t":[402.02369599592936,500.361291107067,601.0570362962695],"v1":[-10.357254868666516,-15.358599092346992,-18.17981637433697],"v2":[28.14680001787952,24.449514473332897,36.90232317832954]}
	{"t":[700.3591871348365,801.7683561318721,901.3906331330202],"v1":[-4.184888093902327,2.266724195547855,1.8027188779133356],"v2":[43.63398291814092,37.96260382309654,34.558440299721525]}
	{"t":[998.8508331065062,1097.1401685144158,1198.1819481032155],"v1":[-1.3711844607134631,5.5027134050661735,5.111544086242255],"v2":[34.518652181145356,39.89433181166691,44.621036340105604]}
	{"t":[1298.1931737425077,1398.9874465151283,1498.3317303744573],"v1":[5.597833929076392,23.21742527898042,20.160346681365283],"v2":[46.6957213571881,36.773578638699526,26.80096644321689]}
	{"t":[1598.4807447946152,1698.0002145693118,1797.9744831102964],"v1":[30.550780
	require(Matrix)

	read.svmlight2 <- function( filename ) {
	f <- file( filename, "r")
	lines = readLines( f )
	close(f)
	temp = strsplit(lines,'[: ]')
	target = sapply(temp, function(row){as.numeric(row[1])})
	raw = lapply(temp, function(row){
	n = length(row);
	import fileinput
	from string import join
	import json
	import csv
	import json
	### read the output from MAHOUT and collect into hash ###
	with open('x','rb') as csv_file:
	csv_reader = csv.reader(csv_file,delimiter='\t')
	old_id = ""
	indicators = []
	Log in to the cluster:

	ted:downloads$ ssh se-node10.se.lab
	Last login: Mon Mar 23 17:35:37 2015 from 10.250.0.220
	Please check the cluster reservation calendar:
	https://www.google.com/calendar/embed?src=maprtech.com_2d38343133383836382d313737%40resource.calendar.google.com

	Poke around looking for my volume and such:

	[tdunning@se-node10 ~]$ ls /mapr/se1/user/t
	# Experiments with t-digest in R

	standard.size.bound = function(n, q) {
	4 * n * q * (1-q)
	}

	constant.size.bound = function(n, q) {
	n
	}
	# picking the corners of the hyper cube at random usually gives us a good selection
	d = 0
	while (d == 0) {
	centers = matrix(runif(10*10)>0.5, ncol=10) + 0
	# but occasionally we get a duplicate row that is easily detected
	d = det(centers)
	}

	# start x out by selecting clusters
	x = data.frame(n = ceiling(runif(10000,1e-10,10)))

	public class HbaseLookup {
	static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(TrigoMathFunctions.class);

	private HbaseLookup(){}

	@FunctionTemplate(name = "hLookup", scope = FunctionScope.SIMPLE, nulls = NullHandling.NULL_IF_NULL)
	public static class Lookup implements DrillSimpleFunc {

	@Param VarCharHolder table; // the table to read from
	Set<String> common = Sets.newHashSet(firstListOfEmails);
	common.retainAll(secondListOfEmails);