vitillo/filter.json

## filter.json
{
  "version": 1,
  "dimensions": [
    {
      "field_name": "reason",
      "allowed_values": ["saved-session"]
    },
    {
      "field_name": "appName",
      "allowed_values": "Firefox"
    },
    {
      "field_name": "appUpdateChannel",
      "allowed_values": ["nightly"]
    },
    {
      "field_name": "appVersion",
      "allowed_values": "33.0a1"
    },
    {
      "field_name": "appBuildID",
      "allowed_values": "*"
    },
    {
      "field_name": "submission_date",
      "allowed_values": ["20140626"]
    }
  ]
}

## gpu.py
import json
import numpy
import math
import scikits.bootstrap as sb

def cmf(hist):
    res = []
    total = 0

    for idx, value in enumerate(hist):
        total += value
        res.append(total)

    return res

def lower_bound(labels, bin):
    return labels[bin] if bin > 0 else 0

def med_bin(labels, freq):
    half_total_freq = float(sum(freq))/2

    for idx, cm in enumerate(cmf(freq)):
        if cm >= half_total_freq:
            return idx

def width(labels, bin):
    return labels[bin + 1] - labels[bin] if bin + 1 < len(labels) else float('inf')

# http://www.vitutor.com/statistics/descriptive/median.html
def median(labels, values):
    median_bin = med_bin(labels, values)
    lower_limit = lower_bound(labels, median_bin)
    lower_limit_cmf = lower_bound(cmf(values), median_bin - 1)
    half_total_freq = float(sum(values))/2
    w = width(labels, median_bin)
    median_bin_freq = values[median_bin] if values[median_bin] > 0 else 1

    if w == float('inf'):
        return lower_limit
    else:
        return lower_limit + w*(half_total_freq - lower_limit_cmf)/median_bin_freq

def check_numeric_limits(dmin, dmax, n_buckets):
    if type(dmin) != int:
        raise DefinitionException, "minimum is not a number"
    if type(dmax) != int:
        raise DefinitionException, "maximum is not a number"
    if type(n_buckets) != int:
        raise DefinitionException, "number of buckets is not a number"

def exponential_buckets(dmin, dmax, n_buckets):
    check_numeric_limits(dmin, dmax, n_buckets)
    log_max = math.log(dmax);
    bucket_index = 2;
    ret_array = [0] * n_buckets
    current = dmin
    ret_array[1] = current
    for bucket_index in range(2, n_buckets):
        log_current = math.log(current)
        log_ratio = (log_max - log_current) / (n_buckets - bucket_index)
        log_next = log_current + log_ratio
        next_value = int(math.floor(math.exp(log_next) + 0.5))
        if next_value > current:
            current = next_value
        else:
            current = current + 1
        ret_array[bucket_index] = current
    return ret_array

def clean(s):
    return normalize(s).translate(None, ",")

def normalize(s):
    if type(s) == unicode:
        return s.encode('utf8', 'ignore')
    else:
        return str(s)

def bootstrap_resample(X, n=None):
    if n == None:
        n = len(X)

    resample_i = numpy.floor(numpy.random.rand(n)*len(X)).astype(int)
    X_resample = X[resample_i]
    return X_resample

def percentile_method(X, reps=1000, method=numpy.average, alpha=0.1):
    X = numpy.array(X)
    samples_statistic = []

    for rep in range(reps):
        sample = bootstrap_resample(X)
        samples_statistic.append(method(sample))

    return numpy.percentile(samples_statistic, [alpha, 100-alpha])

# Specific for FX_TAB_ANIM_ANY_FRAME_INTERVAL_MS
ta_buckets = exponential_buckets(7, 500, 50)

def map(k, d, v, cx):
    j = json.loads(v)
    info = j['info']

    vendor_id = info.get('adapterVendorID', "N\A")
    device_id = info.get('adapterDeviceID', "N\A")
    gpu2active = info.get('isGPU2Active', False)

    driver = info.get('adapterDriverVersion', 'N\A')

    histograms = j['histograms']
    ta_hist = histograms.get('FX_TAB_ANIM_ANY_FRAME_INTERVAL_MS', None)

    if ta_hist is None:
        return

    if sum(ta_hist[:50]) == 0:
        return

    cx.write((clean(vendor_id), clean(device_id), clean(driver), clean(gpu2active)), ta_hist)
    cx.write((clean(vendor_id), clean(device_id), clean(driver), "ALL"), ta_hist)
    cx.write((clean(vendor_id), clean(device_id), "ALL", "ALL"), ta_hist)
    cx.write((clean(vendor_id), "ALL", "ALL", "ALL"), ta_hist)
    cx.write(("ALL", "ALL", "ALL", "ALL"), ta_hist)


def setup_reduce(cx):
    cx.field_separator = ","

def reduce(k, v, cx):
    # Let's have a look at an animation histogram

    #Aggregate histograms
    hist = numpy.array(v[0])
    for partial_hist in v[1:]:
        partial_hist = numpy.array(partial_hist)
        hist += partial_hist

    values = []
    for idx, bucket in enumerate(ta_buckets):
        rep = int(hist[idx])
        values += rep * [bucket]

    # CI calculated with bootstrap isn't really useful, so don't compute it
    p = k + tuple([str(median(ta_buckets, hist[:-5])), str(len(v))])

    cx.write(p[0], ",".join(p[1:]))
	{
	"version": 1,
	"dimensions": [
	{
	"field_name": "reason",
	"allowed_values": ["saved-session"]
	},
	{
	"field_name": "appName",
	"allowed_values": "Firefox"
	},
	{
	"field_name": "appUpdateChannel",
	"allowed_values": ["nightly"]
	},
	{
	"field_name": "appVersion",
	"allowed_values": "33.0a1"
	},
	{
	"field_name": "appBuildID",
	"allowed_values": "*"
	},
	{
	"field_name": "submission_date",
	"allowed_values": ["20140626"]
	}
	]
	}
	import json
	import numpy
	import math
	import scikits.bootstrap as sb

	def cmf(hist):
	res = []
	total = 0

	for idx, value in enumerate(hist):
	total += value
	res.append(total)

	return res

	def lower_bound(labels, bin):
	return labels[bin] if bin > 0 else 0

	def med_bin(labels, freq):
	half_total_freq = float(sum(freq))/2

	for idx, cm in enumerate(cmf(freq)):
	if cm >= half_total_freq:
	return idx

	def width(labels, bin):
	return labels[bin + 1] - labels[bin] if bin + 1 < len(labels) else float('inf')

	# http://www.vitutor.com/statistics/descriptive/median.html
	def median(labels, values):
	median_bin = med_bin(labels, values)
	lower_limit = lower_bound(labels, median_bin)
	lower_limit_cmf = lower_bound(cmf(values), median_bin - 1)
	half_total_freq = float(sum(values))/2
	w = width(labels, median_bin)
	median_bin_freq = values[median_bin] if values[median_bin] > 0 else 1

	if w == float('inf'):
	return lower_limit
	else:
	return lower_limit + w*(half_total_freq - lower_limit_cmf)/median_bin_freq

	def check_numeric_limits(dmin, dmax, n_buckets):
	if type(dmin) != int:
	raise DefinitionException, "minimum is not a number"
	if type(dmax) != int:
	raise DefinitionException, "maximum is not a number"
	if type(n_buckets) != int:
	raise DefinitionException, "number of buckets is not a number"

	def exponential_buckets(dmin, dmax, n_buckets):
	check_numeric_limits(dmin, dmax, n_buckets)
	log_max = math.log(dmax);
	bucket_index = 2;
	ret_array = [0] * n_buckets
	current = dmin
	ret_array[1] = current
	for bucket_index in range(2, n_buckets):
	log_current = math.log(current)
	log_ratio = (log_max - log_current) / (n_buckets - bucket_index)
	log_next = log_current + log_ratio
	next_value = int(math.floor(math.exp(log_next) + 0.5))
	if next_value > current:
	current = next_value
	else:
	current = current + 1
	ret_array[bucket_index] = current
	return ret_array

	def clean(s):
	return normalize(s).translate(None, ",")

	def normalize(s):
	if type(s) == unicode:
	return s.encode('utf8', 'ignore')
	else:
	return str(s)

	def bootstrap_resample(X, n=None):
	if n == None:
	n = len(X)

	resample_i = numpy.floor(numpy.random.rand(n)*len(X)).astype(int)
	X_resample = X[resample_i]
	return X_resample

	def percentile_method(X, reps=1000, method=numpy.average, alpha=0.1):
	X = numpy.array(X)
	samples_statistic = []

	for rep in range(reps):
	sample = bootstrap_resample(X)
	samples_statistic.append(method(sample))

	return numpy.percentile(samples_statistic, [alpha, 100-alpha])

	# Specific for FX_TAB_ANIM_ANY_FRAME_INTERVAL_MS
	ta_buckets = exponential_buckets(7, 500, 50)

	def map(k, d, v, cx):
	j = json.loads(v)
	info = j['info']

	vendor_id = info.get('adapterVendorID', "N\A")
	device_id = info.get('adapterDeviceID', "N\A")
	gpu2active = info.get('isGPU2Active', False)

	driver = info.get('adapterDriverVersion', 'N\A')

	histograms = j['histograms']
	ta_hist = histograms.get('FX_TAB_ANIM_ANY_FRAME_INTERVAL_MS', None)

	if ta_hist is None:
	return

	if sum(ta_hist[:50]) == 0:
	return

	cx.write((clean(vendor_id), clean(device_id), clean(driver), clean(gpu2active)), ta_hist)
	cx.write((clean(vendor_id), clean(device_id), clean(driver), "ALL"), ta_hist)
	cx.write((clean(vendor_id), clean(device_id), "ALL", "ALL"), ta_hist)
	cx.write((clean(vendor_id), "ALL", "ALL", "ALL"), ta_hist)
	cx.write(("ALL", "ALL", "ALL", "ALL"), ta_hist)


	def setup_reduce(cx):
	cx.field_separator = ","

	def reduce(k, v, cx):
	# Let's have a look at an animation histogram

	#Aggregate histograms
	hist = numpy.array(v[0])
	for partial_hist in v[1:]:
	partial_hist = numpy.array(partial_hist)
	hist += partial_hist

	values = []
	for idx, bucket in enumerate(ta_buckets):
	rep = int(hist[idx])
	values += rep * [bucket]

	# CI calculated with bootstrap isn't really useful, so don't compute it
	p = k + tuple([str(median(ta_buckets, hist[:-5])), str(len(v))])

	cx.write(p[0], ",".join(p[1:]))