Skip to content

Instantly share code, notes, and snippets.

@vitillo vitillo/
Last active Aug 29, 2015

What would you like to do?
Ensemble predictor based on Mann-Whitney's U test of the last N daily distributions
import matplotlib as mp
import matplotlib.pylab
import numpy
import sys
import simplejson as json
from scipy import stats
from urllib.request import urlopen
sample_unit = None
day_units = None
def get_nth_day_series(series, day):
start = len(series) - (day+1)*day_units
if start < 0:
end = start + day_units
return series[start:end]
def predict(series):
subsets = [get_nth_day_series(series, i) for i in range(0, 10)]
mean = numpy.mean(subsets[0])
result = 0
for ss in subsets:
if numpy.mean(ss) - mean > 0:
tstat = stats.mannwhitneyu(subsets[0], ss)
result += tstat[1] < 0.0001
return result > (len(subsets) >> 2)
return 0
def parse(channel, column_name):
global sample_unit
global day_units
url = "{}.cbuf".format(channel)
lines = urlopen(url).readlines()
meta = json.loads(lines[0].decode())
col_index = -1
data = []
sample_unit = meta['seconds_per_row']
day_units = 24*60*60//sample_unit
for idx, col_info in enumerate(meta['column_info']):
if col_info['name'] == column_name:
col_index = idx
assert(col_index >= 0)
for line in lines[1:]:
return data
if __name__ == "__main__":
if len(sys.argv) == 1:
channels = ("nightly", "aurora", "beta", "release", "other", "ALL")
predictions = [predict(parse(channel, "Requests")) for channel in channels]
for channel, prediction in zip(channels, predictions):
print("{}:{}".format(channel, prediction))
series = parse(sys.argv[1], "Requests")
results = [predict(series[:i]) for i in range(0, len(series))]
cmap = mp.colors.ListedColormap(["white","red"], name='from_list', N=None)
mp.pyplot.scatter(range(0, len(series)), series, c=results, cmap=cmap)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.