Ensemble predictor based on Mann-Whitney's U test of the last N daily distributions
 import matplotlib as mp import matplotlib.pylab import numpy import sys import simplejson as json from scipy import stats from urllib.request import urlopen sample_unit = None day_units = None def get_nth_day_series(series, day): start = len(series) - (day+1)*day_units if start < 0: raise end = start + day_units return series[start:end] def predict(series): try: subsets = [get_nth_day_series(series, i) for i in range(0, 10)] mean = numpy.mean(subsets[0]) result = 0 for ss in subsets: if numpy.mean(ss) - mean > 0: tstat = stats.mannwhitneyu(subsets[0], ss) result += tstat[1] < 0.0001 return result > (len(subsets) >> 2) except: return 0 def parse(channel, column_name): global sample_unit global day_units url = "http://ec2-50-112-66-71.us-west-2.compute.amazonaws.com:4352/data/TelemetryChannelMetrics60DaysAggregator.{}.cbuf".format(channel) lines = urlopen(url).readlines() meta = json.loads(lines[0].decode()) col_index = -1 data = [] sample_unit = meta['seconds_per_row'] day_units = 24*60*60//sample_unit for idx, col_info in enumerate(meta['column_info']): if col_info['name'] == column_name: col_index = idx assert(col_index >= 0) for line in lines[1:]: data.append(int(line.decode().split()[col_index])) return data if __name__ == "__main__": if len(sys.argv) == 1: channels = ("nightly", "aurora", "beta", "release", "other", "ALL") predictions = [predict(parse(channel, "Requests")) for channel in channels] for channel, prediction in zip(channels, predictions): print("{}:{}".format(channel, prediction)) else: series = parse(sys.argv[1], "Requests") results = [predict(series[:i]) for i in range(0, len(series))] cmap = mp.colors.ListedColormap(["white","red"], name='from_list', N=None) mp.pyplot.scatter(range(0, len(series)), series, c=results, cmap=cmap) mp.pylab.show()
