Skip to content

Instantly share code, notes, and snippets.

# vitillo/predictor.py

Last active August 29, 2015 13:56
Show Gist options
• Save vitillo/9023560 to your computer and use it in GitHub Desktop.
Ensemble predictor based on Mann-Whitney's U test of the last N daily distributions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 import matplotlib as mp import matplotlib.pylab import numpy import sys import simplejson as json from scipy import stats from urllib.request import urlopen sample_unit = None day_units = None def get_nth_day_series(series, day): start = len(series) - (day+1)*day_units if start < 0: raise end = start + day_units return series[start:end] def predict(series): try: subsets = [get_nth_day_series(series, i) for i in range(0, 10)] mean = numpy.mean(subsets[0]) result = 0 for ss in subsets: if numpy.mean(ss) - mean > 0: tstat = stats.mannwhitneyu(subsets[0], ss) result += tstat[1] < 0.0001 return result > (len(subsets) >> 2) except: return 0 def parse(channel, column_name): global sample_unit global day_units url = "http://ec2-50-112-66-71.us-west-2.compute.amazonaws.com:4352/data/TelemetryChannelMetrics60DaysAggregator.{}.cbuf".format(channel) lines = urlopen(url).readlines() meta = json.loads(lines[0].decode()) col_index = -1 data = [] sample_unit = meta['seconds_per_row'] day_units = 24*60*60//sample_unit for idx, col_info in enumerate(meta['column_info']): if col_info['name'] == column_name: col_index = idx assert(col_index >= 0) for line in lines[1:]: data.append(int(line.decode().split()[col_index])) return data if __name__ == "__main__": if len(sys.argv) == 1: channels = ("nightly", "aurora", "beta", "release", "other", "ALL") predictions = [predict(parse(channel, "Requests")) for channel in channels] for channel, prediction in zip(channels, predictions): print("{}:{}".format(channel, prediction)) else: series = parse(sys.argv[1], "Requests") results = [predict(series[:i]) for i in range(0, len(series))] cmap = mp.colors.ListedColormap(["white","red"], name='from_list', N=None) mp.pyplot.scatter(range(0, len(series)), series, c=results, cmap=cmap) mp.pylab.show()
to join this conversation on GitHub. Already have an account? Sign in to comment