Ensemble predictor based on Mann-Whitney's U test of the last N daily distributions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib as mp | |
import matplotlib.pylab | |
import numpy | |
import sys | |
import simplejson as json | |
from scipy import stats | |
from urllib.request import urlopen | |
sample_unit = None | |
day_units = None | |
def get_nth_day_series(series, day): | |
start = len(series) - (day+1)*day_units | |
if start < 0: | |
raise | |
end = start + day_units | |
return series[start:end] | |
def predict(series): | |
try: | |
subsets = [get_nth_day_series(series, i) for i in range(0, 10)] | |
mean = numpy.mean(subsets[0]) | |
result = 0 | |
for ss in subsets: | |
if numpy.mean(ss) - mean > 0: | |
tstat = stats.mannwhitneyu(subsets[0], ss) | |
result += tstat[1] < 0.0001 | |
return result > (len(subsets) >> 2) | |
except: | |
return 0 | |
def parse(channel, column_name): | |
global sample_unit | |
global day_units | |
url = "http://ec2-50-112-66-71.us-west-2.compute.amazonaws.com:4352/data/TelemetryChannelMetrics60DaysAggregator.{}.cbuf".format(channel) | |
lines = urlopen(url).readlines() | |
meta = json.loads(lines[0].decode()) | |
col_index = -1 | |
data = [] | |
sample_unit = meta['seconds_per_row'] | |
day_units = 24*60*60//sample_unit | |
for idx, col_info in enumerate(meta['column_info']): | |
if col_info['name'] == column_name: | |
col_index = idx | |
assert(col_index >= 0) | |
for line in lines[1:]: | |
data.append(int(line.decode().split()[col_index])) | |
return data | |
if __name__ == "__main__": | |
if len(sys.argv) == 1: | |
channels = ("nightly", "aurora", "beta", "release", "other", "ALL") | |
predictions = [predict(parse(channel, "Requests")) for channel in channels] | |
for channel, prediction in zip(channels, predictions): | |
print("{}:{}".format(channel, prediction)) | |
else: | |
series = parse(sys.argv[1], "Requests") | |
results = [predict(series[:i]) for i in range(0, len(series))] | |
cmap = mp.colors.ListedColormap(["white","red"], name='from_list', N=None) | |
mp.pyplot.scatter(range(0, len(series)), series, c=results, cmap=cmap) | |
mp.pylab.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment