Skip to content

Instantly share code, notes, and snippets.

@vitillo
Last active August 29, 2015 13:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vitillo/9023560 to your computer and use it in GitHub Desktop.
Save vitillo/9023560 to your computer and use it in GitHub Desktop.
Ensemble predictor based on Mann-Whitney's U test of the last N daily distributions
import matplotlib as mp
import matplotlib.pylab
import numpy
import sys
import simplejson as json
from scipy import stats
from urllib.request import urlopen
sample_unit = None
day_units = None
def get_nth_day_series(series, day):
start = len(series) - (day+1)*day_units
if start < 0:
raise
end = start + day_units
return series[start:end]
def predict(series):
try:
subsets = [get_nth_day_series(series, i) for i in range(0, 10)]
mean = numpy.mean(subsets[0])
result = 0
for ss in subsets:
if numpy.mean(ss) - mean > 0:
tstat = stats.mannwhitneyu(subsets[0], ss)
result += tstat[1] < 0.0001
return result > (len(subsets) >> 2)
except:
return 0
def parse(channel, column_name):
global sample_unit
global day_units
url = "http://ec2-50-112-66-71.us-west-2.compute.amazonaws.com:4352/data/TelemetryChannelMetrics60DaysAggregator.{}.cbuf".format(channel)
lines = urlopen(url).readlines()
meta = json.loads(lines[0].decode())
col_index = -1
data = []
sample_unit = meta['seconds_per_row']
day_units = 24*60*60//sample_unit
for idx, col_info in enumerate(meta['column_info']):
if col_info['name'] == column_name:
col_index = idx
assert(col_index >= 0)
for line in lines[1:]:
data.append(int(line.decode().split()[col_index]))
return data
if __name__ == "__main__":
if len(sys.argv) == 1:
channels = ("nightly", "aurora", "beta", "release", "other", "ALL")
predictions = [predict(parse(channel, "Requests")) for channel in channels]
for channel, prediction in zip(channels, predictions):
print("{}:{}".format(channel, prediction))
else:
series = parse(sys.argv[1], "Requests")
results = [predict(series[:i]) for i in range(0, len(series))]
cmap = mp.colors.ListedColormap(["white","red"], name='from_list', N=None)
mp.pyplot.scatter(range(0, len(series)), series, c=results, cmap=cmap)
mp.pylab.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment