Created July 21, 2010 00:07
# this file is from the google-chartwrapper project (
coding = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'
ecoding = coding + '-.'
codeset = {
'simple': {
'coding': coding,
'max_value': 61,
'char': ',',
'dchar': '',
'none': '_',
'value': lambda x: coding[x]
'text': {
'coding': '',
'max_value': 100,
'none': '-1',
'char': '|',
'dchar': ',',
'value': lambda x: '%.1f' % float(x)
'extended': {
'coding': ecoding,
'max_value': 4095,
'none': '__',
'dchar': '',
'char': ',',
'value': lambda x: '%s%s' % \
(ecoding[int(float(x) / 64)], ecoding[int(x % 64)])
class Encoder:
"""Data encoder that handles simple,text, and extended encodings
Based on javascript encoding algorithm and pygooglecharts"""
def __init__(self, encoding=None, scale=None, series=''):
self.series = series or ''
if encoding is None:
encoding = 'text'
assert(encoding in ('simple','text','extended')),\
'Unknown encoding: %s'%encoding
self.encoding = encoding
self.scale = scale
self.codeset = codeset[encoding]
def scalevalue(self, value):
return value # one day...
if self.encoding != 'text' and self.scale and \
isinstance(value, int) or isinstance(value, float):
if type(self.scale) == type(()):
lower,upper = self.scale
lower,upper = 0,float(self.scale)
value = int(round(float(value - lower) * \
self.codeset['max_value'] / upper))
return min(value, self.codeset['max_value'])
def encode(self, *args, **kwargs):
"""Encode wrapper for a dataset with maximum value
Datasets can be one or two dimensional
Strings are ignored as ordinal encoding"""
if isinstance(args[0], str):
return self.encode([args[0]],**kwargs)
elif isinstance(args[0], int) or isinstance(args[0], float):
return self.encode([[args[0]]],**kwargs)
if len(args)>1:
dataset = args
dataset = args[0]
typemap = list(map(type,dataset))
code = self.encoding[0]
if type('') in typemap:
data = ','.join(map(str,dataset))
elif type([]) in typemap or type(()) in typemap:
data = self.codeset['char'].join(map(self.encodedata, dataset))
elif len(dataset) == 1 and hasattr(dataset[0], '__iter__'):
data = self.encodedata(dataset[0])
data = self.encodedata(dataset)
except ValueError:
data = self.encodedata(','.join(map(unicode,dataset)))
if not '.' in data and code == 't':
code = 'e'
return '%s%s:%s'%(code,self.series,data)
def encodedata(self, data):
sub_data = []
enc_size = len(self.codeset['coding'])
for value in data:
if value in (None,'None'):
elif isinstance(value, str):
elif value >= -1:
raise ValueError('cannot encode value: %s'%value)
return self.codeset['dchar'].join(sub_data)
def decode(self, astr):
e = astr[0]
dec_data = []
for data in astr[2:].split(self.codeset['char']):
sub_data = []
if e == 't':
sub_data.extend(map(float, data.split(',')))
elif e == 'e':
flag = 0
index = self.codeset['coding'].index
for i in range(len(data)):
if not flag:
this,next = index(data[i]),index(data[i+1])
flag = 1
sub_data.append((64 * this) + next)
else: flag = 0
elif e == 's':
sub_data.extend(map(self.codeset['coding'].index, data))
return dec_data
import datetime
import gzip
import re
import StringIO
import urllib2
from encoding import Encoder # from google-chartwrapper
from numpy import numarray
import pylab
DATE_FORMAT = '%Y/%m/%d'
ENC = Encoder('extended')
#ERROR_RATES = ('get', 'put', 'update', 'delete', 'query')
ERROR_RATES = ('put', 'update', 'delete')
NUM_BINS = 24 * 4 # bins are 15 minutes wide
RAW_MAX = 4095
X_MAX = 24.0
Y_MAX = 100.0
Y_THRESH = 0.0
RE_CHD = re.compile(r'&chd=([^&]*)')
def extract_data_string_from_url(url):
m =
if m:
raise Exception("url does not contain any data - missing query parameter 'chd'")
RESTR_SET = r'[^,]*'
RE_SECOND_SET = re.compile(r'e:%s,%s' % (RESTR_SET, RESTR_SET))
def data_string_to_scaled_data(data_str):
m =
if not m:
raise Exception("data string does not contain data in the expected format")
edata =
if not edata:
return [] # no downtime if there is only one data set
xraw, yraw = ENC.decode(edata)
xscl = [x*X_MAX/RAW_MAX for x in xraw]
yscl = [y*Y_MAX/RAW_MAX for y in yraw]
return zip(xscl, yscl)
def bin_data(data_scaled, y_thresh=Y_THRESH):
"""Returns a list. Each element represents a period of time throughout the
day (length of time determined by NUM_BINS - e.g., 12 bins => each element
covers 24/12=2hours). The element is True iff there was downtime greater
than y_thresh.
prev_down_i = None
ret = [False] * NUM_BINS
for x,y in data_scaled:
if y > y_thresh:
i = min(int(x/X_MAX * NUM_BINS), NUM_BINS-1)
ret[i] = True
# if previous reading was downtime too, then mark all bins between
# the previous reading and this reading as downtime too
if prev_down_i is not None:
for j in xrange(prev_down_i, i):
ret[j] = True
prev_down_i = i
prev_down_i = None
return ret
def extract_data_for_specific_error_rate(html, name):
m ='"ae-trust-detail-datastore-%s-error_rate-link"[^(]*[(]([^)]*)[)]'%name, html, re.DOTALL)
if not m:
raise Exception("Missing data for %s" % name)
chart_url =
data_str = extract_data_string_from_url(chart_url)
data_scaled = data_string_to_scaled_data(data_str)
data_binned = bin_data(data_scaled)
return data_binned
def extract_error_rate_data(url, error_rates=ERROR_RATES):
"""Extracts error rate data from `url` for each error rate specified in
error_rates. The output is like that of bin_data(), except the results of
each error_rate is OR'd together. Thus each element/bin in the returned
list indicates whether ANY error rate was greater than Y_THRESH for the
chunk of time it represents.
resp = urllib2.urlopen(url)
except urllib2.HTTPError, e:
raise Exception('HTTP failure: %s (%s)' % (e,url))
except urllib2.URLError, e:
raise Exception('Fetch failure: %s (%s)' % (e,url))
html =
if'content-encoding') == 'gzip':
html = gzip.GzipFile(fileobj=StringIO.StringIO(html)).read()
datas = []
for name in error_rates:
datas.append(extract_data_for_specific_error_rate(html, name))
# combine the channels - if any is True, mark the bin as true
return [any(t) for t in zip(*datas)]
def plot_bins(data, num_days, num_bins, yticksz=5, width=0.5):
labels = []
if num_bins >= 24:
for i in xrange(24):
labels += ['%02d:00'%i]
labels += ['']*(num_bins/24-1)
for i in xrange(num_bins):
labels += ['%d:00-%d:00' % (i*(24.0/num_bins), (i+1)*(24.0/num_bins))]
xlocs = numarray.array(range(len(data))) + width, data, width=width)
pylab.xticks(xlocs+width/2.0, labels)
pylab.xlabel("Time of Day")
pylab.yticks(filter(lambda x: x%yticksz==0,range(0,max(data)+yticksz)))
pylab.ylabel("# of Days with Downtime")
pylab.xlim(0, xlocs[-1]+width*2)
pylab.ylim(0, int((max(data)+yticksz)/yticksz)*yticksz+1)
pylab.title("# Days with Downtime in the Past %d Days" % num_days)
def main(start=datetime.datetime(2009,07,20),
"""Time of day is divided up into NUM_BINS equi-sized chunks. Then downtime
from each day from `start` to `end` is retrieved. Each chunk counts the
number of days which experienced downtime greater than Y_THRESH during the
time block allocated to the chunk.
Data is printed to stdout and shown in a plot.
dt = start
one_day = datetime.timedelta(days=1)
counts = [0] * NUM_BINS
num_days = 0
while dt < end:
dt_str = dt.strftime(DATE_FORMAT)
print 'working on', dt_str
datas_for_day = extract_error_rate_data(DATASTORE_STATUS_URL % dt_str)
print '%s => %s' % (dt_str, datas_for_day)
for i,b in enumerate(datas_for_day):
if b:
counts[i] += 1
dt = dt + one_day
num_days += 1
print 'num days = %d' % num_days
print 'chunk downtime counts =', counts
plot_bins(counts, num_days, NUM_BINS)
if __name__ == '__main__':
dound commented Jul 21, 2010

A quick script to download data on Google App Engine datastore downtime.

