Skip to content

Instantly share code, notes, and snippets.

@cgoldberg
Created April 14, 2015 15:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cgoldberg/2d23dafecfb491a6abc8 to your computer and use it in GitHub Desktop.
Save cgoldberg/2d23dafecfb491a6abc8 to your computer and use it in GitHub Desktop.
parse onload timings and weight with Cochran–Mantel–Haenszel
#!/usr/bin/env python
import collections
import re
from operator import itemgetter
import urlparse
import numpy
import sys
DATA_FILE = 'perflog-everything-onload.csv'
NUM_RESULTS = 50
IP_REGEX = re.compile(r'""ip"": ""(.+?)""')
PAGE_REGEX = re.compile(r'""page"": ""([^"]+)""')
ONLOAD_REGEX = re.compile(r'""value"": ""(.+?)""')
COURSE_REGEX = re.compile(r'(/courses/[^/]+/[^/]+)/')
COURSEV1_REGEX = re.compile(r'(/courses/course-v1:[^/]+)/')
def aggregateURL(page_url):
url = urlparse.urlsplit(page_url)
host = url[1]
path = url[2]
for regex in [COURSE_REGEX, COURSEV1_REGEX]:
match = regex.search(path)
if match:
path = match.group(1)
for prefix in ["/activate/", "/email_confirm/", "/shoppingcart/receipt/", "/notification_prefs/unsubscribe/", "/verify_student/upgrade/", "/jump_to_id/", "/course_modes/", "/verify_student/"]:
if path.startswith(prefix):
path = prefix
if path.endswith('/courseware'):
path = path[:-len('/courseware')]
return path
def weighted_quantile(quantile, population, weights):
weighted_quantile = quantile * sum(weights)
accweights = [0] ; acc = 0.
for i in xrange(len(weights)):
acc += weights[i]
accweights.append(acc)
# binary search
li, ri = 0, len(population)
while ri - li > 1:
mi = (li + ri)/2
if accweights[mi] > weighted_quantile:
li, ri = li, mi
else:
li, ri = mi, ri
return population[li]
if __name__ == '__main__':
page_times = collections.defaultdict(list)
ipatotals = collections.defaultdict(lambda: 0.)
ipas_present = collections.defaultdict(lambda: collections.defaultdict(lambda: 0.))
with open(DATA_FILE) as f:
rows = f.readlines()
for row in rows:
match = PAGE_REGEX.search(row)
aggregated_url = aggregateURL(match.group(1))
match = ONLOAD_REGEX.search(row)
onload_timer = float(match.group(1)) / 1000.0
if onload_timer < 0 or onload_timer > 1e5:
continue
match = IP_REGEX.search(row)
ip = match.group(1)
ipa = int(ip[:ip.find(".")])
page_times[aggregated_url].append((ipa, onload_timer))
ipas_present[aggregated_url][ipa] += 1
ipatotals[ipa] += 1
results = []
total_requests = 0
for url, ipa_onload_times in page_times.items():
num_requests = len(ipa_onload_times)
total_requests += num_requests
weights = [] ; onload_times = [] ; total_weight = 0.
for ipa, onload in ipa_onload_times:
onload_times.append(onload)
this_weight = ipatotals[ipa]/ipas_present[url][ipa]
if ipatotals[ipa] < 50:
this_weight = 0
# this_weight = 1.
weights.append(this_weight)
total_weight += this_weight
percentile_95_time = numpy.percentile(onload_times, 95)
onload_times, weights = zip(*sorted(zip(onload_times, weights)))
weighted_percentile_95_time = weighted_quantile(0.95, onload_times, weights)
results.append((num_requests, weighted_percentile_95_time, percentile_95_time, url))
print '# views\t\t% of total\ttime (95th)\turl (aggregated)'
print '-' * 80
results_by_num_requests = sorted([x for x in results if x[0] > 100], key=itemgetter(1), reverse=True)
for result in results_by_num_requests:
num_requests, weighted_percentile_95_time, percentile_95_time, url = result
percent_requests = float(num_requests) / float(total_requests) * 100.0
trimmed_percent = '{0:.2f} %'.format(percent_requests)
w_trimmed_timer = '{0:.2f}'.format(weighted_percentile_95_time)
trimmed_timer = '{0:.2f}'.format(percentile_95_time)
print '{}\t\t{}\t\t{}\t\t{}\t\t{}'.format(num_requests, trimmed_percent, w_trimmed_timer, trimmed_timer, url)
print '-' * 80
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment