cgoldberg/mhtest.py

## mhtest.py
#!/usr/bin/env python

import collections
import re
from operator import itemgetter
import urlparse
import numpy

import sys

DATA_FILE = 'perflog-everything-onload.csv'
NUM_RESULTS = 50

IP_REGEX = re.compile(r'""ip"": ""(.+?)""')
PAGE_REGEX = re.compile(r'""page"": ""([^"]+)""')
ONLOAD_REGEX = re.compile(r'""value"": ""(.+?)""')
COURSE_REGEX = re.compile(r'(/courses/[^/]+/[^/]+)/')
COURSEV1_REGEX = re.compile(r'(/courses/course-v1:[^/]+)/')

def aggregateURL(page_url):
  url = urlparse.urlsplit(page_url)
  host = url[1]
  path = url[2]
  for regex in [COURSE_REGEX, COURSEV1_REGEX]:
      match = regex.search(path)
      if match:
          path = match.group(1)
  for prefix in ["/activate/", "/email_confirm/", "/shoppingcart/receipt/", "/notification_prefs/unsubscribe/", "/verify_student/upgrade/", "/jump_to_id/", "/course_modes/", "/verify_student/"]:
      if path.startswith(prefix):
          path = prefix
  if path.endswith('/courseware'):
      path = path[:-len('/courseware')]
  return path

def weighted_quantile(quantile, population, weights):
    weighted_quantile = quantile * sum(weights)
    accweights = [0] ; acc = 0.
    for i in xrange(len(weights)):
        acc += weights[i]
        accweights.append(acc)
    # binary search
    li, ri = 0, len(population)
    while ri - li > 1:
        mi = (li + ri)/2
        if accweights[mi] > weighted_quantile:
            li, ri = li, mi
        else:
            li, ri = mi, ri
    return population[li]

if __name__ == '__main__':
    page_times = collections.defaultdict(list)
    ipatotals = collections.defaultdict(lambda: 0.)
    ipas_present = collections.defaultdict(lambda: collections.defaultdict(lambda: 0.))
    with open(DATA_FILE) as f:
        rows = f.readlines()
        for row in rows:
            match = PAGE_REGEX.search(row)
            aggregated_url = aggregateURL(match.group(1))
            match = ONLOAD_REGEX.search(row)
            onload_timer = float(match.group(1)) / 1000.0
            if onload_timer < 0 or onload_timer > 1e5:
              continue
            match = IP_REGEX.search(row)
            ip = match.group(1)
            ipa = int(ip[:ip.find(".")])
            page_times[aggregated_url].append((ipa, onload_timer))
            ipas_present[aggregated_url][ipa] += 1
            ipatotals[ipa] += 1

    results = []
    total_requests = 0
    for url, ipa_onload_times in page_times.items():
        num_requests = len(ipa_onload_times)
        total_requests += num_requests

        weights = [] ; onload_times = [] ; total_weight = 0.
        for ipa, onload in ipa_onload_times:
            onload_times.append(onload)
            this_weight = ipatotals[ipa]/ipas_present[url][ipa]
            if ipatotals[ipa] < 50:
                this_weight = 0
#            this_weight = 1.
            weights.append(this_weight)
            total_weight += this_weight
        percentile_95_time = numpy.percentile(onload_times, 95)
        onload_times, weights = zip(*sorted(zip(onload_times, weights)))
        weighted_percentile_95_time = weighted_quantile(0.95, onload_times, weights)
        results.append((num_requests, weighted_percentile_95_time, percentile_95_time, url))

    print '# views\t\t% of total\ttime (95th)\turl (aggregated)'
    print '-' * 80
    results_by_num_requests = sorted([x for x in results if x[0] > 100], key=itemgetter(1), reverse=True)
    for result in results_by_num_requests:
        num_requests, weighted_percentile_95_time, percentile_95_time, url = result
        percent_requests = float(num_requests) / float(total_requests) * 100.0
        trimmed_percent = '{0:.2f} %'.format(percent_requests)
        w_trimmed_timer = '{0:.2f}'.format(weighted_percentile_95_time)
        trimmed_timer = '{0:.2f}'.format(percentile_95_time)
        print '{}\t\t{}\t\t{}\t\t{}\t\t{}'.format(num_requests, trimmed_percent, w_trimmed_timer, trimmed_timer, url)
    print '-' * 80
	#!/usr/bin/env python

	import collections
	import re
	from operator import itemgetter
	import urlparse
	import numpy

	import sys

	DATA_FILE = 'perflog-everything-onload.csv'
	NUM_RESULTS = 50

	IP_REGEX = re.compile(r'""ip"": ""(.+?)""')
	PAGE_REGEX = re.compile(r'""page"": ""([^"]+)""')
	ONLOAD_REGEX = re.compile(r'""value"": ""(.+?)""')
	COURSE_REGEX = re.compile(r'(/courses/[^/]+/[^/]+)/')
	COURSEV1_REGEX = re.compile(r'(/courses/course-v1:[^/]+)/')

	def aggregateURL(page_url):
	url = urlparse.urlsplit(page_url)
	host = url[1]
	path = url[2]
	for regex in [COURSE_REGEX, COURSEV1_REGEX]:
	match = regex.search(path)
	if match:
	path = match.group(1)
	for prefix in ["/activate/", "/email_confirm/", "/shoppingcart/receipt/", "/notification_prefs/unsubscribe/", "/verify_student/upgrade/", "/jump_to_id/", "/course_modes/", "/verify_student/"]:
	if path.startswith(prefix):
	path = prefix
	if path.endswith('/courseware'):
	path = path[:-len('/courseware')]
	return path

	def weighted_quantile(quantile, population, weights):
	weighted_quantile = quantile * sum(weights)
	accweights = [0] ; acc = 0.
	for i in xrange(len(weights)):
	acc += weights[i]
	accweights.append(acc)
	# binary search
	li, ri = 0, len(population)
	while ri - li > 1:
	mi = (li + ri)/2
	if accweights[mi] > weighted_quantile:
	li, ri = li, mi
	else:
	li, ri = mi, ri
	return population[li]

	if __name__ == '__main__':
	page_times = collections.defaultdict(list)
	ipatotals = collections.defaultdict(lambda: 0.)
	ipas_present = collections.defaultdict(lambda: collections.defaultdict(lambda: 0.))
	with open(DATA_FILE) as f:
	rows = f.readlines()
	for row in rows:
	match = PAGE_REGEX.search(row)
	aggregated_url = aggregateURL(match.group(1))
	match = ONLOAD_REGEX.search(row)
	onload_timer = float(match.group(1)) / 1000.0
	if onload_timer < 0 or onload_timer > 1e5:
	continue
	match = IP_REGEX.search(row)
	ip = match.group(1)
	ipa = int(ip[:ip.find(".")])
	page_times[aggregated_url].append((ipa, onload_timer))
	ipas_present[aggregated_url][ipa] += 1
	ipatotals[ipa] += 1

	results = []
	total_requests = 0
	for url, ipa_onload_times in page_times.items():
	num_requests = len(ipa_onload_times)
	total_requests += num_requests

	weights = [] ; onload_times = [] ; total_weight = 0.
	for ipa, onload in ipa_onload_times:
	onload_times.append(onload)
	this_weight = ipatotals[ipa]/ipas_present[url][ipa]
	if ipatotals[ipa] < 50:
	this_weight = 0
	# this_weight = 1.
	weights.append(this_weight)
	total_weight += this_weight
	percentile_95_time = numpy.percentile(onload_times, 95)
	onload_times, weights = zip(*sorted(zip(onload_times, weights)))
	weighted_percentile_95_time = weighted_quantile(0.95, onload_times, weights)
	results.append((num_requests, weighted_percentile_95_time, percentile_95_time, url))

	print '# views\t\t% of total\ttime (95th)\turl (aggregated)'
	print '-' * 80
	results_by_num_requests = sorted([x for x in results if x[0] > 100], key=itemgetter(1), reverse=True)
	for result in results_by_num_requests:
	num_requests, weighted_percentile_95_time, percentile_95_time, url = result
	percent_requests = float(num_requests) / float(total_requests) * 100.0
	trimmed_percent = '{0:.2f} %'.format(percent_requests)
	w_trimmed_timer = '{0:.2f}'.format(weighted_percentile_95_time)
	trimmed_timer = '{0:.2f}'.format(percentile_95_time)
	print '{}\t\t{}\t\t{}\t\t{}\t\t{}'.format(num_requests, trimmed_percent, w_trimmed_timer, trimmed_timer, url)
	print '-' * 80