Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Data processing script to calculate the relationship between request time and DOM load time.
#!/usr/bin/python
"""
Data processing script to calculate the relationship between request time and
DOM load time.
See: http://blog.decadecity.net/2012/09/15/how-long-does-an-http-request-take/
"""
import apachelog # https://code.google.com/p/apachelog/
import re
import urlparse
def get_stats(log_data, format=None, max_time=6000):
"""
Parses an apache log file for two parameters ("dom" and "request").
Returns a list of tuples of related request and dom times.
"""
if format is None:
format = r'%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"'
p = apachelog.parser(format)
log = [] # Stats collected from the log file.
for line in log_data:
parsed = p.parse(line)
url = re.match(r'(.*) (.*) (.*)', parsed['%r']) # Break out the URL components.
parsed['method'] = url.group(1)
parsed['url'] = url.group(2)
parsed['protocol'] = url.group(3)
parsed['urlparsed'] = urlparse.urlparse(url.group(2))
parsed['params'] = urlparse.parse_qs(parsed['urlparsed'].query)
if 'dom' in parsed['params'] and 'request' in parsed['params']:
# We have the two params we need.
log.append(parsed)
result = [] # Output data.
for entry in log:
try:
dom = int(entry['params']['dom'][0])
request = int(entry['params']['request'][0])
if max_time > request > 0 and max_time > dom > 0:
# We have a pair of valid values so add to result.
result.append((request, dom))
except ValueError:
next
return result
if __name__ == '__main__':
import sys
import numpy
import matplotlib.pyplot as plt
data = sys.stdin.readlines() # Take log data from stdin - quick and dirty.
stats = get_stats(data)
# Working out y = nx + c where y is the Request time, x is the DOM time and n is the fudge factor.
x_list = []
y_list = []
for point in stats:
y_list.append(point[0])
x_list.append(point[1])
average_request = int(sum(y_list)) / len(y_list)
# This is the mechanics of solving y = nx + c using numpy.
x = numpy.array(x_list)
y = numpy.array(y_list)
A = numpy.vstack([x, numpy.ones(len(x))]).T
# Linear regression using least squares: https://en.wikipedia.org/wiki/Ordinary_least_squares
n, c = numpy.linalg.lstsq(A, y)[0]
# Now draw the result as a graph.
plt.axhline(y=average_request, label='Avg request (%d)' % (average_request), color='g')
plt.plot(x, y, 'o', label='Original data points (%d)' % (len(x)), markersize=2.5)
plt.plot(x, n*x + c, 'r', label='Fitted line: y = %sx + %s' % (round(n,3), int(c)))
plt.legend(loc='upper left')
plt.xlabel('DOM timer (ms)')
plt.ylabel('Request timer (ms)')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.