Skip to content

Instantly share code, notes, and snippets.

@dskarataev
Last active August 29, 2015 14:12
Show Gist options
  • Save dskarataev/7ef994119876ac8e7463 to your computer and use it in GitHub Desktop.
Save dskarataev/7ef994119876ac8e7463 to your computer and use it in GitHub Desktop.
logstat.py
#!/usr/bin/env python
# coding: utf-8
# Implementation of screening test for position Backend Developer
# 2014, Denis Karataev
import re
import numpy
LOG_PATH = '/home/dsk/sample.log'
TEMPLATES = [
'GET /api/users/{user_id}/count_pending_messages',
'GET /api/users/{user_id}/get_messages',
'GET /api/users/{user_id}/get_friends_progress',
'GET /api/users/{user_id}/get_friends_score',
'POST /api/users/{user_id}',
'GET /api/users/{user_id}',
]
def print_statistics():
result_data = {}
# this pattern we use for getting method, url, dyno, connect_time, service_time
pattern_main = re.compile(r'.*method=(GET|POST) path=(/api/users/\d+?.*?) host=.*? dyno=(.*?)'
r' connect=(\d+?)ms service=(\d+?)ms.*')
# this pattern we use for replacing exact user_id in url to the template {user_id}
pattern_sub = re.compile(r'^/api/users/\d+?(/|$)')
try:
with open(LOG_PATH) as f:
for line in f:
m = re.match(pattern_main, line)
if m:
method, url, dyno, connect_time, service_time = m.group(1), m.group(2), m.group(3),\
int(m.group(4)), int(m.group(5))
# replace every exact user_id to the template {user_id}
url = re.sub(pattern_sub, '/api/users/{user_id}/', url)
if url.endswith('/'):
url = url[:-1]
url = ' '.join((method, url))
# compare url with every of given templates that we need to aggregate data
if url in TEMPLATES:
if url not in result_data:
result_data[url] = {
'connect_time': [],
'service_time': [],
'total_time': [],
'dynos': [],
}
# save raw data for future counting
result_data[url]['connect_time'].append(connect_time)
result_data[url]['service_time'].append(service_time)
result_data[url]['total_time'].append(connect_time + service_time)
result_data[url]['dynos'].append(dyno)
# if we have captured at least something, do aggregation
if result_data:
# to save same sort order as in the task description
for url in TEMPLATES:
if url in result_data:
# count times it was called.
# Same as how many times we have added raw data to the list
called = len(result_data[url]['connect_time'])
# count average (mean) and median time for every type of time
connect_time = result_data[url]['connect_time']
connect_time_avg = numpy.mean(numpy.array(connect_time))
connect_time_med = numpy.median(numpy.array(connect_time))
service_time = result_data[url]['service_time']
service_time_avg = numpy.mean(numpy.array(service_time))
service_time_med = numpy.median(numpy.array(service_time))
total_time = result_data[url]['total_time']
total_time_avg = numpy.mean(numpy.array(total_time))
total_time_med = numpy.median(numpy.array(total_time))
# count the most dyno
dynos = result_data[url]['dynos']
most_dyno = max(set(dynos), key=dynos.count)
# output results
print url
print 'Was called %d times' % called
# here we use float with rounding up to 3 numbers after the point like in ping command
print 'Mean time (connect/service/total) = %.3f/%.3f/%.3f ms' % (connect_time_avg,
service_time_avg,
total_time_avg)
# here we use float with rounding up to 1 number after the point
# because we can have only two cases: integer or integer + 0.5
print 'Median time (connect/service/total) = %.1f/%.1f/%.1f ms' % (connect_time_med,
service_time_med,
total_time_med)
print 'The most responded dyno: %s' % most_dyno
print ''
else:
print 'There is no data that could be interested.'
except IOError, e:
print 'Problem with logfile: %s' % e
if __name__ == '__main__':
print_statistics()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment