dskarataev/logstat.py

## logstat.py
#!/usr/bin/env python
# coding: utf-8
# Implementation of screening test for position Backend Developer
# 2014, Denis Karataev
import re
import numpy

LOG_PATH = '/home/dsk/sample.log'
TEMPLATES = [
    'GET /api/users/{user_id}/count_pending_messages',
    'GET /api/users/{user_id}/get_messages',
    'GET /api/users/{user_id}/get_friends_progress',
    'GET /api/users/{user_id}/get_friends_score',
    'POST /api/users/{user_id}',
    'GET /api/users/{user_id}',
]


def print_statistics():
    result_data = {}
    # this pattern we use for getting method, url, dyno, connect_time, service_time
    pattern_main = re.compile(r'.*method=(GET|POST) path=(/api/users/\d+?.*?) host=.*? dyno=(.*?)'
                              r' connect=(\d+?)ms service=(\d+?)ms.*')
    # this pattern we use for replacing exact user_id in url to the template {user_id}
    pattern_sub = re.compile(r'^/api/users/\d+?(/|$)')
    try:
        with open(LOG_PATH) as f:
            for line in f:
                m = re.match(pattern_main, line)
                if m:
                    method, url, dyno, connect_time, service_time = m.group(1), m.group(2), m.group(3),\
                                                                    int(m.group(4)), int(m.group(5))

                    # replace every exact user_id to the template {user_id}
                    url = re.sub(pattern_sub, '/api/users/{user_id}/', url)
                    if url.endswith('/'):
                        url = url[:-1]
                    url = ' '.join((method, url))

                    # compare url with every of given templates that we need to aggregate data
                    if url in TEMPLATES:
                        if url not in result_data:
                            result_data[url] = {
                                'connect_time': [],
                                'service_time': [],
                                'total_time': [],
                                'dynos': [],
                            }
                        # save raw data for future counting
                        result_data[url]['connect_time'].append(connect_time)
                        result_data[url]['service_time'].append(service_time)
                        result_data[url]['total_time'].append(connect_time + service_time)
                        result_data[url]['dynos'].append(dyno)
        # if we have captured at least something, do aggregation
        if result_data:
            # to save same sort order as in the task description
            for url in TEMPLATES:
                if url in result_data:
                    # count times it was called.
                    # Same as how many times we have added raw data to the list
                    called = len(result_data[url]['connect_time'])

                    # count average (mean) and median time for every type of time
                    connect_time = result_data[url]['connect_time']
                    connect_time_avg = numpy.mean(numpy.array(connect_time))
                    connect_time_med = numpy.median(numpy.array(connect_time))

                    service_time = result_data[url]['service_time']
                    service_time_avg = numpy.mean(numpy.array(service_time))
                    service_time_med = numpy.median(numpy.array(service_time))

                    total_time = result_data[url]['total_time']
                    total_time_avg = numpy.mean(numpy.array(total_time))
                    total_time_med = numpy.median(numpy.array(total_time))

                    # count the most dyno
                    dynos = result_data[url]['dynos']
                    most_dyno = max(set(dynos), key=dynos.count)

                    # output results
                    print url
                    print 'Was called %d times' % called

                    # here we use float with rounding up to 3 numbers after the point like in ping command
                    print 'Mean time (connect/service/total) = %.3f/%.3f/%.3f ms' % (connect_time_avg,
                                                                                     service_time_avg,
                                                                                     total_time_avg)

                    # here we use float with rounding up to 1 number after the point
                    # because we can have only two cases: integer or integer + 0.5
                    print 'Median time (connect/service/total) = %.1f/%.1f/%.1f ms' % (connect_time_med,
                                                                                       service_time_med,
                                                                                       total_time_med)
                    print 'The most responded dyno: %s' % most_dyno
                    print ''
        else:
            print 'There is no data that could be interested.'
    except IOError, e:
        print 'Problem with logfile: %s' % e

if __name__ == '__main__':
    print_statistics()
	#!/usr/bin/env python
	# coding: utf-8
	# Implementation of screening test for position Backend Developer
	# 2014, Denis Karataev
	import re
	import numpy

	LOG_PATH = '/home/dsk/sample.log'
	TEMPLATES = [
	'GET /api/users/{user_id}/count_pending_messages',
	'GET /api/users/{user_id}/get_messages',
	'GET /api/users/{user_id}/get_friends_progress',
	'GET /api/users/{user_id}/get_friends_score',
	'POST /api/users/{user_id}',
	'GET /api/users/{user_id}',
	]


	def print_statistics():
	result_data = {}
	# this pattern we use for getting method, url, dyno, connect_time, service_time
	pattern_main = re.compile(r'.method=(GET\|POST) path=(/api/users/\d+?.?) host=.? dyno=(.?)'
	r' connect=(\d+?)ms service=(\d+?)ms.*')
	# this pattern we use for replacing exact user_id in url to the template {user_id}
	pattern_sub = re.compile(r'^/api/users/\d+?(/\|$)')
	try:
	with open(LOG_PATH) as f:
	for line in f:
	m = re.match(pattern_main, line)
	if m:
	method, url, dyno, connect_time, service_time = m.group(1), m.group(2), m.group(3),\
	int(m.group(4)), int(m.group(5))

	# replace every exact user_id to the template {user_id}
	url = re.sub(pattern_sub, '/api/users/{user_id}/', url)
	if url.endswith('/'):
	url = url[:-1]
	url = ' '.join((method, url))

	# compare url with every of given templates that we need to aggregate data
	if url in TEMPLATES:
	if url not in result_data:
	result_data[url] = {
	'connect_time': [],
	'service_time': [],
	'total_time': [],
	'dynos': [],
	}
	# save raw data for future counting
	result_data[url]['connect_time'].append(connect_time)
	result_data[url]['service_time'].append(service_time)
	result_data[url]['total_time'].append(connect_time + service_time)
	result_data[url]['dynos'].append(dyno)
	# if we have captured at least something, do aggregation
	if result_data:
	# to save same sort order as in the task description
	for url in TEMPLATES:
	if url in result_data:
	# count times it was called.
	# Same as how many times we have added raw data to the list
	called = len(result_data[url]['connect_time'])

	# count average (mean) and median time for every type of time
	connect_time = result_data[url]['connect_time']
	connect_time_avg = numpy.mean(numpy.array(connect_time))
	connect_time_med = numpy.median(numpy.array(connect_time))

	service_time = result_data[url]['service_time']
	service_time_avg = numpy.mean(numpy.array(service_time))
	service_time_med = numpy.median(numpy.array(service_time))

	total_time = result_data[url]['total_time']
	total_time_avg = numpy.mean(numpy.array(total_time))
	total_time_med = numpy.median(numpy.array(total_time))

	# count the most dyno
	dynos = result_data[url]['dynos']
	most_dyno = max(set(dynos), key=dynos.count)

	# output results
	print url
	print 'Was called %d times' % called

	# here we use float with rounding up to 3 numbers after the point like in ping command
	print 'Mean time (connect/service/total) = %.3f/%.3f/%.3f ms' % (connect_time_avg,
	service_time_avg,
	total_time_avg)

	# here we use float with rounding up to 1 number after the point
	# because we can have only two cases: integer or integer + 0.5
	print 'Median time (connect/service/total) = %.1f/%.1f/%.1f ms' % (connect_time_med,
	service_time_med,
	total_time_med)
	print 'The most responded dyno: %s' % most_dyno
	print ''
	else:
	print 'There is no data that could be interested.'
	except IOError, e:
	print 'Problem with logfile: %s' % e

	if __name__ == '__main__':
	print_statistics()