Skip to content

Instantly share code, notes, and snippets.

@adimania
Last active November 18, 2016 12:25
Show Gist options
  • Save adimania/0454b0d820e5f37178d040db312c6e16 to your computer and use it in GitHub Desktop.
Save adimania/0454b0d820e5f37178d040db312c6e16 to your computer and use it in GitHub Desktop.
get median of an Apache httpd log file
from flask import Flask
import re
import math
import redis
import zlib
import ast
app = Flask(__name__)
LOG_FILE = '/var/log/access.log'
red = redis.Redis()
def get_sizes():
regex='([(\\S\\.)]+) - - \\[(.*?)\\] "(.*?)" (\\d+) (\\d+)'
sizes = {}
with open(LOG_FILE) as f:
for line in f:
# In case the request has a - in place of size, replace it with 0
if line[-2] == '-':
line = line[:-2]+'0'
log_groups=re.match(regex, line).groups()
if log_groups[3] in sizes:
sizes[log_groups[3]].append(int(log_groups[4]))
else:
sizes[log_groups[3]] = [int(log_groups[4])]
return sizes
def get_median(resp_code):
'''
https://en.wikipedia.org/wiki/Median
The median is the value separating the higher half of a data sample, a population, or a probability distribution, from the lower half. In simple terms, it may be thought of as the "middle" value of a data set.
'''
adler = zlib.adler32(open(LOG_FILE).read())
median_str = red.get(adler)
if median_str:
median_dict = ast.literal_eval(median_str)
else:
sizes = get_sizes()
median_dict = {}
for key in sizes:
sizes[key].sort()
length = len(sizes[key])
# Remember that index of a list starts at 0.
if length % 2 == 0:
median = (sizes[resp_code][length/2 - 1] + sizes[resp_code][length/2])/2.0
else:
median = float(sizes[resp_code][int(math.floor(length/2))])
median_dict[key] = median
red.set(adler, str(median_dict))
return median_dict[resp_code]
@app.route('/<resp_code>', methods=["GET"])
def get_response(resp_code):
return str(get_median(resp_code))
if __name__ == '__main__':
app.run(host="0.0.0.0", port=8888)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment