Skip to content

Instantly share code, notes, and snippets.

@mjamesruggiero
Last active December 15, 2015 01:59
Show Gist options
  • Save mjamesruggiero/5184068 to your computer and use it in GitHub Desktop.
Save mjamesruggiero/5184068 to your computer and use it in GitHub Desktop.
# Fri Mar 15 21:31:59 PDT 2013
# want a histogram of queries
import re
import logging
import csv
def clean(line):
return remove_newlines(remove_times(sub_digits(line)))
def sub_digits(old_string, subst="x"):
"""substitute all digits with something else"""
return re.sub("\d+", subst, old_string)
def only_selects(lines):
"""filter out lines that are not SELECTs"""
for line in lines:
if "SELECT" in line:
yield unicode(line, "UTF-8")
def remove_times(log_line):
"""this could be better for the control chars"""
start_loc = log_line.find('SELECT')
end_loc = log_line.find(';')
return log_line[start_loc:end_loc]
def remove_newlines(old_string, subst=''):
return re.sub("\n", subst, old_string )
def make_histogram(lines):
"""make an unsorted query count"""
d = {}
for line in lines:
d[line] = d.get(line, 0) + 1
return d
def save_to_file(query_counts, threshold=0):
with open('repetitious_queries.csv', 'wa') as csvfile:
writer = csv.writer(csvfile)
for count, query in query_counts:
if count > threshold:
writer.writerow([query, count])
if __name__ == '__main__':
FORMAT = '%(asctime)-15s %(message)s'
logging.basicConfig(format=FORMAT)
logger = logging.getLogger('repetitious_queries')
logfile = '/Users/engineer/workspace/Creatorious/log/development.log'
histogram = {}
with open(logfile) as f:
selects = only_selects(f)
cleaned = [clean(line) for line in selects]
histogram = make_histogram(cleaned)
# and sort it
keys = histogram.keys()
sorted_count = [ (histogram[k], k) for k in keys ]
sorted_count.sort()
sorted_count.reverse()
WRITE_THAT_FILE = False
SUBSTRING_SIZE = 75
THRESHOLD = 10
for count, query in sorted_count:
if count > THRESHOLD:
print("{0}\t{1}".format(count, query[:SUBSTRING_SIZE]))
if WRITE_THAT_FILE:
save_to_file(sorted_count, threshold=THRESHOLD)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment