Skip to content

Instantly share code, notes, and snippets.

@GedowFather
Last active Aug 8, 2021
Embed
What would you like to do?
Python scirpt for counting slow query log and sort it.
#
# Count slow query log and sort it.
# Result is output to stdout.
#
# Log example:
"""
/rdsdbbin/oscar/bin/mysqld, Version: 5.6.10-log (MySQL Community Server (GPL)). started with:
Tcp port: 3306 Unix socket: /tmp/mysql.sock
Time Id Command Argument
# Time: 201006 6:05:01
# User@Host: test[test] @ [10.0.1.2] Id: 11198216
# Query_time: 1.638045 Lock_time: 0.000057 Rows_sent: 1 Rows_examined: 837061
use test_database;
SET timestamp=1601964301;
SELECT ...;
# Time: 201006 6:05:02
# User@Host: test[test] @ [10.0.1.2] Id: 11198216
# Query_time: 0.676726 Lock_time: 0.000215 Rows_sent: 0 Rows_examined: 325854
SET timestamp=1601964302;
SELECT ...;
"""
import os
import textwrap
from datetime import datetime
from SqlLogger import counterSlowQuery
#
# Config
#
ranking = 100
hostname = 'example-db-01-01'
filename = 'example-mysql-slowquery_20210701-2100.log'
#
# Read log
#
counters = {}
counters[hostname] = counterSlowQuery(hostname)
f = open(filename, "r")
log = f.read()
f.close()
counters[hostname].putLog(log)
#
# Sort and cut ranking
#
all_queries = []
for hostname,counter in list(counters.items()):
all_queries.extend(counter.getQueries())
if not all_queries:
print("Not found slow query.")
quit()
rank_queries = sorted(all_queries, key=lambda x:x['total_query_time'], reverse=True)
rank_count = len(rank_queries)
#
# Message
#
message = f"""
Reporting slow query log.
Found {rank_count} slow rank queries.
Sorted rank query by total time and picked up top parts.
"""
number = 0
datetime_format = "%Y-%m-%dT%H:%M:%SZ"
for q in rank_queries[:ranking]:
number += 1
oldest_datetime = datetime.fromtimestamp(q['oldest_unix_time']).strftime(datetime_format)
latest_datetime = datetime.fromtimestamp(q['latest_unix_time']).strftime(datetime_format)
message += textwrap.dedent(f"""
#
# {number}
#
Host : {q['host']}
Database : {q['database']}
Found Time : {oldest_datetime} - {latest_datetime}
Query Count : {q['count']}
Total Time : {round(q['total_query_time'], 1)}
Query Time : {round(q['min_query_time'], 2)} - {round(q['max_query_time'], 2)}
Lock Time : {round(q['min_lock_time'], 2)} - { round(q['max_lock_time'], 2)}
Target Rows : {q['min_rows']} - {q['max_rows']}
Query :
{q['unique_query']}
""")
print(message)
#
# スロークエリログをユニーク化整理します
#
import re
import hashlib
class counterSlowQuery:
init_keep_info = {
'unix_time' : None,
'src_ip' : None,
'query_time': None,
'lock_time' : None,
'rows' : None,
'query' : "",
}
def __init__(self, host):
self.host = host
self.queries = {}
self.keep_info = {}
self.keep_info.update(self.init_keep_info)
self.next_is_query = False
self.selected_database = None
return
def getQueries(self):
return list(self.queries.values())
def putLog(self, log):
for line in log.splitlines():
self.sortLog(line)
return
def sortLog(self, line):
if re.match(r"[/T]", line):
return
elif re.match("# Th", line):
return
elif re.match("# F", line):
return
elif re.match("# Time", line):
self.next_is_query = False
self.keep_info.update(self.init_keep_info)
elif re.match("# User", line):
match = re.match(r".*@.*\[([0-9.]+)\].*", line)
if not match: return
src_ip = match.group(1)
self.keep_info['src_ip'] = src_ip
elif re.match("# Query", line):
regex = r"^.*Query_time: *([0-9.]+) *Lock_time: *([0-9.]+) *.*Rows_examined: *([0-9]+).*$"
match = re.match(regex, line, flags=re.IGNORECASE)
if not match: return
query_time = match.group(1)
lock_time = match.group(2)
rows = match.group(3)
self.keep_info['query_time'] = float(query_time)
self.keep_info['lock_time'] = float(lock_time)
self.keep_info['rows'] = int(rows)
elif re.match(r"^use ", line, flags=re.IGNORECASE):
match = re.match(r"^use +([^;]+);?$", line, flags=re.IGNORECASE)
if not match: return
self.selected_database = match.group(1)
elif re.match("SET time", line, flags=re.IGNORECASE):
if self.keep_info['query_time'] is None:
self.next_is_query = False
self.keep_info.update(self.init_keep_info)
print("Not found comment out information")
else:
match = re.match(r"^SET timestamp=([0-9]+);?$", line, flags=re.IGNORECASE)
if not match: return
self.next_is_query = True
self.keep_info['unix_time'] = int(match.group(1))
elif self.next_is_query is True:
self.keep_info['query'] += line
if not line: return
if line[-1] == ";" or line[-2] == ";":
self.saveQueryInfo()
self.keep_info.update(self.init_keep_info)
else:
self.keep_info['query'] += " "
else:
print("Found invalid line (%s)" % line)
print("Keep info is")
print(self.keep_info)
return
def saveQueryInfo(self):
if not (self.keep_info['unix_time'] and self.keep_info['query_time']
and self.keep_info['lock_time'] and self.keep_info['rows']):
return
unique_query = self.uniqueQuery(self.keep_info['query'])
query_hash = hashlib.md5(unique_query.encode('utf-8')).hexdigest()[0:8]
if query_hash not in self.queries:
self.queries[query_hash] = {
'hash' : query_hash,
'count' : 0,
'host' : self.host,
'src_ip' : self.keep_info['src_ip'],
'database' : self.selected_database,
'unique_query' : unique_query,
'oldest_unix_time': self.keep_info['unix_time'],
'latest_unix_time': self.keep_info['unix_time'],
'min_query_time' : self.keep_info['query_time'],
'max_query_time' : self.keep_info['query_time'],
'min_lock_time' : self.keep_info['lock_time'],
'max_lock_time' : self.keep_info['lock_time'],
'min_rows' : self.keep_info['rows'],
'max_rows' : self.keep_info['rows'],
'total_query_time': 0.0,
}
self.queries[query_hash]['count'] += 1
if self.queries[query_hash]['src_ip'] is None and self.keep_info['src_ip'] is not None:
self.queries[query_hash]['src_ip'] = self.keep_info['src_ip']
if self.queries[query_hash]['database'] is None and self.selected_database is not None:
self.queries[query_hash]['database'] = self.selected_database
if self.keep_info['query_time']:
self.queries[query_hash]['total_query_time'] += self.keep_info['query_time']
if self.keep_info['unix_time'] < self.queries[query_hash]['oldest_unix_time']:
self.queries[query_hash]['oldest_unix_time'] = self.keep_info['unix_time']
elif self.keep_info['unix_time'] > self.queries[query_hash]['oldest_unix_time']:
self.queries[query_hash]['latest_unix_time'] = self.keep_info['unix_time']
if self.keep_info['query_time'] < self.queries[query_hash]['min_query_time']:
self.queries[query_hash]['min_query_time'] = self.keep_info['query_time']
elif self.keep_info['query_time'] > self.queries[query_hash]['max_query_time']:
self.queries[query_hash]['max_query_time'] = self.keep_info['query_time']
if self.keep_info['lock_time'] < self.queries[query_hash]['min_lock_time']:
self.queries[query_hash]['min_lock_time'] = self.keep_info['lock_time']
elif self.keep_info['lock_time'] > self.queries[query_hash]['max_lock_time']:
self.queries[query_hash]['max_lock_time'] = self.keep_info['lock_time']
if self.keep_info['rows'] < self.queries[query_hash]['min_rows']:
self.queries[query_hash]['min_rows'] = self.keep_info['rows']
elif self.keep_info['rows'] > self.queries[query_hash]['max_rows']:
self.queries[query_hash]['max_rows'] = self.keep_info['rows']
return
def uniqueQuery(self, query):
query = query.rstrip(';')
# 日本語は削除して代替文字列に置換(カット前にしないと中途半端になる)
ja_regex = r'[0-9ぁ-んァ-ン一-龥,.、。ー-・]+'
query = re.sub(ja_regex, '[ja]', query, flags=re.IGNORECASE)
# クエリのNormalize
# - 複数スペースを単スペースに置換
# - 変数値を ? に置換
# > 日付と文字列を分けるために一度 !TIMESTAMP! に変換して戻す
# > IN , VALUES などの複数形は ,... に変換
replacements = [
[r" {2,}", " "],
[r"'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}'", "!TIMESTAMP!"],
[r"'\d{4}-\d{2}-\d{2}'", "!DATE!"],
[r"'[^']*'", "'?'"],
[r"!TIMESTAMP!", "'????-??-?? ??:??:??'"],
[r"!DATE!", "'????-??-??'"],
[r"([ (,=])-*[\d.]+", r'\1?'],
[r" (IN) ([^\)]+)", r' \1 (?,...)'],
[r" (VALUES) (\([^\)]+\),).*", r' \1 \2...'],
]
for r in replacements:
query = re.sub(r[0], r[1], query, flags=re.IGNORECASE)
return query
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment