Skip to content

Instantly share code, notes, and snippets.

@correl
Created March 28, 2014 14:58
Show Gist options
  • Save correl/9834783 to your computer and use it in GitHub Desktop.
Save correl/9834783 to your computer and use it in GitHub Desktop.
Aggregate similar log file entries matching a search pattern
import sys
import argparse
from itertools import ifilter
from fuzzywuzzy import fuzz
MATCH_THRESHOLD = 90
def parse_file(filename, predicate=None):
with open(filename) as f:
return aggregate_errors(f, predicate)
def aggregate_errors(lines, predicate=None):
return reduce(store_similar,
ifilter(predicate, lines) if predicate else lines,
{})
def store_similar(acc, string):
match = find_similar(acc, string)
if match:
acc[match["key"]].append((match["ratio"], string))
else:
acc[string] = [(100, string)]
return acc
def find_similar(acc, string):
results = map(lambda s: (fuzz.token_set_ratio(s, string), s),
acc.iterkeys())
filtered = filter(lambda r: r[0] >= MATCH_THRESHOLD,
results)
ranked = sorted(filtered)
return dict(zip(["ratio", "key"], ranked[-1])) if ranked else None
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("searchstring")
parser.add_argument("filename")
parser.add_argument("-t", "--threshold",
type=int,
help="Fuzzy match percentage threshold")
args = parser.parse_args()
if args.threshold:
MATCH_THRESHOLD = args.threshold
aggregated = parse_file(args.filename,
lambda line: args.searchstring in line)
for entry, matches in aggregated.iteritems():
print("{:<10} {}".format(len(matches), entry))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment