Skip to content

Instantly share code, notes, and snippets.

@benbramley
Created June 29, 2015 12:00
Show Gist options
  • Save benbramley/18f941592e25e0816aad to your computer and use it in GitHub Desktop.
Save benbramley/18f941592e25e0816aad to your computer and use it in GitHub Desktop.
Fuzzy lookup in Splunk
import csv,sys
from fuzzywuzzy import fuzz
LOOKUPCSV = 'somelookup.csv'
RATIOTHRESHOLD = 80
def output_results(results, mvdelim = '\n', output = sys.stdout):
"""Given a list of dictionaries, each representing
a single result, and an optional list of fields,
output those results to stdout for consumption by the
Splunk pipeline"""
# We collect all the unique field names, as well as
# convert all multivalue keys to the right form
fields = set()
for result in results:
for key in result.keys():
if(isinstance(result[key], list)):
result['__mv_' + key] = encode_mv(result[key])
result[key] = mvdelim.join(result[key])
fields.update(result.keys())
# convert the fields into a list and create a CSV writer
# to output to stdout
fields = sorted(list(fields))
writer = csv.DictWriter(output, fields)
# Write out the fields, and then the actual results
writer.writerow(dict(zip(fields, fields)))
writer.writerows(results)
def encode_mv(vals):
"""For multivalues, values are wrapped in '$' and separated using ';'
Literal '$' values are represented with '$$'"""
s = ""
for val in vals:
val = val.replace('$', '$$')
if len(s) > 0:
s += ';'
s += '$' + val + '$'
return s
def fuzzylookup(key,value,csvfile,threshold):
csvfile.seek(0)
csvlookup = csv.DictReader(csvfile)
for line in csvlookup:
if (fuzz.ratio(value,line[key]) > threshold):
return line
def main(input, output, argv):
csv_in = csv.DictReader(input)
threshold = RATIOTHRESHOLD
csvfile = open('../lookups/' + LOOKUPCSV)
result = []
for row in csv_in:
# Automatically get the input field we are looking up - Splunk sends all fields but we just want the one with values set
for (k,v) in row.items():
if v:
field = k
key = field
value = row[field]
# Fuzzy lookup
results = fuzzylookup(key,value,csvfile,threshold)
if results:
results[key] = value
result.append(results)
output_results(result)
if __name__ == '__main__':
main(sys.stdin, sys.stdout, sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment