Skip to content

Instantly share code, notes, and snippets.

@rvprasad

rvprasad/recoverSchema.py

Last active Dec 20, 2015
Embed
What would you like to do?
While structured logs are easy to analyze, logs are most often unstructured (e.g. crond, SQL server). This code snippet demonstrates a simple language-based approach to recover the schema/structure of unstructured logs. The approach analyzes a set of logs (lines) and constructs schemas (regular expressions) that cover every log in the set. The a…
import re
def getVocabulary(wordFileName):
ret = set()
with open(wordFileName) as wordFile:
for w in wordFile:
ret.add(w.strip())
return ret
import string
def isWordInVocabulary(word, vocabulary):
lword = word.lower()
cword = string.capwords(lword)
if word in vocabulary or \
lword in vocabulary or \
cword in vocabulary:
return True
elif re.sub(r'[\(]*(.*?)[\)\.,;:]*', r'\1', word) in vocabulary or \
re.sub(r'[\(]*(.*?)[\)\.,;:]*', r'\1', lword) in vocabulary or \
re.sub(r'[\(]*(.*?)[\)\.,;:]*', r'\1', cword) in vocabulary:
return True
elif re.sub(r'(.*?)\(s\)', r'\1', word) in vocabulary or \
re.sub(r'(.*?)\(s\)', r'\1', lword) in vocabulary or \
re.sub(r'(.*?)\(s\)', r'\1', cword) in vocabulary:
return True
else:
return False
def getSchemaRegex(schema, dynamicContent):
alphaSubregex = r'a-zA-Z_'
alphaRegex = "[%s]" % alphaSubregex
letter = 'l'
digit = 'd'
alnum = 'a'
for j, values in enumerate(dynamicContent):
chars = set()
for v in values:
tmp1 = re.sub(alphaRegex, letter, v)
tmp2 = re.sub(r'\d', digit, tmp1)
chars |= set(tmp2)
if letter in chars and digit in chars:
chars.remove(letter)
chars.remove(digit)
chars.add(alnum)
charsString = re.escape(''.join(chars))
tmp1 = re.sub(digit, r'\d', charsString)
tmp2 = re.sub(letter, r'\w', tmp1)
tmp3 = re.sub(alnum, alphaSubregex, tmp2)
regex = "[%s]+" % tmp3
schema = re.sub(("_%d_" % j), regex, schema)
return schema
def extractSchemaFreqAndDynContentFromFile(schemas2freqAndDynContent, logFile):
def isPrintable(l):
return all([x in string.printable for x in l.strip()])
for line in filter(isPrintable, logFile):
schema = []
words = re.split(r'\s', line)
currStatic = []
currDynamic = []
for word in [w.strip() for w in words if len(w.strip()) > 0]:
if isWordInVocabulary(word, vocabulary):
currStatic.append(word)
else:
if len(currStatic):
schema.append(" ".join(currStatic))
currStatic = []
schema.append("_%d_" % (len(currDynamic)))
currDynamic.append(word)
if len(currStatic):
schema.append(" ".join(currStatic))
schema = " ".join(schema)
if len(schema):
if schema not in schemas2freqAndDynContent:
dyn = []
for i in range(len(currDynamic)):
dyn.append(set())
schemas2freqAndDynContent[schema] = (0, dyn)
count, dyn = schemas2freqAndDynContent[schema]
for i,w in enumerate(currDynamic):
dyn[i].add(currDynamic[i])
schemas2freqAndDynContent[schema] = count + 1, dyn
import sys
vocabulary = getVocabulary(sys.argv[1])
schemas2freqAndDynContent = {}
with open(sys.argv[2]) as logFileNamesFile:
for logfileName in [l.strip() for l in logFileNamesFile]:
with open(logfileName) as logFile:
extractSchemaFreqAndDynContentFromFile(schemas2freqAndDynContent, logFile)
for i, schema in enumerate(sorted(schemas2freqAndDynContent.keys())):
frequency, dynamicContent = schemas2freqAndDynContent[schema]
schemaRegex = getSchemaRegex(schema, dynamicContent)
print("%d:%d:%s" % (i, frequency, schemaRegex))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment