Skip to content

Instantly share code, notes, and snippets.

@rvprasad
Last active December 20, 2015 04:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rvprasad/6074308 to your computer and use it in GitHub Desktop.
Save rvprasad/6074308 to your computer and use it in GitHub Desktop.
While structured logs are easy to analyze, logs are most often unstructured (e.g. crond, SQL server). This code snippet demonstrates a simple language-based approach to recover the schema/structure of unstructured logs. The approach analyzes a set of logs (lines) and constructs schemas (regular expressions) that cover every log in the set. The a…
import re
def getVocabulary(wordFileName):
ret = set()
with open(wordFileName) as wordFile:
for w in wordFile:
ret.add(w.strip())
return ret
import string
def isWordInVocabulary(word, vocabulary):
lword = word.lower()
cword = string.capwords(lword)
if word in vocabulary or \
lword in vocabulary or \
cword in vocabulary:
return True
elif re.sub(r'[\(]*(.*?)[\)\.,;:]*', r'\1', word) in vocabulary or \
re.sub(r'[\(]*(.*?)[\)\.,;:]*', r'\1', lword) in vocabulary or \
re.sub(r'[\(]*(.*?)[\)\.,;:]*', r'\1', cword) in vocabulary:
return True
elif re.sub(r'(.*?)\(s\)', r'\1', word) in vocabulary or \
re.sub(r'(.*?)\(s\)', r'\1', lword) in vocabulary or \
re.sub(r'(.*?)\(s\)', r'\1', cword) in vocabulary:
return True
else:
return False
def getSchemaRegex(schema, dynamicContent):
alphaSubregex = r'a-zA-Z_'
alphaRegex = "[%s]" % alphaSubregex
letter = 'l'
digit = 'd'
alnum = 'a'
for j, values in enumerate(dynamicContent):
chars = set()
for v in values:
tmp1 = re.sub(alphaRegex, letter, v)
tmp2 = re.sub(r'\d', digit, tmp1)
chars |= set(tmp2)
if letter in chars and digit in chars:
chars.remove(letter)
chars.remove(digit)
chars.add(alnum)
charsString = re.escape(''.join(chars))
tmp1 = re.sub(digit, r'\d', charsString)
tmp2 = re.sub(letter, r'\w', tmp1)
tmp3 = re.sub(alnum, alphaSubregex, tmp2)
regex = "[%s]+" % tmp3
schema = re.sub(("_%d_" % j), regex, schema)
return schema
def extractSchemaFreqAndDynContentFromFile(schemas2freqAndDynContent, logFile):
def isPrintable(l):
return all([x in string.printable for x in l.strip()])
for line in filter(isPrintable, logFile):
schema = []
words = re.split(r'\s', line)
currStatic = []
currDynamic = []
for word in [w.strip() for w in words if len(w.strip()) > 0]:
if isWordInVocabulary(word, vocabulary):
currStatic.append(word)
else:
if len(currStatic):
schema.append(" ".join(currStatic))
currStatic = []
schema.append("_%d_" % (len(currDynamic)))
currDynamic.append(word)
if len(currStatic):
schema.append(" ".join(currStatic))
schema = " ".join(schema)
if len(schema):
if schema not in schemas2freqAndDynContent:
dyn = []
for i in range(len(currDynamic)):
dyn.append(set())
schemas2freqAndDynContent[schema] = (0, dyn)
count, dyn = schemas2freqAndDynContent[schema]
for i,w in enumerate(currDynamic):
dyn[i].add(currDynamic[i])
schemas2freqAndDynContent[schema] = count + 1, dyn
import sys
vocabulary = getVocabulary(sys.argv[1])
schemas2freqAndDynContent = {}
with open(sys.argv[2]) as logFileNamesFile:
for logfileName in [l.strip() for l in logFileNamesFile]:
with open(logfileName) as logFile:
extractSchemaFreqAndDynContentFromFile(schemas2freqAndDynContent, logFile)
for i, schema in enumerate(sorted(schemas2freqAndDynContent.keys())):
frequency, dynamicContent = schemas2freqAndDynContent[schema]
schemaRegex = getSchemaRegex(schema, dynamicContent)
print("%d:%d:%s" % (i, frequency, schemaRegex))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment