Last active
December 20, 2015 04:58
-
-
Save rvprasad/6074308 to your computer and use it in GitHub Desktop.
While structured logs are easy to analyze, logs are most often unstructured (e.g. crond, SQL server). This code snippet demonstrates a simple language-based approach to recover the schema/structure of unstructured logs. The approach analyzes a set of logs (lines) and constructs schemas (regular expressions) that cover every log in the set. The a…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
def getVocabulary(wordFileName): | |
ret = set() | |
with open(wordFileName) as wordFile: | |
for w in wordFile: | |
ret.add(w.strip()) | |
return ret | |
import string | |
def isWordInVocabulary(word, vocabulary): | |
lword = word.lower() | |
cword = string.capwords(lword) | |
if word in vocabulary or \ | |
lword in vocabulary or \ | |
cword in vocabulary: | |
return True | |
elif re.sub(r'[\(]*(.*?)[\)\.,;:]*', r'\1', word) in vocabulary or \ | |
re.sub(r'[\(]*(.*?)[\)\.,;:]*', r'\1', lword) in vocabulary or \ | |
re.sub(r'[\(]*(.*?)[\)\.,;:]*', r'\1', cword) in vocabulary: | |
return True | |
elif re.sub(r'(.*?)\(s\)', r'\1', word) in vocabulary or \ | |
re.sub(r'(.*?)\(s\)', r'\1', lword) in vocabulary or \ | |
re.sub(r'(.*?)\(s\)', r'\1', cword) in vocabulary: | |
return True | |
else: | |
return False | |
def getSchemaRegex(schema, dynamicContent): | |
alphaSubregex = r'a-zA-Z_' | |
alphaRegex = "[%s]" % alphaSubregex | |
letter = 'l' | |
digit = 'd' | |
alnum = 'a' | |
for j, values in enumerate(dynamicContent): | |
chars = set() | |
for v in values: | |
tmp1 = re.sub(alphaRegex, letter, v) | |
tmp2 = re.sub(r'\d', digit, tmp1) | |
chars |= set(tmp2) | |
if letter in chars and digit in chars: | |
chars.remove(letter) | |
chars.remove(digit) | |
chars.add(alnum) | |
charsString = re.escape(''.join(chars)) | |
tmp1 = re.sub(digit, r'\d', charsString) | |
tmp2 = re.sub(letter, r'\w', tmp1) | |
tmp3 = re.sub(alnum, alphaSubregex, tmp2) | |
regex = "[%s]+" % tmp3 | |
schema = re.sub(("_%d_" % j), regex, schema) | |
return schema | |
def extractSchemaFreqAndDynContentFromFile(schemas2freqAndDynContent, logFile): | |
def isPrintable(l): | |
return all([x in string.printable for x in l.strip()]) | |
for line in filter(isPrintable, logFile): | |
schema = [] | |
words = re.split(r'\s', line) | |
currStatic = [] | |
currDynamic = [] | |
for word in [w.strip() for w in words if len(w.strip()) > 0]: | |
if isWordInVocabulary(word, vocabulary): | |
currStatic.append(word) | |
else: | |
if len(currStatic): | |
schema.append(" ".join(currStatic)) | |
currStatic = [] | |
schema.append("_%d_" % (len(currDynamic))) | |
currDynamic.append(word) | |
if len(currStatic): | |
schema.append(" ".join(currStatic)) | |
schema = " ".join(schema) | |
if len(schema): | |
if schema not in schemas2freqAndDynContent: | |
dyn = [] | |
for i in range(len(currDynamic)): | |
dyn.append(set()) | |
schemas2freqAndDynContent[schema] = (0, dyn) | |
count, dyn = schemas2freqAndDynContent[schema] | |
for i,w in enumerate(currDynamic): | |
dyn[i].add(currDynamic[i]) | |
schemas2freqAndDynContent[schema] = count + 1, dyn | |
import sys | |
vocabulary = getVocabulary(sys.argv[1]) | |
schemas2freqAndDynContent = {} | |
with open(sys.argv[2]) as logFileNamesFile: | |
for logfileName in [l.strip() for l in logFileNamesFile]: | |
with open(logfileName) as logFile: | |
extractSchemaFreqAndDynContentFromFile(schemas2freqAndDynContent, logFile) | |
for i, schema in enumerate(sorted(schemas2freqAndDynContent.keys())): | |
frequency, dynamicContent = schemas2freqAndDynContent[schema] | |
schemaRegex = getSchemaRegex(schema, dynamicContent) | |
print("%d:%d:%s" % (i, frequency, schemaRegex)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment