rvprasad/recoverSchema.py

## recoverSchema.py
import re

def getVocabulary(wordFileName):
    ret = set()
    with open(wordFileName) as wordFile:
        for w in wordFile:
            ret.add(w.strip())
    return ret

import string

def isWordInVocabulary(word, vocabulary):
    lword = word.lower()
    cword = string.capwords(lword)
    if word in vocabulary or \
        lword in vocabulary or \
        cword in vocabulary:
        return True
    elif re.sub(r'[\(]*(.*?)[\)\.,;:]*', r'\1', word) in vocabulary or \
        re.sub(r'[\(]*(.*?)[\)\.,;:]*', r'\1', lword) in vocabulary or \
        re.sub(r'[\(]*(.*?)[\)\.,;:]*', r'\1', cword) in vocabulary:
        return True
    elif re.sub(r'(.*?)\(s\)', r'\1', word) in vocabulary or \
        re.sub(r'(.*?)\(s\)', r'\1', lword) in vocabulary or \
        re.sub(r'(.*?)\(s\)', r'\1', cword) in vocabulary:
        return True
    else:
        return False

def getSchemaRegex(schema, dynamicContent):
    alphaSubregex = r'a-zA-Z_'
    alphaRegex = "[%s]" % alphaSubregex
    letter = 'l'
    digit = 'd'
    alnum = 'a'
    for j, values in enumerate(dynamicContent):
        chars = set()
        for v in values:
            tmp1 = re.sub(alphaRegex, letter, v)
            tmp2 = re.sub(r'\d', digit, tmp1)
            chars |= set(tmp2)
        if letter in chars and digit in chars:
            chars.remove(letter)
            chars.remove(digit)
            chars.add(alnum)
        charsString = re.escape(''.join(chars))
        tmp1 = re.sub(digit, r'\d', charsString)
        tmp2 = re.sub(letter, r'\w', tmp1)
        tmp3 = re.sub(alnum, alphaSubregex, tmp2)
        regex = "[%s]+" % tmp3
        schema = re.sub(("_%d_" % j), regex, schema)
    return schema

def extractSchemaFreqAndDynContentFromFile(schemas2freqAndDynContent, logFile):
    def isPrintable(l):
        return all([x in string.printable for x in l.strip()])
    for line in filter(isPrintable, logFile):
        schema = []
        words = re.split(r'\s', line)
        currStatic = []
        currDynamic = []
        for word in [w.strip() for w in words if len(w.strip()) > 0]:
            if isWordInVocabulary(word, vocabulary):
                currStatic.append(word)
            else:
                if len(currStatic):
                    schema.append(" ".join(currStatic))
                    currStatic = []
                schema.append("_%d_" % (len(currDynamic)))
                currDynamic.append(word)
        if len(currStatic):
            schema.append(" ".join(currStatic))
        schema = " ".join(schema)
        if len(schema):
            if schema not in schemas2freqAndDynContent:
                dyn = []
                for i in range(len(currDynamic)):
                    dyn.append(set())
                schemas2freqAndDynContent[schema] = (0, dyn)
            count, dyn = schemas2freqAndDynContent[schema]
            for i,w in enumerate(currDynamic):
                dyn[i].add(currDynamic[i])
            schemas2freqAndDynContent[schema] = count + 1, dyn

import sys

vocabulary = getVocabulary(sys.argv[1])

schemas2freqAndDynContent = {}
with open(sys.argv[2]) as logFileNamesFile:
    for logfileName in [l.strip() for l in logFileNamesFile]:
        with open(logfileName) as logFile:
            extractSchemaFreqAndDynContentFromFile(schemas2freqAndDynContent, logFile)

for i, schema in enumerate(sorted(schemas2freqAndDynContent.keys())):
    frequency, dynamicContent = schemas2freqAndDynContent[schema]
    schemaRegex = getSchemaRegex(schema, dynamicContent)
    print("%d:%d:%s" % (i, frequency, schemaRegex))
	import re

	def getVocabulary(wordFileName):
	ret = set()
	with open(wordFileName) as wordFile:
	for w in wordFile:
	ret.add(w.strip())
	return ret

	import string

	def isWordInVocabulary(word, vocabulary):
	lword = word.lower()
	cword = string.capwords(lword)
	if word in vocabulary or \
	lword in vocabulary or \
	cword in vocabulary:
	return True
	elif re.sub(r'[\(](.?)[\)\.,;:]*', r'\1', word) in vocabulary or \
	re.sub(r'[\(](.?)[\)\.,;:]*', r'\1', lword) in vocabulary or \
	re.sub(r'[\(](.?)[\)\.,;:]*', r'\1', cword) in vocabulary:
	return True
	elif re.sub(r'(.*?)\(s\)', r'\1', word) in vocabulary or \
	re.sub(r'(.*?)\(s\)', r'\1', lword) in vocabulary or \
	re.sub(r'(.*?)\(s\)', r'\1', cword) in vocabulary:
	return True
	else:
	return False

	def getSchemaRegex(schema, dynamicContent):
	alphaSubregex = r'a-zA-Z_'
	alphaRegex = "[%s]" % alphaSubregex
	letter = 'l'
	digit = 'd'
	alnum = 'a'
	for j, values in enumerate(dynamicContent):
	chars = set()
	for v in values:
	tmp1 = re.sub(alphaRegex, letter, v)
	tmp2 = re.sub(r'\d', digit, tmp1)
	chars \|= set(tmp2)
	if letter in chars and digit in chars:
	chars.remove(letter)
	chars.remove(digit)
	chars.add(alnum)
	charsString = re.escape(''.join(chars))
	tmp1 = re.sub(digit, r'\d', charsString)
	tmp2 = re.sub(letter, r'\w', tmp1)
	tmp3 = re.sub(alnum, alphaSubregex, tmp2)
	regex = "[%s]+" % tmp3
	schema = re.sub(("_%d_" % j), regex, schema)
	return schema

	def extractSchemaFreqAndDynContentFromFile(schemas2freqAndDynContent, logFile):
	def isPrintable(l):
	return all([x in string.printable for x in l.strip()])
	for line in filter(isPrintable, logFile):
	schema = []
	words = re.split(r'\s', line)
	currStatic = []
	currDynamic = []
	for word in [w.strip() for w in words if len(w.strip()) > 0]:
	if isWordInVocabulary(word, vocabulary):
	currStatic.append(word)
	else:
	if len(currStatic):
	schema.append(" ".join(currStatic))
	currStatic = []
	schema.append("_%d_" % (len(currDynamic)))
	currDynamic.append(word)
	if len(currStatic):
	schema.append(" ".join(currStatic))
	schema = " ".join(schema)
	if len(schema):
	if schema not in schemas2freqAndDynContent:
	dyn = []
	for i in range(len(currDynamic)):
	dyn.append(set())
	schemas2freqAndDynContent[schema] = (0, dyn)
	count, dyn = schemas2freqAndDynContent[schema]
	for i,w in enumerate(currDynamic):
	dyn[i].add(currDynamic[i])
	schemas2freqAndDynContent[schema] = count + 1, dyn

	import sys

	vocabulary = getVocabulary(sys.argv[1])

	schemas2freqAndDynContent = {}
	with open(sys.argv[2]) as logFileNamesFile:
	for logfileName in [l.strip() for l in logFileNamesFile]:
	with open(logfileName) as logFile:
	extractSchemaFreqAndDynContentFromFile(schemas2freqAndDynContent, logFile)

	for i, schema in enumerate(sorted(schemas2freqAndDynContent.keys())):
	frequency, dynamicContent = schemas2freqAndDynContent[schema]
	schemaRegex = getSchemaRegex(schema, dynamicContent)
	print("%d:%d:%s" % (i, frequency, schemaRegex))