Teemperor/remove2guards.py

## remove2guards.py
#!/usr/bin/env python

from __future__ import print_function
import re, os
import ntpath

# print to stderr...
import sys

def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)

currentFile = ""
processedIncludes = 0

def edit_distance(s1, s2):
    m=len(s1)+1
    n=len(s2)+1

    tbl = {}
    for i in range(m): tbl[i,0]=i
    for j in range(n): tbl[0,j]=j
    for i in range(1, m):
        for j in range(1, n):
            cost = 0 if s1[i-1] == s2[j-1] else 1
            tbl[i,j] = min(tbl[i, j-1]+1, tbl[i-1, j]+1, tbl[i-1, j-1]+cost)

    return tbl[i,j]

def readlines(fname):
    with open(fname) as f:
        content = f.readlines()
    return content

def write(fname, content):
    with open(fname, "w") as text_file:
        text_file.write(content)

def read(fname):
    with open(fname) as f:
        content = f.read()
    return content

header_path_cache = {}

# Searches the current directoy and ../build/ for a header by it's name
def find_header(header_name):
    global header_path_cache
    if header_name in header_path_cache:
        return header_path_cache[header_name]

    for root, dirs, files in os.walk("."):
        for f in files:
            if f == header_name:
                result = os.path.join(root, f)
                header_path_cache[header_name] = result
                return result
    #Fall back to the build folder
    for root, dirs, files in os.walk("../build/"):
        for f in files:
            if f == header_name:
                result = os.path.join(root, f)
                header_path_cache[header_name] = result
                return result

# removes "" and <> around paths
def clean_include_path(include):
    if include.startswith('"'):
        include = include[1:]
    if include.startswith('<'):
        include = include[1:]
    if include.endswith('"'):
        include = include[:-1]
    if include.endswith('>'):
        include = include[:-1]
    return include

def is_no_op_line(line):
    line = line.strip()
    if line == "":
        return True
    if line.startswith("*"):
        return True
    if line.startswith("//"):
        return True
    if line.startswith("/*"):
        return True
    if line.endswith("*/"):
        return True

    return False
# Parses the header and finds the symbol of the header guard
# e.g. BLA_H for
# #ifndef BLA_H
# #define BLA_H
# #endif
def get_guard_symbol(path):
    ifndef_re = re.compile(r"#ifndef[ ]+([\S]+) ?")
    define_re = re.compile(r"#define[ ]+([\S]+) ?")
    lines = readlines(path)
    symbol = None
    # 0 = looking for ifndef
    # 1 = looking for define
    # 2 = looking for endif
    state = 0
    in_block_comment = False
    for line in lines:
        if state == 0:
            if "/*" in line:
                in_block_comment = True
            if "*/" in line:
                in_block_comment = False
            if not in_block_comment:
                if line.strip().startswith("#ifndef"):
                    ifndef_match = re.match(ifndef_re, line)
                    if ifndef_match:
                        symbol = ifndef_match.group(1)
                    else:
                        eprint("IFNDEF REGEX DID NOT WORK: " + line)
                    state += 1
                elif not is_no_op_line(line):
                    eprint("NO NOOP: " + path + ":" + line)
                    return
                    #assert False
                pass
    return symbol

assumptions = []
unsure_assumptions = []
failed_assumptions = []

# Utility functions for removing prefix/suffix from a string
def remove_prefix(text, prefix):
    if text.startswith(prefix):
        return text[len(prefix):]
    return text

def remove_suffix(text, suffix):
    if text.endswith(suffix):
        return text[:-len(suffix)]
    return text

# Checks if two header guard symbols are similar.
# Examples for similar symbols are:
# ROOT_TTREE and ROOT_TTREE_H
# ROOT_TMVA_XXX and ROOT_XXX
# ROOT_TFILE and ROOT_TFILE
# Depending on the classification of the similarity,
# the comparision is written as a single string to
# either assumptions (if we're sure they are intended
# to be the same symbol) or in unsure_assumptions if
# we are not 100% sure if they are supposed to be
# the same symbol.
# Otherwise returns False and adds no messsage to any
# list.
def similar_guards(a, b):
    global assumptions
    global unsure_assumptions
    global failed_assumptions

    aOrig = a[:]
    bOrig = b[:]

    # this happens so often, we make a special case
    if a in ("ROOT_TGButton", "ROOT_TGWidget") and b in ("ROOT_TGButton", "ROOT_TGWidget"):
        return True

    a = a.lower()
    b = b.lower()

    # it's pretty obvious that we have double header guards if the ifndef has a _h prefix
    if a.endswith("_h") or a.endswith("_hh"):
        return True
    # Keep removing all the prefixes that people really like to add or remove...
    while True:
        abak = a[:]
        bbak = b[:]
        a = remove_prefix(a, "roott")
        b = remove_prefix(b, "roott")
        a = remove_prefix(a, "root")
        b = remove_prefix(b, "root")
        a = remove_prefix(a, "roo")
        b = remove_prefix(b, "roo")
        a = remove_prefix(a, "_")
        b = remove_prefix(b, "_")
        a = remove_prefix(a, "tmva_")
        b = remove_prefix(b, "tmva_")
        a = remove_prefix(a, "fit_")
        b = remove_prefix(b, "fit_")
        a = remove_prefix(a, "math_")
        b = remove_prefix(b, "math_")
        a = remove_prefix(a, "genvector_")
        b = remove_prefix(b, "genvector_")
        a = remove_suffix(a, "_hh")
        b = remove_suffix(b, "_hh")
        a = remove_suffix(a, "_h")
        b = remove_suffix(b, "_h")
        if a == abak and b == bbak:
            break
    if a == b:
        assumptions.append("\n" + currentFile + ":\n" + aOrig + "\n" + bOrig)
        return True
    if edit_distance(a, b) <= 5:
        unsure_assumptions.append("\n" + currentFile + "\n" + aOrig + "\n" + bOrig)
        return True
    return False


def fix_guard(guard):
    global processedIncludes
    global assumptions
    global unsure_assumptions
    global failed_assumptions

    # Check for some false-positives
    if guard.group(1) == "__CINT__":
        return guard.group(0)
    if guard.group(1) == "NDEBUG":
        return guard.group(0)
    if "WIN32" in guard.group(1):
        return guard.group(0)
    # This is probably not for header guard checking... e.g. NO_MATHCORE and so on
    if guard.group(1).startswith("NO_"):
        return guard.group(0)

    included_header = ntpath.basename(clean_include_path(guard.group(2)))
    #print(included_header)
    #print(find_header(included_header))
    header_path = find_header(included_header)

    if header_path == None:
        eprint(currentFile + ": Coulnd't find file for header: " + included_header)
        return guard.group(0)

    guard_symbol = get_guard_symbol(header_path)

    if guard_symbol == None:
        eprint(currentFile + ": Couldn't find a header guard for " + header_path)
        return guard.group(0)

    if guard_symbol == guard.group(1):
        processedIncludes += 1
        return "#include " + guard.group(2)

    if similar_guards(guard_symbol, guard.group(1)):
        processedIncludes += 1
        return "#include " + guard.group(2)

    failed_assumptions.append("\n" + currentFile + ":\n" + guard_symbol + "\n" + guard.group(1))
    return guard.group(0)


def fix_double_guards(path):
    double_guard_regex = re.compile(r"#ifndef[ ]+([\S]+)[^\n]*\n[\n ]*#include[ ]*([\S]+)\n[\n ]*#endif")
    content = read(path)
    new_content = re.sub(double_guard_regex, fix_guard, content)
    if new_content != content:
        write(path, new_content)


def run_tests():
    assert is_no_op_line("\n")
    assert is_no_op_line("   /*    \n")
    assert is_no_op_line("   //    \n")
    assert is_no_op_line("   *    \n")

run_tests()

for root, subdirs, files in os.walk("."):
    # Skip roottest for now. We should remove this when we do the same for rottest :)
    if "/roottest/" in root:
        continue
    for file in files:
        path = os.path.join(root, file)
        currentFile = path
        try:
            fix_double_guards(path)
        except KeyboardInterrupt:
            exit(1)
        except AssertionError:
            exit(1)
        except UnicodeDecodeError:
            pass #print("Error in file: " + path)

#print(processedIncludes)

print("\n\n\n\n\nProbably correct typo corrections:" + str(len(assumptions)))
for a in assumptions:
    print(a)

print("\n\n\n\n\nDubious typo corrections:" + str(len(unsure_assumptions)))
for a in unsure_assumptions:
    print(a)

print("\n\n\n\n\nUnuccessful typo corrections:" + str(len(failed_assumptions)))
for a in failed_assumptions:
    print(a)
	#!/usr/bin/env python

	from __future__ import print_function
	import re, os
	import ntpath

	# print to stderr...
	import sys

	def eprint(args, *kwargs):
	print(args, file=sys.stderr, *kwargs)

	currentFile = ""
	processedIncludes = 0

	def edit_distance(s1, s2):
	m=len(s1)+1
	n=len(s2)+1

	tbl = {}
	for i in range(m): tbl[i,0]=i
	for j in range(n): tbl[0,j]=j
	for i in range(1, m):
	for j in range(1, n):
	cost = 0 if s1[i-1] == s2[j-1] else 1
	tbl[i,j] = min(tbl[i, j-1]+1, tbl[i-1, j]+1, tbl[i-1, j-1]+cost)

	return tbl[i,j]

	def readlines(fname):
	with open(fname) as f:
	content = f.readlines()
	return content

	def write(fname, content):
	with open(fname, "w") as text_file:
	text_file.write(content)

	def read(fname):
	with open(fname) as f:
	content = f.read()
	return content

	header_path_cache = {}

	# Searches the current directoy and ../build/ for a header by it's name
	def find_header(header_name):
	global header_path_cache
	if header_name in header_path_cache:
	return header_path_cache[header_name]

	for root, dirs, files in os.walk("."):
	for f in files:
	if f == header_name:
	result = os.path.join(root, f)
	header_path_cache[header_name] = result
	return result
	#Fall back to the build folder
	for root, dirs, files in os.walk("../build/"):
	for f in files:
	if f == header_name:
	result = os.path.join(root, f)
	header_path_cache[header_name] = result
	return result

	# removes "" and <> around paths
	def clean_include_path(include):
	if include.startswith('"'):
	include = include[1:]
	if include.startswith('<'):
	include = include[1:]
	if include.endswith('"'):
	include = include[:-1]
	if include.endswith('>'):
	include = include[:-1]
	return include

	def is_no_op_line(line):
	line = line.strip()
	if line == "":
	return True
	if line.startswith("*"):
	return True
	if line.startswith("//"):
	return True
	if line.startswith("/*"):
	return True
	if line.endswith("*/"):
	return True

	return False
	# Parses the header and finds the symbol of the header guard
	# e.g. BLA_H for
	# #ifndef BLA_H
	# #define BLA_H
	# #endif
	def get_guard_symbol(path):
	ifndef_re = re.compile(r"#ifndef[ ]+([\S]+) ?")
	define_re = re.compile(r"#define[ ]+([\S]+) ?")
	lines = readlines(path)
	symbol = None
	# 0 = looking for ifndef
	# 1 = looking for define
	# 2 = looking for endif
	state = 0
	in_block_comment = False
	for line in lines:
	if state == 0:
	if "/*" in line:
	in_block_comment = True
	if "*/" in line:
	in_block_comment = False
	if not in_block_comment:
	if line.strip().startswith("#ifndef"):
	ifndef_match = re.match(ifndef_re, line)
	if ifndef_match:
	symbol = ifndef_match.group(1)
	else:
	eprint("IFNDEF REGEX DID NOT WORK: " + line)
	state += 1
	elif not is_no_op_line(line):
	eprint("NO NOOP: " + path + ":" + line)
	return
	#assert False
	pass
	return symbol

	assumptions = []
	unsure_assumptions = []
	failed_assumptions = []

	# Utility functions for removing prefix/suffix from a string
	def remove_prefix(text, prefix):
	if text.startswith(prefix):
	return text[len(prefix):]
	return text

	def remove_suffix(text, suffix):
	if text.endswith(suffix):
	return text[:-len(suffix)]
	return text

	# Checks if two header guard symbols are similar.
	# Examples for similar symbols are:
	# ROOT_TTREE and ROOT_TTREE_H
	# ROOT_TMVA_XXX and ROOT_XXX
	# ROOT_TFILE and ROOT_TFILE
	# Depending on the classification of the similarity,
	# the comparision is written as a single string to
	# either assumptions (if we're sure they are intended
	# to be the same symbol) or in unsure_assumptions if
	# we are not 100% sure if they are supposed to be
	# the same symbol.
	# Otherwise returns False and adds no messsage to any
	# list.
	def similar_guards(a, b):
	global assumptions
	global unsure_assumptions
	global failed_assumptions

	aOrig = a[:]
	bOrig = b[:]

	# this happens so often, we make a special case
	if a in ("ROOT_TGButton", "ROOT_TGWidget") and b in ("ROOT_TGButton", "ROOT_TGWidget"):
	return True

	a = a.lower()
	b = b.lower()

	# it's pretty obvious that we have double header guards if the ifndef has a _h prefix
	if a.endswith("_h") or a.endswith("_hh"):
	return True
	# Keep removing all the prefixes that people really like to add or remove...
	while True:
	abak = a[:]
	bbak = b[:]
	a = remove_prefix(a, "roott")
	b = remove_prefix(b, "roott")
	a = remove_prefix(a, "root")
	b = remove_prefix(b, "root")
	a = remove_prefix(a, "roo")
	b = remove_prefix(b, "roo")
	a = remove_prefix(a, "_")
	b = remove_prefix(b, "_")
	a = remove_prefix(a, "tmva_")
	b = remove_prefix(b, "tmva_")
	a = remove_prefix(a, "fit_")
	b = remove_prefix(b, "fit_")
	a = remove_prefix(a, "math_")
	b = remove_prefix(b, "math_")
	a = remove_prefix(a, "genvector_")
	b = remove_prefix(b, "genvector_")
	a = remove_suffix(a, "_hh")
	b = remove_suffix(b, "_hh")
	a = remove_suffix(a, "_h")
	b = remove_suffix(b, "_h")
	if a == abak and b == bbak:
	break
	if a == b:
	assumptions.append("\n" + currentFile + ":\n" + aOrig + "\n" + bOrig)
	return True
	if edit_distance(a, b) <= 5:
	unsure_assumptions.append("\n" + currentFile + "\n" + aOrig + "\n" + bOrig)
	return True
	return False


	def fix_guard(guard):
	global processedIncludes
	global assumptions
	global unsure_assumptions
	global failed_assumptions

	# Check for some false-positives
	if guard.group(1) == "__CINT__":
	return guard.group(0)
	if guard.group(1) == "NDEBUG":
	return guard.group(0)
	if "WIN32" in guard.group(1):
	return guard.group(0)
	# This is probably not for header guard checking... e.g. NO_MATHCORE and so on
	if guard.group(1).startswith("NO_"):
	return guard.group(0)

	included_header = ntpath.basename(clean_include_path(guard.group(2)))
	#print(included_header)
	#print(find_header(included_header))
	header_path = find_header(included_header)

	if header_path == None:
	eprint(currentFile + ": Coulnd't find file for header: " + included_header)
	return guard.group(0)

	guard_symbol = get_guard_symbol(header_path)

	if guard_symbol == None:
	eprint(currentFile + ": Couldn't find a header guard for " + header_path)
	return guard.group(0)

	if guard_symbol == guard.group(1):
	processedIncludes += 1
	return "#include " + guard.group(2)

	if similar_guards(guard_symbol, guard.group(1)):
	processedIncludes += 1
	return "#include " + guard.group(2)

	failed_assumptions.append("\n" + currentFile + ":\n" + guard_symbol + "\n" + guard.group(1))
	return guard.group(0)


	def fix_double_guards(path):
	double_guard_regex = re.compile(r"#ifndef[ ]+([\S]+)[^\n]\n[\n ]#include[ ]([\S]+)\n[\n ]#endif")
	content = read(path)
	new_content = re.sub(double_guard_regex, fix_guard, content)
	if new_content != content:
	write(path, new_content)


	def run_tests():
	assert is_no_op_line("\n")
	assert is_no_op_line(" /* \n")
	assert is_no_op_line(" // \n")
	assert is_no_op_line(" * \n")

	run_tests()

	for root, subdirs, files in os.walk("."):
	# Skip roottest for now. We should remove this when we do the same for rottest :)
	if "/roottest/" in root:
	continue
	for file in files:
	path = os.path.join(root, file)
	currentFile = path
	try:
	fix_double_guards(path)
	except KeyboardInterrupt:
	exit(1)
	except AssertionError:
	exit(1)
	except UnicodeDecodeError:
	pass #print("Error in file: " + path)

	#print(processedIncludes)

	print("\n\n\n\n\nProbably correct typo corrections:" + str(len(assumptions)))
	for a in assumptions:
	print(a)

	print("\n\n\n\n\nDubious typo corrections:" + str(len(unsure_assumptions)))
	for a in unsure_assumptions:
	print(a)

	print("\n\n\n\n\nUnuccessful typo corrections:" + str(len(failed_assumptions)))
	for a in failed_assumptions:
	print(a)