sultanqasim/recommenter.py

## recommenter.py
# Script to add back comments removed from source code
# Example use case: comments stripped out of HTC kernel releases
#
# Copyright (C) 2016 Sultan Qasim Khan
#
# Usage:
# 1. Find the closest upstream source release with comments (eg. CAF)
# 2. After checking out the closest upstream release, create a git
#    commit overlaying the OEM modifications. Do this by replacing
#    all files with the OEM versions and committing the change.
# 3. Run "git diff HEAD~1" and save output to a file. This script will
#    parse the diff, and create a patch file reverting the removal of
#    comments from upstream code, while not reverting any other changes.
# 4. Use "git apply" to apply the generated patch file to restore comments.
#
# Usage Example:
# Assume current commit is a stripped HTC release over a commented CAF base
# git diff HEAD~1 >vendor_changes
# python3.5 recommenter.py vendor_changes comment_restoration
# git apply comment_restoration
# git commit -a -m "restore stripped comments"

import sys

def extract_changes(lines):
    """
    Take in the lines of a git diff, output a list of tuples of filename
    and changes for each file.
    """
    changes = []
    cur_filename = None
    cur_lines = []
    skip_lines = False
    deleted = False

    for line in lines:
        if skip_lines:
            if line.startswith(b'deleted file'): deleted = True
            elif line.startswith(b'@@'): skip_lines = False
            elif line.startswith(b'diff --git a/'): skip_lines = False
            else: continue

        if line.startswith(b'diff --git a/'):
            if not deleted and cur_filename is not None:
                changes.append((cur_filename, cur_lines))
            deleted = False
            cur_filename = line.split(b' b/')[1].rstrip()
            cur_lines = []
            skip_lines = True
        else:
            cur_lines.append(line)

    if not deleted and cur_filename is not None:
        changes.append((cur_filename, cur_lines))

    return changes

def _extract_diff_line_params(line):
    """
    Takes in a line like:
    b'@@ -176,9 +179,6 @@ static const inline bool is_cpu_secure(void)\n'

    Outputs a tuple like:
    (176, 9, 179, 6, b'static const inline bool is_cpu_secure(void)')
    """
    chunks = line.split(b' ', 4)
    oldline_str, oldcnt_str = chunks[1][1:].split(b',')
    try:
        newline_str, newcnt_str = chunks[2][1:].split(b',')
    except ValueError:
        newline_str = b'0'
        newcnt_str = chunks[2][1:]
    oldline, oldcnt, newline, newcnt = [int(x) for x in [
            oldline_str, oldcnt_str, newline_str, newcnt_str]]
    if len(chunks) > 4:
        funcname = chunks[4].rstrip()
    else:
        funcname = b''
    return (oldline, oldcnt, newline, newcnt, funcname)

def cluster_diffs(diff):
    """
    Takes in a diff of a single file generated by extract_changes
    This clusters all the changes by line groups.
    It will convert each change group into a tuple of the form:
    (old_line_num, old_line_count, new_line_num, new_line_count, funcname, diff)
    """
    clusters = []
    cur_params = None
    cluster_lines = []

    for line in diff:
        if line.startswith(b'@@ -'):
            if cur_params is not None:
                clusters.append(cur_params + (cluster_lines,))
            cur_params = _extract_diff_line_params(line)
            cluster_lines = []
        else:
            cluster_lines.append(line)

    if cur_params is not None:
        clusters.append(cur_params + (cluster_lines,))

    return clusters

def strip_comments(lines):
    """
    Takes a set of lines, and strips out all multiline (/* */) and
    single line (//) comments.
    """
    # first take out multiline style comments
    # right now, I'm kinda lazy and won't handle multiple
    # such comments in one line (it's a rare edge case anyway)
    in_comment = False
    new_lines = []
    for line in lines:
        if in_comment:
            if b'*/' in line:
                new_lines.append(line[line.index(b'*/') + 2:])
                in_comment = False
            else:
                continue
        else:
            if b'/*' in line:
                if b'*/' in line: # single line
                    new_line = line[:line.index(b'/*')]
                    new_line += line[line.index(b'*/') + 2:]
                    new_lines.append(new_line)
                else: # multi line
                    new_lines.append(line[:line.index(b'/*')] + b'\n')
                    in_comment = True
            else:
                new_lines.append(line)

    # annoying edge case: diff chunk ends while in comment
    # I don't feel like handling it properly, so lets just leave the
    # comments in for this bad case
    if in_comment:
        new_lines = lines

    # now take out single line double slash comments
    lines = new_lines
    new_lines = []
    for line in lines:
        if b'//' in line:
            new_lines.append(line[:line.index(b'//')] + b'\n')
        else:
            new_lines.append(line)

    return new_lines

def strip_trailing_whitespace(lines):
    # also strip empty lines
    new_lines = []
    for line in lines:
        stripped = line.rstrip()
        if len(stripped) == 0: continue
        new_lines.append(stripped + b'\n')
    return new_lines

def strip_leading_indicators(lines):
    # leading indicator is +, -, or space
    return [line[1:] for line in lines]

def check_diff_for_comment_removal(old, new):
    """
    old is the diff listing of lines removed in a change
    new is the diff listing of lines added in a change

    This function examines a change to see if the only effect it has
    is to remove a comment. If this is the case, the function will
    return true. Else, it will return false.
    """
    old = strip_leading_indicators(old)
    new = strip_leading_indicators(new)

    stripped_old = strip_trailing_whitespace(old)
    stripped_new = strip_trailing_whitespace(new)

    # if the only change is whitespace removal and not actual comment removal,
    # don't do anything
    if stripped_old == stripped_new:
        return False

    cstripped_old = strip_trailing_whitespace(strip_comments(old))

    # if the new one is the same as the old one with comments stripped,
    # we have an issue
    return cstripped_old == stripped_new

def invert_change(old, new):
    new_inv = [b'-' + line[1:] for line in new]
    old_inv = [b'+' + line[1:] for line in old]
    return new_inv + old_inv

def neutralize_change(new):
    return [b' ' + line[1:] for line in new]

def invert_comment_removals(cluster_diff):
    """
    Takes in the diffs of one cluster (as clustered by cluster_diffs)
    It will make changes that are not comment removals into no-ops
    It will revert any changes that remove comments without adding or
    changing anything else.
    """
    diff_out = []
    in_diff = False
    last_old = None
    last_new = None

    for line in cluster_diff:
        if in_diff:
            if line.startswith(b'-'):
                last_old.append(line)
            elif line.startswith(b'+'):
                last_new.append(line)
            else:
                if check_diff_for_comment_removal(last_old, last_new):
                    diff_out.extend(invert_change(last_old, last_new))
                else:
                    diff_out.extend(neutralize_change(last_new))
                in_diff = False
                diff_out.append(line)
        else:
            if line.startswith(b'-'):
                in_diff = True
                last_old = [line]
                last_new = []
            elif line.startswith(b'+'):
                in_diff = True
                last_old = []
                last_new = [line]
            else:
                diff_out.append(line)

    if in_diff:
        if check_diff_for_comment_removal(last_old, last_new):
            diff_out.extend(invert_change(last_old, last_new))
        else:
            diff_out.extend(neutralize_change(last_new))

    return diff_out

def count_additions_and_removals(cluster_diff):
    additions = 0
    removals = 0
    for line in cluster_diff:
        if line.startswith(b'-'): removals += 1
        elif line.startswith(b'+'): additions += 1
    return additions, removals

def recompute_diff_lines(file_clusters):
    """
    Takes in a list of all the diff clusters in a file after we have done
    comment removal inversion. This will recompute the line numbers after
    we have done the diff inversion. This will return the same list of
    diff clusters passed in, but with line numbers corrected.
    """
    lines_added = 0
    new_clusters = []

    for cluster in file_clusters:
        additions, removals = count_additions_and_removals(cluster[5])
        if additions == 0 and removals == 0:
            # get rid of empty diffs
            continue
        new_clusters.append((
                cluster[2],
                cluster[3],
                cluster[2] + lines_added,
                cluster[3] + additions - removals,
                cluster[4],
                cluster[5]))
        lines_added += additions - removals

    return new_clusters

def convert_cluster_diff_to_lines(cluster):
    """
    Takes in a cluster diff tuple (old_lime, old_count, new_line...)
    Converts it back into the form you see in git diff
    (ie. it readds the @@ -123,4 +567,8 @@ ... type headers)
    """
    lines = []
    lines.append(b'@@ -%i,%i +%i,%i @@ %s\n' % cluster[:5])
    lines.extend(cluster[5])
    return lines

def main(argv):
    infile = open(argv[1], 'rb')
    lines = infile.readlines()
    infile.close()

    changed_files = extract_changes(lines)

    file_diff_clusters = []
    for fname, diff in changed_files:
        file_diff_clusters.append((fname, cluster_diffs(diff)))

    # generate the diffs to add back comments
    readd_comment_diff_clusters = []
    for fname, clusters in file_diff_clusters:
        new_clusters = []
        for cluster in clusters:
            new_diff = invert_comment_removals(cluster[5])
            new_clusters.append(cluster[:5] + (new_diff,))
        new_clusters = recompute_diff_lines(new_clusters)
        readd_comment_diff_clusters.append((fname, new_clusters))

    # write out the comment readding diffs to a git-readable file format
    outfile = open(argv[2], 'wb')
    for fname, clusters in readd_comment_diff_clusters:
        if len(clusters) == 0: continue
        outfile.write(b'diff --git a/%s b/%s\n' % (fname, fname))
        outfile.write(b'--- a/%s\n' % fname)
        outfile.write(b'+++ b/%s\n' % fname)
        for cluster in clusters:
            outfile.writelines(convert_cluster_diff_to_lines(cluster))
        outfile.write(b'\n')
    outfile.close()


if sys.version_info < (3, 5):
    raise Exception("Script requires Python 3.5 or newer")

if __name__ == "__main__":
    main(sys.argv)
	# Script to add back comments removed from source code
	# Example use case: comments stripped out of HTC kernel releases
	#
	# Copyright (C) 2016 Sultan Qasim Khan
	#
	# Usage:
	# 1. Find the closest upstream source release with comments (eg. CAF)
	# 2. After checking out the closest upstream release, create a git
	# commit overlaying the OEM modifications. Do this by replacing
	# all files with the OEM versions and committing the change.
	# 3. Run "git diff HEAD~1" and save output to a file. This script will
	# parse the diff, and create a patch file reverting the removal of
	# comments from upstream code, while not reverting any other changes.
	# 4. Use "git apply" to apply the generated patch file to restore comments.
	#
	# Usage Example:
	# Assume current commit is a stripped HTC release over a commented CAF base
	# git diff HEAD~1 >vendor_changes
	# python3.5 recommenter.py vendor_changes comment_restoration
	# git apply comment_restoration
	# git commit -a -m "restore stripped comments"

	import sys

	def extract_changes(lines):
	"""
	Take in the lines of a git diff, output a list of tuples of filename
	and changes for each file.
	"""
	changes = []
	cur_filename = None
	cur_lines = []
	skip_lines = False
	deleted = False

	for line in lines:
	if skip_lines:
	if line.startswith(b'deleted file'): deleted = True
	elif line.startswith(b'@@'): skip_lines = False
	elif line.startswith(b'diff --git a/'): skip_lines = False
	else: continue

	if line.startswith(b'diff --git a/'):
	if not deleted and cur_filename is not None:
	changes.append((cur_filename, cur_lines))
	deleted = False
	cur_filename = line.split(b' b/')[1].rstrip()
	cur_lines = []
	skip_lines = True
	else:
	cur_lines.append(line)

	if not deleted and cur_filename is not None:
	changes.append((cur_filename, cur_lines))

	return changes

	def _extract_diff_line_params(line):
	"""
	Takes in a line like:
	b'@@ -176,9 +179,6 @@ static const inline bool is_cpu_secure(void)\n'

	Outputs a tuple like:
	(176, 9, 179, 6, b'static const inline bool is_cpu_secure(void)')
	"""
	chunks = line.split(b' ', 4)
	oldline_str, oldcnt_str = chunks[1][1:].split(b',')
	try:
	newline_str, newcnt_str = chunks[2][1:].split(b',')
	except ValueError:
	newline_str = b'0'
	newcnt_str = chunks[2][1:]
	oldline, oldcnt, newline, newcnt = [int(x) for x in [
	oldline_str, oldcnt_str, newline_str, newcnt_str]]
	if len(chunks) > 4:
	funcname = chunks[4].rstrip()
	else:
	funcname = b''
	return (oldline, oldcnt, newline, newcnt, funcname)

	def cluster_diffs(diff):
	"""
	Takes in a diff of a single file generated by extract_changes
	This clusters all the changes by line groups.
	It will convert each change group into a tuple of the form:
	(old_line_num, old_line_count, new_line_num, new_line_count, funcname, diff)
	"""
	clusters = []
	cur_params = None
	cluster_lines = []

	for line in diff:
	if line.startswith(b'@@ -'):
	if cur_params is not None:
	clusters.append(cur_params + (cluster_lines,))
	cur_params = _extract_diff_line_params(line)
	cluster_lines = []
	else:
	cluster_lines.append(line)

	if cur_params is not None:
	clusters.append(cur_params + (cluster_lines,))

	return clusters

	def strip_comments(lines):
	"""
	Takes a set of lines, and strips out all multiline (/* */) and
	single line (//) comments.
	"""
	# first take out multiline style comments
	# right now, I'm kinda lazy and won't handle multiple
	# such comments in one line (it's a rare edge case anyway)
	in_comment = False
	new_lines = []
	for line in lines:
	if in_comment:
	if b'*/' in line:
	new_lines.append(line[line.index(b'*/') + 2:])
	in_comment = False
	else:
	continue
	else:
	if b'/*' in line:
	if b'*/' in line: # single line
	new_line = line[:line.index(b'/*')]
	new_line += line[line.index(b'*/') + 2:]
	new_lines.append(new_line)
	else: # multi line
	new_lines.append(line[:line.index(b'/*')] + b'\n')
	in_comment = True
	else:
	new_lines.append(line)

	# annoying edge case: diff chunk ends while in comment
	# I don't feel like handling it properly, so lets just leave the
	# comments in for this bad case
	if in_comment:
	new_lines = lines

	# now take out single line double slash comments
	lines = new_lines
	new_lines = []
	for line in lines:
	if b'//' in line:
	new_lines.append(line[:line.index(b'//')] + b'\n')
	else:
	new_lines.append(line)

	return new_lines

	def strip_trailing_whitespace(lines):
	# also strip empty lines
	new_lines = []
	for line in lines:
	stripped = line.rstrip()
	if len(stripped) == 0: continue
	new_lines.append(stripped + b'\n')
	return new_lines

	def strip_leading_indicators(lines):
	# leading indicator is +, -, or space
	return [line[1:] for line in lines]

	def check_diff_for_comment_removal(old, new):
	"""
	old is the diff listing of lines removed in a change
	new is the diff listing of lines added in a change

	This function examines a change to see if the only effect it has
	is to remove a comment. If this is the case, the function will
	return true. Else, it will return false.
	"""
	old = strip_leading_indicators(old)
	new = strip_leading_indicators(new)

	stripped_old = strip_trailing_whitespace(old)
	stripped_new = strip_trailing_whitespace(new)

	# if the only change is whitespace removal and not actual comment removal,
	# don't do anything
	if stripped_old == stripped_new:
	return False

	cstripped_old = strip_trailing_whitespace(strip_comments(old))

	# if the new one is the same as the old one with comments stripped,
	# we have an issue
	return cstripped_old == stripped_new

	def invert_change(old, new):
	new_inv = [b'-' + line[1:] for line in new]
	old_inv = [b'+' + line[1:] for line in old]
	return new_inv + old_inv

	def neutralize_change(new):
	return [b' ' + line[1:] for line in new]

	def invert_comment_removals(cluster_diff):
	"""
	Takes in the diffs of one cluster (as clustered by cluster_diffs)
	It will make changes that are not comment removals into no-ops
	It will revert any changes that remove comments without adding or
	changing anything else.
	"""
	diff_out = []
	in_diff = False
	last_old = None
	last_new = None

	for line in cluster_diff:
	if in_diff:
	if line.startswith(b'-'):
	last_old.append(line)
	elif line.startswith(b'+'):
	last_new.append(line)
	else:
	if check_diff_for_comment_removal(last_old, last_new):
	diff_out.extend(invert_change(last_old, last_new))
	else:
	diff_out.extend(neutralize_change(last_new))
	in_diff = False
	diff_out.append(line)
	else:
	if line.startswith(b'-'):
	in_diff = True
	last_old = [line]
	last_new = []
	elif line.startswith(b'+'):
	in_diff = True
	last_old = []
	last_new = [line]
	else:
	diff_out.append(line)

	if in_diff:
	if check_diff_for_comment_removal(last_old, last_new):
	diff_out.extend(invert_change(last_old, last_new))
	else:
	diff_out.extend(neutralize_change(last_new))

	return diff_out

	def count_additions_and_removals(cluster_diff):
	additions = 0
	removals = 0
	for line in cluster_diff:
	if line.startswith(b'-'): removals += 1
	elif line.startswith(b'+'): additions += 1
	return additions, removals

	def recompute_diff_lines(file_clusters):
	"""
	Takes in a list of all the diff clusters in a file after we have done
	comment removal inversion. This will recompute the line numbers after
	we have done the diff inversion. This will return the same list of
	diff clusters passed in, but with line numbers corrected.
	"""
	lines_added = 0
	new_clusters = []

	for cluster in file_clusters:
	additions, removals = count_additions_and_removals(cluster[5])
	if additions == 0 and removals == 0:
	# get rid of empty diffs
	continue
	new_clusters.append((
	cluster[2],
	cluster[3],
	cluster[2] + lines_added,
	cluster[3] + additions - removals,
	cluster[4],
	cluster[5]))
	lines_added += additions - removals

	return new_clusters

	def convert_cluster_diff_to_lines(cluster):
	"""
	Takes in a cluster diff tuple (old_lime, old_count, new_line...)
	Converts it back into the form you see in git diff
	(ie. it readds the @@ -123,4 +567,8 @@ ... type headers)
	"""
	lines = []
	lines.append(b'@@ -%i,%i +%i,%i @@ %s\n' % cluster[:5])
	lines.extend(cluster[5])
	return lines

	def main(argv):
	infile = open(argv[1], 'rb')
	lines = infile.readlines()
	infile.close()

	changed_files = extract_changes(lines)

	file_diff_clusters = []
	for fname, diff in changed_files:
	file_diff_clusters.append((fname, cluster_diffs(diff)))

	# generate the diffs to add back comments
	readd_comment_diff_clusters = []
	for fname, clusters in file_diff_clusters:
	new_clusters = []
	for cluster in clusters:
	new_diff = invert_comment_removals(cluster[5])
	new_clusters.append(cluster[:5] + (new_diff,))
	new_clusters = recompute_diff_lines(new_clusters)
	readd_comment_diff_clusters.append((fname, new_clusters))

	# write out the comment readding diffs to a git-readable file format
	outfile = open(argv[2], 'wb')
	for fname, clusters in readd_comment_diff_clusters:
	if len(clusters) == 0: continue
	outfile.write(b'diff --git a/%s b/%s\n' % (fname, fname))
	outfile.write(b'--- a/%s\n' % fname)
	outfile.write(b'+++ b/%s\n' % fname)
	for cluster in clusters:
	outfile.writelines(convert_cluster_diff_to_lines(cluster))
	outfile.write(b'\n')
	outfile.close()


	if sys.version_info < (3, 5):
	raise Exception("Script requires Python 3.5 or newer")

	if __name__ == "__main__":
	main(sys.argv)