Skip to content

Instantly share code, notes, and snippets.

@sultanqasim
Last active September 17, 2019 18:21
Show Gist options
  • Save sultanqasim/ea78aedd9672f7a739a365b704dc3c50 to your computer and use it in GitHub Desktop.
Save sultanqasim/ea78aedd9672f7a739a365b704dc3c50 to your computer and use it in GitHub Desktop.
# Script to add back comments removed from source code
# Example use case: comments stripped out of HTC kernel releases
#
# Copyright (C) 2016 Sultan Qasim Khan
#
# Usage:
# 1. Find the closest upstream source release with comments (eg. CAF)
# 2. After checking out the closest upstream release, create a git
# commit overlaying the OEM modifications. Do this by replacing
# all files with the OEM versions and committing the change.
# 3. Run "git diff HEAD~1" and save output to a file. This script will
# parse the diff, and create a patch file reverting the removal of
# comments from upstream code, while not reverting any other changes.
# 4. Use "git apply" to apply the generated patch file to restore comments.
#
# Usage Example:
# Assume current commit is a stripped HTC release over a commented CAF base
# git diff HEAD~1 >vendor_changes
# python3.5 recommenter.py vendor_changes comment_restoration
# git apply comment_restoration
# git commit -a -m "restore stripped comments"
import sys
def extract_changes(lines):
"""
Take in the lines of a git diff, output a list of tuples of filename
and changes for each file.
"""
changes = []
cur_filename = None
cur_lines = []
skip_lines = False
deleted = False
for line in lines:
if skip_lines:
if line.startswith(b'deleted file'): deleted = True
elif line.startswith(b'@@'): skip_lines = False
elif line.startswith(b'diff --git a/'): skip_lines = False
else: continue
if line.startswith(b'diff --git a/'):
if not deleted and cur_filename is not None:
changes.append((cur_filename, cur_lines))
deleted = False
cur_filename = line.split(b' b/')[1].rstrip()
cur_lines = []
skip_lines = True
else:
cur_lines.append(line)
if not deleted and cur_filename is not None:
changes.append((cur_filename, cur_lines))
return changes
def _extract_diff_line_params(line):
"""
Takes in a line like:
b'@@ -176,9 +179,6 @@ static const inline bool is_cpu_secure(void)\n'
Outputs a tuple like:
(176, 9, 179, 6, b'static const inline bool is_cpu_secure(void)')
"""
chunks = line.split(b' ', 4)
oldline_str, oldcnt_str = chunks[1][1:].split(b',')
try:
newline_str, newcnt_str = chunks[2][1:].split(b',')
except ValueError:
newline_str = b'0'
newcnt_str = chunks[2][1:]
oldline, oldcnt, newline, newcnt = [int(x) for x in [
oldline_str, oldcnt_str, newline_str, newcnt_str]]
if len(chunks) > 4:
funcname = chunks[4].rstrip()
else:
funcname = b''
return (oldline, oldcnt, newline, newcnt, funcname)
def cluster_diffs(diff):
"""
Takes in a diff of a single file generated by extract_changes
This clusters all the changes by line groups.
It will convert each change group into a tuple of the form:
(old_line_num, old_line_count, new_line_num, new_line_count, funcname, diff)
"""
clusters = []
cur_params = None
cluster_lines = []
for line in diff:
if line.startswith(b'@@ -'):
if cur_params is not None:
clusters.append(cur_params + (cluster_lines,))
cur_params = _extract_diff_line_params(line)
cluster_lines = []
else:
cluster_lines.append(line)
if cur_params is not None:
clusters.append(cur_params + (cluster_lines,))
return clusters
def strip_comments(lines):
"""
Takes a set of lines, and strips out all multiline (/* */) and
single line (//) comments.
"""
# first take out multiline style comments
# right now, I'm kinda lazy and won't handle multiple
# such comments in one line (it's a rare edge case anyway)
in_comment = False
new_lines = []
for line in lines:
if in_comment:
if b'*/' in line:
new_lines.append(line[line.index(b'*/') + 2:])
in_comment = False
else:
continue
else:
if b'/*' in line:
if b'*/' in line: # single line
new_line = line[:line.index(b'/*')]
new_line += line[line.index(b'*/') + 2:]
new_lines.append(new_line)
else: # multi line
new_lines.append(line[:line.index(b'/*')] + b'\n')
in_comment = True
else:
new_lines.append(line)
# annoying edge case: diff chunk ends while in comment
# I don't feel like handling it properly, so lets just leave the
# comments in for this bad case
if in_comment:
new_lines = lines
# now take out single line double slash comments
lines = new_lines
new_lines = []
for line in lines:
if b'//' in line:
new_lines.append(line[:line.index(b'//')] + b'\n')
else:
new_lines.append(line)
return new_lines
def strip_trailing_whitespace(lines):
# also strip empty lines
new_lines = []
for line in lines:
stripped = line.rstrip()
if len(stripped) == 0: continue
new_lines.append(stripped + b'\n')
return new_lines
def strip_leading_indicators(lines):
# leading indicator is +, -, or space
return [line[1:] for line in lines]
def check_diff_for_comment_removal(old, new):
"""
old is the diff listing of lines removed in a change
new is the diff listing of lines added in a change
This function examines a change to see if the only effect it has
is to remove a comment. If this is the case, the function will
return true. Else, it will return false.
"""
old = strip_leading_indicators(old)
new = strip_leading_indicators(new)
stripped_old = strip_trailing_whitespace(old)
stripped_new = strip_trailing_whitespace(new)
# if the only change is whitespace removal and not actual comment removal,
# don't do anything
if stripped_old == stripped_new:
return False
cstripped_old = strip_trailing_whitespace(strip_comments(old))
# if the new one is the same as the old one with comments stripped,
# we have an issue
return cstripped_old == stripped_new
def invert_change(old, new):
new_inv = [b'-' + line[1:] for line in new]
old_inv = [b'+' + line[1:] for line in old]
return new_inv + old_inv
def neutralize_change(new):
return [b' ' + line[1:] for line in new]
def invert_comment_removals(cluster_diff):
"""
Takes in the diffs of one cluster (as clustered by cluster_diffs)
It will make changes that are not comment removals into no-ops
It will revert any changes that remove comments without adding or
changing anything else.
"""
diff_out = []
in_diff = False
last_old = None
last_new = None
for line in cluster_diff:
if in_diff:
if line.startswith(b'-'):
last_old.append(line)
elif line.startswith(b'+'):
last_new.append(line)
else:
if check_diff_for_comment_removal(last_old, last_new):
diff_out.extend(invert_change(last_old, last_new))
else:
diff_out.extend(neutralize_change(last_new))
in_diff = False
diff_out.append(line)
else:
if line.startswith(b'-'):
in_diff = True
last_old = [line]
last_new = []
elif line.startswith(b'+'):
in_diff = True
last_old = []
last_new = [line]
else:
diff_out.append(line)
if in_diff:
if check_diff_for_comment_removal(last_old, last_new):
diff_out.extend(invert_change(last_old, last_new))
else:
diff_out.extend(neutralize_change(last_new))
return diff_out
def count_additions_and_removals(cluster_diff):
additions = 0
removals = 0
for line in cluster_diff:
if line.startswith(b'-'): removals += 1
elif line.startswith(b'+'): additions += 1
return additions, removals
def recompute_diff_lines(file_clusters):
"""
Takes in a list of all the diff clusters in a file after we have done
comment removal inversion. This will recompute the line numbers after
we have done the diff inversion. This will return the same list of
diff clusters passed in, but with line numbers corrected.
"""
lines_added = 0
new_clusters = []
for cluster in file_clusters:
additions, removals = count_additions_and_removals(cluster[5])
if additions == 0 and removals == 0:
# get rid of empty diffs
continue
new_clusters.append((
cluster[2],
cluster[3],
cluster[2] + lines_added,
cluster[3] + additions - removals,
cluster[4],
cluster[5]))
lines_added += additions - removals
return new_clusters
def convert_cluster_diff_to_lines(cluster):
"""
Takes in a cluster diff tuple (old_lime, old_count, new_line...)
Converts it back into the form you see in git diff
(ie. it readds the @@ -123,4 +567,8 @@ ... type headers)
"""
lines = []
lines.append(b'@@ -%i,%i +%i,%i @@ %s\n' % cluster[:5])
lines.extend(cluster[5])
return lines
def main(argv):
infile = open(argv[1], 'rb')
lines = infile.readlines()
infile.close()
changed_files = extract_changes(lines)
file_diff_clusters = []
for fname, diff in changed_files:
file_diff_clusters.append((fname, cluster_diffs(diff)))
# generate the diffs to add back comments
readd_comment_diff_clusters = []
for fname, clusters in file_diff_clusters:
new_clusters = []
for cluster in clusters:
new_diff = invert_comment_removals(cluster[5])
new_clusters.append(cluster[:5] + (new_diff,))
new_clusters = recompute_diff_lines(new_clusters)
readd_comment_diff_clusters.append((fname, new_clusters))
# write out the comment readding diffs to a git-readable file format
outfile = open(argv[2], 'wb')
for fname, clusters in readd_comment_diff_clusters:
if len(clusters) == 0: continue
outfile.write(b'diff --git a/%s b/%s\n' % (fname, fname))
outfile.write(b'--- a/%s\n' % fname)
outfile.write(b'+++ b/%s\n' % fname)
for cluster in clusters:
outfile.writelines(convert_cluster_diff_to_lines(cluster))
outfile.write(b'\n')
outfile.close()
if sys.version_info < (3, 5):
raise Exception("Script requires Python 3.5 or newer")
if __name__ == "__main__":
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment