Last active
September 17, 2019 18:21
-
-
Save sultanqasim/ea78aedd9672f7a739a365b704dc3c50 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Script to add back comments removed from source code | |
# Example use case: comments stripped out of HTC kernel releases | |
# | |
# Copyright (C) 2016 Sultan Qasim Khan | |
# | |
# Usage: | |
# 1. Find the closest upstream source release with comments (eg. CAF) | |
# 2. After checking out the closest upstream release, create a git | |
# commit overlaying the OEM modifications. Do this by replacing | |
# all files with the OEM versions and committing the change. | |
# 3. Run "git diff HEAD~1" and save output to a file. This script will | |
# parse the diff, and create a patch file reverting the removal of | |
# comments from upstream code, while not reverting any other changes. | |
# 4. Use "git apply" to apply the generated patch file to restore comments. | |
# | |
# Usage Example: | |
# Assume current commit is a stripped HTC release over a commented CAF base | |
# git diff HEAD~1 >vendor_changes | |
# python3.5 recommenter.py vendor_changes comment_restoration | |
# git apply comment_restoration | |
# git commit -a -m "restore stripped comments" | |
import sys | |
def extract_changes(lines): | |
""" | |
Take in the lines of a git diff, output a list of tuples of filename | |
and changes for each file. | |
""" | |
changes = [] | |
cur_filename = None | |
cur_lines = [] | |
skip_lines = False | |
deleted = False | |
for line in lines: | |
if skip_lines: | |
if line.startswith(b'deleted file'): deleted = True | |
elif line.startswith(b'@@'): skip_lines = False | |
elif line.startswith(b'diff --git a/'): skip_lines = False | |
else: continue | |
if line.startswith(b'diff --git a/'): | |
if not deleted and cur_filename is not None: | |
changes.append((cur_filename, cur_lines)) | |
deleted = False | |
cur_filename = line.split(b' b/')[1].rstrip() | |
cur_lines = [] | |
skip_lines = True | |
else: | |
cur_lines.append(line) | |
if not deleted and cur_filename is not None: | |
changes.append((cur_filename, cur_lines)) | |
return changes | |
def _extract_diff_line_params(line): | |
""" | |
Takes in a line like: | |
b'@@ -176,9 +179,6 @@ static const inline bool is_cpu_secure(void)\n' | |
Outputs a tuple like: | |
(176, 9, 179, 6, b'static const inline bool is_cpu_secure(void)') | |
""" | |
chunks = line.split(b' ', 4) | |
oldline_str, oldcnt_str = chunks[1][1:].split(b',') | |
try: | |
newline_str, newcnt_str = chunks[2][1:].split(b',') | |
except ValueError: | |
newline_str = b'0' | |
newcnt_str = chunks[2][1:] | |
oldline, oldcnt, newline, newcnt = [int(x) for x in [ | |
oldline_str, oldcnt_str, newline_str, newcnt_str]] | |
if len(chunks) > 4: | |
funcname = chunks[4].rstrip() | |
else: | |
funcname = b'' | |
return (oldline, oldcnt, newline, newcnt, funcname) | |
def cluster_diffs(diff): | |
""" | |
Takes in a diff of a single file generated by extract_changes | |
This clusters all the changes by line groups. | |
It will convert each change group into a tuple of the form: | |
(old_line_num, old_line_count, new_line_num, new_line_count, funcname, diff) | |
""" | |
clusters = [] | |
cur_params = None | |
cluster_lines = [] | |
for line in diff: | |
if line.startswith(b'@@ -'): | |
if cur_params is not None: | |
clusters.append(cur_params + (cluster_lines,)) | |
cur_params = _extract_diff_line_params(line) | |
cluster_lines = [] | |
else: | |
cluster_lines.append(line) | |
if cur_params is not None: | |
clusters.append(cur_params + (cluster_lines,)) | |
return clusters | |
def strip_comments(lines): | |
""" | |
Takes a set of lines, and strips out all multiline (/* */) and | |
single line (//) comments. | |
""" | |
# first take out multiline style comments | |
# right now, I'm kinda lazy and won't handle multiple | |
# such comments in one line (it's a rare edge case anyway) | |
in_comment = False | |
new_lines = [] | |
for line in lines: | |
if in_comment: | |
if b'*/' in line: | |
new_lines.append(line[line.index(b'*/') + 2:]) | |
in_comment = False | |
else: | |
continue | |
else: | |
if b'/*' in line: | |
if b'*/' in line: # single line | |
new_line = line[:line.index(b'/*')] | |
new_line += line[line.index(b'*/') + 2:] | |
new_lines.append(new_line) | |
else: # multi line | |
new_lines.append(line[:line.index(b'/*')] + b'\n') | |
in_comment = True | |
else: | |
new_lines.append(line) | |
# annoying edge case: diff chunk ends while in comment | |
# I don't feel like handling it properly, so lets just leave the | |
# comments in for this bad case | |
if in_comment: | |
new_lines = lines | |
# now take out single line double slash comments | |
lines = new_lines | |
new_lines = [] | |
for line in lines: | |
if b'//' in line: | |
new_lines.append(line[:line.index(b'//')] + b'\n') | |
else: | |
new_lines.append(line) | |
return new_lines | |
def strip_trailing_whitespace(lines): | |
# also strip empty lines | |
new_lines = [] | |
for line in lines: | |
stripped = line.rstrip() | |
if len(stripped) == 0: continue | |
new_lines.append(stripped + b'\n') | |
return new_lines | |
def strip_leading_indicators(lines): | |
# leading indicator is +, -, or space | |
return [line[1:] for line in lines] | |
def check_diff_for_comment_removal(old, new): | |
""" | |
old is the diff listing of lines removed in a change | |
new is the diff listing of lines added in a change | |
This function examines a change to see if the only effect it has | |
is to remove a comment. If this is the case, the function will | |
return true. Else, it will return false. | |
""" | |
old = strip_leading_indicators(old) | |
new = strip_leading_indicators(new) | |
stripped_old = strip_trailing_whitespace(old) | |
stripped_new = strip_trailing_whitespace(new) | |
# if the only change is whitespace removal and not actual comment removal, | |
# don't do anything | |
if stripped_old == stripped_new: | |
return False | |
cstripped_old = strip_trailing_whitespace(strip_comments(old)) | |
# if the new one is the same as the old one with comments stripped, | |
# we have an issue | |
return cstripped_old == stripped_new | |
def invert_change(old, new): | |
new_inv = [b'-' + line[1:] for line in new] | |
old_inv = [b'+' + line[1:] for line in old] | |
return new_inv + old_inv | |
def neutralize_change(new): | |
return [b' ' + line[1:] for line in new] | |
def invert_comment_removals(cluster_diff): | |
""" | |
Takes in the diffs of one cluster (as clustered by cluster_diffs) | |
It will make changes that are not comment removals into no-ops | |
It will revert any changes that remove comments without adding or | |
changing anything else. | |
""" | |
diff_out = [] | |
in_diff = False | |
last_old = None | |
last_new = None | |
for line in cluster_diff: | |
if in_diff: | |
if line.startswith(b'-'): | |
last_old.append(line) | |
elif line.startswith(b'+'): | |
last_new.append(line) | |
else: | |
if check_diff_for_comment_removal(last_old, last_new): | |
diff_out.extend(invert_change(last_old, last_new)) | |
else: | |
diff_out.extend(neutralize_change(last_new)) | |
in_diff = False | |
diff_out.append(line) | |
else: | |
if line.startswith(b'-'): | |
in_diff = True | |
last_old = [line] | |
last_new = [] | |
elif line.startswith(b'+'): | |
in_diff = True | |
last_old = [] | |
last_new = [line] | |
else: | |
diff_out.append(line) | |
if in_diff: | |
if check_diff_for_comment_removal(last_old, last_new): | |
diff_out.extend(invert_change(last_old, last_new)) | |
else: | |
diff_out.extend(neutralize_change(last_new)) | |
return diff_out | |
def count_additions_and_removals(cluster_diff): | |
additions = 0 | |
removals = 0 | |
for line in cluster_diff: | |
if line.startswith(b'-'): removals += 1 | |
elif line.startswith(b'+'): additions += 1 | |
return additions, removals | |
def recompute_diff_lines(file_clusters): | |
""" | |
Takes in a list of all the diff clusters in a file after we have done | |
comment removal inversion. This will recompute the line numbers after | |
we have done the diff inversion. This will return the same list of | |
diff clusters passed in, but with line numbers corrected. | |
""" | |
lines_added = 0 | |
new_clusters = [] | |
for cluster in file_clusters: | |
additions, removals = count_additions_and_removals(cluster[5]) | |
if additions == 0 and removals == 0: | |
# get rid of empty diffs | |
continue | |
new_clusters.append(( | |
cluster[2], | |
cluster[3], | |
cluster[2] + lines_added, | |
cluster[3] + additions - removals, | |
cluster[4], | |
cluster[5])) | |
lines_added += additions - removals | |
return new_clusters | |
def convert_cluster_diff_to_lines(cluster): | |
""" | |
Takes in a cluster diff tuple (old_lime, old_count, new_line...) | |
Converts it back into the form you see in git diff | |
(ie. it readds the @@ -123,4 +567,8 @@ ... type headers) | |
""" | |
lines = [] | |
lines.append(b'@@ -%i,%i +%i,%i @@ %s\n' % cluster[:5]) | |
lines.extend(cluster[5]) | |
return lines | |
def main(argv): | |
infile = open(argv[1], 'rb') | |
lines = infile.readlines() | |
infile.close() | |
changed_files = extract_changes(lines) | |
file_diff_clusters = [] | |
for fname, diff in changed_files: | |
file_diff_clusters.append((fname, cluster_diffs(diff))) | |
# generate the diffs to add back comments | |
readd_comment_diff_clusters = [] | |
for fname, clusters in file_diff_clusters: | |
new_clusters = [] | |
for cluster in clusters: | |
new_diff = invert_comment_removals(cluster[5]) | |
new_clusters.append(cluster[:5] + (new_diff,)) | |
new_clusters = recompute_diff_lines(new_clusters) | |
readd_comment_diff_clusters.append((fname, new_clusters)) | |
# write out the comment readding diffs to a git-readable file format | |
outfile = open(argv[2], 'wb') | |
for fname, clusters in readd_comment_diff_clusters: | |
if len(clusters) == 0: continue | |
outfile.write(b'diff --git a/%s b/%s\n' % (fname, fname)) | |
outfile.write(b'--- a/%s\n' % fname) | |
outfile.write(b'+++ b/%s\n' % fname) | |
for cluster in clusters: | |
outfile.writelines(convert_cluster_diff_to_lines(cluster)) | |
outfile.write(b'\n') | |
outfile.close() | |
if sys.version_info < (3, 5): | |
raise Exception("Script requires Python 3.5 or newer") | |
if __name__ == "__main__": | |
main(sys.argv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment