-
-
Save joshuabambrick/a850d0e0050129b9252c748fa06c48b2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# disclaimer: this is perhaps the worst code I've ever written as it was written as fast as possible | |
from collections import Counter | |
import dataclasses | |
import sys | |
from pathlib import Path | |
import sys, tokenize | |
@dataclasses.dataclass | |
class Token: | |
text: str | |
exact_type: int | |
start_line_num: int | |
start_col: int | |
end_line_num: int | |
end_col: int | |
visited: bool | |
@classmethod | |
def from_tup(cls, tup) -> 'Token': | |
toktype, ttext, (slineno, scol), (elineno, ecol), ltext = tup | |
return cls( | |
text=ttext, | |
exact_type=tup.exact_type, | |
start_line_num=slineno, | |
start_col=scol, | |
end_line_num=elineno, | |
end_col=ecol, | |
visited=False, | |
) | |
root = Path(sys.argv[1]) | |
output = bool(len(sys.argv) > 2 and sys.argv[2]) | |
def main(): | |
matches = 0 | |
total_lines_saved = 0 | |
reductions = 0 | |
syntax_errors = [] | |
biggest_change = 0 | |
biggest_change_file = None | |
if root.suffix == ".py": | |
paths = [root] | |
else: | |
paths = root.rglob("**/*.py") | |
for path in paths: | |
print(' =====', path, '=====') | |
try: | |
with tokenize.open(path) as source: | |
# need empty string to represent 'encoding' | |
file = [''] + source.readlines() | |
flat_file = ''.join(file) | |
except SyntaxError: | |
syntax_errors.append(path) | |
continue | |
new_file = file.copy() | |
file_changed = False | |
lines_removed = 0 | |
chars_removed_from_line = Counter() | |
with tokenize.open(path) as source: | |
tokgen = tokenize.generate_tokens(source.readline) | |
open_brackets = [] | |
bracket_contents = [] | |
bracket_last_three = [] | |
last_three = [] | |
for tup in map(Token.from_tup, tokgen): | |
if len (last_three) == 3: | |
last_three.pop(0) | |
last_three.append(tup) | |
assert len(last_three) <= 3 | |
for contents in bracket_contents: | |
contents.append(tup) | |
if tup.exact_type == tokenize.LPAR: | |
open_brackets.append(tup) | |
bracket_contents.append([tup]) | |
bracket_last_three.append(last_three.copy()) | |
elif tup.exact_type == tokenize.RPAR: | |
open_bracket = open_brackets.pop() | |
close_bracket = tup | |
current_bracket_tokens = bracket_contents.pop() | |
current_bracket_last_three = bracket_last_three.pop() | |
if current_bracket_last_three[0].exact_type == tokenize.NAME and current_bracket_last_three[0].text == 'def': | |
# this is a function definition, not a call | |
continue | |
assert current_bracket_tokens[0].exact_type == tokenize.LPAR | |
assert current_bracket_tokens[-1].exact_type == tokenize.RPAR | |
current_bracket_tokens = current_bracket_tokens[1:-1] | |
current_bracket_tokens = [t for t in current_bracket_tokens if t.exact_type not in (tokenize.NL, tokenize.NEWLINE)] | |
current_term_tokens = [] | |
simplified_bracket_tokens = [] | |
original_matches = matches | |
for i, term in enumerate(current_bracket_tokens): | |
if term.visited: | |
continue | |
term.visited = True | |
if term.exact_type != tokenize.COMMA: | |
current_term_tokens.append(term) | |
if i != len(current_bracket_tokens) - 1: | |
continue | |
new_form = None | |
try: | |
lhs, mid, rhs = current_term_tokens | |
if lhs.text and lhs.text == rhs.text and mid.exact_type == tokenize.EQUAL: | |
new_form = f'={lhs.text}' | |
assert lhs.start_line_num == rhs.start_line_num | |
new_file_line_num = lhs.start_line_num - lines_removed | |
old_line = new_file[new_file_line_num] | |
#print('OLD:', old_line) | |
#print('NEW:', old_line[:lhs.start_col - chars_removed_from_line[lhs.start_line_num]] + new_form + old_line[rhs.end_col - chars_removed_from_line[lhs.start_line_num]:]) | |
new_file[new_file_line_num] = old_line[:lhs.start_col - chars_removed_from_line[lhs.start_line_num]] + new_form + old_line[rhs.end_col - chars_removed_from_line[lhs.start_line_num]:] | |
chars_removed_from_line[lhs.start_line_num] += (rhs.end_col - lhs.start_col) - len(new_form) | |
matches += 1 | |
file_changed = True | |
except: | |
pass | |
#if any(term.exact_type == tokenize.EQUAL for term in current_term_tokens): | |
# print('failed', [t.text for t in current_term_tokens]) | |
if new_form: | |
simplified_bracket_tokens.append(new_form) | |
elif current_term_tokens: | |
start_idx = sum(map(len, file[:current_term_tokens[0].start_line_num])) + current_term_tokens[0].start_col | |
end_idx = sum(map(len, file[:current_term_tokens[-1].end_line_num])) + current_term_tokens[-1].end_col | |
simplified_bracket_tokens.append(flat_file[start_idx:end_idx]) | |
current_term_tokens = [] | |
if original_matches != matches: | |
start_idx = sum(map(len, file[:open_bracket.start_line_num])) | |
end_idx = sum(map(len, file[:close_bracket.end_line_num + 1])) - 1 # remove trailing newline | |
new_line = f'{flat_file[start_idx:start_idx + open_bracket.end_col]}{", ".join(simplified_bracket_tokens)}){file[close_bracket.end_line_num][close_bracket.end_col:]}' | |
if len(new_line) > 80: | |
continue | |
new_file_line_num = open_bracket.start_line_num - lines_removed | |
new_file[new_file_line_num] = new_line | |
new_file = new_file[:new_file_line_num + 1] + new_file[close_bracket.end_line_num - lines_removed + 1:] | |
change = close_bracket.end_line_num - open_bracket.start_line_num | |
lines_removed += change | |
if (end_idx - start_idx) < 80: | |
continue | |
if change > biggest_change: | |
biggest_change = change | |
biggest_change_file = path | |
reductions += 1 | |
total_lines_saved += change | |
print(' ', flat_file[start_idx:end_idx]) | |
lines = sorted({open_bracket.start_line_num, close_bracket.end_line_num}) | |
print(' ', f'on L{"-".join(map(str, lines))} can become') | |
print(' ', new_line) | |
print() | |
if output and file_changed: | |
with open(path, 'w') as fout: | |
fout.write(''.join(new_file[1:])) | |
print(f'syntax errors ({len(syntax_errors)}):', syntax_errors) | |
print('matches:', matches) | |
print('lines_saved:', total_lines_saved) | |
print('reductions (lines of length >80 become < 80):', reductions) | |
print('biggest change:', biggest_change, f'({biggest_change_file})') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I hacked this together very quickly and in no way support all the design decisions made in the process! That said, if I recall correctly, the output of
ast
ignores the positions of characters (e.g. it removes comments and whitespace) and this made it impossible to experiment with introducing the new syntax in place. This may have been an circumventable issue but I didn't spot a solution at the time.