Last active
September 10, 2018 18:52
-
-
Save mbarkhau/7ad941b982b96ae62dd66bd6d6924a28 to your computer and use it in GitHub Desktop.
black_alignment_postproc.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from enum import Enum | |
from typing import * | |
FileContent = str | |
str_contents = """ | |
#!/usr/bin/env python3 | |
# fmt: on | |
# Some license here. | |
# | |
# Has many lines. Many, many lines. | |
# Many, many, many lines. | |
\"\"\"Module docstring. | |
Possibly also many, many lines. | |
\"\"\" | |
import os.path | |
import sys | |
import a | |
from b.c import X # some noqa comment | |
def test_parse(): | |
\"\"\"Docstring comes first. | |
Possibly many lines. | |
\"\"\" | |
# FIXME: Some comment about why this function is crap but still in production. | |
environ = { | |
"MYAPP_DB_HOST": "1.2.3.4", | |
"MYAPP_DB_PORT": "1234", | |
'MYAPP_DB_PASSWORD': "secret", | |
'MYAPP_DB_READ_ONLY': "0", | |
'MYAPP_DB_DDL': "~/mkdb.sql", | |
'MYAPP_DL': 123_123, | |
'MYAPP_DL': 123_123_929, | |
'MYAPP_DBDL': 12, | |
} | |
barf = { | |
22: 23_222, | |
2234: 231_231_231_232, | |
1234: 231_231_232, | |
} | |
dbenv = myenv.parse(DBEnv, environ, prefix="MYAPP_DB_") | |
assert dbenv.host == "1.2.3.4" | |
assert dbenv.user == "new_user" | |
assert dbenv.password == "secret" | |
assert dbenv.read_only is False | |
assert isinstance(dbenv.ddl, pl.Path) | |
assert str(dbenv.ddl).endswith("mkdb.sql") | |
assert len(attrnames) == 7 | |
GLOBAL_STATE = {"a": a(1), "b": a(2), "c": a(3)} | |
'What\\'s the deal "here"?' | |
'And "here"?' | |
StrList = List[str] | |
PathList = List[Path] | |
class TestEnv(myenv.BaseEnv): | |
''' | |
notakey: notaval | |
notakeyeither: notaval | |
''' | |
str_val_wo_default: str | |
float_val_wo_default: float | |
path_val_wo_default: pl.Path | |
paths_val_wo_default: List[pl.Path] | |
str_val: str = "foo" | |
strs_val: StrList = ["foo = bar", "barfoo = baz"] | |
float_val: float = 12.34 | |
path_val: pl.Path = pl.Path("file.txt") | |
paths_val: PathList = [ | |
[1, 2, 3, 4], | |
[5, 6, 7, 8], | |
[1, 22, 2243, 4], | |
{23: "asdf", 22: "aaaa", 443: "bbbb", 439: "cccc"}, | |
{1123: "asdf", 22: "k3k3", 443: "jfjf", 439: "k2k2"}, | |
{1: "asdf", 2332: "asdfk3k3", 42243: "jssdfjf", 4: "k2k2eie"}, | |
] | |
\"\"\"Docstring for instance attribute spam.\"\"\" | |
""" | |
# str_contents = open("blacker.py").read() | |
str_contents = open("test.txt").read() | |
# fmt: off | |
ALIGN_BEFORE_TOKENS = { | |
"<<=", ">>=", "**=", "//=", | |
"+=", "-=", "*=", "/=", "%=", "|=", "&=", "@=", | |
"==", "!=", "<=", ">=", | |
"//", "<<", ">>", "^=", "~=", | |
"in", "is", "},", "],", "),", | |
"->", | |
",", ":", "=", | |
"+", "-", "*", "/", | |
"%", "|", "&", "^", "~", | |
"!", "<", ">", | |
"}", "]", ")", | |
} | |
# fmt: on | |
NO_ALIGN_BLOCK_END_MATCHERS = { | |
"'''": re.compile(r"(?<![^\\]\\)'''"), | |
'"""': re.compile(r"(?<![^\\]\\)\"\"\""), | |
'"' : re.compile(r"(?<![^\\]\\)\""), | |
"'" : re.compile(r"(?<![^\\]\\)'"), | |
"#": re.compile(r"$", flags=re.MULTILINE), | |
} | |
ALIGN_TOKEN_SEP_RE = re.compile( | |
r""" | |
( | |
''' | |
|\"\"\" | |
|\" | |
|' | |
|\# | |
|<<= |>>= |\*\*= |//= | |
|\+= |\-= |\*= |/= |%= |\|= |&= |@= | |
|== |!= |<= |>= | |
|// |<< |>> |\^= |~= | |
|(?<!\w)in(?!\w) |(?<!\w)is(?!\w) |\}, |\], |\), | |
|\-> | |
|, |: |= | |
|\+ |\- |\* |/ | |
|% |\| |& |\^ |~ | |
|! |< |> | |
|\{ |\[ |\( | |
|\} |\] |\) | |
|$ | |
) | |
""", | |
flags=re.MULTILINE | re.VERBOSE, | |
) | |
SYMBOL_STRING_RE = re.compile(r"\"\w+(\w\d)?\"") | |
class TokenType(Enum): | |
INDENT = 0 | |
SEPARATOR = 1 | |
CODE = 2 | |
NEWLINE = 3 | |
BLOCK = 4 | |
COMMENT = 5 | |
WHITESPACE = 6 | |
TokenVal = str | |
class Token(NamedTuple): | |
typ: TokenType | |
val: TokenVal | |
def tokenize_for_alignment(src_contents: str) -> Iterator[Token]: | |
rest = src_contents | |
prev_rest = None | |
while rest: | |
assert rest != prev_rest, "No progress at: " + repr(rest[:40]) | |
prev_rest = rest | |
curr_token_sep = ALIGN_TOKEN_SEP_RE.search(rest) | |
assert curr_token_sep is not None | |
curr_token_start, curr_token_end = curr_token_sep.span() | |
# newline match has zero width | |
is_newline = curr_token_start == curr_token_end | |
if is_newline: | |
# adjust for zero width match | |
curr_token_end = curr_token_start + 1 | |
# Get everything (if anything) up to (and excluding) the newline | |
token_val = rest[:curr_token_start] | |
if token_val: | |
assert token_val != "\n" | |
yield Token(TokenType.CODE, token_val) | |
# The newline itself (note that black promises to | |
# have normalized CRLF etc. to plain LF) | |
token_val = rest[curr_token_start:curr_token_end] | |
assert token_val == "\n" | |
yield Token(TokenType.NEWLINE, token_val) | |
rest = rest[curr_token_end:] | |
# parse any indent | |
new_rest = rest.lstrip(" \t") | |
indent_len = len(rest) - len(new_rest) | |
if indent_len > 0: | |
indent_token_val = rest[:indent_len] | |
yield Token(TokenType.INDENT, indent_token_val) | |
rest = new_rest | |
elif curr_token_start > 0: | |
prev_token_val = rest[:curr_token_start] | |
rest = rest[curr_token_start:] | |
assert prev_token_val != "\n" | |
assert prev_token_val not in ALIGN_BEFORE_TOKENS, repr(prev_token_val) | |
if len(prev_token_val.strip()) == 0: | |
yield Token(TokenType.WHITESPACE, prev_token_val) | |
else: | |
yield Token(TokenType.CODE, prev_token_val) | |
else: | |
token_val = curr_token_sep.group(0) | |
if token_val in NO_ALIGN_BLOCK_END_MATCHERS: | |
# comment, string or docstring | |
block_begin_val = token_val | |
assert curr_token_end > 0 | |
rest = rest[len(block_begin_val) :] | |
end_matcher = NO_ALIGN_BLOCK_END_MATCHERS[token_val] | |
block_end_match = end_matcher.search(rest) | |
assert block_end_match, rest[:40] | |
block_end_token = block_end_match.group(0) | |
block_end_index = block_end_match.span()[-1] | |
assert block_end_index <= len(rest), f"{len(rest)} < {block_end_index}" | |
block_rest = rest[:block_end_index] | |
block_token_val = block_begin_val + block_rest | |
assert block_token_val.endswith(block_end_token) | |
if block_token_val.strip().startswith("#"): | |
yield Token(TokenType.COMMENT, block_token_val) | |
else: | |
yield Token(TokenType.BLOCK, block_token_val) | |
rest = rest[block_end_index:] | |
else: | |
sep_token_val = token_val | |
yield Token(TokenType.SEPARATOR, sep_token_val) | |
rest = rest[curr_token_end:] | |
# NOTE (mb 2018-09-09): The way we tokenize, we always consume | |
# all content belonging to strings and comments. This means that | |
# the rest (after consuming all content of a string or comment), | |
# should continue to be valid python. This means we can do some | |
# basic sanity checks. For example, no valid python token begins | |
# with a questionmark (though this is actually introduced because | |
# one of the test cases conveniently has a questionmark as the | |
# first character after an edge case of string parsing). | |
assert not rest.startswith("?"), repr(rest) | |
Indent = str | |
RowIndex = int | |
ColIndex = int | |
OffsetWidth = int | |
TokenTable = List[List[Token]] | |
class RowLayoutToken(NamedTuple): | |
"""Disambiguate between lines with different layout/structure | |
We only want to align lines which have the same structure of | |
indent and separators. Any difference in the number of elements | |
or type of separators causes alignment to be disabled. | |
""" | |
typ: TokenType | |
# val is only set if it should cause a different prefix | |
# eg. if a separator is a comma vs a period. | |
val: Optional[TokenVal] | |
# Tokens which have values which are relevant to to the layout of | |
# a cell group. | |
LAYOUT_VAL_TOKENS = set([TokenType.SEPARATOR, TokenType.INDENT]) | |
RowLayoutTokens = Tuple[RowLayoutToken, ...] | |
class AlignmentContextKey(NamedTuple): | |
"""Does not change between multiple lines that can be aligned.""" | |
col_idx: ColIndex | |
tok_typ: TokenType | |
tok_val: TokenVal | |
layout : RowLayoutTokens | |
AlignmentContext = Dict[AlignmentContextKey, OffsetWidth] | |
class AlignmentCellKey(NamedTuple): | |
last_row_index: RowIndex | |
col_index : ColIndex | |
token_val : TokenVal | |
layout : RowLayoutTokens | |
class AlignmentCell(NamedTuple): | |
row_idx: RowIndex | |
offset_width: OffsetWidth | |
CellGroups = Dict[AlignmentCellKey, List[AlignmentCell]] | |
def normalize_symbol_strings(row: List[Token]) -> None: | |
"""Convert doublequotes to single quotes for internal/symbol/atom strings | |
Internal/Symbol/Atom strings are strings which are code as opposed | |
to data. They have no meaning outside of the context of the | |
program. Symbol strings must be valid python identifiers | |
They are: | |
- dictionary keys | |
- attribute names | |
- implicit enums | |
They are not: | |
- urls | |
- user readable text | |
- translation strings | |
- format strings | |
This function performs conversion only for a subset of cases, | |
since it cannot detect all. These cases are, strings used as | |
dictionary keys and for attribute access via getattr, | |
setattr, delattr. | |
""" | |
# single quotes. | |
for col_index, tok_cell in enumerate(row): | |
is_dict_key_symbol = ( | |
tok_cell.typ == TokenType.SEPARATOR | |
and tok_cell.val in (":", "]") | |
and col_index > 0 | |
and row[col_index - 1].typ == TokenType.BLOCK | |
and SYMBOL_STRING_RE.match(row[col_index - 1].val) | |
) | |
if is_dict_key_symbol: | |
normalized_token_val = row[col_index - 1].val.replace('"', "'") | |
row[col_index - 1] = Token(TokenType.BLOCK, normalized_token_val) | |
is_attrname_symbol = ( | |
tok_cell.typ == TokenType.CODE | |
and tok_cell.val in ('getattr', 'setattr', 'delattr') | |
and col_index + 5 < len(row) | |
and row[col_index + 1].typ == TokenType.SEPARATOR | |
and row[col_index + 1].val == "(" | |
and row[col_index + 2].typ == TokenType.CODE | |
and row[col_index + 3].typ == TokenType.SEPARATOR | |
and row[col_index + 3].val == "," | |
and row[col_index + 4].typ == TokenType.WHITESPACE | |
and row[col_index + 4].val == " " | |
and row[col_index + 5].typ == TokenType.BLOCK | |
and SYMBOL_STRING_RE.match(row[col_index + 5].val) | |
) | |
if is_attrname_symbol: | |
normalized_token_val = row[col_index + 5].val.replace('"', "'") | |
row[col_index + 5] = Token(TokenType.BLOCK, normalized_token_val) | |
def find_alignment_contexts(table: TokenTable) -> Iterator[AlignmentContext]: | |
is_alignment_enabled = True | |
for row in table: | |
ctx: AlignmentContext = {} | |
layout: RowLayoutTokens = tuple() | |
if is_alignment_enabled: | |
normalize_symbol_strings(row) | |
for col_index, token in enumerate(row): | |
if token.typ == TokenType.COMMENT and "fmt: off" in token.val: | |
is_alignment_enabled = False | |
if token.typ == TokenType.COMMENT and "fmt: on" in token.val: | |
is_alignment_enabled = True | |
if not is_alignment_enabled: | |
continue | |
layout_token_val: Optional[TokenVal] | |
if token.typ in LAYOUT_VAL_TOKENS: | |
if token.val in ALIGN_BEFORE_TOKENS: | |
layout_token_val = token.val | |
elif col_index > 0: | |
# Layout tokens such as ([{ don't cause alignment, to | |
# their preceding token, so line offset up to the | |
# column of those tokens can a be different. We only | |
# want to continue with alignment if the tokens are | |
# all at the same line offset. | |
layout_token_val = token.val + f"::{len(row[col_index - 1].val)}" | |
else: | |
layout_token_val = None | |
else: | |
layout_token_val = None | |
layout += (RowLayoutToken(token.typ, layout_token_val),) | |
if token.val in ALIGN_BEFORE_TOKENS: | |
assert token.typ == TokenType.SEPARATOR | |
maybe_indent_token = row[0] | |
if maybe_indent_token.typ == TokenType.INDENT: | |
indent = maybe_indent_token.val | |
else: | |
indent = "" | |
prev_token = row[col_index - 1] | |
if prev_token.typ == TokenType.SEPARATOR: | |
continue | |
offset_width = len(prev_token.val) | |
ctx_key = AlignmentContextKey(col_index, token.typ, token.val, layout) | |
ctx[ctx_key] = offset_width | |
yield ctx | |
def find_cell_groups(alignment_contexts: List[AlignmentContext]) -> CellGroups: | |
cell_groups: Dict[AlignmentCellKey, List[AlignmentCell]] = {} | |
for row_index, ctx in enumerate(alignment_contexts): | |
ctx_items = sorted(ctx.items()) | |
for ctx_key, offset_width in ctx_items: | |
col_index, token_typ, token_val, layout = ctx_key | |
prev_cell_key = AlignmentCellKey(row_index - 1, col_index, token_val, layout) | |
curr_cell_key = AlignmentCellKey(row_index, col_index, token_val, layout) | |
curr_cell = AlignmentCell(row_index, offset_width) | |
if prev_cell_key in cell_groups: | |
prev_cells = cell_groups[prev_cell_key] | |
del cell_groups[prev_cell_key] | |
cell_groups[curr_cell_key] = prev_cells + [curr_cell] | |
else: | |
cell_groups[curr_cell_key] = [curr_cell] | |
return cell_groups | |
def realigned_contents(table: TokenTable, cell_groups: CellGroups) -> str: | |
prev_col_index = -1 | |
for ctx_key, cells in sorted(cell_groups.items()): | |
prev_col_index = ctx_key.col_index | |
if len(cells) < 3: | |
continue | |
max_offset_width = max(ow for _, ow in cells) | |
for row_index, offset_width in cells: | |
extra_offset = max_offset_width - offset_width | |
if extra_offset == 0: | |
continue | |
row = table[row_index] | |
left_token = row[ctx_key.col_index - 1] | |
is_last_sep_token = all( | |
token.typ in (TokenType.NEWLINE, TokenType.COMMENT, TokenType.WHITESPACE) | |
for token in row[ctx_key.col_index + 1:] | |
) | |
maybe_number = left_token.val.strip().replace("_", "") | |
if maybe_number.isdigit(): | |
padded_left_token_val = " " * extra_offset + left_token.val | |
elif is_last_sep_token: | |
# don't align if this is the last token of the row | |
continue | |
else: | |
padded_left_token_val = left_token.val + " " * extra_offset | |
padded_token = Token(TokenType.CODE, padded_left_token_val) | |
row[ctx_key.col_index - 1] = padded_token | |
return "".join("".join(token.val for token in row) for row in table) | |
def align_formatted_str(src_contents: str) -> FileContent: | |
debug = 0 | |
table: TokenTable = [[]] | |
for token in tokenize_for_alignment(src_contents): | |
if debug: | |
print("TOKEN: ", repr(token.val).ljust(50), token) | |
table[-1].append(token) | |
if token.typ == TokenType.NEWLINE: | |
table.append([]) | |
else: | |
is_block_token = token.typ in (TokenType.BLOCK, TokenType.COMMENT, TokenType.WHITESPACE) | |
assert is_block_token or "\n" not in token.val | |
if debug: | |
for row in table: | |
print("ROW: ", end="") | |
for tok_cell in row: | |
print(tok_cell, end="\n ") | |
print() | |
alignment_contexts = list(find_alignment_contexts(table)) | |
cell_groups = find_cell_groups(alignment_contexts) | |
if debug: | |
for cell_key, cells in cell_groups.items(): | |
if len(cells) > 1: | |
print("CELL", len(cells), cell_key) | |
for all_cell in cells: | |
print("\t\t", all_cell) | |
return realigned_contents(table, cell_groups) | |
print(align_formatted_str(str_contents)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment