Skip to content

Instantly share code, notes, and snippets.

@mbarkhau
Last active September 10, 2018 18:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mbarkhau/7ad941b982b96ae62dd66bd6d6924a28 to your computer and use it in GitHub Desktop.
Save mbarkhau/7ad941b982b96ae62dd66bd6d6924a28 to your computer and use it in GitHub Desktop.
black_alignment_postproc.py
import re
from enum import Enum
from typing import *
FileContent = str
str_contents = """
#!/usr/bin/env python3
# fmt: on
# Some license here.
#
# Has many lines. Many, many lines.
# Many, many, many lines.
\"\"\"Module docstring.
Possibly also many, many lines.
\"\"\"
import os.path
import sys
import a
from b.c import X # some noqa comment
def test_parse():
\"\"\"Docstring comes first.
Possibly many lines.
\"\"\"
# FIXME: Some comment about why this function is crap but still in production.
environ = {
"MYAPP_DB_HOST": "1.2.3.4",
"MYAPP_DB_PORT": "1234",
'MYAPP_DB_PASSWORD': "secret",
'MYAPP_DB_READ_ONLY': "0",
'MYAPP_DB_DDL': "~/mkdb.sql",
'MYAPP_DL': 123_123,
'MYAPP_DL': 123_123_929,
'MYAPP_DBDL': 12,
}
barf = {
22: 23_222,
2234: 231_231_231_232,
1234: 231_231_232,
}
dbenv = myenv.parse(DBEnv, environ, prefix="MYAPP_DB_")
assert dbenv.host == "1.2.3.4"
assert dbenv.user == "new_user"
assert dbenv.password == "secret"
assert dbenv.read_only is False
assert isinstance(dbenv.ddl, pl.Path)
assert str(dbenv.ddl).endswith("mkdb.sql")
assert len(attrnames) == 7
GLOBAL_STATE = {"a": a(1), "b": a(2), "c": a(3)}
'What\\'s the deal "here"?'
'And "here"?'
StrList = List[str]
PathList = List[Path]
class TestEnv(myenv.BaseEnv):
'''
notakey: notaval
notakeyeither: notaval
'''
str_val_wo_default: str
float_val_wo_default: float
path_val_wo_default: pl.Path
paths_val_wo_default: List[pl.Path]
str_val: str = "foo"
strs_val: StrList = ["foo = bar", "barfoo = baz"]
float_val: float = 12.34
path_val: pl.Path = pl.Path("file.txt")
paths_val: PathList = [
[1, 2, 3, 4],
[5, 6, 7, 8],
[1, 22, 2243, 4],
{23: "asdf", 22: "aaaa", 443: "bbbb", 439: "cccc"},
{1123: "asdf", 22: "k3k3", 443: "jfjf", 439: "k2k2"},
{1: "asdf", 2332: "asdfk3k3", 42243: "jssdfjf", 4: "k2k2eie"},
]
\"\"\"Docstring for instance attribute spam.\"\"\"
"""
# str_contents = open("blacker.py").read()
str_contents = open("test.txt").read()
# fmt: off
ALIGN_BEFORE_TOKENS = {
"<<=", ">>=", "**=", "//=",
"+=", "-=", "*=", "/=", "%=", "|=", "&=", "@=",
"==", "!=", "<=", ">=",
"//", "<<", ">>", "^=", "~=",
"in", "is", "},", "],", "),",
"->",
",", ":", "=",
"+", "-", "*", "/",
"%", "|", "&", "^", "~",
"!", "<", ">",
"}", "]", ")",
}
# fmt: on
NO_ALIGN_BLOCK_END_MATCHERS = {
"'''": re.compile(r"(?<![^\\]\\)'''"),
'"""': re.compile(r"(?<![^\\]\\)\"\"\""),
'"' : re.compile(r"(?<![^\\]\\)\""),
"'" : re.compile(r"(?<![^\\]\\)'"),
"#": re.compile(r"$", flags=re.MULTILINE),
}
ALIGN_TOKEN_SEP_RE = re.compile(
r"""
(
'''
|\"\"\"
|\"
|'
|\#
|<<= |>>= |\*\*= |//=
|\+= |\-= |\*= |/= |%= |\|= |&= |@=
|== |!= |<= |>=
|// |<< |>> |\^= |~=
|(?<!\w)in(?!\w) |(?<!\w)is(?!\w) |\}, |\], |\),
|\->
|, |: |=
|\+ |\- |\* |/
|% |\| |& |\^ |~
|! |< |>
|\{ |\[ |\(
|\} |\] |\)
|$
)
""",
flags=re.MULTILINE | re.VERBOSE,
)
SYMBOL_STRING_RE = re.compile(r"\"\w+(\w\d)?\"")
class TokenType(Enum):
INDENT = 0
SEPARATOR = 1
CODE = 2
NEWLINE = 3
BLOCK = 4
COMMENT = 5
WHITESPACE = 6
TokenVal = str
class Token(NamedTuple):
typ: TokenType
val: TokenVal
def tokenize_for_alignment(src_contents: str) -> Iterator[Token]:
rest = src_contents
prev_rest = None
while rest:
assert rest != prev_rest, "No progress at: " + repr(rest[:40])
prev_rest = rest
curr_token_sep = ALIGN_TOKEN_SEP_RE.search(rest)
assert curr_token_sep is not None
curr_token_start, curr_token_end = curr_token_sep.span()
# newline match has zero width
is_newline = curr_token_start == curr_token_end
if is_newline:
# adjust for zero width match
curr_token_end = curr_token_start + 1
# Get everything (if anything) up to (and excluding) the newline
token_val = rest[:curr_token_start]
if token_val:
assert token_val != "\n"
yield Token(TokenType.CODE, token_val)
# The newline itself (note that black promises to
# have normalized CRLF etc. to plain LF)
token_val = rest[curr_token_start:curr_token_end]
assert token_val == "\n"
yield Token(TokenType.NEWLINE, token_val)
rest = rest[curr_token_end:]
# parse any indent
new_rest = rest.lstrip(" \t")
indent_len = len(rest) - len(new_rest)
if indent_len > 0:
indent_token_val = rest[:indent_len]
yield Token(TokenType.INDENT, indent_token_val)
rest = new_rest
elif curr_token_start > 0:
prev_token_val = rest[:curr_token_start]
rest = rest[curr_token_start:]
assert prev_token_val != "\n"
assert prev_token_val not in ALIGN_BEFORE_TOKENS, repr(prev_token_val)
if len(prev_token_val.strip()) == 0:
yield Token(TokenType.WHITESPACE, prev_token_val)
else:
yield Token(TokenType.CODE, prev_token_val)
else:
token_val = curr_token_sep.group(0)
if token_val in NO_ALIGN_BLOCK_END_MATCHERS:
# comment, string or docstring
block_begin_val = token_val
assert curr_token_end > 0
rest = rest[len(block_begin_val) :]
end_matcher = NO_ALIGN_BLOCK_END_MATCHERS[token_val]
block_end_match = end_matcher.search(rest)
assert block_end_match, rest[:40]
block_end_token = block_end_match.group(0)
block_end_index = block_end_match.span()[-1]
assert block_end_index <= len(rest), f"{len(rest)} < {block_end_index}"
block_rest = rest[:block_end_index]
block_token_val = block_begin_val + block_rest
assert block_token_val.endswith(block_end_token)
if block_token_val.strip().startswith("#"):
yield Token(TokenType.COMMENT, block_token_val)
else:
yield Token(TokenType.BLOCK, block_token_val)
rest = rest[block_end_index:]
else:
sep_token_val = token_val
yield Token(TokenType.SEPARATOR, sep_token_val)
rest = rest[curr_token_end:]
# NOTE (mb 2018-09-09): The way we tokenize, we always consume
# all content belonging to strings and comments. This means that
# the rest (after consuming all content of a string or comment),
# should continue to be valid python. This means we can do some
# basic sanity checks. For example, no valid python token begins
# with a questionmark (though this is actually introduced because
# one of the test cases conveniently has a questionmark as the
# first character after an edge case of string parsing).
assert not rest.startswith("?"), repr(rest)
Indent = str
RowIndex = int
ColIndex = int
OffsetWidth = int
TokenTable = List[List[Token]]
class RowLayoutToken(NamedTuple):
"""Disambiguate between lines with different layout/structure
We only want to align lines which have the same structure of
indent and separators. Any difference in the number of elements
or type of separators causes alignment to be disabled.
"""
typ: TokenType
# val is only set if it should cause a different prefix
# eg. if a separator is a comma vs a period.
val: Optional[TokenVal]
# Tokens which have values which are relevant to to the layout of
# a cell group.
LAYOUT_VAL_TOKENS = set([TokenType.SEPARATOR, TokenType.INDENT])
RowLayoutTokens = Tuple[RowLayoutToken, ...]
class AlignmentContextKey(NamedTuple):
"""Does not change between multiple lines that can be aligned."""
col_idx: ColIndex
tok_typ: TokenType
tok_val: TokenVal
layout : RowLayoutTokens
AlignmentContext = Dict[AlignmentContextKey, OffsetWidth]
class AlignmentCellKey(NamedTuple):
last_row_index: RowIndex
col_index : ColIndex
token_val : TokenVal
layout : RowLayoutTokens
class AlignmentCell(NamedTuple):
row_idx: RowIndex
offset_width: OffsetWidth
CellGroups = Dict[AlignmentCellKey, List[AlignmentCell]]
def normalize_symbol_strings(row: List[Token]) -> None:
"""Convert doublequotes to single quotes for internal/symbol/atom strings
Internal/Symbol/Atom strings are strings which are code as opposed
to data. They have no meaning outside of the context of the
program. Symbol strings must be valid python identifiers
They are:
- dictionary keys
- attribute names
- implicit enums
They are not:
- urls
- user readable text
- translation strings
- format strings
This function performs conversion only for a subset of cases,
since it cannot detect all. These cases are, strings used as
dictionary keys and for attribute access via getattr,
setattr, delattr.
"""
# single quotes.
for col_index, tok_cell in enumerate(row):
is_dict_key_symbol = (
tok_cell.typ == TokenType.SEPARATOR
and tok_cell.val in (":", "]")
and col_index > 0
and row[col_index - 1].typ == TokenType.BLOCK
and SYMBOL_STRING_RE.match(row[col_index - 1].val)
)
if is_dict_key_symbol:
normalized_token_val = row[col_index - 1].val.replace('"', "'")
row[col_index - 1] = Token(TokenType.BLOCK, normalized_token_val)
is_attrname_symbol = (
tok_cell.typ == TokenType.CODE
and tok_cell.val in ('getattr', 'setattr', 'delattr')
and col_index + 5 < len(row)
and row[col_index + 1].typ == TokenType.SEPARATOR
and row[col_index + 1].val == "("
and row[col_index + 2].typ == TokenType.CODE
and row[col_index + 3].typ == TokenType.SEPARATOR
and row[col_index + 3].val == ","
and row[col_index + 4].typ == TokenType.WHITESPACE
and row[col_index + 4].val == " "
and row[col_index + 5].typ == TokenType.BLOCK
and SYMBOL_STRING_RE.match(row[col_index + 5].val)
)
if is_attrname_symbol:
normalized_token_val = row[col_index + 5].val.replace('"', "'")
row[col_index + 5] = Token(TokenType.BLOCK, normalized_token_val)
def find_alignment_contexts(table: TokenTable) -> Iterator[AlignmentContext]:
is_alignment_enabled = True
for row in table:
ctx: AlignmentContext = {}
layout: RowLayoutTokens = tuple()
if is_alignment_enabled:
normalize_symbol_strings(row)
for col_index, token in enumerate(row):
if token.typ == TokenType.COMMENT and "fmt: off" in token.val:
is_alignment_enabled = False
if token.typ == TokenType.COMMENT and "fmt: on" in token.val:
is_alignment_enabled = True
if not is_alignment_enabled:
continue
layout_token_val: Optional[TokenVal]
if token.typ in LAYOUT_VAL_TOKENS:
if token.val in ALIGN_BEFORE_TOKENS:
layout_token_val = token.val
elif col_index > 0:
# Layout tokens such as ([{ don't cause alignment, to
# their preceding token, so line offset up to the
# column of those tokens can a be different. We only
# want to continue with alignment if the tokens are
# all at the same line offset.
layout_token_val = token.val + f"::{len(row[col_index - 1].val)}"
else:
layout_token_val = None
else:
layout_token_val = None
layout += (RowLayoutToken(token.typ, layout_token_val),)
if token.val in ALIGN_BEFORE_TOKENS:
assert token.typ == TokenType.SEPARATOR
maybe_indent_token = row[0]
if maybe_indent_token.typ == TokenType.INDENT:
indent = maybe_indent_token.val
else:
indent = ""
prev_token = row[col_index - 1]
if prev_token.typ == TokenType.SEPARATOR:
continue
offset_width = len(prev_token.val)
ctx_key = AlignmentContextKey(col_index, token.typ, token.val, layout)
ctx[ctx_key] = offset_width
yield ctx
def find_cell_groups(alignment_contexts: List[AlignmentContext]) -> CellGroups:
cell_groups: Dict[AlignmentCellKey, List[AlignmentCell]] = {}
for row_index, ctx in enumerate(alignment_contexts):
ctx_items = sorted(ctx.items())
for ctx_key, offset_width in ctx_items:
col_index, token_typ, token_val, layout = ctx_key
prev_cell_key = AlignmentCellKey(row_index - 1, col_index, token_val, layout)
curr_cell_key = AlignmentCellKey(row_index, col_index, token_val, layout)
curr_cell = AlignmentCell(row_index, offset_width)
if prev_cell_key in cell_groups:
prev_cells = cell_groups[prev_cell_key]
del cell_groups[prev_cell_key]
cell_groups[curr_cell_key] = prev_cells + [curr_cell]
else:
cell_groups[curr_cell_key] = [curr_cell]
return cell_groups
def realigned_contents(table: TokenTable, cell_groups: CellGroups) -> str:
prev_col_index = -1
for ctx_key, cells in sorted(cell_groups.items()):
prev_col_index = ctx_key.col_index
if len(cells) < 3:
continue
max_offset_width = max(ow for _, ow in cells)
for row_index, offset_width in cells:
extra_offset = max_offset_width - offset_width
if extra_offset == 0:
continue
row = table[row_index]
left_token = row[ctx_key.col_index - 1]
is_last_sep_token = all(
token.typ in (TokenType.NEWLINE, TokenType.COMMENT, TokenType.WHITESPACE)
for token in row[ctx_key.col_index + 1:]
)
maybe_number = left_token.val.strip().replace("_", "")
if maybe_number.isdigit():
padded_left_token_val = " " * extra_offset + left_token.val
elif is_last_sep_token:
# don't align if this is the last token of the row
continue
else:
padded_left_token_val = left_token.val + " " * extra_offset
padded_token = Token(TokenType.CODE, padded_left_token_val)
row[ctx_key.col_index - 1] = padded_token
return "".join("".join(token.val for token in row) for row in table)
def align_formatted_str(src_contents: str) -> FileContent:
debug = 0
table: TokenTable = [[]]
for token in tokenize_for_alignment(src_contents):
if debug:
print("TOKEN: ", repr(token.val).ljust(50), token)
table[-1].append(token)
if token.typ == TokenType.NEWLINE:
table.append([])
else:
is_block_token = token.typ in (TokenType.BLOCK, TokenType.COMMENT, TokenType.WHITESPACE)
assert is_block_token or "\n" not in token.val
if debug:
for row in table:
print("ROW: ", end="")
for tok_cell in row:
print(tok_cell, end="\n ")
print()
alignment_contexts = list(find_alignment_contexts(table))
cell_groups = find_cell_groups(alignment_contexts)
if debug:
for cell_key, cells in cell_groups.items():
if len(cells) > 1:
print("CELL", len(cells), cell_key)
for all_cell in cells:
print("\t\t", all_cell)
return realigned_contents(table, cell_groups)
print(align_formatted_str(str_contents))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment