mbarkhau/black_alignment_postproc.py

## black_alignment_postproc.py

import re
from enum import Enum
from typing import *

FileContent = str


str_contents = """
#!/usr/bin/env python3
# fmt: on
# Some license here.
#
# Has many lines. Many, many lines.
# Many, many, many lines.
\"\"\"Module docstring.

Possibly also many, many lines.
\"\"\"


import os.path
import sys

import a
from b.c import X  # some noqa comment


def test_parse():
    \"\"\"Docstring comes first.

    Possibly many lines.
    \"\"\"
    # FIXME: Some comment about why this function is crap but still in production.

    environ = {
        "MYAPP_DB_HOST": "1.2.3.4",
        "MYAPP_DB_PORT": "1234",
        'MYAPP_DB_PASSWORD': "secret",
        'MYAPP_DB_READ_ONLY': "0",
        'MYAPP_DB_DDL': "~/mkdb.sql",
        'MYAPP_DL': 123_123,
        'MYAPP_DL': 123_123_929,
        'MYAPP_DBDL': 12,
    }

    barf = {
        22: 23_222,
        2234: 231_231_231_232,
        1234: 231_231_232,
    }

    dbenv = myenv.parse(DBEnv, environ, prefix="MYAPP_DB_")

    assert dbenv.host == "1.2.3.4"
    assert dbenv.user == "new_user"
    assert dbenv.password == "secret"

    assert dbenv.read_only is False
    assert isinstance(dbenv.ddl, pl.Path)
    assert str(dbenv.ddl).endswith("mkdb.sql")

    assert len(attrnames) == 7


GLOBAL_STATE = {"a": a(1), "b": a(2), "c": a(3)}

'What\\'s the deal "here"?'
'And "here"?'

StrList = List[str]
PathList = List[Path]

class TestEnv(myenv.BaseEnv):
    '''
    notakey: notaval
    notakeyeither: notaval
    '''

    str_val_wo_default: str
    float_val_wo_default: float
    path_val_wo_default: pl.Path
    paths_val_wo_default: List[pl.Path]

    str_val: str = "foo"
    strs_val: StrList = ["foo = bar", "barfoo = baz"]
    float_val: float = 12.34
    path_val: pl.Path = pl.Path("file.txt")
    paths_val: PathList = [
        [1, 2, 3, 4],
        [5, 6, 7, 8],
        [1, 22, 2243, 4],

        {23: "asdf", 22: "aaaa", 443: "bbbb", 439: "cccc"},
        {1123: "asdf", 22: "k3k3", 443: "jfjf", 439: "k2k2"},
        {1: "asdf", 2332: "asdfk3k3", 42243: "jssdfjf", 4: "k2k2eie"},
    ]
    \"\"\"Docstring for instance attribute spam.\"\"\"
"""

# str_contents = open("blacker.py").read()
str_contents = open("test.txt").read()


# fmt: off
ALIGN_BEFORE_TOKENS = {
    "<<=", ">>=", "**=", "//=",
    "+=", "-=", "*=", "/=", "%=", "|=", "&=", "@=",
    "==", "!=", "<=", ">=",
    "//", "<<", ">>", "^=", "~=",
    "in", "is", "},", "],", "),",
    "->",
    ",", ":", "=",
    "+", "-", "*", "/",
    "%", "|", "&", "^", "~",
    "!", "<", ">",
    "}", "]", ")",
}
# fmt: on


NO_ALIGN_BLOCK_END_MATCHERS = {
    "'''": re.compile(r"(?<![^\\]\\)'''"),
    '"""': re.compile(r"(?<![^\\]\\)\"\"\""),
    '"'  : re.compile(r"(?<![^\\]\\)\""),
    "'"  : re.compile(r"(?<![^\\]\\)'"),
    "#": re.compile(r"$", flags=re.MULTILINE),
}


ALIGN_TOKEN_SEP_RE = re.compile(
    r"""
        (
            '''
            |\"\"\"
            |\"
            |'
            |\#

            |<<= |>>= |\*\*= |//=
            |\+= |\-= |\*= |/= |%= |\|= |&= |@=
            |== |!= |<= |>=
            |// |<< |>> |\^= |~=
            |(?<!\w)in(?!\w) |(?<!\w)is(?!\w) |\}, |\], |\),
            |\->
            |, |: |=
            |\+ |\- |\* |/
            |% |\| |& |\^ |~
            |! |< |>
            |\{ |\[ |\(
            |\} |\] |\)

            |$
        )
    """,
    flags=re.MULTILINE | re.VERBOSE,
)


SYMBOL_STRING_RE = re.compile(r"\"\w+(\w\d)?\"")


class TokenType(Enum):

    INDENT    = 0
    SEPARATOR = 1
    CODE      = 2
    NEWLINE   = 3
    BLOCK     = 4
    COMMENT   = 5
    WHITESPACE = 6


TokenVal = str


class Token(NamedTuple):

    typ: TokenType
    val: TokenVal


def tokenize_for_alignment(src_contents: str) -> Iterator[Token]:
    rest = src_contents
    prev_rest = None

    while rest:
        assert rest != prev_rest, "No progress at: " + repr(rest[:40])
        prev_rest = rest

        curr_token_sep = ALIGN_TOKEN_SEP_RE.search(rest)
        assert curr_token_sep is not None
        curr_token_start, curr_token_end = curr_token_sep.span()

        # newline match has zero width
        is_newline = curr_token_start == curr_token_end
        if is_newline:
            # adjust for zero width match
            curr_token_end = curr_token_start + 1

            # Get everything (if anything) up to (and excluding) the newline
            token_val = rest[:curr_token_start]
            if token_val:
                assert token_val != "\n"
                yield Token(TokenType.CODE, token_val)

            # The newline itself (note that black promises to
            # have normalized CRLF etc. to plain LF)
            token_val = rest[curr_token_start:curr_token_end]
            assert token_val == "\n"
            yield Token(TokenType.NEWLINE, token_val)

            rest = rest[curr_token_end:]
            # parse any indent
            new_rest = rest.lstrip(" \t")
            indent_len = len(rest) - len(new_rest)
            if indent_len > 0:
                indent_token_val = rest[:indent_len]
                yield Token(TokenType.INDENT, indent_token_val)
                rest = new_rest
        elif curr_token_start > 0:
            prev_token_val = rest[:curr_token_start]
            rest = rest[curr_token_start:]
            assert prev_token_val != "\n"
            assert prev_token_val not in ALIGN_BEFORE_TOKENS, repr(prev_token_val)
            if len(prev_token_val.strip()) == 0:
                yield Token(TokenType.WHITESPACE, prev_token_val)
            else:
                yield Token(TokenType.CODE, prev_token_val)
        else:
            token_val = curr_token_sep.group(0)
            if token_val in NO_ALIGN_BLOCK_END_MATCHERS:
                # comment, string or docstring
                block_begin_val = token_val
                assert curr_token_end > 0
                rest = rest[len(block_begin_val) :]
                end_matcher = NO_ALIGN_BLOCK_END_MATCHERS[token_val]
                block_end_match = end_matcher.search(rest)
                assert block_end_match, rest[:40]
                block_end_token = block_end_match.group(0)
                block_end_index = block_end_match.span()[-1]
                assert block_end_index <= len(rest), f"{len(rest)} < {block_end_index}"
                block_rest = rest[:block_end_index]
                block_token_val = block_begin_val + block_rest
                assert block_token_val.endswith(block_end_token)
                if block_token_val.strip().startswith("#"):
                    yield Token(TokenType.COMMENT, block_token_val)
                else:
                    yield Token(TokenType.BLOCK, block_token_val)
                rest = rest[block_end_index:]
            else:
                sep_token_val = token_val
                yield Token(TokenType.SEPARATOR, sep_token_val)
                rest = rest[curr_token_end:]

            # NOTE (mb 2018-09-09): The way we tokenize, we always consume
            #   all content belonging to strings and comments. This means that
            #   the rest (after consuming all content of a string or comment),
            #   should continue to be valid python. This means we can do some
            #   basic sanity checks. For example, no valid python token begins
            #   with a questionmark (though this is actually introduced because
            #   one of the test cases conveniently has a questionmark as the
            #   first character after an edge case of string parsing).
            assert not rest.startswith("?"), repr(rest)


Indent      = str
RowIndex    = int
ColIndex    = int
OffsetWidth = int

TokenTable = List[List[Token]]


class RowLayoutToken(NamedTuple):
    """Disambiguate between lines with different layout/structure

    We only want to align lines which have the same structure of
    indent and separators. Any difference in the number of elements
    or type of separators causes alignment to be disabled.
    """

    typ: TokenType
    # val is only set if it should cause a different prefix
    # eg. if a separator is a comma vs a period.
    val: Optional[TokenVal]


# Tokens which have values which are relevant to to the layout of
# a cell group.
LAYOUT_VAL_TOKENS = set([TokenType.SEPARATOR, TokenType.INDENT])

RowLayoutTokens = Tuple[RowLayoutToken, ...]


class AlignmentContextKey(NamedTuple):
    """Does not change between multiple lines that can be aligned."""

    col_idx: ColIndex
    tok_typ: TokenType
    tok_val: TokenVal
    layout : RowLayoutTokens


AlignmentContext = Dict[AlignmentContextKey, OffsetWidth]


class AlignmentCellKey(NamedTuple):
    last_row_index: RowIndex
    col_index     : ColIndex
    token_val     : TokenVal
    layout        : RowLayoutTokens


class AlignmentCell(NamedTuple):
    row_idx: RowIndex
    offset_width: OffsetWidth


CellGroups = Dict[AlignmentCellKey, List[AlignmentCell]]


def normalize_symbol_strings(row: List[Token]) -> None:
    """Convert doublequotes to single quotes for internal/symbol/atom strings

    Internal/Symbol/Atom strings are strings which are code as opposed
    to data. They have no meaning outside of the context of the
    program. Symbol strings must be valid python identifiers

    They are:
        - dictionary keys
        - attribute names
        - implicit enums
    They are not:
        - urls
        - user readable text
        - translation strings
        - format strings

    This function performs conversion only for a subset of cases,
    since it cannot detect all. These cases are, strings used as
    dictionary keys and for attribute access via getattr,
    setattr, delattr.
    """

    # single quotes.
    for col_index, tok_cell in enumerate(row):
        is_dict_key_symbol = (
            tok_cell.typ == TokenType.SEPARATOR
            and tok_cell.val in (":", "]")
            and col_index > 0
            and row[col_index - 1].typ == TokenType.BLOCK
            and SYMBOL_STRING_RE.match(row[col_index - 1].val)
        )

        if is_dict_key_symbol:
            normalized_token_val = row[col_index - 1].val.replace('"', "'")
            row[col_index - 1] = Token(TokenType.BLOCK, normalized_token_val)

        is_attrname_symbol = (
            tok_cell.typ == TokenType.CODE
            and tok_cell.val in ('getattr', 'setattr', 'delattr')
            and col_index + 5 < len(row)
            and row[col_index + 1].typ == TokenType.SEPARATOR
            and row[col_index + 1].val == "("
            and row[col_index + 2].typ == TokenType.CODE
            and row[col_index + 3].typ == TokenType.SEPARATOR
            and row[col_index + 3].val == ","
            and row[col_index + 4].typ == TokenType.WHITESPACE
            and row[col_index + 4].val == " "
            and row[col_index + 5].typ == TokenType.BLOCK
            and SYMBOL_STRING_RE.match(row[col_index + 5].val)
        )
        if is_attrname_symbol:
            normalized_token_val = row[col_index + 5].val.replace('"', "'")
            row[col_index + 5] = Token(TokenType.BLOCK, normalized_token_val)


def find_alignment_contexts(table: TokenTable) -> Iterator[AlignmentContext]:
    is_alignment_enabled = True

    for row in table:
        ctx: AlignmentContext = {}
        layout: RowLayoutTokens = tuple()

        if is_alignment_enabled:
            normalize_symbol_strings(row)

        for col_index, token in enumerate(row):
            if token.typ == TokenType.COMMENT and "fmt: off" in token.val:
                is_alignment_enabled = False
            if token.typ == TokenType.COMMENT and "fmt: on" in token.val:
                is_alignment_enabled = True

            if not is_alignment_enabled:
                continue

            layout_token_val: Optional[TokenVal]
            if token.typ in LAYOUT_VAL_TOKENS:
                if token.val in ALIGN_BEFORE_TOKENS:
                    layout_token_val = token.val
                elif col_index > 0:
                    # Layout tokens such as ([{ don't cause alignment, to
                    # their preceding token, so line offset up to the
                    # column of those tokens can a be different. We only
                    # want to continue with alignment if the tokens are
                    # all at the same line offset.
                    layout_token_val = token.val + f"::{len(row[col_index - 1].val)}"
                else:
                    layout_token_val = None
            else:
                layout_token_val = None

            layout += (RowLayoutToken(token.typ, layout_token_val),)

            if token.val in ALIGN_BEFORE_TOKENS:
                assert token.typ == TokenType.SEPARATOR
                maybe_indent_token = row[0]
                if maybe_indent_token.typ == TokenType.INDENT:
                    indent = maybe_indent_token.val
                else:
                    indent = ""
                prev_token = row[col_index - 1]
                if prev_token.typ == TokenType.SEPARATOR:
                    continue

                offset_width = len(prev_token.val)
                ctx_key = AlignmentContextKey(col_index, token.typ, token.val, layout)
                ctx[ctx_key] = offset_width

        yield ctx


def find_cell_groups(alignment_contexts: List[AlignmentContext]) -> CellGroups:
    cell_groups: Dict[AlignmentCellKey, List[AlignmentCell]] = {}
    for row_index, ctx in enumerate(alignment_contexts):
        ctx_items = sorted(ctx.items())
        for ctx_key, offset_width in ctx_items:
            col_index, token_typ, token_val, layout = ctx_key

            prev_cell_key = AlignmentCellKey(row_index - 1, col_index, token_val, layout)
            curr_cell_key = AlignmentCellKey(row_index, col_index, token_val, layout)

            curr_cell = AlignmentCell(row_index, offset_width)

            if prev_cell_key in cell_groups:
                prev_cells = cell_groups[prev_cell_key]
                del cell_groups[prev_cell_key]
                cell_groups[curr_cell_key] = prev_cells + [curr_cell]
            else:
                cell_groups[curr_cell_key] = [curr_cell]

    return cell_groups


def realigned_contents(table: TokenTable, cell_groups: CellGroups) -> str:
    prev_col_index = -1
    for ctx_key, cells in sorted(cell_groups.items()):
        prev_col_index = ctx_key.col_index
        if len(cells) < 3:
            continue

        max_offset_width = max(ow for _, ow in cells)
        for row_index, offset_width in cells:
            extra_offset = max_offset_width - offset_width
            if extra_offset == 0:
                continue

            row = table[row_index]
            left_token = row[ctx_key.col_index - 1]
            is_last_sep_token = all(
                token.typ in (TokenType.NEWLINE, TokenType.COMMENT, TokenType.WHITESPACE)
                for token in row[ctx_key.col_index + 1:]
            )
            maybe_number = left_token.val.strip().replace("_", "")
            if maybe_number.isdigit():
                padded_left_token_val = " " * extra_offset + left_token.val
            elif is_last_sep_token:
                # don't align if this is the last token of the row
                continue
            else:
                padded_left_token_val = left_token.val + " " * extra_offset
            padded_token = Token(TokenType.CODE, padded_left_token_val)
            row[ctx_key.col_index - 1] = padded_token

    return "".join("".join(token.val for token in row) for row in table)


def align_formatted_str(src_contents: str) -> FileContent:
    debug = 0

    table: TokenTable = [[]]
    for token in tokenize_for_alignment(src_contents):
        if debug:
            print("TOKEN: ", repr(token.val).ljust(50), token)
        table[-1].append(token)
        if token.typ == TokenType.NEWLINE:
            table.append([])
        else:
            is_block_token = token.typ in (TokenType.BLOCK, TokenType.COMMENT, TokenType.WHITESPACE)
            assert is_block_token or "\n" not in token.val

    if debug:
        for row in table:
            print("ROW: ", end="")
            for tok_cell in row:
                print(tok_cell, end="\n     ")
            print()

    alignment_contexts = list(find_alignment_contexts(table))
    cell_groups = find_cell_groups(alignment_contexts)

    if debug:
        for cell_key, cells in cell_groups.items():
            if len(cells) > 1:
                print("CELL", len(cells), cell_key)
                for all_cell in cells:
                    print("\t\t", all_cell)

    return realigned_contents(table, cell_groups)


print(align_formatted_str(str_contents))

	import re
	from enum import Enum
	from typing import *

	FileContent = str


	str_contents = """
	#!/usr/bin/env python3
	# fmt: on
	# Some license here.
	#
	# Has many lines. Many, many lines.
	# Many, many, many lines.
	\"\"\"Module docstring.

	Possibly also many, many lines.
	\"\"\"


	import os.path
	import sys

	import a
	from b.c import X # some noqa comment


	def test_parse():
	\"\"\"Docstring comes first.

	Possibly many lines.
	\"\"\"
	# FIXME: Some comment about why this function is crap but still in production.

	environ = {
	"MYAPP_DB_HOST": "1.2.3.4",
	"MYAPP_DB_PORT": "1234",
	'MYAPP_DB_PASSWORD': "secret",
	'MYAPP_DB_READ_ONLY': "0",
	'MYAPP_DB_DDL': "~/mkdb.sql",
	'MYAPP_DL': 123_123,
	'MYAPP_DL': 123_123_929,
	'MYAPP_DBDL': 12,
	}

	barf = {
	22: 23_222,
	2234: 231_231_231_232,
	1234: 231_231_232,
	}

	dbenv = myenv.parse(DBEnv, environ, prefix="MYAPP_DB_")

	assert dbenv.host == "1.2.3.4"
	assert dbenv.user == "new_user"
	assert dbenv.password == "secret"

	assert dbenv.read_only is False
	assert isinstance(dbenv.ddl, pl.Path)
	assert str(dbenv.ddl).endswith("mkdb.sql")

	assert len(attrnames) == 7


	GLOBAL_STATE = {"a": a(1), "b": a(2), "c": a(3)}

	'What\\'s the deal "here"?'
	'And "here"?'

	StrList = List[str]
	PathList = List[Path]

	class TestEnv(myenv.BaseEnv):
	'''
	notakey: notaval
	notakeyeither: notaval
	'''

	str_val_wo_default: str
	float_val_wo_default: float
	path_val_wo_default: pl.Path
	paths_val_wo_default: List[pl.Path]

	str_val: str = "foo"
	strs_val: StrList = ["foo = bar", "barfoo = baz"]
	float_val: float = 12.34
	path_val: pl.Path = pl.Path("file.txt")
	paths_val: PathList = [
	[1, 2, 3, 4],
	[5, 6, 7, 8],
	[1, 22, 2243, 4],

	{23: "asdf", 22: "aaaa", 443: "bbbb", 439: "cccc"},
	{1123: "asdf", 22: "k3k3", 443: "jfjf", 439: "k2k2"},
	{1: "asdf", 2332: "asdfk3k3", 42243: "jssdfjf", 4: "k2k2eie"},
	]
	\"\"\"Docstring for instance attribute spam.\"\"\"
	"""

	# str_contents = open("blacker.py").read()
	str_contents = open("test.txt").read()


	# fmt: off
	ALIGN_BEFORE_TOKENS = {
	"<<=", ">>=", "**=", "//=",
	"+=", "-=", "*=", "/=", "%=", "\|=", "&=", "@=",
	"==", "!=", "<=", ">=",
	"//", "<<", ">>", "^=", "~=",
	"in", "is", "},", "],", "),",
	"->",
	",", ":", "=",
	"+", "-", "*", "/",
	"%", "\|", "&", "^", "~",
	"!", "<", ">",
	"}", "]", ")",
	}
	# fmt: on


	NO_ALIGN_BLOCK_END_MATCHERS = {
	"'''": re.compile(r"(?<![^\\]\\)'''"),
	'"""': re.compile(r"(?<![^\\]\\)\"\"\""),
	'"' : re.compile(r"(?<![^\\]\\)\""),
	"'" : re.compile(r"(?<![^\\]\\)'"),
	"#": re.compile(r"$", flags=re.MULTILINE),
	}


	ALIGN_TOKEN_SEP_RE = re.compile(
	r"""
	(
	'''
	\|\"\"\"
	\|\"
	\|'
	\|\#

	\|<<= \|>>= \|\\= \|//=
	\|\+= \|\-= \|\*= \|/= \|%= \|\\|= \|&= \|@=
	\|== \|!= \|<= \|>=
	\|// \|<< \|>> \|\^= \|~=
	\|(?<!\w)in(?!\w) \|(?<!\w)is(?!\w) \|\}, \|\], \|\),
	\|\->
	\|, \|: \|=
	\|\+ \|\- \|\* \|/
	\|% \|\\| \|& \|\^ \|~
	\|! \|< \|>
	\|\{ \|\[ \|\(
	\|\} \|\] \|\)

	\|$
	)
	""",
	flags=re.MULTILINE \| re.VERBOSE,
	)


	SYMBOL_STRING_RE = re.compile(r"\"\w+(\w\d)?\"")


	class TokenType(Enum):

	INDENT = 0
	SEPARATOR = 1
	CODE = 2
	NEWLINE = 3
	BLOCK = 4
	COMMENT = 5
	WHITESPACE = 6


	TokenVal = str


	class Token(NamedTuple):

	typ: TokenType
	val: TokenVal


	def tokenize_for_alignment(src_contents: str) -> Iterator[Token]:
	rest = src_contents
	prev_rest = None

	while rest:
	assert rest != prev_rest, "No progress at: " + repr(rest[:40])
	prev_rest = rest

	curr_token_sep = ALIGN_TOKEN_SEP_RE.search(rest)
	assert curr_token_sep is not None
	curr_token_start, curr_token_end = curr_token_sep.span()

	# newline match has zero width
	is_newline = curr_token_start == curr_token_end
	if is_newline:
	# adjust for zero width match
	curr_token_end = curr_token_start + 1

	# Get everything (if anything) up to (and excluding) the newline
	token_val = rest[:curr_token_start]
	if token_val:
	assert token_val != "\n"
	yield Token(TokenType.CODE, token_val)

	# The newline itself (note that black promises to
	# have normalized CRLF etc. to plain LF)
	token_val = rest[curr_token_start:curr_token_end]
	assert token_val == "\n"
	yield Token(TokenType.NEWLINE, token_val)

	rest = rest[curr_token_end:]
	# parse any indent
	new_rest = rest.lstrip(" \t")
	indent_len = len(rest) - len(new_rest)
	if indent_len > 0:
	indent_token_val = rest[:indent_len]
	yield Token(TokenType.INDENT, indent_token_val)
	rest = new_rest
	elif curr_token_start > 0:
	prev_token_val = rest[:curr_token_start]
	rest = rest[curr_token_start:]
	assert prev_token_val != "\n"
	assert prev_token_val not in ALIGN_BEFORE_TOKENS, repr(prev_token_val)
	if len(prev_token_val.strip()) == 0:
	yield Token(TokenType.WHITESPACE, prev_token_val)
	else:
	yield Token(TokenType.CODE, prev_token_val)
	else:
	token_val = curr_token_sep.group(0)
	if token_val in NO_ALIGN_BLOCK_END_MATCHERS:
	# comment, string or docstring
	block_begin_val = token_val
	assert curr_token_end > 0
	rest = rest[len(block_begin_val) :]
	end_matcher = NO_ALIGN_BLOCK_END_MATCHERS[token_val]
	block_end_match = end_matcher.search(rest)
	assert block_end_match, rest[:40]
	block_end_token = block_end_match.group(0)
	block_end_index = block_end_match.span()[-1]
	assert block_end_index <= len(rest), f"{len(rest)} < {block_end_index}"
	block_rest = rest[:block_end_index]
	block_token_val = block_begin_val + block_rest
	assert block_token_val.endswith(block_end_token)
	if block_token_val.strip().startswith("#"):
	yield Token(TokenType.COMMENT, block_token_val)
	else:
	yield Token(TokenType.BLOCK, block_token_val)
	rest = rest[block_end_index:]
	else:
	sep_token_val = token_val
	yield Token(TokenType.SEPARATOR, sep_token_val)
	rest = rest[curr_token_end:]

	# NOTE (mb 2018-09-09): The way we tokenize, we always consume
	# all content belonging to strings and comments. This means that
	# the rest (after consuming all content of a string or comment),
	# should continue to be valid python. This means we can do some
	# basic sanity checks. For example, no valid python token begins
	# with a questionmark (though this is actually introduced because
	# one of the test cases conveniently has a questionmark as the
	# first character after an edge case of string parsing).
	assert not rest.startswith("?"), repr(rest)


	Indent = str
	RowIndex = int
	ColIndex = int
	OffsetWidth = int

	TokenTable = List[List[Token]]


	class RowLayoutToken(NamedTuple):
	"""Disambiguate between lines with different layout/structure

	We only want to align lines which have the same structure of
	indent and separators. Any difference in the number of elements
	or type of separators causes alignment to be disabled.
	"""

	typ: TokenType
	# val is only set if it should cause a different prefix
	# eg. if a separator is a comma vs a period.
	val: Optional[TokenVal]


	# Tokens which have values which are relevant to to the layout of
	# a cell group.
	LAYOUT_VAL_TOKENS = set([TokenType.SEPARATOR, TokenType.INDENT])

	RowLayoutTokens = Tuple[RowLayoutToken, ...]


	class AlignmentContextKey(NamedTuple):
	"""Does not change between multiple lines that can be aligned."""

	col_idx: ColIndex
	tok_typ: TokenType
	tok_val: TokenVal
	layout : RowLayoutTokens


	AlignmentContext = Dict[AlignmentContextKey, OffsetWidth]


	class AlignmentCellKey(NamedTuple):
	last_row_index: RowIndex
	col_index : ColIndex
	token_val : TokenVal
	layout : RowLayoutTokens


	class AlignmentCell(NamedTuple):
	row_idx: RowIndex
	offset_width: OffsetWidth


	CellGroups = Dict[AlignmentCellKey, List[AlignmentCell]]


	def normalize_symbol_strings(row: List[Token]) -> None:
	"""Convert doublequotes to single quotes for internal/symbol/atom strings

	Internal/Symbol/Atom strings are strings which are code as opposed
	to data. They have no meaning outside of the context of the
	program. Symbol strings must be valid python identifiers

	They are:
	- dictionary keys
	- attribute names
	- implicit enums
	They are not:
	- urls
	- user readable text
	- translation strings
	- format strings

	This function performs conversion only for a subset of cases,
	since it cannot detect all. These cases are, strings used as
	dictionary keys and for attribute access via getattr,
	setattr, delattr.
	"""

	# single quotes.
	for col_index, tok_cell in enumerate(row):
	is_dict_key_symbol = (
	tok_cell.typ == TokenType.SEPARATOR
	and tok_cell.val in (":", "]")
	and col_index > 0
	and row[col_index - 1].typ == TokenType.BLOCK
	and SYMBOL_STRING_RE.match(row[col_index - 1].val)
	)

	if is_dict_key_symbol:
	normalized_token_val = row[col_index - 1].val.replace('"', "'")
	row[col_index - 1] = Token(TokenType.BLOCK, normalized_token_val)

	is_attrname_symbol = (
	tok_cell.typ == TokenType.CODE
	and tok_cell.val in ('getattr', 'setattr', 'delattr')
	and col_index + 5 < len(row)
	and row[col_index + 1].typ == TokenType.SEPARATOR
	and row[col_index + 1].val == "("
	and row[col_index + 2].typ == TokenType.CODE
	and row[col_index + 3].typ == TokenType.SEPARATOR
	and row[col_index + 3].val == ","
	and row[col_index + 4].typ == TokenType.WHITESPACE
	and row[col_index + 4].val == " "
	and row[col_index + 5].typ == TokenType.BLOCK
	and SYMBOL_STRING_RE.match(row[col_index + 5].val)
	)
	if is_attrname_symbol:
	normalized_token_val = row[col_index + 5].val.replace('"', "'")
	row[col_index + 5] = Token(TokenType.BLOCK, normalized_token_val)


	def find_alignment_contexts(table: TokenTable) -> Iterator[AlignmentContext]:
	is_alignment_enabled = True

	for row in table:
	ctx: AlignmentContext = {}
	layout: RowLayoutTokens = tuple()

	if is_alignment_enabled:
	normalize_symbol_strings(row)

	for col_index, token in enumerate(row):
	if token.typ == TokenType.COMMENT and "fmt: off" in token.val:
	is_alignment_enabled = False
	if token.typ == TokenType.COMMENT and "fmt: on" in token.val:
	is_alignment_enabled = True

	if not is_alignment_enabled:
	continue

	layout_token_val: Optional[TokenVal]
	if token.typ in LAYOUT_VAL_TOKENS:
	if token.val in ALIGN_BEFORE_TOKENS:
	layout_token_val = token.val
	elif col_index > 0:
	# Layout tokens such as ([{ don't cause alignment, to
	# their preceding token, so line offset up to the
	# column of those tokens can a be different. We only
	# want to continue with alignment if the tokens are
	# all at the same line offset.
	layout_token_val = token.val + f"::{len(row[col_index - 1].val)}"
	else:
	layout_token_val = None
	else:
	layout_token_val = None

	layout += (RowLayoutToken(token.typ, layout_token_val),)

	if token.val in ALIGN_BEFORE_TOKENS:
	assert token.typ == TokenType.SEPARATOR
	maybe_indent_token = row[0]
	if maybe_indent_token.typ == TokenType.INDENT:
	indent = maybe_indent_token.val
	else:
	indent = ""
	prev_token = row[col_index - 1]
	if prev_token.typ == TokenType.SEPARATOR:
	continue

	offset_width = len(prev_token.val)
	ctx_key = AlignmentContextKey(col_index, token.typ, token.val, layout)
	ctx[ctx_key] = offset_width

	yield ctx


	def find_cell_groups(alignment_contexts: List[AlignmentContext]) -> CellGroups:
	cell_groups: Dict[AlignmentCellKey, List[AlignmentCell]] = {}
	for row_index, ctx in enumerate(alignment_contexts):
	ctx_items = sorted(ctx.items())
	for ctx_key, offset_width in ctx_items:
	col_index, token_typ, token_val, layout = ctx_key

	prev_cell_key = AlignmentCellKey(row_index - 1, col_index, token_val, layout)
	curr_cell_key = AlignmentCellKey(row_index, col_index, token_val, layout)

	curr_cell = AlignmentCell(row_index, offset_width)

	if prev_cell_key in cell_groups:
	prev_cells = cell_groups[prev_cell_key]
	del cell_groups[prev_cell_key]
	cell_groups[curr_cell_key] = prev_cells + [curr_cell]
	else:
	cell_groups[curr_cell_key] = [curr_cell]

	return cell_groups


	def realigned_contents(table: TokenTable, cell_groups: CellGroups) -> str:
	prev_col_index = -1
	for ctx_key, cells in sorted(cell_groups.items()):
	prev_col_index = ctx_key.col_index
	if len(cells) < 3:
	continue

	max_offset_width = max(ow for _, ow in cells)
	for row_index, offset_width in cells:
	extra_offset = max_offset_width - offset_width
	if extra_offset == 0:
	continue

	row = table[row_index]
	left_token = row[ctx_key.col_index - 1]
	is_last_sep_token = all(
	token.typ in (TokenType.NEWLINE, TokenType.COMMENT, TokenType.WHITESPACE)
	for token in row[ctx_key.col_index + 1:]
	)
	maybe_number = left_token.val.strip().replace("_", "")
	if maybe_number.isdigit():
	padded_left_token_val = " " * extra_offset + left_token.val
	elif is_last_sep_token:
	# don't align if this is the last token of the row
	continue
	else:
	padded_left_token_val = left_token.val + " " * extra_offset
	padded_token = Token(TokenType.CODE, padded_left_token_val)
	row[ctx_key.col_index - 1] = padded_token

	return "".join("".join(token.val for token in row) for row in table)


	def align_formatted_str(src_contents: str) -> FileContent:
	debug = 0

	table: TokenTable = [[]]
	for token in tokenize_for_alignment(src_contents):
	if debug:
	print("TOKEN: ", repr(token.val).ljust(50), token)
	table[-1].append(token)
	if token.typ == TokenType.NEWLINE:
	table.append([])
	else:
	is_block_token = token.typ in (TokenType.BLOCK, TokenType.COMMENT, TokenType.WHITESPACE)
	assert is_block_token or "\n" not in token.val

	if debug:
	for row in table:
	print("ROW: ", end="")
	for tok_cell in row:
	print(tok_cell, end="\n ")
	print()

	alignment_contexts = list(find_alignment_contexts(table))
	cell_groups = find_cell_groups(alignment_contexts)

	if debug:
	for cell_key, cells in cell_groups.items():
	if len(cells) > 1:
	print("CELL", len(cells), cell_key)
	for all_cell in cells:
	print("\t\t", all_cell)

	return realigned_contents(table, cell_groups)


	print(align_formatted_str(str_contents))