Skip to content

Instantly share code, notes, and snippets.

@nat-n
Created July 15, 2023 13:09
Show Gist options
  • Save nat-n/068f084cbfcd76f5c32391fba65a7e8c to your computer and use it in GitHub Desktop.
Save nat-n/068f084cbfcd76f5c32391fba65a7e8c to your computer and use it in GitHub Desktop.
An experimental parser for bash syntax in python
"""
An experimental tokenizer for bash syntax.
This is an alternative to shlex from the standard library for tokenizing posix
style command line statements.
On top of the word-splitting and quote removal functionality of shlex, this
library also makes it possilbe to tell which glob characters are escaped, and
preserves semantics for variable & arithmetic expansions.
In scope:
- quoting & escaping
- parameter, arithmetic, command expansions
- glob characters
- redirects
- control operations
- comments
- tilde expansions
Out of scope:
- history expansion
- shell keywords (case, if, export, etc)
- multiline constructs/multi-token grammar: control flow, functions, multiline groups
- recursively tokenizing the content of command or arithmetic expansions
- advanced parameter expansion, like ${param:-}
- arrays
- groups
- advanced glob patterns (classes and pattern lists)
"""
from collections import namedtuple, deque
from enum import Enum
from io import StringIO
from typing import IO, List, Optional, Tuple
import re
# characters that are escaped by glob:escape
GLOB_INIT_CHARS = "*?["
PREFIX_CHARS = "\\$"
METACHARACTERS = "|;&<>(){}"
WHITESPACE = " \t\r\n"
NON_RANGE_CHARS = WHITESPACE + "!`'\";&|()<>"
VAR_INIT_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_"
DIGITS = "0123456789"
VAR_CHARS = VAR_INIT_CHARS + DIGITS
SPECIAL_VARS = DIGITS + "!@#$*-?"
def tokenize(cmd_line: str):
return list(CommandLineTokenizer(StringIO(cmd_line)))
class TokenType(Enum):
WORD = 0
CONTROL_OPERATOR = 1
REDIRECT_OPERATOR = 2
COMMENT = 3
class TokenFeatureType(str, Enum):
VARIABLE = "var" # TODO: rename PARAMETER
COMMAND = "cmd"
ARITHMETIC = "arth"
TILDE = "tld"
GROUP = "grp"
class TokenFeature(
namedtuple(
"TokenFeature",
("type", "content", "start", "end", "quoted"),
)
):
type: TokenFeatureType
content: str
start: int
end: int
quoted: bool
class Token(
namedtuple(
"Token",
(
"content",
"includes_glob",
"escaped_glob_indices",
"expansions",
"type",
),
)
):
content: str
includes_glob: bool
escaped_glob_indices: Tuple[int, ...]
expansions: Tuple[TokenFeature, ...]
type: TokenType
@staticmethod
def operator(content: str):
return Token(content, False, tuple(), tuple(), TokenType.CONTROL_OPERATOR)
_expansion_types = {
"(": (")", TokenFeatureType.GROUP),
"{": ("}", TokenFeatureType.GROUP),
"${": ("}", TokenFeatureType.VARIABLE),
"$((": ("))", TokenFeatureType.ARITHMETIC),
"$(": (")", TokenFeatureType.COMMAND),
"`": ("`", TokenFeatureType.COMMAND),
"$": ("", TokenFeatureType.VARIABLE),
"~": ("", TokenFeatureType.TILDE),
}
def includes_glob(input_stream: IO[str]) -> bool: # TODO: unit test me
"""
Check if the input stream includes are valid glob sequence, such as:
- *
- ?
- a valid simple range expression
Escaping of glob characters is not considered.
"""
next_char = None
range_sequence_len = 0
escape_active = False
while next_char := input_stream.read(1):
if not next_char:
return False
if next_char in "*?":
return True
if range_sequence_len:
if not escape_active:
if next_char == "]":
if range_sequence_len == 1:
range_sequence_len += 1
else:
return True
if next_char == '"':
escape_active = True
if next_char in NON_RANGE_CHARS:
range_sequence_len = 0
continue
range_sequence_len += 1
if next_char == "[":
range_sequence_len = 1
class RedirectParseState:
"""
Here-document and here-string redirects are not supported.
"""
left_fd: str = ""
operator: str = ""
right_fd: str = ""
def __init__(self, left_fd: str = "", operator: str = ""):
self.left_fd = left_fd
self.operator = operator
self.right_fd = ""
@property
def valid(self):
if self.left_fd and not self.left_fd.isdigit():
return False
if not re.match(r"^\d*\-?$", self.right_fd):
return False
if self.left_fd and not (self.operator or self.right_fd):
return True
if self.operator in ("&>", "&>>") and not (self.left_fd or self.right_fd):
return True
if self.operator in (">", ">>", ">|", "<", "<>") and not self.right_fd:
return True
if self.operator in (">&", "<&"):
return True
return False
def read(self) -> str:
return self.left_fd + self.operator + self.right_fd
def append(self, next_char: str) -> bool:
"""
This method assumes self will always be initialized with left_fd OR
with operator as one of: "<" or ">" or "&>"
If next_char is successfully appended to this token then return empty string.
Otherwise return the most recent chars that are cannot be appended to this
token. Usually this is just next_char but it can also more some digits.
"""
rejected = ""
if self.operator in ("<&", ">&"):
if not self.right_fd.endswith("-") and (
next_char.isdigit() or next_char == "-"
):
self.right_fd += next_char
elif next_char in " \t\n|;&<>":
rejected = next_char
else:
# what we thought was closing digits is actually the next word
rejected = self.right_fd + next_char
self.right_fd = ""
elif self.operator == ">":
if next_char in ">&|":
self.operator += next_char
else:
rejected = next_char
elif self.operator == "<":
if next_char in ">&":
self.operator += next_char
else:
rejected = next_char
elif self.operator == "&>":
if next_char == ">":
self.operator += next_char
elif next_char in "|;&<>()":
raise CommandLineTokenizeError(
f"Syntax error near redirect operator {self.read()!r}"
)
else:
rejected = next_char
elif not self.operator:
if next_char.isdigit():
self.left_fd += next_char
elif next_char in "<>":
self.operator += next_char
else:
rejected = next_char
else:
rejected = next_char
return rejected
class TokenParseState:
quote_active: str
range_active: bool
active_prefix: str
scope_terminator: str
redirect: Optional[RedirectParseState]
paren_depth: int
includes_glob: bool
_expansion_scope_content: List[str]
_parts: List[str]
_escaped_glob_indices: List[int]
_expansions: List[TokenFeature]
_expansion_scope_start_index: Optional[int] = None
expansion_type: Optional[str] = None
def __init__(self):
self.quote_active = ""
self.range_active = False
self.active_prefix = ""
self.scope_terminator = ""
self.redirect = None
self.paren_depth = 0
self.includes_glob = False
self._expansion_scope_content = []
self._parts = []
self._escaped_glob_indices = []
self._expansions = []
def read(self):
if self.redirect:
return self.redirect.read()
# TODO: keep count of length in append instead of calling read just to measure?
if self.is_empty:
return ""
if len(self._parts) > 1:
self._consolidate()
return self._parts[0]
def get_token(self) -> Optional[Token]:
if self.is_empty:
return None
print("T:", self.read())
return Token(
content=self.read(),
includes_glob=self.includes_glob,
escaped_glob_indices=tuple(self._escaped_glob_indices),
expansions=tuple(self._expansions),
type=self._get_token_type(),
)
def _get_token_type(self) -> TokenType:
if self.redirect:
return TokenType.REDIRECT_OPERATOR
if self.is_comment:
return TokenType.COMMENT
return TokenType.WORD
def _consolidate(self):
self._parts = ["".join(self._parts)]
@property
def is_empty(self) -> bool:
return (
not self._parts
and not self.redirect
and not self.active_prefix
and not self.quote_active
and not self.scope_terminator
)
@property
def escape_active(self) -> bool:
return self.active_prefix == "\\"
@property
def is_comment(self) -> bool:
return self.active_prefix == "#"
@property
def in_single_quotes(self) -> bool:
return self.quote_active == "'"
@property
def in_double_quotes(self) -> bool:
return self.quote_active == '"'
@property
def in_expansion(self) -> bool:
return self.expansion_type is not None
def init_redirect(self, digits: str = "", operator: str = ""):
self.redirect = RedirectParseState(digits, operator)
def cancel_redirect(self):
"""
This can be called when we parsed some digits then realized it's not a redirect.
"""
self.append(self.redirect.read())
self.redirect = None
def start_expansion(self, opener: str, no_container: bool = False):
assert not self.in_expansion
self.append(opener, force_in_expansion=no_container)
expansion_type = _expansion_types.get(opener, None)
if expansion_type is None:
raise TokenizerInternalError(f"Unknown scope opener {opener!r}")
self.scope_terminator, self.expansion_type = expansion_type
self._expansion_scope_start_index = len(self.read()) - len(opener)
def end_expansion(self):
assert self.in_expansion
expansion_content = "".join(self._expansion_scope_content)
self.append(self.scope_terminator)
self._expansions.append(
TokenFeature(
type=self.expansion_type,
content=expansion_content,
start=self._expansion_scope_start_index,
end=len(self.read()),
quoted=self.in_double_quotes,
)
)
self._expansion_scope_start_index = None
self._expansion_scope_content = []
self.expansion_type = None
self.scope_terminator = ""
def cancel_expansion(self):
assert self.in_expansion
self._expansion_scope_start_index = None
self._expansion_scope_content = []
self.expansion_type = None
self.scope_terminator = ""
def close_quotes(self):
self.quote_active = ""
# This just ensures we don't consider this token to be empty anymore
self.append("")
def append(
self, char: str, escaped_glob: bool = False, force_in_expansion: bool = False
):
if self.in_expansion or force_in_expansion:
self._expansion_scope_content.append(char)
self._parts.append(char)
if escaped_glob:
self._escaped_glob_indices.append(len(self.read()) - 1)
class CommandLineTokenizer:
pushback: List[str] = []
lookback: List[str] = []
def __init__(self, input_stream: IO[str]):
self.input_stream = input_stream
def parse_token(self):
state = TokenParseState()
next_char = None
while True:
if self.pushback:
next_char = self.pushback.pop()
print("p:", next_char)
else:
next_char = self.input_stream.read(1)
if not next_char:
# end of input
if state.escape_active:
raise CommandLineTokenizeError("No escaped character")
if state.in_single_quotes or state.in_double_quotes:
raise CommandLineTokenizeError("No closing quotation")
if state.scope_terminator:
raise CommandLineTokenizeError(
f"No closing bracket in token {state.read()!r}"
)
if state.in_expansion:
for char in self.lookback:
state.append(char)
self.lookback.clear()
state.end_expansion()
if state.redirect:
if state.redirect.valid:
if not state.redirect.operator:
state.cancel_redirect()
else:
raise CommandLineTokenizeError(
f"Unexpected end of input in redirect: {state.read()!r}"
)
if state.range_active:
state.append("[")
if self.lookback:
self._cancel_lookback()
if state.active_prefix:
# TODO: this should vary depending on the prefix
state.append(state.active_prefix)
state.active_prefix = ""
break
if state.is_comment:
if next_char == "\n":
break
state.append(next_char)
elif state.range_active:
if state.escape_active:
self.lookback.append(next_char)
state.escape_active = False
elif next_char == "]" and self.lookback:
# range expression complete
state.includes_glob = True
state.append("[")
for char in self.lookback:
state.append(char)
self.lookback.clear()
state.append(next_char)
state.range_active = False
elif next_char in NON_RANGE_CHARS:
# range expression invalidated, pretend it never happened
state.append("[", escaped_glob=True)
self._cancel_lookback(next_char)
state.range_active = False
else:
self.lookback.append(next_char)
elif state.escape_active:
if state.in_double_quotes and not next_char in '"$':
state.append("\\")
elif next_char in GLOB_INIT_CHARS:
state.append(next_char, escaped_glob=True)
else:
state.append(next_char)
state.active_prefix = ""
elif state.in_single_quotes:
if next_char == "'":
state.close_quotes()
elif next_char in GLOB_INIT_CHARS:
state.append(next_char, escaped_glob=True)
else:
state.append(next_char)
elif state.in_expansion:
if state.scope_terminator == "}":
if next_char == "}":
state.end_expansion()
else:
state.append(next_char)
elif state.scope_terminator == ")":
# TODO: to do this properly we need to parse the sub command to tell
# whether the closing bracket is quoted or escaped
if next_char == ")":
state.end_expansion()
else:
state.append(next_char)
elif state.scope_terminator == "`":
# N.B. we don't respect escaping of backticks inside backticks
if next_char == "`":
state.end_expansion()
else:
state.append(next_char)
elif state.scope_terminator == "))":
if next_char == ")":
if state.paren_depth:
state.paren_depth -= 1
state.append(next_char)
elif self.lookback:
if self.lookback[0] == ")":
assert len(self.lookback) == 1
state.end_expansion()
self.lookback.clear()
else:
self._cancel_lookback(next_char)
else:
self.lookback.append(next_char)
continue
elif next_char == "(":
state.paren_depth += 1
state.append(next_char)
elif next_char == "\\":
state.active_prefix = next_char
else:
# TODO: verify if next_char is valid here?
state.append(next_char)
elif state.expansion_type == TokenFeatureType.VARIABLE:
# This must be a variable name
if next_char in VAR_CHARS:
state.append(next_char)
else:
# We've gone off the end of the variable name, so we need to
# start the loop again with the same same char.
state.end_expansion()
self.pushback.append(next_char)
continue
elif state.expansion_type == TokenFeatureType.TILDE:
if next_char in "/ \n\t\n|;&<>(){}":
# Reached the end of valid looking tilde expansion
for char in self.lookback:
state.append(char)
self.lookback.clear()
state.end_expansion()
self.pushback.append(next_char)
elif self.lookback:
if self.lookback[0] in "+-":
if next_char.isdigit():
self.lookback.append(next_char)
else:
self._cancel_lookback(next_char)
state.cancel_expansion()
elif next_char in VAR_CHARS or next_char in "-.":
self.lookback.append(next_char)
elif next_char in "+-" or next_char in VAR_CHARS:
self.lookback.append(next_char)
else:
self._cancel_lookback(next_char)
state.cancel_expansion()
else:
raise TokenizerInternalError(
f"Unexpected expansion_type {state.expansion_type!r}"
)
elif state.active_prefix == "$":
if next_char == "(":
state.active_prefix = "$("
else:
state.active_prefix = ""
if next_char == "{":
state.start_expansion("${")
elif next_char in SPECIAL_VARS:
state.start_expansion("$")
state.append(next_char)
state.end_expansion()
elif next_char in VAR_INIT_CHARS:
state.start_expansion("$")
state.append(next_char)
else:
# this $ has no meaning
state.append("$")
self.pushback.append(next_char)
continue
elif state.active_prefix == "$(":
state.active_prefix = ""
if next_char == "(":
state.start_expansion("$((")
else:
state.start_expansion("$(")
self.pushback.append(next_char)
continue
elif state.redirect:
if rejected := state.redirect.append(next_char):
if not state.redirect.operator:
state.cancel_redirect()
self.pushback.append(next_char)
continue
elif state.redirect.valid:
for char in reversed(rejected):
self.pushback.append(char)
return state.get_token()
else:
raise CommandLineTokenizeError(
f"Syntax error near redirect operator {state.read()!r}"
)
elif state.active_prefix == "|":
state.active_prefix = ""
if next_char == "|":
return Token.operator("||")
elif next_char == "&":
return Token.operator("|&")
else:
self.pushback.append(next_char)
return Token.operator("|")
elif state.active_prefix == "&":
state.active_prefix = ""
if next_char == "&":
return Token.operator("&&")
elif next_char == ">":
state.init_redirect(operator="&>")
else:
self.pushback.append(next_char)
return Token.operator("&")
elif state.active_prefix == ";":
if next_char == ";":
state.active_prefix = ";;"
elif next_char == "&":
return Token.operator(";&")
else:
self.pushback.append(next_char)
return Token.operator(";")
elif state.active_prefix == ";;":
if next_char == "&":
return Token.operator(";;&")
else:
self.pushback.append(next_char)
return Token.operator(";;")
elif next_char == "`":
state.start_expansion(next_char)
elif state.in_double_quotes:
if next_char == '"':
state.close_quotes()
elif next_char in PREFIX_CHARS:
state.active_prefix = next_char
elif next_char in GLOB_INIT_CHARS:
state.append(next_char, escaped_glob=True)
else:
state.append(next_char)
elif next_char in "><":
if state.is_empty:
state.init_redirect(operator=next_char)
else:
self.pushback.append(next_char)
break
elif next_char in "&|":
if state.is_empty:
# need to check the next char to know what this means
state.active_prefix = next_char
else:
self.pushback.append(next_char)
break
elif next_char in WHITESPACE:
if state.is_empty:
# ignore leading whitespace
continue
else:
# token complete
break
elif next_char in "'\"":
# Open quotes and drop quote char
state.quote_active = next_char
elif next_char == "[":
# This could be a range expression, but we need to look ahead
# to parse the whole expression to be sure. This means validating
# that there is a closing square bracked with valid contents.
state.range_active = True
elif next_char in "*?":
state.includes_glob = True
state.append(next_char)
elif next_char in PREFIX_CHARS:
state.active_prefix = next_char
elif next_char == ";":
if not state.is_empty:
self.pushback.append(next_char)
break
state.active_prefix = ";"
elif next_char == "~":
if state.is_empty:
# need to check the next char to know what this means
state.start_expansion("~", no_container=True)
else:
state.append(next_char)
elif next_char == "#":
if not state.is_empty:
self.pushback.append(next_char)
break
state.active_prefix = next_char
state.append(next_char)
elif next_char.isdigit() and state.is_empty:
state.init_redirect(digits=next_char)
else:
# TODO: validate next_char is allowed out in the open like this
# ... why wouldn't it be?
# like () is usually a problem unless at start of token/line
# ... need to count tokens?
state.append(next_char)
return state.get_token()
def _cancel_lookback(self, next_char: str = ""):
if next_char:
self.pushback.append(next_char)
for char in reversed(self.lookback):
self.pushback.append(char)
self.lookback.clear()
def __iter__(self):
return self
def __next__(self):
if token := self.parse_token():
return token
raise StopIteration
class CommandLineTokenizeError(RuntimeError):
# TODO: keep track of where we are through the whole source for errors!!
pass
@property
def message(self) -> str:
return self.args[0] if self.args else "Failed to tokenize command line"
class TokenizerInternalError(Exception):
pass
import pytest
import math
from shell_tokenizer import (
tokenize,
TokenType,
TokenFeature,
TokenFeatureType,
CommandLineTokenizeError,
)
VARIABLE = TokenFeatureType.VARIABLE
COMMAND = TokenFeatureType.COMMAND
ARITHMETIC = TokenFeatureType.ARITHMETIC
TILDE = TokenFeatureType.TILDE
def test_parse_empty_token():
tokens = tokenize("echo ''1 '' 2")
assert_tokens(
tokens,
TokenDesc("echo"),
TokenDesc("1"),
TokenDesc(""),
TokenDesc("2"),
)
def test_parse_comments():
tokens = tokenize(
"""
echo hi! # la de da
# |;&<>(){} # @!2
#
##
# OK ;) \t
"""
)
assert_tokens(
tokens,
TokenDesc("echo"),
TokenDesc("hi!"),
TokenDesc("# la de da", type=TokenType.COMMENT),
TokenDesc("# |;&<>(){} # @!2", type=TokenType.COMMENT),
TokenDesc("#", type=TokenType.COMMENT),
TokenDesc("##", type=TokenType.COMMENT),
TokenDesc("# OK ;) \t", type=TokenType.COMMENT),
)
def test_parse_glob_tokens():
# label unescaped ? *
# label unescaped [ with valid content before ]
tokens = tokenize(
"echo h?llo/**/*.py * thing[123].txt "
""" "h?llo/**/*.py" "*" "thing[123].txt" """
"'h?llo/**/*.py' '*' 'thing[123].txt' "
"h\\?llo/\\*\\*/\\*.py \\* thing\\[123].txt "
" h?ll'*' "
" foo[ld] o[]r lo][l 2 "
"[[] [[yep] [nope!] [still!*] "
"[ha&]"
)
# assert len(tokens) == 4
assert_tokens(
tokens,
TokenDesc("echo"),
TokenDesc("h?llo/**/*.py", includes_glob=True),
TokenDesc("*", includes_glob=True),
TokenDesc("thing[123].txt", includes_glob=True),
TokenDesc(
"h?llo/**/*.py",
includes_glob=False,
escaped_glob_indices=(
1,
6,
7,
9,
),
),
TokenDesc("*", includes_glob=False, escaped_glob_indices=(0,)),
TokenDesc("thing[123].txt", includes_glob=False, escaped_glob_indices=(5,)),
TokenDesc(
"h?llo/**/*.py",
includes_glob=False,
escaped_glob_indices=(
1,
6,
7,
9,
),
),
TokenDesc("*", includes_glob=False, escaped_glob_indices=(0,)),
TokenDesc("thing[123].txt", includes_glob=False, escaped_glob_indices=(5,)),
TokenDesc(
"h?llo/**/*.py",
includes_glob=False,
escaped_glob_indices=(
1,
6,
7,
9,
),
),
TokenDesc("*", includes_glob=False, escaped_glob_indices=(0,)),
TokenDesc("thing[123].txt", includes_glob=False, escaped_glob_indices=(5,)),
TokenDesc("h?ll*", includes_glob=True, escaped_glob_indices=(4,)),
TokenDesc("foo[ld]", includes_glob=True),
TokenDesc("o[]r", includes_glob=False, escaped_glob_indices=(1,)),
TokenDesc("lo][l", includes_glob=False, escaped_glob_indices=(3,)),
TokenDesc("2", includes_glob=False),
TokenDesc("[[]", includes_glob=True),
TokenDesc("[[yep]", includes_glob=True),
TokenDesc("[nope!]", includes_glob=False, escaped_glob_indices=(0,)),
TokenDesc("[still!*]", includes_glob=True, escaped_glob_indices=(0,)),
TokenDesc("[ha", includes_glob=False, escaped_glob_indices=(0,)),
TokenDesc("&", type=TokenType.CONTROL_OPERATOR, includes_glob=False),
TokenDesc("]", includes_glob=False),
)
def test_parse_arithmetic_expansions():
tokens = tokenize(
"""
echo "A$(( 1 + 1 ))B" '$(( 2**3 ))' A$((1&&1==( 1 || 1 )))B
"""
)
assert len(tokens) == 4
assert_tokens(
tokens,
TokenDesc("echo"),
TokenDesc(
"A$(( 1 + 1 ))B",
expansions=(
TokenFeature(ARITHMETIC, " 1 + 1 ", start=1, end=13, quoted=True),
),
),
TokenDesc("$(( 2**3 ))", escaped_glob_indices=(5, 6)),
TokenDesc(
"A$((1&&1==( 1 || 1 )))B",
expansions=(
TokenFeature(
ARITHMETIC,
"1&&1==( 1 || 1 )",
start=1,
end=22,
quoted=False,
),
),
),
)
def test_parse_command_expansions():
tokens = tokenize(
r"""
echo "A$(ls "foo")B" '$(ls foo)' A$(ls foo)B
"""
)
assert len(tokens) == 4
assert_tokens(
tokens,
TokenDesc("echo"),
TokenDesc(
'A$(ls "foo")B',
expansions=(
TokenFeature(COMMAND, 'ls "foo"', start=1, end=12, quoted=True),
),
),
TokenDesc("$(ls foo)"),
TokenDesc(
"A$(ls foo)B",
expansions=(
TokenFeature(COMMAND, "ls foo", start=1, end=10, quoted=False),
),
),
)
def test_parse_operator_tokens():
# NB we also parse `;;`, `;&`, `;;&` but we don't parse case statements so nevermind
for op in ("&", "&&", "|", "||", "|&", ";"):
tokens = tokenize(f"echo 1{op}echo 2 {op} echo 3")
assert len(tokens) == 8
assert_tokens(
tokens,
TokenDesc("echo"),
TokenDesc("1"),
TokenDesc(op, type=TokenType.CONTROL_OPERATOR),
TokenDesc("echo"),
TokenDesc("2"),
TokenDesc(op, type=TokenType.CONTROL_OPERATOR),
TokenDesc("echo"),
TokenDesc("3"),
)
def test_parse_tilde_tokens():
tokens = tokenize(
r"""
echo ~ ~/ ~/foo ~user.name ~root/Library ~+ ~- ~12 ~+42 ~-11 ~-/foo ~+/foo x~ ~@ "~"
"""
)
assert len(tokens) == 16
assert_tokens(
tokens,
TokenDesc("echo"),
TokenDesc(
"~",
expansions=(TokenFeature(TILDE, "~", start=0, end=1, quoted=False),),
),
TokenDesc(
"~/",
expansions=(TokenFeature(TILDE, "~", start=0, end=1, quoted=False),),
),
TokenDesc(
"~/foo",
expansions=(TokenFeature(TILDE, "~", start=0, end=1, quoted=False),),
),
TokenDesc(
"~user.name",
expansions=(
TokenFeature(TILDE, "~user.name", start=0, end=10, quoted=False),
),
),
TokenDesc(
"~root/Library",
expansions=(TokenFeature(TILDE, "~root", start=0, end=5, quoted=False),),
),
TokenDesc(
"~+",
expansions=(TokenFeature(TILDE, "~+", start=0, end=2, quoted=False),),
),
TokenDesc(
"~-",
expansions=(TokenFeature(TILDE, "~-", start=0, end=2, quoted=False),),
),
TokenDesc(
"~12",
expansions=(TokenFeature(TILDE, "~12", start=0, end=3, quoted=False),),
),
TokenDesc(
"~+42",
expansions=(TokenFeature(TILDE, "~+42", start=0, end=4, quoted=False),),
),
TokenDesc(
"~-11",
expansions=(TokenFeature(TILDE, "~-11", start=0, end=4, quoted=False),),
),
TokenDesc(
"~-/foo",
expansions=(TokenFeature(TILDE, "~-", start=0, end=2, quoted=False),),
),
TokenDesc(
"~+/foo",
expansions=(TokenFeature(TILDE, "~+", start=0, end=2, quoted=False),),
),
TokenDesc("x~"),
TokenDesc("~@"),
TokenDesc("~"),
)
def test_parse_backticks():
tokens = tokenize(
r"""
echo $WOO`goal or "${wut}" span`" and `more stuff` !" `yes`
"""
)
assert len(tokens) == 3
assert_token(tokens[0], "echo")
assert_token(
tokens[1],
'$WOO`goal or "${wut}" span` and `more stuff` !',
expansions=(
TokenFeature(
type=VARIABLE,
content="WOO",
start=0,
end=4,
quoted=False,
),
TokenFeature(
type=COMMAND,
content='goal or "${wut}" span',
start=4,
end=27,
quoted=False,
),
TokenFeature(
type=COMMAND,
content="more stuff",
start=32,
end=44,
quoted=True,
),
),
)
assert_token(
tokens[2],
"`yes`",
expansions=(
TokenFeature(
type=COMMAND,
content="yes",
start=0,
end=5,
quoted=False,
),
),
)
def test_parse_isolated_variables():
tokens = tokenize(
"""
echo $NAKED_UPPER $naked_lower_1 ${CURLIES1} ${curlies2} "$Quoted" "${QuotedBrackets}" '$SQuoted' '${SQuotedBrackets}'
"""
)
assert len(tokens) == 9
assert_token(tokens[0], "echo")
assert_token(
tokens[1],
"$NAKED_UPPER",
expansions=(TokenFeature(VARIABLE, "NAKED_UPPER", 0, 12, False),),
)
assert_token(
tokens[2],
"$naked_lower_1",
expansions=(TokenFeature(VARIABLE, "naked_lower_1", 0, 14, False),),
)
assert_token(
tokens[3],
"${CURLIES1}",
expansions=(TokenFeature(VARIABLE, "CURLIES1", 0, 11, False),),
)
assert_token(
tokens[4],
"${curlies2}",
expansions=(TokenFeature(VARIABLE, "curlies2", 0, 11, False),),
)
assert_token(
tokens[5],
"$Quoted",
expansions=(TokenFeature(VARIABLE, "Quoted", 0, 7, True),),
)
assert_token(
tokens[6],
"${QuotedBrackets}",
expansions=(TokenFeature(VARIABLE, "QuotedBrackets", 0, 17, True),),
)
assert_token(tokens[7], "$SQuoted")
assert_token(tokens[8], "${SQuotedBrackets}")
tokens = tokenize("echo $")
assert len(tokens) == 2
assert_token(tokens[0], "echo")
assert_token(tokens[1], "$")
def test_parse_escaped_variables():
tokens = tokenize(
r"""
echo \$NAKED_UPPER \\\$naked_lower_1 \\\\\${CURLIES1} \${curlies2} "\$Quoted" "\${QuotedBrackets}" '\$SQuoted' '\${SQuotedBrackets}'
"""
)
assert len(tokens) == 9
assert_token(tokens[0], "echo")
assert_token(tokens[1], "$NAKED_UPPER")
assert_token(tokens[2], "\\$naked_lower_1")
assert_token(tokens[3], "\\\\${CURLIES1}")
assert_token(tokens[4], "${curlies2}")
assert_token(tokens[5], "$Quoted")
assert_token(tokens[6], "${QuotedBrackets}")
assert_token(tokens[7], "\\$SQuoted")
assert_token(tokens[8], "\\${SQuotedBrackets}")
def test_parse_special_variables():
tokens = tokenize(
"""
echo $FOO$0A$1A$2A$3A$4A$5A$6A$7A$8A$9A$$A$!A$*A$?A$-A$#A$${LOL}
"""
)
assert len(tokens) == 2
assert_token(tokens[0], "echo")
assert_token(
tokens[1],
"$FOO$0A$1A$2A$3A$4A$5A$6A$7A$8A$9A$$A$!A$*A$?A$-A$#A$${LOL}",
expansions=(
TokenFeature(
type=VARIABLE,
content="FOO",
start=0,
end=4,
quoted=False,
),
TokenFeature(
type=VARIABLE,
content="0",
start=4,
end=6,
quoted=False,
),
TokenFeature(
type=VARIABLE,
content="1",
start=7,
end=9,
quoted=False,
),
TokenFeature(
type=VARIABLE,
content="2",
start=10,
end=12,
quoted=False,
),
TokenFeature(
type=VARIABLE,
content="3",
start=13,
end=15,
quoted=False,
),
TokenFeature(
type=VARIABLE,
content="4",
start=16,
end=18,
quoted=False,
),
TokenFeature(
type=VARIABLE,
content="5",
start=19,
end=21,
quoted=False,
),
TokenFeature(
type=VARIABLE,
content="6",
start=22,
end=24,
quoted=False,
),
TokenFeature(
type=VARIABLE,
content="7",
start=25,
end=27,
quoted=False,
),
TokenFeature(
type=VARIABLE,
content="8",
start=28,
end=30,
quoted=False,
),
TokenFeature(
type=VARIABLE,
content="9",
start=31,
end=33,
quoted=False,
),
TokenFeature(
type=VARIABLE,
content="$",
start=34,
end=36,
quoted=False,
),
TokenFeature(
type=VARIABLE,
content="!",
start=37,
end=39,
quoted=False,
),
TokenFeature(
type=VARIABLE,
content="*",
start=40,
end=42,
quoted=False,
),
TokenFeature(
type=VARIABLE,
content="?",
start=43,
end=45,
quoted=False,
),
TokenFeature(
type=VARIABLE,
content="-",
start=46,
end=48,
quoted=False,
),
TokenFeature(
type=VARIABLE,
content="#",
start=49,
end=51,
quoted=False,
),
TokenFeature(
type=VARIABLE,
content="$",
start=52,
end=54,
quoted=False,
),
),
)
def test_parse_jumbled_variables():
tokens = tokenize(
"""
echo go$NAKED_UPPER$naked_lower_1>${CURLIES1}"${curlies2} $Quoted"ok''"${QuotedBrackets}"'$SQuoted'99'${SQuotedBrackets}'
"""
)
assert_token(tokens[0], "echo")
assert_token(
tokens[1],
"go$NAKED_UPPER$naked_lower_1",
expansions=(
TokenFeature(
type=VARIABLE,
content="NAKED_UPPER",
start=2,
end=14,
quoted=False,
),
TokenFeature(
type=VARIABLE,
content="naked_lower_1",
start=14,
end=28,
quoted=False,
),
),
)
assert_token(tokens[2], ">", type=2)
assert_token(
tokens[3],
"${CURLIES1}${curlies2} $Quotedok${QuotedBrackets}$SQuoted99${SQuotedBrackets}",
expansions=(
TokenFeature(
type=VARIABLE,
content="CURLIES1",
start=0,
end=11,
quoted=False,
),
TokenFeature(
type=VARIABLE,
content="curlies2",
start=11,
end=22,
quoted=True,
),
TokenFeature(
type=VARIABLE,
content="Quoted",
start=23,
end=30,
quoted=True,
),
TokenFeature(
type=VARIABLE,
content="QuotedBrackets",
start=32,
end=49,
quoted=True,
),
),
)
@pytest.mark.parametrize(
"example",
[
">",
"<",
">>",
">&",
"<&", # semantically invalid without following digit
"&>",
"&>>",
">|",
"<>",
"1>",
"1>>",
"1>&",
"1>|",
"1<>",
"1>&2",
"1<&2",
">&2",
"<&2",
"1>&-",
"1<&-",
"1>&2-",
"1<&2-",
">&2-",
"<&2-",
"11>",
"11>>",
"11>&",
"11>|",
"11<>",
"11>&12",
"11<&12",
">&12",
"<&12",
"11>&-",
"11<&-",
"11>&12-",
"11<&12-",
">&12-",
"<&12-",
],
)
def test_tokenize_redirects(example):
tokens = tokenize(f"echo foo {example} bar")
assert len(tokens) == 4
assert_token(tokens[0], "echo")
assert_token(tokens[1], "foo")
assert_token(tokens[2], example, type=2)
assert_token(tokens[3], "bar")
@pytest.mark.parametrize(
"example, equivalent",
[
("echo foo 1&> bar", "echo foo 1 &> bar"),
("echo foo 1&>bar", "echo foo 1 &> bar"),
("echo foo 1&>> bar", "echo foo 1 &>> bar"),
("echo foo 1&>>bar", "echo foo 1 &>> bar"),
("echo foo 1&>2 bar", "echo foo 1 &> 2 bar"),
("echo foo 1&>>2bar", "echo foo 1 &>> 2bar"),
("echo foo 1>2 bar", "echo foo 1> 2 bar"),
("echo foo 1>2bar", "echo foo 1> 2bar"),
("echo foo 1>>2 bar", "echo foo 1>> 2 bar"),
("echo foo 1>>2bar", "echo foo 1>> 2bar"),
("echo foo 1&>2 bar", "echo foo 1&> 2 bar"),
("echo foo 1&>2bar", "echo foo 1&> 2bar"),
("echo foo 1&>>2 bar", "echo foo 1&>> 2 bar"),
("echo foo 1&>>2bar", "echo foo 1&>> 2bar"),
("echo foo 1>|2 bar", "echo foo 1>| 2 bar"),
("echo foo 1>|2bar", "echo foo 1>| 2bar"),
("echo foo 1<>2 bar", "echo foo 1<> 2 bar"),
("echo foo 1<>2bar", "echo foo 1<> 2bar"),
("echo foo >2 bar", "echo foo > 2 bar"),
("echo foo >2bar", "echo foo > 2bar"),
("echo foo >>2 bar", "echo foo >> 2 bar"),
("echo foo >>2bar", "echo foo >> 2bar"),
("echo foo &>2 bar", "echo foo &> 2 bar"),
("echo foo &>2bar", "echo foo &> 2bar"),
("echo foo &>>2 bar", "echo foo &>> 2 bar"),
("echo foo &>>2bar", "echo foo &>> 2bar"),
("echo foo >|2 bar", "echo foo >| 2 bar"),
("echo foo >|2bar", "echo foo >| 2bar"),
("echo foo <>2 bar", "echo foo <> 2 bar"),
("echo foo <>2bar", "echo foo <> 2bar"),
("echo foo 2x> bar", "echo foo 2x > bar"),
("echo foo 2x>bar", "echo foo 2x > bar"),
("echo foo &< bar", "echo foo & < bar"),
("echo foo &<bar", "echo foo & < bar"),
],
)
def test_tokenize_redirect_boundaries(example, equivalent):
assert tokenize(example) == tokenize(equivalent)
@pytest.mark.parametrize(
"example, error_msg",
[
("echo foo &>| bar", "Syntax error near redirect operator '&>'"),
("echo foo &>< bar", "Syntax error near redirect operator '&>'"),
("echo foo &>& bar", "Syntax error near redirect operator '&>'"),
],
)
def test_tokenize_redirect_errors(example, error_msg):
with pytest.raises(CommandLineTokenizeError) as e_info:
tokenize(example)
assert e_info.value.args[0] == error_msg
def test_tokens_with_digits():
tokens = tokenize(f"echo foo 888bar")
assert_token(tokens[0], "echo")
assert_token(tokens[1], "foo")
assert_token(tokens[2], "888bar")
tokens = tokenize(f"echo foo888 bar")
assert_token(tokens[0], "echo")
assert_token(tokens[1], "foo888")
assert_token(tokens[2], "bar")
tokens = tokenize(f"echo foo888bar")
assert_token(tokens[0], "echo")
assert_token(tokens[1], "foo888bar")
tokens = tokenize(f"echo foo 888-bar")
assert_token(tokens[0], "echo")
assert_token(tokens[1], "foo")
assert_token(tokens[2], "888-bar")
tokens = tokenize(f"echo foo 888 bar")
assert_token(tokens[0], "echo")
assert_token(tokens[1], "foo")
assert_token(tokens[2], "888")
assert_token(tokens[3], "bar")
def assert_token(
token,
content: str,
includes_glob=False,
escaped_glob_indices=tuple(),
expansions=tuple(),
type=0,
):
assert token.content == content
assert token.includes_glob == includes_glob
assert token.escaped_glob_indices == escaped_glob_indices
assert token.expansions == expansions
assert token.type == TokenType(type)
class TokenDesc:
def __init__(
self,
content: str,
includes_glob=False,
escaped_glob_indices=tuple(),
expansions=tuple(),
type=0,
):
self.content = content
self.includes_glob = includes_glob
self.escaped_glob_indices = escaped_glob_indices
self.expansions = expansions
self.type = type
def assert_tokens(tokens, *token_descriptions):
for index, desc in enumerate(token_descriptions):
assert_token(
tokens[index],
content=desc.content,
includes_glob=desc.includes_glob,
escaped_glob_indices=desc.escaped_glob_indices,
expansions=desc.expansions,
type=desc.type,
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment