Last active
January 13, 2020 02:40
-
-
Save Meorawr/f039a051afe5744c3843323a6c1cc699 to your computer and use it in GitHub Desktop.
TRP3 Saved Variables Parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
''' | |
Limited reimplementation of the Lua 5.1 lexer and parser providing utilities | |
for serialization and deserialization of World of Warcraft saved variables | |
files. | |
The grammar supported by the parser is a subset of the Lua 5.1 grammar, | |
any unimplemented features will raise errors if encountered in a source file. | |
''' | |
from dataclasses import dataclass | |
from enum import Enum | |
from sys import maxunicode | |
from typing import Any, Callable, Iterable, Iterator, Mapping, Optional, \ | |
NewType, TextIO, Union | |
Lexeme = Union[None, str, float] | |
''' | |
Lexeme represents a parsed source code unit. This may optionally be decoded | |
into a specific type, for example numeric tokens would have a floating point | |
lexeme representing the value. | |
''' | |
LexemeDecoder = Callable[[str], Lexeme] | |
''' | |
LexemeDecoder represents a function type that translates a lexeme from a | |
an input source code string to any valid lexeme value type. This can be used | |
to decode numbers to actual floating point values, for example. | |
''' | |
SymbolValue = Union[None, bool, float, int, str] | |
''' | |
SymbolValue represents a union of types that are equivalent to immediate Lua | |
values, as emitted with VALUE kind symbols. | |
Nil values are translated to None, booleans are booleans, numbers are floats, | |
and strings are strings. | |
''' | |
SymbolName = NewType('SymbolName', str) | |
''' | |
SymbolName represents the string name of a variable emitted as part of | |
START_VARIABLE kind symbols. | |
''' | |
LuaTable = Mapping[Union[bool, float, int, str], 'LuaValue'] | |
''' | |
LuaTable represents a Python-compatible Lua table representation as a mapping | |
of simple hashable values to the generic LuaValue type. | |
Of note, this means you can't use lists/dicts as keys in tables. This should | |
be fine for our uses. | |
''' | |
LuaValue = Union[bool, float, int, str, None, LuaTable] | |
''' | |
LuaValue represents a union of all value Lua types, including tables. | |
''' | |
def isspace(s: str): | |
'''Returns true if the given string is whitespace.''' | |
return s.isascii() and s.isspace() | |
def isalpha(s: str): | |
'''Returns true if the given string is alphabetical.''' | |
return s.isascii() and s.isalpha() | |
def isdigit(s: str): | |
'''Returns true if the given string is a numeric digit.''' | |
return s.isascii() and s.isdigit() | |
def isalnum(s: str): | |
'''Returns true if the given string is alphanumeric.''' | |
return s.isascii() and s.isalnum() | |
def isquote(s: str): | |
'''Returns true if the given string represents a quote.''' | |
return s == '"' or s == '\'' | |
def iseol(s: str): | |
'''Returns true if the given string represents an EOL marker.''' | |
return s == '\r' or s == '\n' | |
def iseof(s: str): | |
'''Returns true if the given string represents an EOF state.''' | |
return len(s) == 0 | |
def iskeyword(s: str): | |
'''Returns true if the given string represents a Lua keyword.''' | |
# These keywords should be all that we find inside saved variables. | |
return s == 'false' or s == 'nil' or s == 'true' | |
def issymbol(s: str): | |
'''Returns true if the given string represents a Lua symbol.''' | |
# These symbols should be all that we find inside saved variables. | |
return s == ',' or s == '=' or s == '{' or s == '[' or s == '-' \ | |
or s == '}' or s == ']' | |
@dataclass | |
class Location: | |
''' | |
Represents a location in a source file, tracking its name, line, and | |
column indices. All indices are zero-based. | |
''' | |
name: str | |
'''Name of the source file.''' | |
line: int = 0 | |
'''File line number.''' | |
column: int = 0 | |
'''File column number.''' | |
def __str__(self): | |
'''Returns the printable string representation of this location.''' | |
return f'{self.name}:{self.line}:{self.column}' | |
def advance_line(self, lines=1): | |
'''Advances the line index, and resets the column to 0.''' | |
self.line += lines | |
self.column = 0 | |
def advance_column(self, columns=1): | |
'''Advances the column index.''' | |
self.column += columns | |
def copy(self): | |
'''Returns a shallow copy of the location object.''' | |
return Location(self.name, self.line, self.column) | |
class TokenKind(Enum): | |
''' | |
TokenKind is an enumeration of types of tokens yielded by the Lexer. | |
''' | |
EOF = 'EOF' | |
KEYWORD = 'KEYWORD' | |
NAME = 'NAME' | |
NUMBER = 'NUMBER' | |
STRING = 'STRING' | |
SYMBOL = 'SYMBOL' | |
def __repr__(self) -> str: | |
'''Returns the internal string representation of a token kind.''' | |
return f'<{self.__class__.__name__}.{self.name}>' | |
def __str__(self) -> str: | |
'''Returns the printable string representation of a token kind.''' | |
return f'<{str(self.value).lower()}>' | |
@dataclass | |
class Token: | |
''' | |
Token represents a single token yielded by the lexer, complete with | |
source location information and its associated lexeme. | |
''' | |
kind: TokenKind | |
'''Kind of token represented.''' | |
lexeme: Lexeme | |
'''Lexeme data attached to this token.''' | |
start: Location | |
'''Start location of the token in the input.''' | |
end: Location | |
'''End location of the token in the input.''' | |
def __str__(self) -> str: | |
'''Returns the printable string representation of this token.''' | |
if self.lexeme is not None: | |
# Use repr here since lexemes will be string or numeric data, | |
# and we want a quoted string representation for output. | |
return repr(self.lexeme) | |
else: | |
return str(self.kind) | |
class TokenBuilder: | |
''' | |
TokenBuilder implements a state machine for building Tokens. | |
This implements tracking of the start and end locations of a token, as | |
well as a data buffer for its lexeme. | |
''' | |
def __init__(self, location: Location): | |
''' | |
Initializes the token builder, tracking the given source location | |
reference. The location can be mutated externally and will only be | |
copied upon a call to `start()` or `end()`. | |
''' | |
self._location = location | |
self._buffer = [] | |
self._start = None # Start location, set upon start() call. | |
def lexeme(self, decoder: Optional[LexemeDecoder] = None) -> Lexeme: | |
''' | |
Returns the lexeme from the data stored within the builder's buffer, | |
optionally pushing it through the given decoder. | |
If the lexeme buffer is empty and no decoder given, None is returned. | |
''' | |
lexeme = ''.join(self._buffer) | |
if decoder is not None: | |
lexeme = decoder(lexeme) | |
elif len(lexeme) == 0: | |
lexeme = None | |
return lexeme | |
def reset(self): | |
''' | |
Resets the state of the token builder, clearing all stored data | |
for the current token and allowing `start()` and `end()` calls. | |
''' | |
self._buffer.clear() | |
self._start = None | |
def start(self): | |
''' | |
Starts a token from a character range marked as starting at the | |
current source location. | |
Raises an AssertionError if `start()` has already been called. | |
''' | |
assert self._start is None | |
self._start = self._location.copy() | |
def append(self, data: str): | |
''' | |
Appends the given string data to the lexeme buffer. | |
Raises an AssertionError if `start()` has not been called. | |
''' | |
assert self._start is not None | |
self._buffer.append(data) | |
def end(self, | |
kind: TokenKind, | |
decoder: Optional[LexemeDecoder] = None) -> Token: | |
''' | |
Finishes a token, returning an instance of it with the given token | |
kind and lexeme stored by the builder (as decoded with the optional | |
decoder). | |
Raises an AssertionError if `start()` has not been called. Resets the | |
state upon returning the token, allowing the builder to be reused | |
for building new tokens. | |
''' | |
assert self._start is not None | |
token = Token(kind=kind, | |
lexeme=self.lexeme(decoder=decoder), | |
start=self._start, | |
end=self._location.copy()) | |
self.reset() | |
return token | |
def build(self, kind: TokenKind, data: Optional[str] = None): | |
''' | |
Helper function that builds a token with optional lexeme data. | |
Mostly used for EOF indicators or other static-sized tokens. | |
''' | |
self.start() | |
# If a lexeme is given we assume the location reference that we | |
# recorded as our start position is actually *after* the data, so we | |
# should rewind it by the length of the lexeme. | |
if data is not None: | |
self._start.advance_column(-len(data)) | |
self.append(data) | |
return self.end(kind) | |
class LexerError(Exception): | |
'''Error type raised by the lexer upon invalid or malformed input.''' | |
def __init__(self, message: str, scope: Union[None, Location, Token]): | |
''' | |
Initializes the error with the given message string, optionally | |
including scope information from the given location or token. | |
''' | |
if isinstance(scope, Location): | |
what = f'{str(scope)}: {message}' | |
elif isinstance(scope, Token): | |
what = f'{str(scope.end)}: {message} near {str(scope)}' | |
else: | |
what = message | |
super().__init__(what) | |
class Lexer: | |
''' | |
Lexer implements a Lua source file lexer that yields tokens upon | |
successive next calls. Instances of this type may be used as iterators | |
with the `next()` builtin or `for token in lexer` style loops. | |
This implements a subset of the language and will raise errors upon | |
malformed or unsupported input. | |
''' | |
def __init__(self, input: TextIO): | |
''' | |
Initializes the lexer to read from the given TextIO stream, such as a | |
file. | |
''' | |
self.token = None # Most recently read token. | |
self._input = input | |
self._location = Location(getattr(input, 'name', str(input))) | |
self._builder = TokenBuilder(self._location) | |
self._char = None # Initialized on first advancement. | |
def __iter__(self) -> Iterator[Token]: | |
'''Returns an iterator that yields tokens.''' | |
return self | |
def __next__(self) -> Token: | |
''' | |
Returns the next token from the stream. Raises StopIteration when | |
an EOF token is obtained. | |
''' | |
self.token = self._next_token() | |
if self.token.kind == TokenKind.EOF: | |
raise StopIteration | |
else: | |
return self.token | |
def _next_token(self) -> Token: | |
''' | |
Returns the next token from the stream. This will cause the IO | |
stream to be read and internal state to be advanced. Upon EOF, | |
an EOF kind token is yielded; further calls beyond an EOF token will | |
result in a LexerError. | |
''' | |
# If this is our first call, we need to advance to the first byte. | |
if self._char is None: | |
self._read_char() | |
self._builder.reset() | |
return self._read_token() | |
def _read_token(self) -> Token: | |
'''Reads and returns a token from the underlying stream.''' | |
assert self._char is not None # Must have read a character. | |
if self._char == '-': | |
# An `-` could indicate the start of a comment, or just a symbol. | |
self._read_char() | |
if self._char == '-': | |
self._skip_comment() | |
return self._read_token() | |
else: | |
return self._builder.build(TokenKind.SYMBOL, '-') | |
elif self._char == '[': | |
# Lua supports slong strings with `[[content]]` syntax, but the | |
# saved variable format encodes all strings with quotes only. | |
self._read_char() | |
if self._char == '[': | |
# FIXME: Not bulletproof. Doesn't detect `[=[content]=]`. | |
raise LexerError('unsupported long string', self._location) | |
else: | |
return self._builder.build(TokenKind.SYMBOL, '[') | |
elif self._char == '=': | |
# An `=` could be an assignment operator or the start of a | |
# comparison operator. Saved variables should only contain | |
# assignments, and thus are single-character symbols. | |
self._read_char() | |
if self._char == '=': | |
raise LexerError('unsupported `==` operator', self._location) | |
else: | |
return self._builder.build(TokenKind.SYMBOL, '=') | |
elif self._char == '.': | |
# Periods could indicate a few operators, but saved variables | |
# should only ever emit them as fractional numeric literals. | |
return self._read_numeral_token() | |
elif isquote(self._char): | |
# Quoted string literal token. | |
return self._read_string_token() | |
elif iseol(self._char): | |
# Newline character. This *must* be handled before whitespace | |
# since newlines are whitespace. Skip newlines and recurse. | |
self._skip_newline() | |
return self._read_token() | |
elif isspace(self._char): | |
# Whitespace character. Must not be a newline. Whitespace is | |
# skipped and we just recurse otherwise. | |
assert not iseol(self._char) | |
self._read_char() | |
return self._read_token() | |
elif isalpha(self._char) or self._char == '_': | |
# Keyword or identifier token. | |
return self._read_name_token() | |
elif isdigit(self._char): | |
# Numeric literal token. | |
return self._read_numeral_token() | |
elif iseof(self._char): | |
# EOF token. | |
return self._builder.build(TokenKind.EOF) | |
else: | |
# Symbol token. | |
return self._read_symbol_token() | |
def _read_symbol_token(self) -> Token: | |
'''Reads a single-character symbol token from the input stream.''' | |
assert issymbol(self._char) # Must be positioned on a symbol. | |
symbol = self._char | |
self._read_char() | |
return self._builder.build(TokenKind.SYMBOL, symbol) | |
def _read_name_token(self) -> Token: | |
'''Reads and returns an identifier or keyword from the stream.''' | |
assert isalpha(self._char) or self._char == '_' | |
# Read all name-valid characters into the token builder. | |
self._builder.start() | |
while isalnum(self._char) or self._char == '_': | |
self._builder.append(self._char) | |
self._read_char() | |
# The lexeme contents are either a keyword or identifier. Figure it | |
# out and yield the right token kind. | |
if iskeyword(self._builder.lexeme()): | |
return self._builder.end(TokenKind.KEYWORD) | |
else: | |
return self._builder.end(TokenKind.NAME) | |
def _read_numeral_token(self) -> Token: | |
'''Reads a numeric literal token from the input stream.''' | |
# Must be positioned on either a digit or a period. | |
assert isdigit(self._char) or self._char == '.' | |
# Consume all digits/periods into the buffer. We omit support for | |
# exponents and hexadecimal numbers. | |
self._builder.start() | |
while isdigit(self._char) or self._char == '.': | |
self._builder.append(self._char) | |
self._read_char() | |
try: | |
# Decode as an integer if the lexeme contains no periods. | |
decoder = float if '.' in self._builder.lexeme() else int | |
return self._builder.end(TokenKind.NUMBER, decoder=decoder) | |
except ValueError: | |
raise LexerError('malformed number', self._location) | |
def _read_string_token(self) -> Token: | |
'''Reads and returns a quoted string literal token from the stream.''' | |
assert isquote(self._char) # Must be positioned on a quote. | |
quote = self._char | |
# Consume characters until we reach the same terminating quote. | |
self._builder.start() | |
self._read_char() | |
while self._char != quote: | |
if iseof(self._char): | |
# End of stream reached. | |
raise LexerError('unfinished string', | |
self._builder.end(TokenKind.EOF)) | |
elif iseol(self._char): | |
# End of line character without a preceeding backslash. | |
raise LexerError('unfinished string', | |
self._builder.end(TokenKind.STRING)) | |
elif self._char == '\\': | |
# Escape sequence. Advances the stream to after the escape, | |
# so no follow-up read is required. | |
self._builder.append(self._read_string_escape()) | |
else: | |
# Standard character/byte, no processing needed. | |
self._builder.append(self._char) | |
self._read_char() | |
# Discard the trailing quote. | |
self._read_char() | |
return self._builder.end(TokenKind.STRING) | |
def _read_string_escape(self) -> str: | |
''' | |
Reads an escape sequence from within a string literal. Assumes that | |
we're starting on the backslash, and returns the transformed literal | |
as a string. If EOF is reached, returns an empty string. The stream | |
will be advanced beyond the end of the escape sequence. | |
''' | |
assert self._char == '\\' # Assume we're starting on the backslash. | |
self._read_char() | |
if self._char == 'a': # Bell. | |
self._read_char() | |
return '\a' | |
elif self._char == 'b': # Backspace. | |
self._read_char() | |
return '\b' | |
elif self._char == 'f': # Form feed. | |
self._read_char() | |
return '\f' | |
elif self._char == 'n': # Line feed. | |
self._read_char() | |
return '\n' | |
elif self._char == 'r': # Carriage return. | |
self._read_char() | |
return '\r' | |
elif self._char == 't': # Horizontal tab. | |
self._read_char() | |
return '\t' | |
elif self._char == 'v': # Vertical tab. | |
self._read_char() | |
return '\v' | |
elif iseol(self._char): | |
# A \ followed by a newline is converted to an \n character and | |
# needs to be processed specially for location tracking. | |
self._skip_newline() | |
return '\n' | |
elif iseof(self._char): | |
# Return the EOF unmodified. | |
return self._char | |
elif not isdigit(self._char): | |
# Non-digit characters at this point imply that we're dealing | |
# with an escaped special character, like a quote or backslash. | |
# These can be passed straight through. | |
char = self._char | |
self._read_char() | |
return char | |
else: | |
# The set can only be a digit at this point, which means this | |
# is a codepoint in the form `\123`. These may come in a series, | |
# which we need to collect all of and decode to form a string. | |
encoding = self._input.encoding | |
codepoint = 0x00000000 | |
bytecount = 0 | |
byte = 0 | |
while True: | |
# Convert the digit character to part of this ordinal byte. | |
byte = (byte * 10) + (ord(self._char[0]) - 48) | |
self._read_char() | |
if isdigit(self._char) and byte <= 255: | |
# Next character is a digit and we've still got an ordinal | |
# below the maximum representable range. | |
continue | |
elif byte > 255: | |
# Ordinal is outside the valid escape range. | |
raise LexerError('escape sequence too large', | |
self._builder.end(TokenKind.STRING)) | |
# Attempt to decode the codepoint we've amassed thus far | |
# using the same encoding as our input stream. | |
try: | |
codepoint |= byte | |
bytecount += 1 | |
return codepoint.to_bytes(bytecount, 'big') \ | |
.decode(encoding) | |
except UnicodeDecodeError as ex: | |
# Is there another escape sequence we can try? | |
if self._char != '\\' or bytecount == 4: | |
raise LexerError(ex.reason, | |
self._builder.end(TokenKind.STRING)) | |
# Is it actually an ordinal too? | |
self._read_char() | |
if not isdigit(self._char): | |
raise LexerError(ex.reason, | |
self._builder.end(TokenKind.STRING)) | |
# Shift the codepoint up 8 bits and try reading the next | |
# ordinal escape sequence for decoding. | |
codepoint <<= 8 | |
byte = 0 | |
continue | |
def _skip_comment(self): | |
''' | |
Reads and discards a comment sequence from the stream. This yields | |
no token as a result, but updates our location information. | |
''' | |
assert self._char == '-' # Must be positioned on a minus. | |
# We assume saved variables never contain block comments. | |
self._read_char() | |
if self._char == '[': | |
# FIXME: Not bulletproof. Doesn't detect `--[=[comment]=]`, and | |
# falsely triggers on `--[ comment`. | |
raise LexerError('unsupported block comment', self._location) | |
# Consume all characters until an EOL or EOF marker. | |
while not iseol(self._char) and not iseof(self._char): | |
self._read_char() | |
def _skip_newline(self): | |
''' | |
Reads and discards a newline sequence from the stream. This yields | |
no token as a result, but updates our location information. | |
''' | |
assert iseol(self._char) # Must be positioned on an EOL character. | |
# Record the current EOL character and read the next one. EOL markers | |
# can come in non-matching pairs (\r\n or \n\r), which we want to | |
# consume and skip as one unit. | |
previous_eol = self._char | |
self._read_char() | |
if iseol(self._char) and self._char != previous_eol: | |
self._read_char() | |
# Advance our line and column numbering. | |
self._location.advance_line() | |
self._location.advance_column() | |
def _read_char(self): | |
'''Advances the source stream by a single character.''' | |
# If we're at EOF already, force a LexerError to bubble up. | |
if self._char is not None and iseof(self._char): | |
raise LexerError('unexpected EOF', self._location) | |
self._char = self._input.read(1) | |
# If this was our first read our line number needs incrementing. | |
# Do this before incrementing the column as this resets it. | |
if self._location.line == 0: | |
self._location.advance_line() | |
self._location.advance_column() | |
class SymbolKind(Enum): | |
''' | |
SymbolKind is an enumeration of non-terminal and terminal symbols | |
emittable by the parser. This allows the creation of a limited AST | |
in a stream-based manner, rather than forcing construction of a full | |
tree on each parse. | |
''' | |
END_TABLE = 'END_TABLE' | |
END_TABLE_KEY = 'END_TABLE_KEY' | |
END_TABLE_VALUE = 'END_TABLE_VALUE' | |
END_VARIABLE = 'END_VARIABLE' | |
START_TABLE = 'START_TABLE' | |
START_TABLE_KEY = 'START_TABLE_KEY' | |
START_TABLE_VALUE = 'START_TABLE_VALUE' | |
START_VARIABLE = 'START_VARIABLE' | |
VALUE = 'VALUE' | |
def __repr__(self): | |
'''Returns the internal string representation of a symbol kind.''' | |
return f'<{self.__class__.__name__}.{self.name}>' | |
def __str__(self): | |
'''Returns the printable string representation of a symbol kind.''' | |
return str(self.value) | |
@dataclass | |
class Symbol: | |
''' | |
Symbol represents the structure of a single symbol emitted by the parser, | |
combining a SymbolKind with an kind-specific data. | |
For most symbols the data field will be None. For START_VARIABLE and | |
END_VARIABLE it will be a SymbolName, and for VALUE a SymbolValue. | |
''' | |
kind: SymbolKind | |
'''Kind of symbol.''' | |
data: Union[None, SymbolName, SymbolValue] = None | |
'''Optional data associated with this symbol. Kind-specific.''' | |
def __str__(self): | |
'''Returns the printable string representation of a symbol kind.''' | |
if self.kind == SymbolKind.VALUE or self.data is not None: | |
return f'Symbol({str(self.kind)}, {repr(self.data)})' | |
else: | |
return f'Symbol({str(self.kind)})' | |
class ParseError(Exception): | |
'''Error type raised by the parser upon invalid or malformed input.''' | |
def __init__(self, message: str, scope: Union[None, Location, Token]): | |
''' | |
Initializes the error with the given message string including | |
location information from the given scope. | |
''' | |
if isinstance(scope, Token): | |
what = f'{str(scope.start)}: {message}, got {str(scope)}' | |
elif isinstance(scope, Location): | |
what = f'{str(scope)}: {message}' | |
else: | |
what = message | |
super().__init__(what) | |
class Parser: | |
''' | |
Parser implements a Lua source file parser that yields that yields | |
symbols upon advancement via the `next()` builtin or | |
`for symbol in parser` style loops. | |
This implements a subset of the language and will raise errors upon | |
malformed or unsupported input. | |
''' | |
@staticmethod | |
def read_from(stream: TextIO): | |
'''Initializes a parser to read from the given TextIO stream.''' | |
return Parser(Lexer(stream)) | |
def __init__(self, lexer: Lexer): | |
'''Initializes a parser to read tokens from the given Lexer.''' | |
self.symbol = None # Most recently read symbol. None on EOF. | |
self._lexer = lexer | |
self._token = None # Most recently parsed token. | |
self._state = self._parse_chunk() # Iterable. | |
def __iter__(self) -> Iterator[Symbol]: | |
'''Returns an iterator for accessing symbols from the parser.''' | |
return self | |
def __next__(self) -> Symbol: | |
''' | |
Returns the next symbol from the parser. Raises StopIteration upon | |
reaching EOF. | |
''' | |
if self._token is None: # No tokens yet read, start. | |
self._read_token() | |
if self._token.kind == TokenKind.EOF: # EOF token reached, stop. | |
self.symbol = None | |
raise StopIteration | |
else: | |
self.symbol = next(self._state) | |
return self.symbol | |
def _parse_chunk(self) -> Iterable[Symbol]: | |
''' | |
Yields all symbols present within a Lua chunk. Continues until the | |
underlying token stream reaches EOF. | |
''' | |
# Chunks are an optionally-semicolon-delimited sequence of statements. | |
while not self._accept_token(TokenKind.EOF): | |
yield from self._parse_statement() | |
self._accept_and_read_token(TokenKind.SYMBOL, ';') | |
def _parse_statement(self) -> Iterable[Symbol]: | |
'''Yields symbols from a Lua statement.''' | |
# Lua supports a varied range of statements, but the only ones we | |
# will find in saved variable files will be assignment statements. | |
# Expect a variable name identifier. | |
self._expect_token(TokenKind.NAME) | |
variable_name = SymbolName(self._token.lexeme) | |
yield Symbol(SymbolKind.START_VARIABLE, variable_name) | |
# Expect an `=` symbol. | |
self._read_token() | |
self._expect_and_read_token(TokenKind.SYMBOL, '=') | |
# The value being assigned is the result of an expression, so recurse. | |
yield from self._parse_expression() | |
yield Symbol(SymbolKind.END_VARIABLE, variable_name) | |
def _parse_expression(self) -> Iterable[Symbol]: | |
'''Yields symbols from a Lua expression.''' | |
# Lua expressions are complicated and recursive, complete with unary | |
# and binary operators, subexpressions, etc. Thankfully saved | |
# variables files can only emit the unary minus operator and | |
# simple expressions (values). | |
# Test for unary minus. | |
if self._accept_and_read_token(TokenKind.SYMBOL, '-'): | |
# Recursively parse as a subexpression and obtain the first | |
# symbol from it. | |
expression = self._parse_expression() | |
symbol = next(expression) | |
# Saved variables should only ever generate code such that the | |
# unary minus operation is applied to immediate numeric values, | |
# so if this isn't the case then complain. | |
if symbol.kind != SymbolKind.VALUE \ | |
or not isinstance(symbol.data, (float, int)): | |
raise ParseError('unsupported unary minus operation', None) | |
# Apply the negation to the value and re-yield it to the caller, | |
# then continue with the expression so that we properly advance | |
# our token stream. | |
yield Symbol(symbol.kind, -symbol.data) | |
yield from expression | |
else: | |
# No minus, so it can only be a value expression. | |
yield from self._parse_value() | |
def _parse_value(self) -> Iterable[Symbol]: | |
'''Yields symbols from a Lua value expression.''' | |
# Value expressions (simpleexp) can only be immediate values or | |
# table constructors. Any other variants are unimplemented. | |
if self._accept_token(TokenKind.NUMBER): | |
yield Symbol(SymbolKind.VALUE, self._token.lexeme) | |
elif self._accept_token(TokenKind.STRING): | |
yield Symbol(SymbolKind.VALUE, self._token.lexeme) | |
elif self._accept_token(TokenKind.KEYWORD, 'nil'): | |
yield Symbol(SymbolKind.VALUE, None) | |
elif self._accept_token(TokenKind.KEYWORD, 'true'): | |
yield Symbol(SymbolKind.VALUE, True) | |
elif self._accept_token(TokenKind.KEYWORD, 'false'): | |
yield Symbol(SymbolKind.VALUE, False) | |
elif self._accept_token(TokenKind.SYMBOL, '{'): | |
yield from self._parse_table() | |
else: | |
raise ParseError('unsupported expression variant', self._token) | |
# Advance to the next token to close the expression. | |
self._read_token() | |
def _parse_table(self) -> Iterable[Symbol]: | |
'''Yields symbols from a Lua table constructor expression.''' | |
# Assume we're starting on the opening curly bracket. | |
self._expect_and_read_token(TokenKind.SYMBOL, '{') | |
yield Symbol(SymbolKind.START_TABLE) | |
# Read the tokens forming the records of this table. | |
while True: | |
if self._accept_token(TokenKind.SYMBOL, '}'): | |
# Found closing brace, break out the loop. | |
break | |
elif self._accept_token(TokenKind.NAME): | |
# Normally a NAME token could be ambiguous between either | |
# a hash record assignment (`{a = 1}`) or an list record with | |
# a variable lookup (`{a}`), but saved variables should never | |
# emit variable lookups inside tables. | |
yield from self._parse_table_hash_record() | |
elif self._accept_token(TokenKind.SYMBOL, '['): | |
# An explicit `[` is always a hash record. | |
yield from self._parse_table_hash_record() | |
else: | |
# Anything else is considered an list record. | |
yield from self._parse_table_list_record() | |
# Records should be separated by commas; saved variables won't | |
# use semicolons as delimiters. | |
if not self._accept_and_read_token(TokenKind.SYMBOL, ','): | |
break | |
# Our caller will advance the token beyond the closing brace, so we | |
# just need to expect it rather than expect and read. | |
self._expect_token(TokenKind.SYMBOL, '}') | |
yield Symbol(SymbolKind.END_TABLE) | |
def _parse_table_hash_record(self) -> Iterable[Symbol]: | |
'''Yields symbols from a Lua table constructor hash record.''' | |
# A hash record is either a NAME token or a bracketed key expression | |
# followed by a value assignment. | |
yield Symbol(SymbolKind.START_TABLE_KEY) | |
if self._accept_token(TokenKind.NAME): | |
# Literal key. | |
yield Symbol(SymbolKind.VALUE, self._token.lexeme) | |
self._read_token() | |
else: | |
# Bracketed key. | |
self._expect_and_read_token(TokenKind.SYMBOL, '[') | |
yield from self._parse_expression() | |
self._expect_and_read_token(TokenKind.SYMBOL, ']') | |
yield Symbol(SymbolKind.END_TABLE_KEY) | |
self._expect_and_read_token(TokenKind.SYMBOL, '=') | |
# The value is always an expression. | |
yield Symbol(SymbolKind.START_TABLE_VALUE) | |
yield from self._parse_expression() | |
yield Symbol(SymbolKind.END_TABLE_VALUE) | |
def _parse_table_list_record(self) -> Iterable[Symbol]: | |
'''Yields symbols from a Lua table constructor list record.''' | |
# List records are just expressions. | |
yield Symbol(SymbolKind.START_TABLE_VALUE) | |
yield from self._parse_expression() | |
yield Symbol(SymbolKind.END_TABLE_VALUE) | |
def _parse_singlevar(self) -> Iterable[Symbol]: | |
'''Yields symbols from a Lua variable assignment statement.''' | |
def _accept_token(self, kind: TokenKind, lexeme: Lexeme = None): | |
''' | |
Returns true if the current token matches the kind and (optional) | |
lexeme data. | |
''' | |
assert self._token is not None # Must have a token to test. | |
return self._token.kind == kind \ | |
and (lexeme is None or self._token.lexeme == lexeme) | |
def _expect_token(self, kind: TokenKind, lexeme: Lexeme = None): | |
''' | |
Raises a ParseError if the current token doesn't match the requested | |
kind and (optional) lexeme data. | |
''' | |
if not self._accept_token(kind, lexeme): | |
if lexeme is not None: | |
raise ParseError(f'expected {repr(lexeme)}', self._token) | |
else: | |
raise ParseError(f'expected {str(kind)}', self._token) | |
def _accept_and_read_token(self, kind: TokenKind, lexeme: Lexeme = None): | |
''' | |
Returns true and reads the next token if the current token matches | |
the given kind and (optional) lexeme data. | |
''' | |
if self._accept_token(kind, lexeme): | |
self._read_token() | |
return True | |
else: | |
return False | |
def _expect_and_read_token(self, kind: TokenKind, lexeme: Lexeme = None): | |
''' | |
Reads the next token if the current token matches the given kind | |
and (optional) lexeme data, otherwise raises a ParseError. | |
''' | |
self._expect_token(kind, lexeme) | |
self._read_token() | |
def _read_token(self): | |
'''Fetches the next token from the lexer.''' | |
try: | |
self._token = next(self._lexer) | |
except StopIteration: | |
# EOF reached, copy the token so we can detect it internally. | |
self._token = self._lexer.token | |
class DeserializeError(Exception): | |
'''Error type raised by the deserialize functions on invalid symbols.''' | |
pass | |
@dataclass | |
class SerializeOptions: | |
'''Structure representing options used for serializing Lua values.''' | |
line_prefix: str = '' | |
''' | |
Prefix to prepend to the start of each line. This is present for | |
each key/value pair within a table, as well as the table end (}). | |
''' | |
line_indent: str = '' | |
''' | |
Indentation string, repeated for each level of indentation depth. This | |
is placed after line_prefix on each suitable line. | |
''' | |
line_suffix: str = '' | |
''' | |
Suffix to append to the end of each line. This is applied to the end of | |
each table start (`{`), and any key/value pairs within tables. | |
''' | |
trailing_comma: bool = False | |
'''If true, include a trailing comma on the last entry in a table.''' | |
key_value_space: bool = False | |
'''If true, include spaces around `=` in table assignments.''' | |
indent_depth: int = 0 | |
'''Indentation depth, to be incremented each time a table starts.''' | |
raw: bool = False | |
''' | |
If true, disables `__lua_serialize__` implementations on nested | |
objects when serializing. | |
''' | |
def copy(self): | |
'''Returns a shallow copy of the options o.''' | |
return SerializeOptions(line_prefix=self.line_prefix, | |
line_indent=self.line_indent, | |
line_suffix=self.line_suffix, | |
trailing_comma=self.trailing_comma, | |
key_value_space=self.key_value_space, | |
indent_depth=self.indent_depth, | |
raw=self.raw) | |
SERIALIZE_COMPACT = SerializeOptions() | |
'''Serializer options that will produce compact output.''' | |
SERIALIZE_SPACED = SerializeOptions(line_suffix=' ', key_value_space=True) | |
'''Serializer options that will produce spaced output.''' | |
SERIALIZE_PRETTY = SerializeOptions(line_indent='\t', | |
line_suffix='\n', | |
trailing_comma=True, | |
key_value_space=True) | |
'''Serializer options that will produce pretty output.''' | |
def deserialize_table(parser: Parser) -> LuaTable: | |
''' | |
Deserializes a table from the parser's current state. The parser must | |
be positioned on a START_TABLE symbol, and will be advanced to the next | |
matching END_TABLE symbol. | |
''' | |
if parser.symbol.kind != SymbolKind.START_TABLE: | |
raise DeserializeError(f'expected table, got {str(parser.symbol)}') | |
table = dict() # Table, as a dictionary. Truly a visionary. | |
array_size = 0 # Size of the array segment. | |
current_key = None # Last value collected for the key. | |
current_val = None # Last value collected for, well, a value. | |
for symbol in parser: | |
if symbol.kind == SymbolKind.END_TABLE: # End of this table. | |
break | |
elif symbol.kind == SymbolKind.START_TABLE: # Nested table. | |
current_val = deserialize_table(parser) | |
elif symbol.kind == SymbolKind.VALUE: # Immediate value. | |
current_val = symbol.data | |
elif symbol.kind == SymbolKind.END_TABLE_KEY: | |
# We just read a key; we always write values and nested tables | |
# to current_val so we need to move it to current_key. | |
current_key, current_val = current_val, None | |
elif symbol.kind == SymbolKind.END_TABLE_VALUE: | |
# Push the entry into the table. | |
if current_key is not None: | |
# Hash entry. | |
table[current_key] = current_val | |
else: | |
# Array entry. | |
array_size += 1 | |
table[array_size] = current_val | |
# Reset the stored values. | |
current_key, current_key = None, None | |
else: | |
# Other symbols are ignored. We should get nothing invalid if | |
# the parser is correct. | |
continue | |
return table | |
def deserialize(parser: Parser) -> LuaValue: | |
''' | |
Deserializes a Lua value from the given parser. The parser will be | |
advanced to the next symbol, after which it expects to find either a | |
VALUE or START_TABLE symbol. | |
Returns None if the parser reaches EOF. Other symbols will raise a | |
DeserializeError. | |
''' | |
try: | |
sym = next(parser) | |
if sym.kind == SymbolKind.VALUE: | |
return sym.data | |
elif sym.kind == SymbolKind.START_TABLE: | |
return deserialize_table(parser) | |
else: | |
raise DeserializeError(f'expected value or table, got {str(sym)}') | |
except StopIteration: | |
return None | |
def serialize(o: Any, | |
output: TextIO, | |
options: SerializeOptions = SERIALIZE_COMPACT): | |
''' | |
Serializes a given value as its Lua equivalent to the given output | |
stream. If the given value implements a `__lua_serialize__()` method, | |
it will be called, otherwise this behaves the same as `serialize_raw()`. | |
''' | |
if not options.raw and callable(getattr(o, '__lua_serialize__', None)): | |
o.__lua_serialize__(output=output, options=options) | |
else: | |
serialize_raw(o, output=output, options=options) | |
def serialize_raw(o: Any, | |
output: TextIO, | |
options: SerializeOptions = SERIALIZE_COMPACT): | |
''' | |
Serializes a given value as its Lua equivalent to the given output | |
stream. The given value will not have any custom serializers executed | |
on its top level. | |
Complex values (dicts and lists) will be formatted with the given options. | |
Raises ValueError if the input value cannot be converted to a Lua | |
equivalent. | |
''' | |
if isinstance(o, str): | |
serialize_str(o, output=output) | |
elif isinstance(o, bool): | |
output.write('true' if o else 'false') | |
elif isinstance(o, (float, int)): | |
output.write(str(o)) | |
elif isinstance(o, dict): | |
serialize_dict(o, output=output, options=options) | |
elif isinstance(o, list): | |
serialize_list(o, output=output, options=options) | |
elif o is None: | |
output.write('nil') | |
else: | |
raise ValueError(f'Cannot map value to Lua: {repr(o)}') | |
def serialize_str(s: str, output: TextIO): | |
''' | |
Serializes a string as a double-quoted Lua string to the given output | |
stream. | |
''' | |
output.write('"') | |
for c in s[:]: | |
if c == '"': | |
output.write(f'\\"') | |
elif c == '\\': | |
output.write(f'\\\\') | |
elif c == '\r': | |
output.write(f'\\r') | |
elif c == '\n': | |
output.write(f'\\n') | |
elif not c.isascii() or not c.isprintable(): | |
# Non-printable or non-ascii characters will be output as | |
# ordinal escape sequences on a byte-by-byte basis. | |
for b in bytes(c, 'utf-8'): | |
output.write(f'\\{b}') | |
else: | |
# All other characters are printable and can go through as-is. | |
output.write(c) | |
output.write('"') | |
def serialize_dict(d: dict, | |
output: TextIO, | |
options: SerializeOptions = SERIALIZE_COMPACT): | |
''' | |
Serializes a dictionary as a Lua table to the given output stream, | |
formatted according to the given options. | |
''' | |
# Shortcut for zero-entry tables. | |
if len(d) == 0: | |
output.write('{}') | |
return | |
indent_string = options.line_indent * options.indent_depth | |
# Open the table. | |
output.write('{') | |
output.write(options.line_suffix) | |
# Serialize the contents. | |
suboptions = options.copy() | |
suboptions.indent_depth += 1 | |
serialize_dict_entries(d, output=output, options=suboptions) | |
# Terminate the table. | |
output.write(options.line_prefix) | |
output.write(indent_string) | |
output.write('}') | |
def serialize_list(l: list, | |
output: TextIO, | |
options: SerializeOptions = SERIALIZE_COMPACT): | |
''' | |
Serializes a list as a Lua table to the given output stream, formatted | |
according to the given options. | |
''' | |
# Shortcut for zero-entry tables. | |
if len(l) == 0: | |
output.write('{}') | |
return | |
indent_string = options.line_indent * options.indent_depth | |
# Open the table. | |
output.write('{') | |
output.write(options.line_suffix) | |
# Serialize the contents. | |
suboptions = options.copy() | |
suboptions.indent_depth += 1 | |
serialize_list_entries(l, output=output, options=suboptions) | |
# Terminate the table. | |
output.write(options.line_prefix) | |
output.write(indent_string) | |
output.write('}') | |
def serialize_dict_entries(d: dict, | |
output: TextIO, | |
options: SerializeOptions = SERIALIZE_COMPACT): | |
''' | |
Serializes the entries of a dict as Lua table contents to the given | |
output stream, formatted according to the given options. | |
''' | |
indent_string = options.line_indent * options.indent_depth | |
i = 0 | |
for k, v in d.items(): | |
output.write(options.line_prefix) | |
output.write(indent_string) | |
# Entry key. | |
output.write('[') | |
serialize(k, output=output, options=options) | |
output.write(']') | |
# Assignment. | |
output.write(' = ' if options.key_value_space else '=') | |
# Entry value. | |
serialize(v, output=output, options=options) | |
if i < (len(d) - 1) or options.trailing_comma: | |
output.write(',') | |
output.write(options.line_suffix) | |
i += 1 | |
def serialize_list_entries(l: list, | |
output: TextIO, | |
options: SerializeOptions = SERIALIZE_COMPACT): | |
''' | |
Serializes the entries of a list as Lua table contents to the given | |
output stream, formatted according to the given options. | |
''' | |
indent_string = options.line_indent * options.indent_depth | |
for i, v in enumerate(l): | |
output.write(options.line_prefix) | |
output.write(indent_string) | |
serialize(v, output=output, options=options) | |
if i < (len(l) - 1) or options.trailing_comma: | |
output.write(',') | |
output.write(options.line_suffix) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
from savedvars import LuaValue, Parser, SerializeOptions, SymbolKind, \ | |
SERIALIZE_PRETTY, deserialize, serialize | |
from dataclasses import dataclass | |
from enum import Enum | |
from io import open | |
from sys import stdout | |
from typing import TextIO | |
class VariableKind(Enum): | |
''' | |
VariableKind is an enumeration of all saved variables found within the | |
`totalRP3.lua` file. | |
''' | |
PROFILES = 'TRP3_Profiles' | |
CHARACTERS = 'TRP3_Characters' | |
CONFIGURATION = 'TRP3_Configuration' | |
TARGET_FRAME = 'TRP3_TargetFrame' | |
FLYWAY = 'TRP3_Flyway' | |
PRESETS = 'TRP3_Presets' | |
COMPANIONS = 'TRP3_Companions' | |
MATURE_FILTER = 'TRP3_MatureFilter' | |
STASHED_DATA = 'TRP3_StashedData' | |
COLORS = 'TRP3_Colors' | |
NOTES = 'TRP3_Notes' | |
def field_name(self) -> str: | |
'''Returns a Variable instance field name for a variable kind.''' | |
return str(self.name).lower() | |
def variable_name(self) -> str: | |
'''Returns a Lua variable name for a variable kind.''' | |
return str(self.value) | |
@dataclass | |
class Variables: | |
''' | |
Variables represents the structure of the TRP3 saved variables. Each | |
field present maps to a variable in the `totalRP3.lua` file and may be | |
accessed as a Python-equivalent Lua data structure, eg. dicts. | |
''' | |
profiles: LuaValue = None | |
characters: LuaValue = None | |
configuration: LuaValue = None | |
target_frame: LuaValue = None | |
flyway: LuaValue = None | |
presets: LuaValue = None | |
companions: LuaValue = None | |
mature_filter: LuaValue = None | |
stashed_data: LuaValue = None | |
colors: LuaValue = None | |
notes: LuaValue = None | |
@staticmethod | |
def read_from(input: TextIO): | |
''' | |
Constructs a Variables instance from the contents of the given | |
TextIO (file) stream. | |
Raises an error if there's any issue parsing the contents, hopefully | |
complete with source location information. | |
''' | |
# As we're parsing from the top of a file we should only get variable | |
# assignments. Anything else is ignored. | |
parser = Parser.read_from(input) | |
variables = Variables() | |
for symbol in parser: | |
if symbol.kind != SymbolKind.START_VARIABLE: | |
continue | |
try: | |
kind = VariableKind(symbol.data) | |
setattr(variables, kind.field_name(), deserialize(parser)) | |
except ValueError: | |
# Unknown variable. | |
continue | |
return variables | |
def __lua_serialize__(self, output: TextIO, options: SerializeOptions): | |
'''Serializes the variables to the given output stream.''' | |
for kind in VariableKind: | |
output.write(kind.variable_name()) | |
output.write(' = ') | |
serialize(getattr(self, kind.field_name(), None), output, options) | |
output.write('\n\n') | |
if __name__ == '__main__': | |
# Read in a saved variables file. `variables` will be set to a Variables | |
# object instance, of which each field corresponds to a saved variable. | |
variables: Variables | |
with open('totalRP3.lua', mode='r', newline='') as source_file: | |
variables = Variables.read_from(source_file) | |
# Dump some information about each profile, and modify it in-memory. | |
for profile_id, profile in variables.profiles.items(): | |
profile_name = profile['profileName'] | |
fn = profile['player']['characteristics'].get('FN', '') | |
ln = profile['player']['characteristics'].get('LN', '') | |
ra = profile['player']['characteristics'].get('RA', '') | |
cl = profile['player']['characteristics'].get('CL', '') | |
print(f'{profile_name}: {fn} {ln} - {ra} {cl}') | |
profile['player']['characteristics']['FN'] = 'Different' | |
profile['player']['characteristics']['LN'] = 'Name' | |
# Write out the newly modified profiles. | |
with open('totalRP3_Modified.lua', mode='w') as output_file: | |
serialize(variables, output_file, SERIALIZE_PRETTY) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment