Skip to content

Instantly share code, notes, and snippets.

@Meorawr
Last active January 13, 2020 02:40
Show Gist options
  • Save Meorawr/f039a051afe5744c3843323a6c1cc699 to your computer and use it in GitHub Desktop.
Save Meorawr/f039a051afe5744c3843323a6c1cc699 to your computer and use it in GitHub Desktop.
TRP3 Saved Variables Parser
#!/usr/bin/python3
'''
Limited reimplementation of the Lua 5.1 lexer and parser providing utilities
for serialization and deserialization of World of Warcraft saved variables
files.
The grammar supported by the parser is a subset of the Lua 5.1 grammar,
any unimplemented features will raise errors if encountered in a source file.
'''
from dataclasses import dataclass
from enum import Enum
from sys import maxunicode
from typing import Any, Callable, Iterable, Iterator, Mapping, Optional, \
NewType, TextIO, Union
Lexeme = Union[None, str, float]
'''
Lexeme represents a parsed source code unit. This may optionally be decoded
into a specific type, for example numeric tokens would have a floating point
lexeme representing the value.
'''
LexemeDecoder = Callable[[str], Lexeme]
'''
LexemeDecoder represents a function type that translates a lexeme from a
an input source code string to any valid lexeme value type. This can be used
to decode numbers to actual floating point values, for example.
'''
SymbolValue = Union[None, bool, float, int, str]
'''
SymbolValue represents a union of types that are equivalent to immediate Lua
values, as emitted with VALUE kind symbols.
Nil values are translated to None, booleans are booleans, numbers are floats,
and strings are strings.
'''
SymbolName = NewType('SymbolName', str)
'''
SymbolName represents the string name of a variable emitted as part of
START_VARIABLE kind symbols.
'''
LuaTable = Mapping[Union[bool, float, int, str], 'LuaValue']
'''
LuaTable represents a Python-compatible Lua table representation as a mapping
of simple hashable values to the generic LuaValue type.
Of note, this means you can't use lists/dicts as keys in tables. This should
be fine for our uses.
'''
LuaValue = Union[bool, float, int, str, None, LuaTable]
'''
LuaValue represents a union of all value Lua types, including tables.
'''
def isspace(s: str):
'''Returns true if the given string is whitespace.'''
return s.isascii() and s.isspace()
def isalpha(s: str):
'''Returns true if the given string is alphabetical.'''
return s.isascii() and s.isalpha()
def isdigit(s: str):
'''Returns true if the given string is a numeric digit.'''
return s.isascii() and s.isdigit()
def isalnum(s: str):
'''Returns true if the given string is alphanumeric.'''
return s.isascii() and s.isalnum()
def isquote(s: str):
'''Returns true if the given string represents a quote.'''
return s == '"' or s == '\''
def iseol(s: str):
'''Returns true if the given string represents an EOL marker.'''
return s == '\r' or s == '\n'
def iseof(s: str):
'''Returns true if the given string represents an EOF state.'''
return len(s) == 0
def iskeyword(s: str):
'''Returns true if the given string represents a Lua keyword.'''
# These keywords should be all that we find inside saved variables.
return s == 'false' or s == 'nil' or s == 'true'
def issymbol(s: str):
'''Returns true if the given string represents a Lua symbol.'''
# These symbols should be all that we find inside saved variables.
return s == ',' or s == '=' or s == '{' or s == '[' or s == '-' \
or s == '}' or s == ']'
@dataclass
class Location:
'''
Represents a location in a source file, tracking its name, line, and
column indices. All indices are zero-based.
'''
name: str
'''Name of the source file.'''
line: int = 0
'''File line number.'''
column: int = 0
'''File column number.'''
def __str__(self):
'''Returns the printable string representation of this location.'''
return f'{self.name}:{self.line}:{self.column}'
def advance_line(self, lines=1):
'''Advances the line index, and resets the column to 0.'''
self.line += lines
self.column = 0
def advance_column(self, columns=1):
'''Advances the column index.'''
self.column += columns
def copy(self):
'''Returns a shallow copy of the location object.'''
return Location(self.name, self.line, self.column)
class TokenKind(Enum):
'''
TokenKind is an enumeration of types of tokens yielded by the Lexer.
'''
EOF = 'EOF'
KEYWORD = 'KEYWORD'
NAME = 'NAME'
NUMBER = 'NUMBER'
STRING = 'STRING'
SYMBOL = 'SYMBOL'
def __repr__(self) -> str:
'''Returns the internal string representation of a token kind.'''
return f'<{self.__class__.__name__}.{self.name}>'
def __str__(self) -> str:
'''Returns the printable string representation of a token kind.'''
return f'<{str(self.value).lower()}>'
@dataclass
class Token:
'''
Token represents a single token yielded by the lexer, complete with
source location information and its associated lexeme.
'''
kind: TokenKind
'''Kind of token represented.'''
lexeme: Lexeme
'''Lexeme data attached to this token.'''
start: Location
'''Start location of the token in the input.'''
end: Location
'''End location of the token in the input.'''
def __str__(self) -> str:
'''Returns the printable string representation of this token.'''
if self.lexeme is not None:
# Use repr here since lexemes will be string or numeric data,
# and we want a quoted string representation for output.
return repr(self.lexeme)
else:
return str(self.kind)
class TokenBuilder:
'''
TokenBuilder implements a state machine for building Tokens.
This implements tracking of the start and end locations of a token, as
well as a data buffer for its lexeme.
'''
def __init__(self, location: Location):
'''
Initializes the token builder, tracking the given source location
reference. The location can be mutated externally and will only be
copied upon a call to `start()` or `end()`.
'''
self._location = location
self._buffer = []
self._start = None # Start location, set upon start() call.
def lexeme(self, decoder: Optional[LexemeDecoder] = None) -> Lexeme:
'''
Returns the lexeme from the data stored within the builder's buffer,
optionally pushing it through the given decoder.
If the lexeme buffer is empty and no decoder given, None is returned.
'''
lexeme = ''.join(self._buffer)
if decoder is not None:
lexeme = decoder(lexeme)
elif len(lexeme) == 0:
lexeme = None
return lexeme
def reset(self):
'''
Resets the state of the token builder, clearing all stored data
for the current token and allowing `start()` and `end()` calls.
'''
self._buffer.clear()
self._start = None
def start(self):
'''
Starts a token from a character range marked as starting at the
current source location.
Raises an AssertionError if `start()` has already been called.
'''
assert self._start is None
self._start = self._location.copy()
def append(self, data: str):
'''
Appends the given string data to the lexeme buffer.
Raises an AssertionError if `start()` has not been called.
'''
assert self._start is not None
self._buffer.append(data)
def end(self,
kind: TokenKind,
decoder: Optional[LexemeDecoder] = None) -> Token:
'''
Finishes a token, returning an instance of it with the given token
kind and lexeme stored by the builder (as decoded with the optional
decoder).
Raises an AssertionError if `start()` has not been called. Resets the
state upon returning the token, allowing the builder to be reused
for building new tokens.
'''
assert self._start is not None
token = Token(kind=kind,
lexeme=self.lexeme(decoder=decoder),
start=self._start,
end=self._location.copy())
self.reset()
return token
def build(self, kind: TokenKind, data: Optional[str] = None):
'''
Helper function that builds a token with optional lexeme data.
Mostly used for EOF indicators or other static-sized tokens.
'''
self.start()
# If a lexeme is given we assume the location reference that we
# recorded as our start position is actually *after* the data, so we
# should rewind it by the length of the lexeme.
if data is not None:
self._start.advance_column(-len(data))
self.append(data)
return self.end(kind)
class LexerError(Exception):
'''Error type raised by the lexer upon invalid or malformed input.'''
def __init__(self, message: str, scope: Union[None, Location, Token]):
'''
Initializes the error with the given message string, optionally
including scope information from the given location or token.
'''
if isinstance(scope, Location):
what = f'{str(scope)}: {message}'
elif isinstance(scope, Token):
what = f'{str(scope.end)}: {message} near {str(scope)}'
else:
what = message
super().__init__(what)
class Lexer:
'''
Lexer implements a Lua source file lexer that yields tokens upon
successive next calls. Instances of this type may be used as iterators
with the `next()` builtin or `for token in lexer` style loops.
This implements a subset of the language and will raise errors upon
malformed or unsupported input.
'''
def __init__(self, input: TextIO):
'''
Initializes the lexer to read from the given TextIO stream, such as a
file.
'''
self.token = None # Most recently read token.
self._input = input
self._location = Location(getattr(input, 'name', str(input)))
self._builder = TokenBuilder(self._location)
self._char = None # Initialized on first advancement.
def __iter__(self) -> Iterator[Token]:
'''Returns an iterator that yields tokens.'''
return self
def __next__(self) -> Token:
'''
Returns the next token from the stream. Raises StopIteration when
an EOF token is obtained.
'''
self.token = self._next_token()
if self.token.kind == TokenKind.EOF:
raise StopIteration
else:
return self.token
def _next_token(self) -> Token:
'''
Returns the next token from the stream. This will cause the IO
stream to be read and internal state to be advanced. Upon EOF,
an EOF kind token is yielded; further calls beyond an EOF token will
result in a LexerError.
'''
# If this is our first call, we need to advance to the first byte.
if self._char is None:
self._read_char()
self._builder.reset()
return self._read_token()
def _read_token(self) -> Token:
'''Reads and returns a token from the underlying stream.'''
assert self._char is not None # Must have read a character.
if self._char == '-':
# An `-` could indicate the start of a comment, or just a symbol.
self._read_char()
if self._char == '-':
self._skip_comment()
return self._read_token()
else:
return self._builder.build(TokenKind.SYMBOL, '-')
elif self._char == '[':
# Lua supports slong strings with `[[content]]` syntax, but the
# saved variable format encodes all strings with quotes only.
self._read_char()
if self._char == '[':
# FIXME: Not bulletproof. Doesn't detect `[=[content]=]`.
raise LexerError('unsupported long string', self._location)
else:
return self._builder.build(TokenKind.SYMBOL, '[')
elif self._char == '=':
# An `=` could be an assignment operator or the start of a
# comparison operator. Saved variables should only contain
# assignments, and thus are single-character symbols.
self._read_char()
if self._char == '=':
raise LexerError('unsupported `==` operator', self._location)
else:
return self._builder.build(TokenKind.SYMBOL, '=')
elif self._char == '.':
# Periods could indicate a few operators, but saved variables
# should only ever emit them as fractional numeric literals.
return self._read_numeral_token()
elif isquote(self._char):
# Quoted string literal token.
return self._read_string_token()
elif iseol(self._char):
# Newline character. This *must* be handled before whitespace
# since newlines are whitespace. Skip newlines and recurse.
self._skip_newline()
return self._read_token()
elif isspace(self._char):
# Whitespace character. Must not be a newline. Whitespace is
# skipped and we just recurse otherwise.
assert not iseol(self._char)
self._read_char()
return self._read_token()
elif isalpha(self._char) or self._char == '_':
# Keyword or identifier token.
return self._read_name_token()
elif isdigit(self._char):
# Numeric literal token.
return self._read_numeral_token()
elif iseof(self._char):
# EOF token.
return self._builder.build(TokenKind.EOF)
else:
# Symbol token.
return self._read_symbol_token()
def _read_symbol_token(self) -> Token:
'''Reads a single-character symbol token from the input stream.'''
assert issymbol(self._char) # Must be positioned on a symbol.
symbol = self._char
self._read_char()
return self._builder.build(TokenKind.SYMBOL, symbol)
def _read_name_token(self) -> Token:
'''Reads and returns an identifier or keyword from the stream.'''
assert isalpha(self._char) or self._char == '_'
# Read all name-valid characters into the token builder.
self._builder.start()
while isalnum(self._char) or self._char == '_':
self._builder.append(self._char)
self._read_char()
# The lexeme contents are either a keyword or identifier. Figure it
# out and yield the right token kind.
if iskeyword(self._builder.lexeme()):
return self._builder.end(TokenKind.KEYWORD)
else:
return self._builder.end(TokenKind.NAME)
def _read_numeral_token(self) -> Token:
'''Reads a numeric literal token from the input stream.'''
# Must be positioned on either a digit or a period.
assert isdigit(self._char) or self._char == '.'
# Consume all digits/periods into the buffer. We omit support for
# exponents and hexadecimal numbers.
self._builder.start()
while isdigit(self._char) or self._char == '.':
self._builder.append(self._char)
self._read_char()
try:
# Decode as an integer if the lexeme contains no periods.
decoder = float if '.' in self._builder.lexeme() else int
return self._builder.end(TokenKind.NUMBER, decoder=decoder)
except ValueError:
raise LexerError('malformed number', self._location)
def _read_string_token(self) -> Token:
'''Reads and returns a quoted string literal token from the stream.'''
assert isquote(self._char) # Must be positioned on a quote.
quote = self._char
# Consume characters until we reach the same terminating quote.
self._builder.start()
self._read_char()
while self._char != quote:
if iseof(self._char):
# End of stream reached.
raise LexerError('unfinished string',
self._builder.end(TokenKind.EOF))
elif iseol(self._char):
# End of line character without a preceeding backslash.
raise LexerError('unfinished string',
self._builder.end(TokenKind.STRING))
elif self._char == '\\':
# Escape sequence. Advances the stream to after the escape,
# so no follow-up read is required.
self._builder.append(self._read_string_escape())
else:
# Standard character/byte, no processing needed.
self._builder.append(self._char)
self._read_char()
# Discard the trailing quote.
self._read_char()
return self._builder.end(TokenKind.STRING)
def _read_string_escape(self) -> str:
'''
Reads an escape sequence from within a string literal. Assumes that
we're starting on the backslash, and returns the transformed literal
as a string. If EOF is reached, returns an empty string. The stream
will be advanced beyond the end of the escape sequence.
'''
assert self._char == '\\' # Assume we're starting on the backslash.
self._read_char()
if self._char == 'a': # Bell.
self._read_char()
return '\a'
elif self._char == 'b': # Backspace.
self._read_char()
return '\b'
elif self._char == 'f': # Form feed.
self._read_char()
return '\f'
elif self._char == 'n': # Line feed.
self._read_char()
return '\n'
elif self._char == 'r': # Carriage return.
self._read_char()
return '\r'
elif self._char == 't': # Horizontal tab.
self._read_char()
return '\t'
elif self._char == 'v': # Vertical tab.
self._read_char()
return '\v'
elif iseol(self._char):
# A \ followed by a newline is converted to an \n character and
# needs to be processed specially for location tracking.
self._skip_newline()
return '\n'
elif iseof(self._char):
# Return the EOF unmodified.
return self._char
elif not isdigit(self._char):
# Non-digit characters at this point imply that we're dealing
# with an escaped special character, like a quote or backslash.
# These can be passed straight through.
char = self._char
self._read_char()
return char
else:
# The set can only be a digit at this point, which means this
# is a codepoint in the form `\123`. These may come in a series,
# which we need to collect all of and decode to form a string.
encoding = self._input.encoding
codepoint = 0x00000000
bytecount = 0
byte = 0
while True:
# Convert the digit character to part of this ordinal byte.
byte = (byte * 10) + (ord(self._char[0]) - 48)
self._read_char()
if isdigit(self._char) and byte <= 255:
# Next character is a digit and we've still got an ordinal
# below the maximum representable range.
continue
elif byte > 255:
# Ordinal is outside the valid escape range.
raise LexerError('escape sequence too large',
self._builder.end(TokenKind.STRING))
# Attempt to decode the codepoint we've amassed thus far
# using the same encoding as our input stream.
try:
codepoint |= byte
bytecount += 1
return codepoint.to_bytes(bytecount, 'big') \
.decode(encoding)
except UnicodeDecodeError as ex:
# Is there another escape sequence we can try?
if self._char != '\\' or bytecount == 4:
raise LexerError(ex.reason,
self._builder.end(TokenKind.STRING))
# Is it actually an ordinal too?
self._read_char()
if not isdigit(self._char):
raise LexerError(ex.reason,
self._builder.end(TokenKind.STRING))
# Shift the codepoint up 8 bits and try reading the next
# ordinal escape sequence for decoding.
codepoint <<= 8
byte = 0
continue
def _skip_comment(self):
'''
Reads and discards a comment sequence from the stream. This yields
no token as a result, but updates our location information.
'''
assert self._char == '-' # Must be positioned on a minus.
# We assume saved variables never contain block comments.
self._read_char()
if self._char == '[':
# FIXME: Not bulletproof. Doesn't detect `--[=[comment]=]`, and
# falsely triggers on `--[ comment`.
raise LexerError('unsupported block comment', self._location)
# Consume all characters until an EOL or EOF marker.
while not iseol(self._char) and not iseof(self._char):
self._read_char()
def _skip_newline(self):
'''
Reads and discards a newline sequence from the stream. This yields
no token as a result, but updates our location information.
'''
assert iseol(self._char) # Must be positioned on an EOL character.
# Record the current EOL character and read the next one. EOL markers
# can come in non-matching pairs (\r\n or \n\r), which we want to
# consume and skip as one unit.
previous_eol = self._char
self._read_char()
if iseol(self._char) and self._char != previous_eol:
self._read_char()
# Advance our line and column numbering.
self._location.advance_line()
self._location.advance_column()
def _read_char(self):
'''Advances the source stream by a single character.'''
# If we're at EOF already, force a LexerError to bubble up.
if self._char is not None and iseof(self._char):
raise LexerError('unexpected EOF', self._location)
self._char = self._input.read(1)
# If this was our first read our line number needs incrementing.
# Do this before incrementing the column as this resets it.
if self._location.line == 0:
self._location.advance_line()
self._location.advance_column()
class SymbolKind(Enum):
'''
SymbolKind is an enumeration of non-terminal and terminal symbols
emittable by the parser. This allows the creation of a limited AST
in a stream-based manner, rather than forcing construction of a full
tree on each parse.
'''
END_TABLE = 'END_TABLE'
END_TABLE_KEY = 'END_TABLE_KEY'
END_TABLE_VALUE = 'END_TABLE_VALUE'
END_VARIABLE = 'END_VARIABLE'
START_TABLE = 'START_TABLE'
START_TABLE_KEY = 'START_TABLE_KEY'
START_TABLE_VALUE = 'START_TABLE_VALUE'
START_VARIABLE = 'START_VARIABLE'
VALUE = 'VALUE'
def __repr__(self):
'''Returns the internal string representation of a symbol kind.'''
return f'<{self.__class__.__name__}.{self.name}>'
def __str__(self):
'''Returns the printable string representation of a symbol kind.'''
return str(self.value)
@dataclass
class Symbol:
'''
Symbol represents the structure of a single symbol emitted by the parser,
combining a SymbolKind with an kind-specific data.
For most symbols the data field will be None. For START_VARIABLE and
END_VARIABLE it will be a SymbolName, and for VALUE a SymbolValue.
'''
kind: SymbolKind
'''Kind of symbol.'''
data: Union[None, SymbolName, SymbolValue] = None
'''Optional data associated with this symbol. Kind-specific.'''
def __str__(self):
'''Returns the printable string representation of a symbol kind.'''
if self.kind == SymbolKind.VALUE or self.data is not None:
return f'Symbol({str(self.kind)}, {repr(self.data)})'
else:
return f'Symbol({str(self.kind)})'
class ParseError(Exception):
'''Error type raised by the parser upon invalid or malformed input.'''
def __init__(self, message: str, scope: Union[None, Location, Token]):
'''
Initializes the error with the given message string including
location information from the given scope.
'''
if isinstance(scope, Token):
what = f'{str(scope.start)}: {message}, got {str(scope)}'
elif isinstance(scope, Location):
what = f'{str(scope)}: {message}'
else:
what = message
super().__init__(what)
class Parser:
'''
Parser implements a Lua source file parser that yields that yields
symbols upon advancement via the `next()` builtin or
`for symbol in parser` style loops.
This implements a subset of the language and will raise errors upon
malformed or unsupported input.
'''
@staticmethod
def read_from(stream: TextIO):
'''Initializes a parser to read from the given TextIO stream.'''
return Parser(Lexer(stream))
def __init__(self, lexer: Lexer):
'''Initializes a parser to read tokens from the given Lexer.'''
self.symbol = None # Most recently read symbol. None on EOF.
self._lexer = lexer
self._token = None # Most recently parsed token.
self._state = self._parse_chunk() # Iterable.
def __iter__(self) -> Iterator[Symbol]:
'''Returns an iterator for accessing symbols from the parser.'''
return self
def __next__(self) -> Symbol:
'''
Returns the next symbol from the parser. Raises StopIteration upon
reaching EOF.
'''
if self._token is None: # No tokens yet read, start.
self._read_token()
if self._token.kind == TokenKind.EOF: # EOF token reached, stop.
self.symbol = None
raise StopIteration
else:
self.symbol = next(self._state)
return self.symbol
def _parse_chunk(self) -> Iterable[Symbol]:
'''
Yields all symbols present within a Lua chunk. Continues until the
underlying token stream reaches EOF.
'''
# Chunks are an optionally-semicolon-delimited sequence of statements.
while not self._accept_token(TokenKind.EOF):
yield from self._parse_statement()
self._accept_and_read_token(TokenKind.SYMBOL, ';')
def _parse_statement(self) -> Iterable[Symbol]:
'''Yields symbols from a Lua statement.'''
# Lua supports a varied range of statements, but the only ones we
# will find in saved variable files will be assignment statements.
# Expect a variable name identifier.
self._expect_token(TokenKind.NAME)
variable_name = SymbolName(self._token.lexeme)
yield Symbol(SymbolKind.START_VARIABLE, variable_name)
# Expect an `=` symbol.
self._read_token()
self._expect_and_read_token(TokenKind.SYMBOL, '=')
# The value being assigned is the result of an expression, so recurse.
yield from self._parse_expression()
yield Symbol(SymbolKind.END_VARIABLE, variable_name)
def _parse_expression(self) -> Iterable[Symbol]:
'''Yields symbols from a Lua expression.'''
# Lua expressions are complicated and recursive, complete with unary
# and binary operators, subexpressions, etc. Thankfully saved
# variables files can only emit the unary minus operator and
# simple expressions (values).
# Test for unary minus.
if self._accept_and_read_token(TokenKind.SYMBOL, '-'):
# Recursively parse as a subexpression and obtain the first
# symbol from it.
expression = self._parse_expression()
symbol = next(expression)
# Saved variables should only ever generate code such that the
# unary minus operation is applied to immediate numeric values,
# so if this isn't the case then complain.
if symbol.kind != SymbolKind.VALUE \
or not isinstance(symbol.data, (float, int)):
raise ParseError('unsupported unary minus operation', None)
# Apply the negation to the value and re-yield it to the caller,
# then continue with the expression so that we properly advance
# our token stream.
yield Symbol(symbol.kind, -symbol.data)
yield from expression
else:
# No minus, so it can only be a value expression.
yield from self._parse_value()
def _parse_value(self) -> Iterable[Symbol]:
'''Yields symbols from a Lua value expression.'''
# Value expressions (simpleexp) can only be immediate values or
# table constructors. Any other variants are unimplemented.
if self._accept_token(TokenKind.NUMBER):
yield Symbol(SymbolKind.VALUE, self._token.lexeme)
elif self._accept_token(TokenKind.STRING):
yield Symbol(SymbolKind.VALUE, self._token.lexeme)
elif self._accept_token(TokenKind.KEYWORD, 'nil'):
yield Symbol(SymbolKind.VALUE, None)
elif self._accept_token(TokenKind.KEYWORD, 'true'):
yield Symbol(SymbolKind.VALUE, True)
elif self._accept_token(TokenKind.KEYWORD, 'false'):
yield Symbol(SymbolKind.VALUE, False)
elif self._accept_token(TokenKind.SYMBOL, '{'):
yield from self._parse_table()
else:
raise ParseError('unsupported expression variant', self._token)
# Advance to the next token to close the expression.
self._read_token()
def _parse_table(self) -> Iterable[Symbol]:
'''Yields symbols from a Lua table constructor expression.'''
# Assume we're starting on the opening curly bracket.
self._expect_and_read_token(TokenKind.SYMBOL, '{')
yield Symbol(SymbolKind.START_TABLE)
# Read the tokens forming the records of this table.
while True:
if self._accept_token(TokenKind.SYMBOL, '}'):
# Found closing brace, break out the loop.
break
elif self._accept_token(TokenKind.NAME):
# Normally a NAME token could be ambiguous between either
# a hash record assignment (`{a = 1}`) or an list record with
# a variable lookup (`{a}`), but saved variables should never
# emit variable lookups inside tables.
yield from self._parse_table_hash_record()
elif self._accept_token(TokenKind.SYMBOL, '['):
# An explicit `[` is always a hash record.
yield from self._parse_table_hash_record()
else:
# Anything else is considered an list record.
yield from self._parse_table_list_record()
# Records should be separated by commas; saved variables won't
# use semicolons as delimiters.
if not self._accept_and_read_token(TokenKind.SYMBOL, ','):
break
# Our caller will advance the token beyond the closing brace, so we
# just need to expect it rather than expect and read.
self._expect_token(TokenKind.SYMBOL, '}')
yield Symbol(SymbolKind.END_TABLE)
def _parse_table_hash_record(self) -> Iterable[Symbol]:
'''Yields symbols from a Lua table constructor hash record.'''
# A hash record is either a NAME token or a bracketed key expression
# followed by a value assignment.
yield Symbol(SymbolKind.START_TABLE_KEY)
if self._accept_token(TokenKind.NAME):
# Literal key.
yield Symbol(SymbolKind.VALUE, self._token.lexeme)
self._read_token()
else:
# Bracketed key.
self._expect_and_read_token(TokenKind.SYMBOL, '[')
yield from self._parse_expression()
self._expect_and_read_token(TokenKind.SYMBOL, ']')
yield Symbol(SymbolKind.END_TABLE_KEY)
self._expect_and_read_token(TokenKind.SYMBOL, '=')
# The value is always an expression.
yield Symbol(SymbolKind.START_TABLE_VALUE)
yield from self._parse_expression()
yield Symbol(SymbolKind.END_TABLE_VALUE)
def _parse_table_list_record(self) -> Iterable[Symbol]:
'''Yields symbols from a Lua table constructor list record.'''
# List records are just expressions.
yield Symbol(SymbolKind.START_TABLE_VALUE)
yield from self._parse_expression()
yield Symbol(SymbolKind.END_TABLE_VALUE)
def _parse_singlevar(self) -> Iterable[Symbol]:
'''Yields symbols from a Lua variable assignment statement.'''
def _accept_token(self, kind: TokenKind, lexeme: Lexeme = None):
'''
Returns true if the current token matches the kind and (optional)
lexeme data.
'''
assert self._token is not None # Must have a token to test.
return self._token.kind == kind \
and (lexeme is None or self._token.lexeme == lexeme)
def _expect_token(self, kind: TokenKind, lexeme: Lexeme = None):
'''
Raises a ParseError if the current token doesn't match the requested
kind and (optional) lexeme data.
'''
if not self._accept_token(kind, lexeme):
if lexeme is not None:
raise ParseError(f'expected {repr(lexeme)}', self._token)
else:
raise ParseError(f'expected {str(kind)}', self._token)
def _accept_and_read_token(self, kind: TokenKind, lexeme: Lexeme = None):
'''
Returns true and reads the next token if the current token matches
the given kind and (optional) lexeme data.
'''
if self._accept_token(kind, lexeme):
self._read_token()
return True
else:
return False
def _expect_and_read_token(self, kind: TokenKind, lexeme: Lexeme = None):
'''
Reads the next token if the current token matches the given kind
and (optional) lexeme data, otherwise raises a ParseError.
'''
self._expect_token(kind, lexeme)
self._read_token()
def _read_token(self):
'''Fetches the next token from the lexer.'''
try:
self._token = next(self._lexer)
except StopIteration:
# EOF reached, copy the token so we can detect it internally.
self._token = self._lexer.token
class DeserializeError(Exception):
'''Error type raised by the deserialize functions on invalid symbols.'''
pass
@dataclass
class SerializeOptions:
'''Structure representing options used for serializing Lua values.'''
line_prefix: str = ''
'''
Prefix to prepend to the start of each line. This is present for
each key/value pair within a table, as well as the table end (}).
'''
line_indent: str = ''
'''
Indentation string, repeated for each level of indentation depth. This
is placed after line_prefix on each suitable line.
'''
line_suffix: str = ''
'''
Suffix to append to the end of each line. This is applied to the end of
each table start (`{`), and any key/value pairs within tables.
'''
trailing_comma: bool = False
'''If true, include a trailing comma on the last entry in a table.'''
key_value_space: bool = False
'''If true, include spaces around `=` in table assignments.'''
indent_depth: int = 0
'''Indentation depth, to be incremented each time a table starts.'''
raw: bool = False
'''
If true, disables `__lua_serialize__` implementations on nested
objects when serializing.
'''
def copy(self):
'''Returns a shallow copy of the options o.'''
return SerializeOptions(line_prefix=self.line_prefix,
line_indent=self.line_indent,
line_suffix=self.line_suffix,
trailing_comma=self.trailing_comma,
key_value_space=self.key_value_space,
indent_depth=self.indent_depth,
raw=self.raw)
SERIALIZE_COMPACT = SerializeOptions()
'''Serializer options that will produce compact output.'''
SERIALIZE_SPACED = SerializeOptions(line_suffix=' ', key_value_space=True)
'''Serializer options that will produce spaced output.'''
SERIALIZE_PRETTY = SerializeOptions(line_indent='\t',
line_suffix='\n',
trailing_comma=True,
key_value_space=True)
'''Serializer options that will produce pretty output.'''
def deserialize_table(parser: Parser) -> LuaTable:
'''
Deserializes a table from the parser's current state. The parser must
be positioned on a START_TABLE symbol, and will be advanced to the next
matching END_TABLE symbol.
'''
if parser.symbol.kind != SymbolKind.START_TABLE:
raise DeserializeError(f'expected table, got {str(parser.symbol)}')
table = dict() # Table, as a dictionary. Truly a visionary.
array_size = 0 # Size of the array segment.
current_key = None # Last value collected for the key.
current_val = None # Last value collected for, well, a value.
for symbol in parser:
if symbol.kind == SymbolKind.END_TABLE: # End of this table.
break
elif symbol.kind == SymbolKind.START_TABLE: # Nested table.
current_val = deserialize_table(parser)
elif symbol.kind == SymbolKind.VALUE: # Immediate value.
current_val = symbol.data
elif symbol.kind == SymbolKind.END_TABLE_KEY:
# We just read a key; we always write values and nested tables
# to current_val so we need to move it to current_key.
current_key, current_val = current_val, None
elif symbol.kind == SymbolKind.END_TABLE_VALUE:
# Push the entry into the table.
if current_key is not None:
# Hash entry.
table[current_key] = current_val
else:
# Array entry.
array_size += 1
table[array_size] = current_val
# Reset the stored values.
current_key, current_key = None, None
else:
# Other symbols are ignored. We should get nothing invalid if
# the parser is correct.
continue
return table
def deserialize(parser: Parser) -> LuaValue:
'''
Deserializes a Lua value from the given parser. The parser will be
advanced to the next symbol, after which it expects to find either a
VALUE or START_TABLE symbol.
Returns None if the parser reaches EOF. Other symbols will raise a
DeserializeError.
'''
try:
sym = next(parser)
if sym.kind == SymbolKind.VALUE:
return sym.data
elif sym.kind == SymbolKind.START_TABLE:
return deserialize_table(parser)
else:
raise DeserializeError(f'expected value or table, got {str(sym)}')
except StopIteration:
return None
def serialize(o: Any,
output: TextIO,
options: SerializeOptions = SERIALIZE_COMPACT):
'''
Serializes a given value as its Lua equivalent to the given output
stream. If the given value implements a `__lua_serialize__()` method,
it will be called, otherwise this behaves the same as `serialize_raw()`.
'''
if not options.raw and callable(getattr(o, '__lua_serialize__', None)):
o.__lua_serialize__(output=output, options=options)
else:
serialize_raw(o, output=output, options=options)
def serialize_raw(o: Any,
output: TextIO,
options: SerializeOptions = SERIALIZE_COMPACT):
'''
Serializes a given value as its Lua equivalent to the given output
stream. The given value will not have any custom serializers executed
on its top level.
Complex values (dicts and lists) will be formatted with the given options.
Raises ValueError if the input value cannot be converted to a Lua
equivalent.
'''
if isinstance(o, str):
serialize_str(o, output=output)
elif isinstance(o, bool):
output.write('true' if o else 'false')
elif isinstance(o, (float, int)):
output.write(str(o))
elif isinstance(o, dict):
serialize_dict(o, output=output, options=options)
elif isinstance(o, list):
serialize_list(o, output=output, options=options)
elif o is None:
output.write('nil')
else:
raise ValueError(f'Cannot map value to Lua: {repr(o)}')
def serialize_str(s: str, output: TextIO):
'''
Serializes a string as a double-quoted Lua string to the given output
stream.
'''
output.write('"')
for c in s[:]:
if c == '"':
output.write(f'\\"')
elif c == '\\':
output.write(f'\\\\')
elif c == '\r':
output.write(f'\\r')
elif c == '\n':
output.write(f'\\n')
elif not c.isascii() or not c.isprintable():
# Non-printable or non-ascii characters will be output as
# ordinal escape sequences on a byte-by-byte basis.
for b in bytes(c, 'utf-8'):
output.write(f'\\{b}')
else:
# All other characters are printable and can go through as-is.
output.write(c)
output.write('"')
def serialize_dict(d: dict,
output: TextIO,
options: SerializeOptions = SERIALIZE_COMPACT):
'''
Serializes a dictionary as a Lua table to the given output stream,
formatted according to the given options.
'''
# Shortcut for zero-entry tables.
if len(d) == 0:
output.write('{}')
return
indent_string = options.line_indent * options.indent_depth
# Open the table.
output.write('{')
output.write(options.line_suffix)
# Serialize the contents.
suboptions = options.copy()
suboptions.indent_depth += 1
serialize_dict_entries(d, output=output, options=suboptions)
# Terminate the table.
output.write(options.line_prefix)
output.write(indent_string)
output.write('}')
def serialize_list(l: list,
output: TextIO,
options: SerializeOptions = SERIALIZE_COMPACT):
'''
Serializes a list as a Lua table to the given output stream, formatted
according to the given options.
'''
# Shortcut for zero-entry tables.
if len(l) == 0:
output.write('{}')
return
indent_string = options.line_indent * options.indent_depth
# Open the table.
output.write('{')
output.write(options.line_suffix)
# Serialize the contents.
suboptions = options.copy()
suboptions.indent_depth += 1
serialize_list_entries(l, output=output, options=suboptions)
# Terminate the table.
output.write(options.line_prefix)
output.write(indent_string)
output.write('}')
def serialize_dict_entries(d: dict,
output: TextIO,
options: SerializeOptions = SERIALIZE_COMPACT):
'''
Serializes the entries of a dict as Lua table contents to the given
output stream, formatted according to the given options.
'''
indent_string = options.line_indent * options.indent_depth
i = 0
for k, v in d.items():
output.write(options.line_prefix)
output.write(indent_string)
# Entry key.
output.write('[')
serialize(k, output=output, options=options)
output.write(']')
# Assignment.
output.write(' = ' if options.key_value_space else '=')
# Entry value.
serialize(v, output=output, options=options)
if i < (len(d) - 1) or options.trailing_comma:
output.write(',')
output.write(options.line_suffix)
i += 1
def serialize_list_entries(l: list,
output: TextIO,
options: SerializeOptions = SERIALIZE_COMPACT):
'''
Serializes the entries of a list as Lua table contents to the given
output stream, formatted according to the given options.
'''
indent_string = options.line_indent * options.indent_depth
for i, v in enumerate(l):
output.write(options.line_prefix)
output.write(indent_string)
serialize(v, output=output, options=options)
if i < (len(l) - 1) or options.trailing_comma:
output.write(',')
output.write(options.line_suffix)
#!/usr/bin/python3
from savedvars import LuaValue, Parser, SerializeOptions, SymbolKind, \
SERIALIZE_PRETTY, deserialize, serialize
from dataclasses import dataclass
from enum import Enum
from io import open
from sys import stdout
from typing import TextIO
class VariableKind(Enum):
'''
VariableKind is an enumeration of all saved variables found within the
`totalRP3.lua` file.
'''
PROFILES = 'TRP3_Profiles'
CHARACTERS = 'TRP3_Characters'
CONFIGURATION = 'TRP3_Configuration'
TARGET_FRAME = 'TRP3_TargetFrame'
FLYWAY = 'TRP3_Flyway'
PRESETS = 'TRP3_Presets'
COMPANIONS = 'TRP3_Companions'
MATURE_FILTER = 'TRP3_MatureFilter'
STASHED_DATA = 'TRP3_StashedData'
COLORS = 'TRP3_Colors'
NOTES = 'TRP3_Notes'
def field_name(self) -> str:
'''Returns a Variable instance field name for a variable kind.'''
return str(self.name).lower()
def variable_name(self) -> str:
'''Returns a Lua variable name for a variable kind.'''
return str(self.value)
@dataclass
class Variables:
'''
Variables represents the structure of the TRP3 saved variables. Each
field present maps to a variable in the `totalRP3.lua` file and may be
accessed as a Python-equivalent Lua data structure, eg. dicts.
'''
profiles: LuaValue = None
characters: LuaValue = None
configuration: LuaValue = None
target_frame: LuaValue = None
flyway: LuaValue = None
presets: LuaValue = None
companions: LuaValue = None
mature_filter: LuaValue = None
stashed_data: LuaValue = None
colors: LuaValue = None
notes: LuaValue = None
@staticmethod
def read_from(input: TextIO):
'''
Constructs a Variables instance from the contents of the given
TextIO (file) stream.
Raises an error if there's any issue parsing the contents, hopefully
complete with source location information.
'''
# As we're parsing from the top of a file we should only get variable
# assignments. Anything else is ignored.
parser = Parser.read_from(input)
variables = Variables()
for symbol in parser:
if symbol.kind != SymbolKind.START_VARIABLE:
continue
try:
kind = VariableKind(symbol.data)
setattr(variables, kind.field_name(), deserialize(parser))
except ValueError:
# Unknown variable.
continue
return variables
def __lua_serialize__(self, output: TextIO, options: SerializeOptions):
'''Serializes the variables to the given output stream.'''
for kind in VariableKind:
output.write(kind.variable_name())
output.write(' = ')
serialize(getattr(self, kind.field_name(), None), output, options)
output.write('\n\n')
if __name__ == '__main__':
# Read in a saved variables file. `variables` will be set to a Variables
# object instance, of which each field corresponds to a saved variable.
variables: Variables
with open('totalRP3.lua', mode='r', newline='') as source_file:
variables = Variables.read_from(source_file)
# Dump some information about each profile, and modify it in-memory.
for profile_id, profile in variables.profiles.items():
profile_name = profile['profileName']
fn = profile['player']['characteristics'].get('FN', '')
ln = profile['player']['characteristics'].get('LN', '')
ra = profile['player']['characteristics'].get('RA', '')
cl = profile['player']['characteristics'].get('CL', '')
print(f'{profile_name}: {fn} {ln} - {ra} {cl}')
profile['player']['characteristics']['FN'] = 'Different'
profile['player']['characteristics']['LN'] = 'Name'
# Write out the newly modified profiles.
with open('totalRP3_Modified.lua', mode='w') as output_file:
serialize(variables, output_file, SERIALIZE_PRETTY)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment