Last active
November 10, 2022 23:46
-
-
Save xixasdev/f0632aab83972985adcc7d2e11bdd6fe to your computer and use it in GitHub Desktop.
X4 Foundations Savegame: XML Analyzer to determine deserialization criteria
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# x4_savegame_xmlanalyzer_v1.py | |
# AUTHOR: xixas | DATE: 2022.01.09 | LICENSE: WTFPL/PDM/CC0... your choice | |
# DESCRIPTION: Parse an X4 Foundations savegame XML file and report tag statistics | |
import argparse | |
import re | |
import sys | |
import xml.sax | |
ENUM_THRESHOLD = 2 | |
RE_ENUM = re.compile('[A-Za-z_][A-Za-z0-9_]*') | |
RE_TYPES = { | |
'int': re.compile('-?[0-9]+'), | |
'float': re.compile('-?[0-9.]+'), | |
'int_pair': re.compile('\{-?[0-9]+, ?-?[0-9]+\}'), | |
'id': re.compile('\[0x[0-9a-zA-Z]+\]'), | |
'exponent': re.compile('-?[0-9]+e-?[0-9]+'), | |
} | |
def parse_args(args): | |
parser = argparse.ArgumentParser(description="Parse an XML file and report tag statistics") | |
parser.add_argument('filepath', help='path to XML file to be parsed') | |
opts = parser.parse_args(args) | |
return opts | |
def run(args): | |
opts = parse_args(args) | |
tags = parse_xml(opts.filepath) | |
consolidate_tag_types(tags) | |
print_tags(tags) | |
def parse_xml(filepath): | |
handler = XmlHandler() | |
parser = xml.sax.make_parser() | |
parser.setContentHandler(handler) | |
with open(filepath, 'r') as fd: | |
parser.parse(fd) | |
return handler.tags | |
def consolidate_tag_types(tags, _tag_stack = []): | |
for (tag_name, tag_info) in tags.items(): | |
local_tag_stack = _tag_stack + [tag_name] | |
for (attr_name, attr_info) in tag_info['attrs'].items(): | |
types = attr_info['types'] | |
if len(types) > 1: | |
if 'string' in types: | |
type_names = list(types.keys()) | |
for type_name in type_names: | |
type_info = types[type_name] | |
if type_name != 'string': | |
types['string']['count'] += type_info['count'] | |
types.pop(type_name) | |
elif 'float' in types: | |
if 'int' in types: | |
types['float']['count'] += types['int']['count'] | |
types.pop('int') | |
# Determine enums | |
if len(types) == 1 and 'string' in types: | |
values = attr_info['values'] | |
total_strings = len(values) | |
string_count_at_enum_threshold = 0 | |
can_enum = True | |
for (string_value, string_info) in values.items(): | |
if not re.fullmatch(RE_ENUM, string_value): | |
can_enum = False | |
break | |
if string_info['count'] >= ENUM_THRESHOLD: | |
string_count_at_enum_threshold += 1 | |
if can_enum and string_count_at_enum_threshold >= total_strings / 2: | |
types['enum'] = {'count': types['string']['count'], 'values': [k for k in values.keys()]} | |
types.pop('string') | |
children = tag_info['children'] | |
if len(children): | |
consolidate_tag_types(children, local_tag_stack) | |
def print_tags(tags_dict, indent=''): | |
for (tag_name, tag_info) in sorted(tags_dict.items()): | |
print(f"{indent}{tag_name} ({tag_info['count']})") | |
for (attr_name, attr_info) in sorted(tag_info['attrs'].items()): | |
types = [f"{type_name} ({type_info['count']})" for (type_name, type_info) in sorted(attr_info['types'].items(), key=lambda x: x[1]['count'], reverse=True)] | |
enum_values = (' {'+', '.join([k for k in attr_info['values'].keys()])+'}') if 'enum' in attr_info['types'] else '' | |
print(f"{indent} : {attr_name} ({attr_info['count']}): {', '.join(types)}{enum_values}") | |
if tag_info['children']: | |
print_tags(tag_info['children'], f"{indent} ") | |
class XmlHandler(xml.sax.ContentHandler): | |
def __init__(self, *args, **kwargs): | |
super().__init__(*args, **kwargs) | |
self.tags = {} | |
self.tag_stack = [] | |
def startElement(self, name, attrs): | |
tags = self.tags | |
for t in self.tag_stack: | |
tags = tags[t]['children'] | |
self.tag_stack.append(name) | |
tags[name] = tag = tags.get(name, {'count': 0, 'attrs': {}, 'children': {}}) | |
tag['count'] += 1 | |
for (attr_name, attr_value) in attrs.items(): | |
tag['attrs'][attr_name] = attr = tag['attrs'].get(attr_name, {'count': 0, 'types': {}, 'values': {}}) | |
attr['count'] += 1 | |
types = attr['types'] | |
type_name = 'string' | |
if 'string' not in types: | |
for (re_type_name, re_type) in RE_TYPES.items(): | |
if re.fullmatch(re_type, attr_value): | |
type_name = re_type_name | |
break | |
attr['types'][type_name] = type_item = attr['types'].get(type_name, {'count': 0}) | |
type_item['count'] += 1 | |
# Store value count | |
values = attr['values'] | |
values[attr_value] = values.get(attr_value, {'count': 0}) | |
values[attr_value]['count'] += 1 | |
def endElement(self, name): | |
self.tag_stack.pop() | |
if __name__ == '__main__': | |
run(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Update: See version 2 here with a number of improved features.
X4: Foundations Savegame XML Analyzer
This is an initial attempt at breaking down X4's savegame files to help determine potential deserialization strategies for the purpose of discussing ways to improve save/load performance.
It lists all encountered tags and attributes and attempts a best-guess at deserialization type. If it believes a value can be stored to an enumerated type it also lists the enum names.
This first version is a little too aggressive with nesting relationships, considering all nested tags to be different tags if their parent hierarchies are different from one another. This is particularly evident with
cue
tags. I'll loosen this up for v2.Use:
Extract an X4 savegame to its base xml file.
Save this script to the same directory.
Run the script with the save file as an argument -- e.g.:
Notations:
Runtime:
Expect 2 to 4 minutes, depending on hardware and savegame size.
This is Python -- quick to write, slow to run.
Sample Output
I'm cutting this short (note the
...
in the middle).My test savegame for this run was 714 MB (uncompressed), ~14.7 million lines.
Sample output was 4150 lines... and I'm not pasting all that here :)