Last active
January 13, 2022 21:55
-
-
Save xixasdev/e38bfe50965ddfed24d1e9d0bccdd6c8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# x4_savegame_xmlanalyzer_v2.py | |
# AUTHOR: xixas | DATE: 2022.01.12 | LICENSE: WTFPL/PDM/CC0... your choice | |
# DESCRIPTION: Parse an X4 Foundations savegame XML file and report tag statistics | |
import argparse | |
import re | |
import sys | |
import xml.sax | |
ENUM_THRESHOLD = 2 | |
RE_ENUM = re.compile('[A-Za-z_][A-Za-z0-9_]*') | |
RE_TYPES = { | |
'int': re.compile('-?[0-9]+'), | |
'float': re.compile('-?[0-9]+\.[0-9]+'), | |
'int_pair': re.compile('\{-?[0-9]+, ?-?[0-9]+\}'), | |
'id': re.compile('\[0x[0-9a-zA-Z]+\]'), | |
'exponent': re.compile('-?[0-9]+e-?[0-9]+'), | |
} | |
def parse_args(args): | |
parser = argparse.ArgumentParser(description="Parse an X4 Foundations XML file and report tag statistics") | |
parser.add_argument('--sort', dest='sort_tags', action='store_true', help="sort tags alphabetically (default display is order encountered)") | |
parser.add_argument('--tree', dest='tree_mode', action='store_true', help="tree mode (nodes analyzed separately by branch degree)") | |
parser.add_argument('--no-consolidate', dest='consolidate_types', action='store_false', help="do not consolidate overlapping attribute types") | |
parser.add_argument('--no-enums', dest='find_enums', action='store_false', help="do not convert likely string types into enum types") | |
parser.add_argument('filepath', help='path to XML file to be parsed') | |
opts = parser.parse_args(args) | |
return opts | |
def run(args): | |
opts = parse_args(args) | |
tags = parse_xml(opts.filepath, opts.tree_mode) | |
if opts.consolidate_types: | |
consolidate_types(tags, opts.tree_mode) | |
if opts.find_enums: | |
find_enums(tags, opts.tree_mode) | |
print_tags(tags, opts.tree_mode, opts.sort_tags) | |
def parse_xml(filepath, tree_mode=False): | |
handler = XmlHandler(tree_mode) | |
parser = xml.sax.make_parser() | |
parser.setContentHandler(handler) | |
with open(filepath, 'r') as fd: | |
parser.parse(fd) | |
return handler.tags | |
def consolidate_types(tags, tree_mode=False, _tag_stack=[]): | |
for (tag_name, tag_info) in tags.items(): | |
local_tag_stack = _tag_stack + [tag_name] | |
for (attr_name, attr_info) in tag_info['attrs'].items(): | |
types = attr_info['types'] | |
if len(types) > 1: | |
if 'string' in types: | |
type_names = list(types.keys()) | |
for type_name in type_names: | |
type_info = types[type_name] | |
if type_name != 'string': | |
types['string']['count'] += type_info['count'] | |
types.pop(type_name) | |
elif 'float' in types: | |
if 'int' in types: | |
types['float']['count'] += types['int']['count'] | |
types.pop('int') | |
if tree_mode: | |
children = tag_info['children'] | |
if len(children): | |
consolidate_types(children, tree_mode, local_tag_stack) | |
def find_enums(tags, tree_mode=False, _tag_stack=[]): | |
for (tag_name, tag_info) in tags.items(): | |
local_tag_stack = _tag_stack + [tag_name] | |
for (attr_name, attr_info) in tag_info['attrs'].items(): | |
types = attr_info['types'] | |
if len(types) == 1 and 'string' in types: | |
values = attr_info['values'] | |
total_strings = len(values) | |
string_count_at_enum_threshold = 0 | |
can_enum = True | |
for (string_value, string_info) in values.items(): | |
if not re.fullmatch(RE_ENUM, string_value): | |
can_enum = False | |
break | |
if string_info['count'] >= ENUM_THRESHOLD: | |
string_count_at_enum_threshold += 1 | |
if can_enum and string_count_at_enum_threshold >= total_strings / 2: | |
types['enum'] = {'count': types['string']['count'], 'values': [k for k in values.keys()]} | |
types.pop('string') | |
if tree_mode: | |
children = tag_info['children'] | |
if len(children): | |
find_enums(children, tree_mode, local_tag_stack) | |
def print_tags(tags_dict, tree_mode=False, sort_tags=False, indent=''): | |
for (tag_name, tag_info) in sorted(tags_dict.items()) if sort_tags else tags_dict.items(): | |
print(f"{indent}{tag_name} ({tag_info['count']})") | |
if not tree_mode: | |
parents = ', '.join([f"{k} ({v['count']})" for (k, v) in sorted(tag_info['parents'].items(), key=lambda x: x[1]['count'])]) | |
children = ', '.join([f"{k} ({v['count']})" for (k, v) in sorted(tag_info['children'].items(), key=lambda x: x[1]['count'])]) | |
if parents: | |
print(f"{indent} @ parents: {parents}") | |
if children: | |
print(f"{indent} @ children: {children}") | |
for (attr_name, attr_info) in sorted(tag_info['attrs'].items()): | |
types = [f"{type_name} ({type_info['count']})" for (type_name, type_info) in sorted(attr_info['types'].items(), key=lambda x: x[1]['count'], reverse=True)] | |
enum_values = (' {'+', '.join([k for k in attr_info['values'].keys()])+'}') if 'enum' in attr_info['types'] else '' | |
print(f"{indent} : {attr_name} ({attr_info['count']}): {', '.join(types)}{enum_values}") | |
if tree_mode: | |
if tag_info['children']: | |
print_tags(tag_info['children'], tree_mode, sort_tags, f"{indent} ") | |
class XmlHandler(xml.sax.ContentHandler): | |
def __init__(self, tree_mode, *args, **kwargs): | |
super().__init__(*args, **kwargs) | |
self.tree_mode = tree_mode | |
self.tags = {} | |
self.tag_stack = [] | |
def startElement(self, name, attrs): | |
tags = self.tags | |
if self.tree_mode: | |
for t in self.tag_stack: | |
tags = tags[t]['children'] | |
tags[name] = tag = tags.get(name, {'count': 0, 'attrs': {}, 'parents': {}, 'children': {}}) | |
tag['count'] += 1 | |
if not self.tree_mode and self.tag_stack: | |
parent_name = self.tag_stack[-1] | |
self.tags[parent_name]['children'][name] = child = self.tags[parent_name]['children'].get(name, {'count': 0}) | |
child['count'] += 1 | |
tag['parents'][parent_name] = parent = tag['parents'].get(parent_name, {'count': 0}) | |
parent['count'] += 1 | |
for (attr_name, attr_value) in attrs.items(): | |
tag['attrs'][attr_name] = attr = tag['attrs'].get(attr_name, {'count': 0, 'types': {}, 'values': {}}) | |
attr['count'] += 1 | |
types = attr['types'] | |
type_name = 'string' | |
if 'string' not in types: | |
for (re_type_name, re_type) in RE_TYPES.items(): | |
if re.fullmatch(re_type, attr_value): | |
type_name = re_type_name | |
break | |
attr['types'][type_name] = type_item = attr['types'].get(type_name, {'count': 0}) | |
type_item['count'] += 1 | |
# Store value count | |
values = attr['values'] | |
values[attr_value] = values.get(attr_value, {'count': 0}) | |
values[attr_value]['count'] += 1 | |
self.tag_stack.append(name) | |
def endElement(self, name): | |
self.tag_stack.pop() | |
if __name__ == '__main__': | |
run(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
X4: Foundations Savegame XML Analyzer
This is a second attempt at breaking down X4's savegame files to help determine potential deserialization strategies for the purpose of discussing ways to improve save/load performance.
It lists all encountered tags and attributes and attempts a best-guess at deserialization type.
If it believes a value can be stored to an enumerated type it also lists the enum names.
This second version addresses the over-aggressive relationship nesting of the first version, which considered all nested tags to be different tags if their parent hierarchies are different from one another. The original version can still be useful in later anaylsis, though, and can still be accessed via the
--tree
option.See information about other new options in the
Usage
section below.Use:
Extract an X4 savegame to its base xml file.
Save this script to the same directory.
Run the script with the save file as an argument -- e.g.:
Usage:
Notations:
Runtime:
Expect 2 to 4 minutes, depending on hardware and savegame size.
This is Python -- quick to write, slow to run.
Sample Output
I'm cutting this short (note the
...
in the middle).My test savegame for this run was 714 MB (uncompressed), ~14.7 million lines.
Sample output was 1746 lines... and I'm not pasting all that here :)