Skip to content

Instantly share code, notes, and snippets.

@xixasdev
Last active January 13, 2022 21:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xixasdev/e38bfe50965ddfed24d1e9d0bccdd6c8 to your computer and use it in GitHub Desktop.
Save xixasdev/e38bfe50965ddfed24d1e9d0bccdd6c8 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# x4_savegame_xmlanalyzer_v2.py
# AUTHOR: xixas | DATE: 2022.01.12 | LICENSE: WTFPL/PDM/CC0... your choice
# DESCRIPTION: Parse an X4 Foundations savegame XML file and report tag statistics
import argparse
import re
import sys
import xml.sax
ENUM_THRESHOLD = 2
RE_ENUM = re.compile('[A-Za-z_][A-Za-z0-9_]*')
RE_TYPES = {
'int': re.compile('-?[0-9]+'),
'float': re.compile('-?[0-9]+\.[0-9]+'),
'int_pair': re.compile('\{-?[0-9]+, ?-?[0-9]+\}'),
'id': re.compile('\[0x[0-9a-zA-Z]+\]'),
'exponent': re.compile('-?[0-9]+e-?[0-9]+'),
}
def parse_args(args):
parser = argparse.ArgumentParser(description="Parse an X4 Foundations XML file and report tag statistics")
parser.add_argument('--sort', dest='sort_tags', action='store_true', help="sort tags alphabetically (default display is order encountered)")
parser.add_argument('--tree', dest='tree_mode', action='store_true', help="tree mode (nodes analyzed separately by branch degree)")
parser.add_argument('--no-consolidate', dest='consolidate_types', action='store_false', help="do not consolidate overlapping attribute types")
parser.add_argument('--no-enums', dest='find_enums', action='store_false', help="do not convert likely string types into enum types")
parser.add_argument('filepath', help='path to XML file to be parsed')
opts = parser.parse_args(args)
return opts
def run(args):
opts = parse_args(args)
tags = parse_xml(opts.filepath, opts.tree_mode)
if opts.consolidate_types:
consolidate_types(tags, opts.tree_mode)
if opts.find_enums:
find_enums(tags, opts.tree_mode)
print_tags(tags, opts.tree_mode, opts.sort_tags)
def parse_xml(filepath, tree_mode=False):
handler = XmlHandler(tree_mode)
parser = xml.sax.make_parser()
parser.setContentHandler(handler)
with open(filepath, 'r') as fd:
parser.parse(fd)
return handler.tags
def consolidate_types(tags, tree_mode=False, _tag_stack=[]):
for (tag_name, tag_info) in tags.items():
local_tag_stack = _tag_stack + [tag_name]
for (attr_name, attr_info) in tag_info['attrs'].items():
types = attr_info['types']
if len(types) > 1:
if 'string' in types:
type_names = list(types.keys())
for type_name in type_names:
type_info = types[type_name]
if type_name != 'string':
types['string']['count'] += type_info['count']
types.pop(type_name)
elif 'float' in types:
if 'int' in types:
types['float']['count'] += types['int']['count']
types.pop('int')
if tree_mode:
children = tag_info['children']
if len(children):
consolidate_types(children, tree_mode, local_tag_stack)
def find_enums(tags, tree_mode=False, _tag_stack=[]):
for (tag_name, tag_info) in tags.items():
local_tag_stack = _tag_stack + [tag_name]
for (attr_name, attr_info) in tag_info['attrs'].items():
types = attr_info['types']
if len(types) == 1 and 'string' in types:
values = attr_info['values']
total_strings = len(values)
string_count_at_enum_threshold = 0
can_enum = True
for (string_value, string_info) in values.items():
if not re.fullmatch(RE_ENUM, string_value):
can_enum = False
break
if string_info['count'] >= ENUM_THRESHOLD:
string_count_at_enum_threshold += 1
if can_enum and string_count_at_enum_threshold >= total_strings / 2:
types['enum'] = {'count': types['string']['count'], 'values': [k for k in values.keys()]}
types.pop('string')
if tree_mode:
children = tag_info['children']
if len(children):
find_enums(children, tree_mode, local_tag_stack)
def print_tags(tags_dict, tree_mode=False, sort_tags=False, indent=''):
for (tag_name, tag_info) in sorted(tags_dict.items()) if sort_tags else tags_dict.items():
print(f"{indent}{tag_name} ({tag_info['count']})")
if not tree_mode:
parents = ', '.join([f"{k} ({v['count']})" for (k, v) in sorted(tag_info['parents'].items(), key=lambda x: x[1]['count'])])
children = ', '.join([f"{k} ({v['count']})" for (k, v) in sorted(tag_info['children'].items(), key=lambda x: x[1]['count'])])
if parents:
print(f"{indent} @ parents: {parents}")
if children:
print(f"{indent} @ children: {children}")
for (attr_name, attr_info) in sorted(tag_info['attrs'].items()):
types = [f"{type_name} ({type_info['count']})" for (type_name, type_info) in sorted(attr_info['types'].items(), key=lambda x: x[1]['count'], reverse=True)]
enum_values = (' {'+', '.join([k for k in attr_info['values'].keys()])+'}') if 'enum' in attr_info['types'] else ''
print(f"{indent} : {attr_name} ({attr_info['count']}): {', '.join(types)}{enum_values}")
if tree_mode:
if tag_info['children']:
print_tags(tag_info['children'], tree_mode, sort_tags, f"{indent} ")
class XmlHandler(xml.sax.ContentHandler):
def __init__(self, tree_mode, *args, **kwargs):
super().__init__(*args, **kwargs)
self.tree_mode = tree_mode
self.tags = {}
self.tag_stack = []
def startElement(self, name, attrs):
tags = self.tags
if self.tree_mode:
for t in self.tag_stack:
tags = tags[t]['children']
tags[name] = tag = tags.get(name, {'count': 0, 'attrs': {}, 'parents': {}, 'children': {}})
tag['count'] += 1
if not self.tree_mode and self.tag_stack:
parent_name = self.tag_stack[-1]
self.tags[parent_name]['children'][name] = child = self.tags[parent_name]['children'].get(name, {'count': 0})
child['count'] += 1
tag['parents'][parent_name] = parent = tag['parents'].get(parent_name, {'count': 0})
parent['count'] += 1
for (attr_name, attr_value) in attrs.items():
tag['attrs'][attr_name] = attr = tag['attrs'].get(attr_name, {'count': 0, 'types': {}, 'values': {}})
attr['count'] += 1
types = attr['types']
type_name = 'string'
if 'string' not in types:
for (re_type_name, re_type) in RE_TYPES.items():
if re.fullmatch(re_type, attr_value):
type_name = re_type_name
break
attr['types'][type_name] = type_item = attr['types'].get(type_name, {'count': 0})
type_item['count'] += 1
# Store value count
values = attr['values']
values[attr_value] = values.get(attr_value, {'count': 0})
values[attr_value]['count'] += 1
self.tag_stack.append(name)
def endElement(self, name):
self.tag_stack.pop()
if __name__ == '__main__':
run(sys.argv[1:])
@xixasdev
Copy link
Author

xixasdev commented Jan 13, 2022

X4: Foundations Savegame XML Analyzer

This is a second attempt at breaking down X4's savegame files to help determine potential deserialization strategies for the purpose of discussing ways to improve save/load performance.

It lists all encountered tags and attributes and attempts a best-guess at deserialization type.
If it believes a value can be stored to an enumerated type it also lists the enum names.

This second version addresses the over-aggressive relationship nesting of the first version, which considered all nested tags to be different tags if their parent hierarchies are different from one another. The original version can still be useful in later anaylsis, though, and can still be accessed via the --tree option.

See information about other new options in the Usage section below.

Use:

Extract an X4 savegame to its base xml file.
Save this script to the same directory.
Run the script with the save file as an argument -- e.g.:

python3 x4_savegame_xmlanalyzer_v2.py save_009.xml

Usage:

usage: x4_savegame_xmlanalyzer_v2.py [-h] [--sort] [--tree] [--no-consolidate] [--no-enums] filepath

Parse an X4 Foundations XML file and report tag statistics

positional arguments:
  filepath          path to XML file to be parsed

optional arguments:
  -h, --help        show this help message and exit
  --sort            sort tags alphabetically (default display is order encountered)
  --tree            tree mode (nodes analyzed separately by branch degree)
  --no-consolidate  do not consolidate overlapping attribute types
  --no-enums        do not convert likely string types into enum types

Notations:

  • () Numbers in parentheses are counts for encountered tags/attributes/types
  • @ The "at" symbol is used to designate parent and child relationships (when they exist)
  • : Attributes are prefixed with a colon below their parent tag - they are followed by encountered type(s)
  • "id" typed attributes are attributes that reference other objects
  • {} Possible enum values are wrapped in curly braces

Runtime:

Expect 2 to 4 minutes, depending on hardware and savegame size.
This is Python -- quick to write, slow to run.

Sample Output

I'm cutting this short (note the ... in the middle).
My test savegame for this run was 714 MB (uncompressed), ~14.7 million lines.
Sample output was 1746 lines... and I'm not pasting all that here :)

savegame (1)
  @ children: info (1), universe (1), economylog (1), modified (1), stats (1), log (1), messages (1), script (1), md (1), missions (1), aidirector (1), operations (1), notifications (1), ui (1), signature (1)
info (1)
  @ parents: savegame (1)
  @ children: save (1), game (1), player (1), patches (1)
save (1)
  @ parents: info (1)
  : date (1): int (1)
  : name (1): string (1)
game (1)
  @ parents: info (1)
  : build (1): int (1)
  : code (1): int (1)
  : guid (1): string (1)
  : id (1): string (1)
  : original (1): int (1)
  : seed (1): int (1)
  : start (1): string (1)
  : time (1): float (1)
  : version (1): int (1)
player (1)
  @ parents: info (1)
  : location (1): int_pair (1)
  : money (1): int (1)
  : name (1): string (1)
patches (1)
  @ parents: info (1)
  @ children: history (1), patch (2)
patch (5)
  @ parents: aidirector (1), patches (2), history (2)
  @ children: script (82)
  : extension (4): enum (4) {ego_dlc_split, ego_dlc_terran}
  : name (4): string (4)
  : version (4): int (4)
history (1)
  @ parents: patches (1)
  @ children: patch (2)
...
...
...
command (15288)
  @ parents: script (15288)
  @ children: param (5288)
  : type (15288): enum (15288) {attackenemies, repair, freetrade, trade, dockat, explore, resupply, wait, patrol, escort, freemining, searchresources, buildstation, mining, support, investigate, attackobject, withdrawbattle, movetozone, protect, recon, follow, recycle, collect}
commandaction (21091)
  @ parents: script (21091)
  @ children: param (190)
  : type (21091): enum (21091) {standingby, repairchecking, searchingtrades, repairingto, attacking, flying, calculating, undocking, docking, searchingresources, buildingto, executingtrade, waitingdrones, waitingtodock, investigating, attackingto}
counters (308)
  @ parents: script (308)
  @ children: counter (389)
counter (389)
  @ parents: counters (389)
  : current (264): int (264)
  : max (384): int (384)
  : type (5): enum (5) {list}
  : value (5): int (5)
operations (1)
  @ parents: savegame (1)
notifications (1)
  @ parents: savegame (1)
  @ children: type (1)
type (1)
  @ parents: notifications (1)
  : enabled (1): int (1)
  : id (1): string (1)
ui (1)
  @ parents: savegame (1)
  @ children: addons (1)
  : version (1): int (1)
addons (1)
  @ parents: ui (1)
  @ children: data (1)
signature (1)
  @ parents: savegame (1)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment