xixasdev/x4_savegame_xmlanalyzer_v2.py

## x4_savegame_xmlanalyzer_v2.py
#!/usr/bin/env python3
# x4_savegame_xmlanalyzer_v2.py
# AUTHOR: xixas | DATE: 2022.01.12 | LICENSE: WTFPL/PDM/CC0... your choice
# DESCRIPTION: Parse an X4 Foundations savegame XML file and report tag statistics


import argparse
import re
import sys
import xml.sax


ENUM_THRESHOLD = 2
RE_ENUM = re.compile('[A-Za-z_][A-Za-z0-9_]*')
RE_TYPES = {
    'int':      re.compile('-?[0-9]+'),
    'float':    re.compile('-?[0-9]+\.[0-9]+'),
    'int_pair': re.compile('\{-?[0-9]+, ?-?[0-9]+\}'),
    'id':       re.compile('\[0x[0-9a-zA-Z]+\]'),
    'exponent': re.compile('-?[0-9]+e-?[0-9]+'),
}


def parse_args(args):
    parser = argparse.ArgumentParser(description="Parse an X4 Foundations XML file and report tag statistics")
    parser.add_argument('--sort', dest='sort_tags', action='store_true', help="sort tags alphabetically (default display is order encountered)")
    parser.add_argument('--tree', dest='tree_mode', action='store_true', help="tree mode (nodes analyzed separately by branch degree)")
    parser.add_argument('--no-consolidate', dest='consolidate_types', action='store_false', help="do not consolidate overlapping attribute types")
    parser.add_argument('--no-enums', dest='find_enums', action='store_false', help="do not convert likely string types into enum types")
    parser.add_argument('filepath', help='path to XML file to be parsed')
    opts = parser.parse_args(args)
    return opts


def run(args):
    opts = parse_args(args)
    tags = parse_xml(opts.filepath, opts.tree_mode)
    if opts.consolidate_types:
        consolidate_types(tags, opts.tree_mode)
    if opts.find_enums:
        find_enums(tags, opts.tree_mode)
    print_tags(tags, opts.tree_mode, opts.sort_tags)


def parse_xml(filepath, tree_mode=False):
    handler = XmlHandler(tree_mode)
    parser = xml.sax.make_parser()
    parser.setContentHandler(handler)
    with open(filepath, 'r') as fd:
        parser.parse(fd)
    return handler.tags


def consolidate_types(tags, tree_mode=False, _tag_stack=[]):
    for (tag_name, tag_info) in tags.items():
        local_tag_stack = _tag_stack + [tag_name]
        for (attr_name, attr_info) in tag_info['attrs'].items():
            types = attr_info['types']
            if len(types) > 1:
                if 'string' in types:
                    type_names = list(types.keys())
                    for type_name in type_names:
                        type_info = types[type_name]
                        if type_name != 'string':
                            types['string']['count'] += type_info['count']
                            types.pop(type_name)
                elif 'float' in types:
                    if 'int' in types:
                        types['float']['count'] += types['int']['count']
                        types.pop('int')
        if tree_mode:
            children = tag_info['children']
            if len(children):
                consolidate_types(children, tree_mode, local_tag_stack)


def find_enums(tags, tree_mode=False, _tag_stack=[]):
    for (tag_name, tag_info) in tags.items():
        local_tag_stack = _tag_stack + [tag_name]
        for (attr_name, attr_info) in tag_info['attrs'].items():
            types = attr_info['types']
            if len(types) == 1 and 'string' in types:
                values = attr_info['values']
                total_strings = len(values)
                string_count_at_enum_threshold = 0
                can_enum = True
                for (string_value, string_info) in values.items():
                    if not re.fullmatch(RE_ENUM, string_value):
                        can_enum = False
                        break
                    if string_info['count'] >= ENUM_THRESHOLD:
                        string_count_at_enum_threshold += 1
                if can_enum and string_count_at_enum_threshold >= total_strings / 2:
                    types['enum'] = {'count': types['string']['count'], 'values': [k for k in values.keys()]}
                    types.pop('string')
        if tree_mode:
            children = tag_info['children']
            if len(children):
                find_enums(children, tree_mode, local_tag_stack)


def print_tags(tags_dict, tree_mode=False, sort_tags=False, indent=''):
    for (tag_name, tag_info) in sorted(tags_dict.items()) if sort_tags else tags_dict.items():
        print(f"{indent}{tag_name} ({tag_info['count']})")
        if not tree_mode:
            parents = ', '.join([f"{k} ({v['count']})" for (k, v) in sorted(tag_info['parents'].items(), key=lambda x: x[1]['count'])])
            children = ', '.join([f"{k} ({v['count']})" for (k, v) in sorted(tag_info['children'].items(), key=lambda x: x[1]['count'])])
            if parents:
                print(f"{indent}  @ parents: {parents}")
            if children:
                print(f"{indent}  @ children: {children}")
        for (attr_name, attr_info) in sorted(tag_info['attrs'].items()):
            types = [f"{type_name} ({type_info['count']})" for (type_name, type_info) in sorted(attr_info['types'].items(), key=lambda x: x[1]['count'], reverse=True)]
            enum_values = (' {'+', '.join([k for k in attr_info['values'].keys()])+'}') if 'enum' in attr_info['types'] else ''
            print(f"{indent}  : {attr_name} ({attr_info['count']}): {', '.join(types)}{enum_values}")
        if tree_mode:
            if tag_info['children']:
                print_tags(tag_info['children'], tree_mode, sort_tags, f"{indent}  ")


class XmlHandler(xml.sax.ContentHandler):

    def __init__(self, tree_mode, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.tree_mode = tree_mode
        self.tags = {}
        self.tag_stack = []

    def startElement(self, name, attrs):
        tags = self.tags
        if self.tree_mode:
            for t in self.tag_stack:
                tags = tags[t]['children']
        tags[name] = tag = tags.get(name, {'count': 0, 'attrs': {}, 'parents': {}, 'children': {}})
        tag['count'] += 1
        if not self.tree_mode and self.tag_stack:
            parent_name = self.tag_stack[-1]
            self.tags[parent_name]['children'][name] = child = self.tags[parent_name]['children'].get(name, {'count': 0})
            child['count'] += 1
            tag['parents'][parent_name] = parent = tag['parents'].get(parent_name, {'count': 0})
            parent['count'] += 1
        for (attr_name, attr_value) in attrs.items():
            tag['attrs'][attr_name] = attr = tag['attrs'].get(attr_name, {'count': 0, 'types': {}, 'values': {}})
            attr['count'] += 1
            types = attr['types']
            type_name = 'string'
            if 'string' not in types:
                for (re_type_name, re_type) in RE_TYPES.items():
                    if re.fullmatch(re_type, attr_value):
                        type_name = re_type_name
                        break
            attr['types'][type_name] = type_item = attr['types'].get(type_name, {'count': 0})
            type_item['count'] += 1

            # Store value count
            values = attr['values']
            values[attr_value] = values.get(attr_value, {'count': 0})
            values[attr_value]['count'] += 1
        self.tag_stack.append(name)

    def endElement(self, name):
        self.tag_stack.pop()


if __name__ == '__main__':
    run(sys.argv[1:])
	#!/usr/bin/env python3
	# x4_savegame_xmlanalyzer_v2.py
	# AUTHOR: xixas \| DATE: 2022.01.12 \| LICENSE: WTFPL/PDM/CC0... your choice
	# DESCRIPTION: Parse an X4 Foundations savegame XML file and report tag statistics


	import argparse
	import re
	import sys
	import xml.sax


	ENUM_THRESHOLD = 2
	RE_ENUM = re.compile('[A-Za-z_][A-Za-z0-9_]*')
	RE_TYPES = {
	'int': re.compile('-?[0-9]+'),
	'float': re.compile('-?[0-9]+\.[0-9]+'),
	'int_pair': re.compile('\{-?[0-9]+, ?-?[0-9]+\}'),
	'id': re.compile('\[0x[0-9a-zA-Z]+\]'),
	'exponent': re.compile('-?[0-9]+e-?[0-9]+'),
	}


	def parse_args(args):
	parser = argparse.ArgumentParser(description="Parse an X4 Foundations XML file and report tag statistics")
	parser.add_argument('--sort', dest='sort_tags', action='store_true', help="sort tags alphabetically (default display is order encountered)")
	parser.add_argument('--tree', dest='tree_mode', action='store_true', help="tree mode (nodes analyzed separately by branch degree)")
	parser.add_argument('--no-consolidate', dest='consolidate_types', action='store_false', help="do not consolidate overlapping attribute types")
	parser.add_argument('--no-enums', dest='find_enums', action='store_false', help="do not convert likely string types into enum types")
	parser.add_argument('filepath', help='path to XML file to be parsed')
	opts = parser.parse_args(args)
	return opts


	def run(args):
	opts = parse_args(args)
	tags = parse_xml(opts.filepath, opts.tree_mode)
	if opts.consolidate_types:
	consolidate_types(tags, opts.tree_mode)
	if opts.find_enums:
	find_enums(tags, opts.tree_mode)
	print_tags(tags, opts.tree_mode, opts.sort_tags)


	def parse_xml(filepath, tree_mode=False):
	handler = XmlHandler(tree_mode)
	parser = xml.sax.make_parser()
	parser.setContentHandler(handler)
	with open(filepath, 'r') as fd:
	parser.parse(fd)
	return handler.tags


	def consolidate_types(tags, tree_mode=False, _tag_stack=[]):
	for (tag_name, tag_info) in tags.items():
	local_tag_stack = _tag_stack + [tag_name]
	for (attr_name, attr_info) in tag_info['attrs'].items():
	types = attr_info['types']
	if len(types) > 1:
	if 'string' in types:
	type_names = list(types.keys())
	for type_name in type_names:
	type_info = types[type_name]
	if type_name != 'string':
	types['string']['count'] += type_info['count']
	types.pop(type_name)
	elif 'float' in types:
	if 'int' in types:
	types['float']['count'] += types['int']['count']
	types.pop('int')
	if tree_mode:
	children = tag_info['children']
	if len(children):
	consolidate_types(children, tree_mode, local_tag_stack)


	def find_enums(tags, tree_mode=False, _tag_stack=[]):
	for (tag_name, tag_info) in tags.items():
	local_tag_stack = _tag_stack + [tag_name]
	for (attr_name, attr_info) in tag_info['attrs'].items():
	types = attr_info['types']
	if len(types) == 1 and 'string' in types:
	values = attr_info['values']
	total_strings = len(values)
	string_count_at_enum_threshold = 0
	can_enum = True
	for (string_value, string_info) in values.items():
	if not re.fullmatch(RE_ENUM, string_value):
	can_enum = False
	break
	if string_info['count'] >= ENUM_THRESHOLD:
	string_count_at_enum_threshold += 1
	if can_enum and string_count_at_enum_threshold >= total_strings / 2:
	types['enum'] = {'count': types['string']['count'], 'values': [k for k in values.keys()]}
	types.pop('string')
	if tree_mode:
	children = tag_info['children']
	if len(children):
	find_enums(children, tree_mode, local_tag_stack)


	def print_tags(tags_dict, tree_mode=False, sort_tags=False, indent=''):
	for (tag_name, tag_info) in sorted(tags_dict.items()) if sort_tags else tags_dict.items():
	print(f"{indent}{tag_name} ({tag_info['count']})")
	if not tree_mode:
	parents = ', '.join([f"{k} ({v['count']})" for (k, v) in sorted(tag_info['parents'].items(), key=lambda x: x[1]['count'])])
	children = ', '.join([f"{k} ({v['count']})" for (k, v) in sorted(tag_info['children'].items(), key=lambda x: x[1]['count'])])
	if parents:
	print(f"{indent} @ parents: {parents}")
	if children:
	print(f"{indent} @ children: {children}")
	for (attr_name, attr_info) in sorted(tag_info['attrs'].items()):
	types = [f"{type_name} ({type_info['count']})" for (type_name, type_info) in sorted(attr_info['types'].items(), key=lambda x: x[1]['count'], reverse=True)]
	enum_values = (' {'+', '.join([k for k in attr_info['values'].keys()])+'}') if 'enum' in attr_info['types'] else ''
	print(f"{indent} : {attr_name} ({attr_info['count']}): {', '.join(types)}{enum_values}")
	if tree_mode:
	if tag_info['children']:
	print_tags(tag_info['children'], tree_mode, sort_tags, f"{indent} ")


	class XmlHandler(xml.sax.ContentHandler):

	def __init__(self, tree_mode, args, *kwargs):
	super().__init__(args, *kwargs)
	self.tree_mode = tree_mode
	self.tags = {}
	self.tag_stack = []

	def startElement(self, name, attrs):
	tags = self.tags
	if self.tree_mode:
	for t in self.tag_stack:
	tags = tags[t]['children']
	tags[name] = tag = tags.get(name, {'count': 0, 'attrs': {}, 'parents': {}, 'children': {}})
	tag['count'] += 1
	if not self.tree_mode and self.tag_stack:
	parent_name = self.tag_stack[-1]
	self.tags[parent_name]['children'][name] = child = self.tags[parent_name]['children'].get(name, {'count': 0})
	child['count'] += 1
	tag['parents'][parent_name] = parent = tag['parents'].get(parent_name, {'count': 0})
	parent['count'] += 1
	for (attr_name, attr_value) in attrs.items():
	tag['attrs'][attr_name] = attr = tag['attrs'].get(attr_name, {'count': 0, 'types': {}, 'values': {}})
	attr['count'] += 1
	types = attr['types']
	type_name = 'string'
	if 'string' not in types:
	for (re_type_name, re_type) in RE_TYPES.items():
	if re.fullmatch(re_type, attr_value):
	type_name = re_type_name
	break
	attr['types'][type_name] = type_item = attr['types'].get(type_name, {'count': 0})
	type_item['count'] += 1

	# Store value count
	values = attr['values']
	values[attr_value] = values.get(attr_value, {'count': 0})
	values[attr_value]['count'] += 1
	self.tag_stack.append(name)

	def endElement(self, name):
	self.tag_stack.pop()


	if __name__ == '__main__':
	run(sys.argv[1:])