nosoop/smc_parser.py

## smc_parser.py
#!/usr/bin/python3

# BSD Zero Clause License
#
# Copyright (C) 2023 by nosoop
#
# Permission to use, copy, modify, and/or distribute this software for any purpose with or
# without fee is hereby granted.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
# SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
# THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
# CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
# OR PERFORMANCE OF THIS SOFTWARE.

import collections
import enum
import itertools

class SMCOperation(enum.Enum):
	STRING = 1
	SUBSECTION_START = 2
	SUBSECTION_END = 3
	COMMENT = 4
	COMMENT_MULTILINE = 5
	KEYVALUE = 6

# https://stackoverflow.com/a/70762559
def takewhile_inclusive(predicate, it):
	for x in it:
		if predicate(x):
			yield x
		else:
			yield x
			break

def _is_whitespace(ch):
	return ch in (' ', '\t', '\n', '\r')

def _smc_stream_skip_whitespace(stream):
	# consumes whitespace and returns the first non-whitespace character if any, or None if EOS
	values = tuple(takewhile_inclusive(_is_whitespace, stream))
	if not values:
		return None
	*ws, last = values
	if not ws and not _is_whitespace(last):
		return last
	return last if ws and not _is_whitespace(last) else None

def _smc_stream_extract_multiline_comment(stream):
	while True:
		yield from itertools.takewhile(lambda ch: ch != '*', stream)
		ch = next(stream, None)
		if ch == '/':
			return
		yield '*'
		yield ch

_escape_mapping = str.maketrans({
	'"': '"',
	'n': '\n',
	'r': '\r',
	't': '\t',
	'\\': '\\',
})

def _smc_stream_extract_string(stream):
	last = None
	for ch in stream:
		if ch == "\\":
			ch = next(stream).translate(_escape_mapping)
		elif ch == '"':
			return
		yield ch

def parse_smc_string(data):
	stream = iter(data)
	while True:
		ch = _smc_stream_skip_whitespace(stream)
		if ch is None:
			return
		elif ch == '"':
			# consume until the next quote, then determine if:
			# - the string marks the subsection name '{'
			# - we have another string to consume, making this a key / value pair
			key = ''.join(_smc_stream_extract_string(stream))

			ch = _smc_stream_skip_whitespace(stream)
			if ch == '{':
				yield SMCOperation.SUBSECTION_START, key
			elif ch == '"':
				value = ''.join(_smc_stream_extract_string(stream))
				yield SMCOperation.KEYVALUE, key, value
			else:
				raise ValueError(f"Unexpected character {ch.encode('ascii', 'backslashreplace')} after end of string")
		elif ch == '}':
			yield SMCOperation.SUBSECTION_END, None
		elif ch == '/':
			ch = next(stream)
			if ch == '/':
				# single line comment: consume until the end of the line
				value = ''.join(itertools.takewhile(lambda ch: ch != '\n', stream))
				yield SMCOperation.COMMENT, value
			elif ch == '*':
				# multi line comment: consume until the sequence '*/' is reached
				value = ''.join(_smc_stream_extract_multiline_comment(stream))
				yield SMCOperation.COMMENT_MULTILINE, value
			else:
				raise ValueError(f"Unexpected character {ch.encode('ascii', 'backslashreplace')} at start of comment")
		else:
			raise ValueError(f"Unexpected character {ch.encode('ascii', 'backslashreplace')}")

import collections
import json

class MultiKeyDict(collections.defaultdict):
	# a dict that supports supports one-to-many mappings
	# init by passing keys pointing to a list of values
	def __init__(self, *args, **kwargs):
		super().__init__(list, *args, **kwargs)

	# yields a key, value pair for every array item associated with a key
	def items(self):
		yield from ((k, iv) for k, v in super().items() for iv in v)

def smc_string_to_dict(data):
	# returns a multidict instance
	root_node = MultiKeyDict()
	contexts = [ root_node ]
	for event, *info in parse_smc_string(data):
		if event == SMCOperation.SUBSECTION_START:
			key, *_ = info
			subkey = MultiKeyDict()
			contexts[-1][key].append(subkey)
			contexts.append(subkey)
		elif event == SMCOperation.SUBSECTION_END:
			contexts.pop()
		elif event == SMCOperation.KEYVALUE:
			key, value = info
			contexts[-1][key].append(value)
	return root_node

import pathlib

def main():
	SMC_STRING = """
	"thing"
	{
		// this is a comment node
		"key"	"value"

		"subthing"
		{
			// and another
			"subthing key"		"subthing value"
			"subthing key"		"duplicate key value"
		}
		"subthing"
		{
			"duplicate subthing" "yes"
		}

		/**
		 * this is a multiline comment node
		 */
		"another key"	"another value"
	}
	"""

	# sections = []
	# for event, *data in parse_smc_string(SMC_STRING):
		# print(event, data, tuple(sections))
		# if event == SMCOperation.SUBSECTION_START:
			# section, *_ = data
			# sections.append(section)
		# elif event == SMCOperation.SUBSECTION_END:
			# sections.pop()
	# assert(not sections)

	for f in pathlib.Path('translations').rglob('*.txt'):
		print(f)
		print(json.dumps(smc_string_to_dict(f.read_text('utf8')), indent = 4))

if __name__ == "__main__":
	main()
	#!/usr/bin/python3

	# BSD Zero Clause License
	#
	# Copyright (C) 2023 by nosoop
	#
	# Permission to use, copy, modify, and/or distribute this software for any purpose with or
	# without fee is hereby granted.
	#
	# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
	# SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
	# THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY
	# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
	# CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
	# OR PERFORMANCE OF THIS SOFTWARE.

	import collections
	import enum
	import itertools

	class SMCOperation(enum.Enum):
	STRING = 1
	SUBSECTION_START = 2
	SUBSECTION_END = 3
	COMMENT = 4
	COMMENT_MULTILINE = 5
	KEYVALUE = 6

	# https://stackoverflow.com/a/70762559
	def takewhile_inclusive(predicate, it):
	for x in it:
	if predicate(x):
	yield x
	else:
	yield x
	break

	def _is_whitespace(ch):
	return ch in (' ', '\t', '\n', '\r')

	def _smc_stream_skip_whitespace(stream):
	# consumes whitespace and returns the first non-whitespace character if any, or None if EOS
	values = tuple(takewhile_inclusive(_is_whitespace, stream))
	if not values:
	return None
	*ws, last = values
	if not ws and not _is_whitespace(last):
	return last
	return last if ws and not _is_whitespace(last) else None

	def _smc_stream_extract_multiline_comment(stream):
	while True:
	yield from itertools.takewhile(lambda ch: ch != '*', stream)
	ch = next(stream, None)
	if ch == '/':
	return
	yield '*'
	yield ch

	_escape_mapping = str.maketrans({
	'"': '"',
	'n': '\n',
	'r': '\r',
	't': '\t',
	'\\': '\\',
	})

	def _smc_stream_extract_string(stream):
	last = None
	for ch in stream:
	if ch == "\\":
	ch = next(stream).translate(_escape_mapping)
	elif ch == '"':
	return
	yield ch

	def parse_smc_string(data):
	stream = iter(data)
	while True:
	ch = _smc_stream_skip_whitespace(stream)
	if ch is None:
	return
	elif ch == '"':
	# consume until the next quote, then determine if:
	# - the string marks the subsection name '{'
	# - we have another string to consume, making this a key / value pair
	key = ''.join(_smc_stream_extract_string(stream))

	ch = _smc_stream_skip_whitespace(stream)
	if ch == '{':
	yield SMCOperation.SUBSECTION_START, key
	elif ch == '"':
	value = ''.join(_smc_stream_extract_string(stream))
	yield SMCOperation.KEYVALUE, key, value
	else:
	raise ValueError(f"Unexpected character {ch.encode('ascii', 'backslashreplace')} after end of string")
	elif ch == '}':
	yield SMCOperation.SUBSECTION_END, None
	elif ch == '/':
	ch = next(stream)
	if ch == '/':
	# single line comment: consume until the end of the line
	value = ''.join(itertools.takewhile(lambda ch: ch != '\n', stream))
	yield SMCOperation.COMMENT, value
	elif ch == '*':
	# multi line comment: consume until the sequence '*/' is reached
	value = ''.join(_smc_stream_extract_multiline_comment(stream))
	yield SMCOperation.COMMENT_MULTILINE, value
	else:
	raise ValueError(f"Unexpected character {ch.encode('ascii', 'backslashreplace')} at start of comment")
	else:
	raise ValueError(f"Unexpected character {ch.encode('ascii', 'backslashreplace')}")

	import collections
	import json

	class MultiKeyDict(collections.defaultdict):
	# a dict that supports supports one-to-many mappings
	# init by passing keys pointing to a list of values
	def __init__(self, args, *kwargs):
	super().__init__(list, args, *kwargs)

	# yields a key, value pair for every array item associated with a key
	def items(self):
	yield from ((k, iv) for k, v in super().items() for iv in v)

	def smc_string_to_dict(data):
	# returns a multidict instance
	root_node = MultiKeyDict()
	contexts = [ root_node ]
	for event, *info in parse_smc_string(data):
	if event == SMCOperation.SUBSECTION_START:
	key, *_ = info
	subkey = MultiKeyDict()
	contexts[-1][key].append(subkey)
	contexts.append(subkey)
	elif event == SMCOperation.SUBSECTION_END:
	contexts.pop()
	elif event == SMCOperation.KEYVALUE:
	key, value = info
	contexts[-1][key].append(value)
	return root_node

	import pathlib

	def main():
	SMC_STRING = """
	"thing"
	{
	// this is a comment node
	"key" "value"

	"subthing"
	{
	// and another
	"subthing key" "subthing value"
	"subthing key" "duplicate key value"
	}
	"subthing"
	{
	"duplicate subthing" "yes"
	}

	/**
	* this is a multiline comment node
	*/
	"another key" "another value"
	}
	"""

	# sections = []
	# for event, *data in parse_smc_string(SMC_STRING):
	# print(event, data, tuple(sections))
	# if event == SMCOperation.SUBSECTION_START:
	# section, *_ = data
	# sections.append(section)
	# elif event == SMCOperation.SUBSECTION_END:
	# sections.pop()
	# assert(not sections)

	for f in pathlib.Path('translations').rglob('*.txt'):
	print(f)
	print(json.dumps(smc_string_to_dict(f.read_text('utf8')), indent = 4))

	if __name__ == "__main__":
	main()