prcutler/ElementTree.py

## code.py
# SPDX-FileCopyrightText: Copyright (c) 2022 Neradoc
# SPDX-License-Identifier: Unlicense

import sys
from ElementTree import parse

with open("some-demo.xml", "r") as fp:
    tree = parse(fp)

print(tree)

def print_sub_tree(node, depth=0):
    if node.text is not None:
        text = '"' + node.text + '"'
    else:
        text = ""
    print(" "*depth, "-", node.tag, text)
    for key, value in node.attrib.items():
        print(" "*depth, "|", key, ":", value)
    for subnode in node:
        print_sub_tree(subnode, depth+2)

print_sub_tree(tree.getroot())

## ElementTree.py
# This file is part of the standard library of Pycopy project, minimalist
# and lightweight Python implementation.
#
# https://github.com/pfalcon/pycopy
# https://github.com/pfalcon/pycopy-lib
#
# The MIT License (MIT)
#
# Copyright (c) 2018-2020 Paul Sokolovsky
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

import io
import xmltok2


class ParseError(Exception):
    pass


class Element:

    def __init__(self):
        self.tag = None
        self.attrib = {}
        self.text = None
        self.tail = None
        self._children = []

    def __getitem__(self, i):
        return self._children[i]

    def __len__(self):
        return len(self._children)

    def append(self, el):
        self._children.append(el)

    def get(self, key, default=None):
        return self.attrib.get(key, default)

    def set(self, key, value):
        self.attrib[key] = value

    def write(self, file):
        assert self.tag is not None
        file.write("<%s" % self.tag)
        for k, v in self.attrib.items():
            file.write(' {}="{}"'.format(k, v))
        file.write(">")
        if self.text is not None:
            file.write(self.text)
        for t in self._children:
            t.write(file)
        file.write("</%s>" % self.tag)
        if self.tail is not None:
            file.write(self.tail)


class ElementTree:

    def __init__(self, root):
        self.root = root

    def getroot(self):
        return self.root

    def write(self, file):
        self.root.write(file)
        file.write("\n")


def parse_el(stream):
    stack = []
    root = None
    last = None

    for ev in xmltok2.tokenize(stream):
        typ = ev[0]

        if typ == xmltok2.START_TAG:
            el = Element()
            el.tag = ev[2]
            if not stack:
                root = el
            else:
                stack[-1]._children.append(el)
            stack.append(el)
            last = None

        elif typ == xmltok2.ATTR:
            # Ignore attrs of processing instructions
            if stack:
                stack[-1].attrib[ev[2]] = ev[3]

        elif typ == xmltok2.TEXT:
            if last is None:
                stack[-1].text = ev[1]
            else:
                last.tail = ev[1]

        elif typ == xmltok2.END_TAG:
            if stack[-1].tag != ev[2]:
                raise ParseError("mismatched tag: /%s (expected: /%s)" % (ev[1][1], stack[-1].tag))
            last = stack.pop()

    return root


def parse(source):
    return ElementTree(parse_el(source))


def fromstring(data):
    buf = io.StringIO(data)
    return parse_el(buf)

## some-demo.xml
<?xml version="1.0" encoding="UTF-8"?>
<note>
  <to>Tove</to>
  <from>Jani</from>
  <heading color="red">Reminder</heading>
  <body class="important">Don't forget me this weekend!</body>
</note>

## xmltok2.py
# This file is part of the standard library of Pycopy project, minimalist
# and lightweight Python implementation.
#
# https://github.com/pfalcon/pycopy
# https://github.com/pfalcon/pycopy-lib
#
# The MIT License (MIT)
#
# Copyright (c) 2018-2019 Paul Sokolovsky
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

TEXT = "TEXT"
START_TAG = "START_TAG"
#START_TAG_DONE = "START_TAG_DONE"
END_TAG = "END_TAG"
PI = "PI"
#PI_DONE = "PI_DONE"
ATTR = "ATTR"
#ATTR_VAL = "ATTR_VAL"

class XMLSyntaxError(Exception):
    pass

class XMLTokenizer:

    def __init__(self, f):
        self.f = f
        self.c = ""
        self.nextch()

    def getch(self):
        c = self.c
        self.nextch()
        return c

    def eof(self):
        return self.c == ""

    def nextch(self):
        self.c = self.f.read(1)

    def skip_ws(self):
        while self.c.isspace():
            self.nextch()

    def isident(self):
        self.skip_ws()
        return self.c.isalpha()

    def getident(self):
        self.skip_ws()
        ident = ""
        while self.c:
            c = self.c
            if not(c.isalpha() or c.isdigit() or c in "_-."):
                break
            ident += self.getch()
        return ident

    def putnsident(self, res):
        ns = ""
        ident = self.getident()
        if self.c == ":":
            self.nextch()
            ns = ident
            ident = self.getident()
        res[1] = ns
        res[2] = ident

    def match(self, c):
        self.skip_ws()
        if self.c == c:
            self.nextch()
            return True
        return False

    def expect(self, c):
        if not self.match(c):
            raise XMLSyntaxError

    def lex_attrs_till(self, res):
        while self.isident():
            res[0] = ATTR
            self.putnsident(res)
            self.expect("=")
            quote = self.getch()
            if quote != '"' and quote != "'":
                raise XMLSyntaxError
            val = ""
            while self.c != quote:
                val += self.getch()
            self.expect(quote)
            res[3] = val
            yield res
            res[3] = None

    def tokenize(self):
        res = [None, None, None, None]
        while not self.eof():
            if self.match("<"):
                if self.match("/"):
                    res[0] = END_TAG
                    self.putnsident(res)
                    yield res
                    self.expect(">")
                elif self.match("?"):
                    res[0] = PI
                    res[1] = self.getident()
                    yield res
                    yield from self.lex_attrs_till(res)
                    self.expect("?")
                    self.expect(">")
                elif self.match("!"):
                    self.expect("-")
                    self.expect("-")
                    last3 = ''
                    while True:
                        last3 = last3[-2:] + self.getch()
                        if last3 == "-->":
                            break
                else:
                    res[0] = START_TAG
                    self.putnsident(res)
                    ns = res[1]
                    tag = res[2]
                    yield res
                    yield from self.lex_attrs_till(res)
                    if self.match("/"):
                        res[0] = END_TAG
                        res[1] = ns
                        res[2] = tag
                        yield res
                    self.expect(">")
            else:
                text = ""
                while self.c and self.c != "<":
                    text += self.getch()
                if text:
                    res[0] = TEXT
                    res[1] = text
                    res[2] = None
                    yield res


def gfind(gen, pred):
    for i in gen:
        if pred(i):
            return i

def text_of(gen, tag):
    # Return text content of a leaf tag from tokenizer stream
    def match_tag(t):
        if t[0] != START_TAG:
            return False
        if isinstance(tag, tuple):
            return t[1] == tag[0] and t[2] == tag[1]
        return t[2] == tag

    gfind(gen, match_tag)
    # Assumes no attributes
    res = next(gen)
    assert res[0] == TEXT
    return res[1]

def tokenize(file):
    return XMLTokenizer(file).tokenize()
	# SPDX-FileCopyrightText: Copyright (c) 2022 Neradoc
	# SPDX-License-Identifier: Unlicense

	import sys
	from ElementTree import parse

	with open("some-demo.xml", "r") as fp:
	tree = parse(fp)

	print(tree)

	def print_sub_tree(node, depth=0):
	if node.text is not None:
	text = '"' + node.text + '"'
	else:
	text = ""
	print(" "*depth, "-", node.tag, text)
	for key, value in node.attrib.items():
	print(" "*depth, "\|", key, ":", value)
	for subnode in node:
	print_sub_tree(subnode, depth+2)

	print_sub_tree(tree.getroot())
	# This file is part of the standard library of Pycopy project, minimalist
	# and lightweight Python implementation.
	#
	# https://github.com/pfalcon/pycopy
	# https://github.com/pfalcon/pycopy-lib
	#
	# The MIT License (MIT)
	#
	# Copyright (c) 2018-2020 Paul Sokolovsky
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in
	# all copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	# THE SOFTWARE.

	import io
	import xmltok2


	class ParseError(Exception):
	pass


	class Element:

	def __init__(self):
	self.tag = None
	self.attrib = {}
	self.text = None
	self.tail = None
	self._children = []

	def __getitem__(self, i):
	return self._children[i]

	def __len__(self):
	return len(self._children)

	def append(self, el):
	self._children.append(el)

	def get(self, key, default=None):
	return self.attrib.get(key, default)

	def set(self, key, value):
	self.attrib[key] = value

	def write(self, file):
	assert self.tag is not None
	file.write("<%s" % self.tag)
	for k, v in self.attrib.items():
	file.write(' {}="{}"'.format(k, v))
	file.write(">")
	if self.text is not None:
	file.write(self.text)
	for t in self._children:
	t.write(file)
	file.write("</%s>" % self.tag)
	if self.tail is not None:
	file.write(self.tail)


	class ElementTree:

	def __init__(self, root):
	self.root = root

	def getroot(self):
	return self.root

	def write(self, file):
	self.root.write(file)
	file.write("\n")


	def parse_el(stream):
	stack = []
	root = None
	last = None

	for ev in xmltok2.tokenize(stream):
	typ = ev[0]

	if typ == xmltok2.START_TAG:
	el = Element()
	el.tag = ev[2]
	if not stack:
	root = el
	else:
	stack[-1]._children.append(el)
	stack.append(el)
	last = None

	elif typ == xmltok2.ATTR:
	# Ignore attrs of processing instructions
	if stack:
	stack[-1].attrib[ev[2]] = ev[3]

	elif typ == xmltok2.TEXT:
	if last is None:
	stack[-1].text = ev[1]
	else:
	last.tail = ev[1]

	elif typ == xmltok2.END_TAG:
	if stack[-1].tag != ev[2]:
	raise ParseError("mismatched tag: /%s (expected: /%s)" % (ev[1][1], stack[-1].tag))
	last = stack.pop()

	return root


	def parse(source):
	return ElementTree(parse_el(source))


	def fromstring(data):
	buf = io.StringIO(data)
	return parse_el(buf)
	<?xml version="1.0" encoding="UTF-8"?>
	<note>
	<to>Tove</to>
	<from>Jani</from>
	<heading color="red">Reminder</heading>
	<body class="important">Don't forget me this weekend!</body>
	</note>