Skip to content

Instantly share code, notes, and snippets.

@CallMarl
Last active August 26, 2022 10:30
Show Gist options
  • Save CallMarl/5ebebf03a5ae3b2c06f34aec567f6f6a to your computer and use it in GitHub Desktop.
Save CallMarl/5ebebf03a5ae3b2c06f34aec567f6f6a to your computer and use it in GitHub Desktop.
Python simple HTML Parser implementation (example)
from html.parser import HTMLParser
class Node :
def __init__(self, tag, tag_text) :
self.initial = tag
self.tag = "<%s>" % tag
self.tag_text = tag_text
def __str__(self) :
return "%s" % (self.tag_text)
class HTMLNodeTree :
child = -1
data = []
is_child = False
is_parent = False
level = -1
node = {}
parent = {}
def __init__(self, *args, parent = None, level = -1, node = None, **kwargs) :
self.is_parent = True if parent else False
self.data = []
self.level = level + 1
self.parent = parent
self.node = node
self.beautify = kwargs.get('beautify', 0)
def get_node(self) :
return self.node
def add_node(self, node) :
node = HTMLNodeTree(parent = self, level = self.level, node = node, beautify = self.beautify)
self.data.append(node)
return node
def add_data(self, data) :
self.data.append(data)
def get_parent(self) :
return self.parent if self.is_parent else self
def to_string(self) :
source = ""
if self.node :
source += self.node.tag_text if self.beautify == 0 else self.node.tag
for data in self.data :
if type(data) == type("") :
source += data
if type(data) == type(self) :
source += data.to_string()
return source
def length(self) :
return len(self.to_string())
def __str__(self) :
return self.to_string()
class MyHTMLParser(HTMLParser):
def __init__(self, *args, **kwargs) :
self.tree = HTMLNodeTree(beautify = kwargs.pop('beautify', 0))
HTMLParser.__init__(self, *args, **kwargs)
def handle_starttag(self, tag, attrs) :
node = Node(tag = tag, tag_text = self.get_starttag_text())
self.tree = self.tree.add_node(node)
def handle_endtag(self, tag) :
node = self.tree.get_node()
if not node :
return
if node.initial == tag and tag not in ['br', 'hr'] :
self.tree.add_data("</%s>" % tag)
self.tree = self.tree.get_parent()
elif tag in ['br', 'hr'] :
self.tree = self.tree.get_parent()
else :
self.tree = self.tree.get_parent()
self.handle_endtag(tag)
def handle_data(self, data):
self.tree.add_data(data)
def __str__(self) :
print("Display tree :\n")
return "\n" + str(self.tree)
### Code entrypoint
#
source = '<html><head><title>Test</title></head><body><br /><h1>\nParse me!<a href=\'\'>Hell\noo</a> \nHum...</h1></body></html>'
print("Initial length : %d" % len(source))
### Beautify leverage is dedicated to change the behaviour of the display.
#
print("---------------------------------------------")
parser = MyHTMLParser()
parser.feed(source)
print("Beautify 0 length : %d" % parser.tree.length())
print(parser)
print("---------------------------------------------")
print("")
print("---------------------------------------------")
parser = MyHTMLParser(beautify = 1)
parser.feed(source)
print("Beautify 1 length : %d" % parser.tree.length())
print(parser)
print("---------------------------------------------")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment