Last active
August 26, 2022 10:30
-
-
Save CallMarl/5ebebf03a5ae3b2c06f34aec567f6f6a to your computer and use it in GitHub Desktop.
Python simple HTML Parser implementation (example)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from html.parser import HTMLParser | |
class Node : | |
def __init__(self, tag, tag_text) : | |
self.initial = tag | |
self.tag = "<%s>" % tag | |
self.tag_text = tag_text | |
def __str__(self) : | |
return "%s" % (self.tag_text) | |
class HTMLNodeTree : | |
child = -1 | |
data = [] | |
is_child = False | |
is_parent = False | |
level = -1 | |
node = {} | |
parent = {} | |
def __init__(self, *args, parent = None, level = -1, node = None, **kwargs) : | |
self.is_parent = True if parent else False | |
self.data = [] | |
self.level = level + 1 | |
self.parent = parent | |
self.node = node | |
self.beautify = kwargs.get('beautify', 0) | |
def get_node(self) : | |
return self.node | |
def add_node(self, node) : | |
node = HTMLNodeTree(parent = self, level = self.level, node = node, beautify = self.beautify) | |
self.data.append(node) | |
return node | |
def add_data(self, data) : | |
self.data.append(data) | |
def get_parent(self) : | |
return self.parent if self.is_parent else self | |
def to_string(self) : | |
source = "" | |
if self.node : | |
source += self.node.tag_text if self.beautify == 0 else self.node.tag | |
for data in self.data : | |
if type(data) == type("") : | |
source += data | |
if type(data) == type(self) : | |
source += data.to_string() | |
return source | |
def length(self) : | |
return len(self.to_string()) | |
def __str__(self) : | |
return self.to_string() | |
class MyHTMLParser(HTMLParser): | |
def __init__(self, *args, **kwargs) : | |
self.tree = HTMLNodeTree(beautify = kwargs.pop('beautify', 0)) | |
HTMLParser.__init__(self, *args, **kwargs) | |
def handle_starttag(self, tag, attrs) : | |
node = Node(tag = tag, tag_text = self.get_starttag_text()) | |
self.tree = self.tree.add_node(node) | |
def handle_endtag(self, tag) : | |
node = self.tree.get_node() | |
if not node : | |
return | |
if node.initial == tag and tag not in ['br', 'hr'] : | |
self.tree.add_data("</%s>" % tag) | |
self.tree = self.tree.get_parent() | |
elif tag in ['br', 'hr'] : | |
self.tree = self.tree.get_parent() | |
else : | |
self.tree = self.tree.get_parent() | |
self.handle_endtag(tag) | |
def handle_data(self, data): | |
self.tree.add_data(data) | |
def __str__(self) : | |
print("Display tree :\n") | |
return "\n" + str(self.tree) | |
### Code entrypoint | |
# | |
source = '<html><head><title>Test</title></head><body><br /><h1>\nParse me!<a href=\'\'>Hell\noo</a> \nHum...</h1></body></html>' | |
print("Initial length : %d" % len(source)) | |
### Beautify leverage is dedicated to change the behaviour of the display. | |
# | |
print("---------------------------------------------") | |
parser = MyHTMLParser() | |
parser.feed(source) | |
print("Beautify 0 length : %d" % parser.tree.length()) | |
print(parser) | |
print("---------------------------------------------") | |
print("") | |
print("---------------------------------------------") | |
parser = MyHTMLParser(beautify = 1) | |
parser.feed(source) | |
print("Beautify 1 length : %d" % parser.tree.length()) | |
print(parser) | |
print("---------------------------------------------") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment