Skip to content

Instantly share code, notes, and snippets.

@benoit74
Created May 16, 2024 13:23
Show Gist options
  • Save benoit74/353a203270003acc23471b3d8fb1a92c to your computer and use it in GitHub Desktop.
Save benoit74/353a203270003acc23471b3d8fb1a92c to your computer and use it in GitHub Desktop.
Test Python and lxml HTML parsers
CONTENTS = [
(
"content1",
"<html><head><title>Test</title></head><body><h1>Parse me!</h1></body></html>",
),
(
"content2",
"""<img width="200" src="image.png?param1=value1&aparam2=value2" />""",
),
(
"content3",
"""<img width="200" src="image.png?param1=value1&param2=value2" />""",
),
]
from html.parser import HTMLParser
import io
from lxml import etree
import difflib
import sys
AttrsList = list[tuple[str, str | None]]
class MyHTMLParser(HTMLParser):
def __init__(self):
super().__init__()
self.output = io.StringIO()
def send(self, value: str):
self.output.write(value)
def format_attr(self, name: str, value: str | None) -> str:
if value is None:
return name
return f'{name}="{value}"'
def handle_starttag(self, tag: str, attrs: AttrsList, *, auto_close: bool = False):
self.send(f"<{tag}")
if attrs:
self.send(" ")
self.send(" ".join(self.format_attr(*attr) for attr in attrs))
if auto_close:
self.send(" />")
else:
self.send(">")
def handle_endtag(self, tag: str):
self.send(f"</{tag}>")
def handle_startendtag(self, tag: str, attrs: AttrsList):
self.handle_starttag(tag, attrs, auto_close=True)
def handle_data(self, data):
self.send(data)
def handle_entityref(self, name: str):
self.send(f"&{name};")
def handle_charref(self, name: str):
self.send(f"&#{name};")
def handle_comment(self, data: str):
self.send(f"<!--{data}-->")
def handle_decl(self, decl: str):
self.send(f"<!{decl}>")
def handle_pi(self, data: str):
self.send(f"<?{data}>")
class MyLxmlParser(object):
def __init__(self):
super().__init__()
self.output = io.StringIO()
def send(self, value: str):
self.output.write(value)
def format_attr(self, name: str, value: str | None) -> str:
if value is None:
return name
return f'{name}="{value}"'
def start(self, tag: str, attrs: AttrsList, *, auto_close: bool = False):
self.send(f"<{tag}")
if attrs:
self.send(" ")
self.send(
" ".join(
self.format_attr(attr_name, attr_value)
for attr_name, attr_value in dict(attrs).items()
)
)
if auto_close:
self.send(" />")
else:
self.send(">")
def end(self, tag: str):
self.send(f"</{tag}>")
def data(self, data):
self.send(data)
def comment(self, data: str):
self.send(f"<!--{data}-->")
def close(self):
pass
print(
"###################################\n"
"# Parsing with Python HTML parser #\n"
"###################################"
)
for content_name, content_value in CONTENTS:
print(f"\033[93mProcessing {content_name} \033[0m")
parser = MyHTMLParser()
parser.feed(content_value)
parser.output.seek(0)
content_parsed = parser.output.read()
if content_value == content_parsed:
print(f" {content_value}")
print(f" \033[92mParsing is transparent \033[0m")
else:
print("- expected")
print("+ actual")
differ = difflib.Differ()
diffs = list(
differ.compare(content_value.splitlines(), content_parsed.splitlines())
)
print("\n".join(f" {diff}" for diff in diffs))
print(f" \033[91mParsing is not transparent \033[0m")
print("")
print(
"#################################\n"
"# Parsing with LXML HTML parser #\n"
"#################################"
)
for content_name, content_value in CONTENTS:
print(f"\033[93mProcessing {content_name} \033[0m")
target = MyLxmlParser()
parser = etree.HTMLParser(target=target)
parser.feed(content_value)
target.output.seek(0)
content_parsed = target.output.read()
if content_value == content_parsed:
print(f" {content_value}")
print(f" \033[92mParsing is transparent \033[0m")
else:
print("- expected")
print("+ actual")
differ = difflib.Differ()
diffs = list(
differ.compare(content_value.splitlines(), content_parsed.splitlines())
)
print("\n".join(f" {diff}" for diff in diffs))
print(f" \033[91mParsing is not transparent \033[0m")
print("")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment