Created
April 24, 2021 14:40
-
-
Save gruentee/203e4ba3791581070df9a4b1e6c55549 to your computer and use it in GitHub Desktop.
Create XPaths expressions from BeatufilSoup4 node objects, inspired by https://stackoverflow.com/a/32263260/1604622
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup, Tag, NavigableString, Comment | |
from lxml import etree | |
import pytest | |
def node_to_xpath(node): | |
node_type = { | |
Tag: getattr(node, "name"), | |
Comment: "comment()", | |
NavigableString: "text()" | |
} | |
same_type_siblings = list(node.parent.find_all(lambda x: getattr(node, "name", True) == getattr(x, "name", False), | |
recursive=False)) | |
if len(same_type_siblings) <= 1: | |
return node_type[type(node)] | |
pos = same_type_siblings.index(node) + 1 | |
return f"{node_type[type(node)]}[{pos}]" | |
def get_node_xpath(node: Union[Tag, Comment]): | |
xpath = "/" | |
elements = [f"{node_to_xpath(node)}"] | |
for p in node.parents: | |
if p.name == "[document]": | |
break | |
elements.insert(0, node_to_xpath(p)) | |
xpath = "/" + xpath.join(elements) | |
return xpath | |
def test_get_node_xpath(): | |
html = """<!DOCTYPE html><html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<title>Test</title> | |
</head> | |
<body> | |
test | |
<!-- This is a comment --> | |
<div> | |
<ul> | |
<li><a href="#">1</li> | |
<li><a href="#">2</li> | |
<li><a href="#">3</li> | |
<li><a href="#">4</li> | |
</ul> | |
</div> | |
</body> | |
</html>""" | |
soup = BeautifulSoup(html, "lxml") | |
dom_lxml = etree.HTML(html) | |
nodes = list(filter(lambda x: isinstance(x, (Tag, Comment)), soup.descendants)) | |
paths = [get_node_xpath(node) for node in nodes] | |
xpaths = [dom_lxml.getroottree().getpath(node) for node in dom_lxml.iter()] | |
results = [dom_lxml.getroottree().xpath(exp) for exp in paths] | |
expected = len(results) | |
actual = len(paths) | |
assert actual == expected and all(results), "No. of path expressions is equal to number of results and results " \ | |
"are not empty lists (ie expression didn't match)." | |
assert paths == xpaths |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment