Skip to content

Instantly share code, notes, and snippets.

@gruentee
Created April 24, 2021 14:40
Show Gist options
  • Save gruentee/203e4ba3791581070df9a4b1e6c55549 to your computer and use it in GitHub Desktop.
Save gruentee/203e4ba3791581070df9a4b1e6c55549 to your computer and use it in GitHub Desktop.
Create XPaths expressions from BeatufilSoup4 node objects, inspired by https://stackoverflow.com/a/32263260/1604622
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
from lxml import etree
import pytest
def node_to_xpath(node):
node_type = {
Tag: getattr(node, "name"),
Comment: "comment()",
NavigableString: "text()"
}
same_type_siblings = list(node.parent.find_all(lambda x: getattr(node, "name", True) == getattr(x, "name", False),
recursive=False))
if len(same_type_siblings) <= 1:
return node_type[type(node)]
pos = same_type_siblings.index(node) + 1
return f"{node_type[type(node)]}[{pos}]"
def get_node_xpath(node: Union[Tag, Comment]):
xpath = "/"
elements = [f"{node_to_xpath(node)}"]
for p in node.parents:
if p.name == "[document]":
break
elements.insert(0, node_to_xpath(p))
xpath = "/" + xpath.join(elements)
return xpath
def test_get_node_xpath():
html = """<!DOCTYPE html><html lang="en">
<head>
<meta charset="UTF-8">
<title>Test</title>
</head>
<body>
test
<!-- This is a comment -->
<div>
<ul>
<li><a href="#">1</li>
<li><a href="#">2</li>
<li><a href="#">3</li>
<li><a href="#">4</li>
</ul>
</div>
</body>
</html>"""
soup = BeautifulSoup(html, "lxml")
dom_lxml = etree.HTML(html)
nodes = list(filter(lambda x: isinstance(x, (Tag, Comment)), soup.descendants))
paths = [get_node_xpath(node) for node in nodes]
xpaths = [dom_lxml.getroottree().getpath(node) for node in dom_lxml.iter()]
results = [dom_lxml.getroottree().xpath(exp) for exp in paths]
expected = len(results)
actual = len(paths)
assert actual == expected and all(results), "No. of path expressions is equal to number of results and results " \
"are not empty lists (ie expression didn't match)."
assert paths == xpaths
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment