Skip to content

Instantly share code, notes, and snippets.

Created April 24, 2021 14:40
Show Gist options
  • Save gruentee/203e4ba3791581070df9a4b1e6c55549 to your computer and use it in GitHub Desktop.
Save gruentee/203e4ba3791581070df9a4b1e6c55549 to your computer and use it in GitHub Desktop.
Create XPaths expressions from BeatufilSoup4 node objects, inspired by
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
from lxml import etree
import pytest
def node_to_xpath(node):
node_type = {
Tag: getattr(node, "name"),
Comment: "comment()",
NavigableString: "text()"
same_type_siblings = list(node.parent.find_all(lambda x: getattr(node, "name", True) == getattr(x, "name", False),
if len(same_type_siblings) <= 1:
return node_type[type(node)]
pos = same_type_siblings.index(node) + 1
return f"{node_type[type(node)]}[{pos}]"
def get_node_xpath(node: Union[Tag, Comment]):
xpath = "/"
elements = [f"{node_to_xpath(node)}"]
for p in node.parents:
if == "[document]":
elements.insert(0, node_to_xpath(p))
xpath = "/" + xpath.join(elements)
return xpath
def test_get_node_xpath():
html = """<!DOCTYPE html><html lang="en">
<meta charset="UTF-8">
<!-- This is a comment -->
<li><a href="#">1</li>
<li><a href="#">2</li>
<li><a href="#">3</li>
<li><a href="#">4</li>
soup = BeautifulSoup(html, "lxml")
dom_lxml = etree.HTML(html)
nodes = list(filter(lambda x: isinstance(x, (Tag, Comment)), soup.descendants))
paths = [get_node_xpath(node) for node in nodes]
xpaths = [dom_lxml.getroottree().getpath(node) for node in dom_lxml.iter()]
results = [dom_lxml.getroottree().xpath(exp) for exp in paths]
expected = len(results)
actual = len(paths)
assert actual == expected and all(results), "No. of path expressions is equal to number of results and results " \
"are not empty lists (ie expression didn't match)."
assert paths == xpaths
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment