Skip to content

Instantly share code, notes, and snippets.

@xflr6
Last active June 5, 2022 10:11
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xflr6/eafc69268a60d6fda8ff9d8d3e12cacc to your computer and use it in GitHub Desktop.
Save xflr6/eafc69268a60d6fda8ff9d8d3e12cacc to your computer and use it in GitHub Desktop.
Use some advanced XPath features of lxml for scraping html/xml
"""Use advanced XPath features of lxml (see also scrapy parsel)."""
from __future__ import annotations
from typing import Optional
import urllib.request
import lxml.etree
import lxml.html
def register_xpath(func, *, ns_uri: Optional[str] = None):
ns = lxml.etree.FunctionNamespace(ns_uri)
name = func.__name__.replace('_', '-')
ns[name] = func
return func
@register_xpath
def has_class(context, *args) -> bool:
cls_contains = 'contains(concat(" ", normalize-space(@class), " "), " {} ")'
has_cls = ' and '.join(cls_contains.format(cls) for cls in args)
xpath = f'self::*[@class and {has_cls}]'
return bool(context.context_node.xpath(xpath))
class XpathEval(lxml.etree.XPathDocumentEvaluator):
_defaults = {'namespaces': {'set': 'http://exslt.org/sets',
're': 'http://exslt.org/regular-expressions'},
'smart_strings': False}
@classmethod
def from_url(cls, url: str, *, xml: bool = False, **kwargs) -> XpathEval:
parse_func = lxml.xml.parse if xml else lxml.html.parse
with urllib.request.urlopen(url) as f:
tree = parse_func(f)
return cls(tree, **kwargs)
def __init__(self, tree: lxml.etree._ElementTree,
*args, **kwargs) -> None:
for kw, value in self._defaults.items():
kwargs.setdefault(kw, value)
super().__init__(tree, *args, **kwargs)
self.tree = tree
if __name__ == '__main__':
url = 'https://en.wikipedia.org/wiki/Category:Monty_Python_songs'
xpath = ('//div[has-class("mw-category")]'
'//h3[re:test(., "^[EI]$")]'
'/following-sibling::ul/li/a/text()')
e = XpathEval.from_url(url)
print(e(xpath))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment