Skip to content

Instantly share code, notes, and snippets.

@xflr6
Last active June 5, 2022 16:41
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xflr6/effaed95e5415c5dd24ec17578ddaf1e to your computer and use it in GitHub Desktop.
Save xflr6/effaed95e5415c5dd24ec17578ddaf1e to your computer and use it in GitHub Desktop.
Compare different ways to get a html tree from an url with streaming
"""Compare ways to return HTML tree streamed and parsed from a given URL."""
import contextlib
from typing import Literal, overload
import urllib.request
import xml.etree.ElementTree as etree
import certifi
import html5lib
import lxml.html
import requests
import urllib3
def parse(url: str) -> etree.ElementTree:
# pure stdlib, but well-formed XHTML only
with urllib.request.urlopen(url) as f:
tree = etree.parse(f)
return tree
def parse(url: str) -> lxml.etree._ElementTree:
if url.lower().startswith('https:'):
raise NotImplementedError('no HTTPS support')
return lxml.html.parse(url)
def parse(url: str) -> lxml.etree._ElementTree:
with urllib.request.urlopen(url) as f:
tree = lxml.html.parse(f)
return tree
@overload
def parse(url: str, *,
treebuilder: Literal['etree'] = ...) -> etree.Element: ...
@overload
def parse(url: str, *,
treebuilder: Literal['lxml']) -> lxml.etree._ElementTree: ...
def parse(url: str, *,
treebuilder: Literal['etree', 'lxml'] = 'etree'):
with urllib.request.urlopen(url) as f:
encoding = f.headers.get_param('charset')
tree = html5lib.parse(f,
treebuilder=treebuilder,
transport_encoding=encoding,
namespaceHTMLElements=False)
return tree
def parse(url: str) -> lxml.etree._ElementTree:
with urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) as http:
r = http.request('GET', url, preload_content=False)
try:
tree = lxml.html.parse(r)
finally:
r.release_conn()
return tree
def parse(url: str) -> lxml.etree._ElementTree:
with contextlib.closing(requests.get(url, stream=True)) as r:
r.raw.decode_content = True
tree = lxml.html.parse(r.raw)
return tree
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment