Skip to content

Instantly share code, notes, and snippets.

@juancarlospaco
Last active January 10, 2022 01:07
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save juancarlospaco/79f9c3a9b989b5d6ad13d64f5dda651d to your computer and use it in GitHub Desktop.
Save juancarlospaco/79f9c3a9b989b5d6ad13d64f5dda651d to your computer and use it in GitHub Desktop.
HTML Parser to extract all Links (Anchors) and return 1 List of Namedtuples, with all the <a> elements and attributes, classes, IDs, aria, title, alt, etc. It only returns Links automatically, its Iterable, meant for a 2-Step Spider Web Scrapper. Other returns a dict with all the elements on lists.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
from html.parser import HTMLParser
from collections import namedtuple
def clean_html(html: str) -> str:
"""Remove HTML markup from the given string."""
cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
cleaned = re.sub(r"(?s)<.*?>", " ", cleaned) # Remove the remaining tags.
cleaned = re.sub(r"&nbsp;", " ", cleaned) # Remove whitespaces.
cleaned = re.sub(r" ", " ", cleaned)
cleaned = re.sub(r" ", " ", cleaned)
return cleaned.strip()
class HTML2AnchorList(HTMLParser):
"""Parse only Anchor Links,ignores the rest,returns list of namedtuples."""
links, current, url_domain, __slots__ = [], None, "", ()
reserved = ("class", "id", "for", "type", "min", "max", "dir")
def handle_starttag(self, tag, attrs):
if tag.lower() == 'a':
self.current = {
k.upper() if k in self.reserved else k.replace("-", "_"):
self.url_domain + v.strip()
if k == "href" and not v.startswith("http") and self.url_domain
else v.strip()
for k, v in attrs if k and v.strip()
}
line_number, column = self.getpos()
self.current.update({"line_number": line_number, "column": column})
else:
self.current = None # Not an Anchor Link.
def handle_data(self, data):
if self.current and data:
self.current.update({"text": data.strip()})
def handle_endtag(self, tag):
if tag.lower() == 'a' and self.current:
self.links.append(namedtuple("A", self.current.keys())(
*self.current.values()))
self.current = None
def feed(self, *args, **kwargs):
self.links = []
super().feed(*args, **kwargs)
return self.links
def __iter__(self):
return iter(self.links)
class HTML2dict(HTMLParser):
"""Parse all elements,use 1 blacklist,returns 1 dict of namedtuples."""
elements, current, url_domain, __slots__ = {}, None, "", ()
reserved = ("class", "id", "for", "type", "min", "max", "dir")
backlist = ( # All HTML elements to Ignore to focus on data on the rest.
"meta", "link", "script", "style", "title", "noscript", "footer",
"header", "iframe", "object", "embed", "canvas", "svg", "template",
"dialog", "marquee", "blink", "br", "option", "datalist", "keygen",
"optgroup", "acronym", "applet", "basefont", "dir", "figure", "kbd",
"noframes", "video", "audio", "wbr", "xmp", "nobr", "figcaption")
def handle_starttag(self, tag, attrs):
tag = tag.lower().strip()
if tag in self.backlist:
print(f"{self.__class__.__name__} Blacklisted item: {tag} {attrs}")
if not tag in self.backlist:
if not self.current or not self.current.get(tag):
self.current = {tag: {}}
lineno, colu = self.getpos()
self.current[tag].update({"lineno": lineno, "column": colu})
if tag == 'a': # Appends url_domain.
self.current[tag].update({
k.replace("-", "_").upper()
if k in self.reserved else k.replace("-", "_"):
self.url_domain + v.strip()
if k == "href" and not v.startswith(
"http") and self.url_domain
else v.strip()
for k, v in attrs if k and v.strip()
})
else:
self.current[tag].update({
k.replace("-", "_").upper()
if k in self.reserved else k.replace("-", "_"):
v.strip() for k, v in attrs if k and v
})
else:
self.current = None # Not an Element we are interested in.
def handle_data(self, data):
if self.current and data:
tag = tuple(self.current.keys())[0] # Adds text contents.
self.current[tag].update({"text": data.strip()})
def handle_endtag(self, tag):
tag = tag.lower().strip()
if not tag in self.backlist and self.current is not None:
if self.current.get(tag):
if list(self.current[tag].keys()) != ['line_number', 'column']:
if not self.elements.get(tag):
self.elements[tag] = []
self.elements[tag].append(namedtuple(
tag.upper(), self.current[tag].keys())(
*self.current[tag].values()))
self.current = None
def feed(self, *args, **kwargs):
self.elements = {}
super().feed(*args, **kwargs)
return self.elements
def __iter__(self):
return iter(self.elements.items())
@juancarlospaco
Copy link
Author

>>> HTML2AnchorList().feed( pathlib.Path("example.html").read_text() )
[A(href='/some/url', lineno=274, column=78, text='example link'), A(href='#', lineno=276, column=15, text='KISS', CLASS="DRY", ID="Simple"), ]

>>> HTML2dict().feed( pathlib.Path("example.html").read_text() )
{
    "a": [A(href='/some/url', lineno=274, column=78, text='example link'), A(href='#', lineno=276, column=15, text='KISS', CLASS="DRY", ID="Simple"), ],
    "p": [P(lineno=9, column=5, text='some text'), P(lineno=12, column=9, text='other text'), ],
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment