Last active
January 10, 2022 01:07
-
-
Save juancarlospaco/79f9c3a9b989b5d6ad13d64f5dda651d to your computer and use it in GitHub Desktop.
HTML Parser to extract all Links (Anchors) and return 1 List of Namedtuples, with all the <a> elements and attributes, classes, IDs, aria, title, alt, etc. It only returns Links automatically, its Iterable, meant for a 2-Step Spider Web Scrapper. Other returns a dict with all the elements on lists.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
import re | |
from html.parser import HTMLParser | |
from collections import namedtuple | |
def clean_html(html: str) -> str: | |
"""Remove HTML markup from the given string.""" | |
cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip()) | |
cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned) | |
cleaned = re.sub(r"(?s)<.*?>", " ", cleaned) # Remove the remaining tags. | |
cleaned = re.sub(r" ", " ", cleaned) # Remove whitespaces. | |
cleaned = re.sub(r" ", " ", cleaned) | |
cleaned = re.sub(r" ", " ", cleaned) | |
return cleaned.strip() | |
class HTML2AnchorList(HTMLParser): | |
"""Parse only Anchor Links,ignores the rest,returns list of namedtuples.""" | |
links, current, url_domain, __slots__ = [], None, "", () | |
reserved = ("class", "id", "for", "type", "min", "max", "dir") | |
def handle_starttag(self, tag, attrs): | |
if tag.lower() == 'a': | |
self.current = { | |
k.upper() if k in self.reserved else k.replace("-", "_"): | |
self.url_domain + v.strip() | |
if k == "href" and not v.startswith("http") and self.url_domain | |
else v.strip() | |
for k, v in attrs if k and v.strip() | |
} | |
line_number, column = self.getpos() | |
self.current.update({"line_number": line_number, "column": column}) | |
else: | |
self.current = None # Not an Anchor Link. | |
def handle_data(self, data): | |
if self.current and data: | |
self.current.update({"text": data.strip()}) | |
def handle_endtag(self, tag): | |
if tag.lower() == 'a' and self.current: | |
self.links.append(namedtuple("A", self.current.keys())( | |
*self.current.values())) | |
self.current = None | |
def feed(self, *args, **kwargs): | |
self.links = [] | |
super().feed(*args, **kwargs) | |
return self.links | |
def __iter__(self): | |
return iter(self.links) | |
class HTML2dict(HTMLParser): | |
"""Parse all elements,use 1 blacklist,returns 1 dict of namedtuples.""" | |
elements, current, url_domain, __slots__ = {}, None, "", () | |
reserved = ("class", "id", "for", "type", "min", "max", "dir") | |
backlist = ( # All HTML elements to Ignore to focus on data on the rest. | |
"meta", "link", "script", "style", "title", "noscript", "footer", | |
"header", "iframe", "object", "embed", "canvas", "svg", "template", | |
"dialog", "marquee", "blink", "br", "option", "datalist", "keygen", | |
"optgroup", "acronym", "applet", "basefont", "dir", "figure", "kbd", | |
"noframes", "video", "audio", "wbr", "xmp", "nobr", "figcaption") | |
def handle_starttag(self, tag, attrs): | |
tag = tag.lower().strip() | |
if tag in self.backlist: | |
print(f"{self.__class__.__name__} Blacklisted item: {tag} {attrs}") | |
if not tag in self.backlist: | |
if not self.current or not self.current.get(tag): | |
self.current = {tag: {}} | |
lineno, colu = self.getpos() | |
self.current[tag].update({"lineno": lineno, "column": colu}) | |
if tag == 'a': # Appends url_domain. | |
self.current[tag].update({ | |
k.replace("-", "_").upper() | |
if k in self.reserved else k.replace("-", "_"): | |
self.url_domain + v.strip() | |
if k == "href" and not v.startswith( | |
"http") and self.url_domain | |
else v.strip() | |
for k, v in attrs if k and v.strip() | |
}) | |
else: | |
self.current[tag].update({ | |
k.replace("-", "_").upper() | |
if k in self.reserved else k.replace("-", "_"): | |
v.strip() for k, v in attrs if k and v | |
}) | |
else: | |
self.current = None # Not an Element we are interested in. | |
def handle_data(self, data): | |
if self.current and data: | |
tag = tuple(self.current.keys())[0] # Adds text contents. | |
self.current[tag].update({"text": data.strip()}) | |
def handle_endtag(self, tag): | |
tag = tag.lower().strip() | |
if not tag in self.backlist and self.current is not None: | |
if self.current.get(tag): | |
if list(self.current[tag].keys()) != ['line_number', 'column']: | |
if not self.elements.get(tag): | |
self.elements[tag] = [] | |
self.elements[tag].append(namedtuple( | |
tag.upper(), self.current[tag].keys())( | |
*self.current[tag].values())) | |
self.current = None | |
def feed(self, *args, **kwargs): | |
self.elements = {} | |
super().feed(*args, **kwargs) | |
return self.elements | |
def __iter__(self): | |
return iter(self.elements.items()) |
Author
juancarlospaco
commented
Jan 4, 2018
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment