Skip to content

Instantly share code, notes, and snippets.

@clemlesne
Created September 18, 2023 17:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save clemlesne/0fdc12fcd3f3649b04908cc32f3173bb to your computer and use it in GitHub Desktop.
Save clemlesne/0fdc12fcd3f3649b04908cc32f3173bb to your computer and use it in GitHub Desktop.
Takes a raw string of HTML and removes all HTML tags, Markdown tables, and line returns.
from typing import Optional
import re
import html
def sanitize(raw: Optional[str]) -> Optional[str]:
"""
Takes a raw string of HTML and removes all HTML tags, Markdown tables, and line returns.
"""
if not raw:
return None
# Remove HTML doctype
raw = re.sub(r"<!DOCTYPE[^>]*>", " ", raw)
# Remove HTML head
raw = re.sub(r"<head\b[^>]*>[\s\S]*<\/head>", " ", raw)
# Remove HTML scripts
raw = re.sub(r"<script\b[^>]*>[\s\S]*?<\/script>", " ", raw)
# Remove HTML styles
raw = re.sub(r"<style\b[^>]*>[\s\S]*?<\/style>", " ", raw)
# Extract href from HTML links, in the form of "(href) text"
raw = re.sub(r"<a\b[^>]*href=\"([^\"]*)\"[^>]*>([^<]*)<\/a>", r"(\1) \2", raw)
# Remove HTML tags
raw = re.sub(r"<[^>]*>", " ", raw)
# Remove Markdown tables
raw = re.sub(r"[-|]{2,}", " ", raw)
# Remove Markdown code blocks
raw = re.sub(r"```[\s\S]*```", " ", raw)
# Remove Markdown bold, italic, strikethrough, code, heading, table delimiters, links, images, comments, and horizontal rules
raw = re.sub(r"[*_`~#|!\[\]<>-]+", " ", raw)
# Remove line returns, tabs and spaces
raw = re.sub(r"[\n\t\v ]+", " ", raw)
# Remove HTML entities
raw = html.unescape(raw)
# Remove leading and trailing spaces
raw = raw.strip()
return raw
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment