Skip to content

Instantly share code, notes, and snippets.

@wincentbalin
Created June 12, 2018 17:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wincentbalin/9aba0a27dc8876c4e8007e0bbcb1bd5b to your computer and use it in GitHub Desktop.
Save wincentbalin/9aba0a27dc8876c4e8007e0bbcb1bd5b to your computer and use it in GitHub Desktop.
import re
def clean_html(html):
"""
Remove HTML markup from the given string.
:param html: the HTML string to be cleaned
:type html: str
:rtype: str
"""
# First we remove inline JavaScript/CSS:
cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
# Then we remove html comments. This has to be done before removing regular
# tags since comments can contain '>' characters.
cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
# Next we can remove the remaining tags:
cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
# Finally, we deal with whitespace
cleaned = re.sub(r"&nbsp;", " ", cleaned)
cleaned = re.sub(r" ", " ", cleaned)
cleaned = re.sub(r" ", " ", cleaned)
return cleaned.strip()
def clean_url(url):
html = compat.urlopen(url).read()
return clean_html(html)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment