Created
June 12, 2018 17:01
-
-
Save wincentbalin/9aba0a27dc8876c4e8007e0bbcb1bd5b to your computer and use it in GitHub Desktop.
NLTK's clean_html(), as removed in https://github.com/nltk/nltk/commit/39a303e5ddc4cdb1a0b00a3be426239b1c24c8bb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
def clean_html(html): | |
""" | |
Remove HTML markup from the given string. | |
:param html: the HTML string to be cleaned | |
:type html: str | |
:rtype: str | |
""" | |
# First we remove inline JavaScript/CSS: | |
cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip()) | |
# Then we remove html comments. This has to be done before removing regular | |
# tags since comments can contain '>' characters. | |
cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned) | |
# Next we can remove the remaining tags: | |
cleaned = re.sub(r"(?s)<.*?>", " ", cleaned) | |
# Finally, we deal with whitespace | |
cleaned = re.sub(r" ", " ", cleaned) | |
cleaned = re.sub(r" ", " ", cleaned) | |
cleaned = re.sub(r" ", " ", cleaned) | |
return cleaned.strip() | |
def clean_url(url): | |
html = compat.urlopen(url).read() | |
return clean_html(html) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment