Skip to content

Instantly share code, notes, and snippets.

@mrichardson23
Last active December 15, 2015 16:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mrichardson23/5288314 to your computer and use it in GitHub Desktop.
Save mrichardson23/5288314 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import re
html_doc = """
<html>
<head>
<title>A Page About Cats</title>
</head>
<body>
<h1>This is a page about cats!</h1>
<p>Thanks for visiting my page about <a href="cats.html">Cats!</a></p>
</body>
</html>
"""
def replaceHTML(html, outStr, inStr):
soup = BeautifulSoup(html)
# Compile regex pattern: match the string outStr, ignoring case
censor = re.compile(re.escape(outStr), re.IGNORECASE)
# Censor the title, don't use formatting:
soup.title.string = censor.sub(inStr, soup.title.string)
# Censor the text in the body.
# TODO: add span tag with style
for t in soup.html.body.findAll(text=True):
t.replaceWith(censor.sub(inStr, t))
return str(soup)
html_doc = replaceHTML(html_doc, 'cat', 'CENSORED')
print html_doc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment