mrichardson23/censorHtml.py

## censorHtml.py
from bs4 import BeautifulSoup
import re

html_doc = """
<html>
<head>
<title>A Page About Cats</title>
</head>

<body>
<h1>This is a page about cats!</h1>
<p>Thanks for visiting my page about <a href="cats.html">Cats!</a></p>
</body>
</html>
"""
def replaceHTML(html, outStr, inStr):
    soup = BeautifulSoup(html)

    # Compile regex pattern: match the string outStr, ignoring case
    censor = re.compile(re.escape(outStr), re.IGNORECASE)

    # Censor the title, don't use formatting:
    soup.title.string = censor.sub(inStr, soup.title.string)

    # Censor the text in the body.
    # TODO: add span tag with style
    for t in soup.html.body.findAll(text=True):
        t.replaceWith(censor.sub(inStr, t))
    return str(soup)

html_doc = replaceHTML(html_doc, 'cat', 'CENSORED')

print html_doc
	from bs4 import BeautifulSoup
	import re

	html_doc = """
	<html>
	<head>
	<title>A Page About Cats</title>
	</head>

	<body>
	<h1>This is a page about cats!</h1>
	<p>Thanks for visiting my page about <a href="cats.html">Cats!</a></p>
	</body>
	</html>
	"""
	def replaceHTML(html, outStr, inStr):
	soup = BeautifulSoup(html)

	# Compile regex pattern: match the string outStr, ignoring case
	censor = re.compile(re.escape(outStr), re.IGNORECASE)

	# Censor the title, don't use formatting:
	soup.title.string = censor.sub(inStr, soup.title.string)

	# Censor the text in the body.
	# TODO: add span tag with style
	for t in soup.html.body.findAll(text=True):
	t.replaceWith(censor.sub(inStr, t))
	return str(soup)

	html_doc = replaceHTML(html_doc, 'cat', 'CENSORED')

	print html_doc