bradyjiang/cleaner.py

## cleaner.py
from lxml.html.clean import Cleaner

#to prevent Cleaner to replace html with div, leave page_structure alone: http://stackoverflow.com/questions/15556391/lxml-clean-html-replaces-html-tag-with-div
cleaner = Cleaner(page_structure=False)
#according to: http://stackoverflow.com/questions/8554035/remove-all-javascript-tags-and-style-tags-from-html-with-python-and-the-lxml-mod
#Cleaner is a better general solution to the problem than using strip_elements, because in cases like this you want to strip out more than just the <script> tag; you also want to get rid of things like onclick=function() attributes on other tags.
cleaner.javascript=True
cleaner.scripts=True
#turn this on in the future if necessary
#cleaner.style=True
cleaner.kill_tags=["base"]
cleaned_html=cleaner.clean_html(str_html)
	from lxml.html.clean import Cleaner

	#to prevent Cleaner to replace html with div, leave page_structure alone: http://stackoverflow.com/questions/15556391/lxml-clean-html-replaces-html-tag-with-div
	cleaner = Cleaner(page_structure=False)
	#according to: http://stackoverflow.com/questions/8554035/remove-all-javascript-tags-and-style-tags-from-html-with-python-and-the-lxml-mod
	#Cleaner is a better general solution to the problem than using strip_elements, because in cases like this you want to strip out more than just the <script> tag; you also want to get rid of things like onclick=function() attributes on other tags.
	cleaner.javascript=True
	cleaner.scripts=True
	#turn this on in the future if necessary
	#cleaner.style=True
	cleaner.kill_tags=["base"]
	cleaned_html=cleaner.clean_html(str_html)