Skip to content

Instantly share code, notes, and snippets.

@Bilio
Created July 17, 2018 01:02
Show Gist options
  • Save Bilio/81074a856f7d771b4d0846a87021fea5 to your computer and use it in GitHub Desktop.
Save Bilio/81074a856f7d771b4d0846a87021fea5 to your computer and use it in GitHub Desktop.
Clean html with python lxml.html Cleaner
import codecs
import sys
from lxml import etree
from lxml.html.clean import Cleaner
def sanitize(dirty_html):
cleaner = Cleaner(page_structure=True,
meta=True,
embedded=True,
links=True,
style=True,
processing_instructions=True,
inline_style=True,
scripts=True,
javascript=True,
comments=True,
frames=True,
forms=True,
annoying_tags=True,
remove_unknown_tags=True,
safe_attrs_only=True,
safe_attrs=frozenset(['src','color', 'href', 'title', 'class', 'name', 'id']),
remove_tags=('span', 'font', 'div')
)
return cleaner.clean_html(dirty_html)
def to_html(element):
return etree.tostring(element, pretty_print=True, encoding='utf-8')
if __name__ == '__main__':
cln_html = None
# >>>>>>>>>>>>>>>>>>> SET INPUT FILE ENCODING HERE <<<<<<<<<<<<<<<<<<<<<
with codecs.open(sys.argv[1], 'rb', 'cp1252') as fin:
sys.stderr.write('sanitizing input...')
cln_html = sanitize(fin.read())
sys.stderr.write('Done\n')
with codecs.open(sys.argv[1] + '.new.html', 'wb', 'utf-8') as fout:
sys.stderr.write('writing file...')
fout.write(cln_html)
sys.stderr.write('Done\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment