Skip to content

Instantly share code, notes, and snippets.

@genothomas
Forked from glowinthedark/html2text.py
Created March 24, 2022 16:30
Show Gist options
  • Save genothomas/0c0e8e6c113caa9b0e0f3e9c2fa4adf4 to your computer and use it in GitHub Desktop.
Save genothomas/0c0e8e6c113caa9b0e0f3e9c2fa4adf4 to your computer and use it in GitHub Desktop.
HTML to plain text converter using python and lxml
#!/usr/bin/env python3
# Convert HTML markup from a file or stdin to plain text.
#
# Usage:
# html2text.py
#!/usr/bin/env python3
import sys
from lxml import html
from lxml.html import tostring
from lxml.html.clean import Cleaner
def sanitize(dirty_html):
cleaner = Cleaner(page_structure=True,
meta=True,
embedded=True,
links=True,
style=True,
processing_instructions=True,
inline_style=True,
scripts=True,
javascript=True,
comments=True,
frames=True,
forms=True,
annoying_tags=True,
remove_unknown_tags=True,
safe_attrs_only=True,
safe_attrs=frozenset(['src','color', 'href', 'title', 'class', 'name', 'id']),
remove_tags=('span', 'font', 'div')
)
return cleaner.clean_html(dirty_html)
if len(sys.argv) > 1:
fin = open(sys.argv[1], encoding='utf-8')
else:
fin = sys.stdin
source = fin.read()
source = sanitize(source)
source = source.replace('<br>', '\n')
tree = html.fromstring(source)
plain = tostring(tree, method='text', encoding='utf-8')
print(plain.decode('utf-8'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment