genothomas/html2text.py

## html2text.py
#!/usr/bin/env python3

# Convert HTML markup from a file or stdin to plain text.
#
# Usage:
# html2text.py

#!/usr/bin/env python3

import sys

from lxml import html
from lxml.html import tostring
from lxml.html.clean import Cleaner


def sanitize(dirty_html):
    cleaner = Cleaner(page_structure=True,
                  meta=True,
                  embedded=True,
                  links=True,
                  style=True,
                  processing_instructions=True,
                  inline_style=True,
                  scripts=True,
                  javascript=True,
                  comments=True,
                  frames=True,
                  forms=True,
                  annoying_tags=True,
                  remove_unknown_tags=True,
                  safe_attrs_only=True,
                  safe_attrs=frozenset(['src','color', 'href', 'title', 'class', 'name', 'id']),
                  remove_tags=('span', 'font', 'div')
                  )

    return cleaner.clean_html(dirty_html)

if len(sys.argv) > 1:
  fin = open(sys.argv[1], encoding='utf-8')
else:
  fin = sys.stdin

source = fin.read()
source = sanitize(source)
source = source.replace('<br>', '\n')

tree = html.fromstring(source)
plain = tostring(tree, method='text', encoding='utf-8')

print(plain.decode('utf-8'))
	#!/usr/bin/env python3

	# Convert HTML markup from a file or stdin to plain text.
	#
	# Usage:
	# html2text.py

	#!/usr/bin/env python3

	import sys

	from lxml import html
	from lxml.html import tostring
	from lxml.html.clean import Cleaner


	def sanitize(dirty_html):
	cleaner = Cleaner(page_structure=True,
	meta=True,
	embedded=True,
	links=True,
	style=True,
	processing_instructions=True,
	inline_style=True,
	scripts=True,
	javascript=True,
	comments=True,
	frames=True,
	forms=True,
	annoying_tags=True,
	remove_unknown_tags=True,
	safe_attrs_only=True,
	safe_attrs=frozenset(['src','color', 'href', 'title', 'class', 'name', 'id']),
	remove_tags=('span', 'font', 'div')
	)

	return cleaner.clean_html(dirty_html)

	if len(sys.argv) > 1:
	fin = open(sys.argv[1], encoding='utf-8')
	else:
	fin = sys.stdin

	source = fin.read()
	source = sanitize(source)
	source = source.replace('<br>', '\n')

	tree = html.fromstring(source)
	plain = tostring(tree, method='text', encoding='utf-8')

	print(plain.decode('utf-8'))