StevenMaude/extract_text_and_remove_line_breaks_from_html.py

## extract_text_and_remove_line_breaks_from_html.py
#!/usr/bin/env python
# encoding: utf-8

from __future__ import (unicode_literals, print_function,
                        absolute_import, division)
import codecs
import re
import sys

import lxml.html
import lxml.html.clean


def read_file(input_filename):
    """ Return content of file. """
    with codecs.open(input_filename, 'r', encoding='utf-8') as f:
        return f.read()


def clean_html(root):
    """
    Take root element and return a cleaned root element.

    Removes styles, scripts, comments, links etc. from element
    and its child elements.

    See http://lxml.de/3.4/api/lxml.html.clean.Cleaner-class.html
    """
    cleaner = lxml.html.clean.Cleaner(style=True)
    cleaned_html = cleaner.clean_html(root)
    for el in cleaned_html.xpath("*//p|//br"):
        el.tail = "\n" + el.tail if el.tail else "\n"
    return cleaned_html


def write_output(output_filename, element):
    """ Write text from HTML element and all child elements to output file. """
    with codecs.open(output_filename, 'w', encoding='utf-8') as f:
        f.write(element.text_content())


def main():
    """ Read HTML file and output cleaned text from it. """
    content = read_file(sys.argv[1])
    new_content = re.sub(r'(\r\n|\n|\r)+', ' ', content)
    root = lxml.html.fromstring(new_content)
    cleaned_html = clean_html(root)
    write_output(sys.argv[2], cleaned_html)


if __name__ == '__main__':
    sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
    main()
	#!/usr/bin/env python
	# encoding: utf-8

	from __future__ import (unicode_literals, print_function,
	absolute_import, division)
	import codecs
	import re
	import sys

	import lxml.html
	import lxml.html.clean


	def read_file(input_filename):
	""" Return content of file. """
	with codecs.open(input_filename, 'r', encoding='utf-8') as f:
	return f.read()


	def clean_html(root):
	"""
	Take root element and return a cleaned root element.

	Removes styles, scripts, comments, links etc. from element
	and its child elements.

	See http://lxml.de/3.4/api/lxml.html.clean.Cleaner-class.html
	"""
	cleaner = lxml.html.clean.Cleaner(style=True)
	cleaned_html = cleaner.clean_html(root)
	for el in cleaned_html.xpath("*//p\|//br"):
	el.tail = "\n" + el.tail if el.tail else "\n"
	return cleaned_html


	def write_output(output_filename, element):
	""" Write text from HTML element and all child elements to output file. """
	with codecs.open(output_filename, 'w', encoding='utf-8') as f:
	f.write(element.text_content())


	def main():
	""" Read HTML file and output cleaned text from it. """
	content = read_file(sys.argv[1])
	new_content = re.sub(r'(\r\n\|\n\|\r)+', ' ', content)
	root = lxml.html.fromstring(new_content)
	cleaned_html = clean_html(root)
	write_output(sys.argv[2], cleaned_html)


	if __name__ == '__main__':
	sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
	main()