thiagodiniz/html_parser.py

## html_parser.py
# -*- coding: utf-8 -*-
import io

import requests
from bs4 import NavigableString, BeautifulSoup
from docx import Document as DocxDocument
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Cm, Pt, Mm
from docx.image.exceptions import UnrecognizedImageError
from lxml import etree

A4_PAGE_HEIGHT = 297
A4_PAGE_WIDHT = 210
DEFAULT_MARGINS = 25.4
MAX_PICTURE_SIZE = 602

def parser(html):
    soup = BeautifulSoup(html)
    body = soup.body
    document = DocxDocument()
    parsing(html, document)
    document.save("output/teste.docx")

def parsing(html, document):
    soup = BeautifulSoup(html, features="lxml")
    body = soup.body
    remove_newlines(body)
    print(body.prettify())
    process_node(body, document)

    return body


def process_node(node, document, parent=None, paragraph=None, style=[]):
    try:
        if node.contents:
            pass

        if node.name in ['ul', 'ol']:
            process_list(node, document)
            return None

        if node.name == 'math':
            process_math(node, document, paragraph)
            return None

        if node.name == 'table':
            process_table(document, node)
            return None

        if node.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            paragraph = document.add_paragraph('')
            style = []

        if not node.get('style') is None:
            style = style + process_style(node['style'])

        for child in node.children:
            paragraph = process_node(child, document, node, paragraph, style)

        if node.name == 'img':
            process_image(node, document, paragraph, style)
            return None

        if node.name == 'br':
            document.add_paragraph('')
            return None

    except AttributeError as error:
        process_string(node, document, parent, paragraph, style)

    return paragraph


def process_string(str_node, document, parent, paragraph, style=[]):
    if paragraph is None:
        paragraph = document.add_paragraph('')

    try:
        if str_node == '\n' or str_node == u'\xa0':
            return
        text = str_node.strip()

        run = paragraph.add_run(text)

        if (parent.name == 'strong') or ('bold' in style):
            run.font.bold = True
        if (parent.name == 'em') or ('italic' in style):
            run.font.italic = True
        if 'small' in style:
            run.font.size = Pt(8)
        if 'underline' in style:
            run.font.underline = True
        if 'left' in style:
            paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
        if 'right' in style:
            paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT
        if 'center' in style:
            paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
        if 'justify' in style:
            paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY

        print(text)
    except Exception as e:
        pass


def process_list(node, document):
    list_style = 'List Bullet' if node.name == 'ul' else 'List Number'

    for child in node.children:
        if child.name == 'li':
            paragraph = document.add_paragraph('', style=list_style)
            process_list_item(child, document, node, paragraph)


def process_list_item(node, document, parent, paragraph):
    try:
        remove_newlines(node)

        if node.contents:
            for child in node.children:
                if child.name == 'math':
                    process_math(child, document, paragraph)
                    continue

                process_list_item(child, document, node, paragraph)

    except AttributeError as error:
        process_string(node, document, parent, paragraph)


def process_image(img_node, document, paragraph, styles=[]):
    if paragraph and paragraph.text == '':
        delete_paragraph(paragraph)

    url = img_node['src']
    image = download_image(url)

    try:
        if should_use_width(img_node):
            print("IMG: W #" + url)
            size = convert_image_size(img_node['width'])
            document.add_picture(image, width=Mm(size))
        else:
            print("IMG: H #" + url)
            size = convert_image_size(img_node['height'])
            document.add_picture(image, height=Mm(size))

        if should_be_centered(img_node, styles):
            try:
                picture_paragraph = document.paragraphs[-1]
            except AttributeError:
                picture_paragraph = document._parent
            picture_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
    except UnrecognizedImageError as error:
        rollbar.report_message('UnrecognizedImageError at html_parser.py', 'warning')
        document.add_paragraph('### Imagem não pode ser incluida, entre em contato com o Lecionas e informe o problema')

def process_math(node, document, paragraph=None):
    import re
    from django.conf import settings

    project_path = re.sub(r'/main$', '%s', settings.BASE_DIR)
    mml2omml_stylesheet_path = project_path % "/export/MML2OMML.XSL"
    element_tree = etree.fromstring(str(node))
    xslt = etree.parse(mml2omml_stylesheet_path)
    transform = etree.XSLT(xslt)
    new_dom = transform(element_tree)

    if paragraph is None:
        paragraph = document.add_paragraph()

    paragraph._element.append(new_dom.getroot())

def should_be_centered(img_node, styles=[]):
    centered = False
    try:
        style_str = img_node['style']
        centered = ("margin-left: auto" in style_str) and ("margin-right: auto" in style_str)
    except:
        pass

    if 'center' in styles:
        centered = True

    return centered

def convert_image_size(orginal_size):
    size_in_px = float(orginal_size)
    if(size_in_px > MAX_PICTURE_SIZE):
        size_in_px = MAX_PICTURE_SIZE
    document_size = round(A4_PAGE_WIDHT - (DEFAULT_MARGINS*2), 1)
    size_in_mn = round((size_in_px * document_size)/MAX_PICTURE_SIZE)
    return size_in_mn

def should_use_width(img_node):
    try:
        width = 0
        height = 0
        width = int(img_node['width'])
        height = int(img_node['height'])
    except:
        pass

    use_width = True
    if height > width:
        use_width = False

    return use_width


def process_table(document, table_node):
    all_rows =  table_node.find_all("tr")
    rows_count = len(all_rows)
    cells_count = list(map(lambda x: len(x.find_all(["th","td"])), all_rows))
    max_columns = max(cells_count)
    table = document.add_table(rows=rows_count, cols=max_columns)

    current_row = 0
    for html_row in all_rows:
        row_cells = table.rows[current_row].cells

        current_cell_index = 0
        for html_cell in html_row.find_all(["th","td"]):
            table_cell = row_cells[current_cell_index]
            process_table_cell(document, html_cell, table_cell, [],True)
            if not (html_cell.get('colspan') is None):
                index_to_merge = current_cell_index + int(html_cell['colspan']) - 1
                cell_to_merge = row_cells[index_to_merge]
                table_cell.merge(cell_to_merge)
            current_cell_index = current_cell_index + 1
        current_row = current_row + 1

def process_table_cell(document, node, cell, styles=[], skip_newlines=False):
    # still not considering new paragraphs inside table_cell
    # https://github.com/python-openxml/python-docx/issues/216
    if not node.get('style') is None:
        styles = styles + process_style(node['style'])

    if not cell.paragraphs:
        paragraph = cell.add_paragraph()
    else:
        paragraph = cell.paragraphs[-1]

    for child in node.children:
        if isinstance(child, NavigableString):
            if skip_newlines and (not child.strip()): continue

            process_string(child, document, node, paragraph, styles)
        elif child.name == 'math':
            process_math(child, document, paragraph)
        elif child.name == 'img':
            paragraph = cell.add_paragraph()
            run = paragraph.add_run()
            process_image(child, run, None, styles)
            cell.add_paragraph()
        else:
            process_table_cell(document, child, cell, styles)

def process_style(style_str=''):
    import re
    default_size = 11
    styles = []

    if style_str is None:
        return styles

    sizeMatch = re.search('font-size: (\d+)pt', style_str)
    if sizeMatch:
        size = int(sizeMatch.group(1))
        if size < default_size:
            styles.append('small')

    weightMatch = re.search('font-weight: (700|800|900|bold(er)?)', style_str)
    if weightMatch:
        styles.append('bold')

    styleMatch = re.search('font-style: (oblique|italic)', style_str)
    if styleMatch:
        styles.append('italic')

    underMatch = re.search('text-decoration: underline', style_str)
    if underMatch:
        styles.append('underline')

    alignMatch = re.search('text-align: (left|right|center|justify)', style_str)
    if alignMatch:
        styles.append(alignMatch.group(1))

    return styles


def delete_paragraph(paragraph):
    p = paragraph._element
    p.getparent().remove(p)
    p._p = p._element = None

def remove_newlines(node):
    all_elements = node.contents
    for element in all_elements:
        if isinstance(element, NavigableString):
            if element.strip() == '':
                element.extract()

def download_image(url):
    response = requests.get(url, stream=True)
    image = io.BytesIO(response.content)
    return image

def remove_table_border(table):
    from docx.oxml.shared import OxmlElement  # Necessary Import

    # https://github.com/python-openxml/python-docx/issues/433
    tbl = table._tbl  # get xml element in table
    for cell in tbl.iter_tcs():
        tcPr = cell.tcPr  # get tcPr element, in which we can define style of borders
        tcBorders = OxmlElement('w:tcBorders')
        borders = []

        for element in ['w:top', 'w:left', 'w:bottom', 'w:right']:
            el = OxmlElement(element)
            el.set(qn('w:val'), 'nil')
            borders.append(el)

        for b in borders:
            tcBorders.append(b)

        tcPr.append(tcBorders)
	# -- coding: utf-8 --
	import io

	import requests
	from bs4 import NavigableString, BeautifulSoup
	from docx import Document as DocxDocument
	from docx.enum.text import WD_ALIGN_PARAGRAPH
	from docx.shared import Cm, Pt, Mm
	from docx.image.exceptions import UnrecognizedImageError
	from lxml import etree

	A4_PAGE_HEIGHT = 297
	A4_PAGE_WIDHT = 210
	DEFAULT_MARGINS = 25.4
	MAX_PICTURE_SIZE = 602

	def parser(html):
	soup = BeautifulSoup(html)
	body = soup.body
	document = DocxDocument()
	parsing(html, document)
	document.save("output/teste.docx")

	def parsing(html, document):
	soup = BeautifulSoup(html, features="lxml")
	body = soup.body
	remove_newlines(body)
	print(body.prettify())
	process_node(body, document)

	return body


	def process_node(node, document, parent=None, paragraph=None, style=[]):
	try:
	if node.contents:
	pass

	if node.name in ['ul', 'ol']:
	process_list(node, document)
	return None

	if node.name == 'math':
	process_math(node, document, paragraph)
	return None

	if node.name == 'table':
	process_table(document, node)
	return None

	if node.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
	paragraph = document.add_paragraph('')
	style = []

	if not node.get('style') is None:
	style = style + process_style(node['style'])

	for child in node.children:
	paragraph = process_node(child, document, node, paragraph, style)

	if node.name == 'img':
	process_image(node, document, paragraph, style)
	return None

	if node.name == 'br':
	document.add_paragraph('')
	return None

	except AttributeError as error:
	process_string(node, document, parent, paragraph, style)

	return paragraph


	def process_string(str_node, document, parent, paragraph, style=[]):
	if paragraph is None:
	paragraph = document.add_paragraph('')

	try:
	if str_node == '\n' or str_node == u'\xa0':
	return
	text = str_node.strip()

	run = paragraph.add_run(text)

	if (parent.name == 'strong') or ('bold' in style):
	run.font.bold = True
	if (parent.name == 'em') or ('italic' in style):
	run.font.italic = True
	if 'small' in style:
	run.font.size = Pt(8)
	if 'underline' in style:
	run.font.underline = True
	if 'left' in style:
	paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT
	if 'right' in style:
	paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT
	if 'center' in style:
	paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
	if 'justify' in style:
	paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY

	print(text)
	except Exception as e:
	pass


	def process_list(node, document):
	list_style = 'List Bullet' if node.name == 'ul' else 'List Number'

	for child in node.children:
	if child.name == 'li':
	paragraph = document.add_paragraph('', style=list_style)
	process_list_item(child, document, node, paragraph)


	def process_list_item(node, document, parent, paragraph):
	try:
	remove_newlines(node)

	if node.contents:
	for child in node.children:
	if child.name == 'math':
	process_math(child, document, paragraph)
	continue

	process_list_item(child, document, node, paragraph)

	except AttributeError as error:
	process_string(node, document, parent, paragraph)


	def process_image(img_node, document, paragraph, styles=[]):
	if paragraph and paragraph.text == '':
	delete_paragraph(paragraph)

	url = img_node['src']
	image = download_image(url)

	try:
	if should_use_width(img_node):
	print("IMG: W #" + url)
	size = convert_image_size(img_node['width'])
	document.add_picture(image, width=Mm(size))
	else:
	print("IMG: H #" + url)
	size = convert_image_size(img_node['height'])
	document.add_picture(image, height=Mm(size))

	if should_be_centered(img_node, styles):
	try:
	picture_paragraph = document.paragraphs[-1]
	except AttributeError:
	picture_paragraph = document._parent
	picture_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
	except UnrecognizedImageError as error:
	rollbar.report_message('UnrecognizedImageError at html_parser.py', 'warning')
	document.add_paragraph('### Imagem não pode ser incluida, entre em contato com o Lecionas e informe o problema')

	def process_math(node, document, paragraph=None):
	import re
	from django.conf import settings

	project_path = re.sub(r'/main$', '%s', settings.BASE_DIR)
	mml2omml_stylesheet_path = project_path % "/export/MML2OMML.XSL"
	element_tree = etree.fromstring(str(node))
	xslt = etree.parse(mml2omml_stylesheet_path)
	transform = etree.XSLT(xslt)
	new_dom = transform(element_tree)

	if paragraph is None:
	paragraph = document.add_paragraph()

	paragraph._element.append(new_dom.getroot())

	def should_be_centered(img_node, styles=[]):
	centered = False
	try:
	style_str = img_node['style']
	centered = ("margin-left: auto" in style_str) and ("margin-right: auto" in style_str)
	except:
	pass

	if 'center' in styles:
	centered = True

	return centered

	def convert_image_size(orginal_size):
	size_in_px = float(orginal_size)
	if(size_in_px > MAX_PICTURE_SIZE):
	size_in_px = MAX_PICTURE_SIZE
	document_size = round(A4_PAGE_WIDHT - (DEFAULT_MARGINS*2), 1)
	size_in_mn = round((size_in_px * document_size)/MAX_PICTURE_SIZE)
	return size_in_mn

	def should_use_width(img_node):
	try:
	width = 0
	height = 0
	width = int(img_node['width'])
	height = int(img_node['height'])
	except:
	pass

	use_width = True
	if height > width:
	use_width = False

	return use_width


	def process_table(document, table_node):
	all_rows = table_node.find_all("tr")
	rows_count = len(all_rows)
	cells_count = list(map(lambda x: len(x.find_all(["th","td"])), all_rows))
	max_columns = max(cells_count)
	table = document.add_table(rows=rows_count, cols=max_columns)

	current_row = 0
	for html_row in all_rows:
	row_cells = table.rows[current_row].cells

	current_cell_index = 0
	for html_cell in html_row.find_all(["th","td"]):
	table_cell = row_cells[current_cell_index]
	process_table_cell(document, html_cell, table_cell, [],True)
	if not (html_cell.get('colspan') is None):
	index_to_merge = current_cell_index + int(html_cell['colspan']) - 1
	cell_to_merge = row_cells[index_to_merge]
	table_cell.merge(cell_to_merge)
	current_cell_index = current_cell_index + 1
	current_row = current_row + 1

	def process_table_cell(document, node, cell, styles=[], skip_newlines=False):
	# still not considering new paragraphs inside table_cell
	# https://github.com/python-openxml/python-docx/issues/216
	if not node.get('style') is None:
	styles = styles + process_style(node['style'])

	if not cell.paragraphs:
	paragraph = cell.add_paragraph()
	else:
	paragraph = cell.paragraphs[-1]

	for child in node.children:
	if isinstance(child, NavigableString):
	if skip_newlines and (not child.strip()): continue

	process_string(child, document, node, paragraph, styles)
	elif child.name == 'math':
	process_math(child, document, paragraph)
	elif child.name == 'img':
	paragraph = cell.add_paragraph()
	run = paragraph.add_run()
	process_image(child, run, None, styles)
	cell.add_paragraph()
	else:
	process_table_cell(document, child, cell, styles)

	def process_style(style_str=''):
	import re
	default_size = 11
	styles = []

	if style_str is None:
	return styles

	sizeMatch = re.search('font-size: (\d+)pt', style_str)
	if sizeMatch:
	size = int(sizeMatch.group(1))
	if size < default_size:
	styles.append('small')

	weightMatch = re.search('font-weight: (700\|800\|900\|bold(er)?)', style_str)
	if weightMatch:
	styles.append('bold')

	styleMatch = re.search('font-style: (oblique\|italic)', style_str)
	if styleMatch:
	styles.append('italic')

	underMatch = re.search('text-decoration: underline', style_str)
	if underMatch:
	styles.append('underline')

	alignMatch = re.search('text-align: (left\|right\|center\|justify)', style_str)
	if alignMatch:
	styles.append(alignMatch.group(1))

	return styles


	def delete_paragraph(paragraph):
	p = paragraph._element
	p.getparent().remove(p)
	p._p = p._element = None

	def remove_newlines(node):
	all_elements = node.contents
	for element in all_elements:
	if isinstance(element, NavigableString):
	if element.strip() == '':
	element.extract()

	def download_image(url):
	response = requests.get(url, stream=True)
	image = io.BytesIO(response.content)
	return image

	def remove_table_border(table):
	from docx.oxml.shared import OxmlElement # Necessary Import

	# https://github.com/python-openxml/python-docx/issues/433
	tbl = table._tbl # get xml element in table
	for cell in tbl.iter_tcs():
	tcPr = cell.tcPr # get tcPr element, in which we can define style of borders
	tcBorders = OxmlElement('w:tcBorders')
	borders = []

	for element in ['w:top', 'w:left', 'w:bottom', 'w:right']:
	el = OxmlElement(element)
	el.set(qn('w:val'), 'nil')
	borders.append(el)

	for b in borders:
	tcBorders.append(b)

	tcPr.append(tcBorders)