phillipkent/convert-docx-to-html.py

## convert-docx-to-html.py
# Converts a docx file with tables and images to (simple) HTML
# ORIGINAL CODE BY DAVID SSALI AT SOURCE: https://medium.com/@dvdssali/docx-to-html-1374eb6491a1
#
# Requires the Python module 'python-docx' <https://python-docx.readthedocs.io>
#
# Example use:
# >>> s = convert('./SOURCEDIR', 'SAMPLE.docx')
# >>> print(s)
# or
# $ python .\convert-docx-to-html.py > temp.html

from docx import Document
from docx.document import Document as _Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
import xml.etree.ElementTree as ET
import time
import re
import os

def convert(dir_path,file):
    ##dir_path = "./SOURCEDIR"
    ##file = "SAMPLE.docx"
	# image_path will be a directory to store image files; if the directory does not exist it will be created
	splitname = re.split('[. ]', file)
	if len(splitname) > 2:
		image_path=  ''.join(splitname[:2]) + '-images'
	else:
		image_path= splitname[0] + '-images'
	if not os.path.isdir(image_path):
		os.mkdir(image_path)
	document = Document(dir_path + '/' + file)
	body = ""
	list_items = []

	for block in iter_block_items(document):
		if isinstance(block, Paragraph):
			tmp_heading_type = get_heading_type(block)
			if re.match("List\sParagraph",tmp_heading_type):
				list_items.append("<li>" + block.text + "</li>")
			else:
				images = render_image(document,block,image_path,image_path)
				if len(list_items) > 0:
					body += render_list_items(list_items)
					list_items = []
				if len(images) > 0:
					body = body + images
				else:
					# modified to use a different outer_tag if a 'Heading' style is found in the original paragraph
					if 'Heading' in tmp_heading_type:
					    outer_tag = 'h'+tmp_heading_type.split(' ')[-1]
					else:
						outer_tag = 'p'
					body = body + render_runs(block.runs, outer_tag)
		elif isinstance(block, Table):
			body += render_table(block,document,image_path)

	return body

def iter_block_items(parent):
    """
    Generate a reference to each paragraph and table child within *parent*,
    in document order. Each returned value is an instance of either Table or
    Paragraph. *parent* would most commonly be a reference to a main
    Document object, but also works for a _Cell object, which itself can
    contain paragraphs and tables.
    """
    if isinstance(parent, _Document):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("something's not right")
    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)

def table_print(block):
    table = block
    for row in table.rows:
        for cell in row.cells:
            for paragraph in cell.paragraphs:
                print(paragraph.text,'  ',end='')
        print("\n")

# Modified to treat cell content as a set of blocks to process
def render_table(block,document,image_path):
	table = block
	html = "<table class='table table-bordered' border='1'>"
	for row in table.rows:
		html += "<tr>"
		for cell in row.cells:
			html += "<td>"
			# ---
			cbody = ""
			clist_items = []
			for cblock in iter_block_items(cell):
				if isinstance(cblock, Paragraph):
					tmp_heading_type = get_heading_type(cblock)
					if re.match("List\sParagraph",tmp_heading_type):
						clist_items.append("<li>" + block.text + "</li>")
					else:
						images = render_image(document,cblock,image_path,image_path)
						if len(clist_items) > 0:
							cbody += render_list_items(clist_items)
							clist_items = []
						if len(images) > 0:
							cbody = cbody + images
						else:
							cbody = cbody + render_runs(cblock.runs)
				elif isinstance(cblock, Table):
					cbody += render_table(cblock)
			html += cbody + " "
			html += "</td>"
		html += "</tr>"
	html += "</table>"
	return html

# Modified to use a different outer_tag if a 'Heading' style is found in the original paragraph
def render_runs(runs, outer_tag='p'):
    html = "<" + outer_tag + ">"
    for run in runs:
        html = html + run.text
    html += "</" + outer_tag + ">"
    return html

def render_list_items(items):
    html = "<ul>"
    for item in items:
        html += item
    html += "</ul>"
    return html

def get_heading_type(block):
    return block.style.name

def render_image(document,par,dir_path,book_id):
    """get all of the images in a paragraph
    :param par: a paragraph object from docx
    :return: a list of r:embed
    """
    ids = []
    root = ET.fromstring(par._p.xml)
    namespace = {
             'a':"http://schemas.openxmlformats.org/drawingml/2006/main", \
             'r':"http://schemas.openxmlformats.org/officeDocument/2006/relationships", \
             'wp':"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"}
    inlines = root.findall('.//wp:inline',namespace)
    for inline in inlines:
        imgs = inline.findall('.//a:blip', namespace)
        for img in imgs:
            id = img.attrib['{{{0}}}embed'.format(namespace['r'])]
            ids.append(id)
    inlines = root.findall('.//wp:anchor',namespace)
    for inline in inlines:
        imgs = inline.findall('.//a:blip', namespace)
        for img in imgs:
            id = img.attrib['{{{0}}}embed'.format(namespace['r'])]
            ids.append(id)
    response = ""
    if len(ids) > 0:
        for id in ids:
            image_part = document.part.related_parts[id]
            millis = int(round(time.time() * 1000))
            file_name = str(id) + "-" + str(book_id) + "-" + str(millis) + ".png"
            fr = open(dir_path + "/" + file_name, "wb")
            fr.write(image_part._blob)
            fr.close()
            response += "<img border='1' src='" + dir_path + "/" + file_name + "' class='img-responsive'/>"
    return response

if __name__ == '__main__':
	s = convert('./SOURCEDIR', 'SAMPLE.docx')
	print(s)
	# Converts a docx file with tables and images to (simple) HTML
	# ORIGINAL CODE BY DAVID SSALI AT SOURCE: https://medium.com/@dvdssali/docx-to-html-1374eb6491a1
	#
	# Requires the Python module 'python-docx' <https://python-docx.readthedocs.io>
	#
	# Example use:
	# >>> s = convert('./SOURCEDIR', 'SAMPLE.docx')
	# >>> print(s)
	# or
	# $ python .\convert-docx-to-html.py > temp.html

	from docx import Document
	from docx.document import Document as _Document
	from docx.oxml.text.paragraph import CT_P
	from docx.oxml.table import CT_Tbl
	from docx.table import _Cell, Table
	from docx.text.paragraph import Paragraph
	import xml.etree.ElementTree as ET
	import time
	import re
	import os

	def convert(dir_path,file):
	##dir_path = "./SOURCEDIR"
	##file = "SAMPLE.docx"
	# image_path will be a directory to store image files; if the directory does not exist it will be created
	splitname = re.split('[. ]', file)
	if len(splitname) > 2:
	image_path= ''.join(splitname[:2]) + '-images'
	else:
	image_path= splitname[0] + '-images'
	if not os.path.isdir(image_path):
	os.mkdir(image_path)
	document = Document(dir_path + '/' + file)
	body = ""
	list_items = []

	for block in iter_block_items(document):
	if isinstance(block, Paragraph):
	tmp_heading_type = get_heading_type(block)
	if re.match("List\sParagraph",tmp_heading_type):
	list_items.append("<li>" + block.text + "</li>")
	else:
	images = render_image(document,block,image_path,image_path)
	if len(list_items) > 0:
	body += render_list_items(list_items)
	list_items = []
	if len(images) > 0:
	body = body + images
	else:
	# modified to use a different outer_tag if a 'Heading' style is found in the original paragraph
	if 'Heading' in tmp_heading_type:
	outer_tag = 'h'+tmp_heading_type.split(' ')[-1]
	else:
	outer_tag = 'p'
	body = body + render_runs(block.runs, outer_tag)
	elif isinstance(block, Table):
	body += render_table(block,document,image_path)

	return body

	def iter_block_items(parent):
	"""
	Generate a reference to each paragraph and table child within parent,
	in document order. Each returned value is an instance of either Table or
	Paragraph. parent would most commonly be a reference to a main
	Document object, but also works for a _Cell object, which itself can
	contain paragraphs and tables.
	"""
	if isinstance(parent, _Document):
	parent_elm = parent.element.body
	elif isinstance(parent, _Cell):
	parent_elm = parent._tc
	else:
	raise ValueError("something's not right")
	for child in parent_elm.iterchildren():
	if isinstance(child, CT_P):
	yield Paragraph(child, parent)
	elif isinstance(child, CT_Tbl):
	yield Table(child, parent)

	def table_print(block):
	table = block
	for row in table.rows:
	for cell in row.cells:
	for paragraph in cell.paragraphs:
	print(paragraph.text,' ',end='')
	print("\n")

	# Modified to treat cell content as a set of blocks to process
	def render_table(block,document,image_path):
	table = block
	html = "<table class='table table-bordered' border='1'>"
	for row in table.rows:
	html += "<tr>"
	for cell in row.cells:
	html += "<td>"
	# ---
	cbody = ""
	clist_items = []
	for cblock in iter_block_items(cell):
	if isinstance(cblock, Paragraph):
	tmp_heading_type = get_heading_type(cblock)
	if re.match("List\sParagraph",tmp_heading_type):
	clist_items.append("<li>" + block.text + "</li>")
	else:
	images = render_image(document,cblock,image_path,image_path)
	if len(clist_items) > 0:
	cbody += render_list_items(clist_items)
	clist_items = []
	if len(images) > 0:
	cbody = cbody + images
	else:
	cbody = cbody + render_runs(cblock.runs)
	elif isinstance(cblock, Table):
	cbody += render_table(cblock)
	html += cbody + " "
	html += "</td>"
	html += "</tr>"
	html += "</table>"
	return html

	# Modified to use a different outer_tag if a 'Heading' style is found in the original paragraph
	def render_runs(runs, outer_tag='p'):
	html = "<" + outer_tag + ">"
	for run in runs:
	html = html + run.text
	html += "</" + outer_tag + ">"
	return html

	def render_list_items(items):
	html = "<ul>"
	for item in items:
	html += item
	html += "</ul>"
	return html

	def get_heading_type(block):
	return block.style.name

	def render_image(document,par,dir_path,book_id):
	"""get all of the images in a paragraph
	:param par: a paragraph object from docx
	:return: a list of r:embed
	"""
	ids = []
	root = ET.fromstring(par._p.xml)
	namespace = {
	'a':"http://schemas.openxmlformats.org/drawingml/2006/main", \
	'r':"http://schemas.openxmlformats.org/officeDocument/2006/relationships", \
	'wp':"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"}
	inlines = root.findall('.//wp:inline',namespace)
	for inline in inlines:
	imgs = inline.findall('.//a:blip', namespace)
	for img in imgs:
	id = img.attrib['{{{0}}}embed'.format(namespace['r'])]
	ids.append(id)
	inlines = root.findall('.//wp:anchor',namespace)
	for inline in inlines:
	imgs = inline.findall('.//a:blip', namespace)
	for img in imgs:
	id = img.attrib['{{{0}}}embed'.format(namespace['r'])]
	ids.append(id)
	response = ""
	if len(ids) > 0:
	for id in ids:
	image_part = document.part.related_parts[id]
	millis = int(round(time.time() * 1000))
	file_name = str(id) + "-" + str(book_id) + "-" + str(millis) + ".png"
	fr = open(dir_path + "/" + file_name, "wb")
	fr.write(image_part._blob)
	fr.close()
	response += "<img border='1' src='" + dir_path + "/" + file_name + "' class='img-responsive'/>"
	return response

	if __name__ == '__main__':
	s = convert('./SOURCEDIR', 'SAMPLE.docx')
	print(s)