phillipkent/convert-docx.py

## convert-docx.py
# Converts a docx file with tables and images to a new docx file
# The new file is based on a 'stub' document which contains preamble text and styles
#
# Requires the Python module 'python-docx' <https://python-docx.readthedocs.io>
# Written for Python 3
#
# Source documents are taken from the directory 'source' and converted documents are saved
# to the directory 'converted'
#
# Two types of source documents are handled: 'Fiscal Guide' or 'Economics Regime'. Each one
# has its own stub document and different conversion options.
# ** The stub documents are not included here! The code is offered as an example for adaptation
# ** to your own uses.
#
# Thanks to David Ssali for his code posted at https://medium.com/@dvdssali/docx-to-html-1374eb6491a1
#

from docx import Document
from docx.document import Document as _Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
from docx.shared import RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
import xml.etree.ElementTree as ET
from io import BytesIO
from copy import copy
from copy import deepcopy
import datetime
from os import listdir
from os.path import isfile, join
import re

def get_docx_text(filepath):
    """
    Take the path of a docx file as argument, return the text in array of strings
    """
	WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    PARA = WORD_NAMESPACE + 'p'
    TEXT = WORD_NAMESPACE + 't'
    document = zipfile.ZipFile(filepath)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [node.text
                 for node in paragraph.getiterator(TEXT)
                 if node.text]
        if texts:
            paragraphs.append(''.join(texts))

    return paragraphs

def iter_block_items(parent):
    """
    Generate a reference to each paragraph and table child within *parent*,
    in document order. Each returned value is an instance of either Table or
    Paragraph. *parent* would most commonly be a reference to a main
    Document object, but also works for a _Cell object, which itself can
    contain paragraphs and tables.
    """
    if isinstance(parent, _Document):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("ERROR: something not right")
    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)

def get_heading_type(block):
    return block.style.name

def get_image_Ids(paragraph):
    ids = []
    root = ET.fromstring(paragraph._p.xml)
    namespace = {
             'a':"http://schemas.openxmlformats.org/drawingml/2006/main", \
             'r':"http://schemas.openxmlformats.org/officeDocument/2006/relationships", \
             'wp':"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"}
    inlines = root.findall('.//wp:inline',namespace)
    for inline in inlines:
        imgs = inline.findall('.//a:blip', namespace)
        for img in imgs:
            id = img.attrib['{{{0}}}embed'.format(namespace['r'])]
            ids.append(id)
    inlines = root.findall('.//wp:anchor',namespace)
    for inline in inlines:
        imgs = inline.findall('.//a:blip', namespace)
        for img in imgs:
            id = img.attrib['{{{0}}}embed'.format(namespace['r'])]
            ids.append(id)
    return ids

def convertPRL(sourceDocx, stubDocx):
   # The converted document begins using preamble content and styles from the 'stub' document
   doc = Document(stubDocx)
   source_doc = Document('./source/' + sourceDocx)
   # Set the new Core Properties
   doc.core_properties.author = 'Documentation  team'
   doc.core_properties.last_modified_by = 'Automatic conversion script'
   doc.core_properties.created = source_doc.core_properties.created
   doc.core_properties.revision = 1 + source_doc.core_properties.revision
   doc.core_properties.modified = datetime.datetime.utcnow()

   # Title page
   #  Replace 'TITLE_REGIME' from source_doc
   #  Replace 'SUB_REGIME_TYPE' from source_doc
   #  Replace 'SUB_VERSION' from source_doc (not for FiscalGuide stub)
   sourceDocText = get_docx_text('./source/' + sourceDocx)
   index_T = [index for index, item in enumerate(doc.paragraphs) if item.text == 'TITLE_REGIME'][0]
   doc.paragraphs[index_T].text =  sourceDocText[1]
   index_S1 = [index for index, item in enumerate(doc.paragraphs) if item.text == 'SUB_REGIME_TYPE'][0]
   doc.paragraphs[index_S1].text =  sourceDocText[2]
   if 'EconomicsRegime' in stubDocx:
      index_S2 = [index for index, item in enumerate(doc.paragraphs) if item.text == 'SUB_VERSION'][0]
      doc.paragraphs[index_S2].text =  sourceDocText[5]

   # Add the contents of source doc to the new doc

   if 'EconomicsRegime' in stubDocx:
      startHeading = 'Economics Regime'
   elif 'FiscalGuide' in stubDocx:
      startHeading = 'Fiscal Terms'

   started = False
   # started is False until after startHeading is found in source_doc
   for block in iter_block_items(source_doc):
       if started:
          if isinstance(block, Paragraph):
             # Look for image objects in the block
             image_Ids = get_image_Ids(block)
             ##print(image_Ids)
             if len(image_Ids) > 0:
			    #process the images in this block
                for id in image_Ids:
                   image_part = source_doc.part.related_parts[id]
                   image_stream = BytesIO(image_part.blob)
                   inlineShape = doc.add_picture(image_stream)
                   # width/height are extracted from the source inline_shape object which has this id
                   inlineShape.width = [s.width for s in source_doc.inline_shapes if s._inline.graphic.graphicData.pic.blipFill.blip.embed == id][0]
                   inlineShape.height = [s.height for s in source_doc.inline_shapes if s._inline.graphic.graphicData.pic.blipFill.blip.embed == id][0]
                   # the image will be centered
                   doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
             else: # process the paragraph as text
                tmp_heading_type = get_heading_type(block)
			    # check if this is a bulleted paragraph
                if re.match("List\sParagraph",tmp_heading_type):
                   p = doc.add_paragraph('', style='List Bullet')
                else:
                   p = doc.add_paragraph('', style=block.style)
                   p.alignment = block.alignment
                for r in block.runs:
                   new_run = p.add_run(r.text, style=r.style)
                   new_run.bold = r.bold
                   new_run.italic = r.italic
                   new_run.underline = r.underline
                   font = new_run.font
                   font.color.rgb = r.font.color.rgb
          elif isinstance(block, Table):
             # For Economics documents: the 3-column tables with images need special handling
			 # Images are only found in column 3
             # (Note: funny behavior happens when trying to modify or add images within a copied table,
             #  so the approach here is to build a new table)
             if 'EconomicsRegime' in stubDocx and len(block.columns) == 3:
                new_table = doc.add_table(rows = len(block.rows), cols = len(block.columns))
                new_table.style = doc.styles['Table Grid']
                # Insert Column 1
                for cell_index, cell in enumerate(block.columns[0].cells):
                    p = new_table.columns[0].cells[cell_index].paragraphs[0]
                    for r in cell.paragraphs[0].runs:
                       new_run = p.add_run(r.text)##, style=new_table.columns[1].cells[0].paragraphs[0].runs[0].style)
                       new_run.bold = r.bold
                       new_run.italic = r.italic
                       new_run.underline = r.underline
                       font = new_run.font
                       ##font.size = new_table.columns[1].cells[0].paragraphs[0].runs[0].font.size
                       font.color.rgb = r.font.color.rgb
                # Insert Column 2
                for cell_index, cell in enumerate(block.columns[1].cells):
                    p = new_table.columns[1].cells[cell_index].paragraphs[0]
                    for r in cell.paragraphs[0].runs:
                       new_run = p.add_run(r.text)##, style=new_table.columns[1].cells[0].paragraphs[0].runs[0].style)
                       new_run.bold = r.bold
                       new_run.italic = r.italic
                       new_run.underline = r.underline
                       font = new_run.font
                       ##font.size = new_table.columns[1].cells[0].paragraphs[0].runs[0].font.size
                       font.color.rgb = r.font.color.rgb
                for cell_index, cell in enumerate(block.columns[2].cells):
                    first_para = True
                    for cblock in iter_block_items(cell):
                        # Look for image objects in the block
                        cblock_image_Ids = get_image_Ids(cblock)
                        if len(cblock_image_Ids) > 0:
			               #process the images in this block
                           for c_id in cblock_image_Ids:
                              c_image_part = source_doc.part.related_parts[c_id]
                              c_para = new_table.columns[2].cells[cell_index].add_paragraph()
                              c_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
                              c_run = c_para.add_run()
                              c_image_stream = BytesIO(c_image_part._blob)
                              c_inlineShape = c_run.add_picture(c_image_stream)
                              # width/height are extracted from the source inline_shape object which has this id
                              c_inlineShape.width = [s.width for s in source_doc.inline_shapes if s._inline.graphic.graphicData.pic.blipFill.blip.embed == c_id][0]
                              c_inlineShape.height = [s.height for s in source_doc.inline_shapes if s._inline.graphic.graphicData.pic.blipFill.blip.embed == c_id][0]
                        else: # process the paragraph as text
                           if first_para:
                              p = new_table.columns[2].cells[cell_index].paragraphs[0]
                              first_para = False
                           else:
                              p = new_table.columns[2].cells[cell_index].add_paragraph()
                           #p.style = cell_style
                           p.alignment = cblock.alignment
                           for r in cblock.runs:
                              new_run = p.add_run(r.text)
                              new_run.bold = r.bold
                              new_run.italic = r.italic
                              new_run.underline = r.underline
                              font = new_run.font
                              font.color.rgb = r.font.color.rgb
             else:
                # FOR ALL OTHER TABLES: find the current last paragraph in doc and insert the new_table after it
                last_p = doc.paragraphs[-1]
                new_table = block
                p._p.addnext(new_table._tbl)
                new_table.style = doc.styles['Table Grid']
       else:
          if isinstance(block, Paragraph):
             if block.text == startHeading:
                started = True

   doc.save('./converted/' + sourceDocx)

if __name__ == '__main__':
	sourceDir = './source/'

	# Batch convert all the files in directory 'source'
	sourceFilelist = [f for f in listdir(sourceDir) if isfile(join(sourceDir, f))]
	for file in sourceFilelist:
		if 'Economics' in file:
			convertPRL(file, 'STUB-EconomicsRegime.docx')
		else:
			convertPRL(file, 'STUB-FiscalGuide.docx')
		print('Converted: {0}'.format(file))
	# Converts a docx file with tables and images to a new docx file
	# The new file is based on a 'stub' document which contains preamble text and styles
	#
	# Requires the Python module 'python-docx' <https://python-docx.readthedocs.io>
	# Written for Python 3
	#
	# Source documents are taken from the directory 'source' and converted documents are saved
	# to the directory 'converted'
	#
	# Two types of source documents are handled: 'Fiscal Guide' or 'Economics Regime'. Each one
	# has its own stub document and different conversion options.
	# ** The stub documents are not included here! The code is offered as an example for adaptation
	# ** to your own uses.
	#
	# Thanks to David Ssali for his code posted at https://medium.com/@dvdssali/docx-to-html-1374eb6491a1
	#

	from docx import Document
	from docx.document import Document as _Document
	from docx.oxml.text.paragraph import CT_P
	from docx.oxml.table import CT_Tbl
	from docx.table import _Cell, Table
	from docx.text.paragraph import Paragraph
	from docx.shared import RGBColor
	from docx.enum.text import WD_ALIGN_PARAGRAPH
	import xml.etree.ElementTree as ET
	from io import BytesIO
	from copy import copy
	from copy import deepcopy
	import datetime
	from os import listdir
	from os.path import isfile, join
	import re

	def get_docx_text(filepath):
	"""
	Take the path of a docx file as argument, return the text in array of strings
	"""
	WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
	PARA = WORD_NAMESPACE + 'p'
	TEXT = WORD_NAMESPACE + 't'
	document = zipfile.ZipFile(filepath)
	xml_content = document.read('word/document.xml')
	document.close()
	tree = XML(xml_content)

	paragraphs = []
	for paragraph in tree.getiterator(PARA):
	texts = [node.text
	for node in paragraph.getiterator(TEXT)
	if node.text]
	if texts:
	paragraphs.append(''.join(texts))

	return paragraphs

	def iter_block_items(parent):
	"""
	Generate a reference to each paragraph and table child within parent,
	in document order. Each returned value is an instance of either Table or
	Paragraph. parent would most commonly be a reference to a main
	Document object, but also works for a _Cell object, which itself can
	contain paragraphs and tables.
	"""
	if isinstance(parent, _Document):
	parent_elm = parent.element.body
	elif isinstance(parent, _Cell):
	parent_elm = parent._tc
	else:
	raise ValueError("ERROR: something not right")
	for child in parent_elm.iterchildren():
	if isinstance(child, CT_P):
	yield Paragraph(child, parent)
	elif isinstance(child, CT_Tbl):
	yield Table(child, parent)

	def get_heading_type(block):
	return block.style.name

	def get_image_Ids(paragraph):
	ids = []
	root = ET.fromstring(paragraph._p.xml)
	namespace = {
	'a':"http://schemas.openxmlformats.org/drawingml/2006/main", \
	'r':"http://schemas.openxmlformats.org/officeDocument/2006/relationships", \
	'wp':"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"}
	inlines = root.findall('.//wp:inline',namespace)
	for inline in inlines:
	imgs = inline.findall('.//a:blip', namespace)
	for img in imgs:
	id = img.attrib['{{{0}}}embed'.format(namespace['r'])]
	ids.append(id)
	inlines = root.findall('.//wp:anchor',namespace)
	for inline in inlines:
	imgs = inline.findall('.//a:blip', namespace)
	for img in imgs:
	id = img.attrib['{{{0}}}embed'.format(namespace['r'])]
	ids.append(id)
	return ids

	def convertPRL(sourceDocx, stubDocx):
	# The converted document begins using preamble content and styles from the 'stub' document
	doc = Document(stubDocx)
	source_doc = Document('./source/' + sourceDocx)
	# Set the new Core Properties
	doc.core_properties.author = 'Documentation team'
	doc.core_properties.last_modified_by = 'Automatic conversion script'
	doc.core_properties.created = source_doc.core_properties.created
	doc.core_properties.revision = 1 + source_doc.core_properties.revision
	doc.core_properties.modified = datetime.datetime.utcnow()

	# Title page
	# Replace 'TITLE_REGIME' from source_doc
	# Replace 'SUB_REGIME_TYPE' from source_doc
	# Replace 'SUB_VERSION' from source_doc (not for FiscalGuide stub)
	sourceDocText = get_docx_text('./source/' + sourceDocx)
	index_T = [index for index, item in enumerate(doc.paragraphs) if item.text == 'TITLE_REGIME'][0]
	doc.paragraphs[index_T].text = sourceDocText[1]
	index_S1 = [index for index, item in enumerate(doc.paragraphs) if item.text == 'SUB_REGIME_TYPE'][0]
	doc.paragraphs[index_S1].text = sourceDocText[2]
	if 'EconomicsRegime' in stubDocx:
	index_S2 = [index for index, item in enumerate(doc.paragraphs) if item.text == 'SUB_VERSION'][0]
	doc.paragraphs[index_S2].text = sourceDocText[5]

	# Add the contents of source doc to the new doc

	if 'EconomicsRegime' in stubDocx:
	startHeading = 'Economics Regime'
	elif 'FiscalGuide' in stubDocx:
	startHeading = 'Fiscal Terms'

	started = False
	# started is False until after startHeading is found in source_doc
	for block in iter_block_items(source_doc):
	if started:
	if isinstance(block, Paragraph):
	# Look for image objects in the block
	image_Ids = get_image_Ids(block)
	##print(image_Ids)
	if len(image_Ids) > 0:
	#process the images in this block
	for id in image_Ids:
	image_part = source_doc.part.related_parts[id]
	image_stream = BytesIO(image_part.blob)
	inlineShape = doc.add_picture(image_stream)
	# width/height are extracted from the source inline_shape object which has this id
	inlineShape.width = [s.width for s in source_doc.inline_shapes if s._inline.graphic.graphicData.pic.blipFill.blip.embed == id][0]
	inlineShape.height = [s.height for s in source_doc.inline_shapes if s._inline.graphic.graphicData.pic.blipFill.blip.embed == id][0]
	# the image will be centered
	doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
	else: # process the paragraph as text
	tmp_heading_type = get_heading_type(block)
	# check if this is a bulleted paragraph
	if re.match("List\sParagraph",tmp_heading_type):
	p = doc.add_paragraph('', style='List Bullet')
	else:
	p = doc.add_paragraph('', style=block.style)
	p.alignment = block.alignment
	for r in block.runs:
	new_run = p.add_run(r.text, style=r.style)
	new_run.bold = r.bold
	new_run.italic = r.italic
	new_run.underline = r.underline
	font = new_run.font
	font.color.rgb = r.font.color.rgb
	elif isinstance(block, Table):
	# For Economics documents: the 3-column tables with images need special handling
	# Images are only found in column 3
	# (Note: funny behavior happens when trying to modify or add images within a copied table,
	# so the approach here is to build a new table)
	if 'EconomicsRegime' in stubDocx and len(block.columns) == 3:
	new_table = doc.add_table(rows = len(block.rows), cols = len(block.columns))
	new_table.style = doc.styles['Table Grid']
	# Insert Column 1
	for cell_index, cell in enumerate(block.columns[0].cells):
	p = new_table.columns[0].cells[cell_index].paragraphs[0]
	for r in cell.paragraphs[0].runs:
	new_run = p.add_run(r.text)##, style=new_table.columns[1].cells[0].paragraphs[0].runs[0].style)
	new_run.bold = r.bold
	new_run.italic = r.italic
	new_run.underline = r.underline
	font = new_run.font
	##font.size = new_table.columns[1].cells[0].paragraphs[0].runs[0].font.size
	font.color.rgb = r.font.color.rgb
	# Insert Column 2
	for cell_index, cell in enumerate(block.columns[1].cells):
	p = new_table.columns[1].cells[cell_index].paragraphs[0]
	for r in cell.paragraphs[0].runs:
	new_run = p.add_run(r.text)##, style=new_table.columns[1].cells[0].paragraphs[0].runs[0].style)
	new_run.bold = r.bold
	new_run.italic = r.italic
	new_run.underline = r.underline
	font = new_run.font
	##font.size = new_table.columns[1].cells[0].paragraphs[0].runs[0].font.size
	font.color.rgb = r.font.color.rgb
	for cell_index, cell in enumerate(block.columns[2].cells):
	first_para = True
	for cblock in iter_block_items(cell):
	# Look for image objects in the block
	cblock_image_Ids = get_image_Ids(cblock)
	if len(cblock_image_Ids) > 0:
	#process the images in this block
	for c_id in cblock_image_Ids:
	c_image_part = source_doc.part.related_parts[c_id]
	c_para = new_table.columns[2].cells[cell_index].add_paragraph()
	c_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
	c_run = c_para.add_run()
	c_image_stream = BytesIO(c_image_part._blob)
	c_inlineShape = c_run.add_picture(c_image_stream)
	# width/height are extracted from the source inline_shape object which has this id
	c_inlineShape.width = [s.width for s in source_doc.inline_shapes if s._inline.graphic.graphicData.pic.blipFill.blip.embed == c_id][0]
	c_inlineShape.height = [s.height for s in source_doc.inline_shapes if s._inline.graphic.graphicData.pic.blipFill.blip.embed == c_id][0]
	else: # process the paragraph as text
	if first_para:
	p = new_table.columns[2].cells[cell_index].paragraphs[0]
	first_para = False
	else:
	p = new_table.columns[2].cells[cell_index].add_paragraph()
	#p.style = cell_style
	p.alignment = cblock.alignment
	for r in cblock.runs:
	new_run = p.add_run(r.text)
	new_run.bold = r.bold
	new_run.italic = r.italic
	new_run.underline = r.underline
	font = new_run.font
	font.color.rgb = r.font.color.rgb
	else:
	# FOR ALL OTHER TABLES: find the current last paragraph in doc and insert the new_table after it
	last_p = doc.paragraphs[-1]
	new_table = block
	p._p.addnext(new_table._tbl)
	new_table.style = doc.styles['Table Grid']
	else:
	if isinstance(block, Paragraph):
	if block.text == startHeading:
	started = True

	doc.save('./converted/' + sourceDocx)

	if __name__ == '__main__':
	sourceDir = './source/'

	# Batch convert all the files in directory 'source'
	sourceFilelist = [f for f in listdir(sourceDir) if isfile(join(sourceDir, f))]
	for file in sourceFilelist:
	if 'Economics' in file:
	convertPRL(file, 'STUB-EconomicsRegime.docx')
	else:
	convertPRL(file, 'STUB-FiscalGuide.docx')
	print('Converted: {0}'.format(file))