Skip to content

Instantly share code, notes, and snippets.

@phillipkent
Last active November 7, 2023 09:19
Show Gist options
  • Star 6 You must be signed in to star a gist
  • Fork 4 You must be signed in to fork a gist
  • Save phillipkent/fcd8276d3984089cddd2f72a52fd00eb to your computer and use it in GitHub Desktop.
Save phillipkent/fcd8276d3984089cddd2f72a52fd00eb to your computer and use it in GitHub Desktop.
Python code using the python-docx module to convert a DOCX file to another DOCX file
# Converts a docx file with tables and images to a new docx file
# The new file is based on a 'stub' document which contains preamble text and styles
#
# Requires the Python module 'python-docx' <https://python-docx.readthedocs.io>
# Written for Python 3
#
# Source documents are taken from the directory 'source' and converted documents are saved
# to the directory 'converted'
#
# Two types of source documents are handled: 'Fiscal Guide' or 'Economics Regime'. Each one
# has its own stub document and different conversion options.
# ** The stub documents are not included here! The code is offered as an example for adaptation
# ** to your own uses.
#
# Thanks to David Ssali for his code posted at https://medium.com/@dvdssali/docx-to-html-1374eb6491a1
#
from docx import Document
from docx.document import Document as _Document
from docx.oxml.text.paragraph import CT_P
from docx.oxml.table import CT_Tbl
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
from docx.shared import RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
import xml.etree.ElementTree as ET
from io import BytesIO
from copy import copy
from copy import deepcopy
import datetime
from os import listdir
from os.path import isfile, join
import re
def get_docx_text(filepath):
"""
Take the path of a docx file as argument, return the text in array of strings
"""
WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
PARA = WORD_NAMESPACE + 'p'
TEXT = WORD_NAMESPACE + 't'
document = zipfile.ZipFile(filepath)
xml_content = document.read('word/document.xml')
document.close()
tree = XML(xml_content)
paragraphs = []
for paragraph in tree.getiterator(PARA):
texts = [node.text
for node in paragraph.getiterator(TEXT)
if node.text]
if texts:
paragraphs.append(''.join(texts))
return paragraphs
def iter_block_items(parent):
"""
Generate a reference to each paragraph and table child within *parent*,
in document order. Each returned value is an instance of either Table or
Paragraph. *parent* would most commonly be a reference to a main
Document object, but also works for a _Cell object, which itself can
contain paragraphs and tables.
"""
if isinstance(parent, _Document):
parent_elm = parent.element.body
elif isinstance(parent, _Cell):
parent_elm = parent._tc
else:
raise ValueError("ERROR: something not right")
for child in parent_elm.iterchildren():
if isinstance(child, CT_P):
yield Paragraph(child, parent)
elif isinstance(child, CT_Tbl):
yield Table(child, parent)
def get_heading_type(block):
return block.style.name
def get_image_Ids(paragraph):
ids = []
root = ET.fromstring(paragraph._p.xml)
namespace = {
'a':"http://schemas.openxmlformats.org/drawingml/2006/main", \
'r':"http://schemas.openxmlformats.org/officeDocument/2006/relationships", \
'wp':"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"}
inlines = root.findall('.//wp:inline',namespace)
for inline in inlines:
imgs = inline.findall('.//a:blip', namespace)
for img in imgs:
id = img.attrib['{{{0}}}embed'.format(namespace['r'])]
ids.append(id)
inlines = root.findall('.//wp:anchor',namespace)
for inline in inlines:
imgs = inline.findall('.//a:blip', namespace)
for img in imgs:
id = img.attrib['{{{0}}}embed'.format(namespace['r'])]
ids.append(id)
return ids
def convertPRL(sourceDocx, stubDocx):
# The converted document begins using preamble content and styles from the 'stub' document
doc = Document(stubDocx)
source_doc = Document('./source/' + sourceDocx)
# Set the new Core Properties
doc.core_properties.author = 'Documentation team'
doc.core_properties.last_modified_by = 'Automatic conversion script'
doc.core_properties.created = source_doc.core_properties.created
doc.core_properties.revision = 1 + source_doc.core_properties.revision
doc.core_properties.modified = datetime.datetime.utcnow()
# Title page
# Replace 'TITLE_REGIME' from source_doc
# Replace 'SUB_REGIME_TYPE' from source_doc
# Replace 'SUB_VERSION' from source_doc (not for FiscalGuide stub)
sourceDocText = get_docx_text('./source/' + sourceDocx)
index_T = [index for index, item in enumerate(doc.paragraphs) if item.text == 'TITLE_REGIME'][0]
doc.paragraphs[index_T].text = sourceDocText[1]
index_S1 = [index for index, item in enumerate(doc.paragraphs) if item.text == 'SUB_REGIME_TYPE'][0]
doc.paragraphs[index_S1].text = sourceDocText[2]
if 'EconomicsRegime' in stubDocx:
index_S2 = [index for index, item in enumerate(doc.paragraphs) if item.text == 'SUB_VERSION'][0]
doc.paragraphs[index_S2].text = sourceDocText[5]
# Add the contents of source doc to the new doc
if 'EconomicsRegime' in stubDocx:
startHeading = 'Economics Regime'
elif 'FiscalGuide' in stubDocx:
startHeading = 'Fiscal Terms'
started = False
# started is False until after startHeading is found in source_doc
for block in iter_block_items(source_doc):
if started:
if isinstance(block, Paragraph):
# Look for image objects in the block
image_Ids = get_image_Ids(block)
##print(image_Ids)
if len(image_Ids) > 0:
#process the images in this block
for id in image_Ids:
image_part = source_doc.part.related_parts[id]
image_stream = BytesIO(image_part.blob)
inlineShape = doc.add_picture(image_stream)
# width/height are extracted from the source inline_shape object which has this id
inlineShape.width = [s.width for s in source_doc.inline_shapes if s._inline.graphic.graphicData.pic.blipFill.blip.embed == id][0]
inlineShape.height = [s.height for s in source_doc.inline_shapes if s._inline.graphic.graphicData.pic.blipFill.blip.embed == id][0]
# the image will be centered
doc.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
else: # process the paragraph as text
tmp_heading_type = get_heading_type(block)
# check if this is a bulleted paragraph
if re.match("List\sParagraph",tmp_heading_type):
p = doc.add_paragraph('', style='List Bullet')
else:
p = doc.add_paragraph('', style=block.style)
p.alignment = block.alignment
for r in block.runs:
new_run = p.add_run(r.text, style=r.style)
new_run.bold = r.bold
new_run.italic = r.italic
new_run.underline = r.underline
font = new_run.font
font.color.rgb = r.font.color.rgb
elif isinstance(block, Table):
# For Economics documents: the 3-column tables with images need special handling
# Images are only found in column 3
# (Note: funny behavior happens when trying to modify or add images within a copied table,
# so the approach here is to build a new table)
if 'EconomicsRegime' in stubDocx and len(block.columns) == 3:
new_table = doc.add_table(rows = len(block.rows), cols = len(block.columns))
new_table.style = doc.styles['Table Grid']
# Insert Column 1
for cell_index, cell in enumerate(block.columns[0].cells):
p = new_table.columns[0].cells[cell_index].paragraphs[0]
for r in cell.paragraphs[0].runs:
new_run = p.add_run(r.text)##, style=new_table.columns[1].cells[0].paragraphs[0].runs[0].style)
new_run.bold = r.bold
new_run.italic = r.italic
new_run.underline = r.underline
font = new_run.font
##font.size = new_table.columns[1].cells[0].paragraphs[0].runs[0].font.size
font.color.rgb = r.font.color.rgb
# Insert Column 2
for cell_index, cell in enumerate(block.columns[1].cells):
p = new_table.columns[1].cells[cell_index].paragraphs[0]
for r in cell.paragraphs[0].runs:
new_run = p.add_run(r.text)##, style=new_table.columns[1].cells[0].paragraphs[0].runs[0].style)
new_run.bold = r.bold
new_run.italic = r.italic
new_run.underline = r.underline
font = new_run.font
##font.size = new_table.columns[1].cells[0].paragraphs[0].runs[0].font.size
font.color.rgb = r.font.color.rgb
for cell_index, cell in enumerate(block.columns[2].cells):
first_para = True
for cblock in iter_block_items(cell):
# Look for image objects in the block
cblock_image_Ids = get_image_Ids(cblock)
if len(cblock_image_Ids) > 0:
#process the images in this block
for c_id in cblock_image_Ids:
c_image_part = source_doc.part.related_parts[c_id]
c_para = new_table.columns[2].cells[cell_index].add_paragraph()
c_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
c_run = c_para.add_run()
c_image_stream = BytesIO(c_image_part._blob)
c_inlineShape = c_run.add_picture(c_image_stream)
# width/height are extracted from the source inline_shape object which has this id
c_inlineShape.width = [s.width for s in source_doc.inline_shapes if s._inline.graphic.graphicData.pic.blipFill.blip.embed == c_id][0]
c_inlineShape.height = [s.height for s in source_doc.inline_shapes if s._inline.graphic.graphicData.pic.blipFill.blip.embed == c_id][0]
else: # process the paragraph as text
if first_para:
p = new_table.columns[2].cells[cell_index].paragraphs[0]
first_para = False
else:
p = new_table.columns[2].cells[cell_index].add_paragraph()
#p.style = cell_style
p.alignment = cblock.alignment
for r in cblock.runs:
new_run = p.add_run(r.text)
new_run.bold = r.bold
new_run.italic = r.italic
new_run.underline = r.underline
font = new_run.font
font.color.rgb = r.font.color.rgb
else:
# FOR ALL OTHER TABLES: find the current last paragraph in doc and insert the new_table after it
last_p = doc.paragraphs[-1]
new_table = block
p._p.addnext(new_table._tbl)
new_table.style = doc.styles['Table Grid']
else:
if isinstance(block, Paragraph):
if block.text == startHeading:
started = True
doc.save('./converted/' + sourceDocx)
if __name__ == '__main__':
sourceDir = './source/'
# Batch convert all the files in directory 'source'
sourceFilelist = [f for f in listdir(sourceDir) if isfile(join(sourceDir, f))]
for file in sourceFilelist:
if 'Economics' in file:
convertPRL(file, 'STUB-EconomicsRegime.docx')
else:
convertPRL(file, 'STUB-FiscalGuide.docx')
print('Converted: {0}'.format(file))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment