Skip to content

Instantly share code, notes, and snippets.

@ishahid
Last active August 29, 2015 13:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ishahid/9245232 to your computer and use it in GitHub Desktop.
Save ishahid/9245232 to your computer and use it in GitHub Desktop.
Utility to replace variables enclosed in square brackets with the given value in Microsoft Word docx files. Based upon the following blog post. http://virantha.com/2013/08/16/reading-and-writing-microsoft-word-docx-files-with-python/
import os, re, zipfile, shutil, tempfile
from lxml import etree
class docx():
def __init__(self, docx_filename):
self.filename = docx_filename
with open(self.filename) as f:
self.zipfile = zipfile.ZipFile(f)
self.xml_content = self.zipfile.read('word/document.xml')
self.xml_tree = self._get_xml_tree(self.xml_content)
self._join_tags(self.xml_tree)
def _get_xml_tree(self, xml_string):
return etree.fromstring(xml_string)
def _check_element_is(self, element, type_char):
word_schema = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
return element.tag == '{%s}%s' % (word_schema, type_char)
def _itertext(self, tree):
for node in tree.iter(tag=etree.Element):
if self._check_element_is(node, 't'):
yield (node, node.text)
def _join_tags(self, tree):
chars = []
openbrac = False
inside_openbrac_node = False
for node, text in self._itertext(tree):
# Scan through every node with text
for i, c in enumerate(text):
# Go through each node's text character by character
if c == '[':
openbrac = True # Within a tag
inside_openbrac_node = True # Tag was opened in this node
openbrac_node = node # Save ptr to open bracket containing node
chars = []
elif c== ']':
assert openbrac
if inside_openbrac_node:
# Open and close inside same node, no need to do anything
pass
else:
# Open bracket in earlier node, now it's closed
# So append all the chars we've encountered since the openbrac_node '['
# to the openbrac_node
chars.append(']')
openbrac_node.text += ''.join(chars)
# Also, don't forget to remove the characters seen so far from current node
node.text = text[i+1:]
openbrac = False
inside_openbrac_node = False
else:
# Normal text character
if openbrac and inside_openbrac_node:
# No need to copy text
pass
elif openbrac and not inside_openbrac_node:
chars.append(c)
else:
# outside of a open/close
pass
if openbrac and not inside_openbrac_node:
# Went through all text that is part of an open bracket/close bracket
# in other nodes
# need to remove this text completely
node.text = ""
inside_openbrac_node = False
def replace(self, variable, value):
var = '[%s]' % variable.lower()
for node, text in self._itertext(self.xml_tree):
if var in text.lower():
regex = re.compile(re.escape(var), re.IGNORECASE)
node.text = regex.sub(value, text)
def save_as(self, output_filename):
tmp_dir = tempfile.mkdtemp()
with open(self.filename) as f:
self.zipfile = zipfile.ZipFile(f)
self.zipfile.extractall(tmp_dir)
with open(os.path.join(tmp_dir,'word/document.xml'), 'w') as f:
xmlstr = etree.tostring(self.xml_tree, pretty_print=True)
f.write(xmlstr)
filenames = self.zipfile.namelist()
zip_copy_filename = output_filename
with zipfile.ZipFile(zip_copy_filename, "w") as docx:
for filename in filenames:
docx.write(os.path.join(tmp_dir, filename), filename)
# Clean up the temp dir
shutil.rmtree(tmp_dir)
if __name__ == "__main__":
doc = docx('test.docx')
doc.replace('variable', 'value')
doc.replace('another_variable', 'another value')
doc.save_as('test_result.docx')
@GurjotSinghMahi
Copy link

What if we want to change the characters of text not in brackets and simple paragraphs characters one by one by making the dictionary.. how would we do that?? can you please tell me..!!

@ishahid
Copy link
Author

ishahid commented Apr 15, 2015

Search for the required text the same way I am searching/replacing variables with values in the function replace.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment