Skip to content

Instantly share code, notes, and snippets.

@jojonas
Last active March 6, 2018 03:20
Show Gist options
  • Save jojonas/094a8d426c24aa35a0944b6de13bbe29 to your computer and use it in GitHub Desktop.
Save jojonas/094a8d426c24aa35a0944b6de13bbe29 to your computer and use it in GitHub Desktop.
Change the language of Microsoft Office documents
"""
Changing the spelling language for Microsoft Office documents is a pain.
You can set the default spell-checker language as you desire, but this will
not modify existing templates. Templates are copied during installation
to the path '%appdata%\Microsoft\Templates' and done. Text elements within
these templates will retain their default spelling language.
This leads to a quite painful user experience when working with templates
and multiple languages.
Luckily, Office documents are nowadays (since Office 2007) stored in the
"Office Open XML" format, which is a ZIP file containing a bunch of XML
files. The spelling language is an attribute on some of these XML elements.
This script changes the spelling of all text-boxes, etc, by iterating
through the XML files and changing the content of the "lang" attribute.
It will create a new Office Open XML document with the modified XML files
(and all other non-XML files), suffixed by the desired language.
Example: Translating "my_slides.pptx" to American English will result in a
file called "my_slides_en-US.pptx".
The target language can be given with the "--lang" argument. See "--help"
for details.
(C) 2016 Jonas Lieb
"""
import os, os.path
import zipfile
import tempfile
import re
def translate_xml(filename, lang="en-US", only=None):
# We are using a "simple" regular expression here instead of a real XML parser
# because the Python Standard Library parser (ElementTree) changes the
# namespace prefixes within the XML files.
# This is basically equal within the XML-logic, but causes Microsoft
# Office to throw an error and offer to repair the file on opening it.
pattern = r'(<[^>]*lang=")([^\"]*)("[^>]*>)'
# ugly: Read entire file content to memory
with open(filename, 'r', encoding="utf-8") as file:
text = file.read()
# Maintain a set of replaced languages (neat for debugging)
seen = set()
# Callback for regular-expression matching
def replace(match):
old_language = match.group(2)
if only is not None and old_language not in only:
return match.group(0)
if old_language == lang:
return match.group(0)
seen.add(old_language)
return match.expand(r'\1' + lang + r'\3')
# Perform the RE-substitution
translated = re.sub(pattern, replace, text)
# Dump everything back to the same (temporary!) XML file
with open(filename, 'w', encoding="utf-8") as file:
file.write(translated)
# Print that neat debugging info
if seen:
print("Modified file '%s'. Observed languages (before): %s" \
% (os.path.basename(filename), ",".join(seen)))
def translate_archive(filename, lang="en_US", only=None):
# Append language suffix
old_name, old_extension = os.path.splitext(filename)
new_filename = old_name + "_" + lang + old_extension
print("Translating document '%s' to '%s'." % (filename, new_filename))
# Obtain the name of a temporary directory, usually resides within a
# users %appdata% (or /tmp on Linux). Python deletes it when the context
# manager is left.
with tempfile.TemporaryDirectory() as tmpdir:
print("Using temporary directory '%s'." % tmpdir)
# Use the zipfile module for opening Office Open XML files (yes, they're)
# just plain old .zip files with a fancy extension.
with zipfile.ZipFile(filename, 'r') as source_zip, \
zipfile.ZipFile(new_filename, 'x') as destination_zip:
# We extract the files one-by-one to keep track of the ZipInfo objects
# during translation.
for fileinfo in source_zip.infolist():
extracted_filename = source_zip.extract(fileinfo, path=tmpdir)
# Only touch .xml files
extension = os.path.splitext(fileinfo.filename)[1].lower()
if extension == ".xml":
translate_xml(extracted_filename, lang=lang, only=only)
# Write to destination file
destination_zip.write(extracted_filename,
arcname=fileinfo.filename,
compress_type=fileinfo.compress_type)
if __name__=="__main__":
import argparse
# Supported files. Wikipedia lists only .docx, .pptx and .xlsx as valid extensions
# for Office Open XML, but Microsoft uses the .dotx, .potx and .xltx files
# for its templates.
supported = (".docx", ".dotx", ".xlsx", ".xltx", ".pptx", ".potx", ".ppsx")
parser = argparse.ArgumentParser(description="Change spelling language of all elements in an Office document. " \
"Supported file types: %s" % (", ".join(supported)))
# Custom "type-conversion" function which checks the file extension
# Is automatically called by the argument parser
def file_type(filename):
ext = os.path.splitext(filename)[1].lower()
if ext not in supported:
parser.error("File type must be one of: %s." % (", ".join(supported)))
return filename
parser.add_argument("filename", type=file_type, nargs="+", help="Files to process" )
parser.add_argument("--lang", type=str, help="Destination language (e.g. en-US or de-DE)", default="en-US" )
parser.add_argument("--only", type=str, help="Only substitute these languages, default: all.", nargs="*", default=None)
args = parser.parse_args()
for filename in args.filename:
translate_archive(filename, lang=args.lang, only=args.only)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment