jojonas/changeofficelang.py

## changeofficelang.py
"""
	Changing the spelling language for Microsoft Office documents is a pain.
	You can set the default spell-checker language as you desire, but this will
	not modify existing templates. Templates are copied during installation
	to the path '%appdata%\Microsoft\Templates' and done. Text elements within
	these templates will retain their default spelling language.

	This leads to a quite painful user experience when working with templates
	and multiple languages.

	Luckily, Office documents are nowadays (since Office 2007) stored in the
	"Office Open XML" format, which is a ZIP file containing a bunch of XML
	files. The spelling language is an attribute on some of these XML elements.

	This script changes the spelling of all text-boxes, etc, by iterating
	through the XML files and changing the content of the "lang" attribute.

	It will create a new Office Open XML document with the modified XML files
	(and all other non-XML files), suffixed by the desired language.

	Example: Translating "my_slides.pptx" to American English will result in a
	file called "my_slides_en-US.pptx".

	The target language can be given with the "--lang" argument. See "--help"
	for details.

	(C) 2016 Jonas Lieb

"""

import os, os.path
import zipfile
import tempfile
import re

def translate_xml(filename, lang="en-US", only=None):
	# We are using a "simple" regular expression here instead of a real XML parser
	# because the Python Standard Library parser (ElementTree) changes the
	# namespace prefixes within the XML files.
	# This is basically equal within the XML-logic, but causes Microsoft
	# Office to throw an error and offer to repair the file on opening it.

	pattern = r'(<[^>]*lang=")([^\"]*)("[^>]*>)'

	# ugly: Read entire file content to memory
	with open(filename, 'r', encoding="utf-8") as file:
		text = file.read()

	# Maintain a set of replaced languages (neat for debugging)
	seen = set()

	# Callback for regular-expression matching
	def replace(match):
		old_language = match.group(2)
		if only is not None and old_language not in only:
			return match.group(0)
		if old_language == lang:
			return match.group(0)

		seen.add(old_language)
		return match.expand(r'\1' + lang + r'\3')

	# Perform the RE-substitution
	translated = re.sub(pattern, replace, text)

	# Dump everything back to the same (temporary!) XML file
	with open(filename, 'w', encoding="utf-8") as file:
		file.write(translated)

	# Print that neat debugging info
	if seen:
		print("Modified file '%s'. Observed languages (before): %s" \
				% (os.path.basename(filename), ",".join(seen)))


def translate_archive(filename, lang="en_US", only=None):
	# Append language suffix
	old_name, old_extension = os.path.splitext(filename)
	new_filename = old_name + "_" + lang + old_extension

	print("Translating document '%s' to '%s'." % (filename, new_filename))

	# Obtain the name of a temporary directory, usually resides within a
	# users %appdata% (or /tmp on Linux). Python deletes it when the context
	# manager is left.
	with tempfile.TemporaryDirectory() as tmpdir:
		print("Using temporary directory '%s'." % tmpdir)

		# Use the zipfile module for opening Office Open XML files (yes, they're)
		# just plain old .zip files with a fancy extension.
		with zipfile.ZipFile(filename, 'r') as source_zip, \
			zipfile.ZipFile(new_filename, 'x') as destination_zip:

			# We extract the files one-by-one to keep track of the ZipInfo objects
			# during translation.
			for fileinfo in source_zip.infolist():
				extracted_filename = source_zip.extract(fileinfo, path=tmpdir)

				# Only touch .xml files
				extension = os.path.splitext(fileinfo.filename)[1].lower()
				if extension == ".xml":
					translate_xml(extracted_filename, lang=lang, only=only)

				# Write to destination file
				destination_zip.write(extracted_filename,
					arcname=fileinfo.filename,
					compress_type=fileinfo.compress_type)


if __name__=="__main__":
	import argparse

	# Supported files. Wikipedia lists only .docx, .pptx and .xlsx as valid extensions
	# for Office Open XML, but Microsoft uses the .dotx, .potx and .xltx files
	# for its templates.
	supported = (".docx", ".dotx", ".xlsx", ".xltx", ".pptx", ".potx", ".ppsx")

	parser = argparse.ArgumentParser(description="Change spelling language of all elements in an Office document. " \
		"Supported file types: %s" % (", ".join(supported)))

	# Custom "type-conversion" function which checks the file extension
	# Is automatically called by the argument parser
	def file_type(filename):
		ext = os.path.splitext(filename)[1].lower()
		if ext not in supported:
			parser.error("File type must be one of: %s." % (", ".join(supported)))
		return filename

	parser.add_argument("filename", type=file_type, nargs="+", help="Files to process" )
	parser.add_argument("--lang", type=str, help="Destination language (e.g. en-US or de-DE)", default="en-US" )
	parser.add_argument("--only", type=str, help="Only substitute these languages, default: all.", nargs="*", default=None)

	args = parser.parse_args()

	for filename in args.filename:
		translate_archive(filename, lang=args.lang, only=args.only)
	"""
	Changing the spelling language for Microsoft Office documents is a pain.
	You can set the default spell-checker language as you desire, but this will
	not modify existing templates. Templates are copied during installation
	to the path '%appdata%\Microsoft\Templates' and done. Text elements within
	these templates will retain their default spelling language.

	This leads to a quite painful user experience when working with templates
	and multiple languages.

	Luckily, Office documents are nowadays (since Office 2007) stored in the
	"Office Open XML" format, which is a ZIP file containing a bunch of XML
	files. The spelling language is an attribute on some of these XML elements.

	This script changes the spelling of all text-boxes, etc, by iterating
	through the XML files and changing the content of the "lang" attribute.

	It will create a new Office Open XML document with the modified XML files
	(and all other non-XML files), suffixed by the desired language.

	Example: Translating "my_slides.pptx" to American English will result in a
	file called "my_slides_en-US.pptx".

	The target language can be given with the "--lang" argument. See "--help"
	for details.

	(C) 2016 Jonas Lieb

	"""

	import os, os.path
	import zipfile
	import tempfile
	import re

	def translate_xml(filename, lang="en-US", only=None):
	# We are using a "simple" regular expression here instead of a real XML parser
	# because the Python Standard Library parser (ElementTree) changes the
	# namespace prefixes within the XML files.
	# This is basically equal within the XML-logic, but causes Microsoft
	# Office to throw an error and offer to repair the file on opening it.

	pattern = r'(<[^>]lang=")([^\"])("[^>]*>)'

	# ugly: Read entire file content to memory
	with open(filename, 'r', encoding="utf-8") as file:
	text = file.read()

	# Maintain a set of replaced languages (neat for debugging)
	seen = set()

	# Callback for regular-expression matching
	def replace(match):
	old_language = match.group(2)
	if only is not None and old_language not in only:
	return match.group(0)
	if old_language == lang:
	return match.group(0)

	seen.add(old_language)
	return match.expand(r'\1' + lang + r'\3')

	# Perform the RE-substitution
	translated = re.sub(pattern, replace, text)

	# Dump everything back to the same (temporary!) XML file
	with open(filename, 'w', encoding="utf-8") as file:
	file.write(translated)

	# Print that neat debugging info
	if seen:
	print("Modified file '%s'. Observed languages (before): %s" \
	% (os.path.basename(filename), ",".join(seen)))


	def translate_archive(filename, lang="en_US", only=None):
	# Append language suffix
	old_name, old_extension = os.path.splitext(filename)
	new_filename = old_name + "_" + lang + old_extension

	print("Translating document '%s' to '%s'." % (filename, new_filename))

	# Obtain the name of a temporary directory, usually resides within a
	# users %appdata% (or /tmp on Linux). Python deletes it when the context
	# manager is left.
	with tempfile.TemporaryDirectory() as tmpdir:
	print("Using temporary directory '%s'." % tmpdir)

	# Use the zipfile module for opening Office Open XML files (yes, they're)
	# just plain old .zip files with a fancy extension.
	with zipfile.ZipFile(filename, 'r') as source_zip, \
	zipfile.ZipFile(new_filename, 'x') as destination_zip:

	# We extract the files one-by-one to keep track of the ZipInfo objects
	# during translation.
	for fileinfo in source_zip.infolist():
	extracted_filename = source_zip.extract(fileinfo, path=tmpdir)

	# Only touch .xml files
	extension = os.path.splitext(fileinfo.filename)[1].lower()
	if extension == ".xml":
	translate_xml(extracted_filename, lang=lang, only=only)

	# Write to destination file
	destination_zip.write(extracted_filename,
	arcname=fileinfo.filename,
	compress_type=fileinfo.compress_type)


	if __name__=="__main__":
	import argparse

	# Supported files. Wikipedia lists only .docx, .pptx and .xlsx as valid extensions
	# for Office Open XML, but Microsoft uses the .dotx, .potx and .xltx files
	# for its templates.
	supported = (".docx", ".dotx", ".xlsx", ".xltx", ".pptx", ".potx", ".ppsx")

	parser = argparse.ArgumentParser(description="Change spelling language of all elements in an Office document. " \
	"Supported file types: %s" % (", ".join(supported)))

	# Custom "type-conversion" function which checks the file extension
	# Is automatically called by the argument parser
	def file_type(filename):
	ext = os.path.splitext(filename)[1].lower()
	if ext not in supported:
	parser.error("File type must be one of: %s." % (", ".join(supported)))
	return filename

	parser.add_argument("filename", type=file_type, nargs="+", help="Files to process" )
	parser.add_argument("--lang", type=str, help="Destination language (e.g. en-US or de-DE)", default="en-US" )
	parser.add_argument("--only", type=str, help="Only substitute these languages, default: all.", nargs="*", default=None)

	args = parser.parse_args()

	for filename in args.filename:
	translate_archive(filename, lang=args.lang, only=args.only)