Skip to content

Instantly share code, notes, and snippets.

@davecoutts
Created May 3, 2020 15:45
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save davecoutts/0e981c3b5f765320561aa6ca78ddebd2 to your computer and use it in GitHub Desktop.
Save davecoutts/0e981c3b5f765320561aa6ca78ddebd2 to your computer and use it in GitHub Desktop.
Python script to convert Microsoft Word and Excel files from one file format to another
DESCRIPTION = \
'''
#--------------------------------------------------------------------------------------------------
Convert Microsoft Word and Excel files from one file format to another.
#--------------------------------------------------------------------------------------------------
'''
EPILOG = \
'''
#--------------------------------------------------------------------------------------------------
This script automates the Microsoft Word and Excel applications to open a Word or Excel file in one
format, and save it in another.
A common use case might be to convert Word documents in the older 97-2003 binary '.doc' format
to the newer Open XML '.docx' file format.
The script required >= python 3.5
The script recursively scans from a base directory and writes out each file in the new format,
to the same directory as the original source file.
e.g. C:\\foo\\a\\word_document_1.doc <-- Original
C:\\foo\\a\\word_document_1.docx <-- New
C:\\foo\\a\\b\word_document_2.doc <-- Original
C:\\foo\\a\\b\\word_document_2.docx <-- New
## Study the formats in the links below to understand which output formats can be selected.
https://docs.microsoft.com/en-us/office/vba/api/word.wdsaveformat
https://docs.microsoft.com/en-us/office/vba/api/excel.xlfileformat
## Example script use
# Convert from Word '.doc' to '.docx' format, scan starting from the user HOME directory.
# Converting from '.doc' to '.docx', starting from the HOME directory is the script default setting.
python.exe microsoft_doc_converter.py
# Same as above, except as a Dryrun. In Dryrun mode the files to be converted are just listed,
# not converted.
python.exe microsoft_doc_converter.py --dryrun
# Convert from Word '.doc' to '.docx' format, scan starting from the C:\\foo directory.
python.exe microsoft_doc_converter.py --basedir "C:\\foo"
# Convert from Word '.docx' to '.pdf' format, scan starting from the C:\\foo directory.
python.exe microsoft_doc_converter.py -b "C:\\foo" -s "docx" -d "pdf" -f 17
# Convert from Excel '.xls' to '.xlsx' format, scan starting from the C:\\bar directory.
python.exe microsoft_doc_converter.py -c "Excel.application" -b "C:\\bar" -s "xls" -d "xlsx" -f 51
#--------------------------------------------------------------------------------------------------
## !! BIG FAT WARNING !!
File conversions can result in changed/lost content and formatting.
As always, backups and lots of testing is advised.
#--------------------------------------------------------------------------------------------------
# Briefly tested, May 3rd 2020, using,
- Anaconda3-2020.02-Windows-x86_64.exe (Python 3.7.6)
- Windows 10 Enterprise 1909
- Microsoft Office 365 ProPlus, Version 1908
#--------------------------------------------------------------------------------------------------
'''
#--------------------------------------------------------------------------------------------------
__author__ = 'Dave Coutts'
__license__ = 'Apache'
__version__ = '1.0.0'
__maintainer__ = 'https://github.com/davecoutts'
__status__ = 'Production'
#--------------------------------------------------------------------------------------------------
import win32com.client
from pathlib import Path
#--------------------------------------------------------------------------------------------------
def converter(comObject, dirPath, sourceExtension, destinationExtension, fileFormat, dryRun=False):
msApp = win32com.client.Dispatch(comObject)
for sourceFile in sorted(dirPath.rglob(f'*.{sourceExtension}')):
destinationFile = sourceFile.with_suffix(f'.{destinationExtension}')
if not destinationFile.is_file():
print(f'Converting: {sourceFile}')
if not dryRun:
try:
if comObject == 'Excel.application':
doc = msApp.Workbooks.Open(str(sourceFile))
elif comObject == 'Word.application':
doc = msApp.Documents.Open(str(sourceFile))
doc.SaveAs(str(destinationFile), FileFormat = fileFormat)
doc.Close()
except Exception as e:
print(f'Failed to Convert: {sourceFile} : {e}')
msApp.Quit()
return
#--------------------------------------------------------------------------------------------------
def main():
import argparse
parser = argparse.ArgumentParser(
epilog=EPILOG,
description=DESCRIPTION,
formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument('-c', '--comobject',
dest='comobject',
type=str,
default='Word.application',
help="COM Object name of the application to be called. 'Word.application' or 'Excel.application'. Default, 'Word.application'."
)
parser.add_argument('-b', '--basedir',
dest='basedir',
type=Path,
default=Path.home(),
help='Directory to start the recursive scan from. Default, users HOME directory'
)
parser.add_argument('-s', '--srcext',
dest='sourceextension',
type=str,
default='doc',
help="File extension of the source files to be converted. Default, 'doc'."
)
parser.add_argument('-d', '--destext',
dest='destinationextension',
type=str,
default='docx',
help="File extension of the resulting converted file. Default, 'docx'."
)
parser.add_argument('-f', '--filefmt',
dest='fileformat',
type=int,
default=16,
help="Microsoft file format number of the output format. Default, 16."
)
parser.add_argument('--dryrun',
dest='dryrun',
action="store_true",
default=False,
help='Print out all files to be converter but do not carry out the actual conversion.'
)
args = parser.parse_args()
converter(
comObject=args.comobject,
dirPath=args.basedir,
sourceExtension=args.sourceextension,
destinationExtension=args.destinationextension,
fileFormat=args.fileformat,
dryRun=args.dryrun
)
#--------------------------------------------------------------------------------------------------
if __name__ == '__main__':
main()
#--------------------------------------------------------------------------------------------------
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment