Skip to content

Instantly share code, notes, and snippets.

@JGVerdugo
Last active August 29, 2015 14:17
Show Gist options
  • Save JGVerdugo/37bfc77286992cfadfa9 to your computer and use it in GitHub Desktop.
Save JGVerdugo/37bfc77286992cfadfa9 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# Runs in Pyhton 3
# Processes MSWord files with Tika. Zero exception or error control, use at own risk.
# Assumes lowercase .doc and .docx extensions.
# Output is written to new files with corresponding file exts in the same dir.
import os
import sys
import subprocess
import glob
# Usage: python convertdoc.py [dir] [meta/txt/html/xml]
curDir = "."
tika = "/home/pepe/code/tika-1.7/tika-app-1.7.jar"
mode = "--metadata" # Default conversion mode. Use "txt", "html"
# and "xml" on the command line for the others
def newName(name, mode):
newName = name[:-3]
if mode == "--metadata":
newName += "meta"
elif mode == "--text":
newName += "txt"
elif mode == "--html":
newName += "html"
elif mode == "--xml":
newName += "xml"
else:
newName += "meta"
return newName
if len(sys.argv) >= 2:
curDir = sys.argv[1]
if len(sys.argv) == 3:
if sys.argv[2] == "txt":
mode = "--text"
elif sys.argv[2] == "html":
mode = "--html"
elif sys.argv[2] == "xml":
mode = "--xml"
else:
sys.argv[2] == "--metadata"
# filelist = glob.glob(os.path.join(curDir, "*.DOC"))
filelist = glob.glob(os.path.join(curDir, "*.do*"))
for filename in filelist:
print("Processing " + filename + "...")
newFile = newName(filename, mode)
f = open(newFile, "w")
subprocess.call(["java", "-jar", tika, mode, filename], stdout=f, timeout=None)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment