Skip to content

Instantly share code, notes, and snippets.

@markrwilliams
Created June 7, 2013 03:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save markrwilliams/5726983 to your computer and use it in GitHub Desktop.
Save markrwilliams/5726983 to your computer and use it in GitHub Desktop.
import lxml.etree
from zipfile import ZipFile
class DocX(object):
DOCPATH = 'word/document.xml'
def __init__(self, fn):
with ZipFile(fn).open(self.DOCPATH) as f:
self.doc = lxml.etree.parse(f)
self.nsmap = self.doc.getroot().nsmap
@property
def paragraphs(self):
paragraphs = []
for p in self.doc.xpath('.//w:p', namespaces=self.nsmap):
line = ''
for text in p.xpath('.//w:t', namespaces=self.nsmap):
line += text.text
paragraphs.append(line)
return paragraphs
if __name__ == '__main__':
import codecs
import argparse
parser = argparse.ArgumentParser(description='.docx plain text extractor')
parser.add_argument('docx',
help='the source docx')
parser.add_argument('--output',
'-o',
default=None,
help='output file')
args = parser.parse_args()
plaintext = '\n'.join(DocX("/tmp/Gregory Pinto's Resume.docx").paragraphs)
if args.output:
with codecs.open(args.output, 'w', encoding='utf-8') as f:
f.write(plaintext)
else:
print plaintext
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment