Skip to content

Instantly share code, notes, and snippets.

@jeffxf
Forked from etienned/extractdocx.py
Created October 17, 2017 02:56
Show Gist options
  • Save jeffxf/ce89ff671c335a5c7e87a612290df332 to your computer and use it in GitHub Desktop.
Save jeffxf/ce89ff671c335a5c7e87a612290df332 to your computer and use it in GitHub Desktop.
Simple function to extract text from MS XML Word document (.docx) without any dependencies.
#!/usr/bin/python
try:
from xml.etree.cElementTree import XML
except ImportError:
from xml.etree.ElementTree import XML
import zipfile
import sys
"""
Extracts formulas from docx document
Created to detect malicious docx documents containing DDE formulas
Modified @etienned's docx text extracting python script:
https://gist.github.com/etienned/7539105
"""
WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
PARA = WORD_NAMESPACE + 'p'
TEXT = WORD_NAMESPACE + 'instrText'
def get_docx_formulas(path):
document = zipfile.ZipFile(path)
xml_content = document.read('word/document.xml')
document.close()
tree = XML(xml_content)
paragraphs = []
for paragraph in tree.getiterator(PARA):
texts = [node.text
for node in paragraph.getiterator(TEXT)
if node.text]
if texts:
paragraphs.append(''.join(texts))
return '\n\n'.join(paragraphs)
if __name__ == "__main__":
try:
print(get_docx_formulas(sys.argv[1]))
except:
print(str('usage: ' + sys.argv[0]) + ' [docx file]')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment