Skip to content

Instantly share code, notes, and snippets.

@emlyn
Created June 25, 2024 22:21
Show Gist options
  • Save emlyn/7ed6ec91734302fa57aa959cd2e5f969 to your computer and use it in GitHub Desktop.
Save emlyn/7ed6ec91734302fa57aa959cd2e5f969 to your computer and use it in GitHub Desktop.
Extract all embedded images from an (XML-format) Word document
#!/usr/bin/env python3
# Exatrct all embedded images from a Word document in their full resolution.
# In Word: File, Save As, choose XML format
# Then run the resulting Word XML file through this script.
import sys
import base64
from xml.dom.minidom import parse
def main(fname):
dom = parse(fname)
for i, part in enumerate(dom.getElementsByTagName('pkg:part')):
fname = part.getAttribute('pkg:name')
ctype = part.getAttribute('pkg:contentType')
print(f"\nPart {i+1}: {fname} ({ctype})")
if not ctype.startswith('image/'):
print('Skipping (not an image)')
elif comp := part.getAttribute('pkg:compression') != 'store':
print(f'Skipping (compressed: {comp})')
else:
fname = fname.lstrip('/').replace('/', '_')
print(f'Extracting to {fname}')
data = part.getElementsByTagName('pkg:binaryData')[0].firstChild.data
with open(fname, 'wb') as f:
f.write(base64.b64decode(data))
if __name__ == '__main__':
main(*sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment