Created

Embed URL

HTTPS clone URL

SSH clone URL

You can clone with HTTPS or SSH.

Download Gist

Python Code to Extract Highlighted Text from DOCX (Word 2007 and Up format)

View example-highlight-extract.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
#!usr/bin/python
# -*- coding: utf-8 -*-
from docx import *
document = opendocx(r'test.docx')
words = document.xpath('//w:r', namespaces=document.nsmap)
WPML_URI = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
tag_rPr = WPML_URI + 'rPr'
tag_highlight = WPML_URI + 'highlight'
tag_val = WPML_URI + 'val'
tag_t = WPML_URI + 't'
for word in words:
for rPr in word.findall(tag_rPr):
high=rPr.findall(tag_highlight)
for hi in high:
if hi.attrib[tag_val] == 'yellow':
print word.find(tag_t).text.encode('utf-8').lower()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.