Skip to content

Instantly share code, notes, and snippets.

@KakoozaJerry
Created July 2, 2021 16:47
Show Gist options
  • Save KakoozaJerry/d41178af0b8d35398d4397d73fbb47b3 to your computer and use it in GitHub Desktop.
Save KakoozaJerry/d41178af0b8d35398d4397d73fbb47b3 to your computer and use it in GitHub Desktop.
try:
from xml.etree.cElementTree import XML
except ImportError:
from xml.etree.ElementTree import XML
import os
import re
import pathlib
import zipfile
WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
PARA = WORD_NAMESPACE + 'p'
TEXT = WORD_NAMESPACE + 't'
def get_docx_text(path):
"""
Take the path of a docx file as argument, return the text in unicode.
"""
document = zipfile.ZipFile(path)
xml_content = document.read('word/document.xml')
document.close()
tree = XML(xml_content)
paragraphs = []
for paragraph in tree.getiterator(PARA):
texts = [node.text for node in paragraph.getiterator(TEXT) if node.text]
if texts:
paragraphs.append(''.join(texts))
return paragraphs
def sentences(par):
pars = re.sub('([?!])', '\g<1>\n', par)
pars = re.sub('([a-z].)( )([A-Z])', '\g<1>\n\g<3>', pars)
return pars.split('\n')
files = list(pathlib.Path("./",).rglob("*.docx"))
for fn in files:
doc = get_docx_text(fn)
for (i, par) in enumerate(doc):
for (j, sent) in enumerate(sentences(par)):
print('%s\t%d.%d\t%s' % (fn,i+1, j+1,sent))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment