Last active
August 29, 2015 14:22
-
-
Save 4e1e0603/95761342159c9df46d0a to your computer and use it in GitHub Desktop.
Získání obsahu Word dolumentu
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_docx_paragraphs_content(path, delimiter="\n"): | |
""" | |
Funkce vrací obsah odstavců Word dokumentu. | |
:path: Cesta k Word dokumentu. | |
:delimiter: Oddělovací znak jednotlivých odstavců ve výstupním řetězci. | |
:returns paragraphs: Vrací obsah odstavců jako řetězec. | |
""" | |
import os | |
import zipfile | |
from xml.etree.ElementTree import XML | |
WORD_NAMESPACE = \ | |
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}" | |
PARA = WORD_NAMESPACE + 'p' | |
TEXT = WORD_NAMESPACE + 't' | |
if not os.path.isfile(path): | |
raise FileNotFoundError | |
if not os.access(path, os.R_OK): | |
raise PermissionError | |
with zipfile.ZipFile(path) as document: | |
try: | |
tree = XML(document.read('word/document.xml')) | |
except zipfile.BadZipFile: | |
raise | |
paragraphs = [] | |
for paragraph in tree.getiterator(PARA): | |
texts = [node.text | |
for node in paragraph.getiterator(TEXT) | |
if node.text] | |
if texts: | |
paragraphs.append(delimiter.join(texts)) | |
return paragraphs |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment