Skip to content

Instantly share code, notes, and snippets.

@ChronoMonochrome
Last active May 23, 2018 09:40
Show Gist options
  • Save ChronoMonochrome/1f5a9bf09e97cbca5a0df677e9ee6865 to your computer and use it in GitHub Desktop.
Save ChronoMonochrome/1f5a9bf09e97cbca5a0df677e9ee6865 to your computer and use it in GitHub Desktop.
Convert DOCX tables to CSV format
import os
from docx import Document
def print_tables(path):
doc = Document(path)
for ntable, table in enumerate(doc.tables):
buf = u""
start_row = 0
if ntable > 0:
start_row = 2
n_empty = 0
data_part = False
for nrow, row in enumerate(table.rows[start_row:]):
last_tc = None
row_empty = True
for cell in row.cells:
row_empty = row_empty and not cell.text
# ignore merged and empty cells
if ((cell._tc != last_tc) and cell.text):
# try to determine if cell.text contains float value
is_float = (cell.text[0].isdigit() and cell.text.count(".") == 1)
if (is_float):
#if (not data_part):
# print cell.text
data_part = True
# to make Excel happy
buf += cell.text.replace(".", ",")
else:
buf += cell.text
buf += u";"
last_tc = cell._tc
if row_empty:
n_empty += 1
if data_part and row_empty:
break
buf += u"\n"
#print (n_empty, nrow)
yield buf
for i in os.listdir("."):
if i.endswith(".docx"):
print(i)
tbl = list(print_tables(i))
open("%s.csv" % i, "wb").write(u"".join(tbl).encode("cp1251"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment