ChronoMonochrome/docx2csv.py

## docx2csv.py
import os
from docx import Document

def print_tables(path):
	doc = Document(path)
	for ntable, table in enumerate(doc.tables):
		buf = u""
		start_row = 0
		if ntable > 0:
			start_row = 2
		n_empty = 0
		data_part = False
		for nrow, row in enumerate(table.rows[start_row:]):
			last_tc = None
			row_empty = True
			for cell in row.cells:
				row_empty = row_empty and not cell.text
				# ignore merged and empty cells
				if ((cell._tc != last_tc) and cell.text):
					# try to determine if cell.text contains float value
					is_float = (cell.text[0].isdigit() and cell.text.count(".") == 1)
					if (is_float):
						#if (not data_part):
						#	print cell.text
						data_part = True
						# to make Excel happy
						buf += cell.text.replace(".", ",")
					else:
						buf += cell.text
					buf += u";"
				last_tc = cell._tc
			if row_empty:
				n_empty += 1
			if data_part and row_empty:
				break
			buf += u"\n"
		#print (n_empty, nrow)
		yield buf

for i in os.listdir("."):
	if i.endswith(".docx"):
		print(i)
		tbl = list(print_tables(i))
		open("%s.csv" % i, "wb").write(u"".join(tbl).encode("cp1251"))
	import os
	from docx import Document

	def print_tables(path):
	doc = Document(path)
	for ntable, table in enumerate(doc.tables):
	buf = u""
	start_row = 0
	if ntable > 0:
	start_row = 2
	n_empty = 0
	data_part = False
	for nrow, row in enumerate(table.rows[start_row:]):
	last_tc = None
	row_empty = True
	for cell in row.cells:
	row_empty = row_empty and not cell.text
	# ignore merged and empty cells
	if ((cell._tc != last_tc) and cell.text):
	# try to determine if cell.text contains float value
	is_float = (cell.text[0].isdigit() and cell.text.count(".") == 1)
	if (is_float):
	#if (not data_part):
	# print cell.text
	data_part = True
	# to make Excel happy
	buf += cell.text.replace(".", ",")
	else:
	buf += cell.text
	buf += u";"
	last_tc = cell._tc
	if row_empty:
	n_empty += 1
	if data_part and row_empty:
	break
	buf += u"\n"
	#print (n_empty, nrow)
	yield buf

	for i in os.listdir("."):
	if i.endswith(".docx"):
	print(i)
	tbl = list(print_tables(i))
	open("%s.csv" % i, "wb").write(u"".join(tbl).encode("cp1251"))