linzeqipku/docx_to_html.py

## docx_to_html.py
import mammoth
from zipfile import BadZipFile
import os

path='E:/dc/data/docx'
html_path='E:/dc/data/html'

def gci(filepath):
	files = os.listdir(filepath)
	for fi in files:
		fi_d = os.path.join(filepath,fi)
		if os.path.isdir(fi_d):
			gci(fi_d)
		else:
			parse(os.path.join(filepath,fi_d))

def parse(filepath):
	if filepath.endswith('.docx') and not '~$' in filepath:
		docx_file=open(filepath, "rb")
		print(filepath)
		try:
			result = mammoth.convert_to_html(docx_file)
			docx_file.close()
			html = result.value
			html_file_name=os.path.abspath(html_path)+filepath[len(os.path.abspath(path)):-5]+'.html'
			if not os.path.exists(os.path.dirname(html_file_name)):
				os.makedirs(os.path.dirname(html_file_name))
			html_file=open(html_file_name, "w", encoding="utf8")
			html_file.write(html)
			html_file.close()
		except BadZipFile:
			print('    BadZipFile...')

gci(path)
	import mammoth
	from zipfile import BadZipFile
	import os

	path='E:/dc/data/docx'
	html_path='E:/dc/data/html'

	def gci(filepath):
	files = os.listdir(filepath)
	for fi in files:
	fi_d = os.path.join(filepath,fi)
	if os.path.isdir(fi_d):
	gci(fi_d)
	else:
	parse(os.path.join(filepath,fi_d))

	def parse(filepath):
	if filepath.endswith('.docx') and not '~$' in filepath:
	docx_file=open(filepath, "rb")
	print(filepath)
	try:
	result = mammoth.convert_to_html(docx_file)
	docx_file.close()
	html = result.value
	html_file_name=os.path.abspath(html_path)+filepath[len(os.path.abspath(path)):-5]+'.html'
	if not os.path.exists(os.path.dirname(html_file_name)):
	os.makedirs(os.path.dirname(html_file_name))
	html_file=open(html_file_name, "w", encoding="utf8")
	html_file.write(html)
	html_file.close()
	except BadZipFile:
	print(' BadZipFile...')

	gci(path)