Skip to content

Instantly share code, notes, and snippets.

@linzeqipku
Last active June 9, 2021 02:08
Show Gist options
  • Save linzeqipku/3cec0b90e9e51445a2ffc5e15cdf4ae0 to your computer and use it in GitHub Desktop.
Save linzeqipku/3cec0b90e9e51445a2ffc5e15cdf4ae0 to your computer and use it in GitHub Desktop.
convert .docx files to .html files
import mammoth
from zipfile import BadZipFile
import os
path='E:/dc/data/docx'
html_path='E:/dc/data/html'
def gci(filepath):
files = os.listdir(filepath)
for fi in files:
fi_d = os.path.join(filepath,fi)
if os.path.isdir(fi_d):
gci(fi_d)
else:
parse(os.path.join(filepath,fi_d))
def parse(filepath):
if filepath.endswith('.docx') and not '~$' in filepath:
docx_file=open(filepath, "rb")
print(filepath)
try:
result = mammoth.convert_to_html(docx_file)
docx_file.close()
html = result.value
html_file_name=os.path.abspath(html_path)+filepath[len(os.path.abspath(path)):-5]+'.html'
if not os.path.exists(os.path.dirname(html_file_name)):
os.makedirs(os.path.dirname(html_file_name))
html_file=open(html_file_name, "w", encoding="utf8")
html_file.write(html)
html_file.close()
except BadZipFile:
print(' BadZipFile...')
gci(path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment