Last active
June 9, 2021 02:08
-
-
Save linzeqipku/3cec0b90e9e51445a2ffc5e15cdf4ae0 to your computer and use it in GitHub Desktop.
convert .docx files to .html files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import mammoth | |
from zipfile import BadZipFile | |
import os | |
path='E:/dc/data/docx' | |
html_path='E:/dc/data/html' | |
def gci(filepath): | |
files = os.listdir(filepath) | |
for fi in files: | |
fi_d = os.path.join(filepath,fi) | |
if os.path.isdir(fi_d): | |
gci(fi_d) | |
else: | |
parse(os.path.join(filepath,fi_d)) | |
def parse(filepath): | |
if filepath.endswith('.docx') and not '~$' in filepath: | |
docx_file=open(filepath, "rb") | |
print(filepath) | |
try: | |
result = mammoth.convert_to_html(docx_file) | |
docx_file.close() | |
html = result.value | |
html_file_name=os.path.abspath(html_path)+filepath[len(os.path.abspath(path)):-5]+'.html' | |
if not os.path.exists(os.path.dirname(html_file_name)): | |
os.makedirs(os.path.dirname(html_file_name)) | |
html_file=open(html_file_name, "w", encoding="utf8") | |
html_file.write(html) | |
html_file.close() | |
except BadZipFile: | |
print(' BadZipFile...') | |
gci(path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment