-
-
Save Wzixiao/53f6b3948dba4fcf657008058041cf97 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from glob import glob | |
import re | |
import os | |
import argparse | |
import win32com.client as win32 | |
from win32com.client import constants | |
from pathlib import Path | |
from tqdm import tqdm | |
import time | |
import shutil | |
def save_as_docx(file_location, output_path): | |
# 获取文件的绝对路径,在Windows下调用客户端时必须使用绝对路径,否则无法打开 | |
file_abs = os.path.abspath(file_location) | |
# 将文件扩展名更改为.docx | |
new_file_name = re.sub(r'\.\w+$', '.docx', file_abs.split('\\')[-1]) | |
new_file_abs = os.path.join(output_path, new_file_name) | |
if os.path.exists(new_file_abs): | |
return | |
if ".docx" in file_abs: | |
# 也可以使用copy | |
shutil.move(file_abs, new_file_abs) | |
return | |
# 创建一个用于打开Word文档的客户端对象,需要下载WPS或者Office(Microsoft Word) | |
word = win32.gencache.EnsureDispatch('Word.Application') | |
# 打开指定路径的文档 | |
doc = word.Documents.Open(file_abs) | |
doc.Activate() | |
# 将当前活动文档保存为.docx格式 | |
word.ActiveDocument.SaveAs(new_file_abs, FileFormat=constants.wdFormatXMLDocument) | |
# 关闭打开的文档资源 | |
doc.Close(False) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--doc_file_dir', type=str, help='doc文件目录') | |
parser.add_argument('--docx_file_save_dir', type=str, help='docx文件保存目录') | |
args = parser.parse_args() | |
doc_file_dir = args.doc_file_dir | |
docx_file_save_dir = args.docx_file_save_dir | |
if not doc_file_dir: | |
raise ValueError("doc_file_dir is need input") | |
if not docx_file_save_dir: | |
docx_file_save_dir = doc_file_dir | |
# 检测用户输入的输出路径在windwos下是否为绝对路径 | |
if not ":" in docx_file_save_dir: | |
docx_file_save_dir = os.path.abspath(docx_file_save_dir) | |
# 创建输出目录的文件夹 | |
Path(docx_file_save_dir).mkdir(exist_ok=True) | |
# 获取目录下所有的doc文件 | |
doc_file_locations = glob(f"{doc_file_dir}/*.doc*", recursive=True) | |
for path in tqdm(doc_file_locations): | |
save_as_docx(path, docx_file_save_dir) | |
time.sleep(0.1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
以下是一个例子:
python .\convert_doc_into_docx.py --doc_file_dir='./doc' --docx_file_save_dir="./docx"
以上两个参数都可以使用绝对路径
docx_file_save_dir在未输入的情况下等同与doc_file_dir
电脑必须存在可以打开Word文档的软件,比如WPS或者Office(Microsoft Word)