Created
April 27, 2019 08:16
-
-
Save felixmon/50bd92d5c5118074cfbd3ac13546c267 to your computer and use it in GitHub Desktop.
Read and count words and Chinese words in Docx
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 读取已有的word文档 | |
# 参考1:https://www.jianshu.com/p/94ac13f6633e | |
# 参考2: https://python-docx.readthedocs.io/en/latest/api/text.html#paragraph-objects | |
# 参考3: https://www.jianshu.com/p/867fe84d8440 | |
import sys | |
import re | |
from docx import Document | |
def main(): | |
# 创建文档对象 | |
document = Document('/Users/username/desktop/demo1.docx') | |
# 读取文档中所有的段落列表 | |
ps = document.paragraphs | |
# 每个段落都有text 属性 | |
# 这里返回一个List值 | |
ps_detail = [(x.text) for x in ps] | |
# 总共有多少段落 | |
ps_detail_count = len(ps_detail) | |
# 用正则来选出汉字 | |
hanzi_regex = re.compile(r'[\u4E00-\u9FA5]') | |
word_count = 0 | |
hanzi_count = 0 | |
for i in range(ps_detail_count): | |
word_count += len(ps_detail[i]) | |
hanzi_count += len(hanzi_regex.findall(ps_detail[i])) | |
print ("总共有" + str(word_count) + "字。其中汉字" + str(hanzi_count) + "个。") | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment