felixmon/read-docx.py

## read-docx.py

# 读取已有的word文档
# 参考1：https://www.jianshu.com/p/94ac13f6633e
# 参考2: https://python-docx.readthedocs.io/en/latest/api/text.html#paragraph-objects
# 参考3: https://www.jianshu.com/p/867fe84d8440
import sys
import re

from docx import Document

def main():

    # 创建文档对象
    document = Document('/Users/username/desktop/demo1.docx')

    # 读取文档中所有的段落列表
    ps = document.paragraphs

    # 每个段落都有text 属性
    # 这里返回一个List值
    ps_detail = [(x.text) for x in ps]

    # 总共有多少段落
    ps_detail_count = len(ps_detail)

    # 用正则来选出汉字
    hanzi_regex = re.compile(r'[\u4E00-\u9FA5]')

    word_count = 0
    hanzi_count = 0

    for i in range(ps_detail_count):
        word_count += len(ps_detail[i])
        hanzi_count += len(hanzi_regex.findall(ps_detail[i]))

    print ("总共有" + str(word_count) + "字。其中汉字" + str(hanzi_count) + "个。")

if __name__ == '__main__':
    main()

	# 读取已有的word文档
	# 参考1：https://www.jianshu.com/p/94ac13f6633e
	# 参考2: https://python-docx.readthedocs.io/en/latest/api/text.html#paragraph-objects
	# 参考3: https://www.jianshu.com/p/867fe84d8440
	import sys
	import re

	from docx import Document

	def main():

	# 创建文档对象
	document = Document('/Users/username/desktop/demo1.docx')

	# 读取文档中所有的段落列表
	ps = document.paragraphs

	# 每个段落都有text 属性
	# 这里返回一个List值
	ps_detail = [(x.text) for x in ps]

	# 总共有多少段落
	ps_detail_count = len(ps_detail)

	# 用正则来选出汉字
	hanzi_regex = re.compile(r'[\u4E00-\u9FA5]')

	word_count = 0
	hanzi_count = 0

	for i in range(ps_detail_count):
	word_count += len(ps_detail[i])
	hanzi_count += len(hanzi_regex.findall(ps_detail[i]))

	print ("总共有" + str(word_count) + "字。其中汉字" + str(hanzi_count) + "个。")

	if __name__ == '__main__':
	main()