Skip to content

Instantly share code, notes, and snippets.

@akkuman
Last active December 23, 2022 09:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save akkuman/1954eca68b923921a70e170286b52a13 to your computer and use it in GitHub Desktop.
Save akkuman/1954eca68b923921a70e170286b52a13 to your computer and use it in GitHub Desktop.
[python docxtpl html转richtext] #python #docxtpl

可能遇到过: docxtpl 模板占位的元素带入了 html 元素,wps能打开但是word打开报错的情况

这种情况下需要先把 html 转为 RichText 元素,然后再插入

但是现在按照这个思路去找,可能完全找不到相关的解决方案

转变了一下思路,用关键词 markdown to richtext 搜索了一下,找到了一个相关的 issue jhpyle/docassemble#72

虽说里面没有提及解决方案,但是他们提到是通过 markdown -> html -> richtext 来解决的,并且该项目中已经实现了相关功能

找了下相关文档和资料,发现了 https://github.com/jhpyle/docassemble/blob/c16c786c65186399584c9a30d0e3d7dcb3acb056/docassemble_base/docassemble/base/file_docx.py#LL645C21-L645C21

扣了出来实现了一下,见下面的 html_richtext.py

调用示例见 main.py

import re
import string
from jinja2.runtime import Undefined
import docx.opc.constants
from bs4 import NavigableString, Tag
from docxtpl import RichText
# ref: https://github.com/jhpyle/docassemble/issues/72
# ref: https://github.com/jhpyle/docassemble/blob/c16c786c65186399584c9a30d0e3d7dcb3acb056/docassemble_base/docassemble/base/file_docx.py#LL645C21-L645C21
list_types = ['1', 'A', 'a', 'I', 'i']
def ensure_definition(*pargs, **kwargs):
for val in pargs:
if isinstance(val, Undefined):
str(val)
for val in kwargs.values():
if isinstance(val, Undefined):
str(val)
def roman(num, case=None):
"""Given an index between 0 and 3999, returns a roman numeral between 1 and 4000."""
ensure_definition(num, case)
if case is None:
case = 'upper'
num = num + 1
if not isinstance(num, int):
raise TypeError("expected integer, got %s" % type(num))
if not 0 < num < 4000:
raise ValueError("Argument must be between 1 and 3999")
ints = (1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1)
nums = ('M', 'CM', 'D', 'CD', 'C', 'XC', 'L', 'XL', 'X', 'IX', 'V', 'IV', 'I')
result = ""
for indexno, the_int in enumerate(ints):
count = int(num / the_int)
result += nums[indexno] * count
num -= the_int * count
if case == 'lower':
return result.lower()
return result
def Alpha(number):
multiplier = int((number - 1) / 26)
indexno = (number - 1) % 26
return string.ascii_uppercase[indexno] * (multiplier + 1)
def alpha(number):
multiplier = int((number - 1) / 26)
indexno = (number - 1) % 26
return string.ascii_lowercase[indexno] * (multiplier + 1)
def Roman_Numeral(number):
return roman((number - 1) % 4000, case='upper')
def roman_numeral(number):
return roman((number - 1) % 4000, case='lower')
class InlineSoupParser:
def __init__(self, tpl):
self.runs = [RichText('')]
self.run = self.runs[-1]
self.bold = False
self.italic = False
self.underline = False
self.indentation = 0
self.style = 'p'
self.strike = False
self.size = None
self.charstyle = None
self.color = None
self.tpl = tpl
self.at_start = True
self.list_number = 1
self.list_type = list_types[-1]
def new_paragraph(self):
if self.at_start:
self.at_start = False
else:
self.run.add("\n", italic=self.italic, bold=self.bold, underline=self.underline, strike=self.strike, size=self.size, style=self.charstyle, color=self.color)
if self.indentation:
self.run.add("\t" * self.indentation)
if self.style == 'ul':
self.run.add("•\t")
if self.style == 'ol1':
self.run.add(str(self.list_number) + ".\t")
self.list_number += 1
elif self.style == 'olA':
self.run.add(Alpha(self.list_number) + ".\t")
self.list_number += 1
elif self.style == 'ola':
self.run.add(alpha(self.list_number) + ".\t")
self.list_number += 1
elif self.style == 'olI':
self.run.add(Roman_Numeral(self.list_number) + ".\t")
self.list_number += 1
elif self.style == 'oli':
self.run.add(roman_numeral(self.list_number) + ".\t")
self.list_number += 1
# else:
# self.list_number = 1
def __str__(self):
output = ''
for run in self.runs:
output += str(run)
return output
def start_link(self, url):
ref = self.tpl.docx._part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)
self.runs.append('<w:hyperlink r:id="%s">' % (ref, ))
self.new_run()
def end_link(self):
self.runs.append('</w:hyperlink>')
self.new_run()
def new_run(self):
self.runs.append(RichText(''))
self.run = self.runs[-1]
def traverse(self, elem):
for part in elem.contents:
if isinstance(part, NavigableString):
self.run.add(str(part), italic=self.italic, bold=self.bold, underline=self.underline, strike=self.strike, size=self.size, style=self.charstyle, color=self.color)
elif isinstance(part, Tag):
if part.name in ('p', 'blockquote'):
self.new_paragraph()
self.traverse(part)
elif part.name == 'li':
self.new_paragraph()
self.traverse(part)
elif part.name == 'ul':
oldstyle = self.style
self.style = 'ul'
self.indentation += 1
self.traverse(part)
self.indentation -= 1
self.style = oldstyle
elif part.name == 'ol':
oldstyle = self.style
oldlistnumber = self.list_number
oldlisttype = self.list_type
if part.get('type', None) in list_types:
self.list_type = part['type']
else:
self.list_type = list_types[(list_types.index(self.list_type) + 1) % 5]
try:
self.list_number = int(part.get('start', 1))
except:
self.list_number = 1
self.style = 'ol' + self.list_type
self.indentation += 1
self.traverse(part)
self.indentation -= 1
self.list_type = oldlisttype
self.list_number = oldlistnumber
self.style = oldstyle
elif part.name == 'strong':
self.bold = True
self.traverse(part)
self.bold = False
elif part.name == 'em':
self.italic = True
self.traverse(part)
self.italic = False
elif part.name == 'strike':
self.strike = True
self.traverse(part)
self.strike = False
elif part.name == 'u':
self.underline = True
self.traverse(part)
self.underline = False
elif re.match(r'h[1-6]', part.name):
oldsize = self.size
self.size = 60 - ((int(part.name[1]) - 1) * 10)
self.bold = True
self.traverse(part)
self.bold = False
self.size = oldsize
elif part.name == 'a':
self.start_link(part['href'])
if self.tpl.da_hyperlink_style:
self.charstyle = self.tpl.da_hyperlink_style
else:
self.underline = True
self.color = '#0000ff'
self.traverse(part)
if self.tpl.da_hyperlink_style:
self.charstyle = None
else:
self.underline = False
self.color = None
self.end_link()
elif part.name == 'br':
self.run.add("\n", italic=self.italic, bold=self.bold, underline=self.underline, strike=self.strike, size=self.size, style=self.charstyle, color=self.color)
else:
print("Encountered a " + part.__class__.__name__)
def html_to_richtext(tpl, source_code):
# ref: https://github.com/jhpyle/docassemble/blob/c16c786c65186399584c9a30d0e3d7dcb3acb056/docassemble_base/docassemble/base/file_docx.py#LL645C21-L645C21
source_code = re.sub(r"\n", ' ', source_code)
source_code = re.sub(r">\s+<", '><', source_code)
soup = BeautifulSoup('<html>' + source_code + '</html>', 'html.parser')
parser = InlineSoupParser(tpl)
for elem in soup.find_all(recursive=False):
parser.traverse(elem)
output = str(parser)
return output
from docxtpl import DocxTemplate
if __name__ == '__main__':
tpl = DocxTemplate('tpl.docx')
content html_to_richtext(tpl, '在 HTTP 事务的上下文中,基本访问身份验证是 HTTP 用户代理在发出请求时提供用户名和密码的方法。<br/><br/>通过 HTTP 连接使用基本身份验证保护此目录。使用基本身份验证时,用户凭据将以纯文本形式发送,并且由于未使用 HTTPS,它们很容易受到数据包嗅探的影响。')
context = {
'content': 'content'
}
tpl.render(context)
tpl.save('output.docx')
# 注意,tpl.docx 中的占位符类似于 `{{r content }}`
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment