akkuman/README.md

## README.md

      
    Raw
  

              README.md
            
          
    可能遇到过： docxtpl 模板占位的元素带入了 html 元素，wps能打开但是word打开报错的情况
这种情况下需要先把 html 转为 RichText 元素，然后再插入
但是现在按照这个思路去找，可能完全找不到相关的解决方案
转变了一下思路，用关键词 markdown to richtext 搜索了一下，找到了一个相关的 issue jhpyle/docassemble#72
虽说里面没有提及解决方案，但是他们提到是通过 markdown -> html -> richtext 来解决的，并且该项目中已经实现了相关功能
找了下相关文档和资料，发现了 https://github.com/jhpyle/docassemble/blob/c16c786c65186399584c9a30d0e3d7dcb3acb056/docassemble_base/docassemble/base/file_docx.py#LL645C21-L645C21
扣了出来实现了一下，见下面的 html_richtext.py
调用示例见 main.py

  
## html_richtext.py
import re
import string
from jinja2.runtime import Undefined
import docx.opc.constants
from bs4 import NavigableString, Tag
from docxtpl import RichText

# ref: https://github.com/jhpyle/docassemble/issues/72
# ref: https://github.com/jhpyle/docassemble/blob/c16c786c65186399584c9a30d0e3d7dcb3acb056/docassemble_base/docassemble/base/file_docx.py#LL645C21-L645C21
list_types = ['1', 'A', 'a', 'I', 'i']

def ensure_definition(*pargs, **kwargs):
    for val in pargs:
        if isinstance(val, Undefined):
            str(val)
    for val in kwargs.values():
        if isinstance(val, Undefined):
            str(val)

def roman(num, case=None):
    """Given an index between 0 and 3999, returns a roman numeral between 1 and 4000."""
    ensure_definition(num, case)
    if case is None:
        case = 'upper'
    num = num + 1
    if not isinstance(num, int):
        raise TypeError("expected integer, got %s" % type(num))
    if not 0 < num < 4000:
        raise ValueError("Argument must be between 1 and 3999")
    ints = (1000, 900, 500,  400, 100,  90, 50,  40, 10,  9,   5,   4,  1)
    nums = ('M', 'CM', 'D', 'CD', 'C', 'XC', 'L', 'XL', 'X', 'IX', 'V', 'IV', 'I')
    result = ""
    for indexno, the_int in enumerate(ints):
        count = int(num / the_int)
        result += nums[indexno] * count
        num -= the_int * count
    if case == 'lower':
        return result.lower()
    return result

def Alpha(number):
    multiplier = int((number - 1) / 26)
    indexno = (number - 1) % 26
    return string.ascii_uppercase[indexno] * (multiplier + 1)


def alpha(number):
    multiplier = int((number - 1) / 26)
    indexno = (number - 1) % 26
    return string.ascii_lowercase[indexno] * (multiplier + 1)


def Roman_Numeral(number):
    return roman((number - 1) % 4000, case='upper')


def roman_numeral(number):
    return roman((number - 1) % 4000, case='lower')

class InlineSoupParser:

    def __init__(self, tpl):
        self.runs = [RichText('')]
        self.run = self.runs[-1]
        self.bold = False
        self.italic = False
        self.underline = False
        self.indentation = 0
        self.style = 'p'
        self.strike = False
        self.size = None
        self.charstyle = None
        self.color = None
        self.tpl = tpl
        self.at_start = True
        self.list_number = 1
        self.list_type = list_types[-1]

    def new_paragraph(self):
        if self.at_start:
            self.at_start = False
        else:
            self.run.add("\n", italic=self.italic, bold=self.bold, underline=self.underline, strike=self.strike, size=self.size, style=self.charstyle, color=self.color)
        if self.indentation:
            self.run.add("\t" * self.indentation)
        if self.style == 'ul':
            self.run.add("•\t")
        if self.style == 'ol1':
            self.run.add(str(self.list_number) + ".\t")
            self.list_number += 1
        elif self.style == 'olA':
            self.run.add(Alpha(self.list_number) + ".\t")
            self.list_number += 1
        elif self.style == 'ola':
            self.run.add(alpha(self.list_number) + ".\t")
            self.list_number += 1
        elif self.style == 'olI':
            self.run.add(Roman_Numeral(self.list_number) + ".\t")
            self.list_number += 1
        elif self.style == 'oli':
            self.run.add(roman_numeral(self.list_number) + ".\t")
            self.list_number += 1
        # else:
        #     self.list_number = 1

    def __str__(self):
        output = ''
        for run in self.runs:
            output += str(run)
        return output

    def start_link(self, url):
        ref = self.tpl.docx._part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)
        self.runs.append('<w:hyperlink r:id="%s">' % (ref, ))
        self.new_run()

    def end_link(self):
        self.runs.append('</w:hyperlink>')
        self.new_run()

    def new_run(self):
        self.runs.append(RichText(''))
        self.run = self.runs[-1]

    def traverse(self, elem):
        for part in elem.contents:
            if isinstance(part, NavigableString):
                self.run.add(str(part), italic=self.italic, bold=self.bold, underline=self.underline, strike=self.strike, size=self.size, style=self.charstyle, color=self.color)
            elif isinstance(part, Tag):
                if part.name in ('p', 'blockquote'):
                    self.new_paragraph()
                    self.traverse(part)
                elif part.name == 'li':
                    self.new_paragraph()
                    self.traverse(part)
                elif part.name == 'ul':
                    oldstyle = self.style
                    self.style = 'ul'
                    self.indentation += 1
                    self.traverse(part)
                    self.indentation -= 1
                    self.style = oldstyle
                elif part.name == 'ol':
                    oldstyle = self.style
                    oldlistnumber = self.list_number
                    oldlisttype = self.list_type
                    if part.get('type', None) in list_types:
                        self.list_type = part['type']
                    else:
                        self.list_type = list_types[(list_types.index(self.list_type) + 1) % 5]
                    try:
                        self.list_number = int(part.get('start', 1))
                    except:
                        self.list_number = 1
                    self.style = 'ol' + self.list_type
                    self.indentation += 1
                    self.traverse(part)
                    self.indentation -= 1
                    self.list_type = oldlisttype
                    self.list_number = oldlistnumber
                    self.style = oldstyle
                elif part.name == 'strong':
                    self.bold = True
                    self.traverse(part)
                    self.bold = False
                elif part.name == 'em':
                    self.italic = True
                    self.traverse(part)
                    self.italic = False
                elif part.name == 'strike':
                    self.strike = True
                    self.traverse(part)
                    self.strike = False
                elif part.name == 'u':
                    self.underline = True
                    self.traverse(part)
                    self.underline = False
                elif re.match(r'h[1-6]', part.name):
                    oldsize = self.size
                    self.size = 60 - ((int(part.name[1]) - 1) * 10)
                    self.bold = True
                    self.traverse(part)
                    self.bold = False
                    self.size = oldsize
                elif part.name == 'a':
                    self.start_link(part['href'])
                    if self.tpl.da_hyperlink_style:
                        self.charstyle = self.tpl.da_hyperlink_style
                    else:
                        self.underline = True
                        self.color = '#0000ff'
                    self.traverse(part)
                    if self.tpl.da_hyperlink_style:
                        self.charstyle = None
                    else:
                        self.underline = False
                        self.color = None
                    self.end_link()
                elif part.name == 'br':
                    self.run.add("\n", italic=self.italic, bold=self.bold, underline=self.underline, strike=self.strike, size=self.size, style=self.charstyle, color=self.color)
            else:
                print("Encountered a " + part.__class__.__name__)


def html_to_richtext(tpl, source_code):
      # ref: https://github.com/jhpyle/docassemble/blob/c16c786c65186399584c9a30d0e3d7dcb3acb056/docassemble_base/docassemble/base/file_docx.py#LL645C21-L645C21
      source_code = re.sub(r"\n", ' ', source_code)
      source_code = re.sub(r">\s+<", '><', source_code)
      soup = BeautifulSoup('<html>' + source_code + '</html>', 'html.parser')
      parser = InlineSoupParser(tpl)
      for elem in soup.find_all(recursive=False):
          parser.traverse(elem)
      output = str(parser)
      return output

## main.py
from docxtpl import DocxTemplate

if __name__ == '__main__':
    tpl = DocxTemplate('tpl.docx')
    content html_to_richtext(tpl, '在 HTTP 事务的上下文中，基本访问身份验证是 HTTP 用户代理在发出请求时提供用户名和密码的方法。<br/><br/>通过 HTTP 连接使用基本身份验证保护此目录。使用基本身份验证时，用户凭据将以纯文本形式发送，并且由于未使用 HTTPS，它们很容易受到数据包嗅探的影响。')
    context = {
        'content': 'content'
    }
    tpl.render(context)
    tpl.save('output.docx')

# 注意，tpl.docx 中的占位符类似于 `{{r content }}`
	import re
	import string
	from jinja2.runtime import Undefined
	import docx.opc.constants
	from bs4 import NavigableString, Tag
	from docxtpl import RichText

	# ref: https://github.com/jhpyle/docassemble/issues/72
	# ref: https://github.com/jhpyle/docassemble/blob/c16c786c65186399584c9a30d0e3d7dcb3acb056/docassemble_base/docassemble/base/file_docx.py#LL645C21-L645C21
	list_types = ['1', 'A', 'a', 'I', 'i']

	def ensure_definition(pargs, *kwargs):
	for val in pargs:
	if isinstance(val, Undefined):
	str(val)
	for val in kwargs.values():
	if isinstance(val, Undefined):
	str(val)

	def roman(num, case=None):
	"""Given an index between 0 and 3999, returns a roman numeral between 1 and 4000."""
	ensure_definition(num, case)
	if case is None:
	case = 'upper'
	num = num + 1
	if not isinstance(num, int):
	raise TypeError("expected integer, got %s" % type(num))
	if not 0 < num < 4000:
	raise ValueError("Argument must be between 1 and 3999")
	ints = (1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1)
	nums = ('M', 'CM', 'D', 'CD', 'C', 'XC', 'L', 'XL', 'X', 'IX', 'V', 'IV', 'I')
	result = ""
	for indexno, the_int in enumerate(ints):
	count = int(num / the_int)
	result += nums[indexno] * count
	num -= the_int * count
	if case == 'lower':
	return result.lower()
	return result

	def Alpha(number):
	multiplier = int((number - 1) / 26)
	indexno = (number - 1) % 26
	return string.ascii_uppercase[indexno] * (multiplier + 1)


	def alpha(number):
	multiplier = int((number - 1) / 26)
	indexno = (number - 1) % 26
	return string.ascii_lowercase[indexno] * (multiplier + 1)


	def Roman_Numeral(number):
	return roman((number - 1) % 4000, case='upper')


	def roman_numeral(number):
	return roman((number - 1) % 4000, case='lower')

	class InlineSoupParser:

	def __init__(self, tpl):
	self.runs = [RichText('')]
	self.run = self.runs[-1]
	self.bold = False
	self.italic = False
	self.underline = False
	self.indentation = 0
	self.style = 'p'
	self.strike = False
	self.size = None
	self.charstyle = None
	self.color = None
	self.tpl = tpl
	self.at_start = True
	self.list_number = 1
	self.list_type = list_types[-1]

	def new_paragraph(self):
	if self.at_start:
	self.at_start = False
	else:
	self.run.add("\n", italic=self.italic, bold=self.bold, underline=self.underline, strike=self.strike, size=self.size, style=self.charstyle, color=self.color)
	if self.indentation:
	self.run.add("\t" * self.indentation)
	if self.style == 'ul':
	self.run.add("•\t")
	if self.style == 'ol1':
	self.run.add(str(self.list_number) + ".\t")
	self.list_number += 1
	elif self.style == 'olA':
	self.run.add(Alpha(self.list_number) + ".\t")
	self.list_number += 1
	elif self.style == 'ola':
	self.run.add(alpha(self.list_number) + ".\t")
	self.list_number += 1
	elif self.style == 'olI':
	self.run.add(Roman_Numeral(self.list_number) + ".\t")
	self.list_number += 1
	elif self.style == 'oli':
	self.run.add(roman_numeral(self.list_number) + ".\t")
	self.list_number += 1
	# else:
	# self.list_number = 1

	def __str__(self):
	output = ''
	for run in self.runs:
	output += str(run)
	return output

	def start_link(self, url):
	ref = self.tpl.docx._part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)
	self.runs.append('<w:hyperlink r:id="%s">' % (ref, ))
	self.new_run()

	def end_link(self):
	self.runs.append('</w:hyperlink>')
	self.new_run()

	def new_run(self):
	self.runs.append(RichText(''))
	self.run = self.runs[-1]

	def traverse(self, elem):
	for part in elem.contents:
	if isinstance(part, NavigableString):
	self.run.add(str(part), italic=self.italic, bold=self.bold, underline=self.underline, strike=self.strike, size=self.size, style=self.charstyle, color=self.color)
	elif isinstance(part, Tag):
	if part.name in ('p', 'blockquote'):
	self.new_paragraph()
	self.traverse(part)
	elif part.name == 'li':
	self.new_paragraph()
	self.traverse(part)
	elif part.name == 'ul':
	oldstyle = self.style
	self.style = 'ul'
	self.indentation += 1
	self.traverse(part)
	self.indentation -= 1
	self.style = oldstyle
	elif part.name == 'ol':
	oldstyle = self.style
	oldlistnumber = self.list_number
	oldlisttype = self.list_type
	if part.get('type', None) in list_types:
	self.list_type = part['type']
	else:
	self.list_type = list_types[(list_types.index(self.list_type) + 1) % 5]
	try:
	self.list_number = int(part.get('start', 1))
	except:
	self.list_number = 1
	self.style = 'ol' + self.list_type
	self.indentation += 1
	self.traverse(part)
	self.indentation -= 1
	self.list_type = oldlisttype
	self.list_number = oldlistnumber
	self.style = oldstyle
	elif part.name == 'strong':
	self.bold = True
	self.traverse(part)
	self.bold = False
	elif part.name == 'em':
	self.italic = True
	self.traverse(part)
	self.italic = False
	elif part.name == 'strike':
	self.strike = True
	self.traverse(part)
	self.strike = False
	elif part.name == 'u':
	self.underline = True
	self.traverse(part)
	self.underline = False
	elif re.match(r'h[1-6]', part.name):
	oldsize = self.size
	self.size = 60 - ((int(part.name[1]) - 1) * 10)
	self.bold = True
	self.traverse(part)
	self.bold = False
	self.size = oldsize
	elif part.name == 'a':
	self.start_link(part['href'])
	if self.tpl.da_hyperlink_style:
	self.charstyle = self.tpl.da_hyperlink_style
	else:
	self.underline = True
	self.color = '#0000ff'
	self.traverse(part)
	if self.tpl.da_hyperlink_style:
	self.charstyle = None
	else:
	self.underline = False
	self.color = None
	self.end_link()
	elif part.name == 'br':
	self.run.add("\n", italic=self.italic, bold=self.bold, underline=self.underline, strike=self.strike, size=self.size, style=self.charstyle, color=self.color)
	else:
	print("Encountered a " + part.__class__.__name__)


	def html_to_richtext(tpl, source_code):
	# ref: https://github.com/jhpyle/docassemble/blob/c16c786c65186399584c9a30d0e3d7dcb3acb056/docassemble_base/docassemble/base/file_docx.py#LL645C21-L645C21
	source_code = re.sub(r"\n", ' ', source_code)
	source_code = re.sub(r">\s+<", '><', source_code)
	soup = BeautifulSoup('<html>' + source_code + '</html>', 'html.parser')
	parser = InlineSoupParser(tpl)
	for elem in soup.find_all(recursive=False):
	parser.traverse(elem)
	output = str(parser)
	return output
	from docxtpl import DocxTemplate

	if __name__ == '__main__':
	tpl = DocxTemplate('tpl.docx')
	content html_to_richtext(tpl, '在 HTTP 事务的上下文中，基本访问身份验证是 HTTP 用户代理在发出请求时提供用户名和密码的方法。<br/><br/>通过 HTTP 连接使用基本身份验证保护此目录。使用基本身份验证时，用户凭据将以纯文本形式发送，并且由于未使用 HTTPS，它们很容易受到数据包嗅探的影响。')
	context = {
	'content': 'content'
	}
	tpl.render(context)
	tpl.save('output.docx')

	# 注意，tpl.docx 中的占位符类似于 `{{r content }}`