|
import re |
|
import string |
|
from jinja2.runtime import Undefined |
|
import docx.opc.constants |
|
from bs4 import NavigableString, Tag |
|
from docxtpl import RichText |
|
|
|
# ref: https://github.com/jhpyle/docassemble/issues/72 |
|
# ref: https://github.com/jhpyle/docassemble/blob/c16c786c65186399584c9a30d0e3d7dcb3acb056/docassemble_base/docassemble/base/file_docx.py#LL645C21-L645C21 |
|
list_types = ['1', 'A', 'a', 'I', 'i'] |
|
|
|
def ensure_definition(*pargs, **kwargs): |
|
for val in pargs: |
|
if isinstance(val, Undefined): |
|
str(val) |
|
for val in kwargs.values(): |
|
if isinstance(val, Undefined): |
|
str(val) |
|
|
|
def roman(num, case=None): |
|
"""Given an index between 0 and 3999, returns a roman numeral between 1 and 4000.""" |
|
ensure_definition(num, case) |
|
if case is None: |
|
case = 'upper' |
|
num = num + 1 |
|
if not isinstance(num, int): |
|
raise TypeError("expected integer, got %s" % type(num)) |
|
if not 0 < num < 4000: |
|
raise ValueError("Argument must be between 1 and 3999") |
|
ints = (1000, 900, 500, 400, 100, 90, 50, 40, 10, 9, 5, 4, 1) |
|
nums = ('M', 'CM', 'D', 'CD', 'C', 'XC', 'L', 'XL', 'X', 'IX', 'V', 'IV', 'I') |
|
result = "" |
|
for indexno, the_int in enumerate(ints): |
|
count = int(num / the_int) |
|
result += nums[indexno] * count |
|
num -= the_int * count |
|
if case == 'lower': |
|
return result.lower() |
|
return result |
|
|
|
def Alpha(number): |
|
multiplier = int((number - 1) / 26) |
|
indexno = (number - 1) % 26 |
|
return string.ascii_uppercase[indexno] * (multiplier + 1) |
|
|
|
|
|
def alpha(number): |
|
multiplier = int((number - 1) / 26) |
|
indexno = (number - 1) % 26 |
|
return string.ascii_lowercase[indexno] * (multiplier + 1) |
|
|
|
|
|
def Roman_Numeral(number): |
|
return roman((number - 1) % 4000, case='upper') |
|
|
|
|
|
def roman_numeral(number): |
|
return roman((number - 1) % 4000, case='lower') |
|
|
|
class InlineSoupParser: |
|
|
|
def __init__(self, tpl): |
|
self.runs = [RichText('')] |
|
self.run = self.runs[-1] |
|
self.bold = False |
|
self.italic = False |
|
self.underline = False |
|
self.indentation = 0 |
|
self.style = 'p' |
|
self.strike = False |
|
self.size = None |
|
self.charstyle = None |
|
self.color = None |
|
self.tpl = tpl |
|
self.at_start = True |
|
self.list_number = 1 |
|
self.list_type = list_types[-1] |
|
|
|
def new_paragraph(self): |
|
if self.at_start: |
|
self.at_start = False |
|
else: |
|
self.run.add("\n", italic=self.italic, bold=self.bold, underline=self.underline, strike=self.strike, size=self.size, style=self.charstyle, color=self.color) |
|
if self.indentation: |
|
self.run.add("\t" * self.indentation) |
|
if self.style == 'ul': |
|
self.run.add("•\t") |
|
if self.style == 'ol1': |
|
self.run.add(str(self.list_number) + ".\t") |
|
self.list_number += 1 |
|
elif self.style == 'olA': |
|
self.run.add(Alpha(self.list_number) + ".\t") |
|
self.list_number += 1 |
|
elif self.style == 'ola': |
|
self.run.add(alpha(self.list_number) + ".\t") |
|
self.list_number += 1 |
|
elif self.style == 'olI': |
|
self.run.add(Roman_Numeral(self.list_number) + ".\t") |
|
self.list_number += 1 |
|
elif self.style == 'oli': |
|
self.run.add(roman_numeral(self.list_number) + ".\t") |
|
self.list_number += 1 |
|
# else: |
|
# self.list_number = 1 |
|
|
|
def __str__(self): |
|
output = '' |
|
for run in self.runs: |
|
output += str(run) |
|
return output |
|
|
|
def start_link(self, url): |
|
ref = self.tpl.docx._part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True) |
|
self.runs.append('<w:hyperlink r:id="%s">' % (ref, )) |
|
self.new_run() |
|
|
|
def end_link(self): |
|
self.runs.append('</w:hyperlink>') |
|
self.new_run() |
|
|
|
def new_run(self): |
|
self.runs.append(RichText('')) |
|
self.run = self.runs[-1] |
|
|
|
def traverse(self, elem): |
|
for part in elem.contents: |
|
if isinstance(part, NavigableString): |
|
self.run.add(str(part), italic=self.italic, bold=self.bold, underline=self.underline, strike=self.strike, size=self.size, style=self.charstyle, color=self.color) |
|
elif isinstance(part, Tag): |
|
if part.name in ('p', 'blockquote'): |
|
self.new_paragraph() |
|
self.traverse(part) |
|
elif part.name == 'li': |
|
self.new_paragraph() |
|
self.traverse(part) |
|
elif part.name == 'ul': |
|
oldstyle = self.style |
|
self.style = 'ul' |
|
self.indentation += 1 |
|
self.traverse(part) |
|
self.indentation -= 1 |
|
self.style = oldstyle |
|
elif part.name == 'ol': |
|
oldstyle = self.style |
|
oldlistnumber = self.list_number |
|
oldlisttype = self.list_type |
|
if part.get('type', None) in list_types: |
|
self.list_type = part['type'] |
|
else: |
|
self.list_type = list_types[(list_types.index(self.list_type) + 1) % 5] |
|
try: |
|
self.list_number = int(part.get('start', 1)) |
|
except: |
|
self.list_number = 1 |
|
self.style = 'ol' + self.list_type |
|
self.indentation += 1 |
|
self.traverse(part) |
|
self.indentation -= 1 |
|
self.list_type = oldlisttype |
|
self.list_number = oldlistnumber |
|
self.style = oldstyle |
|
elif part.name == 'strong': |
|
self.bold = True |
|
self.traverse(part) |
|
self.bold = False |
|
elif part.name == 'em': |
|
self.italic = True |
|
self.traverse(part) |
|
self.italic = False |
|
elif part.name == 'strike': |
|
self.strike = True |
|
self.traverse(part) |
|
self.strike = False |
|
elif part.name == 'u': |
|
self.underline = True |
|
self.traverse(part) |
|
self.underline = False |
|
elif re.match(r'h[1-6]', part.name): |
|
oldsize = self.size |
|
self.size = 60 - ((int(part.name[1]) - 1) * 10) |
|
self.bold = True |
|
self.traverse(part) |
|
self.bold = False |
|
self.size = oldsize |
|
elif part.name == 'a': |
|
self.start_link(part['href']) |
|
if self.tpl.da_hyperlink_style: |
|
self.charstyle = self.tpl.da_hyperlink_style |
|
else: |
|
self.underline = True |
|
self.color = '#0000ff' |
|
self.traverse(part) |
|
if self.tpl.da_hyperlink_style: |
|
self.charstyle = None |
|
else: |
|
self.underline = False |
|
self.color = None |
|
self.end_link() |
|
elif part.name == 'br': |
|
self.run.add("\n", italic=self.italic, bold=self.bold, underline=self.underline, strike=self.strike, size=self.size, style=self.charstyle, color=self.color) |
|
else: |
|
print("Encountered a " + part.__class__.__name__) |
|
|
|
|
|
def html_to_richtext(tpl, source_code): |
|
# ref: https://github.com/jhpyle/docassemble/blob/c16c786c65186399584c9a30d0e3d7dcb3acb056/docassemble_base/docassemble/base/file_docx.py#LL645C21-L645C21 |
|
source_code = re.sub(r"\n", ' ', source_code) |
|
source_code = re.sub(r">\s+<", '><', source_code) |
|
soup = BeautifulSoup('<html>' + source_code + '</html>', 'html.parser') |
|
parser = InlineSoupParser(tpl) |
|
for elem in soup.find_all(recursive=False): |
|
parser.traverse(elem) |
|
output = str(parser) |
|
return output |