gasman/html_rewriter.py

## html_rewriter.py
from collections import defaultdict
from collections.abc import Mapping
import html
import re
import warnings

from django.utils.html import conditional_escape, format_html
from django.utils.safestring import mark_safe


ELEMENT_SELECTOR = re.compile(r'^([\w-]+)$')
ELEMENT_WITH_ATTR_SELECTOR = re.compile(r'^([\w-]+)\[([\w-]+)\]$')
ELEMENT_WITH_ATTR_EXACT_SINGLE_QUOTE_SELECTOR = re.compile(r"^([\w-]+)\[([\w-]+)='(.*)'\]$")
ELEMENT_WITH_ATTR_EXACT_DOUBLE_QUOTE_SELECTOR = re.compile(r'^([\w-]+)\[([\w-]+)="(.*)"\]$')
ELEMENT_WITH_ATTR_EXACT_UNQUOTED_SELECTOR = re.compile(r"^([\w-]+)\[([\w-]+)=([\w-]+)\]$")

DOUBLE_QUOTED_ATTRIBUTE = r'([\w-]+)\="([^"]*)"'
SINGLE_QUOTED_ATTRIBUTE = r"([\w-]+)\='([^]*)'"
UNQUOTED_ATTRIBUTE = r'([\w-]+)\=([\w-]+)'
ATTRIBUTE = re.compile(
    r'(?:%s|%s|%s)' % (DOUBLE_QUOTED_ATTRIBUTE, SINGLE_QUOTED_ATTRIBUTE, UNQUOTED_ATTRIBUTE)
)


class Rule:
    """A CSS-like rule that an HTML element can match or not"""
    priority = None

    def __lt__(self, other):
        # Define an ordering on Selector objects so that running sort() on a list of them
        # orders by priority
        return self.priority < other.priority


class ElementRule(Rule):
    priority = 2  # lower than an element-with-attribute rule

    def __init__(self, name, rewriter):
        self.name = name
        self.rewriter = rewriter

    def attributes_match(self, attrs):
        # this rule does not care about attributes
        return True


class ElementWithAttributeRule(Rule):
    priority = 1  # higher than element rule

    def __init__(self, name, attr, rewriter):
        self.name = name
        self.attr = attr
        self.rewriter = rewriter

    def attributes_match(self, attrs):
        return self.attr in attrs


class ElementWithAttributeExactRule(Rule):
    priority = 1  # higher than element rule

    def __init__(self, name, attr, value, rewriter):
        self.name = name
        self.attr = attr
        self.value = value
        self.rewriter = rewriter

    def attributes_match(self, attrs):
        return (self.attr in attrs) and (attrs[self.attr] == self.value)


class HTMLRewriter:
    def __init__(self, rules):
        self.rules_by_element = defaultdict(list)
        self.add_rules(rules)

    def add_rules(self, rules):
        # accepts either a dict of {selector: rewriter}, or a list of (selector, rewriter) tuples
        if isinstance(rules, Mapping):
            rules = rules.items()

        for selector, rewriter in rules:
            self.add_rule(selector, rewriter)

    def add_rule(self, selector, rewriter):
        match = ELEMENT_SELECTOR.match(selector)
        if match:
            name = match.group(1)
            self.rules_by_element[name].append(
                ElementRule(name, rewriter)
            )
            self.rules_by_element[name].sort()
            return

        match = ELEMENT_WITH_ATTR_SELECTOR.match(selector)
        if match:
            name, attr = match.groups()
            self.rules_by_element[name].append(
                ElementWithAttributeRule(name, attr, rewriter)
            )
            self.rules_by_element[name].sort()
            return

        for regex in (
            ELEMENT_WITH_ATTR_EXACT_SINGLE_QUOTE_SELECTOR,
            ELEMENT_WITH_ATTR_EXACT_DOUBLE_QUOTE_SELECTOR,
            ELEMENT_WITH_ATTR_EXACT_UNQUOTED_SELECTOR
        ):
            match = regex.match(selector)
            if match:
                name, attr, value = match.groups()
                self.rules_by_element[name].append(
                    ElementWithAttributeExactRule(name, attr, value, rewriter)
                )
                self.rules_by_element[name].sort()
                return

        warnings.warn("Unsupported selector format: %r" % selector)

    def rewrite(self, html):
        """
        Rewrite a string of HTML according to the configured rules
        """
        result, _ = self._rewrite(html)
        return result

    def _rewrite(self, html, *, start=0, until_tag=None):
        """
        Internal method used recursively by `rewrite`.
        Rewrites a string of HTML, starting from offset `start`, until it encounters either the end
        of the string, or a closing tag as specified in `until_tag` that is NOT balanced by an
        opening tag seen in the current invocation of `_rewrite`.

        In other words,

            some text with <span>an extra span</span> in it</span>
                                                           ^^^^^^^ this one.

        Upon reaching either end condition (closing tag or end of string), it will return a tuple
        of the rewritten HTML string and the offset of any subsequent not-yet-processed HTML.
        """

        if not self.rules_by_element:
            # no rewrite rules, so nothing to do. Yay!
            return mark_safe(html[start:]), len(html)

        position = start  # current position within the html string
        result = ''  # rewritten HTML goes here

        # the number of pending occurrences of until_tag that we will skip past before treating it
        # as the ACTUAL closing tag
        ignored_closing_tag_count = 0

        # build a regexp that matches any element name in our rule list,
        # e.g. "(h1|a|embed)"
        element_name_re = "(%s)" % (
            '|'.join(re.escape(name) for name in self.rules_by_element.keys())
        )

        # turn this into a regexp that matches any opening tag with one of these names
        # and any set of attributes - e.g. "<(h1|a|embed)(\b[^>]*)>"
        opening_tag_re = r'<%s(\b[^>]*)>' % element_name_re

        if until_tag:
            # our final regexp also needs to match the specified closing tag
            closing_tag_re = r'</(%s)>' % re.escape(until_tag)
            final_re_expr = r'(?:%s|%s)' % (opening_tag_re, closing_tag_re)
        else:
            # no until_tag specified, so we're just looking for opening tags
            final_re_expr = opening_tag_re

        # we need to compile the regexp to be able to start the search from an arbitrary offset
        final_re = re.compile(final_re_expr)

        while True:
            match = final_re.search(html, position)
            if not match:
                # we have reached the end of the HTML string with no more elements to rewrite.
                # This shouldn't happen if we were expecting a closing tag, so warn in that case
                if until_tag:
                    warnings.warn(
                        "Reached end of string without encountering closing %r tag" % until_tag
                    )

                # either way, we can tack the remaining non-rewritten HTML on to our result,
                # and we're done
                result += html[position:]
                return mark_safe(result), len(html)
            else:
                # we've found a tag that we're interested in, but first we should add all of the
                # non-rewritten HTML up to that point on to our result
                result += html[position:match.start()]

                # once we've finished handling this tag, we'll continue from the offset after it
                position = match.end()

                # now parse the tag into its components

                tag = match.group(0)
                is_closing_tag = tag.startswith('</')

                if is_closing_tag:
                    # the regexp is structured as <(tagname)(attrs)>|</(tagname)> , so we're
                    # interested in the third group
                    tag_name = match.group(3)
                else:
                    tag_name = match.group(1)
                    attr_string = match.group(2)
                    is_self_closing = attr_string.endswith('/')
                    if is_self_closing:
                        attr_string = attr_string[:-1]

                    attrs = self.unpack_attr_string(attr_string)

                if is_closing_tag:
                    if ignored_closing_tag_count > 0:
                        # this tag is closing a tag that was previously opened in this invocation
                        # of _rewrite - it isn't the *real* closing tag
                        ignored_closing_tag_count -= 1
                    else:
                        # this is the real closing tag, so we're done with this invocation of
                        # _rewrite
                        return mark_safe(result), position

                else:
                    # this is an opening tag - look for a matching rewrite rule
                    matching_rule = None
                    for rule in self.rules_by_element[tag_name]:
                        if rule.attributes_match(attrs):
                            matching_rule = rule
                            break

                    if not matching_rule:
                        # no matching rewrite rule, so output this tag unchanged
                        result += tag

                        if tag_name == until_tag and not is_self_closing:
                            # this tag has the same name as the closing tag we're waiting for,
                            # so the next occurrence of the closing tag will be closing this one,
                            # rather than the *real* closing tag that signals the end of this
                            # invocation of _rewrite
                            ignored_closing_tag_count += 1

                    elif hasattr(matching_rule.rewriter, 'rewrite_attributes'):
                        # If the rewrite rule has a rewrite_attributes method, then we output the
                        # original tag with the updated attributes. Since the closing tag will be
                        # unchanged in the output, there's no need for us to do anything special to
                        # match it; however, if it happens to be the same as until_tag, we need to
                        # bump up ignored_closing_tag_count so that we'll skip over it rather than
                        # treating it as the *real* closing tag

                        new_attrs = matching_rule.rewriter.rewrite_attributes(tag_name, attrs)
                        new_attr_string = ' '.join(
                            '%s="%s"' % (conditional_escape(key), conditional_escape(val))
                            for key, val in new_attrs.items()
                        )
                        if is_self_closing:
                            new_tag = '<%s %s/>' % (tag_name, new_attr_string)
                        else:
                            new_tag = '<%s %s>' % (tag_name, new_attr_string)

                        result += new_tag

                        if tag_name == until_tag and not is_self_closing:
                            ignored_closing_tag_count += 1

                    elif hasattr(matching_rule.rewriter, 'rewrite_element'):
                        if is_self_closing:
                            # this element has no content, so just call rewrite_element with an
                            # empty string as content
                            rewritten_element = matching_rule.rewriter.rewrite_element(
                                tag_name, attrs, ''
                            )
                        else:
                            # we need to consume the element content by spinning up a recursive
                            # call to _rewrite and then call rewrite_element with that result
                            content, position = self._rewrite(html, start=position, until_tag=tag_name)
                            rewritten_element = matching_rule.rewriter.rewrite_element(
                                tag_name, attrs, content
                            )

                        # escape the result of rewrite_element, unless it's a safe string
                        # (e.g. the output of format_html)
                        result += conditional_escape(rewritten_element)

                    else:
                        raise Exception(
                            "Invalid ElementRewriter: %r. An ElementRewriter must implement "
                            "either rewrite_element or rewrite_attributes"
                            % matching_rule.rewriter
                        )

    @staticmethod
    def unpack_attr_string(attr_string):
        """Unpack a string of HTML attributes into a dict of unescaped strings"""
        attributes = {}
        for match in ATTRIBUTE.finditer(attr_string):
            if match.group(1):
                name = match.group(1)
                val = match.group(2)
            elif match.group(3):
                name = match.group(3)
                val = match.group(4)
            elif match.group(5):
                name = match.group(5)
                val = match.group(6)

            attributes[name] = html.unescape(val)

        return attributes


class ElementRewriter:
    pass


class ImageRewriter(ElementRewriter):
    def rewrite_element(self, name, attributes, content):
        return format_html(
            '<img src="/images/image-{}.jpg" alt="{}" />',
            attributes['id'],
            attributes['alt']
        )


class PageLinkRewriter(ElementRewriter):
    def rewrite_attributes(self, name, attributes):
        return {
            'href': '/pages/page-%s' % attributes['id'],
            'class': 'page',
        }


class IntroParagraphRewriter(ElementRewriter):
    def rewrite_element(self, name, attributes, content):
        return format_html('<p class="intro">{}</p>', content)


class ROT13Rewriter(ElementRewriter):
    def rewrite_element(self, name, attributes, content):
        import codecs
        return codecs.encode(content, 'rot_13')


rewriter = HTMLRewriter({
    "embed[embedtype='image']": ImageRewriter(),
    "a[linktype='page']": PageLinkRewriter(),
    "intro": IntroParagraphRewriter(),
    "rot13": ROT13Rewriter(),
})


db_html = """
<h1>HTML Rewriter example</h1>
<intro><strong>First</strong>, we have a paragraph with <a linktype="page" id="123">a <em>wonderful</em> link</a> in it.</intro>
<p>This <a href="https://torchbox.com/">external link</a> doesn't get rewritten though. Now for an image:</p>
<embed embedtype="image" id="456" alt="I &lt;3 kittens" />
<p>But all we really care about is that <rot13>Darth Vader is Luke's father</rot13>.</p>
"""

print(rewriter.rewrite(db_html))


"""
Output:

<h1>HTML Rewriter example</h1>
<p class="intro"><strong>First</strong>, we have a paragraph with <a href="/pages/page-123" class="page">a <em>wonderful</em> link</a> in it.</p>
<p>This <a href="https://torchbox.com/">external link</a> doesn't get rewritten though. Now for an image:</p>
<img src="/images/image-456.jpg" alt="I &lt;3 kittens" />
<p>But all we really care about is that Qnegu Inqre vf Yhxr&#x27;f sngure.</p>
"""
	from collections import defaultdict
	from collections.abc import Mapping
	import html
	import re
	import warnings

	from django.utils.html import conditional_escape, format_html
	from django.utils.safestring import mark_safe


	ELEMENT_SELECTOR = re.compile(r'^([\w-]+)$')
	ELEMENT_WITH_ATTR_SELECTOR = re.compile(r'^([\w-]+)\[([\w-]+)\]$')
	ELEMENT_WITH_ATTR_EXACT_SINGLE_QUOTE_SELECTOR = re.compile(r"^([\w-]+)\[([\w-]+)='(.*)'\]$")
	ELEMENT_WITH_ATTR_EXACT_DOUBLE_QUOTE_SELECTOR = re.compile(r'^([\w-]+)\[([\w-]+)="(.*)"\]$')
	ELEMENT_WITH_ATTR_EXACT_UNQUOTED_SELECTOR = re.compile(r"^([\w-]+)\[([\w-]+)=([\w-]+)\]$")

	DOUBLE_QUOTED_ATTRIBUTE = r'([\w-]+)\="([^"]*)"'
	SINGLE_QUOTED_ATTRIBUTE = r"([\w-]+)\='([^]*)'"
	UNQUOTED_ATTRIBUTE = r'([\w-]+)\=([\w-]+)'
	ATTRIBUTE = re.compile(
	r'(?:%s\|%s\|%s)' % (DOUBLE_QUOTED_ATTRIBUTE, SINGLE_QUOTED_ATTRIBUTE, UNQUOTED_ATTRIBUTE)
	)


	class Rule:
	"""A CSS-like rule that an HTML element can match or not"""
	priority = None

	def __lt__(self, other):
	# Define an ordering on Selector objects so that running sort() on a list of them
	# orders by priority
	return self.priority < other.priority


	class ElementRule(Rule):
	priority = 2 # lower than an element-with-attribute rule

	def __init__(self, name, rewriter):
	self.name = name
	self.rewriter = rewriter

	def attributes_match(self, attrs):
	# this rule does not care about attributes
	return True


	class ElementWithAttributeRule(Rule):
	priority = 1 # higher than element rule

	def __init__(self, name, attr, rewriter):
	self.name = name
	self.attr = attr
	self.rewriter = rewriter

	def attributes_match(self, attrs):
	return self.attr in attrs


	class ElementWithAttributeExactRule(Rule):
	priority = 1 # higher than element rule

	def __init__(self, name, attr, value, rewriter):
	self.name = name
	self.attr = attr
	self.value = value
	self.rewriter = rewriter

	def attributes_match(self, attrs):
	return (self.attr in attrs) and (attrs[self.attr] == self.value)


	class HTMLRewriter:
	def __init__(self, rules):
	self.rules_by_element = defaultdict(list)
	self.add_rules(rules)

	def add_rules(self, rules):
	# accepts either a dict of {selector: rewriter}, or a list of (selector, rewriter) tuples
	if isinstance(rules, Mapping):
	rules = rules.items()

	for selector, rewriter in rules:
	self.add_rule(selector, rewriter)

	def add_rule(self, selector, rewriter):
	match = ELEMENT_SELECTOR.match(selector)
	if match:
	name = match.group(1)
	self.rules_by_element[name].append(
	ElementRule(name, rewriter)
	)
	self.rules_by_element[name].sort()
	return

	match = ELEMENT_WITH_ATTR_SELECTOR.match(selector)
	if match:
	name, attr = match.groups()
	self.rules_by_element[name].append(
	ElementWithAttributeRule(name, attr, rewriter)
	)
	self.rules_by_element[name].sort()
	return

	for regex in (
	ELEMENT_WITH_ATTR_EXACT_SINGLE_QUOTE_SELECTOR,
	ELEMENT_WITH_ATTR_EXACT_DOUBLE_QUOTE_SELECTOR,
	ELEMENT_WITH_ATTR_EXACT_UNQUOTED_SELECTOR
	):
	match = regex.match(selector)
	if match:
	name, attr, value = match.groups()
	self.rules_by_element[name].append(
	ElementWithAttributeExactRule(name, attr, value, rewriter)
	)
	self.rules_by_element[name].sort()
	return

	warnings.warn("Unsupported selector format: %r" % selector)

	def rewrite(self, html):
	"""
	Rewrite a string of HTML according to the configured rules
	"""
	result, _ = self._rewrite(html)
	return result

	def _rewrite(self, html, *, start=0, until_tag=None):
	"""
	Internal method used recursively by `rewrite`.
	Rewrites a string of HTML, starting from offset `start`, until it encounters either the end
	of the string, or a closing tag as specified in `until_tag` that is NOT balanced by an
	opening tag seen in the current invocation of `_rewrite`.

	In other words,

	some text with <span>an extra span</span> in it</span>
	^^^^^^^ this one.

	Upon reaching either end condition (closing tag or end of string), it will return a tuple
	of the rewritten HTML string and the offset of any subsequent not-yet-processed HTML.
	"""

	if not self.rules_by_element:
	# no rewrite rules, so nothing to do. Yay!
	return mark_safe(html[start:]), len(html)

	position = start # current position within the html string
	result = '' # rewritten HTML goes here

	# the number of pending occurrences of until_tag that we will skip past before treating it
	# as the ACTUAL closing tag
	ignored_closing_tag_count = 0

	# build a regexp that matches any element name in our rule list,
	# e.g. "(h1\|a\|embed)"
	element_name_re = "(%s)" % (
	'\|'.join(re.escape(name) for name in self.rules_by_element.keys())
	)

	# turn this into a regexp that matches any opening tag with one of these names
	# and any set of attributes - e.g. "<(h1\|a\|embed)(\b[^>]*)>"
	opening_tag_re = r'<%s(\b[^>]*)>' % element_name_re

	if until_tag:
	# our final regexp also needs to match the specified closing tag
	closing_tag_re = r'</(%s)>' % re.escape(until_tag)
	final_re_expr = r'(?:%s\|%s)' % (opening_tag_re, closing_tag_re)
	else:
	# no until_tag specified, so we're just looking for opening tags
	final_re_expr = opening_tag_re

	# we need to compile the regexp to be able to start the search from an arbitrary offset
	final_re = re.compile(final_re_expr)

	while True:
	match = final_re.search(html, position)
	if not match:
	# we have reached the end of the HTML string with no more elements to rewrite.
	# This shouldn't happen if we were expecting a closing tag, so warn in that case
	if until_tag:
	warnings.warn(
	"Reached end of string without encountering closing %r tag" % until_tag
	)

	# either way, we can tack the remaining non-rewritten HTML on to our result,
	# and we're done
	result += html[position:]
	return mark_safe(result), len(html)
	else:
	# we've found a tag that we're interested in, but first we should add all of the
	# non-rewritten HTML up to that point on to our result
	result += html[position:match.start()]

	# once we've finished handling this tag, we'll continue from the offset after it
	position = match.end()

	# now parse the tag into its components

	tag = match.group(0)
	is_closing_tag = tag.startswith('</')

	if is_closing_tag:
	# the regexp is structured as <(tagname)(attrs)>\|</(tagname)> , so we're
	# interested in the third group
	tag_name = match.group(3)
	else:
	tag_name = match.group(1)
	attr_string = match.group(2)
	is_self_closing = attr_string.endswith('/')
	if is_self_closing:
	attr_string = attr_string[:-1]

	attrs = self.unpack_attr_string(attr_string)

	if is_closing_tag:
	if ignored_closing_tag_count > 0:
	# this tag is closing a tag that was previously opened in this invocation
	# of _rewrite - it isn't the real closing tag
	ignored_closing_tag_count -= 1
	else:
	# this is the real closing tag, so we're done with this invocation of
	# _rewrite
	return mark_safe(result), position

	else:
	# this is an opening tag - look for a matching rewrite rule
	matching_rule = None
	for rule in self.rules_by_element[tag_name]:
	if rule.attributes_match(attrs):
	matching_rule = rule
	break

	if not matching_rule:
	# no matching rewrite rule, so output this tag unchanged
	result += tag

	if tag_name == until_tag and not is_self_closing:
	# this tag has the same name as the closing tag we're waiting for,
	# so the next occurrence of the closing tag will be closing this one,
	# rather than the real closing tag that signals the end of this
	# invocation of _rewrite
	ignored_closing_tag_count += 1

	elif hasattr(matching_rule.rewriter, 'rewrite_attributes'):
	# If the rewrite rule has a rewrite_attributes method, then we output the
	# original tag with the updated attributes. Since the closing tag will be
	# unchanged in the output, there's no need for us to do anything special to
	# match it; however, if it happens to be the same as until_tag, we need to
	# bump up ignored_closing_tag_count so that we'll skip over it rather than
	# treating it as the real closing tag

	new_attrs = matching_rule.rewriter.rewrite_attributes(tag_name, attrs)
	new_attr_string = ' '.join(
	'%s="%s"' % (conditional_escape(key), conditional_escape(val))
	for key, val in new_attrs.items()
	)
	if is_self_closing:
	new_tag = '<%s %s/>' % (tag_name, new_attr_string)
	else:
	new_tag = '<%s %s>' % (tag_name, new_attr_string)

	result += new_tag

	if tag_name == until_tag and not is_self_closing:
	ignored_closing_tag_count += 1

	elif hasattr(matching_rule.rewriter, 'rewrite_element'):
	if is_self_closing:
	# this element has no content, so just call rewrite_element with an
	# empty string as content
	rewritten_element = matching_rule.rewriter.rewrite_element(
	tag_name, attrs, ''
	)
	else:
	# we need to consume the element content by spinning up a recursive
	# call to _rewrite and then call rewrite_element with that result
	content, position = self._rewrite(html, start=position, until_tag=tag_name)
	rewritten_element = matching_rule.rewriter.rewrite_element(
	tag_name, attrs, content
	)

	# escape the result of rewrite_element, unless it's a safe string
	# (e.g. the output of format_html)
	result += conditional_escape(rewritten_element)

	else:
	raise Exception(
	"Invalid ElementRewriter: %r. An ElementRewriter must implement "
	"either rewrite_element or rewrite_attributes"
	% matching_rule.rewriter
	)

	@staticmethod
	def unpack_attr_string(attr_string):
	"""Unpack a string of HTML attributes into a dict of unescaped strings"""
	attributes = {}
	for match in ATTRIBUTE.finditer(attr_string):
	if match.group(1):
	name = match.group(1)
	val = match.group(2)
	elif match.group(3):
	name = match.group(3)
	val = match.group(4)
	elif match.group(5):
	name = match.group(5)
	val = match.group(6)

	attributes[name] = html.unescape(val)

	return attributes


	class ElementRewriter:
	pass


	class ImageRewriter(ElementRewriter):
	def rewrite_element(self, name, attributes, content):
	return format_html(
	'<img src="/images/image-{}.jpg" alt="{}" />',
	attributes['id'],
	attributes['alt']
	)


	class PageLinkRewriter(ElementRewriter):
	def rewrite_attributes(self, name, attributes):
	return {
	'href': '/pages/page-%s' % attributes['id'],
	'class': 'page',
	}


	class IntroParagraphRewriter(ElementRewriter):
	def rewrite_element(self, name, attributes, content):
	return format_html('<p class="intro">{}</p>', content)


	class ROT13Rewriter(ElementRewriter):
	def rewrite_element(self, name, attributes, content):
	import codecs
	return codecs.encode(content, 'rot_13')


	rewriter = HTMLRewriter({
	"embed[embedtype='image']": ImageRewriter(),
	"a[linktype='page']": PageLinkRewriter(),
	"intro": IntroParagraphRewriter(),
	"rot13": ROT13Rewriter(),
	})


	db_html = """
	<h1>HTML Rewriter example</h1>
	<intro><strong>First</strong>, we have a paragraph with <a linktype="page" id="123">a <em>wonderful</em> link</a> in it.</intro>
	<p>This <a href="https://torchbox.com/">external link</a> doesn't get rewritten though. Now for an image:</p>
	<embed embedtype="image" id="456" alt="I <3 kittens" />
	<p>But all we really care about is that <rot13>Darth Vader is Luke's father</rot13>.</p>
	"""

	print(rewriter.rewrite(db_html))


	"""
	Output:

	<h1>HTML Rewriter example</h1>
	<p class="intro"><strong>First</strong>, we have a paragraph with <a href="/pages/page-123" class="page">a <em>wonderful</em> link</a> in it.</p>
	<p>This <a href="https://torchbox.com/">external link</a> doesn't get rewritten though. Now for an image:</p>
	<img src="/images/image-456.jpg" alt="I <3 kittens" />
	<p>But all we really care about is that Qnegu Inqre vf Yhxr'f sngure.</p>
	"""