Skip to content

Instantly share code, notes, and snippets.

@gasman
Created May 13, 2020 01:24
Show Gist options
  • Save gasman/e640f6e12609ffe43718403203fd8d65 to your computer and use it in GitHub Desktop.
Save gasman/e640f6e12609ffe43718403203fd8d65 to your computer and use it in GitHub Desktop.
from collections import defaultdict
from collections.abc import Mapping
import html
import re
import warnings
from django.utils.html import conditional_escape, format_html
from django.utils.safestring import mark_safe
ELEMENT_SELECTOR = re.compile(r'^([\w-]+)$')
ELEMENT_WITH_ATTR_SELECTOR = re.compile(r'^([\w-]+)\[([\w-]+)\]$')
ELEMENT_WITH_ATTR_EXACT_SINGLE_QUOTE_SELECTOR = re.compile(r"^([\w-]+)\[([\w-]+)='(.*)'\]$")
ELEMENT_WITH_ATTR_EXACT_DOUBLE_QUOTE_SELECTOR = re.compile(r'^([\w-]+)\[([\w-]+)="(.*)"\]$')
ELEMENT_WITH_ATTR_EXACT_UNQUOTED_SELECTOR = re.compile(r"^([\w-]+)\[([\w-]+)=([\w-]+)\]$")
DOUBLE_QUOTED_ATTRIBUTE = r'([\w-]+)\="([^"]*)"'
SINGLE_QUOTED_ATTRIBUTE = r"([\w-]+)\='([^]*)'"
UNQUOTED_ATTRIBUTE = r'([\w-]+)\=([\w-]+)'
ATTRIBUTE = re.compile(
r'(?:%s|%s|%s)' % (DOUBLE_QUOTED_ATTRIBUTE, SINGLE_QUOTED_ATTRIBUTE, UNQUOTED_ATTRIBUTE)
)
class Rule:
"""A CSS-like rule that an HTML element can match or not"""
priority = None
def __lt__(self, other):
# Define an ordering on Selector objects so that running sort() on a list of them
# orders by priority
return self.priority < other.priority
class ElementRule(Rule):
priority = 2 # lower than an element-with-attribute rule
def __init__(self, name, rewriter):
self.name = name
self.rewriter = rewriter
def attributes_match(self, attrs):
# this rule does not care about attributes
return True
class ElementWithAttributeRule(Rule):
priority = 1 # higher than element rule
def __init__(self, name, attr, rewriter):
self.name = name
self.attr = attr
self.rewriter = rewriter
def attributes_match(self, attrs):
return self.attr in attrs
class ElementWithAttributeExactRule(Rule):
priority = 1 # higher than element rule
def __init__(self, name, attr, value, rewriter):
self.name = name
self.attr = attr
self.value = value
self.rewriter = rewriter
def attributes_match(self, attrs):
return (self.attr in attrs) and (attrs[self.attr] == self.value)
class HTMLRewriter:
def __init__(self, rules):
self.rules_by_element = defaultdict(list)
self.add_rules(rules)
def add_rules(self, rules):
# accepts either a dict of {selector: rewriter}, or a list of (selector, rewriter) tuples
if isinstance(rules, Mapping):
rules = rules.items()
for selector, rewriter in rules:
self.add_rule(selector, rewriter)
def add_rule(self, selector, rewriter):
match = ELEMENT_SELECTOR.match(selector)
if match:
name = match.group(1)
self.rules_by_element[name].append(
ElementRule(name, rewriter)
)
self.rules_by_element[name].sort()
return
match = ELEMENT_WITH_ATTR_SELECTOR.match(selector)
if match:
name, attr = match.groups()
self.rules_by_element[name].append(
ElementWithAttributeRule(name, attr, rewriter)
)
self.rules_by_element[name].sort()
return
for regex in (
ELEMENT_WITH_ATTR_EXACT_SINGLE_QUOTE_SELECTOR,
ELEMENT_WITH_ATTR_EXACT_DOUBLE_QUOTE_SELECTOR,
ELEMENT_WITH_ATTR_EXACT_UNQUOTED_SELECTOR
):
match = regex.match(selector)
if match:
name, attr, value = match.groups()
self.rules_by_element[name].append(
ElementWithAttributeExactRule(name, attr, value, rewriter)
)
self.rules_by_element[name].sort()
return
warnings.warn("Unsupported selector format: %r" % selector)
def rewrite(self, html):
"""
Rewrite a string of HTML according to the configured rules
"""
result, _ = self._rewrite(html)
return result
def _rewrite(self, html, *, start=0, until_tag=None):
"""
Internal method used recursively by `rewrite`.
Rewrites a string of HTML, starting from offset `start`, until it encounters either the end
of the string, or a closing tag as specified in `until_tag` that is NOT balanced by an
opening tag seen in the current invocation of `_rewrite`.
In other words,
some text with <span>an extra span</span> in it</span>
^^^^^^^ this one.
Upon reaching either end condition (closing tag or end of string), it will return a tuple
of the rewritten HTML string and the offset of any subsequent not-yet-processed HTML.
"""
if not self.rules_by_element:
# no rewrite rules, so nothing to do. Yay!
return mark_safe(html[start:]), len(html)
position = start # current position within the html string
result = '' # rewritten HTML goes here
# the number of pending occurrences of until_tag that we will skip past before treating it
# as the ACTUAL closing tag
ignored_closing_tag_count = 0
# build a regexp that matches any element name in our rule list,
# e.g. "(h1|a|embed)"
element_name_re = "(%s)" % (
'|'.join(re.escape(name) for name in self.rules_by_element.keys())
)
# turn this into a regexp that matches any opening tag with one of these names
# and any set of attributes - e.g. "<(h1|a|embed)(\b[^>]*)>"
opening_tag_re = r'<%s(\b[^>]*)>' % element_name_re
if until_tag:
# our final regexp also needs to match the specified closing tag
closing_tag_re = r'</(%s)>' % re.escape(until_tag)
final_re_expr = r'(?:%s|%s)' % (opening_tag_re, closing_tag_re)
else:
# no until_tag specified, so we're just looking for opening tags
final_re_expr = opening_tag_re
# we need to compile the regexp to be able to start the search from an arbitrary offset
final_re = re.compile(final_re_expr)
while True:
match = final_re.search(html, position)
if not match:
# we have reached the end of the HTML string with no more elements to rewrite.
# This shouldn't happen if we were expecting a closing tag, so warn in that case
if until_tag:
warnings.warn(
"Reached end of string without encountering closing %r tag" % until_tag
)
# either way, we can tack the remaining non-rewritten HTML on to our result,
# and we're done
result += html[position:]
return mark_safe(result), len(html)
else:
# we've found a tag that we're interested in, but first we should add all of the
# non-rewritten HTML up to that point on to our result
result += html[position:match.start()]
# once we've finished handling this tag, we'll continue from the offset after it
position = match.end()
# now parse the tag into its components
tag = match.group(0)
is_closing_tag = tag.startswith('</')
if is_closing_tag:
# the regexp is structured as <(tagname)(attrs)>|</(tagname)> , so we're
# interested in the third group
tag_name = match.group(3)
else:
tag_name = match.group(1)
attr_string = match.group(2)
is_self_closing = attr_string.endswith('/')
if is_self_closing:
attr_string = attr_string[:-1]
attrs = self.unpack_attr_string(attr_string)
if is_closing_tag:
if ignored_closing_tag_count > 0:
# this tag is closing a tag that was previously opened in this invocation
# of _rewrite - it isn't the *real* closing tag
ignored_closing_tag_count -= 1
else:
# this is the real closing tag, so we're done with this invocation of
# _rewrite
return mark_safe(result), position
else:
# this is an opening tag - look for a matching rewrite rule
matching_rule = None
for rule in self.rules_by_element[tag_name]:
if rule.attributes_match(attrs):
matching_rule = rule
break
if not matching_rule:
# no matching rewrite rule, so output this tag unchanged
result += tag
if tag_name == until_tag and not is_self_closing:
# this tag has the same name as the closing tag we're waiting for,
# so the next occurrence of the closing tag will be closing this one,
# rather than the *real* closing tag that signals the end of this
# invocation of _rewrite
ignored_closing_tag_count += 1
elif hasattr(matching_rule.rewriter, 'rewrite_attributes'):
# If the rewrite rule has a rewrite_attributes method, then we output the
# original tag with the updated attributes. Since the closing tag will be
# unchanged in the output, there's no need for us to do anything special to
# match it; however, if it happens to be the same as until_tag, we need to
# bump up ignored_closing_tag_count so that we'll skip over it rather than
# treating it as the *real* closing tag
new_attrs = matching_rule.rewriter.rewrite_attributes(tag_name, attrs)
new_attr_string = ' '.join(
'%s="%s"' % (conditional_escape(key), conditional_escape(val))
for key, val in new_attrs.items()
)
if is_self_closing:
new_tag = '<%s %s/>' % (tag_name, new_attr_string)
else:
new_tag = '<%s %s>' % (tag_name, new_attr_string)
result += new_tag
if tag_name == until_tag and not is_self_closing:
ignored_closing_tag_count += 1
elif hasattr(matching_rule.rewriter, 'rewrite_element'):
if is_self_closing:
# this element has no content, so just call rewrite_element with an
# empty string as content
rewritten_element = matching_rule.rewriter.rewrite_element(
tag_name, attrs, ''
)
else:
# we need to consume the element content by spinning up a recursive
# call to _rewrite and then call rewrite_element with that result
content, position = self._rewrite(html, start=position, until_tag=tag_name)
rewritten_element = matching_rule.rewriter.rewrite_element(
tag_name, attrs, content
)
# escape the result of rewrite_element, unless it's a safe string
# (e.g. the output of format_html)
result += conditional_escape(rewritten_element)
else:
raise Exception(
"Invalid ElementRewriter: %r. An ElementRewriter must implement "
"either rewrite_element or rewrite_attributes"
% matching_rule.rewriter
)
@staticmethod
def unpack_attr_string(attr_string):
"""Unpack a string of HTML attributes into a dict of unescaped strings"""
attributes = {}
for match in ATTRIBUTE.finditer(attr_string):
if match.group(1):
name = match.group(1)
val = match.group(2)
elif match.group(3):
name = match.group(3)
val = match.group(4)
elif match.group(5):
name = match.group(5)
val = match.group(6)
attributes[name] = html.unescape(val)
return attributes
class ElementRewriter:
pass
class ImageRewriter(ElementRewriter):
def rewrite_element(self, name, attributes, content):
return format_html(
'<img src="/images/image-{}.jpg" alt="{}" />',
attributes['id'],
attributes['alt']
)
class PageLinkRewriter(ElementRewriter):
def rewrite_attributes(self, name, attributes):
return {
'href': '/pages/page-%s' % attributes['id'],
'class': 'page',
}
class IntroParagraphRewriter(ElementRewriter):
def rewrite_element(self, name, attributes, content):
return format_html('<p class="intro">{}</p>', content)
class ROT13Rewriter(ElementRewriter):
def rewrite_element(self, name, attributes, content):
import codecs
return codecs.encode(content, 'rot_13')
rewriter = HTMLRewriter({
"embed[embedtype='image']": ImageRewriter(),
"a[linktype='page']": PageLinkRewriter(),
"intro": IntroParagraphRewriter(),
"rot13": ROT13Rewriter(),
})
db_html = """
<h1>HTML Rewriter example</h1>
<intro><strong>First</strong>, we have a paragraph with <a linktype="page" id="123">a <em>wonderful</em> link</a> in it.</intro>
<p>This <a href="https://torchbox.com/">external link</a> doesn't get rewritten though. Now for an image:</p>
<embed embedtype="image" id="456" alt="I &lt;3 kittens" />
<p>But all we really care about is that <rot13>Darth Vader is Luke's father</rot13>.</p>
"""
print(rewriter.rewrite(db_html))
"""
Output:
<h1>HTML Rewriter example</h1>
<p class="intro"><strong>First</strong>, we have a paragraph with <a href="/pages/page-123" class="page">a <em>wonderful</em> link</a> in it.</p>
<p>This <a href="https://torchbox.com/">external link</a> doesn't get rewritten though. Now for an image:</p>
<img src="/images/image-456.jpg" alt="I &lt;3 kittens" />
<p>But all we really care about is that Qnegu Inqre vf Yhxr&#x27;f sngure.</p>
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment