Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
this standardizes unicode codepoints to html entities when possible.
# -*- coding: utf-8 -*-
"""
this translates unicode codepoints in the input into *NAMED* html entities
a future version may do the html spec supported entities as well
this does not escape unsafe html into entities, as lots of libraries do that and
this function is likely to be used in a pipeline that does that too.
this simply standardizes unicode points into html entities.
"""
from six.moves.html_entities import codepoint2name
# we shall start with a custom version of codepoint2name...
codepoint2name_custom = dict(codepoint2name.items())
# however it should not translate the following
# this will break everything, as & is the control character
del codepoint2name_custom[38] # & &
# the following are indeed unsafe, but we're not sanitizing
del codepoint2name_custom[34] # " "
del codepoint2name_custom[60] # < &lt;
del codepoint2name_custom[62] # > &gt;
unicode_to_entity = {k: (u'&%s;' % v)
for k, v in codepoint2name_custom.items()
}
def unicode_to_entity_transation(input):
return input.translate(unicode_to_entity)
if __name__ == '__main__':
sample_text = u"""&amp; & &bull; • ♣ # ' " &quot; &apos; &lt; &gt; <> 🙃🙃&"""
expectedout = u"""&amp; & &bull; &bull; &clubs; # ' " &quot; &apos; &lt; &gt; <> 🙃🙃&"""
generated = unicode_to_entity_transation(sample_text)
assert expectedout == generated
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.