jvanasco/standardize_unicode.py

## standardize_unicode.py
# -*- coding: utf-8 -*-
"""
this translates unicode codepoints in the input into *NAMED* html entities
a future version may do the html spec supported entities as well

this does not escape unsafe html into entities, as lots of libraries do that and
this function is likely to be used in a pipeline that does that too.

this simply standardizes unicode points into html entities.
"""

from six.moves.html_entities import codepoint2name

# we shall start with a custom version of codepoint2name...
codepoint2name_custom = dict(codepoint2name.items())
# however it should not translate the following
# this will break everything, as & is the control character
del codepoint2name_custom[38]  # & &amp;
# the following are indeed unsafe, but we're not sanitizing
del codepoint2name_custom[34]  # " &quot;
del codepoint2name_custom[60]  # < &lt;
del codepoint2name_custom[62]  # > &gt;

unicode_to_entity = {k: (u'&%s;' % v)
                     for k, v in codepoint2name_custom.items()
                     }

def unicode_to_entity_transation(input):
    return input.translate(unicode_to_entity)

if __name__ == '__main__':
    sample_text = u"""&amp; & &bull; • ♣ # ' " &quot; &apos; &lt; &gt; <> 🙃🙃&"""
    expectedout = u"""&amp; & &bull; &bull; &clubs; # ' " &quot; &apos; &lt; &gt; <> 🙃🙃&"""
    generated = unicode_to_entity_transation(sample_text)
    assert expectedout == generated
	# -- coding: utf-8 --
	"""
	this translates unicode codepoints in the input into NAMED html entities
	a future version may do the html spec supported entities as well

	this does not escape unsafe html into entities, as lots of libraries do that and
	this function is likely to be used in a pipeline that does that too.

	this simply standardizes unicode points into html entities.
	"""

	from six.moves.html_entities import codepoint2name

	# we shall start with a custom version of codepoint2name...
	codepoint2name_custom = dict(codepoint2name.items())
	# however it should not translate the following
	# this will break everything, as & is the control character
	del codepoint2name_custom[38] # & &
	# the following are indeed unsafe, but we're not sanitizing
	del codepoint2name_custom[34] # " "
	del codepoint2name_custom[60] # < <
	del codepoint2name_custom[62] # > >

	unicode_to_entity = {k: (u'&%s;' % v)
	for k, v in codepoint2name_custom.items()
	}

	def unicode_to_entity_transation(input):
	return input.translate(unicode_to_entity)

	if __name__ == '__main__':
	sample_text = u"""& & • • ♣ # ' " " ' < > <> 🙃🙃&"""
	expectedout = u"""& & • • &clubs; # ' " " ' < > <> 🙃🙃&"""
	generated = unicode_to_entity_transation(sample_text)
	assert expectedout == generated