riccardomurri/asciify.py

## asciify.py
# /usr/bin/env python
# -*- encoding: utf-8 -*-
#
"""
Two simple functions for rendering a Unicode string using ASCII
characters only.

The only sensible applications are on words in a Latin-derived
alphabet (i.e., anything that could be rendered in a ISO-8859-*
character set); anything else will just be replaced by a string of
"unprintable" marks.
"""


def asciify_crudely(unistr, default='?'):
    # note that the entire docstring has to be *unicode*, otherwise
    # doctests fail inexplicably!
    u"""
    Render unicode string `unistr` using ASCII characters only.

    The rendering is quite crude in that any non-ASCII character is
    just replaced with the glyph `?`::

      >>> asciify_crudely(u'pâté')
      'p?t?'
      >>> asciify_crudely(u'PÂTÉ')
      'P?T?'

    The character to be substituted for untranslatable characters can
    be passed as second argument::

      >>> asciify_crudely(u'pâté', '*')
      'p*t*'
    """
    converted = []
    for unichr in iter(unistr):
        try:
            ch = unichr.decode('ascii')
        except:
            ch = default
        converted.append(chr(ord(ch)))
    return ''.join(converted)


def latinify(unistr, default='?'):
    # note that the entire docstring has to be *unicode*, otherwise
    # doctests fail inexplicably!
    u"""
    Render unicode string `unistr` using ASCII characters only.

    Latin letters with diacritical marks are substituted with their
    "bare" equivalent::

      >>> latinify(u'pâté')
      'pate'
      >>> latinify(u'PÂTÉ')
      'PATE'

    Letters which have no direct equivalent in the latin alphabet are
    replaced with the glyph `?`::

      >>> latinify(u'Sigurður Þórarinsson')
      'Sigur?ur ?orarinsson'

    The character to be substituted for untranslatable characters can
    be passed as second argument::

      >>> latinify(u'Sigurður Þórarinsson', '*')
      'Sigur*ur *orarinsson'
    """
    from unicodedata import name

    converted = []
    for unich in iter(unistr):
        try:
            ch = unich.decode('ascii')
        except:
            # deduce a latin letter equivalent from the Unicode data
            # point name; e.g., since `name(u'á') == 'LATIN SMALL
            # LETTER A WITH ACUTE'` translate `á` to `a`.  However, in
            # some cases the unicode name is still "LATIN LETTER"
            # although no direct equivalent in the Latin alphabeth
            # exists (e.g., Þ, "LATIN CAPITAL LETTER THORN") -- we can
            # avoid these cases by checking that the letter name is
            # composed of one letter only.
            what = name(unich).split()
            if what[0] == 'LATIN' and what[2] == 'LETTER' and len(what[3]) == 1:
                if what[1] == 'SMALL':
                    ch = what[3].lower()
                else: # what[1] == 'CAPITAL'
                    ch = what[3].upper()
            else:
                ch = default
        converted.append(chr(ord(ch)))
    return ''.join(converted)


if __name__ == '__main__':
    import doctest
    doctest.testmod(name='asciify',
                    optionflags=doctest.NORMALIZE_WHITESPACE)
	# /usr/bin/env python
	# -- encoding: utf-8 --
	#
	"""
	Two simple functions for rendering a Unicode string using ASCII
	characters only.

	The only sensible applications are on words in a Latin-derived
	alphabet (i.e., anything that could be rendered in a ISO-8859-*
	character set); anything else will just be replaced by a string of
	"unprintable" marks.
	"""


	def asciify_crudely(unistr, default='?'):
	# note that the entire docstring has to be unicode, otherwise
	# doctests fail inexplicably!
	u"""
	Render unicode string `unistr` using ASCII characters only.

	The rendering is quite crude in that any non-ASCII character is
	just replaced with the glyph `?`::

	>>> asciify_crudely(u'pâté')
	'p?t?'
	>>> asciify_crudely(u'PÂTÉ')
	'P?T?'

	The character to be substituted for untranslatable characters can
	be passed as second argument::

	>>> asciify_crudely(u'pâté', '*')
	'pt'
	"""
	converted = []
	for unichr in iter(unistr):
	try:
	ch = unichr.decode('ascii')
	except:
	ch = default
	converted.append(chr(ord(ch)))
	return ''.join(converted)


	def latinify(unistr, default='?'):
	# note that the entire docstring has to be unicode, otherwise
	# doctests fail inexplicably!
	u"""
	Render unicode string `unistr` using ASCII characters only.

	Latin letters with diacritical marks are substituted with their
	"bare" equivalent::

	>>> latinify(u'pâté')
	'pate'
	>>> latinify(u'PÂTÉ')
	'PATE'

	Letters which have no direct equivalent in the latin alphabet are
	replaced with the glyph `?`::

	>>> latinify(u'Sigurður Þórarinsson')
	'Sigur?ur ?orarinsson'

	The character to be substituted for untranslatable characters can
	be passed as second argument::

	>>> latinify(u'Sigurður Þórarinsson', '*')
	'Sigurur orarinsson'
	"""
	from unicodedata import name

	converted = []
	for unich in iter(unistr):
	try:
	ch = unich.decode('ascii')
	except:
	# deduce a latin letter equivalent from the Unicode data
	# point name; e.g., since `name(u'á') == 'LATIN SMALL
	# LETTER A WITH ACUTE'` translate `á` to `a`. However, in
	# some cases the unicode name is still "LATIN LETTER"
	# although no direct equivalent in the Latin alphabeth
	# exists (e.g., Þ, "LATIN CAPITAL LETTER THORN") -- we can
	# avoid these cases by checking that the letter name is
	# composed of one letter only.
	what = name(unich).split()
	if what[0] == 'LATIN' and what[2] == 'LETTER' and len(what[3]) == 1:
	if what[1] == 'SMALL':
	ch = what[3].lower()
	else: # what[1] == 'CAPITAL'
	ch = what[3].upper()
	else:
	ch = default
	converted.append(chr(ord(ch)))
	return ''.join(converted)


	if __name__ == '__main__':
	import doctest
	doctest.testmod(name='asciify',
	optionflags=doctest.NORMALIZE_WHITESPACE)