Skip to content

Instantly share code, notes, and snippets.

@riccardomurri
Last active August 29, 2015 14:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save riccardomurri/3c3ccec30f037be174d3 to your computer and use it in GitHub Desktop.
Save riccardomurri/3c3ccec30f037be174d3 to your computer and use it in GitHub Desktop.
Two simple functions for rendering a Unicode string using ASCII characters only.
# /usr/bin/env python
# -*- encoding: utf-8 -*-
#
"""
Two simple functions for rendering a Unicode string using ASCII
characters only.
The only sensible applications are on words in a Latin-derived
alphabet (i.e., anything that could be rendered in a ISO-8859-*
character set); anything else will just be replaced by a string of
"unprintable" marks.
"""
def asciify_crudely(unistr, default='?'):
# note that the entire docstring has to be *unicode*, otherwise
# doctests fail inexplicably!
u"""
Render unicode string `unistr` using ASCII characters only.
The rendering is quite crude in that any non-ASCII character is
just replaced with the glyph `?`::
>>> asciify_crudely(u'pâté')
'p?t?'
>>> asciify_crudely(u'PÂTÉ')
'P?T?'
The character to be substituted for untranslatable characters can
be passed as second argument::
>>> asciify_crudely(u'pâté', '*')
'p*t*'
"""
converted = []
for unichr in iter(unistr):
try:
ch = unichr.decode('ascii')
except:
ch = default
converted.append(chr(ord(ch)))
return ''.join(converted)
def latinify(unistr, default='?'):
# note that the entire docstring has to be *unicode*, otherwise
# doctests fail inexplicably!
u"""
Render unicode string `unistr` using ASCII characters only.
Latin letters with diacritical marks are substituted with their
"bare" equivalent::
>>> latinify(u'pâté')
'pate'
>>> latinify(u'PÂTÉ')
'PATE'
Letters which have no direct equivalent in the latin alphabet are
replaced with the glyph `?`::
>>> latinify(u'Sigurður Þórarinsson')
'Sigur?ur ?orarinsson'
The character to be substituted for untranslatable characters can
be passed as second argument::
>>> latinify(u'Sigurður Þórarinsson', '*')
'Sigur*ur *orarinsson'
"""
from unicodedata import name
converted = []
for unich in iter(unistr):
try:
ch = unich.decode('ascii')
except:
# deduce a latin letter equivalent from the Unicode data
# point name; e.g., since `name(u'á') == 'LATIN SMALL
# LETTER A WITH ACUTE'` translate `á` to `a`. However, in
# some cases the unicode name is still "LATIN LETTER"
# although no direct equivalent in the Latin alphabeth
# exists (e.g., Þ, "LATIN CAPITAL LETTER THORN") -- we can
# avoid these cases by checking that the letter name is
# composed of one letter only.
what = name(unich).split()
if what[0] == 'LATIN' and what[2] == 'LETTER' and len(what[3]) == 1:
if what[1] == 'SMALL':
ch = what[3].lower()
else: # what[1] == 'CAPITAL'
ch = what[3].upper()
else:
ch = default
converted.append(chr(ord(ch)))
return ''.join(converted)
if __name__ == '__main__':
import doctest
doctest.testmod(name='asciify',
optionflags=doctest.NORMALIZE_WHITESPACE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment