Skip to content

Instantly share code, notes, and snippets.

@lukasz-madon
Created September 13, 2015 13:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lukasz-madon/03463e58748b40338546 to your computer and use it in GitHub Desktop.
Save lukasz-madon/03463e58748b40338546 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""Convert Unicode text into plain ASCII string."""
# for brevity the table is kept simple
CONVERSION_TABLE = {
u"\u0394": "D",
u"\u03b1": "a",
u"\u03bd": "n",
u"\u03ac": "a",
u"\u03b7": "e"
}
def decode(string):
"""decodes unicode string to a normalised form"""
if not isinstance(string, unicode):
raise TypeError("unicode string is required")
ret_string = []
for c in string:
code_point = ord(c)
if code_point < 128: # ASCII
ret_string.append(c)
elif code_point > 983039:
ret_string.append(c)
heuristic_value = _get_heuristic(code_point)
if heuristic_value is not None:
ret_string.append(heuristic_value)
continue
converted_value = CONVERSION_TABLE.get(c)
if converted_value is not None:
ret_string.append(converted_value)
return "".join(ret_string)
def _get_heuristic(code_point):
"""TODO(lukaszma) implement heuristic algorithms that improves conversion"""
return None
print decode(u"\u0394\u03b1\u03bd\u03ac\u03b7")
print decode(u"Δανάη")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment