Skip to content

Instantly share code, notes, and snippets.

@wafflespeanut
Created December 20, 2015 09:38
Show Gist options
  • Save wafflespeanut/015bdd11adcabfd95679 to your computer and use it in GitHub Desktop.
Save wafflespeanut/015bdd11adcabfd95679 to your computer and use it in GitHub Desktop.
# Characters were robbed from http://www.unicode.org/Public/security/revision-06/confusables.txt
# and filtered using Daniel's awesome suggestion for 'printable' ASCII replacements (https://github.com/rust-lang/rust/issues/25957#issuecomment-107812945)
# egrep -v '^[ ]*#' confusables.txt | egrep -v '^[ ]+$|^$' | egrep '00(2[1-9A-F]|3[A-F]|40|5[BCDF]|7[BCD])' | egrep ';[^0-9A-F]00.. ;' | egrep -v '^00' > CONFUSABLES.txt
def const_array_of_strings(name, type_spec = "&'static str"):
return [unicode("const %s: &'static [%s] = &[" % (name, type_spec))]
arrays = [const_array_of_strings('UNICODE_ARRAY', "(char, &'static str, char)"),
const_array_of_strings('ASCII_ARRAY', "(char, &'static str)")]
lengths = map(lambda arr: len(arr[0]), arrays)
def format_for_type(thing):
if type(thing) is unicode or type(thing) is str:
if len(thing) == 1 or (thing.startswith('\\') and len(thing) == 2):
return u"'%s'" % thing
return u'"%s"' % thing
return u'%s' % thing
def put_into_list(list_at_work, initial_len, stuff, limit = 100, newline = False):
if len(stuff) > 1:
uni_str = u'(' + u', '.join([format_for_type(thing) for thing in stuff]) + '), '
else:
uni_str = format_for_type(stuff[0]) + u', '
if newline:
list_at_work.append(u'\n' + u' ' * 4 + uni_str)
elif len(list_at_work[-1] + uni_str) >= limit:
list_at_work.append(u'\n' + u' ' * initial_len + uni_str)
else:
list_at_work[-1] += uni_str
with open('CONFUSABLES.txt', 'rb') as file_data:
uni_list, ascii_list = [], []
for line in file_data.readlines():
stuff = line.decode('utf-8').split(u'\u2192')
char_vals = stuff[0].split(';')
u_char, sub_char = str(char_vals[0].rstrip()), str(char_vals[1].strip())
u_name = str(stuff[1][(stuff[1].find(')')):].strip(') ')).title()
sub_name = str(stuff[2][:stuff[2].find('#')].strip()).title()
uni_char = unichr(int(u_char, 16))
if uni_char not in uni_list: # too many duplicates really
uni_list.append(uni_char)
ascii_char = unichr(int(sub_char, 16))
ascii_char = '\\' + ascii_char if ascii_char in '\'"\\' else ascii_char
put_into_list(arrays[0], lengths[0], [uni_char, u_name, ascii_char], newline = True)
try:
idx = ascii_list.index((ascii_char, sub_name))
except ValueError:
ascii_list.append((ascii_char, sub_name))
put_into_list(arrays[1], lengths[1], [ascii_char, sub_name], newline = True)
arrays = map(lambda arr: u''.join(arr) + u'];' , arrays)
with open('unicode_chars.rs', 'wb') as file_data:
file_data.write(u'\n\n'.join(arrays).encode('utf-8'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment