Created
September 25, 2012 19:12
-
-
Save jseabold/3783839 to your computer and use it in GitHub Desktop.
octal to unicode and beyond the infinite
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
I received a file that had octal characters as ascii. Needed to map them to latin-1 unicode and be able to go back again to latin-1 octal. | |
Might not be completely general, but it works for me. | |
E.g., | |
name = 'Duchy of Zweibr\\374cken' | |
octalchar_to_unicode(name) | |
unicode_to_octalchar(octalchar_to_unicode(name)) | |
http://www.utoronto.ca/web/HTMLdocs/NewHTML/iso_table.html | |
http://www.utf8-chartable.de/unicode-utf8-table.pl?number=1024&utf8=dec | |
""" | |
import re | |
def unicode_to_octalchar(name): | |
# match anything that's not a non-ascii character | |
# After 126 (base 10) is non-ASCII - not sure about decomposed characters | |
# probably need to normalize first. I'm not... | |
name = name.encode('latin-1') | |
start = hex(127) # or chr(127) | |
end = hex(255) | |
pattern = '[%s-%s]+' % (chr(127), chr(255)) | |
parts = re.finditer(pattern, name) | |
for part in parts: | |
#NOTE: as soon as you make this sub, you're adding in characters | |
# so you need to adjust your end points | |
# convert hex to ordinal then to base 8 | |
sub = "\\\\"+str(int(oct(ord(part.group())))) | |
name = re.sub(pattern, sub, name, count=1) | |
return name | |
def octalchar_to_unicode(name): | |
import re | |
parts = re.finditer('\\\\(\d+)', name) | |
for part in parts: | |
name = re.sub('\\\\(\d+)', chr(int(part.group()[1:], 8)), name, count=1) | |
return name.decode('latin-1') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment