jseabold/octal_to_unicode.py

## octal_to_unicode.py
"""
I received a file that had octal characters as ascii. Needed to map them to latin-1 unicode and be able to go back again to latin-1 octal.
Might not be completely general, but it works for me.

E.g.,

name = 'Duchy of Zweibr\\374cken'
octalchar_to_unicode(name)
unicode_to_octalchar(octalchar_to_unicode(name))
http://www.utoronto.ca/web/HTMLdocs/NewHTML/iso_table.html
http://www.utf8-chartable.de/unicode-utf8-table.pl?number=1024&utf8=dec
"""
import re

def unicode_to_octalchar(name):
    # match anything that's not a non-ascii character
    # After 126 (base 10) is non-ASCII - not sure about decomposed characters
    # probably need to normalize first. I'm not...
    name = name.encode('latin-1')
    start = hex(127) # or chr(127)
    end = hex(255)
    pattern = '[%s-%s]+' % (chr(127), chr(255))
    parts = re.finditer(pattern, name)
    for part in parts:
        #NOTE: as soon as you make this sub, you're adding in characters
        # so you need to adjust your end points

        # convert hex to ordinal then to base 8
        sub =  "\\\\"+str(int(oct(ord(part.group()))))
        name = re.sub(pattern, sub, name, count=1)
    return name

def octalchar_to_unicode(name):
    import re
    parts = re.finditer('\\\\(\d+)', name)
    for part in parts:
        name = re.sub('\\\\(\d+)', chr(int(part.group()[1:], 8)), name, count=1)
    return name.decode('latin-1')
	"""
	I received a file that had octal characters as ascii. Needed to map them to latin-1 unicode and be able to go back again to latin-1 octal.
	Might not be completely general, but it works for me.

	E.g.,

	name = 'Duchy of Zweibr\\374cken'
	octalchar_to_unicode(name)
	unicode_to_octalchar(octalchar_to_unicode(name))
	http://www.utoronto.ca/web/HTMLdocs/NewHTML/iso_table.html
	http://www.utf8-chartable.de/unicode-utf8-table.pl?number=1024&utf8=dec
	"""
	import re

	def unicode_to_octalchar(name):
	# match anything that's not a non-ascii character
	# After 126 (base 10) is non-ASCII - not sure about decomposed characters
	# probably need to normalize first. I'm not...
	name = name.encode('latin-1')
	start = hex(127) # or chr(127)
	end = hex(255)
	pattern = '[%s-%s]+' % (chr(127), chr(255))
	parts = re.finditer(pattern, name)
	for part in parts:
	#NOTE: as soon as you make this sub, you're adding in characters
	# so you need to adjust your end points

	# convert hex to ordinal then to base 8
	sub = "\\\\"+str(int(oct(ord(part.group()))))
	name = re.sub(pattern, sub, name, count=1)
	return name

	def octalchar_to_unicode(name):
	import re
	parts = re.finditer('\\\\(\d+)', name)
	for part in parts:
	name = re.sub('\\\\(\d+)', chr(int(part.group()[1:], 8)), name, count=1)
	return name.decode('latin-1')