Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
python's version of java's modified utf8
# translated from: http://hg.openjdk.java.net/jdk8/jdk8/jdk/file/94cc251d0c45/src/share/npt/utf.c
def utf8s_to_utf8m(string):
"""
:param string: utf8 encoded string
:return: modified utf8 encoded string
"""
new_str = []
i = 0
while i < len(string):
byte1 = string[i]
# NULL bytes and bytes starting with 11110xxx are special
if (byte1 & 0x80) == 0:
if byte1 == 0:
new_str.append(0xC0)
new_str.append(0x80)
else:
# Single byte
new_str.append(byte1)
elif (byte1 & 0xE0) == 0xC0: # 2byte encoding
new_str.append(byte1)
i += 1
new_str.append(string[i])
elif (byte1 & 0xF0) == 0xE0: # 3byte encoding
new_str.append(byte1)
i += 1
new_str.append(string[i])
i += 1
new_str.append(string[i])
elif (byte1 & 0xF8) == 0xF0: # 4byte encoding
# Beginning of 4byte encoding, turn into 2 3byte encodings
# Bits in: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
i += 1
byte2 = string[i]
i += 1
byte3 = string[i]
i += 1
byte4 = string[i]
# Reconstruct full 21bit value
u21 = (byte1 & 0x07) << 18
u21 += (byte2 & 0x3F) << 12
u21 += (byte3 & 0x3F) << 6
u21 += (byte4 & 0x3F)
# Bits out: 11101101 1010xxxx 10xxxxxx
new_str.append(0xED)
new_str.append((0xA0 + (((u21 >> 16) - 1) & 0x0F)))
new_str.append((0x80 + ((u21 >> 10) & 0x3F)))
# Bits out: 11101101 1011xxxx 10xxxxxx
new_str.append(0xED)
new_str.append((0xB0 + ((u21 >> 6) & 0x0F)))
new_str.append(byte4)
i += 1
return bytes(new_str)
def utf8m_to_utf8s(string):
"""
:param string: modified utf8 encoded string
:return: utf8 encoded string
"""
new_string = []
length = len(string)
i = 0
while i < length:
byte1 = string[i]
if (byte1 & 0x80) == 0: # 1byte encoding
new_string.append(byte1)
elif (byte1 & 0xE0) == 0xC0: # 2byte encoding
i += 1
byte2 = string[i]
if byte1 != 0xC0 or byte2 != 0x80:
new_string.append(byte1)
new_string.append(byte2)
else:
new_string.append(0)
elif (byte1 & 0xF0) == 0xE0: # 3byte encoding
i += 1
byte2 = string[i]
i += 1
byte3 = string[i]
if i+3 < length and byte1 == 0xED and (byte2 & 0xF0) == 0xA0:
# See if this is a pair of 3byte encodings
byte4 = string[i+1]
byte5 = string[i+2]
byte6 = string[i+3]
if byte4 == 0xED and (byte5 & 0xF0) == 0xB0:
# Bits in: 11101101 1010xxxx 10xxxxxx
# Bits in: 11101101 1011xxxx 10xxxxxx
i += 3
# Reconstruct 21 bit code
u21 = ((byte2 & 0x0F) + 1) << 16
u21 += (byte3 & 0x3F) << 10
u21 += (byte5 & 0x0F) << 6
u21 += (byte6 & 0x3F)
# Bits out: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
# Convert to 4byte encoding
new_string.append(0xF0 + ((u21 >> 18) & 0x07))
new_string.append(0x80 + ((u21 >> 12) & 0x3F))
new_string.append(0x80 + ((u21 >> 6) & 0x3F))
new_string.append(0x80 + (u21 & 0x3F))
continue
new_string.append(byte1)
new_string.append(byte2)
new_string.append(byte3)
i += 1
return bytes(new_string)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment