Skip to content

Instantly share code, notes, and snippets.

@RickyCook
Last active May 16, 2019 01:53
Show Gist options
  • Save RickyCook/b569e62f0827b2ccbab448669f00e689 to your computer and use it in GitHub Desktop.
Save RickyCook/b569e62f0827b2ccbab448669f00e689 to your computer and use it in GitHub Desktop.
Trim valid UTF8 bytes to a given max length, ensuring valid UTF8 afterwards
#!/usr/bin/env python3
import unittest
CONTINUATION_HIGH_BITS = 0b10
CONTINUATION_SHIFT_RIGHT = 8 - CONTINUATION_HIGH_BITS.bit_length()
def is_continuation(byte):
return byte >> CONTINUATION_SHIFT_RIGHT == CONTINUATION_HIGH_BITS
def trim_utf8(raw, maxlen):
""" Trims a UTF8 string to a maximum length, presenting a valid UTF8 output
back (dropping off incomplete multi-byte sequences)
:param raw: initial ``bytes`` that you want to trim
:param maxlen: maximum length of the output
"""
trimmed = raw[:maxlen]
tail_is_continuation = is_continuation(trimmed[-1])
# Drop off bytes until non-continuation
while is_continuation(trimmed[-1]):
trimmed = trimmed[:-1]
# If initially a continuation, drop 1 more byte (the start byte)
if tail_is_continuation:
trimmed = trimmed[:-1]
return trimmed
class TestIt(unittest.TestCase):
def run_test(self, rawstr):
rawbytes = rawstr.encode()
assert trim_utf8(rawbytes, len(rawbytes) - 1).decode() == rawstr[:-1]
def test_emojis(self):
self.run_test('😇🤭🤬')
@unittest.expectedFailure
def test_multi_codepoint_emoji(self):
# Rainbow flag is a white flag with an extra modifier byte
# We don't handle this case; it decodes without exception
rawstr = '🏳️‍🌈🏳️‍🌈'
rawbytes = rawstr.encode()
expstr = '🏳️‍🌈'
assert trim_utf8(rawbytes, len(rawbytes) - 1).decode() == expstr
# Following byte sequences are from
# https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
def test_first_possible_4_byte(self):
# Note this is different to the 6-bytes; it's 4x 1 char
self.run_test('𐀀𐀀𐀀𐀀')
def test_first_possible_6_byte(self):
self.run_test('������')
def test_last_possible_6_byte(self):
self.run_test('������')
if __name__ == '__main__':
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment