Skip to content

Instantly share code, notes, and snippets.

@ergoithz
Last active January 2, 2016 19:09
Show Gist options
  • Save ergoithz/8348342 to your computer and use it in GitHub Desktop.
Save ergoithz/8348342 to your computer and use it in GitHub Desktop.
How to read utf-8 char from file descriptor. Including doctest and algorithm used for range detection in main docstring.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
def write_utf8_char(fd, char):
fd.write(unichr(char).encode("utf-8"))
def read_utf8_char(fd):
'''
>>> import os
>>> from StringIO import StringIO
>>> sio = StringIO()
>>> sio.seek(0, os.SEEK_SET)
>>> write_utf8_char(sio, 0x101111)
>>> sio.seek(0, os.SEEK_SET)
>>> read_utf8_char(sio) == 0x101111
True
>>> sio.seek(0, os.SEEK_SET)
>>> write_utf8_char(sio, 0x51111)
>>> sio.seek(0, os.SEEK_SET)
>>> read_utf8_char(sio) == 0x51111
True
'''
# inclusive range [b'\xed\xa0\xbf', b'\xee\x80\x7f'] is invalid in utf8
fco = ord(fd.read(1))
if fco < 0x80:
# ASCII range
return fco
elif fco < 0xC2:
# Control bit range
raise ValueError("%s not a valid utf-8 start" % hex(fco))
elif fco < 0xE0:
more = 1
elif fco < 0xF0:
more = 2
elif fco < 0xF5:
more = 3
else:
raise ValueError("%s not in unicode range" % hex(fco))
return ord((chr(fco) + fd.read(more)).decode("utf-8"))
if __name__ == "__main__":
'''
# Range test
import sys
if sys.version_info[0] == 2:
utf8chr = lambda x: ord(unichr(x).encode("utf-8")[0])
else:
xrange = range
unichr = chr
utf8chr = lambda x: chr(x).encode("utf-8")[0]
curlen = 5
margins = []
ranges = (
xrange(0x10FFFF, 0xDFFF, -1),
xrange(0xD800-1, -1, -1)
)
for r in ranges:
for i in r:
nlen = len(unichr(i).encode("utf-8"))
if nlen < curlen:
curlen = nlen
if margins:
margins[-1][0] = utf8chr(i+1)-1
margins.append([None, utf8chr(i)+1])
margins.reverse()
for nmin, nmax in margins:
if nmin:
print(" 0x%X < x < 0x%X" % (nmin, nmax))
else:
print(" x < 0x%X" % (nmax))
'''
import doctest
doctest.testmod()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment