Last active
January 2, 2016 19:09
-
-
Save ergoithz/8348342 to your computer and use it in GitHub Desktop.
How to read utf-8 char from file descriptor. Including doctest and algorithm used for range detection in main docstring.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
def write_utf8_char(fd, char): | |
fd.write(unichr(char).encode("utf-8")) | |
def read_utf8_char(fd): | |
''' | |
>>> import os | |
>>> from StringIO import StringIO | |
>>> sio = StringIO() | |
>>> sio.seek(0, os.SEEK_SET) | |
>>> write_utf8_char(sio, 0x101111) | |
>>> sio.seek(0, os.SEEK_SET) | |
>>> read_utf8_char(sio) == 0x101111 | |
True | |
>>> sio.seek(0, os.SEEK_SET) | |
>>> write_utf8_char(sio, 0x51111) | |
>>> sio.seek(0, os.SEEK_SET) | |
>>> read_utf8_char(sio) == 0x51111 | |
True | |
''' | |
# inclusive range [b'\xed\xa0\xbf', b'\xee\x80\x7f'] is invalid in utf8 | |
fco = ord(fd.read(1)) | |
if fco < 0x80: | |
# ASCII range | |
return fco | |
elif fco < 0xC2: | |
# Control bit range | |
raise ValueError("%s not a valid utf-8 start" % hex(fco)) | |
elif fco < 0xE0: | |
more = 1 | |
elif fco < 0xF0: | |
more = 2 | |
elif fco < 0xF5: | |
more = 3 | |
else: | |
raise ValueError("%s not in unicode range" % hex(fco)) | |
return ord((chr(fco) + fd.read(more)).decode("utf-8")) | |
if __name__ == "__main__": | |
''' | |
# Range test | |
import sys | |
if sys.version_info[0] == 2: | |
utf8chr = lambda x: ord(unichr(x).encode("utf-8")[0]) | |
else: | |
xrange = range | |
unichr = chr | |
utf8chr = lambda x: chr(x).encode("utf-8")[0] | |
curlen = 5 | |
margins = [] | |
ranges = ( | |
xrange(0x10FFFF, 0xDFFF, -1), | |
xrange(0xD800-1, -1, -1) | |
) | |
for r in ranges: | |
for i in r: | |
nlen = len(unichr(i).encode("utf-8")) | |
if nlen < curlen: | |
curlen = nlen | |
if margins: | |
margins[-1][0] = utf8chr(i+1)-1 | |
margins.append([None, utf8chr(i)+1]) | |
margins.reverse() | |
for nmin, nmax in margins: | |
if nmin: | |
print(" 0x%X < x < 0x%X" % (nmin, nmax)) | |
else: | |
print(" x < 0x%X" % (nmax)) | |
''' | |
import doctest | |
doctest.testmod() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment