ergoithz/utf8char.py

## utf8char.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

def write_utf8_char(fd, char):
    fd.write(unichr(char).encode("utf-8"))

def read_utf8_char(fd):
    '''

    >>> import os
    >>> from StringIO import StringIO
    >>> sio = StringIO()
    >>> sio.seek(0, os.SEEK_SET)
    >>> write_utf8_char(sio, 0x101111)
    >>> sio.seek(0, os.SEEK_SET)
    >>> read_utf8_char(sio) == 0x101111
    True
    >>> sio.seek(0, os.SEEK_SET)
    >>> write_utf8_char(sio, 0x51111)
    >>> sio.seek(0, os.SEEK_SET)
    >>> read_utf8_char(sio) == 0x51111
    True

    '''
    # inclusive range [b'\xed\xa0\xbf', b'\xee\x80\x7f'] is invalid in utf8
    fco = ord(fd.read(1))
    if fco < 0x80:
        # ASCII range
        return fco
    elif fco < 0xC2:
        # Control bit range
        raise ValueError("%s not a valid utf-8 start" % hex(fco))
    elif fco < 0xE0:
        more = 1
    elif fco < 0xF0:
        more = 2
    elif fco < 0xF5:
        more = 3
    else:
        raise ValueError("%s not in unicode range" % hex(fco))
    return ord((chr(fco) + fd.read(more)).decode("utf-8"))

if __name__ == "__main__":
    '''
    # Range test

    import sys

    if sys.version_info[0] == 2:
        utf8chr = lambda x: ord(unichr(x).encode("utf-8")[0])
    else:
        xrange = range
        unichr = chr
        utf8chr = lambda x: chr(x).encode("utf-8")[0]

    curlen = 5
    margins = []
    ranges = (
        xrange(0x10FFFF, 0xDFFF, -1),
        xrange(0xD800-1, -1, -1)
    )
    for r in ranges:
        for i in r:
            nlen = len(unichr(i).encode("utf-8"))
            if nlen < curlen:
                curlen = nlen
                if margins:
                    margins[-1][0] = utf8chr(i+1)-1
                margins.append([None, utf8chr(i)+1])

    margins.reverse()
    for nmin, nmax in margins:
        if nmin:
            print(" 0x%X < x < 0x%X" % (nmin, nmax))
        else:
            print("        x < 0x%X" % (nmax))
    '''

    import doctest
    doctest.testmod()
	#!/usr/bin/env python
	# -- coding: utf-8 --

	def write_utf8_char(fd, char):
	fd.write(unichr(char).encode("utf-8"))

	def read_utf8_char(fd):
	'''

	>>> import os
	>>> from StringIO import StringIO
	>>> sio = StringIO()
	>>> sio.seek(0, os.SEEK_SET)
	>>> write_utf8_char(sio, 0x101111)
	>>> sio.seek(0, os.SEEK_SET)
	>>> read_utf8_char(sio) == 0x101111
	True
	>>> sio.seek(0, os.SEEK_SET)
	>>> write_utf8_char(sio, 0x51111)
	>>> sio.seek(0, os.SEEK_SET)
	>>> read_utf8_char(sio) == 0x51111
	True

	'''
	# inclusive range [b'\xed\xa0\xbf', b'\xee\x80\x7f'] is invalid in utf8
	fco = ord(fd.read(1))
	if fco < 0x80:
	# ASCII range
	return fco
	elif fco < 0xC2:
	# Control bit range
	raise ValueError("%s not a valid utf-8 start" % hex(fco))
	elif fco < 0xE0:
	more = 1
	elif fco < 0xF0:
	more = 2
	elif fco < 0xF5:
	more = 3
	else:
	raise ValueError("%s not in unicode range" % hex(fco))
	return ord((chr(fco) + fd.read(more)).decode("utf-8"))

	if __name__ == "__main__":
	'''
	# Range test

	import sys

	if sys.version_info[0] == 2:
	utf8chr = lambda x: ord(unichr(x).encode("utf-8")[0])
	else:
	xrange = range
	unichr = chr
	utf8chr = lambda x: chr(x).encode("utf-8")[0]

	curlen = 5
	margins = []
	ranges = (
	xrange(0x10FFFF, 0xDFFF, -1),
	xrange(0xD800-1, -1, -1)
	)
	for r in ranges:
	for i in r:
	nlen = len(unichr(i).encode("utf-8"))
	if nlen < curlen:
	curlen = nlen
	if margins:
	margins[-1][0] = utf8chr(i+1)-1
	margins.append([None, utf8chr(i)+1])

	margins.reverse()
	for nmin, nmax in margins:
	if nmin:
	print(" 0x%X < x < 0x%X" % (nmin, nmax))
	else:
	print(" x < 0x%X" % (nmax))
	'''

	import doctest
	doctest.testmod()