mmalone/readutf.java

## readutf.java
/**
 * Reads from the
 * stream <code>in</code> a representation
 * of a Unicode  character string encoded in
 * <a href="DataInput.html#modified-utf-8">modified UTF-8</a> format;
 * this string of characters is then returned as a <code>String</code>.
 * The details of the modified UTF-8 representation
 * are  exactly the same as for the <code>readUTF</code>
 * method of <code>DataInput</code>.
 *
 * @param      in   a data input stream.
 * @return     a Unicode string.
 * @exception  EOFException            if the input stream reaches the end
 *               before all the bytes.
 * @exception  IOException   the stream has been closed and the contained
 *             input stream does not support reading after close, or
 *             another I/O error occurs.
 * @exception  UTFDataFormatException  if the bytes do not represent a
 *               valid modified UTF-8 encoding of a Unicode string.
 * @see        java.io.DataInputStream#readUnsignedShort()
 */
public final static String readUTF(DataInput in) throws IOException {
    int utflen = in.readUnsignedShort();
    byte[] bytearr = null;
    char[] chararr = null;
    if (in instanceof DataInputStream) {
        DataInputStream dis = (DataInputStream)in;
        if (dis.bytearr.length < utflen){
            dis.bytearr = new byte[utflen*2];
            dis.chararr = new char[utflen*2];
        }
        chararr = dis.chararr;
        bytearr = dis.bytearr;
    } else {
        bytearr = new byte[utflen];
        chararr = new char[utflen];
    }
    int c, char2, char3;
    int count = 0;
    int chararr_count=0;
    in.readFully(bytearr, 0, utflen);
    while (count < utflen) {
        c = (int) bytearr[count] & 0xff;
        if (c > 127) break;
        count++;
        chararr[chararr_count++]=(char)c;
    }
    while (count < utflen) {
        c = (int) bytearr[count] & 0xff;

        switch (c >> 4) {
            case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
                /* 0xxxxxxx*/
                count++;
                chararr[chararr_count++]=(char)c;
                break;
            case 12: case 13:
                /* 110x xxxx   10xx xxxx*/
                count += 2;
                if (count > utflen)
                    throw new UTFDataFormatException(
                        "malformed input: partial character at end");
                char2 = (int) bytearr[count-1];
                if ((char2 & 0xC0) != 0x80)
                    throw new UTFDataFormatException(
                        "malformed input around byte " + count);
                chararr[chararr_count++]=(char)(((c & 0x1F) << 6) |
                                                (char2 & 0x3F));
                break;
            case 14:
                /* 1110 xxxx  10xx xxxx  10xx xxxx */
                count += 3;
                if (count > utflen)
                    throw new UTFDataFormatException(
                        "malformed input: partial character at end");
                char2 = (int) bytearr[count-2];
                char3 = (int) bytearr[count-1];
                if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
                    throw new UTFDataFormatException(
                        "malformed input around byte " + (count-1));
                chararr[chararr_count++]=(char)(((c     & 0x0F) << 12) |
                                                ((char2 & 0x3F) << 6)  |
                                                ((char3 & 0x3F) << 0));
                break;
            default:
                /* 10xx xxxx,  1111 xxxx */
                throw new UTFDataFormatException(
                    "malformed input around byte " + count);
        }
    }
    // The number of chars produced may be less than utflen
    return new String(chararr, 0, chararr_count);
}

## readutf.py
import struct

def read_utf(fd):
    "Python implementation of Java's DataInputStream.readUTF method."
    length = struct.unpack('>H', fd.read(2))[0]
    return fd.read(length).decode('utf8')
	/**
	* Reads from the
	* stream <code>in</code> a representation
	* of a Unicode character string encoded in
	* <a href="DataInput.html#modified-utf-8">modified UTF-8</a> format;
	* this string of characters is then returned as a <code>String</code>.
	* The details of the modified UTF-8 representation
	* are exactly the same as for the <code>readUTF</code>
	* method of <code>DataInput</code>.
	*
	* @param in a data input stream.
	* @return a Unicode string.
	* @exception EOFException if the input stream reaches the end
	* before all the bytes.
	* @exception IOException the stream has been closed and the contained
	* input stream does not support reading after close, or
	* another I/O error occurs.
	* @exception UTFDataFormatException if the bytes do not represent a
	* valid modified UTF-8 encoding of a Unicode string.
	* @see java.io.DataInputStream#readUnsignedShort()
	*/
	public final static String readUTF(DataInput in) throws IOException {
	int utflen = in.readUnsignedShort();
	byte[] bytearr = null;
	char[] chararr = null;
	if (in instanceof DataInputStream) {
	DataInputStream dis = (DataInputStream)in;
	if (dis.bytearr.length < utflen){
	dis.bytearr = new byte[utflen*2];
	dis.chararr = new char[utflen*2];
	}
	chararr = dis.chararr;
	bytearr = dis.bytearr;
	} else {
	bytearr = new byte[utflen];
	chararr = new char[utflen];
	}
	int c, char2, char3;
	int count = 0;
	int chararr_count=0;
	in.readFully(bytearr, 0, utflen);
	while (count < utflen) {
	c = (int) bytearr[count] & 0xff;
	if (c > 127) break;
	count++;
	chararr[chararr_count++]=(char)c;
	}
	while (count < utflen) {
	c = (int) bytearr[count] & 0xff;

	switch (c >> 4) {
	case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
	/* 0xxxxxxx*/
	count++;
	chararr[chararr_count++]=(char)c;
	break;
	case 12: case 13:
	/* 110x xxxx 10xx xxxx*/
	count += 2;
	if (count > utflen)
	throw new UTFDataFormatException(
	"malformed input: partial character at end");
	char2 = (int) bytearr[count-1];
	if ((char2 & 0xC0) != 0x80)
	throw new UTFDataFormatException(
	"malformed input around byte " + count);
	chararr[chararr_count++]=(char)(((c & 0x1F) << 6) \|
	(char2 & 0x3F));
	break;
	case 14:
	/* 1110 xxxx 10xx xxxx 10xx xxxx */
	count += 3;
	if (count > utflen)
	throw new UTFDataFormatException(
	"malformed input: partial character at end");
	char2 = (int) bytearr[count-2];
	char3 = (int) bytearr[count-1];
	if (((char2 & 0xC0) != 0x80) \|\| ((char3 & 0xC0) != 0x80))
	throw new UTFDataFormatException(
	"malformed input around byte " + (count-1));
	chararr[chararr_count++]=(char)(((c & 0x0F) << 12) \|
	((char2 & 0x3F) << 6) \|
	((char3 & 0x3F) << 0));
	break;
	default:
	/* 10xx xxxx, 1111 xxxx */
	throw new UTFDataFormatException(
	"malformed input around byte " + count);
	}
	}
	// The number of chars produced may be less than utflen
	return new String(chararr, 0, chararr_count);
	}
	import struct

	def read_utf(fd):
	"Python implementation of Java's DataInputStream.readUTF method."
	length = struct.unpack('>H', fd.read(2))[0]
	return fd.read(length).decode('utf8')