Skip to content

Instantly share code, notes, and snippets.

@lxneng
Forked from mmalone/readutf.java
Created February 25, 2010 01:51
Show Gist options
  • Save lxneng/314123 to your computer and use it in GitHub Desktop.
Save lxneng/314123 to your computer and use it in GitHub Desktop.
Java vs. Python
/**
* Reads from the
* stream <code>in</code> a representation
* of a Unicode character string encoded in
* <a href="DataInput.html#modified-utf-8">modified UTF-8</a> format;
* this string of characters is then returned as a <code>String</code>.
* The details of the modified UTF-8 representation
* are exactly the same as for the <code>readUTF</code>
* method of <code>DataInput</code>.
*
* @param in a data input stream.
* @return a Unicode string.
* @exception EOFException if the input stream reaches the end
* before all the bytes.
* @exception IOException the stream has been closed and the contained
* input stream does not support reading after close, or
* another I/O error occurs.
* @exception UTFDataFormatException if the bytes do not represent a
* valid modified UTF-8 encoding of a Unicode string.
* @see java.io.DataInputStream#readUnsignedShort()
*/
public final static String readUTF(DataInput in) throws IOException {
int utflen = in.readUnsignedShort();
byte[] bytearr = null;
char[] chararr = null;
if (in instanceof DataInputStream) {
DataInputStream dis = (DataInputStream)in;
if (dis.bytearr.length < utflen){
dis.bytearr = new byte[utflen*2];
dis.chararr = new char[utflen*2];
}
chararr = dis.chararr;
bytearr = dis.bytearr;
} else {
bytearr = new byte[utflen];
chararr = new char[utflen];
}
int c, char2, char3;
int count = 0;
int chararr_count=0;
in.readFully(bytearr, 0, utflen);
while (count < utflen) {
c = (int) bytearr[count] & 0xff;
if (c > 127) break;
count++;
chararr[chararr_count++]=(char)c;
}
while (count < utflen) {
c = (int) bytearr[count] & 0xff;
switch (c >> 4) {
case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
/* 0xxxxxxx*/
count++;
chararr[chararr_count++]=(char)c;
break;
case 12: case 13:
/* 110x xxxx 10xx xxxx*/
count += 2;
if (count > utflen)
throw new UTFDataFormatException(
"malformed input: partial character at end");
char2 = (int) bytearr[count-1];
if ((char2 & 0xC0) != 0x80)
throw new UTFDataFormatException(
"malformed input around byte " + count);
chararr[chararr_count++]=(char)(((c & 0x1F) << 6) |
(char2 & 0x3F));
break;
case 14:
/* 1110 xxxx 10xx xxxx 10xx xxxx */
count += 3;
if (count > utflen)
throw new UTFDataFormatException(
"malformed input: partial character at end");
char2 = (int) bytearr[count-2];
char3 = (int) bytearr[count-1];
if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
throw new UTFDataFormatException(
"malformed input around byte " + (count-1));
chararr[chararr_count++]=(char)(((c & 0x0F) << 12) |
((char2 & 0x3F) << 6) |
((char3 & 0x3F) << 0));
break;
default:
/* 10xx xxxx, 1111 xxxx */
throw new UTFDataFormatException(
"malformed input around byte " + count);
}
}
// The number of chars produced may be less than utflen
return new String(chararr, 0, chararr_count);
}
import struct
def read_utf(fd):
"Python implementation of Java's DataInputStream.readUTF method."
length = struct.unpack('>H', fd.read(2))[0]
return fd.read(length).decode('utf8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment