public
Last active

Java vs. Python

  • Download Gist
readutf.java
Java
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
/**
* Reads from the
* stream <code>in</code> a representation
* of a Unicode character string encoded in
* <a href="DataInput.html#modified-utf-8">modified UTF-8</a> format;
* this string of characters is then returned as a <code>String</code>.
* The details of the modified UTF-8 representation
* are exactly the same as for the <code>readUTF</code>
* method of <code>DataInput</code>.
*
* @param in a data input stream.
* @return a Unicode string.
* @exception EOFException if the input stream reaches the end
* before all the bytes.
* @exception IOException the stream has been closed and the contained
* input stream does not support reading after close, or
* another I/O error occurs.
* @exception UTFDataFormatException if the bytes do not represent a
* valid modified UTF-8 encoding of a Unicode string.
* @see java.io.DataInputStream#readUnsignedShort()
*/
public final static String readUTF(DataInput in) throws IOException {
int utflen = in.readUnsignedShort();
byte[] bytearr = null;
char[] chararr = null;
if (in instanceof DataInputStream) {
DataInputStream dis = (DataInputStream)in;
if (dis.bytearr.length < utflen){
dis.bytearr = new byte[utflen*2];
dis.chararr = new char[utflen*2];
}
chararr = dis.chararr;
bytearr = dis.bytearr;
} else {
bytearr = new byte[utflen];
chararr = new char[utflen];
}
int c, char2, char3;
int count = 0;
int chararr_count=0;
in.readFully(bytearr, 0, utflen);
while (count < utflen) {
c = (int) bytearr[count] & 0xff;
if (c > 127) break;
count++;
chararr[chararr_count++]=(char)c;
}
while (count < utflen) {
c = (int) bytearr[count] & 0xff;
 
switch (c >> 4) {
case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
/* 0xxxxxxx*/
count++;
chararr[chararr_count++]=(char)c;
break;
case 12: case 13:
/* 110x xxxx 10xx xxxx*/
count += 2;
if (count > utflen)
throw new UTFDataFormatException(
"malformed input: partial character at end");
char2 = (int) bytearr[count-1];
if ((char2 & 0xC0) != 0x80)
throw new UTFDataFormatException(
"malformed input around byte " + count);
chararr[chararr_count++]=(char)(((c & 0x1F) << 6) |
(char2 & 0x3F));
break;
case 14:
/* 1110 xxxx 10xx xxxx 10xx xxxx */
count += 3;
if (count > utflen)
throw new UTFDataFormatException(
"malformed input: partial character at end");
char2 = (int) bytearr[count-2];
char3 = (int) bytearr[count-1];
if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80))
throw new UTFDataFormatException(
"malformed input around byte " + (count-1));
chararr[chararr_count++]=(char)(((c & 0x0F) << 12) |
((char2 & 0x3F) << 6) |
((char3 & 0x3F) << 0));
break;
default:
/* 10xx xxxx, 1111 xxxx */
throw new UTFDataFormatException(
"malformed input around byte " + count);
}
}
// The number of chars produced may be less than utflen
return new String(chararr, 0, chararr_count);
}
readutf.py
Python
1 2 3 4 5 6
import struct
 
def read_utf(fd):
"Python implementation of Java's DataInputStream.readUTF method."
length = struct.unpack('>H', fd.read(2))[0]
return fd.read(length).decode('utf8')

I realize that this is very old, but there are some points I strongly feel I must harp on.
First of all, this is an unfair comparison; in reality, a proper comparison would be either:

import java.io.*;

public final static String readUTF(DataInputStream in) throws IOException {
    return in.readUTF();
}
import struct

def read_utf(fd):
    "Python implementation of Java's DataInputStream.readUTF method."
    length = struct.unpack('>H', fd.read(2))[0]
    return fd.read(length).decode('utf8')

Or the code written in Java above with the Python code below including all of the code in struct.unpack, fd.read, and string.decode included in the def itself. The code written in Java is shorter in the former, and I have the feeling that it would also be shorter in the latter.

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.