Skip to content

Instantly share code, notes, and snippets.

@lzxz1234
Created October 10, 2012 07:35
Show Gist options
  • Save lzxz1234/3863752 to your computer and use it in GitHub Desktop.
Save lzxz1234/3863752 to your computer and use it in GitHub Desktop.
即时判断字节编码类型
public static String getCharset(byte[] bytes) {
if(bytes == null || bytes.length < 2) return "GBK";
if(bytes[0] == (byte)0xFF && bytes[1] == (byte)0xFE) {
return "UTF-16LE";
} else if (bytes[0] == (byte)0xFE && bytes[1] == (byte)0xFF) {
return "UTF-16BE";
} else if (bytes[0] == (byte)0xEF && bytes[1] == (byte)0xBB && bytes[2] == (byte)0xBF) {
return "UTF-8";
}
for(int i = 0, b = bytes[i] & 0xff; i < bytes.length; i ++) {
b = bytes[i] & 0xff;
if(b >= 0xF0) return "GBK";
if(0x80 <= b && b <=0xBF) return "GBK";//单独出现BF以下的,也算是GBK
if(0xC0 <= b && b <= 0xDF) {
i ++; b = bytes[i] & 0xff;
if(0x80 <= b && b <= 0xBF) {
continue;//双字节 (0xC0 - 0xDF) (0x80- 0xBF),也可能在GBK编码内
} else {
return "GBK";
}
} else if (0xE0 <= b && b <= 0xEF) {// 也有可能出错,但是几率较小
i ++; b = bytes[i] & 0xff;
if(0x80 <= b && b <= 0xBF) {
i ++; b = bytes[i] & 0xff;
if(0x80 <= b && b <= 0xBF) {
return "UTF-8";
}
}
return "GBK";
}
}
return "GBK";
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment