Created
October 10, 2012 07:35
-
-
Save lzxz1234/3863752 to your computer and use it in GitHub Desktop.
即时判断字节编码类型
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public static String getCharset(byte[] bytes) { | |
if(bytes == null || bytes.length < 2) return "GBK"; | |
if(bytes[0] == (byte)0xFF && bytes[1] == (byte)0xFE) { | |
return "UTF-16LE"; | |
} else if (bytes[0] == (byte)0xFE && bytes[1] == (byte)0xFF) { | |
return "UTF-16BE"; | |
} else if (bytes[0] == (byte)0xEF && bytes[1] == (byte)0xBB && bytes[2] == (byte)0xBF) { | |
return "UTF-8"; | |
} | |
for(int i = 0, b = bytes[i] & 0xff; i < bytes.length; i ++) { | |
b = bytes[i] & 0xff; | |
if(b >= 0xF0) return "GBK"; | |
if(0x80 <= b && b <=0xBF) return "GBK";//单独出现BF以下的,也算是GBK | |
if(0xC0 <= b && b <= 0xDF) { | |
i ++; b = bytes[i] & 0xff; | |
if(0x80 <= b && b <= 0xBF) { | |
continue;//双字节 (0xC0 - 0xDF) (0x80- 0xBF),也可能在GBK编码内 | |
} else { | |
return "GBK"; | |
} | |
} else if (0xE0 <= b && b <= 0xEF) {// 也有可能出错,但是几率较小 | |
i ++; b = bytes[i] & 0xff; | |
if(0x80 <= b && b <= 0xBF) { | |
i ++; b = bytes[i] & 0xff; | |
if(0x80 <= b && b <= 0xBF) { | |
return "UTF-8"; | |
} | |
} | |
return "GBK"; | |
} | |
} | |
return "GBK"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment