Created
February 11, 2014 14:07
-
-
Save rayyee/8935434 to your computer and use it in GitHub Desktop.
Read utf string from bytes file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* 从未知编码的二进制流中读取文本 | |
* @param ba | |
* @param len 读取长度,默认为-1,则读取至文件尾 | |
* @return | |
*/ | |
public static function readString(ba:ByteArray,len:int = -1):String | |
{ | |
if ((len != -1 && len > ba.bytesAvailable) || (len == -1)) len = ba.bytesAvailable; | |
var encode:String = 'gb2312'; | |
//先判断头三个字节是不是utf bom | |
if (ba.bytesAvailable >= 3) | |
{ | |
//0xEF 0xBB 0xBF | |
var chkarr:Array = []; | |
var a:int = 0xffffffEF; | |
var b:int = 0xffffffBB; | |
var c:int = 0xffffffBF; | |
chkarr.push(ba.readByte()); | |
chkarr.push(ba.readByte()); | |
chkarr.push(ba.readByte()); | |
if ((chkarr[0] == a && chkarr[1] == b && chkarr[2] == c)) | |
{ | |
//utf-8 bom | |
encode = 'utf-8'; | |
return ba.readMultiByte(len - 3, encode); | |
} | |
else | |
{ | |
ba.position -= 3; | |
} | |
} | |
//逐个字节判断是否有UTF8的编码 | |
if (isUTF8(ba, len)) | |
{ | |
encode = 'utf-8'; | |
} | |
return ba.readMultiByte(len - 3, encode); | |
} | |
/** | |
* 判断文本是否是UTF8编码 | |
* @param ba | |
* @param len 读取长度,默认为-1,则读取至文件尾 | |
* @return | |
*/ | |
public static function isUTF8(ba:ByteArray,len:int = -1):Boolean | |
{ | |
if ((len != -1 && len > ba.bytesAvailable) || (len == -1)) len = ba.bytesAvailable; | |
var score:int = 0; | |
var i:int; | |
var goodbytes:int = 0, asciibytes:int = 0; | |
// Maybe also use UTF8 Byte Order Mark: EF BB BF | |
// Check to see if characters fit into acceptable ranges | |
var oldpos:int = ba.position; | |
var byte:int, byte1:int, byte2:int; | |
var curlen:int = len; | |
while(curlen>0) | |
{ | |
ba.position = oldpos + (len - curlen); | |
byte = ba.readByte(); | |
curlen -= 1; | |
if (curlen >= 1) byte1 = ba.readByte(); | |
if (curlen >= 2) byte2 = ba.readByte(); | |
//0x7f = 127 = 01111111 | |
if ((byte & 0x7F) == byte) | |
{ | |
// 最高位是0的ASCII字符 | |
asciibytes++; | |
// Ignore ASCII, can throw off count | |
} | |
else if (-64 <= byte && byte <= -33 | |
//-0x40~-0x21 | |
&& // Two bytes | |
curlen >= 1 && -128 <= byte1 | |
&& | |
byte1<= -65) | |
{ | |
goodbytes += 2; | |
curlen -= 1; | |
} | |
else if (-32 <= byte | |
&& byte <= -17 | |
&& // Three bytes | |
curlen >= 2 && -128 <= byte1 | |
&& byte1 <= -65 && -128 <= byte2 | |
&& byte2 <= -65) | |
{ | |
goodbytes += 3; | |
curlen -= 2; | |
} | |
} | |
ba.position = oldpos; | |
if (asciibytes == len) | |
{ | |
return false; | |
} | |
score = 100 * goodbytes / (len - asciibytes); | |
// If not above 98, reduce to zero to prevent coincidental matches | |
// Allows for some (few) bad formed sequences | |
if (score > 98) { | |
return true; | |
} else if (score > 95 && goodbytes > 30) { | |
return true; | |
} else { | |
return false; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment