Skip to content

Instantly share code, notes, and snippets.

@rayyee
Created February 11, 2014 14:07
Show Gist options
  • Save rayyee/8935434 to your computer and use it in GitHub Desktop.
Save rayyee/8935434 to your computer and use it in GitHub Desktop.
Read utf string from bytes file.
/**
* 从未知编码的二进制流中读取文本
* @param ba
* @param len 读取长度,默认为-1,则读取至文件尾
* @return
*/
public static function readString(ba:ByteArray,len:int = -1):String
{
if ((len != -1 && len > ba.bytesAvailable) || (len == -1)) len = ba.bytesAvailable;
var encode:String = 'gb2312';
//先判断头三个字节是不是utf bom
if (ba.bytesAvailable >= 3)
{
//0xEF 0xBB 0xBF
var chkarr:Array = [];
var a:int = 0xffffffEF;
var b:int = 0xffffffBB;
var c:int = 0xffffffBF;
chkarr.push(ba.readByte());
chkarr.push(ba.readByte());
chkarr.push(ba.readByte());
if ((chkarr[0] == a && chkarr[1] == b && chkarr[2] == c))
{
//utf-8 bom
encode = 'utf-8';
return ba.readMultiByte(len - 3, encode);
}
else
{
ba.position -= 3;
}
}
//逐个字节判断是否有UTF8的编码
if (isUTF8(ba, len))
{
encode = 'utf-8';
}
return ba.readMultiByte(len - 3, encode);
}
/**
* 判断文本是否是UTF8编码
* @param ba
* @param len 读取长度,默认为-1,则读取至文件尾
* @return
*/
public static function isUTF8(ba:ByteArray,len:int = -1):Boolean
{
if ((len != -1 && len > ba.bytesAvailable) || (len == -1)) len = ba.bytesAvailable;
var score:int = 0;
var i:int;
var goodbytes:int = 0, asciibytes:int = 0;
// Maybe also use UTF8 Byte Order Mark: EF BB BF
// Check to see if characters fit into acceptable ranges
var oldpos:int = ba.position;
var byte:int, byte1:int, byte2:int;
var curlen:int = len;
while(curlen>0)
{
ba.position = oldpos + (len - curlen);
byte = ba.readByte();
curlen -= 1;
if (curlen >= 1) byte1 = ba.readByte();
if (curlen >= 2) byte2 = ba.readByte();
//0x7f = 127 = 01111111
if ((byte & 0x7F) == byte)
{
// 最高位是0的ASCII字符
asciibytes++;
// Ignore ASCII, can throw off count
}
else if (-64 <= byte && byte <= -33
//-0x40~-0x21
&& // Two bytes
curlen >= 1 && -128 <= byte1
&&
byte1<= -65)
{
goodbytes += 2;
curlen -= 1;
}
else if (-32 <= byte
&& byte <= -17
&& // Three bytes
curlen >= 2 && -128 <= byte1
&& byte1 <= -65 && -128 <= byte2
&& byte2 <= -65)
{
goodbytes += 3;
curlen -= 2;
}
}
ba.position = oldpos;
if (asciibytes == len)
{
return false;
}
score = 100 * goodbytes / (len - asciibytes);
// If not above 98, reduce to zero to prevent coincidental matches
// Allows for some (few) bad formed sequences
if (score > 98) {
return true;
} else if (score > 95 && goodbytes > 30) {
return true;
} else {
return false;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment