rayyee/ReadUtfBytes.as

## ReadUtfBytes.as
/**
 * 从未知编码的二进制流中读取文本
 * @param   ba
 * @param   len 读取长度，默认为-1，则读取至文件尾
 * @return
 */
public static function readString(ba:ByteArray,len:int = -1):String
{
    if ((len != -1 && len > ba.bytesAvailable) || (len == -1)) len = ba.bytesAvailable;
    var encode:String = 'gb2312';
    //先判断头三个字节是不是utf bom
    if (ba.bytesAvailable >= 3)
    {
        //0xEF 0xBB 0xBF
        var chkarr:Array = [];
        var a:int = 0xffffffEF;
        var b:int = 0xffffffBB;
        var c:int = 0xffffffBF;
        chkarr.push(ba.readByte());
        chkarr.push(ba.readByte());
        chkarr.push(ba.readByte());
        if ((chkarr[0] == a && chkarr[1] == b && chkarr[2] == c))
        {
            //utf-8 bom
            encode = 'utf-8';
            return ba.readMultiByte(len - 3, encode);
        }
        else
        {
            ba.position -= 3;
        }
    }

    //逐个字节判断是否有UTF8的编码
    if (isUTF8(ba, len))
    {
        encode = 'utf-8';
    }

    return ba.readMultiByte(len - 3, encode);
}

/**
 * 判断文本是否是UTF8编码
 * @param   ba
 * @param   len 读取长度，默认为-1，则读取至文件尾
 * @return
 */
public static function isUTF8(ba:ByteArray,len:int = -1):Boolean
{
    if ((len != -1 && len > ba.bytesAvailable) || (len == -1)) len = ba.bytesAvailable;
    var score:int = 0;
    var i:int;
    var goodbytes:int = 0, asciibytes:int = 0;
    // Maybe also use UTF8 Byte Order Mark: EF BB BF
    // Check to see if characters fit into acceptable ranges
    var oldpos:int = ba.position;
    var byte:int, byte1:int, byte2:int;
    var curlen:int = len;
    while(curlen>0)
    {
        ba.position = oldpos + (len - curlen);
        byte = ba.readByte();
        curlen -= 1;
        if (curlen >= 1) byte1 = ba.readByte();
        if (curlen >= 2) byte2 = ba.readByte();

        //0x7f = 127 = 01111111
        if ((byte & 0x7F) == byte)
        {
             // 最高位是0的ASCII字符
             asciibytes++;
             // Ignore ASCII, can throw off count
        }
        else if (-64 <= byte && byte <= -33
             //-0x40~-0x21
             && // Two bytes
             curlen >= 1 && -128 <= byte1
             &&
             byte1<= -65)
        {
             goodbytes += 2;
             curlen -= 1;
        }
        else if (-32 <= byte
            && byte <= -17
            && // Three bytes
            curlen >= 2 && -128 <= byte1
            && byte1 <= -65 && -128 <= byte2
            && byte2 <= -65)
        {
            goodbytes += 3;
            curlen -= 2;
        }
    }

    ba.position = oldpos;

    if (asciibytes == len)
    {
        return false;
    }
    score = 100 * goodbytes / (len - asciibytes);
    // If not above 98, reduce to zero to prevent coincidental matches
    // Allows for some (few) bad formed sequences
    if (score > 98) {
        return true;
    } else if (score > 95 && goodbytes > 30) {
        return true;
    } else {
        return false;
    }
}
	/**
	* 从未知编码的二进制流中读取文本
	* @param ba
	* @param len 读取长度，默认为-1，则读取至文件尾
	* @return
	*/
	public static function readString(ba:ByteArray,len:int = -1):String
	{
	if ((len != -1 && len > ba.bytesAvailable) \|\| (len == -1)) len = ba.bytesAvailable;
	var encode:String = 'gb2312';
	//先判断头三个字节是不是utf bom
	if (ba.bytesAvailable >= 3)
	{
	//0xEF 0xBB 0xBF
	var chkarr:Array = [];
	var a:int = 0xffffffEF;
	var b:int = 0xffffffBB;
	var c:int = 0xffffffBF;
	chkarr.push(ba.readByte());
	chkarr.push(ba.readByte());
	chkarr.push(ba.readByte());
	if ((chkarr[0] == a && chkarr[1] == b && chkarr[2] == c))
	{
	//utf-8 bom
	encode = 'utf-8';
	return ba.readMultiByte(len - 3, encode);
	}
	else
	{
	ba.position -= 3;
	}
	}

	//逐个字节判断是否有UTF8的编码
	if (isUTF8(ba, len))
	{
	encode = 'utf-8';
	}

	return ba.readMultiByte(len - 3, encode);
	}

	/**
	* 判断文本是否是UTF8编码
	* @param ba
	* @param len 读取长度，默认为-1，则读取至文件尾
	* @return
	*/
	public static function isUTF8(ba:ByteArray,len:int = -1):Boolean
	{
	if ((len != -1 && len > ba.bytesAvailable) \|\| (len == -1)) len = ba.bytesAvailable;
	var score:int = 0;
	var i:int;
	var goodbytes:int = 0, asciibytes:int = 0;
	// Maybe also use UTF8 Byte Order Mark: EF BB BF
	// Check to see if characters fit into acceptable ranges
	var oldpos:int = ba.position;
	var byte:int, byte1:int, byte2:int;
	var curlen:int = len;
	while(curlen>0)
	{
	ba.position = oldpos + (len - curlen);
	byte = ba.readByte();
	curlen -= 1;
	if (curlen >= 1) byte1 = ba.readByte();
	if (curlen >= 2) byte2 = ba.readByte();

	//0x7f = 127 = 01111111
	if ((byte & 0x7F) == byte)
	{
	// 最高位是0的ASCII字符
	asciibytes++;
	// Ignore ASCII, can throw off count
	}
	else if (-64 <= byte && byte <= -33
	//-0x40~-0x21
	&& // Two bytes
	curlen >= 1 && -128 <= byte1
	&&
	byte1<= -65)
	{
	goodbytes += 2;
	curlen -= 1;
	}
	else if (-32 <= byte
	&& byte <= -17
	&& // Three bytes
	curlen >= 2 && -128 <= byte1
	&& byte1 <= -65 && -128 <= byte2
	&& byte2 <= -65)
	{
	goodbytes += 3;
	curlen -= 2;
	}
	}

	ba.position = oldpos;

	if (asciibytes == len)
	{
	return false;
	}
	score = 100 * goodbytes / (len - asciibytes);
	// If not above 98, reduce to zero to prevent coincidental matches
	// Allows for some (few) bad formed sequences
	if (score > 98) {
	return true;
	} else if (score > 95 && goodbytes > 30) {
	return true;
	} else {
	return false;
	}
	}