Created
June 7, 2013 00:57
-
-
Save MorbZ/5726339 to your computer and use it in GitHub Desktop.
This AS3 class handles strings with UTF-8 multibyte characters (up to 4 bytes/character). Currently it is able to split a String into its characters, concatenate chars to a string, give the binary and the unicode representation of a character and remove symbols/emojis from a string.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package utils | |
{ | |
import flash.errors.EOFError; | |
import flash.utils.ByteArray; | |
/** | |
* This AS3 class handles strings with UTF-8 multibyte characters (up to 4 bytes/character). | |
* Currently it is able to split a String into its characters, concatenate chars to a string, | |
* give the binary and the unicode representation of a character and remove symbols/emojis from | |
* a string. | |
* | |
* @author Merten Peetz | |
* | |
* @license CC-BY | |
*/ | |
public class MultiByteStrUtil | |
{ | |
// Vars | |
private static var bytes:ByteArray = new ByteArray; | |
/** Public Methods **/ | |
/** | |
* Splits a string into its single characters and returns them as a vector of strings. | |
* | |
* @param str The string which will be splitted. | |
* | |
* @return A vector of strings where each element is a single (multibyte) character. | |
*/ | |
public static function strToCharArray(str:String):Vector.<String> | |
{ | |
//init byte-array | |
bytes.clear(); | |
bytes.writeUTFBytes(str); | |
bytes.position = 0; | |
//iterate bytes | |
var chars:Vector.<String> = new Vector.<String>; | |
while(bytes.position < bytes.length) { | |
var byte:uint = bytes.readUnsignedByte(); | |
//get number of "1"s until the first "0" | |
var b:uint = 0x80; | |
var n_ones:int = 4; | |
for(var i:int = 0; i < n_ones; i++) { | |
if((byte & b) == 0) { | |
n_ones = i; | |
break; | |
} | |
b >>= 1; | |
} | |
//how many bytes is this character using? | |
var n_bytes:int = 0; | |
if(n_ones == 0) { | |
n_bytes = 1; | |
} else if(n_ones == 1) { | |
continue; | |
} else { | |
n_bytes = n_ones; | |
} | |
//read character | |
bytes.position--; | |
try { | |
var char:String = bytes.readUTFBytes(n_bytes); | |
chars.push(char); | |
} catch(ex:EOFError) { | |
break; | |
} | |
} | |
return chars; | |
} | |
/** | |
* Concatenates an array of single (multibyte) characters to a string. | |
* | |
* @param chars The vector of strings to be concatenated. Each element must by a single | |
* character. | |
* | |
* @return The concatenated string. | |
*/ | |
public static function charArrayToStr(chars:Vector.<String>):String { | |
//fill byte array | |
bytes.clear(); | |
for(var i:int = 0; i < chars.length; i++) { | |
bytes.writeUTFBytes(chars[i]); | |
} | |
//get string | |
bytes.position = 0; | |
var str:String = bytes.readUTFBytes(bytes.length); | |
return str; | |
} | |
/** | |
* Returns the unicode code point representation of a single (multibyte) character. | |
* | |
* @param char The character which will represented in unicode. | |
* | |
* @return A 32-bit unsigned integer where 1 - 3 bytes will be filled, depending on the | |
* number of bytes used by the unicode code point. Or 0 when the provided "character" has | |
* more than 4 bytes. | |
*/ | |
public static function charToUnicode(char:String):uint { | |
//init byte array | |
bytes.clear(); | |
bytes.writeUTFBytes(char); | |
bytes.position = 0; | |
//how many bytes? | |
var n_bytes:int = 0; | |
var b:uint = bytes.readUnsignedByte(); | |
var start:uint = 0; | |
if(bytes.length == 1 && (b & 0x80) == 0) { | |
//1 byte (0xxxxxxx) | |
//return without changes | |
return b; | |
} else if(bytes.length == 2 && (b & 0xE0) == 0xC0) { | |
//2 bytes (110xxxxx 10xxxxxx) | |
n_bytes = 2; | |
start = b & 0x1F; | |
} else if(bytes.length == 3 && (b & 0xF0) == 0xE0) { | |
//3 bytes (1110xxxx 10xxxxxx 10xxxxxx) | |
n_bytes = 3; | |
start = b & 0xF; | |
} else if(bytes.length == 4 && (b & 0xF8) == 0xF0) { | |
//4 bytes (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) | |
n_bytes = 4; | |
start = b & 0x7; | |
} else { | |
//invalid | |
return 0; | |
} | |
//add starting bits | |
var code:uint = 0; | |
code |= start; | |
//add following bytes | |
bytes.position = 1; | |
while(bytes.position < bytes.length) { | |
b = bytes.readUnsignedByte(); | |
code <<= 6; | |
code |= b & 0x3F; | |
} | |
return code; | |
} | |
/** | |
* Returns the binary UTF-8 representation of a single (multibyte) character. | |
* | |
* @param char The character which will represented in binary. | |
* | |
* @return A 32-bit unsigned integer where 1 - 4 bytes will be filled, depending on the | |
* number of bytes used by the character. Or 0 when the provided "character" has more than 4 | |
* bytes. | |
*/ | |
public static function charToUtf8(char:String):uint { | |
//length check | |
if(char.length > 4) { | |
return 0; | |
} | |
//init byte array | |
bytes.clear(); | |
bytes.writeUTFBytes(char); | |
bytes.position = 0; | |
//read bytes | |
var num:uint = 0; | |
while(bytes.position < bytes.length) { | |
num <<= 8; | |
num |= bytes.readUnsignedByte(); | |
} | |
return num; | |
} | |
/** | |
* Removes Emojis, Dingbats and Miscellaneous Symbols from a string | |
* | |
* @param str The string from which the Symbols will be removed | |
* | |
* @return The cleaned string. | |
*/ | |
public static function removeSymbolsFromString(str:String):String { | |
var chars:Vector.<String> = strToCharArray(str); | |
var orig_len:int = chars.length; | |
//check chars | |
for(var i:int = 0; i < chars.length; i++) { | |
var code:uint = charToUnicode(chars[i]); | |
//remove by ranges | |
if( | |
(code >= 0x20D0 && code <= 0x21CF) || //Combining Diacritical Marks for Symbols | |
(code >= 0x2190 && code <= 0x228F) || //Arrows | |
(code >= 0x2300 && code <= 0x23FF) || //Miscellaneous Technical | |
(code >= 0x2460 && code <= 0x255F) || //Enclosed Alphanumerics | |
(code >= 0x25A0 && code <= 0x269F) || //Geometric Shapes | |
(code >= 0x2600 && code <= 0x26FF) || //Miscellaneous Symbols | |
(code >= 0x2700 && code <= 0x27FF) || //Dingbats | |
(code >= 0x2900 && code <= 0x29FF) || //Supplemental Arrows-B | |
(code >= 0x2B00 && code <= 0x2BFF) || //Miscellaneous Symbols And Arrows | |
(code >= 0x3200 && code <= 0x32FF) || //Enclosed CJK Letters and Months | |
(code >= 0x1F000 && code <= 0x1F0FF) || //Mahjong Tiles | |
(code >= 0x1F0A0 && code <= 0x1F19F) || //Playing Cards | |
(code >= 0x1F100 && code <= 0x1F1FF) || //Enclosed Alphanumeric Supplement | |
(code >= 0x1F200 && code <= 0x1F2FF) || //Enclosed Ideographic Supplement | |
(code >= 0x1F300 && code <= 0x1F5FF) || //Miscellaneous Symbols And Pictographs | |
(code >= 0x1F600 && code <= 0x1F64F) || //Emoticons | |
(code >= 0x1F680 && code <= 0x1F6FF) //Transport and Map Symbols | |
) { | |
chars.splice(i, 1); | |
i--; | |
} | |
} | |
//return string | |
if(chars.length == orig_len) { | |
return str; | |
} | |
return charArrayToStr(chars); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment