Skip to content

Instantly share code, notes, and snippets.

@MorbZ
Created June 7, 2013 00:57
Show Gist options
  • Save MorbZ/5726339 to your computer and use it in GitHub Desktop.
Save MorbZ/5726339 to your computer and use it in GitHub Desktop.
This AS3 class handles strings with UTF-8 multibyte characters (up to 4 bytes/character). Currently it is able to split a String into its characters, concatenate chars to a string, give the binary and the unicode representation of a character and remove symbols/emojis from a string.
package utils
{
import flash.errors.EOFError;
import flash.utils.ByteArray;
/**
* This AS3 class handles strings with UTF-8 multibyte characters (up to 4 bytes/character).
* Currently it is able to split a String into its characters, concatenate chars to a string,
* give the binary and the unicode representation of a character and remove symbols/emojis from
* a string.
*
* @author Merten Peetz
*
* @license CC-BY
*/
public class MultiByteStrUtil
{
// Vars
private static var bytes:ByteArray = new ByteArray;
/** Public Methods **/
/**
* Splits a string into its single characters and returns them as a vector of strings.
*
* @param str The string which will be splitted.
*
* @return A vector of strings where each element is a single (multibyte) character.
*/
public static function strToCharArray(str:String):Vector.<String>
{
//init byte-array
bytes.clear();
bytes.writeUTFBytes(str);
bytes.position = 0;
//iterate bytes
var chars:Vector.<String> = new Vector.<String>;
while(bytes.position < bytes.length) {
var byte:uint = bytes.readUnsignedByte();
//get number of "1"s until the first "0"
var b:uint = 0x80;
var n_ones:int = 4;
for(var i:int = 0; i < n_ones; i++) {
if((byte & b) == 0) {
n_ones = i;
break;
}
b >>= 1;
}
//how many bytes is this character using?
var n_bytes:int = 0;
if(n_ones == 0) {
n_bytes = 1;
} else if(n_ones == 1) {
continue;
} else {
n_bytes = n_ones;
}
//read character
bytes.position--;
try {
var char:String = bytes.readUTFBytes(n_bytes);
chars.push(char);
} catch(ex:EOFError) {
break;
}
}
return chars;
}
/**
* Concatenates an array of single (multibyte) characters to a string.
*
* @param chars The vector of strings to be concatenated. Each element must by a single
* character.
*
* @return The concatenated string.
*/
public static function charArrayToStr(chars:Vector.<String>):String {
//fill byte array
bytes.clear();
for(var i:int = 0; i < chars.length; i++) {
bytes.writeUTFBytes(chars[i]);
}
//get string
bytes.position = 0;
var str:String = bytes.readUTFBytes(bytes.length);
return str;
}
/**
* Returns the unicode code point representation of a single (multibyte) character.
*
* @param char The character which will represented in unicode.
*
* @return A 32-bit unsigned integer where 1 - 3 bytes will be filled, depending on the
* number of bytes used by the unicode code point. Or 0 when the provided "character" has
* more than 4 bytes.
*/
public static function charToUnicode(char:String):uint {
//init byte array
bytes.clear();
bytes.writeUTFBytes(char);
bytes.position = 0;
//how many bytes?
var n_bytes:int = 0;
var b:uint = bytes.readUnsignedByte();
var start:uint = 0;
if(bytes.length == 1 && (b & 0x80) == 0) {
//1 byte (0xxxxxxx)
//return without changes
return b;
} else if(bytes.length == 2 && (b & 0xE0) == 0xC0) {
//2 bytes (110xxxxx 10xxxxxx)
n_bytes = 2;
start = b & 0x1F;
} else if(bytes.length == 3 && (b & 0xF0) == 0xE0) {
//3 bytes (1110xxxx 10xxxxxx 10xxxxxx)
n_bytes = 3;
start = b & 0xF;
} else if(bytes.length == 4 && (b & 0xF8) == 0xF0) {
//4 bytes (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
n_bytes = 4;
start = b & 0x7;
} else {
//invalid
return 0;
}
//add starting bits
var code:uint = 0;
code |= start;
//add following bytes
bytes.position = 1;
while(bytes.position < bytes.length) {
b = bytes.readUnsignedByte();
code <<= 6;
code |= b & 0x3F;
}
return code;
}
/**
* Returns the binary UTF-8 representation of a single (multibyte) character.
*
* @param char The character which will represented in binary.
*
* @return A 32-bit unsigned integer where 1 - 4 bytes will be filled, depending on the
* number of bytes used by the character. Or 0 when the provided "character" has more than 4
* bytes.
*/
public static function charToUtf8(char:String):uint {
//length check
if(char.length > 4) {
return 0;
}
//init byte array
bytes.clear();
bytes.writeUTFBytes(char);
bytes.position = 0;
//read bytes
var num:uint = 0;
while(bytes.position < bytes.length) {
num <<= 8;
num |= bytes.readUnsignedByte();
}
return num;
}
/**
* Removes Emojis, Dingbats and Miscellaneous Symbols from a string
*
* @param str The string from which the Symbols will be removed
*
* @return The cleaned string.
*/
public static function removeSymbolsFromString(str:String):String {
var chars:Vector.<String> = strToCharArray(str);
var orig_len:int = chars.length;
//check chars
for(var i:int = 0; i < chars.length; i++) {
var code:uint = charToUnicode(chars[i]);
//remove by ranges
if(
(code >= 0x20D0 && code <= 0x21CF) || //Combining Diacritical Marks for Symbols
(code >= 0x2190 && code <= 0x228F) || //Arrows
(code >= 0x2300 && code <= 0x23FF) || //Miscellaneous Technical
(code >= 0x2460 && code <= 0x255F) || //Enclosed Alphanumerics
(code >= 0x25A0 && code <= 0x269F) || //Geometric Shapes
(code >= 0x2600 && code <= 0x26FF) || //Miscellaneous Symbols
(code >= 0x2700 && code <= 0x27FF) || //Dingbats
(code >= 0x2900 && code <= 0x29FF) || //Supplemental Arrows-B
(code >= 0x2B00 && code <= 0x2BFF) || //Miscellaneous Symbols And Arrows
(code >= 0x3200 && code <= 0x32FF) || //Enclosed CJK Letters and Months
(code >= 0x1F000 && code <= 0x1F0FF) || //Mahjong Tiles
(code >= 0x1F0A0 && code <= 0x1F19F) || //Playing Cards
(code >= 0x1F100 && code <= 0x1F1FF) || //Enclosed Alphanumeric Supplement
(code >= 0x1F200 && code <= 0x1F2FF) || //Enclosed Ideographic Supplement
(code >= 0x1F300 && code <= 0x1F5FF) || //Miscellaneous Symbols And Pictographs
(code >= 0x1F600 && code <= 0x1F64F) || //Emoticons
(code >= 0x1F680 && code <= 0x1F6FF) //Transport and Map Symbols
) {
chars.splice(i, 1);
i--;
}
}
//return string
if(chars.length == orig_len) {
return str;
}
return charArrayToStr(chars);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment