Last active
December 31, 2015 20:29
-
-
Save aseques/8040175 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Encode an UTF-8 string into GSM 03.38 | |
* Since UTF-8 is largely ASCII compatible, and GSM 03.38 is somewhat compatible, unnecessary conversions are removed. | |
* Specials chars such as € can be encoded by using an escape char \x1B in front of a backwards compatible (similar) char. | |
* UTF-8 chars which doesn't have a GSM 03.38 equivalent is replaced with a question mark. | |
* UTF-8 continuation bytes (\x08-\xBF) are replaced when encountered in their valid places, but | |
* any continuation bytes outside of a valid UTF-8 sequence is not processed. | |
* | |
* @param string $string | |
* @return string | |
*/ | |
function utf8_to_gsm0338($string) | |
{ | |
$dict = array( | |
'@' => "\x00", '£' => "\x01", '$' => "\x02", '¥' => "\x03", 'è' => "\x04", 'é' => "\x05", 'ù' => "\x06", 'ì' => "\x07", 'ò' => "\x08", 'Ç' => "\x09", 'Ø' => "\x0B", 'ø' => "\x0C", 'Å' => "\x0E", 'å' => "\x0F", | |
'Δ' => "\x10", '_' => "\x11", 'Φ' => "\x12", 'Γ' => "\x13", 'Λ' => "\x14", 'Ω' => "\x15", 'Π' => "\x16", 'Ψ' => "\x17", 'Σ' => "\x18", 'Θ' => "\x19", 'Ξ' => "\x1A", 'Æ' => "\x1C", 'æ' => "\x1D", 'ß' => "\x1E", 'É' => "\x1F", | |
// all \x2? removed | |
// all \x3? removed | |
// all \x4? removed | |
'Ä' => "\x5B", 'Ö' => "\x5C", 'Ñ' => "\x5D", 'Ü' => "\x5E", '§' => "\x5F", | |
'¿' => "\x60", | |
'ä' => "\x7B", 'ö' => "\x7C", 'ñ' => "\x7D", 'ü' => "\x7E", 'à' => "\x7F", | |
'^' => "\x1B\x14", '{' => "\x1B\x28", '}' => "\x1B\x29", '\\' => "\x1B\x2F", '[' => "\x1B\x3C", '~' => "\x1B\x3D", ']' => "\x1B\x3E", '|' => "\x1B\x40", '€' => "\x1B\x65" | |
); | |
$converted = strtr($string, $dict); | |
return $converted; | |
} | |
// Prepare message | |
//$message = 'Hèy語'; | |
$message = 'Hèy'; | |
echo "$message\n"; | |
echo "HEX: ".bin2hex($message)."\n"; | |
//1st we convert to gsm8 encoding | |
$encodedMessage = utf8_to_gsm0338($message); | |
echo "$encodedMessage\n"; | |
echo "HEX: ".bin2hex($encodedMessage)."\n"; | |
//2ns we detect if there are more conversions left (in that case it would be uft sms) | |
// Detect unconverted UTF-8 chars from codepages U+0080-U+07FF, U+0080-U+FFFF and U+010000-U+10FFFF | |
if (!preg_match_all('/([\\xC0-\\xDF].)|([\\xE0-\\xEF]..)|([\\xF0-\\xFF]...)/m',$encodedMessage,$matches)) { | |
echo "No need to encode into utf8\n"; | |
} else { | |
echo "Message needs enconding to utf8"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment