aseques/gist:8040175

## gistfile1.txt
<?php

/**
 * Encode an UTF-8 string into GSM 03.38
 * Since UTF-8 is largely ASCII compatible, and GSM 03.38 is somewhat compatible, unnecessary conversions are removed.
 * Specials chars such as € can be encoded by using an escape char \x1B in front of a backwards compatible (similar) char.
 * UTF-8 chars which doesn't have a GSM 03.38 equivalent is replaced with a question mark.
 * UTF-8 continuation bytes (\x08-\xBF) are replaced when encountered in their valid places, but
 * any continuation bytes outside of a valid UTF-8 sequence is not processed.
 *
 * @param string $string
 * @return string
 */
function utf8_to_gsm0338($string)
{
        $dict = array(
                '@' => "\x00", '£' => "\x01", '$' => "\x02", '¥' => "\x03", 'è' => "\x04", 'é' => "\x05", 'ù' => "\x06", 'ì' => "\x07", 'ò' => "\x08", 'Ç' => "\x09", 'Ø' => "\x0B", 'ø' => "\x0C", 'Å' => "\x0E", 'å' => "\x0F",
                'Δ' => "\x10", '_' => "\x11", 'Φ' => "\x12", 'Γ' => "\x13", 'Λ' => "\x14", 'Ω' => "\x15", 'Π' => "\x16", 'Ψ' => "\x17", 'Σ' => "\x18", 'Θ' => "\x19", 'Ξ' => "\x1A", 'Æ' => "\x1C", 'æ' => "\x1D", 'ß' => "\x1E", 'É' => "\x1F",
                // all \x2? removed
                // all \x3? removed
                // all \x4? removed
                'Ä' => "\x5B", 'Ö' => "\x5C", 'Ñ' => "\x5D", 'Ü' => "\x5E", '§' => "\x5F",
                '¿' => "\x60",
                'ä' => "\x7B", 'ö' => "\x7C", 'ñ' => "\x7D", 'ü' => "\x7E", 'à' => "\x7F",
                '^' => "\x1B\x14", '{' => "\x1B\x28", '}' => "\x1B\x29", '\\' => "\x1B\x2F", '[' => "\x1B\x3C", '~' => "\x1B\x3D", ']' => "\x1B\x3E", '|' => "\x1B\x40", '€' => "\x1B\x65"
        );
        $converted = strtr($string, $dict);
        return $converted;
}


// Prepare message
//$message = 'Hèy語';
$message = 'Hèy';
echo "$message\n";
echo "HEX: ".bin2hex($message)."\n";

//1st we convert to gsm8 encoding
$encodedMessage = utf8_to_gsm0338($message);
echo "$encodedMessage\n";
echo "HEX: ".bin2hex($encodedMessage)."\n";

//2ns we detect if there are more conversions left (in that case it would be uft sms)
// Detect unconverted UTF-8 chars from codepages U+0080-U+07FF, U+0080-U+FFFF and U+010000-U+10FFFF
if (!preg_match_all('/([\\xC0-\\xDF].)|([\\xE0-\\xEF]..)|([\\xF0-\\xFF]...)/m',$encodedMessage,$matches)) {
	echo "No need to encode into utf8\n";
} else {
	echo "Message needs enconding to utf8";
}
	<?php

	/**
	* Encode an UTF-8 string into GSM 03.38
	* Since UTF-8 is largely ASCII compatible, and GSM 03.38 is somewhat compatible, unnecessary conversions are removed.
	* Specials chars such as € can be encoded by using an escape char \x1B in front of a backwards compatible (similar) char.
	* UTF-8 chars which doesn't have a GSM 03.38 equivalent is replaced with a question mark.
	* UTF-8 continuation bytes (\x08-\xBF) are replaced when encountered in their valid places, but
	* any continuation bytes outside of a valid UTF-8 sequence is not processed.
	*
	* @param string $string
	* @return string
	*/
	function utf8_to_gsm0338($string)
	{
	$dict = array(
	'@' => "\x00", '£' => "\x01", '$' => "\x02", '¥' => "\x03", 'è' => "\x04", 'é' => "\x05", 'ù' => "\x06", 'ì' => "\x07", 'ò' => "\x08", 'Ç' => "\x09", 'Ø' => "\x0B", 'ø' => "\x0C", 'Å' => "\x0E", 'å' => "\x0F",
	'Δ' => "\x10", '_' => "\x11", 'Φ' => "\x12", 'Γ' => "\x13", 'Λ' => "\x14", 'Ω' => "\x15", 'Π' => "\x16", 'Ψ' => "\x17", 'Σ' => "\x18", 'Θ' => "\x19", 'Ξ' => "\x1A", 'Æ' => "\x1C", 'æ' => "\x1D", 'ß' => "\x1E", 'É' => "\x1F",
	// all \x2? removed
	// all \x3? removed
	// all \x4? removed
	'Ä' => "\x5B", 'Ö' => "\x5C", 'Ñ' => "\x5D", 'Ü' => "\x5E", '§' => "\x5F",
	'¿' => "\x60",
	'ä' => "\x7B", 'ö' => "\x7C", 'ñ' => "\x7D", 'ü' => "\x7E", 'à' => "\x7F",
	'^' => "\x1B\x14", '{' => "\x1B\x28", '}' => "\x1B\x29", '\\' => "\x1B\x2F", '[' => "\x1B\x3C", '~' => "\x1B\x3D", ']' => "\x1B\x3E", '\|' => "\x1B\x40", '€' => "\x1B\x65"
	);
	$converted = strtr($string, $dict);
	return $converted;
	}


	// Prepare message
	//$message = 'Hèy語';
	$message = 'Hèy';
	echo "$message\n";
	echo "HEX: ".bin2hex($message)."\n";

	//1st we convert to gsm8 encoding
	$encodedMessage = utf8_to_gsm0338($message);
	echo "$encodedMessage\n";
	echo "HEX: ".bin2hex($encodedMessage)."\n";

	//2ns we detect if there are more conversions left (in that case it would be uft sms)
	// Detect unconverted UTF-8 chars from codepages U+0080-U+07FF, U+0080-U+FFFF and U+010000-U+10FFFF
	if (!preg_match_all('/([\\xC0-\\xDF].)\|([\\xE0-\\xEF]..)\|([\\xF0-\\xFF]...)/m',$encodedMessage,$matches)) {
	echo "No need to encode into utf8\n";
	} else {
	echo "Message needs enconding to utf8";
	}