Skip to content

Instantly share code, notes, and snippets.

@theraot
Created November 28, 2018 23:16
Show Gist options
  • Save theraot/0d92d4f6c6e29e5cfe5572dbb5cbe9f2 to your computer and use it in GitHub Desktop.
Save theraot/0d92d4f6c6e29e5cfe5572dbb5cbe9f2 to your computer and use it in GitHub Desktop.
<?php
/**
* CC-BY 3.0 Alfonso J. Ramos (theraot)
* UTF8
*/
final class UTF8
{
//------------------------------------------------------------
// Private (Class)
//------------------------------------------------------------
private static function CodePointLength($ord)
{
if (($ord >> 7) === 0)
{
return 1;
}
if (($ord >> 5) === 6)
{
return 2;
}
if (($ord >> 4) === 14)
{
return 3;
}
if (($ord >> 3) === 30)
{
return 4;
}
return false;
}
private static function CharacterIndex($string, $position, $after)
{
$strlen = strlen($string);
if ($position < 0)
{
for ($index = $strlen - 1; $index >= 0; $index--)
{
$ord = ord($string{$index});
if (($ord >> 6) !== 2)
{
$position++;
}
if ($position === 0)
{
return $index;
}
}
return null;
}
$count = 0;
for ($index = $after; $index < $strlen; $count++)
{
if ($count === $position)
{
return $index;
}
$ord = ord($string{$index});
$add = UTF8::CodePointLength($ord);
if ($add === false)
{
return false;
}
/*for ($check = $index + 1; $check < $index + $add; $check++)
{
$ord = ord($string{$check});
if ($ord < 0x80 || $ord > 0xbf)
{
return false;
}
}*/
$index += $add;
}
if ($count < $position)
{
return false;
}
return $strlen;
}
//------------------------------------------------------------
// Public (Class)
//------------------------------------------------------------
/**
* UTF-8 aware replacement of char
*/
public static function Character(/*int*/ $codepoint)
{
$codepoint = intval($codepoint);
if ($codepoint < 127)
{
return chr($codepoint);
}
if ($codepoint < 2047)
{
return chr(192 | (($codepoint >> 6) & 31)).chr(128 | ($codepoint & 63));
}
if ($codepoint < 65535)
{
return chr(224 | (($codepoint >> 12) & 31)).chr(128 | (($codepoint >> 6) & 63)).chr(128 | ($codepoint & 63));
}
if ($codepoint < 1114111)
{
return chr(240 | (($codepoint >> 18) & 31)).chr(128 | (($codepoint >> 12) & 63)).chr(128 | (($codepoint >> 6) & 63)).chr(128 | ($codepoint & 63));
}
}
public static function CharacterAt($string, $index)
{
$nextIndex = UTF8::CharacterIndex($string, 1, $index);
if ($nextIndex === false)
{
return '';
}
return substr($string, $index, $nextIndex - $index);
}
public static function CharactersAt($string, $index, $length)
{
$nextIndex = UTF8::CharacterIndex($string, $length, $index);
if ($nextIndex === false)
{
return substr($string, $index);
}
return substr($string, $index, $nextIndex - $index);
}
/**
* UTF-8 aware replacement of ord
*/
public static function CodePoint(/*string*/ $character)
{
$ord0 = ord($character{0});
switch(UTF8::CodePointLength($ord0))
{
case 1:
return $ord0;
case 2:
return ($ord0 - 192) * 64 + (ord($character{1}) - 128);
case 3:
return ($ord0 - 224) * 4096 + (ord($character{1}) - 128) * 64 + (ord($character{2}) - 128);
case 4:
return ($ord0 - 240) * 262144 + (ord($character{1}) - 128) * 4096 + (ord($character{2}) - 128) * 64 + (ord($character{3}) - 128);
default:
return false;
}
}
public static function Enumerate($string)
{
$strlen = strlen($string);
for ($index = 0; $index < $strlen; )
{
$chr = UTF8::CharacterAt($string, $index);
if ($chr === '')
{
return;
}
$index += strlen($chr);
yield $chr;
}
}
public static function IsASCII($string)
{
$strlen = strlen($string);
$count = 0;
for ($index = 0; $index < $strlen; $count++)
{
$ord = ord($string{$index});
$add = UTF8::CodePointLength($ord);
if ($add === false || $add !== 1)
{
return false;
}
$index += $add;
}
return $count;
}
public static function IsUTF8($string)
{
return UTF8::Length($string) !== false;
}
public static function Length($string)
{
$strlen = strlen($string);
$count = 0;
for ($index = 0; $index < $strlen; $count++)
{
$ord = ord($string{$index});
$add = UTF8::CodePointLength($ord);
if ($add === false)
{
return false;
}
for ($check = $index + 1; $check < $index + $add; $check++)
{
$ord = ord($string{$check});
if ($ord < 0x80 || $ord > 0xbf)
{
return false;
}
}
$index += $add;
}
return $count;
}
public static function Split(/*string*/ $string, /*int*/ $length = 1)
{
if (intval($length) !== $length || $length < 1)
{
trigger_error('The length of each segment must be greater than zero', E_USER_WARNING);
return false;
}
else
{
$strlen = strlen($string);
$result = [];
$index = 0;
while($index < $strlen)
{
$block = UTF8::CharactersAt($string, $index, $length);
$result[] = $block;
$index += strlen($block);
}
return $result;
}
}
public static function Substr($string, $start, $length = null)
{
$startIndex = UTF8::CharacterIndex($string, $start, 0);
if ($startIndex === null)
{
$startIndex = 0;
}
if ($startIndex === false)
{
return false;
}
if ($length === null)
{
return substr($string, $startIndex);
}
else
{
$endIndex = UTF8::CharacterIndex($string, $length, $startIndex);
if ($endIndex === null)
{
return false;
}
if ($endIndex === false)
{
return substr($string, $startIndex);
}
if ($endIndex < $startIndex)
{
return $start < 0 ? '' : false;
}
return substr($string, $startIndex, $endIndex - $startIndex);
}
}
//------------------------------------------------------------
// Public (Constructor)
//------------------------------------------------------------
/**
* Creating instances of this class is not allowed.
*/
public function __construct()
{
trigger_error('Creating instances of '.__CLASS__.' is forbidden');
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment