Skip to content

Instantly share code, notes, and snippets.

@zanbaldwin
Last active February 16, 2022 13:03
Show Gist options
  • Save zanbaldwin/283156910871bd829f031ac787335ac2 to your computer and use it in GitHub Desktop.
Save zanbaldwin/283156910871bd829f031ac787335ac2 to your computer and use it in GitHub Desktop.
I got bored and played with the UTF-8 standard.
<?php declare(strict_types=1);
class UnicodeCharacter
{
private string $binary;
/**********************************\
| CONSTRUCTORS AND FACTORY METHODS |
\**********************************/
protected function __construct(string $codepoint)
{
$this->binary = $this->convert($codepoint);
}
public static function fromString(string $hex): static
{
return preg_match('/^([A-F\d]{1,5}|10[A-F\d]{4})$/', $hex)
? new static($hex)
: throw new \OutOfRangeException('hex string out of range');
}
public static function fromInt(int $num): static
{
return $num < 0 || $num > 0x10FFFF
? new static(dechex($num))
: throw new \OutOfRangeException('number out of range');
}
/*************************\
| WHERE THE MAGIC HAPPENS |
\*************************/
private function convert(string $codepointHex): string
{
$paddingLengthRoundedToEvenNumber = self::getLength($codepointHex) + (self::getLength($codepointHex) % 2);
$codepointBinary = ltrim(self::binaryToHumanReadable(
self::binaryFromHex(self::padString($codepointHex, $paddingLengthRoundedToEvenNumber, '0', STR_PAD_LEFT))
), '0');
$codepointBitCount = self::getLength($codepointBinary);
if ($codepointBitCount <= 7) {
// US-ASCII.
return '0' . self::padString($codepointBinary, 7, '0', STR_PAD_LEFT);
}
// Else Multibyte UTF-8.
$paddingLength = match (true) {
$codepointBitCount <= 11 => 11,
$codepointBitCount <= 16 => 16,
$codepointBitCount <= 21 => 21,
default => throw new \RuntimeException('too many bits to encode'),
};
$codepointBinary = self::padString($codepointBinary, $paddingLength, '0', STR_PAD_LEFT);
$segments = array_reverse(array_map(
fn (string $part): string => strrev($part),
str_split(strrev($codepointBinary), 6)
));
$unicodeBinary = str_repeat('1', count($segments)) . '0' . array_shift($segments)
. implode('', array_map(fn (string $segment): string => '10' . $segment, $segments));
// Inlined self::binaryFromHumanReadable().
return self::binaryFromHex(implode('', array_map(function ($byteRepresentation) {
return self::padString(dechex(bindec($byteRepresentation)), 2, '0', STR_PAD_LEFT, '8bit');
}, function_exists('mb_str_split') ? mb_str_split($unicodeBinary, 8, '8bit') : str_split($unicodeBinary, 8))));
}
/****************\
| OUTPUT METHODS |
\****************/
public function toHex(): string
{
return self::binaryToHex($this->binary);
}
public function toHumanReadable(): string
{
return self::binaryToHumanReadable($this->binary);
}
public function __toString(): string
{
return $this->binary;
}
/*********************************************\
| HELPER METHODS FOR DEALING WITH BINARY DATA |
\*********************************************/
private static function getLength(string $str): int
{
return function_exists('\\mb_strlen')
? (int) mb_strlen($str, '8bit')
: strlen(bin2hex($str)) / 2;
}
/**
* PHP doesn't have a function for multibyte string padding. This should suffice in case
* PHP's internal string functions have been overloaded by the mbstring extension.
*/
private static function padString(
string $input,
int $paddingLength,
string $padding = ' ',
int $type = STR_PAD_RIGHT,
string $encoding = 'UTF-8'
): string {
$diff = strlen($input) - (function_exists('mb_strlen') ? mb_strlen($input, $encoding) : strlen($input));
return str_pad($input, $paddingLength + $diff, $padding, $type);
}
/** @throws \InvalidArgumentException */
private static function binaryFromHex(string $hex): string
{
if (!ctype_xdigit($hex) || self::getLength($hex) % 2 !== 0) {
throw new \InvalidArgumentException('Valid hexadecimal string not provided.');
}
return pack('H*', strtolower($hex));
}
private static function binaryToHex(string $binary): string
{
$data = unpack('H*', $binary);
return reset($data);
}
private static function binaryToHumanReadable(string $binary): string
{
$hex = self::binaryToHex($binary);
return implode('', array_map(function ($character) {
return self::padString(decbin(hexdec($character)), 8, '0', STR_PAD_LEFT, '8bit');
}, function_exists('mb_str_split') ? mb_str_split($hex, 2, '8bit') : str_split($hex, 2)));
}
}
<?php declare(strict_types=1);
class UnicodeGraphemeCluster
{
protected function __construct(
private array $characters,
) {}
public static function fromInts(array $nums): static
{
return new static(array_map(
fn (int $num): UnicodeCharacter => UnicodeCharacter::fromInt($num),
$nums
));
}
public static function fromString(string $codepoints): static
{
return new static(array_map(
fn (string $codepoint): UnicodeCharacter => UnicodeCharacter::fromString($codepoint),
preg_split('/[\s\+]+/', $codepoints, -1, \PREG_SPLIT_NO_EMPTY)
));
}
/** @param string[] $codepoints */
public static function fromStrings(array $codepoints): static
{
return new static(array_map(
fn (string $codepoint): UnicodeCharacter => UnicodeCharacter::fromString($codepoint),
$codepoints
));
}
public function toHex(): string
{
return implode('', array_map(fn (UnicodeCharacter $char): string => $char->toHex(), $this->characters));
}
public function toHumanReadable(): string
{
return implode('', array_map(fn (UnicodeCharacter $char): string => $char->toHumanReadable(), $this->characters));
}
public function __toString(): string
{
return implode('', array_map(fn (UnicodeCharacter $char) => (string) $char, $this->characters));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment