Last active
February 16, 2022 13:03
-
-
Save zanbaldwin/283156910871bd829f031ac787335ac2 to your computer and use it in GitHub Desktop.
I got bored and played with the UTF-8 standard.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php declare(strict_types=1); | |
class UnicodeCharacter | |
{ | |
private string $binary; | |
/**********************************\ | |
| CONSTRUCTORS AND FACTORY METHODS | | |
\**********************************/ | |
protected function __construct(string $codepoint) | |
{ | |
$this->binary = $this->convert($codepoint); | |
} | |
public static function fromString(string $hex): static | |
{ | |
return preg_match('/^([A-F\d]{1,5}|10[A-F\d]{4})$/', $hex) | |
? new static($hex) | |
: throw new \OutOfRangeException('hex string out of range'); | |
} | |
public static function fromInt(int $num): static | |
{ | |
return $num < 0 || $num > 0x10FFFF | |
? new static(dechex($num)) | |
: throw new \OutOfRangeException('number out of range'); | |
} | |
/*************************\ | |
| WHERE THE MAGIC HAPPENS | | |
\*************************/ | |
private function convert(string $codepointHex): string | |
{ | |
$paddingLengthRoundedToEvenNumber = self::getLength($codepointHex) + (self::getLength($codepointHex) % 2); | |
$codepointBinary = ltrim(self::binaryToHumanReadable( | |
self::binaryFromHex(self::padString($codepointHex, $paddingLengthRoundedToEvenNumber, '0', STR_PAD_LEFT)) | |
), '0'); | |
$codepointBitCount = self::getLength($codepointBinary); | |
if ($codepointBitCount <= 7) { | |
// US-ASCII. | |
return '0' . self::padString($codepointBinary, 7, '0', STR_PAD_LEFT); | |
} | |
// Else Multibyte UTF-8. | |
$paddingLength = match (true) { | |
$codepointBitCount <= 11 => 11, | |
$codepointBitCount <= 16 => 16, | |
$codepointBitCount <= 21 => 21, | |
default => throw new \RuntimeException('too many bits to encode'), | |
}; | |
$codepointBinary = self::padString($codepointBinary, $paddingLength, '0', STR_PAD_LEFT); | |
$segments = array_reverse(array_map( | |
fn (string $part): string => strrev($part), | |
str_split(strrev($codepointBinary), 6) | |
)); | |
$unicodeBinary = str_repeat('1', count($segments)) . '0' . array_shift($segments) | |
. implode('', array_map(fn (string $segment): string => '10' . $segment, $segments)); | |
// Inlined self::binaryFromHumanReadable(). | |
return self::binaryFromHex(implode('', array_map(function ($byteRepresentation) { | |
return self::padString(dechex(bindec($byteRepresentation)), 2, '0', STR_PAD_LEFT, '8bit'); | |
}, function_exists('mb_str_split') ? mb_str_split($unicodeBinary, 8, '8bit') : str_split($unicodeBinary, 8)))); | |
} | |
/****************\ | |
| OUTPUT METHODS | | |
\****************/ | |
public function toHex(): string | |
{ | |
return self::binaryToHex($this->binary); | |
} | |
public function toHumanReadable(): string | |
{ | |
return self::binaryToHumanReadable($this->binary); | |
} | |
public function __toString(): string | |
{ | |
return $this->binary; | |
} | |
/*********************************************\ | |
| HELPER METHODS FOR DEALING WITH BINARY DATA | | |
\*********************************************/ | |
private static function getLength(string $str): int | |
{ | |
return function_exists('\\mb_strlen') | |
? (int) mb_strlen($str, '8bit') | |
: strlen(bin2hex($str)) / 2; | |
} | |
/** | |
* PHP doesn't have a function for multibyte string padding. This should suffice in case | |
* PHP's internal string functions have been overloaded by the mbstring extension. | |
*/ | |
private static function padString( | |
string $input, | |
int $paddingLength, | |
string $padding = ' ', | |
int $type = STR_PAD_RIGHT, | |
string $encoding = 'UTF-8' | |
): string { | |
$diff = strlen($input) - (function_exists('mb_strlen') ? mb_strlen($input, $encoding) : strlen($input)); | |
return str_pad($input, $paddingLength + $diff, $padding, $type); | |
} | |
/** @throws \InvalidArgumentException */ | |
private static function binaryFromHex(string $hex): string | |
{ | |
if (!ctype_xdigit($hex) || self::getLength($hex) % 2 !== 0) { | |
throw new \InvalidArgumentException('Valid hexadecimal string not provided.'); | |
} | |
return pack('H*', strtolower($hex)); | |
} | |
private static function binaryToHex(string $binary): string | |
{ | |
$data = unpack('H*', $binary); | |
return reset($data); | |
} | |
private static function binaryToHumanReadable(string $binary): string | |
{ | |
$hex = self::binaryToHex($binary); | |
return implode('', array_map(function ($character) { | |
return self::padString(decbin(hexdec($character)), 8, '0', STR_PAD_LEFT, '8bit'); | |
}, function_exists('mb_str_split') ? mb_str_split($hex, 2, '8bit') : str_split($hex, 2))); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php declare(strict_types=1); | |
class UnicodeGraphemeCluster | |
{ | |
protected function __construct( | |
private array $characters, | |
) {} | |
public static function fromInts(array $nums): static | |
{ | |
return new static(array_map( | |
fn (int $num): UnicodeCharacter => UnicodeCharacter::fromInt($num), | |
$nums | |
)); | |
} | |
public static function fromString(string $codepoints): static | |
{ | |
return new static(array_map( | |
fn (string $codepoint): UnicodeCharacter => UnicodeCharacter::fromString($codepoint), | |
preg_split('/[\s\+]+/', $codepoints, -1, \PREG_SPLIT_NO_EMPTY) | |
)); | |
} | |
/** @param string[] $codepoints */ | |
public static function fromStrings(array $codepoints): static | |
{ | |
return new static(array_map( | |
fn (string $codepoint): UnicodeCharacter => UnicodeCharacter::fromString($codepoint), | |
$codepoints | |
)); | |
} | |
public function toHex(): string | |
{ | |
return implode('', array_map(fn (UnicodeCharacter $char): string => $char->toHex(), $this->characters)); | |
} | |
public function toHumanReadable(): string | |
{ | |
return implode('', array_map(fn (UnicodeCharacter $char): string => $char->toHumanReadable(), $this->characters)); | |
} | |
public function __toString(): string | |
{ | |
return implode('', array_map(fn (UnicodeCharacter $char) => (string) $char, $this->characters)); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment