Skip to content

Instantly share code, notes, and snippets.

@ahmedash95
Created October 19, 2016 15:54
Show Gist options
  • Save ahmedash95/35ec5c2c6d5c0ee1ff34b27b229759e4 to your computer and use it in GitHub Desktop.
Save ahmedash95/35ec5c2c6d5c0ee1ff34b27b229759e4 to your computer and use it in GitHub Desktop.
Glyphs Class for UTF8
<?php
/**
* This file is part of the AIP package.
*
* (c) Khaled Al-Sham'aa <khaled@ar-php.org> && Maher El Gamil <maherbusnes@gmail.com>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
class Glyphs
{
/**
* @var null|string
*/
private $_glyphs = null;
/**
* @var null|string
*/
private $_hex = null;
/**
* @var null|string
*/
private $_prevLink = null;
/**
* @var null|string
*/
private $_nextLink = null;
/**
* @var null|string
*/
private $_vowel = null;
/**
* Loads initialize values.
*
* @ignore
*/
public function __construct()
{
$this->_prevLink = '،؟؛ـئبتثجحخسشصضطظعغفقكلمنهي';
$this->_nextLink = 'ـآأؤإائبةتثجحخدذرز';
$this->_nextLink .= 'سشصضطظعغفقكلمنهوىي';
$this->_vowel = 'ًٌٍَُِّْ';
/*
$this->_glyphs['ً'] = array('FE70','FE71');
$this->_glyphs['ٌ'] = array('FE72','FE72');
$this->_glyphs['ٍ'] = array('FE74','FE74');
$this->_glyphs['َ'] = array('FE76','FE77');
$this->_glyphs['ُ'] = array('FE78','FE79');
$this->_glyphs['ِ'] = array('FE7A','FE7B');
$this->_glyphs['ّ'] = array('FE7C','FE7D');
$this->_glyphs['ْ'] = array('FE7E','FE7E');
*/
$this->_glyphs = 'ًٌٍَُِّْٰ';
$this->_hex = '064B064B064B064B064C064C064C064C064D064D064D064D064E064E';
$this->_hex .= '064E064E064F064F064F064F06500650065006500651065106510651';
$this->_hex .= '06520652065206520670067006700670';
$this->_glyphs .= 'ءآأؤإئاب';
$this->_hex .= 'FE80FE80FE80FE80FE81FE82FE81FE82FE83FE84FE83FE84FE85FE86';
$this->_hex .= 'FE85FE86FE87FE88FE87FE88FE89FE8AFE8BFE8CFE8DFE8EFE8DFE8E';
$this->_hex .= 'FE8FFE90FE91FE92';
$this->_glyphs .= 'ةتثجحخدذ';
$this->_hex .= 'FE93FE94FE93FE94FE95FE96FE97FE98FE99FE9AFE9BFE9CFE9DFE9E';
$this->_hex .= 'FE9FFEA0FEA1FEA2FEA3FEA4FEA5FEA6FEA7FEA8FEA9FEAAFEA9FEAA';
$this->_hex .= 'FEABFEACFEABFEAC';
$this->_glyphs .= 'رزسشصضطظ';
$this->_hex .= 'FEADFEAEFEADFEAEFEAFFEB0FEAFFEB0FEB1FEB2FEB3FEB4FEB5FEB6';
$this->_hex .= 'FEB7FEB8FEB9FEBAFEBBFEBCFEBDFEBEFEBFFEC0FEC1FEC2FEC3FEC4';
$this->_hex .= 'FEC5FEC6FEC7FEC8';
$this->_glyphs .= 'عغفقكلمن';
$this->_hex .= 'FEC9FECAFECBFECCFECDFECEFECFFED0FED1FED2FED3FED4FED5FED6';
$this->_hex .= 'FED7FED8FED9FEDAFEDBFEDCFEDDFEDEFEDFFEE0FEE1FEE2FEE3FEE4';
$this->_hex .= 'FEE5FEE6FEE7FEE8';
$this->_glyphs .= 'هوىيـ،؟؛';
$this->_hex .= 'FEE9FEEAFEEBFEECFEEDFEEEFEEDFEEEFEEFFEF0FEEFFEF0FEF1FEF2';
$this->_hex .= 'FEF3FEF40640064006400640060C060C060C060C061F061F061F061F';
$this->_hex .= '061B061B061B061B';
// Support the extra 4 Persian letters (p), (ch), (zh) and (g)
// This needs value in getGlyphs function to be 52 instead of 48
// $this->_glyphs .= chr(129).chr(141).chr(142).chr(144);
// $this->_hex .= 'FB56FB57FB58FB59FB7AFB7BFB7CFB7DFB8AFB8BFB8AFB8BFB92';
// $this->_hex .= 'FB93FB94FB95';
//
// $this->_prevLink .= chr(129).chr(141).chr(142).chr(144);
// $this->_nextLink .= chr(129).chr(141).chr(142).chr(144);
//
// Example: $text = 'نمونة قلم: لاگچ ژافپ';
// Email Yossi Beck <yosbeck@gmail.com> ask him to save that example
// string using ANSI encoding in Notepad
$this->_glyphs .= '';
$this->_hex .= '';
$this->_glyphs .= 'لآلألإلا';
$this->_hex .= 'FEF5FEF6FEF5FEF6FEF7FEF8FEF7FEF8FEF9FEFAFEF9FEFAFEFBFEFC';
$this->_hex .= 'FEFBFEFC';
}
/**
* Get glyphs.
*
* @param string $char Char
* @param int $type Type
*
* @return string
*/
protected function getGlyphs($char, $type)
{
$pos = mb_strpos($this->_glyphs, $char);
if ($pos > 49) {
$pos = ($pos - 49) / 2 + 49;
}
$pos = $pos * 16 + $type * 4;
return substr($this->_hex, $pos, 4);
}
/**
* Convert Arabic Windows-1256 charset string into glyph joining in UTF-8
* hexadecimals stream.
*
* @param string $str Arabic string in Windows-1256 charset
*
* @return string Arabic glyph joining in UTF-8 hexadecimals stream
*
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
*/
protected function preConvert($str)
{
$crntChar = null;
$prevChar = null;
$nextChar = null;
$output = '';
$_temp = mb_strlen($str);
$chars = [];
for ($i = 0; $i < $_temp; $i++) {
$chars[] = mb_substr($str, $i, 1);
}
$max = count($chars);
for ($i = $max - 1; $i >= 0; $i--) {
$crntChar = $chars[$i];
$prevChar = ' ';
if ($i > 0) {
$prevChar = $chars[$i - 1];
}
if ($prevChar && mb_strpos($this->_vowel, $prevChar) !== false) {
$prevChar = $chars[$i - 2];
if ($prevChar && mb_strpos($this->_vowel, $prevChar) !== false) {
$prevChar = $chars[$i - 3];
}
}
$Reversed = false;
$flip_arr = ')]>}';
$ReversedChr = '([<{';
if ($crntChar && mb_strpos($flip_arr, $crntChar) !== false) {
$crntChar = $ReversedChr[mb_strpos($flip_arr, $crntChar)];
$Reversed = true;
} else {
$Reversed = false;
}
if ($crntChar && !$Reversed
&& (mb_strpos($ReversedChr, $crntChar) !== false)
) {
$crntChar = $flip_arr[mb_strpos($ReversedChr, $crntChar)];
}
if (ord($crntChar) < 128) {
$output .= $crntChar;
$nextChar = $crntChar;
continue;
}
if ($crntChar == 'ل' && isset($chars[$i + 1])
&& (mb_strpos('آأإا', $chars[$i + 1]) !== false)
) {
continue;
}
if ($crntChar && mb_strpos($this->_vowel, $crntChar) !== false) {
if (isset($chars[$i + 1])
&& (mb_strpos($this->_nextLink, $chars[$i + 1]) !== false)
&& (mb_strpos($this->_prevLink, $prevChar) !== false)
) {
$output .= '&#x'.$this->getGlyphs($crntChar, 1).';';
} else {
$output .= '&#x'.$this->getGlyphs($crntChar, 0).';';
}
continue;
}
$form = 0;
if (($prevChar == 'لا' || $prevChar == 'لآ' || $prevChar == 'لأ'
|| $prevChar == 'لإ' || $prevChar == 'ل')
&& (mb_strpos('آأإا', $crntChar) !== false)
) {
if (mb_strpos($this->_prevLink, $chars[$i - 2]) !== false) {
$form++;
}
if (mb_strpos($this->_vowel, $chars[$i - 1])) {
$output .= '&#x';
$output .= $this->getGlyphs($crntChar, $form).';';
} else {
$output .= '&#x';
$output .= $this->getGlyphs($prevChar.$crntChar, $form).';';
}
$nextChar = $prevChar;
continue;
}
if ($prevChar && mb_strpos($this->_prevLink, $prevChar) !== false) {
$form++;
}
if ($nextChar && mb_strpos($this->_nextLink, $nextChar) !== false) {
$form += 2;
}
$output .= '&#x'.$this->getGlyphs($crntChar, $form).';';
$nextChar = $crntChar;
}
// from Arabic Presentation Forms-B, Range: FE70-FEFF,
// file "UFE70.pdf" (in reversed order)
// into Arabic Presentation Forms-A, Range: FB50-FDFF, file "UFB50.pdf"
// Example: $output = str_replace('&#xFEA0;&#xFEDF;', '&#xFCC9;', $output);
// Lam Jeem
$output = $this->decodeEntities($output, $exclude = ['&']);
return $output;
}
/**
* Regression analysis calculate roughly the max number of character fit in
* one A4 page line for a given font size.
*
* @param int $font Font size
*
* @return int Maximum number of characters per line
*
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
*/
public function a4MaxChars($font)
{
$x = 381.6 - 31.57 * $font + 1.182 * pow($font, 2) - 0.02052 *
pow($font, 3) + 0.0001342 * pow($font, 4);
return floor($x - 2);
}
/**
* Calculate the lines number of given Arabic text and font size that will
* fit in A4 page size.
*
* @param string $str Arabic string you would like to split it into lines
* @param int $font Font size
*
* @return int Number of lines for a given Arabic string in A4 page size
*
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
*/
public function a4Lines($str, $font)
{
$str = str_replace(["\r\n", "\n", "\r"], "\n", $str);
$lines = 0;
$chars = 0;
$words = explode(' ', $str);
$w_count = count($words);
$max_chars = $this->a4MaxChars($font);
for ($i = 0; $i < $w_count; $i++) {
$w_len = mb_strlen($words[$i]) + 1;
if ($chars + $w_len < $max_chars) {
if (mb_strpos($words[$i], "\n") !== false) {
$words_nl = explode("\n", $words[$i]);
$nl_num = count($words_nl) - 1;
for ($j = 1; $j < $nl_num; $j++) {
$lines++;
}
$chars = mb_strlen($words_nl[$nl_num]) + 1;
} else {
$chars += $w_len;
}
} else {
$lines++;
$chars = $w_len;
}
}
$lines++;
return $lines;
}
/**
* Convert Arabic Windows-1256 charset string into glyph joining in UTF-8
* hexadecimals stream (take care of whole the document including English
* sections as well as numbers and arcs etc...).
*
* @param string $str Arabic string in Windows-1256 charset
* @param int $max_chars Max number of chars you can fit in one line
* @param bool $hindo If true use Hindo digits else use Arabic digits
*
* @return string Arabic glyph joining in UTF-8 hexadecimals stream (take
* care of whole document including English sections as well
* as numbers and arcs etc...)
*
* @author Khaled Al-Sham'aa <khaled@ar-php.org>
*/
public function utf8Glyphs($str, $max_chars = 50, $hindo = true)
{
$str = str_replace(["\r\n", "\n", "\r"], " \n ", $str);
$str = str_replace("\t", ' ', $str);
$lines = [];
$words = explode(' ', $str);
$w_count = count($words);
$c_chars = 0;
$c_words = [];
$english = [];
$en_index = -1;
$en_words = [];
$en_stack = [];
for ($i = 0; $i < $w_count; $i++) {
$pattern = '/^(\n?)';
$pattern .= '[a-z\d\\/\@\#\$\%\^\&\*\(\)\_\~\"\'\[\]\{\}\;\,\|\-\.\:!]*';
$pattern .= '([\.\:\+\=\-\!،؟]?)$/i';
if (preg_match($pattern, $words[$i], $matches)) {
if ($matches[1]) {
$words[$i] = mb_substr($words[$i], 1).$matches[1];
}
if ($matches[2]) {
$words[$i] = $matches[2].mb_substr($words[$i], 0, -1);
}
$words[$i] = strrev($words[$i]);
array_push($english, $words[$i]);
if ($en_index == -1) {
$en_index = $i;
}
$en_words[] = true;
} elseif ($en_index != -1) {
$en_count = count($english);
for ($j = 0; $j < $en_count; $j++) {
$words[$en_index + $j] = $english[$en_count - 1 - $j];
}
$en_index = -1;
$english = [];
$en_words[] = false;
} else {
$en_words[] = false;
}
}
if ($en_index != -1) {
$en_count = count($english);
for ($j = 0; $j < $en_count; $j++) {
$words[$en_index + $j] = $english[$en_count - 1 - $j];
}
}
// need more work to fix lines starts by English words
if (isset($en_start)) {
$last = true;
$from = 0;
foreach ($en_words as $key => $value) {
if ($last !== $value) {
$to = $key - 1;
array_push($en_stack, [$from, $to]);
$from = $key;
}
$last = $value;
}
array_push($en_stack, [$from, $key]);
$new_words = [];
while (list($from, $to) = array_pop($en_stack)) {
for ($i = $from; $i <= $to; $i++) {
$new_words[] = $words[$i];
}
}
$words = $new_words;
}
for ($i = 0; $i < $w_count; $i++) {
$w_len = mb_strlen($words[$i]) + 1;
if ($c_chars + $w_len < $max_chars) {
if (mb_strpos($words[$i], "\n") !== false) {
$words_nl = explode("\n", $words[$i]);
array_push($c_words, $words_nl[0]);
array_push($lines, implode(' ', $c_words));
$nl_num = count($words_nl) - 1;
for ($j = 1; $j < $nl_num; $j++) {
array_push($lines, $words_nl[$j]);
}
$c_words = [$words_nl[$nl_num]];
$c_chars = mb_strlen($words_nl[$nl_num]) + 1;
} else {
array_push($c_words, $words[$i]);
$c_chars += $w_len;
}
} else {
array_push($lines, implode(' ', $c_words));
$c_words = [$words[$i]];
$c_chars = $w_len;
}
}
array_push($lines, implode(' ', $c_words));
$maxLine = count($lines);
$output = '';
for ($j = $maxLine - 1; $j >= 0; $j--) {
$output .= $lines[$j]."\n";
}
$output = rtrim($output);
$output = $this->preConvert($output);
if ($hindo) {
$nums = [
'0', '1', '2', '3', '4',
'5', '6', '7', '8', '9',
];
$arNums = [
'٠', '١', '٢', '٣', '٤',
'٥', '٦', '٧', '٨', '٩',
];
foreach ($nums as $k => $v) {
$p_nums[$k] = '/'.$v.'/ui';
}
$output = preg_replace($p_nums, $arNums, $output);
foreach ($arNums as $k => $v) {
$p_arNums[$k] = '/([a-z-\d]+)'.$v.'/ui';
}
foreach ($nums as $k => $v) {
$r_nums[$k] = '${1}'.$v;
}
$output = preg_replace($p_arNums, $r_nums, $output);
foreach ($arNums as $k => $v) {
$p_arNums[$k] = '/'.$v.'([a-z-\d]+)/ui';
}
foreach ($nums as $k => $v) {
$r_nums[$k] = $v.'${1}';
}
$output = preg_replace($p_arNums, $r_nums, $output);
}
return $output;
}
/**
* Decode all HTML entities (including numerical ones) to regular UTF-8 bytes.
* Double-escaped entities will only be decoded once
* ("&amp;lt;" becomes "&lt;", not "<").
*
* @param string $text The text to decode entities in.
* @param array $exclude An array of characters which should not be decoded.
* For example, array('<', '&', '"'). This affects
* both named and numerical entities.
*
* @return string
*/
protected function decodeEntities($text, $exclude = [])
{
static $table;
// We store named entities in a table for quick processing.
if (!isset($table)) {
// Get all named HTML entities.
$table = array_flip(get_html_translation_table(HTML_ENTITIES));
// PHP gives us ISO-8859-1 data, we need UTF-8.
$table = array_map('utf8_encode', $table);
// Add apostrophe (XML)
$table['&apos;'] = "'";
}
$newtable = array_diff($table, $exclude);
// Use a regexp to select all entities in one pass, to avoid decoding
// double-escaped entities twice.
//return preg_replace('/&(#x?)?([A-Za-z0-9]+);/e',
// '$this->decodeEntities2("$1", "$2", "$0", $newtable,
// $exclude)', $text);
$pieces = explode('&', $text);
$text = array_shift($pieces);
foreach ($pieces as $piece) {
if ($piece[0] == '#') {
if ($piece[1] == 'x') {
$one = '#x';
} else {
$one = '#';
}
} else {
$one = '';
}
$end = mb_strpos($piece, ';');
$start = mb_strlen($one);
$two = mb_substr($piece, $start, $end - $start);
$zero = '&'.$one.$two.';';
$text .= $this->decodeEntities2($one, $two, $zero, $newtable, $exclude).
mb_substr($piece, $end + 1);
}
return $text;
}
/**
* Helper function for decodeEntities.
*
* @param string $prefix Prefix
* @param string $codepoint Codepoint
* @param string $original Original
* @param array &$table Store named entities in a table
* @param array &$exclude An array of characters which should not be decoded
*
* @return string
*/
protected function decodeEntities2(
$prefix, $codepoint, $original, &$table, &$exclude
) {
// Named entity
if (!$prefix) {
if (isset($table[$original])) {
return $table[$original];
} else {
return $original;
}
}
// Hexadecimal numerical entity
if ($prefix == '#x') {
$codepoint = base_convert($codepoint, 16, 10);
}
// Encode codepoint as UTF-8 bytes
if ($codepoint < 0x80) {
$str = chr($codepoint);
} elseif ($codepoint < 0x800) {
$str = chr(0xC0 | ($codepoint >> 6)).
chr(0x80 | ($codepoint & 0x3F));
} elseif ($codepoint < 0x10000) {
$str = chr(0xE0 | ($codepoint >> 12)).
chr(0x80 | (($codepoint >> 6) & 0x3F)).
chr(0x80 | ($codepoint & 0x3F));
} elseif ($codepoint < 0x200000) {
$str = chr(0xF0 | ($codepoint >> 18)).
chr(0x80 | (($codepoint >> 12) & 0x3F)).
chr(0x80 | (($codepoint >> 6) & 0x3F)).
chr(0x80 | ($codepoint & 0x3F));
}
// Check for excluded characters
if (in_array($str, $exclude)) {
return $original;
} else {
return $str;
}
}
}
// How to use
(new Glyphs)->utf8Glyphs('احمد اشرف');
// Just it :D
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment