Skip to content

Instantly share code, notes, and snippets.

@jfcherng
Forked from blackbing/big5_stroke.tab
Last active December 15, 2020 08:26
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jfcherng/02c013631a3be12164f113dada0cb252 to your computer and use it in GitHub Desktop.
Save jfcherng/02c013631a3be12164f113dada0cb252 to your computer and use it in GitHub Desktop.
中文筆劃排序(只適用繁體中文) for PHP 7.2
<?php
/**
* Author: blackbing@gmail.com
* Desc: 為了解決中文筆劃排序的問題(只適用繁體中文)
* php 可以直接執行 cht_strokesort.
*/
final class ChtStroke
{
const BIG5_HB_MIN = 0x81; // 高位元組最小值
const BIG5_HB_MAX = 0xfe; // 高位元組最大值
const BIG5_LB1_MIN = 0x40; // 低位元組最小值
const BIG5_LB1_MAX = 0x7e; // 低位元組最大值
const BIG5_LB2_MIN = 0xa1; // 低位元組最小值
const BIG5_LB2_MAX = 0xfe; // 低位元組最大值
public static function chtStrokesort(array $str_arr, bool $dontSort = false): array
{
//若是英數字,則依照ord來做排序,而筆劃排序則由base開始起算
static $stroke_base = 50000;
$ord_arr = [];
foreach ($str_arr as $key => $value) {
$value = \urldecode($value);
$charLen = \mb_strlen($value, 'UTF-8');
$strokes = $ords = [];
for ($i = 0; $i < $charLen; ++$i) {
$char = \mb_substr($value, $i, 1, 'UTF-8');
$stroke = self::getStringStroke($char);
if ($stroke > 0) {
$ords[] = $stroke_base + $stroke;
} else {
$ords[] = \ord($char);
}
$strokes[] = $stroke;
}
$ord_arr[] = [
'string' => $value,
'index_old' => $key,
'strokes' => $strokes,
'ords' => $ords,
];
}
// 若指定不排序
if (!$dontSort) {
\usort($ord_arr, self::uDictArrayCompare('ords'));
}
return $ord_arr;
}
/**
* Sort the array by the stroke information of one of it's element's columns.
*
* @param array[] $array The array
* @param string $column The column name to be used by stroke sorting
*/
public static function chtStrokeSortByColumn(array &$array, string $column): void
{
$tmpColumn = "__stroke__{$column}";
// insert stroke information into the source array
foreach ($array as &$item) {
$item[$tmpColumn] = self::chtOrds($item[$column]);
}
unset($item);
// sort the array by the inserted stroke information
\usort($array, self::uDictArrayCompare($tmpColumn));
// remove the inserted stroke information
foreach ($array as $key => $item) {
unset($item[$tmpColumn]);
}
}
public static function chtOrds(string $str): array
{
//若是英數字,則依照ord來做排序,而筆劃排序則由base開始起算
static $stroke_base = 50000;
$ords = [];
$strLen = \mb_strlen($str, 'UTF-8');
for ($i = 0; $i < $strLen; ++$i) {
$char = \mb_substr($str, $i, 1, 'UTF-8');
$stroke = self::getStringStroke($char);
if ($stroke > 0) {
$ords[] = $stroke_base + $stroke;
} else {
$ords[] = \ord($char);
}
}
return $ords;
}
public static function uDictArrayCompare(string $key): Closure
{
return function (array $a, array $b) use ($key): int {
if ($a[$key] === $b[$key]) {
return 0;
}
for ($i = 0; $i < \count($a[$key]); ++$i) {
if (!isset($b[$key][$i])) {
return 1;
}
if ($a[$key][$i] > $b[$key][$i]) {
return 1;
}
if ($a[$key][$i] < $b[$key][$i]) {
return -1;
}
continue;
}
return -1;
};
}
// 計算中文字筆劃
private static function big5Stroke(string $str): ?int
{
$i = 0;
foreach (self::getStrokeData() as $key => $val) {
$StrokeMapping[$i] = \explode(' ', $val);
$StrokeMapping[$i][1] = \hexdec($StrokeMapping[$i][1]);
$StrokeMapping[$i][2] = \hexdec($StrokeMapping[$i][2]);
++$i;
}
$s1 = \substr($str, 0, 1);
$s2 = \substr($str, 1, 1);
$s = \hexdec(\bin2hex($s1 . $s2));
if (self::big5IsHb($s1) && self::big5IsLb($s2)) {
for ($i = 0; $i < \count($StrokeMapping); ++$i) {
if ($StrokeMapping[$i][1] <= $s && $StrokeMapping[$i][2] >= $s) {
return $StrokeMapping[$i][0];
}
}
}
return null;
}
private static function getStringStroke(string $str): int
{
$str = self::utf8ToBig5($str);
$stroke = self::big5Stroke($str);
return (int) $stroke;
}
private static function big5IsHb(string $c): bool
{
$asc = \ord($c);
return $asc >= self::BIG5_HB_MIN && $asc <= self::BIG5_HB_MAX;
}
private static function big5IsLb(string $c): bool
{
$asc = \ord($c);
return
($asc >= self::BIG5_LB1_MIN && $asc <= self::BIG5_LB1_MAX) ||
($asc >= self::BIG5_LB2_MIN && $asc <= self::BIG5_LB2_MAX);
}
private static function utf8ToBig5(string $utf8_str): string
{
$i = 0;
$len = \strlen($utf8_str);
$big5_str = '';
for ($i = 0; $i < $len; ++$i) {
$sbit = \ord(\substr($utf8_str, $i, 1));
if ($sbit < 128) {
$big5_str .= \substr($utf8_str, $i, 1);
} elseif ($sbit > 191 && $sbit < 224) {
$new_word = \mb_convert_encoding(\substr($utf8_str, $i, 2), 'Big5', 'UTF-8');
$big5_str .= ($new_word == '') ? (\mb_convert_encoding(\substr($utf8_str, $i, 3), 'HTML-ENTITIES', 'UTF-8')) : $new_word;
++$i;
} elseif ($sbit > 223 && $sbit < 240) {
$new_word = \mb_convert_encoding(\substr($utf8_str, $i, 3), 'Big5', 'UTF-8');
$big5_str .= ($new_word == '') ? (\mb_convert_encoding(\substr($utf8_str, $i, 3), 'HTML-ENTITIES', 'UTF-8')) : $new_word;
$i += 2;
} elseif ($sbit > 239 && $sbit < 248) {
$new_word = \mb_convert_encoding(\substr($utf8_str, $i, 4), 'Big5', 'UTF-8');
$big5_str .= ($new_word == '') ? (\mb_convert_encoding(\substr($utf8_str, $i, 3), 'HTML-ENTITIES', 'UTF-8')) : $new_word;
$i += 3;
}
}
return $big5_str;
}
private static function getStrokeData(): array
{
return \explode(
"\n",
<<<'EOT'
1 a440 a441
2 a442 a453
3 a454 a47e
4 a4a1 a4fd
5 a4fe a5df
6 a5e0 a6e9
7 a6ea a8c2
8 a8c3 ab44
9 ab45 adbb
10 adbc b0ad
11 b0ae b3c2
12 b3c3 b6c3
13 b6c4 b9ab
14 b9ac bbf4
15 bbf5 bea6
16 bea7 c074
17 c075 c24e
18 c24f c35e
19 c35f c454
20 c455 c4d6
21 c3d7 c56a
22 c56b c5c7
23 c5c8 c5c7
24 c5f1 c654
25 c655 c664
26 c665 c66b
27 c66c c675
28 c676 c67a
29 c67b c67e
2 c940 c944
3 c945 c94c
4 c94d c95c
5 c95d c9aa
6 c9ab c959
7 ca5a cbb0
8 cbb1 cddc
9 cddd d0c7
10 d0c8 d44a
11 d44b d850
12 d851 dcb0
13 dcb1 e0ef
14 e0f0 e4e5
15 e4e6 e8f3
16 e8f4 ecb8
17 ecb9 efb6
18 efb7 f1ea
19 f1eb f3fc
20 f3fd f5bf
21 f5c0 f6d5
22 f6d6 f7cf
23 f6d6 f7cf
24 f8a5 f8ed
25 f8e9 f96a
26 f96b f9a1
27 f9a2 f9b9
28 f9ba f9c5
29 f9c6 f9dc
9 f9da f9da
12 f9db f9db
13 f9d6 f9d8
15 f9dc f9dc
16 f9d9 f9d9
30 c67b c67d
30 f9cc f9cf
31 f9c6 f9c6
31 f9d0 f9d0
32 f9d1 f9d1
33 c67e c67e
33 f9d2 f9d2
34 f9d3 f9d3
36 f9d4 f9d5
EOT
);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment