Skip to content

Instantly share code, notes, and snippets.

@alexdowad
Created March 18, 2023 08:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexdowad/7d23f9bd183a575a9b424b69f59c8073 to your computer and use it in GitHub Desktop.
Save alexdowad/7d23f9bd183a575a9b424b69f59c8073 to your computer and use it in GitHub Desktop.
Script for benchmarking mb_detect_encoding
<?php
$path = explode('/', readlink('/proc/self/exe'));
$GLOBALS['commit'] = str_replace('php-', '', array_pop($path));
$GLOBALS['iterations_per_test'] = 5000;
function bench($func, $arg, $iterations) {
$t1 = microtime(true);
while ($iterations-- > 0) {
$func($arg);
}
return microtime(true) - $t1;
}
function benchAll($func, $args, $iterations) {
$elapsed = 0;
foreach ($args as $arg) {
$elapsed += bench($func, $arg, $iterations);
}
return $elapsed / count($args);
}
/* Test strings */
$ascii_short = array("a", "This", "cat", "", "APART", "``", "_x_", "@");
$ascii_medium = array(str_repeat("abcde", 20),
join(array_map(function($n) { return hash('md5', $n); }, range(1, 5))),
str_repeat(join(array_map('chr', range(33, 126))), 2));
$ascii_long = array(str_repeat('This is a long string! ', 500),
join(array_map(function($n) { return hash('md5', $n); }, range(10, 310))),
str_repeat(join(array_map('chr', range(33, 126))), 120));
$utf8_short = array("ひらがな", "Καλημέρα", "АБВГ", "ABC", "", "⡌⠁⠧⠑ ⠼", "∑ f(i)");
$utf8_medium = array("Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι, ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς",
"Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.",
"ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789 abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿАБВГДабвгд∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣",
"⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞",
"∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (A ⇔ B)");
$utf8_long = array(str_repeat("Σὲ γνωρίζω ἀπὸ τὴν κόψη τοῦ σπαθιοῦ τὴν τρομερή, σὲ γνωρίζω ἀπὸ τὴν ὄψη ποὺ μὲ βία μετράει τὴ γῆ.", 100),
join(array_map(function($n) { return mb_chr($n, 'UTF-8'); }, range(1, 10000))),
str_repeat("გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,", 80));
$utf16_short = array_map(function($str) { return mb_convert_encoding($str, 'UTF-16LE', 'UTF-8'); }, $utf8_short);
$utf16_medium = array_map(function($str) { return mb_convert_encoding($str, 'UTF-16LE', 'UTF-8'); }, $utf8_medium);
$utf16_long = array_map(function($str) { return mb_convert_encoding($str, 'UTF-16LE', 'UTF-8'); }, $utf8_long);
$jp_short = array("こんにちは", "テストのデータ", "漢字も", "123", ""); /* UTF-8 */
$jp_medium = array("むかし、むかし、あるところにおじいさんとおばあさんがいました。おじいさんが山へ木をきりにいけば、おばあさんは川へせんたくにでかけます。「おじいさん、はようもどってきなされ。」「おばあさんもきをつけてな。」まい日やさしくいいあってでかけます。",
"静岡県警によると、現場は静岡市葵区柚木の線路内で、人が立ち入り、走行中の新幹線がはねたとみられるという。県警はJR東海からの通報を受けて現場検証を実施、詳しい状況を調べている。",
"8千人はっせんにん……、毎年まいとし8千人はっせんにんが全国ぜんこくに81はちじゅういちある大学だいがく医い学がく部ぶを卒そつ業ぎょうしてゆく。君達きみたちはその8千人はっせんにんのトップの80はちじゅう人にん");
$jp_long = array_map(function($str) { return str_repeat($str, 100); }, $jp_medium);
$jis_short = array_map(function($str) { return mb_convert_encoding($str, 'JIS', 'UTF-8'); }, $jp_short);
$jis_medium = array_map(function($str) { return mb_convert_encoding($str, 'JIS', 'UTF-8'); }, $jp_medium);
$jis_long = array_map(function($str) { return mb_convert_encoding($str, 'JIS', 'UTF-8'); }, $jp_long);
$sjis_short = array_map(function($str) { return mb_convert_encoding($str, 'SJIS', 'UTF-8'); }, $jp_short);
$sjis_medium = array_map(function($str) { return mb_convert_encoding($str, 'SJIS', 'UTF-8'); }, $jp_medium);
$sjis_long = array_map(function($str) { return mb_convert_encoding($str, 'SJIS', 'UTF-8'); }, $jp_long);
function benchMBDetectEncoding($strings, $encodings, $desc) {
$elapsed = benchAll(function($str) use(&$encodings) {
mb_detect_encoding($str, $encodings, true);
}, $strings, $GLOBALS['iterations_per_test']);
echo "{$GLOBALS['commit']}|mb_detect_encoding|$desc - strict|$elapsed|{$GLOBALS['iterations_per_test']}\n";
/*
$elapsed = benchAll(function($str) use(&$encodings) {
mb_detect_encoding($str, $encodings, false);
}, $strings, $GLOBALS['iterations_per_test']);
echo "{$GLOBALS['commit']}|mb_detect_encoding|$desc - non-strict|$elapsed|{$GLOBALS['iterations_per_test']}\n";
*/
}
/*
benchMBDetectEncoding($utf8_medium, array('UTF-8', 'UTF-16LE', 'UTF-16BE', 'JIS', 'SJIS'), 'UTF-8, medium');
benchMBDetectEncoding($utf8_long, array('UTF-8', 'UTF-16LE', 'UTF-16BE', 'JIS', 'SJIS'), 'UTF-8, long');
benchMBDetectEncoding($utf16_medium, array('UTF-8', 'UTF-16LE', 'UTF-16BE', 'JIS', 'SJIS'), 'UTF-16LE, medium');
benchMBDetectEncoding($utf16_long, array('UTF-8', 'UTF-16LE', 'UTF-16BE', 'JIS', 'SJIS'), 'UTF-16LE, long');
*/
benchMBDetectEncoding($jis_long, array('UTF-8', 'UTF-16LE', 'UTF-16BE', 'JIS', 'SJIS'), 'JIS, long');
/*
benchMBDetectEncoding($sjis_long, array('UTF-8', 'UTF-16LE', 'UTF-16BE', 'JIS', 'SJIS'), 'SJIS, long');
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment