Skip to content

Instantly share code, notes, and snippets.

@rybakit
Last active September 12, 2019 08:18
Show Gist options
  • Save rybakit/2c75152577fdcb9f4718d44e7123a539 to your computer and use it in GitHub Desktop.
Save rybakit/2c75152577fdcb9f4718d44e7123a539 to your computer and use it in GitHub Desktop.
<?php
const UTF8_REGEX = '/\A(?:
[\x00-\x7F]++ # ASCII
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
)*+\z/x';
const UTF8_U = '//u';
function utf8_regex($n, $str)
{
for ($i = 0; $i < $n; ++$i) {
\preg_match(UTF8_REGEX, $str);
}
}
function utf8_u($n, $str)
{
for ($i = 0; $i < $n; ++$i) {
\preg_match(UTF8_U, $str);
}
}
function empty_loop($n, $str)
{
for ($i = 0; $i < $n; ++$i) {
}
}
function getmicrotime()
{
$t = \gettimeofday();
return $t['sec'] + $t['usec'] / 1000000;
}
function start_test()
{
\ob_start();
return getmicrotime();
}
function end_test($start, $name, $overhead = null)
{
global $total;
global $last_time;
$end = getmicrotime();
\ob_end_clean();
$last_time = $end - $start;
$total += $last_time;
$num = \number_format($last_time, 3);
$pad = \str_repeat(' ', 24 - \strlen($name) - \strlen($num));
if (null === $overhead) {
echo $name.$pad.$num."\n";
} else {
$num2 = \number_format($last_time - $overhead, 3);
echo $name.$pad.$num.' '.$num2."\n";
}
\ob_start();
return getmicrotime();
}
function total()
{
global $total;
$pad = \str_repeat('-', 24);
echo $pad."\n";
$num = \number_format($total, 3);
$pad = \str_repeat(' ', 24 - \strlen('Total') - \strlen($num));
echo 'Total'.$pad.$num."\n";
}
const N = 200000;
$str1_s = \str_repeat('c', 100);
$str1_m = \str_repeat('c', 1000);
$str1_l = \str_repeat('c', 10000);
$str2_s = \str_repeat('c', 49)."\x80".\str_repeat('c', 50);
$str2_m = \str_repeat('c', 499)."\x80".\str_repeat('c', 500);
$str2_l = \str_repeat('c', 4999)."\x80".\str_repeat('c', 5000);
$t0 = $t = start_test();
empty_loop(N, $str);
$t = end_test($t, 'empty_loop');
$overhead = $last_time;
\ob_end_clean();
echo "\nutf8:\n\n";
\ob_start();
utf8_regex(N, $str1_s);
$t = end_test($t, 'S regex', $overhead);
utf8_u(N, $str1_s);
$t = end_test($t, 'S //u', $overhead);
utf8_regex(N, $str1_m);
$t = end_test($t, 'M regex', $overhead);
utf8_u(N, $str1_m);
$t = end_test($t, 'M //u', $overhead);
utf8_regex(N, $str1_l);
$t = end_test($t, 'L regex', $overhead);
utf8_u(N, $str1_l);
$t = end_test($t, 'L //u', $overhead);
\ob_end_clean();
echo "\nnon-utf8:\n\n";
\ob_start();
utf8_regex(N, $str2_s);
$t = end_test($t, 'S regex', $overhead);
utf8_u(N, $str2_s);
$t = end_test($t, 'S //u', $overhead);
utf8_regex(N, $str2_m);
$t = end_test($t, 'M regex', $overhead);
utf8_u(N, $str2_m);
$t = end_test($t, 'M //u', $overhead);
utf8_regex(N, $str2_l);
$t = end_test($t, 'L regex', $overhead);
utf8_u(N, $str2_l);
$t = end_test($t, 'L //u', $overhead);
total();
$ php -n -d pcre.jit=1 bench_utf8_regex_vs_u.php [40/1527]
empty_loop 0.003
utf8:
S regex 0.036 0.033
S //u 0.055 0.052
M regex 0.120 0.117
M //u 0.215 0.212
L regex 0.928 0.925
L //u 1.843 1.840
non-utf8:
S regex 0.030 0.027
S //u 0.023 0.021
M regex 0.072 0.069
M //u 0.106 0.103
L regex 0.475 0.472
L //u 0.932 0.929
------------------------
Total 4.837
$ php -n -d pcre.jit=0 bench_utf8_regex_vs_u.php
empty_loop 0.004
utf8:
S regex 0.094 0.090
S //u 0.061 0.057
M regex 0.306 0.302
M //u 0.222 0.218
L regex 2.490 2.486
L //u 1.843 1.839
non-utf8:
S regex 0.079 0.076
S //u 0.023 0.019
M regex 0.189 0.186
M //u 0.107 0.103
L regex 1.271 1.267
L //u 0.915 0.911
------------------------
Total 7.604
$ php -n -v
PHP 7.3.8 (cli) (built: Jul 30 2019 09:26:16) ( NTS )
Copyright (c) 1997-2018 The PHP Group
Zend Engine v3.3.8, Copyright (c) 1998-2018 Zend Technologies
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment