Skip to content

Instantly share code, notes, and snippets.

@rybakit
Created September 27, 2016 16:10
Show Gist options
  • Save rybakit/717018d5f3292ef28fd9de6210f687aa to your computer and use it in GitHub Desktop.
Save rybakit/717018d5f3292ef28fd9de6210f687aa to your computer and use it in GitHub Desktop.
<?php
const UTF8_REGEX = '/\A(?:
[\x00-\x7F]++ # ASCII
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
)*+\z/x';
const NON_UTF8_REGEX = '/(
[\xC0-\xC1] # Invalid UTF-8 Bytes
| [\xF5-\xFF] # Invalid UTF-8 Bytes
| \xE0[\x80-\x9F] # Overlong encoding of prior code point
| \xF0[\x80-\x8F] # Overlong encoding of prior code point
| [\xC2-\xDF](?![\x80-\xBF]) # Invalid UTF-8 Sequence Start
| [\xE0-\xEF](?![\x80-\xBF]{2}) # Invalid UTF-8 Sequence Start
| [\xF0-\xF4](?![\x80-\xBF]{3}) # Invalid UTF-8 Sequence Start
| (?<=[\x0-\x7F\xF5-\xFF])[\x80-\xBF] # Invalid UTF-8 Sequence Middle
| (?<![\xC2-\xDF]|[\xE0-\xEF]|[\xE0-\xEF][\x80-\xBF]|[\xF0-\xF4]|[\xF0-\xF4][\x80-\xBF]|[\xF0-\xF4][\x80-\xBF]{2})[\x80-\xBF] # Overlong Sequence
| (?<=[\xE0-\xEF])[\x80-\xBF](?![\x80-\xBF]) # Short 3 byte sequence
| (?<=[\xF0-\xF4])[\x80-\xBF](?![\x80-\xBF]{2}) # Short 4 byte sequence
| (?<=[\xF0-\xF4][\x80-\xBF])[\x80-\xBF](?![\x80-\xBF]) # Short 4 byte sequence (2)
)/x';
function utf8($n, $str) {
for ($i = 0; $i < $n; ++$i) {
\preg_match(UTF8_REGEX, $str);
}
}
function non_utf8($n, $str) {
for ($i = 0; $i < $n; ++$i) {
\preg_match(NON_UTF8_REGEX, $str);
}
}
function empty_loop($n, $str) {
for ($i = 0; $i < $n; ++$i) {
}
}
function getmicrotime()
{
$t = gettimeofday();
return ($t['sec'] + $t['usec'] / 1000000);
}
function start_test()
{
ob_start();
return getmicrotime();
}
function end_test($start, $name, $overhead = null)
{
global $total;
global $last_time;
$end = getmicrotime();
ob_end_clean();
$last_time = $end-$start;
$total += $last_time;
$num = number_format($last_time,3);
$pad = str_repeat(" ", 24-strlen($name)-strlen($num));
if (is_null($overhead)) {
echo $name.$pad.$num."\n";
} else {
$num2 = number_format($last_time - $overhead,3);
echo $name.$pad.$num." ".$num2."\n";
}
ob_start();
return getmicrotime();
}
function total()
{
global $total;
$pad = str_repeat("-", 24);
echo $pad."\n";
$num = number_format($total,3);
$pad = str_repeat(" ", 24-strlen("Total")-strlen($num));
echo "Total".$pad.$num."\n";
}
const N = 500000;
$str = str_repeat('c', 65535);
$t0 = $t = start_test();
empty_loop(N, $str);
$t = end_test($t, 'empty_loop');
$overhead = $last_time;
utf8(N, $str);
$t = end_test($t, 'utf8', $overhead);
non_utf8(N, $str);
$t = end_test($t, 'non_utf8', $overhead);
total();
$ php -d pcre.jit=1 tests/bench_utf8_regex.php
empty_loop 0.004
utf8 14.833 14.829
non_utf8 14.713 14.709
------------------------
Total 29.550
$ php -d pcre.jit=0 tests/bench_utf8_regex.php
empty_loop 0.004
utf8 81.329 81.325
^C
$ php -v
PHP 7.0.5 (cli) (built: Apr 23 2016 10:48:01) ( NTS )
Copyright (c) 1997-2016 The PHP Group
Zend Engine v3.0.0, Copyright (c) 1998-2016 Zend Technologies
with Zend OPcache v7.0.6-dev, Copyright (c) 1999-2016, by Zend Technologies
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment