Skip to content

Instantly share code, notes, and snippets.

@lifthrasiir
Created November 18, 2010 07:59
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lifthrasiir/704754 to your computer and use it in GitHub Desktop.
Save lifthrasiir/704754 to your computer and use it in GitHub Desktop.
a drop-in replacement for ereg* functions in PHP
<?php
// ere2pcre -- a drop-in replacement for ereg* functions
// written by Kang Seonghoon <public+ere2pcre@mearie.org>.
//
// this library is dedicated to the public domain. for the jurisdictions that
// does not recognize the public domain, CC0 1.0 Universal Public Domain
// Dedication applies.
function _ere2pcre_escape($c) {
if ($c == "\0") {
trigger_error('ere2pcre: a literal null byte in the regex', E_USER_ERROR);
} elseif (strpos('\^$.[]|()?*+{}-/', $c) !== false) {
return "\\".$c;
} else {
return $c;
}
}
// recursively converts ERE into PCRE, starting at the position $i.
function _ere2pcre($s, $i) {
$r = array('');
$rr = 0;
$l = strlen($s);
while ($i < $l) {
// atom
$c = $s[$i];
if ($c == '(') {
if ($i + 1 < $l && $s[$i+1] == ')') { // special case
$r[$rr] .= '()';
++$i;
} else {
list($t, $ii) = _ere2pcre($s, $i + 1);
if ($ii >= $l || $s[$ii] != ')') {
trigger_error('ere2pcre: "(" does not have a matching ")"',
E_USER_ERROR);
}
$r[$rr] .= '('.$t.')';
$i = $ii;
}
} elseif ($c == '[') {
++$i;
$cls = '';
if ($i < $l && $s[$i] == '^') {
$cls .= '^';
++$i;
}
if ($i >= $l) {
trigger_error('ere2pcre: "[" does not have a matching "]"',
E_USER_ERROR);
}
$start = true;
do {
if ($s[$i] == '[' &&
$i+1 < $l && strpos('.=:', $s[$i+1]) !== false) {
$ii = strpos($s, ']', $i);
if ($ii === false) {
trigger_error('ere2pcre: "[" does not have a matching '.
'"]"', E_USER_ERROR);
}
$ccls = substr($s, $i+1, $ii-($i+1));
$cclsmap = array(
':alnum:' => '[:alnum:]',
':alpha:' => '[:alpha:]',
':blank:' => '[:blank:]',
':cntrl:' => '[:cntrl:]',
':digit:' => '\d',
':graph:' => '[:graph:]',
':lower:' => '[:lower:]',
':print:' => '[:print:]',
':punct:' => '[:punct:]',
':space:' => '\013\s', // should include VT
':upper:' => '[:upper:]',
':xdigit:' => '[:xdigit:]',
);
if (!isset($cclsmap[$ccls])) {
trigger_error('ere2pcre: an invalid or unsupported '.
'character class ['.$ccls.']',
E_USER_ERROR);
}
$cls .= $cclsmap[$ccls];
$i = $ii + 1;
} else {
$a = $s[$i++];
if ($a === '-' && !$start && !($i < $l && $s[$i] == ']')) {
trigger_error('ere2pcre: "-" is invalid for the start '.
'character in the brackets',
E_USER_ERROR);
}
if ($i < $l && $s[$i] === '-') {
++$i;
$b = $s[$i++];
if ($b == ']') {
$cls .= _ere2pcre_escape($a).'\-';
break;
} elseif (ord($a) > ord($b)) {
trigger_error('ere2pcre: an invalid character '.
'range "'.$a.'-'.$b.'"',
E_USER_ERROR);
}
$cls .= _ere2pcre_escape($a).'-'._ere2pcre_escape($b);
} else {
$cls .= _ere2pcre_escape($a);
}
}
$start = false;
} while ($i < $l && $s[$i] != ']');
if ($i >= $l) {
trigger_error('ere2pcre: "[" does not have a matching "]"',
E_USER_ERROR);
}
$r[$rr] .= '['.$cls.']';
} elseif ($c == ')') {
break;
} elseif ($c == '*' || $c == '+' || $c == '?') {
trigger_error('ere2pcre: unescaped metacharacter "'.$c.'"',
E_USER_ERROR);
} elseif ($c == '{') {
if ($i + 1 < $l && strpos('0123456789', $s[$i+1]) !== false) {
$r[$rr] .= '\{';
} else {
trigger_error('ere2pcre: unescaped metacharacter "'.$c.'"',
E_USER_ERROR);
}
} elseif ($c == '.') {
$r[$rr] .= $c;
} elseif ($c == '^' || $c == '$') {
$r[$rr] .= $c;
++$i;
continue;
} elseif ($c == '|') {
if ($r[$rr] === '') {
trigger_error('ere2pcre: empty branch', E_USER_ERROR);
}
$r[] = '';
++$rr;
++$i;
continue;
} elseif ($c == "\\") {
if (++$i >= $l) {
trigger_error('ere2pcre: an invalid escape sequence at the end',
E_USER_ERROR);
}
$r[$rr] .= _ere2pcre_escape($s[$i]);
} else { // including ] and } which are allowed as a literal character
$r[$rr] .= _ere2pcre_escape($c);
}
++$i;
if ($i >= $l) break;
// piece after the atom (only ONE of them is possible)
$c = $s[$i];
if ($c == '*' || $c == '+' || $c == '?') {
$r[$rr] .= $c;
++$i;
} elseif ($c == '{') {
$ii = strpos($s, '}', $i);
if ($ii === false) {
trigger_error('ere2pcre: "{" does not have a matching "}"',
E_USER_ERROR);
}
$bound = substr($s, $i+1, $ii-($i+1));
if (!preg_match('/^([0-9]|[1-9][0-9]|1[0-9][0-9]|
2[0-4][0-9]|25[0-5])
(,([0-9]|[1-9][0-9]|1[0-9][0-9]|
2[0-4][0-9]|25[0-5])?)?$/x',
$bound, $m)) {
trigger_error('ere2pcre: an invalid bound', E_USER_ERROR);
}
if (isset($m[3])) {
if ($m[1] > $m[3]) {
trigger_error('ere2pcre: an invalid bound', E_USER_ERROR);
}
$r[$rr] .= '{'.$m[1].','.$m[3].'}';
} elseif (isset($m[2])) {
$r[$rr] .= '{'.$m[1].',}';
} else {
$r[$rr] .= '{'.$m[1].'}';
}
$i = $ii + 1;
}
}
if ($r[$rr] === '') {
trigger_error('ere2pcre: empty regular expression or branch',
E_USER_ERROR);
}
return array(implode('|', $r), $i);
}
// converts the ERE $s into the PCRE $r. triggers error on any invalid input.
function ere2pcre($s, $ignorecase) {
static $cache = array(), $icache = array();
if ($ignorecase) {
if (isset($icache[$s])) return $icache[$s];
} else {
if (isset($cache[$s])) return $cache[$s];
}
list($r, $i) = _ere2pcre($s, 0);
if ($i != strlen($s)) {
trigger_error('ere2pcre: unescaped metacharacter ")"', E_USER_ERROR);
}
if ($ignorecase) {
return ($icache[$s] = '/'.$r.'/mi');
} else {
return ($cache[$s] = '/'.$r.'/m');
}
}
// drop-in replacement for ereg.
function myereg($r, $s, &$m = null) {
$r = ere2pcre($r, false);
if (func_num_args() > 2) { // fix the result
return (preg_match($r, $s, $m) ? strlen($m[0]) : false);
} else {
return (preg_match($r, $s) ? 1 : false);
}
}
// drop-in replacement for eregi.
function myeregi($r, $s, &$m = null) {
$r = ere2pcre($r, true);
if (func_num_args() > 2) { // fix the result
return (preg_match($r, $s, $m) ? strlen($m[0]) : false);
} else {
return (preg_match($r, $s) ? 1 : false);
}
}
// drop-in replacement for ereg_replace.
function myereg_replace($r, $t, $s) {
return preg_replace(ere2pcre($r, false), $t, $s);
}
// drop-in replacement for eregi_replace.
function myeregi_replace($r, $t, $s) {
return preg_replace(ere2pcre($r, true), $t, $s);
}
// drop-in replacement for split.
function mysplit($r, $s, $l=-1) {
return preg_split(ere2pcre($r, false), $s, ($l == 0 ? 1 : $l));
}
// drop-in replacement for spliti.
function myspliti($r, $s, $l=-1) {
return preg_split(ere2pcre($r, true), $s, ($l == 0 ? 1 : $l));
}
// some test cases.
function ere2pcre_test() {
foreach (array(
'mearie.org',
'mearie\.org',
'mearie[.,]org',
'[a-z]+[.,][a-z]+',
'^[a-z]+[.,][a-z]+$',
'^[a-z]+[.,][a-z]{3,}$',
'a|b|(c|d)|e',
'a|b|()|c',
'[[:alnum:][:punct:]]',
'[]-z]',
'[[a]]',
'[---]',
'[a\z]',
'[^^]',
'^$^$^$^$',
'\([^>]*\"?[^)]*\)',
) as $r) {
printf("%-40s%-40s\n", $r, ere2pcre($r, false));
}
$r = "^(http(s?):\/\/|ftp:\/\/)*([[:alpha:]][-[:alnum:]]*[[:alnum:]])".
"(\.[[:alpha:]][-[:alnum:]]*[[:alpha:]])+(/[[:alpha:]][-[:alnum:]]*".
"[[:alnum:]])*(\/?)(/[[:alpha:]][-[:alnum:]]*\.[[:alpha:]]{3,5})?".
"(\?([[:alnum:]][-_%[:alnum:]]*=[-_%[:alnum:]]+)(&([[:alnum:]]".
"[-_%[:alnum:]]*=[-_%[:alnum:]]+))*)?$";
printf("%s\n%s\n", $r, ere2pcre($r, false));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment