Skip to content

Instantly share code, notes, and snippets.

@imme-emosol
Created December 5, 2010 09:37
Show Gist options
  • Save imme-emosol/728973 to your computer and use it in GitHub Desktop.
Save imme-emosol/728973 to your computer and use it in GitHub Desktop.
uri checkers testbed in php
<?php
/*
* Note: You'll need PHP5.3 to run this script!
*/
/*
* Patterns originate from http://mathiasbynens.be/demo/url-regex
*
* Note: None of the patterns had the S-modifier. I added it to speed up the tests.
* When patterns are used repeatedly, /S can improve performance :)
*/
$patterns = array(
'spoon' => '/(((http|ftp|https):\/{2})+(([0-9a-z_-]+\.)+(aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cx|cy|cz|cz|de|dj|dk|dm|do|dz|ec|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mn|mn|mo|mp|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|nom|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ra|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw|arpa)(:[0-9]+)?((\/([~0-9a-zA-Z\#\+\%@\.\/_-]+))?(\?[0-9a-zA-Z\+\%@\/&\[\];=_-]+)?)?))\b/imuS',
'krijnhoetmer' => '_(^|[\s.:;?\-\]<\(])(https?://[-\w;/?:@&=+$\|\_.!~*\|\'()\[\]%#,☺]+[\w/#](\(\))?)(?=$|[\s\',\|\(\).:;?\-\[\]>\)])_iS',
'gruber' => '#\b(([\w-]+://?|www[.])[^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/)))#iS',
'gruber revised' => "#(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))#iS",
'cowboy' => '~(?:\b[a-z\d.-]+://[^<>\s]+|\b(?:(?:(?:[^\s!@#$%^&*()_=+[\]{}\|;:\'",.<>/?]+)\.)+(?:ac|ad|aero|ae|af|ag|ai|al|am|an|ao|aq|arpa|ar|asia|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|biz|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|cat|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|coop|com|co|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec|edu|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|info|int|in|io|iq|ir|is|it|je|jm|jobs|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mil|mk|ml|mm|mn|mobi|mo|mp|mq|mr|ms|mt|museum|mu|mv|mw|mx|my|mz|name|na|nc|net|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pro|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|travel|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|xn--0zwm56d|xn--11b5bs3a9aj6g|xn--80akhbyknj4f|xn--9t4b11yi5a|xn--deba0ad|xn--g6w251d|xn--hgbk6aj7f53bba|xn--hlcj6aya9esc7a|xn--jxalpdlp|xn--kgbechtv|xn--zckzah|ye|yt|yu|za|zm|zw)|(?:(?:[0-9]|[1-9]\d|1\d{2}|2[0-4]\d|25[0-5])\.){3}(?:[0-9]|[1-9]\d|1\d{2}|2[0-4]\d|25[0-5]))(?:[;/][^#?<>\s]*)?(?:\?[^#<>\s]*)?(?:#[^<>\s]*)?(?!\w))~iS',
// jeffrey friedl
'jeffrey friedl' => '@\b((ftp|https?)://[-\w]+(\.\w[-\w]*)+|(?:[a-z0-9](?:[-a-z0-9]*[a-z0-9])?\.)+(?: com\b|edu\b|biz\b|gov\b|in(?:t|fo)\b|mil\b|net\b|org\b|[a-z][a-z]\b))(\:\d+)?(/[^.!,?;"\'<>()\[\]{}\s\x7F-\xFF]*(?:[.!,?]+[^.!,?;"\'<>()\[\]{}\s\x7F-\xFF]+)*)?@iS',
'mattfarina' => '/^([a-z][a-z0-9\*\-\.]*):\/\/(?:(?:(?:[\w\.\-\+!$&\'\(\)*\+,;=]|%[0-9a-f]{2})+:)*(?:[\w\.\-\+%!$&\'\(\)*\+,;=]|%[0-9a-f]{2})+@)?(?:(?:[a-z0-9\-\.]|%[0-9a-f]{2})+|(?:\[(?:[0-9a-f]{0,4}:)*(?:[0-9a-f]{0,4})\]))(?::[0-9]+)?(?:[\/|\?](?:[\w#!:\.\?\+=&@!$\'~*,;\/\(\)\[\]\-]|%[0-9a-f]{2})*)?$/xiS',
'stephenhay' => '@^[httprads:]*\/\/[^$.?#].*$@iS',
'stephenhay revised' => '@^[hftps]*:\/\/[^/$.?#].[^\s]*$@iS',
// hint: javascript does \uXX while php does \xXX
'scottgonzales' => '#([a-z]([a-z]|\d|\+|-|\.)*):(\/\/(((([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&\'\(\)\*\+,;=]|:)*@)?((\[(|(v[\da-f]{1,}\.(([a-z]|\d|-|\.|_|~)|[!\$&\'\(\)\*\+,;=]|:)+))\])|((\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5])\.(\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5])\.(\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5])\.(\d|[1-9]\d|1\d\d|2[0-4]\d|25[0-5]))|(([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&\'\(\)\*\+,;=])*)(:\d*)?)(\/(([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&\'\(\)\*\+,;=]|:|@)*)*|(\/((([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&\'\(\)\*\+,;=]|:|@)+(\/(([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&\'\(\)\*\+,;=]|:|@)*)*)?)|((([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&\'\(\)\*\+,;=]|:|@)+(\/(([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&\'\(\)\*\+,;=]|:|@)*)*)|((([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&\'\(\)\*\+,;=]|:|@)){0})(\?((([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&\'\(\)\*\+,;=]|:|@)|[\xE000-\xF8FF]|\/|\?)*)?(\#((([a-z]|\d|-|\.|_|~|[\x00A0-\xD7FF\xF900-\xFDCF\xFDF0-\xFFEF])|(%[\da-f]{2})|[!\$&\'\(\)\*\+,;=]|:|@)|\/|\?)*)?#iS',
'rodneyrehm' => '#((https?://|ftp://|www\.|[^\s:=]+@www\.).*?[a-z_\/0-9\-\#=&])(?=(\.|,|;|\?|\!)?("|\'|«|»|\[|\s|\r|\n|$))#iS',
'rodneyrehm revised' => '#(([a-z]+://|www\.|[^\s:=]+@www\.)([^/].*?[a-z0-9].*?)([a-z_\/0-9\-\#=&]|))(?=[\.,;\?\!]?(["\'«»\[\s\r\n]|$))#iS',
'imme_emosol' => '@(https?|ftp|torrent|image|irc)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$@iS' ,
'imme_emosol ht-&f-tp(s)' => '@(https?|ftp)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$@iS' ,
'filter_var' => function( $subject )
{
if( $t = filter_var( $subject, FILTER_VALIDATE_URL ) )
return array( $t );
return false;
},
'parse_url' => function( $subject )
{
if( $t = parse_url( $subject ) )
{
$x = array();
if( !empty( $t['scheme'] ) )
{
$x[] = $t['scheme'];
$x[] = '://';
}
if( !empty( $t['user'] ) )
{
$x[] = $t['user'];
if( !empty( $t['pass'] ) )
{
$x[] = ':';
$x[] = $t['pass'];
}
$x[] = '@';
}
if( !empty( $t['host'] ) )
{
$x[] = $t['host'];
}
if( !empty( $t['port'] ) )
{
$x[] = ':';
$x[] = $t['port'];
}
if( !empty( $t['path'] ) )
{
$x[] = $t['path'];
}
if( !empty( $t['query'] ) )
{
$x[] = '?';
$x[] = $t['query'];
}
if( !empty( $t['fragment'] ) )
{
$x[] = '#';
$x[] = $t['fragment'];
}
//var_dump( $subject, parse_url( $subject ), join( '', parse_url( $subject ) ) ); exit;
return array( join( '', $x ) );
}
return false;
}
);
$authors = array(
'spoon' => 'http://www.spoon-library.com/',
'krijnhoetmer' => 'http://twitter.com/krijnhoetmer',
'gruber' => 'http://twitter.com/gruber',
'gruber revised' => 'http://twitter.com/gruber',
'cowboy' => 'http://twitter.com/cowboy',
'jeffrey friedl' => 'http://regex.info/blog/',
'mattfarina' => 'http://twitter.com/mattfarina',
'stephenhay' => 'http://twitter.com/stephenhay',
'scottgonzales' => 'http://twitter.com/scottgonzales',
'rodneyrehm' => 'http://twitter.com/rodneyrehm',
'imme_emosol' => 'http://twitter.com/imme_emosol' ,
'filter_var' => 'http://php.net/filter_var',
'parse_url' => 'http://php.net/parse_url',
);
$tests = array(
'positive' => array(
'http://foo.com/blah_blah',
'http://foo.com/blah_blah/',
'http://foo.com/blah_blah_(wikipedia)',
'http://foo.com/blah_blah_(wikipedia)_(again)',
'http://www.example.com/wpstyle/?p=364',
'https://www.example.com/foo/?bar=baz&inga=42&quux',
'http://✪df.ws/123',
'http://userid:password@example.com:8080',
'http://userid:password@example.com:8080/',
'http://userid@example.com',
'http://userid@example.com/',
'http://userid@example.com:8080',
'http://userid@example.com:8080/',
'http://userid:password@example.com',
'http://userid:password@example.com/',
'http://192.168.1.1/',
'http://192.168.1.1:8080/',
'http://➡.ws/䨹',
'http://⌘.ws',
'http://⌘.ws/',
'http://foo.com/blah_(wikipedia)#cite-1',
'http://foo.com/blah_(wikipedia)_blah#cite-1',
'http://foo.com/unicode_(✪)_in_parens',
'http://foo.com/(something)?after=parens',
'http://☺.damowmow.com/',
'http://code.example.com/events/#&product=browser',
'http://j.mp',
'ftp://foo.bar/baz',
'torrent://foo.bar/baz',
'image://foo.bar:993',
'irc://foo.bar:6667',
),
'negative' => array(
'rdar://1234',
'http://',
'http://.',
'http://..',
'http://../',
'http://?',
'http://??',
'http://??/',
'http://#',
'http://##',
'http://##/',
'http://foo.bar?q=Spaces should be encoded',
'//',
'//a',
'///a',
'///',
'http:///a',
'foo.com',
'http://-a.b.co',
'http://a.b-.co',
),
'fulltext' => array(
'http://example.com' => 'The brown http://example.com, jumped over the fox',
'http://example.com?la#mc' => 'http://example.com?la#mc. nice site',
'http://example.com?la#mc-' => '"http://example.com?la#mc-" looks bad',
'www.example.com?la#mc-' => 'www.example.com?la#mc-" is off',
'ftp://foo.example.com?la#mc-' => 'ftp://foo.example.com?la#mc-" is oldschool',
'www.example.org' => 'yeah, looks nice www.example.org?',
'http://✪df.ws/123' => 'what about IDN? http://✪df.ws/123',
'irc://foo:6667' => 'do you speak IRC? irc://foo:6667',
// and so on... too lazy to come up with more tests
),
);
$testNames = array(
'positive' => 'URL verification (on URLs)',
'negative' => 'URL verification (on non URLs, "passed" being good, "failed" being "false positive")',
'fulltext' => 'URL scanning in natural text',
);
$testAssertions = array(
'positive' => function( $_subject, $subject, $matches )
{
// matched nothing
if( !$matches )
{
return 0;
}
// matched correctly
else if( $matches[0] == $subject )
{
return 1;
}
// matched something
else
{
return 2;
}
},
'negative' => function( $_subject, $subject, $matches )
{
// matched nothing
if( !$matches )
{
return 1;
}
// matched something
else
{
return 0;
}
},
'fulltext' => function( $_subject, $subject, $matches )
{
// matched nothing
if( !$matches )
{
return 0;
}
// matched something
else
{
if( $matches[0] == $_subject
//|| ( isset( $matches[1] ) && $matches[ 1 ] === $_subject )
)
{
return 1;
}
else
{
return 2;
}
}
},
);
$results = array();
$maybes = array();
foreach( $tests as $_test => $test )
{
foreach( $test as $_subject => $subject )
{
foreach( $patterns as $_pattern => $pattern )
{
if( is_string($pattern) )
preg_match( $pattern, $subject, $matches );
else
$matches = $pattern( $subject );
if( empty( $results[ $_pattern ] ) )
$results[ $_pattern ] = array();
if( empty( $results[ $_pattern ][ $_test ] ) )
$results[ $_pattern ][ $_test ] = array();
if (
(
$results[ $_pattern ][ $_test ][ $subject ]
= $testAssertions[ $_test ](
$_subject, $subject, $matches
)
) == 2
)
$maybes[ $_pattern .'#'. $_test .'#'. $subject ] = $matches[0];
}
}
}
?><!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="de" lang="de">
<head>
<meta charset="utf-8" />
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
<title>URL identification</title>
<style type="text/css">
th, td { padding: 3px; }
th { text-align: left; font-weight: bold; background-color: #EEE; }
td { text-align: center; background-color: #EFEFEF; }
.passed { color: #00B530; background-color: #50F78B; }
.failed { color: #FF0546; background-color: #FFB9CA; }
.maybe { color: #D29331; background-color: #FCE484; }
.results tbody tr:first-child th { background-color: #DDD; }
.regex { }
.regex th, .regex td { text-align:left; }
.regex .number { text-align: right; }
.regex .code { overflow:auto; white-space:nowrap; }
:target th, :target td { background-color: lightblue; }
</style>
</head>
<body>
<h1>URL verification and identification</h1>
<table class="results">
<thead>
<tr>
<th>Subject</th>
<?php
foreach( $patterns as $_pattern => $pattern )
echo "\t\t\t\t\t\t" .
'<th><a href="#',
urlencode( $_pattern ),
'">',
htmlspecialchars( $_pattern ),
'</a></th>' .
"\n"
;
?>
</tr>
</thead>
<?php
$patternsCount = count( $patterns );
$styles = array(
'failed' ,
'passed' ,
'maybe' ,
);
echo '<tbody>';
foreach( $tests as $_test => $test )
{
echo '' .
'<tr><th colspan="' .
( $patternsCount + 1 ) .
'">' .
htmlspecialchars( $testNames[ $_test ] ) .
'</th></tr>' .
'';
foreach( $test as $subject )
{
echo '' .
'<tr><th>' .
htmlspecialchars( $subject ) .
'</th>' .
'';
foreach( $patterns as $_pattern => $pattern )
{
$_maybe = $_pattern .'#'. $_test .'#'. $subject;
$result = $results[ $_pattern ][ $_test ][ $subject ];
$state = $styles[ $result ];
echo '' .
'<td class="' .
$state .
'"' .
( !empty( $maybes[ $_maybe ] )
? '' .
' title="'.
htmlspecialchars( $maybes[ $_maybe ] ) .
'"' .
''
: '' .
//' title="' .
//$subject .
//'"' .
''
) .
'>' .
$state .
'</td>' .
'';
}
echo '</tr>';
}
}
echo '</tbody>';
?>
</tbody>
</table>
<h2>The Regular Expressions</h2>
<table class="regex">
<thead>
<tr>
<th>Name</th>
<th>Characters</th>
<th>Expression</th>
</thead>
<tbody>
<?php
foreach( $patterns as $_pattern => $pattern )
{
$author_link = '';
$pattern_author = $_pattern;
for (;;)
{
if ( isset( $authors[ $pattern_author ] ) )
{
$author_link = htmlspecialchars( $authors[ $pattern_author ] );
break;
}
$new_pattern_author = substr( $_pattern , 0 , strpos( $_pattern , ' ' ) );
if ( $new_pattern_author === $pattern_author )
{
break;
}
$pattern_author = $new_pattern_author;
}
echo '' .
'<tr id="' .
urlencode( $_pattern ) .
'">' .
'<th><a href="' .
$author_link .
'">' .
htmlspecialchars( $_pattern ) .
'</a></th>'
;
if( is_string( $pattern ) )
{
echo '' .
'<td class="number">' .
mb_strlen( $pattern , 'UTF-8' ) .
'</td>' .
'<td><div class="code">' .
htmlspecialchars( $pattern ) .
'</div></td>'
;
}
else
{
echo '<td colspan="2">PHP core function, not a Regular expression!</td>';
}
echo '</tr>';
}
?>
</tbody>
</table>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment