Skip to content

Instantly share code, notes, and snippets.

@xeoncross
Created May 2, 2014 18:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save xeoncross/8c568c746546db51788b to your computer and use it in GitHub Desktop.
Save xeoncross/8c568c746546db51788b to your computer and use it in GitHub Desktop.
How to normalize a web URL (also removes the schema)
{
"require": {
"glenscott/url-normalizer" : "dev-master"
}
}
function normalizeUrl($url)
{
$normalizer = new URL\Normalizer;
$normalizer->setUrl($url);
$url = $normalizer->normalize();
extract((array) parse_url($url) + array(
'host' => '',
'path' => '',
'query' => ''
));
$path = ltrim($path, '/');
if($host AND ($path OR $query)) {
$host .= '/';
}
return $host . $path . ($query ? "?$query" : '');
}
<?php
require('vendor/autoload.php');
$urls = 'HTTP://www.Example.com/
http://www.example.com/
http://www.example.com/a%c2%b1b
http://www.example.com/a%C2%B1b
http://www.example.com/%7Eusername/
http://www.example.com/~username/
http://www.example.com
http://www.example.com/
http://www.example.com:80/bar.html
http://www.example.com/bar.html
http://www.example.com/../a/b/../../c/./d.html
http://www.example.com/../a/b/../c/./d.html
http://www.example.com/a/c/d.html
http://www.example.com/../a/b/../c/./d.html
http://www.example.com/a/c/d.html
//www.example.com/path?googleguy=googley
www.example.com/path
http://www.example.com/?array[key]=value
http://www.example.com/?array%5Bkey%5D=value
http://example.com/url=http://site.com&value=sure
';
$urls = explode("\n", $urls);
function standardizeUrl($url)
{
extract((array) parse_url($url) + array(
'schema' => 'http',
'host' => '',
'path' => '',
'query' => ''
));
// example.com/path and //example.com/js.js have problems
/*if( ! $host) {
if(strpos($url, '/') === false) {
return;
}
list($host, $foo) = explode('/', ltrim($url, '/'), 2);
}
*/
if($host) { $host .= '/'; }
//print 'Host: ' . $host . PHP_EOL;
if($path = trim($path, '/')) {
$path = rawurlencode(rawurldecode(trim($path, '/')));
$path = str_replace(
array('%2F', '%3A', '%40', '/./'),
array('/', ':', '@', '/'),
$path
);
if($path) {
// Remove /foo/../
while(($new = preg_replace('~((?!\.\.|/).)+/\.\./~', '', $path)) !== $path) {
$path = $new;
}
// Remove starting ../
while(substr($path, 0, 3) == '../') {
$path = substr($path, 3);
}
}
}
//print "\t$schema - $host\n";
return strtolower("$schema://$host") . $path . ($query ? "?$query" : '');
}
function normalizeUrl($url)
{
$normalizer = new URL\Normalizer;
$normalizer->setUrl($url);
$url = $normalizer->normalize();
extract((array) parse_url($url) + array(
'host' => '',
'path' => '',
'query' => ''
));
$path = ltrim($path, '/');
if($host AND ($path OR $query)) {
$host .= '/';
}
return $host . $path . ($query ? "?$query" : '');
}
$normalizer = new URL\Normalizer;
$end_me = array();
$end_me2 = array();
$end_normalize = array();
foreach($urls as $url) {
print "$url\n";
$start = microtime(TRUE);
print "" . standardizeUrl($url) . "\n";
$end_me[] = (microtime(TRUE) - $start);
$start = microtime(TRUE);
print "" . normalizeUrl($url) . "\n";
$end_me2[] = (microtime(TRUE) - $start);
$start = microtime(TRUE);
$normalizer->setUrl($url);
print "" . $normalizer->normalize() . "\n\n";
$end_normalize[] = (microtime(TRUE) - $start);
}
print array_sum($end_me) . ' = ' . (array_sum($end_me) / count($end_me)) . "\n";
print array_sum($end_me2) . ' = ' . (array_sum($end_me2) / count($end_me2)) . "\n";
print array_sum($end_normalize) . ' = ' . (array_sum($end_normalize) / count($end_normalize)) . "\n";
HTTP://www.Example.com/
http://www.example.com/
www.example.com
http://www.example.com/
http://www.example.com/
http://www.example.com/
www.example.com
http://www.example.com/
http://www.example.com/a%c2%b1b
http://www.example.com/a%C2%B1b
www.example.com/a%C2%B1b
http://www.example.com/a%C2%B1b
http://www.example.com/a%C2%B1b
http://www.example.com/a%C2%B1b
www.example.com/a%C2%B1b
http://www.example.com/a%C2%B1b
http://www.example.com/%7Eusername/
http://www.example.com/~username
www.example.com/~username/
http://www.example.com/~username/
http://www.example.com/~username/
http://www.example.com/~username
www.example.com/~username/
http://www.example.com/~username/
http://www.example.com
http://www.example.com/
www.example.com
http://www.example.com/
http://www.example.com/
http://www.example.com/
www.example.com
http://www.example.com/
http://www.example.com:80/bar.html
http://www.example.com/bar.html
www.example.com/bar.html
http://www.example.com/bar.html
http://www.example.com/bar.html
http://www.example.com/bar.html
www.example.com/bar.html
http://www.example.com/bar.html
http://www.example.com/../a/b/../../c/./d.html
http://www.example.com/c/d.html
www.example.com/c/d.html
http://www.example.com/c/d.html
http://www.example.com/../a/b/../c/./d.html
http://www.example.com/a/c/d.html
www.example.com/a/c/d.html
http://www.example.com/a/c/d.html
http://www.example.com/a/c/d.html
http://www.example.com/a/c/d.html
www.example.com/a/c/d.html
http://www.example.com/a/c/d.html
http://www.example.com/../a/b/../c/./d.html
http://www.example.com/a/c/d.html
www.example.com/a/c/d.html
http://www.example.com/a/c/d.html
http://www.example.com/a/c/d.html
http://www.example.com/a/c/d.html
www.example.com/a/c/d.html
http://www.example.com/a/c/d.html
//www.example.com/path?googleguy=googley
http://www.example.com/path?googleguy=googley
www.example.com/path?googleguy=googley
//www.example.com/path?googleguy=googley
www.example.com/path
http://www.example.com/path
www.example.com/path
www.example.com/path
http://www.example.com/?array[key]=value
http://www.example.com/?array[key]=value
www.example.com/?array%5Bkey%5D=value
http://www.example.com/?array%5Bkey%5D=value
http://www.example.com/?array%5Bkey%5D=value
http://www.example.com/?array%5Bkey%5D=value
www.example.com/?array%5Bkey%5D=value
http://www.example.com/?array%5Bkey%5D=value
http://example.com/url=http://site.com&value=sure
http://example.com/url%3Dhttp://site.com%26value%3Dsure
example.com/url=http://site.com&value=sure
http://example.com/url=http://site.com&value=sure
http://
0.0015895366668701 = 7.569222223191E-5
0.0075552463531494 = 0.00035977363586426
0.0063259601593018 = 0.00030123619806199
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment