Created
May 2, 2014 18:03
-
-
Save xeoncross/8c568c746546db51788b to your computer and use it in GitHub Desktop.
How to normalize a web URL (also removes the schema)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"require": { | |
"glenscott/url-normalizer" : "dev-master" | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function normalizeUrl($url) | |
{ | |
$normalizer = new URL\Normalizer; | |
$normalizer->setUrl($url); | |
$url = $normalizer->normalize(); | |
extract((array) parse_url($url) + array( | |
'host' => '', | |
'path' => '', | |
'query' => '' | |
)); | |
$path = ltrim($path, '/'); | |
if($host AND ($path OR $query)) { | |
$host .= '/'; | |
} | |
return $host . $path . ($query ? "?$query" : ''); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
require('vendor/autoload.php'); | |
$urls = 'HTTP://www.Example.com/ | |
http://www.example.com/ | |
http://www.example.com/a%c2%b1b | |
http://www.example.com/a%C2%B1b | |
http://www.example.com/%7Eusername/ | |
http://www.example.com/~username/ | |
http://www.example.com | |
http://www.example.com/ | |
http://www.example.com:80/bar.html | |
http://www.example.com/bar.html | |
http://www.example.com/../a/b/../../c/./d.html | |
http://www.example.com/../a/b/../c/./d.html | |
http://www.example.com/a/c/d.html | |
http://www.example.com/../a/b/../c/./d.html | |
http://www.example.com/a/c/d.html | |
//www.example.com/path?googleguy=googley | |
www.example.com/path | |
http://www.example.com/?array[key]=value | |
http://www.example.com/?array%5Bkey%5D=value | |
http://example.com/url=http://site.com&value=sure | |
'; | |
$urls = explode("\n", $urls); | |
function standardizeUrl($url) | |
{ | |
extract((array) parse_url($url) + array( | |
'schema' => 'http', | |
'host' => '', | |
'path' => '', | |
'query' => '' | |
)); | |
// example.com/path and //example.com/js.js have problems | |
/*if( ! $host) { | |
if(strpos($url, '/') === false) { | |
return; | |
} | |
list($host, $foo) = explode('/', ltrim($url, '/'), 2); | |
} | |
*/ | |
if($host) { $host .= '/'; } | |
//print 'Host: ' . $host . PHP_EOL; | |
if($path = trim($path, '/')) { | |
$path = rawurlencode(rawurldecode(trim($path, '/'))); | |
$path = str_replace( | |
array('%2F', '%3A', '%40', '/./'), | |
array('/', ':', '@', '/'), | |
$path | |
); | |
if($path) { | |
// Remove /foo/../ | |
while(($new = preg_replace('~((?!\.\.|/).)+/\.\./~', '', $path)) !== $path) { | |
$path = $new; | |
} | |
// Remove starting ../ | |
while(substr($path, 0, 3) == '../') { | |
$path = substr($path, 3); | |
} | |
} | |
} | |
//print "\t$schema - $host\n"; | |
return strtolower("$schema://$host") . $path . ($query ? "?$query" : ''); | |
} | |
function normalizeUrl($url) | |
{ | |
$normalizer = new URL\Normalizer; | |
$normalizer->setUrl($url); | |
$url = $normalizer->normalize(); | |
extract((array) parse_url($url) + array( | |
'host' => '', | |
'path' => '', | |
'query' => '' | |
)); | |
$path = ltrim($path, '/'); | |
if($host AND ($path OR $query)) { | |
$host .= '/'; | |
} | |
return $host . $path . ($query ? "?$query" : ''); | |
} | |
$normalizer = new URL\Normalizer; | |
$end_me = array(); | |
$end_me2 = array(); | |
$end_normalize = array(); | |
foreach($urls as $url) { | |
print "$url\n"; | |
$start = microtime(TRUE); | |
print "" . standardizeUrl($url) . "\n"; | |
$end_me[] = (microtime(TRUE) - $start); | |
$start = microtime(TRUE); | |
print "" . normalizeUrl($url) . "\n"; | |
$end_me2[] = (microtime(TRUE) - $start); | |
$start = microtime(TRUE); | |
$normalizer->setUrl($url); | |
print "" . $normalizer->normalize() . "\n\n"; | |
$end_normalize[] = (microtime(TRUE) - $start); | |
} | |
print array_sum($end_me) . ' = ' . (array_sum($end_me) / count($end_me)) . "\n"; | |
print array_sum($end_me2) . ' = ' . (array_sum($end_me2) / count($end_me2)) . "\n"; | |
print array_sum($end_normalize) . ' = ' . (array_sum($end_normalize) / count($end_normalize)) . "\n"; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
HTTP://www.Example.com/ | |
http://www.example.com/ | |
www.example.com | |
http://www.example.com/ | |
http://www.example.com/ | |
http://www.example.com/ | |
www.example.com | |
http://www.example.com/ | |
http://www.example.com/a%c2%b1b | |
http://www.example.com/a%C2%B1b | |
www.example.com/a%C2%B1b | |
http://www.example.com/a%C2%B1b | |
http://www.example.com/a%C2%B1b | |
http://www.example.com/a%C2%B1b | |
www.example.com/a%C2%B1b | |
http://www.example.com/a%C2%B1b | |
http://www.example.com/%7Eusername/ | |
http://www.example.com/~username | |
www.example.com/~username/ | |
http://www.example.com/~username/ | |
http://www.example.com/~username/ | |
http://www.example.com/~username | |
www.example.com/~username/ | |
http://www.example.com/~username/ | |
http://www.example.com | |
http://www.example.com/ | |
www.example.com | |
http://www.example.com/ | |
http://www.example.com/ | |
http://www.example.com/ | |
www.example.com | |
http://www.example.com/ | |
http://www.example.com:80/bar.html | |
http://www.example.com/bar.html | |
www.example.com/bar.html | |
http://www.example.com/bar.html | |
http://www.example.com/bar.html | |
http://www.example.com/bar.html | |
www.example.com/bar.html | |
http://www.example.com/bar.html | |
http://www.example.com/../a/b/../../c/./d.html | |
http://www.example.com/c/d.html | |
www.example.com/c/d.html | |
http://www.example.com/c/d.html | |
http://www.example.com/../a/b/../c/./d.html | |
http://www.example.com/a/c/d.html | |
www.example.com/a/c/d.html | |
http://www.example.com/a/c/d.html | |
http://www.example.com/a/c/d.html | |
http://www.example.com/a/c/d.html | |
www.example.com/a/c/d.html | |
http://www.example.com/a/c/d.html | |
http://www.example.com/../a/b/../c/./d.html | |
http://www.example.com/a/c/d.html | |
www.example.com/a/c/d.html | |
http://www.example.com/a/c/d.html | |
http://www.example.com/a/c/d.html | |
http://www.example.com/a/c/d.html | |
www.example.com/a/c/d.html | |
http://www.example.com/a/c/d.html | |
//www.example.com/path?googleguy=googley | |
http://www.example.com/path?googleguy=googley | |
www.example.com/path?googleguy=googley | |
//www.example.com/path?googleguy=googley | |
www.example.com/path | |
http://www.example.com/path | |
www.example.com/path | |
www.example.com/path | |
http://www.example.com/?array[key]=value | |
http://www.example.com/?array[key]=value | |
www.example.com/?array%5Bkey%5D=value | |
http://www.example.com/?array%5Bkey%5D=value | |
http://www.example.com/?array%5Bkey%5D=value | |
http://www.example.com/?array%5Bkey%5D=value | |
www.example.com/?array%5Bkey%5D=value | |
http://www.example.com/?array%5Bkey%5D=value | |
http://example.com/url=http://site.com&value=sure | |
http://example.com/url%3Dhttp://site.com%26value%3Dsure | |
example.com/url=http://site.com&value=sure | |
http://example.com/url=http://site.com&value=sure | |
http:// | |
0.0015895366668701 = 7.569222223191E-5 | |
0.0075552463531494 = 0.00035977363586426 | |
0.0063259601593018 = 0.00030123619806199 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment