-
-
Save supersonictw/ae583e26fc1c9dea03403463323b7960 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Given a URL, normalize that URL. | |
* @param String URL | |
* @return String Normalized URL | |
*/ | |
function normalizeUrl($input_url) | |
{ | |
$newUrl = ""; | |
$url = parse_url($input_url); | |
$defaultSchemes = array("http" => 80, "https" => 443); | |
if (isset($url['scheme'])) { | |
$url['scheme'] = strtolower($url['scheme']); | |
$newUrl .= "{$url['scheme']}://"; | |
} else { | |
return normalizeUrl("http://" . $input_url); | |
} | |
if (isset($url['host'])) { | |
$url['host'] = strtolower($url['host']); | |
// Seems like a valid domain, properly validation should be made in higher layers. | |
if (preg_match("/[a-z]+\Z/", $url['host'])) { | |
if (preg_match("/^www\./", $url['host']) && gethostbyname($url['host']) == gethostbyname(str_replace("www.", "", $url['host']))) { | |
$newUrl .= str_replace("www.", "", $url['host']); | |
} else { | |
$newUrl .= $url['host']; | |
} | |
} else { | |
$newUrl .= $url['host']; | |
} | |
} | |
if (isset($url['port'])) { | |
// Strip scheme default ports | |
if ($defaultSchemes[$url['scheme']] !== $url['port']) { | |
$newUrl .= ":{$url['port']}"; | |
} | |
} | |
if (isset($url['path'])) { | |
// Case normalization | |
$url['path'] = strtolower($url['path']); | |
//Strip duplicate slashes | |
while (preg_match("/\/\//", $url['path'])) { | |
$url['path'] = preg_replace("/\/\//", "/", $url['path']); | |
} | |
/* | |
* Decode unreserved characters, http://www.apps.ietf.org/rfc/rfc3986.html#sec-2.3 | |
* Heavily rewritten version of urlDecodeUnreservedChars() in Glen Scott's url-normalizer. | |
*/ | |
$u = array(); | |
for ($o = 65; $o <= 90; $o++) { | |
$u[] = dechex($o); | |
} | |
for ($o = 97; $o <= 122; $o++) { | |
$u[] = dechex($o); | |
} | |
for ($o = 48; $o <= 57; $o++) { | |
$u[] = dechex($o); | |
} | |
$chrs = array('-', '.', '_', '~'); | |
foreach ($chrs as $chr) { | |
$u[] = dechex(ord($chr)); | |
} | |
$url['path'] = preg_replace_callback(array_map(create_function('$str', 'return "/%" . strtoupper($str) . "/x";'), $u), create_function('$matches', 'return chr(hexdec($matches[0]));'), $url['path']); | |
// Remove directory index | |
$defaultIndexes = array("/default\.aspx/" => "default.aspx", "/default\.asp/" => "default.asp", "/index\.html/" => "index.html", "/index\.htm/" => "index.htm", "/default\.html/" => "default.html", "/default\.htm/" => "default.htm", "/index\.php/" => "index.php", "/index\.jsp/" => "index.jsp"); | |
foreach ($defaultIndexes as $index => $strip) { | |
if (preg_match($index, $url['path'])) { | |
$url['path'] = str_replace($strip, "", $url['path']); | |
} | |
} | |
/** | |
* Path segment normalization, http://www.apps.ietf.org/rfc/rfc3986.html#sec-5.2.4 | |
* Heavily rewritten version of removeDotSegments() in Glen Scott's url-normalizer. | |
*/ | |
$new_path = ''; | |
while (!empty($url['path'])) { | |
if (preg_match('!^(\.\./|\./)!x', $url['path'])) { | |
$url['path'] = preg_replace('!^(\.\./|\./)!x', '', $url['path']); | |
} elseif (preg_match('!^(/\./)!x', $url['path'], $matches) || preg_match('!^(/\.)$!x', $url['path'], $matches)) { | |
$url['path'] = preg_replace("!^" . $matches[1] . "!", '/', $url['path']); | |
} elseif (preg_match('!^(/\.\./|/\.\.)!x', $url['path'], $matches)) { | |
$url['path'] = preg_replace('!^' . preg_quote($matches[1], '!') . '!x', '/', $url['path']); | |
$new_path = preg_replace('!/([^/]+)$!x', '', $new_path); | |
} elseif (preg_match('!^(\.|\.\.)$!x', $url['path'])) { | |
$url['path'] = preg_replace('!^(\.|\.\.)$!x', $url['path']); | |
} else { | |
if (preg_match('!(/*[^/]*)!x', $url['path'], $matches)) { | |
$first_path_segment = $matches[1]; | |
$url['path'] = preg_replace('/^' . preg_quote($first_path_segment, '/') . '/', '', $url['path'], 1); | |
$new_path .= $first_path_segment; | |
} | |
} | |
} | |
$newUrl .= $new_path; | |
} | |
if (isset($url['fragment'])) { | |
unset($url['fragment']); | |
} | |
// Sort GET params alphabetically, not because the RFC requires it but because it's cool! | |
if (isset($url['query'])) { | |
if (preg_match("/&/", $url['query'])) { | |
$s = explode("&", $url['query']); | |
$url['query'] = ""; | |
sort($s); | |
foreach ($s as $z) { | |
$url['query'] .= "{$z}&"; | |
} | |
$url['query'] = preg_replace("/&\Z/", "", $url['query']); | |
} | |
$newUrl .= "?{$url['query']}"; | |
} | |
return $newUrl; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment