Skip to content

Instantly share code, notes, and snippets.

@supersonictw
Forked from collegeman/normalize-url.php
Last active May 3, 2020 14:47
Show Gist options
  • Save supersonictw/ae583e26fc1c9dea03403463323b7960 to your computer and use it in GitHub Desktop.
Save supersonictw/ae583e26fc1c9dea03403463323b7960 to your computer and use it in GitHub Desktop.
<?php
/**
* Given a URL, normalize that URL.
* @param String URL
* @return String Normalized URL
*/
function normalizeUrl($input_url)
{
$newUrl = "";
$url = parse_url($input_url);
$defaultSchemes = array("http" => 80, "https" => 443);
if (isset($url['scheme'])) {
$url['scheme'] = strtolower($url['scheme']);
$newUrl .= "{$url['scheme']}://";
} else {
return normalizeUrl("http://" . $input_url);
}
if (isset($url['host'])) {
$url['host'] = strtolower($url['host']);
// Seems like a valid domain, properly validation should be made in higher layers.
if (preg_match("/[a-z]+\Z/", $url['host'])) {
if (preg_match("/^www\./", $url['host']) && gethostbyname($url['host']) == gethostbyname(str_replace("www.", "", $url['host']))) {
$newUrl .= str_replace("www.", "", $url['host']);
} else {
$newUrl .= $url['host'];
}
} else {
$newUrl .= $url['host'];
}
}
if (isset($url['port'])) {
// Strip scheme default ports
if ($defaultSchemes[$url['scheme']] !== $url['port']) {
$newUrl .= ":{$url['port']}";
}
}
if (isset($url['path'])) {
// Case normalization
$url['path'] = strtolower($url['path']);
//Strip duplicate slashes
while (preg_match("/\/\//", $url['path'])) {
$url['path'] = preg_replace("/\/\//", "/", $url['path']);
}
/*
* Decode unreserved characters, http://www.apps.ietf.org/rfc/rfc3986.html#sec-2.3
* Heavily rewritten version of urlDecodeUnreservedChars() in Glen Scott's url-normalizer.
*/
$u = array();
for ($o = 65; $o <= 90; $o++) {
$u[] = dechex($o);
}
for ($o = 97; $o <= 122; $o++) {
$u[] = dechex($o);
}
for ($o = 48; $o <= 57; $o++) {
$u[] = dechex($o);
}
$chrs = array('-', '.', '_', '~');
foreach ($chrs as $chr) {
$u[] = dechex(ord($chr));
}
$url['path'] = preg_replace_callback(array_map(create_function('$str', 'return "/%" . strtoupper($str) . "/x";'), $u), create_function('$matches', 'return chr(hexdec($matches[0]));'), $url['path']);
// Remove directory index
$defaultIndexes = array("/default\.aspx/" => "default.aspx", "/default\.asp/" => "default.asp", "/index\.html/" => "index.html", "/index\.htm/" => "index.htm", "/default\.html/" => "default.html", "/default\.htm/" => "default.htm", "/index\.php/" => "index.php", "/index\.jsp/" => "index.jsp");
foreach ($defaultIndexes as $index => $strip) {
if (preg_match($index, $url['path'])) {
$url['path'] = str_replace($strip, "", $url['path']);
}
}
/**
* Path segment normalization, http://www.apps.ietf.org/rfc/rfc3986.html#sec-5.2.4
* Heavily rewritten version of removeDotSegments() in Glen Scott's url-normalizer.
*/
$new_path = '';
while (!empty($url['path'])) {
if (preg_match('!^(\.\./|\./)!x', $url['path'])) {
$url['path'] = preg_replace('!^(\.\./|\./)!x', '', $url['path']);
} elseif (preg_match('!^(/\./)!x', $url['path'], $matches) || preg_match('!^(/\.)$!x', $url['path'], $matches)) {
$url['path'] = preg_replace("!^" . $matches[1] . "!", '/', $url['path']);
} elseif (preg_match('!^(/\.\./|/\.\.)!x', $url['path'], $matches)) {
$url['path'] = preg_replace('!^' . preg_quote($matches[1], '!') . '!x', '/', $url['path']);
$new_path = preg_replace('!/([^/]+)$!x', '', $new_path);
} elseif (preg_match('!^(\.|\.\.)$!x', $url['path'])) {
$url['path'] = preg_replace('!^(\.|\.\.)$!x', $url['path']);
} else {
if (preg_match('!(/*[^/]*)!x', $url['path'], $matches)) {
$first_path_segment = $matches[1];
$url['path'] = preg_replace('/^' . preg_quote($first_path_segment, '/') . '/', '', $url['path'], 1);
$new_path .= $first_path_segment;
}
}
}
$newUrl .= $new_path;
}
if (isset($url['fragment'])) {
unset($url['fragment']);
}
// Sort GET params alphabetically, not because the RFC requires it but because it's cool!
if (isset($url['query'])) {
if (preg_match("/&/", $url['query'])) {
$s = explode("&", $url['query']);
$url['query'] = "";
sort($s);
foreach ($s as $z) {
$url['query'] .= "{$z}&";
}
$url['query'] = preg_replace("/&\Z/", "", $url['query']);
}
$newUrl .= "?{$url['query']}";
}
return $newUrl;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment