Skip to content

Instantly share code, notes, and snippets.

Created February 18, 2012 08:53
Show Gist options
  • Save sebacruz/1858327 to your computer and use it in GitHub Desktop.
Save sebacruz/1858327 to your computer and use it in GitHub Desktop.
Normalizes URL according to RFC 3986 to use it in comparison operations.
* Normalizes URL according to RFC 3986 to use it in comparison operations.
* The function gets URL argument by reference and modifies it.
* It returns the normalized URL on success and FALSE of failure.
* @copyright Copyright (c) 2005-2012 Zend Technologies USA Inc. (
* @license New BSD License
* @package Zend_OpenId
* @filesource
* @param string $url URL to be normalized
* @return string|bool Normalized URL on success and FALSE of failure.
function normalize_url($url) {
// RFC 3986, 6.2.2. Syntax-Based Normalization
// RFC 3986, Percent-Encoding Normalization
$i = 0;
$n = strlen($url);
$res = '';
while ($i < $n) {
if ($url[$i] == '%') {
if ($i + 2 >= $n) {
return FALSE;
if ($url[$i] >= '0' && $url[$i] <= '9') {
$c = ord($url[$i]) - ord('0');
} else if ($url[$i] >= 'A' && $url[$i] <= 'F') {
$c = ord($url[$i]) - ord('A') + 10;
} else if ($url[$i] >= 'a' && $url[$i] <= 'f') {
$c = ord($url[$i]) - ord('a') + 10;
} else {
return FALSE;
if ($url[$i] >= '0' && $url[$i] <= '9') {
$c = ($c << 4) | (ord($url[$i]) - ord('0'));
} else if ($url[$i] >= 'A' && $url[$i] <= 'F') {
$c = ($c << 4) | (ord($url[$i]) - ord('A') + 10);
} else if ($url[$i] >= 'a' && $url[$i] <= 'f') {
$c = ($c << 4) | (ord($url[$i]) - ord('a') + 10);
} else {
return FALSE;
$ch = chr($c);
if (($ch >= 'A' && $ch <= 'Z') ||
($ch >= 'a' && $ch <= 'z') ||
$ch == '-' ||
$ch == '.' ||
$ch == '_' ||
$ch == '~') {
$res .= $ch;
} else {
$res .= '%';
if (($c >> 4) < 10) {
$res .= chr(($c >> 4) + ord('0'));
} else {
$res .= chr(($c >> 4) - 10 + ord('A'));
$c = $c & 0xf;
if ($c < 10) {
$res .= chr($c + ord('0'));
} else {
$res .= chr($c - 10 + ord('A'));
} else {
$res .= $url[$i++];
if (!preg_match('|^([^:]+)://([^:@]*(?:[:][^@]*)?@)?([^/:@?#]*)(?:[:]([^/?#]*))?(/[^?#]*)?((?:[?](?:[^#]*))?)((?:#.*)?)$|', $res, $reg)) {
return FALSE;
$scheme = $reg[1];
$auth = $reg[2];
$host = $reg[3];
$port = $reg[4];
$path = $reg[5];
$query = $reg[6];
$fragment = $reg[7]; /* strip it */
if (empty($scheme) || empty($host)) {
return FALSE;
// RFC 3986, Case Normalization
$scheme = strtolower($scheme);
$host = strtolower($host);
// RFC 3986, Path Segment Normalization
if (!empty($path)) {
$i = 0;
$n = strlen($path);
$res = "";
while ($i < $n) {
if ($path[$i] == '/') {
while ($i < $n && $path[$i] == '/') {
if ($i < $n && $path[$i] == '.') {
if ($i < $n && $path[$i] == '.') {
if ($i == $n || $path[$i] == '/') {
if (($pos = strrpos($res, '/')) !== FALSE) {
$res = substr($res, 0, $pos);
} else {
$res .= '/..';
} else if ($i != $n && $path[$i] != '/') {
$res .= '/.';
} else {
$res .= '/';
} else {
$res .= $path[$i++];
$path = $res;
// RFC 3986,6.2.3. Scheme-Based Normalization
if ($scheme == 'http') {
if ($port == 80) {
$port = '';
} else if ($scheme == 'https') {
if ($port == 443) {
$port = '';
if (empty($path)) {
$path = '/';
$url = $scheme
. '://'
. $auth
. $host
. (empty($port) ? '' : (':' . $port))
. $path
. $query;
return $url;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment