Skip to content

Instantly share code, notes, and snippets.

@rodneyrehm
Created April 8, 2011 11:52
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rodneyrehm/909692 to your computer and use it in GitHub Desktop.
Save rodneyrehm/909692 to your computer and use it in GitHub Desktop.
Reduce (UTF-8) strings to alphanumeric
<?php
/*
consider decomposing the characters to "capture" more "obscure" characters such as ṩ
- http://www.php.net/manual/en/normalizer.normalize.php#92592
*/
/**
* Normalize a string to only contain alphanumeric characters and dashes.
*
* Replace accents by their entities.
* Replace everything else by - (dash).
* @note mb_internal_charset() must be set to whatever encoding $string had originally
* @param string $string String to normalize
* @param boolean $trim Trim string to not start/end with a dash and not contain dash sequences
* @param array|string $allow List of Characters that are to be ignored while urlifying array( unicode => true ) or string like "$é@"
* @param array $replace List of Characters that are to be replaced while urlifying array( unicode => "character" )
* @return string normalized string
* @author Christian Kruse <cjk+os@wwwtech.de>
* @author Rodney Rehm <rodney.rehm@medialize.de>
*/
function urlify($string, $trim=true, $allow=null, $replace=null)
{
if (!is_string($string)) {
throw new Exception('$string must be a string');
}
$_replace = array(
0xE4 => "\0\0\0\x61\0\0\0\x65", // ä
0xC4 => "\0\0\0\x41\0\0\0\x65", // Ä
0xF6 => "\0\0\0\x6F\0\0\0\x65", // ö
0xD6 => "\0\0\0\x4F\0\0\0\x65", // Ö
0xFC => "\0\0\0\x75\0\0\0\x65", // ü
0xDC => "\0\0\0\x55\0\0\0\x65", // Ü
0xDF => "\0\0\0\x73\0\0\0\x73", // ß
0xE6 => "\0\0\0\x61\0\0\0\x65", // æ
0xC6 => "\0\0\0\x41\0\0\0\x65", // Æ
);
if ($replace && is_array($replace)) {
foreach ($replace as $k => $v) {
$_replace[$k] = mb_convert_encoding($v, "UTF-32BE");
}
}
if ($allow && is_string($allow)) {
$t = mb_convert_encoding($allow, "UTF-32BE");
$t = unpack("N*", $t);
$allow = array();
foreach ($t as $k) {
$allow[$k] = true;
}
} elseif ($allow && !is_array($allow)) {
$allow = null;
}
$res = '';
$string = mb_convert_encoding($string, "UTF-32BE");
$unicodes = unpack("N*", $string);
$i = -1;
foreach ($unicodes as $code) {
$i++;
if (($code >= 97 && $code <= 122) || ($code >= 65 && $code <= 90) || ($code >= 48 && $code <= 57) || $code == 95 || $code == 45) {
// skip normalization for alphanumeric characters [a-zA-Z0-9_-]
$res .= mb_substr($string, $i, 1, "UTF-32BE");
} elseif ($allow && isset($allow[$code])) {
// skip normalization for allowed characters
$res .= mb_substr($string, $i, 1, "UTF-32BE");
} elseif (isset($_replace[$code])) {
// replace as defined
$res .= $_replace[$code];
} elseif (($code >= 0xC0 && $code <= 0xC6) || ($code >= 0xE0 && $code <= 0xE6) || ($code >= 0x100 && $code <= 0x105)) {
$res .= "\0\0\0\x61"; // a
} elseif ($code == 0xC7 || $code == 0xE7 || ($code >= 0x106 && $code <= 0x10D)) {
$res .= "\0\0\0\x63"; // c
} elseif ($code == 0xD0 || ($code >= 0x10E && $code <= 0x111)) {
$res .= "\0\0\0\x64"; // d
} elseif (($code >= 0xC8 && $code <= 0xCB) || ($code >= 0xE8 && $code <= 0xEB) || ($code >= 0x112 && $code <= 0x11B)) {
$res .= "\0\0\0\x65"; // e
} elseif (($code >= 0xCC && $code <= 0xCF) || ($code >= 0xEC && $code <= 0xEF)) {
$res .= "\0\0\0\x69"; // i
} elseif ($code == 0xD1 || $code == 0xF1) {
$res .= "\0\0\0\x6E"; // n
} elseif (($code >= 0xD2 && $code <= 0xD8) || ($code >= 0xF2 && $code <= 0xF8)) {
$res .= "\0\0\0\x6F"; // o
} elseif (($code >= 0xD9 && $code <= 0xDB) || ($code >= 0xF9 && $code <= 0xFB)) {
$res .= "\0\0\0\x75"; // u
} elseif ($code == 0xDD || $code == 0xFD || $code == 0xFF) {
$res .= "\0\0\0\x79"; // y
} else {
$res .= "\0\0\0\x2D"; // -
}
}
if ($trim) {
$res = preg_replace('#(\0\0\0\x2D){2,}#', "\0\0\0\x2D", $res);
$res = preg_replace('#(^\0\0\0\x2D)|(\0\0\0\x2D$)#', "", $res);
}
return mb_convert_encoding($res, mb_internal_encoding(), "UTF-32BE");
}
<?php
include dirname(__FILE__) . '/urlify.php';
mb_internal_encoding('UTF-8');
$test = array(
'hällö wörld' => 'haelloe-woerld',
'hällö wörldß' => 'haelloe-woerldss',
'hällö wörldß' => 'haelloe-woerldss',
'hällö wörld ' => 'haelloe-woerld',
'hällö wörld %' => 'haelloe-woerld',
'héllò peôplë ÑO?' => 'hello-people-nO',
);
echo '<pre>';
foreach ($test as $o => $r){
$_r = urlify($o);
echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
}
echo '</pre>';
$test = array(
'hällö wörld' => 'haelloe-woerld',
'hällö wörldß' => 'haelloe-woerldss',
'hällö wörldß' => 'haelloe-woerldss',
'hällö wörld ' => 'haelloe-woerld-',
'hällö wörld %' => 'haelloe-woerld--',
'héllò peôplë ÑO?' => 'hello-people-nO-',
);
echo '<pre>';
foreach ($test as $o => $r){
$_r = urlify($o, false);
echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
}
echo '</pre>';
$test = array(
'hällö wörld' => 'haelloe-woerld',
'hällö wörldß' => 'haelloe-woerldss',
'hällö wörldß' => 'haelloe-woerldss',
'hällö wörld ' => 'haelloe-woerld',
'hällö wörld %' => 'haelloe-woerld',
'héllò peôplë ÑO?' => 'hello-people-nO',
'héllò peôplë ñO?' => 'hello-people-ñO',
);
echo '<pre>';
foreach ($test as $o => $r){
$_r = urlify($o, true, array(0xF1 => true));
echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
}
echo '</pre>';
$test = array(
'hällö wörld' => 'haelloe-woerld',
'hällö wörldß' => 'haelloe-woerldss',
'hällö wörldß' => 'haelloe-woerldss',
'hällö wörld ' => 'haelloe-woerld',
'hällö wörld %' => 'haelloe-woerld',
'héllò peôplë ÑO?' => 'hello-people-nO',
'héllò peôplë ñO?' => 'hello-people-XXXO',
);
echo '<pre>';
foreach ($test as $o => $r){
$_r = urlify($o, true, null, array(0xF1 => 'XXX'));
echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
}
echo '</pre>';
$test = array(
'hällö wörld' => 'haelloe-woerld',
'hällö wörldß' => 'haelloe-woerldss',
'hällö wörldß' => 'haelloe-woerldss',
'hällö wörld ' => 'haelloe-woerld',
'hällö wörld %' => 'haelloe-woerld',
'héllò peôplë ÑO?' => 'hello-people-nO?',
'héllò peôplë ñO?' => 'hello-people-ñO?',
);
echo '<pre>';
foreach ($test as $o => $r){
$_r = urlify($o, true, 'ñ?');
echo $o .' - '. $_r .' --- '. ($r == $_r ? 'OK' : 'FAILED') .' --- '. bin2hex($_r) ."\n";
}
echo '</pre>';
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment