Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
URL Slugs in PHP (with UTF-8 and Transliteration Support)
<?php
/**
* Create a web friendly URL slug from a string.
*
* Although supported, transliteration is discouraged because
* 1) most web browsers support UTF-8 characters in URLs
* 2) transliteration causes a loss of information
*
* @author Sean Murphy <sean@iamseanmurphy.com>
* @copyright Copyright 2012 Sean Murphy. All rights reserved.
* @license http://creativecommons.org/publicdomain/zero/1.0/
*
* @param string $str
* @param array $options
* @return string
*/
function url_slug($str, $options = array()) {
// Make sure string is in UTF-8 and strip invalid UTF-8 characters
$str = mb_convert_encoding((string)$str, 'UTF-8', mb_list_encodings());
$defaults = array(
'delimiter' => '-',
'limit' => null,
'lowercase' => true,
'replacements' => array(),
'transliterate' => false,
);
// Merge options
$options = array_merge($defaults, $options);
$char_map = array(
// Latin
'À' => 'A', 'Á' => 'A', 'Â' => 'A', 'Ã' => 'A', 'Ä' => 'A', 'Å' => 'A', 'Æ' => 'AE', 'Ç' => 'C',
'È' => 'E', 'É' => 'E', 'Ê' => 'E', 'Ë' => 'E', 'Ì' => 'I', 'Í' => 'I', 'Î' => 'I', 'Ï' => 'I',
'Ð' => 'D', 'Ñ' => 'N', 'Ò' => 'O', 'Ó' => 'O', 'Ô' => 'O', 'Õ' => 'O', 'Ö' => 'O', 'Ő' => 'O',
'Ø' => 'O', 'Ù' => 'U', 'Ú' => 'U', 'Û' => 'U', 'Ü' => 'U', 'Ű' => 'U', 'Ý' => 'Y', 'Þ' => 'TH',
'ß' => 'ss',
'à' => 'a', 'á' => 'a', 'â' => 'a', 'ã' => 'a', 'ä' => 'a', 'å' => 'a', 'æ' => 'ae', 'ç' => 'c',
'è' => 'e', 'é' => 'e', 'ê' => 'e', 'ë' => 'e', 'ì' => 'i', 'í' => 'i', 'î' => 'i', 'ï' => 'i',
'ð' => 'd', 'ñ' => 'n', 'ò' => 'o', 'ó' => 'o', 'ô' => 'o', 'õ' => 'o', 'ö' => 'o', 'ő' => 'o',
'ø' => 'o', 'ù' => 'u', 'ú' => 'u', 'û' => 'u', 'ü' => 'u', 'ű' => 'u', 'ý' => 'y', 'þ' => 'th',
'ÿ' => 'y',
// Latin symbols
'©' => '(c)',
// Greek
'Α' => 'A', 'Β' => 'B', 'Γ' => 'G', 'Δ' => 'D', 'Ε' => 'E', 'Ζ' => 'Z', 'Η' => 'H', 'Θ' => '8',
'Ι' => 'I', 'Κ' => 'K', 'Λ' => 'L', 'Μ' => 'M', 'Ν' => 'N', 'Ξ' => '3', 'Ο' => 'O', 'Π' => 'P',
'Ρ' => 'R', 'Σ' => 'S', 'Τ' => 'T', 'Υ' => 'Y', 'Φ' => 'F', 'Χ' => 'X', 'Ψ' => 'PS', 'Ω' => 'W',
'Ά' => 'A', 'Έ' => 'E', 'Ί' => 'I', 'Ό' => 'O', 'Ύ' => 'Y', 'Ή' => 'H', 'Ώ' => 'W', 'Ϊ' => 'I',
'Ϋ' => 'Y',
'α' => 'a', 'β' => 'b', 'γ' => 'g', 'δ' => 'd', 'ε' => 'e', 'ζ' => 'z', 'η' => 'h', 'θ' => '8',
'ι' => 'i', 'κ' => 'k', 'λ' => 'l', 'μ' => 'm', 'ν' => 'n', 'ξ' => '3', 'ο' => 'o', 'π' => 'p',
'ρ' => 'r', 'σ' => 's', 'τ' => 't', 'υ' => 'y', 'φ' => 'f', 'χ' => 'x', 'ψ' => 'ps', 'ω' => 'w',
'ά' => 'a', 'έ' => 'e', 'ί' => 'i', 'ό' => 'o', 'ύ' => 'y', 'ή' => 'h', 'ώ' => 'w', 'ς' => 's',
'ϊ' => 'i', 'ΰ' => 'y', 'ϋ' => 'y', 'ΐ' => 'i',
// Turkish
'Ş' => 'S', 'İ' => 'I', 'Ç' => 'C', 'Ü' => 'U', 'Ö' => 'O', 'Ğ' => 'G',
'ş' => 's', 'ı' => 'i', 'ç' => 'c', 'ü' => 'u', 'ö' => 'o', 'ğ' => 'g',
// Russian
'А' => 'A', 'Б' => 'B', 'В' => 'V', 'Г' => 'G', 'Д' => 'D', 'Е' => 'E', 'Ё' => 'Yo', 'Ж' => 'Zh',
'З' => 'Z', 'И' => 'I', 'Й' => 'J', 'К' => 'K', 'Л' => 'L', 'М' => 'M', 'Н' => 'N', 'О' => 'O',
'П' => 'P', 'Р' => 'R', 'С' => 'S', 'Т' => 'T', 'У' => 'U', 'Ф' => 'F', 'Х' => 'H', 'Ц' => 'C',
'Ч' => 'Ch', 'Ш' => 'Sh', 'Щ' => 'Sh', 'Ъ' => '', 'Ы' => 'Y', 'Ь' => '', 'Э' => 'E', 'Ю' => 'Yu',
'Я' => 'Ya',
'а' => 'a', 'б' => 'b', 'в' => 'v', 'г' => 'g', 'д' => 'd', 'е' => 'e', 'ё' => 'yo', 'ж' => 'zh',
'з' => 'z', 'и' => 'i', 'й' => 'j', 'к' => 'k', 'л' => 'l', 'м' => 'm', 'н' => 'n', 'о' => 'o',
'п' => 'p', 'р' => 'r', 'с' => 's', 'т' => 't', 'у' => 'u', 'ф' => 'f', 'х' => 'h', 'ц' => 'c',
'ч' => 'ch', 'ш' => 'sh', 'щ' => 'sh', 'ъ' => '', 'ы' => 'y', 'ь' => '', 'э' => 'e', 'ю' => 'yu',
'я' => 'ya',
// Ukrainian
'Є' => 'Ye', 'І' => 'I', 'Ї' => 'Yi', 'Ґ' => 'G',
'є' => 'ye', 'і' => 'i', 'ї' => 'yi', 'ґ' => 'g',
// Czech
'Č' => 'C', 'Ď' => 'D', 'Ě' => 'E', 'Ň' => 'N', 'Ř' => 'R', 'Š' => 'S', 'Ť' => 'T', 'Ů' => 'U',
'Ž' => 'Z',
'č' => 'c', 'ď' => 'd', 'ě' => 'e', 'ň' => 'n', 'ř' => 'r', 'š' => 's', 'ť' => 't', 'ů' => 'u',
'ž' => 'z',
// Polish
'Ą' => 'A', 'Ć' => 'C', 'Ę' => 'e', 'Ł' => 'L', 'Ń' => 'N', 'Ó' => 'o', 'Ś' => 'S', 'Ź' => 'Z',
'Ż' => 'Z',
'ą' => 'a', 'ć' => 'c', 'ę' => 'e', 'ł' => 'l', 'ń' => 'n', 'ó' => 'o', 'ś' => 's', 'ź' => 'z',
'ż' => 'z',
// Latvian
'Ā' => 'A', 'Č' => 'C', 'Ē' => 'E', 'Ģ' => 'G', 'Ī' => 'i', 'Ķ' => 'k', 'Ļ' => 'L', 'Ņ' => 'N',
'Š' => 'S', 'Ū' => 'u', 'Ž' => 'Z',
'ā' => 'a', 'č' => 'c', 'ē' => 'e', 'ģ' => 'g', 'ī' => 'i', 'ķ' => 'k', 'ļ' => 'l', 'ņ' => 'n',
'š' => 's', 'ū' => 'u', 'ž' => 'z'
);
// Make custom replacements
$str = preg_replace(array_keys($options['replacements']), $options['replacements'], $str);
// Transliterate characters to ASCII
if ($options['transliterate']) {
$str = str_replace(array_keys($char_map), $char_map, $str);
}
// Replace non-alphanumeric characters with our delimiter
$str = preg_replace('/[^\p{L}\p{Nd}]+/u', $options['delimiter'], $str);
// Remove duplicate delimiters
$str = preg_replace('/(' . preg_quote($options['delimiter'], '/') . '){2,}/', '$1', $str);
// Truncate slug to max. characters
$str = mb_substr($str, 0, ($options['limit'] ? $options['limit'] : mb_strlen($str, 'UTF-8')), 'UTF-8');
// Remove delimiter from ends
$str = trim($str, $options['delimiter']);
return $options['lowercase'] ? mb_strtolower($str, 'UTF-8') : $str;
}
?>
<?php
include('url_slug.php');
header('Content-type: text/plain; charset=utf-8');
// Basic usage
echo "This is an example string. Nothing fancy." . "\n";
echo url_slug("This is an example string. Nothing fancy.") . "\n\n";
// Example using French with unwanted characters ('?)
echo "Qu'en est-il français? Ça marche alors?" . "\n";
echo url_slug("Qu'en est-il français? Ça marche alors?") . "\n\n";
// Example using transliteration
echo "Что делать, если я не хочу, UTF-8?" . "\n";
echo url_slug("Что делать, если я не хочу, UTF-8?", array('transliterate' => true)) . "\n\n";
// Example using transliteration on an unsupported language
echo "מה אם אני לא רוצה UTF-8 תווים?" . "\n";
echo url_slug("מה אם אני לא רוצה UTF-8 תווים?", array('transliterate' => true)) . "\n\n";
// Some other options
echo "This is an Example String. What's Going to Happen to Me?" . "\n";
echo url_slug(
"This is an Example String. What's Going to Happen to Me?",
array(
'delimiter' => '_',
'limit' => 40,
'lowercase' => false,
'replacements' => array(
'/\b(an)\b/i' => 'a',
'/\b(example)\b/i' => 'Test'
)
)
);
/*
Output:
This is an example string. Nothing fancy.
this-is-an-example-string-nothing-fancy
Qu'en est-il français? Ça marche alors?
qu-en-est-il-français-ça-marche-alors
Что делать, если я не хочу, UTF-8?
chto-delat-esli-ya-ne-hochu-utf-8
מה אם אני לא רוצה UTF-8 תווים?
מה-אם-אני-לא-רוצה-utf-8-תווים
This is an Example String. What's Going to Happen to Me?
This_is_a_Test_String_What_s_Going_to_Ha
*/
?>
@cobyl
Copy link

cobyl commented Jul 17, 2013

@aduycuong
Copy link

transliterate = true not effect in [ộ, ủ, ĩ, ệ]

@arturssmirnovs
Copy link

Awesome stuff :)

@geejay101
Copy link

Seems to work. Fingers crossed.

Thanks!

@sokai
Copy link

sokai commented Feb 3, 2017

And German:

		// German
		'Ä' => 'AE', 'Ö' => 'OE', 'Ü' => 'UE',
		'ä' => 'ae', 'ö' => 'oe', 'ü' => 'ue',

@ausi
Copy link

ausi commented Nov 3, 2017

@onassar
Copy link

onassar commented Dec 13, 2017

@ausi can you compare the two libs?

@ADProperant
Copy link

Hi, I'm using your code because it's the most complete that I have find.
In some case, for example for share url in twitter (by API), some alphanumeric char are uncorrect. For example like a ordinal chars (like ª).
I suggest to complete this code with replace some Latin 1 supplement chars. (https://cloford.com/resources/charcodes/utf-8_latin1_supplement.htm)

@aramokay
Copy link

aramokay commented Dec 5, 2018

And Armenian
//Armenian "և"=>"ev","ու"=>"u", "Ա"=>"A","Բ"=>"B","Գ"=>"G","Դ"=>"D","Ե"=>"Ye","Զ"=>"Z","Է"=>"E", "Ը"=>"Y","Թ"=>"T","Ժ"=>"Zh","Ի"=>"I","Լ"=>"L","Խ"=>"X","Ծ"=>"Tc", "Կ"=>"K","Հ"=>"H","Ձ"=>"Dz","Ղ"=>"Gh","Ճ"=>"Tch","Մ"=>"M","Յ"=>"Y", "Ն"=>"N","Շ"=>"Sh","Ո"=>"Vo","Չ"=>"Ch","Պ"=>"P","Ջ"=>"J","Ռ"=>"R", "Ս"=>"S","Վ"=>"V","Տ"=>"T","Ր"=>"R","Ց"=>"C","Փ"=>"P","Ք"=>"Q", "Օ"=>"O","Ֆ"=>"F", "Ու"=>"U", "ա"=>"a","բ"=>"b","գ"=>"g","դ"=>"d","ե"=>"e","զ"=>"z","է"=>"e", "ը"=>"y","թ"=>"th","ժ"=>"zh","ի"=>"i","լ"=>"l","խ"=>"x","ծ"=>"ts", "կ"=>"k","հ"=>"h","ձ"=>"dz","ղ"=>"gh","ճ"=>"ch","մ"=>"m","յ"=>"y", "ն"=>"n","շ"=>"sh","ո"=>"o","չ"=>"ch","պ"=>"p","ջ"=>"j","ռ"=>"r", "ս"=>"s","վ"=>"v","տ"=>"t","ր"=>"r","ց"=>"c","փ"=>"p","ք"=>"q", "օ"=>"o","ֆ"=>"f", "№"=>"#","—"=>"-","«"=>"","»"=>"","…"=>""

@BonBonSlick
Copy link

BonBonSlick commented Dec 24, 2018

Why we "to ASCII"? We just do replace without changing encoding.

// Transliterate characters to ASCII

$str = str_replace(array_keys($char_map), $char_map, $str);

@Samirdatainflow
Copy link

If you are working with PHP- Codeigniter framework and you want to Generate URL slug in Codeigniter then follow this link
https://datainflow.com/generate-url-slug-codeigniter/

@egorsmkv
Copy link

egorsmkv commented Aug 4, 2019

The Ukrainian transliteration array seems to be not full.

@geekyayush
Copy link

Woah! Thanks a lot, man.
It really helped.
I've also implemented most of the suggestions from comments. :)

@michaelbutler
Copy link

Cool, I like how it keeps most UTF-8 characters so it works internationally. I want to remind everyone reading this that UTF-8 characters aren't allowed in HTTP message headers (Body is fine of course), which includes the HTTP method and DOCUMENT PATH line. So, when sending an HTTP message with a URL path that contains non-ASCII special UTF-8 characters, or when doing a Location: redirect header, please be sure to properly URL encode the data.

Example:

GET /cookbook/recipes/Что-делать.html HTTP/1.1
Host: www.rebol.net
User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate
Referer: https://www.google.com/

That looks like a typical GET request. What's wrong? The GET line includes non-ASCII characters: Что-делать.html. The correct Request should be:

GET /cookbook/recipes/%D0%A7%D1%82%D0%BE-%D0%B4%D0%B5%D0%BB%D0%B0%D1%82%D1%8C.html HTTP/1.1
Host: www.rebol.net
User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:88.0) Gecko/20100101 Firefox/88.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate
Referer: https://www.google.com/

Though in general, I would recommend only using plain ASCII in URLs for maximum compatibility, but sometimes it cannot be avoided.

For HTTP responses, be sure to also use plain ASCII only:

HTTP/1.1 301 Moved Permanently
Server: nginx
Date: Tue, 18 May 2021 22:52:47 GMT
Content-Type: text/html
Content-Length: 162
Connection: keep-alive
Location: https://www.tumblr.com/%D0%A7%D1%82%D0%BE-%D0%B4%D0%B5%D0%BB%D0%B0%D1%82%D1%8C.html
 
<html>
<head><title>301 Moved Permanently</title></head>
<body>
<center><h1>301 Moved Permanently</h1></center>
<hr><center>nginx</center>
</body>
</html>

Notice the Location: header on the Response does not include any UTF-8 characters.

Some browsers and systems will fudge it and allow this, but others will be strict and deny accepting the Request or Response.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment