Skip to content

Instantly share code, notes, and snippets.

@rxu
Created November 9, 2014 16:24
Show Gist options
  • Save rxu/0660eef7a2f9e7992db6 to your computer and use it in GitHub Desktop.
Save rxu/0660eef7a2f9e7992db6 to your computer and use it in GitHub Desktop.
IDN URL validation example
// Start with characters that are NFKC Case folded (as in IDNA2003)
// \P{Changes_When_NFKC_Casefolded}
// Remove Control Characters and Whitespace (as in IDNA2003)
$no_cc = '[^\p{C}\p{Z}]';
// Remove Symbols, Punctuation, non-decimal Numbers, and Enclosing Marks
$no_symbol = '[^\p{S}\p{P}\p{Nl}\p{No}\p{Me}]';
// Remove characters used for archaic Hangul (Korean) - \p{HST=L} and \p{HST=V}
// as per http://unicode.org/Public/UNIDATA/HangulSyllableType.txt
$no_hangul = '[^\x{1100}-\x{115F}\x{A960}-\x{A97C}\x{1160}-\x{11A7}\x{D7B0}-\x{D7C6}]';
// Remove three blocks of technical or archaic symbols.
$no_cdm = '[^\x{20D0}-\x{20FF}]'; // \p{block=Combining_Diacritical_Marks_For_Symbols}
$no_musical = '[^\x{1D100}-\x{1D1FF}]'; // \p{block=Musical_Symbols}
$no_ancient_greek_musical = '[^\x{1D200}–\x{1D24F}]'; // \p{block=Ancient_Greek_Musical_Notation}
/* Remove certain exceptions:
** U+0640 ARABIC TATWEEL
** U+07FA NKO LAJANYALAN
** U+302E HANGUL SINGLE DOT TONE MARK
** U+302F HANGUL DOUBLE DOT TONE MARK
** U+3031 VERTICAL KANA REPEAT MARK
** U+3032 VERTICAL KANA REPEAT WITH VOICED SOUND MARK
** ..
** U+3035 VERTICAL KANA REPEAT MARK LOWER HALF
** U+303B VERTICAL IDEOGRAPHIC ITERATION MARK
*/
$no_certain_exceptions = '[^\x{0640}\x{07FA}\x{302E}\x{302F}\x{3031}-\x{3035}\x{303B}]';
/* Add certain exceptions:
** U+00B7 MIDDLE DOT
** U+0375 GREEK LOWER NUMERAL SIGN
** U+05F3 HEBREW PUNCTUATION GERESH
** U+05F4 HEBREW PUNCTUATION GERSHAYIM
** U+30FB KATAKANA MIDDLE DOT
** U+002D HYPHEN-MINUS
** U+06FD ARABIC SIGN SINDHI AMPERSAND
** U+06FE ARABIC SIGN SINDHI POSTPOSITION MEN
** U+0F0B TIBETAN MARK INTERSYLLABIC TSHEG
** U+3007 IDEOGRAPHIC NUMBER ZERO
*/
$add_certain_exceptions = '[\x{00B7}\x{0375}\x{05F3}\x{05F4}\x{30FB}\x{002D}\x{06FD}\x{06FE}\x{0F0B}\x{3007}]';
/* Add special exceptions (Deviations):
** U+00DF LATIN SMALL LETTER SHARP S
** U+03C2 GREEK SMALL LETTER FINAL SIGMA
** U+200C ZERO WIDTH NON-JOINER
** U+200D ZERO WIDTH JOINER
*/
$add_deviations = '[\x{00DF}\x{03C2}\x{200C}\x{200D}]';
$preg_expression = "[a-z]$scheme*:/{2}(?:(?:[\pLa-z0-9\-._~!$&'($inline*+,;=:@|]+|%[\dA-F]{2})+|[0-9.]+|\[[\pLa-z0-9.]+:[\pLa-z0-9.]+:[\pLa-z0-9.:]+\])(?::\d*)?(?:/(?:[\pLa-z0-9\-._~!$&'($inline*+,;=:@|]+|%[\dA-F]{2})*)*(?:\?(?:[\pLa-z0-9\-._~!$&'($inline*+,;=:@/?|]+|%[\dA-F]{2})*)?(?:\#(?:[\pLa-z0-9\-._~!$&'($inline*+,;=:@/?|]+|%[\dA-F]{2})*)?";
$regex = $no_cc . $no_symbol . $no_hangul . $no_cdm . $no_musical . $no_ancient_greek_musical . $no_certain_exceptions;// . $add_certain_exceptions . $add_deviations;
$url = 'http://www.täst.de';
var_dump(preg_match('#' . $preg_expression . $regex . '#iu', $url));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment