Created
November 9, 2014 16:24
-
-
Save rxu/0660eef7a2f9e7992db6 to your computer and use it in GitHub Desktop.
IDN URL validation example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Start with characters that are NFKC Case folded (as in IDNA2003) | |
// \P{Changes_When_NFKC_Casefolded} | |
// Remove Control Characters and Whitespace (as in IDNA2003) | |
$no_cc = '[^\p{C}\p{Z}]'; | |
// Remove Symbols, Punctuation, non-decimal Numbers, and Enclosing Marks | |
$no_symbol = '[^\p{S}\p{P}\p{Nl}\p{No}\p{Me}]'; | |
// Remove characters used for archaic Hangul (Korean) - \p{HST=L} and \p{HST=V} | |
// as per http://unicode.org/Public/UNIDATA/HangulSyllableType.txt | |
$no_hangul = '[^\x{1100}-\x{115F}\x{A960}-\x{A97C}\x{1160}-\x{11A7}\x{D7B0}-\x{D7C6}]'; | |
// Remove three blocks of technical or archaic symbols. | |
$no_cdm = '[^\x{20D0}-\x{20FF}]'; // \p{block=Combining_Diacritical_Marks_For_Symbols} | |
$no_musical = '[^\x{1D100}-\x{1D1FF}]'; // \p{block=Musical_Symbols} | |
$no_ancient_greek_musical = '[^\x{1D200}–\x{1D24F}]'; // \p{block=Ancient_Greek_Musical_Notation} | |
/* Remove certain exceptions: | |
** U+0640 ARABIC TATWEEL | |
** U+07FA NKO LAJANYALAN | |
** U+302E HANGUL SINGLE DOT TONE MARK | |
** U+302F HANGUL DOUBLE DOT TONE MARK | |
** U+3031 VERTICAL KANA REPEAT MARK | |
** U+3032 VERTICAL KANA REPEAT WITH VOICED SOUND MARK | |
** .. | |
** U+3035 VERTICAL KANA REPEAT MARK LOWER HALF | |
** U+303B VERTICAL IDEOGRAPHIC ITERATION MARK | |
*/ | |
$no_certain_exceptions = '[^\x{0640}\x{07FA}\x{302E}\x{302F}\x{3031}-\x{3035}\x{303B}]'; | |
/* Add certain exceptions: | |
** U+00B7 MIDDLE DOT | |
** U+0375 GREEK LOWER NUMERAL SIGN | |
** U+05F3 HEBREW PUNCTUATION GERESH | |
** U+05F4 HEBREW PUNCTUATION GERSHAYIM | |
** U+30FB KATAKANA MIDDLE DOT | |
** U+002D HYPHEN-MINUS | |
** U+06FD ARABIC SIGN SINDHI AMPERSAND | |
** U+06FE ARABIC SIGN SINDHI POSTPOSITION MEN | |
** U+0F0B TIBETAN MARK INTERSYLLABIC TSHEG | |
** U+3007 IDEOGRAPHIC NUMBER ZERO | |
*/ | |
$add_certain_exceptions = '[\x{00B7}\x{0375}\x{05F3}\x{05F4}\x{30FB}\x{002D}\x{06FD}\x{06FE}\x{0F0B}\x{3007}]'; | |
/* Add special exceptions (Deviations): | |
** U+00DF LATIN SMALL LETTER SHARP S | |
** U+03C2 GREEK SMALL LETTER FINAL SIGMA | |
** U+200C ZERO WIDTH NON-JOINER | |
** U+200D ZERO WIDTH JOINER | |
*/ | |
$add_deviations = '[\x{00DF}\x{03C2}\x{200C}\x{200D}]'; | |
$preg_expression = "[a-z]$scheme*:/{2}(?:(?:[\pLa-z0-9\-._~!$&'($inline*+,;=:@|]+|%[\dA-F]{2})+|[0-9.]+|\[[\pLa-z0-9.]+:[\pLa-z0-9.]+:[\pLa-z0-9.:]+\])(?::\d*)?(?:/(?:[\pLa-z0-9\-._~!$&'($inline*+,;=:@|]+|%[\dA-F]{2})*)*(?:\?(?:[\pLa-z0-9\-._~!$&'($inline*+,;=:@/?|]+|%[\dA-F]{2})*)?(?:\#(?:[\pLa-z0-9\-._~!$&'($inline*+,;=:@/?|]+|%[\dA-F]{2})*)?"; | |
$regex = $no_cc . $no_symbol . $no_hangul . $no_cdm . $no_musical . $no_ancient_greek_musical . $no_certain_exceptions;// . $add_certain_exceptions . $add_deviations; | |
$url = 'http://www.täst.de'; | |
var_dump(preg_match('#' . $preg_expression . $regex . '#iu', $url)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment