Instantly share code, notes, and snippets.
Created
July 26, 2020 08:27
-
Star
(0)
0
You must be signed in to star a gist -
Fork
(0)
0
You must be signed in to fork a gist
-
Save polevaultweb/bcce0b2eeaa09609abea4cf75b865d06 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Get matching hashtag regex. | |
* Based on https://github.com/ngnpope/twitter-text-php | |
* | |
* @return string Regex pattern. | |
*/ | |
protected function valid_hashtag_regex() { | |
$tmp = array(); | |
// Expression to match latin accented characters. | |
// | |
// 0x00C0-0x00D6 | |
// 0x00D8-0x00F6 | |
// 0x00F8-0x00FF | |
// 0x0100-0x024f | |
// 0x0253-0x0254 | |
// 0x0256-0x0257 | |
// 0x0259 | |
// 0x025b | |
// 0x0263 | |
// 0x0268 | |
// 0x026f | |
// 0x0272 | |
// 0x0289 | |
// 0x028b | |
// 0x02bb | |
// 0x0300-0x036f | |
// 0x1e00-0x1eff | |
// | |
// Excludes 0x00D7 - multiplication sign (confusable with 'x'). | |
// Excludes 0x00F7 - division sign. | |
$tmp['latin_accents'] = '\x{00c0}-\x{00d6}\x{00d8}-\x{00f6}\x{00f8}-\x{00ff}'; | |
$tmp['latin_accents'] .= '\x{0100}-\x{024f}\x{0253}-\x{0254}\x{0256}-\x{0257}'; | |
$tmp['latin_accents'] .= '\x{0259}\x{025b}\x{0263}\x{0268}\x{026f}\x{0272}\x{0289}\x{028b}\x{02bb}\x{0300}-\x{036f}\x{1e00}-\x{1eff}'; | |
// Expression to match non-latin characters. | |
// | |
// Cyrillic (Russian, Ukranian, ...): | |
// | |
// 0x0400-0x04FF Cyrillic | |
// 0x0500-0x0527 Cyrillic Supplement | |
// 0x2DE0-0x2DFF Cyrillic Extended A | |
// 0xA640-0xA69F Cyrillic Extended B | |
$tmp['non_latin_hashtag_chars'] = '\x{0400}-\x{04ff}\x{0500}-\x{0527}\x{2de0}-\x{2dff}\x{a640}-\x{a69f}'; | |
// Hebrew: | |
// | |
// 0x0591-0x05bf Hebrew | |
// 0x05c1-0x05c2 | |
// 0x05c4-0x05c5 | |
// 0x05c7 | |
// 0x05d0-0x05ea | |
// 0x05f0-0x05f4 | |
// 0xfb12-0xfb28 Hebrew Presentation Forms | |
// 0xfb2a-0xfb36 | |
// 0xfb38-0xfb3c | |
// 0xfb3e | |
// 0xfb40-0xfb41 | |
// 0xfb43-0xfb44 | |
// 0xfb46-0xfb4f | |
$tmp['non_latin_hashtag_chars'] .= '\x{0591}-\x{05bf}\x{05c1}-\x{05c2}\x{05c4}-\x{05c5}\x{05c7}\x{05d0}-\x{05ea}\x{05f0}-\x{05f4}'; | |
$tmp['non_latin_hashtag_chars'] .= '\x{fb12}-\x{fb28}\x{fb2a}-\x{fb36}\x{fb38}-\x{fb3c}\x{fb3e}\x{fb40}-\x{fb41}\x{fb43}-\x{fb44}\x{fb46}-\x{fb4f}'; | |
// Arabic: | |
// | |
// 0x0610-0x061a Arabic | |
// 0x0620-0x065f | |
// 0x066e-0x06d3 | |
// 0x06d5-0x06dc | |
// 0x06de-0x06e8 | |
// 0x06ea-0x06ef | |
// 0x06fa-0x06fc | |
// 0x06ff | |
// 0x0750-0x077f Arabic Supplement | |
// 0x08a0 Arabic Extended A | |
// 0x08a2-0x08ac | |
// 0x08e4-0x08fe | |
// 0xfb50-0xfbb1 Arabic Pres. Forms A | |
// 0xfbd3-0xfd3d | |
// 0xfd50-0xfd8f | |
// 0xfd92-0xfdc7 | |
// 0xfdf0-0xfdfb | |
// 0xfe70-0xfe74 Arabic Pres. Forms B | |
// 0xfe76-0xfefc | |
$tmp['non_latin_hashtag_chars'] .= '\x{0610}-\x{061a}\x{0620}-\x{065f}\x{066e}-\x{06d3}\x{06d5}-\x{06dc}\x{06de}-\x{06e8}\x{06ea}-\x{06ef}\x{06fa}-\x{06fc}\x{06ff}'; | |
$tmp['non_latin_hashtag_chars'] .= '\x{0750}-\x{077f}\x{08a0}\x{08a2}-\x{08ac}\x{08e4}-\x{08fe}'; | |
$tmp['non_latin_hashtag_chars'] .= '\x{fb50}-\x{fbb1}\x{fbd3}-\x{fd3d}\x{fd50}-\x{fd8f}\x{fd92}-\x{fdc7}\x{fdf0}-\x{fdfb}\x{fe70}-\x{fe74}\x{fe76}-\x{fefc}'; | |
// | |
// 0x200c-0x200c Zero-Width Non-Joiner | |
// 0x0e01-0x0e3a Thai | |
$tmp['non_latin_hashtag_chars'] .= '\x{200c}\x{0e01}-\x{0e3a}'; | |
// Hangul (Korean): | |
// | |
// 0x0e40-0x0e4e Hangul (Korean) | |
// 0x1100-0x11FF Hangul Jamo | |
// 0x3130-0x3185 Hangul Compatibility Jamo | |
// 0xA960-0xA97F Hangul Jamo Extended A | |
// 0xAC00-0xD7AF Hangul Syllables | |
// 0xD7B0-0xD7FF Hangul Jamo Extended B | |
// 0xFFA1-0xFFDC Half-Width Hangul | |
$tmp['non_latin_hashtag_chars'] .= '\x{0e40}-\x{0e4e}\x{1100}-\x{11ff}\x{3130}-\x{3185}\x{a960}-\x{a97f}\x{ac00}-\x{d7af}\x{d7b0}-\x{d7ff}\x{ffa1}-\x{ffdc}'; | |
// Expression to match other characters. | |
// | |
// 0x30A1-0x30FA Katakana (Full-Width) | |
// 0x30FC-0x30FE Katakana (Full-Width) | |
// 0xFF66-0xFF9F Katakana (Half-Width) | |
// 0xFF10-0xFF19 Latin (Full-Width) | |
// 0xFF21-0xFF3A Latin (Full-Width) | |
// 0xFF41-0xFF5A Latin (Full-Width) | |
// 0x3041-0x3096 Hiragana | |
// 0x3099-0x309E Hiragana | |
// 0x3400-0x4DBF Kanji (CJK Extension A) | |
// 0x4E00-0x9FFF Kanji (Unified) | |
// 0x20000-0x2A6DF Kanji (CJK Extension B) | |
// 0x2A700-0x2B73F Kanji (CJK Extension C) | |
// 0x2B740-0x2B81F Kanji (CJK Extension D) | |
// 0x2F800-0x2FA1F Kanji (CJK supplement) | |
// 0x3003 Kanji (CJK supplement) | |
// 0x3005 Kanji (CJK supplement) | |
// 0x303B Kanji (CJK supplement) | |
$tmp['cj_hashtag_characters'] = '\x{30A1}-\x{30FA}\x{30FC}-\x{30FE}\x{FF66}-\x{FF9F}\x{FF10}-\x{FF19}\x{FF21}-\x{FF3A}\x{FF41}-\x{FF5A}\x{3041}-\x{3096}\x{3099}-\x{309E}\x{3400}-\x{4DBF}\x{4E00}-\x{9FFF}\x{3003}\x{3005}\x{303B}\x{020000}-\x{02a6df}\x{02a700}-\x{02b73f}\x{02b740}-\x{02b81f}\x{02f800}-\x{02fa1f}'; | |
$tmp['hashtag_alpha'] = '[a-z_' . $tmp['latin_accents'] . $tmp['non_latin_hashtag_chars'] . $tmp['cj_hashtag_characters'] . ']'; | |
$tmp['hashtag_alphanumeric'] = '[a-z0-9_' . $tmp['latin_accents'] . $tmp['non_latin_hashtag_chars'] . $tmp['cj_hashtag_characters'] . ']'; | |
$tmp['hashtag_boundary'] = '(?:\A|\z|[^&a-z0-9_' . $tmp['latin_accents'] . $tmp['non_latin_hashtag_chars'] . $tmp['cj_hashtag_characters'] . '])'; | |
$tmp['hashtag'] = '(' . $tmp['hashtag_boundary'] . ')(' . $tmp['hashtag_alphanumeric'] . '*' . $tmp['hashtag_alpha'] . $tmp['hashtag_alphanumeric'] . '*)'; | |
$valid_hashtag = '/' . $tmp['hashtag'] . '(?=(.*|$))/iu'; | |
return $valid_hashtag; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment