Created
September 10, 2015 13:51
-
-
Save ShimmerFairy/392d0c3550a5e7aa4287 to your computer and use it in GitHub Desktop.
The Grapheme Cluster regexes, in P6 regex form
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# the EGC rule, in P6 form (UAX#29, Table 1b) | |
use v6; | |
# note that | alternatives in the Unicode syntax is really || in P6 syntax | |
# ("Note that given alternates (X|Y), the first match is taken.") | |
my token CRLF { <:GCB<CR>> <:GCB<LF>> } | |
my token RI-Sequence { <:GCB<Regional_Indicator>>+ } | |
my token Hangul-Syllable { | |
|| <:GCB<L>>* <:GCB<V>>+ <:GCB<T>>* | |
|| <:GCB<L>>* <:GCB<LV>> <:GCB<V>>* <:GCB<T>>* | |
|| <:GCB<L>>* <:GCB<LVT>> <:GCB<T>>* | |
|| <:GCB<L>>+ | |
|| <:GCB<T>>+ | |
} | |
my token base { <-:M> } | |
my token extended_base { <base> | \c[ZERO WIDTH JOINER] | \c[ZERO WIDTH NON-JOINER] } | |
my token LGC { | |
|| <CRLF> | |
|| [ <RI-Sequence> | <Hangul-Syllable> | <-:GCB<Control>> ] <:Grapheme_Extend>* | |
|| . | |
} | |
my token EGC { | |
|| <CRLF> | |
|| <:GCB<Prepend>>* | |
[ <RI-Sequence> | <Hangul-Syllable> | <-:GCB<Control>> ] | |
[ <:Grapheme_Extend> | <:GCB<SpacingMark>> ]* | |
|| . | |
} | |
"नि" ~~ m:g/<LGC>/; say "Legacy Grapheme Cluster: ", +$/; | |
"नि" ~~ m:g/<EGC>/; say "Extended Grapheme Cluster: ", +$/; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment