Skip to content

Instantly share code, notes, and snippets.

@tricky-labyrinth
Last active January 21, 2025 19:34
A hardcoded version of Llama 3's regex pattern. With some test cases.
mod unicode_tables;
use fancy_regex::Regex;
use bitvec::prelude::*;
// main function to replace regex.find_iter
pub fn fragmentize(text: &str) -> Vec<&str> {
let mut fragments = Vec::with_capacity(text.len());
let mut chars = Vec::with_capacity(text.len());
let mut char_to_byte = Vec::with_capacity(text.len() + 1);
for (idx, c) in text.char_indices() {
chars.push(c);
char_to_byte.push(idx);
}
char_to_byte.push(text.len()); // convenience, allowing us to use chars_to_byte[chars.len() + 1] -> text.len()
let mut is_letter_cache = BitVec::repeat(false, chars.len());
let mut is_number_cache = BitVec::repeat(false, chars.len());
let mut is_whitespace_cache = BitVec::repeat(false, chars.len());
for (j, c) in text.chars().enumerate() {
let whitespace = is_whitespace(c);
if whitespace {
is_whitespace_cache.set(j, true);
} else {
let letter = is_letter(c);
if letter {
is_letter_cache.set(j, true);
} else {
let number = is_number(c);
if number {
is_number_cache.set(j, true);
}
}
}
}
let mut i = 0;
while i < chars.len() { // this isn't actually calling .len() on every iteration; compiler just makes sure chars isn't getting mutated, and then stores the value.
if let Some(len) = match_contractions(&chars, i) {
fragments.push(&text[char_to_byte[i]..char_to_byte[i+len]]);
i += len;
} else if let Some(len) = match_word(&chars, i, &is_letter_cache, &is_number_cache) {
fragments.push(&text[char_to_byte[i]..char_to_byte[i+len]]);
i += len;
} else if let Some(len) = match_numbers(&chars, i, &is_number_cache) {
fragments.push(&text[char_to_byte[i]..char_to_byte[i+len]]);
i += len;
} else if let Some(len) = match_symbol_block(&chars, i, &is_letter_cache, &is_number_cache, &is_whitespace_cache) {
fragments.push(&text[char_to_byte[i]..char_to_byte[i+len]]);
i += len;
} else if let Some(len) = match_newlines(&chars, i, &is_whitespace_cache) {
fragments.push(&text[char_to_byte[i]..char_to_byte[i+len]]);
i += len;
} else if let Some(len) = match_trailing_whitespace(&chars, i, &is_whitespace_cache) {
fragments.push(&text[char_to_byte[i]..char_to_byte[i+len]]);
i += len;
} else if let Some(len) = match_whitespace(&chars, i, &is_whitespace_cache) {
fragments.push(&text[char_to_byte[i]..char_to_byte[i+len]]);
i += len;
} else {
// if nothing matches, consume one char and continue
// the regex equivalent of a char being completely matchless; LLM creators should design their regex pattern so this ~never happens
fragments.push(&text[char_to_byte[i]..char_to_byte[i+1]]);
i += 1;
}
}
fragments
}
// binary search over the unicode tables
fn is_letter(c: char) -> bool {
unicode_tables::LETTER.binary_search_by(|&(start, end)| {
if c < start {
std::cmp::Ordering::Greater
} else if c > end {
std::cmp::Ordering::Less
} else {
std::cmp::Ordering::Equal
}
})
.is_ok()
}
fn is_number(c: char) -> bool {
unicode_tables::NUMBER.binary_search_by(|&(start, end)| {
if c < start {
std::cmp::Ordering::Greater
} else if c > end {
std::cmp::Ordering::Less
} else {
std::cmp::Ordering::Equal
}
})
.is_ok()
}
fn is_whitespace(c: char) -> bool {
unicode_tables::WHITE_SPACE.binary_search_by(|&(start, end)| {
if c < start {
std::cmp::Ordering::Greater
} else if c > end {
std::cmp::Ordering::Less
} else {
std::cmp::Ordering::Equal
}
})
.is_ok()
}
fn match_contractions(chars: &[char], start: usize) -> Option<usize> {
// (?i:'s|'t|'re|'ve|'m|'ll|'d)
// we will lowercase the next few chars and see if they match these sequences
// sequences (case-insensitive):
// 's 't 're 've 'm 'll 'd
// all start with '
// check presence of at least 2 chars, up to max length 3 for 're, 've, 'll
if start >= chars.len() { return None; }
let mut rem = chars[start..].iter().map(|c| c.to_ascii_lowercase()); // this would be expensive if the compiler wasn't cracked
let c0 = rem.next()?;
if c0 != '\'' { return None; }
let c1 = rem.next().unwrap_or('\0');
match c1 {
's' | 't' | 'm' | 'd' => {
// 's, 't, 'm, 'd are two-char matches
Some(2)
}
'r' => {
// maybe 're
let c2 = rem.next().unwrap_or('\0');
if c2 == 'e' { Some(3) } else { None }
}
'v' => {
// 've
let c2 = rem.next().unwrap_or('\0');
if c2 == 'e' { Some(3) } else { None }
}
'l' => {
// 'll
let c2 = rem.next().unwrap_or('\0');
if c2 == 'l' { Some(3) } else { None }
}
_ => None
}
}
fn match_word(chars: &[char], start: usize, is_letter_cache: &BitVec, is_number_cache: &BitVec) -> Option<usize> {
// [^\r\n\p{L}\p{N}]?\p{L}+
// optional one char that is not CR, LF, letter, or number
// followed by one or more letters
// step 1: try to match optional prefix
let mut i = start;
let c = chars[i];
if i < chars.len() && (c != '\r' && c != '\n' && !is_letter_cache[i] && !is_number_cache[i]) {
i += 1;
}
// now match one or more letters
let start_of_letters = i;
while i < chars.len() && is_letter_cache[i] {
i += 1;
}
let letter_count = i - start_of_letters;
if letter_count > 0 {
Some(i - start)
} else {
// if we failed to match letters, but took a prefix char, that still doesn't form a valid fragment
None
}
}
fn match_numbers(chars: &[char], start: usize, is_number_cache: &BitVec) -> Option<usize> {
// \p{N}{1,3} - 1 to 3 numeric chars
let mut i = start;
let mut count = 0;
while i < chars.len() && is_number_cache[i] && count < 3 {
i += 1;
count += 1;
}
if count > 0 {
Some(count)
} else {
None
}
}
fn match_symbol_block(chars: &[char], start: usize, is_letter_cache: &BitVec, is_number_cache: &BitVec, is_whitespace_cache: &BitVec) -> Option<usize> {
// ?[^\s\p{L}\p{N}]+[\r\n]*
let mut i = start;
if i < chars.len() && chars[i] == ' ' {
i += 1;
}
let mut symbol_count = 0;
while i < chars.len() && (!is_whitespace_cache[i] && !is_letter_cache[i] && !is_number_cache[i]) {
i += 1;
symbol_count += 1;
}
if symbol_count == 0 {
return None;
}
while i < chars.len() && (chars[i] == '\r' || chars[i] == '\n') {
i += 1;
}
Some(i - start)
}
fn match_newlines(chars: &[char], start: usize, is_whitespace_cache: &BitVec) -> Option<usize> {
// \s*[\r\n]+
// zero or more whitespace, then one or more newlines (\r or \n)
let mut i = start;
while i < chars.len() && !(chars[i] == '\r' || chars[i] == '\n') && is_whitespace_cache[i] {
i += 1;
}
let mut newline_count = 0;
while i < chars.len() && (chars[i] == '\r' || chars[i] == '\n') {
i += 1;
newline_count += 1;
}
if newline_count > 0 {
Some(i - start)
} else {
None
}
}
fn match_trailing_whitespace(chars: &[char], start: usize, is_whitespace_cache: &BitVec) -> Option<usize> {
// \s+(?!\S)
let mut i = start;
let mut count = 0;
while i < chars.len() && is_whitespace_cache[i] {
i += 1;
count += 1;
}
if count == 0 {
return None;
}
// look ahead: must not be followed by a non-whitespace char
if i >= chars.len() {
Some(i - start)
} else if is_whitespace_cache[i - 1] { // we know i > 1 from the above while loop
// but we still have to check if this's a single whitespace char
if (i - start) == 1 {
Some(i - start)
} else {
Some((i - start) - 1) // remove that last whitespace char if that doesn't make us empty
}
} else {
None
}
}
fn match_whitespace(chars: &[char], start: usize, is_whitespace_cache: &BitVec) -> Option<usize> {
// \s+
let mut i = start;
let mut count = 0;
while i < chars.len() && is_whitespace_cache[i] {
i += 1;
count += 1;
}
if count > 0 {
Some(count)
} else {
None
}
}
fn main() {
let test_cases = vec![
"newline test\nwith multiple lines\n\nend",
"Hello world\n 'Re is good \n\n 123.",
"Hello world",
"Hello world",
"Hello world\n\n",
"'s 't 're 've 'm 'll 'd",
"Hello-world",
"abc123 xyz 'Re end",
"Some punctuation!!!",
"Trailing space ",
"Multiple\nLines\nHere",
"simple 's test",
"trailing space ",
"no contractions here",
"'m 'll 'd 've 're 't 's mixed up",
"123 4567 89",
"emojis 🤔😀 check",
"punct !?!? end",
" multiple spaces before words ",
"你好世界 你好世界",
"y̆es",
"y̆y̆y̆y̆y̆y̆y̆es",
// ~20 more tests for wide coverage
"Zalgo: Z̸̹̓̀ą̵̮͑̎l̶̞̈ͅg̴̙̐̿ő̵̜̒!",
"Mixed languages: Bonjour 世界 Hello",
" Leading spaces",
"Tabs\tand other whitespace",
"Numbers: 000 9999",
"Contractions with caps: 'S 'Ll 'D",
"Emoji clusters: 👨‍👩‍👦 👨‍❤️‍👨",
"Latin Extended: ŚŌMĒ ÚNĬČŐDĖ",
"Arabic text: مرحبا بالعالم",
"Hebrew text: שלום עולם",
"Thai text: สวัสดีโลก",
"Devanagari: नमस्ते दुनिया",
"A string with only punctuation: !!!???$$$",
"No break space: hello\u{00A0}world",
"Soft hyphen test: hel\u{00AD}lo",
"Invisible joiner test: a\u{200D}b",
"Right-to-left text: שלום",
"Combining accents: e\u{0301} e\u{0308} e\u{0323}",
"Full width characters: Hello world",
"Long run of whitespace:\n\n\n \t\t\n",
"Numbers with letters: abc123xyz",
"Zero width space: hello\u{200B}world",
// big tests
// approx. 1k chars: mixture of normal text, zalgo, unicode, emojis, punctuation, multiple whitespaces, etc.
r#"This is a long test case with a variety of strange characters. Let's go crazy:
Zalgo: Z̵̮̜̼̼̦͌̎̉̚̕a̵̱̟̲̬̫̱̅͂l̵̠̹̏̅̓̓ğ̴̨̥̦̦̠́̄̃ȏ̴̢̼̱̏̀͋͘!
A line with emojis: 🤔😂🐱‍👤🔥✨💯
Multiple spaces: "Hello world" and tabs: "Hello\t\tworld"
Mixed scripts: English, 中文, عربى, हिन्दी, Русский, Ελληνικά.
Accents: café, coöperate, façades. Combining marks: e\u{0301} a\u{0308} i\u{0323}.
Weird punctuation: !!!???$$$@@@###~~~===+++!!!
No-break space: hello\u{00A0}world. Zero width space: hello\u{200B}world.
RTL text: שלום עולם. BiDi test: ABC‏DEF‎GHI.
A bunch of newlines:
End with a lot of random ASCII noise: !!!!!????????/////\\\\\||||||||@@@@@@@
And some final whitespace:
"#,
// another ~1k chars, focusing on large unicode blocks, multiple lines, nested scripts
r#"Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Here is a large block of text with unusual Unicode:
Devanagari: नमस्ते दुनिया
Thai: สวัสดีโลก
Arabic: مرحبا بالعالم
Hebrew: שלום עולם
Emoji family: 👨‍👩‍👦 👩‍❤️‍💋‍👩 👨‍❤️‍💋‍👨
Full width: Hello world
Zalgo extended: H̷̫̰̘̅ę̸͕̺̼̹̀̄́̔̋ľ̵̼̠̝́͊l̸̛̮̖̼̃̓ȯ̵̙͔
Invisible joiner: a\u{200D}b c\u{200D}d
Soft hyphen: hel\u{00AD}lo
Accented run: áéíóú àèìòù âêîôû
Numbers: 123 456 7890 000 9999 12 ᧐᧑᧒
Weird spacing: "word word", "word\n\n\nword", "word\r\nword"
Trailing spaces:
Another line of punctuation: (((((( )))))) {{{{{ }}}}} [[[[[ ]]]]] <<<<< >>>>>
End of big block.
"#,
// third ~1k chars: lots of random punctuation, symbols, mixed casing, random languages
r#"This test has random CJK: 你好世界 こんにちは世界 안녕하세요 세계
Mixing math symbols: ∑ ∏ ∫ ± ∞ ≤ ≥
Line with random ASCII art:
>>>>>>>>>>>----->>>>>>>[[[[||]]]]<<<<<-----<<<<<<<<<<
Mixed quotes: “Hello” ‘World’ «Привет» „Hallo”
Random diacritics stacked: o\u{0302}\u{0311}\u{0300}\u{0323}\u{0339}
Spaced out text: W O R D
Tabs and newlines in weird places:
\t\t\n\n \t \n
Invisible spaces: hello\u{200B}again\u{200B}friend
Punctuation explosion: !!!???###@@@$$$%%%^^^&&&***((()))---___+++===???
Zalgo with newline:
R̴̨̛̫̺̰̦̹͓̗̠̠̹̔̿̇̔̒̈́̍̕
End with a sequence of random Unicode blocks:
Latin Extended: Ŵŷ
Greek: ΞΩψ
Arabic again: س
Hebrew again: ז
Cyrllic: Ж
Emoji: 🤯😱
Done.
"#,
// fourth ~1k chars: a large narrative-like text but with random unicode and whitespace
r#"Once upon a time, in a land far away, there were characters of all kinds:
The villagers spoke: "Hello world" (with triple spaces).
They wrote runes: ᚠᚢᚦᚨᚱᚲ
They drew glyphs: 𓀀 𓁹 𓂀
They laughed in all scripts: 哈哈, ههه, हहा, хаха, haha!
They danced: 💃🕺💃🕺
Between them stood lines of whitespace:
Numbers scattered: 42 007 999999
Soft hyphen words: co\u{00AD}op re\u{00AD}act
Accents: naïve fiancé jalapeño
Fullwidth forms again: This is wide
Mixing direction: ABC‏DEF‎GHI one more time.
At the end, punctuation stands guard: ~~~###@@@!!!
The end.
"#,
// fifth ~1k chars: heavy use of multiple combining marks, random direction changes, etc.
r#"A final insanity test:
Combinations of letters + accents:
a\u{0300}\u{0301}\u{0302}\u{0323}, e\u{0308}\u{0301}\u{031c}, i\u{0339}\u{0342}\u{035c}, o\u{0361}\u{0311}\u{0338}, u\u{035f}\u{0334}\u{0345}
Line with ZWJ sequences: 👩‍❤️‍💋‍👩👨‍👨‍👧‍👦👩‍👩‍👦
Huge whitespace block with various breaks:
\n\n\n\r\n\r\n
"End-of-line" words:
hello\
world\
again
Random Unicode blocks:
Ethiopic: ሰላም
Gujarati: નમસ્તે
Gurmukhi: ਸਤ ਸ੍ਰੀ ਅਕਾਲ
Tamil: வணக்கம்
Ornamental dingbats: ❀❁✿❁❀
Punctuation run: !!!???///\\\\///???
No-break space again: test\u{00A0}test
Zalgo one last time: W̸̱̞̓̑̕ḛ̵̤̞̦̟̈́̇̕ɨ̴̳̰͙̱̈ř̵̜̦̰̊ď̴̠̻
The end.
"#,

"aaa, aab, aac, aad, aae, aaf, aag, aah, aai, aaj, aak, aal, aam, aan, aao, aap, aaq, aar, aas, aat, aau, aav, aaw, aax, aay, aaz, aba, abb, abc, abd, abe, abf, abg, abh, abi, abj, abk, abl, abm, abn, abo, abp, abq, abr, abs, abt, abu, abv, abw, abx, aby, abz, aca, acb, acc, acd, ace, acf, acg, ach, aci, acj, ack, acl, acm, acn, aco, acp, acq, acr, acs, act, acu, acv, acw, acx, acy, acz, ada, adb, adc, add, ade, adf, adg, adh, adi, adj, adk, adl, adm, adn, ado, adp, adq, adr, ads, adt, adu, adv, adw, adx, ady, adz, aea, aeb, aec, aed, aee, aef, aeg, aeh, aei, aej, aek, ael, aem, aen, aeo, aep, aeq, aer, aes, aet, aeu, aev, aew, aex, aey, aez, afa, afb, afc, afd, afe, aff, afg, afh, afi, afj, afk, afl, afm, afn, afo, afp, afq, afr, afs, aft, afu, afv, afw, afx, afy, afz, aga, agb, agc, agd, age, agf, agg, agh, agi, agj, agk, agl, agm, agn, ago, agp, agq, agr, ags, agt, agu, agv, agw, agx, agy, agz, aha, ahb, ahc, ahd, ahe, ahf, ahg, ahh, ahi, ahj, ahk, ahl, ahm, ahn, aho, ahp, ahq, ahr, ahs, aht, ahu, ahv, ahw, ahx, ahy, ahz, aia, aib, aic, aid, aie, aif, aig, aih, aii, aij, aik, ail, aim, ain, aio, aip, aiq, air, ais, ait, aiu, aiv, aiw, aix, aiy, aiz, aja, ajb, ajc, ajd, aje, ajf, ajg, ajh, aji, ajj, ajk, ajl, ajm, ajn, ajo, ajp, ajq, ajr, ajs, ajt, aju, ajv, ajw, ajx, ajy, ajz, aka, akb, akc, akd, ake, akf, akg, akh, aki, akj, akk, akl, akm, akn, ako, akp, akq, akr, aks, akt, aku, akv, akw, akx, aky, akz, ala, alb, alc, ald, ale, alf, alg, alh, ali, alj, alk, all, alm, aln, alo, alp, alq, alr, als, alt, alu, alv, alw, alx, aly, alz, ama, amb, amc, amd, ame, amf, amg, amh, ami, amj, amk, aml, amm, amn, amo, amp, amq, amr, ams, amt, amu, amv, amw, amx, amy, amz, ana, anb, anc, and, ane, anf, ang, anh, ani, anj, ank, anl, anm, ann, ano, anp, anq, anr, ans, ant, anu, anv, anw, anx, any, anz, aoa, aob, aoc, aod, aoe, aof, aog, aoh, aoi, aoj, aok, aol, aom, aon, aoo, aop, aoq, aor, aos, aot, aou, aov, aow, aox, aoy, aoz, apa, apb, apc, apd, ape, apf, apg, aph, api, apj, apk, apl, apm, apn, apo, app, apq, apr, aps, apt, apu, apv, apw, apx, apy, apz, aqa, aqb, aqc, aqd, aqe, aqf, aqg, aqh, aqi, aqj, aqk, aql, aqm, aqn, aqo, aqp, aqq, aqr, aqs, aqt, aqu, aqv, aqw, aqx, aqy, aqz, ara, arb, arc, ard, are, arf, arg, arh, ari, arj, ark, arl, arm, arn, aro, arp, arq, arr, ars, art, aru, arv, arw, arx, ary, arz, asa, asb, asc, asd, ase, asf, asg, ash, asi, asj, ask, asl, asm, asn, aso, asp, asq, asr, ass, ast, asu, asv, asw, asx, asy, asz, ata, atb, atc, atd, ate, atf, atg, ath, ati, atj, atk, atl, atm, atn, ato, atp, atq, atr, ats, att, atu, atv, atw, atx, aty, atz, aua, aub, auc, aud, aue, auf, aug, auh, aui, auj, auk, aul, aum, aun, auo, aup, aqu, aur, aus, aut, auu, auv, auw, aux, auy, auz, ava, avb, avc, avd, ave, avf, avg, avh, avi, avj, avk, avl, avm, avn, avo, avp, avq, avr, avs, avt, avu, avv, avw, avx, avy, avz, awa, awb, awc, awd, awe, awf, awg, awh, awi, awj, awk, awl, awm, awn, awo, awp, awq, awr, aws, awt, awu, awv, awx, awy, awz, axa, axb, axc, axd, axe, axf, axg, axh, axi, axj, axk, axl, axm, axn, axo, axp, axq, axr, axs, axt, axu, axv, axw, axx, axy, axz, aya, ayb, ayc, ayd, aye, ayf, ayg, ayh, ayi, ayj, ayk, ayl, aym, ayn, ayo, ayp, ayq, ayr, ays, ayt, ayu, ayv, ayw, ayx, ayy, ayz, aza, azb, azc, azd, aze, azf, azg, azh, azi, azj, azk, azl, azm, azn, azo, azp, azq, azr, azs, azt, azu, azv, azw, azx, azy, azz, uce, ucf, ucg, uch, uci, ucj, uck, ucl, ucm, ucn, uco, ucp, ucq, ucr, ucs, uct, ucu, ucv, ucw, ucx, ucy, ucz, uda, udb, udc, udd, ude, udf, udg, udh, udi, udj, udk, udl, udm, udn, udo, udp, udq, udr, uds, udt, udu, udv, udw, udx, udy, udz, uea, ueb, uec, ued, uee, uef, ueg, ueh, uei, uej, uek, uel, uem, uen, ueo, uep, ueq, uer, ues, uet, ueu, uev, uew, uex, uey, uez, ufa, ufb, ufc, ufd, ufe, uff, ufg, ufh, ufi, ufj, ufk, ufl, ufm, ufn, ufo, ufp, ufq, ufr, ufs, uft, ufu, ufv, ufw, ufx, ufy, ufz, uga, ugb, ugc, ugd, uge, ugf, ugg, ugh, ugi, ugj, ugk, ugl, ugm, ugn, ugo, ugp, ugq, ugr, ugs, ugt, ugu, ugv, ugw, ugx, ugy, ugz, uha, uhb, uhc, uhd, uhe, uhf, uhg, uhh, uhi, uhj, uhk, uhl, uhm, uhn, uho, uhp, uhq, uhr, uhs, uht, uhu, uhv, uhw, uhx, uhy, uhz, uia, uib, uic, uid, uie, uif, uig, uih, uii, uij, uik, uil, uim, uin, uio, uip, uiq, uir, uis, uit, uiu, uiv, uiw, uix, uiy, uiz, uja, ujb, ujc, ujd, uje, ujf, ujg, ujh, uji, ujj, ujk, ujl, ujm, ujn, ujo, ujp, ujq, ujr, ujs, ujt, uju, ujv, ujw, ujx, ujy, ujz, uka, ukb, ukc, ukd, uke, ukf, ukg, ukh, uki, ukj, ukk, ukl, ukm, ukn, uko, ukp, ukq, ukr, uks, ukt, uku, ukv, ukw, ukx, uky, ukz, ula, ulb, ulc, uld, ule, ulf, ulg, ulh, uli, ulj, ulk, ull, ulm, uln, ulo, ulp, ulq, ulr, uls, ult, ulu, ulv, ulw, ulx, uly, ulz, uma, umb, umc, umd, ume, umf, umg, umh, umi, umj, umk, uml, umm, umn, umo, ump, umq, umr, ums, umt, umu, umv, umw, umx, umy, umz, una, unb, unc, und, une, unf, ung, unh, uni, unj, unk, unl, unm, unn, uno, unp, unq, unr, uns, unt, unu, unv, unw, unx, uny, unz, uoa, uob, uoc, uod, uoe, uof, uog, uoh, uoi, uoj, uok, uol, uom, uon, uoo, uop, uoq, uor, uos, uot, uou, uov, uow, uox, uoy, uoz, upa, upb, upc, upd, upe, upf, upg, uph, upi, upj, upk, upl, upm, upn, upo, upp, upq, upr, ups, upt, upu, upv, upw, upx, upy, upz, uqa, uqb, uqc, uqd, uqe, uqf, uqg, uqh, uqi, uqj, uqk, uql, uqm, uqn, uqo, uqp, uqq, uqr, uqs, uqt, uqu, uqv, uqw, uqx, uqy, uqz, ura, urb, urc, urd, ure, urf, urg, urh, uri, urj, urk, url, urm, urn, uro, urp, urq, urr, urs, urt, uru, urv, urw, urx, ury, urz, usa, usb, usc, usd, use, usf, usg, ush, usi, usj, usk, usl, usm, usn, uso, usp, usq, usr, uss, ust, usu, usv, usw, usx, usy, usz, uta, utb, utc, utd, ute, utf, utg, uth, uti, utj, utk, utl, utm, utn, uto, utp, utq, utr, uts, utt, utu, utv, utw, utx, uty, utz, uua, uub, uuc, uud, uue, uuf, uug, uuh, uui, uuj, uuk, uul, uum, uun, uuo, uup, uuq, uur, uus, uut, uuu, uuv, uuw, uux, uuy, uuz, uva, uvb, uvc, uvd, uve, uvf, uvg, uvh, uvi, uvj, uvk, uvl, uvm, uvn, uvo, uvp, uvq, uvr, uvs, uvt, uvu, uvv, uvw, uvx, uvy, uvz, uwa, uwb, uwc, uwd, uwe, uwf, uwg, uwh, uwi, uwj, uwk, uwl, uwm, uwn, uwo, uwp, uwq, uwr, uws, uwt, uwu, uwv, uww, uwx, uwy, uwz, uxa, uxb, uxc, uxd, uxe, uxf, uxg, uxh, uxi, uxj, uxk, uxl, uxm, uxn, uxo, uxp, uxq, uxr, uxs, uxt, uxu, uxv, uxw, uxx, uxy, uxz, uya, uyb, uyc, uyd, uye, uyf, uyg, uyh, uyi, uyj, uyk, uyl, uym, uyn, uyo, uyp, uyq, uyr, uys, uyt, uyu, uyv, uyw, uyx, uyy, uyz, uza, uzb, uzc, uzd, uze, uzf, uzg, uzh, uzi, uzj, uzk, uzl, uzm, uzn, uzo, uzp, uzq, uzr, uzs, uzt, uzu, uzv, uzw, uzx, uzy, uzz, vaa, vab, vac, vad, vae, vaf, vag, vah, vai, vaj, vak, val, vam, van, vao, vap, vaq, var, vas, vat, vau, vav, vaw, vax, vay, vaz, vba, vbb, vbc, vbd, vbe, vbf, vbg, vbh, vbi, vbj, vbk, vbl, vbm, vbn, vbo, vbp, vbq, vbr, vbs, vbt, vbu, vbv, vbw, vbx, vby, vbz, vca, vcb, vcc, vcd, vce, vcf, vcg, vch, vci, vcj, vck, vcl, vcm, vcn, vco, vcp, vcq, vcr, vcs, vct, vcu, vcv, vcw, vcx, vcy, vcz, vda, vdb, vdc, vdd, vde, vdf, vdg, vdh, vdi, vdj, vdk, vdl, vdm, vdn, vdo, vdp, vdq, vdr, vds, vdt, vdu, vdv, vdw, vdx, vdy, vdz, vea, veb, vec, ved, vee, vef, veg, veh, vei, vej, vek, vel, vem, ven, veo, vep, veq, ver, ves, vet, veu, vev, vew, vex, vey, vez, vfa, vfb, vfc, vfd, vfe, vff, vfg, vfh, vfi, vfj, vfk, vfl, vfm, vfn, vfo, vfp, vfq, vfr, vfs, vft, vfu, vfv, vfw, vfx, vfy, vfz, vga, vgb, vgc, vgd, vge, vgf, vgg, vgh, vgi, vgj, vgk, vgl, vgm, vgn, vgo, vgp, vgq, vgr, vgs, vgt, vgu, vgv, vgw, vgx, vgy, vgz, vha, vhb, vhc, vhd, vhe, vhf, vhg, vhh, vhi, vhj, vhk, vhl, vhm, vhn, vho, vhp, vhq, vhr, vhs, vht, vhu, vhv, vhw, vhx, vhy, vhz, via, vib, vic, vid, vie, vif, vig, vih, vii, vij, vik, vil, vim, vin, vio, vip, viq, vir, vis, vit, viu, viv, viw, vix, viy, viz, vja, vjb, vjc, vjd, vje, vjf, vjg, vjh, vji, vjj, vjk, vjl, vjm, vjn, vjo, vjp, vjq, vjr, vjs, vjt, vju, vjv, vjw, vjx, vjy, vjz, vka, vkb, vkc, vkd, vke, vkf, vkg, vkh, vki, vkj, vkk, vkl, vkm, vkn, vko, vkp, vkq, vkr, vks, vkt, vku, vkv, vkw, vkx, vky, vkz, vla, vlb, vlc, vld, vle, vlf, vlg, vlh, vli, vlj, vlk, vll, vlm, vln, vlo, vlp, vlq, vlr, vls, vlt, vlu, vlv, vlw, vlx, vly, vlz, vma, vmb, vmc, vmd, vme, vmf, vmg, vmh, vmi, vmj, vmk, vml, vmm, vmn, vmo, vmp, vmq, vmr, vms, vmt, vmu, vmv, vmw, vmx, vmy, vmz, vna, vnb, vnc, vnd, vne, vnf, vng, vnh, vni, vnj, vnk, vnl, vnm, vnn, vno, vnp, vnq, vnr, vns, vnt, vnu, vnv, vnw, vnx, vny, vnz, voa, vob, voc, vod, voe, vof, vog, voh, voi, voj, vok, vol, vom, von, voo, vop, voq, vor, vos, vot, vou, vov, vow, vox, voy, voz, vpa, vpb, vpc, vpd, vpe, vpf, vpg, vph, vpi, vpj, vpk, vpl, vpm, vpn, vpo, vpp, vpq, vpr, vps, vpt, vpu, vpv, vpw, vpx, vpy, vpz, vqa, vqb, vqc, vqd, vqe, vqf, vqg, vqh, vqi, vqj, vqk, vql, vqm, vqn, vqo, vqp, vqq, vqr, vqs, vqt, vqu, vqv, vqw, vqx, vqy, vqz, vra, vrb, vrc, vrd, vre, vrf, vrg, vrh, vri, vrj, vrk, vrl, vrm, vrn, vro, vrp, vrq, vrr, vrs, vrt, vru, vrv, vrw, vrx, vry, vrz, vsa, vsb, vsc, vsd, vse, vsf, vsg, vsh, vsi, vsj, vsk, vsl, vsm, vsn, vso, vsp, vsq, vsr, vss, vst, vsu, vsv, vsw, vsx, vsy, vsz, vta, vtb, vtc, vtd, vte, vtf, vtg, vth, vti, vtj, vtk, vtl, vtm, vtn, vto, vtp, vtq, vtr, vts, vtt, vtu, vtv, vtw, vtx, vty, vtz, vua, vub, vuc, vud, vue, vuf, vug, vuh, vui, vuj, vuk, vul, vum, vun, vuo, vup, vuq, vur, vus, vut, vuu, vuv, vuw, vux, vuy, vuz, vva, vvb, vvc, vvd, vve, vvf, vvg, vvh, vvi, vvj, vvk, vvl, vvm, vvn, vvo, vvp, vvq, vvr, vvs, vvt, vvu, vvv, vvw, vvx, vvy, vvz, vwa, vwb, vwc, vwd, vwe, vwf, vwg, vwh, vwi, vwj, vwk, vwl, vwm, vwn, vwo, vwp, vwq, vwr, vws, vwt, vwu, vwv, vww, vwx, vwy, vwz, vxa, vxb, vxc, vxd, vxe, vxf, vxg, vxh, vxi, vxj, vxk, vxl, vxm, vxn, vxo, vxp, vxq, vxr, vxs, vxt, vxu, vxv, vxw, vxx, vxy, vxz, vya, vyb, vyc, vyd, vye, vyf, vyg, vyh, vyi, vyj, vyk, vyl, vym, vyn, vyo, vyp, vyq, vyr, vys, vyt, vyu, vyv, vyw, vyx, vyy, vyz, vza, vzb, vzc, vzd, vze, vzf, vzg, vzh, vzi, vzj, vzk, vzl, vzm, vzn, vzo, vzp, vzq, vzr, vzs, vzt, vzu, vzv, vzw, vzx, vzy, vzz, waa, wab, wac, wad, wae, waf, wag, wah, wai, waj, wak, wal, wam, wan, wao, wap, waq, war, was, wat, wau, wav, waw, wax, way, waz, wba, wbb, wbc, wbd, wbe, wbf, wbg, wbh, wbi, wbj, wbk, wbl, wbm, wbn, wbo, wbp, wbq, wbr, wbs, wbt, wbu, wbv, wbw, wbx, wby, wbz, wca, wcb, wcc, wcd, wce, wcf, wcg, wch, wci, wcj, wck, wcl, wcm, wcn, wco, wcp, wcq, wcr, wcs, wct, wcu, wcv, wcw, wcx, wcy, wcz, wda, wdb, wdc, wdd, wde, wdf, wdg, wdh, wdi, wdj, wdk, wdl, wdm, wdn, wdo, wdp, wdq, wdr, wds, wdt, wdu, wdv, wdw, wdx, wdy, wdz, wea, web, wec, wed, wee, wef, weg, weh, wei, wej, wek, wel, wem, wen, weo, wep, weq, wer, wes, wet, weu, wev, wew, wex, wey, wez, wfa, wfb, wfc, wfd, wfe, wff, wfg, wfh, wfi, wfj, wfk, wfl, wfm, wfn, wfo, wfp, wfq, wfr, wfs, wft, wfu, wfv, wfw, wfx, wfy, wfz, wga, wgb, wgc, wgd, wge, wgf, wgg, wgh, wgi, wgj, wgk, wgl, wgm, wgn, wgo, wgp, wgq, wgr, wgs, wgt, wgu, wgv, wgw, wgx, wgy, wgz, wha, whb, whc, whd, whe, whf, whg, whh, whi, whj, whk, whl, whm, whn, who, whp, whq, whr, whs, wht, whu, whv, whw, whx, why, whz, wia, wib, wic, wid, wie, wif, wig, wih, wii, wij, wik, wil, wim, win, wio, wip, wiq, wir, wis, wit, wiu, wiv, wiw, wix, wiy, wiz, wja, wjb, wjc, wjd, wje, wjf, wjg, wjh, wji, wjj, wjk, wjl, wjm, wjn, wjo, wjp, wjq, wjr, wjs, wjt, wju, wjv, wjw, wjx, wjy, wjz, wka, wkb, wkc, wkd, wke, wkf, wkg, wkh, wki, wkj, wkk, wkl, wkm, wkn, wko, wkp, wkq, wkr, wks, wkt, wku, wkv, wkw, wkx, wky, wkz, wla, wlb, wlc, wld, wle, wlf, wlg, wlh, wli, wlj, wlk, wll, wlm, wln, wlo, wlp, wlq, wlr, wls, wlt, wlu, wlv, wlw, wlx, wly, wlz, wma, wmb, wmc, wmd, wme, wmf, wmg, wmh, wmi, wmj, wmk, wml, wmm, wmn, wmo, wmp, wmq, wmr, wms, wmt, wmu, wmv, wmw, wmx, wmy, wmz, wna, wnb, wnc, wnd, wne, wnf, wng, wnh, wni, wnj, wnk, wnl, wnm, wnn, wno, wnp, wnq, wnr, wns, wnt, wnu, wnv, wnw, wnx, wny, wnz, woa, wob, woc, wod, woe, wof, wog, woh, woi, woj, wok, wol, wom, won, woo, wop, woq, wor, wos, wot, wou, wov, wow, wox, woy, woz, wpa, wpb, wpc, wpd, wpe, wpf, wpg, wph, wpi, wpj, wpk, wpl, wpm, wpn, wpo, wpp, wpq, wpr, wps, wpt, wpu, wpv, wpw, wpx, wpy, wpz, wqa, wqb, wqc, wqd, wqe, wqf, wqg, wqh, wqi, wqj, wqk, wql, wqm, wqn, wqo, wqp, wqq, wqr, wqs, wqt, wqu, wqv, wqw, wqx, wqy, wqz, wra, wrb, wrc, wrd, wre, wrf, wrg, wrh, wri, wrj, wrk, wrl, wrm, wrn, wro, wrp, wrq, wrr, wrs, wrt, wru, wrv, wrw, wrx, wry, wrz, wsa, wsb, wsc, wsd, wse, wsf, wsg, wsh, wsi, wsj, wsk, wsl, wsm, wsn, wso, wsp, wsq, wsr, wss, wst, wsu, wsv, wsw, wsx, wsy, wsz, wta, wtb, wtc, wtd, wte, wtf, wtg, wth, wti, wtj, wtk, wtl, wtm, wtn, wto, wtp, wtq, wtr, wts, wtt, wtu, wtv, wtw, wtx, wty, wtz, wua, wub, wuc, wud, wue, wuf, wug, wuh, wui, wuj, wuk, wul, wum, wun, wuo, wup, wuq, wur, wus, wut, wuu, wuv, wuw, wux, wuy, wuz, wva, wvb, wvc, wvd, wve, wvf, wvg, wvh, wvi, wvj, wvk, wvl, wvm, wvn, wvo, wvp, wvq, wvr, wvs, wvt, wvu, wvv, wvw, wvx, wvy, wvz, wwa, wwb, wwc, wwd, wwe, wwf, wwg, wwh, wwi, wwj, wwk, wwl, wwm, wwn, wwo, wwp, wwq, wwr, wws, wwt, wwu, wwv, www, wwx, wwy, wwz, wxa, wxb, wxc, wxd, wxe, wxf, wxg, wxh, wxi, wxj, wxk, wxl, wxm, wxn, wxo, wxp, wxq, wxr, wxs, wxt, wxu, wxv, wxw, wxx, wxy, wxz, wya, wyb, wyc, wyd, wye, wyf, wyg, wyh, wyi, wyj, wyk, wyl, wym, wyn, wyo, wyp, wyq, wyr, wys, wyt, wyu, wyv, wyw, wyx, wyy, wyz, wza, wzb, wzc, wzd, wze, wzf, wzg, wzh, wzi, wzj, wzk, wzl, wzm, wzn, wzo, wzp, wzq, wzr, wzs, wzt, wzu, wzv, wzw, wzx, wzy, wzz, xaa, xab, xac, xad, xae, xaf, xag, xah, xai, xaj, xak, xal, xam, xan, xao, xap, xaq, xar, xas, xat, xau, xav, xaw, xax, xay, xaz, xba, xbb, xbc, xbd, xbe, xbf, xbg, xbh, xbi, xbj, xbk, xbl, xbm, xbn, xbo, xbp, xbq, xbr, xbs, xbt, xbu, xbv, xbw, xbx, xby, xbz, xca, xcb, xcc, xcd, xce, xcf, xcg, xch, xci, xcj, xck, xcl, xcm, xcn, xco, xcp, xcq, xcr, xcs, xct, xcu, xcv, xcw, xcx, xcy, xcz, xda, xdb, xdc, xdd, xde, xdf, xdg, xdh, xdi, xdj, xdk, xdl, xdm, xdn, xdo, xdp, xdq, xdr, xds, xdt, xdu, xdv, xdw, xdx, xdy, xdz, xea, xeb, xec, xed, xee, xef, xeg, xeh, xei, xej, xek, xel, xem, xen, xeo, xep, xeq, xer, xes, xet, xeu, xev, xew, xex, xey, xez, xfa, xfb, xfc, xfd, xfe, xff, xfg, xfh, xfi, xfj, xfk, xfl, xfm, xfn, xfo, xfp, xfq, xfr, xfs, xft, xfu, xfv, xfw, xfx, xfy, xfz, xga, xgb, xgc, xgd, xge, xgf, xgg, xgh, xgi, xgj, xgk, xgl, xgm, xgn, xgo, xgp, xgq, xgr, xgs, xgt, xgu, xgv, xgw, xgx, xgy, xgz, xha, xhb, xhc, xhd, xhe, xhf, xhg, xhh, xhi, xhj, xhk, xhl, xhm, xhn, xho, xhp, xhq, xhr, xhs, xht, xhu, xhv, xhw, xhx, xhy, xhz, xia, xib, xic, xid, xie, xif, xig, xih, xii, xij, xik, xil, xim, xin, xio, xip, xiq, xir, xis, xit, xiu, xiv, xiw, xix, xiy, xiz, xja, xjb, xjc, xjd, xje, xjf, xjg, xjh, xji, xjj, xjk, xjl, xjm, xjn, xjo, xjp, xjq, xjr, xjs, xjt, xju, xjv, xjw, xjx, xjy, xjz, xka, xkb, xkc, xkd, xke, xkf, xkg, xkh, xki, xkj, xkk, xkl, xkm, xkn, xko, xkp, xkq, xkr, xks, xkt, xku, xkv, xkw, xkx, xky, xkz, xla, xlb, xlc, xld, xle, xlf, xlg, xlh, xli, xlj, xlk, xll, xlm, xln, xlo, xlp, xlq, xlr, xls, xlt, xlu, xlv, xlw, xlx, xly, xlz, xma, xmb, xmc, xmd, xme, xmf, xmg, xmh, xmi, xmj, xmk, xml, xmm, xmn, xmo, xmp, xmq, xmr, xms, xmt, xmu, xmv, xmw, xmx, xmy, xmz, xna, xnb, xnc, xnd, xne, xnf, xng, xnh, xni, xnj, xnk, xnl, xnm, xnn, xno, xnp, xnq, xnr, xns, xnt, xnu, xnv, xnw, xnx, xny, xnz, xoa, xob, xoc, xod, xoe, xof, xog, xoh, xoi, xoj, xok, xol, xom, xon, xoo, xop, xoq, xor, xos, xot, xou, xov, xow, xox, xoy, xoz, xpa, xpb, xpc, xpd, xpe, xpf, xpg, xph, xpi, xpj, xpk, xpl, xpm, xpn, xpo, xpp, xpq, xpr, xps, xpt, xpu, xpv, xpw, xpx, xpy, xpz, xqa, xqb, xqc, xqd, xqe, xqf, xqg, xqh, xqi, xqj, xqk, xql, xqm, xqn, xqo, xqp, xqq, xqr, xqs, xqt, xqu, xqv, xqw, xqx, xqy, xqz, xra, xrb, xrc, xrd, xre, xrf, xrg, xrh, xri, xrj, xrk, xrl, xrm, xrn, xro, xrp, xrq, xrr, xrs, xrt, xru, xrv, xrw, xrx, xry, xrz, xsa, xsb, xsc, xsd, xse, xsf, xsg, xsh, xsi, xsj, xsk, xsl, xsm, xsn, xso, xsp, xsq, xsr, xss, xst, xsu, xsv, xsw, xsx, xsy, xsz, xta, xtb, xtc, xtd, xte, xtf, xtg, xth, xti, xtj, xtk, xtl, xtm, xtn, xto, xtp, xtq, xtr, xts, xtt, xtu, xtv, xtw, xtx, xty, xtz, xua, xub, xuc, xud, xue, xuf, xug, xuh, xui, xuj, xuk, xul, xum, xun, xuo, xup, xuq, xur, xus, xut, xuu, xuv, xuw, xux, xuy, xuz, xva, xvb, xvc, xvd, xve, xvf, xvg, xvh, xvi, xvj, xvk, xvl, xvm, xvn, xvo, xvp, xvq, xvr, xvs, xvt, xvu, xvv, xvw, xvx, xvy, xvz, xwa, xwb, xwc, xwd, xwe, xwf, xwg, xwh, xwi, xwj, xwk, xwl, xwm, xwn, xwo, xwp, xwq, xwr, xws, xwt, xwu, xwv, xww, xwx, xwy, xwz, xxa, xxb, xxc, xxd, xxe, xxf, xxg, xxh, xxi, xxj, xxk, xxl, xxm, xxn, xxo, xxp, xxq, xxr, xxs, xxt, xxu, xxv, xxw, xxx, xxy, xxz, xya, xyb, xyc, xyd, xye, xyf, xyg, xyh, xyi, xyj, xyk, xyl, xym, xyn, xyo, xyp, xyq, xyr, xys, xyt, xyu, xyv, xyw, xyx, xyy, xyz, xza, xzb, xzc, xzd, xze, xzf, xzg, xzh, xzi, xzj, xzk, xzl, xzm, xzn, xzo, xzp, xzq, xzr, xzs, xzt, xzu, xzv, xzw, xzx, xzy, xzz, yaa, yab, yac, yad, yae, yaf, yag, yah, yai, yaj, yak, yal, yam, yan, yao, yap, yaq, yar, yas, yat, yau, yav, yaw, yax, yay, yaz, yba, ybb, ybc, ybd, ybe, ybf, ybg, ybh, ybi, ybj, ybk, ybl, ybm, ybn, ybo, ybp, ybq, ybr, ybs, ybt, ybu, ybv, ybw, ybx, yby, ybz, yca, ycb, ycc, ycd, yce, ycf, ycg, ych, yci, ycj, yck, ycl, ycm, ycn, yco, ycp, ycq, ycr, ycs, yct, ycu, ycv, ycw, ycx, ycy, ycz, yda, ydb, ydc, ydd, yde, ydf, ydg, ydh, ydi, ydj, ydk, ydl, ydm, ydn, ydo, ydp, ydq, ydr, yds, ydt, ydu, ydv, ydw, ydx, ydy, ydz, yea, yeb, yec, yed, yee, yef, yeg, yeh, yei, yej, yek, yel, yem, yen, yeo, yep, yeq, yer, yes, yet, yeu, yev, yew, yex, yey, yez, yfa, yfb, yfc, yfd, yfe, yff, yfg, yfh, yfi, yfj, yfk, yfl, yfm, yfn, yfo, yfp, yfq, yfr, yfs, yft, yfu, yfv, yfw, yfx, yfy, yfz, yga, ygb, ygc, ygd, yge, ygf, ygg, ygh, ygi, ygj, ygk, ygl, ygm, ygn, ygo, ygp, ygq, ygr, ygs, ygt, ygu, ygv, ygw, ygx, ygy, ygz, yha, yhb, yhc, yhd, yhe, yhf, yhg, yhh, yhi, yhj, yhk, yhl, yhm, yhn, yho, yhp, yhq, yhr, yhs, yht, yhu, yhv, yhw, yhx, yhy, yhz, yia, yib, yic, yid, yie, yif, yig, yih, yii, yij, yik, yil, yim, yin, yio, yip, yiq, yir, yis, yit, yiu, yiv, yiw, yix, yiy, yiz, yja, yjb, yjc, yjd, yje, yjf, yjg, yjh, yji, yjj, yjk, yjl, yjm, yjn, yjo, yjp, yjq, yjr, yjs, yjt, yju, yjv, yjw, yjx, yjy, yjz, yka, ykb, ykc, ykd, yke, ykf, ykg, ykh, yki, ykj, ykk, ykl, ykm, ykn, yko, ykp, ykq, ykr, yks, ykt, yku, ykv, ykw, ykx, yky, ykz, yla, ylb, ylc, yld, yle, ylf, ylg, ylh, yli, ylj, ylk, yll, ylm, yln, ylo, ylp, ylq, ylr, yls, ylt, ylu, ylv, ylw, ylx, yly, ylz, yma, ymb, ymc, ymd, yme, ymf, ymg, ymh, ymi, ymj, ymk, yml, ymm, ymn, ymo, ymp, ymq, ymr, yms, ymt, ymu, ymv, ymw, ymx, ymy, ymz, yna, ynb, ync, ynd, yne, ynf, yng, ynh, yni, ynj, ynk, ynl, ynm, ynn, yno, ynp, ynq, ynr, yns, ynt, ynu, ynv, ynw, ynx, yny, ynz, yoa, yob, yoc, yod, yoe, yof, yog, yoh, yoi, yoj, yok, yol, yom, yon, yoo, yop, yoq, yor, yos, yot, you, yov, yow, yox, yoy, yoz, ypa, ypb, ypc, ypd, ype, ypf, ypg, yph, ypi, ypj, ypk, ypl, ypm, ypn, ypo, ypp, ypq, ypr, yps, ypt, ypu, ypv, ypw, ypx, ypy, ypz, yqa, yqb, yqc, yqd, yqe, yqf, yqg, yqh, yqi, yqj, yqk, yql, yqm, yqn, yqo, yqp, yqq, yqr, yqs, yqt, yqu, yqv, yqw, yqx, yqy, yqz, yra, yrb, yrc, yrd, yre, yrf, yrg, yrh, yri, yrj, yrk, yrl, yrm, yrn, yro, yrp, yrq, yrr, yrs, yrt, yru, yrv, yrw, yrx, yry, yrz, ysa, ysb, ysc, ysd, yse, ysf, ysg, ysh, ysi, ysj, ysk, ysl, ysm, ysn, yso, ysp, ysq, ysr, yss, yst, ysu, ysv, ysw, ysx, ysy, ysz, yta, ytb, ytc, ytd, yte, ytf, ytg, yth, yti, ytj, ytk, ytl, ytm, ytn, yto, ytp, ytq, ytr, yts, ytt, ytu, ytv, ytw, ytx, yty, ytz, yua, yub, yuc, yud, yue, yuf, yug, yuh, yui, yuj, yuk, yul, yum, yun, yuo, yup, yuq, yur, yus, yut, yuu, yuv, yuw, yux, yuy, yuz, yva, yvb, yvc, yvd, yve, yvf, yvg, yvh, yvi, yvj, yvk, yvl, yvm, yvn, yvo, yvp, yvq, yvr, yvs, yvt, yvu, yvv, yvw, yvx, yvy, yvz, ywa, ywb, ywc, ywd, ywe, ywf, ywg, ywh, ywi, ywj, ywk, ywl, ywm, ywn, ywo, ywp, ywq, ywr, yws, ywt, ywu, ywv, yww, ywx, ywy, ywz, yxa, yxb, yxc, yxd, yxe, yxf, yxg, yxh, yxi, yxj, yxk, yxl, yxm, yxn, yxo, yxp, yxq, yxr, yxs, yxt, yxu, yxv, yxw, yxx, yxy, yxz, yya, yyb, yyc, yyd, yye, yyf, yyg, yyh, yyi, yyj, yyk, yyl, yym, yyn, yyo, yyp, yyq, yyr, yys, yyt, yyu, yyv, yyw, yyx, yyy, yyz, yza, yzb, yzc, yzd, yze, yzf, yzg, yzh, yzi, yzj, yzk, yzl, yzm, yzn, yzo, yzp, yzq, yzr, yzs, yzt, yzu, yzv, yzw, yzx, yzy, yzz, zaa, zab, zac, zad, zae, zaf, zag, zah, zai, zaj, zak, zal, zam, zan, zao, zap, zaq, zar, zas, zat, zau, zav, zaw, zax, zay, zaz, zba, zbb, zbc, zbd, zbe, zbf, zbg, zbh, zbi, zbj, zbk, zbl, zbm, zbn, zbo, zbp, zbq, zbr, zbs, zbt, zbu, zbv, zbw, zbx, zby, zbz, zca, zcb, zcc, zcd, zce, zcf, zcg, zch, zci, zcj, zck, zcl, zcm, zcn, zco, zcp, zcq, zcr, zcs, zct, zcu, zcv, zcw, zcx, zcy, zcz, zda, zdb, zdc, zdd, zde, zdf, zdg, zdh, zdi, zdj, zdk, zdl, zdm, zdn, zdo, zdp, zdq, zdr, zds, zdt, zdu, zdv, zdw, zdx, zdy, zdz, zea, zeb, zec, zed, zee, zef, zeg, zeh, zei, zej, zek, zel, zem, zen, zeo, zep, zeq, zer, zes, zet, zeu, zev, zew, zex, zey, zez, zfa, zfb, zfc, zfd, zfe, zff, zfg, zfh, zfi, zfj, zfk, zfl, zfm, zfn, zfo, zfp, zfq, zfr, zfs, zft, zfu, zfv, zfw, zfx, zfy, zfz, zga, zgb, zgc, zgd, zge, zgf, zgg, zgh, zgi, zgj, zgk, zgl, zgm, zgn, zgo, zgp, zgq, zgr, zgs, zgt, zgu, zgv, zgw, zgx, zgy, zgz, zha, zhb, zhc, zhd, zhe, zhf, zhg, zhh, zhi, zhj, zhk, zhl, zhm, zhn, zho, zhp, zhq, zhr, zhs, zht, zhu, zhv, zhw, zhx, zhy, zhz, zia, zib, zic, zid, zie, zif, zig, zih, zii, zij, zik, zil, zim, zin, zio, zip, ziq, zir, zis, zit, ziu, ziv, ziw, zix, ziy, ziz, zja, zjb, zjc, zjd, zje, zjf, zjg, zjh, zji, zjj, zjk, zjl, zjm, zjn, zjo, zjp, zjq, zjr, zjs, zjt, zju, zjv, zjw, zjx, zjy, zjz, zka, zkb, zkc, zkd, zke, zkf, zkg, zkh, zki, zkj, zkk, zkl, zkm, zkn, zko, zkp, zkq, zkr, zks, zkt, zku, zkv, zkw, zkx, zky, zkz, zla, zlb, zlc, zld, zle, zlf, zlg, zlh, zli, zlj, zlk, zll, zlm, zln, zlo, zlp, zlq, zlr, zls, zlt, zlu, zlv, zlw, zlx, zly, zlz, zma, zmb, zmc, zmd, zme, zmf, zmg, zmh, zmi, zmj, zmk, zml, zmm, zmn, zmo, zmp, zmq, zmr, zms, zmt, zmu, zmv, zmw, zmx, zmy, zmz, zna, znb, znc, znd, zne, znf, zng, znh, zni, znj, znk, znl, znm, znn, zno, znp, znq, znr, zns, znt, znu, znv, znw, znx, zny, znz, zoa, zob, zoc, zod, zoe, zof, zog, zoh, zoi, zoj, zok, zol, zom, zon, zoo, zop, zoq, zor, zos, zot, zou, zov, zow, zox, zoy, zoz, zpa, zpb, zpc, zpd, zpe, zpf, zpg, zph, zpi, zpj, zpk, zpl, zpm, zpn, zpo, zpp, zpq, zpr, zps, zpt, zpu, zpv, zpw, zpx, zpy, zpz, zqa, zqb, zqc, zqd, zqe, zqf, zqg, zqh, zqi, zqj, zqk, zql, zqm, zqn, zqo, zqp, zqq, zqr, zqs, zqt, zqu, zqv, zqw, zqx, zqy, zqz, zra, zrb, zrc, zrd, zre, zrf, zrg, zrh, zri, zrj, zrk, zrl, zrm, zrn, zro, zrp, zrq, zrr, zrs, zrt, zru, zrv, zrw, zrx, zry, zrz, zsa, zsb, zsc, zsd, zse, zsf, zsg, zsh, zsi, zsj, zsk, zsl, zsm, zsn, zso, zsp, zsq, zsr, zss, zst, zsu, zsv, zsw, zsx, zsy, zsz, zta, ztb, ztc, ztd, zte, ztf, ztg, zth, zti, ztj, ztk, ztl, ztm, ztn, zto, ztp, ztq, ztr, zts, ztt, ztu, ztv, ztw, ztx, zty, ztz, zua, zub, zuc, zud, zue, zuf, zug, zuh, zui, zuj, zuk, zul, zum, zun, zuo, zup, zuq, zur, zus, zut, zuu, zuv, zuw, zux, zuy, zuz, zva, zvb, zvc, zvd, zve, zvf, zvg, zvh, zvi, zvj, zvk, zvl, zvm, zvn, zvo, zvp, zvq, zvr, zvs, zvt, zvu, zvv, zvw, zvx, zvy, zvz, zwa, zwb, zwc, zwd, zwe, zwf, zwg, zwh, zwi, zwj, zwk, zwl, zwm, zwn, zwo, zwp, zwq, zwr, zws, zwt, zwu, zwv, zww, zwx, zwy, zwz, zxa, zxb, zxc, zxd, zxe, zxf, zxg, zxh, zxi, zxj, zxk, zxl, zxm, zxn, zxo, zxp, zxq, zxr, zxs, zxt, zxu, zxv, zxw, zxx, zxy, zxz, zya, zyb, zyc, zyd, zye, zyf, zyg, zyh, zyi, zyj, zyk, zyl, zym, zyn, zyo, zyp, zyq, zyr, zys, zyt, zyu, zyv, zyw, zyx, zyy, zyz, zza, zzb, zzc, zzd, zze, zzf, zzg, zzh, zzi, zzj, zzk, zzl, zzm, zzn, zzo, zzp, zzq, zzr, zzs, zzt, zzu, zzv, zzw, zzx, zzy, zzz",
"禿, 尃, 夲, 帳, 檛, 祭, 驿, 荂, 蕎, 緩, 逾, 笱, 短, 蛔, 磺, 珤, 珫, 纡, 梊, 皻, 樹, 斧, 铩, 鰞, 咤, 腞, 塦, 笕, 瓿, 甂, 鷍, 爌, 踽, 撚, 适, 槧, 纇, 抹, 抳, 舞, 乪, 蜥, 崜, 嵱, 仁, 遒, 栨, 濖, 噏, 抐, 眸, 本, 议, 臎, 糚, 屙, 吽, 茰, 娱, 糄, 麖, 葌, 鄜, 篬, 鑩, 鶳, 源, 疿, 侤, 诿, 圅, 穌, 脗, 给, 舸, 痍, 简, 硷, 耋, 爼, 爎, 布, 喏, 核, 藜, 揎, 玝, 椛, 荝, 腙, 榀, 曤, 鋸, 雑, 戂, 蕠, 觪, 瀄, 鬮, 萈, 檽, 陱, 猰, 燹, 宿, 敠, 橖, 汙, 釂, 狄, 俑, 篖, 錎, 併, 缺, 赋, 椂, 頑, 鿏, 後, 涹, 净, 鈱, 殎, 香, 遫, 臺, 靚, 殎, 橔, 玐, 憿, 咳, 馈, 鹆, 鋙, 蟔, 摢, 髗, 稞, 弸, 垧, 螣, 逴, 喵, 梥, 辝, 舒, 荦, 曆, 黒, 梥, 昻, 珙, 唃, 朵, 魰, 婸, 蒕, 豣, 範, 锸, 碬, 峡, 孯, 辌, 毪, 崋, 鑫, 嬑, 灣, 宭, 焋, 攷, 箇, 竔, 鹫, 锢, 勽, 繍, 恷, 桪, 凇, 榋, 迅, 搳, 踤, 虲, 椋, 汣, 謀, 傢, 味, 焳, 羭, 嘚, 竏, 襝, 娦, 僘, 韩, 當, 芬, 埭, 鍏, 鸼, 複, 頦, 咝, 鵊, 衯, 欚, 枌, 鑽, 螐, 蘸, 銣, 唏, 濏, 纊, 萜, 瑛, 弨, 支, 栋, 嵤, 紴, 訫, 婀, 靑, 楴, 胡, 镅, 僊, 匔, 嫋, 儼, 櫏, 瘅, 飰, 溯, 垲, 症, 簓, 訳, 諃, 訔, 诊, 梻, 廒, 鵭, 藋, 极, 擑, 鸎, 詣, 縀, 繇, 廱, 池, 潠, 鞵, 雸, 貤, 放, 笼, 楃, 鸑, 麆, 莚, 佱, 喁, 蠼, 鸀, 黯, 浺, 臂, 跧, 榶, 扑, 繀, 陬, 攸, 峍, 版, 笁, 门, 燇, 叇, 猻, 荩, 崦, 乹, 痡, 鼽, 偶, 建, 暇, 俆, 乡, 猸, 钩, 諘, 齱, 動, 亃, 敬, 厦, 獧, 竬, 瞜, 搥, 趩, 騗, 跎, 鳆, 賡, 弴, 窿, 腟, 嫰, 暒, 凳, 樊, 遀, 炿, 鳚, 魨, 螎, 畭, 曡, 腝, 蛍, 匣, 闔, 隤, 幽, 菮, 牾, 斑, 灡, 缂, 觻, 竿, 澪, 趔, 鎯, 翯, 靎, 齢, 汲, 鞿, 竂, 撋, 濷, 摺, 祙, 傳, 臒, 房, 臨, 黤, 娧, 嚻, 綶, 憓, 骊, 迓, 耴, 塊, 蘾, 入, 閼, 獐, 鵥, 蠑, 箘, 夽, 徹, 揇, 妗, 鷡, 毬, 儚, 筪, 逎, 僞, 亹, 瑁, 旘, 胄, 鎞, 劮, 喻, 簚, 噫, 泏, 矠, 氕, 豠, 彌, 栎, 蓍, 匏, 惯, 傺, 廱, 啚, 寊, 馆, 鐪, 怎, 团, 傱, 矴, 艩, 觐, 帴, 蜧, 暒, 仟, 耠, 羌, 鷎, 菙, 筎, 咐, 夂, 紐, 躢, 叹, 薽, 嵸, 驠, 汒, 蘏, 氾, 暙, 粵, 詏, 靯, 嬌, 蘜, 粔, 渲, 鬀, 辻, 鸨, 畛, 刺, 螖, 犰, 鏬, 很, 辞, 元, 铬, 毮, 蛹, 廉, 廒, 嗙, 鵒, 憦, 凲, 阭, 燾, 鮜, 鋣, 骲, 窺, 鲕, 鋺, 項, 瞨, 友, 竖, 闠, 涛, 斫, 釿, 撀, 逾, 袷, 嵦, 朑, 萗, 軣, 部, 匥, 褂, 楾, 籏, 京, 彷, 椃, 怇, 踂, 猅, 覠, 颲, 鸲, 鰌, 筼, 筃, 聉, 塑, 窕, 峵, 謱, 椉, 鎟, 篓, 煌, 蚕, 覩, 櫭, 鬤, 薰, 泗, 騶, 篮, 隲, 苡, 茚, 虿, 讍, 怆, 戋, 窫, 凁, 懈, 塇, 鎲, 癴, 蝽, 浂, 塌, 襓, 帴, 炀, 鱟, 蚴, 誋, 蠽, 踀, 谶, 猾, 鑠, 峀, 芰, 斗, 窃, 蝨, 篂, 闚, 鹣, 娻, 只, 鯈, 捦, 玫, 缡, 崒, 贜, 無, 冊, 鞻, 紤, 瑿, 州, 飴, 噯, 夼, 摘, 惆, 嶥, 蓭, 柱, 旤, 息, 燌, 炀, 拃, 轓, 脡, 簐, 腇, 揯, 粪, 羪, 畍, 挋, 蘔, 藋, 墎, 嚱, 忺, 捅, 萦, 勞, 軺, 蜅, 坩, 轇, 恂, 苼, 厣, 滐, 怚, 簶, 剡, 脹, 讫, 內, 潬, 褁, 烔, 吾, 蒲, 驷, 鐴, 煽, 編, 藛, 燲, 瘯, 捣, 蹌, 垾, 鮐, 蛩, 广, 帑, 恹, 網, 恕, 螊, 鵸, 魓, 罨, 藧, 潏, 渪, 籾, 畟, 鵑, 綅, 飼, 嚟, 梷, 趖, 讇, 毤, 躨, 虇, 橉, 匚, 蠾, 撷, 浥, 罃, 鳀, 音, 洉, 蹅, 饼, 猃, 逩, 吁, 縡, 笖, 蛗, 忮, 蕫, 悈, 莞, 鉌, 崱, 泃, 柘, 繻, 茘, 箽, 葝, 龑, 僨, 蛟, 陆, 頻, 疅, 浿, 翦, 夕, 踧, 嬺, 墦, 蔊, 红, 閰, 鏝, 菞, 輍, 涺, 敩, 祠, 粓, 擛, 襊, 哗, 彾, 亘, 巍, 淼, 镳, 昩, 讃, 佴, 紺, 坪, 澔, 溆, 洐, 扑, 鞶, 矮, 岂, 溔, 死, 絛, 猧, 溷, 欔, 鏵, 犯, 碃, 詹, 昵, 鉹, 姣, 炚, 卙, 聒, 妼, 悅, 诣, 墹, 繕, 鞕, 譕, 珝, 眓, 粬, 躰, 孼, 蠀, 锢, 琣, 趝, 椉, 蕐, 渄, 鲨, 戽, 袹, 枵, 摼, 鷥, 丢, 坛, 蒉, 郹, 耀, 應, 乴, 磙, 戹, 荟, 膺, 噽, 燭, 覨, 睮, 鄜, 佧, 簏, 邗, 駚, 蠮, 蠖, 賍, 頗, 甯, 魰, 恲, 灵, 帖, 帹, 蛓, 维, 瓝, 痡, 駗, 矱, 鳵, 鍕, 昼, 谡, 帴, 蒇, 荭, 毉, 谌, 宬, 跀, 臙, 覃, 勮, 懓, 謅, 蛤, 靸, 廦, 笊, 薼, 圙, 稶, 苟, 鷕, 譛, 鵏, 锁, 購, 緛, 栤, 諮, 铐, 頺, 楀, 蕈, 嬻, 毮, 遖, 嘞, 惻, 碞, 霴, 淲, 斚, 呬, 阕, 鉚, 氻, 亊, 控, 葲, 髤, 闽, 婹, 蚮, 汪, 纛, 麶, 齱, 扁, 呬, 竵, 酊, 砺, 俇, 橽, 亶, 膯, 截, 躜, 剞, 鿂, 嵛, 潵, 辥, 浱, 詾, 劰, 钍, 焛, 佡, 諼, 呻, 敱, 谬, 淣, 煙, 輚, 捯, 恔, 炝, 鴘, 峈, 鑴, 蟲, 茱, 璖, 鮖, 询, 謂, 夙, 憻, 夶, 桻, 姩, 龦, 蕛, 浻, 鎤, 种, 鏽, 那, 耟, 俖, 溈, 関, 轌, 效, 乃, 沁, 逰, 苯, 襋, 猃, 鴅, 櫜, 椹, 兗, 牦, 傴, 肳, 粪, 憽, 溍, 迦, 攬, 瀁, 箢, 浈, 閰, 柸, 骤, 猱, 瘓, 呡, 撆, 鉇, 玊, 遶, 嘬, 窌, 淟, 攧, 穏, 贾, 昨, 扒, 洨, 價, 參, 靣, 鷆, 箚, 嶼, 缱, 埝, 臱, 浜, 遾, 嬻, 嬌, 錦, 付, 齑, 淺, 臰, 筡, 楅, 鰢, 藁, 积, 畈, 刷, 筢, 源, 撔, 餏, 貍, 赥, 囱, 旍, 嚬, 鈴, 喎, 呯",
];
let regex = Regex::new(r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+").unwrap();
for (i, text) in test_cases.iter().enumerate() {
let my_frags = fragmentize(text);
let re_frags: Vec<_> = regex.find_iter(text)
.filter_map(|m| m.ok())
.map(|m| m.as_str().to_string())
.collect();
let same = my_frags.len() == re_frags.len() && my_frags.iter().zip(&re_frags).all(|(a,b)| a == b);
println!("Test case {}, {} chars: {}", i, text.chars().count(), if same { "MATCH" } else { "DIFFER" });
if !same {
for (index, (a, b)) in my_frags.iter().zip(&re_frags).enumerate() {
if a != b {
println!(" First discrepancy at index {}: my_frags={:?}, re_frags={:?}", index, a, b);
break;
}
}
}
}
}
// Unicode version: 16.0.0.
//
// ucd-generate 0.3.1 is available on crates.io.
pub const LETTER: &'static [(char, char)] = &[
('A', 'Z'),
('a', 'z'),
('ª', 'ª'),
('µ', 'µ'),
('º', 'º'),
('À', 'Ö'),
('Ø', 'ö'),
('ø', 'ˁ'),
('ˆ', 'ˑ'),
('ˠ', 'ˤ'),
('ˬ', 'ˬ'),
('ˮ', 'ˮ'),
('Ͱ', 'ʹ'),
('Ͷ', 'ͷ'),
('ͺ', 'ͽ'),
('Ϳ', 'Ϳ'),
('Ά', 'Ά'),
('Έ', 'Ί'),
('Ό', 'Ό'),
('Ύ', 'Ρ'),
('Σ', 'ϵ'),
('Ϸ', 'ҁ'),
('Ҋ', 'ԯ'),
('Ա', 'Ֆ'),
('ՙ', 'ՙ'),
('ՠ', 'ֈ'),
('א', 'ת'),
('ׯ', 'ײ'),
('ؠ', 'ي'),
('ٮ', 'ٯ'),
('ٱ', 'ۓ'),
('ە', 'ە'),
('ۥ', 'ۦ'),
('ۮ', 'ۯ'),
('ۺ', 'ۼ'),
('ۿ', 'ۿ'),
('ܐ', 'ܐ'),
('ܒ', 'ܯ'),
('ݍ', 'ޥ'),
('ޱ', 'ޱ'),
('ߊ', 'ߪ'),
('ߴ', 'ߵ'),
('ߺ', 'ߺ'),
('ࠀ', 'ࠕ'),
('ࠚ', 'ࠚ'),
('ࠤ', 'ࠤ'),
('ࠨ', 'ࠨ'),
('ࡀ', 'ࡘ'),
('ࡠ', 'ࡪ'),
('ࡰ', 'ࢇ'),
('ࢉ', 'ࢎ'),
('ࢠ', 'ࣉ'),
('ऄ', 'ह'),
('ऽ', 'ऽ'),
('ॐ', 'ॐ'),
('क़', 'ॡ'),
('ॱ', 'ঀ'),
('অ', 'ঌ'),
('এ', 'ঐ'),
('ও', 'ন'),
('প', 'র'),
('ল', 'ল'),
('শ', 'হ'),
('ঽ', 'ঽ'),
('ৎ', 'ৎ'),
('ড়', 'ঢ়'),
('য়', 'ৡ'),
('ৰ', 'ৱ'),
('ৼ', 'ৼ'),
('ਅ', 'ਊ'),
('ਏ', 'ਐ'),
('ਓ', 'ਨ'),
('ਪ', 'ਰ'),
('ਲ', 'ਲ਼'),
('ਵ', 'ਸ਼'),
('ਸ', 'ਹ'),
('ਖ਼', 'ੜ'),
('ਫ਼', 'ਫ਼'),
('ੲ', 'ੴ'),
('અ', 'ઍ'),
('એ', 'ઑ'),
('ઓ', 'ન'),
('પ', 'ર'),
('લ', 'ળ'),
('વ', 'હ'),
('ઽ', 'ઽ'),
('ૐ', 'ૐ'),
('ૠ', 'ૡ'),
('ૹ', 'ૹ'),
('ଅ', 'ଌ'),
('ଏ', 'ଐ'),
('ଓ', 'ନ'),
('ପ', 'ର'),
('ଲ', 'ଳ'),
('ଵ', 'ହ'),
('ଽ', 'ଽ'),
('ଡ଼', 'ଢ଼'),
('ୟ', 'ୡ'),
('ୱ', 'ୱ'),
('ஃ', 'ஃ'),
('அ', 'ஊ'),
('எ', 'ஐ'),
('ஒ', 'க'),
('ங', 'ச'),
('ஜ', 'ஜ'),
('ஞ', 'ட'),
('ண', 'த'),
('ந', 'ப'),
('ம', 'ஹ'),
('ௐ', 'ௐ'),
('అ', 'ఌ'),
('ఎ', 'ఐ'),
('ఒ', 'న'),
('ప', 'హ'),
('ఽ', 'ఽ'),
('ౘ', 'ౚ'),
('ౝ', 'ౝ'),
('ౠ', 'ౡ'),
('ಀ', 'ಀ'),
('ಅ', 'ಌ'),
('ಎ', 'ಐ'),
('ಒ', 'ನ'),
('ಪ', 'ಳ'),
('ವ', 'ಹ'),
('ಽ', 'ಽ'),
('ೝ', 'ೞ'),
('ೠ', 'ೡ'),
('ೱ', 'ೲ'),
('ഄ', 'ഌ'),
('എ', 'ഐ'),
('ഒ', 'ഺ'),
('ഽ', 'ഽ'),
('ൎ', 'ൎ'),
('ൔ', 'ൖ'),
('ൟ', 'ൡ'),
('ൺ', 'ൿ'),
('අ', 'ඖ'),
('ක', 'න'),
('ඳ', 'ර'),
('ල', 'ල'),
('ව', 'ෆ'),
('ก', 'ะ'),
('า', 'ำ'),
('เ', 'ๆ'),
('ກ', 'ຂ'),
('ຄ', 'ຄ'),
('ຆ', 'ຊ'),
('ຌ', 'ຣ'),
('ລ', 'ລ'),
('ວ', 'ະ'),
('າ', 'ຳ'),
('ຽ', 'ຽ'),
('ເ', 'ໄ'),
('ໆ', 'ໆ'),
('ໜ', 'ໟ'),
('ༀ', 'ༀ'),
('ཀ', 'ཇ'),
('ཉ', 'ཬ'),
('ྈ', 'ྌ'),
('က', 'ဪ'),
('ဿ', 'ဿ'),
('ၐ', 'ၕ'),
('ၚ', 'ၝ'),
('ၡ', 'ၡ'),
('ၥ', 'ၦ'),
('ၮ', 'ၰ'),
('ၵ', 'ႁ'),
('ႎ', 'ႎ'),
('Ⴀ', 'Ⴥ'),
('Ⴧ', 'Ⴧ'),
('Ⴭ', 'Ⴭ'),
('ა', 'ჺ'),
('ჼ', 'ቈ'),
('ቊ', 'ቍ'),
('ቐ', 'ቖ'),
('ቘ', 'ቘ'),
('ቚ', 'ቝ'),
('በ', 'ኈ'),
('ኊ', 'ኍ'),
('ነ', 'ኰ'),
('ኲ', 'ኵ'),
('ኸ', 'ኾ'),
('ዀ', 'ዀ'),
('ዂ', 'ዅ'),
('ወ', 'ዖ'),
('ዘ', 'ጐ'),
('ጒ', 'ጕ'),
('ጘ', 'ፚ'),
('ᎀ', 'ᎏ'),
('Ꭰ', 'Ᏽ'),
('ᏸ', 'ᏽ'),
('ᐁ', 'ᙬ'),
('ᙯ', 'ᙿ'),
('ᚁ', 'ᚚ'),
('ᚠ', 'ᛪ'),
('ᛱ', 'ᛸ'),
('ᜀ', 'ᜑ'),
('ᜟ', 'ᜱ'),
('ᝀ', 'ᝑ'),
('ᝠ', 'ᝬ'),
('ᝮ', 'ᝰ'),
('ក', 'ឳ'),
('ៗ', 'ៗ'),
('ៜ', 'ៜ'),
('ᠠ', 'ᡸ'),
('ᢀ', 'ᢄ'),
('ᢇ', 'ᢨ'),
('ᢪ', 'ᢪ'),
('ᢰ', 'ᣵ'),
('ᤀ', 'ᤞ'),
('ᥐ', 'ᥭ'),
('ᥰ', 'ᥴ'),
('ᦀ', 'ᦫ'),
('ᦰ', 'ᧉ'),
('ᨀ', 'ᨖ'),
('ᨠ', 'ᩔ'),
('ᪧ', 'ᪧ'),
('ᬅ', 'ᬳ'),
('ᭅ', 'ᭌ'),
('ᮃ', 'ᮠ'),
('ᮮ', 'ᮯ'),
('ᮺ', 'ᯥ'),
('ᰀ', 'ᰣ'),
('ᱍ', 'ᱏ'),
('ᱚ', 'ᱽ'),
('ᲀ', 'ᲊ'),
('Ა', 'Ჺ'),
('Ჽ', 'Ჿ'),
('ᳩ', 'ᳬ'),
('ᳮ', 'ᳳ'),
('ᳵ', 'ᳶ'),
('ᳺ', 'ᳺ'),
('ᴀ', 'ᶿ'),
('Ḁ', 'ἕ'),
('Ἐ', 'Ἕ'),
('ἠ', 'ὅ'),
('Ὀ', 'Ὅ'),
('ὐ', 'ὗ'),
('Ὑ', 'Ὑ'),
('Ὓ', 'Ὓ'),
('Ὕ', 'Ὕ'),
('Ὗ', 'ώ'),
('ᾀ', 'ᾴ'),
('ᾶ', 'ᾼ'),
('ι', 'ι'),
('ῂ', 'ῄ'),
('ῆ', 'ῌ'),
('ῐ', 'ΐ'),
('ῖ', 'Ί'),
('ῠ', 'Ῥ'),
('ῲ', 'ῴ'),
('ῶ', 'ῼ'),
('ⁱ', 'ⁱ'),
('ⁿ', 'ⁿ'),
('ₐ', 'ₜ'),
('ℂ', 'ℂ'),
('ℇ', 'ℇ'),
('ℊ', 'ℓ'),
('ℕ', 'ℕ'),
('ℙ', 'ℝ'),
('ℤ', 'ℤ'),
('Ω', 'Ω'),
('ℨ', 'ℨ'),
('K', 'ℭ'),
('ℯ', 'ℹ'),
('ℼ', 'ℿ'),
('ⅅ', 'ⅉ'),
('ⅎ', 'ⅎ'),
('Ↄ', 'ↄ'),
('Ⰰ', 'ⳤ'),
('Ⳬ', 'ⳮ'),
('Ⳳ', 'ⳳ'),
('ⴀ', 'ⴥ'),
('ⴧ', 'ⴧ'),
('ⴭ', 'ⴭ'),
('ⴰ', 'ⵧ'),
('ⵯ', 'ⵯ'),
('ⶀ', 'ⶖ'),
('ⶠ', 'ⶦ'),
('ⶨ', 'ⶮ'),
('ⶰ', 'ⶶ'),
('ⶸ', 'ⶾ'),
('ⷀ', 'ⷆ'),
('ⷈ', 'ⷎ'),
('ⷐ', 'ⷖ'),
('ⷘ', 'ⷞ'),
('ⸯ', 'ⸯ'),
('々', '〆'),
('〱', '〵'),
('〻', '〼'),
('ぁ', 'ゖ'),
('ゝ', 'ゟ'),
('ァ', 'ヺ'),
('ー', 'ヿ'),
('ㄅ', 'ㄯ'),
('ㄱ', 'ㆎ'),
('ㆠ', 'ㆿ'),
('ㇰ', 'ㇿ'),
('㐀', '䶿'),
('一', 'ꒌ'),
('ꓐ', 'ꓽ'),
('ꔀ', 'ꘌ'),
('ꘐ', 'ꘟ'),
('ꘪ', 'ꘫ'),
('Ꙁ', 'ꙮ'),
('ꙿ', 'ꚝ'),
('ꚠ', 'ꛥ'),
('ꜗ', 'ꜟ'),
('Ꜣ', 'ꞈ'),
('Ꞌ', 'ꟍ'),
('Ꟑ', 'ꟑ'),
('ꟓ', 'ꟓ'),
('ꟕ', 'Ƛ'),
('ꟲ', 'ꠁ'),
('ꠃ', 'ꠅ'),
('ꠇ', 'ꠊ'),
('ꠌ', 'ꠢ'),
('ꡀ', 'ꡳ'),
('ꢂ', 'ꢳ'),
('ꣲ', 'ꣷ'),
('ꣻ', 'ꣻ'),
('ꣽ', 'ꣾ'),
('ꤊ', 'ꤥ'),
('ꤰ', 'ꥆ'),
('ꥠ', 'ꥼ'),
('ꦄ', 'ꦲ'),
('ꧏ', 'ꧏ'),
('ꧠ', 'ꧤ'),
('ꧦ', 'ꧯ'),
('ꧺ', 'ꧾ'),
('ꨀ', 'ꨨ'),
('ꩀ', 'ꩂ'),
('ꩄ', 'ꩋ'),
('ꩠ', 'ꩶ'),
('ꩺ', 'ꩺ'),
('ꩾ', 'ꪯ'),
('ꪱ', 'ꪱ'),
('ꪵ', 'ꪶ'),
('ꪹ', 'ꪽ'),
('ꫀ', 'ꫀ'),
('ꫂ', 'ꫂ'),
('ꫛ', 'ꫝ'),
('ꫠ', 'ꫪ'),
('ꫲ', 'ꫴ'),
('ꬁ', 'ꬆ'),
('ꬉ', 'ꬎ'),
('ꬑ', 'ꬖ'),
('ꬠ', 'ꬦ'),
('ꬨ', 'ꬮ'),
('ꬰ', 'ꭚ'),
('ꭜ', 'ꭩ'),
('ꭰ', 'ꯢ'),
('가', '힣'),
('ힰ', 'ퟆ'),
('ퟋ', 'ퟻ'),
('豈', '舘'),
('並', '龎'),
('ff', 'st'),
('ﬓ', 'ﬗ'),
('יִ', 'יִ'),
('ײַ', 'ﬨ'),
('שׁ', 'זּ'),
('טּ', 'לּ'),
('מּ', 'מּ'),
('נּ', 'סּ'),
('ףּ', 'פּ'),
('צּ', 'ﮱ'),
('ﯓ', 'ﴽ'),
('ﵐ', 'ﶏ'),
('ﶒ', 'ﷇ'),
('ﷰ', 'ﷻ'),
('ﹰ', 'ﹴ'),
('ﹶ', 'ﻼ'),
('A', 'Z'),
('a', 'z'),
('ヲ', 'ᄒ'),
('ᅡ', 'ᅦ'),
('ᅧ', 'ᅬ'),
('ᅭ', 'ᅲ'),
('ᅳ', 'ᅵ'),
('𐀀', '𐀋'),
('𐀍', '𐀦'),
('𐀨', '𐀺'),
('𐀼', '𐀽'),
('𐀿', '𐁍'),
('𐁐', '𐁝'),
('𐂀', '𐃺'),
('𐊀', '𐊜'),
('𐊠', '𐋐'),
('𐌀', '𐌟'),
('𐌭', '𐍀'),
('𐍂', '𐍉'),
('𐍐', '𐍵'),
('𐎀', '𐎝'),
('𐎠', '𐏃'),
('𐏈', '𐏏'),
('𐐀', '𐒝'),
('𐒰', '𐓓'),
('𐓘', '𐓻'),
('𐔀', '𐔧'),
('𐔰', '𐕣'),
('𐕰', '𐕺'),
('𐕼', '𐖊'),
('𐖌', '𐖒'),
('𐖔', '𐖕'),
('𐖗', '𐖡'),
('𐖣', '𐖱'),
('𐖳', '𐖹'),
('𐖻', '𐖼'),
('𐗀', '𐗳'),
('𐘀', '𐜶'),
('𐝀', '𐝕'),
('𐝠', '𐝧'),
('𐞀', '𐞅'),
('𐞇', '𐞰'),
('𐞲', '𐞺'),
('𐠀', '𐠅'),
('𐠈', '𐠈'),
('𐠊', '𐠵'),
('𐠷', '𐠸'),
('𐠼', '𐠼'),
('𐠿', '𐡕'),
('𐡠', '𐡶'),
('𐢀', '𐢞'),
('𐣠', '𐣲'),
('𐣴', '𐣵'),
('𐤀', '𐤕'),
('𐤠', '𐤹'),
('𐦀', '𐦷'),
('𐦾', '𐦿'),
('𐨀', '𐨀'),
('𐨐', '𐨓'),
('𐨕', '𐨗'),
('𐨙', '𐨵'),
('𐩠', '𐩼'),
('𐪀', '𐪜'),
('𐫀', '𐫇'),
('𐫉', '𐫤'),
('𐬀', '𐬵'),
('𐭀', '𐭕'),
('𐭠', '𐭲'),
('𐮀', '𐮑'),
('𐰀', '𐱈'),
('𐲀', '𐲲'),
('𐳀', '𐳲'),
('𐴀', '𐴣'),
('𐵊', '𐵥'),
('𐵯', '𐶅'),
('𐺀', '𐺩'),
('𐺰', '𐺱'),
('𐻂', '𐻄'),
('𐼀', '𐼜'),
('𐼧', '𐼧'),
('𐼰', '𐽅'),
('𐽰', '𐾁'),
('𐾰', '𐿄'),
('𐿠', '𐿶'),
('𑀃', '𑀷'),
('𑁱', '𑁲'),
('𑁵', '𑁵'),
('𑂃', '𑂯'),
('𑃐', '𑃨'),
('𑄃', '𑄦'),
('𑅄', '𑅄'),
('𑅇', '𑅇'),
('𑅐', '𑅲'),
('𑅶', '𑅶'),
('𑆃', '𑆲'),
('𑇁', '𑇄'),
('𑇚', '𑇚'),
('𑇜', '𑇜'),
('𑈀', '𑈑'),
('𑈓', '𑈫'),
('𑈿', '𑉀'),
('𑊀', '𑊆'),
('𑊈', '𑊈'),
('𑊊', '𑊍'),
('𑊏', '𑊝'),
('𑊟', '𑊨'),
('𑊰', '𑋞'),
('𑌅', '𑌌'),
('𑌏', '𑌐'),
('𑌓', '𑌨'),
('𑌪', '𑌰'),
('𑌲', '𑌳'),
('𑌵', '𑌹'),
('𑌽', '𑌽'),
('𑍐', '𑍐'),
('𑍝', '𑍡'),
('𑎀', '𑎉'),
('𑎋', '𑎋'),
('𑎎', '𑎎'),
('𑎐', '𑎵'),
('𑎷', '𑎷'),
('𑏑', '𑏑'),
('𑏓', '𑏓'),
('𑐀', '𑐴'),
('𑑇', '𑑊'),
('𑑟', '𑑡'),
('𑒀', '𑒯'),
('𑓄', '𑓅'),
('𑓇', '𑓇'),
('𑖀', '𑖮'),
('𑗘', '𑗛'),
('𑘀', '𑘯'),
('𑙄', '𑙄'),
('𑚀', '𑚪'),
('𑚸', '𑚸'),
('𑜀', '𑜚'),
('𑝀', '𑝆'),
('𑠀', '𑠫'),
('𑢠', '𑣟'),
('𑣿', '𑤆'),
('𑤉', '𑤉'),
('𑤌', '𑤓'),
('𑤕', '𑤖'),
('𑤘', '𑤯'),
('𑤿', '𑤿'),
('𑥁', '𑥁'),
('𑦠', '𑦧'),
('𑦪', '𑧐'),
('𑧡', '𑧡'),
('𑧣', '𑧣'),
('𑨀', '𑨀'),
('𑨋', '𑨲'),
('𑨺', '𑨺'),
('𑩐', '𑩐'),
('𑩜', '𑪉'),
('𑪝', '𑪝'),
('𑪰', '𑫸'),
('𑯀', '𑯠'),
('𑰀', '𑰈'),
('𑰊', '𑰮'),
('𑱀', '𑱀'),
('𑱲', '𑲏'),
('𑴀', '𑴆'),
('𑴈', '𑴉'),
('𑴋', '𑴰'),
('𑵆', '𑵆'),
('𑵠', '𑵥'),
('𑵧', '𑵨'),
('𑵪', '𑶉'),
('𑶘', '𑶘'),
('𑻠', '𑻲'),
('𑼂', '𑼂'),
('𑼄', '𑼐'),
('𑼒', '𑼳'),
('𑾰', '𑾰'),
('𒀀', '𒎙'),
('𒒀', '𒕃'),
('𒾐', '𒿰'),
('𓀀', '𓐯'),
('𓑁', '𓑆'),
('𓑠', '𔏺'),
('𔐀', '𔙆'),
('𖄀', '𖄝'),
('𖠀', '𖨸'),
('𖩀', '𖩞'),
('𖩰', '𖪾'),
('𖫐', '𖫭'),
('𖬀', '𖬯'),
('𖭀', '𖭃'),
('𖭣', '𖭷'),
('𖭽', '𖮏'),
('𖵀', '𖵬'),
('𖹀', '𖹿'),
('𖼀', '𖽊'),
('𖽐', '𖽐'),
('𖾓', '𖾟'),
('𖿠', '𖿡'),
('𖿣', '𖿣'),
('𗀀', '𘟷'),
('𘠀', '𘳕'),
('𘳿', '𘴈'),
('𚿰', '𚿳'),
('𚿵', '𚿻'),
('𚿽', '𚿾'),
('𛀀', '𛄢'),
('𛄲', '𛄲'),
('𛅐', '𛅒'),
('𛅕', '𛅕'),
('𛅤', '𛅧'),
('𛅰', '𛋻'),
('𛰀', '𛱪'),
('𛱰', '𛱼'),
('𛲀', '𛲈'),
('𛲐', '𛲙'),
('𝐀', '𝑔'),
('𝑖', '𝒜'),
('𝒞', '𝒟'),
('𝒢', '𝒢'),
('𝒥', '𝒦'),
('𝒩', '𝒬'),
('𝒮', '𝒹'),
('𝒻', '𝒻'),
('𝒽', '𝓃'),
('𝓅', '𝔅'),
('𝔇', '𝔊'),
('𝔍', '𝔔'),
('𝔖', '𝔜'),
('𝔞', '𝔹'),
('𝔻', '𝔾'),
('𝕀', '𝕄'),
('𝕆', '𝕆'),
('𝕊', '𝕐'),
('𝕒', '𝚥'),
('𝚨', '𝛀'),
('𝛂', '𝛚'),
('𝛜', '𝛺'),
('𝛼', '𝜔'),
('𝜖', '𝜴'),
('𝜶', '𝝎'),
('𝝐', '𝝮'),
('𝝰', '𝞈'),
('𝞊', '𝞨'),
('𝞪', '𝟂'),
('𝟄', '𝟋'),
('𝼀', '𝼞'),
('𝼥', '𝼪'),
('𞀰', '𞁭'),
('𞄀', '𞄬'),
('𞄷', '𞄽'),
('𞅎', '𞅎'),
('𞊐', '𞊭'),
('𞋀', '𞋫'),
('𞓐', '𞓫'),
('𞗐', '𞗭'),
('𞗰', '𞗰'),
('𞟠', '𞟦'),
('𞟨', '𞟫'),
('𞟭', '𞟮'),
('𞟰', '𞟾'),
('𞠀', '𞣄'),
('𞤀', '𞥃'),
('𞥋', '𞥋'),
('𞸀', '𞸃'),
('𞸅', '𞸟'),
('𞸡', '𞸢'),
('𞸤', '𞸤'),
('𞸧', '𞸧'),
('𞸩', '𞸲'),
('𞸴', '𞸷'),
('𞸹', '𞸹'),
('𞸻', '𞸻'),
('𞹂', '𞹂'),
('𞹇', '𞹇'),
('𞹉', '𞹉'),
('𞹋', '𞹋'),
('𞹍', '𞹏'),
('𞹑', '𞹒'),
('𞹔', '𞹔'),
('𞹗', '𞹗'),
('𞹙', '𞹙'),
('𞹛', '𞹛'),
('𞹝', '𞹝'),
('𞹟', '𞹟'),
('𞹡', '𞹢'),
('𞹤', '𞹤'),
('𞹧', '𞹪'),
('𞹬', '𞹲'),
('𞹴', '𞹷'),
('𞹹', '𞹼'),
('𞹾', '𞹾'),
('𞺀', '𞺉'),
('𞺋', '𞺛'),
('𞺡', '𞺣'),
('𞺥', '𞺩'),
('𞺫', '𞺻'),
('𠀀', '𪛟'),
('𪜀', '𫜹'),
('𫝀', '𫠝'),
('𫠠', '𬺡'),
('𬺰', '𮯠'),
('𮯰', '𮹝'),
('丽', '𪘀'),
('𰀀', '𱍊'),
('𱍐', '𲎯'),
];
pub const NUMBER: &'static [(char, char)] = &[
('0', '9'),
('²', '³'),
('¹', '¹'),
('¼', '¾'),
('٠', '٩'),
('۰', '۹'),
('߀', '߉'),
('०', '९'),
('০', '৯'),
('৴', '৹'),
('੦', '੯'),
('૦', '૯'),
('୦', '୯'),
('୲', '୷'),
('௦', '௲'),
('౦', '౯'),
('౸', '౾'),
('೦', '೯'),
('൘', '൞'),
('൦', '൸'),
('෦', '෯'),
('๐', '๙'),
('໐', '໙'),
('༠', '༳'),
('၀', '၉'),
('႐', '႙'),
('፩', '፼'),
('ᛮ', 'ᛰ'),
('០', '៩'),
('៰', '៹'),
('᠐', '᠙'),
('᥆', '᥏'),
('᧐', '᧚'),
('᪀', '᪉'),
('᪐', '᪙'),
('᭐', '᭙'),
('᮰', '᮹'),
('᱀', '᱉'),
('᱐', '᱙'),
('⁰', '⁰'),
('⁴', '⁹'),
('₀', '₉'),
('⅐', 'ↂ'),
('ↅ', '↉'),
('①', '⒛'),
('⓪', '⓿'),
('❶', '➓'),
('⳽', '⳽'),
('〇', '〇'),
('〡', '〩'),
('〸', '〺'),
('㆒', '㆕'),
('㈠', '㈩'),
('㉈', '㉏'),
('㉑', '㉟'),
('㊀', '㊉'),
('㊱', '㊿'),
('꘠', '꘩'),
('ꛦ', 'ꛯ'),
('꠰', '꠵'),
('꣐', '꣙'),
('꤀', '꤉'),
('꧐', '꧙'),
('꧰', '꧹'),
('꩐', '꩙'),
('꯰', '꯹'),
('0', '9'),
('𐄇', '𐄳'),
('𐅀', '𐅸'),
('𐆊', '𐆋'),
('𐋡', '𐋻'),
('𐌠', '𐌣'),
('𐍁', '𐍁'),
('𐍊', '𐍊'),
('𐏑', '𐏕'),
('𐒠', '𐒩'),
('𐡘', '𐡟'),
('𐡹', '𐡿'),
('𐢧', '𐢯'),
('𐣻', '𐣿'),
('𐤖', '𐤛'),
('𐦼', '𐦽'),
('𐧀', '𐧏'),
('𐧒', '𐧿'),
('𐩀', '𐩈'),
('𐩽', '𐩾'),
('𐪝', '𐪟'),
('𐫫', '𐫯'),
('𐭘', '𐭟'),
('𐭸', '𐭿'),
('𐮩', '𐮯'),
('𐳺', '𐳿'),
('𐴰', '𐴹'),
('𐵀', '𐵉'),
('𐹠', '𐹾'),
('𐼝', '𐼦'),
('𐽑', '𐽔'),
('𐿅', '𐿋'),
('𑁒', '𑁯'),
('𑃰', '𑃹'),
('𑄶', '𑄿'),
('𑇐', '𑇙'),
('𑇡', '𑇴'),
('𑋰', '𑋹'),
('𑑐', '𑑙'),
('𑓐', '𑓙'),
('𑙐', '𑙙'),
('𑛀', '𑛉'),
('𑛐', '𑛣'),
('𑜰', '𑜻'),
('𑣠', '𑣲'),
('𑥐', '𑥙'),
('𑯰', '𑯹'),
('𑱐', '𑱬'),
('𑵐', '𑵙'),
('𑶠', '𑶩'),
('𑽐', '𑽙'),
('𑿀', '𑿔'),
('𒐀', '𒑮'),
('𖄰', '𖄹'),
('𖩠', '𖩩'),
('𖫀', '𖫉'),
('𖭐', '𖭙'),
('𖭛', '𖭡'),
('𖵰', '𖵹'),
('𖺀', '𖺖'),
('𜳰', '𜳹'),
('𝋀', '𝋓'),
('𝋠', '𝋳'),
('𝍠', '𝍸'),
('𝟎', '𝟿'),
('𞅀', '𞅉'),
('𞋰', '𞋹'),
('𞓰', '𞓹'),
('𞗱', '𞗺'),
('𞣇', '𞣏'),
('𞥐', '𞥙'),
('𞱱', '𞲫'),
('𞲭', '𞲯'),
('𞲱', '𞲴'),
('𞴁', '𞴭'),
('𞴯', '𞴽'),
('🄀', '🄌'),
('🯰', '🯹'),
];
pub const WHITE_SPACE: &'static [(char, char)] = &[
('\t', '\r'),
(' ', ' '),
('\u{85}', '\u{85}'),
('\u{a0}', '\u{a0}'),
('\u{1680}', '\u{1680}'),
('\u{2000}', '\u{200a}'),
('\u{2028}', '\u{2029}'),
('\u{202f}', '\u{202f}'),
('\u{205f}', '\u{205f}'),
('\u{3000}', '\u{3000}'),
];
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment