-
-
Save thomcc/db7ec784d6e14b32695f9d22766c50ca to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
mod run_type { | |
pub const U: u32 = 0; | |
pub const L: u32 = 1; | |
pub const UF: u32 = 2; | |
pub const LF: u32 = 3; | |
pub const UL: u32 = 4; | |
pub const LSU: u32 = 5; | |
pub const U2L_399_EXT2: u32 = 6; | |
pub const UF_D20: u32 = 7; | |
pub const UF_D1_EXT: u32 = 8; | |
pub const U_EXT: u32 = 9; | |
pub const LF_EXT: u32 = 10; | |
pub const U_EXT2: u32 = 11; | |
pub const L_EXT2: u32 = 12; | |
pub const U_EXT3: u32 = 13; | |
} | |
#[repr(u8)] | |
#[derive(Debug, Copy, Clone, PartialEq, Eq)] | |
enum CaseConv { | |
Upper, | |
Lower, | |
Fold, | |
} | |
fn case_conv(c: char, conv_ty: CaseConv) -> ([char; 3], u8) { | |
if c.is_ascii() { | |
return if conv_ty == CaseConv::Upper { | |
([c.to_ascii_uppercase(), '\0', '\0'], 1) | |
} else { | |
([c.to_ascii_lowercase(), '\0', '\0'], 1) | |
}; | |
} | |
debug_assert_eq!(CASE_CONV_TABLE2.len(), CASE_CONV_TABLE1.len()); | |
let conv_type = conv_ty as u32; | |
let is_lower = (conv_ty != CaseConv::Upper) as u32; | |
let mut idx_min = 0usize; | |
let mut idx_max = CASE_CONV_TABLE1.len() - 1; | |
let mut c = c as u32; | |
while idx_min <= idx_max { | |
debug_assert!(idx_max < CASE_CONV_TABLE1.len()); | |
debug_assert!(CASE_CONV_TABLE1.len() < usize::MAX / 2); | |
let idx = (idx_max + idx_min) / 2; | |
debug_assert!(idx < CASE_CONV_TABLE1.len()); | |
let v = CASE_CONV_TABLE1[idx]; | |
let code = v >> (32 - 17); | |
let len = (v >> (32 - 17 - 7)) & 0x7f; | |
if c < code { | |
idx_max = idx - 1; | |
continue; | |
} else if c >= code + len { | |
idx_min = idx + 1; | |
continue; | |
} else { | |
let ty = (v >> (32 - 17 - 7 - 4)) & 0xf; | |
let data = ((v & 0xf) << 8) | (CASE_CONV_TABLE2[idx] as u32); | |
match ty { | |
run_type::U | run_type::L | run_type::UF | run_type::LF => { | |
if (conv_type == (ty & 1)) || (ty >= run_type::UF && conv_type == 2) { | |
c = c - code + (CASE_CONV_TABLE1[data as usize] >> (32 - 17)); | |
} | |
} | |
run_type::UL => { | |
let a = c - code; | |
if (a & 1) != (1 - is_lower) { | |
break; | |
} | |
c = (a ^ 1) + code; | |
} | |
run_type::LSU => { | |
let a = c - code; | |
if a == 1 { | |
c = c.wrapping_add((2 * is_lower).wrapping_sub(1)); | |
} else if a == (1 - is_lower) * 2 { | |
c = c.wrapping_add((2 * is_lower).wrapping_sub(1).wrapping_mul(2)); | |
} | |
} | |
run_type::U2L_399_EXT2 => { | |
if is_lower == 0 { | |
let r = c - code + (CASE_CONV_EXT[(data >> 6) as usize] as u32); | |
return ([unsafe { chr(r) }, '\u{399}', '\0'], 2); | |
} else { | |
c = c - code + (CASE_CONV_EXT[(data & 0x3f) as usize] as u32); | |
} | |
} | |
run_type::UF_D20 => { | |
if conv_type == 1 { | |
break; | |
} | |
c = (data as u32) + ((conv_type == 2) as u32 * 0x20); | |
} | |
run_type::UF_D1_EXT => { | |
if conv_type == 1 { | |
break; | |
} | |
c = (CASE_CONV_EXT[data as usize] as u32) + ((conv_type == 2) as u32); | |
} | |
run_type::U_EXT | run_type::LF_EXT => { | |
if is_lower != ty.wrapping_sub(run_type::U_EXT) { | |
break; | |
} | |
c = CASE_CONV_EXT[data as usize] as u32; | |
} | |
run_type::U_EXT2 | run_type::L_EXT2 => { | |
if conv_type != ty.wrapping_sub(run_type::U_EXT2) { | |
break; | |
} | |
let r0 = c - code + (CASE_CONV_EXT[(data >> 6) as usize] as u32); | |
let r1 = CASE_CONV_EXT[(data & 0x3f) as usize] as u32; | |
return unsafe { ([chr(r0), chr(r1), '\0'], 2) }; | |
} | |
run_type::U_EXT3 => { | |
if conv_ty != CaseConv::Upper { | |
break; | |
} | |
let r0 = CASE_CONV_EXT[data as usize >> 8] as u32; | |
let r1 = CASE_CONV_EXT[(data as usize >> 4) & 0xf] as u32; | |
let r2 = CASE_CONV_EXT[data as usize & 0xf] as u32; | |
return unsafe { ([chr(r0), chr(r1), chr(r2)], 3) }; | |
} | |
ty => { | |
debug_assert!(false, "invalid: {}", ty); | |
} | |
} | |
break; | |
} | |
} | |
unsafe { ([chr(c), '\0', '\0'], 1) } | |
} | |
#[inline] | |
unsafe fn chr(c: u32) -> char { | |
debug_assert!(core::char::from_u32(c).is_some(), "bad char: {:#x?}", c); | |
core::char::from_u32_unchecked(c) | |
} | |
static CASE_CONV_TABLE1: [u32; 361] = [ | |
0x00209a30, 0x00309a00, 0x005a8173, 0x00601730, 0x006c0730, 0x006f81b3, 0x00701700, 0x007c0700, | |
0x007f8100, 0x00803040, 0x009801c3, 0x00988190, 0x00990640, 0x009c9040, 0x00a481b4, 0x00a52e40, | |
0x00bc0130, 0x00bc8640, 0x00bf8170, 0x00c00100, 0x00c08130, 0x00c10440, 0x00c30130, 0x00c38240, | |
0x00c48230, 0x00c58240, 0x00c70130, 0x00c78130, 0x00c80130, 0x00c88240, 0x00c98130, 0x00ca0130, | |
0x00ca8100, 0x00cb0130, 0x00cb8130, 0x00cc0240, 0x00cd0100, 0x00ce0130, 0x00ce8130, 0x00cf0100, | |
0x00cf8130, 0x00d00640, 0x00d30130, 0x00d38240, 0x00d48130, 0x00d60240, 0x00d70130, 0x00d78240, | |
0x00d88230, 0x00d98440, 0x00db8130, 0x00dc0240, 0x00de0240, 0x00df8100, 0x00e20350, 0x00e38350, | |
0x00e50350, 0x00e69040, 0x00ee8100, 0x00ef1240, 0x00f801b4, 0x00f88350, 0x00fa0240, 0x00fb0130, | |
0x00fb8130, 0x00fc2840, 0x01100130, 0x01111240, 0x011d0131, 0x011d8240, 0x011e8130, 0x011f0131, | |
0x011f8201, 0x01208240, 0x01218130, 0x01220130, 0x01228130, 0x01230a40, 0x01280101, 0x01288101, | |
0x01290101, 0x01298100, 0x012a0100, 0x012b0200, 0x012c8100, 0x012d8100, 0x012e0101, 0x01300100, | |
0x01308101, 0x01318100, 0x01328101, 0x01330101, 0x01340100, 0x01348100, 0x01350101, 0x01358101, | |
0x01360101, 0x01378100, 0x01388101, 0x01390100, 0x013a8100, 0x013e8101, 0x01400100, 0x01410101, | |
0x01418100, 0x01438101, 0x01440100, 0x01448100, 0x01450200, 0x01460100, 0x01490100, 0x014e8101, | |
0x014f0101, 0x01a28173, 0x01b80440, 0x01bb0240, 0x01bd8300, 0x01bf8130, 0x01c30130, 0x01c40330, | |
0x01c60130, 0x01c70230, 0x01c801d0, 0x01c89130, 0x01d18930, 0x01d60100, 0x01d68300, 0x01d801d3, | |
0x01d89100, 0x01e10173, 0x01e18900, 0x01e60100, 0x01e68200, 0x01e78130, 0x01e80173, 0x01e88173, | |
0x01ea8173, 0x01eb0173, 0x01eb8100, 0x01ec1840, 0x01f80173, 0x01f88173, 0x01f90100, 0x01f98100, | |
0x01fa01a0, 0x01fa8173, 0x01fb8240, 0x01fc8130, 0x01fd0240, 0x01fe8330, 0x02001030, 0x02082030, | |
0x02182000, 0x02281000, 0x02302240, 0x02453640, 0x02600130, 0x02608e40, 0x02678100, 0x02686040, | |
0x0298a630, 0x02b0a600, 0x02c381b5, 0x08502631, 0x08638131, 0x08668131, 0x08682b00, 0x087e8300, | |
0x09d05011, 0x09f80610, 0x09fc0620, 0x0e400174, 0x0e408174, 0x0e410174, 0x0e418174, 0x0e420174, | |
0x0e428174, 0x0e430174, 0x0e438180, 0x0e440180, 0x0e482b30, 0x0e5e8330, 0x0ebc8101, 0x0ebe8101, | |
0x0ec70101, 0x0f007e40, 0x0f3f1840, 0x0f4b01b5, 0x0f4b81b6, 0x0f4c01b6, 0x0f4c81b6, 0x0f4d01b7, | |
0x0f4d8180, 0x0f4f0130, 0x0f506040, 0x0f800800, 0x0f840830, 0x0f880600, 0x0f8c0630, 0x0f900800, | |
0x0f940830, 0x0f980800, 0x0f9c0830, 0x0fa00600, 0x0fa40630, 0x0fa801b0, 0x0fa88100, 0x0fa901d3, | |
0x0fa98100, 0x0faa01d3, 0x0faa8100, 0x0fab01d3, 0x0fab8100, 0x0fac8130, 0x0fad8130, 0x0fae8130, | |
0x0faf8130, 0x0fb00800, 0x0fb40830, 0x0fb80200, 0x0fb90400, 0x0fbb0200, 0x0fbc0201, 0x0fbd0201, | |
0x0fbe0201, 0x0fc008b7, 0x0fc40867, 0x0fc808b8, 0x0fcc0868, 0x0fd008b8, 0x0fd40868, 0x0fd80200, | |
0x0fd901b9, 0x0fd981b1, 0x0fda01b9, 0x0fdb01b1, 0x0fdb81d7, 0x0fdc0230, 0x0fdd0230, 0x0fde0161, | |
0x0fdf0173, 0x0fe101b9, 0x0fe181b2, 0x0fe201ba, 0x0fe301b2, 0x0fe381d8, 0x0fe40430, 0x0fe60162, | |
0x0fe80200, 0x0fe901d0, 0x0fe981d0, 0x0feb01b0, 0x0feb81d0, 0x0fec0230, 0x0fed0230, 0x0ff00201, | |
0x0ff101d3, 0x0ff181d3, 0x0ff201ba, 0x0ff28101, 0x0ff301b0, 0x0ff381d3, 0x0ff40230, 0x0ff50230, | |
0x0ff60131, 0x0ff901ba, 0x0ff981b2, 0x0ffa01bb, 0x0ffb01b2, 0x0ffb81d9, 0x0ffc0230, 0x0ffd0230, | |
0x0ffe0162, 0x109301a0, 0x109501a0, 0x109581a0, 0x10990131, 0x10a70101, 0x10b01031, 0x10b81001, | |
0x10c18240, 0x125b1a31, 0x12681a01, 0x16002f31, 0x16182f01, 0x16300240, 0x16310130, 0x16318130, | |
0x16320130, 0x16328100, 0x16330100, 0x16338640, 0x16368130, 0x16370130, 0x16378130, 0x16380130, | |
0x16390240, 0x163a8240, 0x163f0230, 0x16406440, 0x16758440, 0x16790240, 0x16802600, 0x16938100, | |
0x16968100, 0x53202e40, 0x53401c40, 0x53910e40, 0x53993e40, 0x53bc8440, 0x53be8130, 0x53bf0a40, | |
0x53c58240, 0x53c68130, 0x53c80440, 0x53ca0101, 0x53cb1440, 0x53d50130, 0x53d58130, 0x53d60130, | |
0x53d68130, 0x53d70130, 0x53d80130, 0x53d88130, 0x53d90130, 0x53d98131, 0x53da0c40, 0x53e10240, | |
0x53e20131, 0x53e28130, 0x53e30130, 0x53e38440, 0x53fa8240, 0x55a98101, 0x55b85020, 0x7d8001b2, | |
0x7d8081b2, 0x7d8101b2, 0x7d8181da, 0x7d8201da, 0x7d8281b3, 0x7d8301b3, 0x7d8981bb, 0x7d8a01bb, | |
0x7d8a81bb, 0x7d8b01bc, 0x7d8b81bb, 0x7f909a31, 0x7fa09a01, 0x82002831, 0x82142801, 0x82582431, | |
0x826c2401, 0x86403331, 0x86603301, 0x8c502031, 0x8c602001, 0xb7202031, 0xb7302001, 0xf4802231, | |
0xf4912201, | |
]; | |
static CASE_CONV_TABLE2: [u8; 361] = [ | |
0x01, 0x00, 0x9c, 0x06, 0x07, 0x4d, 0x03, 0x04, 0x10, 0x00, 0x8f, 0x0b, 0x00, 0x00, 0x11, 0x00, | |
0x08, 0x00, 0x53, 0x4a, 0x51, 0x00, 0x52, 0x00, 0x53, 0x00, 0x3a, 0x54, 0x55, 0x00, 0x57, 0x59, | |
0x3f, 0x5d, 0x5c, 0x00, 0x46, 0x61, 0x63, 0x42, 0x64, 0x00, 0x66, 0x00, 0x68, 0x00, 0x6a, 0x00, | |
0x6c, 0x00, 0x6e, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x93, 0x00, 0x00, 0x20, | |
0x35, 0x00, 0x27, 0x00, 0x21, 0x00, 0x24, 0x22, 0x2a, 0x00, 0x13, 0x6b, 0x6d, 0x00, 0x26, 0x24, | |
0x27, 0x14, 0x16, 0x18, 0x1b, 0x1c, 0x3e, 0x1e, 0x3f, 0x1f, 0x39, 0x3d, 0x22, 0x21, 0x41, 0x1e, | |
0x40, 0x25, 0x25, 0x26, 0x28, 0x20, 0x2a, 0x49, 0x2c, 0x43, 0x2e, 0x4b, 0x30, 0x4c, 0x32, 0x44, | |
0x42, 0x99, 0x00, 0x00, 0x95, 0x8f, 0x7d, 0x7e, 0x83, 0x84, 0x12, 0x80, 0x82, 0x76, 0x77, 0x12, | |
0x7b, 0xa3, 0x7c, 0x78, 0x79, 0x8a, 0x92, 0x98, 0xa6, 0xa0, 0x85, 0x00, 0x9a, 0xa1, 0x93, 0x75, | |
0x33, 0x95, 0x00, 0x8e, 0x00, 0x74, 0x99, 0x98, 0x97, 0x96, 0x00, 0x00, 0x9e, 0x00, 0x9c, 0x00, | |
0xa1, 0xa0, 0x15, 0x2e, 0x2f, 0x30, 0xb4, 0xb5, 0x4e, 0xaa, 0xa9, 0x12, 0x14, 0x1e, 0x21, 0x22, | |
0x22, 0x2a, 0x34, 0x35, 0xa6, 0xa7, 0x36, 0x1f, 0x4a, 0x00, 0x00, 0x97, 0x01, 0x5a, 0xda, 0x1d, | |
0x36, 0x05, 0x00, 0xc4, 0xc3, 0xc6, 0xc5, 0xc8, 0xc7, 0xca, 0xc9, 0xcc, 0xcb, 0xc4, 0xd5, 0x45, | |
0xd6, 0x42, 0xd7, 0x46, 0xd8, 0xce, 0xd0, 0xd2, 0xd4, 0xda, 0xd9, 0xee, 0xf6, 0xfe, 0x0e, 0x07, | |
0x0f, 0x80, 0x9f, 0x00, 0x21, 0x80, 0xa3, 0xed, 0x00, 0xc0, 0x40, 0xc6, 0x60, 0xe7, 0xdb, 0xe6, | |
0x99, 0xc0, 0x00, 0x00, 0x06, 0x60, 0xdc, 0x29, 0xfd, 0x15, 0x12, 0x06, 0x16, 0xf8, 0xdd, 0x06, | |
0x15, 0x12, 0x84, 0x08, 0xc6, 0x16, 0xff, 0xdf, 0x03, 0xc0, 0x40, 0x00, 0x46, 0x60, 0xde, 0xe0, | |
0x6d, 0x37, 0x38, 0x39, 0x15, 0x14, 0x17, 0x16, 0x00, 0x1a, 0x19, 0x1c, 0x1b, 0x00, 0x5f, 0xb7, | |
0x65, 0x44, 0x47, 0x00, 0x4f, 0x62, 0x4e, 0x50, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0xa3, 0xa4, | |
0xa5, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x5a, 0x00, 0x48, 0x00, 0x5b, 0x56, 0x58, | |
0x60, 0x5e, 0x70, 0x69, 0x6f, 0x4d, 0x00, 0x00, 0x3b, 0x67, 0xb8, 0x00, 0x00, 0x45, 0xa8, 0x8a, | |
0x8b, 0x8c, 0xab, 0xac, 0x58, 0x58, 0xaf, 0x94, 0xb0, 0x6f, 0xb2, 0x5c, 0x5b, 0x5e, 0x5d, 0x60, | |
0x5f, 0x62, 0x61, 0x64, 0x63, 0x66, 0x65, 0x68, 0x67, | |
]; | |
static CASE_CONV_EXT: [u16; 58] = [ | |
0x0399, 0x0308, 0x0301, 0x03a5, 0x0313, 0x0300, 0x0342, 0x0391, 0x0397, 0x03a9, 0x0046, 0x0049, | |
0x004c, 0x0053, 0x0069, 0x0307, 0x02bc, 0x004e, 0x004a, 0x030c, 0x0535, 0x0552, 0x0048, 0x0331, | |
0x0054, 0x0057, 0x030a, 0x0059, 0x0041, 0x02be, 0x1f08, 0x1f80, 0x1f28, 0x1f90, 0x1f68, 0x1fa0, | |
0x1fba, 0x0386, 0x1fb3, 0x1fca, 0x0389, 0x1fc3, 0x03a1, 0x1ffa, 0x038f, 0x1ff3, 0x0544, 0x0546, | |
0x053b, 0x054e, 0x053d, 0x03b8, 0x0462, 0xa64a, 0x1e60, 0x03c9, 0x006b, 0x00e5, | |
]; | |
/// Return an iterator that uppercases its input. | |
#[inline] | |
pub fn to_uppercase(c: char) -> CaseMapIter { | |
if c.is_ascii() { | |
CaseMapIter::single(c.to_ascii_uppercase()) | |
} else { | |
CaseMapIter::new(case_conv(c, CaseConv::Upper)) | |
} | |
} | |
/// Return an iterator that lowercases its input. | |
#[inline] | |
pub fn to_lowercase(c: char) -> CaseMapIter { | |
if c.is_ascii() { | |
CaseMapIter::single(c.to_ascii_lowercase()) | |
} else { | |
CaseMapIter::new(case_conv(c, CaseConv::Lower)) | |
} | |
} | |
/// Return an iterator that performs simple case folding on its input. | |
#[inline] | |
pub fn case_fold(c: char) -> CaseMapIter { | |
if c.is_ascii() { | |
CaseMapIter::single(c.to_ascii_lowercase()) | |
} else { | |
CaseMapIter::new(case_conv(c, CaseConv::Fold)) | |
} | |
} | |
/// Case conversion iterator. Produced by `to_lowercase`, `to_uppercase`, and | |
/// `case_fold`. | |
#[derive(Clone, Debug, PartialEq, Eq)] | |
pub struct CaseMapIter { | |
range: core::ops::Range<u8>, | |
buf: [char; 3], | |
} | |
impl CaseMapIter { | |
#[inline] | |
fn single(v: char) -> Self { | |
Self { range: 0..1, buf: [v, '\0', '\0'] } | |
} | |
#[inline] | |
fn new(v: ([char; 3], u8)) -> Self { | |
debug_assert!(v.1 <= 3); | |
Self { range: 0..v.1, buf: v.0 } | |
} | |
} | |
impl Iterator for CaseMapIter { | |
type Item = char; | |
#[inline] | |
fn next(&mut self) -> Option<Self::Item> { | |
debug_assert!(self.range.end <= 3); | |
self.range.next().map(|p| self.buf[(p as usize) & 0b11]) | |
} | |
#[inline] fn size_hint(&self) -> (usize, Option<usize>) { self.range.size_hint() } | |
#[inline] fn count(self) -> usize { self.range.count() } | |
} | |
#[test] | |
fn test_caseconv() { | |
#[track_caller] | |
fn to_conv_result( | |
c: char, | |
conv: CaseConv, | |
mut it: impl Iterator<Item = char>, | |
) -> ([char; 3], u8) { | |
if let Some(a) = it.next() { | |
if let Some(b) = it.next() { | |
if let Some(c) = it.next() { | |
assert_eq!(it.next(), None); | |
([a, b, c], 3) | |
} else { | |
([a, b, '\0'], 2) | |
} | |
} else { | |
([a, '\0', '\0'], 1) | |
} | |
} else { | |
unreachable!("Unknown conversion ({:?}): {:?}", conv, c); | |
} | |
} | |
for c in '\0'..=core::char::MAX { | |
let got_lower = case_conv(c, CaseConv::Lower); | |
let want_lower = to_conv_result(c, CaseConv::Lower, c.to_lowercase()); | |
assert_eq!(got_lower, want_lower, "{:?}", c); | |
let got_upper = case_conv(c, CaseConv::Upper); | |
let want_upper = to_conv_result(c, CaseConv::Upper, c.to_uppercase()); | |
assert_eq!(got_upper, want_upper, "{:?}", c); | |
// TODO: find point of comparison. We only support simple and common | |
// case foldings. | |
let got_fold = case_conv(c, CaseConv::Fold); | |
assert!(got_fold.1 > 0 && got_fold.1 <= 3); | |
assert_eq!(got_fold.1, 1); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment