Skip to content

Instantly share code, notes, and snippets.

@thomcc
Created April 11, 2021 00:00
Show Gist options
  • Save thomcc/db7ec784d6e14b32695f9d22766c50ca to your computer and use it in GitHub Desktop.
Save thomcc/db7ec784d6e14b32695f9d22766c50ca to your computer and use it in GitHub Desktop.
mod run_type {
pub const U: u32 = 0;
pub const L: u32 = 1;
pub const UF: u32 = 2;
pub const LF: u32 = 3;
pub const UL: u32 = 4;
pub const LSU: u32 = 5;
pub const U2L_399_EXT2: u32 = 6;
pub const UF_D20: u32 = 7;
pub const UF_D1_EXT: u32 = 8;
pub const U_EXT: u32 = 9;
pub const LF_EXT: u32 = 10;
pub const U_EXT2: u32 = 11;
pub const L_EXT2: u32 = 12;
pub const U_EXT3: u32 = 13;
}
#[repr(u8)]
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
enum CaseConv {
Upper,
Lower,
Fold,
}
fn case_conv(c: char, conv_ty: CaseConv) -> ([char; 3], u8) {
if c.is_ascii() {
return if conv_ty == CaseConv::Upper {
([c.to_ascii_uppercase(), '\0', '\0'], 1)
} else {
([c.to_ascii_lowercase(), '\0', '\0'], 1)
};
}
debug_assert_eq!(CASE_CONV_TABLE2.len(), CASE_CONV_TABLE1.len());
let conv_type = conv_ty as u32;
let is_lower = (conv_ty != CaseConv::Upper) as u32;
let mut idx_min = 0usize;
let mut idx_max = CASE_CONV_TABLE1.len() - 1;
let mut c = c as u32;
while idx_min <= idx_max {
debug_assert!(idx_max < CASE_CONV_TABLE1.len());
debug_assert!(CASE_CONV_TABLE1.len() < usize::MAX / 2);
let idx = (idx_max + idx_min) / 2;
debug_assert!(idx < CASE_CONV_TABLE1.len());
let v = CASE_CONV_TABLE1[idx];
let code = v >> (32 - 17);
let len = (v >> (32 - 17 - 7)) & 0x7f;
if c < code {
idx_max = idx - 1;
continue;
} else if c >= code + len {
idx_min = idx + 1;
continue;
} else {
let ty = (v >> (32 - 17 - 7 - 4)) & 0xf;
let data = ((v & 0xf) << 8) | (CASE_CONV_TABLE2[idx] as u32);
match ty {
run_type::U | run_type::L | run_type::UF | run_type::LF => {
if (conv_type == (ty & 1)) || (ty >= run_type::UF && conv_type == 2) {
c = c - code + (CASE_CONV_TABLE1[data as usize] >> (32 - 17));
}
}
run_type::UL => {
let a = c - code;
if (a & 1) != (1 - is_lower) {
break;
}
c = (a ^ 1) + code;
}
run_type::LSU => {
let a = c - code;
if a == 1 {
c = c.wrapping_add((2 * is_lower).wrapping_sub(1));
} else if a == (1 - is_lower) * 2 {
c = c.wrapping_add((2 * is_lower).wrapping_sub(1).wrapping_mul(2));
}
}
run_type::U2L_399_EXT2 => {
if is_lower == 0 {
let r = c - code + (CASE_CONV_EXT[(data >> 6) as usize] as u32);
return ([unsafe { chr(r) }, '\u{399}', '\0'], 2);
} else {
c = c - code + (CASE_CONV_EXT[(data & 0x3f) as usize] as u32);
}
}
run_type::UF_D20 => {
if conv_type == 1 {
break;
}
c = (data as u32) + ((conv_type == 2) as u32 * 0x20);
}
run_type::UF_D1_EXT => {
if conv_type == 1 {
break;
}
c = (CASE_CONV_EXT[data as usize] as u32) + ((conv_type == 2) as u32);
}
run_type::U_EXT | run_type::LF_EXT => {
if is_lower != ty.wrapping_sub(run_type::U_EXT) {
break;
}
c = CASE_CONV_EXT[data as usize] as u32;
}
run_type::U_EXT2 | run_type::L_EXT2 => {
if conv_type != ty.wrapping_sub(run_type::U_EXT2) {
break;
}
let r0 = c - code + (CASE_CONV_EXT[(data >> 6) as usize] as u32);
let r1 = CASE_CONV_EXT[(data & 0x3f) as usize] as u32;
return unsafe { ([chr(r0), chr(r1), '\0'], 2) };
}
run_type::U_EXT3 => {
if conv_ty != CaseConv::Upper {
break;
}
let r0 = CASE_CONV_EXT[data as usize >> 8] as u32;
let r1 = CASE_CONV_EXT[(data as usize >> 4) & 0xf] as u32;
let r2 = CASE_CONV_EXT[data as usize & 0xf] as u32;
return unsafe { ([chr(r0), chr(r1), chr(r2)], 3) };
}
ty => {
debug_assert!(false, "invalid: {}", ty);
}
}
break;
}
}
unsafe { ([chr(c), '\0', '\0'], 1) }
}
#[inline]
unsafe fn chr(c: u32) -> char {
debug_assert!(core::char::from_u32(c).is_some(), "bad char: {:#x?}", c);
core::char::from_u32_unchecked(c)
}
static CASE_CONV_TABLE1: [u32; 361] = [
0x00209a30, 0x00309a00, 0x005a8173, 0x00601730, 0x006c0730, 0x006f81b3, 0x00701700, 0x007c0700,
0x007f8100, 0x00803040, 0x009801c3, 0x00988190, 0x00990640, 0x009c9040, 0x00a481b4, 0x00a52e40,
0x00bc0130, 0x00bc8640, 0x00bf8170, 0x00c00100, 0x00c08130, 0x00c10440, 0x00c30130, 0x00c38240,
0x00c48230, 0x00c58240, 0x00c70130, 0x00c78130, 0x00c80130, 0x00c88240, 0x00c98130, 0x00ca0130,
0x00ca8100, 0x00cb0130, 0x00cb8130, 0x00cc0240, 0x00cd0100, 0x00ce0130, 0x00ce8130, 0x00cf0100,
0x00cf8130, 0x00d00640, 0x00d30130, 0x00d38240, 0x00d48130, 0x00d60240, 0x00d70130, 0x00d78240,
0x00d88230, 0x00d98440, 0x00db8130, 0x00dc0240, 0x00de0240, 0x00df8100, 0x00e20350, 0x00e38350,
0x00e50350, 0x00e69040, 0x00ee8100, 0x00ef1240, 0x00f801b4, 0x00f88350, 0x00fa0240, 0x00fb0130,
0x00fb8130, 0x00fc2840, 0x01100130, 0x01111240, 0x011d0131, 0x011d8240, 0x011e8130, 0x011f0131,
0x011f8201, 0x01208240, 0x01218130, 0x01220130, 0x01228130, 0x01230a40, 0x01280101, 0x01288101,
0x01290101, 0x01298100, 0x012a0100, 0x012b0200, 0x012c8100, 0x012d8100, 0x012e0101, 0x01300100,
0x01308101, 0x01318100, 0x01328101, 0x01330101, 0x01340100, 0x01348100, 0x01350101, 0x01358101,
0x01360101, 0x01378100, 0x01388101, 0x01390100, 0x013a8100, 0x013e8101, 0x01400100, 0x01410101,
0x01418100, 0x01438101, 0x01440100, 0x01448100, 0x01450200, 0x01460100, 0x01490100, 0x014e8101,
0x014f0101, 0x01a28173, 0x01b80440, 0x01bb0240, 0x01bd8300, 0x01bf8130, 0x01c30130, 0x01c40330,
0x01c60130, 0x01c70230, 0x01c801d0, 0x01c89130, 0x01d18930, 0x01d60100, 0x01d68300, 0x01d801d3,
0x01d89100, 0x01e10173, 0x01e18900, 0x01e60100, 0x01e68200, 0x01e78130, 0x01e80173, 0x01e88173,
0x01ea8173, 0x01eb0173, 0x01eb8100, 0x01ec1840, 0x01f80173, 0x01f88173, 0x01f90100, 0x01f98100,
0x01fa01a0, 0x01fa8173, 0x01fb8240, 0x01fc8130, 0x01fd0240, 0x01fe8330, 0x02001030, 0x02082030,
0x02182000, 0x02281000, 0x02302240, 0x02453640, 0x02600130, 0x02608e40, 0x02678100, 0x02686040,
0x0298a630, 0x02b0a600, 0x02c381b5, 0x08502631, 0x08638131, 0x08668131, 0x08682b00, 0x087e8300,
0x09d05011, 0x09f80610, 0x09fc0620, 0x0e400174, 0x0e408174, 0x0e410174, 0x0e418174, 0x0e420174,
0x0e428174, 0x0e430174, 0x0e438180, 0x0e440180, 0x0e482b30, 0x0e5e8330, 0x0ebc8101, 0x0ebe8101,
0x0ec70101, 0x0f007e40, 0x0f3f1840, 0x0f4b01b5, 0x0f4b81b6, 0x0f4c01b6, 0x0f4c81b6, 0x0f4d01b7,
0x0f4d8180, 0x0f4f0130, 0x0f506040, 0x0f800800, 0x0f840830, 0x0f880600, 0x0f8c0630, 0x0f900800,
0x0f940830, 0x0f980800, 0x0f9c0830, 0x0fa00600, 0x0fa40630, 0x0fa801b0, 0x0fa88100, 0x0fa901d3,
0x0fa98100, 0x0faa01d3, 0x0faa8100, 0x0fab01d3, 0x0fab8100, 0x0fac8130, 0x0fad8130, 0x0fae8130,
0x0faf8130, 0x0fb00800, 0x0fb40830, 0x0fb80200, 0x0fb90400, 0x0fbb0200, 0x0fbc0201, 0x0fbd0201,
0x0fbe0201, 0x0fc008b7, 0x0fc40867, 0x0fc808b8, 0x0fcc0868, 0x0fd008b8, 0x0fd40868, 0x0fd80200,
0x0fd901b9, 0x0fd981b1, 0x0fda01b9, 0x0fdb01b1, 0x0fdb81d7, 0x0fdc0230, 0x0fdd0230, 0x0fde0161,
0x0fdf0173, 0x0fe101b9, 0x0fe181b2, 0x0fe201ba, 0x0fe301b2, 0x0fe381d8, 0x0fe40430, 0x0fe60162,
0x0fe80200, 0x0fe901d0, 0x0fe981d0, 0x0feb01b0, 0x0feb81d0, 0x0fec0230, 0x0fed0230, 0x0ff00201,
0x0ff101d3, 0x0ff181d3, 0x0ff201ba, 0x0ff28101, 0x0ff301b0, 0x0ff381d3, 0x0ff40230, 0x0ff50230,
0x0ff60131, 0x0ff901ba, 0x0ff981b2, 0x0ffa01bb, 0x0ffb01b2, 0x0ffb81d9, 0x0ffc0230, 0x0ffd0230,
0x0ffe0162, 0x109301a0, 0x109501a0, 0x109581a0, 0x10990131, 0x10a70101, 0x10b01031, 0x10b81001,
0x10c18240, 0x125b1a31, 0x12681a01, 0x16002f31, 0x16182f01, 0x16300240, 0x16310130, 0x16318130,
0x16320130, 0x16328100, 0x16330100, 0x16338640, 0x16368130, 0x16370130, 0x16378130, 0x16380130,
0x16390240, 0x163a8240, 0x163f0230, 0x16406440, 0x16758440, 0x16790240, 0x16802600, 0x16938100,
0x16968100, 0x53202e40, 0x53401c40, 0x53910e40, 0x53993e40, 0x53bc8440, 0x53be8130, 0x53bf0a40,
0x53c58240, 0x53c68130, 0x53c80440, 0x53ca0101, 0x53cb1440, 0x53d50130, 0x53d58130, 0x53d60130,
0x53d68130, 0x53d70130, 0x53d80130, 0x53d88130, 0x53d90130, 0x53d98131, 0x53da0c40, 0x53e10240,
0x53e20131, 0x53e28130, 0x53e30130, 0x53e38440, 0x53fa8240, 0x55a98101, 0x55b85020, 0x7d8001b2,
0x7d8081b2, 0x7d8101b2, 0x7d8181da, 0x7d8201da, 0x7d8281b3, 0x7d8301b3, 0x7d8981bb, 0x7d8a01bb,
0x7d8a81bb, 0x7d8b01bc, 0x7d8b81bb, 0x7f909a31, 0x7fa09a01, 0x82002831, 0x82142801, 0x82582431,
0x826c2401, 0x86403331, 0x86603301, 0x8c502031, 0x8c602001, 0xb7202031, 0xb7302001, 0xf4802231,
0xf4912201,
];
static CASE_CONV_TABLE2: [u8; 361] = [
0x01, 0x00, 0x9c, 0x06, 0x07, 0x4d, 0x03, 0x04, 0x10, 0x00, 0x8f, 0x0b, 0x00, 0x00, 0x11, 0x00,
0x08, 0x00, 0x53, 0x4a, 0x51, 0x00, 0x52, 0x00, 0x53, 0x00, 0x3a, 0x54, 0x55, 0x00, 0x57, 0x59,
0x3f, 0x5d, 0x5c, 0x00, 0x46, 0x61, 0x63, 0x42, 0x64, 0x00, 0x66, 0x00, 0x68, 0x00, 0x6a, 0x00,
0x6c, 0x00, 0x6e, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x93, 0x00, 0x00, 0x20,
0x35, 0x00, 0x27, 0x00, 0x21, 0x00, 0x24, 0x22, 0x2a, 0x00, 0x13, 0x6b, 0x6d, 0x00, 0x26, 0x24,
0x27, 0x14, 0x16, 0x18, 0x1b, 0x1c, 0x3e, 0x1e, 0x3f, 0x1f, 0x39, 0x3d, 0x22, 0x21, 0x41, 0x1e,
0x40, 0x25, 0x25, 0x26, 0x28, 0x20, 0x2a, 0x49, 0x2c, 0x43, 0x2e, 0x4b, 0x30, 0x4c, 0x32, 0x44,
0x42, 0x99, 0x00, 0x00, 0x95, 0x8f, 0x7d, 0x7e, 0x83, 0x84, 0x12, 0x80, 0x82, 0x76, 0x77, 0x12,
0x7b, 0xa3, 0x7c, 0x78, 0x79, 0x8a, 0x92, 0x98, 0xa6, 0xa0, 0x85, 0x00, 0x9a, 0xa1, 0x93, 0x75,
0x33, 0x95, 0x00, 0x8e, 0x00, 0x74, 0x99, 0x98, 0x97, 0x96, 0x00, 0x00, 0x9e, 0x00, 0x9c, 0x00,
0xa1, 0xa0, 0x15, 0x2e, 0x2f, 0x30, 0xb4, 0xb5, 0x4e, 0xaa, 0xa9, 0x12, 0x14, 0x1e, 0x21, 0x22,
0x22, 0x2a, 0x34, 0x35, 0xa6, 0xa7, 0x36, 0x1f, 0x4a, 0x00, 0x00, 0x97, 0x01, 0x5a, 0xda, 0x1d,
0x36, 0x05, 0x00, 0xc4, 0xc3, 0xc6, 0xc5, 0xc8, 0xc7, 0xca, 0xc9, 0xcc, 0xcb, 0xc4, 0xd5, 0x45,
0xd6, 0x42, 0xd7, 0x46, 0xd8, 0xce, 0xd0, 0xd2, 0xd4, 0xda, 0xd9, 0xee, 0xf6, 0xfe, 0x0e, 0x07,
0x0f, 0x80, 0x9f, 0x00, 0x21, 0x80, 0xa3, 0xed, 0x00, 0xc0, 0x40, 0xc6, 0x60, 0xe7, 0xdb, 0xe6,
0x99, 0xc0, 0x00, 0x00, 0x06, 0x60, 0xdc, 0x29, 0xfd, 0x15, 0x12, 0x06, 0x16, 0xf8, 0xdd, 0x06,
0x15, 0x12, 0x84, 0x08, 0xc6, 0x16, 0xff, 0xdf, 0x03, 0xc0, 0x40, 0x00, 0x46, 0x60, 0xde, 0xe0,
0x6d, 0x37, 0x38, 0x39, 0x15, 0x14, 0x17, 0x16, 0x00, 0x1a, 0x19, 0x1c, 0x1b, 0x00, 0x5f, 0xb7,
0x65, 0x44, 0x47, 0x00, 0x4f, 0x62, 0x4e, 0x50, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0xa3, 0xa4,
0xa5, 0x00, 0x00, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x5a, 0x00, 0x48, 0x00, 0x5b, 0x56, 0x58,
0x60, 0x5e, 0x70, 0x69, 0x6f, 0x4d, 0x00, 0x00, 0x3b, 0x67, 0xb8, 0x00, 0x00, 0x45, 0xa8, 0x8a,
0x8b, 0x8c, 0xab, 0xac, 0x58, 0x58, 0xaf, 0x94, 0xb0, 0x6f, 0xb2, 0x5c, 0x5b, 0x5e, 0x5d, 0x60,
0x5f, 0x62, 0x61, 0x64, 0x63, 0x66, 0x65, 0x68, 0x67,
];
static CASE_CONV_EXT: [u16; 58] = [
0x0399, 0x0308, 0x0301, 0x03a5, 0x0313, 0x0300, 0x0342, 0x0391, 0x0397, 0x03a9, 0x0046, 0x0049,
0x004c, 0x0053, 0x0069, 0x0307, 0x02bc, 0x004e, 0x004a, 0x030c, 0x0535, 0x0552, 0x0048, 0x0331,
0x0054, 0x0057, 0x030a, 0x0059, 0x0041, 0x02be, 0x1f08, 0x1f80, 0x1f28, 0x1f90, 0x1f68, 0x1fa0,
0x1fba, 0x0386, 0x1fb3, 0x1fca, 0x0389, 0x1fc3, 0x03a1, 0x1ffa, 0x038f, 0x1ff3, 0x0544, 0x0546,
0x053b, 0x054e, 0x053d, 0x03b8, 0x0462, 0xa64a, 0x1e60, 0x03c9, 0x006b, 0x00e5,
];
/// Return an iterator that uppercases its input.
#[inline]
pub fn to_uppercase(c: char) -> CaseMapIter {
if c.is_ascii() {
CaseMapIter::single(c.to_ascii_uppercase())
} else {
CaseMapIter::new(case_conv(c, CaseConv::Upper))
}
}
/// Return an iterator that lowercases its input.
#[inline]
pub fn to_lowercase(c: char) -> CaseMapIter {
if c.is_ascii() {
CaseMapIter::single(c.to_ascii_lowercase())
} else {
CaseMapIter::new(case_conv(c, CaseConv::Lower))
}
}
/// Return an iterator that performs simple case folding on its input.
#[inline]
pub fn case_fold(c: char) -> CaseMapIter {
if c.is_ascii() {
CaseMapIter::single(c.to_ascii_lowercase())
} else {
CaseMapIter::new(case_conv(c, CaseConv::Fold))
}
}
/// Case conversion iterator. Produced by `to_lowercase`, `to_uppercase`, and
/// `case_fold`.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct CaseMapIter {
range: core::ops::Range<u8>,
buf: [char; 3],
}
impl CaseMapIter {
#[inline]
fn single(v: char) -> Self {
Self { range: 0..1, buf: [v, '\0', '\0'] }
}
#[inline]
fn new(v: ([char; 3], u8)) -> Self {
debug_assert!(v.1 <= 3);
Self { range: 0..v.1, buf: v.0 }
}
}
impl Iterator for CaseMapIter {
type Item = char;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
debug_assert!(self.range.end <= 3);
self.range.next().map(|p| self.buf[(p as usize) & 0b11])
}
#[inline] fn size_hint(&self) -> (usize, Option<usize>) { self.range.size_hint() }
#[inline] fn count(self) -> usize { self.range.count() }
}
#[test]
fn test_caseconv() {
#[track_caller]
fn to_conv_result(
c: char,
conv: CaseConv,
mut it: impl Iterator<Item = char>,
) -> ([char; 3], u8) {
if let Some(a) = it.next() {
if let Some(b) = it.next() {
if let Some(c) = it.next() {
assert_eq!(it.next(), None);
([a, b, c], 3)
} else {
([a, b, '\0'], 2)
}
} else {
([a, '\0', '\0'], 1)
}
} else {
unreachable!("Unknown conversion ({:?}): {:?}", conv, c);
}
}
for c in '\0'..=core::char::MAX {
let got_lower = case_conv(c, CaseConv::Lower);
let want_lower = to_conv_result(c, CaseConv::Lower, c.to_lowercase());
assert_eq!(got_lower, want_lower, "{:?}", c);
let got_upper = case_conv(c, CaseConv::Upper);
let want_upper = to_conv_result(c, CaseConv::Upper, c.to_uppercase());
assert_eq!(got_upper, want_upper, "{:?}", c);
// TODO: find point of comparison. We only support simple and common
// case foldings.
let got_fold = case_conv(c, CaseConv::Fold);
assert!(got_fold.1 > 0 && got_fold.1 <= 3);
assert_eq!(got_fold.1, 1);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment