Skip to content

Instantly share code, notes, and snippets.

@m1el
Created December 18, 2023 20:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save m1el/48339a83a49f46aa0f6cfd13d3f80e09 to your computer and use it in GitHub Desktop.
Save m1el/48339a83a49f46aa0f6cfd13d3f80e09 to your computer and use it in GitHub Desktop.
fn utf8_len(start: u8) -> usize {
match start {
0b0000_0000..=0b0111_1111 => 1,
0b1100_0000..=0b1101_1111 => 2,
0b1110_0000..=0b1110_1111 => 3,
0b1111_0000..=0b1111_0111 => 4,
_ => 0,
}
}
enum Utf8Error {
InvalidStart,
InvalidContinuation,
InvalidCodepoint,
TruncatedCodepoint,
}
fn utf8_split(bytes: &[u8]) -> Option<Result<(char, &[u8]), Utf8Error>> {
if bytes.is_empty() {
return None;
}
let head = bytes[0];
let len = utf8_len(head);
if len == 0 {
return Some(Err(Utf8Error::InvalidStart));
}
if bytes.len() < len {
return Some(Err(Utf8Error::TruncatedCodepoint));
}
let mut chr = (head << len >> len) as u32;
for &byte in &bytes[1..len] {
if byte & 0b1100_0000 != 0b1000_0000 {
return Some(Err(Utf8Error::InvalidContinuation));
}
chr = (chr << 6) | (byte & 0b0011_1111) as u32;
}
match char::from_u32(chr) {
Some(chr) => Some(Ok((chr, &bytes[len..]))),
None => Some(Err(Utf8Error::InvalidCodepoint)),
}
}
fn main() {
let mut bytes = "привіт, світ! ❤️🧡💛💚💙💜".as_bytes();
while let Some(Ok((chr, tail))) = utf8_split(bytes) {
bytes = tail;
print!("'{}', ", chr);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment