Skip to content

Instantly share code, notes, and snippets.

@fand
Last active January 18, 2024 00:20
Show Gist options
  • Save fand/4deb0ae2242bbdab5743085ea9918d8a to your computer and use it in GitHub Desktop.
Save fand/4deb0ae2242bbdab5743085ea9918d8a to your computer and use it in GitHub Desktop.
Find emoji byte index in a String in Rust
extern crate unicode_segmentation;
use unicode_segmentation::UnicodeSegmentation;
use regex;
struct EmojiFinder {
re: regex::Regex,
}
impl EmojiFinder {
pub fn new() -> Self {
Self {
re: regex::Regex::new(r"\p{Emoji}|\p{Emoji_Presentation}|\p{Emoji_Modifier}|\p{Emoji_Modifier_Base}|\p{Emoji_Component}").unwrap(),
}
}
/// Return byte indices of emojis in the text.
pub fn find(&self, s: &str) -> Vec<usize> {
let mut indices = vec![];
let mut index = 0;
for grapheme in s.graphemes(true) {
if self.re.is_match(grapheme) {
indices.push(index);
}
index += grapheme.bytes().len();
}
indices
}
}
fn main() {
let finder = EmojiFinder::new();
dbg!(finder.find("HelloπŸ˜€πŸ˜‡")); // [5, 9]
// ZWJ (3 byte)
dbg!(finder.find("πŸ‘©πŸ˜‡")); // [0, 4]
dbg!(finder.find("πŸ’»πŸ˜‡")); // [0, 4]
dbg!(finder.find("πŸ‘©β€πŸ’»πŸ˜‡")); // [0, 11]
// Family (4byte char + ZWJ for each)
dbg!(finder.find("πŸ‘¨πŸ˜‡")); // [0, 4]
dbg!(finder.find("πŸ‘¨β€πŸ‘¦πŸ˜‡")); // [0, 11]
dbg!(finder.find("πŸ‘¨β€πŸ‘©β€πŸ‘¦πŸ˜‡")); // [0, 18]
dbg!(finder.find("πŸ‘©β€πŸ‘©β€πŸ‘¦β€πŸ‘¦πŸ˜‡")); // [0, 25]
// Variation (4 byte)
dbg!(finder.find("πŸ‘πŸ˜‡")); // [0, 4]
dbg!(finder.find("πŸ‘πŸ½πŸ˜‡")); // [0, 8]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment