Skip to content

Instantly share code, notes, and snippets.

@ssokolow
Last active June 20, 2022 10:28
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ssokolow/4551bd0c97f0c6a797463b83ec9e4a50 to your computer and use it in GitHub Desktop.
Save ssokolow/4551bd0c97f0c6a797463b83ec9e4a50 to your computer and use it in GitHub Desktop.
CamelCase parser for Rust
//! Routines for parsing camelcase strings
//!
use std::mem::replace;
use unicode_categories::UnicodeCategories;
use unicode_segmentation::{GraphemeIndices, UnicodeSegmentation};
// --== Enums ==--
// TODO: Refresh my memory of which other traits I'm advised to derive on this.
/// Phase 1 intermediate representation used to separate classifying Unicode grapheme clusters from
/// defining state transitions between classes.
#[derive(Clone, Copy, Debug, PartialEq)]
enum CharType {
/// No data has yet been processed
Start, // TODO: Is there any way to make this only usable as an initialization value?
/// Uppercase
Uppercase,
/// Lowercase
Lowercase,
/// Character which combines an uppercase and lowercase character in the same glyph to allow
/// round-trip compatibility with legacy encodings.
Titlecase,
/// One of the various types of ampersands Unicode defines
Ampersand,
/// One of the various types of apostrophes Unicode defines
Apostrophe,
/// A "numeric" character, as defined by Unicode
Numeric,
/// A decimal separator, thousands separator, or other "Number Separator"
NumSep,
/// A piece of punctuation which should not have a space after it, such as "(" or "#"
StartPunct,
/// A piece of punctuation which should not have a space before it, such as ")" or "%"
EndPunct,
/// A whitespace character
Whitespace,
/// Any character which does not fall into the other classes
Other,
}
/// Phase 2 intermediate representation used to separate defining state transitions between
/// character classes from actually processing the text to apply the defined transitions.
///
/// This acts as input to an algorithm which walks a `start_offset` and `end_offset` along the
/// input string, with `end_offset` always remaining one character behind the word's actual end
/// so that no fancy reverse-walking of UTF-8 is necessary to detect an <upper><lower> digraph
/// and then break before, rather than within it.
#[derive(Clone, Copy, Debug, PartialEq)]
enum CCaseAction {
/// Just advance end_offset
Literal,
/// Emit accumulated word (if non-empty) and begin a new word starting with this grapheme
StartWord,
/// Shift a grapheme back out of the accumulator, then operate as in `StartWord`
/// (Necessary to implement camelcase "<upper><lower>" handling in a single pass
/// in cases like "RARFile" becoming "RAR File")
AlreadyStartedWord,
/// Like `Literal`, but prevent the following character from being the split point for a new
/// word (Used to suppress AlreadyStartedWord in cases like "[Hello]")
Suppress,
/// Emit accumulated word (if non-empty) and reset accumulator WITHOUT adding this grapheme
/// (Necessary to skip whitespace characters)
Skip,
}
// --== Classifier Functions ==--
/// Identify what role a given character plays in the string
fn classify_char(in_char: char) -> CharType {
// Prevent clippy from complaining about types being split across multiple match arms
// with different explanatory comments for maintainability
#[allow(clippy::match_same_arms)]
match in_char {
// TODO: Use either https://github.com/BurntSushi/ucd-generate or unicode.py from
// unicode-categories and auto-generate the "BIDI" categories.
// TODO: Find a crate to which I can delegate "BIDI" category membership checking
// (Membership checked at http://www.unicode.org/Public/UNIDATA/UnicodeData.txt)
// Note: Keep this at the top in case things like U+00A0 make it into other matchers
// because of attributes like "BIDI: CS" classifications.
x if x.is_whitespace() => CharType::Whitespace,
// TODO: Is there any DB I can use to delegate "Ampersand" and "Apostrophe" definitions?
'\u{26}' | '\u{FE60}' | '\u{FF06}' | '\u{1F674}' => CharType::Ampersand,
// Note: U+2019 (Right Single Quotation Mark)" is included here because FileFormat.info
// includes "U+2019 is preferred for apostrophe" in the "Comments" field.
'\u{27}' | '\u{2019}' | '\u{FF07}' => CharType::Apostrophe,
// Include "BIDI: Common Number Separators [CS]" as non-space-inducing
// TODO: Add unit tests for all of these
'\u{2c}' | '\u{2e}' | '\u{2f}' | '\u{3a}' | '\u{60c}' | '\u{2044}' | '\u{FE50}' |
'\u{FE52}' | '\u{FE55}' | '\u{FF0C}' | '\u{FF0E}' | '\u{FF0F}' | '\u{FF1A}' |
// Include "BIDI: European Number Separator [ES]" as non-breaking based on test corpus
// TODO: Add unit tests for all of these
'\u{2b}' | '\u{2d}' | '\u{207A}' | '\u{207B}' | '\u{208A}' | '\u{208B}' |
'\u{2212}' | '\u{FB29}' | '\u{FE62}' | '\u{FE63}' | '\u{FF0B}' | '\u{FF0D}'
=> CharType::NumSep,
// Include "BIDI: European Number Terminator [ET]" as asymmetrically non-breaking based on
// hard-coded rules like "$ breaks before" and "% breaks after".
// TODO: Add characters from these "see also" lists:
// - http://www.fileformat.info/info/unicode/char/003c/index.htm
// - http://www.fileformat.info/info/unicode/char/003e/index.htm
// - http://www.fileformat.info/info/unicode/char/search.htm?q=%22&preview=entity
// TODO: Add unit tests for at least a large swathe of these
// XXX: Is there an attribute that identifies asymmetric quote characters?
// XXX: Try to build/intuit a corpus which would tell me whether it's feasible to make
// the "BIDI:ET" elements their own class which autodetects which side to break on
// based on surrounding characters. (Because that'd let me autogenerate it)
'\u{23}' | '\u{24}' | '\u{a3}' | '\u{a4}' | '\u{a5}' | '\u{ab}' | '\u{b1}' | '\u{20a0}' |
'\u{20ac}' | '\u{FE5F}' | '\u{FE69}' | '\u{FF03}' | '\u{FF04}' | '\u{ffe1}'
=> CharType::StartPunct,
'\u{25}' | '\u{a2}' | '\u{b0}' | '\u{bb}' | '\u{2030}' | '\u{2031}' | '\u{2032}' |
'\u{2033}' | '\u{2034}' | '\u{FE6A}' | '\u{ff05}' | '\u{ffe0}'
=> CharType::EndPunct,
// Manually include a subset of "BIDI: Other Neutrals [ON]" as asymmetrically non-breaking
// TODO: Which side should U+2E2E break on?
'\u{3c}' | '\u{A1}' | '\u{bf}' | '\u{2E18}' | '\u{fe64}' | '\u{ff1c}'
=> CharType::StartPunct,
'\u{21}' | '\u{3b}' | '\u{3e}' | '\u{3f}' | '\u{37e}' | '\u{2026}' | '\u{203c}' |
'\u{203d}' | '\u{2047}' | '\u{2048}' | '\u{2049}' | '\u{2762}' | '\u{FE54}' |
'\u{FE56}' | '\u{FE57}' | '\u{fe65}' | '\u{FF01}' | '\u{ff02}' | '\u{FF1B}' |
'\u{FF1E}' | '\u{FF1F}' | '\u{1F679}'
=> CharType::EndPunct,
// Punctuation which should only trigger whitespace on one side
x if x.is_punctuation_open() => CharType::StartPunct,
x if x.is_punctuation_close() => CharType::EndPunct,
// Basic numbers and letters
x if x.is_numeric() => CharType::Numeric,
x if x.is_uppercase() => CharType::Uppercase,
x if x.is_lowercase() => CharType::Lowercase,
x if x.is_letter_titlecase() => CharType::Titlecase,
// Fall through to other types of symbols
_ => CharType::Other
}
}
/// Identify the action to take for a given transition between character roles
fn transition_to_action(old_type: CharType, new_type: CharType, strict: bool) -> CCaseAction {
// FIXME: Silence `match_same_arms` lint. It could prompt someone to mess with precedence.
match (old_type, new_type) {
// Split instead of emitting whitespace (must have highest precedence)
(_, CharType::Whitespace) if !strict => CCaseAction::Skip,
(_, CharType::Whitespace) if strict => CCaseAction::Literal,
// Block AlreadyStartedWord in situations like "(Hello"
(CharType::StartPunct, _) => CCaseAction::Suppress,
// Always start a new word after whitespace, before titlecase, and before/after ampersands
// TODO: More unit tests for the interaction between Ampersand and NumSep/etc.
(CharType::Whitespace, _)
| (_, CharType::Titlecase)
| (CharType::Ampersand, _)
| (_, CharType::Ampersand) => CCaseAction::StartWord,
// Don't split before or after a "Number Separator" or apostrophe
// or before closing punctuation (eg. parens) unless overruled by a higher-precedence rule.
(CharType::NumSep, _)
| (_, CharType::NumSep)
| (CharType::Apostrophe, _)
| (_, CharType::Apostrophe)
| (_, CharType::EndPunct) => CCaseAction::Literal,
// Retroactively locate the word-break if we find a lowercase after a titlecase/uppercase
// FIXME: An additional CCaseAction needs to be defined so StartPunct can overrule this
(CharType::Titlecase, CharType::Lowercase) | (CharType::Uppercase, CharType::Lowercase) => {
CCaseAction::AlreadyStartedWord
}
// If we reach this point and the character types differ, start a new word
// TODO: I'll probably want to refine this with regards to CCaseAction::Other
(x, y) if x != y => CCaseAction::StartWord,
// ...otherwise, just pass it through verbatim
_ => CCaseAction::Literal,
}
}
// --== Iterators ==--
/// External iterator for offsets of words as defined by camelcase rules.
pub struct WordOffsets<'a> {
/// Grapheme iterator wrapping the source string
in_iter: GraphemeIndices<'a>,
/// Maximum valid end offset. Used for the final drain operation after the iterator runs out.
in_len: usize,
/// If true, split only on CamelCase transitions, passing other delimiters through as literals
///
/// This is useful for counting camelcase transitions relative to other kinds of delimiters
///
/// TODO: Actually implement this
strict: bool,
// Used by the middle phase of each next() call
/// The abstract type of the previous grapheme's base `char`. Used by `transition_to_action`.
prev_type: CharType,
// Used by the final phase of each next() call
/// The start offset (in bytes) for the word currently being accumulated
start_offset: usize,
/// The previous value of `start_offset`. Used by `AlreadyStartedWord` to rewind split points.
prev_offset: usize,
/// Used to allow `CCaseAction::Skip` to not emit whitespace-only words
skipping: bool,
/// Used to allow `CCaseAction::Suppress` to block `AlreadyStartedWord`
suppress: usize,
}
impl<'a> WordOffsets<'a> {
/// Helper to deduplicate the code involved in advancing to the next word in the iterator
fn _next_word(&mut self, end_offset: usize, skip: bool) -> Option<(usize, usize)> {
// We have to update our state variables no matter what the outcome, so do this first.
let skipping = replace(&mut self.skipping, skip);
let start_offset = replace(&mut self.start_offset, end_offset);
// If our previous "word" is non-empty and we're not skipping it, return it
if start_offset < end_offset && !skipping {
Some((start_offset, end_offset))
} else {
None
}
}
}
impl<'a> Iterator for WordOffsets<'a> {
type Item = (usize, usize);
fn next(&mut self) -> Option<(usize, usize)> {
// Get the next grapheme cluster and its byte index
// Note: Using `while let` instead of `for` is necessary to avoid a borrow conflict
#[allow(clippy::while_let_on_iterator)]
while let Some((byte_offset, grapheme)) = self.in_iter.next() {
// Extract the base `char` so `classify_char` can call things like `is_uppercase`
let base = grapheme.chars().next().expect("non-empty grapheme cluster");
// Identify character types and map transitions between them to actions
let curr_type = classify_char(base);
let curr_action = transition_to_action(
replace(&mut self.prev_type, curr_type),
curr_type,
self.strict,
);
// Actually apply the action to the iterator's state and, if the action returns an
// accumulated word, return it.
// TODO: Consider using an enum for the skip=true/false
let prev_offset = replace(&mut self.prev_offset, byte_offset);
if let Some(pair) = match curr_action {
CCaseAction::Skip => self._next_word(byte_offset, true),
CCaseAction::StartWord if self.suppress != byte_offset => {
self._next_word(byte_offset, false)
}
CCaseAction::AlreadyStartedWord if self.suppress != prev_offset => {
self._next_word(prev_offset, false)
}
CCaseAction::Suppress => {
self.suppress = byte_offset;
None
}
_ => None, // Use Literal as the fallback behaviour
} {
return Some(pair);
}
}
// Drain the remaining graphemes into a final word, if present
let in_len = self.in_len;
self._next_word(in_len, true)
}
}
/// External iterator for words in a string as defined by camelcase rules.
///
/// NOTE: This API should be considered unstable as I have plans to rewrite it once
/// `impl Iterator<Item=&str>` is stabilized.
pub struct Words<'a> {
/// Source string from which slices will be returned
in_str: &'a str,
/// Offset iterator wrapping the source string
in_iter: WordOffsets<'a>,
}
impl<'a> Iterator for Words<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<&'a str> {
#![allow(clippy::indexing_slicing)]
match self.in_iter.next() {
Some((start, end)) => Some(&self.in_str[start..end]),
None => None,
}
}
}
/// Extension trait to add camelcase-based wordwise iterators to &str
pub trait CamelCaseIterators {
/// Returns an iterator over the `(start_offset, end_offset)` tuples defining words within the
/// string, as separated by camelcase rules.
///
/// This implementation differs from the form of camelcase typically used for function names in
/// that it will insert spaces between words and numbers.
/// (ie. "Thing Part 1" rather than "Thing Part1")
///
/// This decision was made based on the following observations taken from a corpus of over 800
/// real-world computer game directory and installer/archive file names:
///
/// 1. It produces a more accurate translation to the intended titles.
/// 2. It is in accordance with how, unlike method names, `snake_case` in video game filenames
/// separates numbers from the words they follow.
///
/// The test data in question can be found in the `filename_to_name_data.json` file used by the
/// top-level integration tests for this project.
///
/// TODO: If strict `true`, only split on camelcase boundaries, passing other delimiters
/// through literally. (Useful for stats gathering)
fn camelcase_offsets(&self, strict: bool) -> WordOffsets;
/// Returns an iterator over the words of the string, separated by camelcase rules.
///
/// See `camelcase_offsets` for details.
///
/// TODO: If strict `true`, only split on camelcase boundaries, passing other delimiters
/// through literally. (Useful for stats gathering)
fn camelcase_words(&self, strict: bool) -> Words;
}
impl CamelCaseIterators for str {
// TODO: Once I'm set up for benchmarking, check whether I should copy the tactic
// unicode_segmentation applies involving #[inline] annotations
fn camelcase_offsets(&self, strict: bool) -> WordOffsets {
WordOffsets {
in_iter: self.grapheme_indices(true),
in_len: self.len(),
strict,
// TODO: Implement strict and unit test it
prev_type: CharType::Start,
start_offset: 0,
prev_offset: 0,
// Use the maximum possible value for `suppress` to mean "unset" because the whole
// point is to affect the behaviour of suppress+1... which means this can't collide
// with anything.
suppress: usize::max_value(), // Use the maximum value for "unset" since
skipping: false,
}
}
fn camelcase_words(&self, strict: bool) -> Words {
Words {
in_str: self,
in_iter: self.camelcase_offsets(strict),
}
}
}
// --== Tests ==--
#[cfg(test)]
mod tests {
use super::CamelCaseIterators;
// TODO: Set up fuzzing too so I can shake out any flaws I *didn't* anticipate.
/// Helper for testing `camelcase_words` on strings which rely on whitespace for some of their
/// word boundaries.
fn check_camelcase_words_limited(input: &str, expected: &[&str]) -> String {
// Check that camelcase_words(false) returns the expected output
let result = input.camelcase_words(false).collect::<Vec<_>>();
assert_eq!(result, expected, "(with input {:?})", input);
// Check that re-joining with " " and then re-splitting doesn't change the results
let result_joined = result.join(" ");
let result2 = result_joined.camelcase_words(false).collect::<Vec<_>>();
assert_eq!(
result2, result,
"camelcase_words should be a no-op when re-run on its own output (space)"
);
// Basic sanity check that camelcase_offsets produces the same number of values as _words
assert_eq!(input.camelcase_offsets(false).count(), expected.len());
// Return the string, joined with "" so we can easily reuse this in check_camelcase_words
result2.join("")
}
/// Helper to deduplicate verifying that CamelCaseIterators output is stable
fn check_camelcase_words(input: &str, expected: &[&str]) {
let result2_joined = check_camelcase_words_limited(input, expected);
assert_eq!(
result2_joined.camelcase_words(false).collect::<Vec<_>>(),
expected,
"camelcase_words should be a no-op when re-run on its own output (no space)"
);
}
/// Helper to deduplicate verifying that `strict` CamelCaseIterators output is as expected
fn check_camelcase_strict(input: &str, expected: &[&str]) {
let result = input.camelcase_words(true).collect::<Vec<_>>();
assert_eq!(result, expected, "(with input {:?})", input);
let result_j = result.join(" ");
let result2 = result_j.camelcase_words(false).collect::<Vec<_>>();
assert_eq!(
result2, result,
"camelcase_words should be a no-op when re-run on its own output (space)"
);
let result2_j = result2.join("");
assert_eq!(
result2_j.camelcase_words(false).collect::<Vec<_>>(),
result,
"camelcase_words should be a no-op when re-run on its own output (no space)"
);
assert_eq!(input.camelcase_offsets(false).count(), expected.len());
}
/// Basic sanity test to catch if camelcase_words is only passing tests because it reverses an
/// indexing mistake camelcase_offsets makes.
#[test]
fn camelcase_offsets_basic_function() {
assert_eq!(
"fooBar2 baz".camelcase_offsets(false).collect::<Vec<_>>(),
[(0, 3), (3, 6), (6, 7), (8, 11)]
)
}
#[test]
fn camelcase_words_basic_function() {
check_camelcase_words("NeonChrome", &["Neon", "Chrome"]); // Basic upper-starting camelcase
check_camelcase_words("projectShyknight", &["project", "Shyknight"]); // ...lower-starting
check_camelcase_words("AndroidVM", &["Android", "VM"]); // Acronym at the end
check_camelcase_words("RARFile", &["RAR", "File"]); // Acronym at the beginning
// TODO: Find a real-world "acronym in the middle" test which allows corpus-friendly rules
// Regression tests
check_camelcase_words("ADruidsDuel", &["A", "Druids", "Duel"]); // Single-letter first word
check_camelcase_words("PickACard", &["Pick", "A", "Card"]); // Single-letter middle word
check_camelcase_words("AxelF", &["Axel", "F"]); // Single-letter end word
}
#[test]
fn camelcase_words_leaves_capitalization_alone() {
check_camelcase_words("foo", &["foo"]);
check_camelcase_words("Foo", &["Foo"]);
check_camelcase_words("fooBar", &["foo", "Bar"]);
check_camelcase_words("FooBar", &["Foo", "Bar"]);
check_camelcase_words("Foo Bar", &["Foo", "Bar"]);
check_camelcase_words_limited("foo bar", &["foo", "bar"]);
}
#[test]
fn camelcase_words_ascii_number_handling() {
check_camelcase_words("6LittleEggs", &["6", "Little", "Eggs"]);
check_camelcase_words("the12chairs", &["the", "12", "chairs"]);
check_camelcase_words("The12Chairs", &["The", "12", "Chairs"]);
check_camelcase_words("1.5 Children", &["1.5", "Children"]);
check_camelcase_words("The1.5Children", &["The", "1.5", "Children"]);
check_camelcase_words("the1.5children", &["the", "1.5", "children"]);
check_camelcase_words("Version1.1", &["Version", "1.1"]);
check_camelcase_words("catch22", &["catch", "22"]);
check_camelcase_words("Catch22", &["Catch", "22"]);
check_camelcase_words("1Two3", &["1", "Two", "3"]);
check_camelcase_words("One2Three", &["One", "2", "Three"]);
check_camelcase_words("ONE2", &["ONE", "2"]);
check_camelcase_words("ONE2THREE", &["ONE", "2", "THREE"]);
}
#[test]
fn camelcase_words_basic_unicode_handling() {
check_camelcase_words("\u{1D7DE}ŁittléEggs", &["\u{1D7DE}", "Łittlé", "Eggs"]);
check_camelcase_words("ⅥŁittłeEggs", &["Ⅵ", "Łittłe", "Eggs"]);
check_camelcase_words("➅LittleEggs", &["➅", "Little", "Eggs"]);
check_camelcase_words("\u{1D7DE} Łittlé Eggs", &["\u{1D7DE}", "Łittlé", "Eggs"]);
check_camelcase_words("Ⅵ Łittłe Eggs", &["Ⅵ", "Łittłe", "Eggs"]);
check_camelcase_words("➅ Little Eggs", &["➅", "Little", "Eggs"]);
}
#[test]
fn camelcase_words_titlecase_handling() {
// Actual word (Serbo-Croatian for "jungle")
check_camelcase_words("Džungla", &["Džungla"]);
// Synthetic cases for exhaustiveness
check_camelcase_words("Dž", &["Dž"]);
check_camelcase_words("DžX", &["Dž", "X"]);
check_camelcase_words("XDž", &["X", "Dž"]);
check_camelcase_words("XxDž", &["Xx", "Dž"]);
check_camelcase_words("DžXx", &["Dž", "Xx"]);
check_camelcase_words("1Dž2", &["1", "Dž", "2"]);
check_camelcase_words("Dž&Dž", &["Dž", "&", "Dž"]);
}
#[test]
fn camelcase_words_ampersand_handling() {
// Basic function with all known ampersand code points
check_camelcase_words("TheKing&I", &["The", "King", "&", "I"]);
check_camelcase_words("TheKing﹠I", &["The", "King", "﹠", "I"]);
check_camelcase_words("TheKing&I", &["The", "King", "&", "I"]);
check_camelcase_words("TheKing\u{1F674}I", &["The", "King", "\u{1F674}", "I"]);
// Ampersand interaction with titlecase codepoints
check_camelcase_words("Dž&Dž", &["Dž", "&", "Dž"]);
// Ampersand followed by punctuation
check_camelcase_words("Forsooth&'tisTrue", &["Forsooth", "&", "'tis", "True"]);
// Regression tests (Ampersand between single-letter words)
check_camelcase_words("A&b", &["A", "&", "b"]);
check_camelcase_words("A﹠b", &["A", "﹠", "b"]);
check_camelcase_words("A&b", &["A", "&", "b"]);
check_camelcase_words("A\u{1F674}b", &["A", "\u{1F674}", "b"]);
check_camelcase_words("1&2", &["1", "&", "2"]);
}
#[test]
fn camelcase_words_apostrophe_handling() {
// Basic check for common apostrophe characters
check_camelcase_words("Don'tMove", &["Don't", "Move"]);
check_camelcase_words("Don\u{FF07}tMove", &["Don\u{FF07}t", "Move"]); // Double-width
check_camelcase_words("Don\u{2019}tMove", &["Don\u{2019}t", "Move"]);
// Note: U+2019 (Right Single Quotation Mark)" is included here because FileFormat.info
// includes "U+2019 is preferred for apostrophe" in the "Comments" field.
// Use an odd but valid sentence to test apostrophes within words, before a space,
// and at the end of the string.
check_camelcase_words_limited("It's my kids' kids'", &["It's", "my", "kids'", "kids'"]);
check_camelcase_words_limited(
"it\u{2019}s my kids\u{2019} kids\u{2019}",
&["it\u{2019}s", "my", "kids\u{2019}", "kids\u{2019}"],
);
check_camelcase_words_limited(
"it\u{FF07}s my kids\u{FF07} kids\u{FF07}",
&["it\u{FF07}s", "my", "kids\u{FF07}", "kids\u{FF07}"],
);
}
#[test]
/// Test that the "retroactively insert word break" signal from the first two characters of a
/// camelcase word adjusts for opening punctuation marks like "(" and "["
fn camelcase_words_open_close_plus_upper_lower() {
check_camelcase_words("Test [Hello]", &["Test", "[Hello]"]);
check_camelcase_words("Test (Hello)", &["Test", "(Hello)"]);
check_camelcase_words("Test {Hello}", &["Test", "{Hello}"]);
check_camelcase_words("Test «Hello»", &["Test", "«Hello»"]);
check_camelcase_words("Test <Hello>", &["Test", "<Hello>"]);
check_camelcase_words("Test ﹤Hello﹥", &["Test", "﹤Hello﹥"]);
check_camelcase_words("Test <Hello>", &["Test", "<Hello>"]);
// XXX: Consider just using a string-building loop so the "Test " and "Hello" don't need to
// be specified repeatedly.
// XXX: Decide whether it's within the scope of our concerns so specify a behaviour
// for a testcase like "[Hello)" where a parser might be trying to enforce
// balanced parens.
}
#[test]
/// General tests for proper handling of characters which should force a word break on one side
/// but not the other. (eg. brackets, exclamation marks, etc.)
///
/// TODO: Find real-world sample strings for all characters I want to include in my tables to
/// guard against accidentally mis-filing a character in both the tables and the tests.
/// (Because mistakes are much easier to see in context)
fn camelcase_words_open_close_handling() {
// Punctuation symbol associativity
check_camelcase_words(
"Who?Him!Really?Yeah!",
&["Who?", "Him!", "Really?", "Yeah!"],
);
check_camelcase_words("100%Juice", &["100%", "Juice"]);
check_camelcase_words("WeAre#1", &["We", "Are", "#1"]);
// Bracket associativity
check_camelcase_words("ShadowWarrior(2013)", &["Shadow", "Warrior", "(2013)"]);
check_camelcase_words("The<html>tag", &["The", "<html>", "tag"]);
check_camelcase_words(
"[She]said[...]and[…].",
&["[She]", "said", "[...]", "and", "[…]."],
);
// Word-break insertion in the presence of titlecase codepoints
check_camelcase_words("[Džungla]", &["[Džungla]"]); // Opening bracket, then titlecase char
check_camelcase_words(" [Džungla] ", &["[Džungla]"]); // ... with leading space
// Regression tests and variations thereof
check_camelcase_words("SallyFace[linux]", &["Sally", "Face", "[linux]"]);
check_camelcase_words("SallyFace[Linux]", &["Sally", "Face", "[Linux]"]);
// Guillemet associativity
// (Apologies to fans of "Un cœur simple". A more real-world example will be welcomed.)
// See Also: https://www.thoughtco.com/capitalize-french-titles-4086495
check_camelcase_words("UnCœur«simple»2", &["Un", "Cœur", "«simple»", "2"]);
// fr_CH
// XXX: Once I've got English solid, I need to evaluate the feasibility of unconditionally
// ignoring non-breaking spaces after "«" and before "»" when splitting so that
// already-split French strings from outside Switzerland don't get mangled.
// TODO: figure out how to handle double-quote associativity.
// (The triggering string was "[She]said\"He's[...]boorish[…]andCrude.\"")
}
#[test]
fn camelcase_words_doesnt_subdivide_numbers() {
check_camelcase_words("3.14", &["3.14"]); // Decimal
check_camelcase_words("255", &["255"]); // Positive integer
check_camelcase_words("-127", &["-127"]); // Negative integer
check_camelcase_words("1000000", &["1000000"]); // Repeating zeros
// Numeric Separators
check_camelcase_words("1,000,000 BCE", &["1,000,000", "BCE"]); // Thousands sep. (en)
check_camelcase_words("1.000.000 AEC", &["1.000.000", "AEC"]); // Thousands sep. (fr)
check_camelcase_words("$1,499.95", &["$1,499.95"]); // Comma and period (English)
check_camelcase_words("€1.499,95", &["€1.499,95"]); // Comma and period (Français)
check_camelcase_words("2.6.12", &["2.6.12"]); // Raw version number
// Regression tests
check_camelcase_words("ut2003", &["ut", "2003"]);
// XXX: Where in the stack of transforms is it most appropriate to ensure that "v1.5rc2"
// doesn't get split up into &["v", "1.5", "rc", "2"]?
}
#[test]
fn camelcase_words_unicode_segmentation() {
// Zalgo text generated using http://eeemo.net/
// Zalgo'd synthetic tests for situations where some algorithms could fail
check_camelcase_words("f̴͘͟͜ǫ̴̸̧͘ó̵̢̢͏B̴̨͠á̵̸͡r̶̵͢͠", &["f̴͘͟͜ǫ̴̸̧͘ó̵̢̢͏", "B̴̨͠á̵̸͡r̶̵͢͠"]); // Basic test
check_camelcase_words("Ŕ̀̕͟͞À̸̛͞͞Ŕ̨̕F̕͜͟͠í̵͜l҉̨e̶̵", &["Ŕ̀̕͟͞À̸̛͞͞Ŕ̨̕", "F̕͜͟͠í̵͜l҉̨e̶̵"]); // Acronym
check_camelcase_words("A̴&b͝", &["A̴", "&", "b͝"]); // Ampersand (No combining chars on &)
check_camelcase_words("A̴&͏̵̛b͝", &["A̴", "&͏̵̛", "b͝"]); // Ampersand (Combining chars on &)
check_camelcase_words("P̕͟͠i҉͢c̨̨͞͡ḱ̸̕Ą̸Ç͘͜a͘͟r̀͟͢҉̵d̕͜", &["P̕͟͠i҉͢c̨̨͞͡ḱ̸̕", "Ą̸", "Ç͘͜a͘͟r̀͟͢҉̵d̕͜"]); // Single-letter word
check_camelcase_words("6̢L̢͏͏͠i̷̛͜t̷̕t̷͟ļ͟͢ȩ̨̕̕È̷̸g̵̷̨͢͡g̷s͟͞", &["6̢", "L̢͏͏͠i̷̛͜t̷̕t̷͟ļ͟͢ȩ̨̕̕", "È̷̸g̵̷̨͢͡g̷s͟͞"]); // Initial number
check_camelcase_words("T̶͡ḩ̷̷͟ȩ̛́͘͡1̵̨̕͢2̕͝C̸̡͞͏͟h̴̵̀a҉͜͢i̵̸̡̕ŗ̴͢s̴͏͘͡", &["T̶͡ḩ̷̷͟ȩ̛́͘͡", "1̵̨̕͢2̕͝", "C̸̡͞͏͟h̴̵̀a҉͜͢i̵̸̡̕ŗ̴͢s̴͏͘͡"]); // Number in the middle
check_camelcase_words("t̶̨͞h̨͝͝e̡͟͢1̴̧̀͘͟2͘͘c̷̴̢͘h̶̴̢͢à͘͏i̡̛r͜s̷͏", &["t̶̨͞h̨͝͝e̡͟͢", "1̴̧̀͘͟2͘͘", "c̷̴̢͘h̶̴̢͢à͘͏i̡̛r͜s̷͏"]); // ...starting lowercase
check_camelcase_words("T͠҉̸̷h̀͡e̡̨͝͠1̴́͏.͏̨́͠͝5̨́̕C̷͜͏͠h̢̧͝ì̡̢̕l̸͞͡d̵̕͢͡ŕ̶͘͡͞e͜͝n̨҉̕", &["T͠҉̸̷h̀͡e̡̨͝͠", "1̴́͏.͏̨́͠͝5̨́̕", "C̷͜͏͠h̢̧͝ì̡̢̕l̸͞͡d̵̕͢͡ŕ̶͘͡͞e͜͝n̨҉̕"]); // ...with decimal
check_camelcase_words("t̡̛͟h͏҉҉́è͝͠1̢̕͟͟.̶̛5̶͜ć̀ḩ̶̸̕͜i̸̕͢l̢͡͝͝͏d͘͟r̨͢e̢҉̵͞͠n̛", &["t̡̛͟h͏҉҉́è͝͠", "1̢̕͟͟.̶̛5̶͜", "ć̀ḩ̶̸̕͜i̸̕͢l̢͡͝͝͏d͘͟r̨͢e̢҉̵͞͠n̛"]); // ...with decimal
check_camelcase_words("V̶͞e̡͜͟͠r̢͟s̀͏̧̢̕i̸̧͞͠o̷̸̧n̡͞1̧̀͘͟͞.̸̕1́͞҉", &["V̶͞e̡͜͟͠r̢͟s̀͏̧̢̕i̸̧͞͠o̷̸̧n̡͞", "1̧̀͘͟͞.̸̕1́͞҉"]); // Decimal number at the end
check_camelcase_words("2̶͏͡0́̕҉̶0̡͞͡3̴̷͟", &["2̶͏͡0́̕҉̶0̡͞͡3̴̷͟"]); // Multi-digit integer with combining characters
// Zalgo'd regression tests
check_camelcase_words("u̢҉͡t̸̷̛2003", &["u̢҉͡t̸̷̛", "2003"]);
check_camelcase_words("u̢҉͡t̸̷̛2̶͏͡0́̕҉̶0̡͞͡3̴̷͟", &["u̢҉͡t̸̷̛", "2̶͏͡0́̕҉̶0̡͞͡3̴̷͟"]);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment