pzel/UTF8-trim.md

## example.pony

// The horse emoji is described here:
// https://unicode-table.com/en/1F40E/
// UTF-8  F0 9F 90 8E  240 159 144 142  4036989070

actor Main
  new create(env: Env) =>
    let pony = "🐎"
    let pony_w_space = "🐎 "

    env.out.print("The Pony as StringBytes == [U8] octets:")
    for c in pony.values() do env.out.print(c.string()) end
    // prints: 240 159 144 142

    env.out.print("The Pony + space as StringBytes == [U8] octets:")
    for c in pony_w_space.values() do env.out.print(c.string()) end
    // prints: 240 159 144 142 32

    let stripped_pony : String box = pony.clone().>strip()
    let stripped_pony_w_space : String box = pony_w_space.clone().>strip()

    env.out.print("Stripped Pony as StringBytes == [U8] octets:")
    for c in stripped_pony.values() do env.out.print(c.string()) end
    // prints: 240 (!!)

    env.out.print("Stripped Pony+space as StringBytes == [U8] octets:")
    for c in stripped_pony_w_space.values() do env.out.print(c.string()) end
    // prints: 240 (!!)

## UTF8-trim.md

      
    Raw
  

              UTF8-trim.md
            
          
    Unintiuitive behavior in String.strip()

I just ran across unintuitive behavior in String.strip
When the last U8 preceding the trailing whitespace is a fragment of a utf-8 sequence,
it gets chopped off along with the following whitespace.

Whe the last U8 preceding the end of the string is a fragment of a utf-8 sequence,
that last codepoint gets mangled to its first octet.
This goes against the prinicple of least surprise. Encoding issues notwithstanding,
I think a programmer is justified in expecting String.strip() to not modify a string that contains
no whitespace.
I've added test cases to the builtin_test/_test.pony suite, and I'm trying to understand
how String.rstrip() works, to try and propose a solution.

	// The horse emoji is described here:
	// https://unicode-table.com/en/1F40E/
	// UTF-8 F0 9F 90 8E 240 159 144 142 4036989070

	actor Main
	new create(env: Env) =>
	let pony = "🐎"
	let pony_w_space = "🐎 "

	env.out.print("The Pony as StringBytes == [U8] octets:")
	for c in pony.values() do env.out.print(c.string()) end
	// prints: 240 159 144 142

	env.out.print("The Pony + space as StringBytes == [U8] octets:")
	for c in pony_w_space.values() do env.out.print(c.string()) end
	// prints: 240 159 144 142 32

	let stripped_pony : String box = pony.clone().>strip()
	let stripped_pony_w_space : String box = pony_w_space.clone().>strip()

	env.out.print("Stripped Pony as StringBytes == [U8] octets:")
	for c in stripped_pony.values() do env.out.print(c.string()) end
	// prints: 240 (!!)

	env.out.print("Stripped Pony+space as StringBytes == [U8] octets:")
	for c in stripped_pony_w_space.values() do env.out.print(c.string()) end
	// prints: 240 (!!)