-
-
Save rendello/d37552507a389656e248f3255a618127 to your computer and use it in GitHub Desktop.
/* | |
Copyright (c) 2024 Rendello | |
Permission to use, copy, modify, and/or distribute this software for any | |
purpose with or without fee is hereby granted. | |
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH | |
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY | |
AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, | |
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM | |
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR | |
OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR | |
PERFORMANCE OF THIS SOFTWARE. | |
*/ | |
// ========================================================================== | |
//! Unicode codepoints that expand or contract when case is changed in UTF-8. | |
// ========================================================================== | |
pub const LOWERCASING_CONTRACTS: [&str; 22] = [ | |
"ẞ", /* ß (3->2), -1 bytes */ | |
"Ω", /* ω (3->2), -1 bytes */ | |
"Å", /* å (3->2), -1 bytes */ | |
"Ɫ", /* ɫ (3->2), -1 bytes */ | |
"Ɽ", /* ɽ (3->2), -1 bytes */ | |
"Ɑ", /* ɑ (3->2), -1 bytes */ | |
"Ɱ", /* ɱ (3->2), -1 bytes */ | |
"Ɐ", /* ɐ (3->2), -1 bytes */ | |
"Ɒ", /* ɒ (3->2), -1 bytes */ | |
"Ȿ", /* ȿ (3->2), -1 bytes */ | |
"Ɀ", /* ɀ (3->2), -1 bytes */ | |
"Ɥ", /* ɥ (3->2), -1 bytes */ | |
"Ɦ", /* ɦ (3->2), -1 bytes */ | |
"Ɜ", /* ɜ (3->2), -1 bytes */ | |
"Ɡ", /* ɡ (3->2), -1 bytes */ | |
"Ɬ", /* ɬ (3->2), -1 bytes */ | |
"Ɪ", /* ɪ (3->2), -1 bytes */ | |
"Ʞ", /* ʞ (3->2), -1 bytes */ | |
"Ʇ", /* ʇ (3->2), -1 bytes */ | |
"Ʝ", /* ʝ (3->2), -1 bytes */ | |
"Ʂ", /* ʂ (3->2), -1 bytes */ | |
"K", /* k (3->1), -2 bytes */ | |
]; | |
pub const LOWERCASING_EXPANDS: [&str; 2] = [ | |
"Ⱥ", /* ⱥ (2->3), +1 bytes */ | |
"Ⱦ", /* ⱦ (2->3), +1 bytes */ | |
]; | |
pub const LOWERCASING_EXPANDS_MULTI_CHAR: [&str; 1] = [ | |
"İ", /* i̇ (2->3), +1 bytes, +1 chars */ | |
]; | |
pub const UPPERCASING_CONTRACTS: [&str; 13] = [ | |
"ı", /* I (2->1), -1 bytes */ | |
"ſ", /* S (2->1), -1 bytes */ | |
"ᲀ", /* В (3->2), -1 bytes */ | |
"ᲁ", /* Д (3->2), -1 bytes */ | |
"ᲂ", /* О (3->2), -1 bytes */ | |
"ᲃ", /* С (3->2), -1 bytes */ | |
"ᲄ", /* Т (3->2), -1 bytes */ | |
"ᲅ", /* Т (3->2), -1 bytes */ | |
"ᲆ", /* Ъ (3->2), -1 bytes */ | |
"ᲇ", /* Ѣ (3->2), -1 bytes */ | |
"ι", /* Ι (3->2), -1 bytes */ | |
"ⱥ", /* Ⱥ (3->2), -1 bytes */ | |
"ⱦ", /* Ⱦ (3->2), -1 bytes */ | |
]; | |
pub const UPPERCASING_CONTRACTS_MULTI_CHAR: [&str; 5] = [ | |
"ff", /* FF (3->2), -1 bytes, +1 chars */ | |
"fi", /* FI (3->2), -1 bytes, +1 chars */ | |
"fl", /* FL (3->2), -1 bytes, +1 chars */ | |
"ſt", /* ST (3->2), -1 bytes, +1 chars */ | |
"st", /* ST (3->2), -1 bytes, +1 chars */ | |
]; | |
pub const UPPERCASING_EXPANDS: [&str; 18] = [ | |
"ȿ", /* Ȿ (2->3), +1 bytes */ | |
"ɀ", /* Ɀ (2->3), +1 bytes */ | |
"ɐ", /* Ɐ (2->3), +1 bytes */ | |
"ɑ", /* Ɑ (2->3), +1 bytes */ | |
"ɒ", /* Ɒ (2->3), +1 bytes */ | |
"ɜ", /* Ɜ (2->3), +1 bytes */ | |
"ɡ", /* Ɡ (2->3), +1 bytes */ | |
"ɥ", /* Ɥ (2->3), +1 bytes */ | |
"ɦ", /* Ɦ (2->3), +1 bytes */ | |
"ɪ", /* Ɪ (2->3), +1 bytes */ | |
"ɫ", /* Ɫ (2->3), +1 bytes */ | |
"ɬ", /* Ɬ (2->3), +1 bytes */ | |
"ɱ", /* Ɱ (2->3), +1 bytes */ | |
"ɽ", /* Ɽ (2->3), +1 bytes */ | |
"ʂ", /* Ʂ (2->3), +1 bytes */ | |
"ʇ", /* Ʇ (2->3), +1 bytes */ | |
"ʝ", /* Ʝ (2->3), +1 bytes */ | |
"ʞ", /* Ʞ (2->3), +1 bytes */ | |
]; | |
pub const UPPERCASING_EXPANDS_MULTI_CHAR: [&str; 89] = [ | |
"ΐ", /* Ϊ́ (2->6), +4 bytes, +2 chars */ | |
"ΰ", /* Ϋ́ (2->6), +4 bytes, +2 chars */ | |
"ὒ", /* Υ̓̀ (3->6), +3 bytes, +2 chars */ | |
"ὔ", /* Υ̓́ (3->6), +3 bytes, +2 chars */ | |
"ὖ", /* Υ̓͂ (3->6), +3 bytes, +2 chars */ | |
"ᾷ", /* Α͂Ι (3->6), +3 bytes, +2 chars */ | |
"ῇ", /* Η͂Ι (3->6), +3 bytes, +2 chars */ | |
"ῒ", /* Ϊ̀ (3->6), +3 bytes, +2 chars */ | |
"ΐ", /* Ϊ́ (3->6), +3 bytes, +2 chars */ | |
"ῗ", /* Ϊ͂ (3->6), +3 bytes, +2 chars */ | |
"ῢ", /* Ϋ̀ (3->6), +3 bytes, +2 chars */ | |
"ΰ", /* Ϋ́ (3->6), +3 bytes, +2 chars */ | |
"ῧ", /* Ϋ͂ (3->6), +3 bytes, +2 chars */ | |
"ῷ", /* Ω͂Ι (3->6), +3 bytes, +2 chars */ | |
"և", /* ԵՒ (2->4), +2 bytes, +1 chars */ | |
"ᾀ", /* ἈΙ (3->5), +2 bytes, +1 chars */ | |
"ᾁ", /* ἉΙ (3->5), +2 bytes, +1 chars */ | |
"ᾂ", /* ἊΙ (3->5), +2 bytes, +1 chars */ | |
"ᾃ", /* ἋΙ (3->5), +2 bytes, +1 chars */ | |
"ᾄ", /* ἌΙ (3->5), +2 bytes, +1 chars */ | |
"ᾅ", /* ἍΙ (3->5), +2 bytes, +1 chars */ | |
"ᾆ", /* ἎΙ (3->5), +2 bytes, +1 chars */ | |
"ᾇ", /* ἏΙ (3->5), +2 bytes, +1 chars */ | |
"ᾈ", /* ἈΙ (3->5), +2 bytes, +1 chars */ | |
"ᾉ", /* ἉΙ (3->5), +2 bytes, +1 chars */ | |
"ᾊ", /* ἊΙ (3->5), +2 bytes, +1 chars */ | |
"ᾋ", /* ἋΙ (3->5), +2 bytes, +1 chars */ | |
"ᾌ", /* ἌΙ (3->5), +2 bytes, +1 chars */ | |
"ᾍ", /* ἍΙ (3->5), +2 bytes, +1 chars */ | |
"ᾎ", /* ἎΙ (3->5), +2 bytes, +1 chars */ | |
"ᾏ", /* ἏΙ (3->5), +2 bytes, +1 chars */ | |
"ᾐ", /* ἨΙ (3->5), +2 bytes, +1 chars */ | |
"ᾑ", /* ἩΙ (3->5), +2 bytes, +1 chars */ | |
"ᾒ", /* ἪΙ (3->5), +2 bytes, +1 chars */ | |
"ᾓ", /* ἫΙ (3->5), +2 bytes, +1 chars */ | |
"ᾔ", /* ἬΙ (3->5), +2 bytes, +1 chars */ | |
"ᾕ", /* ἭΙ (3->5), +2 bytes, +1 chars */ | |
"ᾖ", /* ἮΙ (3->5), +2 bytes, +1 chars */ | |
"ᾗ", /* ἯΙ (3->5), +2 bytes, +1 chars */ | |
"ᾘ", /* ἨΙ (3->5), +2 bytes, +1 chars */ | |
"ᾙ", /* ἩΙ (3->5), +2 bytes, +1 chars */ | |
"ᾚ", /* ἪΙ (3->5), +2 bytes, +1 chars */ | |
"ᾛ", /* ἫΙ (3->5), +2 bytes, +1 chars */ | |
"ᾜ", /* ἬΙ (3->5), +2 bytes, +1 chars */ | |
"ᾝ", /* ἭΙ (3->5), +2 bytes, +1 chars */ | |
"ᾞ", /* ἮΙ (3->5), +2 bytes, +1 chars */ | |
"ᾟ", /* ἯΙ (3->5), +2 bytes, +1 chars */ | |
"ᾠ", /* ὨΙ (3->5), +2 bytes, +1 chars */ | |
"ᾡ", /* ὩΙ (3->5), +2 bytes, +1 chars */ | |
"ᾢ", /* ὪΙ (3->5), +2 bytes, +1 chars */ | |
"ᾣ", /* ὫΙ (3->5), +2 bytes, +1 chars */ | |
"ᾤ", /* ὬΙ (3->5), +2 bytes, +1 chars */ | |
"ᾥ", /* ὭΙ (3->5), +2 bytes, +1 chars */ | |
"ᾦ", /* ὮΙ (3->5), +2 bytes, +1 chars */ | |
"ᾧ", /* ὯΙ (3->5), +2 bytes, +1 chars */ | |
"ᾨ", /* ὨΙ (3->5), +2 bytes, +1 chars */ | |
"ᾩ", /* ὩΙ (3->5), +2 bytes, +1 chars */ | |
"ᾪ", /* ὪΙ (3->5), +2 bytes, +1 chars */ | |
"ᾫ", /* ὫΙ (3->5), +2 bytes, +1 chars */ | |
"ᾬ", /* ὬΙ (3->5), +2 bytes, +1 chars */ | |
"ᾭ", /* ὭΙ (3->5), +2 bytes, +1 chars */ | |
"ᾮ", /* ὮΙ (3->5), +2 bytes, +1 chars */ | |
"ᾯ", /* ὯΙ (3->5), +2 bytes, +1 chars */ | |
"ᾲ", /* ᾺΙ (3->5), +2 bytes, +1 chars */ | |
"ῂ", /* ῊΙ (3->5), +2 bytes, +1 chars */ | |
"ῲ", /* ῺΙ (3->5), +2 bytes, +1 chars */ | |
"ʼn", /* ʼN (2->3), +1 bytes, +1 chars */ | |
"ǰ", /* J̌ (2->3), +1 bytes, +1 chars */ | |
"ὐ", /* Υ̓ (3->4), +1 bytes, +1 chars */ | |
"ᾳ", /* ΑΙ (3->4), +1 bytes, +1 chars */ | |
"ᾴ", /* ΆΙ (3->4), +1 bytes, +1 chars */ | |
"ᾶ", /* Α͂ (3->4), +1 bytes, +1 chars */ | |
"ᾼ", /* ΑΙ (3->4), +1 bytes, +1 chars */ | |
"ῃ", /* ΗΙ (3->4), +1 bytes, +1 chars */ | |
"ῄ", /* ΉΙ (3->4), +1 bytes, +1 chars */ | |
"ῆ", /* Η͂ (3->4), +1 bytes, +1 chars */ | |
"ῌ", /* ΗΙ (3->4), +1 bytes, +1 chars */ | |
"ῖ", /* Ι͂ (3->4), +1 bytes, +1 chars */ | |
"ῤ", /* Ρ̓ (3->4), +1 bytes, +1 chars */ | |
"ῦ", /* Υ͂ (3->4), +1 bytes, +1 chars */ | |
"ῳ", /* ΩΙ (3->4), +1 bytes, +1 chars */ | |
"ῴ", /* ΏΙ (3->4), +1 bytes, +1 chars */ | |
"ῶ", /* Ω͂ (3->4), +1 bytes, +1 chars */ | |
"ῼ", /* ΩΙ (3->4), +1 bytes, +1 chars */ | |
"ﬓ", /* ՄՆ (3->4), +1 bytes, +1 chars */ | |
"ﬔ", /* ՄԵ (3->4), +1 bytes, +1 chars */ | |
"ﬕ", /* ՄԻ (3->4), +1 bytes, +1 chars */ | |
"ﬖ", /* ՎՆ (3->4), +1 bytes, +1 chars */ | |
"ﬗ", /* ՄԽ (3->4), +1 bytes, +1 chars */ | |
]; |
""" | |
Copyright (c) 2024 Rendello | |
Permission to use, copy, modify, and/or distribute this software for any | |
purpose with or without fee is hereby granted. | |
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH | |
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY | |
AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, | |
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM | |
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR | |
OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR | |
PERFORMANCE OF THIS SOFTWARE. | |
""" | |
import sys | |
from dataclasses import dataclass | |
from typing import List, Dict | |
@dataclass | |
class Entry: | |
a: str | |
b: str | |
a_len: int | |
b_len: int | |
delta: int | |
a_char_count: int | |
b_char_count: int | |
delta_char_count: int | |
def sort_entries(l: List[Entry]) -> List[Entry]: | |
""" Sorted by size delta, then alphabetically. """ | |
return sorted(l, key= | |
lambda p: (-(p.delta_char_count), -(p.delta), p.a)) | |
def create_entry_map() -> dict[str, list[Entry]]: | |
entry_map = {} | |
for i in range(sys.maxunicode + 1): | |
a = chr(i) | |
for (case, b) in (('uppercasing', a.upper()), ('lowercasing', a.lower())): | |
attributes = [case] | |
try: | |
a_len = len(a.encode("utf8")) | |
b_len = len(b.encode("utf8")) | |
except UnicodeEncodeError: | |
continue | |
if a_len == b_len: | |
continue | |
delta = b_len - a_len | |
a_char_count = len(a) | |
b_char_count = len(b) | |
delta_char_count = b_char_count - a_char_count | |
if a_len < b_len: | |
attributes.append('expands') | |
elif a_len > b_len: | |
attributes.append('contracts') | |
if b_char_count > 1: | |
attributes.append('multi_char') | |
key = "_".join(attributes) | |
value = Entry(a, b, a_len, b_len, delta, a_char_count, b_char_count, delta_char_count) | |
if key not in entry_map: | |
entry_map[key] = [value] | |
else: | |
entry_map[key].append(value) | |
return entry_map | |
def entry_map_to_string(entry_map: Dict[str, List[Entry]]) -> str: | |
buffer = ( | |
f'''// =======================================================================\n''' | |
f'''//! Automatically generated using `task generate-utf8-case-data`.\n//!\n''' | |
f'''//! Unicode characters that behave oddly when the case is changed, for use\n''' | |
f'''//! with property tests.\n''' | |
f'''// =======================================================================\n\n''' | |
) | |
for key, unsorted_entries in sorted(list(entry_map.items())): | |
entries = sort_entries(unsorted_entries) | |
buffer += f'pub const {key.upper()}: [&str; {len(entries)}] = [\n' | |
for e in entries: | |
ds = "" | |
if e.delta_char_count != 0: | |
ds = f", {e.delta_char_count:+} chars" | |
buffer += f' "{e.a}",\t/* {e.b}\t({e.a_len}->{e.b_len}), {e.delta:+} bytes{ds} */\n' | |
buffer += "];\n\n" | |
return buffer.strip() | |
def generate_utf8_case_data(): | |
return entry_map_to_string(create_entry_map()) |
The generation script:
https://gist.github.com/rendello/b06ca3d976d26fa011897bd1603ea044
@pannous I don't think so, although I'm not sure I understand what you mean
See also "Unicode roundtrip-unsafe characters":
https://gist.github.com/rendello/4d8266b7c52bf0e98eab2073b38829d9
Make it a git instead of gist, it will be more timeless and people can contribute as well. Provide there jupiter notebook, generation script etc goodies.
(Gists are a terrible idea and they are made just to replace pastebin in my very own personal opinion. This nice work I see here deserves more than a "pastebin")
This nice work I see here deserves more than a "pastebin"
Thanks! I might combine the "Unicode roundtrip-unsafe characters" and make a little data file / library. Mostly, if if I'm going to put more effort into it, I'm not sure which direction to go in. This is really programming-language agnostic, so I could have a generator that spits out Rust, Python, etc. Or keep it as TSVs, or maybe I should include some of the property-testing generators I created in Rust, etc. In my own project I'm using this in automated property tests, so I suppose that's one avenue.
a generator that spits out Rust, Python, etc
You can make it in the language that's more fluent to you and it will be more helpful for your own purposes and let others contribute with other languages. Something like a mega git that has directories "Rust", "Python", "Go", "C" and let other people port it to the languages that they like.
I do such gits and called them "various", for example, "monthy-hall-various" and there I have different implementations of different languages. Also some famous projects like the "cern-httpd" (the first server ever) have such structure but with platforms instead of languages - so it's not so much just my own kind of taste.
I could contribute with some languages.
so I could have a generator that spits out Rust, Python, etc.
As for generators, I wouldnt trust so much a generator to do that work for me. I know a makefile can make miracles but still... different languages, different worlds.
@rept0id If you look at the included generate_utf8.py
file, it's creating the whole list. Same with the "Unicode Roundtrip" Gist I linked. It's all automatic anyway, so changing the language generator would just be changing the output string format. The main issue would be structuring the repo, should it be just the generators and have the "outputs" be "releases"? Or should the outputs live beside the generators? I feel like having two languages in the same repo might not be useful, but at the same time I might like to use this test code for both Python and Rust.
Perhaps the best solution would be to have the generator files, and have them generate the files in the repo itself so they're easily visible, with the caveat in comment form saying they're auto-generated (this is what my current project does). Then, I could potentially use the GH releases features to build libraries for Python, Rust, etc. That way property-cased testing generators (a different kind of generator, basically a type containing random values) could be bundled in.
I don't know 😆
Feel free to remix this code yourself too, the licence is in the files and is basically "do anything".
for the case of contractions does UTF-8 contain a command characters to skip so that the whole sequence does not need to be shifted?