Skip to content

Instantly share code, notes, and snippets.

@sixtyfive
Last active July 13, 2022 12:20
Show Gist options
  • Save sixtyfive/0b951090eab449f146e3727ab49e0ea1 to your computer and use it in GitHub Desktop.
Save sixtyfive/0b951090eab449f146e3727ab49e0ea1 to your computer and use it in GitHub Desktop.
# relevant?
# https://unicode.org/reports/tr15/
# ↓ (./string.c)
static VALUE
rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
{
return unicode_normalize_common(argc, argv, str, id_normalize);
}
# ↓ (./string.c)
static VALUE
unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
{
static int UnicodeNormalizeRequired = 0;
VALUE argv2[2];
if (!UnicodeNormalizeRequired) {
rb_require("unicode_normalize/normalize.rb");
UnicodeNormalizeRequired = 1;
}
argv2[0] = str;
if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
}
# ↓ (./lib/unicode_normalize/normalize.rb)
module UnicodeNormalize
# ...
def self.normalize(string, form = :nfc)
encoding = string.encoding
case encoding
when Encoding::UTF_8
case form
when :nfc then
string.gsub REGEXP_C, NF_HASH_C
# ...
end
# ...
end
# ↓
REGEXP_C = Regexp.compile(REGEXP_C_STRING, Regexp::EXTENDED)
NF_HASH_C = Hash.new do |hash, key|
hash.shift if hash.length>MAX_HASH_LENGTH # prevent DoS attack
hash[key] = nfc_one(key)
end
# ↓ (lib/unicode_normalize/tables.rb)
REGEXP_C_STRING = "#{'' # composition exclusions
}" \
"[\u0340\u0341" \
"\u0343\u0344" \
# ... (goes on and on)
"\uD788" \
"][\u11A8-\u11C2]" \
"|#{'' # decomposed Hangul syllables
}" \
"[\u1100-\u1112][\u1161-\u1175][\u11A8-\u11C2]?"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment