Skip to content

Instantly share code, notes, and snippets.

@zbraniecki
Last active February 7, 2024 01:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zbraniecki/9efa7e4d03d1bfb531e8b0af2010f6c8 to your computer and use it in GitHub Desktop.
Save zbraniecki/9efa7e4d03d1bfb531e8b0af2010f6c8 to your computer and use it in GitHub Desktop.
Hermes ICU4X
use widestring::U16CString;
use icu_locid::LanguageIdentifier;
fn canonicalize_utf16_locale(input: &[u16]) -> Vec<u16> {
let utf8_result = String::from_utf16(&input).unwrap();
let loc: LanguageIdentifier = utf8_result.parse().unwrap();
let utf16_data: Vec<u16> = loc.to_string().encode_utf16().collect();
utf16_data
}
#[no_mangle]
pub extern "C" fn canonicalize_locale(input: *const u16, len: usize) -> *mut u16 {
let input_slice = unsafe { std::slice::from_raw_parts(input, len) };
let output = canonicalize_utf16_locale(input_slice);
let c_string = unsafe { U16CString::from_vec_unchecked(output).into_raw() };
c_string
}
#[no_mangle]
pub extern "C" fn free_canonicalized_locale(ptr: *mut u16) {
if !ptr.is_null() {
let _ = unsafe { U16CString::from_raw(ptr) }; // This will drop the U16CString, freeing the memory.
}
}
// https://github.com/facebook/hermes/blob/main/lib/Platform/Intl/PlatformIntlApple.mm#L319-L353
vm::CallResult<std::vector<std::u16string>> canonicalizeLocaleList(
vm::Runtime &runtime,
const std::vector<std::u16string> &locales) {
// 1. If locales is undefined, then
// a. Return a new empty List
// Not needed, this validation occurs closer to VM in 'normalizeLocales'.
// 2. Let seen be a new empty List.
std::vector<std::u16string> seen;
// 3. If Type(locales) is String or Type(locales) is Object and locales has an
// [[InitializedLocale]] internal slot, then
// 4. Else
// We don't yet support Locale object -
// https://402.ecma-international.org/8.0/#locale-objects As of now, 'locales'
// can only be a string list/array. Validation occurs in normalizeLocaleList,
// so this function just takes a vector of strings.
// 5. Let len be ? ToLength(? Get(O, "length")).
// 6. Let k be 0.
// 7. Repeat, while k < len
for (const auto &locale : locales) {
// 7.c.vi. Let canonicalizedTag be CanonicalizeUnicodeLocaleId(tag).
auto parsedOpt = ParsedLocaleIdentifier::parse(locale);
if (!parsedOpt)
return runtime.raiseRangeError(
vm::TwineChar16("Invalid language tag: ") +
vm::TwineChar16(locale.c_str()));
auto canonicalizedTag = parsedOpt->canonicalize();
// 7.c.vii. If canonicalizedTag is not an element of seen, append
// canonicalizedTag as the last element of seen.
if (std::find(seen.begin(), seen.end(), canonicalizedTag) == seen.end()) {
seen.push_back(std::move(canonicalizedTag));
}
}
return seen;
extern "C" {
const char16_t* canonicalize_locale(const char16_t* input, int len);
void free_canonicalized_locale(char16_t* ptr);
}
vm::CallResult<std::vector<std::u16string>> getCanonicalLocales(
vm::Runtime &runtime,
const std::vector<std::u16string> &locales) {
std::vector<std::u16string> seen;
for (auto& loc : locales) {
auto new_locale_ptr = canonicalize_locale(loc.c_str(), loc.length());
std::u16string canonicalizedTag(new_locale_ptr);
if (std::find(seen.begin(), seen.end(), canonicalizedTag) == seen.end()) {
seen.push_back(std::move(canonicalizedTag));
}
free_canonicalized_locale(const_cast<char16_t*>(new_locale_ptr));
}
return seen;
}
Apple:
~/projects/hermes〉time ./build_release/bin/hermes ./test.js
1.85 real 1.82 user 0.00 sys
ICU4X:
~/projects/hermes〉time ./build_release/bin/hermes ./test.js
2.33 real 2.29 user 0.00 sys
for (let i = 0; i < 1000000; i++) {
let result = Intl.getCanonicalLocales(["pl-pl", "de-de", "it-IT", "sr-Cyrl", "ja", "en-Latn-us", "de-at", "es-419", "und", "zh-CN"]);
}
@zbraniecki
Copy link
Author

zbraniecki commented Feb 7, 2024

Apple: 1.82

Split of ICU4X:

  1. no-op - 0.63

  2. Just iterating over locales and writing to seen if not seen - 0.80

  3. Allocating Vec<u16> in Rust from the passed &[u16] - 0.94

  4. Executing U16CString::from_raw_unchecked().into_raw(Vec<u16>) -> *mut u16 - 1.43

  5. Assembling std::u16string from *mut u16 - 1.47

  6. Adding free_locale - 1.58

  7. Adding String::from_utf16(&[u16]) in canonicalize_utf16_locale - 1.85

  8. Adding LanguageIdentifier parsing from String in canonicalize_utf16_locale - 2.07

  9. Adding U16CString::from_vec_unchecked(Vec<u16>).into_raw() -> *mut u16 in canonicalize_utf16_locale - 2.30

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment