Skip to content

Instantly share code, notes, and snippets.

@yawaramin
Last active August 26, 2023 13:44
Show Gist options
  • Save yawaramin/928bf9c7c4c99d441a5fa1e97e261685 to your computer and use it in GitHub Desktop.
Save yawaramin/928bf9c7c4c99d441a5fa1e97e261685 to your computer and use it in GitHub Desktop.
Example of encoding Unicode graphemes as string and taking its length
(* Re: https://twitter.com/llaisdy/status/1558536851560054786?s=20&t=us8J3LvoJTlwVwod5bOqeQ *)
(* Use this if using the REPL, otherwise use dune to build with the library dependency *)
#require "uutf";;
(* Converts an array of ints (Unicode graphemes) into a UTF-8 encoded string. *)
let utf8_to_string uchars =
let buf = Buffer.create (2 * Array.length uchars) in
Array.iter (fun uchar -> Uutf.Buffer.add_utf_8 buf (Uchar.of_int uchar)) uchars;
Buffer.contents buf
(* Calculates number of graphemes in a UTF-8 encoded string. *)
let utf8_len =
let width_folder len _ = function
| `Uchar _ -> succ len
| `Malformed s -> invalid_arg s
in
Uutf.String.fold_utf_8 width_folder 0
(* Now we can replicate the examples in the other languages: *)
let s = utf8_to_string [|0x25105; 0x26159; 0x33521; 0x22269; 0x20154|] (* "𥄅𦅙𳔡𢉩𠅔" *)
let len = utf8_len s (* 5 *)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment