Skip to content

Instantly share code, notes, and snippets.

@sagotch
Created April 18, 2014 09:25
Show Gist options
  • Save sagotch/11033905 to your computer and use it in GitHub Desktop.
Save sagotch/11033905 to your computer and use it in GitHub Desktop.
Convert Unicode escaped \uXXXX sequence to utf-8 encoded string
(*
* Imported from https://gist.github.com/sagotch/
*
* For more informations about unicode to utf-8 converting method used see:
* http://www.ietf.org/rfc/rfc3629.txt (Page3, section "3. UTF-8 definition")
*
* decimal conversions of binary used:
* 10000000 -> 128; 11000000 -> 192; 11100000 -> 224
*)
(**
* Convert Unicode escaped \uXXXX sequence to utf-8 encoded string
* [u] XXXX part of the Unicode sequence (as string)
*)
let to_utf8 (u : string) : string =
let dec = int_of_string @@ "0x" ^ u in
let update_byte s i mask shift =
s.[i] <- Char.chr (Char.code s.[i]
+ dec lsr shift land int_of_string mask) in
if dec > 0xFFFF then
failwith ("Invalid escaped unicode \\u" ^ u)
else if dec > 0x7FF then
let s = String.copy "\224\128\128" in
update_byte s 2 "0b00111111" 0;
update_byte s 1 "0b00111111" 6;
update_byte s 0 "0b00001111" 12;
s
else if dec > 0x7F then
let s = String.copy "\192\128" in
update_byte s 1 "0b00111111" 0;
update_byte s 0 "0b00011111" 6;
s
else
let s = String.copy "\000" in
update_byte s 0 "0b01111111" 0;
s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment