Created
April 18, 2014 09:25
-
-
Save sagotch/11033905 to your computer and use it in GitHub Desktop.
Convert Unicode escaped \uXXXX sequence to utf-8 encoded string
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(* | |
* Imported from https://gist.github.com/sagotch/ | |
* | |
* For more informations about unicode to utf-8 converting method used see: | |
* http://www.ietf.org/rfc/rfc3629.txt (Page3, section "3. UTF-8 definition") | |
* | |
* decimal conversions of binary used: | |
* 10000000 -> 128; 11000000 -> 192; 11100000 -> 224 | |
*) | |
(** | |
* Convert Unicode escaped \uXXXX sequence to utf-8 encoded string | |
* [u] XXXX part of the Unicode sequence (as string) | |
*) | |
let to_utf8 (u : string) : string = | |
let dec = int_of_string @@ "0x" ^ u in | |
let update_byte s i mask shift = | |
s.[i] <- Char.chr (Char.code s.[i] | |
+ dec lsr shift land int_of_string mask) in | |
if dec > 0xFFFF then | |
failwith ("Invalid escaped unicode \\u" ^ u) | |
else if dec > 0x7FF then | |
let s = String.copy "\224\128\128" in | |
update_byte s 2 "0b00111111" 0; | |
update_byte s 1 "0b00111111" 6; | |
update_byte s 0 "0b00001111" 12; | |
s | |
else if dec > 0x7F then | |
let s = String.copy "\192\128" in | |
update_byte s 1 "0b00111111" 0; | |
update_byte s 0 "0b00011111" 6; | |
s | |
else | |
let s = String.copy "\000" in | |
update_byte s 0 "0b01111111" 0; | |
s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment