Created
October 25, 2011 12:25
-
-
Save meijeru/1312555 to your computer and use it in GitHub Desktop.
Encoding/decoding between Unicode codepoints and UTF-8 binary strings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
REBOL [] | |
utf8-to-codepoint: func [ ; yields an integer >= 0 and < 1114112, or none | |
u [binary!] ; should have length 1 to 4 | |
/local b1 b2 b3 b4 | |
][ | |
switch/default length? u [ | |
1 [ | |
b1: u/1 | |
either b1 < 128 [ | |
b1 | |
][ | |
none | |
] | |
] | |
2 [ | |
b1: u/1 - 192 b2: u/2 - 128 | |
either all [ | |
b1 >= 0 b1 < 32 | |
b2 >= 0 b2 < 64 | |
][ | |
(shift/left b1 6) or b2 | |
][ | |
none | |
] | |
] | |
3 [ | |
b1: u/1 - 224 b2: u/2 - 128 b3: u/3 - 128 | |
either all [ | |
b1 >= 0 b1 < 16 | |
b2 >= 0 b2 < 64 | |
b3 >= 0 b3 < 64 | |
][ | |
(shift/left b1 12) or (shift/left b2 6) or b3 | |
][ | |
none | |
] | |
] | |
4 [ | |
b1: u/1 - 240 b2: u/2 - 128 b3: u/3 - 128 b4: u/4 - 128 | |
either all [ | |
b1 >= 0 b1 < 8 | |
b2 >= 0 b2 < 64 | |
b3 >= 0 b3 < 64 | |
b4 >= 0 b4 < 64 | |
][ | |
(shift/left b1 18) or (shift/left b2 12) or (shift/left b3 6) or b4 | |
][ | |
none | |
] | |
] | |
][ | |
none | |
] | |
] | |
codepoint-to-utf8: func [ ; yields a binary value of lengt 1 to 4, or none | |
cp [integer!] ; should be >= 0 and < 1114112 | |
/local b | |
][ | |
case [ | |
cp < 0 [ | |
b: none | |
] | |
cp < 128 [ | |
b: to-binary to-char cp | |
] | |
cp < 2048 [ | |
b: #{0000} | |
b/1: to-char (shift cp 6) or 192 | |
b/2: to-char cp and 63 or 128 | |
] | |
cp < 66536 [ | |
b: #{000000} | |
b/1: to-char (shift cp 12) or 224 | |
b/2: to-char (shift cp and 4095 6) or 128 | |
b/3: to-char cp and 63 or 128 | |
] | |
cp < 1114112 [ | |
b: #{00000000} | |
b/1: to-char (shift cp 18) or 240 | |
b/2: to-char (shift cp and 262143 12) or 128 | |
b/3: to-char (shift cp and 4095 6) or 128 | |
b/4: to-char cp and 63 or 128 | |
] | |
true [ | |
b: none | |
] | |
] | |
b | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment