Skip to content

Instantly share code, notes, and snippets.

@rmccullagh
Created August 20, 2015 23:25
Show Gist options
  • Save rmccullagh/636c3e6ea477383d8c79 to your computer and use it in GitHub Desktop.
Save rmccullagh/636c3e6ea477383d8c79 to your computer and use it in GitHub Desktop.
/*
* ascii whitespace:
* U+0009
* U+000A
* U+000C
* U+000D
* U+0020
*/
/*
* code point, four-to-six hex chars
*/
static int is_ascii_code_point(int val)
{
/* ascii code point U+0000 to U+007F, inclusive
* 0 to 127
*/
return val >= 0x0000 && val <= 0x007F;
}
/*
* Encode an integer to a utf8 unicode byte sequence
* https://encoding.spec.whatwg.org/#code-point
* https://encoding.spec.whatwg.org/#utf-8-encoder
*
* 0x80 == 128
* x = 0x80
* 0000 0000 1000 0000
* x << 1
* left shift by one (multiply it by two)
* 0000 0001 0000 0000
*
* 8
* x = 8
* 0000 0000 0000 1000
* x << 5
* 0000 0001 0000 0000
* x = 256
* [0,255]
* byte = [0x00, 0xFF]
*/
static int utf8encode(char* buf, int codepoint)
{
int count;
int offset;
int i;
int temp;
if(is_ascii_code_point(codepoint)) {
printf("ascii code point %d\n", codepoint);
buf[0] = (char)codepoint;
return 1;
}
if(codepoint >= 0x0080 && codepoint <= 0x07FF) {
count = 1;
offset = 0xC0;
} else if(codepoint >= 0x0800 && codepoint <= 0xFFFF) {
count = 2;
offset = 0xE0;
} else if(codepoint >= 0x10000 && codepoint <= 0x10FFFF) {
count = 3;
offset = 0xF0;
}
i = 0;
buf[i] = (codepoint >> (6 * count)) + offset;
i++;
while(count > 0) {
temp = codepoint >> (6 * (count - 1));
buf[i++] = 0x80 | (temp & 0x3f);
count--;
}
return i;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment