Skip to content

Instantly share code, notes, and snippets.

@zihengCat
Last active January 30, 2018 19:00
Show Gist options
  • Save zihengCat/1a80a31b671e5bb3db6eb7af5a4b30f7 to your computer and use it in GitHub Desktop.
Save zihengCat/1a80a31b671e5bb3db6eb7af5a4b30f7 to your computer and use it in GitHub Desktop.
UTF-8 (ANSI C)
int get_utf8_size(const unsigned char *p_input) {
unsigned char c = *p_input; /* get UTF-8 first Byte */
/*
* 0xxxxxxx --> 1
* 10xxxxxx --> -1 (invalid)
* 110xxxxx --> 2
* 1110xxxx --> 3
* 11110xxx --> 4
* 111110xx --> 5
* 1111110x --> 6
*/
if(c >= 0x00 && c < 0x80){return 1;}
if(c >= 0x80 && c < 0xC0){return -1;}
if(c >= 0xC0 && c < 0xE0){return 2;}
if(c >= 0xE0 && c < 0xF0){return 3;}
if(c >= 0xF0 && c < 0xF8){return 4;}
if(c >= 0xF8 && c < 0xFC){return 5;}
if(c >= 0xFC){return 6;}
return -1;
}
/*****************************************************************************
* 将一个UTF8编码的字符转换成Unicode(UCS-4)编码
*
* 参数:
* p_input 指向输入缓冲区, 保存字符的UTF-8编码
* p_output 指向输出缓冲区, 其保存的数据即是Unicode编码值,
* 类型为unsigned int (4字节)
*
* 返回值:
* 成功则返回该字符的UTF-8编码所占用的字节数
* 失败则返回 -1
*
* 注意:
* 1. UTF-8没有字节序问题, 但是Unicode有字节序要求
* 2. 字节序分为大端(Big Endian)和小端(Little Endian)两种
* 3. Intel处理器采用小端法, 在此采用小端法表示(低地址存低位)
****************************************************************************/
#define UTF8_MAX_BYTES 6
int utf8_to_unicode(const unsigned char *p_input, unsigned int *p_output){
*p_output = 0x0;
int size = get_utf8_size(p_input);
char byte[UTF8_MAX_BYTES] = {0};
unsigned char *p_out = (unsigned char*)p_output;
switch( size ){
case 1:
*(p_out + 0) = *p_input;
break;
case 2:
byte[0] = *(p_input + 0);
byte[1] = *(p_input + 1);
*(p_out + 0) = (byte[1] & 0x3F) + (byte[0] << 6);
*(p_out + 1) = (byte[0] >> 2) & 0x07;
break;
case 3:
byte[0] = *(p_input + 0);
byte[1] = *(p_input + 1);
byte[2] = *(p_input + 2);
*(p_out + 0) = (byte[1] << 6) + (byte[2] & 0x3F);
*(p_out + 1) = (byte[0] << 4) + ((byte[1] >> 2) & 0x0F);
break;
case 4:
byte[0] = *(p_input + 0);
byte[1] = *(p_input + 1);
byte[2] = *(p_input + 2);
byte[3] = *(p_input + 3);
*(p_out + 0) = (byte[2] << 6) + (byte[3] & 0x3F);
*(p_out + 1) = ((byte[2] << 4) & 0x0F) + (byte[1] & 0x0F);
*(p_out + 2) = ((byte[1] >> 4) & 0x03) + ((byte[0] << 2) & 0x1C)
break;
case 5:
case 6:
break;
default:
break;
}
return size;
}
#undef UTF8_MAX_BYTES
@zihengCat
Copy link
Author

zihengCat commented Aug 23, 2017

表 1. UTF-8 编码规则

UTF-8 编码字节数 Unicode 符号范围 UTF-8 编码方式
n (Bytes) 十六进制 (Hexadecimal) 二进制 (Binary)
1 0000 0000 - 0000 007F 0xxxxxxx
2 0000 0080 - 0000 07FF 110xxxxx 10xxxxxx
3 0000 0800 - 0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
4 0001 0000 - 0010 FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
5 0020 0000 - 03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
6 0400 0000 - 7FFF FFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

RFC3629: UTF-8, a transformation format of ISO 10646

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment