Skip to content

Instantly share code, notes, and snippets.

@Liutos
Created October 19, 2013 14:31
Show Gist options
  • Save Liutos/7056664 to your computer and use it in GitHub Desktop.
Save Liutos/7056664 to your computer and use it in GitHub Desktop.
在UTF-8和Code Point之间转换的代码及配套工具
#include <stdio.h>
#include <stdlib.h>
#define MASK 0x8000
// 计算一个字节中最高位开始的连续为1的位的数量
int count1(char byte) {
int count = 0;
while ((byte & MASK) == MASK) {
count++;
byte = byte << 1;
}
return count;
}
// 取出一个字节中的低n位组成的数字
int get_low_bits(char byte, int n) {
int mask = 0;
int i = 0;
for (; i < n; i++) {
mask = (mask << 1) | 1;
}
return byte & mask;
}
// 把一个字节按照二进制编码打印出来,没有对齐数字长度的功能。
void print_binary(char byte) {
char msg[256];
int count = 0;
if (byte == 0) {
puts("0");
return;
}
while (byte != 0) {
msg[count] = byte & 0x1;
count++;
byte = byte >> 1;
}
while (count > 0) {
printf("%d", msg[count - 1]);
count--;
}
putchar('\n');
}
// 简单丑陋的从一个UTF-8编码的字节序列中提取出code point的功能
// 这里的str指向的是一个UTF-8编码的字符的第一个字节
// 简单粗暴的转换方法,按照维基百科UTF-8词条中的表格编写。
int get_code_point(char *str) {
int count = count1(*str);
switch (count) {
case 0: return *str;
case 2: {
return (get_low_bits(str[0], 5) << 6) | get_low_bits(str[1], 6);
}
case 3: {
return (get_low_bits(str[0], 4) << 12) | (get_low_bits(str[1], 6) << 6) | get_low_bits(str[2], 6);
}
case 4: {
return (get_low_bits(str[0], 3) << 18) | (get_low_bits(str[1], 6) << 12) | (get_low_bits(str[2], 6) << 6) | get_low_bits(str[4], 6);
}
default :
return -1;
}
}
// 简单粗暴的将一个32位无符号整数表示的code point转换为对应的UTF-8编码的字符的功能。
// 同样是按照维基百科上的UTF-8词条进行的简单粗暴的编码。
char *code_point_to_utf8(unsigned int cp) {
if (cp < 0x80) {
char *str = calloc(1, sizeof(char));
str[0] = cp;
return str;
} else if (cp < 0x0800) {
char *str = calloc(2, sizeof(char));
str[1] = 0x80 | (cp & 0x3F);
cp = cp >> 6;
str[0] = 0xC0 | (cp & 0x1F);
return str;
} else if (cp < 0x10000) {
char *str = calloc(3, sizeof(char));
str[2] = 0x80 | (cp & 0x3F);
cp = cp >> 6;
str[1] = 0x80 | (cp & 0x3F);
cp = cp >> 6;
str[0] = 0xE0 | (cp & 0x0F);
return str;
} else if (cp < 0x200000) {
char *str = calloc(4, sizeof(char));
str[3] = 0x80 | (cp & 0x3F);
cp = cp >> 6;
str[2] = 0x80 | (cp & 0x3F);
cp = cp >> 6;
str[1] = 0x80 | (cp & 0x3F);
cp = cp >> 6;
str[0] = 0xF0 | (cp & 0x07);
return str;
} else {
printf("Error happens...");
exit(1);
}
}
// 符号=>之后的内容表示输出的内容
int main(int argc, char *argv[]) {
char *str = "汉";
printf("count1('汉') == %d\n", count1(*str)); // => 3
print_binary(get_low_bits(8, 8)); // => 1000
print_binary(get_low_bits(13, 3)); // => 101
printf("code point of %s is %x\n", str, get_code_point(str)); // => 6c49
printf("code point of %s is %d\n", str, get_code_point(str)); // => 27721
str = code_point_to_utf8(27721);
int len = count1(str[0]);
len = len == 0? 1: len;
for (int i = 0; i < len; i++) {
printf("%c", str[i]);
} // => 汉
putchar('\n');
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment