Skip to content

Instantly share code, notes, and snippets.

@ynakajima
Created February 25, 2014 09:54
Show Gist options
  • Save ynakajima/9206063 to your computer and use it in GitHub Desktop.
Save ynakajima/9206063 to your computer and use it in GitHub Desktop.
C言語で、UTF-8 の文字列から Unicode のコードポイントを取得するやりかた ref: http://qiita.com/ynakajima/items/0510115ba19ab66e6670
// UCS4に変換
gunichar* codepoints = g_utf8_to_ucs4_fast(utf8_text, -1, NULL);
$ brew install glib
$ gcc -Wall -I/usr/local/include/glib-2.0 -I/usr/local/lib/glib-2.0/include -lglib-2.0 -o utf8_to_codepoint utf8_to_codepoint.c
$ ./utf8_to_codepoint "UTF-8の文字列を変換"
[U+0055] [U+0054] [U+0046] [U+002D] [U+0038] [U+306E] [U+6587] [U+5B57] [U+5217] [U+3092] [U+5909] [U+63DB]
$ ./utf8_to_codepoint "𠀋𡈽𡌛𡑮"
[U+2000B] [U+2123D] [U+2131B] [U+2146E]
#include <glib.h>
gunichar* g_utf8_to_ucs4_fast(const gchar *str,
glong len,
glong *items_written);
#include <stdio.h>
#include <glib.h>
int main (int argc, char* argv[]) {
// 引き数が渡されてなかったら終了
if (argc < 2) {
fprintf(stderr, "usage: %s text\n", argv[0]);
return 1;
}
// 引き数として渡された文字列を代入
char* utf8_text = argv[1];
// テキストの文字数を取得
glong length = g_utf8_strlen(utf8_text, -1);
// UCS4に変換
gunichar* codepoints = g_utf8_to_ucs4_fast(utf8_text, -1, NULL);
// 1文字づつ code point を出力
for (int i = 0; i < length; i++) {
printf("[U+%04X] ", codepoints[i]);
}
printf("\n");
// 終了処理
g_free(codepoints);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment