Skip to content

Instantly share code, notes, and snippets.

@tsuu32
Last active November 17, 2019 00:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tsuu32/111e1de869a980b1fcb04abf650ab2e5 to your computer and use it in GitHub Desktop.
Save tsuu32/111e1de869a980b1fcb04abf650ab2e5 to your computer and use it in GitHub Desktop.
CoreFoundation の CFStringTokenizer を使って(日本語でも)tokenize するやつ
/* compile: cc tokenize.c -o tokenize -framework CoreFoundation */
/* usage: ./tokenize 今日は晴れでいい気分
/* 参考: https://stackoverflow.com/questions/6877659/how-to-get-an-array-of-sentences-using-cfstringtokenizer */
#include <stdio.h>
#include <CoreFoundation/CoreFoundation.h>
int main(int argc, char *argv[])
{
if (argc != 2) {
fprintf(stderr, "usage: %s String\n", argv[0]);
exit(1);
}
CFStringRef string;
// string = CFSTR("すもももももももものうち");
string = CFStringCreateWithCString(kCFAllocatorDefault, argv[1], kCFStringEncodingUTF8);
CFLocaleRef locale = CFLocaleCopyCurrent();
CFStringTokenizerRef tokenizer = CFStringTokenizerCreate(kCFAllocatorDefault,
string,
CFRangeMake(0, CFStringGetLength(string)),
kCFStringTokenizerUnitWord,
locale);
CFStringTokenizerTokenType tokenType;
while((tokenType = CFStringTokenizerAdvanceToNextToken(tokenizer)) != kCFStringTokenizerTokenNone) {
CFRange tokenRange = CFStringTokenizerGetCurrentTokenRange(tokenizer);
CFStringRef tokenValue = CFStringCreateWithSubstring(kCFAllocatorDefault,
string,
tokenRange);
// CFShow(tokenValue);
/* CFShowでは日本語は \u4eca のようになってしまうため printf する */
/* 参考: https://stackoverflow.com/questions/9166291/converting-a-cfstringref-to-char */
char *buffer;
if ((buffer = (char *)CFStringGetCStringPtr(tokenValue, kCFStringEncodingUTF8)) != NULL) {
printf("%s\n", buffer);
} else {
CFIndex length = CFStringGetLength(tokenValue);
CFIndex maxSize = CFStringGetMaximumSizeForEncoding(length, kCFStringEncodingUTF8) + 1;
buffer = (char *)malloc(maxSize);
CFStringGetCString(tokenValue, buffer, maxSize, kCFStringEncodingUTF8);
printf("%s\n", buffer);
free(buffer);
}
CFRelease(tokenValue);
}
CFRelease(tokenizer);
CFRelease(locale);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment