Skip to content

Instantly share code, notes, and snippets.

@shenlebantongying
Last active January 19, 2023 08:05
Show Gist options
  • Save shenlebantongying/3f055e66224c7dc10a8e12d47945981f to your computer and use it in GitHub Desktop.
Save shenlebantongying/3f055e66224c7dc10a8e12d47945981f to your computer and use it in GitHub Desktop.
add space between cjk & other chars
/*
gcc ./cjkpad.c -o cjkpad `pkg-config --libs --cflags icu-uc icu-io` && ./cjkpad
*/
#include <stdio.h>
#include <string.h>
#include <unicode/utext.h>
#include <unicode/ustdio.h>
static UErrorCode err = U_ZERO_ERROR;
// One grapheme?
typedef struct gra {
UChar32 cp; // Code point
UBlockCode block; // unicode block
size_t start; // starting pos at the str[]
size_t length; // number of char needed (cjk is mostly 3 char)
} gra;
bool isAlphaNum(gra *g) {
return 'a' < g->cp && g->cp < 'z' || 'A' < g->cp && g->cp < 'Z' || '0' < g->cp && g->cp < '9';
}
bool isCjk(gra *g) {
return g->block == UBLOCK_CJK_UNIFIED_IDEOGRAPHS;
}
void padCjk(const char str[]) {
char formatted[1024] = {0}; // 1kiB
gra Gra[1024];
int graLength; // number of Gra
{ // 1st loop that analysis str
UText *ut = utext_openUTF8(NULL, str, -1, &err);
int begin, end;
int i = 0;
for (UChar32 cp = utext_next32From(ut, 0);
cp > -1;
cp = utext_next32(ut), i++) {
end = utext_getNativeIndex(ut);
Gra[i].cp = cp;
Gra[i].block = ublock_getCode(cp);
Gra[i].start = begin;
Gra[i].length = end - begin;
begin = end;
}
graLength = i;
}
{ // 2nd loop that add spaces
strncat(formatted, &str[Gra[0].start], Gra[0].length);
for (int j = 1; j <= graLength; j++) {
gra *prev = &Gra[j - 1];
gra *curr = &Gra[j];
gra *next = &Gra[j + 1];
u_printf("%C%C%C\n", prev->cp, curr->cp, next->cp);
if ((isAlphaNum(prev) && isCjk(curr))
|| (isCjk(prev) && isAlphaNum(curr))) {
strcat(formatted, " ");
}
strncat(formatted, &str[curr->start], curr->length);
}
}
printf("\n%s\n", formatted);
}
int main(void) {
char str[] = "*你好world哈哈, 句子#中间you英文和,。/符号";
padCjk(str);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment