Skip to content

Instantly share code, notes, and snippets.

@vurtun
Last active December 26, 2023 11:58
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save vurtun/440eccd6e781c2a74be0ac719a1b7e96 to your computer and use it in GitHub Desktop.
Save vurtun/440eccd6e781c2a74be0ac719a1b7e96 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#define UTF_INVALID 0xFFFD
static const char*
utf_dec(unsigned *dst, const char *p, const char *e)
{
assert(e);
assert(p);
assert(dst);
int n = 0;
unsigned res = 0;
switch (*p & 0xf0) {
case 0xf0: res = *p & 0x07; n = 3; break;
case 0xe0: res = *p & 0x0f; n = 2; break;
case 0xd0:
case 0xc0: res = *p & 0x1f; n = 1; break;
default: res = *p & 0xff; n = 0; break;}
if (p + n >= e) {
*dst = UTF_INVALID;
return e;
}
while (n--) res = (res << 6) | (*(++p) & 0x3f);
*dst = res;
return p + 1;
}
static unsigned
utf_decode(const char **p, const char *e)
{
assert(p);
assert(e);
unsigned rune = 0;
*p = utf_dec(&rune, *p, e);
return rune;
}
static const char*
utf_prev(const char *b, const char *s)
{
assert(b);
assert(s);
while (s > b) {
char c = *(--s);
if ((((c) & 0xC0) != 0x80))
return s;
} return 0;
}
static const char*
utf_at(const char *str, const char *end, int idx)
{
assert(str);
assert(idx >= 0);
unsigned rune = 0u;
end = !end ? str + strlen(str): end;
for (int i = 0; i < idx && str < end; ++i)
str = utf_dec(&rune, str, end);
return str;
}
static int
utf_len(const char *str, const char *end)
{
int n = 0;
end = !end ? str + strlen(str): end;
while (str < end && *str) {
unsigned rune = 0;
str = utf_dec(&rune, str, end);
n++;
} return n;
}
int main(void)
{
#if 0
const char txt[] = "„Führ“";
const char *end = txt + sizeof(txt)-1;
const char *p = txt;
while (p < end) {
unsigned rune = 0;
p = utf_dec(&rune, p,end);
printf("%u\n", rune);
}
#endif
#if 0
const char txt[] = "„Führ“";
const char *end = txt + sizeof(txt)-1;
unsigned rune = 0;
const char *p = utf_dec(&rune, utf_at(txt,end,2), end);
assert(rune == 252);
const char *x = utf_dec(&rune, utf_at(p,end,4), end);
assert(rune == UTF_INVALID);
assert(x == end);
#endif
#if 0
const char txt[] = "test";
const char *end = txt + sizeof(txt)-1;
const char *p = end;
while ((p = utf_prev(txt, p))) {
unsigned rune = 0;
utf_dec(&rune, p,end);
printf("%c\n", (char)rune);
}
#endif
#if 1
const char txt[] = "„Führ“";
const char *end = txt + sizeof(txt)-1;
const char *p = txt;
while (p < end) {
unsigned rune = utf_decode(&p,end);
printf("%u\n", rune);
}
#endif
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment