Skip to content

Instantly share code, notes, and snippets.

@fasterthanlime
Created February 19, 2020 12:41
Show Gist options
  • Star 10 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save fasterthanlime/6a20aeaeb0ce34abbcda42dc9fded955 to your computer and use it in GitHub Desktop.
Save fasterthanlime/6a20aeaeb0ce34abbcda42dc9fded955 to your computer and use it in GitHub Desktop.
#include <stdio.h> // printf
#include <stdint.h> // uint8_t, uint32_t
#include <stdlib.h> // exit
#include <ctype.h> // toupper
void encode_utf8(uint32_t *src, char *dst) {
int i = 0;
int j = 0;
while (1) {
uint32_t scalar = src[i];
if (scalar == 0) {
dst[j] = 0; // null terminator
break;
}
if (scalar > 0b11111111111) {
fprintf(stderr, "Can only encode codepoints <= 0x{:x}", 0b11111111111);
exit(1);
}
if (scalar > 0b1111111) { // 7 bits
// 2-byte sequence
uint8_t b1 = 0b11000000 | ((uint8_t) ((scalar & 0b11111000000) >> 6));
// 2-byte marker first 5 of 11 bits
uint8_t b2 = 0b10000000 | ((uint8_t) (scalar & 0b111111));
// continuation last 6 of 11 bits
dst[j + 0] = b1;
dst[j + 1] = b2;
j += 2;
} else {
// 1-byte sequence
dst[j] = (char) scalar;
j++;
}
i++;
}
}
void decode_utf8(char *src, uint32_t *dst) {
int i = 0;
int j = 0;
while (1) {
uint8_t c = src[i];
if (c == 0) {
dst[j] = 0;
break; // null terminator
}
uint32_t scalar;
int len;
if (c >> 3 == 0b11110) {
fprintf(stderr, "decode_utf8: 4-byte sequences are not supported!\n");
exit(1);
} if (c >> 4 == 0b1110) {
fprintf(stderr, "decode_utf8: 3-byte sequences are not supported!\n");
exit(1);
} else if (c >> 5 == 0b110) {
// 2-byte sequence
uint32_t b1 = (uint32_t) src[i];
uint32_t b2 = (uint32_t) src[i + 1];
uint32_t mask1 = 0b0000011111000000;
uint32_t mask2 = 0b0000000000111111;
scalar = ((b1 << 6) & mask1) | ((b2 << 0) & mask2);
len = 2;
} else {
// 1-byte sequence
scalar = (uint32_t) c;
len = 1;
}
dst[j++] = scalar;
i += len;
}
}
int main(int argc, char **argv) {
uint32_t scalars[1024]; // hopefully that's enough
decode_utf8(argv[1], scalars);
for (int i = 0;; i++) {
if (scalars[i] == 0) {
break;
}
printf("U+%04X ", scalars[i]);
}
printf("\n");
// this is the highest codepoint we can decode/encode successfully
const size_t table_size = 0b11111111111;
uint32_t lower_to_upper[table_size];
// initialize the table to just return the codepoint unchanged
for (uint32_t cp = 0; cp < table_size; cp++) {
lower_to_upper[cp] = cp;
}
// set a-z => A-Z
for (int c = 97; c <= 122; c++) { // ha.
lower_to_upper[(uint32_t) c] = (uint32_t) toupper(c);
}
// note: nested functions is a GNU extension!
void set(char *lower, char *upper) {
uint32_t lower_s[1024];
uint32_t upper_s[1024];
decode_utf8(lower, lower_s);
decode_utf8(upper, upper_s);
for (int i = 0;; i++) {
if (lower_s[i] == 0) {
break;
}
lower_to_upper[lower_s[i]] = upper_s[i];
}
}
// set a few more
set(
"éêèàâëüöïÿôîçæœ",
"ÉÊÈÀÂËÜÖÏŸÔÎÇÆŒ"
);
// now convert our scalars to upper-case
for (int i = 0;; i++) {
if (scalars[i] == 0) {
break;
}
scalars[i] = lower_to_upper[scalars[i]];
}
uint8_t result[1024]; // yolo
encode_utf8(scalars, result);
printf("%s\n", result);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment