Skip to content

Instantly share code, notes, and snippets.

@samcv
Last active January 27, 2017 15:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save samcv/ebbcf638b92501ac71e22f4dde1a510f to your computer and use it in GitHub Desktop.
Save samcv/ebbcf638b92501ac71e22f4dde1a510f to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#define uninames_elems 191
#define LONGEST_NAME 22
#define HIGHEST_NAME_CP 100
uint32_t get_uninames ( char * out, uint32_t cp );
const static uint16_t uninames[191];uint32_t get_uninames ( char * out, uint32_t cp ) {
// uninames
if (cp >= 0 && cp <= 31) {
sprintf(out, "<control-%.4X>", cp);
return 0;
}
if (cp > 31)
return 32;
return 0;
}
char ctable[40] = {
'\0','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R',
'S','T','U','V','W','X','Y','Z','0','1','2','3','4','5','6','7','8','9',' ','-',
'\a'};
const static char * s_table[40] = {
"CAPITAL","LETTER","LATIN","DIGIT","PARENTHESIS","SIGN","SMALL","BRACKET",
"SOLIDUS","EXCLAMATION","COMMERCIAL","SQUARE","ACCENT","CIRCUMFLEX","APOSTROPHE",
"QUOTATION","AMPERSAND","SEMICOLON","MARK","QUESTION","ASTERISK","RIGHT",
"PERCENT","GREATER","REVERSE","HYPHEN","EQUALS","NUMBER","DOLLAR","LEFT","THAN",
"COLON","COMMA","SEVEN","EIGHT","SPACE","MINUS","GRAVE","THREE","LESS"};
#define num_encoded_codepoints = 69
const static uint8_t name_index[35] = {
0,4,8,11,14,17,21,26,30,35,39,44,48,52,56,61,65,72,80,87,94,102,109,116,124,131,
138,146,153,160,165,170,175,183,190};
const static uint16_t uninames[191] = {
63800,62797,63120,63037,63120,63517,62600,63557,62600,63317,62600,63040,62960,
63597,62560,63277,62560,63200,26101,31919,8039,51239,41559,57606,34092,59980,
24640,62720,62557,41818,24039,6295,22600,62557,32935,1563,60798,1563,59455,
34320,62557,9982,8039,6299,15360,62557,63720,62557,63760,62557,22774,8039,49639,
27239,63959,49519,8039,43119,8039,38359,49519,8039,31919,28839,17481,32039,4719,
1519,3081,1562,60760,60761,59280,62517,62437,62477,4839,4719,1519,3084,1562,
60760,60761,59400,62517,62437,62477,9639,4719,1519,3087,1562,60760,60761,59520,
62517,62437,62477,14439,4719,1519,3090,1562,60760,60761,59640,62517,62437,62477,
19239,4719,1519,3093,1562,60760,60761,59760,62517,62437,62477,24039,4719,1519,
3096,1562,60760,60761,59880,62517,62437,62477,28839,4719,1519,3099,1562,60760,
60761,60000,62517,62437,62477,33639,4719,1519,3102,1562,60760,60761,60120,62517,
62437,62477,38439,4719,1519,3105,1562,60760,60761,60240,63597,62877,62680,63397,
62720,63277,62877,62680,62957,62880,19823,59689,22600,63917,62880,62517,62677,
62477,1639,4719,11119,3082,1562,60766,60761,59320,62517,62677,62477,6400};
typedef struct Decompressor {
/* Encoding an entry gives us three "commands" that can be a character or
* something in a further shift level. Hold them in here for future
* consumption. */
int16_t queue[6];
/* How many valid entries are currently in the queue? */
uint16_t queue_len;
const uint16_t * input_position;
/* Were we signalled to end reading this string and continue with the next one? */
uint8_t eos_signalled;
uint8_t out_buf_pos;
/* We put our characters here. */
char out_buf[LONGEST_NAME + 1];
} Decompressor;
void digest_one_chunk(Decompressor *ds) {
uint16_t num = *(ds->input_position++);
uint32_t temp;
temp = num / 1600;
ds->queue[ds->queue_len++] = temp;
ds->queue[ds->queue_len++] = (num - temp * 1600) / 40;
ds->queue[ds->queue_len++] = num % 40;
/*fprintf(stderr, "digest one chunk, %d -> %d %d %d\n", num, ds->queue[ds->queue_len - 3], ds->queue[ds->queue_len - 2], ds->queue[ds->queue_len - 1]);*/
}
void eat_a_string( Decompressor *ds, uint32_t skip_no_cp ) {
ds->eos_signalled = 0;
/* We're looking for a zero to start with, we are probably trying to
* look up a specific codepoint's name */
if (skip_no_cp) {
fprintf(stderr, "Have been asked to skip %lu cp's\n", skip_no_cp);
}
while (!ds->eos_signalled) {
/*fprintf(stderr, "start of loop: %d codemes in queue\n", ds->queue_len);*/
if (ds->queue_len == 0) { digest_one_chunk(ds); }
if (ds->queue[0] == 39) {
if (ds->queue_len == 1) { digest_one_chunk(ds); }
/* Assume it's shifted by one */
/* XXX too tired to check if the n parameter actually prevents buffer overflows. */
strncpy(ds->out_buf + ds->out_buf_pos, s_table[ds->queue[1]], LONGEST_NAME - ds->out_buf_pos);
ds->out_buf_pos += strlen(s_table[ds->queue[1]]);
/*fprintf(stderr, "concated string number %d: %s\n", ds->queue[1], s_table[ds->queue[1]]);*/
/* Let the two codemes flow out of the queue. */
memmove(ds->queue, ds->queue + 2, (6 - 2) * 2);
ds->queue_len -= 2;
}
else {
ds->out_buf[ds->out_buf_pos++] = ctable[ds->queue[0]];
if (ds->queue[0] == 0) {
ds->eos_signalled = 1;
ds->out_buf_pos = 0;
}
memmove(ds->queue, ds->queue + 1, (6 - 1) * 2);
ds->queue_len--;
}
/*fprintf(stderr, "out_buf_pos now %d\n", ds->out_buf_pos);*/
}
}
uint32_t get_cp_name (uint32_t cp) {
Decompressor ds = {};
uint32_t ret;
ret = get_uninames(ds.out_buf, cp);
if (ret == 0) {
printf("cp: %i name: %s\n", cp, ds.out_buf);
}
else {
printf("ret: %i\n", ret);
int index = name_index[(cp - ret) / 2];
printf("name_index[%i]=%i, cp %lu, ret %lu, cp - ret = %lu\n", (cp - ret)/2, index, cp, ret, cp - ret);
ds.input_position = (const unsigned short *) &uninames + index;
printf("(cp - ret) % 2 = %i\n", (cp - ret) % 2);
eat_a_string(&ds, ( (cp - ret) % 2) );
printf("cp: %i name: %s\n", cp, ds.out_buf);
}
}
int main (void) {
uint32_t cp = 0;
Decompressor ds = {};
ds.input_position = (const uint16_t *) &uninames;
int i;
int ret;
get_cp_name(0x20); /* U+20 SPACE */
return 0;
for (i = 0; i <= HIGHEST_NAME_CP; i++) {
ret = get_uninames(ds.out_buf, cp);
if (ret == 0) {
}
else {
eat_a_string(&ds, 0);
}
printf("U+%X '%s'\n", cp, ds.out_buf);
cp++;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment