Last active
January 27, 2017 15:41
-
-
Save samcv/ebbcf638b92501ac71e22f4dde1a510f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdint.h> | |
#include <string.h> | |
#define uninames_elems 191 | |
#define LONGEST_NAME 22 | |
#define HIGHEST_NAME_CP 100 | |
uint32_t get_uninames ( char * out, uint32_t cp ); | |
const static uint16_t uninames[191];uint32_t get_uninames ( char * out, uint32_t cp ) { | |
// uninames | |
if (cp >= 0 && cp <= 31) { | |
sprintf(out, "<control-%.4X>", cp); | |
return 0; | |
} | |
if (cp > 31) | |
return 32; | |
return 0; | |
} | |
char ctable[40] = { | |
'\0','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R', | |
'S','T','U','V','W','X','Y','Z','0','1','2','3','4','5','6','7','8','9',' ','-', | |
'\a'}; | |
const static char * s_table[40] = { | |
"CAPITAL","LETTER","LATIN","DIGIT","PARENTHESIS","SIGN","SMALL","BRACKET", | |
"SOLIDUS","EXCLAMATION","COMMERCIAL","SQUARE","ACCENT","CIRCUMFLEX","APOSTROPHE", | |
"QUOTATION","AMPERSAND","SEMICOLON","MARK","QUESTION","ASTERISK","RIGHT", | |
"PERCENT","GREATER","REVERSE","HYPHEN","EQUALS","NUMBER","DOLLAR","LEFT","THAN", | |
"COLON","COMMA","SEVEN","EIGHT","SPACE","MINUS","GRAVE","THREE","LESS"}; | |
#define num_encoded_codepoints = 69 | |
const static uint8_t name_index[35] = { | |
0,4,8,11,14,17,21,26,30,35,39,44,48,52,56,61,65,72,80,87,94,102,109,116,124,131, | |
138,146,153,160,165,170,175,183,190}; | |
const static uint16_t uninames[191] = { | |
63800,62797,63120,63037,63120,63517,62600,63557,62600,63317,62600,63040,62960, | |
63597,62560,63277,62560,63200,26101,31919,8039,51239,41559,57606,34092,59980, | |
24640,62720,62557,41818,24039,6295,22600,62557,32935,1563,60798,1563,59455, | |
34320,62557,9982,8039,6299,15360,62557,63720,62557,63760,62557,22774,8039,49639, | |
27239,63959,49519,8039,43119,8039,38359,49519,8039,31919,28839,17481,32039,4719, | |
1519,3081,1562,60760,60761,59280,62517,62437,62477,4839,4719,1519,3084,1562, | |
60760,60761,59400,62517,62437,62477,9639,4719,1519,3087,1562,60760,60761,59520, | |
62517,62437,62477,14439,4719,1519,3090,1562,60760,60761,59640,62517,62437,62477, | |
19239,4719,1519,3093,1562,60760,60761,59760,62517,62437,62477,24039,4719,1519, | |
3096,1562,60760,60761,59880,62517,62437,62477,28839,4719,1519,3099,1562,60760, | |
60761,60000,62517,62437,62477,33639,4719,1519,3102,1562,60760,60761,60120,62517, | |
62437,62477,38439,4719,1519,3105,1562,60760,60761,60240,63597,62877,62680,63397, | |
62720,63277,62877,62680,62957,62880,19823,59689,22600,63917,62880,62517,62677, | |
62477,1639,4719,11119,3082,1562,60766,60761,59320,62517,62677,62477,6400}; | |
typedef struct Decompressor { | |
/* Encoding an entry gives us three "commands" that can be a character or | |
* something in a further shift level. Hold them in here for future | |
* consumption. */ | |
int16_t queue[6]; | |
/* How many valid entries are currently in the queue? */ | |
uint16_t queue_len; | |
const uint16_t * input_position; | |
/* Were we signalled to end reading this string and continue with the next one? */ | |
uint8_t eos_signalled; | |
uint8_t out_buf_pos; | |
/* We put our characters here. */ | |
char out_buf[LONGEST_NAME + 1]; | |
} Decompressor; | |
void digest_one_chunk(Decompressor *ds) { | |
uint16_t num = *(ds->input_position++); | |
uint32_t temp; | |
temp = num / 1600; | |
ds->queue[ds->queue_len++] = temp; | |
ds->queue[ds->queue_len++] = (num - temp * 1600) / 40; | |
ds->queue[ds->queue_len++] = num % 40; | |
/*fprintf(stderr, "digest one chunk, %d -> %d %d %d\n", num, ds->queue[ds->queue_len - 3], ds->queue[ds->queue_len - 2], ds->queue[ds->queue_len - 1]);*/ | |
} | |
void eat_a_string( Decompressor *ds, uint32_t skip_no_cp ) { | |
ds->eos_signalled = 0; | |
/* We're looking for a zero to start with, we are probably trying to | |
* look up a specific codepoint's name */ | |
if (skip_no_cp) { | |
fprintf(stderr, "Have been asked to skip %lu cp's\n", skip_no_cp); | |
} | |
while (!ds->eos_signalled) { | |
/*fprintf(stderr, "start of loop: %d codemes in queue\n", ds->queue_len);*/ | |
if (ds->queue_len == 0) { digest_one_chunk(ds); } | |
if (ds->queue[0] == 39) { | |
if (ds->queue_len == 1) { digest_one_chunk(ds); } | |
/* Assume it's shifted by one */ | |
/* XXX too tired to check if the n parameter actually prevents buffer overflows. */ | |
strncpy(ds->out_buf + ds->out_buf_pos, s_table[ds->queue[1]], LONGEST_NAME - ds->out_buf_pos); | |
ds->out_buf_pos += strlen(s_table[ds->queue[1]]); | |
/*fprintf(stderr, "concated string number %d: %s\n", ds->queue[1], s_table[ds->queue[1]]);*/ | |
/* Let the two codemes flow out of the queue. */ | |
memmove(ds->queue, ds->queue + 2, (6 - 2) * 2); | |
ds->queue_len -= 2; | |
} | |
else { | |
ds->out_buf[ds->out_buf_pos++] = ctable[ds->queue[0]]; | |
if (ds->queue[0] == 0) { | |
ds->eos_signalled = 1; | |
ds->out_buf_pos = 0; | |
} | |
memmove(ds->queue, ds->queue + 1, (6 - 1) * 2); | |
ds->queue_len--; | |
} | |
/*fprintf(stderr, "out_buf_pos now %d\n", ds->out_buf_pos);*/ | |
} | |
} | |
uint32_t get_cp_name (uint32_t cp) { | |
Decompressor ds = {}; | |
uint32_t ret; | |
ret = get_uninames(ds.out_buf, cp); | |
if (ret == 0) { | |
printf("cp: %i name: %s\n", cp, ds.out_buf); | |
} | |
else { | |
printf("ret: %i\n", ret); | |
int index = name_index[(cp - ret) / 2]; | |
printf("name_index[%i]=%i, cp %lu, ret %lu, cp - ret = %lu\n", (cp - ret)/2, index, cp, ret, cp - ret); | |
ds.input_position = (const unsigned short *) &uninames + index; | |
printf("(cp - ret) % 2 = %i\n", (cp - ret) % 2); | |
eat_a_string(&ds, ( (cp - ret) % 2) ); | |
printf("cp: %i name: %s\n", cp, ds.out_buf); | |
} | |
} | |
int main (void) { | |
uint32_t cp = 0; | |
Decompressor ds = {}; | |
ds.input_position = (const uint16_t *) &uninames; | |
int i; | |
int ret; | |
get_cp_name(0x20); /* U+20 SPACE */ | |
return 0; | |
for (i = 0; i <= HIGHEST_NAME_CP; i++) { | |
ret = get_uninames(ds.out_buf, cp); | |
if (ret == 0) { | |
} | |
else { | |
eat_a_string(&ds, 0); | |
} | |
printf("U+%X '%s'\n", cp, ds.out_buf); | |
cp++; | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment