Created
March 19, 2019 13:36
-
-
Save pnck/88324ab6d19e2458056ac6998a81c21d to your computer and use it in GitHub Desktop.
scan specific tagged string in files and generate a ragel source that replace tag with translations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
import re | |
import json | |
import hashlib | |
import itertools | |
import binascii | |
LANGUAGES = ['zh_CN', 'en_US', ] | |
CODEPAGES = ['utf8', 'latin1'] | |
WRAP_PAIR = ('%%T{', '}%%') | |
def usage(): | |
print('usage: {} file1 [file2 ...]\n'.format(sys.argv[0])) | |
print('Any literal string wrapped by `{0}{1}` in the specified files will be scanned out,\n'.format(*WRAP_PAIR) | |
+ 'and a translation table file named `translation.json` will be generated from the scan result.') | |
print('Do translation job with `translation.json` then run the scanner again.') | |
print( | |
'The translation table will be convert into a C-like source file (actually, a Ragel source) `translation.rl`.') | |
print('Just compile (ragel required) and link the generated file as normal C source and everything is done.') | |
def make_translation(): | |
translate_table = {} | |
try: | |
translation_file = open('translation.json', 'r+t', encoding='utf-8') | |
translate_table = json.load(translation_file) | |
except json.decoder.JSONDecodeError: | |
import traceback | |
traceback.print_exc() | |
except: | |
translation_file = open('translation.json', 'w+t', encoding='utf-8') | |
# check translate_table strings | |
for fname in sys.argv[1:]: | |
if not fname in translate_table: | |
translate_table.update({fname: {}}) | |
with open(fname, "rt", encoding='utf-8') as loaded: | |
src = loaded.read() | |
p = re.compile(r'''{0}(.*?){1}'''.format(*WRAP_PAIR), re.MULTILINE | re.DOTALL) | |
r = re.finditer(p, src) | |
for macro_wrapped in r: | |
s = macro_wrapped.groups()[0] | |
if not s: | |
continue | |
if s not in translate_table[fname]: | |
translate_table[fname].update({s: {'span': str(macro_wrapped.span())}}) | |
else: | |
translate_table[fname][s].update({'span': str(macro_wrapped.span())}) | |
# add empty translation for newer added tags | |
for f in translate_table: | |
for tag in translate_table[f]: | |
for lang in LANGUAGES: | |
if lang not in translate_table[f][tag]: | |
translate_table[f][tag].update({lang: ''}) | |
# write translation file and compile to C definition | |
translation_file.seek(0) | |
translation_file.truncate(0) | |
json.dump(translate_table, translation_file, ensure_ascii=False, indent=2) | |
translation_file.close() | |
return translate_table | |
def make_fsm_definition(translate_table): | |
with open('translation.rl', 'wt', encoding='utf-8') as cdef_file: | |
cdef_file.write(''' | |
#ifndef TRANSLATOR_GENERATED | |
#define TRANSLATOR_GENERATED | |
#include <string.h> | |
#include <ctype.h> | |
#include <stdlib.h> | |
#include <stdio.h> | |
#pragma GCC diagnostic push | |
#pragma GCC diagnostic ignored "-Wunused-const-variable" | |
typedef struct _lang_codepage_map { | |
'''.strip()) | |
cdef_file.writelines(('\n const char *{};'.format(cp) for cp in CODEPAGES)) | |
cdef_file.write('\n' + ''' | |
} codepage_map; | |
typedef struct _translated_string_map { | |
const char *tag; // original tag | |
'''.strip()) | |
cdef_file.writelines(('\n const codepage_map {};'.format(lang) for lang in LANGUAGES)) | |
cdef_file.write('\n} string_map;\n\n') | |
tagmap = {} | |
for f in translate_table: | |
for tag in translate_table[f]: | |
h = hashlib.md5(tag.encode('utf8')).hexdigest() | |
tagmap.update({h: tag}) | |
cdef_file.write(''' | |
static const string_map __string_map_{h} = {{ | |
.tag = "{t}", | |
'''.rstrip().format(h=h, t=tag)) | |
if 'span' in translate_table[f][tag]: | |
span = translate_table[f][tag]['span'] | |
cdef_file.write('\n /* {file}:{span} */'.format(file=f, span=span)) | |
for lang in LANGUAGES: | |
if lang not in translate_table[f][tag]: | |
continue | |
translation = translate_table[f][tag][lang] | |
lang_block = ''' | |
.{} = {{ /*"{}"*/ | |
'''.format(lang, translation).rstrip() | |
for cp in CODEPAGES: | |
if translation: | |
try: | |
binstr = binascii.hexlify(translation.encode(cp)).decode() | |
hexstr = ''.join((''.join(('\\x',) + p) for p in zip(binstr[::2], binstr[1::2]))) | |
assign = '"{}"'.format(hexstr) | |
lang_block += '\n .{} = {},'.format(cp, assign) | |
except UnicodeEncodeError: | |
lang_block += '\n .{} = NULL,'.format(cp) | |
lang_block += '\n },' | |
cdef_file.write(lang_block) | |
cdef_file.write('\n};\n') | |
find_map_def = '' | |
for h in tagmap: | |
find_map_def += '\n action match_{h} {{target=&__string_map_{h};}}'.format(h=h) | |
find_map_def += '\n matches =\n (' | |
for h in tagmap: | |
find_map_def += '\n "{tag}" % match_{h} |'.format(tag=tagmap[h], h=h) | |
find_map_def = find_map_def[:-1] + '\n );' | |
find_map_def += '\n prologue = "{}";'.format(WRAP_PAIR[0]) | |
find_map_def += '\n epilogue = "{}";'.format(WRAP_PAIR[1]) | |
find_map_def += r''' | |
action Eno_translation{target=NULL; fhold; fgoto skip_no_match;} | |
action Ebrak_all{CONCAT_STR(view_end);fbreak;} | |
action matched{const char* translated = match_lang(target,lang,current_lang);CONCAT_STR(translated);fgoto main;} | |
action skipped{fgoto main;} | |
action set_view_end{if(!at_prologue){view_end=p;at_prologue=1;}} | |
action set_view_begin{view_begin=p;} | |
action done{view_end = p;CONCAT_VIEW;fbreak;} | |
action do_strcat {CONCAT_VIEW;at_prologue = 0;fhold;fgoto entry;} | |
skip_no_match := any* epilogue @skipped $eof(Ebrak_all); | |
entry := matches $^Eno_translation epilogue @matched $^Eno_translation; | |
main := any* >set_view_begin %eof(done) prologue >set_view_end %do_strcat any; | |
'''.rstrip() | |
match_lang_def = '' | |
for l in LANGUAGES: | |
match_lang_def += '\n action lang_{0}{{lang_ptr=&(target->{0});}}'.format(l) | |
match_lang_def += ''' | |
action decay_cp{ | |
ret=lang_ptr->%s; | |
_lang=strdup(p); | |
for(char *_p=_lang;*++_p;){*_p=tolower(*_p);} | |
/*remove '_' '-'*/ | |
int len = strlen(_lang); | |
char * sign = memchr(_lang,'_',strlen(_lang)); | |
while(sign){ | |
memmove(sign,sign+1,_lang+len-sign); | |
sign = memchr(_lang+1,'_',strlen(_lang)); | |
} | |
sign = memchr(_lang+1,'-',strlen(_lang)); | |
while(sign){ | |
memmove(sign,sign+1,_lang+len-sign); | |
sign = memchr(_lang+1,'-',strlen(_lang)); | |
} | |
p = _lang; | |
pe = _lang+strlen(_lang); | |
eof = pe; | |
} | |
'''.rstrip() % CODEPAGES[0] | |
match_lang_def += '\n main :=' | |
for l, cp in itertools.product(LANGUAGES, CODEPAGES): | |
match_lang_def += ''' | |
(/{m}/ @lang_{l} %eof{{ret=lang_ptr->{cp0};cl="{l}.{cp0}";}} '.' @decay_cp "{cp}" %eof{{if(lang_ptr->{cp}){{ret=lang_ptr->{cp};cl="{l}.{cp}";}}else{{ret=target->tag;cl=NULL;}}}})| | |
'''.format(m=l.replace('_',r'[_\-]'),l=l, cp0=CODEPAGES[0], cp=cp.lower()).rstrip() | |
match_lang_def = match_lang_def[:-1] + ';' | |
ragel_codes = r''' | |
static const char* match_lang(const string_map *target,const char* lang,const char** current_lang) {{ | |
char *_lang = NULL; | |
//for (int i = 0; _lang[i]; ++i) {{_lang[i] = tolower(_lang[i]);}} | |
int cs; | |
const char *p = lang; | |
const char *pe = lang+strlen(lang); | |
const char *eof = pe; | |
const codepage_map *lang_ptr = NULL; | |
const char *ret = target->tag; | |
const char *cl = NULL; | |
%%{{ | |
machine match_lang; | |
{fsm_match_lang_def} | |
write data; | |
write init; | |
write exec; | |
}}%% | |
if(_lang){{free(_lang);}} | |
if(current_lang){{*current_lang = cl;}} | |
return ret; | |
}} | |
%%{{ | |
machine find_map; | |
write data; | |
}}%% | |
__attribute__((visibility("default"))) char *get_translation(const char *in, const char *lang, const char** current_lang) {{ | |
int cs; | |
const char *p = in; | |
const char *pe = in+strlen(in); | |
const char *eof = pe; | |
const string_map *target = NULL; | |
char *ret = NULL; | |
size_t ret_len = 0; | |
const char* view_begin = in; | |
const char* view_end = in; | |
int at_prologue = 0; | |
#define _CONCAT_WITH(A_SRC,A_LEN) \ | |
char *t = ret; \ | |
size_t l = A_LEN; \ | |
ret = (char*)malloc(ret_len+l+1); \ | |
if(t){{ \ | |
memcpy(ret,t,ret_len); \ | |
free(t); \ | |
}} \ | |
memcpy(ret+ret_len,A_SRC,l); \ | |
ret_len += l; \ | |
ret[ret_len]='\0' | |
#define CONCAT_STR(S) _CONCAT_WITH(S,strlen(S)) | |
#define CONCAT_VIEW _CONCAT_WITH(view_begin,view_end-view_begin) | |
%%{{ | |
machine find_map; | |
alphtype int; | |
{fsm_find_def} | |
write init; | |
write exec; | |
}}%% | |
return ret; | |
}} | |
#pragma GCC diagnostic pop | |
/* NOTE: copy these declarations below to your include header */ | |
char *get_translation(const char *, const char *, const char **); | |
#define TRANSLATION_LANGUAGE_VARIABLE(LANG) const char *__to_language = LANG, *__translated_to = NULL | |
'''.format(fsm_find_def=find_map_def, fsm_match_lang_def=match_lang_def).rstrip() | |
cdef_file.write(ragel_codes) | |
cdef_file.write('\n#endif') | |
if __name__ == '__main__': | |
if len(sys.argv) < 2: | |
usage() | |
exit(1) | |
table = make_translation() | |
make_fsm_definition(table) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment