Skip to content

Instantly share code, notes, and snippets.

@pnck
Created March 19, 2019 13:36
Show Gist options
  • Save pnck/88324ab6d19e2458056ac6998a81c21d to your computer and use it in GitHub Desktop.
Save pnck/88324ab6d19e2458056ac6998a81c21d to your computer and use it in GitHub Desktop.
scan specific tagged string in files and generate a ragel source that replace tag with translations
#!/usr/bin/env python3
import sys
import re
import json
import hashlib
import itertools
import binascii
LANGUAGES = ['zh_CN', 'en_US', ]
CODEPAGES = ['utf8', 'latin1']
WRAP_PAIR = ('%%T{', '}%%')
def usage():
print('usage: {} file1 [file2 ...]\n'.format(sys.argv[0]))
print('Any literal string wrapped by `{0}{1}` in the specified files will be scanned out,\n'.format(*WRAP_PAIR)
+ 'and a translation table file named `translation.json` will be generated from the scan result.')
print('Do translation job with `translation.json` then run the scanner again.')
print(
'The translation table will be convert into a C-like source file (actually, a Ragel source) `translation.rl`.')
print('Just compile (ragel required) and link the generated file as normal C source and everything is done.')
def make_translation():
translate_table = {}
try:
translation_file = open('translation.json', 'r+t', encoding='utf-8')
translate_table = json.load(translation_file)
except json.decoder.JSONDecodeError:
import traceback
traceback.print_exc()
except:
translation_file = open('translation.json', 'w+t', encoding='utf-8')
# check translate_table strings
for fname in sys.argv[1:]:
if not fname in translate_table:
translate_table.update({fname: {}})
with open(fname, "rt", encoding='utf-8') as loaded:
src = loaded.read()
p = re.compile(r'''{0}(.*?){1}'''.format(*WRAP_PAIR), re.MULTILINE | re.DOTALL)
r = re.finditer(p, src)
for macro_wrapped in r:
s = macro_wrapped.groups()[0]
if not s:
continue
if s not in translate_table[fname]:
translate_table[fname].update({s: {'span': str(macro_wrapped.span())}})
else:
translate_table[fname][s].update({'span': str(macro_wrapped.span())})
# add empty translation for newer added tags
for f in translate_table:
for tag in translate_table[f]:
for lang in LANGUAGES:
if lang not in translate_table[f][tag]:
translate_table[f][tag].update({lang: ''})
# write translation file and compile to C definition
translation_file.seek(0)
translation_file.truncate(0)
json.dump(translate_table, translation_file, ensure_ascii=False, indent=2)
translation_file.close()
return translate_table
def make_fsm_definition(translate_table):
with open('translation.rl', 'wt', encoding='utf-8') as cdef_file:
cdef_file.write('''
#ifndef TRANSLATOR_GENERATED
#define TRANSLATOR_GENERATED
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include <stdio.h>
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-const-variable"
typedef struct _lang_codepage_map {
'''.strip())
cdef_file.writelines(('\n const char *{};'.format(cp) for cp in CODEPAGES))
cdef_file.write('\n' + '''
} codepage_map;
typedef struct _translated_string_map {
const char *tag; // original tag
'''.strip())
cdef_file.writelines(('\n const codepage_map {};'.format(lang) for lang in LANGUAGES))
cdef_file.write('\n} string_map;\n\n')
tagmap = {}
for f in translate_table:
for tag in translate_table[f]:
h = hashlib.md5(tag.encode('utf8')).hexdigest()
tagmap.update({h: tag})
cdef_file.write('''
static const string_map __string_map_{h} = {{
.tag = "{t}",
'''.rstrip().format(h=h, t=tag))
if 'span' in translate_table[f][tag]:
span = translate_table[f][tag]['span']
cdef_file.write('\n /* {file}:{span} */'.format(file=f, span=span))
for lang in LANGUAGES:
if lang not in translate_table[f][tag]:
continue
translation = translate_table[f][tag][lang]
lang_block = '''
.{} = {{ /*"{}"*/
'''.format(lang, translation).rstrip()
for cp in CODEPAGES:
if translation:
try:
binstr = binascii.hexlify(translation.encode(cp)).decode()
hexstr = ''.join((''.join(('\\x',) + p) for p in zip(binstr[::2], binstr[1::2])))
assign = '"{}"'.format(hexstr)
lang_block += '\n .{} = {},'.format(cp, assign)
except UnicodeEncodeError:
lang_block += '\n .{} = NULL,'.format(cp)
lang_block += '\n },'
cdef_file.write(lang_block)
cdef_file.write('\n};\n')
find_map_def = ''
for h in tagmap:
find_map_def += '\n action match_{h} {{target=&__string_map_{h};}}'.format(h=h)
find_map_def += '\n matches =\n ('
for h in tagmap:
find_map_def += '\n "{tag}" % match_{h} |'.format(tag=tagmap[h], h=h)
find_map_def = find_map_def[:-1] + '\n );'
find_map_def += '\n prologue = "{}";'.format(WRAP_PAIR[0])
find_map_def += '\n epilogue = "{}";'.format(WRAP_PAIR[1])
find_map_def += r'''
action Eno_translation{target=NULL; fhold; fgoto skip_no_match;}
action Ebrak_all{CONCAT_STR(view_end);fbreak;}
action matched{const char* translated = match_lang(target,lang,current_lang);CONCAT_STR(translated);fgoto main;}
action skipped{fgoto main;}
action set_view_end{if(!at_prologue){view_end=p;at_prologue=1;}}
action set_view_begin{view_begin=p;}
action done{view_end = p;CONCAT_VIEW;fbreak;}
action do_strcat {CONCAT_VIEW;at_prologue = 0;fhold;fgoto entry;}
skip_no_match := any* epilogue @skipped $eof(Ebrak_all);
entry := matches $^Eno_translation epilogue @matched $^Eno_translation;
main := any* >set_view_begin %eof(done) prologue >set_view_end %do_strcat any;
'''.rstrip()
match_lang_def = ''
for l in LANGUAGES:
match_lang_def += '\n action lang_{0}{{lang_ptr=&(target->{0});}}'.format(l)
match_lang_def += '''
action decay_cp{
ret=lang_ptr->%s;
_lang=strdup(p);
for(char *_p=_lang;*++_p;){*_p=tolower(*_p);}
/*remove '_' '-'*/
int len = strlen(_lang);
char * sign = memchr(_lang,'_',strlen(_lang));
while(sign){
memmove(sign,sign+1,_lang+len-sign);
sign = memchr(_lang+1,'_',strlen(_lang));
}
sign = memchr(_lang+1,'-',strlen(_lang));
while(sign){
memmove(sign,sign+1,_lang+len-sign);
sign = memchr(_lang+1,'-',strlen(_lang));
}
p = _lang;
pe = _lang+strlen(_lang);
eof = pe;
}
'''.rstrip() % CODEPAGES[0]
match_lang_def += '\n main :='
for l, cp in itertools.product(LANGUAGES, CODEPAGES):
match_lang_def += '''
(/{m}/ @lang_{l} %eof{{ret=lang_ptr->{cp0};cl="{l}.{cp0}";}} '.' @decay_cp "{cp}" %eof{{if(lang_ptr->{cp}){{ret=lang_ptr->{cp};cl="{l}.{cp}";}}else{{ret=target->tag;cl=NULL;}}}})|
'''.format(m=l.replace('_',r'[_\-]'),l=l, cp0=CODEPAGES[0], cp=cp.lower()).rstrip()
match_lang_def = match_lang_def[:-1] + ';'
ragel_codes = r'''
static const char* match_lang(const string_map *target,const char* lang,const char** current_lang) {{
char *_lang = NULL;
//for (int i = 0; _lang[i]; ++i) {{_lang[i] = tolower(_lang[i]);}}
int cs;
const char *p = lang;
const char *pe = lang+strlen(lang);
const char *eof = pe;
const codepage_map *lang_ptr = NULL;
const char *ret = target->tag;
const char *cl = NULL;
%%{{
machine match_lang;
{fsm_match_lang_def}
write data;
write init;
write exec;
}}%%
if(_lang){{free(_lang);}}
if(current_lang){{*current_lang = cl;}}
return ret;
}}
%%{{
machine find_map;
write data;
}}%%
__attribute__((visibility("default"))) char *get_translation(const char *in, const char *lang, const char** current_lang) {{
int cs;
const char *p = in;
const char *pe = in+strlen(in);
const char *eof = pe;
const string_map *target = NULL;
char *ret = NULL;
size_t ret_len = 0;
const char* view_begin = in;
const char* view_end = in;
int at_prologue = 0;
#define _CONCAT_WITH(A_SRC,A_LEN) \
char *t = ret; \
size_t l = A_LEN; \
ret = (char*)malloc(ret_len+l+1); \
if(t){{ \
memcpy(ret,t,ret_len); \
free(t); \
}} \
memcpy(ret+ret_len,A_SRC,l); \
ret_len += l; \
ret[ret_len]='\0'
#define CONCAT_STR(S) _CONCAT_WITH(S,strlen(S))
#define CONCAT_VIEW _CONCAT_WITH(view_begin,view_end-view_begin)
%%{{
machine find_map;
alphtype int;
{fsm_find_def}
write init;
write exec;
}}%%
return ret;
}}
#pragma GCC diagnostic pop
/* NOTE: copy these declarations below to your include header */
char *get_translation(const char *, const char *, const char **);
#define TRANSLATION_LANGUAGE_VARIABLE(LANG) const char *__to_language = LANG, *__translated_to = NULL
'''.format(fsm_find_def=find_map_def, fsm_match_lang_def=match_lang_def).rstrip()
cdef_file.write(ragel_codes)
cdef_file.write('\n#endif')
if __name__ == '__main__':
if len(sys.argv) < 2:
usage()
exit(1)
table = make_translation()
make_fsm_definition(table)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment