Skip to content

Instantly share code, notes, and snippets.

@pietrobraione
Last active January 5, 2019 19:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pietrobraione/895fe84dc61c8dfb6957f4be34dea3b2 to your computer and use it in GitHub Desktop.
Save pietrobraione/895fe84dc61c8dfb6957f4be34dea3b2 to your computer and use it in GitHub Desktop.
/* This gist shows how to build an almost zero-assembly template compiler from a threaded
* interpreter.
* See http://eli.thegreenplace.net/2012/07/12/computed-goto-for-efficient-dispatch-tables
* for the original threaded code interpreter. To illustrate compilation of jumps I added
* a couple of jump bytecodes. Works under macOS High Sierra (clang) and Ubuntu 18.04 (gcc),
* versions:
*
* $ clang --version
* Apple LLVM version 10.0.0 (clang-1000.11.45.5)
* Target: x86_64-apple-darwin17.7.0
*
* $ gcc --version
* gcc (Ubuntu 7.3.0-27ubuntu1~18.04) 7.3.0
*
* Note that with gcc seems to work also when compiled with optimizations, except for gcc -O1.
* With clang optimizations make mmap fail, don't know why.
*/
#define DEBUG
#define _DEFAULT_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <sys/mman.h>
#define OP_INC 0x0
#define OP_DEC 0x1
#define OP_MUL2 0x2
#define OP_DIV2 0x3
#define OP_JMP 0x4
#define OP_BRZ 0x5
#define OP_HALT 0x6
#define RET 0x7 /* not a bytecode! */
/* I2I_DISPATCH: from interpreted to interpreted
* A2C_DISPATCH: from interpreted/compiled to compiled
* C2I_DISPATCH: from compiled to interpreted
* I2A_DISPATCH: from interpreted to interpreted/compiled
* HALT: from interpreted/compiled to interpreted (only return block)
*/
#define I2I_DISPATCH goto *dispatch_table[code[pc]]
#define A2C_DISPATCH __asm__ volatile("jmp *%0" : : "r" (_tgt_table[pc]), "r" (pc), "r" (_tgt_table))
#define C2I_DISPATCH __asm__ volatile("jmp *%0" : : "r" (_dispatch_table[code[pc]]), "r" (pc), "r" (code), "r" (_dispatch_table))
#define I2A_DISPATCH \
if (_tgt_table != 0 && _tgt_table[pc] != 0) { \
A2C_DISPATCH; \
} else { \
I2I_DISPATCH; \
}
#define HALT __asm__ volatile("jmp *%0" : : "r" (_dispatch_table[RET]), "r" (pc), "r" (code), "r" (_dispatch_table))
/* pointers to binary code templates */
/* templates for bytecodes */
void* *entry_table;
void* *exit_table;
/* templates for trampolines */
void* jump_trampoline_entry;
void* jump_trampoline_exit;
void* breakout_trampoline_entry;
void* breakout_trampoline_exit;
/* how big can be a code template? */
unsigned long max_template_size;
/* outputs of the compiler */
/* associates each bytecode in the source code with the address of its
* translation in the binary
*/
void* *tgt_table;
/* the root of the compiled code; this pointer is not used right now,
* but it could be used to unmap the allocated memory
*/
char *binary = 0;
/* the size of the compiled code */
unsigned int compiled_code_size = 0;
/* forward declarations */
void calc_max_template_size(void);
void compile(char*, unsigned int, unsigned int, unsigned int);
/* execute bytecode */
int exec(char* code, int initval) {
int pc = 0;
int val = initval;
void* *_tgt_table = tgt_table; /* necessary to force absolute addressing in jit dispatch asm */
/* The indices of labels in the dispatch_table are the relevant opcodes */
static void* dispatch_table[] = {
&&do_inc, &&do_dec, &&do_mul2, &&do_div2, &&do_jmp, &&do_brz, &&do_halt, &&do_ret};
static void* _exit_table[] = {
&&do_inc_end, &&do_dec_end, &&do_mul2_end, &&do_div2_end, &&do_jmp_end, &&do_brz_end, &&do_halt_end};
void* *_dispatch_table = dispatch_table; /* necessary to force absolute addressing in breakout dispatch asm */
/* prepares global environment for compiler
* (it would be better doing it once)
*/
entry_table = dispatch_table;
exit_table = _exit_table;
jump_trampoline_entry = &&jump_trampoline;
jump_trampoline_exit = &&jump_trampoline_end;
breakout_trampoline_entry = &&breakout_trampoline;
breakout_trampoline_exit = &&breakout_trampoline_end;
calc_max_template_size();
/* starts */
I2A_DISPATCH;
/* some code templates for the compiler,
* not used by the interpreter
*/
jump_trampoline:
if (_tgt_table[pc] != 0) {
A2C_DISPATCH;
} else {
C2I_DISPATCH;
}
jump_trampoline_end:
breakout_trampoline:
C2I_DISPATCH;
breakout_trampoline_end:
/* here come all the bytecode operations */
do_inc:
++val;
++pc;
do_inc_end:
I2A_DISPATCH;
do_dec:
--val;
++pc;
do_dec_end:
I2A_DISPATCH;
do_mul2:
val *= 2;
++pc;
do_mul2_end:
I2A_DISPATCH;
do_div2:
val /= 2;
++pc;
do_div2_end:
I2A_DISPATCH;
do_jmp:
pc += code[pc + 1];
do_jmp_end:
I2A_DISPATCH;
do_brz:
if (val == 0) {
pc += code[pc + 1];
} else {
pc += 2;
}
do_brz_end:
I2A_DISPATCH;
do_halt:
HALT;
do_halt_end:
do_ret:
return val; /* this MUST be the last statement, or the block will not be contiguous! */
}
/* calculates the maximum size of the binary code blocks
* that implement the semantics of the bytecodes
*/
void calc_max_template_size(void) {
max_template_size = 0;
for (int i = 0; i < 7; ++i) {
long binary_block_size = exit_table[i] - entry_table[i];
if (binary_block_size > max_template_size) {
max_template_size = binary_block_size;
}
}
/* (to be safe we should also add to max_template_size the size of the
* trampolines)
*/
}
/* compiles to binary */
void compile(char *code, unsigned int code_size, unsigned int start, unsigned int end) {
/* does some basic sanity checks */
if (end > code_size || start >= end) {
return;
}
/* allocates a big enough buffer */
char *code_buffer =
(char *) mmap(0, (end - start) * sizeof(char) * max_template_size,
PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (code_buffer == MAP_FAILED) {
printf("FAILED ALLOCATION OF CODE BUFFER -- exiting\n");
exit(1);
}
/* allocates and initializes the tgt_table */
tgt_table = malloc(code_size * sizeof(void *));
if (tgt_table == 0) {
printf("FAILED ALLOCATION OF TARGET TABLE -- exiting\n");
exit(1);
}
for (unsigned long long i = 0; i < code_size; ++i) {
tgt_table[i] = 0;
}
/* dispatches on source code and copies the
* corresponding blocks in a buffer; at the
* same times builds the target table
*/
char *to = code_buffer;
for (unsigned long long pc = start; pc < end; ++pc) {
char cur_bytecode = code[pc];
/* updates tgt_table */
tgt_table[pc] = to;
/* appends in code_buffer the translation of the bytecode */
for (char *from = (char *) entry_table[cur_bytecode];
from != (char *) exit_table[cur_bytecode]; ++from, ++to) {
*to = *from;
++compiled_code_size;
}
/* if the bytecode is jump/brz also appends the trampoline,
* and increments pc by one, to keep into account for the operand
*/
if (cur_bytecode == OP_JMP || cur_bytecode == OP_BRZ) {
for (char *from = (char *) jump_trampoline_entry;
from != (char *) jump_trampoline_exit; ++from, ++to) {
*to = *from;
++compiled_code_size;
}
++pc;
}
/* if this is the last bytecode and it is not a halt or
* a jump, appends the breakout trampoline
*/
if (pc == end - 1 &&
cur_bytecode != OP_HALT &&
cur_bytecode != OP_JMP &&
cur_bytecode != OP_BRZ) {
for (char *from = (char *) breakout_trampoline_entry;
from != (char *) breakout_trampoline_exit; ++from, ++to) {
*to = *from;
++compiled_code_size;
}
}
}
/* makes the buffer executable */
mprotect(code_buffer, compiled_code_size, PROT_READ | PROT_EXEC);
/* stores the root to the compiled code */
binary = code_buffer;
}
#define DUMP_SOURCE \
do { \
for (int i = 0; i < code_size; ++i) { \
char* dis[] = { "inc", "dec", "mul2", "div2", "jmp", "brz", "halt" }; \
printf("%d %s", i, dis[code[i]]); \
if (code[i] == OP_JMP || code[i] == OP_BRZ) { \
printf(" %d", code[++i]); \
} \
printf("\n");\
} \
printf("\n"); \
} while (0)
#define DUMP_BINARY \
do { \
printf("\n"); \
int i; \
for (i = 0; i < compiled_code_size; ++i) { \
printf("%02hhX ", binary[i]); \
if (i % 16 == 15) { \
printf("\n"); \
} \
} \
if (i % 16 != 0) { \
printf("\n"); \
} \
printf("\n"); \
} while (0)
int main(void) {
unsigned int code_size;
char *code;
unsigned int code_start_jit, code_end_jit;
int initval;
#if 0
/* generates some random code (without jumps because it may diverge) */
code_size = 10000;
code = malloc(code_size * sizeof(char));
srand(2);
for (unsigned long long i = 0; i < code_size - 1; ++i) {
code[i] = (rand() % 4); /* all bytecodes except jumps and halt */
}
code[code_size - 1] = OP_HALT;
code_start_jit = 0; code_end_jit = code_size;
initval = 0;
#endif
#if 0
/* alternatively, generates a very silly example with brz */
code_size = 4;
code = malloc(code_size * sizeof(char));
code[0] = OP_BRZ;
code[1] = 3;
code[2] = OP_INC;
code[3] = OP_HALT;
code_start_jit = 0; code_end_jit = code_size;
initval = 0;
#endif
/* alternatively, generates an example with a long loop and
* jits a part of the loop body
*/
code_size = 19;
code = malloc(code_size * sizeof(char));
code[0] = OP_BRZ;
code[1] = 18; /* goes to halt */
code[2] = OP_DEC;
code[3] = OP_DEC;
code[4] = OP_DEC;
code[5] = OP_DEC;
code[6] = OP_DEC;
code[7] = OP_DEC;
code[8] = OP_DEC;
code[9] = OP_DEC;
code[10] = OP_DEC;
code[11] = OP_DEC;
code[12] = OP_INC;
code[13] = OP_DEC;
code[14] = OP_INC;
code[15] = OP_DEC; /* subtracts 10 to the register at each iteration*/
code[16] = OP_JMP;
code[17] = -16;
code[18] = OP_HALT;
code_start_jit = 2; code_end_jit = 10;
initval = 1000000;
#if defined DEBUG
DUMP_SOURCE;
#endif
clock_t start, end;
int result;
/* first runs the code with the threaded interpreter... */
start = clock();
result = exec(code, initval);
end = clock();
printf("result: %d\ttime: %f sec\n", result, ((double) (end - start)) / CLOCKS_PER_SEC);
/* ...then compiles to binary... */
start = clock();
compile(code, code_size, code_start_jit, code_end_jit);
end = clock();
printf("compiled\ttime: %f sec\n", ((double) (end - start)) / CLOCKS_PER_SEC);
#if defined DEBUG
DUMP_BINARY;
#endif
/* ...and runs the compiled code */
start = clock();
result = exec(code, initval);
end = clock();
printf("result: %d\ttime: %f sec\n", result, ((double) (end - start)) / CLOCKS_PER_SEC);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment