Last active
January 5, 2019 19:36
-
-
Save pietrobraione/895fe84dc61c8dfb6957f4be34dea3b2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* This gist shows how to build an almost zero-assembly template compiler from a threaded | |
* interpreter. | |
* See http://eli.thegreenplace.net/2012/07/12/computed-goto-for-efficient-dispatch-tables | |
* for the original threaded code interpreter. To illustrate compilation of jumps I added | |
* a couple of jump bytecodes. Works under macOS High Sierra (clang) and Ubuntu 18.04 (gcc), | |
* versions: | |
* | |
* $ clang --version | |
* Apple LLVM version 10.0.0 (clang-1000.11.45.5) | |
* Target: x86_64-apple-darwin17.7.0 | |
* | |
* $ gcc --version | |
* gcc (Ubuntu 7.3.0-27ubuntu1~18.04) 7.3.0 | |
* | |
* Note that with gcc seems to work also when compiled with optimizations, except for gcc -O1. | |
* With clang optimizations make mmap fail, don't know why. | |
*/ | |
#define DEBUG | |
#define _DEFAULT_SOURCE | |
#include <stdlib.h> | |
#include <stdio.h> | |
#include <time.h> | |
#include <sys/mman.h> | |
#define OP_INC 0x0 | |
#define OP_DEC 0x1 | |
#define OP_MUL2 0x2 | |
#define OP_DIV2 0x3 | |
#define OP_JMP 0x4 | |
#define OP_BRZ 0x5 | |
#define OP_HALT 0x6 | |
#define RET 0x7 /* not a bytecode! */ | |
/* I2I_DISPATCH: from interpreted to interpreted | |
* A2C_DISPATCH: from interpreted/compiled to compiled | |
* C2I_DISPATCH: from compiled to interpreted | |
* I2A_DISPATCH: from interpreted to interpreted/compiled | |
* HALT: from interpreted/compiled to interpreted (only return block) | |
*/ | |
#define I2I_DISPATCH goto *dispatch_table[code[pc]] | |
#define A2C_DISPATCH __asm__ volatile("jmp *%0" : : "r" (_tgt_table[pc]), "r" (pc), "r" (_tgt_table)) | |
#define C2I_DISPATCH __asm__ volatile("jmp *%0" : : "r" (_dispatch_table[code[pc]]), "r" (pc), "r" (code), "r" (_dispatch_table)) | |
#define I2A_DISPATCH \ | |
if (_tgt_table != 0 && _tgt_table[pc] != 0) { \ | |
A2C_DISPATCH; \ | |
} else { \ | |
I2I_DISPATCH; \ | |
} | |
#define HALT __asm__ volatile("jmp *%0" : : "r" (_dispatch_table[RET]), "r" (pc), "r" (code), "r" (_dispatch_table)) | |
/* pointers to binary code templates */ | |
/* templates for bytecodes */ | |
void* *entry_table; | |
void* *exit_table; | |
/* templates for trampolines */ | |
void* jump_trampoline_entry; | |
void* jump_trampoline_exit; | |
void* breakout_trampoline_entry; | |
void* breakout_trampoline_exit; | |
/* how big can be a code template? */ | |
unsigned long max_template_size; | |
/* outputs of the compiler */ | |
/* associates each bytecode in the source code with the address of its | |
* translation in the binary | |
*/ | |
void* *tgt_table; | |
/* the root of the compiled code; this pointer is not used right now, | |
* but it could be used to unmap the allocated memory | |
*/ | |
char *binary = 0; | |
/* the size of the compiled code */ | |
unsigned int compiled_code_size = 0; | |
/* forward declarations */ | |
void calc_max_template_size(void); | |
void compile(char*, unsigned int, unsigned int, unsigned int); | |
/* execute bytecode */ | |
int exec(char* code, int initval) { | |
int pc = 0; | |
int val = initval; | |
void* *_tgt_table = tgt_table; /* necessary to force absolute addressing in jit dispatch asm */ | |
/* The indices of labels in the dispatch_table are the relevant opcodes */ | |
static void* dispatch_table[] = { | |
&&do_inc, &&do_dec, &&do_mul2, &&do_div2, &&do_jmp, &&do_brz, &&do_halt, &&do_ret}; | |
static void* _exit_table[] = { | |
&&do_inc_end, &&do_dec_end, &&do_mul2_end, &&do_div2_end, &&do_jmp_end, &&do_brz_end, &&do_halt_end}; | |
void* *_dispatch_table = dispatch_table; /* necessary to force absolute addressing in breakout dispatch asm */ | |
/* prepares global environment for compiler | |
* (it would be better doing it once) | |
*/ | |
entry_table = dispatch_table; | |
exit_table = _exit_table; | |
jump_trampoline_entry = &&jump_trampoline; | |
jump_trampoline_exit = &&jump_trampoline_end; | |
breakout_trampoline_entry = &&breakout_trampoline; | |
breakout_trampoline_exit = &&breakout_trampoline_end; | |
calc_max_template_size(); | |
/* starts */ | |
I2A_DISPATCH; | |
/* some code templates for the compiler, | |
* not used by the interpreter | |
*/ | |
jump_trampoline: | |
if (_tgt_table[pc] != 0) { | |
A2C_DISPATCH; | |
} else { | |
C2I_DISPATCH; | |
} | |
jump_trampoline_end: | |
breakout_trampoline: | |
C2I_DISPATCH; | |
breakout_trampoline_end: | |
/* here come all the bytecode operations */ | |
do_inc: | |
++val; | |
++pc; | |
do_inc_end: | |
I2A_DISPATCH; | |
do_dec: | |
--val; | |
++pc; | |
do_dec_end: | |
I2A_DISPATCH; | |
do_mul2: | |
val *= 2; | |
++pc; | |
do_mul2_end: | |
I2A_DISPATCH; | |
do_div2: | |
val /= 2; | |
++pc; | |
do_div2_end: | |
I2A_DISPATCH; | |
do_jmp: | |
pc += code[pc + 1]; | |
do_jmp_end: | |
I2A_DISPATCH; | |
do_brz: | |
if (val == 0) { | |
pc += code[pc + 1]; | |
} else { | |
pc += 2; | |
} | |
do_brz_end: | |
I2A_DISPATCH; | |
do_halt: | |
HALT; | |
do_halt_end: | |
do_ret: | |
return val; /* this MUST be the last statement, or the block will not be contiguous! */ | |
} | |
/* calculates the maximum size of the binary code blocks | |
* that implement the semantics of the bytecodes | |
*/ | |
void calc_max_template_size(void) { | |
max_template_size = 0; | |
for (int i = 0; i < 7; ++i) { | |
long binary_block_size = exit_table[i] - entry_table[i]; | |
if (binary_block_size > max_template_size) { | |
max_template_size = binary_block_size; | |
} | |
} | |
/* (to be safe we should also add to max_template_size the size of the | |
* trampolines) | |
*/ | |
} | |
/* compiles to binary */ | |
void compile(char *code, unsigned int code_size, unsigned int start, unsigned int end) { | |
/* does some basic sanity checks */ | |
if (end > code_size || start >= end) { | |
return; | |
} | |
/* allocates a big enough buffer */ | |
char *code_buffer = | |
(char *) mmap(0, (end - start) * sizeof(char) * max_template_size, | |
PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); | |
if (code_buffer == MAP_FAILED) { | |
printf("FAILED ALLOCATION OF CODE BUFFER -- exiting\n"); | |
exit(1); | |
} | |
/* allocates and initializes the tgt_table */ | |
tgt_table = malloc(code_size * sizeof(void *)); | |
if (tgt_table == 0) { | |
printf("FAILED ALLOCATION OF TARGET TABLE -- exiting\n"); | |
exit(1); | |
} | |
for (unsigned long long i = 0; i < code_size; ++i) { | |
tgt_table[i] = 0; | |
} | |
/* dispatches on source code and copies the | |
* corresponding blocks in a buffer; at the | |
* same times builds the target table | |
*/ | |
char *to = code_buffer; | |
for (unsigned long long pc = start; pc < end; ++pc) { | |
char cur_bytecode = code[pc]; | |
/* updates tgt_table */ | |
tgt_table[pc] = to; | |
/* appends in code_buffer the translation of the bytecode */ | |
for (char *from = (char *) entry_table[cur_bytecode]; | |
from != (char *) exit_table[cur_bytecode]; ++from, ++to) { | |
*to = *from; | |
++compiled_code_size; | |
} | |
/* if the bytecode is jump/brz also appends the trampoline, | |
* and increments pc by one, to keep into account for the operand | |
*/ | |
if (cur_bytecode == OP_JMP || cur_bytecode == OP_BRZ) { | |
for (char *from = (char *) jump_trampoline_entry; | |
from != (char *) jump_trampoline_exit; ++from, ++to) { | |
*to = *from; | |
++compiled_code_size; | |
} | |
++pc; | |
} | |
/* if this is the last bytecode and it is not a halt or | |
* a jump, appends the breakout trampoline | |
*/ | |
if (pc == end - 1 && | |
cur_bytecode != OP_HALT && | |
cur_bytecode != OP_JMP && | |
cur_bytecode != OP_BRZ) { | |
for (char *from = (char *) breakout_trampoline_entry; | |
from != (char *) breakout_trampoline_exit; ++from, ++to) { | |
*to = *from; | |
++compiled_code_size; | |
} | |
} | |
} | |
/* makes the buffer executable */ | |
mprotect(code_buffer, compiled_code_size, PROT_READ | PROT_EXEC); | |
/* stores the root to the compiled code */ | |
binary = code_buffer; | |
} | |
#define DUMP_SOURCE \ | |
do { \ | |
for (int i = 0; i < code_size; ++i) { \ | |
char* dis[] = { "inc", "dec", "mul2", "div2", "jmp", "brz", "halt" }; \ | |
printf("%d %s", i, dis[code[i]]); \ | |
if (code[i] == OP_JMP || code[i] == OP_BRZ) { \ | |
printf(" %d", code[++i]); \ | |
} \ | |
printf("\n");\ | |
} \ | |
printf("\n"); \ | |
} while (0) | |
#define DUMP_BINARY \ | |
do { \ | |
printf("\n"); \ | |
int i; \ | |
for (i = 0; i < compiled_code_size; ++i) { \ | |
printf("%02hhX ", binary[i]); \ | |
if (i % 16 == 15) { \ | |
printf("\n"); \ | |
} \ | |
} \ | |
if (i % 16 != 0) { \ | |
printf("\n"); \ | |
} \ | |
printf("\n"); \ | |
} while (0) | |
int main(void) { | |
unsigned int code_size; | |
char *code; | |
unsigned int code_start_jit, code_end_jit; | |
int initval; | |
#if 0 | |
/* generates some random code (without jumps because it may diverge) */ | |
code_size = 10000; | |
code = malloc(code_size * sizeof(char)); | |
srand(2); | |
for (unsigned long long i = 0; i < code_size - 1; ++i) { | |
code[i] = (rand() % 4); /* all bytecodes except jumps and halt */ | |
} | |
code[code_size - 1] = OP_HALT; | |
code_start_jit = 0; code_end_jit = code_size; | |
initval = 0; | |
#endif | |
#if 0 | |
/* alternatively, generates a very silly example with brz */ | |
code_size = 4; | |
code = malloc(code_size * sizeof(char)); | |
code[0] = OP_BRZ; | |
code[1] = 3; | |
code[2] = OP_INC; | |
code[3] = OP_HALT; | |
code_start_jit = 0; code_end_jit = code_size; | |
initval = 0; | |
#endif | |
/* alternatively, generates an example with a long loop and | |
* jits a part of the loop body | |
*/ | |
code_size = 19; | |
code = malloc(code_size * sizeof(char)); | |
code[0] = OP_BRZ; | |
code[1] = 18; /* goes to halt */ | |
code[2] = OP_DEC; | |
code[3] = OP_DEC; | |
code[4] = OP_DEC; | |
code[5] = OP_DEC; | |
code[6] = OP_DEC; | |
code[7] = OP_DEC; | |
code[8] = OP_DEC; | |
code[9] = OP_DEC; | |
code[10] = OP_DEC; | |
code[11] = OP_DEC; | |
code[12] = OP_INC; | |
code[13] = OP_DEC; | |
code[14] = OP_INC; | |
code[15] = OP_DEC; /* subtracts 10 to the register at each iteration*/ | |
code[16] = OP_JMP; | |
code[17] = -16; | |
code[18] = OP_HALT; | |
code_start_jit = 2; code_end_jit = 10; | |
initval = 1000000; | |
#if defined DEBUG | |
DUMP_SOURCE; | |
#endif | |
clock_t start, end; | |
int result; | |
/* first runs the code with the threaded interpreter... */ | |
start = clock(); | |
result = exec(code, initval); | |
end = clock(); | |
printf("result: %d\ttime: %f sec\n", result, ((double) (end - start)) / CLOCKS_PER_SEC); | |
/* ...then compiles to binary... */ | |
start = clock(); | |
compile(code, code_size, code_start_jit, code_end_jit); | |
end = clock(); | |
printf("compiled\ttime: %f sec\n", ((double) (end - start)) / CLOCKS_PER_SEC); | |
#if defined DEBUG | |
DUMP_BINARY; | |
#endif | |
/* ...and runs the compiled code */ | |
start = clock(); | |
result = exec(code, initval); | |
end = clock(); | |
printf("result: %d\ttime: %f sec\n", result, ((double) (end - start)) / CLOCKS_PER_SEC); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment