Created
August 4, 2014 18:00
-
-
Save nothings/706a046c35ae5d30c490 to your computer and use it in GitHub Desktop.
stb_vm excerpts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define atom_array_ref_preshifted(arr,offset) ((svmatom *) (((uint8 *) (arr)) + (offset))) | |
#define atom_array_ref_naive(arr,offset) (&(arr)[offset]) | |
#ifdef STB_VM_CONFIG_PRESHIFTED_BYTESTREAM_CPOOL | |
#define atom_cpool_ref atom_array_ref_preshifted | |
#else | |
#define atom_cpool_ref atom_array_ref_naive | |
#endif | |
#ifdef STB_VM_CONFIG_PRESHIFTED_BYTESTREAM_REG | |
#define atom_reg_ref atom_array_ref_preshifted | |
#else | |
#define atom_reg_ref atom_array_ref_naive | |
#endif | |
#ifdef STB_VM_CONFIG_PREFSHIFTED_STRUCT_OFFSETS | |
#define struct_access(str,off) ((svmvalue *) (((uint8 *) (str)) + (offset))) | |
#else | |
#define struct_access(str,off) (str)[off] | |
#endif | |
#ifdef STB_VM_CONFIG_HOLD_REGISTERS_IN_SMALL_INT | |
typedef ubytecode regnum; | |
#else | |
typedef int regnum; | |
#endif | |
#ifdef SVB_VM_CONFIG_PTRCODE | |
#define STB_VM_CONFIG_USE_LABEL_ADDRESSES | |
#endif | |
// svm_interpret takes as input a just-resumed thread. this | |
// is either the result of a thread that has had a new | |
// function set up using 'thread_prepare_function', or is | |
// a thread that previously yielded, and is now being resumed. | |
// if the thread previously yielded and that yield call itself | |
// returns a value back to the thread, the caller must have | |
// already prepared this. | |
svmatom * svm_interpret(svmthread *input_t, svmcontext *input_c) | |
{ | |
// with gcc, we can make the interpreter faster by storing | |
// the addresses of all the instructions and branching directly | |
// to them. so we make that list now. | |
#ifdef STB_VM_CONFIG_USE_LABEL_ADDRESSES | |
static void *opcode_targets_list[SVMOP__total_bytecodes] = | |
{ | |
#define OP(x,y) &&LABEL_##x, | |
OPCODES() | |
#undef OP | |
}; | |
register void **opcode_targets = opcode_targets_list; | |
#endif | |
// we're going to try to run as much code as possible inside | |
// this one function. as long as we're in the function, we don't | |
// have to save/restore registers or anything. to make this explicit | |
// on machines with small register pools (like x86), we'll ask for | |
// our most important locals in registers (though most compilers | |
// will ignore this) | |
register svmthread *t = input_t; | |
register sbytecode *code; | |
register svmstackframe *frame; | |
register svmfunc *func; | |
// the stack pointer is mostly only manipulated on function call/return, | |
// so it's the lowest priority for being a register | |
void *stack; | |
// for a threaded interpreter, we want to store the actual | |
// label addresses in the "bytecode". to do so, the routine | |
// that does the storing needs to be able to be able to get | |
// at the addresses, but they're only accessible inside the | |
// function. so we provide a special path that returns them. | |
#ifdef STB_VM_CONFIG_PTRCODE | |
if (t == NULL) | |
return (svmatom *) opcode_targets; | |
#endif | |
// save the context in the thread so we don't have to pass it around separately | |
t->context = input_c; | |
// note that the function parameters are now dead--they're never accessed again. | |
// just to be on the safe side we'll tell the compiler that as best we can | |
input_t = NULL; | |
input_c = NULL; | |
// now we're ready to begin interpreting. the first thing we do | |
// is load up our registers with the full execution state | |
code = t->saved_instruction_ptr; | |
frame = t->saved_frame; | |
func = t->saved_func; | |
stack = t->saved_stack_top; | |
// now we're ready to start dispatching bytecodes. to do this, | |
// we use two different methods. for portability, we use code | |
// that looks like "for(;;) switch(bytecode) { }". with gcc, | |
// we use code that looks like "goto *opcode_table[bytecode]", and | |
// we put this branch at the *end* of every opcode's implementation. | |
// if TWOBYTECODE is enabled, then the stored bytecodes are *byte* | |
// offsets into the jumptable, instead of indices; this lets | |
// us avoid a shift on the powerpc. If PTRCODE is enabled, | |
// then we store the labels directly in the code, and don't | |
// even have to look in opcode_targets table. | |
// we want a convenient shortcut that lets us refer to the | |
// data in the instruction stream as unsigned instead of signed. | |
#define ucode ((ubytecode *) code) | |
#ifndef STB_VM_CONFIG_USE_LABEL_ADDRESSES | |
// for portable code, we use a switch() to dispatch, | |
// and it's inside a for(;;) loop. | |
#define START_CASES \ | |
for (;;) \ | |
switch(*ucode) | |
// NEXT() then breaks out of the switch(), causing it to loop around the | |
// for(;;) to hit the switch() again. CASE() ends the previous opcode | |
// with NEXT(), then defines a normal switch() case... note that NEXT() | |
// won't work correctly inside a loop! | |
#define NEXT() break | |
#define CASE_RAW(x) case SVMOP_##x: | |
#ifdef SVB_VM_CONFIG_PTRCODE | |
#error not-reached | |
#endif | |
#else | |
// using gcc label addresses, we don't need a loop since we always | |
// branch to next case, and we don't need to do the initial bytecode | |
// invocation because the first CASE() will generate a NEXT() | |
// automatically | |
#define START_CASES | |
// with gcc label addresses, NEXT explicitly jumps to the next | |
// instruction; CASE ends the previous opcode by calling NEXT | |
// and then declares the label for this opcode | |
#if defined(STB_VM_CONFIG_PTRCODE) | |
// if PTRCODE, then the void * is stored explicitly in the code | |
#define OPCODE_TARGET(n) ((void*) (n)) | |
#elif defined(STB_VM_CONFIG_BYTECODE) | |
// if bytecode, we look it up in the table | |
#define OPCODE_TARGET(n) opcode_targets[n] | |
#else | |
// if two-byte code, we assume it's been preshifted, so it's a byte offset | |
#define OPCODE_TARGET(n) *((void **) (((char *) opcode_targets) + (n))) | |
#endif | |
#define NEXT() goto *OPCODE_TARGET(*ucode) | |
#define CASE_RAW(x) LABEL_##x: | |
#endif | |
#define CASE(y) NEXT(); CASE_RAW(y) | |
START_CASES { | |
// next we define a bunch of macros that will make the source | |
// code smaller and easier to follow. | |
// the constants used in a given function are stored in memory | |
// immediately before the function, so to index them, we cast | |
// the function pointer to the right type, and index that. the | |
// indexing uses the previously defined macro that lets us | |
// precompute a shifted index to save an instruction on non-x86 | |
#define cpool_ptr ((svmatom *) func) | |
#define cpool(n) (*atom_cpool_ref(cpool_ptr, n)) | |
// if an instruction contains a constant index which cannot be | |
// a register, and we use PTRCODE, then the constant index can | |
// be a direct pointer to the constant. | |
#ifdef STB_VM_CONFIG_PTRCODE | |
#define cpool_notreg(n) *((svmatom *) (n)) | |
#else | |
#define cpool_notreg(n) cpool(n) | |
#endif | |
#ifdef STB_VM_CONFIG_BYTECODE | |
#define cpoolw(n) ((svm_uintptr) cpool(n).v.u) | |
#else | |
#define cpoolw(n) ((svm_uintptr) ((-(n)-1))) // -1 => 0, -2 => 1, -3 >= 2 | |
#endif | |
// this should be true iff ints and pointers are the same size, and PTRCODE | |
#ifdef STB_VM_CONFIG_INLINE_INT_CONSTANTS | |
// read the value directly from the instruction stream | |
#define cpooli(code_lvalue) (*(svmvalue *) &(code_lvalue)) | |
#else | |
// read the value the usual way | |
#define cpooli(code_lvalue) (cpool_notreg(code_lvalue).v) | |
#endif | |
// this should be true iff floats and pointers are the same size, and PTRCODE | |
#ifdef STB_VM_CONFIG_INLINE_FLOAT_CONSTANTS | |
// read the value directly from the instruction stream | |
#define cpoolf(code_lvalue) (*(svmvalue *) &(code_lvalue)) | |
#else | |
// read the value the usual way | |
#define cpoolf(code_lvalue) (cpool_notreg(code_lvalue).v) | |
#endif | |
#ifdef STB_VM_CONFIG_PTRCODE | |
// read the value directly from the instruction stream | |
#define cpoolp(code_lvalue) (*(svmvalue *) &(code_lvalue)) | |
#else | |
// read the value the usual way | |
#define cpoolp(code_lvalue) (cpool(code_lvalue).v) | |
#endif | |
// registers are stored immediately after the "main" stack frame | |
// data. pretty much all processors have addressing modes with | |
// fixed offsets, so we don't bother trying to optimize the frame | |
// pointer to point exactly to the registers (you could imagine | |
// an instruction set where addressing of reg+constant is available, | |
// or reg+reg, but not reg+reg+constant, in which case it would be | |
// more efficient to always have the pointer point to the beginning | |
// of the array (the array would be indexed as reg+reg, and the other | |
// elements of the structure would be reg+constant). Since there | |
// don't seem to be any important CPUs along those lines, I don't. | |
#define regs(n) (*atom_reg_ref(frame->regs, (n))) | |
// the most general instructions may read from either registers or | |
// constants, so we make macros that hide the details of that. | |
#define val(n) ((n) < 0 ? cpool(n) : regs(n)) | |
#define pval(n) ((n) < 0 ? &cpool(n) : ®s(n)) | |
// we need to do array length checking. if the length is exceeded, | |
// we branch to a array-range-check failure handler | |
#define acheck(a,i) if (i >= atom_svmarray(a)->count) goto array_rangecheck; | |
#define awritecheck(a,i) if (i >= atom_svmarray(a)->count) if (!expand_array(atom_svmarray(a), i)) goto array_rangecheck; | |
// at this point, we're ready for opcodes | |
// the simplest opcode is a no-op; it does nothing, at least in the virtual | |
// machine semantics. However, in STABVM, every opcode is responsible for | |
// advancing the instruction pointer, which is the variable named "code". | |
// We do this because instructions are different lengths. We could try | |
// to make some code that decodes the opcode and figures out the length | |
// by checking a table, but this would involve extra indirections; it's | |
// much faster just to do the add by the right amount in the opcode | |
// implementation itself (although it uses more codespace). And once we're | |
// doing that, it doesn't make any sense for the dispatcher to try to advance | |
// by *only* the opcode. The work it is doing is redundant to that done in | |
// most opcodes, so it makes more sense to move it into the opcode itself. | |
// | |
// The bytecode might be bytes or it might actually be words, but whichever | |
// it is, the vast majority of opcodes use the same number of bytes/words, | |
// so we can just add that number to the pointer, which is itself either | |
// bytes or words as appropriate. | |
CASE(nop) | |
code += 1; | |
// We saw at the start how threads are restored from suspension. | |
// while that's still fresh, let's look at the corresponding call-- | |
// a bytecode that *causes* a suspension. | |
CASE(yield) | |
// This instruction takes up 3 spots in the bytecode stream: | |
// #0 : OP_yield | |
// #1 : value to "return" on yielding | |
// #2 : register to write the result when we "restore" from yielding | |
// implementation is trivial: we just save everything and return | |
// the return value. we don't worry about the result writing, we | |
// leave that to the unsuspend logic. | |
t->context->yield_state = SVM_STATE_yield; | |
t->saved_frame = frame; | |
t->saved_func = func; | |
t->saved_stack_top = stack; | |
// points to the next instruction | |
t->saved_instruction_ptr = code+3; | |
// now return a pointer to the "return" value | |
return pval(code[1]); | |
// Optimized bytecodes are provided which are much simpler and faster | |
// than the general-purpose bytecodes defined by the specification, | |
// with optimized decoding of common cases and/or optimizing behavior | |
// if types are statically known. | |
// We'll start with some of the simplest ones. The first two instructions | |
// perform moves from a register to a register, and from a constant to a | |
// register. We write simple opcodes out as one-liners so that the code | |
// lines up with other opcodes, making it easier to see patterns and bugs. | |
// Each operation like code[1] fetches opcode parameters out of the | |
// bytestream; the macro regs(N) fetches the Nth register, and cpool(N) | |
// fetches the Nth constant from the constant pool (constants are actually | |
// accessed with negative numbers, which is why we must access through | |
// the signed value 'code' not the unsigned value 'ucode') | |
CASE(move_rr_dyn) regs(code[2]) = regs (code[1]); code += 3; | |
CASE(move_rc_dyn) regs(code[2]) = cpool_notreg(code[1]); code += 3; | |
// The next two instructions are similar to the previous. However, | |
// each register/constant is an "atom" which consists of two fields: | |
// a type field (saying what dynamic type the atom has) and a "value" | |
// field (whose encoding depends on the type). For optimized bytecodes | |
// for statically-typed languages, the compiler knows what types the | |
// registers are, and doesn't need to update the type field, so we | |
// only copy the value field, which is denoted for conciseness as ".v". | |
// The ".v" of the register is a union, so later we'll see accesses | |
// to fields in the union, but for these generic instructions that | |
// work on arbitrary types, we just copy the whole value union. | |
CASE(move_rr_static) regs(code[2]).v = regs (code[1]).v; code += 3; | |
CASE(move_rc_static) regs(code[2]).v = cpool_notreg(code[1]).v; code += 3; | |
// Now we have the generic move case, which is a dynamic move, but | |
// we don't know at interpretation time whether we're moving from | |
// a register or the constant pool. (The compiler does know, and | |
// should have just output the right one, but for simplicity we | |
// allow the compiler to target these less-efficient instructions.) | |
CASE(move) { | |
regs(code[2]) = val(code[1]); | |
code += 3; | |
} | |
// A quick side note: when we return from a function, we need to write | |
// the result to a register. To avoid needing extra register moves, | |
// we don't use a fixed register; the instruction says which | |
// register to write to. This will happen a long time after we decoded | |
// the function call, though. So we can either save away which register | |
// it was to the stack, or we can redecode the previous instruction when | |
// we return, and extract the register then. The latter sounds hard, | |
// because instructions are variable length, but we use a simple trick: | |
// we put the register to write the result to in the last byte/word of | |
// the instruction; then on return we always just look one byte back | |
// in the codestream to find the target register. (Note that we cannot | |
// have function calls that discard their return value; they have to | |
// have a dummy register and write it there.) | |
// | |
// So for consistency, the register written by non-fuction call | |
// instructions ALSO appears last in the opcode bytes. This allows us | |
// to make any instruction trigger a trap handler routine, and the return | |
// from that routine will write the return value to the correct place, | |
// allowing us to perform such trap handlers within a single interpreter | |
// loop (without recursing into the interpreter). | |
// There's a second move-like instruction, which includes a coersion between | |
// the input type and an arbitrary type. Here we see our first exception; | |
// if the coersion is impossible, we can tell from the return code, and | |
// we branch to a common chunk of code that will set up a typecheck | |
// exception, and then search up the stack for exception handlers. | |
CASE(coerce) { | |
regnum outreg = code[3]; | |
if (svm_coerce_explicit(t, ®s(outreg), func->type_constants[ucode[2]], pval(code[1]))) | |
goto typecheck; | |
code += 4; | |
} | |
// There's a similar convert operation which is when we explictly request a coersion. | |
// In this case, we allow for the possibility that user-defined conversion | |
// code, and might throw an exception that's *not* a typecheck exception, | |
// so if the conversion throws an exception, we require it to have already | |
// set it up properly. | |
CASE(convert) { | |
regnum outreg = code[3]; | |
svm_convert(t, ®s(outreg), func->type_constants[ucode[2]], pval(code[1])); | |
if (t->context->has_exception) | |
goto exception_handler; | |
code += 4; | |
} | |
// And a type-verifying operation which checks if the type is correct | |
CASE(typecheck) { | |
regnum reg = code[1]; | |
if (func->type_constants[ucode[2]].code != regs(reg).type.code) | |
goto typecheck; | |
code += 3; | |
} | |
// now that we've seen the basics of this, let's look at some instructions | |
// that actually compute something. first we'll only look at the statically typed | |
// ones, since those are one-liners. for example, integer addition always | |
// produces integers (they wrap mod 2^32), so they can't throw exceptions | |
// and don't need coersion. first we have the simple case where we add two | |
// registers and store to a third. | |
CASE(add_rr_int ) regs(code[3]).v.u = regs (code[1]).v.u + regs (code[2]).v.u; code += 4; | |
// next we have the possibility of adding a constant. since addition is commutative, | |
// we only need to allow a constant on the right. we make two versions: one loads | |
// the constant from the constant pool, and the other, faster one uses the value | |
// from the bytecode stream itself. This code is written as if the values are | |
// unsigned, but it will actually handle both signed and unsigned values (and | |
// mixtures of them), because we only support platforms that use 2's complement arithmetic. | |
CASE(add_rc_int ) regs(code[3]).v.u = regs (code[1]).v.u + cpooli(code[2]) .u; code += 4; | |
CASE(add_rb_int ) regs(code[3]).v.u = regs (code[1]).v.u + code[2] ; code += 4; | |
// The third possibility is adding two constants, but an optimizing compiler | |
// should just replace this with the sum of the constants itself, thus an optimizing | |
// compiler never needs to generate that (and the unoptimized OP_add instruction | |
// *does* support this.) | |
// Now we need to repeat the same thing for floats. first we support adding | |
// two registers or adding a register and a constant. we also support the case of | |
// adding a constant integer out of the bytestream, although that integer will have | |
// to be converted to float, which may be slower than fetching from the constant | |
// pool on some platforms. but we define it just in case it isn't. | |
CASE(add_rr_float) regs(code[3]).v.f = regs (code[1]).v.f + regs (code[2]).v.f; code += 4; | |
CASE(add_rc_float) regs(code[3]).v.f = regs (code[1]).v.f + cpoolf(code[2]) .f; code += 4; | |
CASE(add_rb_float) regs(code[3]).v.f = regs (code[1]).v.f + code[2] ; code += 4; | |
// Next, we move on to subtraction. Now, subtraction isn't commutative, so we | |
// might seem to need to implement 'reg - reg', 'reg - const', and 'const - reg' | |
// to cover all the cases. However, 'reg - const' can be implemented using | |
// 'reg + const' with the constant negated, so we omit that one. So subtraction | |
// only has two variations, with the left side being either a register or a | |
// constant. To save bytecode space, we don't implement having the constant | |
// in the bytecode stream, since that operation is more rare. | |
CASE(sub_rr_int ) regs(code[3]).v.u = regs (code[1]).v.u - regs (code[2]).v.u; code += 4; | |
CASE(sub_cr_int ) regs(code[3]).v.u = cpooli(code[1]) .u - regs (code[2]).v.u; code += 4; | |
CASE(sub_rr_float) regs(code[3]).v.f = regs (code[1]).v.f + regs (code[2]).v.f; code += 4; | |
CASE(sub_cr_float) regs(code[3]).v.f = cpoolf(code[1]) .f + regs (code[2]).v.f; code += 4; | |
// Negation could be viewed as a special case of subtraction, but is common | |
// enough that we optimize for it (and we'll see another negation optimization | |
// in OP_mulneg). | |
CASE(neg_rr_int ) regs(code[2]).v.s = -regs(code[1]).v.s; code += 3; | |
CASE(neg_rr_float) regs(code[2]).v.f = -regs(code[1]).v.f; code += 3; | |
// Absolute value is so simple to implement on most CPUs that it would be | |
// a shame to have to implement it as a native function, so we go ahead | |
// and provide it even if it's not all that common. We don't need to worry | |
// about constants; the only plausible case is register-to-register. | |
CASE(abs_rr_int ) regs(code[2]).v.s = svm__abs (regs(code[1]).v.s); code += 3; | |
CASE(abs_rr_float) regs(code[2]).v.f = (float) svm__fabs(regs(code[1]).v.f); code += 3; | |
// As a special case, we have one which forces the output type to be unsigned; | |
// this is useful for taking the absolute value of -2^31, which is still negative | |
// if we keep the result as a signed int | |
CASE(absu_rr_int ) { | |
regnum d = code[2]; | |
regs(d).v.s = svm__abs(regs(code[1]).v.s); | |
regs(d).type.code = SVMT_u32; | |
} | |
// Next we have multiplication. This is similar to addition (it's commutative), | |
// but it's different in that signed and unsigned multiplication are different | |
// operations. In fact, we need to support multiplying a signed by an unsigned | |
// number, but that turns out to just be an unsigned multiply as well. Since | |
// it's commutative, each type has two cases; again we make two versions of the | |
// constant multiply, with the constant stored in different places, except for | |
// float multiply. | |
CASE(mul_rr_int ) regs(code[3]).v.s = regs(code[1]).v.s * regs (code[2]).v.s; code += 4; | |
CASE(mul_rc_int ) regs(code[3]).v.s = regs(code[1]).v.s * cpooli(code[2]) .s; code += 4; | |
CASE(mul_rb_int ) regs(code[3]).v.s = regs(code[1]).v.s * code[2] ; code += 4; | |
CASE(mul_rr_uint ) regs(code[3]).v.u = regs(code[1]).v.u * regs (code[2]).v.u; code += 4; | |
CASE(mul_rc_uint ) regs(code[3]).v.u = regs(code[1]).v.u * cpooli(code[2]) .u; code += 4; | |
CASE(mul_rb_uint ) regs(code[3]).v.u = regs(code[1]).v.u * ucode[2] ; code += 4; | |
CASE(mul_rr_float) regs(code[3]).v.f = regs(code[1]).v.f * regs (code[2]).v.f; code += 4; | |
CASE(mul_rc_float) regs(code[3]).v.f = regs(code[1]).v.f * cpoolf(code[2]) .f; code += 4; | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// That's it for the static jump statements. Now let's take a look at | |
// much more complicated flow control: function call and return. | |
// | |
// STABVM uses an explicit stack for function call/return. We push | |
// function arguments on the stack, save the current function state, | |
// and start up the next function. | |
// | |
// This gets very complicated very quickly, so we'll only look at | |
// the very simplest case first: calling a bytecoded function which | |
// we know statically takes no arguments, and does no special | |
// initialization. This isn't a very important case to optimize | |
// for, but it's the case that *can* go the fastest, so we do, | |
// just in case it's useful. Furthermore, it lets us look at exactly | |
// how much overhead an interpreted function call requires. Function | |
// call/return is important for code that's less "big scripts" and more | |
// "traditional code". | |
// | |
// Other function calls will do more and be slower, but this establishes | |
// our baseline for function call overhead. | |
// | |
// We need to define some local variables, and we want the scope of | |
// these to be explicit for the optimizer, so we'll start up a {} block. | |
CASE(call_fc_bcode_a_none) // 0=opcode, 1=func, 2=retval reg | |
{ | |
// the first field of the instruction is the pointer to the function | |
// to call, so we load that up now. | |
svmfunc *newfunc = (svmfunc *) cpoolp(code[1]).ptr; | |
// In PTRCODE mode, this is one CPU instruction: | |
// load reg_temp,4(reg_code); | |
// in other modes, it's indirected through cpool, so 2-3 instructions. | |
// the new "execution context" (stack frame) we'll run in is located | |
// wherever the current top of stack is. | |
svmstackframe *newframe = (svmstackframe *) stack; | |
// If stack is in a CPU register, this is free. | |
// we do our stack computations in a temporary variable to | |
// encourage the compiler to register allocate it. | |
void *newstack; | |
// when we're done with this function, we'll want to return and | |
// continue executing from this same spot. we'll do that by reloading | |
// these values out of what will be the current frame pointer, | |
// but for now is "newframe". so we need to save everything there. | |
// note that we never actually access code[2], which stores the | |
// register where we'll put the function's return value. We use a trick | |
// to save an instruction or two instead, which we'll see later. | |
newframe->saved_func = func; | |
newframe->saved_stackframe = frame; | |
newframe->saved_instruction_ptr = code + 3; | |
// This requires 4 instructions (3 stores and one add; a bit worse | |
// on x86 where the add will stomp a register). | |
// next we need to compute how much stack space we take up. | |
// this depends on the size of the stack used by the function | |
// description, so we store that explicitly in the function. | |
newstack = (char *) stack + newfunc->stack_frame_size; | |
// this requires two RISC instructions (we assume stack is | |
// in a register) or two x86 instructions (load stack, add | |
// from memory) | |
// next we need to check for stack overflow. by design, to | |
// make this test fast, the stack is stored right before the | |
// thread pointer and grows upwards towards it, so the | |
// stack overflows if it is past the thread pointer. | |
if ((char *) newstack > (char *) t) goto stack_overflow; | |
// this typically requires two instructions | |
// at this point, we've finished saving everything, and we've | |
// checked every failure case, so we just need to update our | |
// locals so we're actually set up to run the new function | |
code = newfunc->bytecode; | |
func = newfunc; | |
frame = newframe; | |
stack = newstack; | |
// each of the above lines requires one instruction (the compiler | |
// can't put newstack in the same register as stack since stack | |
// is still alive after computing newstack if we goto stack_overflow) | |
// and we're done. so, not counting the dispatch code, this | |
// function call (static, no arguments) requires around | |
// 8 CPU instructions for BIGCODE and 10 CPU instructions | |
// otherwise, not counting any register spilling on x86. | |
} | |
// now lets look at the other side of the equation: we want to return | |
// from a function as fast as possible. | |
// | |
// Again, we'll consider the static case. For a return, static means | |
// that the type of the value we're returning is already known to be | |
// the type that the function is defined to be. We don't care about | |
// what type the *caller* is expecting; the caller is required to do | |
// any final coersion (this requires an extra instruction in the | |
// dynamic-to-static case, but the alternative requires us to pay an | |
// extra test in the static-to-static and dynamic-to-dynamic path, or | |
// generate multiple versions of the same function for use in different | |
// contexts). However, since the caller might be calling through | |
// a dynamic interface, we DO have to write out the correct type. | |
// To do this, we rely on the compiler to have arranged for | |
// the chosen register to have the type in it. | |
// again, we need to declare a local variable | |
CASE(return_r_static) | |
{ | |
regnum outreg; | |
// in this case, we want to get a pointer to the frame we're returning to | |
svmstackframe *caller_frame = frame->saved_stackframe; | |
// this requires one CPU instruction | |
// we restore the saved bytecode pointer, which points to immediately | |
// *after* the instruction we last executed in the caller. | |
sbytecode *caller_code = frame->saved_instruction_ptr; | |
// this requires one CPU instruction | |
// now we want to write the return value into the caller's frame. | |
// to know where to write it, we require that *any* caller always | |
// leaves the bytecode pointer so that the *previous* byte/word/dword | |
// before the bytecode pointer points to the register where the | |
// returned result should be written. this is why the bytecode format | |
// places the returned value slot last in the instruction. if we | |
// didn't do this, we would have to save a pointer to where to write | |
// the result, and writing/reading that pointer would be extra overhead. | |
outreg = caller_code[-1]; | |
*atom_reg_ref(caller_frame->regs, outreg) = regs(code[1]); // copy out type in case caller needs it | |
// this effectively computes: | |
// p1 = &caller_frame->regs[caller_code[-1]] | |
// p2 = &frame->regs[code[1]] | |
// p1->v = p2->v | |
// p1->type = p2->type | |
// Let's look at this in two stages, the first two lines then | |
// the second two. | |
// | |
// Loading p1 and p2 require computing "reg + constant + memory" (reg | |
// is 'frame', constants is offset of regs[], and memory is 'code[1]'). | |
// However, we can fold the constant into the uses of p1 and p2, so it's | |
// really just "reg + memory". This requires two CPU instructions EACH on both | |
// x86 and RISC. (x86 adds from memory in one instruction, but needs | |
// to copy the source register to another register first.) | |
// | |
// Copying the data between the two pointers requires 4 RISC instructions | |
// (unless you use some wider data type). x86 seems like it would be the | |
// same. | |
// | |
// This brings the total for that line of code to 8 CPU instructions. | |
// and we restore all the register describing the execution context | |
code = caller_code; | |
stack = (void *) frame; | |
func = frame->saved_func; | |
frame = caller_frame; | |
// this requires 4 CPU instructions | |
// and we're done, ready to continue running in the old frame context. | |
// The total was ~14 CPU instructions (more for register spills); this | |
// is actually more than the void function call was. But more than half of | |
// this was spent in copying the return value. | |
// This means it's worth having some *more* specialized bytecodes for | |
// returns that could be faster. For example, returning a constant | |
// out of the bytecode stream; not writing the type field, or even | |
// a void function that doesn't write anything. So we'll implement | |
// those next. | |
} | |
// first let's do void. it's just the same as the above code, but | |
// without writing anything into the caller's return value. | |
CASE(return_void_static) | |
code = frame->saved_instruction_ptr; | |
stack = (void *) frame; | |
func = frame->saved_func; | |
frame = frame->saved_stackframe; | |
// that's it! it requires only four CPU instructions | |
// let's try reading a small integer out of the bytestream and not writing | |
// the type. this can be used to return a small integer, a "small" unsigned | |
// integer, or a boolean. we could create special 'return true' and | |
// 'return false' for even "more" performance, but on some CPUs they wouldn't | |
// even save anything because the 0 and 1 values would still have to be loaded | |
// or computed into a register somehow. (well, it would save a little memory) | |
CASE(return_b_static) | |
{ | |
svmstackframe *caller_frame = frame->saved_stackframe; | |
int32 retval = code[1]; | |
regnum outreg; | |
code = frame->saved_instruction_ptr; | |
outreg = code[-1]; | |
atom_reg_ref(caller_frame->regs, outreg)->v.s = retval; | |
stack = (void *) frame; | |
func = frame->saved_func; | |
frame = caller_frame; | |
// this is ~9 CPU instructions; we save 2 not computing the | |
// register to load from, we save 2 not reading and writing | |
// the type, and we save one by hoisting code up earlier | |
// in the code, avoiding a register move. | |
// | |
// for an explicit comparison | |
// http://nothings.org/games/if/slux/index.html shows the | |
// disassembly for "return 0" for an earlier VM of mine | |
// which uses the same return-value-location trick. It | |
// instead uses tag bits for type info. It ends up using | |
// six CPU instructions; one isn't RISC-like, but one appears | |
// to be redundant. Regardless, this means it's definitely | |
// possible with different design constraints | |
// to do better than 9 CPU instructions for this operation, | |
// so this is a place where using STABVM compromises | |
// efficiency for semantic flexibility. | |
} | |
// for completeness, lets finish off the return cases. | |
// first we have the equivalent to the first one, but with a constant | |
// instead of a register. because the cpool() could reference the | |
// current function OR the current bytecode stream in PTRCODE, | |
// we have to be careful not to update those before the copy | |
CASE(return_c_static) { | |
svmstackframe *caller_frame = frame->saved_stackframe; | |
sbytecode *caller_code = frame->saved_instruction_ptr; | |
regnum outreg = caller_code[-1]; | |
*atom_reg_ref(caller_frame->regs, outreg) = cpool(code[1]); | |
code = caller_code; | |
stack = (void *) frame; | |
func = frame->saved_func; | |
frame = caller_frame; | |
} | |
// next, we have the fully generic regular return function. this | |
// needs to (a) decode from either regs or constants, and (b) | |
// needs to do a coersion to the return type | |
CASE(return) { | |
svmstackframe *caller_frame = frame->saved_stackframe; | |
sbytecode *caller_code = frame->saved_instruction_ptr; | |
regnum outreg = caller_code[-1]; | |
stack = (void *) frame; | |
if (svm_coerce_explicit(t, atom_reg_ref(caller_frame->regs, outreg), func->return_type, pval(code[1]))) | |
goto typecheck; | |
code = caller_code; | |
func = frame->saved_func; | |
frame = caller_frame; | |
} | |
// Finally, we need a return that's alloca friendly. any stack frame | |
// that might contains allocas has an extra pointer which is a linked | |
// list of allocas from that frame. The VM also keeps a linked list of | |
// all the stack frames. So the stack walker could walk every stack | |
// frame and check the frame's list-of-allocas pointer, but that would | |
// require us to initialize the list-of-allocas pointer on every function | |
// call. Since we don't want to pay the alloca cost if the user never | |
// uses the allocas, that cost is unacceptable. So instead, we keep | |
// a second linked list of the stack frames that have had their | |
// list-of-alloca pointers initialized. Then on return, we have to | |
// unlink that pointer. | |
CASE(return_alloca) { | |
svmstackframe *caller_frame = frame->saved_stackframe; | |
sbytecode *caller_code = frame->saved_instruction_ptr; | |
regnum outreg = caller_code[-1]; | |
#ifdef STB_VM_ALLOCA_WALKER // if this isn't defined, it's same as CASE(return); should share the code | |
t->alloca_stackframe_head = frame->alloca_next_stackframe; // this is the only extra operation | |
#endif | |
stack = (void *) frame; | |
if (svm_coerce_explicit(t, atom_reg_ref(caller_frame->regs, outreg), func->return_type, pval(code[1]))) | |
goto typecheck; | |
code = caller_code; | |
func = frame->saved_func; | |
frame = caller_frame; | |
} | |
// and we're done with all the return variants of every kind. | |
// let's look at some more function dispatch. We'll implement most of the | |
// function calls where the function call is a constant and is implemented | |
// in bytecode. this subset still comes in multiple flavors, but we'll | |
// start with one of the most general. | |
CASE(call_fc_bcode_static) | |
{ // 0=opcode, 1=func, 2=argc, 3..argc+2=arguments, argc+3=retval, | |
// the beginning of this is the same as the void call we | |
// looked at already (OP_call_fc_bcode_a): | |
svmfunc *newfunc = (svmfunc *) cpool(code[1]).v.ptr; | |
svmstackframe *newframe = (svmstackframe *) stack; | |
int i,num_arg; | |
void *newstack; | |
newframe->saved_func = func; | |
newframe->saved_stackframe = frame; | |
// now we need to get the number of arguments | |
num_arg = code[2]; | |
// the instruction is variable length, depending on the argument count | |
newframe->saved_instruction_ptr = code + num_arg+4; | |
// advance the stack and range-check it | |
newstack = (char *) stack + newfunc->stack_frame_size; | |
if ((char *) newstack > (char *) t) | |
goto stack_overflow; | |
// with varargs, the argument array could be longer than num_regs, so we need to check for varargs overflowing | |
if ((char *) &newframe->regs[num_arg] > (char *) t) | |
goto stack_overflow; | |
// copy the function arguments, this version assumes no coersions needed | |
// even so, performance is kind of sucky, because val(k) requires a | |
// branch, and so we're using the *same* branch every time through, | |
// so branch prediction can't work. so later we'll specialize this | |
// with things that minimize the decoding work | |
for (i=0; i < num_arg; ++i) | |
newframe->regs[i] = val(code[3+i]); | |
// now set everything up to be in the new context | |
code = newfunc->bytecode; | |
func = newfunc; | |
frame = newframe; | |
stack = newstack; | |
// now handle the function having special initialization | |
// we do this by branching to shared code that does the initialization, | |
// since it will be the same in every function. | |
if (func->has_initialization) { | |
// we store the argument count in the thread context so we don't | |
// have to store it on in a function-wide variable where | |
// it might trigger save/restores since the compiler might | |
// not figure out that it doesn't flow across the | |
// indirect branches | |
t->context->temp = num_arg; | |
goto func_initializer; | |
} | |
NEXT(); | |
} | |
// this is shared code that is used for all bytecode function prologues | |
// that request initialization | |
func_initializer: { | |
// we copy the argument count back out of the thread pointer | |
int num_arguments_provided_to_call = t->context->temp; | |
// first thing we do is, if we support stackwalking things allocated | |
// by alloca, then we check if this function has any of those, and if | |
// so we add it to the the main chain. additionally, we start a "local" | |
// chain of just the allocations for this stack frame. | |
#ifdef STB_VM_ALLOCA_WALKER | |
if (func->has_walkable_alloca) { | |
// initialize the local chain | |
frame->alloca_local_chain = 0; | |
// update the global chain -- we could actually do this lazily | |
// the first time we add anything to the local chain, but this | |
// is simpler to code and it avoids extra branches to test | |
frame->alloca_next_stackframe = t->alloca_stackframe_head; | |
t->alloca_stackframe_head = frame; | |
} | |
#endif | |
// next step is to handle variable-argument-count functions. we provide | |
// a system which supports two important behaviors: you can ask for a | |
// full args list by setting func->arg_array_start to 0, or | |
// you can ask for a list starting after some known number of | |
// elements by setting it to that number of elements; typically | |
// this would be the start point of a '...' list. You can't ask | |
// for both, since only one array can be generated. (If your language | |
// allows both, you could generate code to dynamically synthesize the | |
// varargs list from the full list as the first instructions of the function). | |
// Also, this doesn't preclude still accessing the elements | |
// from registers (i.e. as name variables). Set arg_array_start to 0x7ffffff | |
// to not create an args array. Set func->max_args to 0x7fffffff to allow an | |
// arbitrary length varargs input. | |
if (num_arguments_provided_to_call > func->arg_array_start) { | |
// copy excess arguments into alloca'd array; this can either be an 'args' array or a 'varargs' array | |
svmarray *a; | |
int array_len = num_arguments_provided_to_call - func->arg_array_start; | |
int array_bytes = sizeof(*a) + sizeof(a->data[0]) * (array_len - 1); | |
a = (svmarray *) stack; | |
if ((char *) stack + array_bytes < (char *) stack) | |
goto stack_overflow; | |
stack = (char *) stack + array_bytes; | |
if ((char *) stack > (char *) t) | |
goto stack_overflow; | |
// now, the arguments are currently on the stack. if there were | |
// more arguments than the function has registers, it wrote off | |
// the end, *into* the extra space, where we just allocated this | |
// array. so before we write anything else into the array structure, | |
// we have to copy the data itself, and we have to use memmove() | |
// since it might be self-overlapping | |
// @TODO: explicitly code loop that goes in right direction, to remove dependency on memmove | |
svm__memmove(a->data, &frame->regs[func->arg_array_start], sizeof(svmatom) * array_len); | |
// now that we've copied the arguments, we can fill out the array header | |
a->count = a->limit = array_len; | |
// now, we add this object to the chain of local walkable-allocas, | |
// so the stack walker can find it | |
#ifdef STB_VM_ALLOCA_WALKER | |
assert(func->has_walkable_alloca); | |
a->alloca_chain = frame->alloca_local_chain; | |
frame->alloca_local_chain = (void *) a; | |
// and we set its type field to NULL, which is the stack-walker flag that it's an array not a structure | |
a->type = NULL; | |
#endif | |
} | |
// next we have a single system used for three purposes: | |
// - providing default parameter values | |
// - initializing variables to constants | |
// - initializing static types for registers for static-to-dynamic conversions | |
// the way this works is that the variables/registers that need initialization | |
// should be low-numbered registers, just after the arguments. parameters needing | |
// initialization are always last. So the initialization just needs to initialize | |
// a continuous block of memory -- any omitted parameters with defaults, and then | |
// some number of low-numbered registers immediately after the parameters. | |
if (func->init_reg_size_in_bytes) { | |
// the initialization uses a memcpy; that copy needs three things: | |
// - the place to write, represented as a register count within the frame | |
// - the amount of data to copy, represented in byte | |
// - the place to copy the data from, in bytes | |
// We start by loading three values with the *full* data to copy; | |
// in the absence of argument-initializers, we'll actually do this; | |
// also if *all* default initializers are required. | |
int start = func->earliest_init_reg; | |
int size = func->init_reg_size_in_bytes; | |
svmatom *data = func->init_reg_data; | |
// Next we need to see if we have initializers that we *shouldn't* set. | |
// This happens iff there are argument initializers and there are more | |
// arguments than the minimum number possible. However, because we ALSO | |
// allow varargs, in which case we'll pass in arbitrary more arguments | |
// then those allowed, but we still need to initialize all the real registers, | |
// the logic here is more complicated. First, we only even have to think | |
// about testing this if the number of arguments passed in extend past | |
// the beginning of the region to initialize. It is precisely to make | |
// this test fast that we chose to encode earliest_init_reg in registers, | |
// not bytes. | |
if (num_arguments_provided_to_call > start) { | |
// tentatively, we assume we start at the first "real" reg for | |
// the function (after the "real" function arguments) | |
start = func->first_mandatory_init_reg; | |
// but if we had fewer arguments then that, then we need to initialize | |
// the missing arguments | |
if (start > num_arguments_provided_to_call) | |
start = num_arguments_provided_to_call; | |
// now we've computed an updated location to copy from. now we need | |
// to update where to copy from, and how much to copy. Both of these | |
// will update by the same number of bytes, so we lift the common | |
// subexpression out explicitly to make sure the compiler doesn't | |
// miss it. | |
{ | |
int offset = (start - func->earliest_init_reg) * sizeof(svmatom); | |
size -= offset; | |
data = (svmatom *) ((char *) data + offset); | |
} | |
} | |
// now we've updated all the pointers and sizes and we can just initialize | |
// the memory | |
svm__memcpy(&frame->regs[start], data, size); | |
} | |
// and we're done with the function prologue, and are ready to run | |
// the first instruction in the function | |
} | |
// now lets look at a version of a function call opcode that does do coersions. | |
// of course if the function is a constant, we know exactly what the | |
// coersions we need to do are at compile time. so ideally we'd only convert | |
// the ones that need it. however, we have to loop through the N arguments | |
// constructing the stack no matter what, and there's not much we can do | |
// that's faster than testing whether we do in fact need to convert. | |
// | |
// so this code is basically identical to call_fc_bcode_static, except it does coersions | |
CASE(call_fc_bcode) { // 0=opcode, 1=func, 2=argc, 3..argc+2=arguments, argc+3=retval, | |
// everything is the same as before up to when we copy the data | |
svmfunc *newfunc = (svmfunc *) cpool(code[1]).v.ptr; | |
svmstackframe *newframe = (svmstackframe *) stack; | |
int i,num_arg; | |
void *newstack; | |
newframe->saved_func = func; | |
newframe->saved_stackframe = frame; | |
num_arg = code[2]; | |
newframe->saved_instruction_ptr = code + num_arg+4; | |
newstack = (char *) stack + newfunc->stack_frame_size; | |
if ((char *) newstack > (char *) t) | |
goto stack_overflow; | |
if ((char *) &newframe->regs[num_arg] > (char *) t) // varargs array could be longer than num_regs | |
goto stack_overflow; | |
// now we copy the data using svm_coerce, which will do the conversions | |
// for us. but, if we're calling a varargs function, we might have more | |
// arguments than it has types, so we have to test for that | |
if (num_arg <= newfunc->max_arg_types) | |
for (i=0; i < num_arg; ++i) | |
svm_coerce(t, &newframe->regs[i], newfunc->type_constants[i], pval(code[i+3])); | |
else { | |
// if we have more than args it has types, then we only coerce those with types | |
int num_types = newfunc->max_arg_types; | |
for (i=0; i < num_types; ++i) | |
svm_coerce(t, &newframe->regs[i], newfunc->type_constants[i], pval(code[i+3])); | |
// and we copy the rest uncoerced | |
for ( ; i < num_arg; ++i) | |
newframe->regs[i] = val(code[i+3]); | |
} | |
// and the rest of the code is unchanged | |
code = newfunc->bytecode; | |
func = newfunc; | |
frame = newframe; | |
stack = newstack; | |
if (func->has_initialization) { | |
t->context->temp = num_arg; | |
goto func_initializer; | |
} | |
} | |
// The last opcode showed a truely fully-general implementation of | |
// a call which we know is going to a bytecode-implemented function. | |
// But there are also native-coded functions. Let's look at those. | |
// The first case to consider is a native function which is statically | |
// known, but where the native function is designed for interfacing | |
// with STABVM by doing any coersions internally: it receives a C array | |
// of svmatoms and parses the arguments itself. We don't have to do | |
// any type coersion (it'll do it if it needs it). So we mostly just | |
// have to build the array and go. | |
CASE(call_fc_ndyn) | |
{ // 0=opcode, 1=func, 2=argc, 3..argc+2=arguments, argc+3=retval, | |
// get the C function pointer, and figure out how many arguments we'll need | |
native_func_dynamicargs *funcptr = (native_func_dynamicargs *) cpool(code[1]).v.ptr; | |
int num_arg = code[2], i; | |
// we get an alias for the stack that we use to build the argument list | |
svmatom *arglist = (svmatom *) stack; | |
// we're going to store the arguments on the stack, so check | |
// that there's room | |
if ((char *) &arglist[num_arg] > (char *) t) | |
goto stack_overflow; | |
// copy the arguments onto the stack; we rely on the callee to | |
// coerce (or to statically know the types, as appropriate) | |
for (i=0; i < num_arg; ++i) | |
arglist[i] = val(code[3+i]); | |
// now we have everything the function call will need. But we | |
// have a few more things to do. | |
// First, we want to allow the function to recurse into the | |
// interpeter. (Imagine, for example, that we're calling a sort | |
// routine, and it wants to call back to a comparison function.) | |
// To do that, we just need to save the stack depth we've used. | |
t->saved_stack_top = arglist + num_arg; | |
// Next, when we call the function, the C compiler may save and | |
// restore our registers (depending on the caller-save situation). | |
// If it's going to save and restore them anyway, then there's no | |
// extra overhead for us to save and restore them (if the compiler | |
// is smart enough to notice). so we'll save and restore the bytecode | |
// pointer (but nothing else). this could waste a *little* bit | |
// on machines with good callee-save support. but it saves | |
// a conditional branch, as we'll see in a moment. | |
t->saved_instruction_ptr = code + num_arg+4; | |
// now we call the C-native function | |
funcptr(t, &frame->regs[code[3+num_arg]], num_arg, arglist); | |
// now we restore the bytecode pointer and go back to running. | |
// except this is actually a trick. we load the value from | |
// t->saved_instruction_ptr which we just saved above, so | |
// *normally* this just restores what we saved. However, | |
// the function we call can overwrite that value. | |
code = t->saved_instruction_ptr; | |
// Now we'll dispatch to the next opcode pointed to by 'code'. | |
// if it was the value we saved ourselves, execution continues | |
// normally. However, if it points somewhere else, something | |
// else happens. In particular, if the function we just called | |
// calls | |
// | |
// svm_from_nativefunc_defer_exception(svmthread *t, svmatom exception_object); | |
// | |
// with this thread's handle, that function will overwrite | |
// t->saved_instruction_ptr with the pointer to a special | |
// instruction sequence that throws the exception. (It'll | |
// copy the old t->saved_instruction_ptr somewhere else, | |
// so that we can restore the interpreter state properly.) | |
// | |
// The point of this is it allows the function we call to | |
// control the behavior of the interpreter at this point, | |
// without us checking a return value from the native function. | |
// On x86 this may save a few instructions; on others, it | |
// probably costs about the same number of instructions | |
// but avoids using another branch prediction slot, etc. | |
// | |
// Because that behavior is not a path that needs to be high | |
// performance, and to avoid overloading the bytecode namespace | |
// in BYTECODE mode, the special instruction sequence doesn't even | |
// get a custom opcode for this behavior; instead we use the regular | |
// SVMOP_break bytecode, and make the SVMOP_break implementation check | |
// if it's in the special bytecode. | |
} | |
// there's another, different calling convention for C code. in | |
// this case, it takes a single structure pointer, and the fields | |
// in it do not have types. they're still all svmvalues; their | |
// layout is identical to the svmstructs that we haven't yet talked | |
// about (and in fact you can use "apply" to call a C function | |
// with a pre-baked struct). | |
// | |
// Again, we'll look at a static one. With this calling convention, | |
// it's our job to do coersions, but we'll start with a fully static | |
// one that doesn't have to do coersions. | |
CASE(call_fc_nstat_static) | |
{ // 0=opcode, 1=func, 2=argc, 3..argc+2=arguments, argc+3=retval, | |
// this proceeds as before | |
native_func_staticargs *funcptr = (native_func_staticargs *) cpool(code[1]).v.ptr; | |
int num_arg = code[2], i; | |
// now the arglist only has values, not types | |
svmvalue *arglist = (svmvalue *) stack; | |
// we're going to store the arguments on the stack, so check | |
// that there's room | |
if ((char *) &arglist[num_arg] > (char *) t) | |
goto stack_overflow; | |
// copy the arguments into the arglist | |
for (i=0; i < num_arg; ++i) | |
arglist[i] = val(code[3+i]).v; | |
// we use the same trick again | |
t->saved_instruction_ptr = code + num_arg+4; | |
funcptr(t, &frame->regs[code[3+num_arg]], arglist); | |
code = t->saved_instruction_ptr; | |
} | |
// now that we've seen all the pieces, let's look at one of the most complicated | |
// opcodes: SVMOP_call in all its full-on glory. the complexity of SVMOP_call here is | |
// something you absolutely want to avoid for performance, which is why we have | |
// all the special cases (and more to come). Generally, SVMOP_call should | |
// only be used when the internal type of the function isn't known in advance. | |
CASE(call) { | |
// the first thing we have to do is get at the function and see what it is | |
svmatom *newfuncatom = pval(code[1]); | |
// there are four cases: closure, bytecode function, native function, | |
// or something else. for maximum generality, if it's something else, | |
// we could try coercing it to a function. but no! that way lies madness, | |
// so we'll just throw an error. | |
// Closures are special because they just set a temp variable, replace | |
// newfuncatom, then run the regular dispatch. So the natural way to | |
// code this would be to do this right here: | |
// if (closure) { | |
// t->context->something = closure->something; | |
// newfuncatom = closure->something; | |
// } | |
// and then it falls through to the regular code that deals | |
// with newfuncatom when it's not a closure. | |
// | |
// However, since closures have more overhead anyway and are likely | |
// to be rare, we don't want to pay the cost of testing for a closure | |
// on every regular call. So we'll make the closure path least efficient | |
// by testing for closure *last. If we find one, we want it to behave | |
// the same as the above code, so we'll issue a *goto* that jumps | |
// back here, as if it had just executed the above if. | |
// | |
// An alternative way to avoid paying the 'if (closure)' cost above | |
// would be to always have an environment and just always copy it to | |
// the temp without testing. However, we'd pay 3-4 instructions on | |
// copying the environment, and we'd have to indirect the function | |
// one deeper. | |
call_got_closure: | |
if (newfuncatom->type.code == SVMT_bytecode_func) { | |
// now we just have the normal bytecode call; this is pretty long, | |
// but it's identical to OP_call_fc_bcode except for where we load the func from | |
svmfunc *newfunc = (svmfunc *) newfuncatom->v.ptr; | |
svmstackframe *newframe = (svmstackframe *) stack; | |
int i,num_arg; | |
void *newstack; | |
newframe->saved_func = func; | |
newframe->saved_stackframe = frame; | |
num_arg = code[2]; | |
newframe->saved_instruction_ptr = code + num_arg+4; | |
newstack = (char *) stack + newfunc->stack_frame_size; | |
if (num_arg < newfunc->min_args || num_arg > newfunc->max_args) | |
goto typecheck; | |
if ((char *) newstack > (char *) t) | |
goto stack_overflow; | |
if ((char *) &newframe->regs[num_arg] > (char *) t) | |
goto stack_overflow; | |
if (num_arg <= newfunc->max_arg_types) | |
for (i=0; i < num_arg; ++i) | |
svm_coerce(t, &newframe->regs[i], newfunc->type_constants[i], pval(code[i+3])); | |
else { | |
int num_types = newfunc->max_arg_types; | |
for (i=0; i < num_types; ++i) | |
svm_coerce(t, &newframe->regs[i], newfunc->type_constants[i], pval(code[i+3])); | |
for ( ; i < num_arg; ++i) | |
newframe->regs[i] = val(code[i+3]); | |
} | |
code = newfunc->bytecode; | |
func = newfunc; | |
frame = newframe; | |
stack = newstack; | |
if (func->has_initialization) { | |
t->context->temp = num_arg; | |
goto func_initializer; | |
} | |
} else if (newfuncatom->type.code == SVMT_native_func) { | |
// now we have a function call. we have to identify which kind of function it is. | |
svmnativefunc *newfunc = (svmnativefunc *) newfuncatom->v.ptr; | |
if (newfunc->is_statictyped) { | |
// this is a statically-known arguments call, so the code below is similar | |
// to OP_call_fc_nstat except for where we load the func from, | |
// and we have to do coersions while packing the arglist | |
svmfunctype *type = newfunc->type; | |
native_func_staticargs *funcptr = newfunc->func.stat; | |
int num_arg = code[2], i; | |
svmvalue *arglist = (svmvalue *) stack; | |
// with packed arguments like this, we require the number of arguments | |
// be a single specific value, so instead of range-checking num_arg, | |
// we require it match exactly; | |
// | |
// in other words we assume type->max_args == type->num_args | |
if (num_arg != type->max_args) | |
goto typecheck; | |
if ((char *) &arglist[num_arg] > (char *) t) | |
goto stack_overflow; | |
for (i=0; i < num_arg; ++i) | |
svm_coerce_static(t, &arglist[i], type->arg_types[i], pval(code[3+i])); | |
if (t->context->has_exception) | |
goto typecheck; | |
t->saved_instruction_ptr = code + num_arg+4; | |
funcptr(t, &frame->regs[code[3+num_arg]], arglist); | |
code = t->saved_instruction_ptr; | |
} else { | |
// that is a call where the callee decodes the arguments itself, | |
// so the code below is identical to OP_call_fc_native except | |
// where we load the func from | |
native_func_dynamicargs *funcptr = newfunc->func.dyn; | |
int num_arg = code[2], i; | |
svmatom *arglist = (svmatom *) stack; | |
if (num_arg < newfunc->type->min_args || num_arg > newfunc->type->max_args) | |
goto typecheck; | |
if ((char *) &arglist[num_arg] > (char *) t) | |
goto stack_overflow; | |
for (i=0; i < num_arg; ++i) | |
arglist[i] = val(code[3+i]); | |
t->saved_instruction_ptr = code + num_arg+4; | |
funcptr(t, &frame->regs[code[3+num_arg]], num_arg, arglist); | |
code = t->saved_instruction_ptr; | |
} | |
} else if (newfuncatom->type.code == SVMT_closure) { | |
svmclosure *c = (svmclosure *) newfuncatom->v.ptr; | |
t->context->environment = c->environment; | |
newfuncatom = &c->function; | |
goto call_got_closure; | |
} else | |
goto typecheck; | |
} | |
// And we have an even crazier case: 'apply', which accepts an svmarray or | |
// svmstruct for the arguments. | |
CASE(apply) | |
{ | |
int is_array = 0; | |
svmatom *arguments = pval(code[2]); | |
svmatom *newfuncatom = pval(code[1]); | |
// check the argument type, and check for fast cases while we're in there | |
if (TYPE_IS_ARRAY(arguments->type)) { | |
if (newfuncatom->type.code == SVMT_native_func) { | |
svmnativefunc *newfunc = (svmnativefunc *) newfuncatom->v.ptr; | |
// if we have a dynamic-typed native function, then we just pass the array | |
if (!newfunc->is_statictyped) { | |
svmarray *a = arguments->v.array_ptr; | |
int num_arg = a->count; | |
if (num_arg < newfunc->type->min_args || num_arg > newfunc->type->max_args) | |
goto typecheck; | |
t->saved_instruction_ptr = code+4; | |
(newfunc->func.dyn)(t, &frame->regs[code[3]], num_arg, a->data); | |
code = t->saved_instruction_ptr; | |
NEXT(); | |
} | |
} | |
is_array = 1; | |
} else if (TYPE_IS_POINTER_TO_TYPE(arguments->type) && TYPE_IS_STRUCT(arguments->type)) { | |
if (newfuncatom->type.code == SVMT_native_func) { | |
svmnativefunc *newfunc = (svmnativefunc *) newfuncatom->v.ptr; | |
if (newfunc->is_statictyped && newfunc->type->struct_type == arguments->type.ptr) { | |
svmfunctype *type = newfunc->type; | |
t->saved_instruction_ptr = code + 4; | |
newfunc->func.stat(t, &frame->regs[code[3]], arguments->v.struct_ptr->data); | |
code = t->saved_instruction_ptr; | |
NEXT(); | |
} | |
} | |
is_array = 0; | |
} else { | |
goto typecheck; | |
} | |
// now we have the non-fast path. first, we handle a closure, | |
// rather than doing that "late" the way we did for SVMOP_call. | |
if (newfuncatom->type.code == SVMT_closure) { | |
svmclosure *c = (svmclosure *) newfuncatom->v.ptr; | |
t->context->environment = c->environment; | |
newfuncatom = &c->function; | |
} | |
// next, we'll copy the arguments onto the thread stack, so that we | |
// can ignore the distinction between the arguments being in | |
// an array or being in a structre. | |
{ | |
int num_arg; | |
svmatom *arglist; | |
// To do that, we need to know where they go on the stack, | |
// which depends on the type of function it is. | |
if (newfuncatom->type.code == SVMT_bytecode_func) { | |
svmstackframe *newframe = (svmstackframe *) stack; | |
arglist = &newframe->regs[0]; | |
} else { | |
arglist = (svmatom *) stack; | |
} | |
if (is_array) { | |
int i; | |
svmarray *a = arguments->v.array_ptr; | |
num_arg = a->count; | |
if (arglist + num_arg > (svmatom *) t) | |
goto stack_overflow; | |
for (i=0; i < num_arg; ++i) | |
arglist[i] = a->data[i]; | |
} else { | |
int i; | |
svmstructtype *type = GET_STRUCT_TYPE(arguments->type); | |
num_arg = type->num_fields; | |
if (arglist + num_arg > (svmatom *) t) | |
goto stack_overflow; | |
for (i=0; i < num_arg; ++i) { | |
arglist[i].v = arguments->v.struct_ptr->data[i]; | |
arglist[i].type = type->field[i]; | |
} | |
} | |
if (newfuncatom->type.code == SVMT_bytecode_func) { | |
svmfunc *newfunc = (svmfunc *) newfuncatom->v.ptr; | |
svmstackframe *newframe = (svmstackframe *) stack; | |
int i; | |
void *newstack; | |
newframe->saved_func = func; | |
newframe->saved_stackframe = frame; | |
newframe->saved_instruction_ptr = code + 4; | |
newstack = (char *) stack + newfunc->stack_frame_size; | |
if (num_arg < newfunc->min_args || num_arg > newfunc->max_args) | |
goto typecheck; | |
if ((char *) newstack > (char *) t) | |
goto stack_overflow; | |
if ((char *) &newframe->regs[num_arg] > (char *) t) | |
goto stack_overflow; | |
svm__assert(arglist == newframe->regs); | |
if (num_arg <= newfunc->max_arg_types) | |
for (i=0; i < num_arg; ++i) | |
svm_coerce_inplace(t, &newframe->regs[i], newfunc->type_constants[i]); | |
else { | |
int num_types = newfunc->max_arg_types; | |
for (i=0; i < num_types; ++i) | |
svm_coerce_inplace(t, &newframe->regs[i], newfunc->type_constants[i]); | |
} | |
code = newfunc->bytecode; | |
func = newfunc; | |
frame = newframe; | |
stack = newstack; | |
if (func->has_initialization) { | |
t->context->temp = num_arg; | |
goto func_initializer; | |
} | |
} else if (newfuncatom->type.code == SVMT_native_func) { | |
svmnativefunc *newfunc = (svmnativefunc *) newfuncatom->v.ptr; | |
if (newfunc->is_statictyped) { | |
svmfunctype *type = newfunc->type; | |
native_func_staticargs *funcptr = newfunc->func.stat; | |
int i; | |
svmvalue *final_args = (svmvalue *) stack; | |
if (num_arg != type->max_args) | |
goto typecheck; | |
if ((char *) &arglist[num_arg] > (char *) t) | |
goto stack_overflow; | |
// note that final_args and arglist overlap, but in a safe way | |
for (i=0; i < num_arg; ++i) | |
svm_coerce_static(t, &final_args[i], type->arg_types[i], &arglist[i]); | |
if (t->context->has_exception) | |
goto typecheck; | |
t->saved_instruction_ptr = code + 4; | |
funcptr(t, &frame->regs[code[3]], final_args); | |
code = t->saved_instruction_ptr; | |
} else { | |
native_func_dynamicargs *funcptr = newfunc->func.dyn; | |
svmatom *arglist = (svmatom *) stack; | |
if (num_arg < newfunc->type->min_args || num_arg > newfunc->type->max_args) | |
goto typecheck; | |
t->saved_instruction_ptr = code + 4; | |
funcptr(t, &frame->regs[code[3]], num_arg, arglist); | |
code = t->saved_instruction_ptr; | |
} | |
} else { | |
goto typecheck; | |
} | |
} | |
} | |
// Now let's make a few call variants that are optimized for when the function to | |
// call is in a register, but the types of everything are statically known, | |
// and we know there's no varargs and such. We're going to assume that languages | |
// allow free intermixing of bytecode and native coded functions, so they can't | |
// know whether a register contains one or the other. So we're not going to | |
// bother producing specialized cases that limit to one or the other, and | |
// because this general case has to handle all three, it's pretty long, so | |
// we're only going to have a couple of specialized ones. We're also going | |
// to assume that this means that while we can tell whether a | |
// function might use varargs or not (this is part of the public signature), | |
// we don't know whether it might use alloca or might even build an 'args' array | |
// (this is not part of the public function signature). Thus we still have to | |
// do all the function initialization for those cases. | |
// | |
// The first case we have is a function call with no arguments, which is | |
// much simpler since all the argument processing is gone. | |
CASE(call_fr_a_static) | |
{ | |
// this proceeds largely the same as SVMOP_call, except we skip all the | |
// argument processing. | |
svmatom *newfuncatom = ®s(code[1]); | |
if (newfuncatom->type.code == SVMT_bytecode_func) { | |
svmfunc *newfunc = (svmfunc *) newfuncatom->v.ptr; | |
svmstackframe *newframe = (svmstackframe *) stack; | |
void *newstack; | |
newframe->saved_func = func; | |
newframe->saved_stackframe = frame; | |
newframe->saved_instruction_ptr = code + 3; | |
newstack = (char *) stack + newfunc->stack_frame_size; | |
if ((char *) newstack > (char *) t) | |
goto stack_overflow; | |
code = newfunc->bytecode; | |
func = newfunc; | |
frame = newframe; | |
stack = newstack; | |
if (func->has_initialization) { | |
t->context->temp = 0; | |
goto func_initializer; | |
} | |
} else if (newfuncatom->type.code == SVMT_native_func) { | |
svmnativefunc *newfunc = (svmnativefunc *) newfuncatom->v.ptr; | |
t->saved_instruction_ptr = code + 3; | |
if (newfunc->is_statictyped) | |
newfunc->func.stat(t, &frame->regs[code[2]], SVM__NULL); | |
else | |
newfunc->func.dyn(t, &frame->regs[code[2]], 0, SVM__NULL); | |
code = t->saved_instruction_ptr; | |
} else | |
goto typecheck; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment