Skip to content

Instantly share code, notes, and snippets.

@nothings
Created August 4, 2014 18:00
Show Gist options
  • Save nothings/706a046c35ae5d30c490 to your computer and use it in GitHub Desktop.
Save nothings/706a046c35ae5d30c490 to your computer and use it in GitHub Desktop.
stb_vm excerpts
#define atom_array_ref_preshifted(arr,offset) ((svmatom *) (((uint8 *) (arr)) + (offset)))
#define atom_array_ref_naive(arr,offset) (&(arr)[offset])
#ifdef STB_VM_CONFIG_PRESHIFTED_BYTESTREAM_CPOOL
#define atom_cpool_ref atom_array_ref_preshifted
#else
#define atom_cpool_ref atom_array_ref_naive
#endif
#ifdef STB_VM_CONFIG_PRESHIFTED_BYTESTREAM_REG
#define atom_reg_ref atom_array_ref_preshifted
#else
#define atom_reg_ref atom_array_ref_naive
#endif
#ifdef STB_VM_CONFIG_PREFSHIFTED_STRUCT_OFFSETS
#define struct_access(str,off) ((svmvalue *) (((uint8 *) (str)) + (offset)))
#else
#define struct_access(str,off) (str)[off]
#endif
#ifdef STB_VM_CONFIG_HOLD_REGISTERS_IN_SMALL_INT
typedef ubytecode regnum;
#else
typedef int regnum;
#endif
#ifdef SVB_VM_CONFIG_PTRCODE
#define STB_VM_CONFIG_USE_LABEL_ADDRESSES
#endif
// svm_interpret takes as input a just-resumed thread. this
// is either the result of a thread that has had a new
// function set up using 'thread_prepare_function', or is
// a thread that previously yielded, and is now being resumed.
// if the thread previously yielded and that yield call itself
// returns a value back to the thread, the caller must have
// already prepared this.
svmatom * svm_interpret(svmthread *input_t, svmcontext *input_c)
{
// with gcc, we can make the interpreter faster by storing
// the addresses of all the instructions and branching directly
// to them. so we make that list now.
#ifdef STB_VM_CONFIG_USE_LABEL_ADDRESSES
static void *opcode_targets_list[SVMOP__total_bytecodes] =
{
#define OP(x,y) &&LABEL_##x,
OPCODES()
#undef OP
};
register void **opcode_targets = opcode_targets_list;
#endif
// we're going to try to run as much code as possible inside
// this one function. as long as we're in the function, we don't
// have to save/restore registers or anything. to make this explicit
// on machines with small register pools (like x86), we'll ask for
// our most important locals in registers (though most compilers
// will ignore this)
register svmthread *t = input_t;
register sbytecode *code;
register svmstackframe *frame;
register svmfunc *func;
// the stack pointer is mostly only manipulated on function call/return,
// so it's the lowest priority for being a register
void *stack;
// for a threaded interpreter, we want to store the actual
// label addresses in the "bytecode". to do so, the routine
// that does the storing needs to be able to be able to get
// at the addresses, but they're only accessible inside the
// function. so we provide a special path that returns them.
#ifdef STB_VM_CONFIG_PTRCODE
if (t == NULL)
return (svmatom *) opcode_targets;
#endif
// save the context in the thread so we don't have to pass it around separately
t->context = input_c;
// note that the function parameters are now dead--they're never accessed again.
// just to be on the safe side we'll tell the compiler that as best we can
input_t = NULL;
input_c = NULL;
// now we're ready to begin interpreting. the first thing we do
// is load up our registers with the full execution state
code = t->saved_instruction_ptr;
frame = t->saved_frame;
func = t->saved_func;
stack = t->saved_stack_top;
// now we're ready to start dispatching bytecodes. to do this,
// we use two different methods. for portability, we use code
// that looks like "for(;;) switch(bytecode) { }". with gcc,
// we use code that looks like "goto *opcode_table[bytecode]", and
// we put this branch at the *end* of every opcode's implementation.
// if TWOBYTECODE is enabled, then the stored bytecodes are *byte*
// offsets into the jumptable, instead of indices; this lets
// us avoid a shift on the powerpc. If PTRCODE is enabled,
// then we store the labels directly in the code, and don't
// even have to look in opcode_targets table.
// we want a convenient shortcut that lets us refer to the
// data in the instruction stream as unsigned instead of signed.
#define ucode ((ubytecode *) code)
#ifndef STB_VM_CONFIG_USE_LABEL_ADDRESSES
// for portable code, we use a switch() to dispatch,
// and it's inside a for(;;) loop.
#define START_CASES \
for (;;) \
switch(*ucode)
// NEXT() then breaks out of the switch(), causing it to loop around the
// for(;;) to hit the switch() again. CASE() ends the previous opcode
// with NEXT(), then defines a normal switch() case... note that NEXT()
// won't work correctly inside a loop!
#define NEXT() break
#define CASE_RAW(x) case SVMOP_##x:
#ifdef SVB_VM_CONFIG_PTRCODE
#error not-reached
#endif
#else
// using gcc label addresses, we don't need a loop since we always
// branch to next case, and we don't need to do the initial bytecode
// invocation because the first CASE() will generate a NEXT()
// automatically
#define START_CASES
// with gcc label addresses, NEXT explicitly jumps to the next
// instruction; CASE ends the previous opcode by calling NEXT
// and then declares the label for this opcode
#if defined(STB_VM_CONFIG_PTRCODE)
// if PTRCODE, then the void * is stored explicitly in the code
#define OPCODE_TARGET(n) ((void*) (n))
#elif defined(STB_VM_CONFIG_BYTECODE)
// if bytecode, we look it up in the table
#define OPCODE_TARGET(n) opcode_targets[n]
#else
// if two-byte code, we assume it's been preshifted, so it's a byte offset
#define OPCODE_TARGET(n) *((void **) (((char *) opcode_targets) + (n)))
#endif
#define NEXT() goto *OPCODE_TARGET(*ucode)
#define CASE_RAW(x) LABEL_##x:
#endif
#define CASE(y) NEXT(); CASE_RAW(y)
START_CASES {
// next we define a bunch of macros that will make the source
// code smaller and easier to follow.
// the constants used in a given function are stored in memory
// immediately before the function, so to index them, we cast
// the function pointer to the right type, and index that. the
// indexing uses the previously defined macro that lets us
// precompute a shifted index to save an instruction on non-x86
#define cpool_ptr ((svmatom *) func)
#define cpool(n) (*atom_cpool_ref(cpool_ptr, n))
// if an instruction contains a constant index which cannot be
// a register, and we use PTRCODE, then the constant index can
// be a direct pointer to the constant.
#ifdef STB_VM_CONFIG_PTRCODE
#define cpool_notreg(n) *((svmatom *) (n))
#else
#define cpool_notreg(n) cpool(n)
#endif
#ifdef STB_VM_CONFIG_BYTECODE
#define cpoolw(n) ((svm_uintptr) cpool(n).v.u)
#else
#define cpoolw(n) ((svm_uintptr) ((-(n)-1))) // -1 => 0, -2 => 1, -3 >= 2
#endif
// this should be true iff ints and pointers are the same size, and PTRCODE
#ifdef STB_VM_CONFIG_INLINE_INT_CONSTANTS
// read the value directly from the instruction stream
#define cpooli(code_lvalue) (*(svmvalue *) &(code_lvalue))
#else
// read the value the usual way
#define cpooli(code_lvalue) (cpool_notreg(code_lvalue).v)
#endif
// this should be true iff floats and pointers are the same size, and PTRCODE
#ifdef STB_VM_CONFIG_INLINE_FLOAT_CONSTANTS
// read the value directly from the instruction stream
#define cpoolf(code_lvalue) (*(svmvalue *) &(code_lvalue))
#else
// read the value the usual way
#define cpoolf(code_lvalue) (cpool_notreg(code_lvalue).v)
#endif
#ifdef STB_VM_CONFIG_PTRCODE
// read the value directly from the instruction stream
#define cpoolp(code_lvalue) (*(svmvalue *) &(code_lvalue))
#else
// read the value the usual way
#define cpoolp(code_lvalue) (cpool(code_lvalue).v)
#endif
// registers are stored immediately after the "main" stack frame
// data. pretty much all processors have addressing modes with
// fixed offsets, so we don't bother trying to optimize the frame
// pointer to point exactly to the registers (you could imagine
// an instruction set where addressing of reg+constant is available,
// or reg+reg, but not reg+reg+constant, in which case it would be
// more efficient to always have the pointer point to the beginning
// of the array (the array would be indexed as reg+reg, and the other
// elements of the structure would be reg+constant). Since there
// don't seem to be any important CPUs along those lines, I don't.
#define regs(n) (*atom_reg_ref(frame->regs, (n)))
// the most general instructions may read from either registers or
// constants, so we make macros that hide the details of that.
#define val(n) ((n) < 0 ? cpool(n) : regs(n))
#define pval(n) ((n) < 0 ? &cpool(n) : &regs(n))
// we need to do array length checking. if the length is exceeded,
// we branch to a array-range-check failure handler
#define acheck(a,i) if (i >= atom_svmarray(a)->count) goto array_rangecheck;
#define awritecheck(a,i) if (i >= atom_svmarray(a)->count) if (!expand_array(atom_svmarray(a), i)) goto array_rangecheck;
// at this point, we're ready for opcodes
// the simplest opcode is a no-op; it does nothing, at least in the virtual
// machine semantics. However, in STABVM, every opcode is responsible for
// advancing the instruction pointer, which is the variable named "code".
// We do this because instructions are different lengths. We could try
// to make some code that decodes the opcode and figures out the length
// by checking a table, but this would involve extra indirections; it's
// much faster just to do the add by the right amount in the opcode
// implementation itself (although it uses more codespace). And once we're
// doing that, it doesn't make any sense for the dispatcher to try to advance
// by *only* the opcode. The work it is doing is redundant to that done in
// most opcodes, so it makes more sense to move it into the opcode itself.
//
// The bytecode might be bytes or it might actually be words, but whichever
// it is, the vast majority of opcodes use the same number of bytes/words,
// so we can just add that number to the pointer, which is itself either
// bytes or words as appropriate.
CASE(nop)
code += 1;
// We saw at the start how threads are restored from suspension.
// while that's still fresh, let's look at the corresponding call--
// a bytecode that *causes* a suspension.
CASE(yield)
// This instruction takes up 3 spots in the bytecode stream:
// #0 : OP_yield
// #1 : value to "return" on yielding
// #2 : register to write the result when we "restore" from yielding
// implementation is trivial: we just save everything and return
// the return value. we don't worry about the result writing, we
// leave that to the unsuspend logic.
t->context->yield_state = SVM_STATE_yield;
t->saved_frame = frame;
t->saved_func = func;
t->saved_stack_top = stack;
// points to the next instruction
t->saved_instruction_ptr = code+3;
// now return a pointer to the "return" value
return pval(code[1]);
// Optimized bytecodes are provided which are much simpler and faster
// than the general-purpose bytecodes defined by the specification,
// with optimized decoding of common cases and/or optimizing behavior
// if types are statically known.
// We'll start with some of the simplest ones. The first two instructions
// perform moves from a register to a register, and from a constant to a
// register. We write simple opcodes out as one-liners so that the code
// lines up with other opcodes, making it easier to see patterns and bugs.
// Each operation like code[1] fetches opcode parameters out of the
// bytestream; the macro regs(N) fetches the Nth register, and cpool(N)
// fetches the Nth constant from the constant pool (constants are actually
// accessed with negative numbers, which is why we must access through
// the signed value 'code' not the unsigned value 'ucode')
CASE(move_rr_dyn) regs(code[2]) = regs (code[1]); code += 3;
CASE(move_rc_dyn) regs(code[2]) = cpool_notreg(code[1]); code += 3;
// The next two instructions are similar to the previous. However,
// each register/constant is an "atom" which consists of two fields:
// a type field (saying what dynamic type the atom has) and a "value"
// field (whose encoding depends on the type). For optimized bytecodes
// for statically-typed languages, the compiler knows what types the
// registers are, and doesn't need to update the type field, so we
// only copy the value field, which is denoted for conciseness as ".v".
// The ".v" of the register is a union, so later we'll see accesses
// to fields in the union, but for these generic instructions that
// work on arbitrary types, we just copy the whole value union.
CASE(move_rr_static) regs(code[2]).v = regs (code[1]).v; code += 3;
CASE(move_rc_static) regs(code[2]).v = cpool_notreg(code[1]).v; code += 3;
// Now we have the generic move case, which is a dynamic move, but
// we don't know at interpretation time whether we're moving from
// a register or the constant pool. (The compiler does know, and
// should have just output the right one, but for simplicity we
// allow the compiler to target these less-efficient instructions.)
CASE(move) {
regs(code[2]) = val(code[1]);
code += 3;
}
// A quick side note: when we return from a function, we need to write
// the result to a register. To avoid needing extra register moves,
// we don't use a fixed register; the instruction says which
// register to write to. This will happen a long time after we decoded
// the function call, though. So we can either save away which register
// it was to the stack, or we can redecode the previous instruction when
// we return, and extract the register then. The latter sounds hard,
// because instructions are variable length, but we use a simple trick:
// we put the register to write the result to in the last byte/word of
// the instruction; then on return we always just look one byte back
// in the codestream to find the target register. (Note that we cannot
// have function calls that discard their return value; they have to
// have a dummy register and write it there.)
//
// So for consistency, the register written by non-fuction call
// instructions ALSO appears last in the opcode bytes. This allows us
// to make any instruction trigger a trap handler routine, and the return
// from that routine will write the return value to the correct place,
// allowing us to perform such trap handlers within a single interpreter
// loop (without recursing into the interpreter).
// There's a second move-like instruction, which includes a coersion between
// the input type and an arbitrary type. Here we see our first exception;
// if the coersion is impossible, we can tell from the return code, and
// we branch to a common chunk of code that will set up a typecheck
// exception, and then search up the stack for exception handlers.
CASE(coerce) {
regnum outreg = code[3];
if (svm_coerce_explicit(t, &regs(outreg), func->type_constants[ucode[2]], pval(code[1])))
goto typecheck;
code += 4;
}
// There's a similar convert operation which is when we explictly request a coersion.
// In this case, we allow for the possibility that user-defined conversion
// code, and might throw an exception that's *not* a typecheck exception,
// so if the conversion throws an exception, we require it to have already
// set it up properly.
CASE(convert) {
regnum outreg = code[3];
svm_convert(t, &regs(outreg), func->type_constants[ucode[2]], pval(code[1]));
if (t->context->has_exception)
goto exception_handler;
code += 4;
}
// And a type-verifying operation which checks if the type is correct
CASE(typecheck) {
regnum reg = code[1];
if (func->type_constants[ucode[2]].code != regs(reg).type.code)
goto typecheck;
code += 3;
}
// now that we've seen the basics of this, let's look at some instructions
// that actually compute something. first we'll only look at the statically typed
// ones, since those are one-liners. for example, integer addition always
// produces integers (they wrap mod 2^32), so they can't throw exceptions
// and don't need coersion. first we have the simple case where we add two
// registers and store to a third.
CASE(add_rr_int ) regs(code[3]).v.u = regs (code[1]).v.u + regs (code[2]).v.u; code += 4;
// next we have the possibility of adding a constant. since addition is commutative,
// we only need to allow a constant on the right. we make two versions: one loads
// the constant from the constant pool, and the other, faster one uses the value
// from the bytecode stream itself. This code is written as if the values are
// unsigned, but it will actually handle both signed and unsigned values (and
// mixtures of them), because we only support platforms that use 2's complement arithmetic.
CASE(add_rc_int ) regs(code[3]).v.u = regs (code[1]).v.u + cpooli(code[2]) .u; code += 4;
CASE(add_rb_int ) regs(code[3]).v.u = regs (code[1]).v.u + code[2] ; code += 4;
// The third possibility is adding two constants, but an optimizing compiler
// should just replace this with the sum of the constants itself, thus an optimizing
// compiler never needs to generate that (and the unoptimized OP_add instruction
// *does* support this.)
// Now we need to repeat the same thing for floats. first we support adding
// two registers or adding a register and a constant. we also support the case of
// adding a constant integer out of the bytestream, although that integer will have
// to be converted to float, which may be slower than fetching from the constant
// pool on some platforms. but we define it just in case it isn't.
CASE(add_rr_float) regs(code[3]).v.f = regs (code[1]).v.f + regs (code[2]).v.f; code += 4;
CASE(add_rc_float) regs(code[3]).v.f = regs (code[1]).v.f + cpoolf(code[2]) .f; code += 4;
CASE(add_rb_float) regs(code[3]).v.f = regs (code[1]).v.f + code[2] ; code += 4;
// Next, we move on to subtraction. Now, subtraction isn't commutative, so we
// might seem to need to implement 'reg - reg', 'reg - const', and 'const - reg'
// to cover all the cases. However, 'reg - const' can be implemented using
// 'reg + const' with the constant negated, so we omit that one. So subtraction
// only has two variations, with the left side being either a register or a
// constant. To save bytecode space, we don't implement having the constant
// in the bytecode stream, since that operation is more rare.
CASE(sub_rr_int ) regs(code[3]).v.u = regs (code[1]).v.u - regs (code[2]).v.u; code += 4;
CASE(sub_cr_int ) regs(code[3]).v.u = cpooli(code[1]) .u - regs (code[2]).v.u; code += 4;
CASE(sub_rr_float) regs(code[3]).v.f = regs (code[1]).v.f + regs (code[2]).v.f; code += 4;
CASE(sub_cr_float) regs(code[3]).v.f = cpoolf(code[1]) .f + regs (code[2]).v.f; code += 4;
// Negation could be viewed as a special case of subtraction, but is common
// enough that we optimize for it (and we'll see another negation optimization
// in OP_mulneg).
CASE(neg_rr_int ) regs(code[2]).v.s = -regs(code[1]).v.s; code += 3;
CASE(neg_rr_float) regs(code[2]).v.f = -regs(code[1]).v.f; code += 3;
// Absolute value is so simple to implement on most CPUs that it would be
// a shame to have to implement it as a native function, so we go ahead
// and provide it even if it's not all that common. We don't need to worry
// about constants; the only plausible case is register-to-register.
CASE(abs_rr_int ) regs(code[2]).v.s = svm__abs (regs(code[1]).v.s); code += 3;
CASE(abs_rr_float) regs(code[2]).v.f = (float) svm__fabs(regs(code[1]).v.f); code += 3;
// As a special case, we have one which forces the output type to be unsigned;
// this is useful for taking the absolute value of -2^31, which is still negative
// if we keep the result as a signed int
CASE(absu_rr_int ) {
regnum d = code[2];
regs(d).v.s = svm__abs(regs(code[1]).v.s);
regs(d).type.code = SVMT_u32;
}
// Next we have multiplication. This is similar to addition (it's commutative),
// but it's different in that signed and unsigned multiplication are different
// operations. In fact, we need to support multiplying a signed by an unsigned
// number, but that turns out to just be an unsigned multiply as well. Since
// it's commutative, each type has two cases; again we make two versions of the
// constant multiply, with the constant stored in different places, except for
// float multiply.
CASE(mul_rr_int ) regs(code[3]).v.s = regs(code[1]).v.s * regs (code[2]).v.s; code += 4;
CASE(mul_rc_int ) regs(code[3]).v.s = regs(code[1]).v.s * cpooli(code[2]) .s; code += 4;
CASE(mul_rb_int ) regs(code[3]).v.s = regs(code[1]).v.s * code[2] ; code += 4;
CASE(mul_rr_uint ) regs(code[3]).v.u = regs(code[1]).v.u * regs (code[2]).v.u; code += 4;
CASE(mul_rc_uint ) regs(code[3]).v.u = regs(code[1]).v.u * cpooli(code[2]) .u; code += 4;
CASE(mul_rb_uint ) regs(code[3]).v.u = regs(code[1]).v.u * ucode[2] ; code += 4;
CASE(mul_rr_float) regs(code[3]).v.f = regs(code[1]).v.f * regs (code[2]).v.f; code += 4;
CASE(mul_rc_float) regs(code[3]).v.f = regs(code[1]).v.f * cpoolf(code[2]) .f; code += 4;
// That's it for the static jump statements. Now let's take a look at
// much more complicated flow control: function call and return.
//
// STABVM uses an explicit stack for function call/return. We push
// function arguments on the stack, save the current function state,
// and start up the next function.
//
// This gets very complicated very quickly, so we'll only look at
// the very simplest case first: calling a bytecoded function which
// we know statically takes no arguments, and does no special
// initialization. This isn't a very important case to optimize
// for, but it's the case that *can* go the fastest, so we do,
// just in case it's useful. Furthermore, it lets us look at exactly
// how much overhead an interpreted function call requires. Function
// call/return is important for code that's less "big scripts" and more
// "traditional code".
//
// Other function calls will do more and be slower, but this establishes
// our baseline for function call overhead.
//
// We need to define some local variables, and we want the scope of
// these to be explicit for the optimizer, so we'll start up a {} block.
CASE(call_fc_bcode_a_none) // 0=opcode, 1=func, 2=retval reg
{
// the first field of the instruction is the pointer to the function
// to call, so we load that up now.
svmfunc *newfunc = (svmfunc *) cpoolp(code[1]).ptr;
// In PTRCODE mode, this is one CPU instruction:
// load reg_temp,4(reg_code);
// in other modes, it's indirected through cpool, so 2-3 instructions.
// the new "execution context" (stack frame) we'll run in is located
// wherever the current top of stack is.
svmstackframe *newframe = (svmstackframe *) stack;
// If stack is in a CPU register, this is free.
// we do our stack computations in a temporary variable to
// encourage the compiler to register allocate it.
void *newstack;
// when we're done with this function, we'll want to return and
// continue executing from this same spot. we'll do that by reloading
// these values out of what will be the current frame pointer,
// but for now is "newframe". so we need to save everything there.
// note that we never actually access code[2], which stores the
// register where we'll put the function's return value. We use a trick
// to save an instruction or two instead, which we'll see later.
newframe->saved_func = func;
newframe->saved_stackframe = frame;
newframe->saved_instruction_ptr = code + 3;
// This requires 4 instructions (3 stores and one add; a bit worse
// on x86 where the add will stomp a register).
// next we need to compute how much stack space we take up.
// this depends on the size of the stack used by the function
// description, so we store that explicitly in the function.
newstack = (char *) stack + newfunc->stack_frame_size;
// this requires two RISC instructions (we assume stack is
// in a register) or two x86 instructions (load stack, add
// from memory)
// next we need to check for stack overflow. by design, to
// make this test fast, the stack is stored right before the
// thread pointer and grows upwards towards it, so the
// stack overflows if it is past the thread pointer.
if ((char *) newstack > (char *) t) goto stack_overflow;
// this typically requires two instructions
// at this point, we've finished saving everything, and we've
// checked every failure case, so we just need to update our
// locals so we're actually set up to run the new function
code = newfunc->bytecode;
func = newfunc;
frame = newframe;
stack = newstack;
// each of the above lines requires one instruction (the compiler
// can't put newstack in the same register as stack since stack
// is still alive after computing newstack if we goto stack_overflow)
// and we're done. so, not counting the dispatch code, this
// function call (static, no arguments) requires around
// 8 CPU instructions for BIGCODE and 10 CPU instructions
// otherwise, not counting any register spilling on x86.
}
// now lets look at the other side of the equation: we want to return
// from a function as fast as possible.
//
// Again, we'll consider the static case. For a return, static means
// that the type of the value we're returning is already known to be
// the type that the function is defined to be. We don't care about
// what type the *caller* is expecting; the caller is required to do
// any final coersion (this requires an extra instruction in the
// dynamic-to-static case, but the alternative requires us to pay an
// extra test in the static-to-static and dynamic-to-dynamic path, or
// generate multiple versions of the same function for use in different
// contexts). However, since the caller might be calling through
// a dynamic interface, we DO have to write out the correct type.
// To do this, we rely on the compiler to have arranged for
// the chosen register to have the type in it.
// again, we need to declare a local variable
CASE(return_r_static)
{
regnum outreg;
// in this case, we want to get a pointer to the frame we're returning to
svmstackframe *caller_frame = frame->saved_stackframe;
// this requires one CPU instruction
// we restore the saved bytecode pointer, which points to immediately
// *after* the instruction we last executed in the caller.
sbytecode *caller_code = frame->saved_instruction_ptr;
// this requires one CPU instruction
// now we want to write the return value into the caller's frame.
// to know where to write it, we require that *any* caller always
// leaves the bytecode pointer so that the *previous* byte/word/dword
// before the bytecode pointer points to the register where the
// returned result should be written. this is why the bytecode format
// places the returned value slot last in the instruction. if we
// didn't do this, we would have to save a pointer to where to write
// the result, and writing/reading that pointer would be extra overhead.
outreg = caller_code[-1];
*atom_reg_ref(caller_frame->regs, outreg) = regs(code[1]); // copy out type in case caller needs it
// this effectively computes:
// p1 = &caller_frame->regs[caller_code[-1]]
// p2 = &frame->regs[code[1]]
// p1->v = p2->v
// p1->type = p2->type
// Let's look at this in two stages, the first two lines then
// the second two.
//
// Loading p1 and p2 require computing "reg + constant + memory" (reg
// is 'frame', constants is offset of regs[], and memory is 'code[1]').
// However, we can fold the constant into the uses of p1 and p2, so it's
// really just "reg + memory". This requires two CPU instructions EACH on both
// x86 and RISC. (x86 adds from memory in one instruction, but needs
// to copy the source register to another register first.)
//
// Copying the data between the two pointers requires 4 RISC instructions
// (unless you use some wider data type). x86 seems like it would be the
// same.
//
// This brings the total for that line of code to 8 CPU instructions.
// and we restore all the register describing the execution context
code = caller_code;
stack = (void *) frame;
func = frame->saved_func;
frame = caller_frame;
// this requires 4 CPU instructions
// and we're done, ready to continue running in the old frame context.
// The total was ~14 CPU instructions (more for register spills); this
// is actually more than the void function call was. But more than half of
// this was spent in copying the return value.
// This means it's worth having some *more* specialized bytecodes for
// returns that could be faster. For example, returning a constant
// out of the bytecode stream; not writing the type field, or even
// a void function that doesn't write anything. So we'll implement
// those next.
}
// first let's do void. it's just the same as the above code, but
// without writing anything into the caller's return value.
CASE(return_void_static)
code = frame->saved_instruction_ptr;
stack = (void *) frame;
func = frame->saved_func;
frame = frame->saved_stackframe;
// that's it! it requires only four CPU instructions
// let's try reading a small integer out of the bytestream and not writing
// the type. this can be used to return a small integer, a "small" unsigned
// integer, or a boolean. we could create special 'return true' and
// 'return false' for even "more" performance, but on some CPUs they wouldn't
// even save anything because the 0 and 1 values would still have to be loaded
// or computed into a register somehow. (well, it would save a little memory)
CASE(return_b_static)
{
svmstackframe *caller_frame = frame->saved_stackframe;
int32 retval = code[1];
regnum outreg;
code = frame->saved_instruction_ptr;
outreg = code[-1];
atom_reg_ref(caller_frame->regs, outreg)->v.s = retval;
stack = (void *) frame;
func = frame->saved_func;
frame = caller_frame;
// this is ~9 CPU instructions; we save 2 not computing the
// register to load from, we save 2 not reading and writing
// the type, and we save one by hoisting code up earlier
// in the code, avoiding a register move.
//
// for an explicit comparison
// http://nothings.org/games/if/slux/index.html shows the
// disassembly for "return 0" for an earlier VM of mine
// which uses the same return-value-location trick. It
// instead uses tag bits for type info. It ends up using
// six CPU instructions; one isn't RISC-like, but one appears
// to be redundant. Regardless, this means it's definitely
// possible with different design constraints
// to do better than 9 CPU instructions for this operation,
// so this is a place where using STABVM compromises
// efficiency for semantic flexibility.
}
// for completeness, lets finish off the return cases.
// first we have the equivalent to the first one, but with a constant
// instead of a register. because the cpool() could reference the
// current function OR the current bytecode stream in PTRCODE,
// we have to be careful not to update those before the copy
CASE(return_c_static) {
svmstackframe *caller_frame = frame->saved_stackframe;
sbytecode *caller_code = frame->saved_instruction_ptr;
regnum outreg = caller_code[-1];
*atom_reg_ref(caller_frame->regs, outreg) = cpool(code[1]);
code = caller_code;
stack = (void *) frame;
func = frame->saved_func;
frame = caller_frame;
}
// next, we have the fully generic regular return function. this
// needs to (a) decode from either regs or constants, and (b)
// needs to do a coersion to the return type
CASE(return) {
svmstackframe *caller_frame = frame->saved_stackframe;
sbytecode *caller_code = frame->saved_instruction_ptr;
regnum outreg = caller_code[-1];
stack = (void *) frame;
if (svm_coerce_explicit(t, atom_reg_ref(caller_frame->regs, outreg), func->return_type, pval(code[1])))
goto typecheck;
code = caller_code;
func = frame->saved_func;
frame = caller_frame;
}
// Finally, we need a return that's alloca friendly. any stack frame
// that might contains allocas has an extra pointer which is a linked
// list of allocas from that frame. The VM also keeps a linked list of
// all the stack frames. So the stack walker could walk every stack
// frame and check the frame's list-of-allocas pointer, but that would
// require us to initialize the list-of-allocas pointer on every function
// call. Since we don't want to pay the alloca cost if the user never
// uses the allocas, that cost is unacceptable. So instead, we keep
// a second linked list of the stack frames that have had their
// list-of-alloca pointers initialized. Then on return, we have to
// unlink that pointer.
CASE(return_alloca) {
svmstackframe *caller_frame = frame->saved_stackframe;
sbytecode *caller_code = frame->saved_instruction_ptr;
regnum outreg = caller_code[-1];
#ifdef STB_VM_ALLOCA_WALKER // if this isn't defined, it's same as CASE(return); should share the code
t->alloca_stackframe_head = frame->alloca_next_stackframe; // this is the only extra operation
#endif
stack = (void *) frame;
if (svm_coerce_explicit(t, atom_reg_ref(caller_frame->regs, outreg), func->return_type, pval(code[1])))
goto typecheck;
code = caller_code;
func = frame->saved_func;
frame = caller_frame;
}
// and we're done with all the return variants of every kind.
// let's look at some more function dispatch. We'll implement most of the
// function calls where the function call is a constant and is implemented
// in bytecode. this subset still comes in multiple flavors, but we'll
// start with one of the most general.
CASE(call_fc_bcode_static)
{ // 0=opcode, 1=func, 2=argc, 3..argc+2=arguments, argc+3=retval,
// the beginning of this is the same as the void call we
// looked at already (OP_call_fc_bcode_a):
svmfunc *newfunc = (svmfunc *) cpool(code[1]).v.ptr;
svmstackframe *newframe = (svmstackframe *) stack;
int i,num_arg;
void *newstack;
newframe->saved_func = func;
newframe->saved_stackframe = frame;
// now we need to get the number of arguments
num_arg = code[2];
// the instruction is variable length, depending on the argument count
newframe->saved_instruction_ptr = code + num_arg+4;
// advance the stack and range-check it
newstack = (char *) stack + newfunc->stack_frame_size;
if ((char *) newstack > (char *) t)
goto stack_overflow;
// with varargs, the argument array could be longer than num_regs, so we need to check for varargs overflowing
if ((char *) &newframe->regs[num_arg] > (char *) t)
goto stack_overflow;
// copy the function arguments, this version assumes no coersions needed
// even so, performance is kind of sucky, because val(k) requires a
// branch, and so we're using the *same* branch every time through,
// so branch prediction can't work. so later we'll specialize this
// with things that minimize the decoding work
for (i=0; i < num_arg; ++i)
newframe->regs[i] = val(code[3+i]);
// now set everything up to be in the new context
code = newfunc->bytecode;
func = newfunc;
frame = newframe;
stack = newstack;
// now handle the function having special initialization
// we do this by branching to shared code that does the initialization,
// since it will be the same in every function.
if (func->has_initialization) {
// we store the argument count in the thread context so we don't
// have to store it on in a function-wide variable where
// it might trigger save/restores since the compiler might
// not figure out that it doesn't flow across the
// indirect branches
t->context->temp = num_arg;
goto func_initializer;
}
NEXT();
}
// this is shared code that is used for all bytecode function prologues
// that request initialization
func_initializer: {
// we copy the argument count back out of the thread pointer
int num_arguments_provided_to_call = t->context->temp;
// first thing we do is, if we support stackwalking things allocated
// by alloca, then we check if this function has any of those, and if
// so we add it to the the main chain. additionally, we start a "local"
// chain of just the allocations for this stack frame.
#ifdef STB_VM_ALLOCA_WALKER
if (func->has_walkable_alloca) {
// initialize the local chain
frame->alloca_local_chain = 0;
// update the global chain -- we could actually do this lazily
// the first time we add anything to the local chain, but this
// is simpler to code and it avoids extra branches to test
frame->alloca_next_stackframe = t->alloca_stackframe_head;
t->alloca_stackframe_head = frame;
}
#endif
// next step is to handle variable-argument-count functions. we provide
// a system which supports two important behaviors: you can ask for a
// full args list by setting func->arg_array_start to 0, or
// you can ask for a list starting after some known number of
// elements by setting it to that number of elements; typically
// this would be the start point of a '...' list. You can't ask
// for both, since only one array can be generated. (If your language
// allows both, you could generate code to dynamically synthesize the
// varargs list from the full list as the first instructions of the function).
// Also, this doesn't preclude still accessing the elements
// from registers (i.e. as name variables). Set arg_array_start to 0x7ffffff
// to not create an args array. Set func->max_args to 0x7fffffff to allow an
// arbitrary length varargs input.
if (num_arguments_provided_to_call > func->arg_array_start) {
// copy excess arguments into alloca'd array; this can either be an 'args' array or a 'varargs' array
svmarray *a;
int array_len = num_arguments_provided_to_call - func->arg_array_start;
int array_bytes = sizeof(*a) + sizeof(a->data[0]) * (array_len - 1);
a = (svmarray *) stack;
if ((char *) stack + array_bytes < (char *) stack)
goto stack_overflow;
stack = (char *) stack + array_bytes;
if ((char *) stack > (char *) t)
goto stack_overflow;
// now, the arguments are currently on the stack. if there were
// more arguments than the function has registers, it wrote off
// the end, *into* the extra space, where we just allocated this
// array. so before we write anything else into the array structure,
// we have to copy the data itself, and we have to use memmove()
// since it might be self-overlapping
// @TODO: explicitly code loop that goes in right direction, to remove dependency on memmove
svm__memmove(a->data, &frame->regs[func->arg_array_start], sizeof(svmatom) * array_len);
// now that we've copied the arguments, we can fill out the array header
a->count = a->limit = array_len;
// now, we add this object to the chain of local walkable-allocas,
// so the stack walker can find it
#ifdef STB_VM_ALLOCA_WALKER
assert(func->has_walkable_alloca);
a->alloca_chain = frame->alloca_local_chain;
frame->alloca_local_chain = (void *) a;
// and we set its type field to NULL, which is the stack-walker flag that it's an array not a structure
a->type = NULL;
#endif
}
// next we have a single system used for three purposes:
// - providing default parameter values
// - initializing variables to constants
// - initializing static types for registers for static-to-dynamic conversions
// the way this works is that the variables/registers that need initialization
// should be low-numbered registers, just after the arguments. parameters needing
// initialization are always last. So the initialization just needs to initialize
// a continuous block of memory -- any omitted parameters with defaults, and then
// some number of low-numbered registers immediately after the parameters.
if (func->init_reg_size_in_bytes) {
// the initialization uses a memcpy; that copy needs three things:
// - the place to write, represented as a register count within the frame
// - the amount of data to copy, represented in byte
// - the place to copy the data from, in bytes
// We start by loading three values with the *full* data to copy;
// in the absence of argument-initializers, we'll actually do this;
// also if *all* default initializers are required.
int start = func->earliest_init_reg;
int size = func->init_reg_size_in_bytes;
svmatom *data = func->init_reg_data;
// Next we need to see if we have initializers that we *shouldn't* set.
// This happens iff there are argument initializers and there are more
// arguments than the minimum number possible. However, because we ALSO
// allow varargs, in which case we'll pass in arbitrary more arguments
// then those allowed, but we still need to initialize all the real registers,
// the logic here is more complicated. First, we only even have to think
// about testing this if the number of arguments passed in extend past
// the beginning of the region to initialize. It is precisely to make
// this test fast that we chose to encode earliest_init_reg in registers,
// not bytes.
if (num_arguments_provided_to_call > start) {
// tentatively, we assume we start at the first "real" reg for
// the function (after the "real" function arguments)
start = func->first_mandatory_init_reg;
// but if we had fewer arguments then that, then we need to initialize
// the missing arguments
if (start > num_arguments_provided_to_call)
start = num_arguments_provided_to_call;
// now we've computed an updated location to copy from. now we need
// to update where to copy from, and how much to copy. Both of these
// will update by the same number of bytes, so we lift the common
// subexpression out explicitly to make sure the compiler doesn't
// miss it.
{
int offset = (start - func->earliest_init_reg) * sizeof(svmatom);
size -= offset;
data = (svmatom *) ((char *) data + offset);
}
}
// now we've updated all the pointers and sizes and we can just initialize
// the memory
svm__memcpy(&frame->regs[start], data, size);
}
// and we're done with the function prologue, and are ready to run
// the first instruction in the function
}
// now lets look at a version of a function call opcode that does do coersions.
// of course if the function is a constant, we know exactly what the
// coersions we need to do are at compile time. so ideally we'd only convert
// the ones that need it. however, we have to loop through the N arguments
// constructing the stack no matter what, and there's not much we can do
// that's faster than testing whether we do in fact need to convert.
//
// so this code is basically identical to call_fc_bcode_static, except it does coersions
CASE(call_fc_bcode) { // 0=opcode, 1=func, 2=argc, 3..argc+2=arguments, argc+3=retval,
// everything is the same as before up to when we copy the data
svmfunc *newfunc = (svmfunc *) cpool(code[1]).v.ptr;
svmstackframe *newframe = (svmstackframe *) stack;
int i,num_arg;
void *newstack;
newframe->saved_func = func;
newframe->saved_stackframe = frame;
num_arg = code[2];
newframe->saved_instruction_ptr = code + num_arg+4;
newstack = (char *) stack + newfunc->stack_frame_size;
if ((char *) newstack > (char *) t)
goto stack_overflow;
if ((char *) &newframe->regs[num_arg] > (char *) t) // varargs array could be longer than num_regs
goto stack_overflow;
// now we copy the data using svm_coerce, which will do the conversions
// for us. but, if we're calling a varargs function, we might have more
// arguments than it has types, so we have to test for that
if (num_arg <= newfunc->max_arg_types)
for (i=0; i < num_arg; ++i)
svm_coerce(t, &newframe->regs[i], newfunc->type_constants[i], pval(code[i+3]));
else {
// if we have more than args it has types, then we only coerce those with types
int num_types = newfunc->max_arg_types;
for (i=0; i < num_types; ++i)
svm_coerce(t, &newframe->regs[i], newfunc->type_constants[i], pval(code[i+3]));
// and we copy the rest uncoerced
for ( ; i < num_arg; ++i)
newframe->regs[i] = val(code[i+3]);
}
// and the rest of the code is unchanged
code = newfunc->bytecode;
func = newfunc;
frame = newframe;
stack = newstack;
if (func->has_initialization) {
t->context->temp = num_arg;
goto func_initializer;
}
}
// The last opcode showed a truely fully-general implementation of
// a call which we know is going to a bytecode-implemented function.
// But there are also native-coded functions. Let's look at those.
// The first case to consider is a native function which is statically
// known, but where the native function is designed for interfacing
// with STABVM by doing any coersions internally: it receives a C array
// of svmatoms and parses the arguments itself. We don't have to do
// any type coersion (it'll do it if it needs it). So we mostly just
// have to build the array and go.
CASE(call_fc_ndyn)
{ // 0=opcode, 1=func, 2=argc, 3..argc+2=arguments, argc+3=retval,
// get the C function pointer, and figure out how many arguments we'll need
native_func_dynamicargs *funcptr = (native_func_dynamicargs *) cpool(code[1]).v.ptr;
int num_arg = code[2], i;
// we get an alias for the stack that we use to build the argument list
svmatom *arglist = (svmatom *) stack;
// we're going to store the arguments on the stack, so check
// that there's room
if ((char *) &arglist[num_arg] > (char *) t)
goto stack_overflow;
// copy the arguments onto the stack; we rely on the callee to
// coerce (or to statically know the types, as appropriate)
for (i=0; i < num_arg; ++i)
arglist[i] = val(code[3+i]);
// now we have everything the function call will need. But we
// have a few more things to do.
// First, we want to allow the function to recurse into the
// interpeter. (Imagine, for example, that we're calling a sort
// routine, and it wants to call back to a comparison function.)
// To do that, we just need to save the stack depth we've used.
t->saved_stack_top = arglist + num_arg;
// Next, when we call the function, the C compiler may save and
// restore our registers (depending on the caller-save situation).
// If it's going to save and restore them anyway, then there's no
// extra overhead for us to save and restore them (if the compiler
// is smart enough to notice). so we'll save and restore the bytecode
// pointer (but nothing else). this could waste a *little* bit
// on machines with good callee-save support. but it saves
// a conditional branch, as we'll see in a moment.
t->saved_instruction_ptr = code + num_arg+4;
// now we call the C-native function
funcptr(t, &frame->regs[code[3+num_arg]], num_arg, arglist);
// now we restore the bytecode pointer and go back to running.
// except this is actually a trick. we load the value from
// t->saved_instruction_ptr which we just saved above, so
// *normally* this just restores what we saved. However,
// the function we call can overwrite that value.
code = t->saved_instruction_ptr;
// Now we'll dispatch to the next opcode pointed to by 'code'.
// if it was the value we saved ourselves, execution continues
// normally. However, if it points somewhere else, something
// else happens. In particular, if the function we just called
// calls
//
// svm_from_nativefunc_defer_exception(svmthread *t, svmatom exception_object);
//
// with this thread's handle, that function will overwrite
// t->saved_instruction_ptr with the pointer to a special
// instruction sequence that throws the exception. (It'll
// copy the old t->saved_instruction_ptr somewhere else,
// so that we can restore the interpreter state properly.)
//
// The point of this is it allows the function we call to
// control the behavior of the interpreter at this point,
// without us checking a return value from the native function.
// On x86 this may save a few instructions; on others, it
// probably costs about the same number of instructions
// but avoids using another branch prediction slot, etc.
//
// Because that behavior is not a path that needs to be high
// performance, and to avoid overloading the bytecode namespace
// in BYTECODE mode, the special instruction sequence doesn't even
// get a custom opcode for this behavior; instead we use the regular
// SVMOP_break bytecode, and make the SVMOP_break implementation check
// if it's in the special bytecode.
}
// there's another, different calling convention for C code. in
// this case, it takes a single structure pointer, and the fields
// in it do not have types. they're still all svmvalues; their
// layout is identical to the svmstructs that we haven't yet talked
// about (and in fact you can use "apply" to call a C function
// with a pre-baked struct).
//
// Again, we'll look at a static one. With this calling convention,
// it's our job to do coersions, but we'll start with a fully static
// one that doesn't have to do coersions.
CASE(call_fc_nstat_static)
{ // 0=opcode, 1=func, 2=argc, 3..argc+2=arguments, argc+3=retval,
// this proceeds as before
native_func_staticargs *funcptr = (native_func_staticargs *) cpool(code[1]).v.ptr;
int num_arg = code[2], i;
// now the arglist only has values, not types
svmvalue *arglist = (svmvalue *) stack;
// we're going to store the arguments on the stack, so check
// that there's room
if ((char *) &arglist[num_arg] > (char *) t)
goto stack_overflow;
// copy the arguments into the arglist
for (i=0; i < num_arg; ++i)
arglist[i] = val(code[3+i]).v;
// we use the same trick again
t->saved_instruction_ptr = code + num_arg+4;
funcptr(t, &frame->regs[code[3+num_arg]], arglist);
code = t->saved_instruction_ptr;
}
// now that we've seen all the pieces, let's look at one of the most complicated
// opcodes: SVMOP_call in all its full-on glory. the complexity of SVMOP_call here is
// something you absolutely want to avoid for performance, which is why we have
// all the special cases (and more to come). Generally, SVMOP_call should
// only be used when the internal type of the function isn't known in advance.
CASE(call) {
// the first thing we have to do is get at the function and see what it is
svmatom *newfuncatom = pval(code[1]);
// there are four cases: closure, bytecode function, native function,
// or something else. for maximum generality, if it's something else,
// we could try coercing it to a function. but no! that way lies madness,
// so we'll just throw an error.
// Closures are special because they just set a temp variable, replace
// newfuncatom, then run the regular dispatch. So the natural way to
// code this would be to do this right here:
// if (closure) {
// t->context->something = closure->something;
// newfuncatom = closure->something;
// }
// and then it falls through to the regular code that deals
// with newfuncatom when it's not a closure.
//
// However, since closures have more overhead anyway and are likely
// to be rare, we don't want to pay the cost of testing for a closure
// on every regular call. So we'll make the closure path least efficient
// by testing for closure *last. If we find one, we want it to behave
// the same as the above code, so we'll issue a *goto* that jumps
// back here, as if it had just executed the above if.
//
// An alternative way to avoid paying the 'if (closure)' cost above
// would be to always have an environment and just always copy it to
// the temp without testing. However, we'd pay 3-4 instructions on
// copying the environment, and we'd have to indirect the function
// one deeper.
call_got_closure:
if (newfuncatom->type.code == SVMT_bytecode_func) {
// now we just have the normal bytecode call; this is pretty long,
// but it's identical to OP_call_fc_bcode except for where we load the func from
svmfunc *newfunc = (svmfunc *) newfuncatom->v.ptr;
svmstackframe *newframe = (svmstackframe *) stack;
int i,num_arg;
void *newstack;
newframe->saved_func = func;
newframe->saved_stackframe = frame;
num_arg = code[2];
newframe->saved_instruction_ptr = code + num_arg+4;
newstack = (char *) stack + newfunc->stack_frame_size;
if (num_arg < newfunc->min_args || num_arg > newfunc->max_args)
goto typecheck;
if ((char *) newstack > (char *) t)
goto stack_overflow;
if ((char *) &newframe->regs[num_arg] > (char *) t)
goto stack_overflow;
if (num_arg <= newfunc->max_arg_types)
for (i=0; i < num_arg; ++i)
svm_coerce(t, &newframe->regs[i], newfunc->type_constants[i], pval(code[i+3]));
else {
int num_types = newfunc->max_arg_types;
for (i=0; i < num_types; ++i)
svm_coerce(t, &newframe->regs[i], newfunc->type_constants[i], pval(code[i+3]));
for ( ; i < num_arg; ++i)
newframe->regs[i] = val(code[i+3]);
}
code = newfunc->bytecode;
func = newfunc;
frame = newframe;
stack = newstack;
if (func->has_initialization) {
t->context->temp = num_arg;
goto func_initializer;
}
} else if (newfuncatom->type.code == SVMT_native_func) {
// now we have a function call. we have to identify which kind of function it is.
svmnativefunc *newfunc = (svmnativefunc *) newfuncatom->v.ptr;
if (newfunc->is_statictyped) {
// this is a statically-known arguments call, so the code below is similar
// to OP_call_fc_nstat except for where we load the func from,
// and we have to do coersions while packing the arglist
svmfunctype *type = newfunc->type;
native_func_staticargs *funcptr = newfunc->func.stat;
int num_arg = code[2], i;
svmvalue *arglist = (svmvalue *) stack;
// with packed arguments like this, we require the number of arguments
// be a single specific value, so instead of range-checking num_arg,
// we require it match exactly;
//
// in other words we assume type->max_args == type->num_args
if (num_arg != type->max_args)
goto typecheck;
if ((char *) &arglist[num_arg] > (char *) t)
goto stack_overflow;
for (i=0; i < num_arg; ++i)
svm_coerce_static(t, &arglist[i], type->arg_types[i], pval(code[3+i]));
if (t->context->has_exception)
goto typecheck;
t->saved_instruction_ptr = code + num_arg+4;
funcptr(t, &frame->regs[code[3+num_arg]], arglist);
code = t->saved_instruction_ptr;
} else {
// that is a call where the callee decodes the arguments itself,
// so the code below is identical to OP_call_fc_native except
// where we load the func from
native_func_dynamicargs *funcptr = newfunc->func.dyn;
int num_arg = code[2], i;
svmatom *arglist = (svmatom *) stack;
if (num_arg < newfunc->type->min_args || num_arg > newfunc->type->max_args)
goto typecheck;
if ((char *) &arglist[num_arg] > (char *) t)
goto stack_overflow;
for (i=0; i < num_arg; ++i)
arglist[i] = val(code[3+i]);
t->saved_instruction_ptr = code + num_arg+4;
funcptr(t, &frame->regs[code[3+num_arg]], num_arg, arglist);
code = t->saved_instruction_ptr;
}
} else if (newfuncatom->type.code == SVMT_closure) {
svmclosure *c = (svmclosure *) newfuncatom->v.ptr;
t->context->environment = c->environment;
newfuncatom = &c->function;
goto call_got_closure;
} else
goto typecheck;
}
// And we have an even crazier case: 'apply', which accepts an svmarray or
// svmstruct for the arguments.
CASE(apply)
{
int is_array = 0;
svmatom *arguments = pval(code[2]);
svmatom *newfuncatom = pval(code[1]);
// check the argument type, and check for fast cases while we're in there
if (TYPE_IS_ARRAY(arguments->type)) {
if (newfuncatom->type.code == SVMT_native_func) {
svmnativefunc *newfunc = (svmnativefunc *) newfuncatom->v.ptr;
// if we have a dynamic-typed native function, then we just pass the array
if (!newfunc->is_statictyped) {
svmarray *a = arguments->v.array_ptr;
int num_arg = a->count;
if (num_arg < newfunc->type->min_args || num_arg > newfunc->type->max_args)
goto typecheck;
t->saved_instruction_ptr = code+4;
(newfunc->func.dyn)(t, &frame->regs[code[3]], num_arg, a->data);
code = t->saved_instruction_ptr;
NEXT();
}
}
is_array = 1;
} else if (TYPE_IS_POINTER_TO_TYPE(arguments->type) && TYPE_IS_STRUCT(arguments->type)) {
if (newfuncatom->type.code == SVMT_native_func) {
svmnativefunc *newfunc = (svmnativefunc *) newfuncatom->v.ptr;
if (newfunc->is_statictyped && newfunc->type->struct_type == arguments->type.ptr) {
svmfunctype *type = newfunc->type;
t->saved_instruction_ptr = code + 4;
newfunc->func.stat(t, &frame->regs[code[3]], arguments->v.struct_ptr->data);
code = t->saved_instruction_ptr;
NEXT();
}
}
is_array = 0;
} else {
goto typecheck;
}
// now we have the non-fast path. first, we handle a closure,
// rather than doing that "late" the way we did for SVMOP_call.
if (newfuncatom->type.code == SVMT_closure) {
svmclosure *c = (svmclosure *) newfuncatom->v.ptr;
t->context->environment = c->environment;
newfuncatom = &c->function;
}
// next, we'll copy the arguments onto the thread stack, so that we
// can ignore the distinction between the arguments being in
// an array or being in a structre.
{
int num_arg;
svmatom *arglist;
// To do that, we need to know where they go on the stack,
// which depends on the type of function it is.
if (newfuncatom->type.code == SVMT_bytecode_func) {
svmstackframe *newframe = (svmstackframe *) stack;
arglist = &newframe->regs[0];
} else {
arglist = (svmatom *) stack;
}
if (is_array) {
int i;
svmarray *a = arguments->v.array_ptr;
num_arg = a->count;
if (arglist + num_arg > (svmatom *) t)
goto stack_overflow;
for (i=0; i < num_arg; ++i)
arglist[i] = a->data[i];
} else {
int i;
svmstructtype *type = GET_STRUCT_TYPE(arguments->type);
num_arg = type->num_fields;
if (arglist + num_arg > (svmatom *) t)
goto stack_overflow;
for (i=0; i < num_arg; ++i) {
arglist[i].v = arguments->v.struct_ptr->data[i];
arglist[i].type = type->field[i];
}
}
if (newfuncatom->type.code == SVMT_bytecode_func) {
svmfunc *newfunc = (svmfunc *) newfuncatom->v.ptr;
svmstackframe *newframe = (svmstackframe *) stack;
int i;
void *newstack;
newframe->saved_func = func;
newframe->saved_stackframe = frame;
newframe->saved_instruction_ptr = code + 4;
newstack = (char *) stack + newfunc->stack_frame_size;
if (num_arg < newfunc->min_args || num_arg > newfunc->max_args)
goto typecheck;
if ((char *) newstack > (char *) t)
goto stack_overflow;
if ((char *) &newframe->regs[num_arg] > (char *) t)
goto stack_overflow;
svm__assert(arglist == newframe->regs);
if (num_arg <= newfunc->max_arg_types)
for (i=0; i < num_arg; ++i)
svm_coerce_inplace(t, &newframe->regs[i], newfunc->type_constants[i]);
else {
int num_types = newfunc->max_arg_types;
for (i=0; i < num_types; ++i)
svm_coerce_inplace(t, &newframe->regs[i], newfunc->type_constants[i]);
}
code = newfunc->bytecode;
func = newfunc;
frame = newframe;
stack = newstack;
if (func->has_initialization) {
t->context->temp = num_arg;
goto func_initializer;
}
} else if (newfuncatom->type.code == SVMT_native_func) {
svmnativefunc *newfunc = (svmnativefunc *) newfuncatom->v.ptr;
if (newfunc->is_statictyped) {
svmfunctype *type = newfunc->type;
native_func_staticargs *funcptr = newfunc->func.stat;
int i;
svmvalue *final_args = (svmvalue *) stack;
if (num_arg != type->max_args)
goto typecheck;
if ((char *) &arglist[num_arg] > (char *) t)
goto stack_overflow;
// note that final_args and arglist overlap, but in a safe way
for (i=0; i < num_arg; ++i)
svm_coerce_static(t, &final_args[i], type->arg_types[i], &arglist[i]);
if (t->context->has_exception)
goto typecheck;
t->saved_instruction_ptr = code + 4;
funcptr(t, &frame->regs[code[3]], final_args);
code = t->saved_instruction_ptr;
} else {
native_func_dynamicargs *funcptr = newfunc->func.dyn;
svmatom *arglist = (svmatom *) stack;
if (num_arg < newfunc->type->min_args || num_arg > newfunc->type->max_args)
goto typecheck;
t->saved_instruction_ptr = code + 4;
funcptr(t, &frame->regs[code[3]], num_arg, arglist);
code = t->saved_instruction_ptr;
}
} else {
goto typecheck;
}
}
}
// Now let's make a few call variants that are optimized for when the function to
// call is in a register, but the types of everything are statically known,
// and we know there's no varargs and such. We're going to assume that languages
// allow free intermixing of bytecode and native coded functions, so they can't
// know whether a register contains one or the other. So we're not going to
// bother producing specialized cases that limit to one or the other, and
// because this general case has to handle all three, it's pretty long, so
// we're only going to have a couple of specialized ones. We're also going
// to assume that this means that while we can tell whether a
// function might use varargs or not (this is part of the public signature),
// we don't know whether it might use alloca or might even build an 'args' array
// (this is not part of the public function signature). Thus we still have to
// do all the function initialization for those cases.
//
// The first case we have is a function call with no arguments, which is
// much simpler since all the argument processing is gone.
CASE(call_fr_a_static)
{
// this proceeds largely the same as SVMOP_call, except we skip all the
// argument processing.
svmatom *newfuncatom = &regs(code[1]);
if (newfuncatom->type.code == SVMT_bytecode_func) {
svmfunc *newfunc = (svmfunc *) newfuncatom->v.ptr;
svmstackframe *newframe = (svmstackframe *) stack;
void *newstack;
newframe->saved_func = func;
newframe->saved_stackframe = frame;
newframe->saved_instruction_ptr = code + 3;
newstack = (char *) stack + newfunc->stack_frame_size;
if ((char *) newstack > (char *) t)
goto stack_overflow;
code = newfunc->bytecode;
func = newfunc;
frame = newframe;
stack = newstack;
if (func->has_initialization) {
t->context->temp = 0;
goto func_initializer;
}
} else if (newfuncatom->type.code == SVMT_native_func) {
svmnativefunc *newfunc = (svmnativefunc *) newfuncatom->v.ptr;
t->saved_instruction_ptr = code + 3;
if (newfunc->is_statictyped)
newfunc->func.stat(t, &frame->regs[code[2]], SVM__NULL);
else
newfunc->func.dyn(t, &frame->regs[code[2]], 0, SVM__NULL);
code = t->saved_instruction_ptr;
} else
goto typecheck;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment