nothings/stb_vm excerpt 1

## stb_vm excerpt 1
#define atom_array_ref_preshifted(arr,offset)  ((svmatom *) (((uint8 *) (arr)) + (offset)))
#define atom_array_ref_naive(arr,offset)       (&(arr)[offset])

#ifdef STB_VM_CONFIG_PRESHIFTED_BYTESTREAM_CPOOL
   #define atom_cpool_ref    atom_array_ref_preshifted
#else
   #define atom_cpool_ref    atom_array_ref_naive
#endif

#ifdef STB_VM_CONFIG_PRESHIFTED_BYTESTREAM_REG
   #define atom_reg_ref      atom_array_ref_preshifted
#else
   #define atom_reg_ref      atom_array_ref_naive
#endif

#ifdef STB_VM_CONFIG_PREFSHIFTED_STRUCT_OFFSETS
   #define struct_access(str,off)   ((svmvalue *) (((uint8 *) (str)) + (offset)))
#else
   #define struct_access(str,off)   (str)[off]
#endif

#ifdef STB_VM_CONFIG_HOLD_REGISTERS_IN_SMALL_INT
typedef ubytecode regnum;
#else
typedef int regnum;
#endif

#ifdef SVB_VM_CONFIG_PTRCODE
#define STB_VM_CONFIG_USE_LABEL_ADDRESSES
#endif

// svm_interpret takes as input a just-resumed thread. this
// is either the result of a thread that has had a new
// function set up using 'thread_prepare_function', or is
// a thread that previously yielded, and is now being resumed.
// if the thread previously yielded and that yield call itself
// returns a value back to the thread, the caller must have
// already prepared this.
svmatom * svm_interpret(svmthread *input_t, svmcontext *input_c)
{
   // with gcc, we can make the interpreter faster by storing
   // the addresses of all the instructions and branching directly
   // to them. so we make that list now.
   #ifdef STB_VM_CONFIG_USE_LABEL_ADDRESSES
   static void *opcode_targets_list[SVMOP__total_bytecodes] =
   {
      #define OP(x,y)  &&LABEL_##x,
      OPCODES()
      #undef OP
   };
   register void **opcode_targets = opcode_targets_list;
   #endif

   // we're going to try to run as much code as possible inside
   // this one function. as long as we're in the function, we don't
   // have to save/restore registers or anything. to make this explicit
   // on machines with small register pools (like x86), we'll ask for
   // our most important locals in registers (though most compilers
   // will ignore this)
   register svmthread *t = input_t;
   register sbytecode *code;
   register svmstackframe *frame;
   register svmfunc *func;

   // the stack pointer is mostly only manipulated on function call/return,
   // so it's the lowest priority for being a register
   void *stack;

   // for a threaded interpreter, we want to store the actual
   // label addresses in the "bytecode". to do so, the routine
   // that does the storing needs to be able to be able to get
   // at the addresses, but they're only accessible inside the
   // function. so we provide a special path that returns them.
   #ifdef STB_VM_CONFIG_PTRCODE
   if (t == NULL)
      return (svmatom *) opcode_targets;
   #endif

   // save the context in the thread so we don't have to pass it around separately
   t->context = input_c;

   // note that the function parameters are now dead--they're never accessed again.
   // just to be on the safe side we'll tell the compiler that as best we can
   input_t = NULL;
   input_c = NULL;

   // now we're ready to begin interpreting. the first thing we do
   // is load up our registers with the full execution state
   code  = t->saved_instruction_ptr;
   frame = t->saved_frame;
   func  = t->saved_func;
   stack = t->saved_stack_top;

   // now we're ready to start dispatching bytecodes. to do this,
   // we use two different methods. for portability, we use code
   // that looks like "for(;;) switch(bytecode) { }". with gcc,
   // we use code that looks like "goto *opcode_table[bytecode]", and
   // we put this branch at the *end* of every opcode's implementation.
   // if TWOBYTECODE is enabled, then the stored bytecodes are *byte*
   // offsets into the jumptable, instead of indices; this lets
   // us avoid a shift on the powerpc.  If PTRCODE is enabled,
   // then we store the labels directly in the code, and don't
   // even have to look in opcode_targets table.

   // we want a convenient shortcut that lets us refer to the
   // data in the instruction stream as unsigned instead of signed.
   #define ucode ((ubytecode *) code)

#ifndef STB_VM_CONFIG_USE_LABEL_ADDRESSES
   // for portable code, we use a switch() to dispatch,
   // and it's inside a for(;;) loop.
   #define START_CASES  \
      for (;;)          \
         switch(*ucode)

   // NEXT() then breaks out of the switch(), causing it to loop around the
   // for(;;) to hit the switch() again. CASE() ends the previous opcode
   // with NEXT(), then defines a normal switch() case... note that NEXT()
   // won't work correctly inside a loop!
   #define NEXT()      break
   #define CASE_RAW(x) case SVMOP_##x:

   #ifdef SVB_VM_CONFIG_PTRCODE
   #error not-reached
   #endif
#else
   // using gcc label addresses, we don't need a loop since we always
   // branch to next case, and we don't need to do the initial bytecode
   // invocation because the first CASE() will generate a NEXT()
   // automatically
   #define START_CASES

   // with gcc label addresses, NEXT explicitly jumps to the next
   // instruction; CASE ends the previous opcode by calling NEXT
   // and then declares the label for this opcode
   #if defined(STB_VM_CONFIG_PTRCODE)
      // if PTRCODE, then the void * is stored explicitly in the code
      #define OPCODE_TARGET(n)  ((void*) (n))
   #elif defined(STB_VM_CONFIG_BYTECODE)
      // if bytecode, we look it up in the table
      #define OPCODE_TARGET(n)   opcode_targets[n]
   #else
      // if two-byte code, we assume it's been preshifted, so it's a byte offset
      #define OPCODE_TARGET(n)  *((void **) (((char *) opcode_targets) + (n)))
   #endif

   #define NEXT()       goto *OPCODE_TARGET(*ucode)
   #define CASE_RAW(x)  LABEL_##x:

#endif

   #define CASE(y)      NEXT(); CASE_RAW(y)

   START_CASES {

   // next we define a bunch of macros that will make the source
   // code smaller and easier to follow.

   // the constants used in a given function are stored in memory
   // immediately before the function, so to index them, we cast
   // the function pointer to the right type, and index that. the
   // indexing uses the previously defined macro that lets us
   // precompute a shifted index to save an instruction on non-x86
   #define cpool_ptr     ((svmatom *) func)
   #define cpool(n)      (*atom_cpool_ref(cpool_ptr, n))

   // if an instruction contains a constant index which cannot be
   // a register, and we use PTRCODE, then the constant index can
   // be a direct pointer to the constant.
   #ifdef STB_VM_CONFIG_PTRCODE
   #define cpool_notreg(n) *((svmatom *) (n))
   #else
   #define cpool_notreg(n) cpool(n)
   #endif

   #ifdef STB_VM_CONFIG_BYTECODE
   #define cpoolw(n)  ((svm_uintptr) cpool(n).v.u)
   #else
   #define cpoolw(n)  ((svm_uintptr) ((-(n)-1)))    // -1 => 0, -2 => 1, -3 >= 2
   #endif

   // this should be true iff ints and pointers are the same size, and PTRCODE
   #ifdef STB_VM_CONFIG_INLINE_INT_CONSTANTS
   // read the value directly from the instruction stream
   #define cpooli(code_lvalue) (*(svmvalue *) &(code_lvalue))
   #else
   // read the value the usual way
   #define cpooli(code_lvalue) (cpool_notreg(code_lvalue).v)
   #endif

   // this should be true iff floats and pointers are the same size, and PTRCODE
   #ifdef STB_VM_CONFIG_INLINE_FLOAT_CONSTANTS
   // read the value directly from the instruction stream
   #define cpoolf(code_lvalue) (*(svmvalue *) &(code_lvalue))
   #else
   // read the value the usual way
   #define cpoolf(code_lvalue) (cpool_notreg(code_lvalue).v)
   #endif

   #ifdef STB_VM_CONFIG_PTRCODE
   // read the value directly from the instruction stream
   #define cpoolp(code_lvalue) (*(svmvalue *) &(code_lvalue))
   #else
   // read the value the usual way
   #define cpoolp(code_lvalue) (cpool(code_lvalue).v)
   #endif

   // registers are stored immediately after the "main" stack frame
   // data. pretty much all processors have addressing modes with
   // fixed offsets, so we don't bother trying to optimize the frame
   // pointer to point exactly to the registers (you could imagine
   // an instruction set where addressing of reg+constant is available,
   // or reg+reg, but not reg+reg+constant, in which case it would be
   // more efficient to always have the pointer point to the beginning
   // of the array (the array would be indexed as reg+reg, and the other
   // elements of the structure would be reg+constant). Since there
   // don't seem to be any important CPUs along those lines, I don't.
   #define regs(n)    (*atom_reg_ref(frame->regs, (n)))

   // the most general instructions may read from either registers or
   // constants, so we make macros that hide the details of that.
   #define val(n)     ((n) < 0 ? cpool(n) : regs(n))
   #define pval(n)    ((n) < 0 ? &cpool(n) : &regs(n))

   // we need to do array length checking. if the length is exceeded,
   // we branch to a array-range-check failure handler
   #define acheck(a,i)  if (i >= atom_svmarray(a)->count) goto array_rangecheck;
   #define awritecheck(a,i) if (i >= atom_svmarray(a)->count) if (!expand_array(atom_svmarray(a), i)) goto array_rangecheck;

   // at this point, we're ready for opcodes

   // the simplest opcode is a no-op; it does nothing, at least in the virtual
   // machine semantics. However, in STABVM, every opcode is responsible for
   // advancing the instruction pointer, which is the variable named "code".
   // We do this because instructions are different lengths. We could try
   // to make some code that decodes the opcode and figures out the length
   // by checking a table, but this would involve extra indirections; it's
   // much faster just to do the add by the right amount in the opcode
   // implementation itself (although it uses more codespace). And once we're
   // doing that, it doesn't make any sense for the dispatcher to try to advance
   // by *only* the opcode. The work it is doing is redundant to that done in
   // most opcodes, so it makes more sense to move it into the opcode itself.
   //
   // The bytecode might be bytes or it might actually be words, but whichever
   // it is, the vast majority of opcodes use the same number of bytes/words,
   // so we can just add that number to the pointer, which is itself either
   // bytes or words as appropriate.
   CASE(nop)
      code += 1;

   // We saw at the start how threads are restored from suspension.
   // while that's still fresh, let's look at the corresponding call--
   // a bytecode that *causes* a suspension.

   CASE(yield)
      // This instruction takes up 3 spots in the bytecode stream:
      //   #0 : OP_yield
      //   #1 : value to "return" on yielding
      //   #2 : register to write the result when we "restore" from yielding

      // implementation is trivial: we just save everything and return
      // the return value. we don't worry about the result writing, we
      // leave that to the unsuspend logic.
      t->context->yield_state = SVM_STATE_yield;
      t->saved_frame = frame;
      t->saved_func = func;
      t->saved_stack_top = stack;
      // points to the next instruction
      t->saved_instruction_ptr = code+3;

      // now return a pointer to the "return" value
      return pval(code[1]);

   // Optimized bytecodes are provided which are much simpler and faster
   // than the general-purpose bytecodes defined by the specification,
   // with optimized decoding of common cases and/or optimizing behavior
   // if types are statically known.

   // We'll start with some of the simplest ones. The first two instructions
   // perform moves from a register to a register, and from a constant to a
   // register. We write simple opcodes out as one-liners so that the code
   // lines up with other opcodes, making it easier to see patterns and bugs.
   // Each operation like code[1] fetches opcode parameters out of the
   // bytestream; the macro regs(N) fetches the Nth register, and cpool(N)
   // fetches the Nth constant from the constant pool (constants are actually
   // accessed with negative numbers, which is why we must access through
   // the signed value 'code' not the unsigned value 'ucode')
   CASE(move_rr_dyn) regs(code[2]) = regs        (code[1]); code += 3;
   CASE(move_rc_dyn) regs(code[2]) = cpool_notreg(code[1]); code += 3;

   // The next two instructions are similar to the previous. However,
   // each register/constant is an "atom" which consists of two fields:
   // a type field (saying what dynamic type the atom has) and a "value"
   // field (whose encoding depends on the type). For optimized bytecodes
   // for statically-typed languages, the compiler knows what types the
   // registers are, and doesn't need to update the type field, so we
   // only copy the value field, which is denoted for conciseness as ".v".
   // The ".v" of the register is a union, so later we'll see accesses
   // to fields in the union, but for these generic instructions that
   // work on arbitrary types, we just copy the whole value union.
   CASE(move_rr_static) regs(code[2]).v = regs        (code[1]).v; code += 3;
   CASE(move_rc_static) regs(code[2]).v = cpool_notreg(code[1]).v; code += 3;

   // Now we have the generic move case, which is a dynamic move, but
   // we don't know at interpretation time whether we're moving from
   // a register or the constant pool. (The compiler does know, and
   // should have just output the right one, but for simplicity we
   // allow the compiler to target these less-efficient instructions.)
   CASE(move) {
      regs(code[2]) = val(code[1]);
      code += 3;
   }

   // A quick side note: when we return from a function, we need to write
   // the result to a register. To avoid needing extra register moves,
   // we don't use a fixed register; the instruction says which
   // register to write to. This will happen a long time after we decoded
   // the function call, though. So we can either save away which register
   // it was to the stack, or we can redecode the previous instruction when
   // we return, and extract the register then. The latter sounds hard,
   // because instructions are variable length, but we use a simple trick:
   // we put the register to write the result to in the last byte/word of
   // the instruction; then on return we always just look one byte back
   // in the codestream to find the target register. (Note that we cannot
   // have function calls that discard their return value; they have to
   // have a dummy register and write it there.)
   //
   // So for consistency, the register written by non-fuction call
   // instructions ALSO appears last in the opcode bytes. This allows us
   // to make any instruction trigger a trap handler routine, and the return
   // from that routine will write the return value to the correct place,
   // allowing us to perform such trap handlers within a single interpreter
   // loop (without recursing into the interpreter).

   // There's a second move-like instruction, which includes a coersion between
   // the input type and an arbitrary type. Here we see our first exception;
   // if the coersion is impossible, we can tell from the return code, and
   // we branch to a common chunk of code that will set up a typecheck
   // exception, and then search up the stack for exception handlers.
   CASE(coerce) {
      regnum outreg = code[3];
      if (svm_coerce_explicit(t, &regs(outreg), func->type_constants[ucode[2]], pval(code[1])))
         goto typecheck;
      code += 4;
   }

   // There's a similar convert operation which is when we explictly request a coersion.
   // In this case, we allow for the possibility that user-defined conversion
   // code, and might throw an exception that's *not* a typecheck exception,
   // so if the conversion throws an exception, we require it to have already
   // set it up properly.
   CASE(convert) {
      regnum outreg = code[3];
      svm_convert(t, &regs(outreg), func->type_constants[ucode[2]], pval(code[1]));
      if (t->context->has_exception)
         goto exception_handler;
      code += 4;
   }

   // And a type-verifying operation which checks if the type is correct
   CASE(typecheck) {
      regnum reg = code[1];
      if (func->type_constants[ucode[2]].code != regs(reg).type.code)
         goto typecheck;
      code += 3;
   }

   // now that we've seen the basics of this, let's look at some instructions
   // that actually compute something. first we'll only look at the statically typed
   // ones, since those are one-liners. for example, integer addition always
   // produces integers (they wrap mod 2^32), so they can't throw exceptions
   // and don't need coersion. first we have the simple case where we add two
   // registers and store to a third.
   CASE(add_rr_int  ) regs(code[3]).v.u = regs (code[1]).v.u + regs  (code[2]).v.u; code += 4;
   // next we have the possibility of adding a constant. since addition is commutative,
   // we only need to allow a constant on the right. we make two versions: one loads
   // the constant from the constant pool, and the other, faster one uses the value
   // from the bytecode stream itself. This code is written as if the values are
   // unsigned, but it will actually handle both signed and unsigned values (and
   // mixtures of them), because we only support platforms that use 2's complement arithmetic.
   CASE(add_rc_int  ) regs(code[3]).v.u = regs (code[1]).v.u + cpooli(code[2])  .u; code += 4;
   CASE(add_rb_int  ) regs(code[3]).v.u = regs (code[1]).v.u +        code[2]     ; code += 4;
   // The third possibility is adding two constants, but an optimizing compiler
   // should just replace this with the sum of the constants itself, thus an optimizing
   // compiler never needs to generate that (and the unoptimized OP_add instruction
   // *does* support this.)

   // Now we need to repeat the same thing for floats. first we support adding
   // two registers or adding a register and a constant. we also support the case of
   // adding a constant integer out of the bytestream, although that integer will have
   // to be converted to float, which may be slower than fetching from the constant
   // pool on some platforms. but we define it just in case it isn't.
   CASE(add_rr_float) regs(code[3]).v.f = regs (code[1]).v.f + regs  (code[2]).v.f; code += 4;
   CASE(add_rc_float) regs(code[3]).v.f = regs (code[1]).v.f + cpoolf(code[2])  .f; code += 4;
   CASE(add_rb_float) regs(code[3]).v.f = regs (code[1]).v.f +        code[2]     ; code += 4;

   // Next, we move on to subtraction. Now, subtraction isn't commutative, so we
   // might seem to need to implement 'reg - reg', 'reg - const', and 'const - reg'
   // to cover all the cases. However, 'reg - const' can be implemented using
   // 'reg + const' with the constant negated, so we omit that one. So subtraction
   // only has two variations, with the left side being either a register or a
   // constant. To save bytecode space, we don't implement having the constant
   // in the bytecode stream, since that operation is more rare.
   CASE(sub_rr_int  ) regs(code[3]).v.u = regs  (code[1]).v.u - regs (code[2]).v.u; code += 4;
   CASE(sub_cr_int  ) regs(code[3]).v.u = cpooli(code[1])  .u - regs (code[2]).v.u; code += 4;
   CASE(sub_rr_float) regs(code[3]).v.f = regs  (code[1]).v.f + regs (code[2]).v.f; code += 4;
   CASE(sub_cr_float) regs(code[3]).v.f = cpoolf(code[1])  .f + regs (code[2]).v.f; code += 4;

   // Negation could be viewed as a special case of subtraction, but is common
   // enough that we optimize for it (and we'll see another negation optimization
   // in OP_mulneg).
   CASE(neg_rr_int  ) regs(code[2]).v.s = -regs(code[1]).v.s; code += 3;
   CASE(neg_rr_float) regs(code[2]).v.f = -regs(code[1]).v.f; code += 3;
   // Absolute value is so simple to implement on most CPUs that it would be
   // a shame to have to implement it as a native function, so we go ahead
   // and provide it even if it's not all that common. We don't need to worry
   // about constants; the only plausible case is register-to-register.
   CASE(abs_rr_int  ) regs(code[2]).v.s =         svm__abs (regs(code[1]).v.s); code += 3;
   CASE(abs_rr_float) regs(code[2]).v.f = (float) svm__fabs(regs(code[1]).v.f); code += 3;
   // As a special case, we have one which forces the output type to be unsigned;
   // this is useful for taking the absolute value of -2^31, which is still negative
   // if we keep the result as a signed int
   CASE(absu_rr_int ) {
      regnum d = code[2];
      regs(d).v.s = svm__abs(regs(code[1]).v.s);
      regs(d).type.code = SVMT_u32;
   }

   // Next we have multiplication. This is similar to addition (it's commutative),
   // but it's different in that signed and unsigned multiplication are different
   // operations. In fact, we need to support multiplying a signed by an unsigned
   // number, but that turns out to just be an unsigned multiply as well. Since
   // it's commutative, each type has two cases; again we make two versions of the
   // constant multiply, with the constant stored in different places, except for
   // float multiply.
   CASE(mul_rr_int  ) regs(code[3]).v.s = regs(code[1]).v.s * regs  (code[2]).v.s; code += 4;
   CASE(mul_rc_int  ) regs(code[3]).v.s = regs(code[1]).v.s * cpooli(code[2])  .s; code += 4;
   CASE(mul_rb_int  ) regs(code[3]).v.s = regs(code[1]).v.s *        code[2]     ; code += 4;
   CASE(mul_rr_uint ) regs(code[3]).v.u = regs(code[1]).v.u * regs  (code[2]).v.u; code += 4;
   CASE(mul_rc_uint ) regs(code[3]).v.u = regs(code[1]).v.u * cpooli(code[2])  .u; code += 4;
   CASE(mul_rb_uint ) regs(code[3]).v.u = regs(code[1]).v.u *       ucode[2]     ; code += 4;
   CASE(mul_rr_float) regs(code[3]).v.f = regs(code[1]).v.f * regs  (code[2]).v.f; code += 4;
   CASE(mul_rc_float) regs(code[3]).v.f = regs(code[1]).v.f * cpoolf(code[2])  .f; code += 4;


## stb_vm excerpt 2
   // That's it for the static jump statements. Now let's take a look at
   // much more complicated flow control: function call and return.
   //
   // STABVM uses an explicit stack for function call/return. We push
   // function arguments on the stack, save the current function state,
   // and start up the next function.
   //
   // This gets very complicated very quickly, so we'll only look at
   // the very simplest case first: calling a bytecoded function which
   // we know statically takes no arguments, and does no special
   // initialization. This isn't a very important case to optimize
   // for, but it's the case that *can* go the fastest, so we do,
   // just in case it's useful. Furthermore, it lets us look at exactly
   // how much overhead an interpreted function call requires. Function
   // call/return is important for code that's less "big scripts" and more
   // "traditional code".
   //
   // Other function calls will do more and be slower, but this establishes
   // our baseline for function call overhead.
   //
   // We need to define some local variables, and we want the scope of
   // these to be explicit for the optimizer, so we'll start up a {} block.
   CASE(call_fc_bcode_a_none) // 0=opcode, 1=func, 2=retval reg
   {
      // the first field of the instruction is the pointer to the function
      // to call, so we load that up now.
      svmfunc *newfunc = (svmfunc *) cpoolp(code[1]).ptr;
      // In PTRCODE mode, this is one CPU instruction:
      //      load   reg_temp,4(reg_code);
      // in other modes, it's indirected through cpool, so 2-3 instructions.

      // the new "execution context" (stack frame) we'll run in is located
      // wherever the current top of stack is.
      svmstackframe *newframe = (svmstackframe *) stack;
      // If stack is in a CPU register, this is free.

      // we do our stack computations in a temporary variable to
      // encourage the compiler to register allocate it.
      void *newstack;

      // when we're done with this function, we'll want to return and
      // continue executing from this same spot. we'll do that by reloading
      // these values out of what will be the current frame pointer,
      // but for now is "newframe". so we need to save everything there.
      // note that we never actually access code[2], which stores the
      // register where we'll put the function's return value. We use a trick
      // to save an instruction or two instead, which we'll see later.
      newframe->saved_func = func;
      newframe->saved_stackframe = frame;
      newframe->saved_instruction_ptr = code + 3;
      // This requires 4 instructions (3 stores and one add; a bit worse
      // on x86 where the add will stomp a register).

      // next we need to compute how much stack space we take up.
      // this depends on the size of the stack used by the function
      // description, so we store that explicitly in the function.
      newstack = (char *) stack + newfunc->stack_frame_size;
      // this requires two RISC instructions (we assume stack is
      // in a register) or two x86 instructions (load stack, add
      // from memory)

      // next we need to check for stack overflow. by design, to
      // make this test fast, the stack is stored right before the
      // thread pointer and grows upwards towards it, so the
      // stack overflows if it is past the thread pointer.
      if ((char *) newstack > (char *) t) goto stack_overflow;
      // this typically requires two instructions

      // at this point, we've finished saving everything, and we've
      // checked every failure case, so we just need to update our
      // locals so we're actually set up to run the new function
      code = newfunc->bytecode;
      func = newfunc;
      frame = newframe;
      stack = newstack;
      // each of the above lines requires one instruction (the compiler
      // can't put newstack in the same register as stack since stack
      // is still alive after computing newstack if we goto stack_overflow)

      // and we're done. so, not counting the dispatch code, this
      // function call (static, no arguments) requires around
      // 8 CPU instructions for BIGCODE and 10 CPU instructions
      // otherwise, not counting any register spilling on x86.
   }


   // now lets look at the other side of the equation: we want to return
   // from a function as fast as possible.
   //
   // Again, we'll consider the static case. For a return, static means
   // that the type of the value we're returning is already known to be
   // the type that the function is defined to be. We don't care about
   // what type the *caller* is expecting; the caller is required to do
   // any final coersion (this requires an extra instruction in the
   // dynamic-to-static case, but the alternative requires us to pay an
   // extra test in the static-to-static and dynamic-to-dynamic path, or
   // generate multiple versions of the same function for use in different
   // contexts). However, since the caller might be calling through
   // a dynamic interface, we DO have to write out the correct type.
   // To do this, we rely on the compiler to have arranged for
   // the chosen register to have the type in it.

   // again, we need to declare a local variable
   CASE(return_r_static)
   {
      regnum outreg;

      // in this case, we want to get a pointer to the frame we're returning to
      svmstackframe *caller_frame = frame->saved_stackframe;
      // this requires one CPU instruction

      // we restore the saved bytecode pointer, which points to immediately
      // *after* the instruction we last executed in the caller.
      sbytecode *caller_code = frame->saved_instruction_ptr;
      // this requires one CPU instruction

      // now we want to write the return value into the caller's frame.
      // to know where to write it, we require that *any* caller always
      // leaves the bytecode pointer so that the *previous* byte/word/dword
      // before the bytecode pointer points to the register where the
      // returned result should be written. this is why the bytecode format
      // places the returned value slot last in the instruction. if we
      // didn't do this, we would have to save a pointer to where to write
      // the result, and writing/reading that pointer would be extra overhead.
      outreg = caller_code[-1];
      *atom_reg_ref(caller_frame->regs, outreg) = regs(code[1]); // copy out type in case caller needs it
      // this effectively computes:
      //     p1 = &caller_frame->regs[caller_code[-1]]
      //     p2 = &frame->regs[code[1]]
      //     p1->v = p2->v
      //     p1->type = p2->type
      // Let's look at this in two stages, the first two lines then
      // the second two.
      //
      // Loading p1 and p2 require computing "reg + constant + memory" (reg
      // is 'frame', constants is offset of regs[], and memory is 'code[1]').
      // However, we can fold the constant into the uses of p1 and p2, so it's
      // really just "reg + memory". This requires two CPU instructions EACH on both
      // x86 and RISC. (x86 adds from memory in one instruction, but needs
      // to copy the source register to another register first.)
      //
      // Copying the data between the two pointers requires 4 RISC instructions
      // (unless you use some wider data type). x86 seems like it would be the
      // same.
      //
      // This brings the total for that line of code to 8 CPU instructions.

      // and we restore all the register describing the execution context
      code = caller_code;
      stack = (void *) frame;
      func = frame->saved_func;
      frame = caller_frame;
      // this requires 4 CPU instructions

      // and we're done, ready to continue running in the old frame context.
      // The total was ~14 CPU instructions (more for register spills); this
      // is actually more than the void function call was. But more than half of
      // this was spent in copying the return value.

      // This means it's worth having some *more* specialized bytecodes for
      // returns that could be faster. For example, returning a constant
      // out of the bytecode stream; not writing the type field, or even
      // a void function that doesn't write anything. So we'll implement
      // those next.
   }

   // first let's do void. it's just the same as the above code, but
   // without writing anything into the caller's return value.
   CASE(return_void_static)
      code = frame->saved_instruction_ptr;
      stack = (void *) frame;
      func = frame->saved_func;
      frame = frame->saved_stackframe;
      // that's it! it requires only four CPU instructions

   // let's try reading a small integer out of the bytestream and not writing
   // the type. this can be used to return a small integer, a "small" unsigned
   // integer, or a boolean. we could create special 'return true' and
   // 'return false' for even "more" performance, but on some CPUs they wouldn't
   // even save anything because the 0 and 1 values would still have to be loaded
   // or computed into a register somehow. (well, it would save a little memory)
   CASE(return_b_static)
   {
      svmstackframe *caller_frame = frame->saved_stackframe;
      int32 retval = code[1];
      regnum outreg;
      code = frame->saved_instruction_ptr;
      outreg = code[-1];
      atom_reg_ref(caller_frame->regs, outreg)->v.s = retval;
      stack = (void *) frame;
      func = frame->saved_func;
      frame = caller_frame;
      // this is ~9 CPU instructions; we save 2 not computing the
      // register to load from, we save 2 not reading and writing
      // the type, and we save one by hoisting code up earlier
      // in the code, avoiding a register move.
      //
      // for an explicit comparison
      // http://nothings.org/games/if/slux/index.html shows the
      // disassembly for "return 0" for an earlier VM of mine
      // which uses the same return-value-location trick. It
      // instead uses tag bits for type info. It ends up using
      // six CPU instructions; one isn't RISC-like, but one appears
      // to be redundant. Regardless, this means it's definitely
      // possible with different design constraints
      // to do better than 9 CPU instructions for this operation,
      // so this is a place where using STABVM compromises
      // efficiency for semantic flexibility.
   }

   // for completeness, lets finish off the return cases.

   // first we have the equivalent to the first one, but with a constant
   // instead of a register. because the cpool() could reference the
   // current function OR the current bytecode stream in PTRCODE,
   // we have to be careful not to update those before the copy
   CASE(return_c_static) {
      svmstackframe *caller_frame = frame->saved_stackframe;
      sbytecode *caller_code = frame->saved_instruction_ptr;
      regnum outreg = caller_code[-1];
      *atom_reg_ref(caller_frame->regs, outreg) = cpool(code[1]);
      code = caller_code;
      stack = (void *) frame;
      func = frame->saved_func;
      frame = caller_frame;
   }

   // next, we have the fully generic regular return function. this
   // needs to (a) decode from either regs or constants, and (b)
   // needs to do a coersion to the return type
   CASE(return) {
      svmstackframe *caller_frame = frame->saved_stackframe;
      sbytecode *caller_code = frame->saved_instruction_ptr;
      regnum outreg = caller_code[-1];
      stack = (void *) frame;
      if (svm_coerce_explicit(t, atom_reg_ref(caller_frame->regs, outreg), func->return_type, pval(code[1])))
         goto typecheck;
      code = caller_code;
      func = frame->saved_func;
      frame = caller_frame;
   }

   // Finally, we need a return that's alloca friendly. any stack frame
   // that might contains allocas has an extra pointer which is a linked
   // list of allocas from that frame. The VM also keeps a linked list of
   // all the stack frames. So the stack walker could walk every stack
   // frame and check the frame's list-of-allocas pointer, but that would
   // require us to initialize the list-of-allocas pointer on every function
   // call. Since we don't want to pay the alloca cost if the user never
   // uses the allocas, that cost is unacceptable. So instead, we keep
   // a second linked list of the stack frames that have had their
   // list-of-alloca pointers initialized. Then on return, we have to
   // unlink that pointer.
   CASE(return_alloca) {
      svmstackframe *caller_frame = frame->saved_stackframe;
      sbytecode *caller_code = frame->saved_instruction_ptr;
      regnum outreg = caller_code[-1];
      #ifdef STB_VM_ALLOCA_WALKER                 // if this isn't defined, it's same as CASE(return); should share the code
      t->alloca_stackframe_head = frame->alloca_next_stackframe; // this is the only extra operation
      #endif
      stack = (void *) frame;
      if (svm_coerce_explicit(t, atom_reg_ref(caller_frame->regs, outreg), func->return_type, pval(code[1])))
         goto typecheck;
      code = caller_code;
      func = frame->saved_func;
      frame = caller_frame;
   }
   // and we're done with all the return variants of every kind.

   // let's look at some more function dispatch. We'll implement most of the
   // function calls where the function call is a constant and is implemented
   // in bytecode. this subset still comes in multiple flavors, but we'll
   // start with one of the most general.
   CASE(call_fc_bcode_static)
   {  // 0=opcode, 1=func, 2=argc, 3..argc+2=arguments, argc+3=retval,

      // the beginning of this is the same as the void call we
      // looked at already (OP_call_fc_bcode_a):
      svmfunc *newfunc = (svmfunc *) cpool(code[1]).v.ptr;
      svmstackframe *newframe = (svmstackframe *) stack;
      int i,num_arg;
      void *newstack;
      newframe->saved_func = func;
      newframe->saved_stackframe = frame;

      // now we need to get the number of arguments
      num_arg = code[2];
      // the instruction is variable length, depending on the argument count
      newframe->saved_instruction_ptr = code + num_arg+4;

      // advance the stack and range-check it
      newstack = (char *) stack + newfunc->stack_frame_size;
      if ((char *) newstack > (char *) t)
         goto stack_overflow;
      // with varargs, the argument array could be longer than num_regs, so we need to check for varargs overflowing
      if ((char *) &newframe->regs[num_arg] > (char *) t)
         goto stack_overflow;

      // copy the function arguments, this version assumes no coersions needed
      // even so, performance is kind of sucky, because val(k) requires a
      // branch, and so we're using the *same* branch every time through,
      // so branch prediction can't work. so later we'll specialize this
      // with things that minimize the decoding work
      for (i=0; i < num_arg; ++i)
         newframe->regs[i] = val(code[3+i]);

      // now set everything up to be in the new context
      code = newfunc->bytecode;
      func = newfunc;
      frame = newframe;
      stack = newstack;

      // now handle the function having special initialization
      // we do this by branching to shared code that does the initialization,
      // since it will be the same in every function.
      if (func->has_initialization) {
         // we store the argument count in the thread context so we don't
         // have to store it on in a function-wide variable where
         // it might trigger save/restores since the compiler might
         // not figure out that it doesn't flow across the
         // indirect branches
         t->context->temp = num_arg;
         goto func_initializer;
      }
      NEXT();
   }

   // this is shared code that is used for all bytecode function prologues
   // that request initialization
   func_initializer: {
      // we copy the argument count back out of the thread pointer
      int num_arguments_provided_to_call = t->context->temp;

      // first thing we do is, if we support stackwalking things allocated
      // by alloca, then we check if this function has any of those, and if
      // so we add it to the the main chain. additionally, we start a "local"
      // chain of just the allocations for this stack frame.
      #ifdef STB_VM_ALLOCA_WALKER
      if (func->has_walkable_alloca) {
         // initialize the local chain
         frame->alloca_local_chain = 0;
         // update the global chain -- we could actually do this lazily
         // the first time we add anything to the local chain, but this
         // is simpler to code and it avoids extra branches to test
         frame->alloca_next_stackframe = t->alloca_stackframe_head;
         t->alloca_stackframe_head = frame;
      }
      #endif

      // next step is to handle variable-argument-count functions. we provide
      // a system which supports two important behaviors: you can ask for a
      // full args list by setting func->arg_array_start to 0, or
      // you can ask for a list starting after some known number of
      // elements by setting it to that number of elements; typically
      // this would be the start point of a '...' list. You can't ask
      // for both, since only one array can be generated. (If your language
      // allows both, you could generate code to dynamically synthesize the
      // varargs list from the full list as the first instructions of the function).
      // Also, this doesn't preclude still accessing the elements
      // from registers (i.e. as name variables). Set arg_array_start to 0x7ffffff
      // to not create an args array. Set func->max_args to 0x7fffffff to allow an
      // arbitrary length varargs input.
      if (num_arguments_provided_to_call > func->arg_array_start) {
         // copy excess arguments into alloca'd array; this can either be an 'args' array or a 'varargs' array
         svmarray *a;
         int array_len = num_arguments_provided_to_call - func->arg_array_start;
         int array_bytes = sizeof(*a) + sizeof(a->data[0]) * (array_len - 1);
         a = (svmarray *) stack;
         if ((char *) stack + array_bytes < (char *) stack)
            goto stack_overflow;
         stack = (char *) stack + array_bytes;
         if ((char *) stack > (char *) t)
            goto stack_overflow;
         // now, the arguments are currently on the stack. if there were
         // more arguments than the function has registers, it wrote off
         // the end, *into* the extra space, where we just allocated this
         // array. so before we write anything else into the array structure,
         // we have to copy the data itself, and we have to use memmove()
         // since it might be self-overlapping
         // @TODO: explicitly code loop that goes in right direction, to remove dependency on memmove
         svm__memmove(a->data, &frame->regs[func->arg_array_start], sizeof(svmatom) * array_len);
         // now that we've copied the arguments, we can fill out the array header
         a->count = a->limit = array_len;
         // now, we add this object to the chain of local walkable-allocas,
         // so the stack walker can find it
         #ifdef STB_VM_ALLOCA_WALKER
         assert(func->has_walkable_alloca);
         a->alloca_chain = frame->alloca_local_chain;
         frame->alloca_local_chain = (void *) a;
         // and we set its type field to NULL, which is the stack-walker flag that it's an array not a structure
         a->type = NULL;
         #endif
      }

      // next we have a single system used for three purposes:
      //    - providing default parameter values
      //    - initializing variables to constants
      //    - initializing static types for registers for static-to-dynamic conversions
      // the way this works is that the variables/registers that need initialization
      // should be low-numbered registers, just after the arguments. parameters needing
      // initialization are always last. So the initialization just needs to initialize
      // a continuous block of memory -- any omitted parameters with defaults, and then
      // some number of low-numbered registers immediately after the parameters.
      if (func->init_reg_size_in_bytes) {
         // the initialization uses a memcpy; that copy needs three things:
         //     - the place to write, represented as a register count within the frame
         //     - the amount of data to copy, represented in byte
         //     - the place to copy the data from, in bytes
         // We start by loading three values with the *full* data to copy;
         // in the absence of argument-initializers, we'll actually do this;
         // also if *all* default initializers are required.
         int start = func->earliest_init_reg;
         int size = func->init_reg_size_in_bytes;
         svmatom *data = func->init_reg_data;

         // Next we need to see if we have initializers that we *shouldn't* set.
         // This happens iff there are argument initializers and there are more
         // arguments than the minimum number possible. However, because we ALSO
         // allow varargs, in which case we'll pass in arbitrary more arguments
         // then those allowed, but we still need to initialize all the real registers,
         // the logic here is more complicated. First, we only even have to think
         // about testing this if the number of arguments passed in extend past
         // the beginning of the region to initialize. It is precisely to make
         // this test fast that we chose to encode earliest_init_reg in registers,
         // not bytes.
         if (num_arguments_provided_to_call > start) {
            // tentatively, we assume we start at the first "real" reg for
            // the function (after the "real" function arguments)
            start = func->first_mandatory_init_reg;
            // but if we had fewer arguments then that, then we need to initialize
            // the missing arguments
            if (start > num_arguments_provided_to_call)
               start = num_arguments_provided_to_call;
            // now we've computed an updated location to copy from. now we need
            // to update where to copy from, and how much to copy. Both of these
            // will update by the same number of bytes, so we lift the common
            // subexpression out explicitly to make sure the compiler doesn't
            // miss it.
            {
               int offset = (start - func->earliest_init_reg) * sizeof(svmatom);
               size -= offset;
               data = (svmatom *) ((char *) data + offset);
            }
         }
         // now we've updated all the pointers and sizes and we can just initialize
         // the memory
         svm__memcpy(&frame->regs[start], data, size);
      }
      // and we're done with the function prologue, and are ready to run
      // the first instruction in the function
   }

   // now lets look at a version of a function call opcode that does do coersions.
   // of course if the function is a constant, we know exactly what the
   // coersions we need to do are at compile time. so ideally we'd only convert
   // the ones that need it. however, we have to loop through the N arguments
   // constructing the stack no matter what, and there's not much we can do
   // that's faster than testing whether we do in fact need to convert.
   //
   // so this code is basically identical to call_fc_bcode_static, except it does coersions
   CASE(call_fc_bcode) {  // 0=opcode, 1=func, 2=argc, 3..argc+2=arguments, argc+3=retval,

      // everything is the same as before up to when we copy the data
      svmfunc *newfunc = (svmfunc *) cpool(code[1]).v.ptr;
      svmstackframe *newframe = (svmstackframe *) stack;
      int i,num_arg;
      void *newstack;
      newframe->saved_func = func;
      newframe->saved_stackframe = frame;
      num_arg = code[2];
      newframe->saved_instruction_ptr = code + num_arg+4;

      newstack = (char *) stack + newfunc->stack_frame_size;
      if ((char *) newstack > (char *) t)
         goto stack_overflow;
      if ((char *) &newframe->regs[num_arg] > (char *) t) // varargs array could be longer than num_regs
         goto stack_overflow;

      // now we copy the data using svm_coerce, which will do the conversions
      // for us. but, if we're calling a varargs function, we might have more
      // arguments than it has types, so we have to test for that
      if (num_arg <= newfunc->max_arg_types)
         for (i=0; i < num_arg; ++i)
            svm_coerce(t, &newframe->regs[i], newfunc->type_constants[i], pval(code[i+3]));
      else {
         // if we have more than args it has types, then we only coerce those with types
         int num_types = newfunc->max_arg_types;
         for (i=0; i < num_types; ++i)
            svm_coerce(t, &newframe->regs[i], newfunc->type_constants[i], pval(code[i+3]));
         // and we copy the rest uncoerced
         for (   ; i < num_arg; ++i)
            newframe->regs[i] = val(code[i+3]);
      }

      // and the rest of the code is unchanged
      code = newfunc->bytecode;
      func = newfunc;
      frame = newframe;
      stack = newstack;

      if (func->has_initialization) {
         t->context->temp = num_arg;
         goto func_initializer;
      }
   }

   // The last opcode showed a truely fully-general implementation of
   // a call which we know is going to a bytecode-implemented function.
   // But there are also native-coded functions. Let's look at those.

   // The first case to consider is a native function which is statically
   // known, but where the native function is designed for interfacing
   // with STABVM by doing any coersions internally: it receives a C array
   // of svmatoms and parses the arguments itself. We don't have to do
   // any type coersion (it'll do it if it needs it). So we mostly just
   // have to build the array and go.
   CASE(call_fc_ndyn)
   {  // 0=opcode, 1=func, 2=argc, 3..argc+2=arguments, argc+3=retval,

      // get the C function pointer, and figure out how many arguments we'll need
      native_func_dynamicargs *funcptr = (native_func_dynamicargs *) cpool(code[1]).v.ptr;
      int num_arg = code[2], i;

      // we get an alias for the stack that we use to build the argument list
      svmatom *arglist = (svmatom *) stack;

      // we're going to store the arguments on the stack, so check
      // that there's room
      if ((char *) &arglist[num_arg] > (char *) t)
         goto stack_overflow;

      // copy the arguments onto the stack; we rely on the callee to
      // coerce (or to statically know the types, as appropriate)
      for (i=0; i < num_arg; ++i)
         arglist[i] = val(code[3+i]);

      // now we have everything the function call will need. But we
      // have a few more things to do.

      // First, we want to allow the function to recurse into the
      // interpeter. (Imagine, for example, that we're calling a sort
      // routine, and it wants to call back to a comparison function.)
      // To do that, we just need to save the stack depth we've used.
      t->saved_stack_top = arglist + num_arg;

      // Next, when we call the function, the C compiler may save and
      // restore our registers (depending on the caller-save situation).
      // If it's going to save and restore them anyway, then there's no
      // extra overhead for us to save and restore them (if the compiler
      // is smart enough to notice). so we'll save and restore the bytecode
      // pointer (but nothing else). this could waste a *little* bit
      // on machines with good callee-save support. but it saves
      // a conditional branch, as we'll see in a moment.
      t->saved_instruction_ptr = code + num_arg+4;

      // now we call the C-native function
      funcptr(t, &frame->regs[code[3+num_arg]], num_arg, arglist);

      // now we restore the bytecode pointer and go back to running.
      // except this is actually a trick. we load the value from
      // t->saved_instruction_ptr which we just saved above, so
      // *normally* this just restores what we saved. However,
      // the function we call can overwrite that value.
      code = t->saved_instruction_ptr;

      // Now we'll dispatch to the next opcode pointed to by 'code'.
      // if it was the value we saved ourselves, execution continues
      // normally. However, if it points somewhere else, something
      // else happens. In particular, if the function we just called
      // calls
      //
      //    svm_from_nativefunc_defer_exception(svmthread *t, svmatom exception_object);
      //
      // with this thread's handle, that function will overwrite
      // t->saved_instruction_ptr with the pointer to a special
      // instruction sequence that throws the exception. (It'll
      // copy the old t->saved_instruction_ptr somewhere else,
      // so that we can restore the interpreter state properly.)
      //
      // The point of this is it allows the function we call to
      // control the behavior of the interpreter at this point,
      // without us checking a return value from the native function.
      // On x86 this may save a few instructions; on others, it
      // probably costs about the same number of instructions
      // but avoids using another branch prediction slot, etc.
      //
      // Because that behavior is not a path that needs to be high
      // performance, and to avoid overloading the bytecode namespace
      // in BYTECODE mode, the special instruction sequence doesn't even
      // get a custom opcode for this behavior; instead we use the regular
      // SVMOP_break bytecode, and make the SVMOP_break implementation check
      // if it's in the special bytecode.
   }

   // there's another, different calling convention for C code. in
   // this case, it takes a single structure pointer, and the fields
   // in it do not have types. they're still all svmvalues; their
   // layout is identical to the svmstructs that we haven't yet talked
   // about (and in fact you can use "apply" to call a C function
   // with a pre-baked struct).
   //
   // Again, we'll look at a static one. With this calling convention,
   // it's our job to do coersions, but we'll start with a fully static
   // one that doesn't have to do coersions.
   CASE(call_fc_nstat_static)
   {  // 0=opcode, 1=func, 2=argc, 3..argc+2=arguments, argc+3=retval,

      // this proceeds as before
      native_func_staticargs *funcptr = (native_func_staticargs *) cpool(code[1]).v.ptr;
      int num_arg = code[2], i;

      // now the arglist only has values, not types
      svmvalue *arglist = (svmvalue *) stack;

      // we're going to store the arguments on the stack, so check
      // that there's room
      if ((char *) &arglist[num_arg] > (char *) t)
         goto stack_overflow;

      // copy the arguments into the arglist
      for (i=0; i < num_arg; ++i)
         arglist[i] = val(code[3+i]).v;

      // we use the same trick again
      t->saved_instruction_ptr = code + num_arg+4;
      funcptr(t, &frame->regs[code[3+num_arg]], arglist);
      code = t->saved_instruction_ptr;
   }

   // now that we've seen all the pieces, let's look at one of the most complicated
   // opcodes: SVMOP_call in all its full-on glory. the complexity of SVMOP_call here is
   // something you absolutely want to avoid for performance, which is why we have
   // all the special cases (and more to come). Generally, SVMOP_call should
   // only be used when the internal type of the function isn't known in advance.
   CASE(call) {
      // the first thing we have to do is get at the function and see what it is
      svmatom *newfuncatom = pval(code[1]);

      // there are four cases: closure, bytecode function, native function,
      // or something else. for maximum generality, if it's something else,
      // we could try coercing it to a function. but no! that way lies madness,
      // so we'll just throw an error.

      // Closures are special because they just set a temp variable, replace
      // newfuncatom, then run the regular dispatch. So the natural way to
      // code this would be to do this right here:
      //     if (closure) {
      //        t->context->something = closure->something;
      //        newfuncatom = closure->something;
      //     }
      // and then it falls through to the regular code that deals
      // with newfuncatom when it's not a closure.
      //
      // However, since closures have more overhead anyway and are likely
      // to be rare, we don't want to pay the cost of testing for a closure
      // on every regular call. So we'll make the closure path least efficient
      // by testing for closure *last. If we find one, we want it to behave
      // the same as the above code, so we'll issue a *goto* that jumps
      // back here, as if it had just executed the above if.
      //
      // An alternative way to avoid paying the 'if (closure)' cost above
      // would be to always have an environment and just always copy it to
      // the temp without testing. However, we'd pay 3-4 instructions on
      // copying the environment, and we'd have to indirect the function
      // one deeper.
     call_got_closure:

      if (newfuncatom->type.code == SVMT_bytecode_func) {
         // now we just have the normal bytecode call; this is pretty long,
         // but it's identical to OP_call_fc_bcode except for where we load the func from
         svmfunc *newfunc = (svmfunc *) newfuncatom->v.ptr;
         svmstackframe *newframe = (svmstackframe *) stack;
         int i,num_arg;
         void *newstack;
         newframe->saved_func = func;
         newframe->saved_stackframe = frame;
         num_arg = code[2];
         newframe->saved_instruction_ptr = code + num_arg+4;
         newstack = (char *) stack + newfunc->stack_frame_size;
         if (num_arg < newfunc->min_args || num_arg > newfunc->max_args)
            goto typecheck;
         if ((char *) newstack > (char *) t)
            goto stack_overflow;
         if ((char *) &newframe->regs[num_arg] > (char *) t)
            goto stack_overflow;
         if (num_arg <= newfunc->max_arg_types)
            for (i=0; i < num_arg; ++i)
               svm_coerce(t, &newframe->regs[i], newfunc->type_constants[i], pval(code[i+3]));
         else {
            int num_types = newfunc->max_arg_types;
            for (i=0; i < num_types; ++i)
               svm_coerce(t, &newframe->regs[i], newfunc->type_constants[i], pval(code[i+3]));
            for (   ; i < num_arg; ++i)
               newframe->regs[i] = val(code[i+3]);
         }
         code = newfunc->bytecode;
         func = newfunc;
         frame = newframe;
         stack = newstack;
         if (func->has_initialization) {
            t->context->temp = num_arg;
            goto func_initializer;
         }
      } else if (newfuncatom->type.code == SVMT_native_func) {
         // now we have a function call. we have to identify which kind of function it is.
         svmnativefunc *newfunc = (svmnativefunc *) newfuncatom->v.ptr;
         if (newfunc->is_statictyped) {
            // this is a statically-known arguments call, so the code below is similar
            // to OP_call_fc_nstat except for where we load the func from,
            // and we have to do coersions while packing the arglist
            svmfunctype *type = newfunc->type;
            native_func_staticargs *funcptr = newfunc->func.stat;
            int num_arg = code[2], i;
            svmvalue *arglist = (svmvalue *) stack;
            // with packed arguments like this, we require the number of arguments
            // be a single specific value, so instead of range-checking num_arg,
            // we require it match exactly;
            //
            // in other words we assume type->max_args == type->num_args
            if (num_arg != type->max_args)
               goto typecheck;
            if ((char *) &arglist[num_arg] > (char *) t)
               goto stack_overflow;
            for (i=0; i < num_arg; ++i)
               svm_coerce_static(t, &arglist[i], type->arg_types[i], pval(code[3+i]));
            if (t->context->has_exception)
               goto typecheck;
            t->saved_instruction_ptr = code + num_arg+4;
            funcptr(t, &frame->regs[code[3+num_arg]], arglist);
            code = t->saved_instruction_ptr;
         } else {
            // that is a call where the callee decodes the arguments itself,
            // so the code below is identical to OP_call_fc_native except
            // where we load the func from
            native_func_dynamicargs *funcptr = newfunc->func.dyn;
            int num_arg = code[2], i;
            svmatom *arglist = (svmatom *) stack;
            if (num_arg < newfunc->type->min_args || num_arg > newfunc->type->max_args)
               goto typecheck;
            if ((char *) &arglist[num_arg] > (char *) t)
               goto stack_overflow;
            for (i=0; i < num_arg; ++i)
               arglist[i] = val(code[3+i]);
            t->saved_instruction_ptr = code + num_arg+4;
            funcptr(t, &frame->regs[code[3+num_arg]], num_arg, arglist);
            code = t->saved_instruction_ptr;
         }
      } else if (newfuncatom->type.code == SVMT_closure) {
         svmclosure *c = (svmclosure *) newfuncatom->v.ptr;
         t->context->environment = c->environment;
         newfuncatom = &c->function;
         goto call_got_closure;
      } else
         goto typecheck;
   }

   // And we have an even crazier case: 'apply', which accepts an svmarray or
   // svmstruct for the arguments.
   CASE(apply)
   {
      int is_array = 0;
      svmatom *arguments = pval(code[2]);
      svmatom *newfuncatom = pval(code[1]);

      // check the argument type, and check for fast cases while we're in there
      if (TYPE_IS_ARRAY(arguments->type)) {
         if (newfuncatom->type.code == SVMT_native_func) {
            svmnativefunc *newfunc = (svmnativefunc *) newfuncatom->v.ptr;
            // if we have a dynamic-typed native function, then we just pass the array
            if (!newfunc->is_statictyped) {
               svmarray *a = arguments->v.array_ptr;
               int num_arg = a->count;
               if (num_arg < newfunc->type->min_args || num_arg > newfunc->type->max_args)
                  goto typecheck;
               t->saved_instruction_ptr = code+4;
               (newfunc->func.dyn)(t, &frame->regs[code[3]], num_arg, a->data);
               code = t->saved_instruction_ptr;
               NEXT();
            }
         }
         is_array = 1;
      } else if (TYPE_IS_POINTER_TO_TYPE(arguments->type) && TYPE_IS_STRUCT(arguments->type)) {
         if (newfuncatom->type.code == SVMT_native_func) {
            svmnativefunc *newfunc = (svmnativefunc *) newfuncatom->v.ptr;
            if (newfunc->is_statictyped && newfunc->type->struct_type == arguments->type.ptr) {
               svmfunctype *type = newfunc->type;
               t->saved_instruction_ptr = code + 4;
               newfunc->func.stat(t, &frame->regs[code[3]], arguments->v.struct_ptr->data);
               code = t->saved_instruction_ptr;
               NEXT();
            }
         }
         is_array = 0;
      } else {
         goto typecheck;
      }

      // now we have the non-fast path. first, we handle a closure,
      // rather than doing that "late" the way we did for SVMOP_call.
      if (newfuncatom->type.code == SVMT_closure) {
         svmclosure *c = (svmclosure *) newfuncatom->v.ptr;
         t->context->environment = c->environment;
         newfuncatom = &c->function;
      }

      // next, we'll copy the arguments onto the thread stack, so that we
      // can ignore the distinction between the arguments being in
      // an array or being in a structre.
      {
         int num_arg;
         svmatom *arglist;

         // To do that, we need to know where they go on the stack,
         // which depends on the type of function it is.
         if (newfuncatom->type.code == SVMT_bytecode_func) {
            svmstackframe *newframe = (svmstackframe *) stack;
            arglist = &newframe->regs[0];
         } else {
            arglist = (svmatom *) stack;
         }

         if (is_array) {
            int i;
            svmarray *a = arguments->v.array_ptr;
            num_arg = a->count;
            if (arglist + num_arg > (svmatom *) t)
               goto stack_overflow;
            for (i=0; i < num_arg; ++i)
               arglist[i] = a->data[i];
         } else {
            int i;
            svmstructtype *type = GET_STRUCT_TYPE(arguments->type);
            num_arg = type->num_fields;
            if (arglist + num_arg > (svmatom *) t)
               goto stack_overflow;
            for (i=0; i < num_arg; ++i) {
               arglist[i].v = arguments->v.struct_ptr->data[i];
               arglist[i].type = type->field[i];
            }
         }

         if (newfuncatom->type.code == SVMT_bytecode_func) {
            svmfunc *newfunc = (svmfunc *) newfuncatom->v.ptr;
            svmstackframe *newframe = (svmstackframe *) stack;
            int i;
            void *newstack;
            newframe->saved_func = func;
            newframe->saved_stackframe = frame;
            newframe->saved_instruction_ptr = code + 4;
            newstack = (char *) stack + newfunc->stack_frame_size;
            if (num_arg < newfunc->min_args || num_arg > newfunc->max_args)
               goto typecheck;
            if ((char *) newstack > (char *) t)
               goto stack_overflow;
            if ((char *) &newframe->regs[num_arg] > (char *) t)
               goto stack_overflow;
            svm__assert(arglist == newframe->regs);
            if (num_arg <= newfunc->max_arg_types)
               for (i=0; i < num_arg; ++i)
                  svm_coerce_inplace(t, &newframe->regs[i], newfunc->type_constants[i]);
            else {
               int num_types = newfunc->max_arg_types;
               for (i=0; i < num_types; ++i)
                  svm_coerce_inplace(t, &newframe->regs[i], newfunc->type_constants[i]);
            }
            code = newfunc->bytecode;
            func = newfunc;
            frame = newframe;
            stack = newstack;
            if (func->has_initialization) {
               t->context->temp = num_arg;
               goto func_initializer;
            }
         } else if (newfuncatom->type.code == SVMT_native_func) {
            svmnativefunc *newfunc = (svmnativefunc *) newfuncatom->v.ptr;
            if (newfunc->is_statictyped) {
               svmfunctype *type = newfunc->type;
               native_func_staticargs *funcptr = newfunc->func.stat;
               int i;
               svmvalue *final_args = (svmvalue *) stack;
               if (num_arg != type->max_args)
                  goto typecheck;
               if ((char *) &arglist[num_arg] > (char *) t)
                  goto stack_overflow;
               // note that final_args and arglist overlap, but in a safe way
               for (i=0; i < num_arg; ++i)
                  svm_coerce_static(t, &final_args[i], type->arg_types[i], &arglist[i]);
               if (t->context->has_exception)
                  goto typecheck;
               t->saved_instruction_ptr = code + 4;
               funcptr(t, &frame->regs[code[3]], final_args);
               code = t->saved_instruction_ptr;
            } else {
               native_func_dynamicargs *funcptr = newfunc->func.dyn;
               svmatom *arglist = (svmatom *) stack;
               if (num_arg < newfunc->type->min_args || num_arg > newfunc->type->max_args)
                  goto typecheck;
               t->saved_instruction_ptr = code + 4;
               funcptr(t, &frame->regs[code[3]], num_arg, arglist);
               code = t->saved_instruction_ptr;
            }
         } else {
            goto typecheck;
         }
      }
   }

   // Now let's make a few call variants that are optimized for when the function to
   // call is in a register, but the types of everything are statically known,
   // and we know there's no varargs and such. We're going to assume that languages
   // allow free intermixing of bytecode and native coded functions, so they can't
   // know whether a register contains one or the other. So we're not going to
   // bother producing specialized cases that limit to one or the other, and
   // because this general case has to handle all three, it's pretty long, so
   // we're only going to have a couple of specialized ones. We're also going
   // to assume that this means that while we can tell whether a
   // function might use varargs or not (this is part of the public signature),
   // we don't know whether it might use alloca or might even build an 'args' array
   // (this is not part of the public function signature). Thus we still have to
   // do all the function initialization for those cases.
   //
   // The first case we have is a function call with no arguments, which is
   // much simpler since all the argument processing is gone.
   CASE(call_fr_a_static)
   {
      // this proceeds largely the same as SVMOP_call, except we skip all the
      // argument processing.
      svmatom *newfuncatom = &regs(code[1]);

      if (newfuncatom->type.code == SVMT_bytecode_func) {
         svmfunc *newfunc = (svmfunc *) newfuncatom->v.ptr;
         svmstackframe *newframe = (svmstackframe *) stack;
         void *newstack;
         newframe->saved_func = func;
         newframe->saved_stackframe = frame;
         newframe->saved_instruction_ptr = code + 3;
         newstack = (char *) stack + newfunc->stack_frame_size;
         if ((char *) newstack > (char *) t)
            goto stack_overflow;
         code = newfunc->bytecode;
         func = newfunc;
         frame = newframe;
         stack = newstack;
         if (func->has_initialization) {
            t->context->temp = 0;
            goto func_initializer;
         }
      } else if (newfuncatom->type.code == SVMT_native_func) {
         svmnativefunc *newfunc = (svmnativefunc *) newfuncatom->v.ptr;
         t->saved_instruction_ptr = code + 3;
         if (newfunc->is_statictyped)
            newfunc->func.stat(t, &frame->regs[code[2]], SVM__NULL);
         else
            newfunc->func.dyn(t, &frame->regs[code[2]], 0, SVM__NULL);
         code = t->saved_instruction_ptr;
      } else
         goto typecheck;
   }